# choose to corrupt one part of the triple to_mod = data_rng.choice(3) # corrupt with some other part if to_mod == 0: while word_a_new == word_a: word_a_new = data_rng.choice(relationships.indices_of_words_in_synsets) #sample_cumulative_discrete_distribution(ngram_reader.cumulative_word_frequencies, rng=data_rng) elif to_mod == 1: while word_b_new == word_b: word_b_new = data_rng.choice(relationships.indices_of_words_in_synsets) #sample_cumulative_discrete_distribution(ngram_reader.cumulative_word_frequencies, rng=data_rng) elif to_mod == 2: while rel_index_new == rel_index: # rel_index_new = data_rng.randint(N_relationships) rel_index_new = data_rng.randint(relationships.N_relationships) augmented_cost, cost = model.update_v(word_a, word_b, rel_index, word_a_new, word_b_new, rel_index_new) if not np.isfinite(cost): print 'nan detected' save_model('nan_dump.pkl.gz') import IPython IPython.embed() costs.append(cost) augmented_costs.append(augmented_cost) if i % print_freq == 0: sys.stdout.write('\r k %i: pair : %d / %d' % (model.k, i, block_size)) sys.stdout.flush() if args['semantic_blocks_to_run'] > 1: print print '%i intermediate mean %f' % (block_num, np.mean(costs[-block_size:]))
this_count = 0 augmented_costs = [] costs = [] for block_num in xrange(args['semantic_blocks_to_run']): for i in xrange(args['semantic_block_size']): train_i = -1 while train_i not in indices_in_intersection: train_i = sample_cumulative_discrete_distribution(ngram_reader.cumulative_word_frequencies, rng=data_rng) for j in xrange(args['k_nearest']): train_j = -1 while train_j not in indices_in_intersection: train_j = sample_cumulative_discrete_distribution(ngram_reader.cumulative_word_frequencies, rng=data_rng) if word_similarity.word_pairwise_sims[train_i, train_j] == -np.inf: continue sim = word_similarity.word_pairwise_sims[train_i, train_j] augmented_cost, cost = model.update_v(train_i, train_j, sim) augmented_costs.append(augmented_cost) costs.append(cost) if i % print_freq == 0: sys.stdout.write('\r k %i: pair : %d / %d' % (model.k, i, args['semantic_block_size'])) sys.stdout.flush() if args['semantic_blocks_to_run'] > 1: print print '%i intermediate mean %f' % (block_num, np.mean(costs[-args['semantic_block_size']:])) print stats_for_k['semantic_mean'] = np.mean(costs) stats_for_k['semantic_std'] = np.std(costs) print 'semantic mean cost \t%f' % stats_for_k['semantic_mean'] print 'semantic std cost \t%f' % stats_for_k['semantic_std']
while train_i not in indices_in_intersection: train_i = sample_cumulative_discrete_distribution( ngram_reader.cumulative_word_frequencies, rng=data_rng) for j in xrange(args['k_nearest']): train_j = -1 while train_j not in indices_in_intersection: train_j = sample_cumulative_discrete_distribution( ngram_reader.cumulative_word_frequencies, rng=data_rng) if word_similarity.word_pairwise_sims[ train_i, train_j] == -np.inf: continue sim = word_similarity.word_pairwise_sims[train_i, train_j] augmented_cost, cost = model.update_v( train_i, train_j, sim) augmented_costs.append(augmented_cost) costs.append(cost) if i % print_freq == 0: sys.stdout.write( '\r k %i: pair : %d / %d' % (model.k, i, args['semantic_block_size'])) sys.stdout.flush() if args['semantic_blocks_to_run'] > 1: print print '%i intermediate mean %f' % ( block_num, np.mean( costs[-args['semantic_block_size']:])) print
word_a_new = data_rng.choice( relationships.indices_of_words_in_synsets ) #sample_cumulative_discrete_distribution(ngram_reader.cumulative_word_frequencies, rng=data_rng) elif to_mod == 1: while word_b_new == word_b: word_b_new = data_rng.choice( relationships.indices_of_words_in_synsets ) #sample_cumulative_discrete_distribution(ngram_reader.cumulative_word_frequencies, rng=data_rng) elif to_mod == 2: while rel_index_new == rel_index: # rel_index_new = data_rng.randint(N_relationships) rel_index_new = data_rng.randint( relationships.N_relationships) augmented_cost, cost = model.update_v( word_a, word_b, rel_index, word_a_new, word_b_new, rel_index_new) if not np.isfinite(cost): print 'nan detected' save_model('nan_dump.pkl.gz') import IPython IPython.embed() costs.append(cost) augmented_costs.append(augmented_cost) if i % print_freq == 0: sys.stdout.write('\r k %i: pair : %d / %d' % (model.k, i, block_size)) sys.stdout.flush() if args['semantic_blocks_to_run'] > 1: