with open(os.path.join(args['base_dir'], 'params.json'), 'w') as f: json.dump(args, f) pprint(args) # N_relationships = len(relationships.relationships) replacement_column_index = args['sequence_length'] / 2 rng = np.random.RandomState(args['random_seed']) data_rng = np.random.RandomState(args['random_seed']) validation_rng = np.random.RandomState(args['random_seed'] + 1) random.seed(args['random_seed']) # set up syntactic ngram_reader = NgramReader(args['ngram_filename'], vocab_size=args['ngram_vocab_size'], train_proportion=args['train_proportion'], test_proportion=args['test_proportion']) testing_block = ngram_reader.testing_block() print 'corpus contains %i ngrams' % (ngram_reader.number_of_ngrams) # set up semantic # num_semantic_training = int(relationships.N * 0.98) # semantic_training = relationships.data[:num_semantic_training] # semantic_testing = relationships.data[num_semantic_training:] relationship_path = join(base_dir, 'relationships.pkl.gz') vocabulary_path = join(base_dir, 'vocabulary.pkl.gz') try: with gzip.open(relationship_path) as f: relationships = cPickle.load(f) print 'loaded relationships from %s' % relationship_path except:
try: text = words[i].encode('ascii', 'ignore') ax.text(X[i, 0], X[i, 1], text) except: pass if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument('model') parser.add_argument('--start', type=int, default=0) parser.add_argument('--end', type=int, default=100) args = parser.parse_args() with gzip.open(args.model, 'rb') as f: model = cPickle.load(f) E = model.get_embeddings() try: vocabulary = model.vocabulary except: ngram_filename = DEFAULT_NGRAM_FILENAME from ngrams import NgramReader reader = NgramReader(ngram_filename, vocab_size=model.vocab_size) vocabulary = reader.word_array do_plot(E, vocabulary, start=args.start, end=args.end) plt.title(args.model) plt.show()
if 'simple_joint' not in args: # backward compatibility args['simple_joint'] = False # rewrite in case we've copied the model file into this folder args['base_dir'] = base_dir else: model_loaded = False # dump the params with open(os.path.join(args['base_dir'], 'params.json'), 'w') as f: json.dump(args, f) pprint(args) replacement_column_index = args['sequence_length'] / 2 ngram_reader = NgramReader(args['ngram_filename'], vocab_size=args['vocab_size'], train_proportion=args['train_proportion'], test_proportion=args['test_proportion']) testing_block = ngram_reader.testing_block() vocabulary = ngram_reader.word_array print 'corpus contains %i ngrams' % (ngram_reader.number_of_ngrams) rng = np.random.RandomState(args['random_seed']) data_rng = np.random.RandomState(args['random_seed']) validation_rng = np.random.RandomState(args['random_seed'] + 1) random.seed(args['random_seed']) if not args['dont_run_semantic']: print 'loading semantic similarities' word_similarity = semantic_module.WordSimilarity( vocabulary, args['word_similarity_file'],
parser.add_argument( '--ngram_file', default='/cl/nldata/books_google_ngrams_eng/5grams_size3.hd5') parser.add_argument('--reduction_function', default='max', help='"max" or "mean"') args = parser.parse_args() if args.reduction_function == 'max': reduction_fn = np.max elif args.reduction_function == 'mean': reduction_fn = np.mean else: print 'unknown function %s, using np.max' % args.reduction_function reduction_fn = np.max reader = NgramReader(args.ngram_file) if args.wunsch_paths: with gzip.open(args.wunsch_paths) as f: paths = cPickle.load(f) else: paths = WunschPaths(wn.all_synsets()) sim_fn = partial(scaled_lch_similarity, paths) similarity_matrix = make_similarity_matrix( reader.word_array[:args.vocab_size], similarity_fn=sim_fn, reduction_fn=reduction_fn) print 'writing to file %s' % args.filename with open(args.filename, 'w') as f: np.save(args.filename, similarity_matrix)
if 'simple_joint' not in args: # backward compatibility args['simple_joint'] = False # rewrite in case we've copied the model file into this folder args['base_dir'] = base_dir else: model_loaded = False # dump the params with open(os.path.join(args['base_dir'], 'params.json'), 'w') as f: json.dump(args, f) pprint(args) replacement_column_index = args['sequence_length'] / 2 ngram_reader = NgramReader(args['ngram_filename'], vocab_size=args['vocab_size'], train_proportion=args['train_proportion'], test_proportion=args['test_proportion']) testing_block = ngram_reader.testing_block() vocabulary = ngram_reader.word_array print 'corpus contains %i ngrams' % (ngram_reader.number_of_ngrams) rng = np.random.RandomState(args['random_seed']) data_rng = np.random.RandomState(args['random_seed']) validation_rng = np.random.RandomState(args['random_seed'] + 1) random.seed(args['random_seed']) if not args['dont_run_semantic']: print 'loading semantic similarities' word_similarity = semantic_module.WordSimilarity(vocabulary, args['word_similarity_file'], memmap_filename=args['word_similarity_memmap']) print 'computing terms with semantic distance' indices_in_intersection=set(i for i, v in enumerate(map(compose(np.any, np.isfinite), word_similarity.word_pairwise_sims))
with open(os.path.join(args['base_dir'], 'params.json'), 'w') as f: json.dump(args, f) pprint(args) # N_relationships = len(relationships.relationships) replacement_column_index = args['sequence_length'] / 2 rng = np.random.RandomState(args['random_seed']) data_rng = np.random.RandomState(args['random_seed']) validation_rng = np.random.RandomState(args['random_seed'] + 1) random.seed(args['random_seed']) # set up syntactic ngram_reader = NgramReader(args['ngram_filename'], vocab_size=args['ngram_vocab_size'], train_proportion=args['train_proportion'], test_proportion=args['test_proportion']) testing_block = ngram_reader.testing_block() print 'corpus contains %i ngrams' % (ngram_reader.number_of_ngrams) # set up semantic # num_semantic_training = int(relationships.N * 0.98) # semantic_training = relationships.data[:num_semantic_training] # semantic_testing = relationships.data[num_semantic_training:] relationship_path = join(base_dir, 'relationships.pkl.gz') vocabulary_path = join(base_dir, 'vocabulary.pkl.gz') try: with gzip.open(relationship_path) as f: relationships = cPickle.load(f) print 'loaded relationships from %s' % relationship_path