with open(os.path.join(args['base_dir'], 'params.json'), 'w') as f:
            json.dump(args, f)

    pprint(args)

    # N_relationships = len(relationships.relationships)
    replacement_column_index = args['sequence_length'] / 2

    rng = np.random.RandomState(args['random_seed'])
    data_rng = np.random.RandomState(args['random_seed'])
    validation_rng = np.random.RandomState(args['random_seed'] + 1)
    random.seed(args['random_seed'])


    # set up syntactic
    ngram_reader = NgramReader(args['ngram_filename'], vocab_size=args['ngram_vocab_size'], train_proportion=args['train_proportion'], test_proportion=args['test_proportion'])
    testing_block = ngram_reader.testing_block()
    print 'corpus contains %i ngrams' % (ngram_reader.number_of_ngrams)

    # set up semantic
    # num_semantic_training = int(relationships.N * 0.98)
    # semantic_training = relationships.data[:num_semantic_training]
    # semantic_testing = relationships.data[num_semantic_training:]

    relationship_path = join(base_dir, 'relationships.pkl.gz')
    vocabulary_path = join(base_dir, 'vocabulary.pkl.gz')
    try:
        with gzip.open(relationship_path) as f:
            relationships = cPickle.load(f)
        print 'loaded relationships from %s' % relationship_path
    except:
        try:
            text = words[i].encode('ascii', 'ignore')
            ax.text(X[i, 0], X[i, 1], text)
        except:
            pass


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('model')
    parser.add_argument('--start', type=int, default=0)
    parser.add_argument('--end', type=int, default=100)
    args = parser.parse_args()

    with gzip.open(args.model, 'rb') as f:
        model = cPickle.load(f)

    E = model.get_embeddings()
    try:
        vocabulary = model.vocabulary
    except:
        ngram_filename = DEFAULT_NGRAM_FILENAME
        from ngrams import NgramReader
        reader = NgramReader(ngram_filename, vocab_size=model.vocab_size)
        vocabulary = reader.word_array

    do_plot(E, vocabulary, start=args.start, end=args.end)
    plt.title(args.model)
    plt.show()
Exemple #3
0
        if 'simple_joint' not in args:  # backward compatibility
            args['simple_joint'] = False
        # rewrite in case we've copied the model file into this folder
        args['base_dir'] = base_dir
    else:
        model_loaded = False
        # dump the params
        with open(os.path.join(args['base_dir'], 'params.json'), 'w') as f:
            json.dump(args, f)

    pprint(args)

    replacement_column_index = args['sequence_length'] / 2

    ngram_reader = NgramReader(args['ngram_filename'],
                               vocab_size=args['vocab_size'],
                               train_proportion=args['train_proportion'],
                               test_proportion=args['test_proportion'])
    testing_block = ngram_reader.testing_block()
    vocabulary = ngram_reader.word_array
    print 'corpus contains %i ngrams' % (ngram_reader.number_of_ngrams)

    rng = np.random.RandomState(args['random_seed'])
    data_rng = np.random.RandomState(args['random_seed'])
    validation_rng = np.random.RandomState(args['random_seed'] + 1)
    random.seed(args['random_seed'])

    if not args['dont_run_semantic']:
        print 'loading semantic similarities'
        word_similarity = semantic_module.WordSimilarity(
            vocabulary,
            args['word_similarity_file'],
Exemple #4
0
    parser.add_argument(
        '--ngram_file',
        default='/cl/nldata/books_google_ngrams_eng/5grams_size3.hd5')
    parser.add_argument('--reduction_function',
                        default='max',
                        help='"max" or "mean"')
    args = parser.parse_args()

    if args.reduction_function == 'max':
        reduction_fn = np.max
    elif args.reduction_function == 'mean':
        reduction_fn = np.mean
    else:
        print 'unknown function %s, using np.max' % args.reduction_function
        reduction_fn = np.max
    reader = NgramReader(args.ngram_file)
    if args.wunsch_paths:
        with gzip.open(args.wunsch_paths) as f:
            paths = cPickle.load(f)
    else:
        paths = WunschPaths(wn.all_synsets())

    sim_fn = partial(scaled_lch_similarity, paths)
    similarity_matrix = make_similarity_matrix(
        reader.word_array[:args.vocab_size],
        similarity_fn=sim_fn,
        reduction_fn=reduction_fn)

    print 'writing to file %s' % args.filename
    with open(args.filename, 'w') as f:
        np.save(args.filename, similarity_matrix)
        if 'simple_joint' not in args: # backward compatibility
            args['simple_joint'] = False
        # rewrite in case we've copied the model file into this folder
        args['base_dir'] = base_dir
    else:
        model_loaded = False
        # dump the params
        with open(os.path.join(args['base_dir'], 'params.json'), 'w') as f:
            json.dump(args, f)

    pprint(args)


    replacement_column_index = args['sequence_length'] / 2

    ngram_reader = NgramReader(args['ngram_filename'], vocab_size=args['vocab_size'], train_proportion=args['train_proportion'], test_proportion=args['test_proportion'])
    testing_block = ngram_reader.testing_block()
    vocabulary = ngram_reader.word_array
    print 'corpus contains %i ngrams' % (ngram_reader.number_of_ngrams)

    rng = np.random.RandomState(args['random_seed'])
    data_rng = np.random.RandomState(args['random_seed'])
    validation_rng = np.random.RandomState(args['random_seed'] + 1)
    random.seed(args['random_seed'])

    if not args['dont_run_semantic']:
        print 'loading semantic similarities'
        word_similarity = semantic_module.WordSimilarity(vocabulary, args['word_similarity_file'], memmap_filename=args['word_similarity_memmap'])
        print 'computing terms with semantic distance'
        indices_in_intersection=set(i for i, v in enumerate(map(compose(np.any, np.isfinite),
                                                                word_similarity.word_pairwise_sims))
Exemple #6
0
        with open(os.path.join(args['base_dir'], 'params.json'), 'w') as f:
            json.dump(args, f)

    pprint(args)

    # N_relationships = len(relationships.relationships)
    replacement_column_index = args['sequence_length'] / 2

    rng = np.random.RandomState(args['random_seed'])
    data_rng = np.random.RandomState(args['random_seed'])
    validation_rng = np.random.RandomState(args['random_seed'] + 1)
    random.seed(args['random_seed'])

    # set up syntactic
    ngram_reader = NgramReader(args['ngram_filename'],
                               vocab_size=args['ngram_vocab_size'],
                               train_proportion=args['train_proportion'],
                               test_proportion=args['test_proportion'])
    testing_block = ngram_reader.testing_block()
    print 'corpus contains %i ngrams' % (ngram_reader.number_of_ngrams)

    # set up semantic
    # num_semantic_training = int(relationships.N * 0.98)
    # semantic_training = relationships.data[:num_semantic_training]
    # semantic_testing = relationships.data[num_semantic_training:]

    relationship_path = join(base_dir, 'relationships.pkl.gz')
    vocabulary_path = join(base_dir, 'vocabulary.pkl.gz')
    try:
        with gzip.open(relationship_path) as f:
            relationships = cPickle.load(f)
        print 'loaded relationships from %s' % relationship_path