Example #1
0
def initialize_data_and_model(config, train_phase, layout='dict'):
    c = config
    fuel_path = fuel.config.data_path[0]
    vocab_main = None
    vocab_keys = None
    if not c['encoder']:
        if not c['vocab_keys_path']:
            raise ValueError(
                'Error: Should specify vocab_keys_path when no encoder')
        vocab_keys = Vocabulary(
            os.path.join(fuel.config.data_path[0], c['vocab_keys_path']))

    if c['vocab_path']:
        vocab_main = Vocabulary(
            os.path.join(fuel.config.data_path[0], c['vocab_path']))
    # TODO: change name of class LanguageModellingData... very ill-named.
    data = LanguageModellingData(c['data_path'], layout, vocab=vocab_main)

    vocab_main = data.vocab

    model = Seq2Seq(c['emb_dim'],
                    c['dim'],
                    c['num_input_words'],
                    c['num_output_words'],
                    data.vocab,
                    proximity_coef=c['proximity_coef'],
                    proximity_distance=c['proximity_distance'],
                    encoder=c['encoder'],
                    decoder=c['decoder'],
                    shared_rnn=c['shared_rnn'],
                    translate_layer=c['translate_layer'],
                    word_dropout=c['word_dropout'],
                    tied_in_out=c['tied_in_out'],
                    vocab_keys=vocab_keys,
                    reconstruction_coef=c['reconstruction_coef'],
                    provide_targets=c['provide_targets'],
                    weights_init=Uniform(width=0.1),
                    biases_init=Constant(0.))

    model.initialize()

    if c['embedding_path'] and ((train_phase or c['freeze_pretrained'])
                                or c['provide_targets']):
        if c['provide_targets'] and c['freeze_pretrained']:
            raise ValueError("Can't provide_targets and use freeze_pretrained."
                             "In that case, simply use freeze_pretrained")

        # if encoder embeddings are frozen, then we should load them
        # as they're not saved with the models parameters
        emb_full_path = os.path.join(fuel_path, c['embedding_path'])
        embedding_matrix = numpy.load(emb_full_path)
        if c['provide_targets']:
            model.set_def_embeddings(embedding_matrix, 'target')
            logger.debug("Pre-trained targets loaded")
        else:
            model.set_def_embeddings(embedding_matrix, 'main')
            logger.debug("Pre-trained encoder embeddings loaded")

    return data, model
def initialize_data_and_model(config):
    c = config
    vocab = None
    if c['vocab_path']:
        vocab = Vocabulary(
            os.path.join(fuel.config.data_path[0], c['vocab_path']))
    data = ExtractiveQAData(path=c['data_path'],
                            vocab=vocab,
                            layout=c['layout'])
    # TODO: fix me, I'm so ugly (I mean the access of a private attribute)
    if c['dict_path']:
        dict_vocab = data.vocab
        if c['dict_vocab_path']:
            dict_vocab = Vocabulary(
                os.path.join(fuel.config.data_path[0], c['dict_vocab_path']))
        data._retrieval = Retrieval(
            data.vocab,
            Dictionary(os.path.join(fuel.config.data_path[0], c['dict_path'])),
            max_def_length=c['max_def_length'],
            with_too_long_defs=c['with_too_long_defs'],
            max_def_per_word=c['max_def_per_word'],
            with_too_many_defs=c['with_too_many_defs'],
            # This should fix --exclude_top_k
            vocab_def=dict_vocab)
    logger.debug("Data loaded")
    qam = ExtractiveQAModel(
        c['dim'],
        c['emb_dim'],
        c['readout_dims'],
        c['num_input_words'],
        c['def_num_input_words'],
        data.vocab,
        coattention=c['coattention'],
        use_definitions=bool(c['dict_path']),
        def_word_gating=c['def_word_gating'],
        compose_type=c['compose_type'],
        reuse_word_embeddings=c['reuse_word_embeddings'],
        bidir_encoder=c['bidir_encoder'],
        random_unk=c['random_unk'],
        def_reader=c['def_reader'],
        weights_init=(GlorotUniform() if not c['init_width'] else Uniform(
            width=c['init_width'])),
        recurrent_weights_init=(GlorotUniform() if not c['rec_init_width'] else
                                Uniform(width=c['rec_init_width'])),
        biases_init=Constant(0.))
    qam.initialize()
    logger.debug("Model created")
    if c['embedding_path']:
        qam.set_embeddings(
            numpy.load(
                os.path.join(fuel.config.data_path[0], c['embedding_path'])))
        logger.debug("Embeddings loaded")
    return data, qam
Example #3
0
def test_squad_to_h5py_dataset():
    corenlp = None
    try:
        port = get_free_port()
        corenlp = start_corenlp(port)

        test_dir = tempfile.mkdtemp()
        json_path = os.path.join(test_dir, 'data.json')
        h5_path = os.path.join(test_dir, 'data.h5')
        with open(json_path, 'w') as json_file:
            print(TEST_SQUAD_RAW_DATA, file=json_file)
        squad_to_h5py_dataset(json_path, h5_path,
                              "http://localhost:{}".format(port))
        with h5py.File(h5_path, 'r') as h5_file:
            vocab = Vocabulary.build(h5_file['text'], top_k=100)
        add_words_ids_to_squad(h5_path, vocab)

        dataset = SQuADDataset(h5_path, ('all', ))
        stream = dataset.get_example_stream()
        stream = dataset.apply_default_transformers(stream)
        example = next(stream.get_epoch_iterator(as_dict=True))
        answer_span = slice(example['answer_begins'][0],
                            example['answer_ends'][0])
        assert example['questions'].tolist() == map(vocab.word_to_id, [
            u'To', u'whom', u'did', u'the', u'Virgin', u'Mary', u'allegedly',
            u'appear', u'in', u'1858', u'in', u'Lourdes', u'France', u'?'
        ])
        assert example['contexts'][answer_span].tolist() == map(
            vocab.word_to_id, [u'Saint', u'Bernadette', u'Soubirous'])
    finally:
        if corenlp and corenlp.returncode is None:
            corenlp.kill()
def test_language_model():
    with temporary_content_path(TEST_VOCAB) as path:
        vocab = Vocabulary(path)
    with temporary_content_path(TEST_DICT_JSON, suffix=".json") as path:
        dict_ = Dictionary(path)

    floatX = theano.config.floatX

    def make_data_and_mask(data):
        data = [[str2vec(s, 3) for s in row] for row in data]
        data = np.array(data)
        mask = np.ones((data.shape[0], data.shape[1]),
                        dtype=floatX)
        return data, mask
    words_val, mask_val = make_data_and_mask([['p', 'e', 'a', ], ['a', 'e', 'p',]])
    mask_val[1,2] = 0
    print "data:"
    print words_val
    print "mask:"
    print mask_val
    mask_def_emb_val = np.asarray([[0, 1], [0,0]])

    # With the dictionary
    retrieval = Retrieval(vocab, dict_, exclude_top_k=7)
    lm = LanguageModel(7, 5, vocab.size(), vocab.size(), 
        vocab=vocab, retrieval=retrieval,
        compose_type='transform_and_sum',
        weights_init=Uniform(width=0.1),
        biases_init=Uniform(width=0.1))
    lm.initialize()
    words = tensor.ltensor3('words')
    mask = tensor.matrix('mask', dtype=floatX)
    costs = lm.apply(words, mask)
    cg = ComputationGraph(costs)
    def_mean, = VariableFilter(name='_dict_word_embeddings')(cg)
    def_mean_f = theano.function([words], def_mean)

    perplexities = VariableFilter(name_regex='perplexity.*')(cg)
    mask_def_emb, = VariableFilter(name='mask_def_emb')(cg)
    perplexities_f = theano.function([words, mask], perplexities)
    perplexities_v = perplexities_f(words_val, mask_val)
    mask_emb_f = theano.function([words, mask], mask_def_emb)
    mask_def_v = mask_emb_f(words_val, mask_val)
    for v,p in zip(perplexities_v,perplexities):
        print p.name, ":", v
    assert(np.allclose(mask_def_v, mask_def_emb_val))
Example #5
0
def test_vocab_op():
    with temporary_content_path(TEST_VOCAB) as path:
        vocab = Vocabulary(path)
    op = WordToIdOp(vocab)

    input_ = tensor.as_tensor_variable([ord('d'), ord(' '), ord('c'), 0, 0])
    assert op(input_).eval() == 0

    input_ = tensor.as_tensor_variable([ord('a')])
    assert op(input_).eval() == 5

    input_ = tensor.as_tensor_variable([[ord('a'), 0], [ord('b'), 0]])
    assert list(op(input_).eval()) == [5, 6]
Example #6
0
def test_retrieval():
    with temporary_content_path(TEST_VOCAB, ".txt") as path:
        vocab = Vocabulary(path)
    with temporary_content_path(TEST_DICT_JSON, ".json") as path:
        dict_ = Dictionary(path)

    # check a super simple case
    batch = [['a']]
    defs, def_map = Retrieval(vocab, dict_).retrieve(batch)
    assert defs == [[3, 6, 7, 4], [3, 8, 9, 4]]
    assert def_map == [(0, 0, 0), (0, 0, 1)]

    # check that vectors are handled correctly
    batch = numpy.array([ord('d'), ord(' '), ord('c'), 0, 0])[None, None, :]
    defs, def_map = Retrieval(vocab, dict_).retrieve(batch)
    assert defs == [[3, 5, 6, 4]]
    assert def_map == [(0, 0, 0)]

    # check a complex case
    batch = [['a', 'b', 'b'], ['d c', 'a', 'b']]
    defs, def_map = Retrieval(vocab, dict_).retrieve(batch)
    assert defs == [[3, 6, 7, 4],
                    [3, 8, 9, 4],
                    [3, 9, 8, 4],
                    [3, 5, 6, 4]]
    assert def_map == [(0, 0, 0), (0, 0, 1),
                       (0, 1, 2),
                       (0, 2, 2),
                       (1, 0, 3),
                       (1, 1, 0), (1, 1, 1),
                       (1, 2, 2)]

    # check a complex case with exclude top k
    batch = [['a', 'b', 'c', 'd'], ['a', 'e', 'b']]
    exclude_top_k = 7 # should exclude 'a', 'b', 'c', 'd' and only define 'e'
    defs, def_map = Retrieval(vocab, dict_, exclude_top_k=exclude_top_k).retrieve(batch)
    assert defs == [[3, 6, 7, 8, 4]]
    assert def_map == [(1, 1, 0)]

    # check the op
    retrieval_op = RetrievalOp(Retrieval(vocab, dict_))
    batch = tensor.as_tensor_variable(
        [[[ord('d'), ord(' '), ord('c'), 0, 0],
          [ord('e'), 0, 0, 0, 0]]])
    defs_var, mask_var,  def_map_var = retrieval_op(batch)
    assert defs_var.eval().tolist() == [[3, 5, 6, 4, 0],
                                        [3, 6, 7, 8, 4]]
    assert_allclose(mask_var.eval(), [[1, 1, 1, 1, 0], [1, 1, 1, 1, 1]])
    assert def_map_var.eval().tolist() == [[0, 0, 0], [0, 1, 1]]
Example #7
0
def t_e_s_t_language_model():
    V = 50
    gen = FakeTextGenerator(V, 6, 6, 1.0, 0.2)
    n_sentences = 3
    len_sentences = 7
    data = [gen.sample_sentence(len_sentences) for i in range(n_sentences)]
    vocab_list = '\n'.join(list(set(gen.vocabulary)))
    dict_json = json.dumps(gen.dictionary)
    print "JSON dict:", dict_json

    with temporary_content_path(vocab_list) as path:
        vocab = Vocabulary(path)
    with temporary_content_path(dict_json) as path:
        dict_ = Dictionary(path)

    data = [[str2vec(s, generator.tok_len) for s in row] for row in data]
    data = numpy.array(data)
    print "Data:", data

    # With the dictionary
    lm = LanguageModel(vocab=vocab,
                       dict_=dict_,
                       dim=10,
                       weights_init=Uniform(width=0.1),
                       biases_init=Uniform(width=0.1))
    lm.initialize()
    costs = lm.apply(tensor.as_tensor_variable(data),
                     numpy.ones((data.shape[0], data.shape[1])))
    cg = ComputationGraph(costs)
    def_spans, = VariableFilter(name='def_spans')(cg)
    f = theano.function([], [costs, def_spans])
    costs_value, def_spans_value = f()
    assert def_spans_value.tolist() == [[0, 2], [2, 4], [4, 5], [5, 7]]

    # Without the dictionary
    lm2 = LanguageModel(vocab=vocab,
                        dim=10,
                        weights_init=Uniform(width=0.1),
                        biases_init=Uniform(width=0.1))
    costs2 = lm2.apply(tensor.as_tensor_variable(data),
                       numpy.ones((data.shape[0], data.shape[1])))
    costs2.eval()
Example #8
0
def test_extractive_qa_model():
    with temporary_content_path(TEST_VOCAB) as path:
        vocab = Vocabulary(path)
    with temporary_content_path(TEST_DICT_JSON) as path:
        dict_ = Dictionary(path)

    def make_data_and_mask(data):
        data = [[vocab.word_to_id(s) for s in row] for row in data]
        data = numpy.array(data)
        mask = numpy.ones((data.shape[0], data.shape[1]),
                          dtype=theano.config.floatX)
        return data, mask

    # create some dummy data
    contexts, context_mask = make_data_and_mask([['a', 'a', 'a', 'b'],
                                                 ['b', 'a', 'b', 'a'],
                                                 ['a', 'b', 'b', 'b']])
    questions, question_mask = make_data_and_mask([['a', 'a'], ['b', 'a'],
                                                   ['a', 'b']])
    answer_begins = [0, 0, 1]
    answer_ends = [1, 2, 2]

    for coattention in [False, True]:
        qam = ExtractiveQAModel(vocab=vocab,
                                dim=10,
                                emb_dim=10,
                                num_input_words=10,
                                compose_type='sum',
                                use_definitions=False,
                                reuse_word_embeddings=False,
                                def_reader='LSTMReadDefinitions',
                                coattention=coattention,
                                weights_init=Uniform(width=0.1),
                                biases_init=Uniform(width=0.1))
        qam.initialize()

        costs = qam.apply(tensor.as_tensor_variable(contexts), context_mask,
                          tensor.as_tensor_variable(questions), question_mask,
                          tensor.as_tensor_variable(answer_begins),
                          tensor.as_tensor_variable(answer_ends))
        assert costs.eval().shape == (3, )
Example #9
0
def main():
    logging.basicConfig(
        level='INFO',
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    parser = argparse.ArgumentParser("Builds a vocabulary")
    parser.add_argument("--top-k",
                        type=int,
                        help="Top most frequent words to leave")
    parser.add_argument("--keys-only",
                        action='store_true',
                        help="Build vocab of all keys")
    parser.add_argument("--with-keys",
                        action='store_true',
                        help="Count keys and words in definitions")
    parser.add_argument("dictionary", help="Input dictionary")
    parser.add_argument("vocabulary", help="Output vocabulary")
    args = parser.parse_args()

    text = []
    if args.dictionary.endswith('.json'):
        text = collections.defaultdict(int)
    for f_name in args.dictionary.split(","):
        logging.info("Processing " + f_name)
        assert (f_name.endswith('.json'))
        logging.info(
            "Will build the vocabulary from definitions in a dictionary")
        dict_ = json.load(open(f_name, "r"))
        for word, list_defs in dict_.items():
            if args.keys_only or args.with_keys:
                text[word] += 1
            if not args.keys_only:
                for def_ in list_defs:
                    for def_word in def_:
                        text[def_word] += 1

        logging.info("{} words".format(len(text)))

    vocab = Vocabulary.build(text, args.top_k)
    vocab.save(args.vocabulary)
Example #10
0
def main():
    logging.basicConfig(
        level='INFO',
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    parser = argparse.ArgumentParser("Digitizes text and add a vocab")
    parser.add_argument("vocab", help="Vocabulary")
    parser.add_argument("--type",
                        choices=("squad", "snli"),
                        default='squad',
                        help="What kind of data should be converted")
    parser.add_argument("h5", help="Destination")
    args = parser.parse_args()

    vocab = Vocabulary(args.vocab)

    if args.type == 'squad':
        add_words_ids_to_squad(args.h5, vocab)
    elif args.type == 'snli':
        add_word_ids_to_snli(args.h5, vocab)
    else:
        raise NotImplementedError()
Example #11
0
def main():
    logging.basicConfig(
        level='INFO',
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    parser = argparse.ArgumentParser("List undefined tokens")
    parser.add_argument("vocabulary", help="Input vocabulary")
    parser.add_argument("dictionary", help="Input dictionary")
    args = parser.parse_args()

    undefined_tokens_and_freqs = []
    vocab = Vocabulary(args.vocabulary)
    with open(args.dictionary) as f:
        dictionary = json.load(f)
    for w, c in zip(vocab.words, vocab.frequencies):
        if w not in dictionary.keys():
            undefined_tokens_and_freqs.append((w, c))

    undefined_tokens_and_freqs = sorted(undefined_tokens_and_freqs,
                                        key=lambda x: x[1],
                                        reverse=True)
    for w, c in undefined_tokens_and_freqs:
        print(w)
Example #12
0
def main():
    logging.basicConfig(
        level='INFO',
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    parser = argparse.ArgumentParser(
        "Converts GLOVE embeddings to a numpy array")
    parser.add_argument("txt", help="GLOVE data in txt format")
    parser.add_argument("npy", help="Destination for npy format")
    parser.add_argument("vocab_out", help="output vocabulary")
    parser.add_argument("--vocab",
                        default="",
                        help="Performs subsetting based on passed vocab")

    # OOV handling
    parser.add_argument("--try-lowercase",
                        action="store_true",
                        help="Try lowercase")

    args = parser.parse_args()

    if args.vocab == "":
        raise NotImplementedError("Not implemented")
        embeddings = []
        dim = None
        with open(args.txt) as src:
            for i, line in enumerate(src):
                tokens = line.strip().split()
                features = map(float, tokens[1:])
                dim = len(features)
                embeddings.append(features)
                if i and i % 100000 == 0:
                    print i
        embeddings = [[0.] * dim] * len(
            Vocabulary.SPECIAL_TOKEN_MAP) + embeddings
        np.save(args.npy, embeddings)
    else:
        vocab = Vocabulary(args.vocab)

        print('Computing GloVe')

        # Loading
        embeddings_index = {}
        f = open(args.txt)

        print('Reading GloVe file')
        for line in f:
            values = line.split(' ')
            word = values[0]
            dim = len(values[1:])
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

        f.close()

        # Embedding matrix: larger than necessary
        f_out = open(args.vocab_out, 'w')
        n_specials = len(Vocabulary.SPECIAL_TOKEN_MAP.values())

        embedding_matrix = np.zeros((vocab.size() + n_specials, dim))
        for special_token in Vocabulary.SPECIAL_TOKEN_MAP.values():
            line = '<' + special_token + '>' + " 0\n"
            f_out.write(line.encode('utf-8'))

        i = n_specials
        #i = 0
        for word, count in zip(vocab.words, vocab.frequencies):
            embedding_vector = embeddings_index.get(word)
            if args.try_lowercase and not isinstance(embedding_vector,
                                                     np.ndarray):
                embedding_vector = embeddings_index.get(word.lower())
            in_glove = embedding_vector is not None
            last_comp = None
            if in_glove:
                last_comp = embedding_vector[-1]
            #print "i: {}, word {}, count {}, in_glove {}, last {}".format(i, word, count, in_glove, last_comp)
            if in_glove:
                try:
                    embedding_matrix[i] = embedding_vector
                except:
                    print "error idx", i
                # else, null vector
                #print "writing:", line, i
                line = word + " " + str(count) + "\n"
                f_out.write(line.encode('utf-8'))
                i += 1
            if i and i % 10000 == 0:
                print "i:", i
        f_out.close()
        np.save(args.npy, embedding_matrix[:i])
Example #13
0
        kwargs_emb = {"normalize": args.normalize,
                      "lowercase": args.lowercase}
    else:
        kwargs_emb = {"dim": 300,
                      "vocab_size": args.vocab_size}
    emb = load_embedding(args.emb_filename, format=args.emb_format,
                         load_kwargs=kwargs_emb, lowercase_if_OOV=False, 
                         lemmatize_if_OOV=False, normalize=False)


    model_name = args.emb_filename.split('/')[-2]
    # TODO: need to feed dim and vocab_size? or useless?
  
    vocab_defs, dict_, test_dict = None, None, None
    if is_dict_embedding:
        vocab_defs = Vocabulary(vocab_defs_fname)
        fname_dict = os.path.join(args.root_dicts, "all.json")
        fname_test_dict = os.path.join(args.root_dicts, "test.json")
        dict_ = load_dict(fname_dict)
        test_dict = load_dict(fname_test_dict)

    dirname = os.path.join('results/figures/', model_name)
    if not os.path.exists(dirname):
        os.makedirs(dirname)

    diff_ranks = []
    for name, data in datasets:
        print "dataset:", name
        print_coverage(data, emb)
        print ""
        rank_model, rank_truth = compute_ranks(data, emb)
def initialize_data_and_model(config):
    c = config
    fuel_path = fuel.config.data_path[0]
    vocab_main = None
    if c['vocab_path']:
        vocab_main = Vocabulary(
            os.path.join(fuel.config.data_path[0], c['vocab_path']))
    data = LanguageModellingData(c['data_path'], c['layout'], vocab=vocab_main)

    vocab_main = data.vocab

    retrieval = None
    if c['dict_path'] and not c['embedding_path']:
        dict_full_path = os.path.join(fuel_path, c['dict_path'])
        dict_ = Dictionary(dict_full_path)
        logger.debug("Loaded dictionary with {} entries".format(
            dict_.num_entries()))
        vocab_def = data.vocab
        if c['dict_vocab_path']:
            if not c['standalone_def_lookup']:
                raise ValueError(
                    "Standalone def lookup mandatory with separate vocabs")
            vocab_def = Vocabulary(
                os.path.join(fuel.config.data_path[0], c['dict_vocab_path']))

        retrieval = Retrieval(vocab_main,
                              dict_,
                              c['max_def_length'],
                              with_too_long_defs='drop',
                              exclude_top_k=c['exclude_top_k'],
                              vocab_def=vocab_def,
                              max_def_per_word=c['max_def_per_word'])
    elif c['embedding_path']:
        assert (c['dict_path'])
        emb_full_path = os.path.join(fuel_path, c['embedding_path'])
        embedding_matrix = numpy.load(emb_full_path)
        dict_full_path = os.path.join(fuel_path, c['dict_path'])
        dict_ = Dictionary(dict_full_path)  # should be key=value=word
        if not c['standalone_def_lookup']:
            raise ValueError("Standalone def lookup mandatory")

        vocab_def = data.vocab
        if c['dict_vocab_path']:
            vocab_def = Vocabulary(
                os.path.join(fuel.config.data_path[0], c['dict_vocab_path']))

        retrieval = Retrieval(data.vocab,
                              dict_,
                              max_def_length=1,
                              with_too_long_defs='drop',
                              exclude_top_k=c['exclude_top_k'],
                              vocab_def=vocab_def,
                              max_def_per_word=1,
                              add_bod_eod=False)

    lm = LanguageModel(c['emb_dim'],
                       c['emb_def_dim'],
                       c['dim'],
                       c['num_input_words'],
                       c['def_num_input_words'],
                       c['num_output_words'],
                       data.vocab,
                       retrieval,
                       c['def_reader'],
                       c['standalone_def_lookup'],
                       c['standalone_def_rnn'],
                       c['disregard_word_embeddings'],
                       c['compose_type'],
                       very_rare_threshold=c['very_rare_threshold'],
                       cache_size=c['cache_size'],
                       weights_init=Uniform(width=0.1),
                       biases_init=Constant(0.))
    lm.initialize()

    if c['embedding_path']:
        lm.set_def_embeddings(embedding_matrix)
        logger.debug("Embeddings loaded")

    return (data, lm, retrieval)
def main():
    parser = argparse.ArgumentParser(
        "Generate synthetic data and outputs in files")
    parser.add_argument("path",
                        type=str,
                        help="Top most frequent words to leave")
    parser.add_argument("n_primes", type=int, help="# of primes")
    parser.add_argument("n_non_primes", type=int, help="# of non-primes")
    parser.add_argument("features_size", type=int, help="Features size")
    parser.add_argument("markov_order", type=int, help="Markov order")
    parser.add_argument("n_sentences", type=int, help="# sentences")
    parser.add_argument("pc_train", type=float, help="% train sentences")
    parser.add_argument("pc_valid", type=float, help="% valid sentences")
    parser.add_argument("sample_temperature",
                        type=float,
                        default=1.0,
                        help="% valid sentences")
    parser.add_argument("min_sentence_len", type=int, default=6)
    parser.add_argument("max_sentence_len", type=int, default=20)
    parser.add_argument("min_def_len", type=int, default=6)
    parser.add_argument("max_def_len", type=int, default=20)

    args = parser.parse_args()

    print "Number of sentences:", args.n_sentences
    assert (0 < args.pc_train + args.pc_valid < 1)
    assert (os.path.exists(args.path) == False)
    os.makedirs(args.path)
    args.pc_test = 1 - (args.pc_train + args.pc_valid)

    gen = FakeTextGenerator(args.n_primes, args.n_non_primes,
                            args.features_size, args.markov_order,
                            args.sample_temperature, args.min_def_len,
                            args.max_def_len)

    data = gen.create_corpus(args.n_sentences, args.min_sentence_len,
                             args.max_sentence_len, args.pc_train,
                             args.pc_valid)

    train_data, valid_data, test_data = data

    concat_sentences = lambda sentences: [' '.join(s) for s in sentences]
    train_data = concat_sentences(train_data)
    test_data = concat_sentences(test_data)
    valid_data = concat_sentences(valid_data)

    all_data = train_data + valid_data + test_data
    with temporary_content_path('\n'.join(all_data)) as path:
        vocab = Vocabulary.build(path, sort_by='lexicographical')
        vocab.save(os.path.join(args.path, "vocab.txt"))

    dict_json = json.dumps(gen.dictionary)
    write_data(os.path.join(args.path, "dict.json"), dict_json)

    write_data(os.path.join(args.path, "train.txt"), '\n'.join(train_data))
    write_data(os.path.join(args.path, "valid.txt"), '\n'.join(valid_data))
    write_data(os.path.join(args.path, "test.txt"), '\n'.join(test_data))

    args_json = json.dumps(vars(args), indent=4, sort_keys=True)
    write_data(os.path.join(args.path, "params.json"), args_json)

    write_data(os.path.join(args.path, "generator.p"), pickle.dumps(gen))
Example #16
0
def main():
    logging.basicConfig(
        level='INFO',
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    parser = argparse.ArgumentParser("Builds a dictionary")
    parser.add_argument("--target_coverage_text",
                        type=float,
                        help="Target coverage of text")
    parser.add_argument("--target_coverage_def",
                        type=float,
                        help="Target coverage of def")
    parser.add_argument("--vocab_text", type=str, help="Vocabulary of text")
    parser.add_argument("--vocab_def", type=str, help="Vocabulary of def")
    parser.add_argument("--step_size", type=int, default=30)
    parser.add_argument("--target", type=str, default="Final path")
    args = parser.parse_args()

    vocab_text = Vocabulary(args.vocab_text)
    vocab_def = Vocabulary(args.vocab_def)

    # Greedy solution is optimal
    # I also approximate greedy a bit by adding word by word. This is fine, vocabs are big
    target_coverage_text = np.sum(
        vocab_text.frequencies) * args.target_coverage_text
    target_coverage_def = np.sum(
        vocab_def.frequencies) * args.target_coverage_def
    current_vocab = set([])

    # Of course I could use binsearch
    for id in range(vocab_def.size() / args.step_size):
        for id2 in range(args.step_size):
            current_vocab.add(vocab_def.id_to_word(id * args.step_size + id2))

        current_vocab_mod = set(current_vocab)

        current_coverage_def = 0.0
        current_coverage_text = 0.0

        for w in current_vocab_mod:
            current_coverage_def += vocab_def.frequencies[vocab_def.word_to_id(
                w)]
            current_coverage_text += vocab_text.frequencies[
                vocab_text.word_to_id(w)]

        id_text = 0
        while current_coverage_text < target_coverage_text:
            while vocab_text.id_to_word(id_text) in current_vocab_mod:
                id_text += 1
                if id_text >= vocab_text.size():
                    raise Exception("Perhaps try lower target coverage")

            w = vocab_text.id_to_word(id_text)
            current_vocab_mod.add(w)
            current_coverage_def += vocab_def.frequencies[vocab_def.word_to_id(
                w)]
            current_coverage_text += vocab_text.frequencies[id_text]

        if current_coverage_def > target_coverage_def:
            current_vocab = current_vocab_mod
            break

        print(
            "After adding {} words I covered {} of def and {} of text occurences"
            .format(
                len(current_vocab_mod),
                current_coverage_def / float(np.sum(vocab_def.frequencies)),
                current_coverage_text / float(np.sum(vocab_text.frequencies))))

    # To be safe rechecking shortlist works
    current_coverage_def = 0
    current_coverage_text = 0
    for w in current_vocab:
        current_coverage_def += vocab_def.frequencies[vocab_def.word_to_id(w)]
        current_coverage_text += vocab_text.frequencies[vocab_text.word_to_id(
            w)]

    print(
        "Sanity check: after adding {} words I covered {} of def and {} of text occurences"
        .format(len(current_vocab),
                current_coverage_def / float(np.sum(vocab_def.frequencies)),
                current_coverage_text / float(np.sum(vocab_text.frequencies))))

    vocab_result = Vocabulary.build(
        {word: vocab_text.word_freq(word)
         for word in current_vocab})
    vocab_result.save(args.target)
Example #17
0
def main():
    logging.basicConfig(
        level='INFO',
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    parser = argparse.ArgumentParser("Builds a dictionary")
    parser.add_argument("--top-k",
                        type=int,
                        help="Top most frequent words to leave")
    parser.add_argument(
        "--vocab-text",
        default=None,
        help="Vocab corresponding to the main if text is a dictionary.")
    parser.add_argument(
        "--weight-dict-entries",
        action='store_true',
        help="Weight dict entries according to the freqs from a vocab.")
    parser.add_argument(
        "--exclude-top-k",
        type=int,
        help="Ignore definitions of a number of most frequent words")
    parser.add_argument(
        "text",
        help=
        "The text to use. Can be a text file or .h5 or a dictionary with format.json in which case you need to use --vocab-text as well."
    )
    parser.add_argument("vocab", help="Destination")
    args = parser.parse_args()

    text = []
    if args.vocab_text:
        text = collections.defaultdict(int)
        vocab_text = Vocabulary(args.vocab_text)
    for f_name in args.text.split(","):
        logging.info("Processing " + f_name)
        if f_name.endswith('.h5'):
            with h5py.File(f_name) as h5_file:
                if 'text' not in h5_file.keys():
                    print("Missing text field from " + f_name)
                text.extend(h5_file['text'][:])
        elif f_name.endswith('.json'):
            logging.info(
                "Will build the vocabulary from definitions in a dictionary")
            dict_ = json.load(open(f_name, "r"))
            for word, list_defs in dict_.items():
                text_vocab_id = vocab_text.word_to_id(word)

                if (text_vocab_id != vocab_text.unk
                        and text_vocab_id < args.exclude_top_k):
                    continue

                for def_ in list_defs:
                    for def_word in def_:
                        if args.weight_dict_entries:
                            text[def_word] += vocab_text.word_freq(word)
                        else:
                            text[def_word] += 1
        else:
            with open(f_name) as file_:

                def data():
                    for line in file_:
                        for word in line.strip().split():
                            try:
                                yield text_type(word, 'utf-8')
                            except:
                                print("Skipped word " + word)

                text.extend(data())
        logging.info("{} words".format(len(text)))

    vocab = Vocabulary.build(text, args.top_k)
    vocab.save(args.vocab)
Example #18
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description=
        'Write the list of words in embeddings but not in dict vocabulary')
    parser.add_argument('embeddings', type=str)
    parser.add_argument('vocabulary', type=str)
    parser.add_argument('vocabulary_counts', type=str)
    parser.add_argument('absent_words', type=str)

    args = parser.parse_args()

    print "read first file {}".format(args.embeddings)
    embeddings = read_embedding_file(args.embeddings)
    print "read vocabulary file {}".format(args.vocabulary)
    vocabulary = Vocabulary(args.vocabulary)
    print "read vocabulary for counts estimation file {}".format(
        args.vocabulary_counts)
    vocabulary_counts = Vocabulary(args.vocabulary_counts)

    vocabulary = set(vocabulary.words)  # faster lookup

    absent_in_vocab = set(
        [w for w in embeddings.keys() if w not in vocabulary])
    print("Number of absent words in vocab", len(absent_in_vocab))
    absent_in_vocab = sorted(list(absent_in_vocab),
                             key=lambda w: vocabulary_counts.word_freq(w),
                             reverse=True)

    with open(args.absent_words, 'w') as f:
        for w in absent_in_vocab:
Example #19
0
def _initialize_simple_model_and_data(c):

    if c['vocab']:
        vocab = Vocabulary(c['vocab'])
    else:
        vocab = None
    # Load data
    data = SNLIData(path=c['data_path'], layout=c['layout'], vocab=vocab)

    if vocab is None:
        vocab = data.vocab

    if c.get('vocab_text', ''):
        vocab_text = Vocabulary(c['vocab_text'])
    else:
        vocab_text = vocab

    # Dict
    if c['dict_path']:
        dict = Dictionary(c['dict_path'])
        logging.info("Loaded dict with {} entries".format(dict.num_entries()))

        if len(c['vocab_def']):
            retrieval_vocab = Vocabulary(c['vocab_def'])
        else:
            retrieval_vocab = data.vocab

        retrieval = Retrieval(vocab_text=vocab_text,
                              vocab_def=retrieval_vocab,
                              dictionary=dict,
                              max_def_length=c['max_def_length'],
                              with_too_long_defs=c['with_too_long_defs'],
                              exclude_top_k=c['exclude_top_k'],
                              max_def_per_word=c['max_def_per_word'])

        data.set_retrieval(retrieval)
    else:
        retrieval = None
        dict = None
        retrieval_vocab = None

    def_emb_dim = c.get('def_emb_dim',
                        0) if c.get('def_emb_dim', 0) > 0 else c['emb_dim']
    def_emb_translate_dim = c.get(
        'def_emb_translate_dim',
        0) if c.get('def_emb_translate_dim', 0) > 0 else def_emb_dim

    # Initialize
    simple = NLISimple(
        # Baseline arguments
        emb_dim=c['emb_dim'],
        vocab=data.vocab,
        encoder=c['encoder'],
        dropout=c['dropout'],
        num_input_words=c['num_input_words'],
        mlp_dim=c['mlp_dim'],

        # Dict lookup kwargs (will get refactored)
        translate_dim=c['translate_dim'],
        retrieval=retrieval,
        compose_type=c['compose_type'],
        reader_type=c['reader_type'],
        disregard_word_embeddings=c['disregard_word_embeddings'],
        def_vocab=retrieval_vocab,
        def_emb_dim=c['def_emb_dim'],
        combiner_dropout=c['combiner_dropout'],
        share_def_lookup=c['share_def_lookup'],
        combiner_dropout_type=c['combiner_dropout_type'],
        combiner_bn=c['combiner_bn'],
        combiner_gating=c['combiner_gating'],
        combiner_shortcut=c['combiner_shortcut'],
        combiner_reader_translate=c['combiner_reader_translate'],
        def_dim=c['def_dim'],
        num_input_def_words=c['num_input_def_words'],
        def_emb_translate_dim=def_emb_translate_dim,

        # Init
        weights_init=GlorotUniform(),
        biases_init=Constant(0.0))
    simple.push_initialization_config()
    if c['encoder'] == 'rnn':
        simple._rnn_encoder.weights_init = Uniform(std=0.1)
    simple.initialize()

    if c.get('embedding_def_path', ''):
        embeddings = np.load(c['embedding_def_path'])
        simple.set_def_embeddings(embeddings.astype(theano.config.floatX))

    if c['embedding_path']:
        embeddings = np.load(c['embedding_path'])
        simple.set_embeddings(embeddings.astype(theano.config.floatX))

    return simple, data, dict, retrieval, vocab
Example #20
0
def train_snli_model(new_training_job,
                     config,
                     save_path,
                     params,
                     fast_start,
                     fuel_server,
                     seed,
                     model='simple'):
    if config['exclude_top_k'] > config['num_input_words'] and config[
            'num_input_words'] > 0:
        raise Exception("Some words have neither word nor def embedding")
    c = config
    logger = configure_logger(name="snli_baseline_training",
                              log_file=os.path.join(save_path, "log.txt"))
    if not os.path.exists(save_path):
        logger.info("Start a new job")
        os.mkdir(save_path)
    else:
        logger.info("Continue an existing job")
    with open(os.path.join(save_path, "cmd.txt"), "w") as f:
        f.write(" ".join(sys.argv))

    # Make data paths nice
    for path in [
            'dict_path', 'embedding_def_path', 'embedding_path', 'vocab',
            'vocab_def', 'vocab_text'
    ]:
        if c.get(path, ''):
            if not os.path.isabs(c[path]):
                c[path] = os.path.join(fuel.config.data_path[0], c[path])

    main_loop_path = os.path.join(save_path, 'main_loop.tar')
    main_loop_best_val_path = os.path.join(save_path, 'main_loop_best_val.tar')
    stream_path = os.path.join(save_path, 'stream.pkl')

    # Save config to save_path
    json.dump(config, open(os.path.join(save_path, "config.json"), "w"))

    if model == 'simple':
        nli_model, data, used_dict, used_retrieval, _ = _initialize_simple_model_and_data(
            c)
    elif model == 'esim':
        nli_model, data, used_dict, used_retrieval, _ = _initialize_esim_model_and_data(
            c)
    else:
        raise NotImplementedError()

    # Compute cost
    s1, s2 = T.lmatrix('sentence1'), T.lmatrix('sentence2')

    if c['dict_path']:
        assert os.path.exists(c['dict_path'])
        s1_def_map, s2_def_map = T.lmatrix('sentence1_def_map'), T.lmatrix(
            'sentence2_def_map')
        def_mask = T.fmatrix("def_mask")
        defs = T.lmatrix("defs")
    else:
        s1_def_map, s2_def_map = None, None
        def_mask = None
        defs = None

    s1_mask, s2_mask = T.fmatrix('sentence1_mask'), T.fmatrix('sentence2_mask')
    y = T.ivector('label')

    cg = {}
    for train_phase in [True, False]:
        # NOTE: Please don't change outputs of cg
        if train_phase:
            with batch_normalization(nli_model):
                pred = nli_model.apply(s1,
                                       s1_mask,
                                       s2,
                                       s2_mask,
                                       def_mask=def_mask,
                                       defs=defs,
                                       s1_def_map=s1_def_map,
                                       s2_def_map=s2_def_map,
                                       train_phase=train_phase)
        else:
            pred = nli_model.apply(s1,
                                   s1_mask,
                                   s2,
                                   s2_mask,
                                   def_mask=def_mask,
                                   defs=defs,
                                   s1_def_map=s1_def_map,
                                   s2_def_map=s2_def_map,
                                   train_phase=train_phase)

        cost = CategoricalCrossEntropy().apply(y.flatten(), pred)
        error_rate = MisclassificationRate().apply(y.flatten(), pred)
        cg[train_phase] = ComputationGraph([cost, error_rate])

    # Weight decay (TODO: Make it less bug prone)
    if model == 'simple':
        weights_to_decay = VariableFilter(
            bricks=[dense for dense, relu, bn in nli_model._mlp],
            roles=[WEIGHT])(cg[True].variables)
        weight_decay = np.float32(c['l2']) * sum(
            (w**2).sum() for w in weights_to_decay)
    elif model == 'esim':
        weight_decay = 0.0
    else:
        raise NotImplementedError()

    final_cost = cg[True].outputs[0] + weight_decay
    final_cost.name = 'final_cost'

    # Add updates for population parameters

    if c.get("bn", True):
        pop_updates = get_batch_normalization_updates(cg[True])
        extra_updates = [(p, m * 0.1 + p * (1 - 0.1)) for p, m in pop_updates]
    else:
        pop_updates = []
        extra_updates = []

    if params:
        logger.debug("Load parameters from {}".format(params))
        with open(params) as src:
            loaded_params = load_parameters(src)
            cg[True].set_parameter_values(loaded_params)
            for param, m in pop_updates:
                param.set_value(loaded_params[get_brick(
                    param).get_hierarchical_name(param)])

    if os.path.exists(os.path.join(save_path, "main_loop.tar")):
        logger.warning("Manually loading BN stats :(")
        with open(os.path.join(save_path, "main_loop.tar")) as src:
            loaded_params = load_parameters(src)

        for param, m in pop_updates:
            param.set_value(
                loaded_params[get_brick(param).get_hierarchical_name(param)])

    if theano.config.compute_test_value != 'off':
        test_value_data = next(
            data.get_stream('train', batch_size=4).get_epoch_iterator())
        s1.tag.test_value = test_value_data[0]
        s1_mask.tag.test_value = test_value_data[1]
        s2.tag.test_value = test_value_data[2]
        s2_mask.tag.test_value = test_value_data[3]
        y.tag.test_value = test_value_data[4]

    # Freeze embeddings
    if not c['train_emb']:
        frozen_params = [
            p for E in nli_model.get_embeddings_lookups() for p in E.parameters
        ]
        train_params = [p for p in cg[True].parameters]
        assert len(set(frozen_params) & set(train_params)) > 0
    else:
        frozen_params = []
    if not c.get('train_def_emb', 1):
        frozen_params_def = [
            p for E in nli_model.get_def_embeddings_lookups()
            for p in E.parameters
        ]
        train_params = [p for p in cg[True].parameters]
        assert len(set(frozen_params_def) & set(train_params)) > 0
        frozen_params += frozen_params_def
    train_params = [p for p in cg[True].parameters if p not in frozen_params]
    train_params_keys = [
        get_brick(p).get_hierarchical_name(p) for p in train_params
    ]

    # Optimizer
    algorithm = GradientDescent(cost=final_cost,
                                on_unused_sources='ignore',
                                parameters=train_params,
                                step_rule=Adam(learning_rate=c['lr']))
    algorithm.add_updates(extra_updates)
    m = Model(final_cost)

    parameters = m.get_parameter_dict()  # Blocks version mismatch
    logger.info("Trainable parameters" + "\n" +
                pprint.pformat([(key, parameters[key].get_value().shape)
                                for key in sorted(train_params_keys)],
                               width=120))
    logger.info("# of parameters {}".format(
        sum([
            np.prod(parameters[key].get_value().shape)
            for key in sorted(train_params_keys)
        ])))

    ### Monitored args ###
    train_monitored_vars = [final_cost] + cg[True].outputs
    monitored_vars = cg[False].outputs
    val_acc = monitored_vars[1]
    to_monitor_names = [
        'def_unk_ratio', 's1_merged_input_rootmean2', 's1_def_mean_rootmean2',
        's1_gate_rootmean2', 's1_compose_gate_rootmean2'
    ]
    for k in to_monitor_names:
        train_v, valid_v = VariableFilter(name=k)(
            cg[True]), VariableFilter(name=k)(cg[False])
        if len(train_v):
            logger.info("Adding {} tracking".format(k))
            train_monitored_vars.append(train_v[0])
            monitored_vars.append(valid_v[0])
        else:
            logger.warning("Didnt find {} in cg".format(k))

    if c['monitor_parameters']:
        for name in train_params_keys:
            param = parameters[name]
            num_elements = numpy.product(param.get_value().shape)
            norm = param.norm(2) / num_elements
            grad_norm = algorithm.gradients[param].norm(2) / num_elements
            step_norm = algorithm.steps[param].norm(2) / num_elements
            stats = tensor.stack(norm, grad_norm, step_norm,
                                 step_norm / grad_norm)
            stats.name = name + '_stats'
            train_monitored_vars.append(stats)

    regular_training_stream = data.get_stream('train',
                                              batch_size=c['batch_size'],
                                              seed=seed)

    if fuel_server:
        # the port will be configured by the StartFuelServer extension
        training_stream = ServerDataStream(
            sources=regular_training_stream.sources,
            hwm=100,
            produces_examples=regular_training_stream.produces_examples)
    else:
        training_stream = regular_training_stream

    ### Build extensions ###

    extensions = [
        # Load(main_loop_path, load_iteration_state=True, load_log=True)
        #     .set_conditions(before_training=not new_training_job),
        StartFuelServer(regular_training_stream,
                        stream_path,
                        hwm=100,
                        script_path=os.path.join(
                            os.path.dirname(__file__),
                            "../bin/start_fuel_server.py"),
                        before_training=fuel_server),
        Timing(every_n_batches=c['mon_freq']),
        ProgressBar(),
        RetrievalPrintStats(retrieval=used_retrieval,
                            every_n_batches=c['mon_freq_valid'],
                            before_training=not fast_start),
        Timestamp(),
        TrainingDataMonitoring(train_monitored_vars,
                               prefix="train",
                               every_n_batches=c['mon_freq']),
    ]

    if c['layout'] == 'snli':
        validation = DataStreamMonitoring(monitored_vars,
                                          data.get_stream('valid',
                                                          batch_size=14,
                                                          seed=seed),
                                          before_training=not fast_start,
                                          on_resumption=True,
                                          after_training=True,
                                          every_n_batches=c['mon_freq_valid'],
                                          prefix='valid')
        extensions.append(validation)
    elif c['layout'] == 'mnli':
        validation = DataStreamMonitoring(monitored_vars,
                                          data.get_stream('valid_matched',
                                                          batch_size=14,
                                                          seed=seed),
                                          every_n_batches=c['mon_freq_valid'],
                                          on_resumption=True,
                                          after_training=True,
                                          prefix='valid_matched')
        validation_mismatched = DataStreamMonitoring(
            monitored_vars,
            data.get_stream('valid_mismatched', batch_size=14, seed=seed),
            every_n_batches=c['mon_freq_valid'],
            before_training=not fast_start,
            on_resumption=True,
            after_training=True,
            prefix='valid_mismatched')
        extensions.extend([validation, validation_mismatched])
    else:
        raise NotImplementedError()

    # Similarity trackers for embeddings
    if len(c.get('vocab_def', '')):
        retrieval_vocab = Vocabulary(c['vocab_def'])
    else:
        retrieval_vocab = data.vocab

    retrieval_all = Retrieval(vocab_text=retrieval_vocab,
                              dictionary=used_dict,
                              max_def_length=c['max_def_length'],
                              exclude_top_k=0,
                              max_def_per_word=c['max_def_per_word'])

    for name in [
            's1_word_embeddings', 's1_dict_word_embeddings',
            's1_translated_word_embeddings'
    ]:
        variables = VariableFilter(name=name)(cg[False])
        if len(variables):
            s1_emb = variables[0]
            logger.info("Adding similarity tracking for " + name)
            # A bit sloppy about downcast

            if "dict" in name:
                embedder = construct_dict_embedder(theano.function(
                    [s1, defs, def_mask, s1_def_map],
                    s1_emb,
                    allow_input_downcast=True),
                                                   vocab=data.vocab,
                                                   retrieval=retrieval_all)
                extensions.append(
                    SimilarityWordEmbeddingEval(
                        embedder=embedder,
                        prefix=name,
                        every_n_batches=c['mon_freq_valid'],
                        before_training=not fast_start))
            else:
                embedder = construct_embedder(theano.function(
                    [s1], s1_emb, allow_input_downcast=True),
                                              vocab=data.vocab)
                extensions.append(
                    SimilarityWordEmbeddingEval(
                        embedder=embedder,
                        prefix=name,
                        every_n_batches=c['mon_freq_valid'],
                        before_training=not fast_start))

    track_the_best = TrackTheBest(validation.record_name(val_acc),
                                  before_training=not fast_start,
                                  every_n_epochs=c['save_freq_epochs'],
                                  after_training=not fast_start,
                                  every_n_batches=c['mon_freq_valid'],
                                  choose_best=min)
    extensions.append(track_the_best)

    # Special care for serializing embeddings
    if len(c.get('embedding_path', '')) or len(c.get('embedding_def_path',
                                                     '')):
        extensions.insert(
            0,
            LoadNoUnpickling(main_loop_path,
                             load_iteration_state=True,
                             load_log=True).set_conditions(
                                 before_training=not new_training_job))
        extensions.append(
            Checkpoint(main_loop_path,
                       parameters=train_params + [p for p, m in pop_updates],
                       save_main_loop=False,
                       save_separately=['log', 'iteration_state'],
                       before_training=not fast_start,
                       every_n_epochs=c['save_freq_epochs'],
                       after_training=not fast_start).add_condition(
                           ['after_batch', 'after_epoch'],
                           OnLogRecord(track_the_best.notification_name),
                           (main_loop_best_val_path, )))
    else:
        extensions.insert(
            0,
            Load(main_loop_path, load_iteration_state=True,
                 load_log=True).set_conditions(
                     before_training=not new_training_job))
        extensions.append(
            Checkpoint(main_loop_path,
                       parameters=cg[True].parameters +
                       [p for p, m in pop_updates],
                       before_training=not fast_start,
                       every_n_epochs=c['save_freq_epochs'],
                       after_training=not fast_start).add_condition(
                           ['after_batch', 'after_epoch'],
                           OnLogRecord(track_the_best.notification_name),
                           (main_loop_best_val_path, )))

    extensions.extend([
        DumpCSVSummaries(save_path,
                         every_n_batches=c['mon_freq_valid'],
                         after_training=True),
        DumpTensorflowSummaries(save_path,
                                after_epoch=True,
                                every_n_batches=c['mon_freq_valid'],
                                after_training=True),
        Printing(every_n_batches=c['mon_freq_valid']),
        PrintMessage(msg="save_path={}".format(save_path),
                     every_n_batches=c['mon_freq']),
        FinishAfter(after_n_batches=c['n_batches']).add_condition(
            ['after_batch'],
            OnLogStatusExceed('iterations_done', c['n_batches']))
    ])

    logger.info(extensions)

    ### Run training ###

    if "VISDOM_SERVER" in os.environ:
        print("Running visdom server")
        ret = subprocess.Popen([
            os.path.join(os.path.dirname(__file__), "../visdom_plotter.py"),
            "--visdom-server={}".format(os.environ['VISDOM_SERVER']),
            "--folder={}".format(save_path)
        ])
        time.sleep(0.1)
        if ret.returncode is not None:
            raise Exception()
        atexit.register(lambda: os.kill(ret.pid, signal.SIGINT))

    model = Model(cost)
    for p, m in pop_updates:
        model._parameter_dict[get_brick(p).get_hierarchical_name(p)] = p

    main_loop = MainLoop(algorithm,
                         training_stream,
                         model=model,
                         extensions=extensions)

    assert os.path.exists(save_path)
    main_loop.run()
Example #21
0
def evaluate(c, tar_path, *args, **kwargs):
    """
    Performs rudimentary evaluation of SNLI/MNLI run

    * Runs on valid and test given network
    * Saves all predictions
    * Saves embedding matrix
    * Saves results.json and predictions.csv
    """

    # Load and configure
    model = kwargs['model']
    assert c.endswith("json")
    c = json.load(open(c))

    # Very ugly absolute path fix
    ABS_PATHS = [
        "data/", "/mnt/users/jastrzebski/local/dict_based_learning/data/",
        "/data/cf9ffb48-61bd-40dc-a011-b2e7e5acfd72/"
    ]
    from six import string_types
    for abs_path in ABS_PATHS:
        for k in c:
            if isinstance(c[k], string_types):
                if c[k].startswith(abs_path):
                    c[k] = c[k][len(abs_path):]

    # Make data paths nice
    for path in [
            'dict_path', 'embedding_def_path', 'embedding_path', 'vocab',
            'vocab_def', 'vocab_text'
    ]:
        if c.get(path, ''):
            if not os.path.isabs(c[path]):
                c[path] = os.path.join(fuel.config.data_path[0], c[path])

    logging.info("Updating config with " + str(kwargs))
    c.update(**kwargs)

    # NOTE: This assures we don't miss crucial definition for some def heavy words
    # usually it is a good idea
    c['max_def_per_word'] = c['max_def_per_word'] * 2

    assert tar_path.endswith("tar")
    dest_path = os.path.dirname(tar_path)
    prefix = os.path.splitext(os.path.basename(tar_path))[0]

    s1_decoded, s2_decoded = T.lmatrix('sentence1'), T.lmatrix('sentence2')

    if c['dict_path']:
        s1_def_map, s2_def_map = T.lmatrix('sentence1_def_map'), T.lmatrix(
            'sentence2_def_map')
        def_mask = T.fmatrix("def_mask")
        defs = T.lmatrix("defs")
    else:
        s1_def_map, s2_def_map = None, None
        def_mask = None
        defs = None

    s1_mask, s2_mask = T.fmatrix('sentence1_mask'), T.fmatrix('sentence2_mask')

    if model == 'simple':
        model, data, used_dict, used_retrieval, used_vocab = _initialize_simple_model_and_data(
            c)
    elif model == 'esim':
        model, data, used_dict, used_retrieval, used_vocab = _initialize_esim_model_and_data(
            c)
    else:
        raise NotImplementedError()

    pred = model.apply(s1_decoded,
                       s1_mask,
                       s2_decoded,
                       s2_mask,
                       def_mask=def_mask,
                       defs=defs,
                       s1_def_map=s1_def_map,
                       s2_def_map=s2_def_map,
                       train_phase=False)
    cg = ComputationGraph([pred])
    if c.get("bn", True):
        bn_params = [
            p for p in VariableFilter(bricks=[BatchNormalization])(cg)
            if hasattr(p, "set_value")
        ]
    else:
        bn_params = []

    # Load model
    model = Model(cg.outputs)
    parameters = model.get_parameter_dict()  # Blocks version mismatch
    logging.info(
        "Trainable parameters" + "\n" +
        pprint.pformat([(key, parameters[key].get_value().shape)
                        for key in sorted([
                            get_brick(param).get_hierarchical_name(param)
                            for param in cg.parameters
                        ])],
                       width=120))
    logging.info("# of parameters {}".format(
        sum([
            np.prod(parameters[key].get_value().shape) for key in sorted([
                get_brick(param).get_hierarchical_name(param)
                for param in cg.parameters
            ])
        ])))
    with open(tar_path) as src:
        params = load_parameters(src)

        loaded_params_set = set(params.keys())
        model_params_set = set([
            get_brick(param).get_hierarchical_name(param)
            for param in cg.parameters
        ])

        logging.info("Loaded extra parameters")
        logging.info(loaded_params_set - model_params_set)
        logging.info("Missing parameters")
        logging.info(model_params_set - loaded_params_set)
    model.set_parameter_values(params)

    if c.get("bn", True):
        logging.info("Loading " + str([
            get_brick(param).get_hierarchical_name(param)
            for param in bn_params
        ]))
        for param in bn_params:
            param.set_value(
                params[get_brick(param).get_hierarchical_name(param)])
        for p in bn_params:
            model._parameter_dict[get_brick(p).get_hierarchical_name(p)] = p

    # Read logs
    logs = pd.read_csv(os.path.join(dest_path, "logs.csv"))
    best_val_acc = logs['valid_misclassificationrate_apply_error_rate'].min()
    logging.info("Best measured valid acc: " + str(best_val_acc))

    # NOTE(kudkudak): We need this to have comparable mean rank and embedding scores
    reference_vocab = Vocabulary(
        os.path.join(fuel.config.data_path[0], c['data_path'], 'vocab.txt'))
    vocab_all = Vocabulary(
        os.path.join(
            fuel.config.data_path[0], c['data_path'],
            'vocab_all.txt'))  # Can include OOV words, which is interesting
    retrieval_all = Retrieval(vocab_text=used_vocab,
                              dictionary=used_dict,
                              max_def_length=c['max_def_length'],
                              exclude_top_k=0,
                              max_def_per_word=c['max_def_per_word'])
    # logging.info("Calculating dict and word embeddings for vocab.txt and vocab_all.txt")
    # for name in ['s1_word_embeddings', 's1_dict_word_embeddings']:
    #     variables = VariableFilter(name=name)(cg)
    #     if len(variables):
    #         s1_emb = variables[0]
    #         # A bit sloppy about downcast
    #
    #         if "dict" in name:
    #             embedder = construct_dict_embedder(
    #                 theano.function([s1_decoded, defs, def_mask, s1_def_map], s1_emb, allow_input_downcast=True),
    #                 vocab=data.vocab, retrieval=retrieval_all)
    #         else:
    #             embedder = construct_embedder(theano.function([s1_decoded], s1_emb, allow_input_downcast=True),
    #                 vocab=data.vocab)
    #
    #         for v_name, v in [("vocab_all", vocab_all), ("vocab", reference_vocab)]:
    #             logging.info("Calculating {} embeddings for {}".format(name, v_name))

    # Predict
    predict_fnc = theano.function(cg.inputs, pred)
    results = {}
    batch_size = 14
    for subset in ['valid', 'test']:
        logging.info("Predicting on " + subset)
        stream = data.get_stream(subset, batch_size=batch_size, seed=778)
        it = stream.get_epoch_iterator()
        rows = []
        for ex in tqdm.tqdm(it, total=10000 / batch_size):
            ex = dict(zip(stream.sources, ex))
            inp = [ex[v.name] for v in cg.inputs]
            prob = predict_fnc(*inp)
            label_pred = np.argmax(prob, axis=1)

            for id in range(len(prob)):
                s1_decoded = used_vocab.decode(ex['sentence1'][id]).split()
                s2_decoded = used_vocab.decode(ex['sentence2'][id]).split()

                assert used_vocab == data.vocab

                s1_decoded = [
                    '*' + w + '*'
                    if used_vocab.word_to_id(w) > c['num_input_words'] else w
                    for w in s1_decoded
                ]
                s2_decoded = [
                    '*' + w + '*'
                    if used_vocab.word_to_id(w) > c['num_input_words'] else w
                    for w in s2_decoded
                ]

                # Different difficulty metrics

                # text_unk_percentage
                s1_no_pad = [w for w in ex['sentence1'][id] if w != 0]
                s2_no_pad = [w for w in ex['sentence2'][id] if w != 0]

                s1_unk_percentage = sum([
                    1. for w in s1_no_pad if w == used_vocab.unk
                ]) / len(s1_no_pad)
                s2_unk_percentage = sum([
                    1. for w in s1_no_pad if w == used_vocab.unk
                ]) / len(s2_no_pad)

                # mean freq word
                s1_mean_freq = np.mean([
                    0 if w == data.vocab.unk else used_vocab._id_to_freq[w]
                    for w in s1_no_pad
                ])
                s2_mean_freq = np.mean([
                    0 if w == data.vocab.unk else used_vocab._id_to_freq[w]
                    for w in s2_no_pad
                ])

                # mean rank word (UNK is max rank)
                # NOTE(kudkudak): Will break if we reindex unk between vocabs :P
                s1_mean_rank = np.mean([
                    reference_vocab.size() if reference_vocab.word_to_id(
                        used_vocab.id_to_word(w)) == reference_vocab.unk else
                    reference_vocab.word_to_id(used_vocab.id_to_word(w))
                    for w in s1_no_pad
                ])

                s2_mean_rank = np.mean([
                    reference_vocab.size() if reference_vocab.word_to_id(
                        used_vocab.id_to_word(w)) == reference_vocab.unk else
                    reference_vocab.word_to_id(used_vocab.id_to_word(w))
                    for w in s2_no_pad
                ])

                rows.append({
                    "pred": label_pred[id],
                    "true_label": ex['label'][id],
                    "s1": ' '.join(s1_decoded),
                    "s2": ' '.join(s2_decoded),
                    "s1_unk_percentage": s1_unk_percentage,
                    "s2_unk_percentage": s2_unk_percentage,
                    "s1_mean_freq": s1_mean_freq,
                    "s2_mean_freq": s2_mean_freq,
                    "s1_mean_rank": s1_mean_rank,
                    "s2_mean_rank": s2_mean_rank,
                    "p_0": prob[id, 0],
                    "p_1": prob[id, 1],
                    "p_2": prob[id, 2]
                })

        preds = pd.DataFrame(rows, columns=rows[0].keys())
        preds.to_csv(
            os.path.join(dest_path,
                         prefix + '_predictions_{}.csv'.format(subset)))
        results[subset] = {}
        results[subset]['misclassification'] = 1 - np.mean(
            preds.pred == preds.true_label)

        if subset == "valid" and np.abs(
            (1 - np.mean(preds.pred == preds.true_label)) -
                best_val_acc) > 0.001:
            logging.error("!!!")
            logging.error(
                "Found different best_val_acc. Probably due to changed specification of the model class."
            )
            logging.error("Discrepancy {}".format(
                (1 - np.mean(preds.pred == preds.true_label)) - best_val_acc))
            logging.error("!!!")

        logging.info(results)

    json.dump(results,
              open(os.path.join(dest_path, prefix + '_results.json'), "w"))
Example #22
0
def _initialize_esim_model_and_data(c):

    if c['vocab']:
        vocab = Vocabulary(c['vocab'])
    else:
        vocab = None

    # Load data
    data = SNLIData(path=c['data_path'], layout=c['layout'], vocab=vocab)

    if vocab is None:
        vocab = data.vocab

    if c.get('vocab_text', ''):
        vocab_text = Vocabulary(c['vocab_text'])
    else:
        vocab_text = vocab

    # def_emb_dim defaults to emb_dim
    # def_emb_translate_dim default to def_emb_dim
    def_emb_dim = c.get('def_emb_dim',
                        0) if c.get('def_emb_dim', 0) > 0 else c['emb_dim']
    def_emb_translate_dim = c.get(
        'def_emb_translate_dim',
        0) if c.get('def_emb_translate_dim', 0) > 0 else def_emb_dim

    # Dict
    if c['dict_path']:
        dict = Dictionary(c['dict_path'])
        logging.info("Loaded dict with {} entries".format(dict.num_entries()))

        if len(c['vocab_def']):
            retrieval_vocab = Vocabulary(c['vocab_def'])
        else:
            retrieval_vocab = data.vocab

        retrieval = Retrieval(vocab_text=vocab_text,
                              vocab_def=retrieval_vocab,
                              dictionary=dict,
                              max_def_length=c['max_def_length'],
                              with_too_long_defs=c['with_too_long_defs'],
                              exclude_top_k=c['exclude_top_k'],
                              max_def_per_word=c['max_def_per_word'])

        data.set_retrieval(retrieval)

        num_input_def_words = c['num_input_def_words'] if c[
            'num_input_def_words'] > 0 else c['num_input_words']

        # TODO: Refactor lookup passing to reader. Very incoventient ATM
        if c['reader_type'] == "rnn":
            def_reader = LSTMReadDefinitions(
                num_input_words=num_input_def_words,
                weights_init=Uniform(width=0.1),
                biases_init=Constant(0.),
                dim=c['def_dim'],
                emb_dim=def_emb_dim,
                vocab=vocab,
                lookup=None)
        elif c['reader_type'] == "mean":
            def_reader = MeanPoolReadDefinitions(
                num_input_words=num_input_def_words,
                translate=c['combiner_reader_translate'],
                vocab=vocab,
                weights_init=Uniform(width=0.1),
                lookup=None,
                dim=def_emb_translate_dim,
                biases_init=Constant(0.),
                emb_dim=def_emb_dim)
        else:
            raise NotImplementedError()

        def_combiner = MeanPoolCombiner(
            dim=c['def_dim'],
            emb_dim=def_emb_translate_dim,
            dropout=c['combiner_dropout'],
            dropout_type=c['combiner_dropout_type'],
            def_word_gating=c['combiner_gating'],
            shortcut_unk_and_excluded=c['combiner_shortcut'],
            num_input_words=num_input_def_words,
            exclude_top_k=c['exclude_top_k'],
            vocab=vocab,
            compose_type=c['compose_type'],
            weights_init=Uniform(width=0.1),
            biases_init=Constant(0.))

    else:
        retrieval = None
        dict = None
        def_combiner = None
        def_reader = None

    # Initialize

    simple = ESIM(
        # Baseline arguments
        emb_dim=c['emb_dim'],
        vocab=data.vocab,
        encoder=c['encoder'],
        dropout=c['dropout'],
        def_emb_translate_dim=def_emb_translate_dim,
        num_input_words=c['num_input_words'],
        def_dim=c['def_dim'],
        dim=c['dim'],
        bn=c.get('bn', True),
        def_combiner=def_combiner,
        def_reader=def_reader,

        # Init
        weights_init=GlorotUniform(),
        biases_init=Constant(0.0))
    simple.push_initialization_config()
    # TODO: Not sure anymore why we do that
    if c['encoder'] == 'bilstm':
        for enc in simple._rnns:
            enc.weights_init = Uniform(std=0.1)
    simple.initialize()

    if c['embedding_path']:
        embeddings = np.load(c['embedding_path'])
        simple.set_embeddings(embeddings.astype(theano.config.floatX))

    if c.get('embedding_def_path', ''):
        embeddings = np.load(c['embedding_def_path'])
        simple.set_def_embeddings(embeddings.astype(theano.config.floatX))

    return simple, data, dict, retrieval, vocab
Example #23
0
def main():
    logging.basicConfig(
        level='INFO',
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    parser = argparse.ArgumentParser(
        "Converts GLOVE embeddings to a numpy array")
    parser.add_argument("txt", help="GLOVE data in txt format")
    parser.add_argument("npy", help="Destination for npy format")
    parser.add_argument("--vocab",
                        default="",
                        help="Performs subsetting based on passed vocab")
    parser.add_argument("--dict",
                        default="",
                        help="Performs subsetting based on passed dict")

    # OOV handling
    parser.add_argument("--try-lemma", action="store_true", help="Try lemma")
    parser.add_argument("--try-lowercase", default="", help="Try lowercase")

    args = parser.parse_args()

    if args.dict and not args.vocab:
        # usually you'd want to use both, I suppose
        raise NotImplementedError("Not implemented")
    if args.try_lemma or args.try_lowercase:
        # TODO(kudkudak): Implement
        raise NotImplementedError("Not implemented yet")

    if args.vocab == "":
        embeddings = []
        dim = None
        with open(args.txt) as src:
            for i, line in enumerate(src):
                tokens = line.strip().split()
                features = map(float, tokens[1:])
                dim = len(features)
                embeddings.append(features)
                if i and i % 100000 == 0:
                    print i
        embeddings = [[0.] * dim] * len(
            Vocabulary.SPECIAL_TOKEN_MAP) + embeddings
        numpy.save(args.npy, embeddings)
    else:
        vocab = Vocabulary(args.vocab)
        if args.dict:
            dict_ = Dictionary(args.dict)

        print('Computing GloVe')

        # Loading
        embeddings_index = {}
        f = open(args.txt)
        print('Reading GloVe file')
        for line in f:
            values = line.split(' ')
            word = values[0]
            dim = len(values[1:])
            coefs = numpy.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        f.close()

        # Embedding matrix
        print('Reading GloVe file')
        embedding_matrix = numpy.zeros((vocab.size(), dim))
        for word in vocab._word_to_id:
            embedding_vector = embeddings_index.get(word)
            in_glove = embedding_vector is not None
            if args.dict:
                in_dict = len(dict_.get_definitions(word)) > 0

            if in_glove and (not args.dict or in_dict):
                # words not found in embedding index will be all-zeros.
                embedding_matrix[vocab.word_to_id(word)] = embedding_vector
            else:
                if not in_glove:
                    print(u'Missing from GloVe: {}'.format(word))
                else:
                    print(u'Missing from dict: {}'.format(word))

        numpy.save(args.npy, embedding_matrix)
Example #24
0
 def vocab(self):
     if not self._vocab:
         logger.debug("Loading default vocab")
         self._vocab = Vocabulary(os.path.join(self._path, "vocab.txt"))
     return self._vocab
Example #25
0
def main():
    logging.basicConfig(
        level='DEBUG',
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    parser = argparse.ArgumentParser("Crawl definitions for a vocabulary")
    parser.add_argument("--api_key",
                        help="Wordnik API key to use")
    # NOTE(kudkudak): wordnik has useCanonical which tries to do stuff like Cats -> cat
    # but it doesn't really work well
    parser.add_argument("--just-lemmas", action="store_true",
                        help="Just use the lemmas as the definition")
    parser.add_argument("--just-lowercase", action="store_true",
                        help="Just lowercase as the definition")
    parser.add_argument("--add-lemma-defs", action="store_true",
                        help="Add definitions from lemmas")
    parser.add_argument("--add-lower-defs", action="store_true",
        help="Add definitions from lowercase")
    parser.add_argument("--add-lower-lemma-defs", action="store_true",
        help="Add definitions from lowercase version of word and lemmas")
    parser.add_argument("--add-dictname-to-defs", action="store_true",
        help="Adds dictionary name to definition")
    parser.add_argument("--wordnet", action="store_true",
        help="Crawl WordNet")
    parser.add_argument("--add-identity", action="store_true",
                        help="Identity mapping dictionary")
    parser.add_argument("--add-spelling-if-no-def", action="store_true",
                        help="Add spelling if there is no definition")
    parser.add_argument("--add-spelling", action="store_true",
                        help="Always add spelling")
    parser.add_argument("--crawl-also-lowercase", default=0, type=int,
        help="If true will crawl also lower-cased version")
    parser.add_argument("--crawl-also-lemma", default=0, type=int,
        help="If true will crawl also lemma version")
    parser.add_argument("--remove-out-of-vocabulary", action="store_true",
                        help="Remove entries of dict which do not appear in vocab")
    parser.add_argument("vocab", help="Vocabulary path")
    parser.add_argument("dict", help="Destination path for the dictionary")
    args = parser.parse_args()

    vocab = Vocabulary(args.vocab)
    dict_ = Dictionary(args.dict)

    try:
        if args.api_key:
            port = get_free_port()
            popen = start_corenlp(port)
            url = "http://localhost:{}".format(port)
            dict_.crawl_wordnik(
                vocab, args.api_key, url,
                crawl_also_lowercase=args.crawl_also_lowercase,
                crawl_also_lemma=args.crawl_also_lemma)
        elif args.wordnet:
            port = get_free_port()
            popen = start_corenlp(port)
            url = "http://localhost:{}".format(port)
            dict_.crawl_wordnet(url)
        elif args.add_lemma_defs or args.add_lower_lemma_defs:
            # NOTE(kudkudak): A bit ugly, but this covers case where
            # we have Cats which do not get added lemmas
            # from cat without try_lower=True
            dict_.add_from_lemma_definitions(vocab, try_lower=args.add_lower_lemma_defs)
        elif args.add_lower_defs:
            dict_.add_from_lowercase_definitions(vocab)
        elif args.add_dictname_to_defs:
            dict_.add_dictname_to_defs(vocab)
        elif args.add_spelling_if_no_def:
            dict_.add_spelling(vocab)
        elif args.add_spelling:
            dict_.add_spelling(vocab, only_if_no_def=False)
        elif args.just_lemmas:
            dict_.crawl_lemmas(vocab)
        elif args.just_lowercase:
            dict_.crawl_lowercase(vocab)
        elif args.add_identity:
            dict_.add_identity_mapping(vocab)
        elif args.remove_out_of_vocabulary:
            dict_.remove_out_of_vocabulary(vocab)
        else:
            raise ValueError("don't know what to do")
    finally:
        if 'popen' in locals() and popen and popen.returncode is None:
            popen.kill()
Example #26
0
def main():
    parser = argparse.ArgumentParser(
        "Analyze coverage of either a dictionary or pretrained embeddings on a given vocab."
    )
    parser.add_argument("--dict", default="", help="Dictionary.")
    parser.add_argument(
        "--embedding",
        default="",
        help=
        "Path to embeddings. Can either be a npy file or a raw glove txt file."
    )
    parser.add_argument(
        "--top_k",
        type=int,
        default=0,
        help=
        "Optional, provide statistics for excluding top_k words from source (either dict or embedding)"
    )
    parser.add_argument("--step_size",
                        type=int,
                        help="Report each",
                        default=10000)
    parser.add_argument("--uncovered", help="Destination for uncovered files")
    parser.add_argument("vocab", help="Vocabulary")
    args = parser.parse_args()

    assert (args.vocab.endswith(".txt"))

    vocab = Vocabulary(args.vocab)
    words = vocab.words
    freqs = numpy.array(vocab.frequencies)
    total = float(freqs.sum())
    coverage = numpy.cumsum(freqs) / total
    print("Cumulative distribution:")
    for i in range(args.step_size,
                   args.step_size * (len(freqs) / args.step_size),
                   args.step_size):
        print(i, coverage[i] * 100)

    if not args.dict and not args.embedding:
        return

    uncovered_file = io.open('/dev/null', 'w')
    if args.uncovered:
        uncovered_file = io.open(args.uncovered, 'w', encoding='utf-8')

    if args.dict and args.top_k:
        print("Analysing coverage of dict of text")

    n_covered = 0
    n_covered_by_lowercasing = 0
    if args.dict:
        source_name = "dictionary"
        dict_ = Dictionary(args.dict)
        print("Dictionary has {} entries".format(dict_.num_entries()))

        n_more_def_than_1 = 0
        for i in range(args.top_k, len(freqs)):
            if len(dict_.get_definitions(words[i])) > 1:
                n_more_def_than_1 += freqs[i]
            if dict_.get_definitions(words[i]):
                n_covered += freqs[i]
            elif dict_.get_definitions(words[i].lower()):
                n_covered_by_lowercasing += freqs[i]
    elif args.embedding:
        source_name = "glove embeddings"
        # Loading (note: now only supports GloVe format)
        word_set = set([])
        if args.embedding.endswith(".txt"):
            with open(args.embedding) as f:
                for line in f:
                    values = line.split(' ')
                    word = values[0]
                    word_set.add(word)
                f.close()
        elif args.embedding.endswith(".npy"):
            print(
                "Warning: assuming that embeddings from .npy file are ordered according to the same vocabulary file as the one passed (using pack_glove --vocab vocab_passed_here)"
            )
            emb_matrix = numpy.load(args.embedding)
            for i, emb in enumerate(emb_matrix):
                if not numpy.all(emb == 0):
                    word_set.add(words[i])

        print("Glove embeddings has {} entries".format(len(word_set)))

        for i in range(args.top_k, len(freqs)):
            if words[i] in word_set:
                n_covered += freqs[i]
            elif words[i].lower() in word_set:
                n_covered_by_lowercasing += freqs[i]
    else:
        raise NotImplementedError()

    print("Analysing coverage of " + source_name)
    if args.top_k:
        print("The first " + str(args.top_k) +
              " ranked words are covered by word embeddings.")
        print("This amounts to " + str(100 * coverage[args.top_k - 1]) +
              "% of occurences.")
    else:
        print("Case when no word embeddings are used (args.top_k=0). " +
              source_name + " provides all embeddings")
    print(source_name +
          " covers {} % of total occurences".format(100 * n_covered / total))
    print(
        "Querying not found words as lowercased words additionally covers {}% of total occurences"
        .format(100 * n_covered_by_lowercasing / total))

    if args.top_k:
        n_not_covered_by_embs = total * (1 - coverage[args.top_k - 1])
        print(
            source_name +
            " covers additional {}% of occurences not covered by word embeddings"
            .format(100 * n_covered / n_not_covered_by_embs))
        print(
            "Querying not found words as lowercased words additionally covers {}% of occurences not covered by word embeddings"
            .format(100 * n_covered_by_lowercasing / n_not_covered_by_embs))
        if args.dict:
            print(
                "Occurences of dictionary defs with >1 def not covered by word embeddings: {}%"
                .format(100 * n_more_def_than_1 / n_not_covered_by_embs))

    uncovered_file.close()