def initialize_data_and_model(config, train_phase, layout='dict'): c = config fuel_path = fuel.config.data_path[0] vocab_main = None vocab_keys = None if not c['encoder']: if not c['vocab_keys_path']: raise ValueError( 'Error: Should specify vocab_keys_path when no encoder') vocab_keys = Vocabulary( os.path.join(fuel.config.data_path[0], c['vocab_keys_path'])) if c['vocab_path']: vocab_main = Vocabulary( os.path.join(fuel.config.data_path[0], c['vocab_path'])) # TODO: change name of class LanguageModellingData... very ill-named. data = LanguageModellingData(c['data_path'], layout, vocab=vocab_main) vocab_main = data.vocab model = Seq2Seq(c['emb_dim'], c['dim'], c['num_input_words'], c['num_output_words'], data.vocab, proximity_coef=c['proximity_coef'], proximity_distance=c['proximity_distance'], encoder=c['encoder'], decoder=c['decoder'], shared_rnn=c['shared_rnn'], translate_layer=c['translate_layer'], word_dropout=c['word_dropout'], tied_in_out=c['tied_in_out'], vocab_keys=vocab_keys, reconstruction_coef=c['reconstruction_coef'], provide_targets=c['provide_targets'], weights_init=Uniform(width=0.1), biases_init=Constant(0.)) model.initialize() if c['embedding_path'] and ((train_phase or c['freeze_pretrained']) or c['provide_targets']): if c['provide_targets'] and c['freeze_pretrained']: raise ValueError("Can't provide_targets and use freeze_pretrained." "In that case, simply use freeze_pretrained") # if encoder embeddings are frozen, then we should load them # as they're not saved with the models parameters emb_full_path = os.path.join(fuel_path, c['embedding_path']) embedding_matrix = numpy.load(emb_full_path) if c['provide_targets']: model.set_def_embeddings(embedding_matrix, 'target') logger.debug("Pre-trained targets loaded") else: model.set_def_embeddings(embedding_matrix, 'main') logger.debug("Pre-trained encoder embeddings loaded") return data, model
def initialize_data_and_model(config): c = config vocab = None if c['vocab_path']: vocab = Vocabulary( os.path.join(fuel.config.data_path[0], c['vocab_path'])) data = ExtractiveQAData(path=c['data_path'], vocab=vocab, layout=c['layout']) # TODO: fix me, I'm so ugly (I mean the access of a private attribute) if c['dict_path']: dict_vocab = data.vocab if c['dict_vocab_path']: dict_vocab = Vocabulary( os.path.join(fuel.config.data_path[0], c['dict_vocab_path'])) data._retrieval = Retrieval( data.vocab, Dictionary(os.path.join(fuel.config.data_path[0], c['dict_path'])), max_def_length=c['max_def_length'], with_too_long_defs=c['with_too_long_defs'], max_def_per_word=c['max_def_per_word'], with_too_many_defs=c['with_too_many_defs'], # This should fix --exclude_top_k vocab_def=dict_vocab) logger.debug("Data loaded") qam = ExtractiveQAModel( c['dim'], c['emb_dim'], c['readout_dims'], c['num_input_words'], c['def_num_input_words'], data.vocab, coattention=c['coattention'], use_definitions=bool(c['dict_path']), def_word_gating=c['def_word_gating'], compose_type=c['compose_type'], reuse_word_embeddings=c['reuse_word_embeddings'], bidir_encoder=c['bidir_encoder'], random_unk=c['random_unk'], def_reader=c['def_reader'], weights_init=(GlorotUniform() if not c['init_width'] else Uniform( width=c['init_width'])), recurrent_weights_init=(GlorotUniform() if not c['rec_init_width'] else Uniform(width=c['rec_init_width'])), biases_init=Constant(0.)) qam.initialize() logger.debug("Model created") if c['embedding_path']: qam.set_embeddings( numpy.load( os.path.join(fuel.config.data_path[0], c['embedding_path']))) logger.debug("Embeddings loaded") return data, qam
def test_squad_to_h5py_dataset(): corenlp = None try: port = get_free_port() corenlp = start_corenlp(port) test_dir = tempfile.mkdtemp() json_path = os.path.join(test_dir, 'data.json') h5_path = os.path.join(test_dir, 'data.h5') with open(json_path, 'w') as json_file: print(TEST_SQUAD_RAW_DATA, file=json_file) squad_to_h5py_dataset(json_path, h5_path, "http://localhost:{}".format(port)) with h5py.File(h5_path, 'r') as h5_file: vocab = Vocabulary.build(h5_file['text'], top_k=100) add_words_ids_to_squad(h5_path, vocab) dataset = SQuADDataset(h5_path, ('all', )) stream = dataset.get_example_stream() stream = dataset.apply_default_transformers(stream) example = next(stream.get_epoch_iterator(as_dict=True)) answer_span = slice(example['answer_begins'][0], example['answer_ends'][0]) assert example['questions'].tolist() == map(vocab.word_to_id, [ u'To', u'whom', u'did', u'the', u'Virgin', u'Mary', u'allegedly', u'appear', u'in', u'1858', u'in', u'Lourdes', u'France', u'?' ]) assert example['contexts'][answer_span].tolist() == map( vocab.word_to_id, [u'Saint', u'Bernadette', u'Soubirous']) finally: if corenlp and corenlp.returncode is None: corenlp.kill()
def test_language_model(): with temporary_content_path(TEST_VOCAB) as path: vocab = Vocabulary(path) with temporary_content_path(TEST_DICT_JSON, suffix=".json") as path: dict_ = Dictionary(path) floatX = theano.config.floatX def make_data_and_mask(data): data = [[str2vec(s, 3) for s in row] for row in data] data = np.array(data) mask = np.ones((data.shape[0], data.shape[1]), dtype=floatX) return data, mask words_val, mask_val = make_data_and_mask([['p', 'e', 'a', ], ['a', 'e', 'p',]]) mask_val[1,2] = 0 print "data:" print words_val print "mask:" print mask_val mask_def_emb_val = np.asarray([[0, 1], [0,0]]) # With the dictionary retrieval = Retrieval(vocab, dict_, exclude_top_k=7) lm = LanguageModel(7, 5, vocab.size(), vocab.size(), vocab=vocab, retrieval=retrieval, compose_type='transform_and_sum', weights_init=Uniform(width=0.1), biases_init=Uniform(width=0.1)) lm.initialize() words = tensor.ltensor3('words') mask = tensor.matrix('mask', dtype=floatX) costs = lm.apply(words, mask) cg = ComputationGraph(costs) def_mean, = VariableFilter(name='_dict_word_embeddings')(cg) def_mean_f = theano.function([words], def_mean) perplexities = VariableFilter(name_regex='perplexity.*')(cg) mask_def_emb, = VariableFilter(name='mask_def_emb')(cg) perplexities_f = theano.function([words, mask], perplexities) perplexities_v = perplexities_f(words_val, mask_val) mask_emb_f = theano.function([words, mask], mask_def_emb) mask_def_v = mask_emb_f(words_val, mask_val) for v,p in zip(perplexities_v,perplexities): print p.name, ":", v assert(np.allclose(mask_def_v, mask_def_emb_val))
def test_vocab_op(): with temporary_content_path(TEST_VOCAB) as path: vocab = Vocabulary(path) op = WordToIdOp(vocab) input_ = tensor.as_tensor_variable([ord('d'), ord(' '), ord('c'), 0, 0]) assert op(input_).eval() == 0 input_ = tensor.as_tensor_variable([ord('a')]) assert op(input_).eval() == 5 input_ = tensor.as_tensor_variable([[ord('a'), 0], [ord('b'), 0]]) assert list(op(input_).eval()) == [5, 6]
def test_retrieval(): with temporary_content_path(TEST_VOCAB, ".txt") as path: vocab = Vocabulary(path) with temporary_content_path(TEST_DICT_JSON, ".json") as path: dict_ = Dictionary(path) # check a super simple case batch = [['a']] defs, def_map = Retrieval(vocab, dict_).retrieve(batch) assert defs == [[3, 6, 7, 4], [3, 8, 9, 4]] assert def_map == [(0, 0, 0), (0, 0, 1)] # check that vectors are handled correctly batch = numpy.array([ord('d'), ord(' '), ord('c'), 0, 0])[None, None, :] defs, def_map = Retrieval(vocab, dict_).retrieve(batch) assert defs == [[3, 5, 6, 4]] assert def_map == [(0, 0, 0)] # check a complex case batch = [['a', 'b', 'b'], ['d c', 'a', 'b']] defs, def_map = Retrieval(vocab, dict_).retrieve(batch) assert defs == [[3, 6, 7, 4], [3, 8, 9, 4], [3, 9, 8, 4], [3, 5, 6, 4]] assert def_map == [(0, 0, 0), (0, 0, 1), (0, 1, 2), (0, 2, 2), (1, 0, 3), (1, 1, 0), (1, 1, 1), (1, 2, 2)] # check a complex case with exclude top k batch = [['a', 'b', 'c', 'd'], ['a', 'e', 'b']] exclude_top_k = 7 # should exclude 'a', 'b', 'c', 'd' and only define 'e' defs, def_map = Retrieval(vocab, dict_, exclude_top_k=exclude_top_k).retrieve(batch) assert defs == [[3, 6, 7, 8, 4]] assert def_map == [(1, 1, 0)] # check the op retrieval_op = RetrievalOp(Retrieval(vocab, dict_)) batch = tensor.as_tensor_variable( [[[ord('d'), ord(' '), ord('c'), 0, 0], [ord('e'), 0, 0, 0, 0]]]) defs_var, mask_var, def_map_var = retrieval_op(batch) assert defs_var.eval().tolist() == [[3, 5, 6, 4, 0], [3, 6, 7, 8, 4]] assert_allclose(mask_var.eval(), [[1, 1, 1, 1, 0], [1, 1, 1, 1, 1]]) assert def_map_var.eval().tolist() == [[0, 0, 0], [0, 1, 1]]
def t_e_s_t_language_model(): V = 50 gen = FakeTextGenerator(V, 6, 6, 1.0, 0.2) n_sentences = 3 len_sentences = 7 data = [gen.sample_sentence(len_sentences) for i in range(n_sentences)] vocab_list = '\n'.join(list(set(gen.vocabulary))) dict_json = json.dumps(gen.dictionary) print "JSON dict:", dict_json with temporary_content_path(vocab_list) as path: vocab = Vocabulary(path) with temporary_content_path(dict_json) as path: dict_ = Dictionary(path) data = [[str2vec(s, generator.tok_len) for s in row] for row in data] data = numpy.array(data) print "Data:", data # With the dictionary lm = LanguageModel(vocab=vocab, dict_=dict_, dim=10, weights_init=Uniform(width=0.1), biases_init=Uniform(width=0.1)) lm.initialize() costs = lm.apply(tensor.as_tensor_variable(data), numpy.ones((data.shape[0], data.shape[1]))) cg = ComputationGraph(costs) def_spans, = VariableFilter(name='def_spans')(cg) f = theano.function([], [costs, def_spans]) costs_value, def_spans_value = f() assert def_spans_value.tolist() == [[0, 2], [2, 4], [4, 5], [5, 7]] # Without the dictionary lm2 = LanguageModel(vocab=vocab, dim=10, weights_init=Uniform(width=0.1), biases_init=Uniform(width=0.1)) costs2 = lm2.apply(tensor.as_tensor_variable(data), numpy.ones((data.shape[0], data.shape[1]))) costs2.eval()
def test_extractive_qa_model(): with temporary_content_path(TEST_VOCAB) as path: vocab = Vocabulary(path) with temporary_content_path(TEST_DICT_JSON) as path: dict_ = Dictionary(path) def make_data_and_mask(data): data = [[vocab.word_to_id(s) for s in row] for row in data] data = numpy.array(data) mask = numpy.ones((data.shape[0], data.shape[1]), dtype=theano.config.floatX) return data, mask # create some dummy data contexts, context_mask = make_data_and_mask([['a', 'a', 'a', 'b'], ['b', 'a', 'b', 'a'], ['a', 'b', 'b', 'b']]) questions, question_mask = make_data_and_mask([['a', 'a'], ['b', 'a'], ['a', 'b']]) answer_begins = [0, 0, 1] answer_ends = [1, 2, 2] for coattention in [False, True]: qam = ExtractiveQAModel(vocab=vocab, dim=10, emb_dim=10, num_input_words=10, compose_type='sum', use_definitions=False, reuse_word_embeddings=False, def_reader='LSTMReadDefinitions', coattention=coattention, weights_init=Uniform(width=0.1), biases_init=Uniform(width=0.1)) qam.initialize() costs = qam.apply(tensor.as_tensor_variable(contexts), context_mask, tensor.as_tensor_variable(questions), question_mask, tensor.as_tensor_variable(answer_begins), tensor.as_tensor_variable(answer_ends)) assert costs.eval().shape == (3, )
def main(): logging.basicConfig( level='INFO', format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser("Builds a vocabulary") parser.add_argument("--top-k", type=int, help="Top most frequent words to leave") parser.add_argument("--keys-only", action='store_true', help="Build vocab of all keys") parser.add_argument("--with-keys", action='store_true', help="Count keys and words in definitions") parser.add_argument("dictionary", help="Input dictionary") parser.add_argument("vocabulary", help="Output vocabulary") args = parser.parse_args() text = [] if args.dictionary.endswith('.json'): text = collections.defaultdict(int) for f_name in args.dictionary.split(","): logging.info("Processing " + f_name) assert (f_name.endswith('.json')) logging.info( "Will build the vocabulary from definitions in a dictionary") dict_ = json.load(open(f_name, "r")) for word, list_defs in dict_.items(): if args.keys_only or args.with_keys: text[word] += 1 if not args.keys_only: for def_ in list_defs: for def_word in def_: text[def_word] += 1 logging.info("{} words".format(len(text))) vocab = Vocabulary.build(text, args.top_k) vocab.save(args.vocabulary)
def main(): logging.basicConfig( level='INFO', format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser("Digitizes text and add a vocab") parser.add_argument("vocab", help="Vocabulary") parser.add_argument("--type", choices=("squad", "snli"), default='squad', help="What kind of data should be converted") parser.add_argument("h5", help="Destination") args = parser.parse_args() vocab = Vocabulary(args.vocab) if args.type == 'squad': add_words_ids_to_squad(args.h5, vocab) elif args.type == 'snli': add_word_ids_to_snli(args.h5, vocab) else: raise NotImplementedError()
def main(): logging.basicConfig( level='INFO', format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser("List undefined tokens") parser.add_argument("vocabulary", help="Input vocabulary") parser.add_argument("dictionary", help="Input dictionary") args = parser.parse_args() undefined_tokens_and_freqs = [] vocab = Vocabulary(args.vocabulary) with open(args.dictionary) as f: dictionary = json.load(f) for w, c in zip(vocab.words, vocab.frequencies): if w not in dictionary.keys(): undefined_tokens_and_freqs.append((w, c)) undefined_tokens_and_freqs = sorted(undefined_tokens_and_freqs, key=lambda x: x[1], reverse=True) for w, c in undefined_tokens_and_freqs: print(w)
def main(): logging.basicConfig( level='INFO', format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser( "Converts GLOVE embeddings to a numpy array") parser.add_argument("txt", help="GLOVE data in txt format") parser.add_argument("npy", help="Destination for npy format") parser.add_argument("vocab_out", help="output vocabulary") parser.add_argument("--vocab", default="", help="Performs subsetting based on passed vocab") # OOV handling parser.add_argument("--try-lowercase", action="store_true", help="Try lowercase") args = parser.parse_args() if args.vocab == "": raise NotImplementedError("Not implemented") embeddings = [] dim = None with open(args.txt) as src: for i, line in enumerate(src): tokens = line.strip().split() features = map(float, tokens[1:]) dim = len(features) embeddings.append(features) if i and i % 100000 == 0: print i embeddings = [[0.] * dim] * len( Vocabulary.SPECIAL_TOKEN_MAP) + embeddings np.save(args.npy, embeddings) else: vocab = Vocabulary(args.vocab) print('Computing GloVe') # Loading embeddings_index = {} f = open(args.txt) print('Reading GloVe file') for line in f: values = line.split(' ') word = values[0] dim = len(values[1:]) coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close() # Embedding matrix: larger than necessary f_out = open(args.vocab_out, 'w') n_specials = len(Vocabulary.SPECIAL_TOKEN_MAP.values()) embedding_matrix = np.zeros((vocab.size() + n_specials, dim)) for special_token in Vocabulary.SPECIAL_TOKEN_MAP.values(): line = '<' + special_token + '>' + " 0\n" f_out.write(line.encode('utf-8')) i = n_specials #i = 0 for word, count in zip(vocab.words, vocab.frequencies): embedding_vector = embeddings_index.get(word) if args.try_lowercase and not isinstance(embedding_vector, np.ndarray): embedding_vector = embeddings_index.get(word.lower()) in_glove = embedding_vector is not None last_comp = None if in_glove: last_comp = embedding_vector[-1] #print "i: {}, word {}, count {}, in_glove {}, last {}".format(i, word, count, in_glove, last_comp) if in_glove: try: embedding_matrix[i] = embedding_vector except: print "error idx", i # else, null vector #print "writing:", line, i line = word + " " + str(count) + "\n" f_out.write(line.encode('utf-8')) i += 1 if i and i % 10000 == 0: print "i:", i f_out.close() np.save(args.npy, embedding_matrix[:i])
kwargs_emb = {"normalize": args.normalize, "lowercase": args.lowercase} else: kwargs_emb = {"dim": 300, "vocab_size": args.vocab_size} emb = load_embedding(args.emb_filename, format=args.emb_format, load_kwargs=kwargs_emb, lowercase_if_OOV=False, lemmatize_if_OOV=False, normalize=False) model_name = args.emb_filename.split('/')[-2] # TODO: need to feed dim and vocab_size? or useless? vocab_defs, dict_, test_dict = None, None, None if is_dict_embedding: vocab_defs = Vocabulary(vocab_defs_fname) fname_dict = os.path.join(args.root_dicts, "all.json") fname_test_dict = os.path.join(args.root_dicts, "test.json") dict_ = load_dict(fname_dict) test_dict = load_dict(fname_test_dict) dirname = os.path.join('results/figures/', model_name) if not os.path.exists(dirname): os.makedirs(dirname) diff_ranks = [] for name, data in datasets: print "dataset:", name print_coverage(data, emb) print "" rank_model, rank_truth = compute_ranks(data, emb)
def initialize_data_and_model(config): c = config fuel_path = fuel.config.data_path[0] vocab_main = None if c['vocab_path']: vocab_main = Vocabulary( os.path.join(fuel.config.data_path[0], c['vocab_path'])) data = LanguageModellingData(c['data_path'], c['layout'], vocab=vocab_main) vocab_main = data.vocab retrieval = None if c['dict_path'] and not c['embedding_path']: dict_full_path = os.path.join(fuel_path, c['dict_path']) dict_ = Dictionary(dict_full_path) logger.debug("Loaded dictionary with {} entries".format( dict_.num_entries())) vocab_def = data.vocab if c['dict_vocab_path']: if not c['standalone_def_lookup']: raise ValueError( "Standalone def lookup mandatory with separate vocabs") vocab_def = Vocabulary( os.path.join(fuel.config.data_path[0], c['dict_vocab_path'])) retrieval = Retrieval(vocab_main, dict_, c['max_def_length'], with_too_long_defs='drop', exclude_top_k=c['exclude_top_k'], vocab_def=vocab_def, max_def_per_word=c['max_def_per_word']) elif c['embedding_path']: assert (c['dict_path']) emb_full_path = os.path.join(fuel_path, c['embedding_path']) embedding_matrix = numpy.load(emb_full_path) dict_full_path = os.path.join(fuel_path, c['dict_path']) dict_ = Dictionary(dict_full_path) # should be key=value=word if not c['standalone_def_lookup']: raise ValueError("Standalone def lookup mandatory") vocab_def = data.vocab if c['dict_vocab_path']: vocab_def = Vocabulary( os.path.join(fuel.config.data_path[0], c['dict_vocab_path'])) retrieval = Retrieval(data.vocab, dict_, max_def_length=1, with_too_long_defs='drop', exclude_top_k=c['exclude_top_k'], vocab_def=vocab_def, max_def_per_word=1, add_bod_eod=False) lm = LanguageModel(c['emb_dim'], c['emb_def_dim'], c['dim'], c['num_input_words'], c['def_num_input_words'], c['num_output_words'], data.vocab, retrieval, c['def_reader'], c['standalone_def_lookup'], c['standalone_def_rnn'], c['disregard_word_embeddings'], c['compose_type'], very_rare_threshold=c['very_rare_threshold'], cache_size=c['cache_size'], weights_init=Uniform(width=0.1), biases_init=Constant(0.)) lm.initialize() if c['embedding_path']: lm.set_def_embeddings(embedding_matrix) logger.debug("Embeddings loaded") return (data, lm, retrieval)
def main(): parser = argparse.ArgumentParser( "Generate synthetic data and outputs in files") parser.add_argument("path", type=str, help="Top most frequent words to leave") parser.add_argument("n_primes", type=int, help="# of primes") parser.add_argument("n_non_primes", type=int, help="# of non-primes") parser.add_argument("features_size", type=int, help="Features size") parser.add_argument("markov_order", type=int, help="Markov order") parser.add_argument("n_sentences", type=int, help="# sentences") parser.add_argument("pc_train", type=float, help="% train sentences") parser.add_argument("pc_valid", type=float, help="% valid sentences") parser.add_argument("sample_temperature", type=float, default=1.0, help="% valid sentences") parser.add_argument("min_sentence_len", type=int, default=6) parser.add_argument("max_sentence_len", type=int, default=20) parser.add_argument("min_def_len", type=int, default=6) parser.add_argument("max_def_len", type=int, default=20) args = parser.parse_args() print "Number of sentences:", args.n_sentences assert (0 < args.pc_train + args.pc_valid < 1) assert (os.path.exists(args.path) == False) os.makedirs(args.path) args.pc_test = 1 - (args.pc_train + args.pc_valid) gen = FakeTextGenerator(args.n_primes, args.n_non_primes, args.features_size, args.markov_order, args.sample_temperature, args.min_def_len, args.max_def_len) data = gen.create_corpus(args.n_sentences, args.min_sentence_len, args.max_sentence_len, args.pc_train, args.pc_valid) train_data, valid_data, test_data = data concat_sentences = lambda sentences: [' '.join(s) for s in sentences] train_data = concat_sentences(train_data) test_data = concat_sentences(test_data) valid_data = concat_sentences(valid_data) all_data = train_data + valid_data + test_data with temporary_content_path('\n'.join(all_data)) as path: vocab = Vocabulary.build(path, sort_by='lexicographical') vocab.save(os.path.join(args.path, "vocab.txt")) dict_json = json.dumps(gen.dictionary) write_data(os.path.join(args.path, "dict.json"), dict_json) write_data(os.path.join(args.path, "train.txt"), '\n'.join(train_data)) write_data(os.path.join(args.path, "valid.txt"), '\n'.join(valid_data)) write_data(os.path.join(args.path, "test.txt"), '\n'.join(test_data)) args_json = json.dumps(vars(args), indent=4, sort_keys=True) write_data(os.path.join(args.path, "params.json"), args_json) write_data(os.path.join(args.path, "generator.p"), pickle.dumps(gen))
def main(): logging.basicConfig( level='INFO', format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser("Builds a dictionary") parser.add_argument("--target_coverage_text", type=float, help="Target coverage of text") parser.add_argument("--target_coverage_def", type=float, help="Target coverage of def") parser.add_argument("--vocab_text", type=str, help="Vocabulary of text") parser.add_argument("--vocab_def", type=str, help="Vocabulary of def") parser.add_argument("--step_size", type=int, default=30) parser.add_argument("--target", type=str, default="Final path") args = parser.parse_args() vocab_text = Vocabulary(args.vocab_text) vocab_def = Vocabulary(args.vocab_def) # Greedy solution is optimal # I also approximate greedy a bit by adding word by word. This is fine, vocabs are big target_coverage_text = np.sum( vocab_text.frequencies) * args.target_coverage_text target_coverage_def = np.sum( vocab_def.frequencies) * args.target_coverage_def current_vocab = set([]) # Of course I could use binsearch for id in range(vocab_def.size() / args.step_size): for id2 in range(args.step_size): current_vocab.add(vocab_def.id_to_word(id * args.step_size + id2)) current_vocab_mod = set(current_vocab) current_coverage_def = 0.0 current_coverage_text = 0.0 for w in current_vocab_mod: current_coverage_def += vocab_def.frequencies[vocab_def.word_to_id( w)] current_coverage_text += vocab_text.frequencies[ vocab_text.word_to_id(w)] id_text = 0 while current_coverage_text < target_coverage_text: while vocab_text.id_to_word(id_text) in current_vocab_mod: id_text += 1 if id_text >= vocab_text.size(): raise Exception("Perhaps try lower target coverage") w = vocab_text.id_to_word(id_text) current_vocab_mod.add(w) current_coverage_def += vocab_def.frequencies[vocab_def.word_to_id( w)] current_coverage_text += vocab_text.frequencies[id_text] if current_coverage_def > target_coverage_def: current_vocab = current_vocab_mod break print( "After adding {} words I covered {} of def and {} of text occurences" .format( len(current_vocab_mod), current_coverage_def / float(np.sum(vocab_def.frequencies)), current_coverage_text / float(np.sum(vocab_text.frequencies)))) # To be safe rechecking shortlist works current_coverage_def = 0 current_coverage_text = 0 for w in current_vocab: current_coverage_def += vocab_def.frequencies[vocab_def.word_to_id(w)] current_coverage_text += vocab_text.frequencies[vocab_text.word_to_id( w)] print( "Sanity check: after adding {} words I covered {} of def and {} of text occurences" .format(len(current_vocab), current_coverage_def / float(np.sum(vocab_def.frequencies)), current_coverage_text / float(np.sum(vocab_text.frequencies)))) vocab_result = Vocabulary.build( {word: vocab_text.word_freq(word) for word in current_vocab}) vocab_result.save(args.target)
def main(): logging.basicConfig( level='INFO', format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser("Builds a dictionary") parser.add_argument("--top-k", type=int, help="Top most frequent words to leave") parser.add_argument( "--vocab-text", default=None, help="Vocab corresponding to the main if text is a dictionary.") parser.add_argument( "--weight-dict-entries", action='store_true', help="Weight dict entries according to the freqs from a vocab.") parser.add_argument( "--exclude-top-k", type=int, help="Ignore definitions of a number of most frequent words") parser.add_argument( "text", help= "The text to use. Can be a text file or .h5 or a dictionary with format.json in which case you need to use --vocab-text as well." ) parser.add_argument("vocab", help="Destination") args = parser.parse_args() text = [] if args.vocab_text: text = collections.defaultdict(int) vocab_text = Vocabulary(args.vocab_text) for f_name in args.text.split(","): logging.info("Processing " + f_name) if f_name.endswith('.h5'): with h5py.File(f_name) as h5_file: if 'text' not in h5_file.keys(): print("Missing text field from " + f_name) text.extend(h5_file['text'][:]) elif f_name.endswith('.json'): logging.info( "Will build the vocabulary from definitions in a dictionary") dict_ = json.load(open(f_name, "r")) for word, list_defs in dict_.items(): text_vocab_id = vocab_text.word_to_id(word) if (text_vocab_id != vocab_text.unk and text_vocab_id < args.exclude_top_k): continue for def_ in list_defs: for def_word in def_: if args.weight_dict_entries: text[def_word] += vocab_text.word_freq(word) else: text[def_word] += 1 else: with open(f_name) as file_: def data(): for line in file_: for word in line.strip().split(): try: yield text_type(word, 'utf-8') except: print("Skipped word " + word) text.extend(data()) logging.info("{} words".format(len(text))) vocab = Vocabulary.build(text, args.top_k) vocab.save(args.vocab)
if __name__ == "__main__": parser = argparse.ArgumentParser( description= 'Write the list of words in embeddings but not in dict vocabulary') parser.add_argument('embeddings', type=str) parser.add_argument('vocabulary', type=str) parser.add_argument('vocabulary_counts', type=str) parser.add_argument('absent_words', type=str) args = parser.parse_args() print "read first file {}".format(args.embeddings) embeddings = read_embedding_file(args.embeddings) print "read vocabulary file {}".format(args.vocabulary) vocabulary = Vocabulary(args.vocabulary) print "read vocabulary for counts estimation file {}".format( args.vocabulary_counts) vocabulary_counts = Vocabulary(args.vocabulary_counts) vocabulary = set(vocabulary.words) # faster lookup absent_in_vocab = set( [w for w in embeddings.keys() if w not in vocabulary]) print("Number of absent words in vocab", len(absent_in_vocab)) absent_in_vocab = sorted(list(absent_in_vocab), key=lambda w: vocabulary_counts.word_freq(w), reverse=True) with open(args.absent_words, 'w') as f: for w in absent_in_vocab:
def _initialize_simple_model_and_data(c): if c['vocab']: vocab = Vocabulary(c['vocab']) else: vocab = None # Load data data = SNLIData(path=c['data_path'], layout=c['layout'], vocab=vocab) if vocab is None: vocab = data.vocab if c.get('vocab_text', ''): vocab_text = Vocabulary(c['vocab_text']) else: vocab_text = vocab # Dict if c['dict_path']: dict = Dictionary(c['dict_path']) logging.info("Loaded dict with {} entries".format(dict.num_entries())) if len(c['vocab_def']): retrieval_vocab = Vocabulary(c['vocab_def']) else: retrieval_vocab = data.vocab retrieval = Retrieval(vocab_text=vocab_text, vocab_def=retrieval_vocab, dictionary=dict, max_def_length=c['max_def_length'], with_too_long_defs=c['with_too_long_defs'], exclude_top_k=c['exclude_top_k'], max_def_per_word=c['max_def_per_word']) data.set_retrieval(retrieval) else: retrieval = None dict = None retrieval_vocab = None def_emb_dim = c.get('def_emb_dim', 0) if c.get('def_emb_dim', 0) > 0 else c['emb_dim'] def_emb_translate_dim = c.get( 'def_emb_translate_dim', 0) if c.get('def_emb_translate_dim', 0) > 0 else def_emb_dim # Initialize simple = NLISimple( # Baseline arguments emb_dim=c['emb_dim'], vocab=data.vocab, encoder=c['encoder'], dropout=c['dropout'], num_input_words=c['num_input_words'], mlp_dim=c['mlp_dim'], # Dict lookup kwargs (will get refactored) translate_dim=c['translate_dim'], retrieval=retrieval, compose_type=c['compose_type'], reader_type=c['reader_type'], disregard_word_embeddings=c['disregard_word_embeddings'], def_vocab=retrieval_vocab, def_emb_dim=c['def_emb_dim'], combiner_dropout=c['combiner_dropout'], share_def_lookup=c['share_def_lookup'], combiner_dropout_type=c['combiner_dropout_type'], combiner_bn=c['combiner_bn'], combiner_gating=c['combiner_gating'], combiner_shortcut=c['combiner_shortcut'], combiner_reader_translate=c['combiner_reader_translate'], def_dim=c['def_dim'], num_input_def_words=c['num_input_def_words'], def_emb_translate_dim=def_emb_translate_dim, # Init weights_init=GlorotUniform(), biases_init=Constant(0.0)) simple.push_initialization_config() if c['encoder'] == 'rnn': simple._rnn_encoder.weights_init = Uniform(std=0.1) simple.initialize() if c.get('embedding_def_path', ''): embeddings = np.load(c['embedding_def_path']) simple.set_def_embeddings(embeddings.astype(theano.config.floatX)) if c['embedding_path']: embeddings = np.load(c['embedding_path']) simple.set_embeddings(embeddings.astype(theano.config.floatX)) return simple, data, dict, retrieval, vocab
def train_snli_model(new_training_job, config, save_path, params, fast_start, fuel_server, seed, model='simple'): if config['exclude_top_k'] > config['num_input_words'] and config[ 'num_input_words'] > 0: raise Exception("Some words have neither word nor def embedding") c = config logger = configure_logger(name="snli_baseline_training", log_file=os.path.join(save_path, "log.txt")) if not os.path.exists(save_path): logger.info("Start a new job") os.mkdir(save_path) else: logger.info("Continue an existing job") with open(os.path.join(save_path, "cmd.txt"), "w") as f: f.write(" ".join(sys.argv)) # Make data paths nice for path in [ 'dict_path', 'embedding_def_path', 'embedding_path', 'vocab', 'vocab_def', 'vocab_text' ]: if c.get(path, ''): if not os.path.isabs(c[path]): c[path] = os.path.join(fuel.config.data_path[0], c[path]) main_loop_path = os.path.join(save_path, 'main_loop.tar') main_loop_best_val_path = os.path.join(save_path, 'main_loop_best_val.tar') stream_path = os.path.join(save_path, 'stream.pkl') # Save config to save_path json.dump(config, open(os.path.join(save_path, "config.json"), "w")) if model == 'simple': nli_model, data, used_dict, used_retrieval, _ = _initialize_simple_model_and_data( c) elif model == 'esim': nli_model, data, used_dict, used_retrieval, _ = _initialize_esim_model_and_data( c) else: raise NotImplementedError() # Compute cost s1, s2 = T.lmatrix('sentence1'), T.lmatrix('sentence2') if c['dict_path']: assert os.path.exists(c['dict_path']) s1_def_map, s2_def_map = T.lmatrix('sentence1_def_map'), T.lmatrix( 'sentence2_def_map') def_mask = T.fmatrix("def_mask") defs = T.lmatrix("defs") else: s1_def_map, s2_def_map = None, None def_mask = None defs = None s1_mask, s2_mask = T.fmatrix('sentence1_mask'), T.fmatrix('sentence2_mask') y = T.ivector('label') cg = {} for train_phase in [True, False]: # NOTE: Please don't change outputs of cg if train_phase: with batch_normalization(nli_model): pred = nli_model.apply(s1, s1_mask, s2, s2_mask, def_mask=def_mask, defs=defs, s1_def_map=s1_def_map, s2_def_map=s2_def_map, train_phase=train_phase) else: pred = nli_model.apply(s1, s1_mask, s2, s2_mask, def_mask=def_mask, defs=defs, s1_def_map=s1_def_map, s2_def_map=s2_def_map, train_phase=train_phase) cost = CategoricalCrossEntropy().apply(y.flatten(), pred) error_rate = MisclassificationRate().apply(y.flatten(), pred) cg[train_phase] = ComputationGraph([cost, error_rate]) # Weight decay (TODO: Make it less bug prone) if model == 'simple': weights_to_decay = VariableFilter( bricks=[dense for dense, relu, bn in nli_model._mlp], roles=[WEIGHT])(cg[True].variables) weight_decay = np.float32(c['l2']) * sum( (w**2).sum() for w in weights_to_decay) elif model == 'esim': weight_decay = 0.0 else: raise NotImplementedError() final_cost = cg[True].outputs[0] + weight_decay final_cost.name = 'final_cost' # Add updates for population parameters if c.get("bn", True): pop_updates = get_batch_normalization_updates(cg[True]) extra_updates = [(p, m * 0.1 + p * (1 - 0.1)) for p, m in pop_updates] else: pop_updates = [] extra_updates = [] if params: logger.debug("Load parameters from {}".format(params)) with open(params) as src: loaded_params = load_parameters(src) cg[True].set_parameter_values(loaded_params) for param, m in pop_updates: param.set_value(loaded_params[get_brick( param).get_hierarchical_name(param)]) if os.path.exists(os.path.join(save_path, "main_loop.tar")): logger.warning("Manually loading BN stats :(") with open(os.path.join(save_path, "main_loop.tar")) as src: loaded_params = load_parameters(src) for param, m in pop_updates: param.set_value( loaded_params[get_brick(param).get_hierarchical_name(param)]) if theano.config.compute_test_value != 'off': test_value_data = next( data.get_stream('train', batch_size=4).get_epoch_iterator()) s1.tag.test_value = test_value_data[0] s1_mask.tag.test_value = test_value_data[1] s2.tag.test_value = test_value_data[2] s2_mask.tag.test_value = test_value_data[3] y.tag.test_value = test_value_data[4] # Freeze embeddings if not c['train_emb']: frozen_params = [ p for E in nli_model.get_embeddings_lookups() for p in E.parameters ] train_params = [p for p in cg[True].parameters] assert len(set(frozen_params) & set(train_params)) > 0 else: frozen_params = [] if not c.get('train_def_emb', 1): frozen_params_def = [ p for E in nli_model.get_def_embeddings_lookups() for p in E.parameters ] train_params = [p for p in cg[True].parameters] assert len(set(frozen_params_def) & set(train_params)) > 0 frozen_params += frozen_params_def train_params = [p for p in cg[True].parameters if p not in frozen_params] train_params_keys = [ get_brick(p).get_hierarchical_name(p) for p in train_params ] # Optimizer algorithm = GradientDescent(cost=final_cost, on_unused_sources='ignore', parameters=train_params, step_rule=Adam(learning_rate=c['lr'])) algorithm.add_updates(extra_updates) m = Model(final_cost) parameters = m.get_parameter_dict() # Blocks version mismatch logger.info("Trainable parameters" + "\n" + pprint.pformat([(key, parameters[key].get_value().shape) for key in sorted(train_params_keys)], width=120)) logger.info("# of parameters {}".format( sum([ np.prod(parameters[key].get_value().shape) for key in sorted(train_params_keys) ]))) ### Monitored args ### train_monitored_vars = [final_cost] + cg[True].outputs monitored_vars = cg[False].outputs val_acc = monitored_vars[1] to_monitor_names = [ 'def_unk_ratio', 's1_merged_input_rootmean2', 's1_def_mean_rootmean2', 's1_gate_rootmean2', 's1_compose_gate_rootmean2' ] for k in to_monitor_names: train_v, valid_v = VariableFilter(name=k)( cg[True]), VariableFilter(name=k)(cg[False]) if len(train_v): logger.info("Adding {} tracking".format(k)) train_monitored_vars.append(train_v[0]) monitored_vars.append(valid_v[0]) else: logger.warning("Didnt find {} in cg".format(k)) if c['monitor_parameters']: for name in train_params_keys: param = parameters[name] num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements grad_norm = algorithm.gradients[param].norm(2) / num_elements step_norm = algorithm.steps[param].norm(2) / num_elements stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' train_monitored_vars.append(stats) regular_training_stream = data.get_stream('train', batch_size=c['batch_size'], seed=seed) if fuel_server: # the port will be configured by the StartFuelServer extension training_stream = ServerDataStream( sources=regular_training_stream.sources, hwm=100, produces_examples=regular_training_stream.produces_examples) else: training_stream = regular_training_stream ### Build extensions ### extensions = [ # Load(main_loop_path, load_iteration_state=True, load_log=True) # .set_conditions(before_training=not new_training_job), StartFuelServer(regular_training_stream, stream_path, hwm=100, script_path=os.path.join( os.path.dirname(__file__), "../bin/start_fuel_server.py"), before_training=fuel_server), Timing(every_n_batches=c['mon_freq']), ProgressBar(), RetrievalPrintStats(retrieval=used_retrieval, every_n_batches=c['mon_freq_valid'], before_training=not fast_start), Timestamp(), TrainingDataMonitoring(train_monitored_vars, prefix="train", every_n_batches=c['mon_freq']), ] if c['layout'] == 'snli': validation = DataStreamMonitoring(monitored_vars, data.get_stream('valid', batch_size=14, seed=seed), before_training=not fast_start, on_resumption=True, after_training=True, every_n_batches=c['mon_freq_valid'], prefix='valid') extensions.append(validation) elif c['layout'] == 'mnli': validation = DataStreamMonitoring(monitored_vars, data.get_stream('valid_matched', batch_size=14, seed=seed), every_n_batches=c['mon_freq_valid'], on_resumption=True, after_training=True, prefix='valid_matched') validation_mismatched = DataStreamMonitoring( monitored_vars, data.get_stream('valid_mismatched', batch_size=14, seed=seed), every_n_batches=c['mon_freq_valid'], before_training=not fast_start, on_resumption=True, after_training=True, prefix='valid_mismatched') extensions.extend([validation, validation_mismatched]) else: raise NotImplementedError() # Similarity trackers for embeddings if len(c.get('vocab_def', '')): retrieval_vocab = Vocabulary(c['vocab_def']) else: retrieval_vocab = data.vocab retrieval_all = Retrieval(vocab_text=retrieval_vocab, dictionary=used_dict, max_def_length=c['max_def_length'], exclude_top_k=0, max_def_per_word=c['max_def_per_word']) for name in [ 's1_word_embeddings', 's1_dict_word_embeddings', 's1_translated_word_embeddings' ]: variables = VariableFilter(name=name)(cg[False]) if len(variables): s1_emb = variables[0] logger.info("Adding similarity tracking for " + name) # A bit sloppy about downcast if "dict" in name: embedder = construct_dict_embedder(theano.function( [s1, defs, def_mask, s1_def_map], s1_emb, allow_input_downcast=True), vocab=data.vocab, retrieval=retrieval_all) extensions.append( SimilarityWordEmbeddingEval( embedder=embedder, prefix=name, every_n_batches=c['mon_freq_valid'], before_training=not fast_start)) else: embedder = construct_embedder(theano.function( [s1], s1_emb, allow_input_downcast=True), vocab=data.vocab) extensions.append( SimilarityWordEmbeddingEval( embedder=embedder, prefix=name, every_n_batches=c['mon_freq_valid'], before_training=not fast_start)) track_the_best = TrackTheBest(validation.record_name(val_acc), before_training=not fast_start, every_n_epochs=c['save_freq_epochs'], after_training=not fast_start, every_n_batches=c['mon_freq_valid'], choose_best=min) extensions.append(track_the_best) # Special care for serializing embeddings if len(c.get('embedding_path', '')) or len(c.get('embedding_def_path', '')): extensions.insert( 0, LoadNoUnpickling(main_loop_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job)) extensions.append( Checkpoint(main_loop_path, parameters=train_params + [p for p, m in pop_updates], save_main_loop=False, save_separately=['log', 'iteration_state'], before_training=not fast_start, every_n_epochs=c['save_freq_epochs'], after_training=not fast_start).add_condition( ['after_batch', 'after_epoch'], OnLogRecord(track_the_best.notification_name), (main_loop_best_val_path, ))) else: extensions.insert( 0, Load(main_loop_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job)) extensions.append( Checkpoint(main_loop_path, parameters=cg[True].parameters + [p for p, m in pop_updates], before_training=not fast_start, every_n_epochs=c['save_freq_epochs'], after_training=not fast_start).add_condition( ['after_batch', 'after_epoch'], OnLogRecord(track_the_best.notification_name), (main_loop_best_val_path, ))) extensions.extend([ DumpCSVSummaries(save_path, every_n_batches=c['mon_freq_valid'], after_training=True), DumpTensorflowSummaries(save_path, after_epoch=True, every_n_batches=c['mon_freq_valid'], after_training=True), Printing(every_n_batches=c['mon_freq_valid']), PrintMessage(msg="save_path={}".format(save_path), every_n_batches=c['mon_freq']), FinishAfter(after_n_batches=c['n_batches']).add_condition( ['after_batch'], OnLogStatusExceed('iterations_done', c['n_batches'])) ]) logger.info(extensions) ### Run training ### if "VISDOM_SERVER" in os.environ: print("Running visdom server") ret = subprocess.Popen([ os.path.join(os.path.dirname(__file__), "../visdom_plotter.py"), "--visdom-server={}".format(os.environ['VISDOM_SERVER']), "--folder={}".format(save_path) ]) time.sleep(0.1) if ret.returncode is not None: raise Exception() atexit.register(lambda: os.kill(ret.pid, signal.SIGINT)) model = Model(cost) for p, m in pop_updates: model._parameter_dict[get_brick(p).get_hierarchical_name(p)] = p main_loop = MainLoop(algorithm, training_stream, model=model, extensions=extensions) assert os.path.exists(save_path) main_loop.run()
def evaluate(c, tar_path, *args, **kwargs): """ Performs rudimentary evaluation of SNLI/MNLI run * Runs on valid and test given network * Saves all predictions * Saves embedding matrix * Saves results.json and predictions.csv """ # Load and configure model = kwargs['model'] assert c.endswith("json") c = json.load(open(c)) # Very ugly absolute path fix ABS_PATHS = [ "data/", "/mnt/users/jastrzebski/local/dict_based_learning/data/", "/data/cf9ffb48-61bd-40dc-a011-b2e7e5acfd72/" ] from six import string_types for abs_path in ABS_PATHS: for k in c: if isinstance(c[k], string_types): if c[k].startswith(abs_path): c[k] = c[k][len(abs_path):] # Make data paths nice for path in [ 'dict_path', 'embedding_def_path', 'embedding_path', 'vocab', 'vocab_def', 'vocab_text' ]: if c.get(path, ''): if not os.path.isabs(c[path]): c[path] = os.path.join(fuel.config.data_path[0], c[path]) logging.info("Updating config with " + str(kwargs)) c.update(**kwargs) # NOTE: This assures we don't miss crucial definition for some def heavy words # usually it is a good idea c['max_def_per_word'] = c['max_def_per_word'] * 2 assert tar_path.endswith("tar") dest_path = os.path.dirname(tar_path) prefix = os.path.splitext(os.path.basename(tar_path))[0] s1_decoded, s2_decoded = T.lmatrix('sentence1'), T.lmatrix('sentence2') if c['dict_path']: s1_def_map, s2_def_map = T.lmatrix('sentence1_def_map'), T.lmatrix( 'sentence2_def_map') def_mask = T.fmatrix("def_mask") defs = T.lmatrix("defs") else: s1_def_map, s2_def_map = None, None def_mask = None defs = None s1_mask, s2_mask = T.fmatrix('sentence1_mask'), T.fmatrix('sentence2_mask') if model == 'simple': model, data, used_dict, used_retrieval, used_vocab = _initialize_simple_model_and_data( c) elif model == 'esim': model, data, used_dict, used_retrieval, used_vocab = _initialize_esim_model_and_data( c) else: raise NotImplementedError() pred = model.apply(s1_decoded, s1_mask, s2_decoded, s2_mask, def_mask=def_mask, defs=defs, s1_def_map=s1_def_map, s2_def_map=s2_def_map, train_phase=False) cg = ComputationGraph([pred]) if c.get("bn", True): bn_params = [ p for p in VariableFilter(bricks=[BatchNormalization])(cg) if hasattr(p, "set_value") ] else: bn_params = [] # Load model model = Model(cg.outputs) parameters = model.get_parameter_dict() # Blocks version mismatch logging.info( "Trainable parameters" + "\n" + pprint.pformat([(key, parameters[key].get_value().shape) for key in sorted([ get_brick(param).get_hierarchical_name(param) for param in cg.parameters ])], width=120)) logging.info("# of parameters {}".format( sum([ np.prod(parameters[key].get_value().shape) for key in sorted([ get_brick(param).get_hierarchical_name(param) for param in cg.parameters ]) ]))) with open(tar_path) as src: params = load_parameters(src) loaded_params_set = set(params.keys()) model_params_set = set([ get_brick(param).get_hierarchical_name(param) for param in cg.parameters ]) logging.info("Loaded extra parameters") logging.info(loaded_params_set - model_params_set) logging.info("Missing parameters") logging.info(model_params_set - loaded_params_set) model.set_parameter_values(params) if c.get("bn", True): logging.info("Loading " + str([ get_brick(param).get_hierarchical_name(param) for param in bn_params ])) for param in bn_params: param.set_value( params[get_brick(param).get_hierarchical_name(param)]) for p in bn_params: model._parameter_dict[get_brick(p).get_hierarchical_name(p)] = p # Read logs logs = pd.read_csv(os.path.join(dest_path, "logs.csv")) best_val_acc = logs['valid_misclassificationrate_apply_error_rate'].min() logging.info("Best measured valid acc: " + str(best_val_acc)) # NOTE(kudkudak): We need this to have comparable mean rank and embedding scores reference_vocab = Vocabulary( os.path.join(fuel.config.data_path[0], c['data_path'], 'vocab.txt')) vocab_all = Vocabulary( os.path.join( fuel.config.data_path[0], c['data_path'], 'vocab_all.txt')) # Can include OOV words, which is interesting retrieval_all = Retrieval(vocab_text=used_vocab, dictionary=used_dict, max_def_length=c['max_def_length'], exclude_top_k=0, max_def_per_word=c['max_def_per_word']) # logging.info("Calculating dict and word embeddings for vocab.txt and vocab_all.txt") # for name in ['s1_word_embeddings', 's1_dict_word_embeddings']: # variables = VariableFilter(name=name)(cg) # if len(variables): # s1_emb = variables[0] # # A bit sloppy about downcast # # if "dict" in name: # embedder = construct_dict_embedder( # theano.function([s1_decoded, defs, def_mask, s1_def_map], s1_emb, allow_input_downcast=True), # vocab=data.vocab, retrieval=retrieval_all) # else: # embedder = construct_embedder(theano.function([s1_decoded], s1_emb, allow_input_downcast=True), # vocab=data.vocab) # # for v_name, v in [("vocab_all", vocab_all), ("vocab", reference_vocab)]: # logging.info("Calculating {} embeddings for {}".format(name, v_name)) # Predict predict_fnc = theano.function(cg.inputs, pred) results = {} batch_size = 14 for subset in ['valid', 'test']: logging.info("Predicting on " + subset) stream = data.get_stream(subset, batch_size=batch_size, seed=778) it = stream.get_epoch_iterator() rows = [] for ex in tqdm.tqdm(it, total=10000 / batch_size): ex = dict(zip(stream.sources, ex)) inp = [ex[v.name] for v in cg.inputs] prob = predict_fnc(*inp) label_pred = np.argmax(prob, axis=1) for id in range(len(prob)): s1_decoded = used_vocab.decode(ex['sentence1'][id]).split() s2_decoded = used_vocab.decode(ex['sentence2'][id]).split() assert used_vocab == data.vocab s1_decoded = [ '*' + w + '*' if used_vocab.word_to_id(w) > c['num_input_words'] else w for w in s1_decoded ] s2_decoded = [ '*' + w + '*' if used_vocab.word_to_id(w) > c['num_input_words'] else w for w in s2_decoded ] # Different difficulty metrics # text_unk_percentage s1_no_pad = [w for w in ex['sentence1'][id] if w != 0] s2_no_pad = [w for w in ex['sentence2'][id] if w != 0] s1_unk_percentage = sum([ 1. for w in s1_no_pad if w == used_vocab.unk ]) / len(s1_no_pad) s2_unk_percentage = sum([ 1. for w in s1_no_pad if w == used_vocab.unk ]) / len(s2_no_pad) # mean freq word s1_mean_freq = np.mean([ 0 if w == data.vocab.unk else used_vocab._id_to_freq[w] for w in s1_no_pad ]) s2_mean_freq = np.mean([ 0 if w == data.vocab.unk else used_vocab._id_to_freq[w] for w in s2_no_pad ]) # mean rank word (UNK is max rank) # NOTE(kudkudak): Will break if we reindex unk between vocabs :P s1_mean_rank = np.mean([ reference_vocab.size() if reference_vocab.word_to_id( used_vocab.id_to_word(w)) == reference_vocab.unk else reference_vocab.word_to_id(used_vocab.id_to_word(w)) for w in s1_no_pad ]) s2_mean_rank = np.mean([ reference_vocab.size() if reference_vocab.word_to_id( used_vocab.id_to_word(w)) == reference_vocab.unk else reference_vocab.word_to_id(used_vocab.id_to_word(w)) for w in s2_no_pad ]) rows.append({ "pred": label_pred[id], "true_label": ex['label'][id], "s1": ' '.join(s1_decoded), "s2": ' '.join(s2_decoded), "s1_unk_percentage": s1_unk_percentage, "s2_unk_percentage": s2_unk_percentage, "s1_mean_freq": s1_mean_freq, "s2_mean_freq": s2_mean_freq, "s1_mean_rank": s1_mean_rank, "s2_mean_rank": s2_mean_rank, "p_0": prob[id, 0], "p_1": prob[id, 1], "p_2": prob[id, 2] }) preds = pd.DataFrame(rows, columns=rows[0].keys()) preds.to_csv( os.path.join(dest_path, prefix + '_predictions_{}.csv'.format(subset))) results[subset] = {} results[subset]['misclassification'] = 1 - np.mean( preds.pred == preds.true_label) if subset == "valid" and np.abs( (1 - np.mean(preds.pred == preds.true_label)) - best_val_acc) > 0.001: logging.error("!!!") logging.error( "Found different best_val_acc. Probably due to changed specification of the model class." ) logging.error("Discrepancy {}".format( (1 - np.mean(preds.pred == preds.true_label)) - best_val_acc)) logging.error("!!!") logging.info(results) json.dump(results, open(os.path.join(dest_path, prefix + '_results.json'), "w"))
def _initialize_esim_model_and_data(c): if c['vocab']: vocab = Vocabulary(c['vocab']) else: vocab = None # Load data data = SNLIData(path=c['data_path'], layout=c['layout'], vocab=vocab) if vocab is None: vocab = data.vocab if c.get('vocab_text', ''): vocab_text = Vocabulary(c['vocab_text']) else: vocab_text = vocab # def_emb_dim defaults to emb_dim # def_emb_translate_dim default to def_emb_dim def_emb_dim = c.get('def_emb_dim', 0) if c.get('def_emb_dim', 0) > 0 else c['emb_dim'] def_emb_translate_dim = c.get( 'def_emb_translate_dim', 0) if c.get('def_emb_translate_dim', 0) > 0 else def_emb_dim # Dict if c['dict_path']: dict = Dictionary(c['dict_path']) logging.info("Loaded dict with {} entries".format(dict.num_entries())) if len(c['vocab_def']): retrieval_vocab = Vocabulary(c['vocab_def']) else: retrieval_vocab = data.vocab retrieval = Retrieval(vocab_text=vocab_text, vocab_def=retrieval_vocab, dictionary=dict, max_def_length=c['max_def_length'], with_too_long_defs=c['with_too_long_defs'], exclude_top_k=c['exclude_top_k'], max_def_per_word=c['max_def_per_word']) data.set_retrieval(retrieval) num_input_def_words = c['num_input_def_words'] if c[ 'num_input_def_words'] > 0 else c['num_input_words'] # TODO: Refactor lookup passing to reader. Very incoventient ATM if c['reader_type'] == "rnn": def_reader = LSTMReadDefinitions( num_input_words=num_input_def_words, weights_init=Uniform(width=0.1), biases_init=Constant(0.), dim=c['def_dim'], emb_dim=def_emb_dim, vocab=vocab, lookup=None) elif c['reader_type'] == "mean": def_reader = MeanPoolReadDefinitions( num_input_words=num_input_def_words, translate=c['combiner_reader_translate'], vocab=vocab, weights_init=Uniform(width=0.1), lookup=None, dim=def_emb_translate_dim, biases_init=Constant(0.), emb_dim=def_emb_dim) else: raise NotImplementedError() def_combiner = MeanPoolCombiner( dim=c['def_dim'], emb_dim=def_emb_translate_dim, dropout=c['combiner_dropout'], dropout_type=c['combiner_dropout_type'], def_word_gating=c['combiner_gating'], shortcut_unk_and_excluded=c['combiner_shortcut'], num_input_words=num_input_def_words, exclude_top_k=c['exclude_top_k'], vocab=vocab, compose_type=c['compose_type'], weights_init=Uniform(width=0.1), biases_init=Constant(0.)) else: retrieval = None dict = None def_combiner = None def_reader = None # Initialize simple = ESIM( # Baseline arguments emb_dim=c['emb_dim'], vocab=data.vocab, encoder=c['encoder'], dropout=c['dropout'], def_emb_translate_dim=def_emb_translate_dim, num_input_words=c['num_input_words'], def_dim=c['def_dim'], dim=c['dim'], bn=c.get('bn', True), def_combiner=def_combiner, def_reader=def_reader, # Init weights_init=GlorotUniform(), biases_init=Constant(0.0)) simple.push_initialization_config() # TODO: Not sure anymore why we do that if c['encoder'] == 'bilstm': for enc in simple._rnns: enc.weights_init = Uniform(std=0.1) simple.initialize() if c['embedding_path']: embeddings = np.load(c['embedding_path']) simple.set_embeddings(embeddings.astype(theano.config.floatX)) if c.get('embedding_def_path', ''): embeddings = np.load(c['embedding_def_path']) simple.set_def_embeddings(embeddings.astype(theano.config.floatX)) return simple, data, dict, retrieval, vocab
def main(): logging.basicConfig( level='INFO', format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser( "Converts GLOVE embeddings to a numpy array") parser.add_argument("txt", help="GLOVE data in txt format") parser.add_argument("npy", help="Destination for npy format") parser.add_argument("--vocab", default="", help="Performs subsetting based on passed vocab") parser.add_argument("--dict", default="", help="Performs subsetting based on passed dict") # OOV handling parser.add_argument("--try-lemma", action="store_true", help="Try lemma") parser.add_argument("--try-lowercase", default="", help="Try lowercase") args = parser.parse_args() if args.dict and not args.vocab: # usually you'd want to use both, I suppose raise NotImplementedError("Not implemented") if args.try_lemma or args.try_lowercase: # TODO(kudkudak): Implement raise NotImplementedError("Not implemented yet") if args.vocab == "": embeddings = [] dim = None with open(args.txt) as src: for i, line in enumerate(src): tokens = line.strip().split() features = map(float, tokens[1:]) dim = len(features) embeddings.append(features) if i and i % 100000 == 0: print i embeddings = [[0.] * dim] * len( Vocabulary.SPECIAL_TOKEN_MAP) + embeddings numpy.save(args.npy, embeddings) else: vocab = Vocabulary(args.vocab) if args.dict: dict_ = Dictionary(args.dict) print('Computing GloVe') # Loading embeddings_index = {} f = open(args.txt) print('Reading GloVe file') for line in f: values = line.split(' ') word = values[0] dim = len(values[1:]) coefs = numpy.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close() # Embedding matrix print('Reading GloVe file') embedding_matrix = numpy.zeros((vocab.size(), dim)) for word in vocab._word_to_id: embedding_vector = embeddings_index.get(word) in_glove = embedding_vector is not None if args.dict: in_dict = len(dict_.get_definitions(word)) > 0 if in_glove and (not args.dict or in_dict): # words not found in embedding index will be all-zeros. embedding_matrix[vocab.word_to_id(word)] = embedding_vector else: if not in_glove: print(u'Missing from GloVe: {}'.format(word)) else: print(u'Missing from dict: {}'.format(word)) numpy.save(args.npy, embedding_matrix)
def vocab(self): if not self._vocab: logger.debug("Loading default vocab") self._vocab = Vocabulary(os.path.join(self._path, "vocab.txt")) return self._vocab
def main(): logging.basicConfig( level='DEBUG', format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser("Crawl definitions for a vocabulary") parser.add_argument("--api_key", help="Wordnik API key to use") # NOTE(kudkudak): wordnik has useCanonical which tries to do stuff like Cats -> cat # but it doesn't really work well parser.add_argument("--just-lemmas", action="store_true", help="Just use the lemmas as the definition") parser.add_argument("--just-lowercase", action="store_true", help="Just lowercase as the definition") parser.add_argument("--add-lemma-defs", action="store_true", help="Add definitions from lemmas") parser.add_argument("--add-lower-defs", action="store_true", help="Add definitions from lowercase") parser.add_argument("--add-lower-lemma-defs", action="store_true", help="Add definitions from lowercase version of word and lemmas") parser.add_argument("--add-dictname-to-defs", action="store_true", help="Adds dictionary name to definition") parser.add_argument("--wordnet", action="store_true", help="Crawl WordNet") parser.add_argument("--add-identity", action="store_true", help="Identity mapping dictionary") parser.add_argument("--add-spelling-if-no-def", action="store_true", help="Add spelling if there is no definition") parser.add_argument("--add-spelling", action="store_true", help="Always add spelling") parser.add_argument("--crawl-also-lowercase", default=0, type=int, help="If true will crawl also lower-cased version") parser.add_argument("--crawl-also-lemma", default=0, type=int, help="If true will crawl also lemma version") parser.add_argument("--remove-out-of-vocabulary", action="store_true", help="Remove entries of dict which do not appear in vocab") parser.add_argument("vocab", help="Vocabulary path") parser.add_argument("dict", help="Destination path for the dictionary") args = parser.parse_args() vocab = Vocabulary(args.vocab) dict_ = Dictionary(args.dict) try: if args.api_key: port = get_free_port() popen = start_corenlp(port) url = "http://localhost:{}".format(port) dict_.crawl_wordnik( vocab, args.api_key, url, crawl_also_lowercase=args.crawl_also_lowercase, crawl_also_lemma=args.crawl_also_lemma) elif args.wordnet: port = get_free_port() popen = start_corenlp(port) url = "http://localhost:{}".format(port) dict_.crawl_wordnet(url) elif args.add_lemma_defs or args.add_lower_lemma_defs: # NOTE(kudkudak): A bit ugly, but this covers case where # we have Cats which do not get added lemmas # from cat without try_lower=True dict_.add_from_lemma_definitions(vocab, try_lower=args.add_lower_lemma_defs) elif args.add_lower_defs: dict_.add_from_lowercase_definitions(vocab) elif args.add_dictname_to_defs: dict_.add_dictname_to_defs(vocab) elif args.add_spelling_if_no_def: dict_.add_spelling(vocab) elif args.add_spelling: dict_.add_spelling(vocab, only_if_no_def=False) elif args.just_lemmas: dict_.crawl_lemmas(vocab) elif args.just_lowercase: dict_.crawl_lowercase(vocab) elif args.add_identity: dict_.add_identity_mapping(vocab) elif args.remove_out_of_vocabulary: dict_.remove_out_of_vocabulary(vocab) else: raise ValueError("don't know what to do") finally: if 'popen' in locals() and popen and popen.returncode is None: popen.kill()
def main(): parser = argparse.ArgumentParser( "Analyze coverage of either a dictionary or pretrained embeddings on a given vocab." ) parser.add_argument("--dict", default="", help="Dictionary.") parser.add_argument( "--embedding", default="", help= "Path to embeddings. Can either be a npy file or a raw glove txt file." ) parser.add_argument( "--top_k", type=int, default=0, help= "Optional, provide statistics for excluding top_k words from source (either dict or embedding)" ) parser.add_argument("--step_size", type=int, help="Report each", default=10000) parser.add_argument("--uncovered", help="Destination for uncovered files") parser.add_argument("vocab", help="Vocabulary") args = parser.parse_args() assert (args.vocab.endswith(".txt")) vocab = Vocabulary(args.vocab) words = vocab.words freqs = numpy.array(vocab.frequencies) total = float(freqs.sum()) coverage = numpy.cumsum(freqs) / total print("Cumulative distribution:") for i in range(args.step_size, args.step_size * (len(freqs) / args.step_size), args.step_size): print(i, coverage[i] * 100) if not args.dict and not args.embedding: return uncovered_file = io.open('/dev/null', 'w') if args.uncovered: uncovered_file = io.open(args.uncovered, 'w', encoding='utf-8') if args.dict and args.top_k: print("Analysing coverage of dict of text") n_covered = 0 n_covered_by_lowercasing = 0 if args.dict: source_name = "dictionary" dict_ = Dictionary(args.dict) print("Dictionary has {} entries".format(dict_.num_entries())) n_more_def_than_1 = 0 for i in range(args.top_k, len(freqs)): if len(dict_.get_definitions(words[i])) > 1: n_more_def_than_1 += freqs[i] if dict_.get_definitions(words[i]): n_covered += freqs[i] elif dict_.get_definitions(words[i].lower()): n_covered_by_lowercasing += freqs[i] elif args.embedding: source_name = "glove embeddings" # Loading (note: now only supports GloVe format) word_set = set([]) if args.embedding.endswith(".txt"): with open(args.embedding) as f: for line in f: values = line.split(' ') word = values[0] word_set.add(word) f.close() elif args.embedding.endswith(".npy"): print( "Warning: assuming that embeddings from .npy file are ordered according to the same vocabulary file as the one passed (using pack_glove --vocab vocab_passed_here)" ) emb_matrix = numpy.load(args.embedding) for i, emb in enumerate(emb_matrix): if not numpy.all(emb == 0): word_set.add(words[i]) print("Glove embeddings has {} entries".format(len(word_set))) for i in range(args.top_k, len(freqs)): if words[i] in word_set: n_covered += freqs[i] elif words[i].lower() in word_set: n_covered_by_lowercasing += freqs[i] else: raise NotImplementedError() print("Analysing coverage of " + source_name) if args.top_k: print("The first " + str(args.top_k) + " ranked words are covered by word embeddings.") print("This amounts to " + str(100 * coverage[args.top_k - 1]) + "% of occurences.") else: print("Case when no word embeddings are used (args.top_k=0). " + source_name + " provides all embeddings") print(source_name + " covers {} % of total occurences".format(100 * n_covered / total)) print( "Querying not found words as lowercased words additionally covers {}% of total occurences" .format(100 * n_covered_by_lowercasing / total)) if args.top_k: n_not_covered_by_embs = total * (1 - coverage[args.top_k - 1]) print( source_name + " covers additional {}% of occurences not covered by word embeddings" .format(100 * n_covered / n_not_covered_by_embs)) print( "Querying not found words as lowercased words additionally covers {}% of occurences not covered by word embeddings" .format(100 * n_covered_by_lowercasing / n_not_covered_by_embs)) if args.dict: print( "Occurences of dictionary defs with >1 def not covered by word embeddings: {}%" .format(100 * n_more_def_than_1 / n_not_covered_by_embs)) uncovered_file.close()