def test_language_model(): with temporary_content_path(TEST_VOCAB) as path: vocab = Vocabulary(path) with temporary_content_path(TEST_DICT_JSON, suffix=".json") as path: dict_ = Dictionary(path) floatX = theano.config.floatX def make_data_and_mask(data): data = [[str2vec(s, 3) for s in row] for row in data] data = np.array(data) mask = np.ones((data.shape[0], data.shape[1]), dtype=floatX) return data, mask words_val, mask_val = make_data_and_mask([['p', 'e', 'a', ], ['a', 'e', 'p',]]) mask_val[1,2] = 0 print "data:" print words_val print "mask:" print mask_val mask_def_emb_val = np.asarray([[0, 1], [0,0]]) # With the dictionary retrieval = Retrieval(vocab, dict_, exclude_top_k=7) lm = LanguageModel(7, 5, vocab.size(), vocab.size(), vocab=vocab, retrieval=retrieval, compose_type='transform_and_sum', weights_init=Uniform(width=0.1), biases_init=Uniform(width=0.1)) lm.initialize() words = tensor.ltensor3('words') mask = tensor.matrix('mask', dtype=floatX) costs = lm.apply(words, mask) cg = ComputationGraph(costs) def_mean, = VariableFilter(name='_dict_word_embeddings')(cg) def_mean_f = theano.function([words], def_mean) perplexities = VariableFilter(name_regex='perplexity.*')(cg) mask_def_emb, = VariableFilter(name='mask_def_emb')(cg) perplexities_f = theano.function([words, mask], perplexities) perplexities_v = perplexities_f(words_val, mask_val) mask_emb_f = theano.function([words, mask], mask_def_emb) mask_def_v = mask_emb_f(words_val, mask_val) for v,p in zip(perplexities_v,perplexities): print p.name, ":", v assert(np.allclose(mask_def_v, mask_def_emb_val))
def main(): logging.basicConfig( level='INFO', format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser( "Converts GLOVE embeddings to a numpy array") parser.add_argument("txt", help="GLOVE data in txt format") parser.add_argument("npy", help="Destination for npy format") parser.add_argument("vocab_out", help="output vocabulary") parser.add_argument("--vocab", default="", help="Performs subsetting based on passed vocab") # OOV handling parser.add_argument("--try-lowercase", action="store_true", help="Try lowercase") args = parser.parse_args() if args.vocab == "": raise NotImplementedError("Not implemented") embeddings = [] dim = None with open(args.txt) as src: for i, line in enumerate(src): tokens = line.strip().split() features = map(float, tokens[1:]) dim = len(features) embeddings.append(features) if i and i % 100000 == 0: print i embeddings = [[0.] * dim] * len( Vocabulary.SPECIAL_TOKEN_MAP) + embeddings np.save(args.npy, embeddings) else: vocab = Vocabulary(args.vocab) print('Computing GloVe') # Loading embeddings_index = {} f = open(args.txt) print('Reading GloVe file') for line in f: values = line.split(' ') word = values[0] dim = len(values[1:]) coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close() # Embedding matrix: larger than necessary f_out = open(args.vocab_out, 'w') n_specials = len(Vocabulary.SPECIAL_TOKEN_MAP.values()) embedding_matrix = np.zeros((vocab.size() + n_specials, dim)) for special_token in Vocabulary.SPECIAL_TOKEN_MAP.values(): line = '<' + special_token + '>' + " 0\n" f_out.write(line.encode('utf-8')) i = n_specials #i = 0 for word, count in zip(vocab.words, vocab.frequencies): embedding_vector = embeddings_index.get(word) if args.try_lowercase and not isinstance(embedding_vector, np.ndarray): embedding_vector = embeddings_index.get(word.lower()) in_glove = embedding_vector is not None last_comp = None if in_glove: last_comp = embedding_vector[-1] #print "i: {}, word {}, count {}, in_glove {}, last {}".format(i, word, count, in_glove, last_comp) if in_glove: try: embedding_matrix[i] = embedding_vector except: print "error idx", i # else, null vector #print "writing:", line, i line = word + " " + str(count) + "\n" f_out.write(line.encode('utf-8')) i += 1 if i and i % 10000 == 0: print "i:", i f_out.close() np.save(args.npy, embedding_matrix[:i])
def main(): logging.basicConfig( level='INFO', format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser("Builds a dictionary") parser.add_argument("--target_coverage_text", type=float, help="Target coverage of text") parser.add_argument("--target_coverage_def", type=float, help="Target coverage of def") parser.add_argument("--vocab_text", type=str, help="Vocabulary of text") parser.add_argument("--vocab_def", type=str, help="Vocabulary of def") parser.add_argument("--step_size", type=int, default=30) parser.add_argument("--target", type=str, default="Final path") args = parser.parse_args() vocab_text = Vocabulary(args.vocab_text) vocab_def = Vocabulary(args.vocab_def) # Greedy solution is optimal # I also approximate greedy a bit by adding word by word. This is fine, vocabs are big target_coverage_text = np.sum( vocab_text.frequencies) * args.target_coverage_text target_coverage_def = np.sum( vocab_def.frequencies) * args.target_coverage_def current_vocab = set([]) # Of course I could use binsearch for id in range(vocab_def.size() / args.step_size): for id2 in range(args.step_size): current_vocab.add(vocab_def.id_to_word(id * args.step_size + id2)) current_vocab_mod = set(current_vocab) current_coverage_def = 0.0 current_coverage_text = 0.0 for w in current_vocab_mod: current_coverage_def += vocab_def.frequencies[vocab_def.word_to_id( w)] current_coverage_text += vocab_text.frequencies[ vocab_text.word_to_id(w)] id_text = 0 while current_coverage_text < target_coverage_text: while vocab_text.id_to_word(id_text) in current_vocab_mod: id_text += 1 if id_text >= vocab_text.size(): raise Exception("Perhaps try lower target coverage") w = vocab_text.id_to_word(id_text) current_vocab_mod.add(w) current_coverage_def += vocab_def.frequencies[vocab_def.word_to_id( w)] current_coverage_text += vocab_text.frequencies[id_text] if current_coverage_def > target_coverage_def: current_vocab = current_vocab_mod break print( "After adding {} words I covered {} of def and {} of text occurences" .format( len(current_vocab_mod), current_coverage_def / float(np.sum(vocab_def.frequencies)), current_coverage_text / float(np.sum(vocab_text.frequencies)))) # To be safe rechecking shortlist works current_coverage_def = 0 current_coverage_text = 0 for w in current_vocab: current_coverage_def += vocab_def.frequencies[vocab_def.word_to_id(w)] current_coverage_text += vocab_text.frequencies[vocab_text.word_to_id( w)] print( "Sanity check: after adding {} words I covered {} of def and {} of text occurences" .format(len(current_vocab), current_coverage_def / float(np.sum(vocab_def.frequencies)), current_coverage_text / float(np.sum(vocab_text.frequencies)))) vocab_result = Vocabulary.build( {word: vocab_text.word_freq(word) for word in current_vocab}) vocab_result.save(args.target)
def evaluate(c, tar_path, *args, **kwargs): """ Performs rudimentary evaluation of SNLI/MNLI run * Runs on valid and test given network * Saves all predictions * Saves embedding matrix * Saves results.json and predictions.csv """ # Load and configure model = kwargs['model'] assert c.endswith("json") c = json.load(open(c)) # Very ugly absolute path fix ABS_PATHS = [ "data/", "/mnt/users/jastrzebski/local/dict_based_learning/data/", "/data/cf9ffb48-61bd-40dc-a011-b2e7e5acfd72/" ] from six import string_types for abs_path in ABS_PATHS: for k in c: if isinstance(c[k], string_types): if c[k].startswith(abs_path): c[k] = c[k][len(abs_path):] # Make data paths nice for path in [ 'dict_path', 'embedding_def_path', 'embedding_path', 'vocab', 'vocab_def', 'vocab_text' ]: if c.get(path, ''): if not os.path.isabs(c[path]): c[path] = os.path.join(fuel.config.data_path[0], c[path]) logging.info("Updating config with " + str(kwargs)) c.update(**kwargs) # NOTE: This assures we don't miss crucial definition for some def heavy words # usually it is a good idea c['max_def_per_word'] = c['max_def_per_word'] * 2 assert tar_path.endswith("tar") dest_path = os.path.dirname(tar_path) prefix = os.path.splitext(os.path.basename(tar_path))[0] s1_decoded, s2_decoded = T.lmatrix('sentence1'), T.lmatrix('sentence2') if c['dict_path']: s1_def_map, s2_def_map = T.lmatrix('sentence1_def_map'), T.lmatrix( 'sentence2_def_map') def_mask = T.fmatrix("def_mask") defs = T.lmatrix("defs") else: s1_def_map, s2_def_map = None, None def_mask = None defs = None s1_mask, s2_mask = T.fmatrix('sentence1_mask'), T.fmatrix('sentence2_mask') if model == 'simple': model, data, used_dict, used_retrieval, used_vocab = _initialize_simple_model_and_data( c) elif model == 'esim': model, data, used_dict, used_retrieval, used_vocab = _initialize_esim_model_and_data( c) else: raise NotImplementedError() pred = model.apply(s1_decoded, s1_mask, s2_decoded, s2_mask, def_mask=def_mask, defs=defs, s1_def_map=s1_def_map, s2_def_map=s2_def_map, train_phase=False) cg = ComputationGraph([pred]) if c.get("bn", True): bn_params = [ p for p in VariableFilter(bricks=[BatchNormalization])(cg) if hasattr(p, "set_value") ] else: bn_params = [] # Load model model = Model(cg.outputs) parameters = model.get_parameter_dict() # Blocks version mismatch logging.info( "Trainable parameters" + "\n" + pprint.pformat([(key, parameters[key].get_value().shape) for key in sorted([ get_brick(param).get_hierarchical_name(param) for param in cg.parameters ])], width=120)) logging.info("# of parameters {}".format( sum([ np.prod(parameters[key].get_value().shape) for key in sorted([ get_brick(param).get_hierarchical_name(param) for param in cg.parameters ]) ]))) with open(tar_path) as src: params = load_parameters(src) loaded_params_set = set(params.keys()) model_params_set = set([ get_brick(param).get_hierarchical_name(param) for param in cg.parameters ]) logging.info("Loaded extra parameters") logging.info(loaded_params_set - model_params_set) logging.info("Missing parameters") logging.info(model_params_set - loaded_params_set) model.set_parameter_values(params) if c.get("bn", True): logging.info("Loading " + str([ get_brick(param).get_hierarchical_name(param) for param in bn_params ])) for param in bn_params: param.set_value( params[get_brick(param).get_hierarchical_name(param)]) for p in bn_params: model._parameter_dict[get_brick(p).get_hierarchical_name(p)] = p # Read logs logs = pd.read_csv(os.path.join(dest_path, "logs.csv")) best_val_acc = logs['valid_misclassificationrate_apply_error_rate'].min() logging.info("Best measured valid acc: " + str(best_val_acc)) # NOTE(kudkudak): We need this to have comparable mean rank and embedding scores reference_vocab = Vocabulary( os.path.join(fuel.config.data_path[0], c['data_path'], 'vocab.txt')) vocab_all = Vocabulary( os.path.join( fuel.config.data_path[0], c['data_path'], 'vocab_all.txt')) # Can include OOV words, which is interesting retrieval_all = Retrieval(vocab_text=used_vocab, dictionary=used_dict, max_def_length=c['max_def_length'], exclude_top_k=0, max_def_per_word=c['max_def_per_word']) # logging.info("Calculating dict and word embeddings for vocab.txt and vocab_all.txt") # for name in ['s1_word_embeddings', 's1_dict_word_embeddings']: # variables = VariableFilter(name=name)(cg) # if len(variables): # s1_emb = variables[0] # # A bit sloppy about downcast # # if "dict" in name: # embedder = construct_dict_embedder( # theano.function([s1_decoded, defs, def_mask, s1_def_map], s1_emb, allow_input_downcast=True), # vocab=data.vocab, retrieval=retrieval_all) # else: # embedder = construct_embedder(theano.function([s1_decoded], s1_emb, allow_input_downcast=True), # vocab=data.vocab) # # for v_name, v in [("vocab_all", vocab_all), ("vocab", reference_vocab)]: # logging.info("Calculating {} embeddings for {}".format(name, v_name)) # Predict predict_fnc = theano.function(cg.inputs, pred) results = {} batch_size = 14 for subset in ['valid', 'test']: logging.info("Predicting on " + subset) stream = data.get_stream(subset, batch_size=batch_size, seed=778) it = stream.get_epoch_iterator() rows = [] for ex in tqdm.tqdm(it, total=10000 / batch_size): ex = dict(zip(stream.sources, ex)) inp = [ex[v.name] for v in cg.inputs] prob = predict_fnc(*inp) label_pred = np.argmax(prob, axis=1) for id in range(len(prob)): s1_decoded = used_vocab.decode(ex['sentence1'][id]).split() s2_decoded = used_vocab.decode(ex['sentence2'][id]).split() assert used_vocab == data.vocab s1_decoded = [ '*' + w + '*' if used_vocab.word_to_id(w) > c['num_input_words'] else w for w in s1_decoded ] s2_decoded = [ '*' + w + '*' if used_vocab.word_to_id(w) > c['num_input_words'] else w for w in s2_decoded ] # Different difficulty metrics # text_unk_percentage s1_no_pad = [w for w in ex['sentence1'][id] if w != 0] s2_no_pad = [w for w in ex['sentence2'][id] if w != 0] s1_unk_percentage = sum([ 1. for w in s1_no_pad if w == used_vocab.unk ]) / len(s1_no_pad) s2_unk_percentage = sum([ 1. for w in s1_no_pad if w == used_vocab.unk ]) / len(s2_no_pad) # mean freq word s1_mean_freq = np.mean([ 0 if w == data.vocab.unk else used_vocab._id_to_freq[w] for w in s1_no_pad ]) s2_mean_freq = np.mean([ 0 if w == data.vocab.unk else used_vocab._id_to_freq[w] for w in s2_no_pad ]) # mean rank word (UNK is max rank) # NOTE(kudkudak): Will break if we reindex unk between vocabs :P s1_mean_rank = np.mean([ reference_vocab.size() if reference_vocab.word_to_id( used_vocab.id_to_word(w)) == reference_vocab.unk else reference_vocab.word_to_id(used_vocab.id_to_word(w)) for w in s1_no_pad ]) s2_mean_rank = np.mean([ reference_vocab.size() if reference_vocab.word_to_id( used_vocab.id_to_word(w)) == reference_vocab.unk else reference_vocab.word_to_id(used_vocab.id_to_word(w)) for w in s2_no_pad ]) rows.append({ "pred": label_pred[id], "true_label": ex['label'][id], "s1": ' '.join(s1_decoded), "s2": ' '.join(s2_decoded), "s1_unk_percentage": s1_unk_percentage, "s2_unk_percentage": s2_unk_percentage, "s1_mean_freq": s1_mean_freq, "s2_mean_freq": s2_mean_freq, "s1_mean_rank": s1_mean_rank, "s2_mean_rank": s2_mean_rank, "p_0": prob[id, 0], "p_1": prob[id, 1], "p_2": prob[id, 2] }) preds = pd.DataFrame(rows, columns=rows[0].keys()) preds.to_csv( os.path.join(dest_path, prefix + '_predictions_{}.csv'.format(subset))) results[subset] = {} results[subset]['misclassification'] = 1 - np.mean( preds.pred == preds.true_label) if subset == "valid" and np.abs( (1 - np.mean(preds.pred == preds.true_label)) - best_val_acc) > 0.001: logging.error("!!!") logging.error( "Found different best_val_acc. Probably due to changed specification of the model class." ) logging.error("Discrepancy {}".format( (1 - np.mean(preds.pred == preds.true_label)) - best_val_acc)) logging.error("!!!") logging.info(results) json.dump(results, open(os.path.join(dest_path, prefix + '_results.json'), "w"))
def main(): logging.basicConfig( level='INFO', format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser( "Converts GLOVE embeddings to a numpy array") parser.add_argument("txt", help="GLOVE data in txt format") parser.add_argument("npy", help="Destination for npy format") parser.add_argument("--vocab", default="", help="Performs subsetting based on passed vocab") parser.add_argument("--dict", default="", help="Performs subsetting based on passed dict") # OOV handling parser.add_argument("--try-lemma", action="store_true", help="Try lemma") parser.add_argument("--try-lowercase", default="", help="Try lowercase") args = parser.parse_args() if args.dict and not args.vocab: # usually you'd want to use both, I suppose raise NotImplementedError("Not implemented") if args.try_lemma or args.try_lowercase: # TODO(kudkudak): Implement raise NotImplementedError("Not implemented yet") if args.vocab == "": embeddings = [] dim = None with open(args.txt) as src: for i, line in enumerate(src): tokens = line.strip().split() features = map(float, tokens[1:]) dim = len(features) embeddings.append(features) if i and i % 100000 == 0: print i embeddings = [[0.] * dim] * len( Vocabulary.SPECIAL_TOKEN_MAP) + embeddings numpy.save(args.npy, embeddings) else: vocab = Vocabulary(args.vocab) if args.dict: dict_ = Dictionary(args.dict) print('Computing GloVe') # Loading embeddings_index = {} f = open(args.txt) print('Reading GloVe file') for line in f: values = line.split(' ') word = values[0] dim = len(values[1:]) coefs = numpy.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close() # Embedding matrix print('Reading GloVe file') embedding_matrix = numpy.zeros((vocab.size(), dim)) for word in vocab._word_to_id: embedding_vector = embeddings_index.get(word) in_glove = embedding_vector is not None if args.dict: in_dict = len(dict_.get_definitions(word)) > 0 if in_glove and (not args.dict or in_dict): # words not found in embedding index will be all-zeros. embedding_matrix[vocab.word_to_id(word)] = embedding_vector else: if not in_glove: print(u'Missing from GloVe: {}'.format(word)) else: print(u'Missing from dict: {}'.format(word)) numpy.save(args.npy, embedding_matrix)