vocab.update(line.strip() for line in f) # read embeddings file if options.w2v_format: words, embs = read_text_embs(options.vectors) else: words, embs = read_pickle_embs(options.vectors) dim = len(embs[0]) word_to_ix = {w: i for (i, w) in enumerate(words)} in_vocab = 0 for word, emb in zip(words, embs): if word == POLYGLOT_UNK or word == W2V_UNK: continue if word in vocab: in_vocab += 1 training_instances.append(Instance(charseq(word, c2i), emb)) training_char_count = len(c2i) print "Total in Embeddings vocabulary:", len(words) print "Training set character count: ", training_char_count # Test: Will be doing inference on these words using mimick if len(vocab) > 0: total = len(vocab) for v in vocab: if v not in words: test_instances.append( Instance(charseq(v, c2i), np.array([0.0] * dim))) print "Total Number of output words:", total print "Total in Training Vocabulary:", in_vocab print "Percentage in-vocab:", in_vocab / total print "Total character count: ", len(c2i)
training_instances = dataset["training_instances"] test_instances = dataset["test_instances"] populate_test_insts_from_vocab = len(test_instances) == 0 emb_dim = len(training_instances[0].word_emb) # Load words to write vocab_words = {} if populate_test_insts_from_vocab: train_words = [wordify(w, i2c) for w in training_instances] with codecs.open(options.vocab, "r", "utf-8") as vocab_file: for vw in vocab_file.readlines(): vw = vw.strip() vocab_words[vw] = np.zeros(emb_dim) if populate_test_insts_from_vocab and vw not in train_words: test_instances.append( Instance(charseq(vw, c2i), np.zeros(emb_dim))) if populate_test_insts_from_vocab: # need to update i2c if saw new characters i2c = {i: c for c, i in c2i.items()} if not options.cnn: model = LSTMMimick(c2i, options.num_lstm_layers, options.char_dim, options.hidden_dim, emb_dim) else: model = CNNMimick(c2i, options.num_conv_layers, options.char_dim, options.hidden_dim,\ options.window_width, options.pooling_maxk, options.w_stride, emb_dim) trainer = dy.MomentumSGDTrainer(model.model, options.learning_rate, 0.9) root_logger.info("Training Algorithm: {}".format(type(trainer)))
test_instances = dataset["test_instances"] populate_test_insts_from_vocab = len(test_instances) == 0 emb_dim = len(training_instances[0].word_emb) # Load words to write vocab_words = {} if populate_test_insts_from_vocab: train_words = [wordify(w, i2c) for w in training_instances] for filename in options.vocab: with codecs.open(filename, "r", "utf-8") as vocab_file: for vw in vocab_file.readlines(): vw = vw.strip() if vw in vocab_words: continue vocab_words[vw] = np.zeros(emb_dim) if populate_test_insts_from_vocab and vw not in train_words: test_instances.append(Instance(charseq(vw, c2i), np.zeros(emb_dim))) if populate_test_insts_from_vocab: # need to update i2c if saw new characters i2c = { i: c for c, i in c2i.items() } if not options.cnn: model = LSTMMimick(c2i, options.num_lstm_layers, options.char_dim, options.hidden_dim, emb_dim) else: model = CNNMimick(c2i, options.num_conv_layers, options.char_dim, options.hidden_dim,\ options.window_width, options.pooling_maxk, options.w_stride, emb_dim) trainer = dy.MomentumSGDTrainer(model.model, options.learning_rate, 0.9) root_logger.info("Training Algorithm: {}".format(type(trainer))) root_logger.info("Number training instances: {}".format(len(training_instances)))