Esempio n. 1
0
                vocab.update(line.strip() for line in f)

    # read embeddings file
    if options.w2v_format:
        words, embs = read_text_embs(options.vectors)
    else:
        words, embs = read_pickle_embs(options.vectors)
    dim = len(embs[0])
    word_to_ix = {w: i for (i, w) in enumerate(words)}

    in_vocab = 0
    for word, emb in zip(words, embs):
        if word == POLYGLOT_UNK or word == W2V_UNK: continue
        if word in vocab:
            in_vocab += 1
        training_instances.append(Instance(charseq(word, c2i), emb))
    training_char_count = len(c2i)
    print "Total in Embeddings vocabulary:", len(words)
    print "Training set character count: ", training_char_count

    # Test: Will be doing inference on these words using mimick
    if len(vocab) > 0:
        total = len(vocab)
        for v in vocab:
            if v not in words:
                test_instances.append(
                    Instance(charseq(v, c2i), np.array([0.0] * dim)))
        print "Total Number of output words:", total
        print "Total in Training Vocabulary:", in_vocab
        print "Percentage in-vocab:", in_vocab / total
        print "Total character count: ", len(c2i)
Esempio n. 2
0
    training_instances = dataset["training_instances"]
    test_instances = dataset["test_instances"]
    populate_test_insts_from_vocab = len(test_instances) == 0
    emb_dim = len(training_instances[0].word_emb)

    # Load words to write
    vocab_words = {}
    if populate_test_insts_from_vocab:
        train_words = [wordify(w, i2c) for w in training_instances]
    with codecs.open(options.vocab, "r", "utf-8") as vocab_file:
        for vw in vocab_file.readlines():
            vw = vw.strip()
            vocab_words[vw] = np.zeros(emb_dim)
            if populate_test_insts_from_vocab and vw not in train_words:
                test_instances.append(
                    Instance(charseq(vw, c2i), np.zeros(emb_dim)))

    if populate_test_insts_from_vocab:
        # need to update i2c if saw new characters
        i2c = {i: c for c, i in c2i.items()}

    if not options.cnn:
        model = LSTMMimick(c2i, options.num_lstm_layers, options.char_dim,
                           options.hidden_dim, emb_dim)
    else:
        model = CNNMimick(c2i, options.num_conv_layers, options.char_dim, options.hidden_dim,\
                options.window_width, options.pooling_maxk, options.w_stride, emb_dim)

    trainer = dy.MomentumSGDTrainer(model.model, options.learning_rate, 0.9)
    root_logger.info("Training Algorithm: {}".format(type(trainer)))
Esempio n. 3
0
    test_instances = dataset["test_instances"]
    populate_test_insts_from_vocab = len(test_instances) == 0
    emb_dim = len(training_instances[0].word_emb)

    # Load words to write
    vocab_words = {}
    if populate_test_insts_from_vocab:
        train_words = [wordify(w, i2c) for w in training_instances]
    for filename in options.vocab:
        with codecs.open(filename, "r", "utf-8") as vocab_file:
            for vw in vocab_file.readlines():
                vw = vw.strip()
                if vw in vocab_words: continue
                vocab_words[vw] = np.zeros(emb_dim)
                if populate_test_insts_from_vocab and vw not in train_words:
                    test_instances.append(Instance(charseq(vw, c2i), np.zeros(emb_dim)))

    if populate_test_insts_from_vocab:
        # need to update i2c if saw new characters
        i2c = { i: c for c, i in c2i.items() }

    if not options.cnn:
        model = LSTMMimick(c2i, options.num_lstm_layers, options.char_dim, options.hidden_dim, emb_dim)
    else:
        model = CNNMimick(c2i, options.num_conv_layers, options.char_dim, options.hidden_dim,\
                options.window_width, options.pooling_maxk, options.w_stride, emb_dim)

    trainer = dy.MomentumSGDTrainer(model.model, options.learning_rate, 0.9)
    root_logger.info("Training Algorithm: {}".format(type(trainer)))

    root_logger.info("Number training instances: {}".format(len(training_instances)))