Python word_listの例、generators.word_list Pythonの例

コード例 #1

0

ファイルを表示

ファイル: GWE.py プロジェクト: wanghia/grammatical-error-detection

def train(model, opt):
    for i in range(epoch):
        print("epoch {}".format(i + 1))
        start = time.time()
        total_loss = 0
        gen1 = gens.word_list(train_txt)
        gen2 = gens.batch(gens.sorted_parallel(gen1, embed_size * batch_size),
                          batch_size)
        batchs = [b for b in gen2]
        bl = list(range(len(batchs)))
        random.shuffle(bl)
        for n, j in enumerate(bl):
            if window_size > len(batchs[j][0]) - 4:
                continue
            tag0 = batchs[j][:]
            tags = [[int(c) for c in a[:-1]] for a in tag0]
            batch = fill_batch([b[-1].split() for b in batchs[j]])
            batch = [convert_word(b) for k, b in enumerate(batch)]
            tags = xp.array(tags, dtype=xp.int32)
            accum_loss = forward(*batch, *tags, model)
            accum_loss.backward()
            opt.update()
            total_loss += accum_loss.data
        print('total_loss: {}'.format(total_loss))
        evaluate(model)
        serializers.save_npz("{}{}".format(ssweM, i), model)
        print("time: {}".format(time.time() - start))

コード例 #2

0

ファイルを表示

def evaluate(model, word2id):
    c_p = 0
    correct_p = 0
    c_r = 0
    correct_r = 0
    m = model.copy()
    m.volatile = True
    gen1 = gens.word_list(test_txt)
    gen2 = gens.batch(gens.sorted_parallel(gen1, embed_size * batch_size),
                      batch_size)
    batchs = [b for b in gen2]
    for batch in batchs:
        tag0 = batch[:]
        tags = [a[0] for a in tag0]
        batch = [b[1:] for b in batch]
        batch = fill_batch([b[-1].split() for b in batch])
        pres = forward(batch, tags, m, word2id, mode=False)
        a, b, c, d = precision_recall_f(pres, tags)
        c_p += a
        correct_p += b
        c_r += c
        correct_r += d
    precision = correct_p / c_p
    recall = correct_r / c_r
    f_measure = (2 * precision * recall) / (precision + recall)
    print('Precision:\t{}'.format(precision))
    print('Recall:\t{}'.format(recall))
    print('F-value\t{}'.format(f_measure))

コード例 #3

0

ファイルを表示

ファイル: BLSTM.py プロジェクト: vinceau/grammatical-error-detection

def evaluate(model, word2id):
    c_p = 0
    correct_p = 0
    c_r = 0
    correct_r = 0
    m = model.copy()
    m.volatile = True
    gen1 = gens.word_list(dev_txt)
    gen2 = gens.batch(gens.sorted_parallel(gen1, embed_size * batch_size),
                      batch_size)
    batchs = [b for b in gen2]
    for batch in batchs:
        tag0 = batch[:]
        tags = [a[:-1] for a in tag0]
        batch = [b[1:] for b in batch]
        batch = fill_batch([b[-1].split() for b in batch])
        tags = fill_batch(tags, token=-1)
        pres, cons = forward(batch, tags, m, word2id, mode=False)
        a, b, c, d = precision_recall_f(pres, tags, cons)
        c_p += a
        correct_p += b
        c_r += c
        correct_r += d
    try:
        precision = correct_p / c_p
        recall = correct_r / c_r
        f_measure = (1 + 0.5**2) * precision * recall / (0.5**2 * precision +
                                                         recall)
    except:
        precision = 'nothing'
        recall = 'nothing'
        f_measure = 'nothing'
    print('Precision:\t{}'.format(precision))
    print('Recall:\t{}'.format(recall))
    print('F-value\t{}'.format(f_measure))

コード例 #4

0

ファイルを表示

def test():
    sta = pickle.load(open(state_model, "rb"))
    word2id = pickle.load(open(vocab_dict, 'rb'))
    c_p = 0
    correct_p = 0
    c_r = 0
    correct_r = 0
    predicts = []
    total_predicts = []
    total_tags = []
    total_batchs = []
    model = BLSTMw2v(sta["vocab_size"], sta["embed_size"], sta["hidden_size"],
                     output_size)
    serializers.load_npz(load_model, model)
    if gpu >= 0:
        model.to_gpu()
    for j, batchs in enumerate(gens.batch(gens.word_list(test_txt),
                                          batch_size)):
        tag0 = batchs[:]
        tags = [a[0] for a in tag0]
        batch = fill_batch([b[-1].split() for b in batchs[j]])
        total_batchs.append(batchs)
        pres = forward(batchs, tags, model, word2id, mode=False)
        a, b, c, d = precision_recall_f(pres, tags)
        c_p += a
        correct_p += b
        c_r += c
        correct_r += d
    evaluate(model, word2id)

コード例 #5

0

ファイルを表示

def train():
    id2word = {}
    word2id = {}
    word_freq = collections.defaultdict(lambda: 0)
    id2word[0] = "<unk>"
    word2id["<unk>"] = 0
    id2word[1] = "</s>"
    word2id["</s>"] = 1
    id2word[-1] = "EOS"
    word2id["EOS"] = -1
    word2id, id2word, word_list, word_freq = make_dict(train_txt, word2id,
                                                       id2word, word_freq)
    word2vec_model = gensim.models.Word2Vec.load_word2vec_format(
        './entity_vector/entity_vector.model.bin', binary=True)
    model = BLSTMw2v(vocab_size, embed_size, hidden_size, output_size)
    model.initialize_embed(word2vec_model, word_list, word2id)
    if gpu >= 0:
        cuda.get_device(gpu).use()
        model.to_gpu()
    opt = O.Adam()
    opt.setup(model)
    gen1 = gens.word_list(train_txt)
    gen2 = gens.batch(gens.sorted_parallel(gen1, embed_size * batch_size),
                      batch_size)
    batchs = [b for b in gen2]
    bl = list(range(len(batchs)))
    random.shuffle(bl)
    for i in range(epoch):
        print("epoch{}".format(i + 1))
        start = time.time()
        total_loss = 0
        for n, j in enumerate(bl):
            tag0 = batchs[j][:]
            tags = [a[0] for a in tag0]
            batch = fill_batch([b[-1].split() for b in batchs[j]])
            accum_loss, pres = forward(batch, tags, model, word2id,
                                       mode=True)  # 損失の計算
            accum_loss.backward()  # 誤差逆伝播
            opt.update()  # パラメータの更新
            total_loss += accum_loss.data
        print("total_loss {}".format(total_loss))
        serializers.save_npz("{}{}".format(load_model, i), model)
        evaluate(model, word2id)
        print("time: {}".format(time.time() - start))
    pickle.dump(dict(word2id), open(vocab_dict, 'wb'))
    serializers.save_npz(load_model, model)
    state_d = {}
    state_d["vocab_size"] = vocab_size
    state_d["hidden_size"] = hidden_size
    state_d["embed_size"] = embed_size
    pickle.dump(state_d, open(state_model, "wb"))

コード例 #6

0

ファイルを表示

def train():
    id2word = {}
    word2id = {}
    word_freq = collections.defaultdict(lambda: 0)
    id2word[0] = "<unk>"
    word2id["<unk>"] = 0
    id2word[1] = "<s>"
    word2id["<s>"] = 1
    id2word[2] = "</s>"
    word2id["</s>"] = 2

    word2id, id2word, word_list, word_freq = make_dict(train_txt, word2id,
                                                       id2word, word_freq)
    model = BiLSTM(vocab_size, embed_size, hidden_size, output_size,
                   extra_hidden_size)
    model.initialize_embed('../data/embedding.txt', word2id)
    if torch.cuda.is_available():
        model.cuda()
    opt = optim.Adam(model.parameters(), lr=0.001)

    for i in range(1, epoch + 1):
        print("\nepoch {}".format(i))
        total_loss = 0
        gen1 = gens.word_list(train_txt)
        gen2 = gens.batch(gens.sorted_parallel(gen1, embed_size * batch_size),
                          batch_size)
        batchs = [b for b in gen2]
        bl = list(range(len(batchs)))
        random.shuffle(bl)
        for n, j in enumerate(bl):
            tag0 = batchs[j][:]
            tags = [[int(c) for c in a[:-1]] for a in tag0]
            batch = fill_batch([b[-1].split() for b in batchs[j]])
            tags = fill_batch(tags, token=0)
            accum_loss, pres, cons = forward(batch,
                                             tags,
                                             model,
                                             word2id,
                                             mode=True)
            accum_loss.backward()
            opt.step()
            total_loss += accum_loss.data[0]
        print("total_loss {}".format(total_loss))
        evaluate(model, word2id)
        torch.save(model.state_dict(), "{}{}".format(load_model, i))

    torch.save(model.state_dict(), load_model)
    with open(vocab_dict, mode='wb') as f:
        pickle.dump(word2id, f)

コード例 #7

0

ファイルを表示

ファイル: BLSTM.py プロジェクト: vinceau/grammatical-error-detection

def test():
    sta = pickle.load(open(state_model, "rb"))
    word2id = pickle.load(open(vocab_dict, 'rb'))
    c_p = 0
    correct_p = 0
    c_r = 0
    correct_r = 0
    predicts = []
    total_predicts = []
    total_tags = []
    total_batchs = []
    model = BLSTMw2v(sta["vocab_size"], sta["embed_size"], sta["hidden_size"],
                     output_size)
    serializers.load_npz(load_model, model)
    if gpu >= 0:
        model.to_gpu()
    for batchs in gens.batch(gens.word_list(test_txt), batch_size):
        tag0 = batchs[:]
        tags = [a[:-1] for a in tag0]
        batchs = [b[1:] for b in batchs]
        total_batchs.append(batchs)
        batchs = fill_batch([b[-1].split() for b in batchs])
        tags = fill_batch(tags, token=-1)
        pres, cons = forward(batchs, tags, model, word2id, mode=False)
        for num, a in enumerate(zip(pres, tags)):
            pre_l = [
                int(xp.argmax(a[0].data[k])) for k in range(len(a[0].data))
                if cons[num][k] == True
            ]
        a, b, c, d = precision_recall_f(pres, tags, cons)
        c_p += a
        correct_p += b
        c_r += c
        correct_r += d
    precision = correct_p / c_p
    recall = correct_r / c_r
    f_measure = 2 * precision * recall / (precision + recall)
    print('Precision:\t{}'.format(precision))
    print('Recall:\t{}'.format(recall))
    print('F-value\t{}'.format(f_measure))

コード例 #8

0

ファイルを表示

def test():
    res = []
    word2id = pickle.load(open(vocab_dict, 'rb'))
    model = BiLSTM(vocab_size, embed_size, hidden_size, output_size,
                   extra_hidden_size)
    model.load_state_dict(torch.load(load_model))
    if torch.cuda.is_available():
        model = model.cuda()

    for i in range(1, epoch + 1):
        print("\nepoch {}".format(i))
        total_loss = 0
        gen1 = gens.word_list(test_txt)
        gen2 = gens.batch(gens.sorted_parallel(gen1, embed_size * batch_size),
                          batch_size)
        batchs = [b for b in gen2]
        bl = list(range(len(batchs)))
        random.shuffle(bl)
        for n, j in enumerate(bl):
            tag0 = batchs[j][:]
            tags = [[int(c) for c in a[:-1]] for a in tag0]
            batch = fill_batch([b[-1].split() for b in batchs[j]])
            tags = fill_batch(tags, token=0)
            accum_loss, pres, cons = forward(batch,
                                             tags,
                                             model,
                                             word2id,
                                             mode=True)
            total_loss += accum_loss.data[0]
            pres = np.array(pres, dtype=np.int64).T
            for pre, text in zip(pres, batch):
                pre = [str(p) for p in pre]
                res.append(' '.join(pre) + '\t' + ' '.join(text))
        print("total_loss {}".format(total_loss))
        with open('./save1.txt', 'w') as f:
            f.write('\n'.join(res))
        evaluate(model, word2id)  # F値を出したい場合はこちら

コード例 #9

0

ファイルを表示

def evaluate(model, word2id):
    c_p = 0
    correct_p = 0
    c_r = 0
    correct_r = 0

    gen1 = gens.word_list(test_txt)
    gen2 = gens.batch(gens.sorted_parallel(gen1, embed_size * batch_size),
                      batch_size)
    batchs = [b for b in gen2]
    for batch in batchs:
        tag0 = batch[:]
        tags = [a[:-1] for a in tag0]
        batch = fill_batch([b[-1].split() for b in batch])
        tags = fill_batch(tags, token=-1)
        pres, cons = forward(batch, tags, model, word2id, mode=False)
        a, b, c, d = precision_recall_f(pres, tags, cons)
        c_p += a
        correct_p += b
        c_r += c
        correct_r += d
    try:
        precision = correct_p / c_p
        recall = correct_r / c_r
        f_measure = (1 + 0.5**2) * precision * recall / (0.5**2 * precision +
                                                         recall)
        print('Precision:\t{}'.format(precision))
        print('Recall:\t{}'.format(recall))
        print('F-value\t{}'.format(f_measure))
    except ZeroDivisionError:
        precision = 0
        recall = 0
        f_measure = 0
        print('Precision:\tnothing')
        print('Recall:\tnothing')
        print('F-value\tnothing')
    return precision, recall, f_measure

コード例 #10

0

ファイルを表示

ファイル: BLSTM.py プロジェクト: vinceau/grammatical-error-detection

def train():
    id2word = {}
    word2id = {}
    word_freq = collections.defaultdict(lambda: 0)
    id2word[0] = "<unk>"
    word2id["<unk>"] = 0
    id2word[1] = "<s>"
    word2id["<s>"] = 1
    id2word[-1] = "</s>"
    word2id["</s>"] = -1

    word2id, id2word, word_list, word_freq = make_dict(train_txt, word2id,
                                                       id2word, word_freq)
    word2vec_model = load_word2vec_format('embedding.txt')
    model = BLSTMw2v(vocab_size, embed_size, hidden_size, output_size)
    model.initialize_embed(word2vec_model, word_list, word2id, id2word)
    if gpu >= 0:
        cuda.get_device(gpu).use()  # Make a specified GPU current
        model.to_gpu()  # Copy the model to the GPU
    opt = O.Adam(alpha=0.001)
    opt.setup(model)

    for i in range(epoch):
        print("epoch{}".format(i + 1))
        start = time.time()
        total_loss = 0
        gen1 = gens.word_list(train_txt)
        gen2 = gens.batch(gens.sorted_parallel(gen1, embed_size * batch_size),
                          batch_size)
        batchs = [b for b in gen2]
        bl = list(range(len(batchs)))
        random.shuffle(bl)
        for n, j in enumerate(bl):
            tag0 = batchs[j][:]
            tags = [[int(c) for c in a[:-1]] for a in tag0]
            batch = fill_batch([b[-1].split() for b in batchs[j]])
            tags = fill_batch(tags, token=0)
            accum_loss, pres, cons = forward(batch,
                                             tags,
                                             model,
                                             word2id,
                                             mode=True)
            accum_loss.backward()
            opt.update()
            total_loss += accum_loss.data
        print("total_loss {}".format(total_loss))
        serializers.save_npz("{}{}".format(load_model, i), model)
        evaluate(model, word2id)
        ff = open('embeding.txt', 'w')
        ff.write('{} {}\n'.format(len(model.x2e.W.data) - 3, embed_size))
        for num in range(2, len(model.x2e.W.data) - 1):
            ff.write('{} {}\n'.format(
                id2word[num], ' '.join([
                    str(model.x2e.W.data[num][numnum])
                    for numnum in range(embed_size)
                ])))
        print("time: {}".format(time.time() - start))
    pickle.dump(dict(word2id), open(vocab_dict, 'wb'))
    serializers.save_npz(load_model, model)
    state_d = {}
    state_d["vocab_size"] = vocab_size
    state_d["hidden_size"] = hidden_size
    state_d["embed_size"] = embed_size
    pickle.dump(state_d, open(state_model, "wb"))