Ejemplo n.º 1
0
def load_set(fn, src_field, dst_field, ctag, morpho_conf=None):
    conllu_data = parse(open(fn, 'r', encoding='utf-8').read())
    sys.stderr.write('Loaded: %d from %s\n' % (len(conllu_data), fn))

    src_descr = {}
    src_descr['i2c'], src_descr['c2i'], src_descr['max_len'] = build_dict(
        conllu_data, src_field, 3, [UNK, EOS])
    dst_descr = {}
    dst_descr['i2c'], dst_descr['c2i'], dst_descr['max_len'] = build_dict(
        conllu_data, dst_field, 3, [UNK, GO, EOS])
    dst_descr['max_len'] += 1
    pos_descr = {}
    pos_descr['i2c'], pos_descr['c2i'] = build_tag_dict(conllu_data, ctag, 1)

    feats_to_use = []
    if morpho_conf is None:
        feats_dict = build_feats_dict(conllu_data)
    else:
        feats_dict = load_feats_dict_from_morpho_config(morpho_conf)
        feats_to_use = list(feats_dict.keys())

    feats_dict['FirstWord'] = {
        'name_ft': None,
        'c2i': {
            '#None': 0,
            'Yes': 1
        },
        'i2c': ['#None', 'Yes']
    }
    feats_to_use.append('FirstWord')

    if 0 == len(feats_to_use):
        data_set, feats_to_use = convert_conllu_to_dataset(
            conllu_data, src_field, dst_field, src_descr['c2i'],
            src_descr['max_len'], dst_descr['c2i'], dst_descr['max_len'],
            pos_descr['c2i'], feats_dict, 0, 0)  # 3, 3)
    else:
        data_set, _ = convert_conllu_to_dataset(
            conllu_data, src_field, dst_field, src_descr['c2i'],
            src_descr['max_len'], dst_descr['c2i'], dst_descr['max_len'],
            pos_descr['c2i'], feats_dict, 0, 0, feats_to_use)

    print('INFO: trainset size: %d' % len(data_set))

    return data_set, src_descr['max_len'], dst_descr['max_len'], src_descr['c2i'], dst_descr['c2i'], \
           pos_descr['c2i'], pos_descr['i2c'], feats_dict, feats_to_use
Ejemplo n.º 2
0
import tensorflow as tf
import pickle
from main import Summodel
from data import build_dict, build_dataset, batch_iter

with open("args.pickle", "rb") as f:
    args = pickle.load(f)

print("Loading dictionary...")
word_dict, reversed_dict, article_max_len, summary_max_len = build_dict(
    "valid")
print("Loading validation dataset...")
valid_x, valid_y = build_dataset("valid", word_dict, article_max_len,
                                 summary_max_len)
valid_x_len = list(map(lambda x: len([y for y in x if y != 0]), valid_x))

with tf.Session() as sess:
    print("Loading saved model...")
    model = Summodel(reversed_dict,
                     article_max_len,
                     summary_max_len,
                     args,
                     Forward_only=True)
    saver = tf.train.Saver(tf.global_variables())
    ckpt = tf.train.get_checkpoint_state("./saved_model/")
    saver.restore(sess, ckpt.model_checkpoint_path)

    batches = batch_iter(valid_x, valid_y, args.batch_size, 1)

    print("Writing summaries to 'train/result.txt'...")
    for batch_x, batch_y in batches:
Ejemplo n.º 3
0
import torch
from torch.utils.data import DataLoader
import torch.optim as optim

from data import read_corpus, build_dict, TAG_MAP, NER_DataSet, condtraints
from bi_lstm_crf import BiLSTM_CRF
from trainer import train, evaluate, load_model

train_corpus_path = './datasets/train_data'
test_corpus_path = './datasets/test_data'

if __name__ == '__main__':

    # prepare data
    corpus = read_corpus(train_corpus_path)
    dct = build_dict(corpus)

    # build dataloader
    np.random.shuffle(corpus)
    train_ds = NER_DataSet(corpus[:-5000], dct)
    val_ds = NER_DataSet(corpus[-5000:], dct)

    train_dl = DataLoader(train_ds,
                          batch_size=32,
                          shuffle=True,
                          drop_last=True,
                          num_workers=0)
    val_dl = DataLoader(val_ds,
                        batch_size=32,
                        shuffle=False,
                        drop_last=True,
Ejemplo n.º 4
0
from data import build_dict, NewsDataSet, CATEGIRY_LIST
import trainer

if __name__ == "__main__":
    device = torch.device('cuda:7' if torch.cuda.is_available() else 'cpu')
    logger.info('using device: {}'.format(device))

    train_file = os.path.abspath('./datasets/cnews/cnews.train.txt')
    valid_file = os.path.abspath('./datasets/cnews/cnews.val.txt')
    test_file = os.path.abspath('./datasets/cnews/cnews.test.txt')

    logger.info('load and preprocess data...')

    # build dictionary
    num_words = 5000  # the size of dictionary
    dct = build_dict([train_file, valid_file], num_words=num_words)

    # build dataset and dataloader
    train_ds = NewsDataSet(train_file, dct)
    train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)

    valid_ds = NewsDataSet(valid_file, dct)
    valid_dl = DataLoader(valid_ds, batch_size=64)

    test_ds = NewsDataSet(test_file, dct)
    test_dl = DataLoader(test_ds, batch_size=64)

    # build model

    model = TextCNN(class_num=len(CATEGIRY_LIST), embed_size=len(dct))
Ejemplo n.º 5
0
        output, hidden = model(input, hidden)
        output = output.squeeze()
        output = softmax(output, dim=0)
        p = output[current_idx].data  # 概率
        total_p += math.log(p)  #e为底
    return math.exp(-total_p * (1 / sentence_len))


def evaluate(model, test_dataset, dict):
    ppl = 0
    for sentence in test_dataset:
        ppl += evaluate_iter(model, sentence, dict)
    ppl = ppl / len(test_dataset)
    print("evaluation ppl:", ppl)
    return ppl


if __name__ == '__main__':
    dataset = data.get_dataset(file_path)
    dict = data.build_dict(dataset)
    config.vocab_size = len(dict)
    train_dataset, test_dataset = data.split_data(
        dataset, train_proportion=config.train_proportion)
    train_tokens = data.tokenize(train_dataset, dict)
    model = RNNModel(config)
    train_batch_source = data.batchify(train_tokens,
                                       config.batch_size)  #传入batchify好的数据直接训练
    train(model, batch_source=train_batch_source)

    #test
    evaluate(model, test_dataset, dict)
Ejemplo n.º 6
0
        word_sim = fluid.layers.reduce_sum(word_sim, dim=-1)
        word_sim = fluid.layers.reshape(word_sim, shape=[-1])
        pred = fluid.layers.sigmoid(word_sim)

        # 通过估计的输出概率定义损失函数,注意我们使用的是sigmoid_cross_entropy_with_logits函数
        # 将sigmoid计算和cross entropy合并成一步计算可以更好的优化,所以输入的是word_sim,而不是pred

        loss = fluid.layers.sigmoid_cross_entropy_with_logits(word_sim, label)
        loss = fluid.layers.reduce_mean(loss)

        # 返回前向计算的结果,飞桨会通过backward函数自动计算出反向结果。
        return pred, loss


corpus = preprocess_data()
word2id_dict, word2id_freq, id2word_dict = build_dict(corpus)
corpus = [word2id_dict[word] for word in corpus]
vocab_size = len(word2id_dict)
print(f"vocab size: {vocab_size}")
corpus = subsampling(corpus, word2id_freq)
print("after subsampling %d tokens in the corpus" % len(corpus))
# print(f"finish create dateset: {len(dataset)} sample")

step = 0
learning_rate = 0.001


# 定义一个使用word-embedding查询同义词的函数
# 这个函数query_token是要查询的词,k表示要返回多少个最相似的词,embed是我们学习到的word-embedding参数
# 我们通过计算不同词之间的cosine距离,来衡量词和词的相似度
# 具体实现如下,x代表要查询词的Embedding,Embedding参数矩阵W代表所有词的Embedding
Ejemplo n.º 7
0
                        action="store_true",
                        help="Use only 50K samples of data")


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    add_arguments(parser)
    args = parser.parse_args()
    with open("args.pickle", "wb") as f:
        pickle.dump(args, f)

    if not os.path.exists("saved_model"):
        os.mkdir("saved_model")

    print("Building dictionary...")
    word_dict, reversed_dict, art_max_len, sum_max_len = build_dict('train')
    print("Loading training dataset...")
    train_x, train_y = build_dataset("train", word_dict, art_max_len,
                                     sum_max_len)
    with tf.Session() as sess:
        model = Summodel(reversed_dict,
                         art_max_len,
                         sum_max_len,
                         args,
                         Forward_only=False)
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver(tf.global_variables())

        batches = batch_iter(train_x, train_y, args.batch_size,
                             args.num_epochs)
        num_batches_per_epoch = (len(train_x) - 1) // args.batch_size + 1