def load_set(fn, src_field, dst_field, ctag, morpho_conf=None): conllu_data = parse(open(fn, 'r', encoding='utf-8').read()) sys.stderr.write('Loaded: %d from %s\n' % (len(conllu_data), fn)) src_descr = {} src_descr['i2c'], src_descr['c2i'], src_descr['max_len'] = build_dict( conllu_data, src_field, 3, [UNK, EOS]) dst_descr = {} dst_descr['i2c'], dst_descr['c2i'], dst_descr['max_len'] = build_dict( conllu_data, dst_field, 3, [UNK, GO, EOS]) dst_descr['max_len'] += 1 pos_descr = {} pos_descr['i2c'], pos_descr['c2i'] = build_tag_dict(conllu_data, ctag, 1) feats_to_use = [] if morpho_conf is None: feats_dict = build_feats_dict(conllu_data) else: feats_dict = load_feats_dict_from_morpho_config(morpho_conf) feats_to_use = list(feats_dict.keys()) feats_dict['FirstWord'] = { 'name_ft': None, 'c2i': { '#None': 0, 'Yes': 1 }, 'i2c': ['#None', 'Yes'] } feats_to_use.append('FirstWord') if 0 == len(feats_to_use): data_set, feats_to_use = convert_conllu_to_dataset( conllu_data, src_field, dst_field, src_descr['c2i'], src_descr['max_len'], dst_descr['c2i'], dst_descr['max_len'], pos_descr['c2i'], feats_dict, 0, 0) # 3, 3) else: data_set, _ = convert_conllu_to_dataset( conllu_data, src_field, dst_field, src_descr['c2i'], src_descr['max_len'], dst_descr['c2i'], dst_descr['max_len'], pos_descr['c2i'], feats_dict, 0, 0, feats_to_use) print('INFO: trainset size: %d' % len(data_set)) return data_set, src_descr['max_len'], dst_descr['max_len'], src_descr['c2i'], dst_descr['c2i'], \ pos_descr['c2i'], pos_descr['i2c'], feats_dict, feats_to_use
import tensorflow as tf import pickle from main import Summodel from data import build_dict, build_dataset, batch_iter with open("args.pickle", "rb") as f: args = pickle.load(f) print("Loading dictionary...") word_dict, reversed_dict, article_max_len, summary_max_len = build_dict( "valid") print("Loading validation dataset...") valid_x, valid_y = build_dataset("valid", word_dict, article_max_len, summary_max_len) valid_x_len = list(map(lambda x: len([y for y in x if y != 0]), valid_x)) with tf.Session() as sess: print("Loading saved model...") model = Summodel(reversed_dict, article_max_len, summary_max_len, args, Forward_only=True) saver = tf.train.Saver(tf.global_variables()) ckpt = tf.train.get_checkpoint_state("./saved_model/") saver.restore(sess, ckpt.model_checkpoint_path) batches = batch_iter(valid_x, valid_y, args.batch_size, 1) print("Writing summaries to 'train/result.txt'...") for batch_x, batch_y in batches:
import torch from torch.utils.data import DataLoader import torch.optim as optim from data import read_corpus, build_dict, TAG_MAP, NER_DataSet, condtraints from bi_lstm_crf import BiLSTM_CRF from trainer import train, evaluate, load_model train_corpus_path = './datasets/train_data' test_corpus_path = './datasets/test_data' if __name__ == '__main__': # prepare data corpus = read_corpus(train_corpus_path) dct = build_dict(corpus) # build dataloader np.random.shuffle(corpus) train_ds = NER_DataSet(corpus[:-5000], dct) val_ds = NER_DataSet(corpus[-5000:], dct) train_dl = DataLoader(train_ds, batch_size=32, shuffle=True, drop_last=True, num_workers=0) val_dl = DataLoader(val_ds, batch_size=32, shuffle=False, drop_last=True,
from data import build_dict, NewsDataSet, CATEGIRY_LIST import trainer if __name__ == "__main__": device = torch.device('cuda:7' if torch.cuda.is_available() else 'cpu') logger.info('using device: {}'.format(device)) train_file = os.path.abspath('./datasets/cnews/cnews.train.txt') valid_file = os.path.abspath('./datasets/cnews/cnews.val.txt') test_file = os.path.abspath('./datasets/cnews/cnews.test.txt') logger.info('load and preprocess data...') # build dictionary num_words = 5000 # the size of dictionary dct = build_dict([train_file, valid_file], num_words=num_words) # build dataset and dataloader train_ds = NewsDataSet(train_file, dct) train_dl = DataLoader(train_ds, batch_size=32, shuffle=True) valid_ds = NewsDataSet(valid_file, dct) valid_dl = DataLoader(valid_ds, batch_size=64) test_ds = NewsDataSet(test_file, dct) test_dl = DataLoader(test_ds, batch_size=64) # build model model = TextCNN(class_num=len(CATEGIRY_LIST), embed_size=len(dct))
output, hidden = model(input, hidden) output = output.squeeze() output = softmax(output, dim=0) p = output[current_idx].data # 概率 total_p += math.log(p) #e为底 return math.exp(-total_p * (1 / sentence_len)) def evaluate(model, test_dataset, dict): ppl = 0 for sentence in test_dataset: ppl += evaluate_iter(model, sentence, dict) ppl = ppl / len(test_dataset) print("evaluation ppl:", ppl) return ppl if __name__ == '__main__': dataset = data.get_dataset(file_path) dict = data.build_dict(dataset) config.vocab_size = len(dict) train_dataset, test_dataset = data.split_data( dataset, train_proportion=config.train_proportion) train_tokens = data.tokenize(train_dataset, dict) model = RNNModel(config) train_batch_source = data.batchify(train_tokens, config.batch_size) #传入batchify好的数据直接训练 train(model, batch_source=train_batch_source) #test evaluate(model, test_dataset, dict)
word_sim = fluid.layers.reduce_sum(word_sim, dim=-1) word_sim = fluid.layers.reshape(word_sim, shape=[-1]) pred = fluid.layers.sigmoid(word_sim) # 通过估计的输出概率定义损失函数,注意我们使用的是sigmoid_cross_entropy_with_logits函数 # 将sigmoid计算和cross entropy合并成一步计算可以更好的优化,所以输入的是word_sim,而不是pred loss = fluid.layers.sigmoid_cross_entropy_with_logits(word_sim, label) loss = fluid.layers.reduce_mean(loss) # 返回前向计算的结果,飞桨会通过backward函数自动计算出反向结果。 return pred, loss corpus = preprocess_data() word2id_dict, word2id_freq, id2word_dict = build_dict(corpus) corpus = [word2id_dict[word] for word in corpus] vocab_size = len(word2id_dict) print(f"vocab size: {vocab_size}") corpus = subsampling(corpus, word2id_freq) print("after subsampling %d tokens in the corpus" % len(corpus)) # print(f"finish create dateset: {len(dataset)} sample") step = 0 learning_rate = 0.001 # 定义一个使用word-embedding查询同义词的函数 # 这个函数query_token是要查询的词,k表示要返回多少个最相似的词,embed是我们学习到的word-embedding参数 # 我们通过计算不同词之间的cosine距离,来衡量词和词的相似度 # 具体实现如下,x代表要查询词的Embedding,Embedding参数矩阵W代表所有词的Embedding
action="store_true", help="Use only 50K samples of data") if __name__ == '__main__': parser = argparse.ArgumentParser() add_arguments(parser) args = parser.parse_args() with open("args.pickle", "wb") as f: pickle.dump(args, f) if not os.path.exists("saved_model"): os.mkdir("saved_model") print("Building dictionary...") word_dict, reversed_dict, art_max_len, sum_max_len = build_dict('train') print("Loading training dataset...") train_x, train_y = build_dataset("train", word_dict, art_max_len, sum_max_len) with tf.Session() as sess: model = Summodel(reversed_dict, art_max_len, sum_max_len, args, Forward_only=False) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables()) batches = batch_iter(train_x, train_y, args.batch_size, args.num_epochs) num_batches_per_epoch = (len(train_x) - 1) // args.batch_size + 1