def sent2features(sent): return [word2features(sent, i) for i in range(len(sent))] def sent2labels(sent): return [label for token, postag, label in sent] def sent2tokens(sent): return [token for token, postag, label in sent] files_paths = [ 'data/sensitive1.tsv', 'data/sensitive2.tsv', 'data/sensitive3.tsv' ] corpus = Corpus(files_paths) sents, tags = corpus.read_tsv() data = [] for i, sent in enumerate(sents): tmp = [] for j, t in enumerate(nltk.pos_tag(sent)): tmp.append((t[0], t[1], tags[i][j])) data.append(tmp) random.shuffle(data) X_train = [sent2features(s) for s in data[996:]] y_train = [sent2labels(s) for s in data[996:]]
import os import pandas as pd from corpus_process import Corpus data_dir = os.path.join(os.pardir, 'data') tsv_files = ['/sensitive1.tsv', '/sensitive2.tsv', '/sensitive3.tsv'] corpus = Corpus([data_dir + f for f in tsv_files])
infos = line.split() wd = infos[0] vec = np.array(infos[1:]).astype(np.float) pretrain_words[wd] = vec word_idx = corp.r_wordIDs vocab_num = corp.wordNum weights_matrix = np.zeros((vocab_num, embedding_dim)) for idx in word_idx.keys(): try: weights_matrix[idx] = pretrain_words[word_idx[idx]] except KeyError: weights_matrix[idx] = np.random.normal(size=(embedding_dim, )) if torch.cuda.is_available(): # run in GPU return torch.Tensor(weights_matrix).cuda() else: return torch.Tensor(weights_matrix) if __name__ == '__main__': train_files = [ 'Datafiles/technology_201501.data', 'Datafiles/technology_201502.data', 'Datafiles/technology_201503.data', 'Datafiles/technology_201504.data' ] test_file = 'Datafiles/technology_201505_test.data' valid_file = 'Datafiles/technology_201505_valid.data' corp = Corpus(train_files, test_file, valid_file) print corp.userNum, len(corp.user_history.keys()) his = create_history_tensor(corp.userNum, corp.user_history) print his.size() print his[-1]
import os, json from corpus_process import Corpus files = ['sensitive1.tsv', 'sensitive2.tsv', 'sensitive3.tsv'] tsv_path = [os.path.join(os.pardir, 'data', f) for f in files] vocabulary_path = os.path.join(os.pardir, 'data', 'vocabulary_corpus.json') corpus = Corpus(tsv_path) x, _ = corpus.read_tsv() vocabulary = {} for sent in x: for tok in sent: if not tok in vocabulary: vocabulary[tok] = len(vocabulary) + 1 with open(vocabulary_path, 'w') as fd: json.dump(vocabulary, fd)
from torch.utils.data import DataLoader, random_split from transformers import BertForTokenClassification, AdamW from pytorch_pretrained_bert import BertAdam import torch from seqeval.metrics import accuracy_score, f1_score, classification_report from seqeval.metrics import classification_report files_train_paths = ['data/sensitive1.tsv', 'data/sensitive3.tsv'] files_test_path = ['data/sensitive2.tsv'] pretrained_dataset = 'bert-base-uncased' # ======================================== # DATA # ======================================== data_train = Corpus(files_train_paths).get_dataset_bert() data_test = Corpus(files_test_path).get_dataset_bert() val_size = int(0.1 * len(data_train)) train_size = len(data_train) - val_size train_dataset, val_dataset = random_split(data_train, [train_size, val_size]) batch_size = 64 train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=False) val_loader = DataLoader(val_dataset, batch_size=batch_size)
def train(config): print 'Start training: ' + config.filename + ' ' + config.modelname trainfile, testfile, validfile = find_files(config.filename, config.month_num) modelname = config.modelname # Process the corpus and prepare pretrained parameters (if any) corp = Corpus(trainfile, testfile, validfile) config.user_num, config.conv_num, config.vocab_num = corp.userNum, corp.convNum, corp.wordNum train_convs, train_convs_per_user, test_data, dev_data = form_dataset(corp, config.batch_size) conv_data = create_tensor(corp.convs) if config.pretrained_file == 'NULL': embedding_matrix = None else: embedding_matrix = create_pretrain_embeddings(corp, config.embedding_dim, config.pretrained_file) # Set the model and saving path if modelname == 'DCR': user_history = create_history_tensor(corp.userNum, corp.user_history) arcs = create_arc_info(conv_data, no_time=True) model = DCR(config, conv_data, user_history, arcs, embedding_matrix) path_name = str(config.batch_size) + "_" + str(config.factor_dim) + "_" + str(config.embedding_dim) + "_" + \ str(config.kernal_num) + "_" + str(config.hidden_dim) + "_" + str(config.gcn_layers_num) + "_" + \ str(config.neg_sample_num) + "_" + str(config.lr) + "_" + str(int(config.pos_weight)) + "_" + config.att if config.month_num != 4: path_name += "_" + str(config.month_num) + "m_" + str(config.runtime) else: path_name += "_" + str(config.runtime) if config.pretrained_file == 'NULL': path_name += "_npembed" if config.no_lstm: path_name += "_nlstm" if config.no_gcn: path_name += "_ngcn" if config.mlp_layers_num == 0: path_name += "_nmlp" else: print 'Modelname Wrong!' exit() res_path = "BestResults/" + modelname + "/" + config.filename + "/" mod_path = "BestModels/" + modelname + "/" + config.filename + "/" if not os.path.isdir(res_path): os.makedirs(res_path) if not os.path.isdir(mod_path): os.makedirs(mod_path) mod_path += path_name + '.model' res_path += path_name + '.data' # Set the optimizing parameters loss_weights = torch.Tensor([1, config.pos_weight]) if torch.cuda.is_available(): # run in GPU model = model.cuda() loss_weights = loss_weights.cuda() if config.optim == 'adam': optimizer = optim.Adam(model.parameters(), lr=config.lr) else: optimizer = optim.SGD(model.parameters(), lr=config.lr) scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda epoch: 1.0 / ((epoch + 1) ** 0.5)) best_dev_map = 0.0 best_epoch = -1 best_epoch_loss = 999999 no_improve = 0 # Begin training for epoch in range(config.max_epoch): train_data = MyDataset(train_convs, train_convs_per_user, train=True, num_sampling=config.neg_sample_num) train_loader = data.DataLoader(train_data, batch_size=config.batch_size, num_workers=0, shuffle=True) loss = train_epoch(model, train_loader, loss_weights, optimizer, epoch) dev_map = evaluate(model, dev_data, dev=True) if dev_map > best_dev_map: no_improve = 0 best_dev_map = dev_map os.system('rm ' + mod_path) best_epoch = epoch best_epoch_loss = loss print('New Best Dev!!! MAP: %g' % best_dev_map) torch.save(model.state_dict(), mod_path) else: no_improve += 1 print('Current Best Dev MAP: %g, Dev MAP: %g' % (best_dev_map, dev_map)) if no_improve > 8: break scheduler.step() model.load_state_dict(torch.load(mod_path)) # Evaluate and save results res = evaluate(model, test_data) print('Result in test set: MAP: %g, Precision@1: %g, Precision@5: %g, nDCG@5: %g, nDCG@10: %g, MRR: %g' % (res[0], res[1], res[2], res[3], res[4], res[5])) with open(res_path, 'w') as f: f.write('MAP\tPre@1\tPre@5\tnDCG@5\tnDCG@10\tMRR\n') f.write('%g\t%g\t%g\t%g\t%g\t%g\n' % (res[0], res[1], res[2], res[3], res[4], res[5])) if modelname != "Pop" and modelname != "Random": f.write('Dev MAP: %g\n' % best_dev_map) f.write('Best epoch: %d\n' % best_epoch) f.write('Best epoch loss: %g\n' % best_epoch_loss)