fields=fields, path=filepath, separator=' ', train='train.txt', validation='valid.txt', test='test.txt') BATCH_SIZE = 16 WORD.build_vocab(train_data, vectors='glove.6B.100d', unk_init=torch.Tensor.normal_) LABEL.build_vocab(train_data) # print(word_frqs[:20])device train_iter = data.Iterator(dataset=train_data, batch_size=BATCH_SIZE, device=device, repeat=False, sort_within_batch=True, shuffle=True) valid_iter = data.Iterator(dataset=valid_data, batch_size=BATCH_SIZE, device=device, repeat=False, sort_within_batch=True, shuffle=False) test_iter = data.Iterator(dataset=test_data, batch_size=BATCH_SIZE, device=device, repeat=False, sort_within_batch=True, shuffle=False)
def test(args): train_data, val_data, test_data, SRC, TGT = prepare_data(args) BATCH_SIZE = args.batch_size best_bleu_loss = 0 pad_idx = TGT.vocab.stoi["<pad>"] print("Size of source vocabulary:", len(SRC.vocab)) print("Size of target vocabulary:", len(TGT.vocab)) print("FC matrix:", args.hidden_dim, args.ff_dim) model = transformer.make_model(len(SRC.vocab), len(TGT.vocab), d_model=args.hidden_dim, d_ff=args.ff_dim, N=args.num_blocks, compress=args.compress, compress_att=args.compress_attn, compress_mode=args.compress_mode, num_compress_enc = args.num_enc_blocks_comp, num_compress_dec = args.num_dec_blocks_comp,) model.to(device) if args.load_model: print('load model from [%s]' % args.load_model, file=sys.stderr) params = torch.load(args.load_model, map_location=lambda storage, loc: storage) state_dict = params['model'] # opts = params[''] model.load_state_dict(state_dict, strict=False) if args.debug: #fast check number of parameters model_full = transformer.make_model(len(SRC.vocab), len(TGT.vocab), d_model=args.hidden_dim, d_ff=args.ff_dim, \ N=6, compress=False, \ num_compress_enc=0, num_compress_dec=0) debug_compress_info(model_full,model) # exit() criterion = train_utils.LabelSmoothing(size=len(TGT.vocab), padding_idx=pad_idx, smoothing=0.1) criterion.to(device) if args.multi_gpu: devices = list(np.arange(args.num_devices)) model_parallel = nn.DataParallel(model, device_ids=devices) test_iter = data.Iterator(test_data, batch_size=args.batch_size, train=False, sort=False, repeat=False, device=device) print("Number of examples in test: ", args.batch_size*len([_ for _ in test_iter])) # test_loss_fn = train_utils.LossCompute(model.generator, criterion, model_opt) os.makedirs(args.save_to_file, exist_ok=True) if args.multi_gpu: model_parallel.eval() start_infer_time = time.time() bleu_loss = train_utils.test_decode(model_parallel.module, SRC, TGT, test_iter, 10000, \ to_words=True, file_path=os.path.join(args.save_to_file, args.exp_name)) print("Time for inference: ", time.time() - start_infer_time) else: model.eval() bleu_loss = train_utils.test_decode(model, SRC, TGT, test_iter, -1,\ to_words=True, file_path=os.path.join(args.save_to_file, args.exp_name)) print() # print("Test perplexity ", np.exp(loss)) print("Total bleu:", bleu_loss)
############################# # define iterator train_iter = data.BucketIterator(train_data, batch_size=params['BATCH_SIZE'], device=DEVICE, sort_within_batch=True, sort_key=lambda x: len(x.text), train=True, repeat=False) # train_iter = data.Iterator(train_data, batch_size=1, train=False, sort=False, repeat=False, device=DEVICE) valid_iter = data.Iterator(valid_data, batch_size=1, train=False, sort=False, repeat=False, device=DEVICE) test_iter = data.Iterator(test_data, batch_size=1, train=False, sort=False, repeat=False, device=DEVICE) print_data_info(train_data, valid_data, test_data, SRC, LABEL) ############################# run_lrp(test_iter, vocab=SRC.vocab, model_file='sa_model4.pt')
PAD_INDEX = AMR_SRC.vocab.stoi[PAD_TOKEN] print_data_info(my_data, NL_SRC, AMR_SRC) train_iter = data.BucketIterator(my_data["train"], batch_size=BATCH_SIZE, train=True, sort_within_batch=True, sort_key=lambda x: (len(x.src), len(x.trg)), repeat=False, device=DEVICE) valid_iter = data.Iterator(my_data["val"], batch_size=1, train=False, sort=False, repeat=False, device=DEVICE) model = make_autoencoder(len(NL_SRC.vocab), len(AMR_SRC.vocab), emb_size=500, hidden_size=500, num_layers=2, dropout=0.5) dev_perplexities = train(model, num_epochs=NUM_EPOCHS, print_every=500, num_batches=NUM_BATCHES, error_per=error_per) torch.save(model, f'{exp_name}.pt')
TEXT.vocab.vectors[i] = vectors[wv_index] match_embedding += 1 else: TEXT.vocab.vectors[i] = torch.FloatTensor(dim).uniform_( -0.25, 0.25) else: print("Error: Need word embedding pt file") exit(1) print("Embedding match number {} out of {}".format(match_embedding, len(TEXT.vocab))) train_iter = data.Iterator(train, batch_size=args.batch_size, device="cuda", train=True, repeat=False, sort=False, shuffle=True, sort_within_batch=False) dev_iter = data.Iterator(dev, batch_size=args.batch_size, device="cuda", train=False, repeat=False, sort=False, shuffle=False, sort_within_batch=False) test_iter = data.Iterator(test, batch_size=args.batch_size, device="cuda", train=False,
# BucketIterator # BPTTIterator # 若只针对训练集构造迭代器 # train_iter = data.BucketIterator(dataset=train, batch_size=8, shuffle=True, sort_within_batch=False, repeat=False) # 同时对训练集和验证集进行迭代器的构建 train_iter, valid_iter = data.BucketIterator.splits( (train, valid), # 构建数据集所需的 Dataset batch_sizes=(train_batch_size, valid_batch_size), device=torch_device, # 如果使用gpu,此处更换为GPU的编号 sort_key=lambda x: len(x.sentence), # 这个BucketIterator需要文本的长度 sort_within_batch=False, repeat=False ) test_iter = data.Iterator(test, batch_size=test_batch_size, device=torch_device, sort=False, sort_within_batch=False, repeat=False) from torchtext.vocab import Vectors import os cache='../vector_cache' if not os.path.exists(cache): os.mkdir(cache) vectors = Vectors(name='glove.6B.'+str(vocab_dimension)+'d.txt', cache=cache) print("build vocab: start") TEXT.build_vocab(train, vectors=vectors) vocab = TEXT.vocab
def preprocess(question, equation, lQueryVars, sni_model, fields, use_sni): # handle $'s question = question.replace('$', ' $ ') question = question.replace('. ', ' . ') question = question.replace('?', ' ? ') question = re.sub(r',([\d\d\d])', r'\1', question) # join equations if needed equation = ' , '.join(equation) # seperate equation at operators equation = equation.replace('[', ' ( ') equation = equation.replace(']', ' ) ') equation = equation.replace('+', ' + ') equation = equation.replace('-', ' - ') equation = equation.replace('*', ' * ') equation = equation.replace('/', ' / ') equation = equation.replace('(', ' ( ') equation = equation.replace(')', ' ) ') equation = equation.replace('=', ' = ') equation = equation.replace('^', ' ^ ') equation = equation.split() question = question.split() # prevent inplace changes on question question_copy = [t for t in question] # prepend and postpend null tokens to question to allow for sni window size # of three question_copy = ['null', 'null', 'null' ] + question_copy + ['null', 'null', 'null'] # find and replace constants in question and equation i = 0 constants = dict() for j, token in enumerate(question): if isFloat(token): example = question_copy[j - 3:j + 4] ex = data.Example.fromlist([' '.join(example), ''], fields) dataset = data.Dataset([ex], fields) inp = None iterator = data.Iterator(dataset, batch_size=1) iterator.repeat = False for batch in iterator: inp = batch.text.t() #.cuda() #inp = inp.cuda(device=0) if (not use_sni) or (use_sni and isSignificant(inp, sni_model)): token = float(token) character = '[' + chr(97 + i) + ']' for symbol in equation: if isFloat(symbol) and float(symbol) == float(token): equation[equation.index(symbol)] = character constants[character] = str(token) for q in question: if isFloat(q) and float(q) == token: question[question.index(q)] = character i += 1 # find and replace variables in equation variables = [ x for x in equation if x not in ['+', '-', '*', '/', ',', '**', '(', ')', '='] and not isFloat(x) and not re.match(r'\[[a-z]\]', x) ] variables = np.unique(variables) i = 0 for v in variables: #equation = [x if x!=v else 'VAR_' + str(i) for x in equation] equation = [x if x != v else 'VAR' for x in equation] #equation = [x if x!=v else '[a]' for x in equation] i += 1 question = ' '.join(question) equation = ''.join(equation) # simplify equation print('equation (before):', equation) equation = equation.split(',') for i, x in enumerate(equation): x = x.replace('[', '') x = x.replace(']', '') x = x.split('=') x = str('(' + str(x[0]) + ')' + '-' + '(' + str(x[1]) + ')') parse_expr(x, evaluate=False) x = sympy.simplify(x) x = str(x) x = x.replace(' ', '') for k in constants.keys(): x = x.replace(k.strip('[').strip(']'), k) equation[i] = x + '=0' equation = sorted(equation) equation = ','.join(equation) j = 0 print('EQUATION:', equation) constants_in_equation = re.findall(r'\[[a-z]\]', equation) print(constants_in_equation) for k in sorted(constants_in_equation, reverse=False): #equation = equation.replace(k, '[' + chr(107 + j) + ']') equation = equation.replace(k, '[]') j += 1 print('EQUATION_:', equation) #print('constants:', constants) print('equation (after): ', equation) return question, equation, constants
def main(): src_dir = "data/src" model_dir = "data/model" eval_dir = "data/eval" corpus = "lang8_small" en_emb = "glove" de_emb = "glove" seq_train = False emb_dim = 200 batch_size = 1500 # Data Loading vocab_file = os.path.join(model_dir, "%s.vocab" % (corpus)) model_file = os.path.join( model_dir, "%s.%s.%s.transformer.pt" % (corpus, en_emb, de_emb)) if not os.path.exists(eval_dir): os.makedirs(eval_dir) # Computing Unit device = torch.device("cpu") # Loading Data bos_word = '<s>' eos_word = '</s>' blank_word = '<blank>' min_freq = 2 spacy_en = spacy.load('en') def tokenize(text): return [tkn.text for tkn in spacy_en.tokenizer(text)] TEXT = data.Field(tokenize=tokenize, init_token=bos_word, eos_token=eos_word, pad_token=blank_word) test = datasets.TranslationDataset(path=os.path.join(src_dir, corpus), exts=('.test.src', '.test.trg'), fields=(TEXT, TEXT)) # use the same order as original data test_iter = data.Iterator(test, batch_size=batch_size, device=device, sort=False, repeat=False, train=False) random_idx = random.randint(0, len(test) - 1) print(test[random_idx].src) print(test[random_idx].trg) # Vocabulary TEXT.vocab = torch.load(vocab_file) pad_idx = TEXT.vocab.stoi["<blank>"] print("Load %s vocabuary; vocab size = %d" % (corpus, len(TEXT.vocab))) # Word Embedding encoder_emb, decoder_emb = get_emb(en_emb, de_emb, TEXT.vocab, device, d_model=emb_dim) # Translation model = BuildModel(len(TEXT.vocab), encoder_emb, decoder_emb, d_model=emb_dim).to(device) model.load_state_dict(torch.load(model_file)) model.eval() print("Predicting %s ..." % (corpus)) src, trg, pred = [], [], [] for batch in (rebatch(pad_idx, b) for b in test_iter): out = greedy_decode(model, TEXT.vocab, batch.src, batch.src_mask) # print("SRC OUT: ", src.shape, out.shape) probs = model.generator(out) _, prediction = torch.max(probs, dim=-1) source = [[TEXT.vocab.itos[word] for word in words[1:]] for words in batch.src] target = [[TEXT.vocab.itos[word] for word in words[1:]] for words in batch.trg] translation = [[TEXT.vocab.itos[word] for word in words] for words in prediction] for i in range(len(translation)): src.append(' '.join(source[i]).split('</s>')[0]) trg.append(' '.join(target[i]).split('</s>')[0]) pred.append(' '.join(translation[i]).split('</s>')[0]) # eliminate data with unkonwn words in src trg if '<unk>' in src[-1] or '<unk>' in trg[-1]: continue print("Source:", src[-1]) print("Target:", trg[-1]) print("Translation:", pred[-1]) print() prefix = os.path.join(eval_dir, '%s.%s.%s.eval' % (corpus, en_emb, de_emb)) for sentences, ext in zip([src, trg, pred], ['.src', '.trg', '.pred']): with open(prefix + ext, 'w+') as f: f.write('\n'.join(sentences))
logging.info( f'Unique tokens in TARGET vocab: {len(target_variable.vocab)}') # Automatically shuffles and buckets the input sequences into # sequences of similar length train_iter, valid_iter = data.BucketIterator.splits( (train_data, valid_data), sort_key=lambda x: len( x.tweet), # what function/field to use to group the data batch_size=BATCH_SIZE, device=device) # Don't want to shuffle test data, so use a standard iterator dev_iter = data.Iterator(dev_data, batch_size=BATCH_SIZE, device=device, train=False, sort=False, sort_within_batch=False) test_iter = data.Iterator(test_data, batch_size=BATCH_SIZE, device=device, train=False, sort=False, sort_within_batch=False) emb_shape = text_variable.vocab.vectors.shape input_dim = emb_shape[0] embedding_dim = emb_shape[1] output_dim = 1 pretrained_embeddings = text_variable.vocab.vectors
def get_vocabularies_and_iterators(experiment, data_dir=None, max_len=30): """ Creates vocabularies and iterators for the experiment :param experiment: the Experiment object including all settings about the experiment :param data_dir: the directory where data is stored in. If None, default is applied :param max_len: the max length, default is the sentence max length considered during tokenization process :return: src vocabulary, trg vocabulary, datasets and iteratotrs + sample iterator if dataset europarl is used """ device = experiment.get_device() #### Create torchtext fields ####### SRC, TRG voc_limit = experiment.voc_limit min_freq = experiment.min_freq corpus = experiment.corpus language_code = experiment.lang_code reduce = experiment.reduce print("Vocabulary limit:", voc_limit) reverse_input = experiment.reverse_input print("Source reversed:", reverse_input) print("Required samples:") print(experiment.train_samples, experiment.val_samples, experiment.test_samples) PREPRO = False if corpus == "europarl" else True MODE = "w" src_tokenizer, trg_tokenizer = get_custom_tokenizer( "en", mode=MODE, prepro=PREPRO), get_custom_tokenizer(language_code, mode=MODE, prepro=PREPRO) src_vocab = Field(tokenize=lambda s: src_tokenizer.tokenize(s), include_lengths=False, init_token=None, eos_token=None, pad_token=PAD_TOKEN, unk_token=UNK_TOKEN, lower=True) trg_vocab = Field(tokenize=lambda s: trg_tokenizer.tokenize(s), include_lengths=False, init_token=SOS_TOKEN, eos_token=EOS_TOKEN, pad_token=PAD_TOKEN, unk_token=UNK_TOKEN, lower=True) print("Fields created!") ####### create splits ########## if corpus == "europarl": root = os.path.expanduser(DATA_DIR_PREPRO) if not data_dir: data_dir = os.path.join(root, corpus, language_code, "splits", str(max_len)) # local directory # check if files have been preprocessed try: files = os.listdir(data_dir) if len(files) < 8: print( "ERROR: Not enough training files found at {}!\nTraining the model on the Europarl dataset requires train, val, test and samples splits for each language!" .format(data_dir)) print( "Please drerun the script 'preprocess.py' for the given <lang_code>!" ) except FileNotFoundError: print("ERROR: Training files not found at {}!".format(data_dir)) print( "Please run the 'preprocess.py' script for the given <lang_code> before training the model!" ) exit(-1) print("Loading data...") start = time.time() file_type = experiment.tok exts = ("." + experiment.get_src_lang(), "." + experiment.get_trg_lang()) train, val, test = Seq2SeqDataset.splits(fields=(src_vocab, trg_vocab), exts=exts, train="train." + file_type, validation="val." + file_type, test="test." + file_type, path=data_dir, reduce=reduce, truncate=experiment.truncate) ### samples is used to check translations during the training phase samples = Seq2SeqDataset.splits(fields=(src_vocab, trg_vocab), exts=exts, train="samples." + file_type, validation="", test="", path=data_dir) end = time.time() print("Duration: {}".format(convert_time_unit(end - start))) print("Total number of sentences: {}".format( (len(train) + len(val) + len(test)))) else: #### Training on IWSLT torchtext corpus ##### print("Loading data...") start = time.time() path = os.path.expanduser(os.path.join(DATA_DIR_PREPRO, "iwslt")) os.makedirs(path, exist_ok=True) exts = (".en", ".de") if experiment.get_src_lang() == "en" else (".de", ".en") ## see: https://lukemelas.github.io/machine-translation.html train, val, test = datasets.IWSLT.splits( root=path, exts=exts, fields=(src_vocab, trg_vocab), filter_pred=lambda x: max(len(vars(x)['src']), len(vars(x)['trg']) ) <= experiment.truncate) samples = None end = time.time() print("Duration: {}".format(convert_time_unit(end - start))) print("Total number of sentences: {}".format( (len(train) + len(val) + len(test)))) if voc_limit > 0: src_vocab.build_vocab(train, min_freq=min_freq, max_size=voc_limit) trg_vocab.build_vocab(train, min_freq=min_freq, max_size=voc_limit) print("Vocabularies created!") else: src_vocab.build_vocab(train, min_freq=min_freq) trg_vocab.build_vocab(train, min_freq=min_freq) print("Vocabularies created!") #### Iterators ##### # Create iterators to process text in batches of approx. the same length train_iter = data.BucketIterator(train, batch_size=experiment.batch_size, device=device, repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), shuffle=True) val_iter = data.BucketIterator(val, 1, device=device, repeat=False, sort_key=lambda x: (len(x.src)), shuffle=True) test_iter = data.Iterator(test, batch_size=1, device=device, repeat=False, sort_key=lambda x: (len(x.src)), shuffle=False) if samples[0].examples: samples_iter = data.Iterator(samples[0], batch_size=1, device=device, repeat=False, shuffle=False, sort_key=lambda x: (len(x.src))) else: samples_iter = None return src_vocab, trg_vocab, train_iter, val_iter, test_iter, train, val, test, samples, samples_iter
with open(args.dic, 'rb') as dic_file: dictionary = pickle.load(dic_file) # Reconstruct the dictionary in torchtext. counter = Counter({'<unk>': 0, '</s>': 0}) TEXT.vocab = vocab.Vocab(counter, specials=['<unk>', '</s>']) TEXT.vocab.itos = dictionary.idx2word TEXT.vocab.stoi = defaultdict(vocab._default_unk_index, dictionary.word2idx) TEXT.vocab.load_vectors('glove.6B.%dd' % args.embedding_dim) itos = TEXT.vocab.itos if args.p else None print('Vocab size %d' % len(TEXT.vocab)) train_iter = data.Iterator(dataset=train, batch_size=args.batch_size, sort_key=lambda x: len(x.context), sort=True, repeat=False) valid_iter = data.Iterator(dataset=valid, batch_size=args.batch_size, sort_key=lambda x: len(x.context), sort=True, repeat=False) print('Initializing the model') if args.load_model != '': with open(args.load_model, 'rb') as f: model = torch.load(f).cuda() elif args.decider_type == 'cnncontext': model = CNNContextClassifier(len(TEXT.vocab),
text_field = data.Field(lower=True, fix_length=40) label_field = data.Field(unk_token=None, pad_token=None) train_data, valid_data, test_data = load_data(text_field, label_field) args.vocab_size = len(text_field.vocab) args.target_size = len(label_field.vocab) args.weight_matrix = text_field.vocab.vectors print(label_field.vocab.itos) #print(label_field.vocab.itos) print("\nParameters:") for attr, value in sorted(args.__dict__.items()): print("\t{}={}".format(attr.upper(), value)) train_iter = data.Iterator(dataset=train_data, batch_size=args.batch_size, shuffle=True) valid_iter = data.Iterator(dataset=valid_data, batch_size=args.batch_size, shuffle=False) test_iter = data.Iterator(dataset=test_data, batch_size=args.batch_size, shuffle=False) if args.train is True: print("Start training...") esim = model.ESIM(args) if args.cuda: esim = esim.cuda() train.train(train_iter, valid_iter, esim, args) else:
path='../data/tweet/multi/top{}/train.csv'.format(emoji_num), format='csv', fields=[('Id', ID), ('Text', TEXT), ('Label', LABEL)], skip_header=True) valid = data.TabularDataset( path='../data/tweet/multi/top{}/valid.csv'.format(emoji_num), format='csv', fields=[('Id', ID), ('Text', TEXT), ('Label', LABEL)], skip_header=True) test = data.TabularDataset( path='../data/tweet/multi/top{}/test.csv'.format(emoji_num), format='csv', fields=[('Id', ID), ('Text', TEXT), ('Label', LABEL)], skip_header=True) TEXT.build_vocab(train,valid,test, min_freq=5) print('Building vocabulary Finished.') train_iter = data.BucketIterator(dataset=train, batch_size=batch_size, sort_key=lambda x: len(x.Text), device=device, repeat=False) valid_iter = data.Iterator(dataset=valid, batch_size=batch_size, device=device, shuffle=False, repeat=False) test_iter = data.Iterator(dataset=test, batch_size=batch_size, device=device, shuffle=False, repeat=False) train_dl = datahelper.BatchWrapper(train_iter, ["Text", "Label"]) valid_dl = datahelper.BatchWrapper(valid_iter, ["Text", "Label"]) test_dl = datahelper.BatchWrapper(test_iter, ["Text", "Label"]) print('Reading data done.') word_matrix = datahelper.wordlist_to_matrix("../data/embedding/top5embedding.txt", TEXT.vocab.itos, device, embedding_dim) def predict_on(model, data_dl, loss_func, device ,model_state_path=None): if model_state_path: model.load_state_dict(torch.load(model_state_path)) print('Start predicting...')
TEXT.build_vocab(trainds,valds, max_size=100000,vectors=vec) # build vocab for labels LABEL.build_vocab(trainds) outputs_last_layer_cove = MTLSTM(n_vocab=len(TEXT.vocab), vectors=TEXT.vocab.vectors) outputs_both_layer_cove = MTLSTM(n_vocab=len(TEXT.vocab), vectors=TEXT.vocab.vectors, layer0=True) outputs_both_layer_cove_with_glove = MTLSTM(n_vocab=len(TEXT.vocab), vectors=TEXT.vocab.vectors, layer0=True, residual_embeddings=True) traindl, valdl = data.BucketIterator.splits(datasets=(trainds, valds), # specify train and validation Tabulardataset batch_sizes=(64,len(valid)), # batch size of train and validation sort_key=lambda x: len(x.moment), # on what attribute the text should be sorted device=None, # -1 mean cpu and 0 or None mean gpu sort_within_batch=True, repeat=False) test_iter = data.Iterator(tst, batch_size=64, device=None, sort=False, sort_within_batch=False, repeat=False) class BatchGenerator: def __init__(self, dl, x_field, y_field): self.dl, self.x_field, self.y_field = dl, x_field, y_field def __iter__(self): for batch in self.dl: X = getattr(batch, self.x_field) y = getattr(batch, self.y_field) yield (X,y) train_batch_it = BatchGenerator(traindl, 'moment', 'social') valid_batch_it = BatchGenerator(valdl, 'moment', 'social') test_batch_it = BatchGenerator(test_iter,'moment','hmid')
dev, test = data.TabularDataset.splits(path=args.output, validation='valid.txt', test='test.txt', format='tsv', fields=field) TEXT.build_vocab(train, dev, test) ED.build_vocab(train, dev) total_num = len(test) print('total num of example: {}'.format(total_num)) # load the model if args.gpu == -1: # Load all tensors onto the CPU test_iter = data.Iterator(test, batch_size=args.batch_size, train=False, repeat=False, sort=False, shuffle=False, sort_within_batch=False) model = torch.load(args.dete_model, map_location=lambda storage, loc: storage) model.config.cuda = False else: test_iter = data.Iterator(test, batch_size=args.batch_size, device=torch.device('cuda', args.gpu), train=False, repeat=False, sort=False, shuffle=False, sort_within_batch=False)
def train(data_path, train_path, val_path, test_path, mf, epochs, bs, opt, net_type, ly, hs, num_dir, emb_dim, embfix, pretrained_emb, dropout, pred_filter, save_path, save, verbose=False): ############################################################################ # Load data ############################################################################ embfix = False # Delete this line later pretrained_emb = False # Delete this line later cuda = int(torch.cuda.is_available()) - 1 TEXT = data.Field(lower=True, init_token="<start>", eos_token="<end>") LABELS = data.Field(sequential=False) VAR_VALUES_VAL = data.Field(sequential=False) VAR_VALUES_TEST = data.Field(sequential=False) ANS_VAL = data.Field(sequential=False) ANS_TEST = data.Field(sequential=False) """ train, val, test = data.TabularDataset.splits( path=data_path, train=train_path, validation=val_path, test=test_path, format='tsv', fields=[('text', TEXT), ('label', LABELS), ('var_values', VAR_VALUES_VAL), ('ans', ANS)]) """ train = data.TabularDataset(path=data_path + train_path, format='tsv', fields=[('text', TEXT), ('label', LABELS), ('var_values', VAR_VALUES_VAL), ('ans', ANS_VAL)]) val = data.TabularDataset(path=data_path + val_path, format='tsv', fields=[('text', TEXT), ('label', LABELS), ('var_values', VAR_VALUES_VAL), ('ans', ANS_VAL)]) test = data.TabularDataset(path=data_path + test_path, format='tsv', fields=[('text', TEXT), ('label', LABELS), ('var_values', VAR_VALUES_TEST), ('ans', ANS_TEST)]) prevecs = None if (pretrained_emb == True): print('USING PRETRAINED EMB') TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=emb_dim), min_freq=mf) prevecs = TEXT.vocab.vectors else: TEXT.build_vocab(train) LABELS.build_vocab(train) print(len(LABELS.vocab.itos)) VAR_VALUES_VAL.build_vocab(val) VAR_VALUES_TEST.build_vocab(test) ANS_VAL.build_vocab(val) ANS_TEST.build_vocab(test) if not os.path.isdir(save_path): os.makedirs(save_path) torch.save(LABELS.vocab.itos, save_path + 'LABELS_vocab_itos.pt') snis = [eq.count('[') for eq in LABELS.vocab.itos] """ train_iter, val_iter, test_iter = data.BucketIterator.splits( (train, val, test), batch_sizes=(bs, bs, bs), sort_key=lambda x: len(x.text)) """ train_iter = data.BucketIterator(train, batch_size=bs, sort_key=lambda x: len(x.text), train=True) val_iter = data.Iterator(val, batch_size=bs, repeat=False, train=False, sort=False, shuffle=False) test_iter = data.Iterator(test, batch_size=len(test), repeat=False, train=False, sort=False, shuffle=False) num_classes = len(LABELS.vocab) input_size = len(TEXT.vocab) ############################################################################ # Build the model ############################################################################ model = m.Model(input_size=input_size, hidden_size=hs, num_classes=num_classes, prevecs=prevecs, num_layers=ly, num_dir=num_dir, batch_size=bs, emb_dim=emb_dim, embfix=embfix, dropout=dropout, net_type=net_type) criterion = nn.CrossEntropyLoss() # Select optimizer if (opt == 'adamax'): optimizer = torch.optim.Adamax(model.parameters()) elif (opt == 'adam'): optimizer = torch.optim.Adam(model.parameters()) elif (opt == 'sgd'): optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.5) else: #print('Optimizer unknown, defaulting to adamax') optimizer = torch.optim.Adamax(model.parameters()) ############################################################################ # Training the Model ############################################################################ if cuda == 0: model = model.cuda() hyperparams = { 'mf': mf, 'epochs': epochs, 'bs': bs, 'opt': opt, 'net_type': net_type, 'ly': ly, 'hs': hs, 'num_dir': num_dir, 'emb_dim': emb_dim, 'embfix': embfix, 'pretrained_emb': pretrained_emb, 'dropout': dropout, 'pred_filter': pred_filter } print('Training:', hyperparams) #print('pretrained_emb:', pretrained_emb) #print('embfix:', embfix) results = [] best_true_acc = 0 for epoch in range(epochs): tot_loss = 0 train_iter.repeat = False for batch_count, batch in enumerate(train_iter): model.zero_grad() inp = batch.text.t() preds = model(inp) #print(F.softmax(preds)) loss = criterion(preds, batch.label) loss.backward() optimizer.step() tot_loss += loss.data[0] # load correct solver solver = None if 'tencent' in data_path: solver = tencent_solver if 'kushman' in data_path: solver = kushman_solver if 'ms_draw' in data_path: solver = msdraw_solver if 'mawps' in data_path: solver = mawps_solver (avg_loss, accuracy, true_acc, corrects, size, t5_acc, t5_corrects, mrr, eval_preds) = evaluate(val_iter, model, TEXT, emb_dim, LABELS, VAR_VALUES_VAL, ANS_VAL, snis, pred_filter=pred_filter, solver=solver) print('Classification acc (VAL):', accuracy) (_, test_acc, test_true_acc, _, _, _, _, _, test_eval_preds) = evaluate(test_iter, model, TEXT, emb_dim, LABELS, VAR_VALUES_TEST, ANS_TEST, snis, pred_filter=pred_filter, solver=solver) # save best preds file if true_acc > best_true_acc: if not os.path.isdir(save_path): os.makedirs(save_path) predictions_file = open(save_path + 'predictions.txt', 'w') for line in eval_preds: predictions_file.write(line + '\n') predictions_file.close() if save: if not os.path.isdir(save_path): os.makedirs(save_path) torch.save(model, save_path + '{}_e{}.pt'.format(accuracy, epoch)) results = np.append( results, { 'epoch': epoch, 'avg_loss': avg_loss, 'accuracy': accuracy, 'true_acc': true_acc, 'corrects': corrects, 'size': size, 't5_acc': t5_acc, 't5_corrects': t5_corrects, 'mrr': mrr, 'preds': eval_preds, 'test_eval_preds': test_eval_preds, 'test_true_acc': test_true_acc, 'test_acc': test_acc }) if verbose: print('\nEvaluation - loss: {:.6f} acc: {:.4f}%({}/{}) ' \ 'true_acc: {:.4f}%(todo/todo) t5_acc: {:.4f}%({}/{}) MRR:' \ '{:.6f}\n'.format(avg_loss, accuracy, corrects, size, t5_acc, t5_corrects, size, mrr)) #print('Best Accuracy:', np.sort([i['accuracy'] for i in results])[-1]) #print('Best True Accuracy:', np.sort([i['true_acc'] for i in results])[-1]) return results
def tokenizer(txt): return list(jieba.cut(txt)) TEXT = data.Field(sequential=True, tokenize=tokenizer, pad_token='<pad>') LABEL = data.Field(sequential=False, use_vocab=False) ftrain = 'train3.tsv' train = data.TabularDataset(path=os.path.join(DATA,ftrain),format='tsv', fields=[ ('seq1',TEXT), ('seq2',TEXT), ('lbl',LABEL) ]) TEXT.build_vocab(train) train_iter = data.Iterator(train,batch_size=4,sort=False,repeat=False) # vocab = TEXT.vocab embedding = torch.nn.Embedding(num_embeddings = len(TEXT.vocab.itos), embedding_dim=10, padding_idx=TEXT.vocab.stoi[TEXT.pad_token] ) for sample in train_iter: seq1,seq2,lbl = [getattr(sample, name) for name in ['seq1','seq2','lbl']] embedding(seq1.unsqueeze(-1)) embedding(seq2.unsqueeze(-1)) fvalid = 'train2.tsv' TEXT2 = data.Field(sequential=True, tokenize=tokenizer, pad_token='<pad>')
path='/content/drive/My Drive/dataset/Cornell-Movie-Quotes-Corpus/', train='train.csv', validation='validation.csv', test='test.csv', format='csv', fields=[('src', SRC), ('trg', TRG), ('label_src', LABEL_SRC), ('label_trg', LABEL_TRG)]) """ SRC.build_vocab(train_ds, vectors=english_fasttext_vectors) TRG.build_vocab(train_ds, vectors=english_fasttext_vectors) #SRC.build_vocab(train_ds) #TRG.build_vocab(train_ds) print(TRG.vocab.stoi) print(len(TRG.vocab.stoi)) from torchtext import data batch_size = 64 train_dl = data.Iterator(train_ds, batch_size=batch_size, train=True) val_dl = data.Iterator(val_ds, batch_size=batch_size, train=False, sort=False) batch = next(iter(val_dl)) print(batch.src[0].shape) print(batch.trg[0].shape) print(batch.label_src.shape) print(batch.trg[0][:, 1:]) print(batch.trg[0]) class EncoderRNN(nn.Module): def __init__(self, emb_size, hidden_size, vocab_size, text_embedding_vectors, emotion_size, dropout=0): super(EncoderRNN, self).__init__() self.hidden_size = hidden_size if text_embedding_vectors == None: self.embedding = nn.Embedding(vocab_size, emb_size) else:
print('Valid Example: {}'.format('\n'.join([ '{} ---- {}'.format(example.text, example.label) for example in valid_data.examples[:5] ]))) print('Test Example: {}'.format('\n'.join([ '{} ---- {}'.format(example.text, example.label) for example in test_data.examples[:5] ]))) train_iter = data.BucketIterator(dataset=train_data, batch_size=BATCH_SIZE, sort_key=lambda x: len(x.text)) valid_iter = data.BucketIterator(dataset=valid_data, batch_size=BATCH_SIZE, sort_key=lambda x: len(x.text)) test_iter = data.Iterator(dataset=test_data, batch_size=BATCH_SIZE, sort=False) # build model from text_classify.model import RNN, WordAVGModel, TextCNN from text_classify.transformer import Transformer embedding_size = TEXT.vocab.vectors.shape[ 1] if USE_PRE_TRAIN_MODEL else EMBEDDING_SIZE # model = RNN(input_size=len(TEXT.vocab), embedding_size=embedding_size, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS, output_size=len(LABEL.vocab)) # model = TextCNN(input_size=len(TEXT.vocab), embedding_size=embedding_size, output_size=len(LABEL.vocab), pooling_method='avg') model = WordAVGModel(vocab_size=len(TEXT.vocab), embedding_dim=embedding_size, output_dim=len(LABEL.vocab)) # model = Transformer(input_size=len(TEXT.vocab), d_model=embedding_size, num_head=4, d_ff=HIDDEN_SIZE, output_size=len(LABEL.vocab), pad=TEXT.vocab.stoi['<pad>'], use_mask=True) utils.weight_init(model)
TEXT, LABEL, train_iter, valid_iter = \ iters.build_iters_lm(ftrain=opt.ftrain, fvalid=opt.fvalid, bsz=opt.batch_size, level=opt.level) ftest = change_file_encoding(opt.ftest) test = data.TabularDataset(path=ftest, format='tsv', fields=[ ('index', INDEX), ('seq1', TEXT), ('seq2', TEXT), ]) test_iter = data.Iterator(test, batch_size=opt.batch_size, sort=False, repeat=False) location = opt.gpu if torch.cuda.is_available( ) and opt.gpu != -1 else 'cpu' device = torch.device(location) encoder = Encoder(len(TEXT.vocab.stoi), opt.rnn_size, TEXT.vocab.stoi[PAD_WORD], opt.enc_layers, opt.dropout, opt.bidirection) model = PhraseSim(encoder, opt.dropout).to(device) init_model(opt, model) if opt.load_idx != -1: basename = "{}-epoch-{}".format(opt.exp, opt.load_idx) model_fname = basename + ".model" location = {
def prepare_data_and_model(Model, args, using_gpu=True): if args.test: ## # narvi #train_path = "/home/zhouy/thesis/data/text_classification_data/train_try.csv" #test_path = "/home/zhouy/thesis/data/text_classification_data/test_try.csv" # tut thinkstation # train_path = "/media/yi/harddrive/codes/thesis_sentimentAnalysis/data/text_classification_data/train_try.csv" # test_path = "/media/yi/harddrive/codes/thesis_sentimentAnalysis/data/text_classification_data/test_try.csv" # # tripadvisor dataset # # xps test_path = "D:\\sentimentAnalysis\\data\\text_classification_data\\test_model_data\\rev_sent_5_score_train_test\\tripadvisor\\test_try.csv" train_path = "D:\\sentimentAnalysis\\data\\text_classification_data\\test_model_data\\rev_sent_5_score_train_test\\tripadvisor\\train_try.csv" else: # original dataset # # narvi #train_path = "/home/zhouy/thesis/data/text_classification_data/tripadvisor_train_dataset.csv" #test_path = "/home/zhouy/thesis/data/text_classification_data/tripadvisor_test_dataset.csv" # # tut thinkstation # train_path = "/home/yi/sentimentAnalysis/algos/5_ToxicCommentClassification-pytorch/data/train.csv" # test_path = "/home/yi/sentimentAnalysis/algos/5_ToxicCommentClassification-pytorch/data/test.csv" # # xps # train_path = "D:/sentimentAnalysis/algos/5_ToxicCommentClassification-pytorch/data/train.csv" # test_path = "D:/sentimentAnalysis/algos/5_ToxicCommentClassification-pytorch/data/test.csv" # tripadvisor dataset # xps train_path = "D:/sentimentAnalysis/data/text_classification_data/tripadvisor_train_dataset.csv" test_path = "D:/sentimentAnalysis/data/text_classification_data/tripadvisor_test_dataset.csv" def tokenize(text): fileters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n' trans_map = str.maketrans(fileters, " " * len(fileters)) text = text.translate(trans_map) text = [ tok.text for tok in spacy_en.tokenizer(text) if tok.text != ' ' ] tokenized_text = [] auxiliary_verbs = ['am', 'is', 'are', 'was', 'were', "'s"] for token in text: if token == "n't": tmp = 'not' elif token == "'ll": tmp = 'will' elif token in auxiliary_verbs: tmp = 'be' else: tmp = token tokenized_text.append(tmp) return tokenized_text if args.dataset == 'tripadvisor': TEXT = data.Field(tokenize=tokenize, lower=True, batch_first=True, truncate_first=True) LABEL = data.Field(sequential=False, use_vocab=False, batch_first=True) test = CustomDataset(test_path, text_field=TEXT, label_field=LABEL, test=True) train = CustomDataset(train_path, text_field=TEXT, label_field=LABEL) # should save the above train, test, these two variables. if args.wordembedding == "glove-6b": vectors = GloVe(name='6B', dim=args.embed_dim) elif args.wordembedding == "FastText": vectors = FastText(language='en') else: NotImplementedError # # FastText # vectors = FastText(name='6B', dim=args.embed_dim) vectors.unk_init = init.xavier_uniform # 下面这行代码报错 # TEXT.build_vocab(train, vectors=vectors, max_size=30000) TEXT.build_vocab(train, vectors=vectors, max_size=10000, min_freq=10) LABEL.build_vocab(train) print('train.fields', train.fields) print('train.name', getattr(train, 'text')) print('len(train)', len(train)) print('vars(train[0])', vars(train[0])) # using the training corpus to create the vocabulary train_iter = data.Iterator(dataset=train, batch_size=args.batch_size, train=True, repeat=False, device=0 if using_gpu else -1) test_iter = data.Iterator(dataset=test, batch_size=args.batch_size, train=False, sort=False, device=0 if using_gpu else -1) # the number of unique words num_tokens = len(TEXT.vocab.itos) args.num_tokens = num_tokens dev_iter = test_iter elif args.dataset == 'SST': text_field = data.Field(batch_first=True, lower=True, tokenize=tokenize) label_field = data.Field(sequential=False, batch_first=True) train_data, dev_data, test_data = datasets.SST.splits( text_field, label_field, fine_grained=True) vectors = GloVe(name='6B', dim=args.embed_dim) text_field.build_vocab(train_data, vectors=vectors, min_freq=1) label_field.build_vocab(train_data) train_iter = data.Iterator(train_data, batch_size=args.batch_size, device=0 if using_gpu else -1, train=True, repeat=False, sort=False, shuffle=True) dev_iter = data.Iterator(dev_data, batch_size=args.batch_size, device=0 if using_gpu else -1, train=False, repeat=False, sort=False, shuffle=False) test_iter = data.Iterator(test_data, batch_size=args.batch_size, device=0 if using_gpu else -1, train=False, repeat=False, sort=False, shuffle=False) # train_iter, dev_iter, test_iter = sst(text_field, label_field) # train_iter, dev_iter, test_iter = SST.iters(batch_size=16, device=0 if using_gpu else -1, vectors="glove.6B.300d") # config.target_class = train_iter.dataset.NUM_CLASSES args.num_tokens = len(text_field.vocab) args.num_classes = len(label_field.vocab) - 1 print("num_classes: ", args.num_classes) if args.model == "VDCNN": net = Model(depth=29, vocabulary_size=args.num_tokens, embed_size=16, n_classes=args.num_classes, k=2, optional_shortcut=True) else: net = Model(args) # # copy pretrained glove word embedding into the model # net.embedding.weight.data.copy_(TEXT.vocab.vectors) if using_gpu: net.cuda() return train_iter, test_iter, net
def train(self, args): ws = self.ws records = self.records logger = ws.logger('DeepSPEnv.train') model = self.sp_model model.train() optim = torch.optim.Adam(model.parameters()) train_iter = data.Iterator(records, 1) # one sequence at a time epoch_size = len(train_iter) state = self.load_training_state() if not args.restart and state: self.load_model('int') train_iter.load_state_dict(state['train_iter']) optim.load_state_dict(state['optim']) current_run = state['current_run'] loss_avg, mae_avg, acc_avg = state['avg'] start_epoch = train_iter.epoch n_samples = state['n_samples'] initial = train_iter._iterations_this_epoch else: if not args.restart: logger.info('nothing to resume, starting from scratch') n_samples = 0 # track total #samples for plotting now = datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S") current_run = str(self.ws.log_path / ('DeepSPEnv.train/run-%s/' % now)) loss_avg = [] mae_avg = [] acc_avg = [] start_epoch = 0 initial = 0 writer = SummaryWriter(str(current_run)) for epoch in range(start_epoch, args.n_epochs): epoch_iter = iter(tqdm(islice(train_iter, epoch_size - initial), total=epoch_size, initial=initial, desc=f'Epoch {epoch+1:3d}: ', unit='bz')) initial = 0 try: # training for batch in critical(epoch_iter): # critical section on one batch i = train_iter._iterations_this_epoch n_samples += len(batch) # backprop on one batch optim.zero_grad() hidden = None losses = [] maes = [] for q, s in zip(batch.question, batch.score): q_index = q[0].item() if q_index == -1: continue q = self.questions[q_index] q['text'] = torch.tensor(q['text']) q['knowledge'] = torch.tensor(q['knowledge']) q['difficulty'] = torch.tensor([q['difficulty']]) s = s.float() s_, hidden = model(q, s, hidden) losses.append(F.mse_loss(s_.view(1), s).view(1)) maes.append(F.l1_loss(s_.view(1), s).item()) if not losses: continue loss = torch.cat(losses).mean() loss.backward() optim.step() # log loss loss_avg.append(loss.item()) mae_avg.extend(maes) acc_avg.extend(np.asarray(maes) < 0.5) if args.log_every == len(loss_avg): writer.add_scalar('DeepSPEnv.train/loss', np.mean(loss_avg), n_samples) writer.add_scalar('DeepSPEnv.train/mae', np.mean(mae_avg), n_samples) writer.add_scalar('DeepSPEnv.train/acc', np.mean(acc_avg), n_samples) loss_avg = [] mae_avg = [] acc_avg = [] # save model if args.save_every > 0 and i % args.save_every == 0: self.save_model(f'{epoch}.{i}') # save after one epoch self.save_model(epoch + 1) except KeyboardInterrupt: self.save_training_state({ 'current_run': current_run, 'optim': optim.state_dict(), 'train_iter': train_iter.state_dict(), 'n_samples': n_samples, 'avg': (loss_avg, mae_avg, acc_avg) }) self.save_model('int') raise
def load_data(opt): # 不设置fix_length TEXT = data.Field(sequential=True, fix_length=opt.max_text_len) # 词或者字符 LABEL = data.Field(sequential=False, use_vocab=False) # load # word/ or article/ train_path = opt.data_path + opt.text_type + '/train_set.csv' val_path = opt.data_path + opt.text_type + '/val_set.csv' test_path = opt.data_path + opt.text_type + '/test_set.csv' train_path = 'D:/git/dataset/val_set.csv' test_path = 'D:/git/dataset/val_set.csv' val_path = 'D:/git/dataset/val_set.csv' # aug for data augmentation if opt.aug: print('make augmentation datasets!') train = GrandDataset(train_path, text_field=TEXT, label_field=LABEL, text_type=opt.text_type, test=False, aug=opt.aug) val = GrandDataset(val_path, text_field=TEXT, label_field=LABEL, text_type=opt.text_type, test=False) test = GrandDataset(test_path, text_field=TEXT, label_field=None, text_type=opt.text_type, test=True) cache = '.vector_cache' if not os.path.exists(cache): os.mkdir(cache) embedding_path = '{}/{}_{}.txt'.format(opt.embedding_path, opt.text_type, opt.embedding_dim) vectors = Vectors(name=embedding_path, cache=cache) print('load word2vec vectors from {}'.format(embedding_path)) vectors.unk_init = init.xavier_uniform_ # 没有命中的token的初始化方式 # 构建Vocab print('building {} vocabulary......'.format(opt.text_type)) TEXT.build_vocab(train, val, test, min_freq=5, vectors=vectors) # LABEL.build_vocab(train) # 构建Iterator # 在 test_iter, shuffle, sort, repeat一定要设置成 False, 要不然会被 torchtext 搞乱样本顺序 # 如果输入变长序列,sort_within_batch需要设置成true,使每个batch内数据按照sort_key降序进行排序 train_iter = data.BucketIterator(dataset=train, batch_size=opt.batch_size, shuffle=True, sort_within_batch=False, repeat=False, device=opt.device) # val_iter = data.BucketIterator(dataset=val, batch_size=opt.batch_size, sort_within_batch=False, repeat=False, # device=opt.device) # train_iter = data.Iterator(dataset=train, batch_size=opt.batch_size, train=True, repeat=False, device=opt.device) val_iter = data.Iterator(dataset=val, batch_size=opt.batch_size, shuffle=False, sort=False, repeat=False, device=opt.device) test_iter = data.Iterator(dataset=test, batch_size=opt.batch_size, shuffle=False, sort=False, repeat=False, device=opt.device) return train_iter, val_iter, test_iter, len(TEXT.vocab), TEXT.vocab.vectors
labels.build_vocab(train, dev, test) if os.path.isfile(args.vector_cache): questions.vocab.vectors = torch.load(args.vector_cache) else: questions.vocab.load_vectors(wv_dir=args.data_cache, wv_type=args.word_vectors, wv_dim=args.d_embed) os.makedirs(os.path.dirname(args.vector_cache), exist_ok=True) torch.save(questions.vocab.vectors, args.vector_cache) # get iterators train_iter = data.Iterator(train, batch_size=args.batch_size, device=args.gpu, train=True, repeat=False, sort=False, shuffle=False) dev_iter = data.Iterator(dev, batch_size=args.batch_size, device=args.gpu, train=True, repeat=False, sort=False, shuffle=False) test_iter = data.Iterator(test, batch_size=args.batch_size, device=args.gpu, train=True, repeat=False,
def mkiters(self, train): args = self.args c = Counter([len(x.out) for x in train]) t1, t2, t3 = [], [], [] print("Sorting training data by len") for x in train: l = len(x.out) if l < 100: t1.append(x) elif l > 100 and l < 220: t2.append(x) else: t3.append(x) t1d = data.Dataset(t1, self.fields) t2d = data.Dataset(t2, self.fields) t3d = data.Dataset(t3, self.fields) valid = data.TabularDataset(path=args.path.replace("train", "val"), format='tsv', fields=self.fields) print("ds sizes:", end='\t') for ds in [t1d, t2d, t3d, valid]: print(len(ds.examples), end='\t') for x in ds: x.rawent = x.ent.split(" ; ") x.ent = self.vec_ents(x.ent, self.ENT) x.rel = self.mkGraphs(x.rel, len(x.ent[1])) if args.sparse: x.rel = (self.adjToSparse(x.rel[0]), x.rel[1]) x.tgt = x.out x.out = [ y.split("_")[0] + ">" if "_" in y else y for y in x.out ] x.sordertgt = torch.LongTensor( [int(y) + 3 for y in x.sorder.split(" ")]) x.sorder = [[ int(z) for z in y.strip().split(" ") if len(z) > 0 ] for y in x.sorder.split("-1")[:-1]] ds.fields["tgt"] = self.TGT ds.fields["rawent"] = data.RawField() ds.fields["sordertgt"] = data.RawField() self.t1_iter = data.Iterator(t1d, args.t1size, device=args.device, sort_key=lambda x: len(x.out), repeat=False, train=True) self.t2_iter = data.Iterator(t2d, args.t2size, device=args.device, sort_key=lambda x: len(x.out), repeat=False, train=True) self.t3_iter = data.Iterator(t3d, args.t3size, device=args.device, sort_key=lambda x: len(x.out), repeat=False, train=True) self.val_iter = data.Iterator(valid, args.t3size, device=args.device, sort_key=lambda x: len(x.out), sort=False, repeat=False, train=False)
model.load_state_dict(torch.load('age-3features-model.pt')) model = model.to(device) USER_ID = data.Field() test_data = data.TabularDataset(path=base_dir + "embedding/test_3features.csv", format='csv', skip_header=True, fields=[('user_id', USER_ID), ('creative_id', creative_id_TEXT), ('ad_id', ad_id_TEXT), ('advertiser_id', advertiser_id_TEXT), ('product_id', product_id_TEXT)]) USER_ID.build_vocab(test_data) test_iterator = data.Iterator(test_data, batch_size=BATCH_SIZE, sort=False, sort_within_batch=False, device=device) result_list = [] user_id_list = [] total_predictions = None model.eval() with torch.no_grad(): for i, batch in enumerate(valid_iterator): print(i) creative_id_text, creative_id_text_length = batch.creative_id advertiser_id_text, advertiser_id_text_length = batch.advertiser_id ad_id_text, ad_id_text_length = batch.ad_id product_id_text, product_id_text_length = batch.product_id predictions = model(creative_id_text, creative_id_text_length,
def train(args): train_data, val_data, test_data, SRC, TGT = prepare_data(args) BATCH_SIZE = args.batch_size best_bleu_loss = 0 pad_idx = TGT.vocab.stoi["<pad>"] print("Size of source vocabulary:", len(SRC.vocab)) print("Size of target vocabulary:", len(TGT.vocab)) print("FC matrix:", args.hidden_dim, args.ff_dim) print(args.compress) model = transformer.make_model(len(SRC.vocab), len(TGT.vocab), d_model=args.hidden_dim, d_ff=args.ff_dim, N=args.num_blocks, compress=args.compress, compress_att=args.compress_attn, compress_mode=args.compress_mode, num_compress_enc=args.num_enc_blocks_comp, num_compress_dec=args.num_dec_blocks_comp ) model.to(device) if args.load_model: print('load model from [%s]' % args.load_model, file=sys.stderr) params = torch.load(args.load_model, map_location=lambda storage, loc: storage) # TODO args = params['args'] state_dict = params['model'] # opts = params[''] model.load_state_dict(state_dict) criterion = train_utils.LabelSmoothing(size=len(TGT.vocab), padding_idx=pad_idx, smoothing=0.1) # criterion = nn.NLLLoss(reduction="sum", ignore_index=0) criterion.to(device) train_iter = data.BucketIterator(train_data, batch_size=BATCH_SIZE, train=True, sort_within_batch=True, sort_key=lambda x: (len(x.src), len(x.trg)), repeat=False, device=device) valid_iter = data.Iterator(val_data, batch_size=BATCH_SIZE, train=False, sort=False, repeat=False, device=device) model_opt = opt.WrapperOpt(model.src_embed[0].d_model, 2, 4000, torch.optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-9)) # train_time = begin_time = time.time() valid_params = (SRC, TGT, valid_iter) print("Number of examples in train: ", BATCH_SIZE * len([_ for _ in train_iter])) print("Number of examples in validation: ", BATCH_SIZE * len([_ for _ in valid_iter])) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print("Number of parameters: ", params) if args.debug: model2 = transformer.make_model(len(SRC.vocab), len(TGT.vocab), d_model=args.hidden_dim, d_ff=args.ff_dim, N=args.num_blocks, compress=True,compress_att=True, compress_mode=args.compress_mode, num_compress_enc=args.num_enc_blocks_comp, num_compress_dec=args.num_dec_blocks_comp) # print("Tranable parameters in fc module ", params2) debug_compress_info(model, model2) exit() os.makedirs(os.path.dirname(args.save_to), exist_ok=True) if args.multi_gpu: devices = list(np.arange(args.num_devices)) model_parallel = nn.DataParallel(model, device_ids=devices) logger_file = {}#Logger(name=args.exp_name) logger_file['bleu'] = [] logger_file['loss'] = [] for epoch in range(args.max_epoch): print("=" * 80) print("Epoch ", epoch + 1) print("=" * 80) print("Train...") if args.multi_gpu: model_parallel.train() train_loss_fn = MultiGPULossCompute(model.generator, criterion, devices=devices, opt=model_opt) train_model = model_parallel else: train_loss_fn = train_utils.LossCompute(model.generator, criterion, model_opt) model.train() _, logger_file = train_utils.run_epoch(args, (train_utils.rebatch(pad_idx, b) for b in train_iter), model_parallel if args.multi_gpu else model, train_loss_fn, valid_params=valid_params, epoch_num=epoch, logger=logger_file) if args.multi_gpu: model_parallel.eval() val_loss_fn = MultiGPULossCompute(model.generator, criterion, devices=devices, opt=model_opt) else: model.eval() val_loss_fn = train_utils.LossCompute(model.generator, criterion, model_opt) print("Validation...") loss, bleu_loss = train_utils.run_epoch(args, (train_utils.rebatch(pad_idx, b) for b in valid_iter),\ model_parallel if args.multi_gpu else model, val_loss_fn, valid_params=valid_params, is_valid=True) if bleu_loss > best_bleu_loss: best_bleu_loss = bleu_loss model_state_dict = model.state_dict() model_file = args.save_to + args.exp_name + 'valid.bin' checkpoint = { 'model': model_state_dict, } print('save model without optimizer [%s]' % model_file, file=sys.stderr) torch.save(checkpoint, model_file) print() print("Validation perplexity ", np.exp(loss)) with open("./logs/"+args.exp_name, 'wb') as f_out: pickle.dump(logger_file, f_out)
def getIter(self, dataset, **kwargs): if 'device' not in kwargs: kwargs = dict(kwargs, device=self.device) else: kwargs = dict(kwargs) # just in case return data.Iterator(dataset, **kwargs)
skip_header=True, fields=[('Text', TEXT), ('Label', LABEL)]) TEXT.build_vocab(train_data, vectors=vectors) vocab_size = len(TEXT.vocab) weight_matrix = TEXT.vocab.vectors train_iter, valid_iter = data.BucketIterator.splits( (train_data, valid_data), batch_size=batch_size, shuffle=True, device=device, sort_key=lambda x: len(x.Text)) test_iter = data.Iterator(test_data, batch_size=batch_size, shuffle=False, device=device, sort=False, repeat=False) def evaluate_accuracy(data_iter, net): acc_sum, valid_loss, n = 0.0, 0.0, 0 valid_batch_num = 0 net.eval() for context in data_iter: valid_batch_num += 1 X = context.Text X = X.to(device).long() y = context.Label y = y.to(device).long() y_hat = net(X)
def load_data(option): #====== Text_filed = data.Field(sequential=True, fix_length=option.max_text_len) Label_field = data.Field(sequential=False, use_vocab=False) #====== train_path = option.data_path + option.text_type + '/train_set.csv' val_path = option.data_path + option.text_type + '/val_set.csv' test_path = option.data_path + option.text_type + '/test_set.csv' if option.aug: print('make augementation datasets!') train = buildDataset(train_path, text_field=Text_filed, label_field=Label_field, text_type=option.text_type, test=False, aug=option.aug) val = buildDataset(val_path, text_field=Text_filed, label_field=Label_field, text_type=option.text_type, test=False) test = buildDataset(test_path, text_field=Text_filed, label_field=None, text_type=option.text_type, test=True) #====== cache = '.vector_cache' if not os.path.exists(cache): os.mkdir(cache) embedding_path = '{}/{}_{}_.txt'.format(option.embedding_path, option.text_type, option.emb_size) print('embedding_path:', embedding_path) # vectors = Vectors(name=embedding_path, cache=cache) print('load word2vec vectors from {}'.format(embedding_path)) vectors.unk_init = init.xavier_uniform_ #如何指定 Vector 缺失值的初始化方式: vector.unk_init = init.xavier_uniform 这种方式指定完再传入 build_vocab #======构建vocab print('building {} vocabulary......'.format(option.text_type)) Text_filed.build_vocab(train, val, test, min_freq=option.min_freq, vectors=vectors) print('vocabulary has been made!\n') #======构建Iterator ''' 1. 在 test_iter, shuffle, sort, repeat一定要设置成 False, 要不然会被 torchtext 搞乱样本顺序 2. 如果输入变长序列,sort_within_batch需要设置成true,使每个batch内数据按照sort_key降序进行排序 ''' print('building {} Iterator......'.format(option.text_type)) train_iter = data.BucketIterator(dataset=train, batch_size=option.batch_size, shuffle=True, sort_within_batch=False, repeat=False, device=option.device) val_iter = data.Iterator(dataset=val, batch_size=option.batch_size, shuffle=False, sort=False, repeat=False, device=option.device) test_iter = data.Iterator(dataset=test, batch_size=option.batch_size, shuffle=False, sort=False, repeat=False, device=option.device) print('Iterator has been made!\n') return train_iter, val_iter, test_iter, len( Text_filed.vocab), Text_filed.vocab.vectors