def dataset_initialize(self): print("-- Preparing Data --") self.transform = [ transforms.Resize((256, 256), Image.BICUBIC), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ] self.train_data_loader = dataset(root_dir=self.args.DATA.data_path, dataset=self.args.DATA.dataset, mode='train', direction=self.args.DATA.direction, transform=self.transform) self.val_data_loader = dataset(root_dir=self.args.DATA.data_path, dataset=self.args.DATA.dataset, mode='val', direction=self.args.DATA.direction, transform=self.transform) self.train_loader = DataLoader( self.train_data_loader, batch_size=self.args.DATA.batch_size, shuffle=True, num_workers=self.args.SETTINGS.num_workers, drop_last=True) self.val_loader = DataLoader( self.val_data_loader, batch_size=self.args.DATA.batch_size, shuffle=True, num_workers=self.args.SETTINGS.num_workers, drop_last=True) print("-- Dataset DONE --")
def make_data(text, title, authors, label): text_data = [] for each_x in text: text_data.append(make_text_data1(each_x['text_file'], each_x['text_dict'], \ each_x['doc_len'], each_x['text_len'])) text_data.append( make_text_data2(title['text_file'], title['text_dict'], title['text_len'])) text_data.append( make_text_data2(authors['text_file'], authors['text_dict'], authors['text_len'], sep=',')) label_data = make_label_data(label['label_file'], label['label_dict']) return dataset(text_data, label_data)
def makeData(srcFile, tgtFile, srcDicts, tgtDicts, sort=False, char=False): src, tgt = [], [] raw_src, raw_tgt = [], [] sizes = [] count, ignored = 0, 0 print('Processing %s & %s ...' % (srcFile, tgtFile)) srcF = open(srcFile) tgtF = open(tgtFile) while True: sline = srcF.readline() tline = tgtF.readline() # normal end of file if sline == "" and tline == "": break # source or target does not have same number of lines if sline == "" or tline == "": print('WARNING: source and target do not have the same number of sentences') break sline = sline.strip() tline = tline.strip() # source and/or target are empty if sline == "" or tline == "": print('WARNING: ignoring an empty line ('+str(count+1)+')') ignored += 1 continue if opt.lower: sline = sline.lower() tline = tline.lower() srcWords = sline.split() tgtWords = tline.split() # if opt.src_length == 0 or (len(srcWords) <= opt.src_length and len(tgtWords) <= opt.tgt_length): if char: srcWords = [word + " " for word in srcWords] tgtWords = list(" ".join(tgtWords)) else: srcWords = [word+" " for word in srcWords] tgtWords = [word+" " for word in tgtWords] src += [srcDicts.convertToIdx(srcWords, dict.UNK_WORD)] tgt += [tgtDicts.convertToIdx(tgtWords, dict.UNK_WORD, dict.BOS_WORD, dict.EOS_WORD)] raw_src += [srcWords] raw_tgt += [tgtWords] sizes += [len(srcWords)] else: ignored += 1 count += 1 if count % opt.report_every == 0: print('... %d sentences prepared' % count) srcF.close() tgtF.close() if opt.shuffle == 1: print('... shuffling sentences') perm = torch.randperm(len(src)) src = [src[idx] for idx in perm] tgt = [tgt[idx] for idx in perm] sizes = [sizes[idx] for idx in perm] raw_src = [raw_src[idx] for idx in perm] raw_tgt = [raw_tgt[idx] for idx in perm] if sort: print('... sorting sentences by size') _, perm = torch.sort(torch.Tensor(sizes)) src = [src[idx] for idx in perm] tgt = [tgt[idx] for idx in perm] raw_src = [raw_src[idx] for idx in perm] raw_tgt = [raw_tgt[idx] for idx in perm] print('Prepared %d sentences (%d ignored due to length == 0 or > %d)' % (len(src), ignored, opt.src_length)) return dataset(src, tgt, raw_src, raw_tgt)
def makeData(srcFile, tgtFile, srcDicts, tgtDicts, sort=False, char=False): ''' 根据字典将语言文件的单词转为索引值. Inputs: srcFile: str, 源语言文件的文件名 tgtFile: str, 目标语言文件的文件名 srcDicts: str, 已经做好的源语言的字典 tgtDicts: str, 已经做好的目标语言的字典 sort: bool, 是否排序 char: bool, 是否以字符为单位 Return: dataset: 已经封装好的数据集, 含有src, tgt, raw_src, raw_tgt src: list, 每一个元素为一个样本的Torch.LongTensor, 值为索引. tgt: list, 每一个元素为一个样本的Torch.LongTensor, 值为索引. raw_src: list, 每一个元素也为list, 该样本的单词. raw_tgt: list, 每一个元素也为list, 该样本的单词. ''' src, tgt = [], [] # src, tgt是一个列表,每一个元素是一行索引 raw_src, raw_tgt = [], [] # raw_src, raw_tgt是一个列表,每一个元素为一行单词 sizes = [] # 每个元素表示该样本含有的单词数量. count, ignored = 0, 0 print('Processing %s & %s ...' % (srcFile, tgtFile)) srcF = open(srcFile) tgtF = open(tgtFile) while True: # 每次读取一行 sline = srcF.readline() tline = tgtF.readline() # normal end of file if sline == "" and tline == "": break # source or target does not have same number of lines if sline == "" or tline == "": print( 'WARNING: source and target do not have the same number of sentences' ) break sline = sline.strip() tline = tline.strip() # source and/or target are empty if sline == "" or tline == "": print('WARNING: ignoring an empty line (' + str(count + 1) + ')') ignored += 1 continue if opt.lower: sline = sline.lower() tline = tline.lower() srcWords = sline.split() tgtWords = tline.split() # if opt.src_length == 0 or (len(srcWords) <= opt.src_length and len(tgtWords) <= opt.tgt_length): if char: srcWords = [word + " " for word in srcWords] tgtWords = list(" ".join(tgtWords)) else: srcWords = [word + " " for word in srcWords[:300]] tgtWords = [word + " " for word in tgtWords] src += [srcDicts.convertToIdx(srcWords, dict.UNK_WORD)] # src不用添加BOS和EOS tgt += [ tgtDicts.convertToIdx(tgtWords, dict.UNK_WORD, dict.BOS_WORD, dict.EOS_WORD) ] raw_src += [srcWords] raw_tgt += [tgtWords] sizes += [len(srcWords)] # sizes对应的是每一行src文本的单词的个数 else: ignored += 1 count += 1 if count % opt.report_every == 0: print('... %d sentences prepared' % count) srcF.close() tgtF.close() if opt.shuffle == 1: print('... shuffling sentences') perm = torch.randperm(len(src)) src = [src[idx] for idx in perm] tgt = [tgt[idx] for idx in perm] sizes = [sizes[idx] for idx in perm] raw_src = [raw_src[idx] for idx in perm] raw_tgt = [raw_tgt[idx] for idx in perm] if sort: print('... sorting sentences by size') _, perm = torch.sort(torch.Tensor(sizes)) src = [src[idx] for idx in perm] tgt = [tgt[idx] for idx in perm] raw_src = [raw_src[idx] for idx in perm] raw_tgt = [raw_tgt[idx] for idx in perm] print('Prepared %d sentences (%d ignored due to length == 0 or > %d)' % (len(src), ignored, opt.src_length)) return dataset(src, tgt, raw_src, raw_tgt) # 封装成相应的数据集
def makeData(srcFile, tgtFile, srcDicts, tgtDicts, sort=False, char=False): src, tgt, from_train = [], [], [] raw_src, raw_tgt = [], [] sizes = [] count, ignored = 0, 0 print('Processing %s & %s ...' % (srcFile, tgtFile)) srcF = open(srcFile, encoding='utf-8') tgtF = open(tgtFile, encoding='utf-8') not_in_train_cnt = 0 while True: sline = srcF.readline() tline = tgtF.readline() # normal end of file if sline == "" and tline == "": break # source or target does not have same number of lines if sline == "" or tline == "": print( 'WARNING: source and target do not have the same number of sentences' ) break sline = sline.strip() tline = tline.strip() # source and/or target are empty if sline == "" or tline == "": print('WARNING: ignoring an empty line (' + str(count + 1) + ')') ignored += 1 continue if opt.lower: sline = sline.lower() tline = tline.lower() srcWords = sline.split() tgtWords = tline.split() # 句长在限制范围内 if opt.src_length == 0 or (len(srcWords) <= opt.src_length and len(tgtWords) <= opt.tgt_length): if char: srcWords = [word + " " for word in srcWords] tgtWords = list(" ".join(tgtWords)) else: srcWords = [word + " " for word in srcWords] tgtWords = [word + " " for word in tgtWords] # 以id组成的句子 src += [srcDicts.convertToIdx(srcWords, dict.UNK_WORD)] # target句加入了GO和EOS tgt += [ tgtDicts.convertToIdx(tgtWords, dict.UNK_WORD, dict.BOS_WORD, dict.EOS_WORD) ] # 原句(当然前面的lowercase还是在的) raw_src += [srcWords] raw_tgt += [tgtWords] sizes += [len(srcWords)] in_train = True for word in new_label + ['<unk>']: if word in map(lambda x: x.strip(), tgtWords): in_train = False not_in_train_cnt += 1 from_train += [[in_train]] else: ignored += 1 count += 1 if count % opt.report_every == 0: print('... %d sentences prepared' % count) srcF.close() tgtF.close() print(not_in_train_cnt) if opt.shuffle == 1: print('... shuffling sentences') perm = torch.randperm(len(src)) src = [src[idx] for idx in perm] tgt = [tgt[idx] for idx in perm] sizes = [sizes[idx] for idx in perm] raw_src = [raw_src[idx] for idx in perm] raw_tgt = [raw_tgt[idx] for idx in perm] from_train = [from_train[idx] for idx in perm] if sort: print('... sorting sentences by size') _, perm = torch.sort(torch.Tensor(sizes)) src = [src[idx] for idx in perm] tgt = [tgt[idx] for idx in perm] raw_src = [raw_src[idx] for idx in perm] raw_tgt = [raw_tgt[idx] for idx in perm] from_train = [from_train[idx] for idx in perm] print('Prepared %d sentences (%d ignored due to length == 0 or > %d)' % (len(src), ignored, opt.src_length)) new_id = [57, 58, 59, 60, 56, 1] for i in range(len(from_train)): flag = 0 for label in tgt[i]: if label in new_id: flag = 1 if flag: assert from_train[i][0] is not True else: assert from_train[i][0] is True dtst = dataset(src, tgt, raw_src, raw_tgt, from_train) print('len tgt {}, known {}'.format(len(dtst.tgt), len(dtst.from_known))) for i in range(len(dtst.from_known)): flag = 0 for label in dtst.tgt[i]: if label in new_id: flag = 1 if flag: print(i, dtst.tgt[i], dtst.from_known[i][0]) assert dtst.from_known[i][0] is not True else: print(i, dtst.tgt[i], dtst.from_known[i][0]) # hint: 发现train只有10534 assert dtst.from_known[i][0] is True # import time # time.sleep(10) return dtst
def test(args): device = torch.device("cuda:0" if (torch.cuda.is_available()) else "cpu") transform = [transforms.Resize((256, 256), Image.BICUBIC), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))] test_data_loader = dataset( root_dir = args.DATA.data_path, dataset = args.DATA.dataset, mode = 'test', direction = args.DATA.direction, transform=transform) test_loader = DataLoader(test_data_loader, batch_size=1, shuffle=True, num_workers=4) "Define Models" generator = Generator(args.MODEL.ngf, args.MODEL.input_nc, args.MODEL.output_nc) "Load weights" checkpoint = args.EVALUATION.evaluation_path checkpoint = os.path.join(checkpoint, "best.pth.tar") load(generator, checkpoint) generator.to(device) L1_loss = torch.nn.L1Loss() t_loss = 0 generator.eval() for i, batch in enumerate(test_loader): input = batch['input'].to(device) target = batch['target'].to(device) "Discriminator Training" with torch.no_grad(): output = generator.forward(input) test_loss = L1_loss(output,target) t_loss += test_loss if args.EVALUATION.plot: "Display Images" input = input.detach().cpu().numpy() input = input.squeeze().transpose((1, 2, 0)) output = output.detach().cpu().numpy() output = output.squeeze().transpose((1, 2, 0)) fig = plt.figure() ax1 = fig.add_subplot(1, 2, 1) ax1.set_title('input') ax1.imshow(input) ax2 = fig.add_subplot(1, 2, 2) ax2.set_title('output') ax2.imshow(output) plt.pause(1) print("Iter:", i, "L1 loss:", test_loss.item()) print("Final Test Loss: ", t_loss.item()/len(test_loader))
def main(args): train_loader, test_loader = dataset() model_obj = STNTrain(train_loader, test_loader, learning_rate=0.001) model_obj.train(25, args.checkpoint_path, args.export_path)
def makeData(qFile, rFile, Dicts): qidx, ridx = [], [] raw_src, raw_tgt = [], [] sizes = [] count, ignored = 0, 0 print('Processing %s & %s ...' % (qFile, rFile)) qF = open(qFile, 'r', encoding='utf-8') aF = open(rFile, 'r', encoding='utf-8') # while True: # qline = qF.readline() # aline = aF.readline() for aline in aF.readlines(): # print(len(qline)) # normal end of file # if len(aline) == 0: # break # # source or target does not have same number of lines # if len(qline) == 0 or len(aline) == 0: # print('WARNING: source and target do not have the same number of sentences') # break # qline = qline.strip() aline = aline.strip() # print(qline,aline) # q and/or r are empty # if qline == "" or aline == "": # print('WARNING: ignoring an empty line ('+str(count+1)+')') # ignored += 1 # continue # qWords = qline.decode('utf-8').split() aWords = aline.split() # print(qWords,aWords) # print(qWords) # # if opt.q_length == 0 or (len(aWords) <= opt.a_length): # qWords = [word+" " for word in qWords] aWords = [word + " " for word in aWords] # qidx += [Dicts.convertToIdx(qWords, # dict.UNK_WORD)] aWords = Dicts.convertToIdx(aWords, dict.UNK_WORD) if len(aWords) < opt.a_length: aWords.extend([0] * (opt.a_length - len(aWords))) else: aWords = aWords[-opt.a_length:] ridx.append(aWords) # raw_src += [srcWords] # raw_tgt += [tgtWords] # sizes += [len(qWords)] # else: # ignored += 1 # count += 1 # print(len(ridx)) # qF.close() qidx = [] qSession = [] for line in qF.readlines(): qline = line.strip() # print(qline) if qline != 'END': qWords = qline.split() qWords = [word + " " for word in qWords] # print(qWords) qWords = Dicts.convertToIdx(qWords, dict.UNK_WORD) # print(qWords) if len(qWords) < opt.q_length: # [qline.append(0) for i in range(len(q_length) - len(qline))] qWords.extend([0] * (opt.q_length - len(qWords))) else: qWords = qWords[-opt.q_length:] # print(len(qWords)) qSession.append(qWords) else: # print(len(qSession)) if len(qSession) <= opt.utter_length: qSession.extend( (opt.utter_length - len(qSession)) * [[0] * opt.q_length]) # print(len(qSession)) else: qSession = qSession[-opt.utter_length:] # print(len(qSession)) qidx.append(qSession) qSession = [] # print(qidx) # print(type(qidx)) # print(qidx) # if count % opt.report_every == 0: # print('... %d sentences prepared' % count) print(len(qidx), len(ridx)) qF.close() aF.close() return dataset(torch.Tensor(qidx), torch.Tensor(ridx))
def makeData(srcFile, tgtFile, srcDicts, tgtDicts, sort=False): src, tgt = [], [] raw_src, raw_tgt = [], [] sizes = [] count, ignored = 0, 0 print('Processing %s & %s ...' % (srcFile, tgtFile)) srcF = open(srcFile, encoding='utf8') tgtF = open(tgtFile, encoding='utf8') while True: sline = srcF.readline() tline = tgtF.readline() # normal end of file if sline == "" and tline == "": break # source or target does not have same number of lines if sline == "" or tline == "": print( 'WARNING: source and target do not have the same number of sentences' ) break sline = sline.strip() tline = tline.strip() # source and/or target are empty if sline == "" or tline == "": print('WARNING: ignoring an empty line (' + str(count + 1) + ')') ignored += 1 continue sline = sline.lower() tline = tline.lower() srcWords = sline.split() tgtWords = tline.split() if opt.trun_src > 0: srcWords = srcWords[:opt.trun_src] if opt.trun_tgt > 0: tgtWords = tgtWords[:opt.trun_tgt] srcWords = [word + " " for word in srcWords] tgtWords = [word + " " for word in tgtWords] src += [srcDicts.convertToIdx(srcWords, dict.UNK_WORD)] tgt += [ tgtDicts.convertToIdx(tgtWords, dict.UNK_WORD, dict.BOS_WORD, dict.EOS_WORD) ] raw_src += [srcWords] raw_tgt += [tgtWords] sizes += [len(srcWords)] count += 1 if count % 1000 == 0: print('... %d sentences prepared' % count) srcF.close() tgtF.close() print('Prepared %d sentences (%d ignored due to length == 0)' % (len(src), ignored)) return dataset(src, tgt, raw_src, raw_tgt)