def read_standard_data(self, path, sentences, debug_mode=False): label_classes = SNLIConfig.label_classes premise_data = [] hypothesis_data = [] labels = [] if debug_mode: i = 320 with open(path, 'r', encoding='utf-8') as file: for line in file: tokens = line.strip().split('\t') labels.append(label_classes[tokens[0].strip()]) premise_data.append(sentences[int(tokens[1].strip())]) hypothesis_data.append(sentences[int(tokens[2].strip())]) i -= 1 if i == 0: break logging(f'loading data {len(premise_data)} from {path}') return premise_data, hypothesis_data, labels with open(path, 'r', encoding='utf-8') as file: for line in file: tokens = line.strip().split('\t') labels.append(label_classes[tokens[0].strip()]) premise_data.append(sentences[int(tokens[1].strip())]) hypothesis_data.append(sentences[int(tokens[2].strip())]) logging(f'loading data {len(premise_data)} from {path}') return premise_data, hypothesis_data, labels
def data2tokens(self): logging(f'{self.path} in data2tokens') for sen in self.datas: data_tokens = ['[CLS]'] data_tokens += self.tokenizer.tokenize(sen)[:self.sen_len - 1] label_tokens = self.tokenizer.tokenize(sen)[:self.sen_len - 1] label_tokens += ['[SEP]'] self.data_tokens.append(data_tokens) self.label_tokens.append(label_tokens)
def token2seq(self, vocab: 'Vocab', maxlen: int): logging( f'{self.dataset_name} {self.dataset_path} is seq maxlen {maxlen}') if not self.using_bert: self.vocab = vocab self.maxlen = maxlen if self.dataset_name == 'SNLI': assert len(self.data_token['pre']) == len( self.data_token['hypo']) == len(self.labels) for tokens in self.data_token['pre']: s_len = min(len(tokens), maxlen) self.data_tensor['pre_len'].append( torch.tensor(s_len, dtype=torch.long)) self.data_tensor['pre'].append( self.__encode_tokens(tokens)) for tokens in self.data_token['hypo']: s_len = min(len(tokens), maxlen) self.data_tensor['hypo_len'].append( torch.tensor(s_len, dtype=torch.long)) self.data_tensor['hypo'].append( self.__encode_tokens(tokens)) else: for tokens in self.data_token: self.data_tensor.append(self.__encode_tokens(tokens)) else: if self.dataset_name == 'SNLI': for i in tqdm(range(len(self.premise_data))): t = self.tokenizer(text=self.premise_data[i], text_pair=self.hypothesis_data[i], max_length=maxlen * 2, truncation=True, padding='max_length') self.data_token['comb'].append( torch.tensor(t['input_ids'], dtype=torch.long)) self.data_types.append( torch.tensor(t['token_type_ids'], dtype=torch.long)) self.data_masks.append( torch.tensor(t['attention_mask'], dtype=torch.long)) else: for sen in tqdm(self.data): t = self.tokenizer(sen, max_length=maxlen, truncation=True, padding='max_length') self.data_token.append( torch.tensor(t['input_ids'], dtype=torch.long)) self.data_types.append( torch.tensor(t['token_type_ids'], dtype=torch.long)) self.data_masks.append( torch.tensor(t['attention_mask'], dtype=torch.long)) self.data_tensor = self.data_token for label in self.labels: self.labels_tensor.append(torch.tensor(label))
def data2token(self): logging(f'{self.dataset_name} {self.dataset_path} is tokenizing') if self.using_bert: pass elif self.dataset_name == 'SNLI': with tqdm(total=len(self.premise_data) + len(self.hypothesis_data)) as pbar: for sen in self.premise_data: self.data_token['pre'].append(self.tokenizer(sen)) pbar.update(1) for sen in self.hypothesis_data: self.data_token['hypo'].append(self.tokenizer(sen)) pbar.update(1) else: for sen in tqdm(self.data): self.data_token.append(self.tokenizer(sen))
def token2idx(self): logging(f'{self.path} in token2idx') for tokens in self.data_tokens: self.data_idx.append(self.tokenizer.convert_tokens_to_ids(tokens)) self.data_mask.append([1] * len(tokens)) for tokens in self.label_tokens: self.label_idx.append(self.tokenizer.convert_tokens_to_ids(tokens)) for i in range(len(self.data_idx)): if len(self.data_idx[i]) < self.sen_len: self.data_idx[i] += [0 ] * (self.sen_len - len(self.data_idx[i])) self.label_idx[i] += [0] * (self.sen_len - len(self.label_idx[i])) self.data_mask[i] += [0] * (self.sen_len - len(self.data_mask[i]))
def token2idx(self): logging(f'{self.path} in token2idx') for tokens in self.premise_data_tokens: self.premise_data_idx.append( self.tokenizer.convert_tokens_to_ids(tokens)) self.premise_data_mask.append([1] * len(tokens)) for tokens in self.premise_label_tokens: self.premise_label_idx.append( self.tokenizer.convert_tokens_to_ids(tokens)) for i in range(len(self.premise_data_idx)): if len(self.premise_data_idx[i]) < self.sen_len: self.premise_data_idx[i] += [0] * ( self.sen_len - len(self.premise_data_idx[i])) self.premise_label_idx[i] += [0] * ( self.sen_len - len(self.premise_label_idx[i])) self.premise_data_mask[i] += [0] * ( self.sen_len - len(self.premise_data_mask[i])) for tokens in self.hypothesis_data_tokens: self.hypothesis_data_idx.append( self.tokenizer.convert_tokens_to_ids(tokens)) self.hypothesis_data_mask.append([1] * len(tokens)) for tokens in self.hypothesis_label_tokens: self.hypothesis_label_idx.append( self.tokenizer.convert_tokens_to_ids(tokens)) for i in range(len(self.hypothesis_data_idx)): if len(self.hypothesis_data_idx[i]) < self.sen_len: self.hypothesis_data_idx[i] += [0] * ( self.sen_len - len(self.hypothesis_data_idx[i])) self.hypothesis_label_idx[i] += [0] * ( self.sen_len - len(self.hypothesis_label_idx[i])) self.hypothesis_data_mask[i] += [0] * ( self.sen_len - len(self.hypothesis_data_mask[i])) for i in range(len(self.premise_data_idx)): self.whole_sen.append(self.premise_data_idx[i] + self.hypothesis_data_idx[i]) self.whole_mask.append(self.premise_data_mask[i] + self.hypothesis_data_mask[i]) self.whole_type.append([0] * len(self.premise_data_idx[i] + [1] * len(self.hypothesis_data_idx)))
def data2tokens(self): logging(f'{self.path} in data2tokens') for sen in self.premise_data: data_tokens = ['[CLS]'] data_tokens += self.tokenizer.tokenize(sen)[:self.sen_len - 1] label_tokens = self.tokenizer.tokenize(sen)[:self.sen_len - 1] label_tokens += ['[SEP]'] self.premise_data_tokens.append(data_tokens) self.premise_label_tokens.append(label_tokens) for sen in self.hypothesis_data: data_tokens = ['[CLS]'] data_tokens += self.tokenizer.tokenize(sen)[:self.sen_len - 1] label_tokens = self.tokenizer.tokenize(sen)[:self.sen_len - 1] label_tokens += ['[SEP]'] self.hypothesis_data_tokens.append(data_tokens) self.hypothesis_label_tokens.append(label_tokens)
def __init__(self, dataset_name, model_name, device: torch.device, is_load=True, vocab=None): assert dataset_name in baseline_config_dataset assert model_name in baseline_config_models_list self.dataset_name = dataset_name self.model_name = model_name self.dataset_config = baseline_config_dataset[self.dataset_name] self.device = device self.net = None self.mode_is_training = False if 'Bert' not in model_name: self.vocab = vocab if vocab else self.__build_vocab() else: self.vocab = None is_entailment = dataset_name == 'SNLI' if model_name in {'LSTM', 'LSTM_E'}: self.net = self.__build_LSTM(is_bid=False, is_load=is_load, is_entailment=is_entailment) elif 'BidLSTM' in model_name: self.net = self.__build_LSTM(is_bid=True, is_load=is_load, is_entailment=is_entailment) elif 'TextCNN' in model_name: self.net = self.__build_TextCNN(is_load=is_load, is_entailment=is_entailment) elif 'Bert' in model_name: self.net = self.__build_Bert(is_load=is_load, is_entailment=is_entailment) self.net.to(device) self.net.eval() logging( f'is_load {is_load} loading baseline {self.dataset_name} {self.model_name}' )
def read_standard_data(self, path, debug_mode=False): path_list = [] logging(f'start loading data from {path}') dirs = os.listdir(path) for dir in dirs: if dir == 'pos' or dir == 'neg': file_list = os.listdir(os.path.join(path, dir)) file_list = map(lambda x: os.path.join(path, dir, x), file_list) path_list += list(file_list) datas = [] labels = [] if debug_mode: i = 320 for p in path_list: label = 0 if 'neg' in p else 1 with open(p, 'r', encoding='utf-8') as file: datas.append(file.readline()) labels.append(label) i -= 1 if i == 0: logging(f'loading data {len(datas)} from {path}') return datas, labels for p in path_list: label = 0 if 'neg' in p else 1 with open(p, 'r', encoding='utf-8') as file: datas.append(file.readline()) labels.append(label) logging(f'loading data {len(datas)} from {path}') return datas, labels
def read_standard_data(self, path, debug_mode=False): data = [] labels = [] if debug_mode: i = 320 with open(path, 'r', encoding='utf-8') as file: for line in file: i -= 1 line = line.strip('\n') data.append(line[:-1]) labels.append(int(line[-1])) if i == 0: break logging(f'loading data {len(data)} from {path}') return data, labels with open(path, 'r', encoding='utf-8') as file: for line in file: line = line.strip('\n') data.append(line[:-1]) labels.append(int(line[-1])) logging(f'loading data {len(data)} from {path}') return data, labels
def __build_vocab(self): logging(f'{self.dataset_name} {self.model_name} is building vocab') train_data_path = self.dataset_config.train_data_path train_dataset = baseline_MyDataset(self.dataset_name, train_data_path, is_to_tokens=True) if self.dataset_name == 'SNLI': return baseline_Vocab( train_dataset.data_token['pre'] + train_dataset.data_token['hypo'], is_using_pretrained=True, is_special=True, vocab_limit_size=self.dataset_config.vocab_limit_size, word_vec_file_path=self.dataset_config. pretrained_word_vectors_path) return baseline_Vocab( train_dataset.data_token, is_using_pretrained=True, is_special=False, vocab_limit_size=self.dataset_config.vocab_limit_size, word_vec_file_path=self.dataset_config.pretrained_word_vectors_path )
def main(): best_path = baseline_config_model_load_path[dataset_name].get(model_name) best_state = None best_acc = 0.0 if is_load_model == False else float( re.findall("_\d.\d+_", best_path)[0][1:-1]) save_acc_limit = args.save_acc_limit epoch = args.epoch for ep in range(epoch): logging(f'epoch {ep} start train') train_loss = train() logging(f'epoch {ep} start evaluate') evaluate_loss, acc = evaluate() if acc > best_acc: best_acc = acc best_path = baseline_config_model_save_path_format.format( dataset_name, model_name, acc, get_time(), note) best_state = copy.deepcopy(model.net.state_dict()) if epoch > 3 and (ep + 1) % ( epoch // 3) == 0 and best_acc > save_acc_limit and best_state != None: logging(f'saving best model acc {best_acc:.5f} in {best_path}') torch.save(best_state, best_path) best_state = None warmup_scheduler.step(ep + 1) scheduler.step(evaluate_loss, ep + 1) logging( f'epoch {ep} done! train_loss {train_loss:.5f} evaluate_loss {evaluate_loss:.5f} \n' f'acc {acc:.5f} now best_acc {best_acc:.5f}') if best_acc > save_acc_limit and best_state != None: logging(f'saving best model acc {best_acc:.5f} in {best_path}') torch.save(best_state, best_path) best_state = None
parser.add_argument('--epoch', type=int, default=20) parser.add_argument('--batch', type=int, default=64) parser.add_argument('--lr', type=float, default=1e-3) parser.add_argument('--note', type=str, default='') parser.add_argument('--load_model', choices=[True, False], default='no', type=parse_bool) parser.add_argument('--cuda', type=str, default='3') parser.add_argument('--skip_loss', type=float, default=0.15) parser.add_argument('--only_evaluate', type=parse_bool, default='no') parser.add_argument('--scratch', type=parse_bool, default='no') args = parser.parse_args() if baseline_debug_mode: logging('the debug mode is on!!!') time.sleep(1) dataset_name = args.dataset dataset_config = baseline_config_dataset[dataset_name] batch = args.batch if not baseline_debug_mode else 4 lr = args.lr note = args.note is_load_model = args.load_model model_name = args.model using_bert = model_name in {'Bert', 'Bert_E'} if args.cuda == 'cpu': device = torch.device('cpu') else: device = torch.device('cuda:' + args.cuda)