Beispiel #1
0
 def read_standard_data(self, path, sentences, debug_mode=False):
     label_classes = SNLIConfig.label_classes
     premise_data = []
     hypothesis_data = []
     labels = []
     if debug_mode:
         i = 320
         with open(path, 'r', encoding='utf-8') as file:
             for line in file:
                 tokens = line.strip().split('\t')
                 labels.append(label_classes[tokens[0].strip()])
                 premise_data.append(sentences[int(tokens[1].strip())])
                 hypothesis_data.append(sentences[int(tokens[2].strip())])
                 i -= 1
                 if i == 0:
                     break
         logging(f'loading data {len(premise_data)} from {path}')
         return premise_data, hypothesis_data, labels
     with open(path, 'r', encoding='utf-8') as file:
         for line in file:
             tokens = line.strip().split('\t')
             labels.append(label_classes[tokens[0].strip()])
             premise_data.append(sentences[int(tokens[1].strip())])
             hypothesis_data.append(sentences[int(tokens[2].strip())])
     logging(f'loading data {len(premise_data)} from {path}')
     return premise_data, hypothesis_data, labels
Beispiel #2
0
 def data2tokens(self):
     logging(f'{self.path} in data2tokens')
     for sen in self.datas:
         data_tokens = ['[CLS]']
         data_tokens += self.tokenizer.tokenize(sen)[:self.sen_len - 1]
         label_tokens = self.tokenizer.tokenize(sen)[:self.sen_len - 1]
         label_tokens += ['[SEP]']
         self.data_tokens.append(data_tokens)
         self.label_tokens.append(label_tokens)
    def token2seq(self, vocab: 'Vocab', maxlen: int):
        logging(
            f'{self.dataset_name} {self.dataset_path} is seq maxlen {maxlen}')
        if not self.using_bert:
            self.vocab = vocab
            self.maxlen = maxlen
            if self.dataset_name == 'SNLI':
                assert len(self.data_token['pre']) == len(
                    self.data_token['hypo']) == len(self.labels)
                for tokens in self.data_token['pre']:
                    s_len = min(len(tokens), maxlen)
                    self.data_tensor['pre_len'].append(
                        torch.tensor(s_len, dtype=torch.long))
                    self.data_tensor['pre'].append(
                        self.__encode_tokens(tokens))
                for tokens in self.data_token['hypo']:
                    s_len = min(len(tokens), maxlen)
                    self.data_tensor['hypo_len'].append(
                        torch.tensor(s_len, dtype=torch.long))
                    self.data_tensor['hypo'].append(
                        self.__encode_tokens(tokens))
            else:
                for tokens in self.data_token:
                    self.data_tensor.append(self.__encode_tokens(tokens))
        else:
            if self.dataset_name == 'SNLI':
                for i in tqdm(range(len(self.premise_data))):
                    t = self.tokenizer(text=self.premise_data[i],
                                       text_pair=self.hypothesis_data[i],
                                       max_length=maxlen * 2,
                                       truncation=True,
                                       padding='max_length')
                    self.data_token['comb'].append(
                        torch.tensor(t['input_ids'], dtype=torch.long))
                    self.data_types.append(
                        torch.tensor(t['token_type_ids'], dtype=torch.long))
                    self.data_masks.append(
                        torch.tensor(t['attention_mask'], dtype=torch.long))
            else:
                for sen in tqdm(self.data):
                    t = self.tokenizer(sen,
                                       max_length=maxlen,
                                       truncation=True,
                                       padding='max_length')
                    self.data_token.append(
                        torch.tensor(t['input_ids'], dtype=torch.long))
                    self.data_types.append(
                        torch.tensor(t['token_type_ids'], dtype=torch.long))
                    self.data_masks.append(
                        torch.tensor(t['attention_mask'], dtype=torch.long))
                self.data_tensor = self.data_token

        for label in self.labels:
            self.labels_tensor.append(torch.tensor(label))
 def data2token(self):
     logging(f'{self.dataset_name} {self.dataset_path} is tokenizing')
     if self.using_bert:
         pass
     elif self.dataset_name == 'SNLI':
         with tqdm(total=len(self.premise_data) +
                   len(self.hypothesis_data)) as pbar:
             for sen in self.premise_data:
                 self.data_token['pre'].append(self.tokenizer(sen))
                 pbar.update(1)
             for sen in self.hypothesis_data:
                 self.data_token['hypo'].append(self.tokenizer(sen))
                 pbar.update(1)
     else:
         for sen in tqdm(self.data):
             self.data_token.append(self.tokenizer(sen))
Beispiel #5
0
    def token2idx(self):
        logging(f'{self.path} in token2idx')
        for tokens in self.data_tokens:
            self.data_idx.append(self.tokenizer.convert_tokens_to_ids(tokens))
            self.data_mask.append([1] * len(tokens))

        for tokens in self.label_tokens:
            self.label_idx.append(self.tokenizer.convert_tokens_to_ids(tokens))

        for i in range(len(self.data_idx)):
            if len(self.data_idx[i]) < self.sen_len:
                self.data_idx[i] += [0
                                     ] * (self.sen_len - len(self.data_idx[i]))
                self.label_idx[i] += [0] * (self.sen_len -
                                            len(self.label_idx[i]))
                self.data_mask[i] += [0] * (self.sen_len -
                                            len(self.data_mask[i]))
Beispiel #6
0
    def token2idx(self):
        logging(f'{self.path} in token2idx')
        for tokens in self.premise_data_tokens:
            self.premise_data_idx.append(
                self.tokenizer.convert_tokens_to_ids(tokens))
            self.premise_data_mask.append([1] * len(tokens))

        for tokens in self.premise_label_tokens:
            self.premise_label_idx.append(
                self.tokenizer.convert_tokens_to_ids(tokens))

        for i in range(len(self.premise_data_idx)):
            if len(self.premise_data_idx[i]) < self.sen_len:
                self.premise_data_idx[i] += [0] * (
                    self.sen_len - len(self.premise_data_idx[i]))
                self.premise_label_idx[i] += [0] * (
                    self.sen_len - len(self.premise_label_idx[i]))
                self.premise_data_mask[i] += [0] * (
                    self.sen_len - len(self.premise_data_mask[i]))

        for tokens in self.hypothesis_data_tokens:
            self.hypothesis_data_idx.append(
                self.tokenizer.convert_tokens_to_ids(tokens))
            self.hypothesis_data_mask.append([1] * len(tokens))

        for tokens in self.hypothesis_label_tokens:
            self.hypothesis_label_idx.append(
                self.tokenizer.convert_tokens_to_ids(tokens))

        for i in range(len(self.hypothesis_data_idx)):
            if len(self.hypothesis_data_idx[i]) < self.sen_len:
                self.hypothesis_data_idx[i] += [0] * (
                    self.sen_len - len(self.hypothesis_data_idx[i]))
                self.hypothesis_label_idx[i] += [0] * (
                    self.sen_len - len(self.hypothesis_label_idx[i]))
                self.hypothesis_data_mask[i] += [0] * (
                    self.sen_len - len(self.hypothesis_data_mask[i]))

        for i in range(len(self.premise_data_idx)):
            self.whole_sen.append(self.premise_data_idx[i] +
                                  self.hypothesis_data_idx[i])
            self.whole_mask.append(self.premise_data_mask[i] +
                                   self.hypothesis_data_mask[i])
            self.whole_type.append([0] *
                                   len(self.premise_data_idx[i] +
                                       [1] * len(self.hypothesis_data_idx)))
Beispiel #7
0
    def data2tokens(self):
        logging(f'{self.path} in data2tokens')
        for sen in self.premise_data:
            data_tokens = ['[CLS]']
            data_tokens += self.tokenizer.tokenize(sen)[:self.sen_len - 1]
            label_tokens = self.tokenizer.tokenize(sen)[:self.sen_len - 1]
            label_tokens += ['[SEP]']
            self.premise_data_tokens.append(data_tokens)
            self.premise_label_tokens.append(label_tokens)

        for sen in self.hypothesis_data:
            data_tokens = ['[CLS]']
            data_tokens += self.tokenizer.tokenize(sen)[:self.sen_len - 1]
            label_tokens = self.tokenizer.tokenize(sen)[:self.sen_len - 1]
            label_tokens += ['[SEP]']
            self.hypothesis_data_tokens.append(data_tokens)
            self.hypothesis_label_tokens.append(label_tokens)
Beispiel #8
0
    def __init__(self,
                 dataset_name,
                 model_name,
                 device: torch.device,
                 is_load=True,
                 vocab=None):
        assert dataset_name in baseline_config_dataset
        assert model_name in baseline_config_models_list

        self.dataset_name = dataset_name
        self.model_name = model_name
        self.dataset_config = baseline_config_dataset[self.dataset_name]
        self.device = device
        self.net = None
        self.mode_is_training = False
        if 'Bert' not in model_name:
            self.vocab = vocab if vocab else self.__build_vocab()
        else:
            self.vocab = None

        is_entailment = dataset_name == 'SNLI'
        if model_name in {'LSTM', 'LSTM_E'}:
            self.net = self.__build_LSTM(is_bid=False,
                                         is_load=is_load,
                                         is_entailment=is_entailment)
        elif 'BidLSTM' in model_name:
            self.net = self.__build_LSTM(is_bid=True,
                                         is_load=is_load,
                                         is_entailment=is_entailment)
        elif 'TextCNN' in model_name:
            self.net = self.__build_TextCNN(is_load=is_load,
                                            is_entailment=is_entailment)
        elif 'Bert' in model_name:
            self.net = self.__build_Bert(is_load=is_load,
                                         is_entailment=is_entailment)

        self.net.to(device)
        self.net.eval()

        logging(
            f'is_load {is_load} loading baseline {self.dataset_name} {self.model_name}'
        )
Beispiel #9
0
 def read_standard_data(self, path, debug_mode=False):
     path_list = []
     logging(f'start loading data from {path}')
     dirs = os.listdir(path)
     for dir in dirs:
         if dir == 'pos' or dir == 'neg':
             file_list = os.listdir(os.path.join(path, dir))
             file_list = map(lambda x: os.path.join(path, dir, x),
                             file_list)
             path_list += list(file_list)
     datas = []
     labels = []
     if debug_mode:
         i = 320
         for p in path_list:
             label = 0 if 'neg' in p else 1
             with open(p, 'r', encoding='utf-8') as file:
                 datas.append(file.readline())
                 labels.append(label)
             i -= 1
             if i == 0:
                 logging(f'loading data {len(datas)} from {path}')
                 return datas, labels
     for p in path_list:
         label = 0 if 'neg' in p else 1
         with open(p, 'r', encoding='utf-8') as file:
             datas.append(file.readline())
             labels.append(label)
     logging(f'loading data {len(datas)} from {path}')
     return datas, labels
Beispiel #10
0
 def read_standard_data(self, path, debug_mode=False):
     data = []
     labels = []
     if debug_mode:
         i = 320
         with open(path, 'r', encoding='utf-8') as file:
             for line in file:
                 i -= 1
                 line = line.strip('\n')
                 data.append(line[:-1])
                 labels.append(int(line[-1]))
                 if i == 0:
                     break
         logging(f'loading data {len(data)} from {path}')
         return data, labels
     with open(path, 'r', encoding='utf-8') as file:
         for line in file:
             line = line.strip('\n')
             data.append(line[:-1])
             labels.append(int(line[-1]))
     logging(f'loading data {len(data)} from {path}')
     return data, labels
Beispiel #11
0
    def __build_vocab(self):
        logging(f'{self.dataset_name} {self.model_name} is building vocab')
        train_data_path = self.dataset_config.train_data_path
        train_dataset = baseline_MyDataset(self.dataset_name,
                                           train_data_path,
                                           is_to_tokens=True)

        if self.dataset_name == 'SNLI':
            return baseline_Vocab(
                train_dataset.data_token['pre'] +
                train_dataset.data_token['hypo'],
                is_using_pretrained=True,
                is_special=True,
                vocab_limit_size=self.dataset_config.vocab_limit_size,
                word_vec_file_path=self.dataset_config.
                pretrained_word_vectors_path)
        return baseline_Vocab(
            train_dataset.data_token,
            is_using_pretrained=True,
            is_special=False,
            vocab_limit_size=self.dataset_config.vocab_limit_size,
            word_vec_file_path=self.dataset_config.pretrained_word_vectors_path
        )
def main():
    best_path = baseline_config_model_load_path[dataset_name].get(model_name)
    best_state = None
    best_acc = 0.0 if is_load_model == False else float(
        re.findall("_\d.\d+_", best_path)[0][1:-1])
    save_acc_limit = args.save_acc_limit
    epoch = args.epoch

    for ep in range(epoch):
        logging(f'epoch {ep} start train')
        train_loss = train()
        logging(f'epoch {ep} start evaluate')
        evaluate_loss, acc = evaluate()
        if acc > best_acc:
            best_acc = acc
            best_path = baseline_config_model_save_path_format.format(
                dataset_name, model_name, acc, get_time(), note)
            best_state = copy.deepcopy(model.net.state_dict())

        if epoch > 3 and (ep + 1) % (
                epoch //
                3) == 0 and best_acc > save_acc_limit and best_state != None:
            logging(f'saving best model acc {best_acc:.5f} in {best_path}')
            torch.save(best_state, best_path)
            best_state = None

        warmup_scheduler.step(ep + 1)
        scheduler.step(evaluate_loss, ep + 1)

        logging(
            f'epoch {ep} done! train_loss {train_loss:.5f} evaluate_loss {evaluate_loss:.5f} \n'
            f'acc {acc:.5f} now best_acc {best_acc:.5f}')

    if best_acc > save_acc_limit and best_state != None:
        logging(f'saving best model acc {best_acc:.5f} in {best_path}')
        torch.save(best_state, best_path)
        best_state = None
parser.add_argument('--epoch', type=int, default=20)
parser.add_argument('--batch', type=int, default=64)
parser.add_argument('--lr', type=float, default=1e-3)
parser.add_argument('--note', type=str, default='')
parser.add_argument('--load_model',
                    choices=[True, False],
                    default='no',
                    type=parse_bool)
parser.add_argument('--cuda', type=str, default='3')
parser.add_argument('--skip_loss', type=float, default=0.15)
parser.add_argument('--only_evaluate', type=parse_bool, default='no')
parser.add_argument('--scratch', type=parse_bool, default='no')
args = parser.parse_args()

if baseline_debug_mode:
    logging('the debug mode is on!!!')
    time.sleep(1)

dataset_name = args.dataset
dataset_config = baseline_config_dataset[dataset_name]
batch = args.batch if not baseline_debug_mode else 4
lr = args.lr
note = args.note
is_load_model = args.load_model
model_name = args.model
using_bert = model_name in {'Bert', 'Bert_E'}
if args.cuda == 'cpu':
    device = torch.device('cpu')
else:
    device = torch.device('cuda:' + args.cuda)