Esempio n. 1
0
    def __init__(self, params, batch_size):
        split_per = 0.95
        self.batch_size = batch_size
        self.lang_set = params['lang_set']
        self.token_embd_list = []
        with open(corpus_embd_file_path_format.format('_'.join(self.lang_set)),
                  "rb") as f:
            while True:
                try:
                    self.token_embd_list.append(pickle.load(f))
                except:
                    break
        self.corpus_df = pd.read_csv(corpus_csv_path,
                                     sep='\001',
                                     encoding='utf-8')
        df_filter = pd.Series([True] * len(self.corpus_df))
        for task in params['task_name']:
            df_filter = df_filter & (self.corpus_df['task_type'] == task)
        self.corpus_df = self.corpus_df[df_filter]
        self.corpus_label = self.corpus_df[
            self.corpus_df['data_set'] != 'test']
        self.corpus_unlabel = self.corpus_df[self.corpus_df['data_set'] ==
                                             'test']
        self.corpus_df_train = self.corpus_label[:int(
            len(self.corpus_label) * split_per)]
        self.corpus_df_dev = self.corpus_label[
            -int(len(self.corpus_label) * (1 - split_per)):]
        self.corpus_df_test = self.corpus_unlabel

        logger.info("load {} data of {}".format(len(self.token_embd_list),
                                                "/".join(self.lang_set)))
Esempio n. 2
0
 def run_train(self, epochs=1):
     # epoch
     for epoch_i in range(self.params['epoch_num'] + 1,
                          self.params['epoch_num'] + epochs + 1):
         logger.info("epoch: {}".format(epoch_i))
         # train_batch phrase
         self.model.train()
         iter_i = 0
         for iter_i, data_train in enumerate(
                 self.data_stream(data_set='train'), self.iter_count + 1):
             # data_train = [batch_size, (input_batch, target_batch)]
             self.run_train_batch(data_train)
             if iter_i % 100 == 0:
                 self.model.eval()
                 score_tmp = self.score
                 self.run_eval()
                 logger.info(
                     "step:{} loss: {:.4f} train_metric: {:.4f} dev_metric: {:.4f}"
                     .format(iter_i, self.loss, score_tmp, self.score))
         model_name = "_".join([
             self.params['model_name_prefix'],
             '_'.join(self.params['task_name']),
             str(epoch_i)
         ])
         self.save_model(model_name=model_name)
         self.iter_count = iter_i
Esempio n. 3
0
 def __init__(self,
              params,
              model,
              data_stream,
              optimizer=None,
              loss_func=None,
              eval_func=None,
              USE_CUDA=True):
     self.params = params
     # self.model = model.cuda() if USE_CUDA else model
     self.model = check_gpu(model)
     self.data_stream = data_stream
     self.optimizer = self.build_optimizer(
     ) if optimizer is None else optimizer
     self.loss_func = self.build_loss_func(
     ) if loss_func is None else loss_func
     self.eval_func = self.build_eval_func(
     ) if eval_func is None else eval_func
     # self.USE_CUDA = USE_CUDA
     self.iter_count = 0
     self.loss = 0
     self.score = 0
     # total_params = sum(p.numel() for p in self.model.parameters())/pow(2,20)
     total_storage_space = sum(p.numel() * int(str(p.dtype)[-2:]) / 8
                               for p in self.model.parameters()) / pow(
                                   2, 20)
     logger.info(
         "Total Model Storage Space: {:.0f} MB".format(total_storage_space))
Esempio n. 4
0
    def __init__(self, params, data_set):
        self.corpus_df = pd.read_csv(os.path.join(
            data_path, 'SemEval2021_Task2_corpus.csv'),
                                     sep='\001',
                                     encoding='utf-8')
        self.data_set = data_set
        if self.data_set == 'train':
            self.corpus_df = self.corpus_df[
                (self.corpus_df['data_set'] == 'training') |
                (self.corpus_df['data_set'] == 'dev')]
        elif self.data_set == 'eval':
            self.corpus_df = self.corpus_df[self.corpus_df['data_set'] ==
                                            'trial']
        elif self.data_set == 'predict':
            self.corpus_df = self.corpus_df[self.corpus_df['data_set'] ==
                                            'test']
        else:
            logger.info('Invalid data_set: {}'.format(data_set))

        # do shuffle
        if self.data_set == 'train':
            self.corpus_df = self.corpus_df.sample(frac=1).reset_index(
                drop=True)

        self.params = params
        self.batch_size = batch_size

        self.n_corpus = len(self.corpus_df)
        logger.info("data num : {}".format(self.n_corpus))
Esempio n. 5
0
def load_model_xml(reloaded):
    # extract para
    params = AttrDict(reloaded['params'])
    logger.info("Supported languages: %s" % ", ".join(params.lang2id.keys()))

    # build dictionary
    dico = dictionary.Dictionary(reloaded['dico_id2word'],
                                 reloaded['dico_word2id'],
                                 reloaded['dico_counts'])
    params['dico'] = dico

    # update parameters
    params.n_words = len(dico)
    params.bos_index = dico.index(dictionary.BOS_WORD)
    params.eos_index = dico.index(dictionary.EOS_WORD)
    params.pad_index = dico.index(dictionary.PAD_WORD)
    params.unk_index = dico.index(dictionary.UNK_WORD)
    params.mask_index = dico.index(dictionary.MASK_WORD)

    # build model / reload weights
    model = TransformerModel(params, params['dico'], True, True)
    model = framework.check_gpu(model)

    framework.load_model_params(model=model,
                                model_params_from_file=reloaded['model'],
                                frozen=True)
    # model.load_state_dict(torch.load(
    #     params["reload_model"],
    #     map_location=lambda storage, loc: storage),
    #     False)

    return model, params
Esempio n. 6
0
def load_model_params(model, model_params_from_file, frozen=None):
    model_params = {}
    # todo the "FROZEN" tag is mistake
    for para_name, para_value in model.named_parameters():
        if para_name in model_params_from_file:
            param_tmp = model_params_from_file[para_name]
            if frozen is not None:
                param_tmp.requires_grad = not frozen
            model_params[para_name] = param_tmp
            logger.info("[{}]{}{}[{}] **INIT_FROM_FILE**".format(
                'Not Frozen' if param_tmp.requires_grad else 'Frozen',
                para_name,
                list(para_value.size()),
                str(para_value.dtype).split(".")[-1],
            ))
        else:
            param_tmp = para_value
            logger.info("[{}]{}{}[{}]".format(
                'Not Frozen' if param_tmp.requires_grad else 'Frozen',
                para_name,
                list(para_value.size()),
                str(para_value.dtype).split(".")[-1],
            ))
    model.load_state_dict(model_params, strict=False)
Esempio n. 7
0
    prediction = torch.argmax(output, dim=1)
    correct_num = (prediction == label).sum().float()
    total_num = len(label)
    acc = correct_num / total_num
    return acc


def check_gpu(x):
    if x is None:
        return None
    elif torch.cuda.is_available():
        return x.cuda()
    else:
        return x


if torch.cuda.is_available():
    logger.info("USE GPU")
    GPU_OR_CPU = 'cuda'
else:
    logger.info("USE CPU")
    GPU_OR_CPU = 'cpu'

if __name__ == '__main__':
    logger.info("END")

# b = torch.from_numpy(a)
# feature_tensor_single.detach().numpy()
# 模型中可学习的参数会由net.parameters()返回。
# params = list(net.parameters())
Esempio n. 8
0
model_bertflow_path = os.path.join(data_root_path, 'BERT-flow', 'exp', 'exp_t_STS-B_ep_1.00_lr_5.00e-05_e_avg-last-2_f_11_1.00e-03_allsplits')
model_bertflow_file = os.path.join(model_bertflow_path, 'model.ckpt-269')

corpus_embd_file_path = os.path.join(data_path, "corpus_embd.npy")
corpus_embd_file_path_format = os.path.join(data_path, "corpus_embd_{}.npy")
corpus_token_file_path = os.path.join(data_path, "corpus_token.txt")

corpus_csv_path = os.path.join(data_path, 'SemEval2021_Task2_corpus.csv')

# print(tensor.device)
# print(torch.cuda.device_count())


if __name__ == "__main__":
    import pickle
    from util_tools import logger
    logger.info("BEGIN")
    corpus_list = []
    f = open(corpus_embd_file_path, "rb")
    i = 0
    while True:
        try:
            corpus_list.append(pickle.load(f))
            i += 1
            # print(i)
        except:
            f.close()
            break
    logger.info("END")

Esempio n. 9
0
                           for x, y in zip(df_batch['sent1_token_keyword_idx'],
                                           df_batch['sent2_token_keyword_idx'])
                           ]
            label = np.array(df_batch['label']).astype(np.int)
            label = framework.check_gpu(torch.tensor(label, dtype=torch.long))
            yield (token_embd_batch, keyword_idx), label


if __name__ == '__main__':
    batch_size = 32
    params = get_params()
    # params = get_params(model_init_path = os.path.join(data_path, "polysemy_bertflow_en-en_10"))
    model = load_model(params)
    optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                  model.parameters()),
                           lr=0.01)
    loss_func = nn.CrossEntropyLoss()
    # USE_CUDA = True if torch.cuda.is_available() else False
    data_stream = generate_data(params=params, batch_size=batch_size)

    polysemt_bertflow_model = framework.framework(params=params,
                                                  model=model,
                                                  data_stream=data_stream,
                                                  optimizer=optimizer,
                                                  loss_func=loss_func)
    # polysemt_bertflow_model.run_eval()
    polysemt_bertflow_model.run_train(epochs=30)

    torch.cuda.empty_cache()
    logger.info('END')