def get_ngram_idf(train_data_path, devtest_data_path, to_file, threshold=1):
    print("==" * 40)
    print("==" * 40)

    train_microblogs = json.load(open(train_data_path, "r"), encoding="utf-8")
    devtest_microblogs = json.load(open(devtest_data_path, "r"), encoding="utf-8")
    all_microblogs = train_microblogs + devtest_microblogs
    total = len(all_microblogs)
    # 统计词在文档中出现的频次
    IDF_Counter = Counter()

    # vocab = set([])
    vocab = {}
    for microblog in all_microblogs:
        for word in set(microblog["response"]):
            # if float(microblog["sentiment score"]) < 0:
            IDF_Counter[word] += 1
            # vocab.add(word)
            data_utils.set_dict_key_value(vocab, word)
    # 删除频率小于threshold的键
    data_utils.removeItemsInDict(vocab, threshold)

    # rf
    dict_idf = {}
    for word in vocab:
        dict_idf[word] = math.log(total/ float(IDF_Counter[word]+1))
    # dict_idf = sorted(dict_idf.items(), key=lambda e: e[1], reverse=False)
    data_utils.save_params(dict_idf, to_file)
def get_entropy(idf_dic, from_file, to_file):
    with codecs.open(from_file, 'r', encoding='utf8') as f1:
        dict_entropy = {}
        for line in f1:
            line_list = line.strip().split(" ")
            entropy = 0.0
            for word in line_list:
                entropy += idf_dic[word]
            dict_entropy[line.strip()] = entropy/len(line_list)
        data_utils.save_params(dict_entropy, to_file)
    def init_embedding(self):
        self.word_embed_file = self.config.word_embed_file
        self.word_dim = self.config.word_dim
        self.threshold = self.config.threshold
        self.we_file = self.config.we_file
        self.w2i_file = self.config.w2i_file
        self.r2i_file = self.config.r2i_file
        self.u2i_file = self.config.u2i_file

        # the char_embed always init
        if self.config.init:
            self.utter_vocab, self.res_vocab, self.vocab = self.build_vocab()
            self.embed = data_utils.load_word_embedding(
                self.vocab, self.word_embed_file, self.config, self.word_dim)
            data_utils.save_params(self.vocab, self.w2i_file)
            data_utils.save_params(self.res_vocab, self.r2i_file)
            data_utils.save_params(self.utter_vocab, self.u2i_file)
            data_utils.save_params(self.embed, self.we_file)
        else:
            self.embed = data_utils.load_params(self.we_file)
            self.vocab = data_utils.load_params(self.w2i_file)
            self.res_vocab = data_utils.load_params(self.r2i_file)
            self.utter_vocab = data_utils.load_params(self.u2i_file)
            self.embed = self.embed.astype(np.float32)
        print("vocab size: %d" % len(self.vocab), "we shape: ",
              self.embed.shape)
    def init_embedding(self):
        self.word_embed_file = self.config.word_embed_file
        self.word_dim = self.config.word_dim
        self.char_dim = self.config.char_dim
        self.ner_dim = self.config.ner_dim
        self.pos_dim = self.config.pos_dim
        self.threshold = self.config.threshold

        self.we_file = self.config.we_file
        self.w2i_file = self.config.w2i_file
        self.c2i_file = self.config.c2i_file
        self.n2i_file = self.config.n2i_file
        self.p2i_file = self.config.p2i_file

        # the char_embed always init
        if self.config.init:
            self.word_vocab, self.char_vocab, self.ner_vocab, self.pos_vocab = self.build_vocab(
            )
            self.embed = data_utils.load_word_embedding(
                self.word_vocab, self.word_embed_file, self.config,
                self.word_dim)
            data_utils.save_params(self.word_vocab, self.w2i_file)
            data_utils.save_params(self.char_vocab, self.c2i_file)
            data_utils.save_params(self.ner_vocab, self.n2i_file)
            data_utils.save_params(self.pos_vocab, self.p2i_file)
            data_utils.save_params(self.embed, self.we_file)
        else:
            self.embed = data_utils.load_params(self.we_file)
            self.word_vocab = data_utils.load_params(self.w2i_file)
            self.char_vocab = data_utils.load_params(self.c2i_file)
            self.ner_vocab = data_utils.load_params(self.n2i_file)
            self.pos_vocab = data_utils.load_params(self.p2i_file)
            self.embed = self.embed.astype(np.float32)
        self.char_embed = np.array(np.random.uniform(
            -0.25, 0.25, (len(self.char_vocab), self.char_dim)),
                                   dtype=np.float32)
        self.ner_embed = np.array(np.random.uniform(
            -0.25, 0.25, (len(self.ner_vocab), self.ner_dim)),
                                  dtype=np.float32)
        self.pos_embed = np.array(np.random.uniform(
            -0.25, 0.25, (len(self.pos_vocab), self.pos_dim)),
                                  dtype=np.float32)
        print("vocab size: %d" % len(self.word_vocab), "we shape: ",
              self.embed.shape)
Example #5
0
    def train(self, output_dir, train_batch_size, gradient_accumulation_steps, seed,
              epochs, data_path, pretrained_path, valid_path, no_cuda=False, dropout=0.3,
              weight_decay=0.01, warmup_proportion=0.1, learning_rate=5e-5, adam_epsilon=1e-8,
              max_seq_length=128, squeeze=True, max_grad_norm=1.0, eval_batch_size=32, epoch_save_model=False,
              model_name='BERT', embedding_path=None, split_train_data=False, motherfile = False):
        if os.path.exists(output_dir) and os.listdir(output_dir):
            raise ValueError("Output directory (%s) already exists and is not empty." % output_dir)
        
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S',
                        level=logging.INFO,
                        filename=os.path.join(output_dir, "log.txt"))
        logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
        logger = logging.getLogger(__name__)

        if gradient_accumulation_steps < 1:
            raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
                         % gradient_accumulation_steps)

        train_batch_size = train_batch_size // gradient_accumulation_steps
    
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)

        # add one for IGNORE label
        if motherfile:
            print(data_path)
            train_examples, train_label_list = get_examples_from_motherfile(data_path, 'train')
            val_examples, val_label_list = get_examples_from_motherfile(data_path, 'test')
            train_label_list.extend(val_label_list)
            label_list = list(set(train_label_list))
        elif split_train_data:
            examples, label_list = get_examples(data_path, 'train')
            random.shuffle(examples)
            train_examples = examples[0:int(len(examples)*0.6)]
            val_examples = examples[int(len(examples)*0.6):int(len(examples)*0.8)]
            eval_examples = examples[int(len(examples)*0.8):]
        else:
            train_examples = None
            train_examples, label_list = get_examples(data_path, 'train')
        num_train_optimization_steps = 0
        num_labels = len(label_list) + 1
        num_train_optimization_steps = int(
            len(train_examples) / train_batch_size / gradient_accumulation_steps) * epochs
        
        hidden_size = 300 if pretrained_path == None else 768 if 'base' in pretrained_path else 1024
        device = 'cuda:0' if (torch.cuda.is_available() and not no_cuda) else 'cpu'
        logger.info(device)
        print(pretrained_path)
        if model_name == 'HERBERT':
            model = AutoTokenizerForTokenClassification(
                pretrained_path=pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout_p=dropout,
                device=device)
        elif model_name == 'BERT_MULTILINGUAL':
            model = BertBaseMultilingualCased(
                pretrained_path=pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout_p=dropout,
                device=device)
        elif model_name == 'Reformer':
            model = Reformer(n_labels=num_labels, hidden_size=512,
                             dropout=dropout, device=device, max_seq_length=max_seq_length,
                             batch_size=train_batch_size)
        else:
            model = XLMRForTokenClassification(pretrained_path=pretrained_path,
                                n_labels=num_labels, hidden_size=hidden_size,
                                dropout=dropout, device=device)

        model.to(device)
        no_decay = ['bias', 'final_layer_norm.weight']

        params = list(model.named_parameters())

        optimizer_grouped_parameters = [
            {'params': [p for n, p in params if not any(
                nd in n for nd in no_decay)], 'weight_decay': weight_decay},
            {'params': [p for n, p in params if any(
                nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

        warmup_steps = int(warmup_proportion * num_train_optimization_steps)
        optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps)

        train_features = convert_examples_to_features(
            train_examples, label_list, max_seq_length, model.encode_word)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        train_data = create_dataset(train_features)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(
            train_data, sampler=train_sampler, batch_size=train_batch_size)
        if not split_train_data:
            val_examples, _ = get_examples(valid_path, 'valid')
        val_features = convert_examples_to_features(
            val_examples, label_list, max_seq_length, model.encode_word)

        val_data = create_dataset(val_features)
        
        best_val_f1 = 0.0

        for epoch_no in range(1, epochs+1):
            logger.info("Epoch %d" % epoch_no)
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            
            model.train()
            steps = len(train_dataloader)
            for step, batch in enumerate(train_dataloader):
                batch = tuple(t.to(device) for t in batch)
                input_ids, label_ids, l_mask, valid_ids, = batch
                loss = model(input_ids, label_ids, l_mask, valid_ids)
                if gradient_accumulation_steps > 1:
                    loss = loss / gradient_accumulation_steps

                loss.backward()
                torch.nn.utils.clip_grad_norm_(
                    model.parameters(), max_grad_norm)

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if step % 5 == 0:
                    logger.info('Step = %d/%d; Loss = %.4f' % (step+1, steps, tr_loss / (step+1)))
                if (step + 1) % gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()
                    model.zero_grad()

            logger.info("\nTesting on validation set...")
            f1, report = evaluate_model(model, val_data, label_list, eval_batch_size, device)
            print(report)
            if f1 > best_val_f1:
                best_val_f1 = f1
                logger.info("\nFound better f1=%.4f on validation set. Saving model\n" % f1)
                logger.info("%s\n" % report)
                torch.save(model.state_dict(), open(os.path.join(output_dir, 'model.pt'), 'wb'))
                save_params(output_dir, dropout, num_labels, label_list)

            if epoch_save_model:
                epoch_output_dir = os.path.join(output_dir, "e%03d" % epoch_no)
                os.makedirs(epoch_output_dir)
                torch.save(model.state_dict(), open(os.path.join(epoch_output_dir, 'model.pt'), 'wb'))
                save_params(epoch_output_dir, dropout, num_labels, label_list)
Example #6
0
    def train(self, model, x_train, y_train, label_map, epochs, train_batch_size, seed, x_valid, y_valid, gradient_accumulation_steps, output_dir, max_seq_length=128,
              weight_decay=0.01, warmup_proportion=0.1, learning_rate=0.01, adam_epsilon=1e-8, no_cuda=False, max_grad_norm=1.0, eval_batch_size=32,
              epoch_save_model=False, dropout=0.2, save=True, logger = None):
        if not logger:
            logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
            logger = logging.getLogger(__name__)

        num_train_optimization_steps = int(
            len(x_train) / train_batch_size / gradient_accumulation_steps) * epochs
        params = list(model.named_parameters())
        no_decay = ['bias', 'final_layer_norm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in params if not any(
                nd in n for nd in no_decay)], 'weight_decay': weight_decay},
            {'params': [p for n, p in params if any(
                nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

        warmup_steps = int(warmup_proportion * num_train_optimization_steps)
        optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps)

        if save and os.path.exists(output_dir) and os.listdir(output_dir):
            raise ValueError("Output directory (%s) already exists and is not empty." % output_dir)
        
        if save and not os.path.exists(output_dir):
            os.makedirs(output_dir)

        device = 'cuda:1' if (torch.cuda.is_available() and not no_cuda) else 'cpu'
        logger.info(device)
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        model.to(device)
        best_val_f1 = 0.0
        best_precision = 0.0
        best_recall = 0.0
        epoch_times = []
        steps = len(x_train)%train_batch_size
        for epoch_no in range(1, epochs+1):
            start = timer()
            logger.info("Epoch %d" % epoch_no)
            tr_loss = 0
            model.train()
            for step in range(0, steps):
                div = int(step*train_batch_size)
                if len(x_train) > div+train_batch_size:
                    input_ids, label_ids, l_mask, valid_ids, = get_batch(x_train[div:div+train_batch_size], y_train[div:div+train_batch_size],
                    device=device, embed_method=model.encode_word, max_seq_length=max_seq_length, label_map=label_map, batch_size=train_batch_size)
                else:
                    input_ids, label_ids, l_mask, valid_ids, = get_batch(x_train[div:], y_train[div:], device=device,
                    embed_method=model.encode_word, max_seq_length=max_seq_length, label_map=label_map, batch_size=train_batch_size)
                loss = model(input_ids, label_ids, l_mask, valid_ids)
                if gradient_accumulation_steps > 1:
                    loss = loss / gradient_accumulation_steps
                loss.backward()
                torch.nn.utils.clip_grad_norm_(
                    model.parameters(), max_grad_norm)
                tr_loss += loss.item()
                if step % 5 == 0:
                    logger.info('Step = %d/%d; Loss = %.4f' % (step+1, steps, tr_loss / (step+1)))
                if (step + 1) % gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()
                    model.zero_grad()
                epoch_times.append(timer() - start)
            logger.info("\nTesting on validation set...")
            f1, report, precision, recall = self.evaluate_model(model, x_valid, y_valid, label_map, eval_batch_size, device, max_seq_length)
            print(report)
            if f1 > best_val_f1:
                best_val_f1 = f1
                best_precision = precision
                best_recall = recall
                logger.info("\nFound better f1=%.4f on validation set. Saving model\n" % f1)
                print(report)
                if save:
                    torch.save(model.state_dict(), open(os.path.join(output_dir, 'model.pt'), 'wb'))
                    save_params(output_dir, dropout, len(label_map.keys()), list(label_map.keys()))

            if save and epoch_save_model:
                epoch_output_dir = os.path.join(output_dir, "e%03d" % epoch_no)
                os.makedirs(epoch_output_dir)
                torch.save(model.state_dict(), open(os.path.join(epoch_output_dir, 'model.pt'), 'wb'))
                save_params(epoch_output_dir, dropout, len(label_map.keys()), list(label_map.keys()))
        print("Avg. epoch time")
        print(np.mean(epoch_times, axis=0))
        return best_val_f1, best_precision, best_recall