Ejemplo n.º 1
0
    def __prepare_train_data(self, X_r: list, y_r: list, source2index: list,
                             target2index: list):
        X_p, y_p = [], []
        for source, target in zip(X_r, y_r):
            X_p.append(
                prepare_sequence(['<s>'] + source + ['</s>'],
                                 source2index).view(1, -1))
            y_p.append(
                prepare_sequence(['<s>'] + target + ['</s>'],
                                 target2index).view(1, -1))

        train_data = list(zip(X_p, y_p))
        return train_data
Ejemplo n.º 2
0
def train(epoch):
    print("Training Epoch:", epoch)
    att_g.train()
    total_loss = 0
    total_img_loss = 0
    total_text_loss = 0
    for idx, (img, caps, info) in enumerate(train_dataset):
        img = img[None,]
        img = img.to(device)
        img_loss, text_loss = 0, 0
        optimizer.zero_grad()
        for s in caps:
            seq = prepare_sequence(s, w_map, device, tag=True)
            r_img, r_text, attn = att_g(img, seq)
            img_loss += img_criterion(r_img, img)
            text_loss += text_criterion(r_text, seq)
        loss = img_loss + text_loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        total_img_loss += img_loss.item()
        total_text_loss += text_loss.item()
        step = idx + 1
        if step % LOG_FREQ == 0:
            print("Step: %d, Loss: %.2f, ImgLoss: %.2f, TextLoss: %.2f" % (step, total_loss/step, total_img_loss/step, total_text_loss/step))
Ejemplo n.º 3
0
    def forward(self, word_input, anneal=False):

        char_idx = []
        for sentence in word_input:
            sentence_chars = []
            for token_idx in sentence:
                token_chars = []
                token = self.vocab.idx2key[token_idx]
                if len(token) <= 20:
                    token_chars.append(
                        prepare_sequence(token, self.char_set) +
                        [self.char_set["<pad>"]] * (20 - len(token)))
                else:
                    token_chars.append(
                        prepare_sequence(token[0:13] + token[-7:],
                                         self.char_set))

                sentence_chars.append(token_chars[0].copy())
            char_idx.append(sentence_chars.copy())

        char_input, _ = pad_packed_sequence(
            pack_sequence([torch.LongTensor(_) for _ in char_idx],
                          enforce_sorted=False),
            batch_first=True,
            padding_value=self.char_set["<pad>"],
        )
        char_input = char_input.to(self.device)

        batch_size = word_input.size(0)
        seq_len = word_input.size(1)
        char_output = self.char_encoder(
            char_input.reshape(-1, char_input.size(2))).reshape(
                batch_size, seq_len, -1)
        word_output = self.word_encoder(word_input, char_output)
        y = self.decoder(word_output)

        if anneal:
            preds = F.log_softmax(y / self.T, dim=2)
        else:
            preds = F.log_softmax(y, dim=2)

        return {"last_preds": preds}  # , "embeddings": word_output}
Ejemplo n.º 4
0
def train(target_dir,
          embedding_dim,
          hidden_dim,
          glove_file):
    torch.manual_seed(1)
    train_word_to_ix, train_tag_to_ix, train_sents_idx, train_labels_idx = pickle.load(
                                                           open(target_dir + "CoNLL_train.pkl", "rb"))
    test_word_to_ix, test_tag_to_ix, test_sents_idx, test_labels_idx = pickle.load(
                                                           open(target_dir + "CoNLL_test.pkl", "rb"))
    model = LSTMTagger(embedding_dim,
                       hidden_dim,
                       len(train_word_to_ix),
                       len(train_tag_to_ix),
                       target_dir,
                       glove_file)
    criterion = nn.NLLLoss()
    optimizer = optim.RMSprop(model.parameters())

    EPOCHS = 2
    for epoch in range(EPOCHS):
        loss = 0
        for i, (sentence, tags) in tqdm(enumerate(zip(train_sents_idx, train_labels_idx))):
            model.zero_grad()
            model.hidden = model.init_hidden()
            # 単語インデックスの tensor に変換
            sentence_in = utils.prepare_sequence(sentence)
            # Tags インデックスの tensor に変換
            targets = utils.prepare_sequence(tags)
            tag_scores = model.forward(sentence_in)
            loss = criterion(tag_scores, targets)
            loss.backward()
            optimizer.step()
            loss += loss.item()
        f1_score_train_sents_avg = inference.evaluate(model,
                                                      train_sents_idx[:len(test_sents_idx)],
                                                      train_labels_idx[:len(test_sents_idx)])
        f1_score_test_sents_avg = inference.evaluate(model,
                                                     test_sents_idx,
                                                     test_labels_idx)
        print("[{}] EPOCH {} - LOSS: {:.8f} TRAIN_DATA_F1_SCORE: {} TEST_DATA_F1_SCORE: {}".
                format(datetime.datetime.today(), epoch + 1, loss,
                       f1_score_train_sents_avg, f1_score_test_sents_avg))
Ejemplo n.º 5
0
    def __predict_sentence(self, src_batch):
        """
        predict sentence
        :param src_batch: get the source sentence
        :return:
        """
        hyp_batch = ''

        inputs = prepare_sequence(['<s>'] + src_batch + ['</s>'], self.data_model.source2index).view(1, -1)
        start_decode = Variable(LongTensor([[self.data_model.target2index['<s>']] * inputs.size(1)]))
        show_preds = self.qrnn(inputs, [inputs.size(1)], start_decode)
        outputs = torch.max(show_preds, dim=1)[1].view(len(inputs), -1)
        for pred in outputs.data.tolist():
            for each_pred in pred:
                hyp_batch += self.data_model.index2target[each_pred]
        hyp_batch = hyp_batch.replace('<s>', '')
        hyp_batch = hyp_batch.replace('</s>', '')
        return hyp_batch
Ejemplo n.º 6
0
    def generate_seq(self, hidden_f, seq):
        batch_size = 1
        outputs = torch.zeros(seq.size(0), batch_size,
                              self.text_encoder.vocab_size).to(self.device)

        hidden_f = hidden_f.view(hidden_f.size(1), -1)
        hidden = hidden_f.mean(1).view(2, 1, self.text_decoder.hidden_dim // 2)
        hidden = (hidden, hidden)

        input = prepare_sequence([START_TAG], self.w_map, self.device)

        for t in range(0, len(seq)):
            output, hidden = self.text_decoder(input, hidden)
            outputs[t] = output
            top1 = output.max(1)[1]
            if top1.item() == END_TAG:
                break
            input = top1

        outputs = outputs.view(outputs.size(0), -1)
        return outputs
def test(fert_model, out_path):

    toks = 0
    all_out_ferts = []

    print("Starting evaluation on test set... (%d sentences)" % len(test_data))
    for sentence in test_data:
        fert_model.zero_grad()
        fert_model.hidden = fert_model.init_hidden()

        sentence_in = utils.prepare_sequence(sentence,
                                             word_to_ix,
                                             gpu=args.gpu)

        fert_scores = fert_model(sentence_in.view(1, -1))
        if args.model_type == 'regression':
            out_ferts = fert_scores.cpu().data.numpy().flatten()
        else:
            expected = True
            if expected:
                ss = fert_scores.cpu().data.numpy()
                probs = np.exp(ss) / np.tile(
                    np.exp(ss).sum(2)[:, :, None], (1, 1, ss.shape[2]))
                out_ferts = (probs[0, :, :] * np.tile(
                    (1. + np.arange(ss.shape[2])), (ss.shape[1], 1))).sum(1)
            else:
                values, indices = torch.max(fert_scores, 2)
                out_ferts = indices.cpu().data.numpy().flatten() + 1

        toks += out_ferts.shape[0]
        all_out_ferts.append(out_ferts.tolist())

    print("Writing predicted fertility values..")
    # Write fertility values to file
    with open(out_path, 'w') as f:
        for ferts in all_out_ferts:
            for fert in ferts:
                f.write("%s " % fert)
            f.write("\n")
Ejemplo n.º 8
0
def infer(model, sent_idx):
    with torch.no_grad():
        inputs = utils.prepare_sequence(sent_idx)
        tag_scores = model.forward(inputs)
        _, pred_tag = torch.max(tag_scores.data, 1)
        return pred_tag
def eval(fert_model, curEpoch=None):

    correct = 0
    toks = 0
    num_matches = num_pred = num_gold = 0
    all_out_ferts = []
    # all_targets = np.array([])

    print("Starting evaluation on dev set... (%d sentences)" % len(dev_data))

    for start_idx, end_idx in dev_order:

        dev_sents = dev_data[start_idx:end_idx + 1]
        target_ferts = dev_ferts[start_idx:end_idx + 1]

        fert_model.zero_grad()
        fert_model.hidden = fert_model.init_hidden(len(dev_sents))

        batch_sents = torch.stack([
            utils.prepare_sequence(sentence, word_to_ix, gpu=args.gpu)
            for sentence in dev_sents
        ])
        batch_ferts = torch.stack([
            utils.prepare_sequence(ferts, gpu=args.gpu)
            for ferts in target_ferts
        ])

        fert_scores = fert_model(batch_sents)

        if args.model_type == 'regression':
            out_ferts = fert_scores.cpu().data.numpy().flatten()
            out_ferts = np.round(out_ferts)

            gold_ferts = batch_ferts.cpu().data.numpy().flatten()
            correct += np.count_nonzero(out_ferts == gold_ferts)
            toks += out_ferts.shape[0]
            num_matches += np.count_nonzero(
                np.logical_and(out_ferts == gold_ferts, gold_ferts != 1))
            num_pred += np.count_nonzero(out_ferts != 1)
            num_gold += np.count_nonzero(gold_ferts != 1)
        else:
            values, indices = torch.max(fert_scores, 2)
            out_ferts = indices.cpu().data.numpy().flatten() + 1

            gold_ferts = batch_ferts.cpu().data.numpy().flatten()
            correct += np.count_nonzero(out_ferts == gold_ferts)
            toks += out_ferts.shape[0]
            num_matches += np.count_nonzero(
                np.logical_and(out_ferts == gold_ferts, gold_ferts != 1))
            num_pred += np.count_nonzero(out_ferts != 1)
            num_gold += np.count_nonzero(gold_ferts != 1)

        all_out_ferts.append(out_ferts.tolist())
        # all_targets = np.append(all_targets, batch_ferts)

    precision = num_matches / num_pred
    recall = num_matches / num_gold
    f1 = 2. * num_matches / (num_pred + num_gold)
    avg_tok_accuracy = correct / toks

    print("Dev Set Accuracy: %f" % avg_tok_accuracy)
    print("Dev Set Precision: %f" % precision)
    print("Dev Set Recall: %f" % recall)
    print("Dev Set F1: %f" % f1)

    return f1  #avg_tok_accuracy
def main():

    ############################################################################################

    if not os.path.isfile(args.model_name):
        if args.model_type == 'regression':
            fert_model = models.BiLSTMRegressor(args.emb_dim, args.hidden_dim,
                                                len(word_to_ix), args.max_fert,
                                                args.n_layers, args.dropout,
                                                args.gpu)
        else:
            fert_model = models.BiLSTMTagger(args.emb_dim, args.hidden_dim,
                                             len(word_to_ix), args.max_fert,
                                             args.n_layers, args.mlp_dim,
                                             args.dropout, args.gpu)
        custom_weight = torch.ones(args.max_fert)
        custom_weight[0] = 0.5  #0.6 # TODO: set this as a hyperparameter?
        if args.gpu:
            fert_model = fert_model.cuda()
            custom_weight = custom_weight.cuda()
        if args.model_type == 'regression':
            loss_function = nn.MSELoss()
        else:
            loss_function = nn.NLLLoss(weight=custom_weight)

        #optimizer = optim.SGD(fert_model.parameters(), lr=0.1)
        #optimizer = optim.Adam(fert_model.parameters(), lr=0.001)
        #optimizer = optim.Adam(fert_model.parameters(), lr=0.001, weight_decay=0.001)
        optimizer = optim.Adam(fert_model.parameters(), lr=0.001)
        print("Training fertility predictor model...")
        patience_counter = 0
        prev_avg_tok_accuracy = 0
        best_avg_tok_accuracy = 0
        random.shuffle(train_order)

        for epoch in xrange(args.epochs):
            accuracies = []
            sent = 0
            tokens = 0
            cum_loss = 0
            batch_idx = 1

            num_matches = num_pred = num_gold = 0
            print("Starting epoch %d .." % epoch)
            for start_idx, end_idx in train_order:
                train_sents = training_data[start_idx:end_idx + 1]
                target_ferts = training_ferts[start_idx:end_idx + 1]
                sent += end_idx - start_idx + 1
                tokens += sum([len(sentence) for sentence in train_sents])

                metric = "MSE" if args.model_type == 'regression' else "Average Accuracy"

                if batch_idx % 100 == 0:
                    print("[Epoch %d] \
                        Sentence %d/%d, \
                        Tokens %d \
                        Cum_Loss: %f \
                        %s: %f" %
                          (epoch, sent, len(training_data), tokens, cum_loss /
                           tokens, metric, sum(accuracies) / len(accuracies)))

                # Step 1. Remember that Pytorch accumulates gradients.  We need to clear them out
                # before each instance
                fert_model.zero_grad()

                # Also, we need to clear out the hidden state of the LSTM, detaching it from its
                # history on the last instance.
                fert_model.hidden = fert_model.init_hidden(len(train_sents))

                # Step 2. Get our inputs ready for the network, that is, turn them into Variables
                # of word indices.
                batch_sents = torch.stack([
                    utils.prepare_sequence(sentence, word_to_ix, gpu=args.gpu)
                    for sentence in train_sents
                ])
                batch_ferts = torch.stack([
                    utils.prepare_sequence(ferts, gpu=args.gpu)
                    for ferts in target_ferts
                ])

                # Step 3. Run our forward pass.
                fert_scores = fert_model(batch_sents)

                if args.model_type == 'regression':
                    out_ferts = fert_scores.cpu().data.numpy().flatten()

                    err = out_ferts - batch_ferts.float().cpu().data.numpy(
                    ).flatten()
                    sent_acc = sum(err**2 / out_ferts.shape[0])
                    accuracies.append(sent_acc)  # This is actually MSE.

                    out_ferts = np.round(out_ferts)
                    gold_ferts = batch_ferts.cpu().data.numpy().flatten()
                    #sent_acc = np.count_nonzero(out_ferts==gold_ferts) / out_ferts.shape[0]
                    #accuracies.append(sent_acc)
                    num_matches += np.count_nonzero(
                        np.logical_and(out_ferts == gold_ferts,
                                       gold_ferts != 1))
                    num_pred += np.count_nonzero(out_ferts != 1)
                    num_gold += np.count_nonzero(gold_ferts != 1)

                    # Step 4. Compute the loss, gradients, and update the parameters
                    loss = loss_function(fert_scores, batch_ferts.float())
                else:
                    values, indices = torch.max(fert_scores, 2)
                    out_ferts = indices.cpu().data.numpy().flatten() + 1

                    gold_ferts = batch_ferts.cpu().data.numpy().flatten()
                    sent_acc = np.count_nonzero(
                        out_ferts == gold_ferts) / out_ferts.shape[0]
                    accuracies.append(sent_acc)
                    num_matches += np.count_nonzero(
                        np.logical_and(out_ferts == gold_ferts,
                                       gold_ferts != 1))
                    num_pred += np.count_nonzero(out_ferts != 1)
                    num_gold += np.count_nonzero(gold_ferts != 1)

                    # Step 4. Compute the loss, gradients, and update the parameters
                    loss = loss_function(
                        fert_scores.view(
                            len(train_sents) * len(train_sents[0]), -1),
                        batch_ferts.view(-1) - 1)

                cum_loss += loss.cpu().data[0]
                loss.backward()
                optimizer.step()
                batch_idx += 1

            precision = num_matches / num_pred
            recall = num_matches / num_gold
            f1 = 2. * num_matches / (num_pred + num_gold)

            print("Loss: %f" % loss.cpu().data.numpy())
            print("Accuracy: %f" % np.mean(accuracies))
            print("Precision: %f" % precision)
            print("Recall: %f" % recall)
            print("F1: %f" % f1)
            print("Evaluating on dev set...")
            avg_tok_accuracy = eval(fert_model, epoch)
            if avg_tok_accuracy > best_avg_tok_accuracy:
                best_avg_tok_accuracy = avg_tok_accuracy
                print("Saving model..")
                torch.save(fert_model, args.model_name)

            # Early Stopping
            if avg_tok_accuracy <= prev_avg_tok_accuracy:
                patience_counter += 1
                if patience_counter == args.patience:
                    print(
                        "Model hasn't improved on dev set for %d epochs. Stopping Training."
                        % patience_counter)
                    break

            prev_avg_tok_accuracy = avg_tok_accuracy

    else:
        print("Loading tagger model from " + args.model_name + "...")
        fert_model = torch.load(args.model_name)
        if args.gpu:
            fert_model = fert_model.cuda()

    if args.test:
        out_path = args.write_fertilities if args.write_fertilities else args.test_source_path + ".fert.predicted"
        test(fert_model, out_path)
Ejemplo n.º 11
0
    def seg(self, text, user_dict=None):
        text = text.strip()
        if len(text) == 0:
            return []

        results = []

        if user_dict is None:
            new_dict = self.userdict
        else:
            new_dict = Trie()
            new_dict.add_dict_word(user_dict)

        # word_list = utils.get_word(text)
        word_list = []
        for i in text:
            word_list.append(i)
        # print(word_list)
        sentence_tensor = utils.prepare_sequence(word_list, self.char2index)
        # print((sentence_tensor))

        with torch.no_grad():
            sentence, text, mask = utils.collate_fn_without_label([
                (sentence_tensor, text)
            ])
            batch_size, seq_len = sentence.shape
            nb_labels = len(utils.tag_to_ix)
            text_score = torch.zeros(batch_size, seq_len, nb_labels).float()
            for i in range(batch_size):
                matchs = new_dict.cut(text[i])
                matchs.extend(process_eng(text[i]))
                # print(matchs)
                for m in matchs:
                    weight = new_dict.get_weight(m[2]) * 10.0
                    if len(m[2]) == 1:
                        text_score[i, m[0],
                                   utils.tag_to_ix[SINGLE_TAG]] = weight
                    elif len(m[2]) == 2:
                        text_score[i, m[0],
                                   utils.tag_to_ix[BEGIN_TAG]] = weight
                        text_score[i, m[0] + 1,
                                   utils.tag_to_ix[END_TAG]] = weight
                    else:
                        text_score[i, m[0],
                                   utils.tag_to_ix[BEGIN_TAG]] = weight
                        text_score[i, m[1] - 1,
                                   utils.tag_to_ix[END_TAG]] = weight
                        text_score[i, m[0] + 1:m[1] - 1,
                                   utils.tag_to_ix[MIDDLE_TAG]] = weight

            masks = mask.to(self.device)
            sen = sentence.to(self.device)
            temp_pred = self.segmentor_model(sen, mask=masks,
                                             text=text_score)[1]
            for i in range(batch_size):
                result = ''
                for j in range(len(temp_pred[i])):
                    # if text[i][j] == ' ':
                    #     continue
                    result += text[i][j]
                    if temp_pred[i][j] == 4 or temp_pred[i][j] == 3:
                        results.append(result)
                        result = ''
        return results
    def preprocess(self, bin_dataframe):
        # bin_np = bin_dataframe.as_matrix()
        bin_np = bin_dataframe.to_numpy()
        docNr = -1

        bin_tweets = []

        bin_tweet_lengths=[]
        bin_tweets_text=[]

        previous_match = ""

        match = []
        for i in range(bin_np.shape[0]):

            if bin_np[i][1] == None or i == bin_np.shape[0] - 1:  # append all docs including the last one
                if (i == bin_np.shape[0] - 1):  # append last line
                    tweet_text = utils.lstToString(utils.strToLst(bin_np[i][1])).split()
                    tweet, tweet_length = utils.prepare_sequence(
                        tweet_text, self.word_to_ix,
                        pad_length=self.pad_length)
                    bin_tweets.append(tweet)
                    bin_tweet_lengths.append(tweet_length)
                    bin_tweets_text.append(tweet_text)


                if (docNr != -1):
                    #bin_tweets = np.asarray(bin_tweets)


                    try:
                        tag_id = self.tag_to_ix[target]

                        if target.startswith("B-") or target.startswith("I-"):
                            ec_id=self.ec_to_ix[target[2:]]
                        else:
                            ec_id=self.ec_to_ix[target]
                    except:
                        # print(target)
                        if target.startswith("B-"):
                            tag_id = self.tag_to_ix["B-Other"]

                        elif target.startswith("I-"):
                            tag_id = self.tag_to_ix["I-Other"]

                        ec_id = self.ec_to_ix["Other"]

                    if target=="O":
                        event_duration_idx = self.event_to_ix["non-event"]
                    else:
                        event_duration_idx = self.event_to_ix["event"]                                  
                    if event_id==-1:
                        independent_event_idx = self.event_to_ix["non-event"]
                    else:
                        independent_event_idx = self.event_to_ix["event"]

                    #print (len(bin_tweets))
                    #print (torch.stack(bin_tweets))
                    match.append([torch.stack(bin_tweets), tag_id,ec_id,event_duration_idx,independent_event_idx,event_type,event_id,bin_tweet_lengths])

                    #print (utils.getDictionaryKeyByIdx(self.tag_to_ix,tag_id),utils.getDictionaryKeyByIdx(self.ec_to_ix,ec_id),utils.getDictionaryKeyByIdx(self.event_to_ix,event_id))

                    # match=np.append(match,bin_tokens)
                    # match['match_bins'].append(bin)

                docNr += 1
                if i != bin_np.shape[0] - 1:
                    infoDict = utils.strToLst(bin_np[i][0])
                    # print('infoDict', infoDict)

                    if previous_match != infoDict['doc']:
                        # print (infoDict['doc'])

                        # match = {'match_bins': np.empty((0)),"match_name": infoDict['doc']}
                        previous_match = infoDict['doc']

                        # below two lines should be interchanged i think
                        match = []
                        self.matches.append(match)

                    bin_tweets = []
                    bin_tweet_lengths=[]
                    bin_tweets_text=[]
                    target = infoDict['corrected_tags']
                    event_type = infoDict['event_type']
                    event_id = infoDict['event_id']
                    match_name= infoDict['doc']                          


                    # {'bin': infoDict['bin'],'targets': infoDict['corrected_tags'],'tweets':[],'timestamps':[],'tokens':""}
            else:

                # bin['tweets'].append(strToLst(bin_np[i][1]))
                # bin_tokens+=" "+lstToString(strToLst(bin_np[i][1]))
                # bin['timestamps'].append(int(bin_np[i][0]))
                # print ((lstToString(strToLst(bin_np[i][1])).split()))
                #print (bin_tokens)
                tweet_text=utils.lstToString(utils.strToLst(bin_np[i][1])).split()
                tweet,tweet_length=utils.prepare_sequence(tweet_text, self.word_to_ix,
                                       pad_length=self.pad_length)

                bin_tweets.append(tweet)
                bin_tweet_lengths.append(tweet_length)
                bin_tweets_text.append(tweet_text)
def eval(tagger_model, k, dev_or_test="dev"):

    if k==-1:
        eval_data = dev_data if dev_or_test=="dev" else test_data
    else:
        eval_data = dev_datasets[k]
    correct = 0
    toks = 0
    hypTags = []
    goldTags = []
    all_out_tags = np.array([])
    all_targets = np.array([])
    logProbs = []
    print("Starting evaluation on %s set... (%d sentences)" % (dev_or_test, len(eval_data)))
    lang_id = []
    if args.model_type=="universal":
        lang_id = [lang]
    sentCount = 0
    for sentence, morph in eval_data:
        tagger_model.zero_grad()
        tagger_model.char_hidden = tagger_model.init_hidden()
        tagger_model.hidden = tagger_model.init_hidden()
        sent_in = []
        sentCount += 1
        for word in sentence:
            s_appended_word  = lang_id + [c for c in word] + lang_id
            word_in = utils.prepare_sequence(s_appended_word, char_to_ix, args.gpu)
            sent_in.append(word_in)

        #targets = utils.prepare_sequence(morph, labels_to_ix, args.gpu)
        if args.sum_word_char:
            word_seq = utils.prepare_sequence(sentence, word_to_ix, args.gpu)
        else:
            word_seq = None

        if args.model_type=="specific":
            tag_scores = tagger_model(sent_in, word_idxs=word_seq, lang=langs[-1], test=True)
        else:
            tag_scores = tagger_model(sent_in, word_idxs=word_seq, test=True)

        tag_scores = tag_scores[:, :-1]

        #values, indices = torch.topk(tag_scores, k=100, dim=1)
        values, indices = torch.max(tag_scores, 1)
        out_tags = indices.cpu().data.numpy()
        #for i in range(out_tags.shape[0]):
        #    hypTags.append([utils.unfreeze_dict(ix_to_labels[idx]) for idx in out_tags[i]])
        hypTags.append([ix_to_labels[idx] for idx in out_tags])
        scores = values.cpu().data.numpy()
        #logProbs += [list(scores[i]) for i in range(scores.shape[0])]
        #all_out_tags = np.append(all_out_tags, out_tags)
        goldTags.append(morph)
        #targets = targets.cpu().data.numpy()
        #correct += np.count_nonzero(out_tags==targets)
        #print(out_tags)
        #correct += np.count_nonzero(np.array([ix_to_labels[idx] for idx in out_tags])==np.array(morph))
        toks += len(sentence)

    avg_tok_accuracy = correct / toks

    prefix = args.model_type + "_"
    if args.sum_word_char:
        prefix += "_wc-sum"

    prefix += "-".join([l for l in langs]) + "_" + dev_or_test

    if args.sent_attn:
        prefix += "-sent_attn"

    if args.tgt_size:
        prefix += "_" + str(args.tgt_size)

    write = True
    folds = 10
    dev_size = (int)(len(training_data)/folds) if args.jackknife else None
    if write:
        utils.write_unimorph(args.treebank_path, hypTags, logProbs, sentCount, k, dev_or_test=dev_or_test, dev_size=dev_size)

    return avg_tok_accuracy
def train(k, training_data_jack, dev_data_jack):

    if not os.path.isfile(args.model_name) or args.continue_train:
        if args.continue_train:
            print("Loading tagger model from " + args.model_name + "...")
            tagger_model = torch.load(args.model_name, map_location=lambda storage, loc: storage)
            if args.gpu:
                tagger_model = tagger_model.cuda()

        else:
            tagger_model = models.BiLSTMTagger(args, word_freq, langs, len(char_to_ix), len(word_to_ix), len(labels_to_ix))
            if args.gpu:
                tagger_model = tagger_model.cuda()

        loss_function = nn.NLLLoss()

        if args.optim=="sgd":
            optimizer = optim.SGD(tagger_model.parameters(), lr=0.1)
        elif args.optim=="adam":
            optimizer = optim.Adam(tagger_model.parameters())
        elif args.optim=="adagrad":
            optimizer = optim.Adagrad(tagger_model.parameters())
        elif args.optim=="rmsprop":
            optimizer = optim.RMSprop(tagger_model.parameters())

        print("Training tagger model...")
        patience_counter = 0
        prev_avg_tok_accuracy = 0
        for epoch in range(args.epochs):
            accuracies = []
            sent = 0
            tokens = 0
            cum_loss = 0
            correct = 0
            print("Starting epoch %d .." %epoch)
            for lang in langs:
                lang_id = []
                if args.model_type=="universal":
                    lang_id = [lang]
                for sentence, morph in training_data_jack:
                    sent += 1

                    if sent%100==0:

                        print("[Epoch %d] \
                            Sentence %d/%d, \
                            Tokens %d \
                            Cum_Loss: %f \
                            Average Accuracy: %f"
                            % (epoch, sent, len(training_data_jack), tokens,
                                cum_loss/tokens, correct/tokens))

                    tagger_model.zero_grad()
                    sent_in = []
                    tokens += len(sentence)

                    for word in sentence:
                        s_appended_word  = lang_id + [c for c in word] + lang_id
                        word_in = utils.prepare_sequence(s_appended_word, char_to_ix, args.gpu)
                        # targets = utils.prepare_sequence(s_appended_word[1:], char_to_ix, args.gpu)
                        sent_in.append(word_in)

                    # sent_in = torch.stack(sent_in)
                    tagger_model.char_hidden = tagger_model.init_hidden()
                    tagger_model.hidden = tagger_model.init_hidden()

                    targets = utils.prepare_sequence(morph, labels_to_ix, args.gpu)

                    if args.sum_word_char:
                        word_seq = utils.prepare_sequence(sentence, word_to_ix, args.gpu)
                    else:
                        word_seq = None

                    if args.model_type=="specific" or args.model_type=="joint":
                        tag_scores = tagger_model(sent_in, word_idxs=word_seq, lang=lang)
                    else:
                        tag_scores = tagger_model(sent_in, word_idxs=word_seq)

                    values, indices = torch.max(tag_scores, 1)
                    out_tags = indices.cpu().data.numpy().flatten()
                    correct += np.count_nonzero(out_tags==targets.cpu().data.numpy())
                    loss = loss_function(tag_scores, targets)
                    cum_loss += loss.cpu().data[0]
                    loss.backward()
                    optimizer.step()

            print("Loss: %f" % loss.cpu().data.numpy())
            print("Accuracy: %f" %(correct/tokens))
            print("Saving model..")
            torch.save(tagger_model, args.model_name)
            #print("Evaluating on dev set...")
            #avg_tok_accuracy = eval(tagger_model, curEpoch=epoch)

            # Early Stopping
            #if avg_tok_accuracy <= prev_avg_tok_accuracy:
            #    patience_counter += 1
            #    if patience_counter==args.patience:
            #        print("Model hasn't improved on dev set for %d epochs. Stopping Training." % patience_counter)
            #        break

            #prev_avg_tok_accuracy = avg_tok_accuracy
    else:
        print("Loading tagger model from " + args.model_name + "...")
        tagger_model = torch.load(args.model_name, map_location=lambda storage, loc: storage)
        if args.gpu:
            tagger_model = tagger_model.cuda()

    if args.test:
        avg_tok_accuracy = eval(tagger_model, args.fold, dev_or_test=dev_or_test)
   
    return tagger_model
Ejemplo n.º 15
0
        test_data.append((test_sentence[i], test_pos[i], test_labels[i]))

    for i in range(len(val_sentence)):
        val_data.append((val_sentence[i], val_pos[i], val_labels[i]))

    word_to_ix = {}
    label_to_ix = {}
    pos_to_ix = {}

    utils.append_to_vocab(train_data, word_to_ix, label_to_ix, pos_to_ix)
    utils.append_to_vocab(test_data, word_to_ix, label_to_ix, pos_to_ix)
    utils.append_to_vocab(val_data, word_to_ix, label_to_ix, pos_to_ix)

    idx_train_data = []
    for sentence, pos, tags in train_data:
        idx_sentences = utils.prepare_sequence(sentence, word_to_ix)
        idx_labels = utils.prepare_sequence(tags, label_to_ix)
        idx_pos = utils.prepare_sequence(pos, pos_to_ix)
        idx_train_data.append((idx_sentences, idx_pos, idx_labels))

    idx_test_data = []
    for sentence, pos, tags in test_data:
        idx_sentences = utils.prepare_sequence(sentence, word_to_ix)
        idx_labels = utils.prepare_sequence(tags, label_to_ix)
        idx_pos = utils.prepare_sequence(pos, pos_to_ix)
        idx_test_data.append((idx_sentences, idx_pos, idx_labels))

    idx_val_data = []
    for sentence, pos, tags in val_data:
        idx_sentences = utils.prepare_sequence(sentence, word_to_ix)
        idx_labels = utils.prepare_sequence(tags, label_to_ix)
Ejemplo n.º 16
0
def eval(tagger_model, curEpoch=None, dev_or_test="dev"):

    eval_data = dev_data if dev_or_test == "dev" else test_data
    correct = 0
    toks = 0
    hypTags = []
    goldTags = []
    all_out_tags = np.array([])
    all_targets = np.array([])
    print("Starting evaluation on %s set... (%d sentences)" %
          (dev_or_test, len(eval_data)))
    lang_id = []
    if args.model_type == "universal":
        lang_id = [lang]
    s = 0
    for sentence, morph in eval_data:
        tagger_model.zero_grad()
        tagger_model.char_hidden = tagger_model.init_hidden()
        tagger_model.hidden = tagger_model.init_hidden()
        sent_in = []

        for word in sentence:
            s_appended_word = lang_id + [c for c in word] + lang_id
            word_in = utils.prepare_sequence(s_appended_word, char_to_ix,
                                             args.gpu)
            sent_in.append(word_in)

        targets = utils.prepare_sequence(morph, labels_to_ix, args.gpu)

        if args.sum_word_char:
            word_seq = utils.prepare_sequence(sentence, word_to_ix, args.gpu)
        else:
            word_seq = None

        if args.model_type == "specific":
            tag_scores = tagger_model(sent_in,
                                      word_idxs=word_seq,
                                      lang=langs[-1],
                                      test=True)
        else:
            tag_scores = tagger_model(sent_in, word_idxs=word_seq, test=True)

        values, indices = torch.max(tag_scores, 1)
        out_tags = indices.cpu().data.numpy().flatten()
        hypTags += [labels_to_ix[idx] for idx in out_tags]
        goldTags.append(morph)
        targets = targets.cpu().data.numpy()
        correct += np.count_nonzero(out_tags == targets)
        toks += len(sentence)

    avg_tok_accuracy = correct / toks

    prefix = args.model_type + "_"
    if args.sum_word_char:
        prefix += "_wc-sum"

    if dev_or_test == "dev":
        prefix += "-".join([l for l in langs
                            ]) + "_" + dev_or_test + "_" + str(curEpoch)
    else:
        prefix += "-".join([l for l in langs]) + "_" + dev_or_test

    if args.sent_attn:
        prefix += "-sent_attn"

    if args.tgt_size:
        prefix += "_" + str(args.tgt_size)

    finalTgts = []
    for tags in goldTags:
        for tag in tags:
            finalTgts.append(tag)

    f1_score, f1_micro_score = utils.computeF1(hypTags,
                                               finalTgts,
                                               prefix,
                                               labels_to_ix,
                                               baseline=True,
                                               write_results=True)
    print("Test Set Accuracy: %f" % avg_tok_accuracy)
    print("Test Set Avg F1 Score (Macro): %f" % f1_score)
    print("Test Set Avg F1 Score (Micro): %f" % f1_micro_score)

    with open(prefix + '_results_f1.txt', 'a') as file:
        file.write("\nAccuracy: " + str(avg_tok_accuracy) + "\n")

    return avg_tok_accuracy, f1_score
Ejemplo n.º 17
0
model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
model.cuda()
time_epoch = time.time()
print("训练集句子数:",training_data_sentence_num)
print("训练集字数:",training_data_character_num)
for epoch in range(10):
    for sentence, tags in training_data:
        model.zero_grad()
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = torch.LongTensor([tag_to_ix[t] for t in tags])
        neg_log_likelihood = model.neg_log_likelihood(sentence_in, targets)
        neg_log_likelihood.backward()
        optimizer.step()
    print("第", epoch, "轮:", time.time() - time_epoch, "秒,",(time.time() - time_epoch)/60,"分钟")
    time_epoch = time.time()
torch.save(model, 'word_segment_model.pkl')  # save entire net  保存整个网络
# 获取测试集中的所有数据
test_data,test_data_sentence_num,test_data_character_num = utils.generate_test_data("./icwb2-data/testing/pku_test_gold_BIO.utf8")

f = open("./icwb2-data/testing/test_model_10epoch.txt",'a')
tag_ind = ['B','I','O','','']
print("测试集句子数:",test_data_sentence_num)
print("测试集字数:",test_data_character_num)
for item in test_data:
    prediction = model(utils.prepare_sequence(item, word_to_ix))
    for tag_index in prediction[1]:
        f.write(tag_ind[tag_index] + '\n')

f.close()
Ejemplo n.º 18
0
def eval_on_dev(tagger_model, curEpoch=None, dev_or_test="dev"):

    correct = 0
    toks = 0
    all_out_tags = np.array([])
    all_targets = np.array([])

    eval_order = dev_order if dev_or_test == "dev" else test_order
    eval_data = dev_data if dev_or_test == "dev" else test_data

    print("Starting evaluation on %s set... (%d sentences)" %
          (dev_or_test, len(eval_data)))

    lang_id = []
    if args.model_type == "universal":
        lang_id = [langs[-1]]

    for start_idx, end_idx in eval_order:

        cur_eval_data = eval_data[start_idx:end_idx + 1]
        eval_sents = [elem[0] for elem in cur_eval_data]
        morph_sents = [elem[1] for elem in cur_eval_data]

        sents_in = []

        for i, sentence in enumerate(eval_sents):
            sent_in = []
            for word in sentence:
                s_appended_word = lang_id + [c for c in word] + lang_id
                word_in = utils.prepare_sequence(s_appended_word, char_to_ix,
                                                 args.gpu)
                # targets = utils.prepare_sequence(s_appended_word[1:], char_to_ix, args.gpu)
                sent_in.append(word_in)
            sents_in.append(sent_in)

        tagger_model.zero_grad()
        tagger_model.char_hidden = tagger_model.init_hidden()
        tagger_model.hidden = tagger_model.init_hidden()

        all_word_seq = []
        for sentence in eval_sents:
            word_seq = utils.prepare_sequence(sentence, word_to_ix, args.gpu)
            all_word_seq.append(word_seq)

        if args.model_type == "specific" or args.model_type == "joint":
            lstm_feats, graph, maxVal = tagger_model(sents_in,
                                                     morph_sents,
                                                     word_idxs=all_word_seq,
                                                     langs=[langs[-1]] *
                                                     len(sents_in),
                                                     test=True)
        else:
            lstm_feats, graph, maxVal = tagger_model(sents_in,
                                                     morph_sents,
                                                     word_idxs=all_word_seq,
                                                     test=True)

        for k in range(len(eval_sents)):
            hypSeq = tagger_model.getBestSequence(graph, k)
            targets = [utils.unfreeze_dict(tags) for tags in morph_sents[k]]
            correct += utils.getCorrectCount(targets, hypSeq)
            toks += len(eval_sents[k])
            all_out_tags = np.append(all_out_tags, hypSeq)
            all_targets = np.append(all_targets, targets)
    avg_tok_accuracy = correct / toks

    prefix = args.model_name
    prefix += "_" + dev_or_test

    if args.sent_attn:
        prefix += "sent_attn"

    if args.tgt_size:
        prefix += "_" + str(args.tgt_size)

    write = True if dev_or_test == "test" else False

    f1_score, f1_micro_score = utils.computeF1(all_out_tags,
                                               all_targets,
                                               prefix,
                                               write_results=write)
    print("Test Set Accuracy: %f" % avg_tok_accuracy)
    print("Test Set Avg F1 Score (Macro): %f" % f1_score)
    print("Test Set Avg F1 Score (Micro): %f" % f1_micro_score)

    if write:
        with open(prefix + '_results_f1.txt', 'ab') as file:
            file.write("\nAccuracy: " + str(avg_tok_accuracy) + "\n")
            for target, hyp in zip(all_targets, all_out_tags):
                file.write(str(target) + "\n")
                file.write(str(hyp) + "\n")

    return avg_tok_accuracy, f1_score
Ejemplo n.º 19
0
def main():
    if not os.path.isfile(args.model_name) or args.continue_train:
        if args.continue_train:
            print("Loading tagger model from " + args.model_name + "...")
            tagger_model = torch.load(
                args.model_name, map_location=lambda storage, loc: storage)
            if args.gpu:
                tagger_model = tagger_model.cuda()
        else:
            print("Creating new model...")
            tagger_model = factorial_crf_tagger.DynamicCRF(args, word_freq, langs, len(char_to_ix), \
                      len(word_to_ix), unique_tags)
            if args.gpu:
                tagger_model = tagger_model.cuda()

        if args.unit_test:
            tests = unit.TestBP()
            labelSum = sum([tag.size() for tag in tagger_model.uniqueTags])
            # Create dummy LSTM features
            lstm_feats = utils.get_var(
                torch.Tensor(torch.randn(len(training_data[0][0]), labelSum)),
                args.gpu)
            tests.setUp(tagger_model, training_data[0][1],
                        len(training_data[0][0]), lstm_feats)

        loss_function = nn.NLLLoss()
        # Provide (N,C) log probability values as input
        # loss_function = nn.CrossEntropyLoss()

        if args.optim == "sgd":
            optimizer = optim.SGD(tagger_model.parameters(), lr=1.0)
        elif args.optim == "adam":
            optimizer = optim.Adam(tagger_model.parameters())
        elif args.optim == "adagrad":
            optimizer = optim.Adagrad(tagger_model.parameters())

        print("Training FCRF-LSTM model...")
        patience_counter = 0
        prev_avg_tok_accuracy = 0
        for epoch in xrange(args.epochs):
            accuracies = []
            sent = 0
            batch_idx = 0
            tokens = 0
            cum_loss = 0
            correct = 0
            random.shuffle(train_order)
            print("Starting epoch %d .." % epoch)

            start_time = time.time()
            for start_idx, end_idx in train_order:
                train_data = training_data[start_idx:end_idx + 1]
                train_sents = [elem[0] for elem in train_data]
                morph_sents = [elem[1] for elem in train_data]

                lang_ids = train_lang_ids[start_idx:end_idx + 1]

                sent += end_idx - start_idx + 1
                tokens += sum([len(sentence) for sentence in train_sents])
                batch_idx += 1

                if batch_idx % 5 == 0:
                    print("[Epoch %d] \
                        Sentence %d/%d, \
                        Tokens %d \
                        Cum_Loss: %f \
                        Time: %f \
                        Tokens/Sec: %d"

                          # Average Accuracy: %f"
                          %
                          (epoch, sent, len(training_data), tokens, cum_loss /
                           tokens, time.time() - start_time, tokens /
                           (time.time() - start_time)))
                    # , correct/tokens))

                tagger_model.zero_grad()

                sents_in = []

                for i, sentence in enumerate(train_sents):
                    sent_in = []
                    lang_id = []
                    if args.model_type == "universal":
                        lang_id = [lang_ids[i]]

                    for word in sentence:
                        s_appended_word = lang_id + [c for c in word] + lang_id
                        word_in = utils.prepare_sequence(
                            s_appended_word, char_to_ix, args.gpu)
                        # targets = utils.prepare_sequence(s_appended_word[1:], char_to_ix, args.gpu)
                        sent_in.append(word_in)
                    sents_in.append(sent_in)

                # sents_in = torch.stack(sent_in)
                tagger_model.char_hidden = tagger_model.init_hidden()
                tagger_model.hidden = tagger_model.init_hidden()

                if args.sum_word_char:
                    all_word_seq = []
                    for sentence in train_sents:
                        word_seq = utils.prepare_sequence(
                            sentence, word_to_ix, args.gpu)
                        all_word_seq.append(word_seq)
                else:
                    all_word_seq = None

                if args.model_type == "specific" or args.model_type == "joint":
                    lstm_feat_sents, graph, maxVal = tagger_model(
                        sents_in,
                        morph_sents,
                        word_idxs=all_word_seq,
                        langs=lang_ids)
                else:
                    lstm_feat_sents, graph, maxVal = tagger_model(
                        sents_in, morph_sents, word_idxs=all_word_seq)

                # Skip parameter updates if marginals are not within a threshold
                if maxVal > 10.00:
                    print("Skipping parameter updates...")
                    continue

                # Compute the loss, gradients, and update the parameters
                all_factors_batch = []

                for k in range(len(train_sents)):
                    all_factors = tagger_model.get_scores(
                        graph, morph_sents[k], lstm_feat_sents[k], k)
                    all_factors_batch.append(all_factors)

                loss = tagger_model.compute_loss(all_factors_batch,
                                                 loss_function)
                # print("Loss:", loss)

                cum_loss += loss.cpu().data[0]
                loss.backward()
                # tagger_model.gradient_check(all_factors_batch[0])
                optimizer.step()

            print("Loss: %f" % loss.cpu().data.numpy())
            print("Saving model..")
            torch.save(tagger_model, args.model_name)
            if (epoch + 1) % 4 == 0:
                print("Evaluating on dev set...")
                avg_tok_accuracy, f1_score = eval_on_dev(tagger_model,
                                                         curEpoch=epoch)

                # Early Stopping
                if avg_tok_accuracy <= prev_avg_tok_accuracy:
                    patience_counter += 1
                    if patience_counter == args.patience:
                        print(
                            "Model hasn't improved on dev set for %d epochs. Stopping Training."
                            % patience_counter)
                        break

                prev_avg_tok_accuracy = avg_tok_accuracy
    else:
        print("Loading tagger model from " + args.model_name + "...")
        tagger_model = torch.load(args.model_name,
                                  map_location=lambda storage, loc: storage)
        if args.gpu:
            tagger_model = tagger_model.cuda()
        else:
            tagger_model.gpu = False

        if args.visualize:
            print("[Visualization Mode]")
            utils.plot_heatmap(unique_tags, tagger_model.pairwise_weights,
                               "pair")
            #utils.plot_heatmap(unique_tags, tagger_model.transition_weights, "trans")
            #utils.plot_heatmap(unique_tags, tagger_model.lang_pairwise_weights, "pair", lang_idx=1)
            print("Stored plots in figures/ directory!")

        if args.test:
            avg_tok_accuracy, f1_score = eval_on_dev(tagger_model,
                                                     dev_or_test="test")
Ejemplo n.º 20
0
F_value_best = 0


if GPU_available:
    model.cuda()
    print('moved model to GPU!!!')

end = time.time()
eps = 0.00000000001
for epoch in range(start_epoch, 100):
    losses = AverageMeter()
    for i, (sentence, tags) in enumerate(training_data):
        model.zero_grad()

        # prepare torch.Tensor
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)
        if GPU_available:
            sentence_in = sentence_in.cuda()
            targets = targets.cuda()
        loss = model.neg_log_likelihood(sentence_in, targets)

        losses.update(loss.item(), 1)

        loss.backward()
        optimizer.step()

    if train_mode:
        print('Epoch: [{0}]\t'
            'Loss: {losses.avg:.4f}\t'
            'Time: {epoch_time:.2f}'.format(epoch, losses=losses, epoch_time=time.time()-end))
Ejemplo n.º 21
0
def prepare_data(path, with_pos=False):
    """
        prepare data before training/evaluating
    """
    print("------- prep data --------")
    # Load data
    sentence_path = os.path.join(path, 'train/sentences.txt')
    test_sentence_path = os.path.join(path, 'test/sentences.txt')
    val_sentence_path = os.path.join(path, 'val/sentences.txt')

    label_path = os.path.join(path, 'train/labels.txt')
    test_label_path = os.path.join(path, 'test/labels.txt')
    val_label_path = os.path.join(path, 'val/labels.txt')

    if with_pos:
        pos_path = os.path.join(path, 'train/pos.txt')
        test_pos_path = os.path.join(path, 'test/pos.txt')
        val_pos_path = os.path.join(path, 'val/pos.txt')

    train_sentence = []
    train_labels = []
    train_pos = []

    test_sentence = []
    test_labels = []
    test_pos = []

    val_sentence = []
    val_labels = []
    val_pos = []

    utils.load_data(sentence_path, train_sentence)
    utils.load_data(label_path, train_labels)
    if with_pos:
        utils.load_data(pos_path, train_pos)

    utils.load_data(test_sentence_path, test_sentence)
    utils.load_data(test_label_path, test_labels)
    if with_pos:
        utils.load_data(test_pos_path, test_pos)

    utils.load_data(val_sentence_path, val_sentence)
    utils.load_data(val_label_path, val_labels)
    if with_pos:
        utils.load_data(val_pos_path, val_pos)

    train_data = []
    test_data = []
    val_data = []

    if with_pos:
        for i in range(len(train_sentence)):
            train_data.append(
                (train_sentence[i], train_pos[i], train_labels[i]))

        for i in range(len(test_sentence)):
            test_data.append((test_sentence[i], test_pos[i], test_labels[i]))

        for i in range(len(val_sentence)):
            val_data.append((val_sentence[i], val_pos[i], val_labels[i]))
    else:
        for i in range(len(train_sentence)):
            train_data.append((train_sentence[i], train_labels[i]))

        for i in range(len(test_sentence)):
            test_data.append((test_sentence[i], test_labels[i]))

        for i in range(len(val_sentence)):
            val_data.append((val_sentence[i], val_labels[i]))

    utils.append_to_vocab(train_data, word_to_ix, label_to_ix, pos_to_ix,
                          with_pos)
    utils.append_to_vocab(test_data, word_to_ix, label_to_ix, pos_to_ix,
                          with_pos)
    utils.append_to_vocab(val_data, word_to_ix, label_to_ix, pos_to_ix,
                          with_pos)

    if with_pos:
        for sentence, pos, tags in train_data:
            idx_sentences = utils.prepare_sequence(sentence, word_to_ix)
            idx_labels = utils.prepare_sequence(tags, label_to_ix)
            idx_pos = utils.prepare_sequence(pos, pos_to_ix)
            idx_train_data.append((idx_sentences, idx_pos, idx_labels))

        for sentence, pos, tags in test_data:
            idx_sentences = utils.prepare_sequence(sentence, word_to_ix)
            idx_labels = utils.prepare_sequence(tags, label_to_ix)
            idx_pos = utils.prepare_sequence(pos, pos_to_ix)
            idx_test_data.append((idx_sentences, idx_pos, idx_labels))

        for sentence, pos, tags in val_data:
            idx_sentences = utils.prepare_sequence(sentence, word_to_ix)
            idx_labels = utils.prepare_sequence(tags, label_to_ix)
            idx_pos = utils.prepare_sequence(pos, pos_to_ix)
            idx_val_data.append((idx_sentences, idx_pos, idx_labels))
    else:
        for sentence, tags in train_data:
            idx_sentences = utils.prepare_sequence(sentence, word_to_ix)
            idx_labels = utils.prepare_sequence(tags, label_to_ix)
            idx_train_data.append((idx_sentences, idx_labels))

        for sentence, tags in test_data:
            idx_sentences = utils.prepare_sequence(sentence, word_to_ix)
            idx_labels = utils.prepare_sequence(tags, label_to_ix)
            idx_test_data.append((idx_sentences, idx_labels))

        for sentence, tags in val_data:
            idx_sentences = utils.prepare_sequence(sentence, word_to_ix)
            idx_labels = utils.prepare_sequence(tags, label_to_ix)
            idx_val_data.append((idx_sentences, idx_labels))

    print("------- prep data done ------\n")