Example #1
0
def read_data(input_file):
    """Reads a BIO data."""
    with codecs.open(input_file, "r", encoding="utf-8") as f:
        lines = []
        words = []
        labels = []
        f_lines = f.readlines()
        for line in tqdm(f_lines, total=len(f_lines), desc="Process {}".format(input_file)):
            contends = line.strip()
            word = line.strip().split(' ')[0]
            label = line.strip().split(' ')[-1]
            if contends.startswith("-DOCSTART-"):
                words.append('')
                continue

            if len(contends) == 0 and not len(words):
                words.append("")

            if len(contends) == 0 and words[-1] == '.':
                lbl = ' '.join([label for label in labels if len(label) > 0])
                w = ' '.join([word for word in words if len(word) > 0])
                lines.append([lbl, w])
                words = []
                labels = []
                continue
            words.append(word)
            labels.append(label.replace("-", "_"))
        return lines
Example #2
0
    def create_vocabs(df,
                      tokenizer,
                      idx2labels_path,
                      markup="IO",
                      idx2cls_path=None,
                      pad_idx=0,
                      is_cls=False,
                      idx2labels=None,
                      idx2cls=None):
        if idx2labels is None:
            label2idx = {"[PAD]": pad_idx, '[CLS]': 1, '[SEP]': 2, "X": 3}
            idx2label = ["[PAD]", '[CLS]', '[SEP]', "X"]
        else:
            label2idx = {label: idx for idx, label in enumerate(idx2labels)}
            idx2label = idx2labels
        idx2cls = idx2cls
        cls2idx = None
        if is_cls:
            idx2cls = []
            cls2idx = {label: idx for idx, label in enumerate(idx2cls)}
        for _, row in tqdm(df.iterrows(),
                           total=len(df),
                           leave=False,
                           desc="Creating labels vocabs"):
            labels = row.labels.split()
            origin_tokens = row.text.split()
            if is_cls and row.cls not in cls2idx:
                cls2idx[row.cls] = len(cls2idx)
                idx2cls.append(row.cls)
            prev_label = ""
            for origin_token, label in zip(origin_tokens, labels):
                if markup == "BIO":
                    prefix = "B_"
                else:
                    prefix = "I_"
                if label != "O":
                    label = label.split("_")[1]
                    if label == prev_label:
                        prefix = "I_"
                    prev_label = label
                else:
                    prev_label = label
                cur_tokens = tokenizer.tokenize(origin_token)
                bert_label = [prefix + label] + ["X"] * (len(cur_tokens) - 1)
                for label_ in bert_label:
                    if label_ not in label2idx:
                        label2idx[label_] = len(label2idx)
                        idx2label.append(label_)
        with open(idx2labels_path, "w", encoding="utf-8") as f:
            for label in idx2label:
                f.write("{}\n".format(label))

        if is_cls:
            with open(idx2cls_path, "w", encoding="utf-8") as f:
                for label in idx2cls:
                    f.write("{}\n".format(label))

        return label2idx, idx2label, cls2idx, idx2cls
Example #3
0
def predict(dl, model, id2cls):
    model.eval()
    idx = 0
    preds_cpu_cls = []
    for batch in tqdm(dl, total=len(dl), leave=False, desc="Predicting"):
        idx += 1
        preds_cls = model.forward(batch)
        preds_cpu_ = transformed_result_cls([preds_cls], [preds_cls], id2cls,
                                            False)
        preds_cpu_cls.extend(preds_cpu_)

    return preds_cpu_cls
Example #4
0
def fact_ru_eval_preprocess(dev_dir, test_dir, dev_df_path, test_df_path):
    dev_reader = Reader(dev_dir)
    dev_reader.read_dir()
    dev_texts, dev_tags = dev_reader.split()
    res_tags = []
    res_tokens = []
    for tag, tokens in tqdm(zip(dev_tags, dev_texts),
                            total=len(dev_tags),
                            desc="Process FactRuEval2016 dev set."):
        if len(tag):
            res_tags.append(tag)
            res_tokens.append(tokens)
    dev = pd.DataFrame({
        "labels": list(map(" ".join, res_tags)),
        "text": list(map(" ".join, res_tokens))
    })
    dev["clf"] = dev["labels"].apply(
        lambda x: all([y.split("_")[0] == "O" for y in x.split()]))
    dev.to_csv(dev_df_path, index=False, sep="\t")

    test_reader = Reader(test_dir)
    test_reader.read_dir()
    test_texts, test_tags = test_reader.split()
    res_tags = []
    res_tokens = []
    for tag, tokens in tqdm(zip(test_tags, test_texts),
                            total=len(test_tags),
                            desc="Process FactRuEval2016 test set."):
        if len(tag):
            res_tags.append(tag)
            res_tokens.append(tokens)
    valid = pd.DataFrame({
        "labels": list(map(" ".join, res_tags)),
        "text": list(map(" ".join, res_tokens))
    })
    valid["clf"] = valid["labels"].apply(
        lambda x: all([y.split("_")[0] == "O" for y in x.split()]))
    valid.to_csv(test_df_path, index=False, sep="\t")
Example #5
0
def read_data_pos(input_file):
    with codecs.open(input_file, "r", encoding="utf-8") as f:
        lines = []
        words = []
        labels = []
        poses = []
        f_lines = f.readlines()

        # i is for test
        i = 0
        for line in tqdm(f_lines,
                         total=len(f_lines),
                         desc="Process {}".format(input_file)):
            contends = line.strip()

            if contends.startswith("-DOCSTART-"):
                words.append('')
                continue

            # this is Not used for Thai data
            if len(contends) == 0 and not len(words):
                words.append("")

            # this is blank-line as ending of sentence
            # also check labels > 0 so that it is not blank line
            if len(contends) == 0 and len(labels) > 0:
                lbl = ' '.join([label for label in labels if len(label) > 0])
                w = ' '.join([word for word in words if len(word) > 0])
                p = ' '.join([pos for pos in poses if len(pos) > 0])
                lines.append([lbl, w, p])
                words = []
                labels = []
                poses = []
                continue

            if len(line.strip().split(' ')) >= 2:
                word = line.strip().split(' ')[0]
                label = line.strip().split(' ')[1]
                pos = line.strip().split(' ')[2]
                words.append(word)
                labels.append(label.replace("-", "_"))
                poses.append(pos)

    #         if (i < 10):
    #             print(lbl)
    #             print(w)

    #         i += 1
    return lines
Example #6
0
def validate_step(dl, model, id2cls):
    model.eval()
    idx = 0
    preds_cpu_cls, targets_cpu_cls = [], []
    for batch in tqdm(dl, total=len(dl), leave=False, desc="Validation"):
        idx += 1
        preds_cls = model.forward(batch)
        preds_cpu_, targets_cpu_ = transformed_result_cls([preds_cls],
                                                          [batch[-1]], id2cls)
        preds_cpu_cls.extend(preds_cpu_)
        targets_cpu_cls.extend(targets_cpu_)
    clf_report_cls = flat_classification_report([targets_cpu_cls],
                                                [preds_cpu_cls],
                                                digits=4)
    return clf_report_cls
Example #7
0
def train_step(dl, model, optimizer, num_epoch=1):
    model.train()
    epoch_loss = 0
    idx = 0
    pr = tqdm(dl, total=len(dl), leave=False)
    for batch in pr:
        idx += 1
        model.zero_grad()
        loss = model.score(batch)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loss = loss.data.cpu().tolist()
        epoch_loss += loss
        pr.set_description("train loss: {}".format(epoch_loss / idx))
        torch.cuda.empty_cache()
    logging.info("\nepoch {}, average train epoch loss={:.5}\n".format(
        num_epoch, epoch_loss / idx))
Example #8
0
def predict(dl, model, id2label, id2cls=None):
    model.eval()
    idx = 0
    preds_cpu = []
    preds_cpu_cls = []
    for batch in tqdm(dl, total=len(dl), leave=False, desc="Predicting"):
        idx += 1
        labels_mask, labels_ids = batch[1], batch[3]
        preds = model.forward(batch)
        if id2cls is not None:
            preds, preds_cls = preds
            preds_cpu_ = transformed_result_cls([preds_cls], [preds_cls], id2cls, False)
            preds_cpu_cls.extend(preds_cpu_)

        preds_cpu_ = transformed_result([preds], [labels_mask], id2label)
        preds_cpu.extend(preds_cpu_)
    if id2cls is not None:
        return preds_cpu, preds_cpu_cls
    return preds_cpu
Example #9
0
    def create_vocabs(df, idx2cls_path, idx2cls=None):
        idx2cls = idx2cls
        cls2idx = {}
        if idx2cls is not None:
            cls2idx = {label: idx for idx, label in enumerate(idx2cls)}
        else:
            idx2cls = []
        for _, row in tqdm(df.iterrows(),
                           total=len(df),
                           leave=False,
                           desc="Creating labels vocabs"):
            if row.cls not in cls2idx:
                cls2idx[row.cls] = len(cls2idx)
                idx2cls.append(row.cls)

        with open(idx2cls_path, "w", encoding="utf-8") as f:
            for label in idx2cls:
                f.write("{}\n".format(label))

        return cls2idx, idx2cls
Example #10
0
def validate_step(dl, model, id2label, sup_labels, id2cls=None):
    model.eval()
    idx = 0
    preds_cpu, targets_cpu = [], []
    preds_cpu_cls, targets_cpu_cls = [], []
    for batch in tqdm(dl, total=len(dl), leave=False):
        idx += 1
        labels_mask, labels_ids = batch[1], batch[3]
        preds = model.forward(batch)
        if id2cls is not None:
            preds, preds_cls = preds
            preds_cpu_, targets_cpu_ = transformed_result_cls([preds_cls], [batch[-1]], id2cls)
            preds_cpu_cls.extend(preds_cpu_)
            targets_cpu_cls.extend(targets_cpu_)
        preds_cpu_, targets_cpu_ = transformed_result([preds], [labels_mask], id2label, [labels_ids])
        preds_cpu.extend(preds_cpu_)
        targets_cpu.extend(targets_cpu_)
    clf_report = flat_classification_report(targets_cpu, preds_cpu, labels=sup_labels, digits=3)
    if id2cls is not None:
        clf_report_cls = flat_classification_report([targets_cpu_cls], [preds_cpu_cls], digits=3)
        return clf_report, clf_report_cls
    return clf_report