Esempi in Python per flatten_lists, esempi in Python per utils.flatten_lists

Esempio n. 1

0

Mostra file

File: create_sentence_boundary_rstdt.py Progetto: arne-cl/DiscourseConstituencyInduction-ViterbiEM

def get_sentence_boundaries(path_tok, path_conll):
    """
    :type path_tok: str
    :type path_conll: str
    :rtype: list of (int, int)

    Compute sentence boundaries based on the tokenized file and the sentence-splitted file
    """
    edus = read_edus(path_tok) # list of list of int
    sentences = read_sentences(path_conll) # list of list of str

    # Assign EDU ID to each token in the sentence list.
    tokens_with_edu_ids = utils.flatten_lists(edus)
    assert len(tokens_with_edu_ids) == len(utils.flatten_lists(sentences))
    sentences_with_edu_ids = assign_edu_ids_to_sentences(sentences, tokens_with_edu_ids)

    # Adjustment
    sentences_with_edu_ids = adjust(sentences_with_edu_ids, n_edus=len(edus))
    assert len(tokens_with_edu_ids) == len(utils.flatten_lists(sentences_with_edu_ids))

    # Compute boundaries
    bnds = compute_boundaries(sentences_with_edu_ids)

    # Check
    test_boundaries(bnds, n_edus=len(edus))
    return bnds

Esempio n. 2

0

Mostra file

def get_paragraph_boundaries(path_tok, path_tok2):
    """
    :type path_tok: str
    :type path_tok2: str
    :rtype: list of tuple of int

    Compute paragraph boundaries based on the tokenized file and the paragraph-splitted file
    """
    edus = read_edus(path_tok)  # list of list of int
    paragraphs = read_paragraphs(path_tok2)  # list of list of str

    # Assign EDU ID to each token in the paragraph list
    tokens_with_edu_ids = utils.flatten_lists(edus)
    # if len(tokens_with_edu_ids) != len(utils.flatten_lists(paragraphs)):
    #     print(path_tok, path_tok2)
    #     print(len(tokens_with_edu_ids), len(utils.flatten_lists(paragraphs)))
    assert len(tokens_with_edu_ids) == len(utils.flatten_lists(paragraphs))
    paragraphs_with_edu_ids = assign_edu_ids_to_sentences(
        paragraphs, tokens_with_edu_ids)

    # Adjust
    paragraphs_with_edu_ids = adjust(paragraphs_with_edu_ids, n_edus=len(edus))
    assert len(tokens_with_edu_ids) == len(
        utils.flatten_lists(paragraphs_with_edu_ids))

    # Compute boundaries
    bnds = compute_boundaries(paragraphs_with_edu_ids)

    # Check
    test_boundaries(bnds, n_edus=len(edus))
    return bnds

Esempio n. 3

0

Mostra file

File: evaluate.py Progetto: OpenNLPhub/ChineseNER

def evaluate_entity_label(pred, label, classes):
    pred = flatten_lists(pred)
    label = flatten_lists(label)
    assert len(pred) == len(label)
    cla = [i.split('-')[-1] for i in classes if i != 'O']
    cla = list(set(cla))
    cla2ind = {}
    cla2ind = dict((c, ind) for ind, c in enumerate(cla))
    index = 0
    pred_entities = np.zeros(len(cla), dtype=int)  #TP+FP
    label_entities = np.zeros(len(cla), dtype=int)  #TP+FN
    acc = np.zeros(len(cla), dtype=int)  #TP
    while index < len(label):
        label_tag = label[index]
        if label_tag == 'O':
            index += 1
        else:
            c = label_tag.split('-')[-1]
            c = cla2ind[c]
            next_tag = 'I' + label_tag[1:]
            j = index + 1
            while label[j] == next_tag and j < len(label):
                j += 1
            label_entities[c] += 1
            label_entity = ''.join(label[index:j])
            pred_entity = ''.join(pred[index:j])
            if label_entity == pred_entity:
                acc[c] += 1
            index = j
    #统计Pred_tag 上的Entity
    index = 0
    while index < len(pred):
        pred_tag = pred[index]
        if pred_tag == 'O':
            index += 1
        elif pred_tag.split('-')[0] == 'B':
            c = pred_tag.split('-')[-1]
            c = cla2ind[c]
            next_tag = 'I' + pred_tag[1:]
            j = index + 1
            while pred[j] == next_tag and j < len(pred):
                j += 1
            pred_entities[c] += 1
            index = j
        else:
            index += 1
        # if index%100==0:
        #     print(index,end=' ')
    units = []
    TP = acc
    FP = pred_entities - acc
    FN = label_entities - acc
    TN = acc.sum() - acc
    for c, ind in cla2ind.items():
        units.append(Eval_unit(TP[ind], FP[ind], FN[ind], TN[ind], c))
    return units

Esempio n. 4

0

Mostra file

def main():
    config = utils.Config()

    filenames = os.listdir(
        os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt",
                     "preprocessed"))
    filenames = [n for n in filenames if n.endswith(".paragraph.boundaries")]
    filenames = [
        n.replace(".paragraph.boundaries", ".edus") for n in filenames
    ]
    filenames.sort()

    for filename in filenames:
        # Path
        path_edus = os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt",
                                 "tmp.preprocessing", filename + ".tokenized")
        path_conll = os.path.join(
            config.getpath("data"), "ptbwsj_wo_rstdt", "tmp.preprocessing",
            filename.replace(".edus", ".sentences.conll"))
        path_out = os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt",
                                "preprocessed", filename + ".postags")

        # Read
        edus = utils.read_lines(
            path_edus,
            process=lambda line: line.split())  # list of list of str
        tokens_e = utils.flatten_lists(edus)  # list of str

        sentences = utils.read_conll(
            path_conll,
            keys=["ID", "FORM", "LEMMA", "POSTAG", "_1", "HEAD",
                  "DEPREL"])  # list of list of {str: str}
        conll_lines = utils.flatten_lists(sentences)  # list of {str: str}
        tokens_s = [conll_line["FORM"]
                    for conll_line in conll_lines]  # list of str
        postags_s = [conll_line["POSTAG"]
                     for conll_line in conll_lines]  # list of str

        # Check whether the number of tokens and that of postags are equivalent
        for token_e, token_s, postag_s in zip(tokens_e, tokens_s, postags_s):
            if token_e != token_s:
                raise ValueError("Error! %s != %s" % (token_e, token_s))

        # Create the POSTAG-version of EDUs
        postag_i = 0
        edus_postag = []
        for edu in edus:
            edu_postag = [postags_s[postag_i + i] for i in range(len(edu))]
            edus_postag.append(edu_postag)
            postag_i += len(edu)

        # Write
        with open(path_out, "w") as f:
            for edu_postag in edus_postag:
                f.write("%s\n" % " ".join(edu_postag))

Esempio n. 5

0

Mostra file

File: evaluate.py Progetto: ziqiYang19960522/named_entity_recognition

def evaluate(tag_lists, target_tag_lists):
    # 评估准确率
    correct_count = 0.

    # 展开嵌套列表
    tag_lists = flatten_lists(tag_lists)
    target_tag_lists = flatten_lists(target_tag_lists)
    assert len(tag_lists) == len(target_tag_lists)

    for pred, tgt in zip(tag_lists, target_tag_lists):
        if pred == tgt:
            correct_count += 1.
    return correct_count / len(tag_lists)

Esempio n. 6

0

Mostra file

File: evaluate.py Progetto: OpenNLPhub/ChineseNER

def evaluate_single_label(pred, label, classes):
    pred = flatten_lists(pred)
    label = flatten_lists(label)
    matrix = confusion_matrix(pred, label, classes)
    TP = np.diag(matrix)
    FP = matrix.sum(axis=1) - TP
    FN = matrix.sum(axis=0) - TP
    TN = matrix.sum() - TP - FN - FP
    unit_list = []
    for i in range(len(classes)):
        cla = classes[i]
        unit = Eval_unit(TP[i], FP[i], FN[i], TN[i], cla)
        unit_list.append(unit)
    return unit_list

Esempio n. 7

0

Mostra file

    def __init__(self, golden_tags, predict_tags, remove_O=True):
        self.golden_tags = flatten_lists(golden_tags)
        self.predict_tags = flatten_lists(predict_tags)

        if remove_O:
            self._remove_Otags()

        self.tagset = set(self.golden_tags)
        self.correct_tags_number = self.count_correct_tags()
        self.predict_tags_counter = Counter(self.predict_tags)
        self.golden_tags_counter = Counter(self.golden_tags)
        self.precision_scores = self.cal_precision()
        self.recall_scores = self.cal_recall()
        self.f1_scores = self.cal_f1()

Esempio n. 8

0

Mostra file

File: work.py Progetto: darr/dlner

def ensemble_evaluate(results, targets, remove_O=False):
    """ensemble多个模型"""
    for i in range(len(results)):
        results[i] = flatten_lists(results[i])

    pred = []
    for result in zip(*results):
        ensemble_tag = Counter(result).most_common(1)[0][0]
        pred.append(ensemble_tag)

    tag_lists = flatten_lists(targets)
    assert len(pred) == len(tag_lists)

    print("Ensemble 四个模型的结果如下：")
    _print_metrics(tag_lists, pred)

Esempio n. 9

0

Mostra file

File: evaluate.py Progetto: zxmwd2/chinese-sequence-ner

def ensemble_evaluate(results, targets, remove_O=False):
    """ensemble多个模型"""
    for i in range(len(results)):
        results[i] = flatten_lists(results[i])

    pred_tags = []
    for result in zip(*results):
        ensemble_tag = Counter(result).most_common(1)[0][0]
        pred_tags.append(ensemble_tag)

    targets = flatten_lists(targets)
    assert len(pred_tags) == len(targets)

    print("Ensemble 四个模型的结果如下：")
    metrics = Metrics(targets, pred_tags, remove_0=remove_O)
    metrics.report_scores(dtype='ensembel')

Esempio n. 10

0

Mostra file

File: evaluate.py Progetto: ahmedtolan23/NER-with-bilstm-CRF-CNN

def ensemble_evaluate(results, targets, remove_O=False):
    """Multiple models of ensemble"""
    for i in range(len(results)):
        results[i] = flatten_lists(results[i])

    pred_tags = []
    for result in zip(*results):
        ensemble_tag = Counter(result).most_common(1)[0][0]
        pred_tags.append(ensemble_tag)

    targets = flatten_lists(targets)
    assert len(pred_tags) == len(targets)

    print("The results of the four Ensemble models are as follows:")
    metrics = Metrics(targets, pred_tags, remove_O=remove_O)
    metrics.report_scores()
    metrics.report_confusion_matrix()

Esempio n. 11

0

Mostra file

def convert_edus(edus, raw_lines):
    """
    :type edus: list of str
    :type raw_lines: list of str
    :rtype: list of str
    """
    edu_positions = []
    for edu_i in range(len(edus)):
        raw = []
        for char_i in range(len(list(edus[edu_i]))):
            if edus[edu_i][char_i] == " ":
                continue
            raw.append(edu_i)
        edu_positions.append(raw)
    edu_positions = utils.flatten_lists(edu_positions)
    # print(edu_positions)

    flatten_raw_lines = list("".join(utils.flatten_lists(raw_lines)))
    # print(flatten_raw_lines)
    result_positions = [-1 for _ in flatten_raw_lines]
    result_i = 0
    cur_char_i = 0
    cur_edu_i = 0
    for char in flatten_raw_lines:
        if char == " ":
            result_positions[result_i] = cur_edu_i
        else:
            edu_i = edu_positions[cur_char_i]
            result_positions[result_i] = edu_i
            cur_char_i += 1
            assert edu_i == cur_edu_i or edu_i == cur_edu_i + 1
            cur_edu_i = edu_i
        result_i += 1

    # print(result_positions)

    new_edus = []
    for edu_i in range(len(edus)):
        b = result_positions.index(edu_i)
        e = b + result_positions.count(edu_i)
        new_edu = "".join(flatten_raw_lines[b:e])
        new_edu = new_edu.strip()
        # print(new_edu)
        new_edus.append(new_edu)

    return new_edus

Esempio n. 12

0

Mostra file

File: spanbasedmodel.py Progetto: arne-cl/DiscourseConstituencyInduction-ViterbiEM

    def compute_span_vectors(
                self,
                edus,
                edus_postag,
                sbnds,
                pbnds,
                padded_edu_vectors,
                mask_bwd,
                mask_fwd,
                batch_spans):
        """
        :type edus: list of list of str
        :type edus_postag: list of list of str
        :type sbnds: list of (int, int)
        :type pbnds: list of (int, int)
        :type padded_edu_vectors: Variable(shape=(n_edus+2, bilstm_dim), dtype=np.float32)
        :type mask_bwd: Variable(shape=(1, bilstm_dim), dtype=np.float32)
        :type mask_fwd: Variable(shape=(1, bilstm_dim), dtype=np.float32)
        :type batch_spans: list of list of (int, int)
        :rtype: Variable(shape=(batch_size * n_spans, bilstm_dim + tempfeat_dim), dtype=np.float32)
        """
        batch_size = len(batch_spans)
        n_spans = len(batch_spans[0])
        total_spans = batch_size * n_spans
        for spans in batch_spans:
            assert len(spans) == n_spans

        # Reshape
        flatten_batch_spans = utils.flatten_lists(batch_spans) # total_spans * (int, int)
        # NOTE that indices in batch_spans should be shifted by +1 due to the boundary padding
        bm1_indices = [(b-1)+1 for b,e in flatten_batch_spans] # total_spans * int
        b_indices = [b+1 for b,e in flatten_batch_spans] # total_spans * int
        e_indices = [e+1 for b,e in flatten_batch_spans] # total_spans * int
        ep1_indices = [(e+1)+1 for b,e in flatten_batch_spans] # total_spans * int

        # Feature extraction
        bm1_padded_edu_vectors = F.get_item(padded_edu_vectors, bm1_indices) # (total_spans, bilstm_dim)
        b_padded_edu_vectors = F.get_item(padded_edu_vectors, b_indices) # (total_spans, bilstm_dim)
        e_padded_edu_vectors = F.get_item(padded_edu_vectors, e_indices) # (total_spans, bilstm_dim)
        ep1_padded_edu_vectors = F.get_item(padded_edu_vectors, ep1_indices) # (total_spans, bilstm_dim)
        mask_bwd = F.broadcast_to(mask_bwd, (total_spans, self.bilstm_dim)) # (total_spans, bilstm_dim)
        mask_fwd = F.broadcast_to(mask_fwd, (total_spans, self.bilstm_dim)) # (total_spans, bilstm_dim)
        span_vectors = mask_bwd * (e_padded_edu_vectors - bm1_padded_edu_vectors) \
                        + mask_fwd * (b_padded_edu_vectors - ep1_padded_edu_vectors) # (total_spans, bilstm_dim)

        # Template features
        tempfeat_vectors = self.template_feature_extractor.extract_batch_features(
                                        edus=edus,
                                        edus_postag=edus_postag,
                                        sbnds=sbnds,
                                        pbnds=pbnds,
                                        spans=flatten_batch_spans) # (total_spans, tempfeat_dim)
        tempfeat_vectors = utils.convert_ndarray_to_variable(tempfeat_vectors, seq=False) # (total_spans, tempfeat_dim)
        span_vectors = F.concat([span_vectors, tempfeat_vectors], axis=1) # (total_spans, bilstm_dim + tempfeat_dim)

        return span_vectors

Esempio n. 13

0

Mostra file

File: evaluating.py Progetto: ahmedtolan23/NER-with-bilstm-CRF-CNN

    def __init__(self, golden_tags, predict_tags, remove_O=False):

        # [[t1, t2], [t3, t4]...] --> [t1, t2, t3, t4...]
        self.golden_tags = flatten_lists(golden_tags)
        self.predict_tags = flatten_lists(predict_tags)

        if remove_O:  # Remove the O tag, only the entity tag
            self._remove_Otags()

        self.tagset = set(self.golden_tags)
        self.correct_tags_number = self.count_correct_tags()
        self.predict_tags_counter = Counter(self.predict_tags)
        self.golden_tags_counter = Counter(self.golden_tags)

        self.precision_scores = self.cal_precision()

        self.recall_scores = self.cal_recall()

        self.f1_scores = self.cal_f1()

Esempio n. 14

0

Mostra file

    def __init__(self, gloden_tags, predict_tags, remove_0=False):
        self.golden_tags = flatten_lists(gloden_tags)
        self.predict_tags = flatten_lists(predict_tags)

        if remove_0:  # 不统计非实体标记
            self._remove_Otags()

        # 所有的tag总数
        self.tagset = set(self.golden_tags)
        self.correct_tags_number = self.count_correct_tags()
        # print(self.correct_tags_number)
        self.predict_tags_count = Counter(self.predict_tags)
        self.golden_tags_count = Counter(self.golden_tags)

        # 精确率
        self.precision_scores = self.cal_precision()
        # 召回率
        self.recall_scores = self.cal_recall()
        # F1
        self.f1_scores = self.cal_f1()

Esempio n. 15

0

Mostra file

File: evaluate.py Progetto: ziqiYang19960522/named_entity_recognition

def ensemble_evaluate(results, targets):
    """ensemble多个模型"""
    for i in range(len(results)):
        results[i] = flatten_lists(results[i])

    pred_tags = []
    for result in zip(*results):
        ensemble_tag = Counter(result).most_common(1)[0][0]
        pred_tags.append(ensemble_tag)

    targets = flatten_lists(targets)
    assert len(pred_tags) == len(targets)

    correct = 0
    for pred, tgt in zip(pred_tags, targets):
        if pred == tgt:
            correct += 1.
    accuracy = correct / len(targets)

    print("Ensemble四个模型的准确率为{:.2f}%".format(accuracy * 100))

Esempio n. 16

0

Mostra file

File: evaluating.py Progetto: sqkika/named_entity_recognition

    def __init__(self, golden_tags, predict_tags, remove_O=False):

        # [[t1, t2], [t3, t4]...] --> [t1, t2, t3, t4...]
        self.golden_tags = flatten_lists(golden_tags)
        self.predict_tags = flatten_lists(predict_tags)

        if remove_O:  # 将O标记移除，只关心实体标记
            self._remove_Otags()

        # 辅助计算的变量
        self.tagset = set(self.golden_tags)
        self.correct_tags_number = self.count_correct_tags()
        self.predict_tags_counter = Counter(self.predict_tags)
        self.golden_tags_counter = Counter(self.golden_tags)

        # 计算精确率
        self.precision_scores = self.cal_precision()

        # 计算召回率
        self.recall_scores = self.cal_recall()

        # 计算F1分数
        self.f1_scores = self.cal_f1()

Esempio n. 17

0

Mostra file

File: make_filelist_for_syntactic_features_ptbwsj_wo_rstdt.py Progetto: arne-cl/DiscourseConstituencyInduction-ViterbiEM

def main():
    config = utils.Config()

    filenames = os.listdir(
        os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt",
                     "preprocessed"))
    filenames = [n for n in filenames if n.endswith(".paragraph.boundaries")]
    filenames = [
        n.replace(".paragraph.boundaries", ".edus") for n in filenames
    ]
    filenames.sort()

    with open(
            os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt",
                         "tmp.preprocessing", "filelist.corenlp2.txt"),
            "w") as ff:
        for filename in filenames:
            # Path
            path_edus = os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt",
                                     "tmp.preprocessing",
                                     filename + ".tokenized")
            path_sbnds = os.path.join(
                config.getpath("data"), "ptbwsj_wo_rstdt", "preprocessed",
                filename.replace(".edus", ".sentence.noproj.boundaries"))
            path_sents = os.path.join(config.getpath("data"),
                                      "ptbwsj_wo_rstdt", "tmp.preprocessing",
                                      filename.replace(".edus", ".sentences"))

            # Read
            edus = utils.read_lines(
                path_edus,
                process=lambda line: line.split())  # list of list of str
            sbnds = utils.read_lines(
                path_sbnds,
                process=lambda line:
                (int(x) for x in line.split()))  # list of (int, int)

            # Create sentences based on the sentence boundaries
            sentences = []
            for begin_i, end_i in sbnds:
                sentence = edus[begin_i:end_i + 1]  # list of list of str
                sentence = utils.flatten_lists(sentence)  # list of str
                sentences.append(sentence)

            # Write
            with open(path_sents, "w") as fs:
                for sentence in sentences:
                    fs.write("%s\n" % " ".join(sentence))
            ff.write("%s\n" % path_sents)

Esempio n. 18

0

Mostra file

File: predict.py Progetto: kstathou/cb-health-org-prediction

def predict_health_cb(data, vectoriser, classifier):
    """Predict health labels for CB.

    Args:
        data (:obj:`list` of :obj:`tuple`): Crunchbase IDs and list of
            categories.
    Return:
        output(:obj:`list` of :obj:`dict`): Crunchbase IDS and bool.

    """
    with open(vectoriser, 'rb') as h:
        vec = pickle.load(h)

    with open(classifier, 'rb') as h:
        clf = pickle.load(h)

    # Store index.
    data_idx = [tup[0] for tup in data]
    labels = clf.predict(vec.transform(flatten_lists([tup[1] for tup in data])))

    return [{'id':id_, 'is_health':pred}
                for id_, pred in zip(data_idx, labels)]

Esempio n. 19

0

Mostra file

File: prepare_scidtb.py Progetto: Ehaschia/DiscourseConstituencyInduction-ViterbiEM

def process(path_in, path_out):
    utils.mkdir(path_out)

    nlp_no_ssplit = spacy.load("en_core_web_sm", diable=["ner", "textcat"])
    nlp_no_ssplit.tokenizer = nlp_no_ssplit.tokenizer.tokens_from_list
    nlp_no_ssplit.add_pipe(prevent_sentence_boundary_detection,
                           name="prevent-sbd",
                           before="parser")

    filenames = os.listdir(path_in)
    filenames = [n for n in filenames if n.endswith(".edu.txt.dep")]
    filenames.sort()

    skip_count = 0
    for filename in pyprind.prog_bar(filenames):
        edus, sents, sbnds, disc_arcs = read_data(
            os.path.join(path_in, filename))

        if edus is None:
            print("Skippted %s" % filename)
            skip_count += 1
            continue

        assert len(sents) == len(sbnds)

        with open(
                os.path.join(path_out,
                             filename.replace(".edu.txt.dep", ".edus.tokens")),
                "w") as f:
            for edu in edus:
                edu = " ".join(edu)
                f.write("%s\n" % edu)

        with open(
                os.path.join(path_out,
                             filename.replace(".edu.txt.dep", ".sbnds")),
                "w") as f:
            for begin_i, end_i in sbnds:
                f.write("%d %d\n" % (begin_i, end_i))

        with open(
                os.path.join(path_out,
                             filename.replace(".edu.txt.dep", ".pbnds")),
                "w") as f:
            n_sents = len(sents)
            f.write("0 %d\n" % (n_sents - 1))

        with open(
                os.path.join(path_out,
                             filename.replace(".edu.txt.dep", ".arcs")),
                "w") as f:
            disc_arcs = sorted(disc_arcs, key=lambda x: x[1])
            disc_arcs = ["%d-%d-%s" % (h, d, l) for h, d, l in disc_arcs]
            disc_arcs = " ".join(disc_arcs)
            f.write("%s\n" % disc_arcs)

        sents_postags = []
        sents_arcs = []
        for sent in sents:
            doc = nlp_no_ssplit(sent)
            sents_ = list(doc.sents)
            assert len(sents_) == 1
            sent = sents_[0]
            postags = [token.tag_ for token in sent]
            arcs = []
            found_root = False
            for token in sent:
                head = token.head.i + 1
                dep = token.i + 1
                label = token.dep_
                if head == dep:
                    assert label == "ROOT"
                    assert not found_root  # Only one token can be the root of dependency graph
                    head = 0
                    found_root = True
                syn_arc = (head, dep, label)
                arcs.append(syn_arc)
            assert found_root
            arcs = ["%d-%d-%s" % (h, d, l) for h, d, l in arcs]
            sents_postags.append(postags)
            sents_arcs.append(arcs)
        postags = utils.flatten_lists(sents_postags)  # List[str]
        arcs = utils.flatten_lists(sents_arcs)  # List[str]

        with open(os.path.join(path_out, filename.replace(".edu.txt.dep", ".edus.postags")), "w") as fp,\
             open(os.path.join(path_out, filename.replace(".edu.txt.dep", ".edus.arcs")), "w") as fa:
            begin_tok_i = 0
            for edu in edus:
                length = len(edu)

                sub_postags = postags[begin_tok_i:begin_tok_i + length]
                sub_postags = " ".join(sub_postags)
                fp.write("%s\n" % sub_postags)

                sub_arcs = arcs[begin_tok_i:begin_tok_i + length]
                sub_arcs = " ".join(sub_arcs)
                fa.write("%s\n" % sub_arcs)

                begin_tok_i += length

    print("Processed %d files; %d files are skipped." %
          (len(filenames) - skip_count, skip_count))

Esempio n. 20

0

Mostra file

def main(args):
    path = args.path

    filenames = os.listdir(path)
    filenames = [n for n in filenames if n.endswith(".edus.tokens")]
    filenames.sort()

    for filename in pyprind.prog_bar(filenames):

        edus = utils.read_lines(
            os.path.join(path, filename),
            process=lambda line: line.split())  # List[List[str]]
        sents = utils.read_lines(
            os.path.join(path, filename.replace(".edus.tokens",
                                                ".sents.tokens")),
            process=lambda line: line.split())  # List[List[str]]
        sents_postags = utils.read_lines(
            os.path.join(path,
                         filename.replace(".edus.tokens", ".sents.postags")),
            process=lambda line: line.split())  # List[List[str]]
        sents_arcs = utils.read_lines(
            os.path.join(path, filename.replace(".edus.tokens",
                                                ".sents.arcs")),
            process=lambda line: line.split())  # List[List[str]]
        postags = utils.flatten_lists(sents_postags)  # List[str]
        arcs = utils.flatten_lists(sents_arcs)  # List[str]

        # Ending positions of gold EDUs
        edu_end_positions = []
        tok_i = 0
        for edu in edus:
            length = len(edu)
            edu_end_positions.append(tok_i + length - 1)
            tok_i += length

        # Ending positions of sentences
        sent_end_positions = []
        tok_i = 0
        for sent in sents:
            length = len(sent)
            sent_end_positions.append(tok_i + length - 1)
            tok_i += length

        # All the ending positions of sentences must match with those of gold EDUs
        assert set(sent_end_positions
                   ) == set(edu_end_positions) & set(sent_end_positions)

        # Sentence boundaries
        sbnds = []
        tok_i = 0
        sent_i = 0
        begin_edu_i = 0
        for end_edu_i, edu in enumerate(edus):
            tok_i += len(edu)
            if tok_i - 1 == sent_end_positions[sent_i]:
                sbnds.append((begin_edu_i, end_edu_i))
                sent_i += 1
                begin_edu_i = end_edu_i + 1
        assert sent_i == len(sent_end_positions)
        with open(
                os.path.join(path, filename.replace(".edus.tokens", ".sbnds")),
                "w") as f:
            for begin_i, end_i in sbnds:
                f.write("%d %d\n" % (begin_i, end_i))

        # Extract POS tags and dependency arcs corresponding to each EDU
        with open(os.path.join(path, filename.replace(".edus.tokens", ".edus.postags")), "w") as fp,\
             open(os.path.join(path, filename.replace(".edus.tokens", ".edus.arcs")), "w") as fa:
            begin_tok_i = 0
            for edu in edus:
                length = len(edu)

                sub_postags = postags[begin_tok_i:begin_tok_i + length]
                sub_postags = " ".join(sub_postags)
                fp.write("%s\n" % sub_postags)

                sub_arcs = arcs[begin_tok_i:begin_tok_i + length]
                sub_arcs = " ".join(sub_arcs)
                fa.write("%s\n" % sub_arcs)

                begin_tok_i += length
        assert begin_tok_i - 1 == edu_end_positions[-1]