Ejemplo n.º 1
0
def break2common_sentences(p1, p2):
    """finds the positions of the common sentence ending

    Breaking is done according to the text and to the ucca annotation of both passages
    returns two lists each containing positions of sentence endings
    guarentees same number of positions is acquired and the last position is the passage end"""
    # break to sentences
    broken1 = break2sentences(p1)
    broken2 = break2sentences(p2)

    # find common endings
    positions1 = []
    positions2 = []
    i = 0
    j = 0
    while j < len(broken2) and i < len(broken1):
        position1, reg1 = _choose_ending_position(p1, broken1[i])
        position2, reg2 = _choose_ending_position(p2, broken2[j])
        if i + 1 < len(broken1):
            pos_after1, one_after1 = _choose_ending_position(
                p1, broken1[i + 1])
        else:
            pos_after1, one_after1 = position1, reg1
        if j + 1 < len(broken2):
            pos_after2, one_after2 = _choose_ending_position(
                p2, broken2[j + 1])
        else:
            pos_after2, one_after2 = position2, reg2

        if reg1 == reg2:
            positions1.append(position1)
            positions2.append(position2)
        # deal with  addition or subtraction of a sentence ending
        elif one_after1 == reg2:
            i += 1
            positions1.append(pos_after1)
            positions2.append(position2)
        elif reg1 == one_after2:
            j += 1
            positions1.append(position1)
            positions2.append(pos_after2)
        i += 1
        j += 1

    # add last sentence in case skipped
    position1, reg1 = _choose_ending_position(p1, broken1[-1])
    position2, reg2 = _choose_ending_position(p2, broken2[-1])
    if (not positions1) or (not positions2) or (positions1[-1] != position1 and
                                                positions2[-1] != position2):
        positions1.append(broken1[-1])
        positions2.append(broken2[-1])
    elif positions1[-1] != position1 and positions2[-1] == position2:
        positions1[-1] = position1
    elif positions1[-1] == position1 and positions2[-1] != position2:
        positions2[-1] = broken2[-1]
    return positions1, positions2
Ejemplo n.º 2
0
def break2common_sentences(p1, p2):
    """finds the positions of the common sentence ending

    Breaking is done according to the text and to the ucca annotation of both passages
    returns two lists each containing positions of sentence endings
    guarentees same number of positions is acquired and the last position is the passage end"""
    # break to sentences
    broken1 = break2sentences(p1)
    broken2 = break2sentences(p2)

    # find common endings
    positions1 = []
    positions2 = []
    i = 0
    j = 0
    while j < len(broken2) and i < len(broken1):
        position1, reg1 = _choose_ending_position(p1, broken1[i])
        position2, reg2 = _choose_ending_position(p2, broken2[j])
        if i + 1 < len(broken1):
            pos_after1, one_after1 = _choose_ending_position(p1, broken1[i + 1])
        else:
            pos_after1, one_after1 = position1, reg1
        if j + 1 < len(broken2):
            pos_after2, one_after2 = _choose_ending_position(p2, broken2[j + 1])
        else:
            pos_after2, one_after2 = position2, reg2

        if reg1 == reg2:
            positions1.append(position1)
            positions2.append(position2)
        # deal with  addition or subtraction of a sentence ending
        elif one_after1 == reg2:
            i += 1
            positions1.append(pos_after1)
            positions2.append(position2)
        elif reg1 == one_after2:
            j += 1
            positions1.append(position1)
            positions2.append(pos_after2)
        i += 1
        j += 1

    # add last sentence in case skipped
    position1, reg1 = _choose_ending_position(p1, broken1[-1])
    position2, reg2 = _choose_ending_position(p2, broken2[-1])
    if (not positions1) or (not positions2) or (
                    positions1[-1] != position1 and positions2[-1] != position2):
        positions1.append(broken1[-1])
        positions2.append(broken2[-1])
    elif positions1[-1] != position1 and positions2[-1] == position2:
        positions1[-1] = position1
    elif positions1[-1] == position1 and positions2[-1] != position2:
        positions2[-1] = broken2[-1]
    return positions1, positions2
Ejemplo n.º 3
0
def main(args):
    print(
        "id,passages,paragraphs,sentences,nodes,terminals,non-terminals,implicit,linkage,discont,"
        "edges,primary,remote,linkage,parents,children,mult-parents")
    data = []
    for passage in get_passages_with_progress_bar(args.filenames):
        terminals = passage.layer(layer0.LAYER_ID).all
        non_terminals = [
            n for n in passage.layer(layer1.LAYER_ID).all if n.ID != "1.1"
        ]
        non_linkage = [n for n in non_terminals if n.tag != NodeTags.Linkage]
        linkage_nodes = passage.layer(layer1.LAYER_ID).top_linkages
        edges = {e for n in non_terminals for e in n}
        remote = [e for e in edges if e.attrib.get("remote")]
        linkage_edges = [e for n in linkage_nodes for e in n]
        fields = (
            int(passage.ID),
            1,
            len({t.paragraph
                 for t in terminals}),
            len(break2sentences(passage)),
            len(terminals) + len(non_terminals),
            len(terminals),
            len(non_terminals),
            len([n for n in non_linkage if n.attrib.get("implicit")]),
            len(linkage_nodes),
            len([
                n for n in non_linkage
                if n.tag == NodeTags.Foundational and n.discontiguous
            ]),
            len(edges),
            len(edges) - len(remote) - len(linkage_edges),
            len(remote),
            len(linkage_edges),
            sum(
                len([p for p in n.parents if p.ID != "1.1"])
                for n in non_linkage),
            sum(len(n.children) for n in non_linkage),
            len([
                n for n in non_linkage
                if len([p for p in n.parents if p.ID != "1.1"]) > 1
            ]),
        )
        if not args.summary:
            with tqdm.external_write_mode():
                print(",".join("%d" % f for f in fields))
        data.append(fields)
    data = np.array(data, dtype=int)
    if args.outfile:
        np.savetxt(args.outfile,
                   data[data[:, 0].argsort()],
                   fmt="%i",
                   delimiter="\t")
    if args.summary:
        print(",".join("%d" % f for f in data.sum(axis=0)))
Ejemplo n.º 4
0
def main():
    argparser = argparse.ArgumentParser(description=desc)
    argparser.add_argument("filenames", nargs="+", help="files to process")
    argparser.add_argument("-o", "--outfile", help="output file for data")
    args = argparser.parse_args()

    print("id,passages,paragraphs,sentences,nodes,terminals,non-terminals,implicit,linkage,discont,"
          "edges,primary,remote,linkage,parents,children,mult-parents")
    data = []
    for pattern in args.filenames:
        for filename in glob.glob(pattern):
            passage = file2passage(filename)
            terminals = passage.layer(layer0.LAYER_ID).all
            non_terminals = [n for n in passage.layer(layer1.LAYER_ID).all if n.ID != "1.1"]
            non_linkage = [n for n in non_terminals if n.tag != NodeTags.Linkage]
            linkage_nodes = passage.layer(layer1.LAYER_ID).top_linkages
            edges = {e for n in non_terminals for e in n}
            remote = [e for e in edges if e.attrib.get("remote")]
            linkage_edges = [e for n in linkage_nodes for e in n]
            fields = (int(passage.ID),
                      1,
                      len({t.paragraph for t in terminals}),
                      len(break2sentences(passage)),
                      len(terminals) + len(non_terminals),
                      len(terminals),
                      len(non_terminals),
                      len([n for n in non_linkage if n.attrib.get("implicit")]),
                      len(linkage_nodes),
                      len([n for n in non_linkage if n.tag == NodeTags.Foundational and n.discontiguous]),
                      len(edges),
                      len(edges) - len(remote) - len(linkage_edges),
                      len(remote),
                      len(linkage_edges),
                      sum(len([p for p in n.parents if p.ID != "1.1"]) for n in non_linkage),
                      sum(len(n.children) for n in non_linkage),
                      len([n for n in non_linkage if len([p for p in n.parents if p.ID != "1.1"]) > 1]),
                      )
            print(",".join("%d" % f for f in fields))
            data.append(fields)
    data = np.array(data, dtype=int)
    if args.outfile:
        np.savetxt(args.outfile, data[data[:, 0].argsort()], fmt="%i", delimiter="\t")

    sys.exit(0)
Ejemplo n.º 5
0
def main():
    argparser = argparse.ArgumentParser(description=desc)
    argparser.add_argument("filenames", nargs="+", help="files to process")
    argparser.add_argument("-o", "--outfile", help="output file for data")
    args = argparser.parse_args()

    print("id,passages,paragraphs,sentences,nodes,terminals,non-terminals,implicit,linkage,discont,"
          "edges,primary,remote,linkage,parents,children,mult-parents")
    data = []
    for pattern in args.filenames:
        for filename in glob.glob(pattern):
            passage = file2passage(filename)
            terminals = passage.layer(layer0.LAYER_ID).all
            non_terminals = [n for n in passage.layer(layer1.LAYER_ID).all if n.ID != "1.1"]
            non_linkage = [n for n in non_terminals if n.tag != NodeTags.Linkage]
            linkage_nodes = passage.layer(layer1.LAYER_ID).top_linkages
            edges = {e for n in non_terminals for e in n}
            remote = [e for e in edges if e.attrib.get("remote")]
            linkage_edges = [e for n in linkage_nodes for e in n]
            fields = (int(passage.ID),
                      1,
                      len({t.paragraph for t in terminals}),
                      len(break2sentences(passage)),
                      len(terminals) + len(non_terminals),
                      len(terminals),
                      len(non_terminals),
                      len([n for n in non_linkage if n.attrib.get("implicit")]),
                      len(linkage_nodes),
                      len([n for n in non_linkage if n.tag == NodeTags.Foundational and n.discontiguous]),
                      len(edges),
                      len(edges) - len(remote) - len(linkage_edges),
                      len(remote),
                      len(linkage_edges),
                      sum(len([p for p in n.parents if p.ID != "1.1"]) for n in non_linkage),
                      sum(len(n.children) for n in non_linkage),
                      len([n for n in non_linkage if len([p for p in n.parents if p.ID != "1.1"]) > 1]),
                      )
            print(",".join("%d" % f for f in fields))
            data.append(fields)
    data = np.array(data, dtype=int)
    np.savetxt(args.outfile, data[data[:, 0].argsort()], fmt="%i", delimiter="\t")

    sys.exit(0)
Ejemplo n.º 6
0
def test_break2sentences(create, breaks):
    """Tests identifying correctly sentence ends. """
    assert textutil.break2sentences(create()) == breaks
Ejemplo n.º 7
0
def test_break2sentences(create, breaks):
    """Tests identifying correctly sentence ends. """
    assert textutil.break2sentences(create()) == breaks
Ejemplo n.º 8
0
 def test_break2sentences(self):
     """Tests identifying correctly sentence ends.
     """
     p = TestUtil.create_multi_passage()
     self.assertSequenceEqual(textutil.break2sentences(p), [4, 7, 11])
Ejemplo n.º 9
0
 def test_break2sentences(self):
     """Tests identifying correctly sentence ends.
     """
     p = TestUtil.create_multi_passage()
     self.assertSequenceEqual(textutil.break2sentences(p), [4, 7, 11])