def break2common_sentences(p1, p2): """finds the positions of the common sentence ending Breaking is done according to the text and to the ucca annotation of both passages returns two lists each containing positions of sentence endings guarentees same number of positions is acquired and the last position is the passage end""" # break to sentences broken1 = break2sentences(p1) broken2 = break2sentences(p2) # find common endings positions1 = [] positions2 = [] i = 0 j = 0 while j < len(broken2) and i < len(broken1): position1, reg1 = _choose_ending_position(p1, broken1[i]) position2, reg2 = _choose_ending_position(p2, broken2[j]) if i + 1 < len(broken1): pos_after1, one_after1 = _choose_ending_position( p1, broken1[i + 1]) else: pos_after1, one_after1 = position1, reg1 if j + 1 < len(broken2): pos_after2, one_after2 = _choose_ending_position( p2, broken2[j + 1]) else: pos_after2, one_after2 = position2, reg2 if reg1 == reg2: positions1.append(position1) positions2.append(position2) # deal with addition or subtraction of a sentence ending elif one_after1 == reg2: i += 1 positions1.append(pos_after1) positions2.append(position2) elif reg1 == one_after2: j += 1 positions1.append(position1) positions2.append(pos_after2) i += 1 j += 1 # add last sentence in case skipped position1, reg1 = _choose_ending_position(p1, broken1[-1]) position2, reg2 = _choose_ending_position(p2, broken2[-1]) if (not positions1) or (not positions2) or (positions1[-1] != position1 and positions2[-1] != position2): positions1.append(broken1[-1]) positions2.append(broken2[-1]) elif positions1[-1] != position1 and positions2[-1] == position2: positions1[-1] = position1 elif positions1[-1] == position1 and positions2[-1] != position2: positions2[-1] = broken2[-1] return positions1, positions2
def break2common_sentences(p1, p2): """finds the positions of the common sentence ending Breaking is done according to the text and to the ucca annotation of both passages returns two lists each containing positions of sentence endings guarentees same number of positions is acquired and the last position is the passage end""" # break to sentences broken1 = break2sentences(p1) broken2 = break2sentences(p2) # find common endings positions1 = [] positions2 = [] i = 0 j = 0 while j < len(broken2) and i < len(broken1): position1, reg1 = _choose_ending_position(p1, broken1[i]) position2, reg2 = _choose_ending_position(p2, broken2[j]) if i + 1 < len(broken1): pos_after1, one_after1 = _choose_ending_position(p1, broken1[i + 1]) else: pos_after1, one_after1 = position1, reg1 if j + 1 < len(broken2): pos_after2, one_after2 = _choose_ending_position(p2, broken2[j + 1]) else: pos_after2, one_after2 = position2, reg2 if reg1 == reg2: positions1.append(position1) positions2.append(position2) # deal with addition or subtraction of a sentence ending elif one_after1 == reg2: i += 1 positions1.append(pos_after1) positions2.append(position2) elif reg1 == one_after2: j += 1 positions1.append(position1) positions2.append(pos_after2) i += 1 j += 1 # add last sentence in case skipped position1, reg1 = _choose_ending_position(p1, broken1[-1]) position2, reg2 = _choose_ending_position(p2, broken2[-1]) if (not positions1) or (not positions2) or ( positions1[-1] != position1 and positions2[-1] != position2): positions1.append(broken1[-1]) positions2.append(broken2[-1]) elif positions1[-1] != position1 and positions2[-1] == position2: positions1[-1] = position1 elif positions1[-1] == position1 and positions2[-1] != position2: positions2[-1] = broken2[-1] return positions1, positions2
def main(args): print( "id,passages,paragraphs,sentences,nodes,terminals,non-terminals,implicit,linkage,discont," "edges,primary,remote,linkage,parents,children,mult-parents") data = [] for passage in get_passages_with_progress_bar(args.filenames): terminals = passage.layer(layer0.LAYER_ID).all non_terminals = [ n for n in passage.layer(layer1.LAYER_ID).all if n.ID != "1.1" ] non_linkage = [n for n in non_terminals if n.tag != NodeTags.Linkage] linkage_nodes = passage.layer(layer1.LAYER_ID).top_linkages edges = {e for n in non_terminals for e in n} remote = [e for e in edges if e.attrib.get("remote")] linkage_edges = [e for n in linkage_nodes for e in n] fields = ( int(passage.ID), 1, len({t.paragraph for t in terminals}), len(break2sentences(passage)), len(terminals) + len(non_terminals), len(terminals), len(non_terminals), len([n for n in non_linkage if n.attrib.get("implicit")]), len(linkage_nodes), len([ n for n in non_linkage if n.tag == NodeTags.Foundational and n.discontiguous ]), len(edges), len(edges) - len(remote) - len(linkage_edges), len(remote), len(linkage_edges), sum( len([p for p in n.parents if p.ID != "1.1"]) for n in non_linkage), sum(len(n.children) for n in non_linkage), len([ n for n in non_linkage if len([p for p in n.parents if p.ID != "1.1"]) > 1 ]), ) if not args.summary: with tqdm.external_write_mode(): print(",".join("%d" % f for f in fields)) data.append(fields) data = np.array(data, dtype=int) if args.outfile: np.savetxt(args.outfile, data[data[:, 0].argsort()], fmt="%i", delimiter="\t") if args.summary: print(",".join("%d" % f for f in data.sum(axis=0)))
def main(): argparser = argparse.ArgumentParser(description=desc) argparser.add_argument("filenames", nargs="+", help="files to process") argparser.add_argument("-o", "--outfile", help="output file for data") args = argparser.parse_args() print("id,passages,paragraphs,sentences,nodes,terminals,non-terminals,implicit,linkage,discont," "edges,primary,remote,linkage,parents,children,mult-parents") data = [] for pattern in args.filenames: for filename in glob.glob(pattern): passage = file2passage(filename) terminals = passage.layer(layer0.LAYER_ID).all non_terminals = [n for n in passage.layer(layer1.LAYER_ID).all if n.ID != "1.1"] non_linkage = [n for n in non_terminals if n.tag != NodeTags.Linkage] linkage_nodes = passage.layer(layer1.LAYER_ID).top_linkages edges = {e for n in non_terminals for e in n} remote = [e for e in edges if e.attrib.get("remote")] linkage_edges = [e for n in linkage_nodes for e in n] fields = (int(passage.ID), 1, len({t.paragraph for t in terminals}), len(break2sentences(passage)), len(terminals) + len(non_terminals), len(terminals), len(non_terminals), len([n for n in non_linkage if n.attrib.get("implicit")]), len(linkage_nodes), len([n for n in non_linkage if n.tag == NodeTags.Foundational and n.discontiguous]), len(edges), len(edges) - len(remote) - len(linkage_edges), len(remote), len(linkage_edges), sum(len([p for p in n.parents if p.ID != "1.1"]) for n in non_linkage), sum(len(n.children) for n in non_linkage), len([n for n in non_linkage if len([p for p in n.parents if p.ID != "1.1"]) > 1]), ) print(",".join("%d" % f for f in fields)) data.append(fields) data = np.array(data, dtype=int) if args.outfile: np.savetxt(args.outfile, data[data[:, 0].argsort()], fmt="%i", delimiter="\t") sys.exit(0)
def main(): argparser = argparse.ArgumentParser(description=desc) argparser.add_argument("filenames", nargs="+", help="files to process") argparser.add_argument("-o", "--outfile", help="output file for data") args = argparser.parse_args() print("id,passages,paragraphs,sentences,nodes,terminals,non-terminals,implicit,linkage,discont," "edges,primary,remote,linkage,parents,children,mult-parents") data = [] for pattern in args.filenames: for filename in glob.glob(pattern): passage = file2passage(filename) terminals = passage.layer(layer0.LAYER_ID).all non_terminals = [n for n in passage.layer(layer1.LAYER_ID).all if n.ID != "1.1"] non_linkage = [n for n in non_terminals if n.tag != NodeTags.Linkage] linkage_nodes = passage.layer(layer1.LAYER_ID).top_linkages edges = {e for n in non_terminals for e in n} remote = [e for e in edges if e.attrib.get("remote")] linkage_edges = [e for n in linkage_nodes for e in n] fields = (int(passage.ID), 1, len({t.paragraph for t in terminals}), len(break2sentences(passage)), len(terminals) + len(non_terminals), len(terminals), len(non_terminals), len([n for n in non_linkage if n.attrib.get("implicit")]), len(linkage_nodes), len([n for n in non_linkage if n.tag == NodeTags.Foundational and n.discontiguous]), len(edges), len(edges) - len(remote) - len(linkage_edges), len(remote), len(linkage_edges), sum(len([p for p in n.parents if p.ID != "1.1"]) for n in non_linkage), sum(len(n.children) for n in non_linkage), len([n for n in non_linkage if len([p for p in n.parents if p.ID != "1.1"]) > 1]), ) print(",".join("%d" % f for f in fields)) data.append(fields) data = np.array(data, dtype=int) np.savetxt(args.outfile, data[data[:, 0].argsort()], fmt="%i", delimiter="\t") sys.exit(0)
def test_break2sentences(create, breaks): """Tests identifying correctly sentence ends. """ assert textutil.break2sentences(create()) == breaks
def test_break2sentences(self): """Tests identifying correctly sentence ends. """ p = TestUtil.create_multi_passage() self.assertSequenceEqual(textutil.break2sentences(p), [4, 7, 11])