Exemple #1
0
def test_site_terminals():
    elem = load_xml("test_files/site1.xml")
    passage = convert.from_site(elem)
    terms = passage.layer(layer0.LAYER_ID).all

    assert passage.ID == "118"
    assert len(terms) == 15

    # There are two punctuation signs (dots, positions 5 and 11), which
    # also serve as paragraph end points. All others are words whose text
    # is their positions, so test that both text, punctuation (yes/no)
    # and paragraphs are converted correctly
    for i, t in enumerate(terms):
        # i starts in 0, positions at 1, hence 5,11 ==> 4,10
        if i in (4, 10):
            assert t.text == "." and t.punct
        else:
            assert t.text == str(i + 1) and not t.punct
        if i < 5:
            par = 1
        elif i < 11:
            par = 2
        else:
            par = 3
        assert t.paragraph == par
Exemple #2
0
    def test_site_terminals(self):
        elem = self._load_xml('./site1.xml')
        passage = convert.from_site(elem)
        terms = passage.layer(layer0.LAYER_ID).all

        self.assertEqual(passage.ID, '118')
        self.assertEqual(len(terms), 15)

        # There are two punctuation signs (dots, positions 5 and 11), which
        # also serve as paragraph end points. All others are words whose text
        # is their positions, so test that both text, punctuation (yes/no)
        # and paragraphs are converted correctly
        for i, t in enumerate(terms):
            # i starts in 0, positions at 1, hence 5,11 ==> 4,10
            if i in (4, 10):
                self.assertTrue(t.text == '.' and t.punct is True)
            else:
                self.assertTrue(t.text == str(i + 1) and t.punct is False)
            if i < 5:
                par = 1
            elif i < 11:
                par = 2
            else:
                par = 3
            self.assertEqual(t.paragraph, par)
Exemple #3
0
    def test_site_simple(self):
        elem = self._load_xml('./site2.xml')
        passage = convert.from_site(elem)
        terms = passage.layer(layer0.LAYER_ID).all
        l1 = passage.layer('1')

        # The Terminals in the passage are just like in test_site_terminals,
        # with this layer1 heirarchy: [[1 C] [2 E] L] [3 4 . H]
        # with the linker having a remark and the parallel scene is uncertain
        head = l1.heads[0]
        self.assertEqual(len(head), 12)  # including all 'unused' terminals
        self.assertEqual(head[9].tag, layer1.EdgeTags.Linker)
        self.assertEqual(head[10].tag, layer1.EdgeTags.ParallelScene)
        linker = head.children[9]
        self._test_edges(linker, [layer1.EdgeTags.Center,
                                  layer1.EdgeTags.Elaborator])
        self.assertTrue(linker.extra['remarks'], '"remark"')
        center = linker.children[0]
        elab = linker.children[1]
        self._test_terms(center, terms[0:1])
        self._test_terms(elab, terms[1:2])
        ps = head.children[10]
        self._test_edges(ps, [layer1.EdgeTags.Terminal,
                              layer1.EdgeTags.Terminal,
                              layer1.EdgeTags.Punctuation])
        self.assertTrue(ps.attrib.get('uncertain'))
        self.assertEqual(ps.children[0], terms[2])
        self.assertEqual(ps.children[1], terms[3])
        self.assertEqual(ps.children[2].children[0], terms[4])
def test_to_standard():
    passage = convert.from_site(load_xml("test_files/site3.xml"))
    ref = load_xml("test_files/standard3.xml")  # old format of xml
    new_ref = convert.to_standard(convert.from_standard(ref))   # converting to the new xml format
    root = convert.to_standard(passage)
    assert (textutil.indent_xml(ETree.tostring(new_ref)).splitlines() ==
            textutil.indent_xml(ETree.tostring(root)).splitlines())
Exemple #5
0
def main(args):
    os.makedirs(args.outdir, exist_ok=True)
    with open(args.filename, encoding="utf-8") as f:
        t = list(map(str.split, f))
        if not args.verbose:
            t = tqdm(t, desc="Downloading", unit=" passages")
        for passage_id, id_field in t:
            if not args.verbose:
                t.set_postfix({
                    "passage_id": passage_id,
                    args.method: id_field
                })
            if args.verbose:
                with external_write_mode():
                    print("Getting passage " + passage_id + " with " +
                          args.method + "=" + id_field,
                          end="\t")
            xml_root = get_by_method(id_field=id_field.split(","),
                                     passage_id=passage_id,
                                     **vars(args))
            if xml_root is None:
                continue
            if args.write_site:
                site_filename = passage_id + "_site_download.xml"
                with open(site_filename, "w", encoding="utf-8") as fsite:
                    print(tostring(xml_root).decode(), file=fsite)
                if args.verbose:
                    with external_write_mode():
                        print("Wrote '%s'" % site_filename)
            if args.write:
                write_passage(convert.from_site(xml_root),
                              outdir=args.outdir,
                              verbose=args.verbose)
Exemple #6
0
    def test_site_simple(self):
        elem = TestUtil.load_xml("test_files/site2.xml")
        passage = convert.from_site(elem)
        terms = passage.layer(layer0.LAYER_ID).all
        l1 = passage.layer("1")

        # The Terminals in the passage are just like in test_site_terminals,
        # with this layer1 hierarchy: [[1 C] [2 E] L] [3 4 . H]
        # with the linker having a remark and the parallel scene is uncertain
        head = l1.heads[0]
        self.assertEqual(len(head), 12)  # including all "unused" terminals
        self.assertEqual(head[9].tag, layer1.EdgeTags.Linker)
        self.assertEqual(head[10].tag, layer1.EdgeTags.ParallelScene)
        linker = head.children[9]
        self._test_edges(linker,
                         [layer1.EdgeTags.Center, layer1.EdgeTags.Elaborator])
        self.assertTrue(linker.extra["remarks"], '"remark"')
        center = linker.children[0]
        elab = linker.children[1]
        self._test_terms(center, terms[0:1])
        self._test_terms(elab, terms[1:2])
        ps = head.children[10]
        self._test_edges(ps, [
            layer1.EdgeTags.Terminal, layer1.EdgeTags.Terminal,
            layer1.EdgeTags.Punctuation
        ])
        self.assertTrue(ps.attrib.get("uncertain"))
        self.assertEqual(ps.children[0], terms[2])
        self.assertEqual(ps.children[1], terms[3])
        self.assertEqual(ps.children[2].children[0], terms[4])
Exemple #7
0
def fix_tokenization(passage, words_set, lang, cw):
    tokenizer = get_tokenizer(lang=lang)
    elem = to_site(passage)
    state = State()
    ever_changed = False
    for paragraph in elem.iterfind(SiteCfg.Paths.Paragraphs):
        while True:
            changed = False
            terminals = list(paragraph.iter(SiteCfg.Tags.Terminal))
            preterminals = get_parents(paragraph, terminals)
            preterminal_parents = get_parents(paragraph, preterminals)
            is_puncts = [
                p.get(SiteCfg.Attr.ElemTag) == SiteCfg.Types.Punct
                for p in preterminals
            ]
            for i in false_indices(is_puncts):
                start, end = expand_to_neighboring_punct(i, is_puncts)
                if retokenize(i, start, end, terminals, preterminals,
                              preterminal_parents, passage.ID, tokenizer,
                              state, cw, words_set):
                    ever_changed = changed = True
                    break
            if not changed:
                break
    return from_site(elem) if ever_changed else None
Exemple #8
0
    def test_site_terminals(self):
        elem = TestUtil.load_xml("test_files/site1.xml")
        passage = convert.from_site(elem)
        terms = passage.layer(layer0.LAYER_ID).all

        self.assertEqual(passage.ID, "118")
        self.assertEqual(len(terms), 15)

        # There are two punctuation signs (dots, positions 5 and 11), which
        # also serve as paragraph end points. All others are words whose text
        # is their positions, so test that both text, punctuation (yes/no)
        # and paragraphs are converted correctly
        for i, t in enumerate(terms):
            # i starts in 0, positions at 1, hence 5,11 ==> 4,10
            if i in (4, 10):
                self.assertTrue(t.text == "." and t.punct is True)
            else:
                self.assertTrue(t.text == str(i + 1) and t.punct is False)
            if i < 5:
                par = 1
            elif i < 11:
                par = 2
            else:
                par = 3
            self.assertEqual(t.paragraph, par)
Exemple #9
0
def test_site_simple():
    elem = load_xml("test_files/site2.xml")
    passage = convert.from_site(elem)
    terms = passage.layer(layer0.LAYER_ID).all
    l1 = passage.layer("1")

    # The Terminals in the passage are just like in test_site_terminals,
    # with this layer1 hierarchy: [[1 C] [2 E] L] [3 4 . H]
    # with the linker having a remark and the parallel scene is uncertain
    head = l1.heads[0]
    assert len(head) == 12  # including all "unused" terminals
    assert head[9].tag == layer1.EdgeTags.Linker
    assert head[10].tag == layer1.EdgeTags.ParallelScene
    linker = head.children[9]
    _test_edges(linker, [layer1.EdgeTags.Center, layer1.EdgeTags.Elaborator])
    assert linker.extra["remarks"], '"remark"'
    center = linker.children[0]
    elab = linker.children[1]
    _test_terms(center, terms[0:1])
    _test_terms(elab, terms[1:2])
    ps = head.children[10]
    _test_edges(ps, [
        layer1.EdgeTags.Terminal, layer1.EdgeTags.Terminal,
        layer1.EdgeTags.Punctuation
    ])
    assert ps.attrib.get("uncertain")
    assert ps.children[0] == terms[2]
    assert ps.children[1] == terms[3]
    assert ps.children[2].children[0] == terms[4]
Exemple #10
0
def main():
    opt_parser = cmd_line_parser()
    (options, args) = opt_parser.parse_args()
    if len(args) > 0:
        opt_parser.error("all arguments must be flagged")

    if (options.guessed is None) or (options.ref is
                                     None) or (options.db_filename is None):
        opt_parser.error("missing arguments. type --help for help.")
    if (options.pid is not None and options.from_xids is not None):
        opt_parser.error("inconsistent parameters. \
        you can't have both a pid and from_xids paramters.")

    keys = [options.guessed, options.ref]
    if options.from_xids:
        xmls = ucca_db.get_by_xids(options.db_filename, options.host, keys)
    else:
        xmls = ucca_db.get_xml_trees(options.db_filename, options.host,
                                     options.pid, keys)

    guessed, ref = [convert.from_site(x) for x in xmls]
    if options.units or options.fscore or options.errors:
        evaluate(guessed,
                 ref,
                 units=options.units,
                 fscore=options.fscore,
                 errors=options.errors,
                 verbose=True)
Exemple #11
0
def main(args):
    keys = [args.guessed, args.ref]
    xmls = api.get_by_xids(db_name=args.db_filename, host_name=args.host, xids=keys) if args.from_xids else \
        api.get_xml_trees(db_name=args.db_filename, host_name=args.host, pid=args.pid, usernames=keys)
    guessed, ref = [convert.from_site(x) for x in xmls]
    if args.units or args.fscore or args.errors:
        evaluate(guessed, ref, units=args.units, fscore=args.fscore, errors=args.errors,
                 constructions=args.constructions, verbose=True)
Exemple #12
0
def main(argv):
    t = tqdm(globals()[argv[1]]("pgserver", "work", *argv[2:]),
             unit=" passages",
             desc="Downloading XMLs")
    for xml in t:
        p = convert.from_site(xml)
        t.set_postfix(ID=p.ID)
        convert.passage2file(p, p.ID + ".xml")
def test_site_discontiguous_with_remote():
    elem = load_xml("test_files/site4.xml")
    passage = convert.from_site(elem)
    s1 = passage.layer(layer1.LAYER_ID).heads[0].state
    remote_a1 = [e.child for e in s1 if e.attrib.get("remote") and e.tag == layer1.EdgeTags.Participant]
    assert len(remote_a1) == 1
    a1 = remote_a1[0]
    remote_a2 = [e.child for e in a1 if e.attrib.get("remote") and e.tag == layer1.EdgeTags.Participant]
    assert len(remote_a2) == 1
Exemple #14
0
def test_site_discontiguous_with_implicit():
    elem = load_xml("test_files/site5.xml")
    passage = convert.from_site(elem)
    s1 = passage.layer(layer1.LAYER_ID).heads[0].state
    remote_t1 = [
        e.child for e in s1
        if e.child.attrib.get("implicit") and e.tag == layer1.EdgeTags.Time
    ]
    assert len(remote_t1) == 1
Exemple #15
0
def print_passages_to_file(host_name,
                           db_name,
                           paids,
                           write_xml=False,
                           write_site_xml=False,
                           prefix='',
                           start_index=0):
    """
    Returns for that user a list of submitted passages and a list of assigned but not submitted passages.
    Each passage is given in the format: (<passage ID>, <source>, <recent submitted xid or -1 if not submitted>,
    <number of tokens in the passage>, <number of units in the passage>, <number of scenes in the passage>,
    <average length of a scene>). It also returns a distribution of the categories.
    write_xml: determines whether to write it to a file, named <prefix><the number of the xml>.xml
    skip_first: the index of the passage where it should start looking (the ones before are skipped)
    """
    c = get_cursor(host_name, db_name)

    for paid in paids:
        if paid < start_index:  # skipping training passages
            continue
        c.execute("SELECT passage,source FROM passages WHERE id=%s", (paid, ))
        r = c.fetchone()
        if r is not None:
            source = r[1]
            c.execute(
                "SELECT id, xml,uid,ts FROM xmls WHERE paid=%s ORDER BY ts DESC",
                (paid, ))
            r = c.fetchone()
            if r is not None:
                xid = r[0]
                uid = r[2]
                ts = r[3]
                print('\t'.join(
                    [str(paid),
                     str(uid),
                     str(source),
                     str(xid),
                     str(ts)]))

                if write_site_xml:
                    f = open(prefix + str(paid) + '_site.xml',
                             'w',
                             encoding='utf-8')
                    f.write(r[1] + '\n')
                    f.close()
                # noinspection PyBroadException
                try:
                    ucca_dag = convert.from_site(fromstring(r[1]))
                except Exception:
                    sys.stderr.write("Skipped xid,paid " + str((xid, paid)) +
                                     "\n")
                    continue
                if write_xml:
                    f = open(prefix + str(paid) + '.xml', 'w')
                    f.write(tostring(convert.to_standard(ucca_dag)).decode())
                    f.close()
Exemple #16
0
def main():
    db_name = sys.argv[1]
    with open(db_name + '.xids') as f:
        xids = tuple(int(x.strip()) for x in f.readlines())
    conn = sqlite3.connect(db_name + '.db')
    c = conn.cursor()
    print("SELECT xml FROM xmls WHERE id IN " + str(xids))
    c.execute("SELECT xml FROM xmls WHERE id IN " + str(xids))
    passages = [convert.from_site(ETree.fromstring(x[0])) for x in c]
    print(set(p.ID for p in passages))
    with open(db_name + '.pickle', 'wb') as f:
        pickle.dump(passages, f)
Exemple #17
0
def get_predicates(host_name, db_name, only_complex=True, start_index=100):
    """
    Returns a list of all the predicates in the UCCA corpus.
    usernames -- the names of the users whose completed passages we should take.
    only_complex -- only the multi-word predicates will be returned.
    start_index -- the minimal passage number to be taken into account.
    """

    def _complex(u):
        "Returns True if u is complex, i.e., if it has more than one child which is not an F or punct"
        if u is None or u.tag != 'FN':
            return False
        non_function_count = 0
        non_function_u = None
        for e in u.outgoing:
            if e.child.tag == 'FN' and e.tag != 'F':
                non_function_count += 1
                non_function_u = e.child
        return True if non_function_count > 1 else _complex(non_function_u)

    predicate_distribution = collections.Counter()

    c = get_cursor(host_name, db_name)
    # uid = get_uid(host_name, db_name, username)
    # get all the completed xmls
    c.execute("SELECT id, xml FROM xmls WHERE status=%s AND reviewOf<>%s ORDER BY ts DESC", (1, -1))
    L = c.fetchall()

    wspace = re.compile("\\s+")

    with open('preds', 'w') as f:
        for r in tqdm(L):
            xid = r[0]
            try:
                ucca_dag = convert.from_site(fromstring(r[1]))
            except Exception:
                print("Skipped.", file=sys.stderr)
                continue

            # gathering statistics
            scenes = [x for x in ucca_dag.layer("1").all if x.tag == "FN" and x.is_scene()]
            temp = []
            for sc in scenes:
                main_relation = sc.process if sc.process is not None else sc.state
                if only_complex and not _complex(main_relation):
                    continue
                try:
                    print(main_relation.to_text(), file=f)
                except UnicodeEncodeError:
                    print("Skipped (encoding issue).", file=sys.stderr)
                    continue
Exemple #18
0
    def test_site_advanced(self):
        elem = self._load_xml('./site3.xml')
        passage = convert.from_site(elem)
        terms = passage.layer(layer0.LAYER_ID).all
        l1 = passage.layer('1')

        # This passage has the same terminals as the simple and terminals test,
        # and have the same layer1 units for the first paragraph as the simple
        # test. In addition, it has the following annotation:
        # [6 7 8 9 H] [10 F] .
        # the 6-9 H has remote D which is [10 F]. Inside of 6-9, we have [8 S]
        # and [6 7 .. 9 A], where [6 E] and [7 .. 9 C].
        # [12 H] [13 H] [14 H] [15 L], where 15 linkage links 12, 13 and 14 and
        # [15 L] has an implicit Center unit
        head, lkg = l1.heads
        self._test_edges(head, [layer1.EdgeTags.Linker,
                                layer1.EdgeTags.ParallelScene,
                                layer1.EdgeTags.ParallelScene,
                                layer1.EdgeTags.Function,
                                layer1.EdgeTags.Punctuation,
                                layer1.EdgeTags.ParallelScene,
                                layer1.EdgeTags.ParallelScene,
                                layer1.EdgeTags.ParallelScene,
                                layer1.EdgeTags.Linker])

        # we only take what we haven't checked already
        ps1, func, punct, ps2, ps3, ps4, link = head.children[2:]
        self._test_edges(ps1, [layer1.EdgeTags.Participant,
                               layer1.EdgeTags.Process,
                               layer1.EdgeTags.Adverbial])
        self.assertTrue(ps1[2].attrib.get('remote'))
        ps1_a, ps1_p, ps1_d = ps1.children
        self._test_edges(ps1_a, [layer1.EdgeTags.Elaborator,
                                 layer1.EdgeTags.Center])
        self._test_terms(ps1_a.children[0], terms[5:6])
        self._test_terms(ps1_a.children[1], terms[6:9:2])
        self._test_terms(ps1_p, terms[7:8])
        self.assertEqual(ps1_d, func)
        self._test_terms(func, terms[9:10])
        self._test_terms(punct, terms[10:11])
        self._test_terms(ps2, terms[11:12])
        self._test_terms(ps3, terms[12:13])
        self._test_terms(ps4, terms[13:14])
        self.assertEqual(len(link), 2)
        self.assertEqual(link[0].tag, layer1.EdgeTags.Center)
        self.assertTrue(link.children[0].attrib.get('implicit'))
        self.assertEqual(link[1].tag, layer1.EdgeTags.Elaborator)
        self.assertEqual(link.children[1][0].tag, layer1.EdgeTags.Terminal)
        self.assertEqual(link.children[1][0].child, terms[14])
        self.assertEqual(lkg.relation, link)
        self.assertSequenceEqual(lkg.arguments, [ps2, ps3, ps4])
def test_site_advanced():
    elem = load_xml("test_files/site3.xml")
    passage = convert.from_site(elem)
    terms = passage.layer(layer0.LAYER_ID).all
    l1 = passage.layer("1")

    # This passage has the same terminals as the simple and terminals test,
    # and have the same layer1 units for the first paragraph as the simple
    # test. In addition, it has the following annotation:
    # [6 7 8 9 H] [10 F] .
    # the 6-9 H has remote D which is [10 F]. Inside of 6-9, we have [8 S]
    # and [6 7 ... 9 A], where [6 E] and [7 ... 9 C].
    # [12 H] [13 H] [14 H] [15 L], where 15 linkage links 12, 13 and 14 and
    # [15 L] has an implicit Center unit
    head, lkg = l1.heads
    _test_edges(head, [layer1.EdgeTags.Linker,
                       layer1.EdgeTags.ParallelScene,
                       layer1.EdgeTags.ParallelScene,
                       layer1.EdgeTags.Function,
                       layer1.EdgeTags.Punctuation,
                       layer1.EdgeTags.ParallelScene,
                       layer1.EdgeTags.ParallelScene,
                       layer1.EdgeTags.ParallelScene,
                       layer1.EdgeTags.Linker])

    # we only take what we haven"t checked already
    ps1, func, punct, ps2, ps3, ps4, link = head.children[2:]
    _test_edges(ps1, [layer1.EdgeTags.Participant,
                      layer1.EdgeTags.Process,
                      layer1.EdgeTags.Adverbial])
    assert ps1[2].attrib.get("remote")
    ps1_a, ps1_p, ps1_d = ps1.children
    _test_edges(ps1_a, [layer1.EdgeTags.Elaborator,
                        layer1.EdgeTags.Center])
    _test_terms(ps1_a.children[0], terms[5:6])
    _test_terms(ps1_a.children[1], terms[6:9:2])
    _test_terms(ps1_p, terms[7:8])
    assert ps1_d == func
    _test_terms(func, terms[9:10])
    _test_terms(punct, terms[10:11])
    _test_terms(ps2, terms[11:12])
    _test_terms(ps3, terms[12:13])
    _test_terms(ps4, terms[13:14])
    assert len(link) == 2
    assert link[0].tag == layer1.EdgeTags.Center
    assert link.children[0].attrib.get("implicit")
    assert link[1].tag == layer1.EdgeTags.Elaborator
    assert link.children[1][0].tag == layer1.EdgeTags.Terminal
    assert link.children[1][0].child == terms[14]
    assert lkg.relation == link
    assert lkg.arguments == [ps2, ps3, ps4]
Exemple #20
0
    def test_site_advanced(self):
        elem = TestUtil.load_xml('test_files/site3.xml')
        passage = convert.from_site(elem)
        terms = passage.layer(layer0.LAYER_ID).all
        l1 = passage.layer('1')

        # This passage has the same terminals as the simple and terminals test,
        # and have the same layer1 units for the first paragraph as the simple
        # test. In addition, it has the following annotation:
        # [6 7 8 9 H] [10 F] .
        # the 6-9 H has remote D which is [10 F]. Inside of 6-9, we have [8 S]
        # and [6 7 ... 9 A], where [6 E] and [7 ... 9 C].
        # [12 H] [13 H] [14 H] [15 L], where 15 linkage links 12, 13 and 14 and
        # [15 L] has an implicit Center unit
        head, lkg = l1.heads
        self._test_edges(head, [
            layer1.EdgeTags.Linker, layer1.EdgeTags.ParallelScene,
            layer1.EdgeTags.ParallelScene, layer1.EdgeTags.Function,
            layer1.EdgeTags.Punctuation, layer1.EdgeTags.ParallelScene,
            layer1.EdgeTags.ParallelScene, layer1.EdgeTags.ParallelScene,
            layer1.EdgeTags.Linker
        ])

        # we only take what we haven't checked already
        ps1, func, punct, ps2, ps3, ps4, link = head.children[2:]
        self._test_edges(ps1, [
            layer1.EdgeTags.Participant, layer1.EdgeTags.Process,
            layer1.EdgeTags.Adverbial
        ])
        self.assertTrue(ps1[2].attrib.get('remote'))
        ps1_a, ps1_p, ps1_d = ps1.children
        self._test_edges(ps1_a,
                         [layer1.EdgeTags.Elaborator, layer1.EdgeTags.Center])
        self._test_terms(ps1_a.children[0], terms[5:6])
        self._test_terms(ps1_a.children[1], terms[6:9:2])
        self._test_terms(ps1_p, terms[7:8])
        self.assertEqual(ps1_d, func)
        self._test_terms(func, terms[9:10])
        self._test_terms(punct, terms[10:11])
        self._test_terms(ps2, terms[11:12])
        self._test_terms(ps3, terms[12:13])
        self._test_terms(ps4, terms[13:14])
        self.assertEqual(len(link), 2)
        self.assertEqual(link[0].tag, layer1.EdgeTags.Center)
        self.assertTrue(link.children[0].attrib.get('implicit'))
        self.assertEqual(link[1].tag, layer1.EdgeTags.Elaborator)
        self.assertEqual(link.children[1][0].tag, layer1.EdgeTags.Terminal)
        self.assertEqual(link.children[1][0].child, terms[14])
        self.assertEqual(lkg.relation, link)
        self.assertSequenceEqual(lkg.arguments, [ps2, ps3, ps4])
Exemple #21
0
def run_file(path, eng):
    """Site XML file ==> prints list of sceneness results"""
    with open(path) as f:
        root = ETree.ElementTree().parse(f)
    passage = convert.from_site(root)
    words = [x.text for x in passage.layer('0').words]
    print(' '.join(words))
    for word in words:
        all_tagsets = eng.get_forms(word)
        all_postags = set()
        for tagset in all_tagsets.values():
            all_postags.update(tagset)
        print('{}\t{}'.format(word, all_postags))
        if eng.is_dual_vn(word):
            print(all_tagsets)
            print('========')
Exemple #22
0
def run_file(path, eng, stats):
    """Site XML file ==> prints list of sceneness results"""
    with open(path) as f:
        root = ETree.ElementTree().parse(f)
    passage = convert.from_site(root)

    sc = scenes.extract_possible_scenes(passage)
    heads = [scenes.extract_head(x) for x in sc]

    for s, h in zip(sc, heads):
        if h is None:
            stats.heads.append(Result(s))
            continue
        out = eng.get_categories(s, h)
        if out == 'implicit':
            stats.heads.append(Result(s))
        elif out == 'no base form':
            stats.lemmas.append(Result(s, h))
        elif out[2]:
            stats.fulls.append(Result(s, h, *out))
        else:
            stats.no_cats.append(Result(s, h, *out))
Exemple #23
0
def run_file(path, eng, stats):
    """Site XML file ==> prints list of sceneness results"""
    with open(path) as f:
        root = ETree.ElementTree().parse(f)
    passage = convert.from_site(root)

    sc = scenes.extract_possible_scenes(passage)
    heads = [scenes.extract_head(x) for x in sc]

    for s, h in zip(sc, heads):
        if h is None:
            stats.heads.append(Result(s))
            continue
        out = eng.get_categories(s, h)
        if out == 'implicit':
            stats.heads.append(Result(s))
        elif out == 'no base form':
            stats.lemmas.append(Result(s, h))
        elif out[2]:
            stats.fulls.append(Result(s, h, *out))
        else:
            stats.no_cats.append(Result(s, h, *out))
Exemple #24
0
def main():
    opt_parser = cmd_line_parser()
    (options, args) = opt_parser.parse_args()
    if len(args) > 0:
        opt_parser.error("all arguments must be flagged")

    if (options.guessed is None) or (options.ref is None) or (options.db_filename is None):
        opt_parser.error("missing arguments. type --help for help.")
    if options.pid is not None and options.from_xids is not None:
        opt_parser.error("inconsistent parameters. \
        you can't have both a pid and from_xids paramters.")

    keys = [options.guessed, options.ref]
    if options.from_xids:
        xmls = ucca_db.get_by_xids(options.db_filename, options.host, keys)
    else:
        xmls = ucca_db.get_xml_trees(options.db_filename, options.host,
                                     options.pid, keys)

    guessed, ref = [convert.from_site(x) for x in xmls]
    if options.units or options.fscore or options.errors:
        evaluate(guessed, ref,
                 units=options.units, fscore=options.fscore, errors=options.errors, verbose=True)
Exemple #25
0
def fix_tokenization(passage, words_set, lang, cw):
    tokenizer = get_tokenizer(lang=lang)
    elem = to_site(passage)
    state = State()
    ever_changed = False
    for paragraph in elem.iterfind(SiteCfg.Paths.Paragraphs):
        while True:
            changed = False
            terminals = list(paragraph.iter(SiteCfg.Tags.Terminal))
            preterminals = get_parents(paragraph, terminals)
            preterminal_parents = get_parents(paragraph, preterminals)
            is_puncts = [p.get(SiteCfg.Attr.ElemTag) == SiteCfg.Types.Punct for p in preterminals]
            for i in false_indices(is_puncts):
                start, end = expand_to_neighboring_punct(i, is_puncts)
                if retokenize(i, start, end, terminals, preterminals,
                              preterminal_parents, passage.ID, tokenizer,
                              state,
                              cw, words_set):
                    ever_changed = changed = True
                    break
            if not changed:
                break
    return from_site(elem) if ever_changed else None
Exemple #26
0
def test_to_standard():
    passage = convert.from_site(load_xml("test_files/site3.xml"))
    ref = load_xml("test_files/standard3.xml")  # old format of xml
    new_ref = convert.to_standard(convert.from_standard(ref))   # converting to the new xml format
    root = convert.to_standard(passage)
    assert ETree.tostring(new_ref) == ETree.tostring(root)
Exemple #27
0
 def test_to_standard(self):
     passage = convert.from_site(TestUtil.load_xml("test_files/site3.xml"))
     ref = TestUtil.load_xml("test_files/standard3.xml")
     root = convert.to_standard(passage)
     self.assertEqual(ETree.tostring(ref), ETree.tostring(root))
Exemple #28
0
def test_to_site():
    passage = loaded()
    root = convert.to_site(passage)
    copy = convert.from_site(root)
    assert passage.equals(copy)
Exemple #29
0
def test_from_standard():
    passage = loaded()
    ref = convert.from_site(load_xml("test_files/site3.xml"))
    assert passage.equals(ref, ordered=True)
Exemple #30
0
 def test_from_standard(self):
     passage = convert.from_standard(self._load_xml('./standard3.xml'))
     ref = convert.from_site(self._load_xml('./site3.xml'))
     self.assertTrue(passage.equals(ref, ordered=True))
Exemple #31
0
def get_tasks(db, host, username):
    """
    Returns for that user a list of submitted passages
    and a list of assigned but not submitted passages.
    Each passage is given in the format:
    (<passage ID>, <source>, <recent submitted xid or -1 if not submitted>,
    <number of tokens in the passage>,
     <number of units in the passage>, <number of scenes in the passage>,
    <average length of a scene>).
     It also returns a distribution of the categories.
    """
    output_submitted = []
    category_distribution = Counter()
    # the categories of scenes. can be A, E or H
    scene_distribution = Counter()

    uid = get_uid(db, username)
    cur = get_cursor(db, username)
    cur.execute("SELECT pid,status FROM tasks WHERE uid=" +
                PLACE_HOLDER, (uid,))
    r = cur.fetchall()
    submitted_paids = [x[0] for x in r if x[1] == 1]
    incomplete_paids = [x[0] for x in r if x[1] == 0]

    wspace = re.compile("\\s+")

    for paid in submitted_paids:
        sum_scene_length = 0
        if paid < 100:  # skipping training passages
            continue
        cur.execute("SELECT passage,source FROM passages WHERE id=" +
                    PLACE_HOLDER, (paid,))
        r = cur.fetchone()
        if r:
            num_tokens = len(wspace.split(r[0])) - 1
            source = r[1]
            cur.execute("SELECT id, xml FROM xmls WHERE paid=" +
                        PLACE_HOLDER + " AND uid=" + PLACE_HOLDER +
                        " AND status=" + PLACE_HOLDER +
                        " ORDER BY ts DESC", (paid, uid, 1))
            r = cur.fetchone()
            if r:
                xid = r[0]
                # noinspection PyBroadException
                try:
                    ucca_dag = convert.from_site(fromstring(r[1]))
                except Exception:
                    sys.stderr.write("Skipped.\n")
                    continue
                num_units = len([x for x in ucca_dag.layer(layer1.LAYER_ID).all
                                 if x.tag == NT.Foundational]) - 1
                for node in ucca_dag.layer(layer1.LAYER_ID).all:
                    category_distribution.update([e.tag for e in node
                                                  if e.tag
                                                  not in
                                                  [ET.Punctuation, ET.LinkArgument, ET.LinkRelation, ET.Terminal]])
                # getting the scene categories
                scenes = [x for x in ucca_dag.layer(layer1.LAYER_ID).all
                          if x.tag == NT.Foundational and x.is_scene()]
                scene_distribution.update([linkage_type(sc) for sc in scenes])
                sum_scene_length += sum([unit_length(x) for x in scenes])

        output_submitted.append((paid, source, xid,
                                 num_tokens, num_units, len(scenes),
                                 1.0 * sum_scene_length / len(scenes)))

    return output_submitted, category_distribution, scene_distribution
Exemple #32
0
 def test_from_standard(self):
     passage = convert.from_standard(TestUtil.load_xml("test_files/standard3.xml"))
     ref = convert.from_site(TestUtil.load_xml("test_files/site3.xml"))
     self.assertTrue(passage.equals(ref, ordered=True))
Exemple #33
0
 def test_to_site(self):
     passage = convert.from_standard(
         TestUtil.load_xml("test_files/standard3.xml"))
     root = convert.to_site(passage)
     copy = convert.from_site(root)
     self.assertTrue(passage.equals(copy))
        output.append(output2)
    return (output)


def get_sentences(P):
    """
    P is the output of the simplification system. Return all the sentences in each passage
    """
    dirpath = '/Mypath/System_output'
    folder = nltk.data.find(dirpath)
    corpusReader = nltk.corpus.PlaintextCorpusReader(folder, P)
    d = len(corpusReader.sents())
    return (corpusReader.sents()[:d])


index = list(range(0, 100))

for t in index:
    f1 = open('UCCAannotated_source/%s.xml' % t)
    xml_string1 = f1.read()
    f1.close()
    xml_object1 = fromstring(xml_string1)
    P1 = convert.from_site(xml_object1)
    L1 = get_scenes(P1)
    L2 = get_sentences('%s.txt' % t)
    s = open('s%s.txt' % t, 'w')
    s.write('%s\n' % L1)
    s.write('%s\n' % L2)

    s.close()
Exemple #35
0
def get_tasks(db, host, username):
    """
    Returns for that user a list of submitted passages
    and a list of assigned but not submitted passages.
    Each passage is given in the format:
    (<passage ID>, <source>, <recent submitted xid or -1 if not submitted>,
    <number of tokens in the passage>,
     <number of units in the passage>, <number of scenes in the passage>,
    <average length of a scene>).
     It also returns a distribution of the categories.
    """
    output_submitted = []
    category_distribution = Counter()
    # the categories of scenes. can be A, E or H
    scene_distribution = Counter()

    uid = get_uid(db, username)
    cur = get_cursor(db, username)
    cur.execute("SELECT pid,status FROM tasks WHERE uid=" + PLACE_HOLDER,
                (uid, ))
    r = cur.fetchall()
    submitted_paids = [x[0] for x in r if x[1] == 1]
    incomplete_paids = [x[0] for x in r if x[1] == 0]

    wspace = re.compile("\\s+")

    for paid in submitted_paids:
        sum_scene_length = 0
        if paid < 100:  # skipping training passages
            continue
        cur.execute(
            "SELECT passage,source FROM passages WHERE id=" + PLACE_HOLDER,
            (paid, ))
        r = cur.fetchone()
        if r:
            num_tokens = len(wspace.split(r[0])) - 1
            source = r[1]
            cur.execute(
                "SELECT id, xml FROM xmls WHERE paid=" + PLACE_HOLDER +
                " AND uid=" + PLACE_HOLDER + " AND status=" + PLACE_HOLDER +
                " ORDER BY ts DESC", (paid, uid, 1))
            r = cur.fetchone()
            if r:
                xid = r[0]
                # noinspection PyBroadException
                try:
                    ucca_dag = convert.from_site(fromstring(r[1]))
                except Exception:
                    sys.stderr.write("Skipped.\n")
                    continue
                num_units = len([
                    x for x in ucca_dag.layer(layer1.LAYER_ID).all
                    if x.tag == NT.Foundational
                ]) - 1
                for node in ucca_dag.layer(layer1.LAYER_ID).all:
                    category_distribution.update([
                        e.tag for e in node if e.tag not in [
                            ET.Punctuation, ET.LinkArgument, ET.LinkRelation,
                            ET.Terminal
                        ]
                    ])
                # getting the scene categories
                scenes = [
                    x for x in ucca_dag.layer(layer1.LAYER_ID).all
                    if x.tag == NT.Foundational and x.is_scene()
                ]
                scene_distribution.update([linkage_type(sc) for sc in scenes])
                sum_scene_length += sum([unit_length(x) for x in scenes])

        output_submitted.append(
            (paid, source, xid, num_tokens, num_units, len(scenes),
             1.0 * sum_scene_length / len(scenes)))

    return output_submitted, category_distribution, scene_distribution
Exemple #36
0
 def test_possible_scenes(self):
     """Tests that the API isn't broken, not validity of the result."""
     elem = ConversionTests._load_xml('./site3.xml')
     passage = convert.from_site(elem)
     scenes.extract_possible_scenes(passage)
Exemple #37
0
 def test_to_site(self):
     passage = convert.from_standard(self._load_xml('./standard3.xml'))
     root = convert.to_site(passage)
     copy = convert.from_site(root)
     self.assertTrue(passage.equals(copy))
Exemple #38
0
    # return predicate_distribution


def get_cursor(host_name, db_name):
    con = get_connection(db_name, host_name)
    c = con.cursor()
    c.execute("SET search_path TO oabend")
    return c


def get_connection(db_name, host_name):
    global CONNECTION
    CONNECTION = psycopg2.connect(host=host_name, database=db_name)
    return CONNECTION


# with open("ids.txt") as f_ids:
#     for i in tqdm(list(f_ids), unit=" passages", desc="Downloading XMLs"):
#         for xml in get_xml_trees("pgserver", "work", i):
#             p = convert.from_site(xml)
#             convert.passage2file(p, "downloaded/" + p.ID + ".xml")


if __name__ == "__main__":
    t = tqdm(globals()[sys.argv[1]]("pgserver", "work", *sys.argv[2:]),
             unit=" passages", desc="Downloading XMLs")
    for xml in t:
        p = convert.from_site(xml)
        t.set_postfix(ID=p.ID)
        convert.passage2file(p, p.ID + ".xml")
Exemple #39
0
 def test_to_site(self):
     passage = convert.from_standard(TestUtil.load_xml("test_files/standard3.xml"))
     root = convert.to_site(passage)
     copy = convert.from_site(root)
     self.assertTrue(passage.equals(copy))
Exemple #40
0
 def test_from_standard(self):
     passage = convert.from_standard(
         TestUtil.load_xml("test_files/standard3.xml"))
     ref = convert.from_site(TestUtil.load_xml("test_files/site3.xml"))
     self.assertTrue(passage.equals(ref, ordered=True))
Exemple #41
0
 def test_to_standard(self):
     passage = convert.from_site(TestUtil.load_xml("test_files/site3.xml"))
     ref = TestUtil.load_xml("test_files/standard3.xml")
     root = convert.to_standard(passage)
     self.assertEqual(ETree.tostring(ref), ETree.tostring(root))
Exemple #42
0
def test_to_standard():
    passage = convert.from_site(load_xml("test_files/site3.xml"))
    ref = load_xml("test_files/standard3.xml")
    root = convert.to_standard(passage)
    assert ETree.tostring(ref) == ETree.tostring(root)
Exemple #43
0
def main():
    print(
        align.align("what has is by the meaning of the word is",
                    "what is the men for the wk is are be"))

    # read xml files
    print("reading db xmls")
    p = []
    for filename in filenames:
        with open(add_path(filename), "rb") as fl:
            p += pickle.load(fl)[0]
        print(
            "read ", filename, " it starts with ",
            tuple(term.text for term in textutil.extract_terminals(
                convert.from_site(p[-1]))[:6]))
    # convert xml to passages
    p = list(map(convert.from_site, p))

    print("reading passage xmls")
    # read passage files
    for filename in passage_filenames:
        print("reading" + filename)
        if os.path.isfile(add_path(os.path.splitext(filename)[0] + ".pkl")):
            with open(add_path(os.path.splitext(filename)[0] + ".pkl"),
                      "rb") as fl:
                p.append(pickle.load(fl))
        else:
            p.append(file2passage(add_path(filename)))
            with open(add_path(os.path.splitext(filename)[0] + ".pkl"),
                      "wb") as fl:
                pickle.dump(p[-1], fl)
                print("dumping",
                      add_path(os.path.splitext(filename)[0] + ".pkl"))

    all_filenames = filenames + passage_filenames
    print("read ", all_filenames)
    word2word = align.align_yields(p[0], p[1])
    assert align.reverse_mapping(word2word) == align.align_yields(
        p[1], p[0]), "align_yields asymmetrical"

    # create symmilarity matrix
    sources = []
    goals = []
    names = []
    i = 0
    while i < len(p):
        names.append(all_filenames[i])
        sources.append(p[i])
        i += 1
        goals.append(p[i])
        i += 1
    chunksize = 1
    if (len(goals) > 100):
        chunksize = int(len(goals) / POOL_SIZE / 10)
    print("multithreading with chunksize", chunksize)
    pool = Pool(POOL_SIZE)
    if r2s:
        results = pool.starmap(distances, zip(goals, sources, names),
                               chunksize)
    else:
        results = pool.starmap(distances, zip(sources, goals, names),
                               chunksize)
    print(results)
    pool.close()
    pool.join()
    sym_mat = []
    keys = []
    for row, key in results:
        keys.append(key)
        sym_mat.append(row)
    print("functions and matrix")
    print(funcs + keys)
    for item in sym_mat:
        print(item)
    print("overall token analysis")
    print(align.token_level_analysis(p))
    output_path = trial_name + "output.csv"
    with open(output_path, "w") as f:
        print("writing output to " + output_path)
        writer = csv.writer(f)
        writer.writerows(sym_mat)
    send_mail("*****@*****.**", "finished",
              os.path.abspath(output_path))
    return
Exemple #44
0
 def test_to_standard(self):
     passage = convert.from_site(self._load_xml('./site3.xml'))
     ref = self._load_xml('./standard3.xml')
     root = convert.to_standard(passage)
     self.assertEqual(ETree.tostring(ref), ETree.tostring(root))