Beispiel #1
0
 def test_annotate_all(self):
     passages = [
         convert.from_standard(
             TestUtil.load_xml("test_files/standard3.xml")),
         TestUtil.create_passage(),
         TestUtil.create_crossing_passage(),
         TestUtil.create_discontiguous(),
         TestUtil.create_multi_passage()
     ]
     list(textutil.annotate_all(passages))
     for passage, compare in textutil.annotate_all(
         ((p, p) for p in passages), as_array=True, as_tuples=True):
         assert passage is compare
         for p in passage, convert.from_standard(
                 convert.to_standard(passage)):
             self.assertTrue(is_annotated(p, as_array=True),
                             "Passage %s is not annotated" % passage.ID)
             self.assertTrue(is_annotated(p, as_array=False),
                             "Passage %s is not annotated" % passage.ID)
             for terminal in p.layer(layer0.LAYER_ID).all:
                 for attr in textutil.Attr:
                     self.assertIn(
                         attr.key, terminal.extra,
                         "Terminal %s in passage %s has no %s" %
                         (terminal, passage.ID, attr.name))
                 self.assertIsNotNone(
                     terminal.tok,
                     "Terminal %s in passage %s has no annotation" %
                     (terminal, passage.ID))
                 self.assertEqual(len(terminal.tok), len(textutil.Attr))
Beispiel #2
0
def test_annotate_all(as_array, convert_and_back):
    passages = [create() for create in PASSAGES]
    list(textutil.annotate_all(passages))
    for passage, compare in textutil.annotate_all(((p, p) for p in passages), as_array=as_array, as_tuples=True):
        assert passage is compare
        p = (passage, convert.from_standard(convert.to_standard(passage)))[convert_and_back]
        assert textutil.is_annotated(p, as_array=as_array), "Passage %s is not annotated" % passage.ID
        for terminal in p.layer(layer0.LAYER_ID).all:
            if as_array:
                assert terminal.tok is not None, "Terminal %s in passage %s has no annotation" % (terminal, passage.ID)
                assert len(terminal.tok) == len(textutil.Attr)
            else:
                for attr in textutil.Attr:
                    assert attr.key in terminal.extra, "Terminal %s in passage %s has no %s" % (
                        terminal, passage.ID, attr.name)
Beispiel #3
0
def test_annotate_all(as_array, convert_and_back):
    passages = [create() for create in PASSAGES]
    list(textutil.annotate_all(passages))
    for passage, compare in textutil.annotate_all(((p, p) for p in passages), as_array=as_array, as_tuples=True):
        assert passage is compare
        p = (passage, convert.from_standard(convert.to_standard(passage)))[convert_and_back]
        assert textutil.is_annotated(p, as_array=as_array), "Passage %s is not annotated" % passage.ID
        for terminal in p.layer(layer0.LAYER_ID).all:
            if as_array:
                assert terminal.tok is not None, "Terminal %s in passage %s has no annotation" % (terminal, passage.ID)
                assert len(terminal.tok) == len(textutil.Attr)
            else:
                for attr in textutil.Attr:
                    assert attr.key in terminal.extra, "Terminal %s in passage %s has no %s" % (
                        terminal, passage.ID, attr.name)
Beispiel #4
0
    def parse_sentence(self, sentence):

        reg_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w', encoding='UTF-8')
        parsed_passage = None

        try:
            TupaParser.__passage_counter = +1
            passage_id = TupaParser.__passage_counter = +1

            # from_text will convert the sentence into a ucca structure.
            # annotate_all will annotate the structure with information from the Spacy parse.
            # annotate_all returns a generator - one that will yield only one object - hence
            # we call next
            unparsed_passage = next(
                annotate_all(from_text(sentence, passage_id,
                                       one_per_line=True)))

            # The 'tupa.parse class's parse method expects a list of unparsed-message. We also need to set
            # the 'evaluate' argument to True, otherwise we get incorrect results. (Ofir Arviv advised as such).
            # The parse method also returns a generator, hence the need to call next.
            # The actual object returned is a tuple of the parsed-passage and an internal score object. We're
            # not interested in the score though, so we just extract the parsed-passage
            parsed_passage_and_score = next(
                self.__parser.parse([unparsed_passage], evaluate=True))
            internal_parsed_passage = parsed_passage_and_score[0]
            parsed_passage = TupaParser.__get_ucca_parsed_passage_from_passage(
                internal_parsed_passage)

        finally:
            sys.stdout = reg_stdout
            return parsed_passage
Beispiel #5
0
def parse_spacy(passages, lang, verbose=False):
    for passage, in annotate_all(zip(passages),
                                 as_array=True,
                                 as_tuples=True,
                                 lang=lang,
                                 verbose=verbose):
        terminals = sorted(passage.layer(layer0.LAYER_ID).all,
                           key=operator.attrgetter("position"))
        dep_nodes = [ConlluConverter.Node()] + [
            ConlluConverter.Node(t.position,
                                 terminal=t,
                                 token=ConlluConverter.Token(t.text, t.tag))
            for t in terminals
        ]
        for dep_node in dep_nodes[1:]:
            dep_node.token.paragraph = dep_node.terminal.paragraph
            head = Attr.HEAD(dep_node.terminal.tok[Attr.HEAD.value])
            if head:
                head += dep_node.position
            rel = Attr.DEP(dep_node.terminal.tok[Attr.DEP.value],
                           lang=passage.attrib.get("lang", lang))
            assert head is not None and rel is not None, \
                "head=%r, rel=%r for token %d in:\n%s" % (head, rel, dep_node.position, " ".join(map(str, terminals)))
            edge = ConlluConverter.Edge(head, rel, remote=False)
            dep_node.terminal = None
            edge.link_head(dep_nodes)
            dep_node.add_edges([edge])
        parsed = ConlluConverter().build_passage(dep_nodes, passage.ID)
        yield passage, parsed
Beispiel #6
0
def main(args):
    if not args.as_array and not args.as_extra:
        args.as_extra = True
    for spec in read_specs(args, converters=FROM_FORMAT_NO_PLACEHOLDERS):
        kwargs = dict(as_array=args.as_array,
                      as_extra=args.as_extra,
                      verbose=args.verbose,
                      lang=spec.lang)
        passages = spec.passages
        if spec.conllu:
            passages = copy_annotation(passages,
                                       spec.conllu,
                                       by_id=args.by_id,
                                       **kwargs)
        elif spec.udpipe:
            passages = annotate_udpipe(passages, spec.udpipe, **kwargs)
        elif spec.stanfordnlp:
            passages = annotate_stanfordnlp(passages, spec.stanfordnlp,
                                            **kwargs)
        for passage in annotate_all(passages if args.verbose else tqdm(
                passages, unit=" passages", desc="Annotating " + spec.out_dir),
                                    replace=spec.conllu
                                    or not (spec.udpipe or spec.stanfordnlp),
                                    **kwargs):
            if passage.extra.get("format") == "amr" and args.as_array:
                from semstr.conversion.amr import AmrConverter
                AmrConverter.introduce_placeholders(passage)
            write_passage(passage,
                          outdir=spec.out_dir,
                          verbose=args.verbose,
                          binary=args.binary)
Beispiel #7
0
def main(args):
    words = args.word or []
    categories = list(args.category or ())
    dependencies = list(args.dependency or ())
    if args.case_insensitive:
        words = list(map(str.lower, words))
    for spec in read_specs(args, converters=FROM_FORMAT):
        if args.dependency:
            spec.passages = annotate_udpipe(spec.passages, spec.udpipe) if spec.udpipe else \
                annotate_all(spec.passages, as_array=True, replace=not spec.udpipe, lang=spec.lang)
        t = tqdm(spec.passages, unit=" passages", desc="Finding")
        if words:
            t.set_postfix(words=",".join(words))
        if categories:
            t.set_postfix(categories=",".join(categories))
        if dependencies:
            t.set_postfix(dependencies=",".join(dependencies))
        found = 0
        filename = os.path.join(
            spec.out_dir, "_".join(words + categories + dependencies) + ".txt")
        with open(filename, "w", encoding="utf-8") as f:
            for passage in t:
                for terminal in passage.layer(layer0.LAYER_ID).all:
                    parent = terminal.parents[0]
                    word = terminal.text
                    if args.case_insensitive:
                        word = word.lower()
                    if (not words or word in words) and (
                            not categories or parent.ftag in categories) and (
                                not dependencies or get_annotation(
                                    terminal, spec.udpipe) in dependencies):
                        print(passage.ID, parent.fparent, file=f)
                        found += 1
                        t.set_postfix(found=found)
        print("Wrote '%s'" % filename)
Beispiel #8
0
def main(args):
    for spec in read_specs(args, converters=FROM_FORMAT):
        spec.passages = annotate_udpipe(spec.passages, spec.udpipe) if spec.udpipe else \
            annotate_all(spec.passages, as_array=True, replace=not spec.udpipe, lang=spec.lang)
        filename = os.path.join(spec.out_dir, "find.db")
        with sqlite3.connect(filename) as conn:
            c = conn.cursor()
            c.execute("DROP TABLE terminals")
            c.execute(
                "CREATE TABLE terminals (pid, tid, text, ftag, fparent, dep)")
            c.execute("CREATE INDEX idx_terminals_pid ON terminals (pid)")
            c.execute("CREATE INDEX idx_terminals_text ON terminals (text)")
            c.execute("CREATE INDEX idx_terminals_ftag ON terminals (ftag)")
            c.execute("CREATE INDEX idx_terminals_dep ON terminals (dep)")
            for passage in tqdm(spec.passages,
                                unit=" passages",
                                desc="Creating " + filename):
                rows = []
                for terminal in passage.layer(layer0.LAYER_ID).all:
                    parent = terminal.parents[0]
                    rows.append(
                        (passage.ID, terminal.ID, terminal.text, parent.ftag,
                         str(parent.fparent),
                         get_annotation(terminal, spec.udpipe)))
                c.executemany("INSERT INTO terminals VALUES (?,?,?,?,?,?)",
                              rows)
                conn.commit()
Beispiel #9
0
def main(args):
    for passage in annotate_all(get_passages_with_progress_bar(
            args.filenames, desc="Annotating"),
                                replace=True,
                                as_array=args.as_array,
                                verbose=args.verbose):
        assert is_annotated(
            passage, args.as_array), "Passage %s is not annotated" % passage.ID
        write_passage(passage, outdir=args.out_dir, verbose=args.verbose)
Beispiel #10
0
 def from_format(self, lines, passage_id, return_original=False, save_original=True, remove_cycles=True, **kwargs):
     self.passage_id = passage_id
     self.return_original = return_original
     self.save_original = save_original
     self.remove_cycles = remove_cycles
     self.extensions = [l for l in EXTENSIONS if kwargs.get(l)]
     self.excluded = {i for l, r in EXTENSIONS.items() if l not in self.extensions for i in r}
     for passage, amr, amr_id in textutil.annotate_all(self._init_passages(self._amr_generator(lines)),
                                                       as_array=True, as_tuples=True):
         yield self._build_passage(passage, amr, amr_id)
Beispiel #11
0
 def parse(self, passages, display=True, write=False):
     passages, total = generate_and_len(single_to_iter(passages))
     if self.config.args.ignore_case:
         passages = to_lower_case(passages)
     pr_width = len(str(total))
     id_width = 1
     passages = self.add_progress_bar(ThreadedGenerator(
         textutil.annotate_all(
             passages,
             as_array=True,
             lang=self.config.args.lang,
             verbose=self.config.args.verbose > 2,
             vocab=self.model.config.vocab(lang=self.config.args.lang)),
         queue_maxsize=100),
                                      display=display)
     for i, passage in enumerate(passages, start=1):
         parser = PassageParser(passage, self.config, self.models,
                                self.training, self.evaluation)
         if self.config.args.verbose and display:
             progress = "%3d%% %*d/%d" % (
                 i / total * 100, pr_width, i,
                 total) if total and i <= total else "%d" % i
             id_width = max(id_width, len(str(passage.ID)))
             print("%s %2s %-6s %-*s" %
                   (progress, parser.lang, parser.in_format, id_width,
                    passage.ID),
                   end=self.config.line_end)
         else:
             passages.set_description()
             postfix = {parser.lang + " " + parser.in_format: passage.ID}
             if display:
                 postfix["|t/s|"] = self.tokens_per_second()
                 if self.correct_action_count:
                     postfix["|a|"] = percents_str(
                         self.correct_action_count,
                         self.action_count,
                         fraction=False)
                 if self.correct_label_count:
                     postfix["|l|"] = percents_str(self.correct_label_count,
                                                   self.label_count,
                                                   fraction=False)
                 if self.evaluation and self.num_passages:
                     postfix["|F1|"] = self.f1 / self.num_passages
             passages.set_postfix(**postfix)
         self.seen_per_format[parser.in_format] += 1
         if self.training and self.config.args.max_training_per_format and \
                 self.seen_per_format[parser.in_format] > self.config.args.max_training_per_format:
             self.config.print("skipped", level=1)
             continue
         assert not (self.training and parser.in_format
                     == "text"), "Cannot train on unannotated plain text"
         yield parser.parse(display=display, write=write)
         self.update_counts(parser)
     if self.num_passages and display:
         self.summary()
Beispiel #12
0
def main(args):
    for spec in read_specs(args, converters=FROM_FORMAT):
        if spec.udpipe:
            spec.passages = annotate_udpipe(spec.passages, spec.udpipe, as_array=args.as_array, verbose=args.verbose)
        elif spec.conllu:
            spec.passages = copy_annotation(spec.passages, spec.conllu, as_array=args.as_array, verbose=args.verbose)
        for passage in annotate_all(spec.passages if args.verbose else
                                    tqdm(spec.passages, unit=" passages", desc="Annotating " + spec.out_dir),
                                    as_array=args.as_array, replace=not spec.udpipe, lang=spec.lang,
                                    verbose=args.verbose):
            write_passage(passage, outdir=spec.out_dir, verbose=args.verbose, binary=args.binary)
Beispiel #13
0
def main(args):
    textutil.BATCH_SIZE = 1
    os.makedirs(args.outdir, exist_ok=True)
    with open(args.outfile, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(("rule", "passage", "terminal", "pos", "before", "after"))
        for passage in annotate_all(get_passages_with_progress_bar(args.passages, desc="Converting"),
                                    verbose=args.verbose):
            convert_passage(passage, report_writer=writer)
            write_passage(passage, outdir=args.outdir, prefix=args.prefix, verbose=args.verbose)
            f.flush()
    print("Wrote '%s'" % args.outfile)
Beispiel #14
0
def main(args):
    for passages, out_dir, lang in read_specs(args):
        for passage in annotate_all(passages if args.verbose else tqdm(
                passages, unit=" passages", desc="Annotating " + out_dir),
                                    as_array=args.as_array,
                                    replace=True,
                                    lang=lang,
                                    verbose=args.verbose):
            write_passage(passage,
                          outdir=out_dir,
                          verbose=args.verbose,
                          binary=args.binary)
Beispiel #15
0
def main(args):
    textutil.BATCH_SIZE = 1
    os.makedirs(args.outdir, exist_ok=True)
    with open(args.outfile, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(
            ("rule", "passage", "terminal", "pos", "before", "after"))
        for passage in annotate_all(get_passages_with_progress_bar(
                args.passages, desc="Converting"),
                                    verbose=args.verbose):
            convert_passage(passage, report_writer=writer)
            write_passage(passage,
                          outdir=args.outdir,
                          prefix=args.prefix,
                          verbose=args.verbose)
            f.flush()
    print("Wrote '%s'" % args.outfile)
Beispiel #16
0
 def parse(self, passages, display=True, write=False, accuracies=None):
     passages, total = generate_and_len(single_to_iter(passages))
     if self.config.args.ignore_case:
         passages = to_lower_case(passages)
     pr_width = len(str(total))
     id_width = 1
     passages = self.add_progress_bar(textutil.annotate_all(
         passages, as_array=True, as_extra=False, lang=self.config.args.lang, verbose=self.config.args.verbose > 2,
         vocab=self.model.config.vocab(lang=self.config.args.lang)), display=display)
     for i, passage in enumerate(passages, start=1):
         parser = PassageParser(passage, self.config, self.models, self.training, self.evaluation)
         if self.config.args.verbose and display:
             progress = "%3d%% %*d/%d" % (i / total * 100, pr_width, i, total) if total and i <= total else "%d" % i
             id_width = max(id_width, len(str(passage.ID)))
             print("%s %2s %-6s %-*s" % (progress, parser.lang, parser.in_format, id_width, passage.ID),
                   end=self.config.line_end)
         else:
             passages.set_description()
             postfix = {parser.lang + " " + parser.in_format: passage.ID}
             if display:
                 postfix["|t/s|"] = self.tokens_per_second()
                 if self.correct_action_count:
                     postfix["|a|"] = percents_str(self.correct_action_count, self.action_count, fraction=False)
                 if self.correct_label_count:
                     postfix["|l|"] = percents_str(self.correct_label_count, self.label_count, fraction=False)
                 if self.evaluation and self.num_passages:
                     postfix["|F1|"] = self.f1 / self.num_passages
             passages.set_postfix(**postfix)
         self.seen_per_format[parser.in_format] += 1
         if self.training and self.config.args.max_training_per_format and \
                 self.seen_per_format[parser.in_format] > self.config.args.max_training_per_format:
             self.config.print("skipped", level=1)
             continue
         assert not (self.training and parser.in_format == "text"), "Cannot train on unannotated plain text"
         yield parser.parse(display=display, write=write, accuracies=accuracies)
         self.update_counts(parser)
     if self.num_passages and display:
         self.summary()
Beispiel #17
0
 def from_format(self,
                 lines,
                 passage_id,
                 return_original=False,
                 save_original=True,
                 remove_cycles=True,
                 wikification=True,
                 placeholders=True,
                 **kwargs):
     self.passage_id = passage_id
     self.return_original = return_original
     self.save_original = save_original
     self.remove_cycles = remove_cycles
     self.wikification = wikification
     self.placeholders = placeholders
     self.set_extensions(**kwargs)
     passages = self._init_passages(self._amr_generator(lines), **kwargs)
     if placeholders:
         passages = textutil.annotate_all(passages,
                                          as_array=True,
                                          as_tuples=True)
     for passage, graph in passages:
         yield self._build_passage(passage, graph)
Beispiel #18
0
def main(args):
    for passage in annotate_all(get_passages_with_progress_bar(args.filenames, desc="Annotating"),
                                replace=True, as_array=args.as_array, verbose=args.verbose):
        assert is_annotated(passage, args.as_array), "Passage %s is not annotated" % passage.ID
        write_passage(passage, outdir=args.out_dir, verbose=args.verbose)