Esempio n. 1
0
def lix(text: Annotation = Annotation("<text>"),
        sentence: Annotation = Annotation("<sentence>"),
        word: Annotation = Annotation("<token:word>"),
        pos: Annotation = Annotation("<token:pos>"),
        out: Output = Output("<text>:readability.lix",
                             description="LIX values for text chunks"),
        skip_pos: List[str] = ["MAD", "MID", "PAD"],
        fmt: str = "%.2f"):
    """Create LIX annotation for text."""
    # Read annotation files and get parent_children relations
    text_children, _orphans = text.get_children(sentence)
    word_pos = list(word.read_attributes((word, pos)))
    sentence_children, _orphans = sentence.get_children(word)
    sentence_children = list(sentence_children)

    # Calculate LIX for every text element
    lix_annotation = []
    for text in text_children:
        in_sentences = []
        for sentence_index in text:
            s = sentence_children[sentence_index]
            in_sentences.append(
                list(
                    actual_words([word_pos[token_index] for token_index in s],
                                 skip_pos)))
        lix_annotation.append(fmt % lix_calc(in_sentences))

    out.write(lix_annotation)
Esempio n. 2
0
def ovix(text: Annotation = Annotation("<text>"),
         word: Annotation = Annotation("<token:word>"),
         pos: Annotation = Annotation("<token:pos>"),
         out: Output = Output("<text>:readability.ovix",
                              description="OVIX values for text chunks"),
         skip_pos: List[str] = ["MAD", "MID", "PAD"],
         fmt: str = "%.2f"):
    """Create OVIX annotation for text."""
    text_children, _orphans = text.get_children(word)
    word_pos = list(word.read_attributes((word, pos)))

    # Calculate OVIX for every text element
    ovix_annotation = []
    for text in text_children:
        in_words = list(
            actual_words([word_pos[token_index] for token_index in text],
                         skip_pos))
        ovix_annotation.append(fmt % ovix_calc(in_words))

    out.write(ovix_annotation)
Esempio n. 3
0
def relations(
        out: OutputData = OutputData("korp.relations"),
        word: Annotation = Annotation("<token:word>"),
        pos: Annotation = Annotation("<token:pos>"),
        lemgram: Annotation = Annotation("<token>:saldo.lemgram"),
        dephead: Annotation = Annotation("<token:dephead>"),
        deprel: Annotation = Annotation("<token:deprel>"),
        sentence_id: Annotation = Annotation("<sentence>:misc.id"),
        ref: Annotation = Annotation("<token>:misc.number_rel_<sentence>"),
        baseform: Annotation = Annotation("<token>:saldo.baseform")):
    """Find certain dependencies between words, to be used by the Word Picture feature in Korp."""
    sentence_ids = sentence_id.read()
    sentence_tokens, _ = sentence_id.get_children(word)

    annotations = list(
        word.read_attributes(
            (word, pos, lemgram, dephead, deprel, ref, baseform)))

    # http://stp.ling.uu.se/~nivre/swedish_treebank/dep.html
    # Tuples with relations (head, rel, dep) to be found (with indexes) and an optional tuple specifying which info
    # should be stored and how
    rels = [
        ({
            1: "VB",
            2: "SS",
            3: "NN"
        }, {
            1: "VB",
            4: "VG",
            5: "VB"
        }, (5, 2, 3, "")),  # "han har sprungit"
        ({
            1: "VB",
            2: "(SS|OO|IO|OA)",
            3: "NN"
        }, ),
        ({
            1: "VB",
            2: "(RA|TA)",
            3: "(AB|NN)"
        }, ),
        ({
            1: "VB",
            2: "(RA|TA)",
            3: "PP"
        }, {
            3: "PP",
            4: "(PA|HD)",
            5: "NN"
        }, (1, 2, 5, "%(3)s")),  # "ges vid behov"
        ({
            1: "NN",
            2: "(AT|ET)",
            3: "JJ"
        }, ),  # "stor hund"
        ({
            1: "NN",
            2: "ET",
            3: "VB"
        }, {
            3: "VB",
            4: "SS",
            5: "HP"
        }, (1, 2, 3, "%(5)s")),  # "brödet som bakats"
        ({
            1: "NN",
            2: "ET",
            3: "PP"
        }, {
            3: "PP",
            4: "PA",
            5: "(NN|PM)"
        }, (1, 2, 5, "%(3)s")),  # "barnen i skolan", "hundarna i Sverige"
        ({
            1: "PP",
            2: "PA",
            3: "NN"
        }, ),  # "på bordet"
        ({
            1: "JJ",
            2: "AA",
            3: "AB"
        }, )  # "fullständigt galen"
    ]

    null_rels = [
        ("VB", ["OO"]),  # Verb som saknar objekt
    ]

    triples = []

    for sentid, sent in zip(sentence_ids, sentence_tokens):
        incomplete = {}  # Tokens looking for heads, with head as key
        tokens = {}  # Tokens in same sentence, with token_index as key

        # Link the tokens together
        for token_index in sent:
            token_word, token_pos, token_lem, token_dh, token_dr, token_ref, token_bf = annotations[
                token_index]
            token_word = token_word.lower()

            if token_lem == "|":
                token_lem = token_word

            this = {
                "pos": token_pos,
                "lemgram": token_lem,
                "word": token_word,
                "head": None,
                "dep": [],
                "ref": token_ref,
                "bf": token_bf
            }

            tokens[token_index] = this

            if not token_dh == "-":
                token_dh = int(token_dh)
                # This token is looking for a head (token is not root)
                dep_triple = (token_dr, this)
                if token_dh in tokens:
                    # Found head. Link them together both ways
                    this["head"] = (token_dr, tokens[token_dh])
                    tokens[token_dh]["dep"].append(dep_triple)
                else:
                    incomplete.setdefault(token_dh, []).append(
                        (token_index, dep_triple))

            # Is someone else looking for the current token as head?
            if token_index in incomplete:
                for t in incomplete[token_index]:
                    tokens[t[0]]["head"] = this
                    this["dep"].append(t[1])
                del incomplete[token_index]

        assert not incomplete, "incomplete is not empty"

        def _match(pattern, value):
            return bool(re.match(r"^%s$" % pattern, value))

        def _findrel(head, rel, dep):
            result = []
            if isinstance(head, dict):
                for d in head["dep"]:
                    if _match(rel, d[0]) and _match(dep, d[1]["pos"]):
                        result.append(d[1])
            if isinstance(dep, dict):
                h = dep["head"]
                if h and _match(rel, h[0]) and _match(head, h[1]["pos"]):
                    result.append(h[1])
            return result

        # Look for relations
        for v in list(tokens.values()):
            for d in v["dep"]:
                for rel in rels:
                    r = rel[0]
                    if _match(";".join([x[1] for x in sorted(r.items())]),
                              ";".join([v["pos"], d[0], d[1]["pos"]])):
                        triple = None
                        if len(rel) == 1:
                            triple = ((v["lemgram"], v["word"], v["pos"],
                                       v["ref"]), d[0],
                                      (d[1]["lemgram"], d[1]["word"],
                                       d[1]["pos"], d[1]["ref"]), ("", None),
                                      sentid, v["ref"], d[1]["ref"])
                        else:
                            lookup = dict(
                                list(
                                    zip(list(map(str, sorted(r.keys()))),
                                        (v, d[0], d[1]))))
                            i = set(rel[0].keys()).intersection(
                                set(rel[1].keys())).pop()
                            rel2 = [x[1] for x in sorted(rel[1].items())]
                            index1 = list(rel[0].keys()).index(i)
                            index2 = list(rel[1].keys()).index(i)
                            if index1 == 2 and index2 == 0:
                                result = _findrel(d[1], rel2[1], rel2[2])
                                if result:
                                    lookup.update(
                                        dict(
                                            list(
                                                zip(
                                                    list(
                                                        map(
                                                            str,
                                                            sorted(rel[1].keys(
                                                            )))),
                                                    (d[1], rel2[1],
                                                     result[0])))))
                            elif index1 == 0 and index2 == 0:
                                result = _findrel(v, rel2[1], rel2[2])
                                if result:
                                    lookup.update(
                                        dict(
                                            list(
                                                zip(
                                                    list(
                                                        map(
                                                            str,
                                                            sorted(rel[1].keys(
                                                            )))),
                                                    (v, rel2[1], result[0])))))

                            pp = rel[-1]
                            if len(list(lookup.keys())) > 3:
                                lookup_bf = dict(
                                    (key, val["bf"])
                                    for key, val in list(lookup.items())
                                    if isinstance(val, dict))
                                lookup_ref = dict(
                                    (key, val["ref"])
                                    for key, val in list(lookup.items())
                                    if isinstance(val, dict))
                                triple = ((lookup[str(pp[0])]["lemgram"],
                                           lookup[str(pp[0])]["word"],
                                           lookup[str(pp[0])]["pos"],
                                           lookup[str(pp[0])]["ref"]),
                                          lookup[str(pp[1])],
                                          (lookup[str(pp[2])]["lemgram"],
                                           lookup[str(pp[2])]["word"],
                                           lookup[str(pp[2])]["pos"],
                                           lookup[str(pp[2])]["ref"]),
                                          (pp[3] % lookup_bf,
                                           pp[3] % lookup_ref), sentid,
                                          lookup[str(pp[0])]["ref"],
                                          lookup[str(pp[2])]["ref"])
                        if triple:
                            triples.extend(_mutate_triple(triple))
                            break
            token_rels = [d[0] for d in v["dep"]]
            for nrel in null_rels:
                if nrel[0] == v["pos"]:
                    missing_rels = [x for x in nrel[1] if x not in token_rels]
                    for mrel in missing_rels:
                        triple = ((v["lemgram"], v["word"], v["pos"],
                                   v["ref"]), mrel, ("", "", "", v["ref"]),
                                  ("", None), sentid, v["ref"], v["ref"])
                        triples.extend(_mutate_triple(triple))

    triples = sorted(set(triples))

    out_data = "\n".join([
        "\t".join(
            (head, headpos, rel, dep, deppos, extra, sentid, refhead, refdep,
             str(bfhead), str(bfdep), str(wfhead), str(wfdep)))
        for (head, headpos, rel, dep, deppos, extra, sentid, refhead, refdep,
             bfhead, bfdep, wfhead, wfdep) in triples
    ])
    out.write(out_data)
Esempio n. 4
0
def annotate(
        out_complemgrams: Output = Output(
            "<token>:saldo.complemgram",
            description="Compound analysis using lemgrams"),
        out_compwf: Output = Output(
            "<token>:saldo.compwf",
            description="Compound analysis using wordforms"),
        out_baseform: Output = Output(
            "<token>:saldo.baseform2",
            description="Baseform including baseforms derived from compounds"),
        word: Annotation = Annotation("<token:word>"),
        msd: Annotation = Annotation("<token:msd>"),
        baseform_tmp: Annotation = Annotation("<token>:saldo.baseform"),
        saldo_comp_model: Model = Model("[saldo.comp_model]"),
        nst_model: Model = Model("[saldo.comp_nst_model]"),
        stats_model: Model = Model("[saldo.comp_stats_model]"),
        complemgramfmt: str = util.SCORESEP + "%.3e",
        delimiter: str = util.DELIM,
        compdelim: str = util.COMPSEP,
        affix: str = util.AFFIX,
        cutoff: bool = True,
        saldo_comp_lexicon=None,
        stats_lexicon=None):
    """Divide compound words into prefix(es) and suffix.

    - out_complemgram is the resulting annotation file for compound lemgrams
      and their probabilities
    - out_compwf is the resulting annotation file for compound wordforms
    - out_baseform is the resulting annotation file for baseforms (including baseforms for compounds)
    - word and msd are existing annotations for wordforms and MSDs
    - baseform_tmp is the existing temporary annotation file for baseforms (not including compounds)
    - saldo_comp_model is the Saldo compound model
    - nst_model is the NST part of speech compound model
    - stats_model is the statistics model (pickled file)
    - complemgramfmt is a format string for how to print the complemgram and its probability
      (use empty string to omit probablility)
    - saldo_comp_lexicon, stats_lexicon: these arguments cannot be set from the command line,
      but are used in the catapult. These arguments must be last.
    """
    ##################
    # Load models
    ##################
    if not saldo_comp_lexicon:
        saldo_comp_lexicon = SaldoCompLexicon(saldo_comp_model.path)

    with open(nst_model.path, "rb") as f:
        nst_model = pickle.load(f)

    if not stats_lexicon:
        stats_lexicon = StatsLexicon(stats_model.path)

    word_msd_baseform_annotations = list(
        word.read_attributes((word, msd, baseform_tmp)))

    # Create alternative lexicon (for words within the file)
    altlexicon = InFileLexicon(word_msd_baseform_annotations)

    ##################
    # Do annotation
    ##################
    complem_annotation = []
    compwf_annotation = []
    baseform_annotation = []

    previous_compounds = {}

    for word, msd, baseform_orig in word_msd_baseform_annotations:
        key = (word, msd)
        if key in previous_compounds:
            compounds = previous_compounds[key]
        else:
            compounds = compound(saldo_comp_lexicon, altlexicon, word, msd)

            if compounds:
                compounds = rank_compounds(compounds, nst_model, stats_lexicon)

                if cutoff:
                    # Only keep analyses with the same length (or +1) as the most probable one
                    best_length = len(compounds[0][1])
                    i = 0
                    for c in compounds:
                        if len(c[1]) > best_length + 1 or len(
                                c[1]) < best_length:
                            break

                        i += 1
                    compounds = compounds[:i]

            previous_compounds[key] = compounds

        # Create complem and compwf annotations
        make_complem_and_compwf(complem_annotation, compwf_annotation,
                                complemgramfmt, compounds, compdelim,
                                delimiter, affix)

        # Create new baseform annotation if necessary
        if baseform_orig != affix:
            baseform_annotation.append(baseform_orig)
        else:
            make_new_baseforms(baseform_annotation, msd, compounds,
                               stats_lexicon, altlexicon, delimiter, affix)

    out_complemgrams.write(complem_annotation)
    out_compwf.write(compwf_annotation)
    out_baseform.write(baseform_annotation)
Esempio n. 5
0
def annotate(
        out_phrase: Output = Output("phrase_structure.phrase",
                                    description="Phrase segments"),
        out_phrase_name: Output = Output(
            "phrase_structure.phrase:phrase_structure.name",
            description="Phrase names"),
        out_phrase_func: Output = Output(
            "phrase_structure.phrase:phrase_structure.func",
            description="Phrase functions"),
        token: Annotation = Annotation("<token>"),
        word: Annotation = Annotation("<token:word>"),
        sentence: Annotation = Annotation("<sentence>"),
        pos: Annotation = Annotation("<token:pos>"),
        msd: Annotation = Annotation("<token:msd>"),
        ref: Annotation = Annotation("<token>:misc.number_rel_<sentence>"),
        dephead_ref: Annotation = Annotation("<token:dephead_ref>"),
        deprel: Annotation = Annotation("<token:deprel>")):
    """Annotate sentence with phrase structures."""
    sentences, _orphans = sentence.get_children(word)
    token_annotations = list(
        ref.read_attributes([ref, word, pos, msd, dephead_ref, deprel]))
    token_spans = list(token.read_spans())

    def get_token_span(index):
        return token_spans[index]

    nodes = []

    for s in sentences:
        tokenlist = [Token(None)]
        for token_index in s:
            token = token_annotations[token_index]
            tokenlist.append(Token(token))

        # Get PS tree
        sen = Sentence(tokenlist)
        if not sen.is_cyclic():
            tree = convert_sentence(sen).top.to_tree_str()
            # print(pprint.pformat(tree), file=sys.stderr)

            # Make nodes
            children = flatten_tree(tree[1], [])
            log.debug("\n\nSENTENCE:")
            position = 0
            open_elem_stack = []
            for child in children:
                if not child[0].startswith("WORD:"):
                    start_pos = get_token_span(s[position])[0]
                    open_elem_stack.append(child + (start_pos, ))
                    log.debug(
                        f"<phrase name={child[0]} func={child[1]}> {s[position]}"
                    )
                else:
                    # Close nodes
                    while open_elem_stack[-1][2] == child[2]:
                        start_pos = open_elem_stack[-1][3]
                        end_pos = get_token_span(s[position - 1])[1]
                        nodes.append(
                            ((start_pos, end_pos), open_elem_stack[-1][0],
                             open_elem_stack[-1][1]))
                        log.debug(
                            f"</phrase name={open_elem_stack[-1][0]} func={open_elem_stack[-1][1]}> {start_pos}-{end_pos}"
                        )
                        open_elem_stack.pop()
                    position += 1
                    log.debug(f"   {child[0][5:]}")

            # Close remaining open nodes
            end_pos = get_token_span(s[-1])[1]
            for elem in reversed(open_elem_stack):
                start_pos = elem[3]
                nodes.append(((start_pos, end_pos), elem[0], elem[1]))
                log.debug(
                    f"</phrase name={elem[0]} func={elem[1]}> {start_pos}-{end_pos}"
                )

    # Sort nodes
    sorted_nodes = sorted(nodes)

    # Write annotations
    out_phrase.write([i[0] for i in sorted_nodes])
    out_phrase_name.write([i[1] for i in sorted_nodes])
    out_phrase_func.write([i[2] for i in sorted_nodes])