Example #1
0
def lix(text: Annotation = Annotation("<text>"),
        sentence: Annotation = Annotation("<sentence>"),
        word: Annotation = Annotation("<token:word>"),
        pos: Annotation = Annotation("<token:pos>"),
        out: Output = Output("<text>:readability.lix",
                             description="LIX values for text chunks"),
        skip_pos: List[str] = ["MAD", "MID", "PAD"],
        fmt: str = "%.2f"):
    """Create LIX annotation for text."""
    # Read annotation files and get parent_children relations
    text_children, _orphans = text.get_children(sentence)
    word_pos = list(word.read_attributes((word, pos)))
    sentence_children, _orphans = sentence.get_children(word)
    sentence_children = list(sentence_children)

    # Calculate LIX for every text element
    lix_annotation = []
    for text in text_children:
        in_sentences = []
        for sentence_index in text:
            s = sentence_children[sentence_index]
            in_sentences.append(
                list(
                    actual_words([word_pos[token_index] for token_index in s],
                                 skip_pos)))
        lix_annotation.append(fmt % lix_calc(in_sentences))

    out.write(lix_annotation)
Example #2
0
def contextual(out: Output = Output("{chunk}:geo.geo_context", description="Geographical places with coordinates"),
               chunk: Annotation = Annotation("{chunk}"),
               context: Annotation = Annotation("[geo.context_chunk]"),
               ne_type: Annotation = Annotation("swener.ne:swener.type"),
               ne_subtype: Annotation = Annotation("swener.ne:swener.subtype"),
               ne_name: Annotation = Annotation("swener.ne:swener.name"),
               model: Model = Model("[geo.model]"),
               method: str = "populous",
               language: list = []):
    """Annotate chunks with location data, based on locations contained within the text.

    context = text chunk to use for disambiguating places (when applicable).
    chunk = text chunk to which the annotation will be added.
    """
    model = load_model(model, language=language)

    ne_type_annotation = list(ne_type.read())
    ne_subtype_annotation = list(ne_subtype.read())
    ne_name_annotation = list(ne_name.read())

    children_context_chunk, _orphans = context.get_children(chunk)
    children_chunk_ne, _orphans = chunk.get_children(ne_type)

    out_annotation = chunk.create_empty_attribute()

    for chunks in children_context_chunk:
        all_locations = []  # TODO: Maybe not needed for anything?
        context_locations = []
        chunk_locations = defaultdict(list)

        for ch in chunks:
            for n in children_chunk_ne[ch]:
                if ne_type_annotation[n] == "LOC" and "PPL" in ne_subtype_annotation[n]:
                    location_text = ne_name_annotation[n].replace("\n", " ").replace("  ", " ")
                    location_data = model.get(location_text.lower())
                    if location_data:
                        all_locations.append((location_text, list(location_data)))
                        context_locations.append((location_text, list(location_data)))
                        chunk_locations[ch].append((location_text, list(location_data)))
                    else:
                        pass
                        # log.info("No location found for %s" % ne_name_annotation[n].replace("%", "%%"))

        chunk_locations = most_populous(chunk_locations)

        for c in chunks:
            out_annotation[c] = _format_location(chunk_locations.get(c, ()))

    out.write(out_annotation)
Example #3
0
def msdtag(out: Output = Output(
    "<token>:hunpos.msd",
    cls="token:msd",
    description="Part-of-speeches with morphological descriptions"),
           word: Annotation = Annotation("<token:word>"),
           sentence: Annotation = Annotation("<sentence>"),
           binary: Binary = Binary("[hunpos.binary]"),
           model: Model = Model("[hunpos.model]"),
           morphtable: Optional[Model] = Model("[hunpos.morphtable]"),
           patterns: Optional[Model] = Model("[hunpos.patterns]"),
           tag_mapping=None,
           encoding: str = util.UTF8):
    """POS/MSD tag using the Hunpos tagger."""
    if isinstance(tag_mapping, str) and tag_mapping:
        tag_mapping = util.tagsets.mappings[tag_mapping]
    elif tag_mapping is None or tag_mapping == "":
        tag_mapping = {}

    pattern_list = []

    if patterns:
        with open(patterns.path, encoding="utf-8") as pat:
            for line in pat:
                if line.strip() and not line.startswith("#"):
                    name, pattern, tags = line.strip().split("\t", 2)
                    pattern_list.append(
                        (name, re.compile("^%s$" % pattern), tags))

    def replace_word(w):
        """Replace word with alias if word matches a regex pattern."""
        for p in pattern_list:
            if re.match(p[1], w):
                return "[[%s]]" % p[0]
        return w

    sentences, _orphans = sentence.get_children(word)
    token_word = list(word.read())
    stdin = SENT_SEP.join(
        TOK_SEP.join(
            replace_word(token_word[token_index]) for token_index in sent)
        for sent in sentences)
    args = [model.path]
    if morphtable:
        args.extend(["-m", morphtable.path])
    stdout, _ = util.system.call_binary(binary, args, stdin, encoding=encoding)

    out_annotation = word.create_empty_attribute()
    for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)):
        for token_index, tagged_token in zip(
                sent,
                tagged_sent.strip().split(TOK_SEP)):
            tag = tagged_token.strip().split(TAG_SEP)[TAG_COLUMN]
            tag = tag_mapping.get(tag, tag)
            out_annotation[token_index] = tag

    out.write(out_annotation)
Example #4
0
def annotate(
        lang: Language = Language(),
        model: Model = Model("[treetagger.model]"),
        tt_binary: Binary = Binary("[treetagger.binary]"),
        out_upos: Output = Output("<token>:treetagger.upos",
                                  cls="token:upos",
                                  description="Part-of-speeches in UD"),
        out_pos: Output = Output(
            "<token>:treetagger.pos",
            cls="token:pos",
            description="Part-of-speeches from TreeTagger"),
        out_baseform: Output = Output("<token>:treetagger.baseform",
                                      description="Baseforms from TreeTagger"),
        word: Annotation = Annotation("<token:word>"),
        sentence: Annotation = Annotation("<sentence>"),
        encoding: str = util.UTF8):
    """POS/MSD tag and lemmatize using TreeTagger."""
    sentences, _orphans = sentence.get_children(word)
    word_annotation = list(word.read())
    stdin = SENT_SEP.join(
        TOK_SEP.join(word_annotation[token_index] for token_index in sent)
        for sent in sentences)
    args = ["-token", "-lemma", "-no-unknown", "-eos-tag", "<eos>", model.path]

    stdout, stderr = util.system.call_binary(tt_binary,
                                             args,
                                             stdin,
                                             encoding=encoding)
    log.debug("Message from TreeTagger:\n%s", stderr)

    # Write pos and upos annotations.
    out_upos_annotation = word.create_empty_attribute()
    out_pos_annotation = word.create_empty_attribute()
    for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)):
        for token_id, tagged_token in zip(sent,
                                          tagged_sent.strip().split(TOK_SEP)):
            tag = tagged_token.strip().split(TAG_SEP)[TAG_COLUMN]
            out_pos_annotation[token_id] = tag
            out_upos_annotation[token_id] = util.tagsets.pos_to_upos(
                tag, lang, TAG_SETS.get(lang))
    out_pos.write(out_pos_annotation)
    out_upos.write(out_upos_annotation)

    # Write lemma annotations.
    out_lemma_annotation = word.create_empty_attribute()
    for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)):
        for token_id, tagged_token in zip(sent,
                                          tagged_sent.strip().split(TOK_SEP)):
            lem = tagged_token.strip().split(TAG_SEP)[LEM_COLUMN]
            out_lemma_annotation[token_id] = lem
    out_baseform.write(out_lemma_annotation)
Example #5
0
def number_relative(out: Output = Output("{annotation}:misc.number_rel_{parent}"),
                    parent: Annotation = Annotation("{parent}"),
                    child: Annotation = Annotation("{annotation}"),
                    prefix: str = "",
                    zfill: bool = False,
                    start: int = START_DEFAULT):
    """Number chunks by their relative position within a parent."""
    parent_children, _orphans = parent.get_children(child)

    out.write(("{prefix}{nr:0{length}d}".format(prefix=prefix,
                                                length=len(str(len(parent) - 1 + start))
                                                if zfill else 0,
                                                nr=cnr)
               for parent in parent_children
               for cnr, _index in enumerate(parent, start)))
Example #6
0
def number_by_parent(out: Output = Output("{annotation}:misc.number_by_parent_{parent_annotation}__{parent_attribute}"),
                     chunk: Annotation = Annotation("{annotation}"),
                     parent_order: Annotation = Annotation("{parent_annotation}:{parent_attribute}"),
                     prefix: str = "",
                     zfill: bool = False,
                     start: int = START_DEFAULT):
    """Number chunks by (parent_order, chunk order)."""
    parent_children, _orphans = parent_order.get_children(chunk)

    child_order = {child_index: (parent_nr, child_index)
                   for parent_index, parent_nr in enumerate(parent_order.read())
                   for child_index in parent_children[parent_index]}

    def _order(index, _value):
        return child_order.get(index)

    _read_chunks_and_write_new_ordering(out, chunk, _order, prefix, zfill, start)
Example #7
0
def nominal_ratio(text: Annotation = Annotation("<text>"),
                  pos: Annotation = Annotation("<token:pos>"),
                  out: Output = Output(
                      "<text>:readability.nk",
                      description="Nominal ratios for text chunks"),
                  noun_pos: List[str] = ["NN", "PP", "PC"],
                  verb_pos: List[str] = ["PN", "AB", "VB"],
                  fmt: str = "%.2f"):
    """Create nominal ratio annotation for text."""
    text_children, _orphans = text.get_children(pos)
    pos_annotation = list(pos.read())

    # Calculate OVIX for every text element
    nk_annotation = []
    for text in text_children:
        in_pos = [pos_annotation[token_index] for token_index in text]
        nk_annotation.append(fmt %
                             nominal_ratio_calc(in_pos, noun_pos, verb_pos))
    out.write(nk_annotation)
Example #8
0
def ovix(text: Annotation = Annotation("<text>"),
         word: Annotation = Annotation("<token:word>"),
         pos: Annotation = Annotation("<token:pos>"),
         out: Output = Output("<text>:readability.ovix",
                              description="OVIX values for text chunks"),
         skip_pos: List[str] = ["MAD", "MID", "PAD"],
         fmt: str = "%.2f"):
    """Create OVIX annotation for text."""
    text_children, _orphans = text.get_children(word)
    word_pos = list(word.read_attributes((word, pos)))

    # Calculate OVIX for every text element
    ovix_annotation = []
    for text in text_children:
        in_words = list(
            actual_words([word_pos[token_index] for token_index in text],
                         skip_pos))
        ovix_annotation.append(fmt % ovix_calc(in_words))

    out.write(ovix_annotation)
Example #9
0
def annotate(
        maltjar: Binary = Binary("[malt.jar]"),
        model: Model = Model("[malt.model]"),
        out_dephead: Output = Output(
            "<token>:malt.dephead",
            cls="token:dephead",
            description="Positions of the dependency heads"),
        out_dephead_ref: Output = Output(
            "<token>:malt.dephead_ref",
            cls="token:dephead_ref",
            description="Sentence-relative positions of the dependency heads"),
        out_deprel: Output = Output(
            "<token>:malt.deprel",
            cls="token:deprel",
            description="Dependency relations to the head"),
        word: Annotation = Annotation("<token:word>"),
        pos: Annotation = Annotation("<token:pos>"),
        msd: Annotation = Annotation("<token:msd>"),
        ref: Annotation = Annotation("<token>:misc.number_rel_<sentence>"),
        sentence: Annotation = Annotation("<sentence>"),
        token: Annotation = Annotation("<token>"),
        encoding: str = util.UTF8,
        process_dict=None):
    """
    Run the malt parser, in an already started process defined in process_dict, or start a new process (default).

    The process_dict argument should never be set from the command line.
    """
    if process_dict is None:
        process = maltstart(maltjar, model, encoding)
    else:
        process = process_dict["process"]
        # If process seems dead, spawn a new
        if process.stdin.closed or process.stdout.closed or process.poll():
            util.system.kill_process(process)
            process = maltstart(maltjar,
                                model,
                                encoding,
                                send_empty_sentence=True)
            process_dict["process"] = process

    sentences, orphans = sentence.get_children(token)
    sentences.append(orphans)

    word_annotation = list(word.read())
    pos_annotation = list(pos.read())
    msd_annotation = list(msd.read())
    ref_annotation = list(ref.read())

    def conll_token(nr, token_index):
        form = word_annotation[token_index]
        lemma = UNDEF
        pos = cpos = pos_annotation[token_index]
        feats = re.sub(r"[ ,.]", "|",
                       msd_annotation[token_index]).replace("+", "/")
        return TAG_SEP.join((str(nr), form, lemma, cpos, pos, feats))

    stdin = SENT_SEP.join(
        TOK_SEP.join(
            conll_token(n + 1, token_index)
            for n, token_index in enumerate(sent)) for sent in sentences)

    if encoding:
        stdin = stdin.encode(encoding)

    keep_process = len(
        stdin) < RESTART_THRESHOLD_LENGTH and process_dict is not None
    log.info("Stdin length: %s, keep process: %s", len(stdin), keep_process)

    if process_dict is not None:
        process_dict["restart"] = not keep_process

    if keep_process:
        # Chatting with malt: send a SENT_SEP and read correct number of lines
        stdin_fd, stdout_fd = process.stdin, process.stdout
        stdin_fd.write(stdin + SENT_SEP.encode(util.UTF8))
        stdin_fd.flush()

        malt_sentences = []
        for sent in sentences:
            malt_sent = []
            for _ in sent:
                line = stdout_fd.readline()
                if encoding:
                    line = line.decode(encoding)
                malt_sent.append(line)
            line = stdout_fd.readline()
            assert line == b"\n"
            malt_sentences.append(malt_sent)
    else:
        # Otherwise use communicate which buffers properly
        stdout, _ = process.communicate(stdin)
        if encoding:
            stdout = stdout.decode(encoding)
        malt_sentences = (malt_sent.split(TOK_SEP)
                          for malt_sent in stdout.split(SENT_SEP))

    out_dephead_annotation = word.create_empty_attribute()
    out_dephead_ref_annotation = out_dephead_annotation.copy()
    out_deprel_annotation = out_dephead_annotation.copy()
    for (sent, malt_sent) in zip(sentences, malt_sentences):
        for (token_index, malt_tok) in zip(sent, malt_sent):
            cols = [(None if col == UNDEF else col)
                    for col in malt_tok.split(TAG_SEP)]
            out_deprel_annotation[token_index] = cols[DEPREL_COLUMN]
            head = int(cols[HEAD_COLUMN])
            out_dephead_annotation[token_index] = str(sent[head -
                                                           1]) if head else "-"
            out_dephead_ref_annotation[token_index] = str(
                ref_annotation[sent[head - 1]]) if head else ""

    out_dephead.write(out_dephead_annotation)
    out_dephead_ref.write(out_dephead_ref_annotation)
    out_deprel.write(out_deprel_annotation)
Example #10
0
def annotate(token: Annotation = Annotation("<token>"),
             word: Annotation = Annotation("<token:word>"),
             sentence: Annotation = Annotation("<sentence>"),
             reference: Annotation = Annotation(
                 "<token>:misc.number_rel_<sentence>"),
             out_sense: Output = Output("<token>:saldo.sense",
                                        cls="token:sense",
                                        description="SALDO identifier"),
             out_lemgram: Output = Output("<token>:saldo.lemgram",
                                          description="SALDO lemgram"),
             out_baseform: Output = Output("<token>:saldo.baseform",
                                           cls="token:baseform",
                                           description="Baseform from SALDO"),
             models: List[Model] = [Model("[saldo.model]")],
             msd: Optional[Annotation] = Annotation("<token:msd>"),
             delimiter: str = util.DELIM,
             affix: str = util.AFFIX,
             precision: str = Config("saldo.precision"),
             precision_filter: str = "max",
             min_precision: float = 0.66,
             skip_multiword: bool = False,
             allow_multiword_overlap: bool = False,
             word_separator: str = "",
             lexicons=None):
    """Use the Saldo lexicon model (and optionally other older lexicons) to annotate pos-tagged words.

    - token, word, msd, sentence, reference: existing annotations
    - out_baseform, out_lemgram, out_sense: resulting annotations to be written
    - models: a list of pickled lexica, typically the Saldo model (saldo.pickle)
      and optional lexicons for older Swedish.
    - delimiter: delimiter character to put between ambiguous results
    - affix: an optional character to put before and after results
    - precision: a format string for how to print the precision for each annotation, e.g. ":%.3f"
      (use empty string for no precision)
    - precision_filter: an optional filter, currently there are the following values:
        max: only use the annotations that are most probable
        first: only use the most probable annotation (or one of the most probable if more than one)
        none: use all annotations
    - min_precision: only use annotations with a probability score higher than this
    - skip_multiword: set to True to disable multi word annotations
    - allow_multiword_overlap: by default we do some cleanup among overlapping multi word annotations.
      By setting this to True, all overlaps will be allowed.
    - word_separator: an optional character used to split the values of "word" into several word variations
    - lexicons: this argument cannot be set from the command line, but is used in the catapult.
      This argument must be last.
    """
    # Allow use of multiple lexicons
    models_list = [(m.path.stem, m) for m in models]
    if not lexicons:
        lexicon_list = [(name, SaldoLexicon(lex.path))
                        for name, lex in models_list]
    # Use pre-loaded lexicons (from catapult)
    else:
        lexicon_list = []
        for name, _lex in models_list:
            assert lexicons.get(
                name, None) is not None, "Lexicon %s not found!" % name
            lexicon_list.append((name, lexicons[name]))

    # Maximum number of gaps in multi-word units.
    # TODO: Set to 0 for hist-mode? since many (most?) multi-word in the old lexicons are inseparable (half öre etc)
    max_gaps = 1

    # Combine annotation names i SALDO lexicon with out annotations
    annotations = []
    if out_baseform:
        annotations.append((out_baseform, "gf"))
    if out_lemgram:
        annotations.append((out_lemgram, "lem"))
    if out_sense:
        annotations.append((out_sense, "saldo"))

    if skip_multiword:
        log.info("Skipping multi word annotations")

    min_precision = float(min_precision)

    # If min_precision is 0, skip almost all part-of-speech checking (verb multi-word expressions still won't be
    # allowed to span over other verbs)
    skip_pos_check = (min_precision == 0.0)

    word_annotation = list(word.read())
    ref_annotation = list(reference.read())
    if msd:
        msd_annotation = list(msd.read())

    sentences, orphans = sentence.get_children(token)
    sentences.append(orphans)

    out_annotation = word.create_empty_attribute()

    for sent in sentences:
        incomplete_multis = [
        ]  # [{annotation, words, [ref], is_particle, lastwordWasGap, numberofgaps}]
        complete_multis = []  # ([ref], annotation)
        sentence_tokens = {}

        for token_index in sent:
            theword = word_annotation[token_index]
            ref = ref_annotation[token_index]
            msdtag = msd_annotation[token_index] if msd else ""

            annotation_info = {}
            sentence_tokens[ref] = {
                "token_index": token_index,
                "annotations": annotation_info
            }

            # Support for multiple values of word
            if word_separator:
                thewords = [w for w in theword.split(word_separator) if w]
            else:
                thewords = [theword]

            # First use MSD tags to find the most probable single word annotations
            ann_tags_words = find_single_word(thewords, lexicon_list, msdtag,
                                              precision, min_precision,
                                              precision_filter,
                                              annotation_info)

            # Find multi-word expressions
            if not skip_multiword:
                find_multiword_expressions(incomplete_multis, complete_multis,
                                           thewords, ref, msdtag, max_gaps,
                                           ann_tags_words, msd_annotation,
                                           sent, skip_pos_check)

            # Loop to next token

        if not allow_multiword_overlap:
            # Check that we don't have any unwanted overlaps
            remove_unwanted_overlaps(complete_multis)

        # Then save the rest of the multi word expressions in sentence_tokens
        save_multiwords(complete_multis, sentence_tokens)

        for tok in list(sentence_tokens.values()):
            out_annotation[tok["token_index"]] = _join_annotation(
                tok["annotations"], delimiter, affix)

        # Loop to next sentence

    for out_annotation_obj, annotation_name in annotations:
        out_annotation_obj.write(
            [v.get(annotation_name, delimiter) for v in out_annotation])
Example #11
0
def annotate(
        wsdjar: Binary = Binary("[wsd.jar]"),
        sense_model: Model = Model("[wsd.sense_model]"),
        context_model: Model = Model("[wsd.context_model]"),
        out: Output = Output(
            "<token>:wsd.sense",
            cls="token:sense",
            description="Sense disambiguated SALDO identifiers"),
        sentence: Annotation = Annotation("<sentence>"),
        word: Annotation = Annotation("<token:word>"),
        ref: Annotation = Annotation("<token>:misc.number_rel_<sentence>"),
        lemgram: Annotation = Annotation("<token>:saldo.lemgram"),
        saldo: Annotation = Annotation("<token>:saldo.sense"),
        pos: Annotation = Annotation("<token:pos>"),
        token: Annotation = Annotation("<token>"),
        prob_format: str = Config("wsd.prob_format"),
        default_prob: float = Config("wsd.default_prob"),
        encoding: str = util.UTF8):
    """Run the word sense disambiguation tool (saldowsd.jar) to add probabilities to the saldo annotation.

    Unanalyzed senses (e.g. multiword expressions) receive the probability value given by default_prob.
      - wsdjar is the name of the java programme to be used for the wsd
      - sense_model and context_model are the models to be used with wsdjar
      - out is the resulting annotation file
      - sentence is an existing annotation for sentences and their children (words)
      - word is an existing annotations for wordforms
      - ref is an existing annotation for word references
      - lemgram and saldo are existing annotations for inflection tables and meanings
      - pos is an existing annotations for part-of-speech
      - prob_format is a format string for how to print the sense probability
      - default_prob is the default value for unanalyzed senses
    """
    word_annotation = list(word.read())
    ref_annotation = list(ref.read())
    lemgram_annotation = list(lemgram.read())
    saldo_annotation = list(saldo.read())
    pos_annotation = list(pos.read())

    sentences, orphans = sentence.get_children(token)
    sentences.append(orphans)

    # Start WSD process
    process = wsd_start(wsdjar, sense_model.path, context_model.path, encoding)

    # Construct input and send to WSD
    stdin = build_input(sentences, word_annotation, ref_annotation,
                        lemgram_annotation, saldo_annotation, pos_annotation)

    if encoding:
        stdin = stdin.encode(encoding)

    stdout, stderr = process.communicate(stdin)
    # TODO: Solve hack line below!
    # Problem is that regular messages "Reading sense vectors.." are also piped to stderr.
    if len(stderr) > 52:
        util.system.kill_process(process)
        log.error(str(stderr))
        return

    if encoding:
        stdout = stdout.decode(encoding)

    process_output(word, out, stdout, sentences, saldo_annotation, prob_format,
                   default_prob)

    # Kill running subprocess
    util.system.kill_process(process)
    return
Example #12
0
def relations(
        out: OutputData = OutputData("korp.relations"),
        word: Annotation = Annotation("<token:word>"),
        pos: Annotation = Annotation("<token:pos>"),
        lemgram: Annotation = Annotation("<token>:saldo.lemgram"),
        dephead: Annotation = Annotation("<token:dephead>"),
        deprel: Annotation = Annotation("<token:deprel>"),
        sentence_id: Annotation = Annotation("<sentence>:misc.id"),
        ref: Annotation = Annotation("<token>:misc.number_rel_<sentence>"),
        baseform: Annotation = Annotation("<token>:saldo.baseform")):
    """Find certain dependencies between words, to be used by the Word Picture feature in Korp."""
    sentence_ids = sentence_id.read()
    sentence_tokens, _ = sentence_id.get_children(word)

    annotations = list(
        word.read_attributes(
            (word, pos, lemgram, dephead, deprel, ref, baseform)))

    # http://stp.ling.uu.se/~nivre/swedish_treebank/dep.html
    # Tuples with relations (head, rel, dep) to be found (with indexes) and an optional tuple specifying which info
    # should be stored and how
    rels = [
        ({
            1: "VB",
            2: "SS",
            3: "NN"
        }, {
            1: "VB",
            4: "VG",
            5: "VB"
        }, (5, 2, 3, "")),  # "han har sprungit"
        ({
            1: "VB",
            2: "(SS|OO|IO|OA)",
            3: "NN"
        }, ),
        ({
            1: "VB",
            2: "(RA|TA)",
            3: "(AB|NN)"
        }, ),
        ({
            1: "VB",
            2: "(RA|TA)",
            3: "PP"
        }, {
            3: "PP",
            4: "(PA|HD)",
            5: "NN"
        }, (1, 2, 5, "%(3)s")),  # "ges vid behov"
        ({
            1: "NN",
            2: "(AT|ET)",
            3: "JJ"
        }, ),  # "stor hund"
        ({
            1: "NN",
            2: "ET",
            3: "VB"
        }, {
            3: "VB",
            4: "SS",
            5: "HP"
        }, (1, 2, 3, "%(5)s")),  # "brödet som bakats"
        ({
            1: "NN",
            2: "ET",
            3: "PP"
        }, {
            3: "PP",
            4: "PA",
            5: "(NN|PM)"
        }, (1, 2, 5, "%(3)s")),  # "barnen i skolan", "hundarna i Sverige"
        ({
            1: "PP",
            2: "PA",
            3: "NN"
        }, ),  # "på bordet"
        ({
            1: "JJ",
            2: "AA",
            3: "AB"
        }, )  # "fullständigt galen"
    ]

    null_rels = [
        ("VB", ["OO"]),  # Verb som saknar objekt
    ]

    triples = []

    for sentid, sent in zip(sentence_ids, sentence_tokens):
        incomplete = {}  # Tokens looking for heads, with head as key
        tokens = {}  # Tokens in same sentence, with token_index as key

        # Link the tokens together
        for token_index in sent:
            token_word, token_pos, token_lem, token_dh, token_dr, token_ref, token_bf = annotations[
                token_index]
            token_word = token_word.lower()

            if token_lem == "|":
                token_lem = token_word

            this = {
                "pos": token_pos,
                "lemgram": token_lem,
                "word": token_word,
                "head": None,
                "dep": [],
                "ref": token_ref,
                "bf": token_bf
            }

            tokens[token_index] = this

            if not token_dh == "-":
                token_dh = int(token_dh)
                # This token is looking for a head (token is not root)
                dep_triple = (token_dr, this)
                if token_dh in tokens:
                    # Found head. Link them together both ways
                    this["head"] = (token_dr, tokens[token_dh])
                    tokens[token_dh]["dep"].append(dep_triple)
                else:
                    incomplete.setdefault(token_dh, []).append(
                        (token_index, dep_triple))

            # Is someone else looking for the current token as head?
            if token_index in incomplete:
                for t in incomplete[token_index]:
                    tokens[t[0]]["head"] = this
                    this["dep"].append(t[1])
                del incomplete[token_index]

        assert not incomplete, "incomplete is not empty"

        def _match(pattern, value):
            return bool(re.match(r"^%s$" % pattern, value))

        def _findrel(head, rel, dep):
            result = []
            if isinstance(head, dict):
                for d in head["dep"]:
                    if _match(rel, d[0]) and _match(dep, d[1]["pos"]):
                        result.append(d[1])
            if isinstance(dep, dict):
                h = dep["head"]
                if h and _match(rel, h[0]) and _match(head, h[1]["pos"]):
                    result.append(h[1])
            return result

        # Look for relations
        for v in list(tokens.values()):
            for d in v["dep"]:
                for rel in rels:
                    r = rel[0]
                    if _match(";".join([x[1] for x in sorted(r.items())]),
                              ";".join([v["pos"], d[0], d[1]["pos"]])):
                        triple = None
                        if len(rel) == 1:
                            triple = ((v["lemgram"], v["word"], v["pos"],
                                       v["ref"]), d[0],
                                      (d[1]["lemgram"], d[1]["word"],
                                       d[1]["pos"], d[1]["ref"]), ("", None),
                                      sentid, v["ref"], d[1]["ref"])
                        else:
                            lookup = dict(
                                list(
                                    zip(list(map(str, sorted(r.keys()))),
                                        (v, d[0], d[1]))))
                            i = set(rel[0].keys()).intersection(
                                set(rel[1].keys())).pop()
                            rel2 = [x[1] for x in sorted(rel[1].items())]
                            index1 = list(rel[0].keys()).index(i)
                            index2 = list(rel[1].keys()).index(i)
                            if index1 == 2 and index2 == 0:
                                result = _findrel(d[1], rel2[1], rel2[2])
                                if result:
                                    lookup.update(
                                        dict(
                                            list(
                                                zip(
                                                    list(
                                                        map(
                                                            str,
                                                            sorted(rel[1].keys(
                                                            )))),
                                                    (d[1], rel2[1],
                                                     result[0])))))
                            elif index1 == 0 and index2 == 0:
                                result = _findrel(v, rel2[1], rel2[2])
                                if result:
                                    lookup.update(
                                        dict(
                                            list(
                                                zip(
                                                    list(
                                                        map(
                                                            str,
                                                            sorted(rel[1].keys(
                                                            )))),
                                                    (v, rel2[1], result[0])))))

                            pp = rel[-1]
                            if len(list(lookup.keys())) > 3:
                                lookup_bf = dict(
                                    (key, val["bf"])
                                    for key, val in list(lookup.items())
                                    if isinstance(val, dict))
                                lookup_ref = dict(
                                    (key, val["ref"])
                                    for key, val in list(lookup.items())
                                    if isinstance(val, dict))
                                triple = ((lookup[str(pp[0])]["lemgram"],
                                           lookup[str(pp[0])]["word"],
                                           lookup[str(pp[0])]["pos"],
                                           lookup[str(pp[0])]["ref"]),
                                          lookup[str(pp[1])],
                                          (lookup[str(pp[2])]["lemgram"],
                                           lookup[str(pp[2])]["word"],
                                           lookup[str(pp[2])]["pos"],
                                           lookup[str(pp[2])]["ref"]),
                                          (pp[3] % lookup_bf,
                                           pp[3] % lookup_ref), sentid,
                                          lookup[str(pp[0])]["ref"],
                                          lookup[str(pp[2])]["ref"])
                        if triple:
                            triples.extend(_mutate_triple(triple))
                            break
            token_rels = [d[0] for d in v["dep"]]
            for nrel in null_rels:
                if nrel[0] == v["pos"]:
                    missing_rels = [x for x in nrel[1] if x not in token_rels]
                    for mrel in missing_rels:
                        triple = ((v["lemgram"], v["word"], v["pos"],
                                   v["ref"]), mrel, ("", "", "", v["ref"]),
                                  ("", None), sentid, v["ref"], v["ref"])
                        triples.extend(_mutate_triple(triple))

    triples = sorted(set(triples))

    out_data = "\n".join([
        "\t".join(
            (head, headpos, rel, dep, deppos, extra, sentid, refhead, refdep,
             str(bfhead), str(bfdep), str(wfhead), str(wfdep)))
        for (head, headpos, rel, dep, deppos, extra, sentid, refhead, refdep,
             bfhead, bfdep, wfhead, wfdep) in triples
    ])
    out.write(out_data)
Example #13
0
def annotate_text(out: Output, lexical_classes_token: Annotation, text: Annotation, token: Annotation,
                  saldoids, cutoff, types, delimiter, affix, freq_model, decimals):
    """
    Annotate text chuncs with lexical classes.

    - out: resulting annotation file
    - lexical_classes_token: existing annotation with lexical classes on token level.
    - text, token: existing annotations for the text-IDs and the tokens.
    - saldoids: existing annotation with saldoIDs, needed when types=True.
    - cutoff: value for limiting the resulting bring classes.
              The result will contain all words with the top x frequencies.
              Words with frequency = 1 will be removed from the result.
    - types: if True, count every class only once per saldo ID occurrence.
    - delimiter: delimiter character to put between ambiguous results.
    - affix: optional character to put before and after results to mark a set.
    - freq_model: pickled file with reference frequencies.
    - decimals: number of decimals to keep in output.
    """
    cutoff = int(cutoff)
    text_children, _orphans = text.get_children(token, preserve_parent_annotation_order=True)
    classes = list(lexical_classes_token.read())
    sense = list(saldoids.read()) if types else None

    if freq_model:
        freq_model = util.PickledLexicon(freq_model.path)

    out_annotation = text.create_empty_attribute()

    for text_index, words in enumerate(text_children):
        seen_types = set()
        class_freqs = defaultdict(int)

        for token_index in words:
            # Count only sense types
            if types:
                senses = str(sorted([s.split(util.SCORESEP)[0] for s in sense[token_index].strip(util.AFFIX).split(util.DELIM)]))
                if senses in seen_types:
                    continue
                else:
                    seen_types.add(senses)

            rogwords = classes[token_index].strip(util.AFFIX).split(util.DELIM) if classes[token_index] != util.AFFIX else []
            for w in rogwords:
                class_freqs[w] += 1

        if freq_model:
            for c in class_freqs:
                # Relative frequency
                rel = class_freqs[c] / len(words)
                # Calculate class dominance
                ref_freq = freq_model.lookup(c.replace("_", " "), 0)
                if not ref_freq:
                    log.error("Class '%s' is missing" % ref_freq)
                class_freqs[c] = (rel / ref_freq)

        # Sort words according to frequency/dominance
        ordered_words = sorted(class_freqs.items(), key=lambda x: x[1], reverse=True)
        if freq_model:
            # Remove words with dominance < 1
            ordered_words = [w for w in ordered_words if w[1] >= 1]
        else:
            # Remove words with frequency 1
            ordered_words = [w for w in ordered_words if w[1] > 1]

        if len(ordered_words) > cutoff:
            cutoff_freq = ordered_words[cutoff - 1][1]
            ordered_words = [w for w in ordered_words if w[1] >= cutoff_freq]

        # Join words and frequencies/dominances
        ordered_words = [util.SCORESEP.join([word, str(round(freq, decimals))]) for word, freq in ordered_words]
        out_annotation[text_index] = util.cwbset(ordered_words, delimiter, affix) if ordered_words else affix

    out.write(out_annotation)
Example #14
0
def annotate(out_ne: Output = Output("swener.ne", cls="named_entity", description="Named entity segments from SweNER"),
             out_ne_ex: Output = Output("swener.ne:swener.ex", description="Named entity expressions from SweNER"),
             out_ne_type: Output = Output("swener.ne:swener.type", cls="named_entity:type",
                                          description="Named entity types from SweNER"),
             out_ne_subtype: Output = Output("swener.ne:swener.subtype", cls="named_entity:subtype",
                                             description="Named entity sub types from SweNER"),
             out_ne_name: Output = Output("swener.ne:swener.name", cls="named_entity:name",
                                          description="Names in SweNER named entities"),
             word: Annotation = Annotation("<token:word>"),
             sentence: Annotation = Annotation("<sentence>"),
             token: Annotation = Annotation("<token>"),
             binary: Binary = Binary("[swener.binary]"),
             process_dict=None):
    """Tag named entities using HFST-SweNER.

    SweNER is either run in an already started process defined in
    process_dict, or a new process is started(default)
    - doc, word, sentence, token: existing annotations
    - out_ne_ex, out_ne_type, out_ne_subtype: resulting annotation files for the named entities
    - process_dict is used in the catapult and should never be set from the command line
    """
    if process_dict is None:
        process = swenerstart(binary, "", util.UTF8, verbose=False)
    # else:
    #     process = process_dict["process"]
    #     # If process seems dead, spawn a new one
    #     if process.stdin.closed or process.stdout.closed or process.poll():
    #         util.system.kill_process(process)
    #         process = swenerstart("", encoding, verbose=False)
    #         process_dict["process"] = process

    # Get sentence annotation
    sentences, _orphans = sentence.get_children(token, orphan_alert=True)

    # Collect all text
    word_annotation = list(word.read())
    stdin = SENT_SEP.join(TOK_SEP.join(word_annotation[token_index] for token_index in sent)
                          for sent in sentences)
    # Escape <, > and &
    stdin = xml.sax.saxutils.escape(stdin)

    # keep_process = len(stdin) < RESTART_THRESHOLD_LENGTH and process_dict is not None
    # log.info("Stdin length: %s, keep process: %s", len(stdin), keep_process)

    # if process_dict is not None:
    #     process_dict["restart"] = not keep_process

    # # Does not work as of now since swener does not have an interactive mode
    # if keep_process:
    #     # Chatting with swener: send a SENT_SEP and read correct number of lines
    #     stdin_fd, stdout_fd = process.stdin, process.stdout
    #     stdin_fd.write(stdin.encode(encoding) + SENT_SEP)
    #     stdin_fd.flush()
    #     stout = stdout_fd.readlines()

    # else:
    # Otherwise use communicate which buffers properly
    # log.info("STDIN %s %s", type(stdin.encode(encoding)), stdin.encode(encoding))
    stdout, _ = process.communicate(stdin.encode(util.UTF8))
    # log.info("STDOUT %s %s", type(stdout.decode(encoding)), stdout.decode(encoding))

    parse_swener_output(sentences, token, stdout.decode(util.UTF8), out_ne, out_ne_ex, out_ne_type, out_ne_subtype,
                        out_ne_name)
Example #15
0
def annotate(
        out_phrase: Output = Output("phrase_structure.phrase",
                                    description="Phrase segments"),
        out_phrase_name: Output = Output(
            "phrase_structure.phrase:phrase_structure.name",
            description="Phrase names"),
        out_phrase_func: Output = Output(
            "phrase_structure.phrase:phrase_structure.func",
            description="Phrase functions"),
        token: Annotation = Annotation("<token>"),
        word: Annotation = Annotation("<token:word>"),
        sentence: Annotation = Annotation("<sentence>"),
        pos: Annotation = Annotation("<token:pos>"),
        msd: Annotation = Annotation("<token:msd>"),
        ref: Annotation = Annotation("<token>:misc.number_rel_<sentence>"),
        dephead_ref: Annotation = Annotation("<token:dephead_ref>"),
        deprel: Annotation = Annotation("<token:deprel>")):
    """Annotate sentence with phrase structures."""
    sentences, _orphans = sentence.get_children(word)
    token_annotations = list(
        ref.read_attributes([ref, word, pos, msd, dephead_ref, deprel]))
    token_spans = list(token.read_spans())

    def get_token_span(index):
        return token_spans[index]

    nodes = []

    for s in sentences:
        tokenlist = [Token(None)]
        for token_index in s:
            token = token_annotations[token_index]
            tokenlist.append(Token(token))

        # Get PS tree
        sen = Sentence(tokenlist)
        if not sen.is_cyclic():
            tree = convert_sentence(sen).top.to_tree_str()
            # print(pprint.pformat(tree), file=sys.stderr)

            # Make nodes
            children = flatten_tree(tree[1], [])
            log.debug("\n\nSENTENCE:")
            position = 0
            open_elem_stack = []
            for child in children:
                if not child[0].startswith("WORD:"):
                    start_pos = get_token_span(s[position])[0]
                    open_elem_stack.append(child + (start_pos, ))
                    log.debug(
                        f"<phrase name={child[0]} func={child[1]}> {s[position]}"
                    )
                else:
                    # Close nodes
                    while open_elem_stack[-1][2] == child[2]:
                        start_pos = open_elem_stack[-1][3]
                        end_pos = get_token_span(s[position - 1])[1]
                        nodes.append(
                            ((start_pos, end_pos), open_elem_stack[-1][0],
                             open_elem_stack[-1][1]))
                        log.debug(
                            f"</phrase name={open_elem_stack[-1][0]} func={open_elem_stack[-1][1]}> {start_pos}-{end_pos}"
                        )
                        open_elem_stack.pop()
                    position += 1
                    log.debug(f"   {child[0][5:]}")

            # Close remaining open nodes
            end_pos = get_token_span(s[-1])[1]
            for elem in reversed(open_elem_stack):
                start_pos = elem[3]
                nodes.append(((start_pos, end_pos), elem[0], elem[1]))
                log.debug(
                    f"</phrase name={elem[0]} func={elem[1]}> {start_pos}-{end_pos}"
                )

    # Sort nodes
    sorted_nodes = sorted(nodes)

    # Write annotations
    out_phrase.write([i[0] for i in sorted_nodes])
    out_phrase_name.write([i[1] for i in sorted_nodes])
    out_phrase_func.write([i[2] for i in sorted_nodes])