Beispiel #1
0
def msdtag(out: Output = Output(
    "<token>:hunpos.msd",
    cls="token:msd",
    description="Part-of-speeches with morphological descriptions"),
           word: Annotation = Annotation("<token:word>"),
           sentence: Annotation = Annotation("<sentence>"),
           binary: Binary = Binary("[hunpos.binary]"),
           model: Model = Model("[hunpos.model]"),
           morphtable: Optional[Model] = Model("[hunpos.morphtable]"),
           patterns: Optional[Model] = Model("[hunpos.patterns]"),
           tag_mapping=None,
           encoding: str = util.UTF8):
    """POS/MSD tag using the Hunpos tagger."""
    if isinstance(tag_mapping, str) and tag_mapping:
        tag_mapping = util.tagsets.mappings[tag_mapping]
    elif tag_mapping is None or tag_mapping == "":
        tag_mapping = {}

    pattern_list = []

    if patterns:
        with open(patterns.path, encoding="utf-8") as pat:
            for line in pat:
                if line.strip() and not line.startswith("#"):
                    name, pattern, tags = line.strip().split("\t", 2)
                    pattern_list.append(
                        (name, re.compile("^%s$" % pattern), tags))

    def replace_word(w):
        """Replace word with alias if word matches a regex pattern."""
        for p in pattern_list:
            if re.match(p[1], w):
                return "[[%s]]" % p[0]
        return w

    sentences, _orphans = sentence.get_children(word)
    token_word = list(word.read())
    stdin = SENT_SEP.join(
        TOK_SEP.join(
            replace_word(token_word[token_index]) for token_index in sent)
        for sent in sentences)
    args = [model.path]
    if morphtable:
        args.extend(["-m", morphtable.path])
    stdout, _ = util.system.call_binary(binary, args, stdin, encoding=encoding)

    out_annotation = word.create_empty_attribute()
    for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)):
        for token_index, tagged_token in zip(
                sent,
                tagged_sent.strip().split(TOK_SEP)):
            tag = tagged_token.strip().split(TAG_SEP)[TAG_COLUMN]
            tag = tag_mapping.get(tag, tag)
            out_annotation[token_index] = tag

    out.write(out_annotation)
Beispiel #2
0
def contextual(out: Output = Output("{chunk}:geo.geo_context", description="Geographical places with coordinates"),
               chunk: Annotation = Annotation("{chunk}"),
               context: Annotation = Annotation("[geo.context_chunk]"),
               ne_type: Annotation = Annotation("swener.ne:swener.type"),
               ne_subtype: Annotation = Annotation("swener.ne:swener.subtype"),
               ne_name: Annotation = Annotation("swener.ne:swener.name"),
               model: Model = Model("[geo.model]"),
               method: str = "populous",
               language: list = []):
    """Annotate chunks with location data, based on locations contained within the text.

    context = text chunk to use for disambiguating places (when applicable).
    chunk = text chunk to which the annotation will be added.
    """
    model = load_model(model, language=language)

    ne_type_annotation = list(ne_type.read())
    ne_subtype_annotation = list(ne_subtype.read())
    ne_name_annotation = list(ne_name.read())

    children_context_chunk, _orphans = context.get_children(chunk)
    children_chunk_ne, _orphans = chunk.get_children(ne_type)

    out_annotation = chunk.create_empty_attribute()

    for chunks in children_context_chunk:
        all_locations = []  # TODO: Maybe not needed for anything?
        context_locations = []
        chunk_locations = defaultdict(list)

        for ch in chunks:
            for n in children_chunk_ne[ch]:
                if ne_type_annotation[n] == "LOC" and "PPL" in ne_subtype_annotation[n]:
                    location_text = ne_name_annotation[n].replace("\n", " ").replace("  ", " ")
                    location_data = model.get(location_text.lower())
                    if location_data:
                        all_locations.append((location_text, list(location_data)))
                        context_locations.append((location_text, list(location_data)))
                        chunk_locations[ch].append((location_text, list(location_data)))
                    else:
                        pass
                        # log.info("No location found for %s" % ne_name_annotation[n].replace("%", "%%"))

        chunk_locations = most_populous(chunk_locations)

        for c in chunks:
            out_annotation[c] = _format_location(chunk_locations.get(c, ()))

    out.write(out_annotation)
Beispiel #3
0
def stanza_lem_model(
        model: ModelOutput = ModelOutput("stanza/lem/sv_suc_lemmatizer.pt")):
    """Download and unzip the Stanza POS-tagging model."""
    zip_model = Model("stanza/lem/synt_stanza_full.zip")
    zip_model.download(
        "https://svn.spraakdata.gu.se/sb-arkiv/pub/stanza/lem_stanza.zip")
    zip_model.unzip()
    zip_model.remove()
Beispiel #4
0
def build_korp_stats(out: ModelOutput = ModelOutput("saldo/stats.pickle"),
                     _saldom: Model = Model("saldo/saldom.xml")):
    """Download Korp's word frequency file and convert it to a model."""
    txt_file = Model("saldo/stats_all.txt")
    try:
        log.info("Downloading Korp stats file...")
        download_stats_file(
            "https://svn.spraakdata.gu.se/sb-arkiv/pub/frekvens/stats_all.txt",
            txt_file.path)

        log.info("Building frequency model...")
        make_model(txt_file.path, out.path)
    finally:
        # Clean up
        txt_file.remove()
Beispiel #5
0
def diapivot_annotate(
        out: Output = Output(
            "<token>:hist.diapivot",
            description="SALDO IDs corresponding to lemgrams"),
        lemgram: Annotation = Annotation("<token>:saldo.lemgram"),
        model: Model = Model("hist/diapivot.pickle")):
    """Annotate each lemgram with its corresponding saldo_id according to model.

    Args:
        out (str, optional): Resulting annotation file.
            Defaults to Output("<token>:hist.diapivot", description="SALDO IDs corresponding to lemgrams").
        lemgram (str, optional): Existing lemgram annotation. Defaults to Annotation("<token>:saldo.lemgram").
        model (str, optional): Crosslink model. Defaults to Model("hist/diapivot.pickle").
    """
    lexicon = PivotLexicon(model)
    lemgram_annotation = list(lemgram.read())

    out_annotation = []

    for lemgrams in lemgram_annotation:
        saldo_ids = []
        for lemgram in lemgrams.split(util.DELIM):
            s_i = lexicon.get_exactMatch(lemgram)
            if s_i:
                saldo_ids += [s_i]
        out_annotation.append(util.AFFIX + util.DELIM.join(set(saldo_ids)) +
                              util.AFFIX if saldo_ids else util.AFFIX)

    out.write(out_annotation)
Beispiel #6
0
def swefn_words(out: Output = Output("<token>:lexical_classes.swefn",
                                     description="Lexical classes for tokens from SweFN"),
                model: Model = Model("[lexical_classes.swefn_word_model]"),
                saldoids: Annotation = Annotation("<token:sense>"),
                pos: Annotation = Annotation("<token:pos>"),
                pos_limit: List[str] = ["NN", "VB", "JJ", "AB"],
                disambiguate: bool = True,
                connect_ids: bool = False,
                delimiter: str = util.DELIM,
                affix: str = util.AFFIX,
                scoresep: str = util.SCORESEP,
                lexicon=None):
    """Swefn specific wrapper for annotate_words. See annotate_words for more info."""

    # SweFN annotation function
    def annotate_swefn(saldo_ids, lexicon, connect_IDs=False, scoresep=util.SCORESEP):
        swefnid = set()
        if saldo_ids:
            for sid in saldo_ids:
                if connect_IDs:
                    swefnid = swefnid.union(set(i + scoresep + sid for i in lexicon.lookup(sid, default=set())))
                else:
                    swefnid = swefnid.union(lexicon.lookup(sid, default=set()))
        return sorted(swefnid)

    annotate_words(out, model, saldoids, pos, annotate_swefn, pos_limit=pos_limit, disambiguate=disambiguate,
                   connect_ids=connect_ids, delimiter=delimiter, affix=affix, scoresep=scoresep, lexicon=lexicon)
Beispiel #7
0
def blingbring_words(out: Output = Output("<token>:lexical_classes.blingbring",
                                          description="Lexical classes for tokens from Blingbring"),
                     model: Model = Model("[lexical_classes.bb_word_model]"),
                     saldoids: Annotation = Annotation("<token:sense>"),
                     pos: Annotation = Annotation("<token:pos>"),
                     pos_limit: List[str] = ["NN", "VB", "JJ", "AB"],
                     class_set: str = "bring",
                     disambiguate: bool = True,
                     connect_ids: bool = False,
                     delimiter: str = util.DELIM,
                     affix: str = util.AFFIX,
                     scoresep: str = util.SCORESEP,
                     lexicon=None):
    """Blingbring specific wrapper for annotate_words. See annotate_words for more info."""
    # pos_limit="NN VB JJ AB" | None

    if class_set not in ["bring", "roget_head", "roget_subsection", "roget_section", "roget_class"]:
        log.warning("Class '%s' not available. Fallback to 'bring'.")
        class_set = "bring"

    # Blingbring annotation function
    def annotate_bring(saldo_ids, lexicon, connect_IDs=False, scoresep=util.SCORESEP):
        rogetid = set()
        if saldo_ids:
            for sid in saldo_ids:
                if connect_IDs:
                    rogetid = rogetid.union(set(i + scoresep + sid for i in lexicon.lookup(sid, default=set())))
                else:
                    rogetid = rogetid.union(lexicon.lookup(sid, default=dict()).get(class_set, set()))
        return sorted(rogetid)

    annotate_words(out, model, saldoids, pos, annotate_bring, pos_limit=pos_limit, disambiguate=disambiguate,
                   class_set=class_set, connect_ids=connect_ids, delimiter=delimiter, affix=affix, scoresep=scoresep,
                   lexicon=lexicon)
def build_nst_comp(out: ModelOutput = ModelOutput("saldo/nst_comp_pos.pickle"),
                   nst_lexicon: Model = Model("saldo/nst_utf8.txt")):
    """Download NST lexicon and convert it to a compound POS model.

    The NST lexicon can be retrieved from SVN with credentials:
    svn export https://svn.spraakdata.gu.se/sb-arkiv/lexikon/NST_svensk_leksikon/nst_utf8.txt saldo/nst_utf8.txt
    """
    log.info("Building compound POS probability model...")
    make_model(nst_lexicon, out)
Beispiel #9
0
def tokenize(
        text: Text = Text(),
        out: Output = Output("segment.token",
                             cls="token",
                             description="Token segments"),
        chunk: Annotation = Annotation("[segment.token_chunk]"),
        segmenter: str = Config("segment.token_segmenter"),
        existing_segments: Optional[str] = Config("segment.existing_tokens"),
        model: Optional[Model] = Model("[segment.tokenizer_config]"),
        token_list: Optional[Model] = Model("[segment.token_list]")):
    """Tokenize text."""
    do_segmentation(text=text,
                    out=out,
                    chunk=chunk,
                    segmenter=segmenter,
                    existing_segments=existing_segments,
                    model=model,
                    token_list=token_list)
Beispiel #10
0
def stanza_pos_model(model: ModelOutput = ModelOutput(
    "stanza/pos/full_sv_talbanken_tagger.pt"),
                     pretrain: ModelOutput = ModelOutput(
                         "stanza/pos/full_sv_talbanken.pretrain.pt")):
    """Download and unzip the Stanza POS-tagging model."""
    zip_model = Model("stanza/pos/synt_stanza_full.zip")
    zip_model.download(
        "https://svn.spraakdata.gu.se/sb-arkiv/pub/stanza/morph_stanza_full.zip"
    )
    zip_model.unzip()
    zip_model.remove()
Beispiel #11
0
def stanza_dep_model(
        model: ModelOutput = ModelOutput("stanza/dep/sv_talbanken_parser.pt"),
        pretrain: ModelOutput = ModelOutput(
            "stanza/dep/sv_talbanken.pretrain.pt")):
    """Download and unzip the Stanza dependency model."""
    zip_model = Model("stanza/dep/synt_stanza_full.zip")
    zip_model.download(
        "https://svn.spraakdata.gu.se/sb-arkiv/pub/stanza/synt_stanza_full.zip"
    )
    zip_model.unzip()
    zip_model.remove()
def hist_morphtable(out: ModelOutput = ModelOutput("hunpos/hist/dalinm-swedberg_saldo_suc-tags.morphtable"),
                    swedberg: Model = Model("hunpos/hist/swedberg-gender.hunpos"),
                    dalin: Model = Model("hunpos/hist/dalinm.hunpos"),
                    saldosuc_morphtable: Model = Model("hunpos/saldo_suc-tags.morphtable")):
    """Read files and make a morphtable together with the information from SALDO (saldosuc_morphtable).

    Args:
        out (str, optional): Resulting morphtable file to be written.
            Defaults to ModelOutput("hunpos/hist/dalinm-swedberg_saldo_suc-tags.morphtable").
        swedberg (str, optional): Wordlist from Swedberg and corresponding SALDO MSD-tags.
            Defaults to Model("hunpos/hist/swedberg-gender.hunpos").
        dalin (str, optional): Wordlist from Dalin and corresponding SALDO MSD-tags.
            Defaults to Model("hunpos/hist/dalinm.hunpos").
        saldosuc_morphtable (str, optional): SALDO Hunpos morphtable.
            Defaults to Model("hunpos/saldo_suc-tags.morphtable").
    """
    words = {}
    _read_saldosuc(words, saldosuc_morphtable.path)
    for fil in [dalin, swedberg]:
        for line in open(fil.path, encoding="utf-8").readlines():
            if not line.strip():
                continue
            xs = line.split("\t")
            word, msd = xs[0].strip(), xs[1].strip()
            if " " in word:
                if msd.startswith("nn"):  # We assume that the head of a noun mwe is the last word
                    word = word.split()[-1]
                if msd.startswith("vb"):  # We assume that the head of a verbal mwe is the first word
                    word = word.split()[0]

            # If the tag is not present, we try to translate it anyway
            suc = SALDO_TO_SUC.get(msd, "")
            if not suc:
                suc = _force_parse(msd)
            if suc:
                words.setdefault(word.lower(), set()).update(suc)
                words.setdefault(word.title(), set()).update(suc)
    with open(out.path, encoding="UTF-8", mode="w") as out:
        for w, ts in list(words.items()):
            line = ("\t".join([w] + list(ts)) + "\n")
            out.write(line)
Beispiel #13
0
def build_tokenlist(
        saldo_model: Model = Model("saldo/saldo.pickle"),
        out: ModelOutput = ModelOutput(
            "segment/bettertokenizer.sv.saldo-tokens"),
        segmenter: str = Config("segment.token_wordlist_segmenter"),
        model: Model = Model("segment/bettertokenizer.sv")):
    """Build a list of words from a SALDO model, to help BetterWordTokenizer."""
    segmenter_args = []
    if model:
        if model.path.suffix in ["pickle", "pkl"]:
            with open(model, "rb") as m:
                model_arg = pickle.load(m)
        else:
            model_arg = model.path
        segmenter_args.append(model_arg)
    assert segmenter in SEGMENTERS, "Available segmenters: %s" % ", ".join(
        sorted(SEGMENTERS))
    segmenter = SEGMENTERS[segmenter]
    segmenter = segmenter(*segmenter_args)
    assert hasattr(
        segmenter, "span_tokenize"
    ), "Segmenter needs a 'span_tokenize' method: %r" % segmenter

    wordforms = set()

    # Skip strings already handled by the tokenizer.
    # Also skip words ending in comma (used by some multi word expressions in SALDO).
    with open(saldo_model.path, "rb") as F:
        lexicon = pickle.load(F)
        for w in lexicon:
            w2 = list(map(split_triple, lexicon[w]))
            mwu_extras = [
                contw for w3 in w2 for cont in w3[2] for contw in cont
                if contw not in lexicon
            ]
            for wf in mwu_extras + [w]:
                spans = list(segmenter.span_tokenize(wf))
                if len(spans) > 1 and not wf.endswith(","):
                    wordforms.add(wf)

    out.write("\n".join(sorted(wordforms)))
def predict(doc: str = Document,
            model: str = Model("[vw_topic_modelling.model]"),
            modeljson: str = Model("[vw_topic_modelling.modeljson]"),
            order,
            struct,
            parent: str = Annotation("{chunk}"),
            word: str = Annotation("<token:word>"),
            out: str = Output("{chunk}:vw_topic_modelling.prediction", description="Predicted attributes"),
            pos: str = Annotation("<token:pos>"),
            raw: bool = False):
    """Predict a structural attribute."""
    raw = raw == "true"

    m_json = json.load(open(modeljson))

    data = (
        Example(None, text.words, text.span)
        for text in texts([(order, struct, parent, word, pos)],
                          map_label=lambda _: "?",
                          min_word_length=m_json["min_word_length"],
                          banned_pos=m_json["banned_pos"])
    )

    index_to_label = m_json["index_to_label"]

    args = ["--initial_regressor", model]

    if raw:
        predictions = (
            util.cwbset(index_to_label[str(s)] + ":" + str(v) for s, v in ss)
            for ss, _span in vw_predict(args, data, raw=True)
        )
    else:
        predictions = (
            index_to_label[str(s)]
            for s, _span in vw_predict(args, data)
        )

    util.write_annotation(doc, out, predictions)
Beispiel #15
0
def annotate(
        lang: Language = Language(),
        model: Model = Model("[treetagger.model]"),
        tt_binary: Binary = Binary("[treetagger.binary]"),
        out_upos: Output = Output("<token>:treetagger.upos",
                                  cls="token:upos",
                                  description="Part-of-speeches in UD"),
        out_pos: Output = Output(
            "<token>:treetagger.pos",
            cls="token:pos",
            description="Part-of-speeches from TreeTagger"),
        out_baseform: Output = Output("<token>:treetagger.baseform",
                                      description="Baseforms from TreeTagger"),
        word: Annotation = Annotation("<token:word>"),
        sentence: Annotation = Annotation("<sentence>"),
        encoding: str = util.UTF8):
    """POS/MSD tag and lemmatize using TreeTagger."""
    sentences, _orphans = sentence.get_children(word)
    word_annotation = list(word.read())
    stdin = SENT_SEP.join(
        TOK_SEP.join(word_annotation[token_index] for token_index in sent)
        for sent in sentences)
    args = ["-token", "-lemma", "-no-unknown", "-eos-tag", "<eos>", model.path]

    stdout, stderr = util.system.call_binary(tt_binary,
                                             args,
                                             stdin,
                                             encoding=encoding)
    log.debug("Message from TreeTagger:\n%s", stderr)

    # Write pos and upos annotations.
    out_upos_annotation = word.create_empty_attribute()
    out_pos_annotation = word.create_empty_attribute()
    for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)):
        for token_id, tagged_token in zip(sent,
                                          tagged_sent.strip().split(TOK_SEP)):
            tag = tagged_token.strip().split(TAG_SEP)[TAG_COLUMN]
            out_pos_annotation[token_id] = tag
            out_upos_annotation[token_id] = util.tagsets.pos_to_upos(
                tag, lang, TAG_SETS.get(lang))
    out_pos.write(out_pos_annotation)
    out_upos.write(out_upos_annotation)

    # Write lemma annotations.
    out_lemma_annotation = word.create_empty_attribute()
    for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)):
        for token_id, tagged_token in zip(sent,
                                          tagged_sent.strip().split(TOK_SEP)):
            lem = tagged_token.strip().split(TAG_SEP)[LEM_COLUMN]
            out_lemma_annotation[token_id] = lem
    out_baseform.write(out_lemma_annotation)
Beispiel #16
0
def annotate(
        sense: Annotation = Annotation("<token>:saldo.sense"),
        out_scores: Output = Output("<token>:sensaldo.sentiment_score",
                                    description="SenSALDO sentiment score"),
        out_labels: Output = Output("<token>:sensaldo.sentiment_label",
                                    description="SenSALDO sentiment label"),
        model: Model = Model("[sensaldo.model]"),
        lexicon=None):
    """Assign sentiment values to tokens based on their sense annotation.

    When more than one sense is possible, calulate a weighted mean.
    - sense: existing annotation with saldoIDs.
    - out_scores, out_labels: resulting annotation file.
    - model: pickled lexicon with saldoIDs as keys.
    - lexicon: this argument cannot be set from the command line,
      but is used in the catapult. This argument must be last.
    """
    if not lexicon:
        lexicon = util.PickledLexicon(model.path)
    # Otherwise use pre-loaded lexicon (from catapult)

    sense = sense.read()
    result_scores = []
    result_labels = []

    for token in sense:
        # Get set of senses for each token and sort them according to their probabilities
        token_senses = [
            tuple(s.rsplit(util.SCORESEP, 1)) if util.SCORESEP in s else
            (s, -1.0) for s in token.split(util.DELIM) if s
        ]
        token_senses.sort(key=lambda x: float(x[1]), reverse=True)

        # Lookup the sentiment score for the most probable sense and assign a sentiment label
        if token_senses:
            best_sense = token_senses[0][0]
            score = lexicon.lookup(best_sense, None)
        else:
            score = None

        if score:
            result_scores.append(score)
            result_labels.append(SENTIMENT_LABLES.get(int(score)))
        else:
            result_scores.append(None)
            result_labels.append(None)

    out_scores.write(result_scores)
    out_labels.write(result_labels)
Beispiel #17
0
def annotate(corpus_text: Text = Text(),
             lang: Language = Language,
             conf_file: Model = Model("[freeling.conf]"),
             fl_binary: Binary = Binary("[freeling.binary]"),
             sentence_chunk: Optional[Annotation] = Annotation("[freeling.sentence_chunk]"),
             out_token: Output = Output("freeling.token", cls="token", description="Token segments"),
             out_word: Output = Output("<token>:freeling.word", cls="token:word", description="Token strings"),
             out_baseform: Output = Output("<token>:freeling.baseform", description="Baseforms from FreeLing"),
             out_upos: Output = Output("<token>:freeling.upos", cls="token:upos", description="Part-of-speeches in UD"),
             out_pos: Output = Output("<token>:freeling.pos", cls="token:pos",
                                      description="Part-of-speeches from FreeLing"),
             out_sentence: Optional[Output] = Output("freeling.sentence", cls="sentence", description="Sentence segments"),
             sentence_annotation: Optional[Annotation] = Annotation("[freeling.sentence_annotation]")):
    """Run FreeLing and output sentences, tokens, baseforms, upos and pos."""
    main(corpus_text, lang, conf_file, fl_binary, sentence_chunk, out_token, out_word, out_baseform, out_upos, out_pos,
         out_sentence, sentence_annotation)
Beispiel #18
0
def swefn_text(out: Output = Output("<text>:lexical_classes.swefn",
                                    description="Lexical classes for text chunks from SweFN"),
               lexical_classes_token: Annotation = Annotation("<token>:lexical_classes.swefn"),
               text: Annotation = Annotation("<text>"),
               token: Annotation = Annotation("<token>"),
               saldoids: Optional[Annotation] = Annotation("<token:sense>"),
               cutoff: int = 3,
               types: bool = False,
               delimiter: str = util.DELIM,
               affix: str = util.AFFIX,
               freq_model: Model = Model("[lexical_classes.swefn_freq_model]"),
               decimals: int = 3):
    """Annotate text chunks with SweFN classes."""
    annotate_text(out=out, lexical_classes_token=lexical_classes_token, text=text, token=token,
                  saldoids=saldoids, cutoff=cutoff, types=types, delimiter=delimiter, affix=affix,
                  freq_model=freq_model, decimals=decimals)
def word_weights(doc: str = Document,
                 model: str = Model("[vw_topic_modelling.model]"),
                 word: str = Annotation("<token:word>"),
                 pos: str = Annotation("<token:pos>"),
                 out: str = Output("<token>:vw_topic_modelling:label_weights", description="Label weights per word")):
    """
    Report the weight for each label for each word.

    Both model and model.json must exist. See --train and --predict.
    """
    m_json = json.load(open(model + ".json"))
    index_to_label = m_json["index_to_label"]
    min_word_length = int(m_json["min_word_length"] or "0")
    banned_pos = (m_json["banned_pos"] or "").split()
    words = list(util.read_annotation(doc, word))
    poss = util.read_annotation(doc, pos) if pos else []
    data = (Example(None, vw_normalize(word))
            for n, word in enumerate(words)
            if len(word) >= min_word_length
            if not pos or poss[n] not in banned_pos)
    weights = defaultdict(list)
    with tempfile.NamedTemporaryFile() as tmp:
        args = ["--initial_regressor", model, "--invert_hash", tmp.name]
        for _ in vw_predict(args, data):
            pass
        for line in open(tmp.name, "r").readlines():
            # allmänna[1]:14342849:0.0139527
            colons = line.split(":")
            if len(colons) == 3:
                word, _hash, weight = colons
                if word[-1] == "]":
                    bracesplit = word.rsplit("[", 1)
                else:
                    bracesplit = []
                if len(bracesplit) == 2:
                    word, index = bracesplit
                    n = int(index[:-1]) + 1
                else:
                    n = 1
                weights[word].append(index_to_label[str(n)] + ":" + weight)
    ws = (
        util.cwbset(weights[vw_normalize(word)])
        for word in words
        if vw_normalize(word) in weights
    )
    util.write_annotation(doc, out, ws)
Beispiel #20
0
def sentence(
        text: Text = Text(),
        out: Output = Output("segment.sentence",
                             cls="sentence",
                             description="Sentence segments"),
        chunk: Optional[Annotation] = Annotation("[segment.sentence_chunk]"),
        segmenter: str = Config("segment.sentence_segmenter"),
        existing_segments: Optional[str] = Config(
            "segment.existing_sentences"),
        model: Optional[Model] = Model("[segment.sentence_model]")):
    """Split text into sentences."""
    do_segmentation(text=text,
                    out=out,
                    chunk=chunk,
                    segmenter=segmenter,
                    existing_segments=existing_segments,
                    model=model)
Beispiel #21
0
def build_dalin(out: ModelOutput = ModelOutput("hist/dalin.pickle")):
    """Download Dalin morphology XML and save as a pickle file."""
    # Download dalinm.xml
    xml_model = Model("hist/dalinm.xml")
    xml_model.download(
        "https://svn.spraakdata.gu.se/sb-arkiv/pub/lmf/dalinm/dalinm.xml")

    # Create pickle file
    lmf_to_pickle(xml_model.path, out.path)

    # Clean up
    xml_model.remove()
Beispiel #22
0
def swefn_model(
        out: ModelOutput = ModelOutput("lexical_classes/swefn.pickle")):
    """Download and build SweFN model."""
    # Download swefn.xml and build swefn.pickle
    raw_file = Model("lexical_classes/swefn.xml")
    raw_file.download(
        "https://svn.spraakdata.gu.se/sb-arkiv/pub/lmf/swefn/swefn.xml")
    lexicon = read_swefn(raw_file.path)
    out.write_pickle(lexicon)

    # Clean up
    raw_file.remove()
Beispiel #23
0
def build_swedberg(out: ModelOutput = ModelOutput("hist/swedberg.pickle")):
    """Download Swedberg morphology XML and save as a pickle file."""
    # Download diapivot.xml
    xml_model = Model("hist/swedbergm.xml")
    xml_model.download(
        "https://svn.spraakdata.gu.se/sb-arkiv/pub/lmf/swedbergm/swedbergm.xml"
    )

    # Create pickle file
    lmf_to_pickle(xml_model.path, out.path)

    # Clean up
    xml_model.remove()
Beispiel #24
0
def annotate_full(corpus_text: Text = Text(),
                  lang: Language = Language(),
                  conf_file: Model = Model("[freeling.conf]"),
                  fl_binary: Binary = Binary("[freeling.binary]"),
                  sentence_chunk: Annotation = Annotation("[freeling.sentence_chunk]"),
                  out_token: Output = Output("freeling.token", cls="token", description="Token segments"),
                  out_word: Output = Output("<token>:freeling.word", cls="token:word", description="Token strings"),
                  out_baseform: Output = Output("<token>:freeling.baseform", description="Baseforms from FreeLing"),
                  out_upos: Output = Output("<token>:freeling.upos", cls="token:upos",
                                            description="Part-of-speeches in UD"),
                  out_pos: Output = Output("<token>:freeling.pos", cls="token:pos",
                                           description="Part-of-speeches from FreeLing"),
                  out_ne_type: Output = Output("<token>:freeling.ne_type", cls="token:named_entity_type",
                                               description="Named entitiy types from FreeLing"),
                  out_sentence: Optional[Output] = Output("freeling.sentence", cls="sentence",
                                                          description="Sentence segments"),
                  sentence_annotation: Optional[Annotation] = Annotation("[freeling.sentence_annotation]")):
    """Run FreeLing and output the usual annotations plus named entity types."""
    main(corpus_text, lang, conf_file, fl_binary, sentence_chunk, out_token, out_word, out_baseform, out_upos, out_pos,
         out_sentence, sentence_annotation, out_ne_type)
Beispiel #25
0
def metadata(out: Output = Output("{chunk}:geo.geo_metadata", description="Geographical places with coordinates"),
             chunk: Annotation = Annotation("{chunk}"),
             source: Annotation = Annotation("[geo.metadata_source]"),
             model: Model = Model("[geo.model]"),
             method: str = "populous",
             language: list = []):
    """Get location data based on metadata containing location names."""
    geomodel = load_model(model, language=language)

    same_target_source = chunk.split()[0] == source.split()[0]
    chunk_annotation = list(chunk.read())
    source_annotation = list(source.read())

    # If location source and target chunk are not the same, we need
    # to find the parent/child relations between them.
    if not same_target_source:
        target_source_parents = list(source.get_parents(chunk))

    chunk_locations = {}

    for i, _ in enumerate(chunk_annotation):
        if same_target_source:
            location_source = source_annotation[i]
        else:
            location_source = source_annotation[target_source_parents[i]] if target_source_parents[
                i] is not None else None

        if location_source:
            location_data = geomodel.get(location_source.strip().lower())
            if location_data:
                chunk_locations[i] = [(location_source, list(location_data))]
        else:
            chunk_locations[i] = []

    chunk_locations = most_populous(chunk_locations)

    out_annotation = chunk.create_empty_attribute()
    for c in chunk_locations:
        out_annotation[c] = _format_location(chunk_locations.get(c, ()))

    out.write(out_annotation)
Beispiel #26
0
def build_diapivot(out: ModelOutput = ModelOutput("hist/diapivot.pickle")):
    """Download diapivot XML dictionary and save as a pickle file."""
    # Download diapivot.xml
    xml_model = Model("hist/diapivot.xml")
    xml_model.download(
        "https://svn.spraakdata.gu.se/sb-arkiv/pub/lmf/diapivot/diapivot.xml")

    # Create pickle file
    xml_lexicon = read_xml(xml_model.path)
    log.info("Saving cross lexicon in Pickle format")
    picklex = {}
    for lem in xml_lexicon:
        lemgrams = []
        for saldo, match in list(xml_lexicon[lem].items()):
            lemgrams.append(PART_DELIM1.join([saldo, match]))
        picklex[lem] = sorted(lemgrams)

    out.write_pickle(picklex)

    # Clean up
    xml_model.remove()
Beispiel #27
0
def build_saldo(out: ModelOutput = ModelOutput("saldo/saldo.pickle"),
                saldom: Model = Model("saldo/saldom.xml")):
    """Save SALDO morphology as a pickle file."""
    lmf_to_pickle(saldom.path, out.path)
Beispiel #28
0
def _download(url, gzip, out):
    gzip_model = Model(gzip)
    gzip_model.download(url)
    gzip_model.ungzip(out.path)
    gzip_model.remove()
Beispiel #29
0
def annotate(
        maltjar: Binary = Binary("[malt.jar]"),
        model: Model = Model("[malt.model]"),
        out_dephead: Output = Output(
            "<token>:malt.dephead",
            cls="token:dephead",
            description="Positions of the dependency heads"),
        out_dephead_ref: Output = Output(
            "<token>:malt.dephead_ref",
            cls="token:dephead_ref",
            description="Sentence-relative positions of the dependency heads"),
        out_deprel: Output = Output(
            "<token>:malt.deprel",
            cls="token:deprel",
            description="Dependency relations to the head"),
        word: Annotation = Annotation("<token:word>"),
        pos: Annotation = Annotation("<token:pos>"),
        msd: Annotation = Annotation("<token:msd>"),
        ref: Annotation = Annotation("<token>:misc.number_rel_<sentence>"),
        sentence: Annotation = Annotation("<sentence>"),
        token: Annotation = Annotation("<token>"),
        encoding: str = util.UTF8,
        process_dict=None):
    """
    Run the malt parser, in an already started process defined in process_dict, or start a new process (default).

    The process_dict argument should never be set from the command line.
    """
    if process_dict is None:
        process = maltstart(maltjar, model, encoding)
    else:
        process = process_dict["process"]
        # If process seems dead, spawn a new
        if process.stdin.closed or process.stdout.closed or process.poll():
            util.system.kill_process(process)
            process = maltstart(maltjar,
                                model,
                                encoding,
                                send_empty_sentence=True)
            process_dict["process"] = process

    sentences, orphans = sentence.get_children(token)
    sentences.append(orphans)

    word_annotation = list(word.read())
    pos_annotation = list(pos.read())
    msd_annotation = list(msd.read())
    ref_annotation = list(ref.read())

    def conll_token(nr, token_index):
        form = word_annotation[token_index]
        lemma = UNDEF
        pos = cpos = pos_annotation[token_index]
        feats = re.sub(r"[ ,.]", "|",
                       msd_annotation[token_index]).replace("+", "/")
        return TAG_SEP.join((str(nr), form, lemma, cpos, pos, feats))

    stdin = SENT_SEP.join(
        TOK_SEP.join(
            conll_token(n + 1, token_index)
            for n, token_index in enumerate(sent)) for sent in sentences)

    if encoding:
        stdin = stdin.encode(encoding)

    keep_process = len(
        stdin) < RESTART_THRESHOLD_LENGTH and process_dict is not None
    log.info("Stdin length: %s, keep process: %s", len(stdin), keep_process)

    if process_dict is not None:
        process_dict["restart"] = not keep_process

    if keep_process:
        # Chatting with malt: send a SENT_SEP and read correct number of lines
        stdin_fd, stdout_fd = process.stdin, process.stdout
        stdin_fd.write(stdin + SENT_SEP.encode(util.UTF8))
        stdin_fd.flush()

        malt_sentences = []
        for sent in sentences:
            malt_sent = []
            for _ in sent:
                line = stdout_fd.readline()
                if encoding:
                    line = line.decode(encoding)
                malt_sent.append(line)
            line = stdout_fd.readline()
            assert line == b"\n"
            malt_sentences.append(malt_sent)
    else:
        # Otherwise use communicate which buffers properly
        stdout, _ = process.communicate(stdin)
        if encoding:
            stdout = stdout.decode(encoding)
        malt_sentences = (malt_sent.split(TOK_SEP)
                          for malt_sent in stdout.split(SENT_SEP))

    out_dephead_annotation = word.create_empty_attribute()
    out_dephead_ref_annotation = out_dephead_annotation.copy()
    out_deprel_annotation = out_dephead_annotation.copy()
    for (sent, malt_sent) in zip(sentences, malt_sentences):
        for (token_index, malt_tok) in zip(sent, malt_sent):
            cols = [(None if col == UNDEF else col)
                    for col in malt_tok.split(TAG_SEP)]
            out_deprel_annotation[token_index] = cols[DEPREL_COLUMN]
            head = int(cols[HEAD_COLUMN])
            out_dephead_annotation[token_index] = str(sent[head -
                                                           1]) if head else "-"
            out_dephead_ref_annotation[token_index] = str(
                ref_annotation[sent[head - 1]]) if head else ""

    out_dephead.write(out_dephead_annotation)
    out_dephead_ref.write(out_dephead_ref_annotation)
    out_deprel.write(out_deprel_annotation)
Beispiel #30
0
def annotate(token: Annotation = Annotation("<token>"),
             word: Annotation = Annotation("<token:word>"),
             sentence: Annotation = Annotation("<sentence>"),
             reference: Annotation = Annotation(
                 "<token>:misc.number_rel_<sentence>"),
             out_sense: Output = Output("<token>:saldo.sense",
                                        cls="token:sense",
                                        description="SALDO identifier"),
             out_lemgram: Output = Output("<token>:saldo.lemgram",
                                          description="SALDO lemgram"),
             out_baseform: Output = Output("<token>:saldo.baseform",
                                           cls="token:baseform",
                                           description="Baseform from SALDO"),
             models: List[Model] = [Model("[saldo.model]")],
             msd: Optional[Annotation] = Annotation("<token:msd>"),
             delimiter: str = util.DELIM,
             affix: str = util.AFFIX,
             precision: str = Config("saldo.precision"),
             precision_filter: str = "max",
             min_precision: float = 0.66,
             skip_multiword: bool = False,
             allow_multiword_overlap: bool = False,
             word_separator: str = "",
             lexicons=None):
    """Use the Saldo lexicon model (and optionally other older lexicons) to annotate pos-tagged words.

    - token, word, msd, sentence, reference: existing annotations
    - out_baseform, out_lemgram, out_sense: resulting annotations to be written
    - models: a list of pickled lexica, typically the Saldo model (saldo.pickle)
      and optional lexicons for older Swedish.
    - delimiter: delimiter character to put between ambiguous results
    - affix: an optional character to put before and after results
    - precision: a format string for how to print the precision for each annotation, e.g. ":%.3f"
      (use empty string for no precision)
    - precision_filter: an optional filter, currently there are the following values:
        max: only use the annotations that are most probable
        first: only use the most probable annotation (or one of the most probable if more than one)
        none: use all annotations
    - min_precision: only use annotations with a probability score higher than this
    - skip_multiword: set to True to disable multi word annotations
    - allow_multiword_overlap: by default we do some cleanup among overlapping multi word annotations.
      By setting this to True, all overlaps will be allowed.
    - word_separator: an optional character used to split the values of "word" into several word variations
    - lexicons: this argument cannot be set from the command line, but is used in the catapult.
      This argument must be last.
    """
    # Allow use of multiple lexicons
    models_list = [(m.path.stem, m) for m in models]
    if not lexicons:
        lexicon_list = [(name, SaldoLexicon(lex.path))
                        for name, lex in models_list]
    # Use pre-loaded lexicons (from catapult)
    else:
        lexicon_list = []
        for name, _lex in models_list:
            assert lexicons.get(
                name, None) is not None, "Lexicon %s not found!" % name
            lexicon_list.append((name, lexicons[name]))

    # Maximum number of gaps in multi-word units.
    # TODO: Set to 0 for hist-mode? since many (most?) multi-word in the old lexicons are inseparable (half öre etc)
    max_gaps = 1

    # Combine annotation names i SALDO lexicon with out annotations
    annotations = []
    if out_baseform:
        annotations.append((out_baseform, "gf"))
    if out_lemgram:
        annotations.append((out_lemgram, "lem"))
    if out_sense:
        annotations.append((out_sense, "saldo"))

    if skip_multiword:
        log.info("Skipping multi word annotations")

    min_precision = float(min_precision)

    # If min_precision is 0, skip almost all part-of-speech checking (verb multi-word expressions still won't be
    # allowed to span over other verbs)
    skip_pos_check = (min_precision == 0.0)

    word_annotation = list(word.read())
    ref_annotation = list(reference.read())
    if msd:
        msd_annotation = list(msd.read())

    sentences, orphans = sentence.get_children(token)
    sentences.append(orphans)

    out_annotation = word.create_empty_attribute()

    for sent in sentences:
        incomplete_multis = [
        ]  # [{annotation, words, [ref], is_particle, lastwordWasGap, numberofgaps}]
        complete_multis = []  # ([ref], annotation)
        sentence_tokens = {}

        for token_index in sent:
            theword = word_annotation[token_index]
            ref = ref_annotation[token_index]
            msdtag = msd_annotation[token_index] if msd else ""

            annotation_info = {}
            sentence_tokens[ref] = {
                "token_index": token_index,
                "annotations": annotation_info
            }

            # Support for multiple values of word
            if word_separator:
                thewords = [w for w in theword.split(word_separator) if w]
            else:
                thewords = [theword]

            # First use MSD tags to find the most probable single word annotations
            ann_tags_words = find_single_word(thewords, lexicon_list, msdtag,
                                              precision, min_precision,
                                              precision_filter,
                                              annotation_info)

            # Find multi-word expressions
            if not skip_multiword:
                find_multiword_expressions(incomplete_multis, complete_multis,
                                           thewords, ref, msdtag, max_gaps,
                                           ann_tags_words, msd_annotation,
                                           sent, skip_pos_check)

            # Loop to next token

        if not allow_multiword_overlap:
            # Check that we don't have any unwanted overlaps
            remove_unwanted_overlaps(complete_multis)

        # Then save the rest of the multi word expressions in sentence_tokens
        save_multiwords(complete_multis, sentence_tokens)

        for tok in list(sentence_tokens.values()):
            out_annotation[tok["token_index"]] = _join_annotation(
                tok["annotations"], delimiter, affix)

        # Loop to next sentence

    for out_annotation_obj, annotation_name in annotations:
        out_annotation_obj.write(
            [v.get(annotation_name, delimiter) for v in out_annotation])