Ejemplo n.º 1
0
def make_new_baseforms(OUT_baseform, tokid, msd_tag, compounds, stats_lexicon,
                       altlexicon, delimiter, affix):
    """Add a list of baseforms to the dictionary OUT_baseform[tokid]."""
    baseform_list = []
    msd_tag = msd_tag[:msd_tag.find('.')]
    for comp in compounds:
        comp = comp[1]
        base_suffix = comp[-1][1][:comp[-1][1].find('.')]
        prefix = comp[0][0]
        # If first letter has upper case, check if one of the affixes is a name:
        if prefix[0] == prefix[0].upper():
            if not any(True for a in comp if "pm" in a[1][a[1].find('.'):]):
                baseform = ''.join(affix[0].lower()
                                   for affix in comp[:-1]) + base_suffix
            else:
                baseform = ''.join(affix[0]
                                   for affix in comp[:-1]) + base_suffix
        else:
            baseform = ''.join(affix[0] for affix in comp[:-1]) + base_suffix

        # Check if this baseform with the MSD tag occurs in stats_lexicon
        if baseform not in baseform_list:
            if stats_lexicon.lookup_word_tag_freq(
                    baseform, msd_tag) > 0 or altlexicon.lookup(
                        baseform.lower()) != []:
                baseform_list.append(baseform)

    # Update dictionary
    OUT_baseform[tokid] = util.cwbset(baseform_list, delimiter, affix) if (
        compounds and baseform_list) else affix
Ejemplo n.º 2
0
def process_output(out, stdout, in_sentences, SALDO, sensefmt, default_prob):
    """Parse WSD output and write annotation."""
    OUT = {}

    # Split output into sentences
    out_sentences = stdout.strip()
    out_sentences = out_sentences.split("\t".join(["_", "_", "_", "_", SENT_SEP, "_", "_"]))
    out_sentences = [i for i in out_sentences if i]

    # Split output into tokens
    for out_sent, in_sent in zip(out_sentences, in_sentences):
        out_tokens = [t for t in out_sent.split("\n") if t]
        for (out_tok, in_tok) in zip(out_tokens, in_sent):
            out_prob = out_tok.split("\t")[6]
            out_prob = [i for i in out_prob.split("|") if i != "_"]
            out_meanings = [i for i in out_tok.split("\t")[5].split("|") if i != "_"]
            saldo = [i for i in SALDO[in_tok].strip(util.AFFIX).split(util.DELIM) if i]

            new_saldo = []
            if out_prob:
                for meaning in saldo:
                    if meaning in out_meanings:
                        i = out_meanings.index(meaning)
                        new_saldo.append(meaning + sensefmt % float(out_prob[i]))
                    else:
                        new_saldo.append(meaning + sensefmt % float(default_prob))
            else:
                new_saldo = [meaning + sensefmt % float(default_prob) for meaning in saldo]

            # Sort by probability
            new_saldo = sorted(new_saldo, key=lambda x: float(x.split(":")[-1]), reverse=True)
            OUT[in_tok] = util.cwbset(new_saldo)

    util.write_annotation(out, OUT)
Ejemplo n.º 3
0
def make_complem_and_compwf(OUT_complem, OUT_compwf, complemgramfmt, tokid,
                            compounds, compdelim, delimiter, affix):
    """Add a list of compound lemgrams to the dictionary OUT_complem[tokid]
    and a list of compound wordforms to OUT_compwf."""
    complem_list = []
    compwf_list = []
    for comp in compounds:
        prob = comp[0]
        comp = comp[1]
        complems = True
        for a in comp:
            if a[1] == '0':
                complems = False
                break
        if complems:
            if complemgramfmt:
                # Construct complemgram + lemprob
                complem_list.append(
                    compdelim.join(affix[1]
                                   for affix in comp) + complemgramfmt % prob)
            else:
                complem_list.append(compdelim.join(affix[1] for affix in comp))

        # If first letter has upper case, check if one of the affixes may be a name:
        if comp[0][0][0] == comp[0][0][0].upper():
            if not any([True for a in comp if "pm" in a[1][a[1].find('.'):]] +
                       [True for a in comp if "PM" in a[2]]):
                wf = compdelim.join(affix[0].lower() for affix in comp)
            else:
                wf = compdelim.join(affix[0] for affix in comp)
        else:
            wf = compdelim.join(affix[0] for affix in comp)

        if wf not in compwf_list:
            compwf_list.append(wf)

    # Update dictionaries
    OUT_complem[tokid] = util.cwbset(
        complem_list, delimiter,
        affix) if compounds and complem_list else affix
    OUT_compwf[tokid] = util.cwbset(compwf_list, delimiter,
                                    affix) if compounds else affix
Ejemplo n.º 4
0
def make_complem_and_compwf(out_complem, out_compwf, complemgramfmt, compounds,
                            compdelim, delimiter, affix):
    """Add a list of compound lemgrams to out_complem and a list of compound wordforms to out_compwf."""
    complem_list = []
    compwf_list = []
    for comp in compounds:
        prob = comp[0]
        comp = comp[1]
        complems = True
        for a in comp:
            if a[1] == "0":
                complems = False
                break
        if complems:
            if complemgramfmt:
                # Construct complemgram + lemprob
                complem_list.append(
                    compdelim.join(affix[1]
                                   for affix in comp) + complemgramfmt % prob)
            else:
                complem_list.append(compdelim.join(affix[1] for affix in comp))

        # If first letter has upper case, check if one of the affixes may be a name:
        if comp[0][0][0] == comp[0][0][0].upper():
            if not any([True for a in comp if "pm" in a[1][a[1].find("."):]] +
                       [True for a in comp if "PM" in a[2]]):
                wf = compdelim.join(affix[0].lower() for affix in comp)
            else:
                wf = compdelim.join(affix[0] for affix in comp)
        else:
            wf = compdelim.join(affix[0] for affix in comp)

        if wf not in compwf_list:
            compwf_list.append(wf)

    # Add to annotations
    out_complem.append(
        util.cwbset(complem_list, delimiter, affix
                    ) if compounds and complem_list else affix)
    out_compwf.append(
        util.cwbset(compwf_list, delimiter, affix) if compounds else affix)
Ejemplo n.º 5
0
def truncateset(string,
                maxlength=4095,
                delimiter="|",
                affix="|",
                encoding="UTF-8"):
    """Truncate a Corpus Workbench set to a maximum length."""
    if len(string) <= maxlength or string == "|":
        return string
    else:
        length = 1  # Including the last affix
        values = string[1:-1].split("|")
        for i, value in enumerate(values):
            length += len(value.encode(encoding)) + 1
            if length > maxlength:
                return util.cwbset(values[:i], delimiter, affix)
Ejemplo n.º 6
0
def ufeatstag(out: Output = Output(
    "<token>:misc.ufeats",
    cls="token:ufeats",
    description="Universal morphological features"),
              pos: Annotation = Annotation("<token:pos>"),
              msd: Annotation = Annotation("<token:msd>")):
    """Convert SUC MSD tags to universal features."""
    pos_tags = pos.read()
    msd_tags = msd.read()
    out_annotation = []

    for pos_tag, msd_tag in zip(pos_tags, msd_tags):
        feats = util.tagsets.suc_to_feats(pos_tag, msd_tag)
        out_annotation.append(util.cwbset(feats))

    out.write(out_annotation)
Ejemplo n.º 7
0
def process_output(word: Annotation, out: Output, stdout, in_sentences,
                   saldo_annotation, prob_format, default_prob):
    """Parse WSD output and write annotation."""
    out_annotation = word.create_empty_attribute()

    # Split output into sentences
    out_sentences = stdout.strip()
    out_sentences = out_sentences.split("\t".join(
        ["_", "_", "_", "_", SENT_SEP, "_", "_"]))
    out_sentences = [i for i in out_sentences if i]

    # Split output into tokens
    for out_sent, in_sent in zip(out_sentences, in_sentences):
        out_tokens = [t for t in out_sent.split("\n") if t]
        for (out_tok, in_tok) in zip(out_tokens, in_sent):
            out_prob = out_tok.split("\t")[6]
            out_prob = [i for i in out_prob.split("|") if i != "_"]
            out_meanings = [
                i for i in out_tok.split("\t")[5].split("|") if i != "_"
            ]
            saldo = [
                i for i in saldo_annotation[in_tok].strip(util.AFFIX).split(
                    util.DELIM) if i
            ]

            new_saldo = []
            if out_prob:
                for meaning in saldo:
                    if meaning in out_meanings:
                        i = out_meanings.index(meaning)
                        new_saldo.append((meaning, float(out_prob[i])))
                    else:
                        new_saldo.append((meaning, default_prob))
            else:
                new_saldo = [(meaning, default_prob) for meaning in saldo]

            # Sort by probability
            new_saldo.sort(key=lambda x: (-x[1], x[0]))
            # Format probability according to prob_format
            new_saldo = [
                saldo + prob_format % prob if prob_format else saldo
                for saldo, prob in new_saldo
            ]
            out_annotation[in_tok] = util.cwbset(new_saldo)

    out.write(out_annotation)
def word_weights(doc: str = Document,
                 model: str = Model("[vw_topic_modelling.model]"),
                 word: str = Annotation("<token:word>"),
                 pos: str = Annotation("<token:pos>"),
                 out: str = Output("<token>:vw_topic_modelling:label_weights", description="Label weights per word")):
    """
    Report the weight for each label for each word.

    Both model and model.json must exist. See --train and --predict.
    """
    m_json = json.load(open(model + ".json"))
    index_to_label = m_json["index_to_label"]
    min_word_length = int(m_json["min_word_length"] or "0")
    banned_pos = (m_json["banned_pos"] or "").split()
    words = list(util.read_annotation(doc, word))
    poss = util.read_annotation(doc, pos) if pos else []
    data = (Example(None, vw_normalize(word))
            for n, word in enumerate(words)
            if len(word) >= min_word_length
            if not pos or poss[n] not in banned_pos)
    weights = defaultdict(list)
    with tempfile.NamedTemporaryFile() as tmp:
        args = ["--initial_regressor", model, "--invert_hash", tmp.name]
        for _ in vw_predict(args, data):
            pass
        for line in open(tmp.name, "r").readlines():
            # allmänna[1]:14342849:0.0139527
            colons = line.split(":")
            if len(colons) == 3:
                word, _hash, weight = colons
                if word[-1] == "]":
                    bracesplit = word.rsplit("[", 1)
                else:
                    bracesplit = []
                if len(bracesplit) == 2:
                    word, index = bracesplit
                    n = int(index[:-1]) + 1
                else:
                    n = 1
                weights[word].append(index_to_label[str(n)] + ":" + weight)
    ws = (
        util.cwbset(weights[vw_normalize(word)])
        for word in words
        if vw_normalize(word) in weights
    )
    util.write_annotation(doc, out, ws)
def predict(doc: str = Document,
            model: str = Model("[vw_topic_modelling.model]"),
            modeljson: str = Model("[vw_topic_modelling.modeljson]"),
            order,
            struct,
            parent: str = Annotation("{chunk}"),
            word: str = Annotation("<token:word>"),
            out: str = Output("{chunk}:vw_topic_modelling.prediction", description="Predicted attributes"),
            pos: str = Annotation("<token:pos>"),
            raw: bool = False):
    """Predict a structural attribute."""
    raw = raw == "true"

    m_json = json.load(open(modeljson))

    data = (
        Example(None, text.words, text.span)
        for text in texts([(order, struct, parent, word, pos)],
                          map_label=lambda _: "?",
                          min_word_length=m_json["min_word_length"],
                          banned_pos=m_json["banned_pos"])
    )

    index_to_label = m_json["index_to_label"]

    args = ["--initial_regressor", model]

    if raw:
        predictions = (
            util.cwbset(index_to_label[str(s)] + ":" + str(v) for s, v in ss)
            for ss, _span in vw_predict(args, data, raw=True)
        )
    else:
        predictions = (
            index_to_label[str(s)]
            for s, _span in vw_predict(args, data)
        )

    util.write_annotation(doc, out, predictions)
Ejemplo n.º 10
0
def annotate_words(out: Output, model: Model, saldoids: Annotation, pos: Annotation, annotate, pos_limit: List[str],
                   class_set=None, disambiguate=True, connect_ids=False, delimiter=util.DELIM, affix=util.AFFIX,
                   scoresep=util.SCORESEP, lexicon=None):
    """
    Annotate words with blingbring classes (rogetID).

    - out_sent: resulting annotation file.
    - model: pickled lexicon with saldoIDs as keys.
    - saldoids, pos: existing annotation with saldoIDs/parts of speech.
    - annotate: annotation function, returns an iterable containing annotations
        for one token ID. (annotate_bring() or annotate_swefn())
    - pos_limit: parts of speech that will be annotated.
        Set to None to annotate all pos.
    - class_set: output Bring classes or Roget IDs ("bring", "roget_head",
        "roget_subsection", "roget_section" or "roget_class").
        Set to None when not annotating blingbring.
    - disambiguate: use WSD and use only the most likely saldo ID.
    - connect_IDs: for sweFN: paste saldo ID after each sweFN ID.
    - delimiter: delimiter character to put between ambiguous results
    - affix: optional character to put before and after results to mark a set.
    - lexicon: this argument cannot be set from the command line,
      but is used in the catapult. This argument must be last.
    """
    if not lexicon:
        lexicon = util.PickledLexicon(model.path)
    # Otherwise use pre-loaded lexicon (from catapult)

    sense = saldoids.read()
    token_pos = list(pos.read())
    out_annotation = pos.create_empty_attribute()

    # Check if the saldo IDs are ranked (= word senses have been disambiguated)
    wsd = saldoids.split()[1].split(".")[0] == "wsd"

    for token_index, token_sense in enumerate(sense):

        # Check if part of speech of this token is allowed
        if not pos_ok(token_pos, token_index, pos_limit):
            saldo_ids = None
            out_annotation[token_index] = affix
            continue

        if wsd and util.SCORESEP in token_sense:
            ranked_saldo = token_sense.strip(util.AFFIX).split(util.DELIM) \
                if token_sense != util.AFFIX else None
            saldo_tuples = [(i.split(util.SCORESEP)[0], i.split(util.SCORESEP)[1]) for i in ranked_saldo]

            if not disambiguate:
                saldo_ids = [i[0] for i in saldo_tuples]

            # Only take the most likely analysis into account.
            # Handle wsd with equal probability for several words
            else:
                saldo_ids = [saldo_tuples[0]]
                del saldo_tuples[0]
                while saldo_tuples and (saldo_tuples[0][1] == saldo_ids[0][1]):
                    saldo_ids = [saldo_tuples[0]]
                    del saldo_tuples[0]

                saldo_ids = [i[0] for i in saldo_ids]

        else:  # No WSD
            saldo_ids = token_sense.strip(util.AFFIX).split(util.DELIM) \
                if token_sense != util.AFFIX else None

        result = annotate(saldo_ids, lexicon, connect_ids, scoresep)
        out_annotation[token_index] = util.cwbset(result, delimiter, affix) if result else affix
    out.write(out_annotation)
Ejemplo n.º 11
0
def _format_location(location_data):
    """Format location as city;country;latitude;longitude"""
    return util.cwbset(";".join((y[0], y[3], y[1], y[2])) for x, y in location_data)
Ejemplo n.º 12
0
def annotate_doc(out,
                 in_token_annotation,
                 text_children,
                 saldoids=None,
                 cutoff=10,
                 types=False,
                 delimiter=util.DELIM,
                 affix=util.AFFIX,
                 freq_model=None,
                 decimals=3):
    """
    Annotate documents with lexical classes.
    - out: resulting annotation file
    - in_token_annotation: existing annotation with lexical classes on token level.
    - text_children: existing annotation for text-IDs and their word children.
    - saldoids: existing annotation with saldoIDs, needed when types=True.
    - cutoff: value for limiting the resulting bring classes.
              The result will contain all words with the top x frequencies.
              Words with frequency = 1 will be removed from the result.
    - types: if True, count every class only once per saldo ID occurrence.
    - delimiter: delimiter character to put between ambiguous results.
    - affix: optional character to put before and after results to mark a set.
    - freq_model: pickled file with reference frequencies.
    - decimals: number of decimals to keep in output.
    """
    cutoff = int(cutoff)
    types = util.strtobool(types)
    text_children = util.read_annotation(text_children)
    classes = util.read_annotation(in_token_annotation)
    sense = util.read_annotation(saldoids) if types else None

    if freq_model:
        freq_model = util.PickledLexicon(freq_model)

    out_doc = {}

    for textid, words in text_children.items():
        seen_types = set()
        class_freqs = defaultdict(int)
        words = words.split()

        for tokid in words:
            # Count only sense types
            if types:
                senses = str(
                    sorted([
                        s.split(util.SCORESEP)[0] for s in sense[tokid].strip(
                            util.AFFIX).split(util.DELIM)
                    ]))
                if senses in seen_types:
                    continue
                else:
                    seen_types.add(senses)

            rogwords = classes[tokid].strip(util.AFFIX).split(
                util.DELIM) if classes[tokid] != util.AFFIX else []
            for w in rogwords:
                class_freqs[w] += 1

        if freq_model:
            for c in class_freqs:
                # Relative frequency
                rel = class_freqs[c] / len(words)
                # Calculate class dominance
                ref_freq = freq_model.lookup(c.replace("_", " "), 0)
                if not ref_freq:
                    util.log.error("Class '%s' is missing" % ref_freq)
                class_freqs[c] = (rel / ref_freq)

        # Sort words according to frequency/dominance
        ordered_words = sorted(class_freqs.items(),
                               key=lambda x: x[1],
                               reverse=True)
        if freq_model:
            # Remove words with dominance < 1
            ordered_words = [w for w in ordered_words if w[1] >= 1]
        else:
            # Remove words with frequency 1
            ordered_words = [w for w in ordered_words if w[1] > 1]

        if len(ordered_words) > cutoff:
            cutoff_freq = ordered_words[cutoff - 1][1]
            ordered_words = [w for w in ordered_words if w[1] >= cutoff_freq]

        # Join words and frequencies/dominances
        ordered_words = [
            util.SCORESEP.join([word, str(round(freq, decimals))])
            for word, freq in ordered_words
        ]
        out_doc[textid] = util.cwbset(ordered_words, delimiter,
                                      affix) if ordered_words else affix

    util.write_annotation(out, out_doc)
Ejemplo n.º 13
0
def annotate_words(out,
                   model,
                   saldoids,
                   pos,
                   annotate,
                   pos_limit,
                   class_set=None,
                   disambiguate=True,
                   connect_ids=False,
                   delimiter=util.DELIM,
                   affix=util.AFFIX,
                   scoresep=util.SCORESEP,
                   lexicon=None):
    """
    Annotate words with blingbring classes (rogetID).
    - out_sent: resulting annotation file.
    - model: pickled lexicon with saldoIDs as keys.
    - saldoids, pos: existing annotation with saldoIDs/parts of speech.
    - annotate: annotation function, returns an iterable containing annotations
        for one token ID. (annotate_bb() or annotate_swefn())
    - pos_limit: parts of speech that will be annotated.
        Set to None to annotate all pos.
    - class_set: output Bring classes or Roget IDs ("bring", "roget_head",
        "roget_subsection", "roget_section" or "roget_class").
        Set to None when not annotating blingbring.
    - disambiguate: use WSD and use only the most likely saldo ID.
    - connect_IDs: for sweFN: paste saldo ID after each sweFN ID.
    - delimiter: delimiter character to put between ambiguous results
    - affix: optional character to put before and after results to mark a set.
    - lexicon: this argument cannot be set from the command line,
      but is used in the catapult. This argument must be last.
    """

    if not lexicon:
        lexicon = util.PickledLexicon(model)
    # Otherwise use pre-loaded lexicon (from catapult)

    if pos_limit.lower() == "none":
        pos_limit = None

    result_dict = {}
    sense = util.read_annotation(saldoids)
    token_pos = util.read_annotation(pos)

    for tokid in sense:

        # Check if part of speech of this token is allowed
        if not pos_ok(token_pos, tokid, pos_limit):
            saldo_ids = None
            result_dict[tokid] = affix
            continue

        if util.SCORESEP in sense[tokid]:  # WSD
            ranked_saldo = sense[tokid].strip(util.AFFIX).split(util.DELIM) \
                if sense[tokid] != util.AFFIX else None
            saldo_tuples = [(i.split(util.SCORESEP)[0],
                             i.split(util.SCORESEP)[1]) for i in ranked_saldo]

            if not disambiguate:
                saldo_ids = [i[0] for i in saldo_tuples]

            # Only take the most likely analysis into account.
            # Handle wsd with equal probability for several words
            else:
                saldo_ids = [saldo_tuples[0]]
                del saldo_tuples[0]
                while saldo_tuples and (saldo_tuples[0][1] == saldo_ids[0][1]):
                    saldo_ids = [saldo_tuples[0]]
                    del saldo_tuples[0]

                saldo_ids = [i[0] for i in saldo_ids]

        else:  # No WSD
            saldo_ids = sense[tokid].strip(util.AFFIX).split(util.DELIM) \
                if sense[tokid] != util.AFFIX else None

        result = annotate(saldo_ids, lexicon, connect_ids, scoresep)
        result_dict[tokid] = util.cwbset(result, delimiter,
                                         affix) if result else affix
    util.write_annotation(out, result_dict)
Ejemplo n.º 14
0
def annotate_text(out: Output, lexical_classes_token: Annotation, text: Annotation, token: Annotation,
                  saldoids, cutoff, types, delimiter, affix, freq_model, decimals):
    """
    Annotate text chuncs with lexical classes.

    - out: resulting annotation file
    - lexical_classes_token: existing annotation with lexical classes on token level.
    - text, token: existing annotations for the text-IDs and the tokens.
    - saldoids: existing annotation with saldoIDs, needed when types=True.
    - cutoff: value for limiting the resulting bring classes.
              The result will contain all words with the top x frequencies.
              Words with frequency = 1 will be removed from the result.
    - types: if True, count every class only once per saldo ID occurrence.
    - delimiter: delimiter character to put between ambiguous results.
    - affix: optional character to put before and after results to mark a set.
    - freq_model: pickled file with reference frequencies.
    - decimals: number of decimals to keep in output.
    """
    cutoff = int(cutoff)
    text_children, _orphans = text.get_children(token, preserve_parent_annotation_order=True)
    classes = list(lexical_classes_token.read())
    sense = list(saldoids.read()) if types else None

    if freq_model:
        freq_model = util.PickledLexicon(freq_model.path)

    out_annotation = text.create_empty_attribute()

    for text_index, words in enumerate(text_children):
        seen_types = set()
        class_freqs = defaultdict(int)

        for token_index in words:
            # Count only sense types
            if types:
                senses = str(sorted([s.split(util.SCORESEP)[0] for s in sense[token_index].strip(util.AFFIX).split(util.DELIM)]))
                if senses in seen_types:
                    continue
                else:
                    seen_types.add(senses)

            rogwords = classes[token_index].strip(util.AFFIX).split(util.DELIM) if classes[token_index] != util.AFFIX else []
            for w in rogwords:
                class_freqs[w] += 1

        if freq_model:
            for c in class_freqs:
                # Relative frequency
                rel = class_freqs[c] / len(words)
                # Calculate class dominance
                ref_freq = freq_model.lookup(c.replace("_", " "), 0)
                if not ref_freq:
                    log.error("Class '%s' is missing" % ref_freq)
                class_freqs[c] = (rel / ref_freq)

        # Sort words according to frequency/dominance
        ordered_words = sorted(class_freqs.items(), key=lambda x: x[1], reverse=True)
        if freq_model:
            # Remove words with dominance < 1
            ordered_words = [w for w in ordered_words if w[1] >= 1]
        else:
            # Remove words with frequency 1
            ordered_words = [w for w in ordered_words if w[1] > 1]

        if len(ordered_words) > cutoff:
            cutoff_freq = ordered_words[cutoff - 1][1]
            ordered_words = [w for w in ordered_words if w[1] >= cutoff_freq]

        # Join words and frequencies/dominances
        ordered_words = [util.SCORESEP.join([word, str(round(freq, decimals))]) for word, freq in ordered_words]
        out_annotation[text_index] = util.cwbset(ordered_words, delimiter, affix) if ordered_words else affix

    out.write(out_annotation)