Ejemplo n.º 1
0
def blingbring_words(out: Output = Output("<token>:lexical_classes.blingbring",
                                          description="Lexical classes for tokens from Blingbring"),
                     model: Model = Model("[lexical_classes.bb_word_model]"),
                     saldoids: Annotation = Annotation("<token:sense>"),
                     pos: Annotation = Annotation("<token:pos>"),
                     pos_limit: List[str] = ["NN", "VB", "JJ", "AB"],
                     class_set: str = "bring",
                     disambiguate: bool = True,
                     connect_ids: bool = False,
                     delimiter: str = util.DELIM,
                     affix: str = util.AFFIX,
                     scoresep: str = util.SCORESEP,
                     lexicon=None):
    """Blingbring specific wrapper for annotate_words. See annotate_words for more info."""
    # pos_limit="NN VB JJ AB" | None

    if class_set not in ["bring", "roget_head", "roget_subsection", "roget_section", "roget_class"]:
        log.warning("Class '%s' not available. Fallback to 'bring'.")
        class_set = "bring"

    # Blingbring annotation function
    def annotate_bring(saldo_ids, lexicon, connect_IDs=False, scoresep=util.SCORESEP):
        rogetid = set()
        if saldo_ids:
            for sid in saldo_ids:
                if connect_IDs:
                    rogetid = rogetid.union(set(i + scoresep + sid for i in lexicon.lookup(sid, default=set())))
                else:
                    rogetid = rogetid.union(lexicon.lookup(sid, default=dict()).get(class_set, set()))
        return sorted(rogetid)

    annotate_words(out, model, saldoids, pos, annotate_bring, pos_limit=pos_limit, disambiguate=disambiguate,
                   class_set=class_set, connect_ids=connect_ids, delimiter=delimiter, affix=affix, scoresep=scoresep,
                   lexicon=lexicon)
def uppercase(
    word: Annotation = Annotation("<token:word>"),
    out: Output = Output("<token>:uppercase.upper"),
    # some_config_variable: str = Config("uppercase.some_setting")
):
    """Convert to uppercase."""
    out.write([val.upper() for val in word.read()])
Ejemplo n.º 3
0
def diapivot_annotate(
        out: Output = Output(
            "<token>:hist.diapivot",
            description="SALDO IDs corresponding to lemgrams"),
        lemgram: Annotation = Annotation("<token>:saldo.lemgram"),
        model: Model = Model("hist/diapivot.pickle")):
    """Annotate each lemgram with its corresponding saldo_id according to model.

    Args:
        out (str, optional): Resulting annotation file.
            Defaults to Output("<token>:hist.diapivot", description="SALDO IDs corresponding to lemgrams").
        lemgram (str, optional): Existing lemgram annotation. Defaults to Annotation("<token>:saldo.lemgram").
        model (str, optional): Crosslink model. Defaults to Model("hist/diapivot.pickle").
    """
    lexicon = PivotLexicon(model)
    lemgram_annotation = list(lemgram.read())

    out_annotation = []

    for lemgrams in lemgram_annotation:
        saldo_ids = []
        for lemgram in lemgrams.split(util.DELIM):
            s_i = lexicon.get_exactMatch(lemgram)
            if s_i:
                saldo_ids += [s_i]
        out_annotation.append(util.AFFIX + util.DELIM.join(set(saldo_ids)) +
                              util.AFFIX if saldo_ids else util.AFFIX)

    out.write(out_annotation)
Ejemplo n.º 4
0
def swefn_words(out: Output = Output("<token>:lexical_classes.swefn",
                                     description="Lexical classes for tokens from SweFN"),
                model: Model = Model("[lexical_classes.swefn_word_model]"),
                saldoids: Annotation = Annotation("<token:sense>"),
                pos: Annotation = Annotation("<token:pos>"),
                pos_limit: List[str] = ["NN", "VB", "JJ", "AB"],
                disambiguate: bool = True,
                connect_ids: bool = False,
                delimiter: str = util.DELIM,
                affix: str = util.AFFIX,
                scoresep: str = util.SCORESEP,
                lexicon=None):
    """Swefn specific wrapper for annotate_words. See annotate_words for more info."""

    # SweFN annotation function
    def annotate_swefn(saldo_ids, lexicon, connect_IDs=False, scoresep=util.SCORESEP):
        swefnid = set()
        if saldo_ids:
            for sid in saldo_ids:
                if connect_IDs:
                    swefnid = swefnid.union(set(i + scoresep + sid for i in lexicon.lookup(sid, default=set())))
                else:
                    swefnid = swefnid.union(lexicon.lookup(sid, default=set()))
        return sorted(swefnid)

    annotate_words(out, model, saldoids, pos, annotate_swefn, pos_limit=pos_limit, disambiguate=disambiguate,
                   connect_ids=connect_ids, delimiter=delimiter, affix=affix, scoresep=scoresep, lexicon=lexicon)
Ejemplo n.º 5
0
def pretty(doc: Document = Document(),
           docid: AnnotationData = AnnotationData("<docid>"),
           out: Export = Export("xml_pretty/[xml_export.filename]"),
           token: Annotation = Annotation("<token>"),
           word: Annotation = Annotation("[export.word]"),
           annotations: ExportAnnotations = ExportAnnotations("xml_export.annotations"),
           source_annotations: SourceAnnotations = SourceAnnotations("xml_export.source_annotations"),
           header_annotations: SourceAnnotations = SourceAnnotations("xml_export.header_annotations"),
           remove_namespaces: bool = Config("export.remove_module_namespaces", False),
           sparv_namespace: str = Config("export.sparv_namespace"),
           source_namespace: str = Config("export.source_namespace"),
           include_empty_attributes: bool = Config("xml_export.include_empty_attributes")):
    """Export annotations to pretty XML in export_dir.

    Args:
        doc: Name of the original document.
        docid: Annotation with document IDs.
        out: Path and filename pattern for resulting file.
        token: Annotation containing the token strings.
        word: Annotation containing the token strings.
        annotations: List of elements:attributes (annotations) to include.
        source_annotations: List of elements:attributes from the original document
            to be kept. If not specified, everything will be kept.
        header_annotations: List of header elements from the original document to include
            in the export. If not specified, all headers will be kept.
        remove_namespaces: Whether to remove module "namespaces" from element and attribute names.
            Disabled by default.
        sparv_namespace: The namespace to be added to all Sparv annotations.
        source_namespace: The namespace to be added to all annotations present in the source.
        include_empty_attributes: Whether to include attributes even when they are empty. Disabled by default.
    """
    # Create export dir
    os.makedirs(os.path.dirname(out), exist_ok=True)

    token_name = token.name

    # Read words and document ID
    word_annotation = list(word.read())
    docid_annotation = docid.read()

    # Get annotation spans, annotations list etc.
    annotation_list, _, export_names = util.get_annotation_names(annotations, source_annotations, doc=doc,
                                                                 token_name=token_name,
                                                                 remove_namespaces=remove_namespaces,
                                                                 sparv_namespace=sparv_namespace,
                                                                 source_namespace=source_namespace)
    h_annotations, h_export_names = util.get_header_names(header_annotations, doc=doc)
    export_names.update(h_export_names)
    span_positions, annotation_dict = util.gather_annotations(annotation_list, export_names, h_annotations,
                                                              doc=doc, split_overlaps=True)
    xmlstr = xml_utils.make_pretty_xml(span_positions, annotation_dict, export_names, token_name, word_annotation,
                                       docid_annotation, include_empty_attributes, sparv_namespace)

    # Write XML to file
    with open(out, mode="w") as outfile:
        outfile.write(xmlstr)
    log.info("Exported: %s", out)
Ejemplo n.º 6
0
def msdtag(out: Output = Output(
    "<token>:hunpos.msd",
    cls="token:msd",
    description="Part-of-speeches with morphological descriptions"),
           word: Annotation = Annotation("<token:word>"),
           sentence: Annotation = Annotation("<sentence>"),
           binary: Binary = Binary("[hunpos.binary]"),
           model: Model = Model("[hunpos.model]"),
           morphtable: Optional[Model] = Model("[hunpos.morphtable]"),
           patterns: Optional[Model] = Model("[hunpos.patterns]"),
           tag_mapping=None,
           encoding: str = util.UTF8):
    """POS/MSD tag using the Hunpos tagger."""
    if isinstance(tag_mapping, str) and tag_mapping:
        tag_mapping = util.tagsets.mappings[tag_mapping]
    elif tag_mapping is None or tag_mapping == "":
        tag_mapping = {}

    pattern_list = []

    if patterns:
        with open(patterns.path, encoding="utf-8") as pat:
            for line in pat:
                if line.strip() and not line.startswith("#"):
                    name, pattern, tags = line.strip().split("\t", 2)
                    pattern_list.append(
                        (name, re.compile("^%s$" % pattern), tags))

    def replace_word(w):
        """Replace word with alias if word matches a regex pattern."""
        for p in pattern_list:
            if re.match(p[1], w):
                return "[[%s]]" % p[0]
        return w

    sentences, _orphans = sentence.get_children(word)
    token_word = list(word.read())
    stdin = SENT_SEP.join(
        TOK_SEP.join(
            replace_word(token_word[token_index]) for token_index in sent)
        for sent in sentences)
    args = [model.path]
    if morphtable:
        args.extend(["-m", morphtable.path])
    stdout, _ = util.system.call_binary(binary, args, stdin, encoding=encoding)

    out_annotation = word.create_empty_attribute()
    for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)):
        for token_index, tagged_token in zip(
                sent,
                tagged_sent.strip().split(TOK_SEP)):
            tag = tagged_token.strip().split(TAG_SEP)[TAG_COLUMN]
            tag = tag_mapping.get(tag, tag)
            out_annotation[token_index] = tag

    out.write(out_annotation)
Ejemplo n.º 7
0
def override(chunk: Annotation, repl: Annotation, out: Output):
    """Replace values in 'chunk' with non empty values from 'repl'."""
    def empty(val):
        if not val:
            return True
        return val == "|"

    repl = list(repl.read())
    out.write((repl[n] if not empty(repl[n]) else val
               for (n, val) in enumerate(chunk.read())))
Ejemplo n.º 8
0
def struct_to_token(
        attr: Annotation = Annotation("{struct}:{attr}"),
        token: Annotation = Annotation("<token>"),
        out: Output = Output("<token>:misc.from_struct_{struct}_{attr}")):
    """Convert an attribute on a structural annotation into a token attribute."""
    token_parents = token.get_parents(attr)
    attr_values = list(attr.read())
    out_values = [
        attr_values[p] if p is not None else "" for p in token_parents
    ]
    out.write(out_values)
Ejemplo n.º 9
0
def upostag(out: Output = Output("<token>:misc.upos",
                                 cls="token:upos",
                                 description="Part-of-speeches in UD"),
            pos: Annotation = Annotation("<token:pos>")):
    """Convert SUC POS tags to UPOS."""
    pos_tags = pos.read()
    out_annotation = []

    for tag in pos_tags:
        out_annotation.append(util.tagsets.pos_to_upos(tag, "swe", "SUC"))

    out.write(out_annotation)
Ejemplo n.º 10
0
def number_by_position(out: Output = Output("{annotation}:misc.number_position"),
                       chunk: Annotation = Annotation("{annotation}"),
                       prefix: str = "",
                       zfill: bool = False,
                       start: int = START_DEFAULT):
    """Number chunks by their position."""
    spans = list(chunk.read_spans())

    def _order(index, _value):
        return spans[index]

    _read_chunks_and_write_new_ordering(out, chunk, _order, prefix, zfill, start)
Ejemplo n.º 11
0
def concat(out: Output,
           left: Annotation,
           right: Annotation,
           separator: str = "",
           merge_twins: bool = False):
    """Concatenate values from two annotations, with an optional separator.

    If merge_twins is set to True, no concatenation will be done on identical values.
    """
    b = list(right.read())
    out.write((f"{val_a}{separator}{b[n]}"
               if not (merge_twins and val_a == b[n]) else val_a
               for (n, val_a) in enumerate(left.read())))
Ejemplo n.º 12
0
def annotate(
        sense: Annotation = Annotation("<token>:saldo.sense"),
        out_scores: Output = Output("<token>:sensaldo.sentiment_score",
                                    description="SenSALDO sentiment score"),
        out_labels: Output = Output("<token>:sensaldo.sentiment_label",
                                    description="SenSALDO sentiment label"),
        model: Model = Model("[sensaldo.model]"),
        lexicon=None):
    """Assign sentiment values to tokens based on their sense annotation.

    When more than one sense is possible, calulate a weighted mean.
    - sense: existing annotation with saldoIDs.
    - out_scores, out_labels: resulting annotation file.
    - model: pickled lexicon with saldoIDs as keys.
    - lexicon: this argument cannot be set from the command line,
      but is used in the catapult. This argument must be last.
    """
    if not lexicon:
        lexicon = util.PickledLexicon(model.path)
    # Otherwise use pre-loaded lexicon (from catapult)

    sense = sense.read()
    result_scores = []
    result_labels = []

    for token in sense:
        # Get set of senses for each token and sort them according to their probabilities
        token_senses = [
            tuple(s.rsplit(util.SCORESEP, 1)) if util.SCORESEP in s else
            (s, -1.0) for s in token.split(util.DELIM) if s
        ]
        token_senses.sort(key=lambda x: float(x[1]), reverse=True)

        # Lookup the sentiment score for the most probable sense and assign a sentiment label
        if token_senses:
            best_sense = token_senses[0][0]
            score = lexicon.lookup(best_sense, None)
        else:
            score = None

        if score:
            result_scores.append(score)
            result_labels.append(SENTIMENT_LABLES.get(int(score)))
        else:
            result_scores.append(None)
            result_labels.append(None)

    out_scores.write(result_scores)
    out_labels.write(result_labels)
Ejemplo n.º 13
0
def number_relative(out: Output = Output("{annotation}:misc.number_rel_{parent}"),
                    parent: Annotation = Annotation("{parent}"),
                    child: Annotation = Annotation("{annotation}"),
                    prefix: str = "",
                    zfill: bool = False,
                    start: int = START_DEFAULT):
    """Number chunks by their relative position within a parent."""
    parent_children, _orphans = parent.get_children(child)

    out.write(("{prefix}{nr:0{length}d}".format(prefix=prefix,
                                                length=len(str(len(parent) - 1 + start))
                                                if zfill else 0,
                                                nr=cnr)
               for parent in parent_children
               for cnr, _index in enumerate(parent, start)))
Ejemplo n.º 14
0
def ufeatstag(out: Output = Output(
    "<token>:misc.ufeats",
    cls="token:ufeats",
    description="Universal morphological features"),
              pos: Annotation = Annotation("<token:pos>"),
              msd: Annotation = Annotation("<token:msd>")):
    """Convert SUC MSD tags to universal features."""
    pos_tags = pos.read()
    msd_tags = msd.read()
    out_annotation = []

    for pos_tag, msd_tag in zip(pos_tags, msd_tags):
        feats = util.tagsets.suc_to_feats(pos_tag, msd_tag)
        out_annotation.append(util.cwbset(feats))

    out.write(out_annotation)
Ejemplo n.º 15
0
def annotate(corpus_text: Text = Text(),
             lang: Language = Language,
             conf_file: Model = Model("[freeling.conf]"),
             fl_binary: Binary = Binary("[freeling.binary]"),
             sentence_chunk: Optional[Annotation] = Annotation("[freeling.sentence_chunk]"),
             out_token: Output = Output("freeling.token", cls="token", description="Token segments"),
             out_word: Output = Output("<token>:freeling.word", cls="token:word", description="Token strings"),
             out_baseform: Output = Output("<token>:freeling.baseform", description="Baseforms from FreeLing"),
             out_upos: Output = Output("<token>:freeling.upos", cls="token:upos", description="Part-of-speeches in UD"),
             out_pos: Output = Output("<token>:freeling.pos", cls="token:pos",
                                      description="Part-of-speeches from FreeLing"),
             out_sentence: Optional[Output] = Output("freeling.sentence", cls="sentence", description="Sentence segments"),
             sentence_annotation: Optional[Annotation] = Annotation("[freeling.sentence_annotation]")):
    """Run FreeLing and output sentences, tokens, baseforms, upos and pos."""
    main(corpus_text, lang, conf_file, fl_binary, sentence_chunk, out_token, out_word, out_baseform, out_upos, out_pos,
         out_sentence, sentence_annotation)
Ejemplo n.º 16
0
def postag(out: Output = Output("<token>:hunpos.pos",
                                cls="token:pos",
                                description="Part-of-speech tags"),
           msd: Annotation = Annotation("<token>:hunpos.msd")):
    """Extract POS from MSD."""
    from sparv.modules.misc import misc
    misc.select(out, msd, index=0, separator=".")
Ejemplo n.º 17
0
def swefn_text(out: Output = Output("<text>:lexical_classes.swefn",
                                    description="Lexical classes for text chunks from SweFN"),
               lexical_classes_token: Annotation = Annotation("<token>:lexical_classes.swefn"),
               text: Annotation = Annotation("<text>"),
               token: Annotation = Annotation("<token>"),
               saldoids: Optional[Annotation] = Annotation("<token:sense>"),
               cutoff: int = 3,
               types: bool = False,
               delimiter: str = util.DELIM,
               affix: str = util.AFFIX,
               freq_model: Model = Model("[lexical_classes.swefn_freq_model]"),
               decimals: int = 3):
    """Annotate text chunks with SweFN classes."""
    annotate_text(out=out, lexical_classes_token=lexical_classes_token, text=text, token=token,
                  saldoids=saldoids, cutoff=cutoff, types=types, delimiter=delimiter, affix=affix,
                  freq_model=freq_model, decimals=decimals)
Ejemplo n.º 18
0
def number_by_parent(out: Output = Output("{annotation}:misc.number_by_parent_{parent_annotation}__{parent_attribute}"),
                     chunk: Annotation = Annotation("{annotation}"),
                     parent_order: Annotation = Annotation("{parent_annotation}:{parent_attribute}"),
                     prefix: str = "",
                     zfill: bool = False,
                     start: int = START_DEFAULT):
    """Number chunks by (parent_order, chunk order)."""
    parent_children, _orphans = parent_order.get_children(chunk)

    child_order = {child_index: (parent_nr, child_index)
                   for parent_index, parent_nr in enumerate(parent_order.read())
                   for child_index in parent_children[parent_index]}

    def _order(index, _value):
        return child_order.get(index)

    _read_chunks_and_write_new_ordering(out, chunk, _order, prefix, zfill, start)
Ejemplo n.º 19
0
def translate_tag(out: Output, tag: Annotation, mapping: dict = {}):
    """Convert part-of-speech tags, specified by the mapping.

    Example mappings: parole_to_suc, suc_to_simple, ...
    """
    if isinstance(mapping, str):
        mapping = util.tagsets.mappings[mapping]
    out.write((mapping.get(t, t) for t in tag.read()))
def word_weights(doc: str = Document,
                 model: str = Model("[vw_topic_modelling.model]"),
                 word: str = Annotation("<token:word>"),
                 pos: str = Annotation("<token:pos>"),
                 out: str = Output("<token>:vw_topic_modelling:label_weights", description="Label weights per word")):
    """
    Report the weight for each label for each word.

    Both model and model.json must exist. See --train and --predict.
    """
    m_json = json.load(open(model + ".json"))
    index_to_label = m_json["index_to_label"]
    min_word_length = int(m_json["min_word_length"] or "0")
    banned_pos = (m_json["banned_pos"] or "").split()
    words = list(util.read_annotation(doc, word))
    poss = util.read_annotation(doc, pos) if pos else []
    data = (Example(None, vw_normalize(word))
            for n, word in enumerate(words)
            if len(word) >= min_word_length
            if not pos or poss[n] not in banned_pos)
    weights = defaultdict(list)
    with tempfile.NamedTemporaryFile() as tmp:
        args = ["--initial_regressor", model, "--invert_hash", tmp.name]
        for _ in vw_predict(args, data):
            pass
        for line in open(tmp.name, "r").readlines():
            # allmänna[1]:14342849:0.0139527
            colons = line.split(":")
            if len(colons) == 3:
                word, _hash, weight = colons
                if word[-1] == "]":
                    bracesplit = word.rsplit("[", 1)
                else:
                    bracesplit = []
                if len(bracesplit) == 2:
                    word, index = bracesplit
                    n = int(index[:-1]) + 1
                else:
                    n = 1
                weights[word].append(index_to_label[str(n)] + ":" + weight)
    ws = (
        util.cwbset(weights[vw_normalize(word)])
        for word in words
        if vw_normalize(word) in weights
    )
    util.write_annotation(doc, out, ws)
Ejemplo n.º 21
0
def lix(text: Annotation = Annotation("<text>"),
        sentence: Annotation = Annotation("<sentence>"),
        word: Annotation = Annotation("<token:word>"),
        pos: Annotation = Annotation("<token:pos>"),
        out: Output = Output("<text>:readability.lix",
                             description="LIX values for text chunks"),
        skip_pos: List[str] = ["MAD", "MID", "PAD"],
        fmt: str = "%.2f"):
    """Create LIX annotation for text."""
    # Read annotation files and get parent_children relations
    text_children, _orphans = text.get_children(sentence)
    word_pos = list(word.read_attributes((word, pos)))
    sentence_children, _orphans = sentence.get_children(word)
    sentence_children = list(sentence_children)

    # Calculate LIX for every text element
    lix_annotation = []
    for text in text_children:
        in_sentences = []
        for sentence_index in text:
            s = sentence_children[sentence_index]
            in_sentences.append(
                list(
                    actual_words([word_pos[token_index] for token_index in s],
                                 skip_pos)))
        lix_annotation.append(fmt % lix_calc(in_sentences))

    out.write(lix_annotation)
Ejemplo n.º 22
0
def ids(doc: Document = Document(),
        annotation: Annotation = Annotation("{annotation}"),
        out: Output = Output("{annotation}:misc.id",
                             description="Unique ID for {annotation}"),
        docid: AnnotationData = AnnotationData("<docid>"),
        prefix: str = ""):
    """Create unique IDs for every span of an existing annotation."""
    docid = docid.read()
    prefix = prefix + docid

    ann = list(annotation.read())
    out_annotation = []
    # Use doc name and annotation name as seed for the IDs
    _reset_id("{}/{}".format(doc, annotation), len(ann))
    for _ in ann:
        new_id = _make_id(prefix, out_annotation)
        out_annotation.append(new_id)
    out.write(out_annotation)
Ejemplo n.º 23
0
def find_replace_regex(chunk: Annotation,
                       out: Output,
                       find: str = "",
                       sub: str = ""):
    """Do find and replace in values of annotation using a regular expressions.

    N.B: When writing regular expressions in YAML they should be enclosed in single quotes.
    """
    out.write((re.sub(find, sub, val) for val in chunk.read()))
Ejemplo n.º 24
0
def number_by_attribute(out: Output = Output("{annotation}:misc.number_by_{attribute}"),
                        chunk: Annotation = Annotation("{annotation}:{attribute}"),
                        prefix: str = "",
                        zfill: bool = False,
                        start: int = START_DEFAULT):
    """Number chunks, with the order determined by an attribute."""
    def _order(_index, value):
        return _natural_sorting(value)

    _read_chunks_and_write_new_ordering(out, chunk, _order, prefix, zfill, start)
Ejemplo n.º 25
0
def nominal_ratio(text: Annotation = Annotation("<text>"),
                  pos: Annotation = Annotation("<token:pos>"),
                  out: Output = Output(
                      "<text>:readability.nk",
                      description="Nominal ratios for text chunks"),
                  noun_pos: List[str] = ["NN", "PP", "PC"],
                  verb_pos: List[str] = ["PN", "AB", "VB"],
                  fmt: str = "%.2f"):
    """Create nominal ratio annotation for text."""
    text_children, _orphans = text.get_children(pos)
    pos_annotation = list(pos.read())

    # Calculate OVIX for every text element
    nk_annotation = []
    for text in text_children:
        in_pos = [pos_annotation[token_index] for token_index in text]
        nk_annotation.append(fmt %
                             nominal_ratio_calc(in_pos, noun_pos, verb_pos))
    out.write(nk_annotation)
Ejemplo n.º 26
0
def vrt(doc: Document = Document(),
        out: Export = Export("vrt/{doc}.vrt"),
        token: Annotation = Annotation("<token>"),
        word: Annotation = Annotation("[export.word]"),
        annotations: ExportAnnotations = ExportAnnotations("cwb.annotations"),
        source_annotations: SourceAnnotations = SourceAnnotations(
            "cwb.source_annotations"),
        remove_namespaces: bool = Config("export.remove_module_namespaces",
                                         False),
        sparv_namespace: str = Config("export.sparv_namespace"),
        source_namespace: str = Config("export.source_namespace")):
    """Export annotations to vrt.

    - annotations: list of elements:attributes (annotations) to include.
    - source_annotations: list of elements:attributes from the original document
      to be kept. If not specified, everything will be kept.
    """
    # Create export dir
    os.makedirs(os.path.dirname(out), exist_ok=True)

    # Read words
    word_annotation = list(word.read())

    # Get annotation spans, annotations list etc.
    annotation_list, token_attributes, export_names = util.get_annotation_names(
        annotations,
        source_annotations,
        doc=doc,
        token_name=token.name,
        remove_namespaces=remove_namespaces,
        sparv_namespace=sparv_namespace,
        source_namespace=source_namespace)
    span_positions, annotation_dict = util.gather_annotations(annotation_list,
                                                              export_names,
                                                              doc=doc)
    vrt_data = create_vrt(span_positions, token.name, word_annotation,
                          token_attributes, annotation_dict, export_names)

    # Write result to file
    with open(out, "w") as f:
        f.write(vrt_data)
    log.info("Exported: %s", out)
Ejemplo n.º 27
0
def annotate_full(corpus_text: Text = Text(),
                  lang: Language = Language(),
                  conf_file: Model = Model("[freeling.conf]"),
                  fl_binary: Binary = Binary("[freeling.binary]"),
                  sentence_chunk: Annotation = Annotation("[freeling.sentence_chunk]"),
                  out_token: Output = Output("freeling.token", cls="token", description="Token segments"),
                  out_word: Output = Output("<token>:freeling.word", cls="token:word", description="Token strings"),
                  out_baseform: Output = Output("<token>:freeling.baseform", description="Baseforms from FreeLing"),
                  out_upos: Output = Output("<token>:freeling.upos", cls="token:upos",
                                            description="Part-of-speeches in UD"),
                  out_pos: Output = Output("<token>:freeling.pos", cls="token:pos",
                                           description="Part-of-speeches from FreeLing"),
                  out_ne_type: Output = Output("<token>:freeling.ne_type", cls="token:named_entity_type",
                                               description="Named entitiy types from FreeLing"),
                  out_sentence: Optional[Output] = Output("freeling.sentence", cls="sentence",
                                                          description="Sentence segments"),
                  sentence_annotation: Optional[Annotation] = Annotation("[freeling.sentence_annotation]")):
    """Run FreeLing and output the usual annotations plus named entity types."""
    main(corpus_text, lang, conf_file, fl_binary, sentence_chunk, out_token, out_word, out_baseform, out_upos, out_pos,
         out_sentence, sentence_annotation, out_ne_type)
Ejemplo n.º 28
0
def ovix(text: Annotation = Annotation("<text>"),
         word: Annotation = Annotation("<token:word>"),
         pos: Annotation = Annotation("<token:pos>"),
         out: Output = Output("<text>:readability.ovix",
                              description="OVIX values for text chunks"),
         skip_pos: List[str] = ["MAD", "MID", "PAD"],
         fmt: str = "%.2f"):
    """Create OVIX annotation for text."""
    text_children, _orphans = text.get_children(word)
    word_pos = list(word.read_attributes((word, pos)))

    # Calculate OVIX for every text element
    ovix_annotation = []
    for text in text_children:
        in_words = list(
            actual_words([word_pos[token_index] for token_index in text],
                         skip_pos))
        ovix_annotation.append(fmt % ovix_calc(in_words))

    out.write(ovix_annotation)
Ejemplo n.º 29
0
def _read_chunks_and_write_new_ordering(out: Output, chunk: Annotation, order, prefix="", zfill=False,
                                        start=START_DEFAULT):
    """Common function called by other numbering functions."""
    new_order = defaultdict(list)

    in_annotation = list(chunk.read())

    for i, val in enumerate(in_annotation):
        val = order(i, val)
        new_order[val].append(i)

    out_annotation = chunk.create_empty_attribute()

    nr_digits = len(str(len(new_order) - 1 + start))
    for nr, key in enumerate(sorted(new_order), start):
        for index in new_order[key]:
            out_annotation[index] = "{prefix}{nr:0{length}d}".format(prefix=prefix,
                                                                     length=nr_digits if zfill else 0,
                                                                     nr=nr)

    out.write(out_annotation)
Ejemplo n.º 30
0
def select(out: Output,
           annotation: Annotation,
           index: Optional[int] = 0,
           separator: Optional[str] = " "):
    """Select a specific index from the values of an annotation.

    The given annotation values are separated by 'separator',
    by default whitespace, with at least index + 1 elements.
    """
    if isinstance(index, str):
        index = int(index)
    out.write(value.split(separator)[index] for value in annotation.read())