Example #1
0
def build_model(out: ModelOutput = ModelOutput("malt/swemalt-1.7.2.mco"),
                _maltjar: Binary = Binary("[malt.jar]")):
    """Download model for MALT Parser.

    Won't download model unless maltjar has been installed.
    """
    out.download("http://maltparser.org/mco/swedish_parser/swemalt-1.7.2.mco")
Example #2
0
def msdtag(out: Output = Output(
    "<token>:hunpos.msd",
    cls="token:msd",
    description="Part-of-speeches with morphological descriptions"),
           word: Annotation = Annotation("<token:word>"),
           sentence: Annotation = Annotation("<sentence>"),
           binary: Binary = Binary("[hunpos.binary]"),
           model: Model = Model("[hunpos.model]"),
           morphtable: Optional[Model] = Model("[hunpos.morphtable]"),
           patterns: Optional[Model] = Model("[hunpos.patterns]"),
           tag_mapping=None,
           encoding: str = util.UTF8):
    """POS/MSD tag using the Hunpos tagger."""
    if isinstance(tag_mapping, str) and tag_mapping:
        tag_mapping = util.tagsets.mappings[tag_mapping]
    elif tag_mapping is None or tag_mapping == "":
        tag_mapping = {}

    pattern_list = []

    if patterns:
        with open(patterns.path, encoding="utf-8") as pat:
            for line in pat:
                if line.strip() and not line.startswith("#"):
                    name, pattern, tags = line.strip().split("\t", 2)
                    pattern_list.append(
                        (name, re.compile("^%s$" % pattern), tags))

    def replace_word(w):
        """Replace word with alias if word matches a regex pattern."""
        for p in pattern_list:
            if re.match(p[1], w):
                return "[[%s]]" % p[0]
        return w

    sentences, _orphans = sentence.get_children(word)
    token_word = list(word.read())
    stdin = SENT_SEP.join(
        TOK_SEP.join(
            replace_word(token_word[token_index]) for token_index in sent)
        for sent in sentences)
    args = [model.path]
    if morphtable:
        args.extend(["-m", morphtable.path])
    stdout, _ = util.system.call_binary(binary, args, stdin, encoding=encoding)

    out_annotation = word.create_empty_attribute()
    for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)):
        for token_index, tagged_token in zip(
                sent,
                tagged_sent.strip().split(TOK_SEP)):
            tag = tagged_token.strip().split(TAG_SEP)[TAG_COLUMN]
            tag = tag_mapping.get(tag, tag)
            out_annotation[token_index] = tag

    out.write(out_annotation)
Example #3
0
def annotate(
        lang: Language = Language(),
        model: Model = Model("[treetagger.model]"),
        tt_binary: Binary = Binary("[treetagger.binary]"),
        out_upos: Output = Output("<token>:treetagger.upos",
                                  cls="token:upos",
                                  description="Part-of-speeches in UD"),
        out_pos: Output = Output(
            "<token>:treetagger.pos",
            cls="token:pos",
            description="Part-of-speeches from TreeTagger"),
        out_baseform: Output = Output("<token>:treetagger.baseform",
                                      description="Baseforms from TreeTagger"),
        word: Annotation = Annotation("<token:word>"),
        sentence: Annotation = Annotation("<sentence>"),
        encoding: str = util.UTF8):
    """POS/MSD tag and lemmatize using TreeTagger."""
    sentences, _orphans = sentence.get_children(word)
    word_annotation = list(word.read())
    stdin = SENT_SEP.join(
        TOK_SEP.join(word_annotation[token_index] for token_index in sent)
        for sent in sentences)
    args = ["-token", "-lemma", "-no-unknown", "-eos-tag", "<eos>", model.path]

    stdout, stderr = util.system.call_binary(tt_binary,
                                             args,
                                             stdin,
                                             encoding=encoding)
    log.debug("Message from TreeTagger:\n%s", stderr)

    # Write pos and upos annotations.
    out_upos_annotation = word.create_empty_attribute()
    out_pos_annotation = word.create_empty_attribute()
    for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)):
        for token_id, tagged_token in zip(sent,
                                          tagged_sent.strip().split(TOK_SEP)):
            tag = tagged_token.strip().split(TAG_SEP)[TAG_COLUMN]
            out_pos_annotation[token_id] = tag
            out_upos_annotation[token_id] = util.tagsets.pos_to_upos(
                tag, lang, TAG_SETS.get(lang))
    out_pos.write(out_pos_annotation)
    out_upos.write(out_upos_annotation)

    # Write lemma annotations.
    out_lemma_annotation = word.create_empty_attribute()
    for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)):
        for token_id, tagged_token in zip(sent,
                                          tagged_sent.strip().split(TOK_SEP)):
            lem = tagged_token.strip().split(TAG_SEP)[LEM_COLUMN]
            out_lemma_annotation[token_id] = lem
    out_baseform.write(out_lemma_annotation)
Example #4
0
def annotate(corpus_text: Text = Text(),
             lang: Language = Language,
             conf_file: Model = Model("[freeling.conf]"),
             fl_binary: Binary = Binary("[freeling.binary]"),
             sentence_chunk: Optional[Annotation] = Annotation("[freeling.sentence_chunk]"),
             out_token: Output = Output("freeling.token", cls="token", description="Token segments"),
             out_word: Output = Output("<token>:freeling.word", cls="token:word", description="Token strings"),
             out_baseform: Output = Output("<token>:freeling.baseform", description="Baseforms from FreeLing"),
             out_upos: Output = Output("<token>:freeling.upos", cls="token:upos", description="Part-of-speeches in UD"),
             out_pos: Output = Output("<token>:freeling.pos", cls="token:pos",
                                      description="Part-of-speeches from FreeLing"),
             out_sentence: Optional[Output] = Output("freeling.sentence", cls="sentence", description="Sentence segments"),
             sentence_annotation: Optional[Annotation] = Annotation("[freeling.sentence_annotation]")):
    """Run FreeLing and output sentences, tokens, baseforms, upos and pos."""
    main(corpus_text, lang, conf_file, fl_binary, sentence_chunk, out_token, out_word, out_baseform, out_upos, out_pos,
         out_sentence, sentence_annotation)
Example #5
0
def annotate_full(corpus_text: Text = Text(),
                  lang: Language = Language(),
                  conf_file: Model = Model("[freeling.conf]"),
                  fl_binary: Binary = Binary("[freeling.binary]"),
                  sentence_chunk: Annotation = Annotation("[freeling.sentence_chunk]"),
                  out_token: Output = Output("freeling.token", cls="token", description="Token segments"),
                  out_word: Output = Output("<token>:freeling.word", cls="token:word", description="Token strings"),
                  out_baseform: Output = Output("<token>:freeling.baseform", description="Baseforms from FreeLing"),
                  out_upos: Output = Output("<token>:freeling.upos", cls="token:upos",
                                            description="Part-of-speeches in UD"),
                  out_pos: Output = Output("<token>:freeling.pos", cls="token:pos",
                                           description="Part-of-speeches from FreeLing"),
                  out_ne_type: Output = Output("<token>:freeling.ne_type", cls="token:named_entity_type",
                                               description="Named entitiy types from FreeLing"),
                  out_sentence: Optional[Output] = Output("freeling.sentence", cls="sentence",
                                                          description="Sentence segments"),
                  sentence_annotation: Optional[Annotation] = Annotation("[freeling.sentence_annotation]")):
    """Run FreeLing and output the usual annotations plus named entity types."""
    main(corpus_text, lang, conf_file, fl_binary, sentence_chunk, out_token, out_word, out_baseform, out_upos, out_pos,
         out_sentence, sentence_annotation, out_ne_type)
Example #6
0
def get_rus_model(out: ModelOutput = ModelOutput("treetagger/rus.par"),
                  tt_binary: Binary = Binary("[treetagger.binary]")):
    """Download TreeTagger language model."""
    gzip = "treetagger/russian.par.gz"
    url = "http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/russian.par.gz"
    _download(url, gzip, out)
Example #7
0
def annotate(
        maltjar: Binary = Binary("[malt.jar]"),
        model: Model = Model("[malt.model]"),
        out_dephead: Output = Output(
            "<token>:malt.dephead",
            cls="token:dephead",
            description="Positions of the dependency heads"),
        out_dephead_ref: Output = Output(
            "<token>:malt.dephead_ref",
            cls="token:dephead_ref",
            description="Sentence-relative positions of the dependency heads"),
        out_deprel: Output = Output(
            "<token>:malt.deprel",
            cls="token:deprel",
            description="Dependency relations to the head"),
        word: Annotation = Annotation("<token:word>"),
        pos: Annotation = Annotation("<token:pos>"),
        msd: Annotation = Annotation("<token:msd>"),
        ref: Annotation = Annotation("<token>:misc.number_rel_<sentence>"),
        sentence: Annotation = Annotation("<sentence>"),
        token: Annotation = Annotation("<token>"),
        encoding: str = util.UTF8,
        process_dict=None):
    """
    Run the malt parser, in an already started process defined in process_dict, or start a new process (default).

    The process_dict argument should never be set from the command line.
    """
    if process_dict is None:
        process = maltstart(maltjar, model, encoding)
    else:
        process = process_dict["process"]
        # If process seems dead, spawn a new
        if process.stdin.closed or process.stdout.closed or process.poll():
            util.system.kill_process(process)
            process = maltstart(maltjar,
                                model,
                                encoding,
                                send_empty_sentence=True)
            process_dict["process"] = process

    sentences, orphans = sentence.get_children(token)
    sentences.append(orphans)

    word_annotation = list(word.read())
    pos_annotation = list(pos.read())
    msd_annotation = list(msd.read())
    ref_annotation = list(ref.read())

    def conll_token(nr, token_index):
        form = word_annotation[token_index]
        lemma = UNDEF
        pos = cpos = pos_annotation[token_index]
        feats = re.sub(r"[ ,.]", "|",
                       msd_annotation[token_index]).replace("+", "/")
        return TAG_SEP.join((str(nr), form, lemma, cpos, pos, feats))

    stdin = SENT_SEP.join(
        TOK_SEP.join(
            conll_token(n + 1, token_index)
            for n, token_index in enumerate(sent)) for sent in sentences)

    if encoding:
        stdin = stdin.encode(encoding)

    keep_process = len(
        stdin) < RESTART_THRESHOLD_LENGTH and process_dict is not None
    log.info("Stdin length: %s, keep process: %s", len(stdin), keep_process)

    if process_dict is not None:
        process_dict["restart"] = not keep_process

    if keep_process:
        # Chatting with malt: send a SENT_SEP and read correct number of lines
        stdin_fd, stdout_fd = process.stdin, process.stdout
        stdin_fd.write(stdin + SENT_SEP.encode(util.UTF8))
        stdin_fd.flush()

        malt_sentences = []
        for sent in sentences:
            malt_sent = []
            for _ in sent:
                line = stdout_fd.readline()
                if encoding:
                    line = line.decode(encoding)
                malt_sent.append(line)
            line = stdout_fd.readline()
            assert line == b"\n"
            malt_sentences.append(malt_sent)
    else:
        # Otherwise use communicate which buffers properly
        stdout, _ = process.communicate(stdin)
        if encoding:
            stdout = stdout.decode(encoding)
        malt_sentences = (malt_sent.split(TOK_SEP)
                          for malt_sent in stdout.split(SENT_SEP))

    out_dephead_annotation = word.create_empty_attribute()
    out_dephead_ref_annotation = out_dephead_annotation.copy()
    out_deprel_annotation = out_dephead_annotation.copy()
    for (sent, malt_sent) in zip(sentences, malt_sentences):
        for (token_index, malt_tok) in zip(sent, malt_sent):
            cols = [(None if col == UNDEF else col)
                    for col in malt_tok.split(TAG_SEP)]
            out_deprel_annotation[token_index] = cols[DEPREL_COLUMN]
            head = int(cols[HEAD_COLUMN])
            out_dephead_annotation[token_index] = str(sent[head -
                                                           1]) if head else "-"
            out_dephead_ref_annotation[token_index] = str(
                ref_annotation[sent[head - 1]]) if head else ""

    out_dephead.write(out_dephead_annotation)
    out_dephead_ref.write(out_dephead_ref_annotation)
    out_deprel.write(out_deprel_annotation)
Example #8
0
def annotate(
        wsdjar: Binary = Binary("[wsd.jar]"),
        sense_model: Model = Model("[wsd.sense_model]"),
        context_model: Model = Model("[wsd.context_model]"),
        out: Output = Output(
            "<token>:wsd.sense",
            cls="token:sense",
            description="Sense disambiguated SALDO identifiers"),
        sentence: Annotation = Annotation("<sentence>"),
        word: Annotation = Annotation("<token:word>"),
        ref: Annotation = Annotation("<token>:misc.number_rel_<sentence>"),
        lemgram: Annotation = Annotation("<token>:saldo.lemgram"),
        saldo: Annotation = Annotation("<token>:saldo.sense"),
        pos: Annotation = Annotation("<token:pos>"),
        token: Annotation = Annotation("<token>"),
        prob_format: str = Config("wsd.prob_format"),
        default_prob: float = Config("wsd.default_prob"),
        encoding: str = util.UTF8):
    """Run the word sense disambiguation tool (saldowsd.jar) to add probabilities to the saldo annotation.

    Unanalyzed senses (e.g. multiword expressions) receive the probability value given by default_prob.
      - wsdjar is the name of the java programme to be used for the wsd
      - sense_model and context_model are the models to be used with wsdjar
      - out is the resulting annotation file
      - sentence is an existing annotation for sentences and their children (words)
      - word is an existing annotations for wordforms
      - ref is an existing annotation for word references
      - lemgram and saldo are existing annotations for inflection tables and meanings
      - pos is an existing annotations for part-of-speech
      - prob_format is a format string for how to print the sense probability
      - default_prob is the default value for unanalyzed senses
    """
    word_annotation = list(word.read())
    ref_annotation = list(ref.read())
    lemgram_annotation = list(lemgram.read())
    saldo_annotation = list(saldo.read())
    pos_annotation = list(pos.read())

    sentences, orphans = sentence.get_children(token)
    sentences.append(orphans)

    # Start WSD process
    process = wsd_start(wsdjar, sense_model.path, context_model.path, encoding)

    # Construct input and send to WSD
    stdin = build_input(sentences, word_annotation, ref_annotation,
                        lemgram_annotation, saldo_annotation, pos_annotation)

    if encoding:
        stdin = stdin.encode(encoding)

    stdout, stderr = process.communicate(stdin)
    # TODO: Solve hack line below!
    # Problem is that regular messages "Reading sense vectors.." are also piped to stderr.
    if len(stderr) > 52:
        util.system.kill_process(process)
        log.error(str(stderr))
        return

    if encoding:
        stdout = stdout.decode(encoding)

    process_output(word, out, stdout, sentences, saldo_annotation, prob_format,
                   default_prob)

    # Kill running subprocess
    util.system.kill_process(process)
    return
Example #9
0
def annotate(out_ne: Output = Output("swener.ne", cls="named_entity", description="Named entity segments from SweNER"),
             out_ne_ex: Output = Output("swener.ne:swener.ex", description="Named entity expressions from SweNER"),
             out_ne_type: Output = Output("swener.ne:swener.type", cls="named_entity:type",
                                          description="Named entity types from SweNER"),
             out_ne_subtype: Output = Output("swener.ne:swener.subtype", cls="named_entity:subtype",
                                             description="Named entity sub types from SweNER"),
             out_ne_name: Output = Output("swener.ne:swener.name", cls="named_entity:name",
                                          description="Names in SweNER named entities"),
             word: Annotation = Annotation("<token:word>"),
             sentence: Annotation = Annotation("<sentence>"),
             token: Annotation = Annotation("<token>"),
             binary: Binary = Binary("[swener.binary]"),
             process_dict=None):
    """Tag named entities using HFST-SweNER.

    SweNER is either run in an already started process defined in
    process_dict, or a new process is started(default)
    - doc, word, sentence, token: existing annotations
    - out_ne_ex, out_ne_type, out_ne_subtype: resulting annotation files for the named entities
    - process_dict is used in the catapult and should never be set from the command line
    """
    if process_dict is None:
        process = swenerstart(binary, "", util.UTF8, verbose=False)
    # else:
    #     process = process_dict["process"]
    #     # If process seems dead, spawn a new one
    #     if process.stdin.closed or process.stdout.closed or process.poll():
    #         util.system.kill_process(process)
    #         process = swenerstart("", encoding, verbose=False)
    #         process_dict["process"] = process

    # Get sentence annotation
    sentences, _orphans = sentence.get_children(token, orphan_alert=True)

    # Collect all text
    word_annotation = list(word.read())
    stdin = SENT_SEP.join(TOK_SEP.join(word_annotation[token_index] for token_index in sent)
                          for sent in sentences)
    # Escape <, > and &
    stdin = xml.sax.saxutils.escape(stdin)

    # keep_process = len(stdin) < RESTART_THRESHOLD_LENGTH and process_dict is not None
    # log.info("Stdin length: %s, keep process: %s", len(stdin), keep_process)

    # if process_dict is not None:
    #     process_dict["restart"] = not keep_process

    # # Does not work as of now since swener does not have an interactive mode
    # if keep_process:
    #     # Chatting with swener: send a SENT_SEP and read correct number of lines
    #     stdin_fd, stdout_fd = process.stdin, process.stdout
    #     stdin_fd.write(stdin.encode(encoding) + SENT_SEP)
    #     stdin_fd.flush()
    #     stout = stdout_fd.readlines()

    # else:
    # Otherwise use communicate which buffers properly
    # log.info("STDIN %s %s", type(stdin.encode(encoding)), stdin.encode(encoding))
    stdout, _ = process.communicate(stdin.encode(util.UTF8))
    # log.info("STDOUT %s %s", type(stdout.decode(encoding)), stdout.decode(encoding))

    parse_swener_output(sentences, token, stdout.decode(util.UTF8), out_ne, out_ne_ex, out_ne_type, out_ne_subtype,
                        out_ne_name)