def build_model(out: ModelOutput = ModelOutput("malt/swemalt-1.7.2.mco"), _maltjar: Binary = Binary("[malt.jar]")): """Download model for MALT Parser. Won't download model unless maltjar has been installed. """ out.download("http://maltparser.org/mco/swedish_parser/swemalt-1.7.2.mco")
def msdtag(out: Output = Output( "<token>:hunpos.msd", cls="token:msd", description="Part-of-speeches with morphological descriptions"), word: Annotation = Annotation("<token:word>"), sentence: Annotation = Annotation("<sentence>"), binary: Binary = Binary("[hunpos.binary]"), model: Model = Model("[hunpos.model]"), morphtable: Optional[Model] = Model("[hunpos.morphtable]"), patterns: Optional[Model] = Model("[hunpos.patterns]"), tag_mapping=None, encoding: str = util.UTF8): """POS/MSD tag using the Hunpos tagger.""" if isinstance(tag_mapping, str) and tag_mapping: tag_mapping = util.tagsets.mappings[tag_mapping] elif tag_mapping is None or tag_mapping == "": tag_mapping = {} pattern_list = [] if patterns: with open(patterns.path, encoding="utf-8") as pat: for line in pat: if line.strip() and not line.startswith("#"): name, pattern, tags = line.strip().split("\t", 2) pattern_list.append( (name, re.compile("^%s$" % pattern), tags)) def replace_word(w): """Replace word with alias if word matches a regex pattern.""" for p in pattern_list: if re.match(p[1], w): return "[[%s]]" % p[0] return w sentences, _orphans = sentence.get_children(word) token_word = list(word.read()) stdin = SENT_SEP.join( TOK_SEP.join( replace_word(token_word[token_index]) for token_index in sent) for sent in sentences) args = [model.path] if morphtable: args.extend(["-m", morphtable.path]) stdout, _ = util.system.call_binary(binary, args, stdin, encoding=encoding) out_annotation = word.create_empty_attribute() for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)): for token_index, tagged_token in zip( sent, tagged_sent.strip().split(TOK_SEP)): tag = tagged_token.strip().split(TAG_SEP)[TAG_COLUMN] tag = tag_mapping.get(tag, tag) out_annotation[token_index] = tag out.write(out_annotation)
def annotate( lang: Language = Language(), model: Model = Model("[treetagger.model]"), tt_binary: Binary = Binary("[treetagger.binary]"), out_upos: Output = Output("<token>:treetagger.upos", cls="token:upos", description="Part-of-speeches in UD"), out_pos: Output = Output( "<token>:treetagger.pos", cls="token:pos", description="Part-of-speeches from TreeTagger"), out_baseform: Output = Output("<token>:treetagger.baseform", description="Baseforms from TreeTagger"), word: Annotation = Annotation("<token:word>"), sentence: Annotation = Annotation("<sentence>"), encoding: str = util.UTF8): """POS/MSD tag and lemmatize using TreeTagger.""" sentences, _orphans = sentence.get_children(word) word_annotation = list(word.read()) stdin = SENT_SEP.join( TOK_SEP.join(word_annotation[token_index] for token_index in sent) for sent in sentences) args = ["-token", "-lemma", "-no-unknown", "-eos-tag", "<eos>", model.path] stdout, stderr = util.system.call_binary(tt_binary, args, stdin, encoding=encoding) log.debug("Message from TreeTagger:\n%s", stderr) # Write pos and upos annotations. out_upos_annotation = word.create_empty_attribute() out_pos_annotation = word.create_empty_attribute() for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)): for token_id, tagged_token in zip(sent, tagged_sent.strip().split(TOK_SEP)): tag = tagged_token.strip().split(TAG_SEP)[TAG_COLUMN] out_pos_annotation[token_id] = tag out_upos_annotation[token_id] = util.tagsets.pos_to_upos( tag, lang, TAG_SETS.get(lang)) out_pos.write(out_pos_annotation) out_upos.write(out_upos_annotation) # Write lemma annotations. out_lemma_annotation = word.create_empty_attribute() for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)): for token_id, tagged_token in zip(sent, tagged_sent.strip().split(TOK_SEP)): lem = tagged_token.strip().split(TAG_SEP)[LEM_COLUMN] out_lemma_annotation[token_id] = lem out_baseform.write(out_lemma_annotation)
def annotate(corpus_text: Text = Text(), lang: Language = Language, conf_file: Model = Model("[freeling.conf]"), fl_binary: Binary = Binary("[freeling.binary]"), sentence_chunk: Optional[Annotation] = Annotation("[freeling.sentence_chunk]"), out_token: Output = Output("freeling.token", cls="token", description="Token segments"), out_word: Output = Output("<token>:freeling.word", cls="token:word", description="Token strings"), out_baseform: Output = Output("<token>:freeling.baseform", description="Baseforms from FreeLing"), out_upos: Output = Output("<token>:freeling.upos", cls="token:upos", description="Part-of-speeches in UD"), out_pos: Output = Output("<token>:freeling.pos", cls="token:pos", description="Part-of-speeches from FreeLing"), out_sentence: Optional[Output] = Output("freeling.sentence", cls="sentence", description="Sentence segments"), sentence_annotation: Optional[Annotation] = Annotation("[freeling.sentence_annotation]")): """Run FreeLing and output sentences, tokens, baseforms, upos and pos.""" main(corpus_text, lang, conf_file, fl_binary, sentence_chunk, out_token, out_word, out_baseform, out_upos, out_pos, out_sentence, sentence_annotation)
def annotate_full(corpus_text: Text = Text(), lang: Language = Language(), conf_file: Model = Model("[freeling.conf]"), fl_binary: Binary = Binary("[freeling.binary]"), sentence_chunk: Annotation = Annotation("[freeling.sentence_chunk]"), out_token: Output = Output("freeling.token", cls="token", description="Token segments"), out_word: Output = Output("<token>:freeling.word", cls="token:word", description="Token strings"), out_baseform: Output = Output("<token>:freeling.baseform", description="Baseforms from FreeLing"), out_upos: Output = Output("<token>:freeling.upos", cls="token:upos", description="Part-of-speeches in UD"), out_pos: Output = Output("<token>:freeling.pos", cls="token:pos", description="Part-of-speeches from FreeLing"), out_ne_type: Output = Output("<token>:freeling.ne_type", cls="token:named_entity_type", description="Named entitiy types from FreeLing"), out_sentence: Optional[Output] = Output("freeling.sentence", cls="sentence", description="Sentence segments"), sentence_annotation: Optional[Annotation] = Annotation("[freeling.sentence_annotation]")): """Run FreeLing and output the usual annotations plus named entity types.""" main(corpus_text, lang, conf_file, fl_binary, sentence_chunk, out_token, out_word, out_baseform, out_upos, out_pos, out_sentence, sentence_annotation, out_ne_type)
def get_rus_model(out: ModelOutput = ModelOutput("treetagger/rus.par"), tt_binary: Binary = Binary("[treetagger.binary]")): """Download TreeTagger language model.""" gzip = "treetagger/russian.par.gz" url = "http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/russian.par.gz" _download(url, gzip, out)
def annotate( maltjar: Binary = Binary("[malt.jar]"), model: Model = Model("[malt.model]"), out_dephead: Output = Output( "<token>:malt.dephead", cls="token:dephead", description="Positions of the dependency heads"), out_dephead_ref: Output = Output( "<token>:malt.dephead_ref", cls="token:dephead_ref", description="Sentence-relative positions of the dependency heads"), out_deprel: Output = Output( "<token>:malt.deprel", cls="token:deprel", description="Dependency relations to the head"), word: Annotation = Annotation("<token:word>"), pos: Annotation = Annotation("<token:pos>"), msd: Annotation = Annotation("<token:msd>"), ref: Annotation = Annotation("<token>:misc.number_rel_<sentence>"), sentence: Annotation = Annotation("<sentence>"), token: Annotation = Annotation("<token>"), encoding: str = util.UTF8, process_dict=None): """ Run the malt parser, in an already started process defined in process_dict, or start a new process (default). The process_dict argument should never be set from the command line. """ if process_dict is None: process = maltstart(maltjar, model, encoding) else: process = process_dict["process"] # If process seems dead, spawn a new if process.stdin.closed or process.stdout.closed or process.poll(): util.system.kill_process(process) process = maltstart(maltjar, model, encoding, send_empty_sentence=True) process_dict["process"] = process sentences, orphans = sentence.get_children(token) sentences.append(orphans) word_annotation = list(word.read()) pos_annotation = list(pos.read()) msd_annotation = list(msd.read()) ref_annotation = list(ref.read()) def conll_token(nr, token_index): form = word_annotation[token_index] lemma = UNDEF pos = cpos = pos_annotation[token_index] feats = re.sub(r"[ ,.]", "|", msd_annotation[token_index]).replace("+", "/") return TAG_SEP.join((str(nr), form, lemma, cpos, pos, feats)) stdin = SENT_SEP.join( TOK_SEP.join( conll_token(n + 1, token_index) for n, token_index in enumerate(sent)) for sent in sentences) if encoding: stdin = stdin.encode(encoding) keep_process = len( stdin) < RESTART_THRESHOLD_LENGTH and process_dict is not None log.info("Stdin length: %s, keep process: %s", len(stdin), keep_process) if process_dict is not None: process_dict["restart"] = not keep_process if keep_process: # Chatting with malt: send a SENT_SEP and read correct number of lines stdin_fd, stdout_fd = process.stdin, process.stdout stdin_fd.write(stdin + SENT_SEP.encode(util.UTF8)) stdin_fd.flush() malt_sentences = [] for sent in sentences: malt_sent = [] for _ in sent: line = stdout_fd.readline() if encoding: line = line.decode(encoding) malt_sent.append(line) line = stdout_fd.readline() assert line == b"\n" malt_sentences.append(malt_sent) else: # Otherwise use communicate which buffers properly stdout, _ = process.communicate(stdin) if encoding: stdout = stdout.decode(encoding) malt_sentences = (malt_sent.split(TOK_SEP) for malt_sent in stdout.split(SENT_SEP)) out_dephead_annotation = word.create_empty_attribute() out_dephead_ref_annotation = out_dephead_annotation.copy() out_deprel_annotation = out_dephead_annotation.copy() for (sent, malt_sent) in zip(sentences, malt_sentences): for (token_index, malt_tok) in zip(sent, malt_sent): cols = [(None if col == UNDEF else col) for col in malt_tok.split(TAG_SEP)] out_deprel_annotation[token_index] = cols[DEPREL_COLUMN] head = int(cols[HEAD_COLUMN]) out_dephead_annotation[token_index] = str(sent[head - 1]) if head else "-" out_dephead_ref_annotation[token_index] = str( ref_annotation[sent[head - 1]]) if head else "" out_dephead.write(out_dephead_annotation) out_dephead_ref.write(out_dephead_ref_annotation) out_deprel.write(out_deprel_annotation)
def annotate( wsdjar: Binary = Binary("[wsd.jar]"), sense_model: Model = Model("[wsd.sense_model]"), context_model: Model = Model("[wsd.context_model]"), out: Output = Output( "<token>:wsd.sense", cls="token:sense", description="Sense disambiguated SALDO identifiers"), sentence: Annotation = Annotation("<sentence>"), word: Annotation = Annotation("<token:word>"), ref: Annotation = Annotation("<token>:misc.number_rel_<sentence>"), lemgram: Annotation = Annotation("<token>:saldo.lemgram"), saldo: Annotation = Annotation("<token>:saldo.sense"), pos: Annotation = Annotation("<token:pos>"), token: Annotation = Annotation("<token>"), prob_format: str = Config("wsd.prob_format"), default_prob: float = Config("wsd.default_prob"), encoding: str = util.UTF8): """Run the word sense disambiguation tool (saldowsd.jar) to add probabilities to the saldo annotation. Unanalyzed senses (e.g. multiword expressions) receive the probability value given by default_prob. - wsdjar is the name of the java programme to be used for the wsd - sense_model and context_model are the models to be used with wsdjar - out is the resulting annotation file - sentence is an existing annotation for sentences and their children (words) - word is an existing annotations for wordforms - ref is an existing annotation for word references - lemgram and saldo are existing annotations for inflection tables and meanings - pos is an existing annotations for part-of-speech - prob_format is a format string for how to print the sense probability - default_prob is the default value for unanalyzed senses """ word_annotation = list(word.read()) ref_annotation = list(ref.read()) lemgram_annotation = list(lemgram.read()) saldo_annotation = list(saldo.read()) pos_annotation = list(pos.read()) sentences, orphans = sentence.get_children(token) sentences.append(orphans) # Start WSD process process = wsd_start(wsdjar, sense_model.path, context_model.path, encoding) # Construct input and send to WSD stdin = build_input(sentences, word_annotation, ref_annotation, lemgram_annotation, saldo_annotation, pos_annotation) if encoding: stdin = stdin.encode(encoding) stdout, stderr = process.communicate(stdin) # TODO: Solve hack line below! # Problem is that regular messages "Reading sense vectors.." are also piped to stderr. if len(stderr) > 52: util.system.kill_process(process) log.error(str(stderr)) return if encoding: stdout = stdout.decode(encoding) process_output(word, out, stdout, sentences, saldo_annotation, prob_format, default_prob) # Kill running subprocess util.system.kill_process(process) return
def annotate(out_ne: Output = Output("swener.ne", cls="named_entity", description="Named entity segments from SweNER"), out_ne_ex: Output = Output("swener.ne:swener.ex", description="Named entity expressions from SweNER"), out_ne_type: Output = Output("swener.ne:swener.type", cls="named_entity:type", description="Named entity types from SweNER"), out_ne_subtype: Output = Output("swener.ne:swener.subtype", cls="named_entity:subtype", description="Named entity sub types from SweNER"), out_ne_name: Output = Output("swener.ne:swener.name", cls="named_entity:name", description="Names in SweNER named entities"), word: Annotation = Annotation("<token:word>"), sentence: Annotation = Annotation("<sentence>"), token: Annotation = Annotation("<token>"), binary: Binary = Binary("[swener.binary]"), process_dict=None): """Tag named entities using HFST-SweNER. SweNER is either run in an already started process defined in process_dict, or a new process is started(default) - doc, word, sentence, token: existing annotations - out_ne_ex, out_ne_type, out_ne_subtype: resulting annotation files for the named entities - process_dict is used in the catapult and should never be set from the command line """ if process_dict is None: process = swenerstart(binary, "", util.UTF8, verbose=False) # else: # process = process_dict["process"] # # If process seems dead, spawn a new one # if process.stdin.closed or process.stdout.closed or process.poll(): # util.system.kill_process(process) # process = swenerstart("", encoding, verbose=False) # process_dict["process"] = process # Get sentence annotation sentences, _orphans = sentence.get_children(token, orphan_alert=True) # Collect all text word_annotation = list(word.read()) stdin = SENT_SEP.join(TOK_SEP.join(word_annotation[token_index] for token_index in sent) for sent in sentences) # Escape <, > and & stdin = xml.sax.saxutils.escape(stdin) # keep_process = len(stdin) < RESTART_THRESHOLD_LENGTH and process_dict is not None # log.info("Stdin length: %s, keep process: %s", len(stdin), keep_process) # if process_dict is not None: # process_dict["restart"] = not keep_process # # Does not work as of now since swener does not have an interactive mode # if keep_process: # # Chatting with swener: send a SENT_SEP and read correct number of lines # stdin_fd, stdout_fd = process.stdin, process.stdout # stdin_fd.write(stdin.encode(encoding) + SENT_SEP) # stdin_fd.flush() # stout = stdout_fd.readlines() # else: # Otherwise use communicate which buffers properly # log.info("STDIN %s %s", type(stdin.encode(encoding)), stdin.encode(encoding)) stdout, _ = process.communicate(stdin.encode(util.UTF8)) # log.info("STDOUT %s %s", type(stdout.decode(encoding)), stdout.decode(encoding)) parse_swener_output(sentences, token, stdout.decode(util.UTF8), out_ne, out_ne_ex, out_ne_type, out_ne_subtype, out_ne_name)