def lix(text: Annotation = Annotation("<text>"), sentence: Annotation = Annotation("<sentence>"), word: Annotation = Annotation("<token:word>"), pos: Annotation = Annotation("<token:pos>"), out: Output = Output("<text>:readability.lix", description="LIX values for text chunks"), skip_pos: List[str] = ["MAD", "MID", "PAD"], fmt: str = "%.2f"): """Create LIX annotation for text.""" # Read annotation files and get parent_children relations text_children, _orphans = text.get_children(sentence) word_pos = list(word.read_attributes((word, pos))) sentence_children, _orphans = sentence.get_children(word) sentence_children = list(sentence_children) # Calculate LIX for every text element lix_annotation = [] for text in text_children: in_sentences = [] for sentence_index in text: s = sentence_children[sentence_index] in_sentences.append( list( actual_words([word_pos[token_index] for token_index in s], skip_pos))) lix_annotation.append(fmt % lix_calc(in_sentences)) out.write(lix_annotation)
def contextual(out: Output = Output("{chunk}:geo.geo_context", description="Geographical places with coordinates"), chunk: Annotation = Annotation("{chunk}"), context: Annotation = Annotation("[geo.context_chunk]"), ne_type: Annotation = Annotation("swener.ne:swener.type"), ne_subtype: Annotation = Annotation("swener.ne:swener.subtype"), ne_name: Annotation = Annotation("swener.ne:swener.name"), model: Model = Model("[geo.model]"), method: str = "populous", language: list = []): """Annotate chunks with location data, based on locations contained within the text. context = text chunk to use for disambiguating places (when applicable). chunk = text chunk to which the annotation will be added. """ model = load_model(model, language=language) ne_type_annotation = list(ne_type.read()) ne_subtype_annotation = list(ne_subtype.read()) ne_name_annotation = list(ne_name.read()) children_context_chunk, _orphans = context.get_children(chunk) children_chunk_ne, _orphans = chunk.get_children(ne_type) out_annotation = chunk.create_empty_attribute() for chunks in children_context_chunk: all_locations = [] # TODO: Maybe not needed for anything? context_locations = [] chunk_locations = defaultdict(list) for ch in chunks: for n in children_chunk_ne[ch]: if ne_type_annotation[n] == "LOC" and "PPL" in ne_subtype_annotation[n]: location_text = ne_name_annotation[n].replace("\n", " ").replace(" ", " ") location_data = model.get(location_text.lower()) if location_data: all_locations.append((location_text, list(location_data))) context_locations.append((location_text, list(location_data))) chunk_locations[ch].append((location_text, list(location_data))) else: pass # log.info("No location found for %s" % ne_name_annotation[n].replace("%", "%%")) chunk_locations = most_populous(chunk_locations) for c in chunks: out_annotation[c] = _format_location(chunk_locations.get(c, ())) out.write(out_annotation)
def msdtag(out: Output = Output( "<token>:hunpos.msd", cls="token:msd", description="Part-of-speeches with morphological descriptions"), word: Annotation = Annotation("<token:word>"), sentence: Annotation = Annotation("<sentence>"), binary: Binary = Binary("[hunpos.binary]"), model: Model = Model("[hunpos.model]"), morphtable: Optional[Model] = Model("[hunpos.morphtable]"), patterns: Optional[Model] = Model("[hunpos.patterns]"), tag_mapping=None, encoding: str = util.UTF8): """POS/MSD tag using the Hunpos tagger.""" if isinstance(tag_mapping, str) and tag_mapping: tag_mapping = util.tagsets.mappings[tag_mapping] elif tag_mapping is None or tag_mapping == "": tag_mapping = {} pattern_list = [] if patterns: with open(patterns.path, encoding="utf-8") as pat: for line in pat: if line.strip() and not line.startswith("#"): name, pattern, tags = line.strip().split("\t", 2) pattern_list.append( (name, re.compile("^%s$" % pattern), tags)) def replace_word(w): """Replace word with alias if word matches a regex pattern.""" for p in pattern_list: if re.match(p[1], w): return "[[%s]]" % p[0] return w sentences, _orphans = sentence.get_children(word) token_word = list(word.read()) stdin = SENT_SEP.join( TOK_SEP.join( replace_word(token_word[token_index]) for token_index in sent) for sent in sentences) args = [model.path] if morphtable: args.extend(["-m", morphtable.path]) stdout, _ = util.system.call_binary(binary, args, stdin, encoding=encoding) out_annotation = word.create_empty_attribute() for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)): for token_index, tagged_token in zip( sent, tagged_sent.strip().split(TOK_SEP)): tag = tagged_token.strip().split(TAG_SEP)[TAG_COLUMN] tag = tag_mapping.get(tag, tag) out_annotation[token_index] = tag out.write(out_annotation)
def annotate( lang: Language = Language(), model: Model = Model("[treetagger.model]"), tt_binary: Binary = Binary("[treetagger.binary]"), out_upos: Output = Output("<token>:treetagger.upos", cls="token:upos", description="Part-of-speeches in UD"), out_pos: Output = Output( "<token>:treetagger.pos", cls="token:pos", description="Part-of-speeches from TreeTagger"), out_baseform: Output = Output("<token>:treetagger.baseform", description="Baseforms from TreeTagger"), word: Annotation = Annotation("<token:word>"), sentence: Annotation = Annotation("<sentence>"), encoding: str = util.UTF8): """POS/MSD tag and lemmatize using TreeTagger.""" sentences, _orphans = sentence.get_children(word) word_annotation = list(word.read()) stdin = SENT_SEP.join( TOK_SEP.join(word_annotation[token_index] for token_index in sent) for sent in sentences) args = ["-token", "-lemma", "-no-unknown", "-eos-tag", "<eos>", model.path] stdout, stderr = util.system.call_binary(tt_binary, args, stdin, encoding=encoding) log.debug("Message from TreeTagger:\n%s", stderr) # Write pos and upos annotations. out_upos_annotation = word.create_empty_attribute() out_pos_annotation = word.create_empty_attribute() for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)): for token_id, tagged_token in zip(sent, tagged_sent.strip().split(TOK_SEP)): tag = tagged_token.strip().split(TAG_SEP)[TAG_COLUMN] out_pos_annotation[token_id] = tag out_upos_annotation[token_id] = util.tagsets.pos_to_upos( tag, lang, TAG_SETS.get(lang)) out_pos.write(out_pos_annotation) out_upos.write(out_upos_annotation) # Write lemma annotations. out_lemma_annotation = word.create_empty_attribute() for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)): for token_id, tagged_token in zip(sent, tagged_sent.strip().split(TOK_SEP)): lem = tagged_token.strip().split(TAG_SEP)[LEM_COLUMN] out_lemma_annotation[token_id] = lem out_baseform.write(out_lemma_annotation)
def number_relative(out: Output = Output("{annotation}:misc.number_rel_{parent}"), parent: Annotation = Annotation("{parent}"), child: Annotation = Annotation("{annotation}"), prefix: str = "", zfill: bool = False, start: int = START_DEFAULT): """Number chunks by their relative position within a parent.""" parent_children, _orphans = parent.get_children(child) out.write(("{prefix}{nr:0{length}d}".format(prefix=prefix, length=len(str(len(parent) - 1 + start)) if zfill else 0, nr=cnr) for parent in parent_children for cnr, _index in enumerate(parent, start)))
def number_by_parent(out: Output = Output("{annotation}:misc.number_by_parent_{parent_annotation}__{parent_attribute}"), chunk: Annotation = Annotation("{annotation}"), parent_order: Annotation = Annotation("{parent_annotation}:{parent_attribute}"), prefix: str = "", zfill: bool = False, start: int = START_DEFAULT): """Number chunks by (parent_order, chunk order).""" parent_children, _orphans = parent_order.get_children(chunk) child_order = {child_index: (parent_nr, child_index) for parent_index, parent_nr in enumerate(parent_order.read()) for child_index in parent_children[parent_index]} def _order(index, _value): return child_order.get(index) _read_chunks_and_write_new_ordering(out, chunk, _order, prefix, zfill, start)
def nominal_ratio(text: Annotation = Annotation("<text>"), pos: Annotation = Annotation("<token:pos>"), out: Output = Output( "<text>:readability.nk", description="Nominal ratios for text chunks"), noun_pos: List[str] = ["NN", "PP", "PC"], verb_pos: List[str] = ["PN", "AB", "VB"], fmt: str = "%.2f"): """Create nominal ratio annotation for text.""" text_children, _orphans = text.get_children(pos) pos_annotation = list(pos.read()) # Calculate OVIX for every text element nk_annotation = [] for text in text_children: in_pos = [pos_annotation[token_index] for token_index in text] nk_annotation.append(fmt % nominal_ratio_calc(in_pos, noun_pos, verb_pos)) out.write(nk_annotation)
def ovix(text: Annotation = Annotation("<text>"), word: Annotation = Annotation("<token:word>"), pos: Annotation = Annotation("<token:pos>"), out: Output = Output("<text>:readability.ovix", description="OVIX values for text chunks"), skip_pos: List[str] = ["MAD", "MID", "PAD"], fmt: str = "%.2f"): """Create OVIX annotation for text.""" text_children, _orphans = text.get_children(word) word_pos = list(word.read_attributes((word, pos))) # Calculate OVIX for every text element ovix_annotation = [] for text in text_children: in_words = list( actual_words([word_pos[token_index] for token_index in text], skip_pos)) ovix_annotation.append(fmt % ovix_calc(in_words)) out.write(ovix_annotation)
def annotate( maltjar: Binary = Binary("[malt.jar]"), model: Model = Model("[malt.model]"), out_dephead: Output = Output( "<token>:malt.dephead", cls="token:dephead", description="Positions of the dependency heads"), out_dephead_ref: Output = Output( "<token>:malt.dephead_ref", cls="token:dephead_ref", description="Sentence-relative positions of the dependency heads"), out_deprel: Output = Output( "<token>:malt.deprel", cls="token:deprel", description="Dependency relations to the head"), word: Annotation = Annotation("<token:word>"), pos: Annotation = Annotation("<token:pos>"), msd: Annotation = Annotation("<token:msd>"), ref: Annotation = Annotation("<token>:misc.number_rel_<sentence>"), sentence: Annotation = Annotation("<sentence>"), token: Annotation = Annotation("<token>"), encoding: str = util.UTF8, process_dict=None): """ Run the malt parser, in an already started process defined in process_dict, or start a new process (default). The process_dict argument should never be set from the command line. """ if process_dict is None: process = maltstart(maltjar, model, encoding) else: process = process_dict["process"] # If process seems dead, spawn a new if process.stdin.closed or process.stdout.closed or process.poll(): util.system.kill_process(process) process = maltstart(maltjar, model, encoding, send_empty_sentence=True) process_dict["process"] = process sentences, orphans = sentence.get_children(token) sentences.append(orphans) word_annotation = list(word.read()) pos_annotation = list(pos.read()) msd_annotation = list(msd.read()) ref_annotation = list(ref.read()) def conll_token(nr, token_index): form = word_annotation[token_index] lemma = UNDEF pos = cpos = pos_annotation[token_index] feats = re.sub(r"[ ,.]", "|", msd_annotation[token_index]).replace("+", "/") return TAG_SEP.join((str(nr), form, lemma, cpos, pos, feats)) stdin = SENT_SEP.join( TOK_SEP.join( conll_token(n + 1, token_index) for n, token_index in enumerate(sent)) for sent in sentences) if encoding: stdin = stdin.encode(encoding) keep_process = len( stdin) < RESTART_THRESHOLD_LENGTH and process_dict is not None log.info("Stdin length: %s, keep process: %s", len(stdin), keep_process) if process_dict is not None: process_dict["restart"] = not keep_process if keep_process: # Chatting with malt: send a SENT_SEP and read correct number of lines stdin_fd, stdout_fd = process.stdin, process.stdout stdin_fd.write(stdin + SENT_SEP.encode(util.UTF8)) stdin_fd.flush() malt_sentences = [] for sent in sentences: malt_sent = [] for _ in sent: line = stdout_fd.readline() if encoding: line = line.decode(encoding) malt_sent.append(line) line = stdout_fd.readline() assert line == b"\n" malt_sentences.append(malt_sent) else: # Otherwise use communicate which buffers properly stdout, _ = process.communicate(stdin) if encoding: stdout = stdout.decode(encoding) malt_sentences = (malt_sent.split(TOK_SEP) for malt_sent in stdout.split(SENT_SEP)) out_dephead_annotation = word.create_empty_attribute() out_dephead_ref_annotation = out_dephead_annotation.copy() out_deprel_annotation = out_dephead_annotation.copy() for (sent, malt_sent) in zip(sentences, malt_sentences): for (token_index, malt_tok) in zip(sent, malt_sent): cols = [(None if col == UNDEF else col) for col in malt_tok.split(TAG_SEP)] out_deprel_annotation[token_index] = cols[DEPREL_COLUMN] head = int(cols[HEAD_COLUMN]) out_dephead_annotation[token_index] = str(sent[head - 1]) if head else "-" out_dephead_ref_annotation[token_index] = str( ref_annotation[sent[head - 1]]) if head else "" out_dephead.write(out_dephead_annotation) out_dephead_ref.write(out_dephead_ref_annotation) out_deprel.write(out_deprel_annotation)
def annotate(token: Annotation = Annotation("<token>"), word: Annotation = Annotation("<token:word>"), sentence: Annotation = Annotation("<sentence>"), reference: Annotation = Annotation( "<token>:misc.number_rel_<sentence>"), out_sense: Output = Output("<token>:saldo.sense", cls="token:sense", description="SALDO identifier"), out_lemgram: Output = Output("<token>:saldo.lemgram", description="SALDO lemgram"), out_baseform: Output = Output("<token>:saldo.baseform", cls="token:baseform", description="Baseform from SALDO"), models: List[Model] = [Model("[saldo.model]")], msd: Optional[Annotation] = Annotation("<token:msd>"), delimiter: str = util.DELIM, affix: str = util.AFFIX, precision: str = Config("saldo.precision"), precision_filter: str = "max", min_precision: float = 0.66, skip_multiword: bool = False, allow_multiword_overlap: bool = False, word_separator: str = "", lexicons=None): """Use the Saldo lexicon model (and optionally other older lexicons) to annotate pos-tagged words. - token, word, msd, sentence, reference: existing annotations - out_baseform, out_lemgram, out_sense: resulting annotations to be written - models: a list of pickled lexica, typically the Saldo model (saldo.pickle) and optional lexicons for older Swedish. - delimiter: delimiter character to put between ambiguous results - affix: an optional character to put before and after results - precision: a format string for how to print the precision for each annotation, e.g. ":%.3f" (use empty string for no precision) - precision_filter: an optional filter, currently there are the following values: max: only use the annotations that are most probable first: only use the most probable annotation (or one of the most probable if more than one) none: use all annotations - min_precision: only use annotations with a probability score higher than this - skip_multiword: set to True to disable multi word annotations - allow_multiword_overlap: by default we do some cleanup among overlapping multi word annotations. By setting this to True, all overlaps will be allowed. - word_separator: an optional character used to split the values of "word" into several word variations - lexicons: this argument cannot be set from the command line, but is used in the catapult. This argument must be last. """ # Allow use of multiple lexicons models_list = [(m.path.stem, m) for m in models] if not lexicons: lexicon_list = [(name, SaldoLexicon(lex.path)) for name, lex in models_list] # Use pre-loaded lexicons (from catapult) else: lexicon_list = [] for name, _lex in models_list: assert lexicons.get( name, None) is not None, "Lexicon %s not found!" % name lexicon_list.append((name, lexicons[name])) # Maximum number of gaps in multi-word units. # TODO: Set to 0 for hist-mode? since many (most?) multi-word in the old lexicons are inseparable (half öre etc) max_gaps = 1 # Combine annotation names i SALDO lexicon with out annotations annotations = [] if out_baseform: annotations.append((out_baseform, "gf")) if out_lemgram: annotations.append((out_lemgram, "lem")) if out_sense: annotations.append((out_sense, "saldo")) if skip_multiword: log.info("Skipping multi word annotations") min_precision = float(min_precision) # If min_precision is 0, skip almost all part-of-speech checking (verb multi-word expressions still won't be # allowed to span over other verbs) skip_pos_check = (min_precision == 0.0) word_annotation = list(word.read()) ref_annotation = list(reference.read()) if msd: msd_annotation = list(msd.read()) sentences, orphans = sentence.get_children(token) sentences.append(orphans) out_annotation = word.create_empty_attribute() for sent in sentences: incomplete_multis = [ ] # [{annotation, words, [ref], is_particle, lastwordWasGap, numberofgaps}] complete_multis = [] # ([ref], annotation) sentence_tokens = {} for token_index in sent: theword = word_annotation[token_index] ref = ref_annotation[token_index] msdtag = msd_annotation[token_index] if msd else "" annotation_info = {} sentence_tokens[ref] = { "token_index": token_index, "annotations": annotation_info } # Support for multiple values of word if word_separator: thewords = [w for w in theword.split(word_separator) if w] else: thewords = [theword] # First use MSD tags to find the most probable single word annotations ann_tags_words = find_single_word(thewords, lexicon_list, msdtag, precision, min_precision, precision_filter, annotation_info) # Find multi-word expressions if not skip_multiword: find_multiword_expressions(incomplete_multis, complete_multis, thewords, ref, msdtag, max_gaps, ann_tags_words, msd_annotation, sent, skip_pos_check) # Loop to next token if not allow_multiword_overlap: # Check that we don't have any unwanted overlaps remove_unwanted_overlaps(complete_multis) # Then save the rest of the multi word expressions in sentence_tokens save_multiwords(complete_multis, sentence_tokens) for tok in list(sentence_tokens.values()): out_annotation[tok["token_index"]] = _join_annotation( tok["annotations"], delimiter, affix) # Loop to next sentence for out_annotation_obj, annotation_name in annotations: out_annotation_obj.write( [v.get(annotation_name, delimiter) for v in out_annotation])
def annotate( wsdjar: Binary = Binary("[wsd.jar]"), sense_model: Model = Model("[wsd.sense_model]"), context_model: Model = Model("[wsd.context_model]"), out: Output = Output( "<token>:wsd.sense", cls="token:sense", description="Sense disambiguated SALDO identifiers"), sentence: Annotation = Annotation("<sentence>"), word: Annotation = Annotation("<token:word>"), ref: Annotation = Annotation("<token>:misc.number_rel_<sentence>"), lemgram: Annotation = Annotation("<token>:saldo.lemgram"), saldo: Annotation = Annotation("<token>:saldo.sense"), pos: Annotation = Annotation("<token:pos>"), token: Annotation = Annotation("<token>"), prob_format: str = Config("wsd.prob_format"), default_prob: float = Config("wsd.default_prob"), encoding: str = util.UTF8): """Run the word sense disambiguation tool (saldowsd.jar) to add probabilities to the saldo annotation. Unanalyzed senses (e.g. multiword expressions) receive the probability value given by default_prob. - wsdjar is the name of the java programme to be used for the wsd - sense_model and context_model are the models to be used with wsdjar - out is the resulting annotation file - sentence is an existing annotation for sentences and their children (words) - word is an existing annotations for wordforms - ref is an existing annotation for word references - lemgram and saldo are existing annotations for inflection tables and meanings - pos is an existing annotations for part-of-speech - prob_format is a format string for how to print the sense probability - default_prob is the default value for unanalyzed senses """ word_annotation = list(word.read()) ref_annotation = list(ref.read()) lemgram_annotation = list(lemgram.read()) saldo_annotation = list(saldo.read()) pos_annotation = list(pos.read()) sentences, orphans = sentence.get_children(token) sentences.append(orphans) # Start WSD process process = wsd_start(wsdjar, sense_model.path, context_model.path, encoding) # Construct input and send to WSD stdin = build_input(sentences, word_annotation, ref_annotation, lemgram_annotation, saldo_annotation, pos_annotation) if encoding: stdin = stdin.encode(encoding) stdout, stderr = process.communicate(stdin) # TODO: Solve hack line below! # Problem is that regular messages "Reading sense vectors.." are also piped to stderr. if len(stderr) > 52: util.system.kill_process(process) log.error(str(stderr)) return if encoding: stdout = stdout.decode(encoding) process_output(word, out, stdout, sentences, saldo_annotation, prob_format, default_prob) # Kill running subprocess util.system.kill_process(process) return
def relations( out: OutputData = OutputData("korp.relations"), word: Annotation = Annotation("<token:word>"), pos: Annotation = Annotation("<token:pos>"), lemgram: Annotation = Annotation("<token>:saldo.lemgram"), dephead: Annotation = Annotation("<token:dephead>"), deprel: Annotation = Annotation("<token:deprel>"), sentence_id: Annotation = Annotation("<sentence>:misc.id"), ref: Annotation = Annotation("<token>:misc.number_rel_<sentence>"), baseform: Annotation = Annotation("<token>:saldo.baseform")): """Find certain dependencies between words, to be used by the Word Picture feature in Korp.""" sentence_ids = sentence_id.read() sentence_tokens, _ = sentence_id.get_children(word) annotations = list( word.read_attributes( (word, pos, lemgram, dephead, deprel, ref, baseform))) # http://stp.ling.uu.se/~nivre/swedish_treebank/dep.html # Tuples with relations (head, rel, dep) to be found (with indexes) and an optional tuple specifying which info # should be stored and how rels = [ ({ 1: "VB", 2: "SS", 3: "NN" }, { 1: "VB", 4: "VG", 5: "VB" }, (5, 2, 3, "")), # "han har sprungit" ({ 1: "VB", 2: "(SS|OO|IO|OA)", 3: "NN" }, ), ({ 1: "VB", 2: "(RA|TA)", 3: "(AB|NN)" }, ), ({ 1: "VB", 2: "(RA|TA)", 3: "PP" }, { 3: "PP", 4: "(PA|HD)", 5: "NN" }, (1, 2, 5, "%(3)s")), # "ges vid behov" ({ 1: "NN", 2: "(AT|ET)", 3: "JJ" }, ), # "stor hund" ({ 1: "NN", 2: "ET", 3: "VB" }, { 3: "VB", 4: "SS", 5: "HP" }, (1, 2, 3, "%(5)s")), # "brödet som bakats" ({ 1: "NN", 2: "ET", 3: "PP" }, { 3: "PP", 4: "PA", 5: "(NN|PM)" }, (1, 2, 5, "%(3)s")), # "barnen i skolan", "hundarna i Sverige" ({ 1: "PP", 2: "PA", 3: "NN" }, ), # "på bordet" ({ 1: "JJ", 2: "AA", 3: "AB" }, ) # "fullständigt galen" ] null_rels = [ ("VB", ["OO"]), # Verb som saknar objekt ] triples = [] for sentid, sent in zip(sentence_ids, sentence_tokens): incomplete = {} # Tokens looking for heads, with head as key tokens = {} # Tokens in same sentence, with token_index as key # Link the tokens together for token_index in sent: token_word, token_pos, token_lem, token_dh, token_dr, token_ref, token_bf = annotations[ token_index] token_word = token_word.lower() if token_lem == "|": token_lem = token_word this = { "pos": token_pos, "lemgram": token_lem, "word": token_word, "head": None, "dep": [], "ref": token_ref, "bf": token_bf } tokens[token_index] = this if not token_dh == "-": token_dh = int(token_dh) # This token is looking for a head (token is not root) dep_triple = (token_dr, this) if token_dh in tokens: # Found head. Link them together both ways this["head"] = (token_dr, tokens[token_dh]) tokens[token_dh]["dep"].append(dep_triple) else: incomplete.setdefault(token_dh, []).append( (token_index, dep_triple)) # Is someone else looking for the current token as head? if token_index in incomplete: for t in incomplete[token_index]: tokens[t[0]]["head"] = this this["dep"].append(t[1]) del incomplete[token_index] assert not incomplete, "incomplete is not empty" def _match(pattern, value): return bool(re.match(r"^%s$" % pattern, value)) def _findrel(head, rel, dep): result = [] if isinstance(head, dict): for d in head["dep"]: if _match(rel, d[0]) and _match(dep, d[1]["pos"]): result.append(d[1]) if isinstance(dep, dict): h = dep["head"] if h and _match(rel, h[0]) and _match(head, h[1]["pos"]): result.append(h[1]) return result # Look for relations for v in list(tokens.values()): for d in v["dep"]: for rel in rels: r = rel[0] if _match(";".join([x[1] for x in sorted(r.items())]), ";".join([v["pos"], d[0], d[1]["pos"]])): triple = None if len(rel) == 1: triple = ((v["lemgram"], v["word"], v["pos"], v["ref"]), d[0], (d[1]["lemgram"], d[1]["word"], d[1]["pos"], d[1]["ref"]), ("", None), sentid, v["ref"], d[1]["ref"]) else: lookup = dict( list( zip(list(map(str, sorted(r.keys()))), (v, d[0], d[1])))) i = set(rel[0].keys()).intersection( set(rel[1].keys())).pop() rel2 = [x[1] for x in sorted(rel[1].items())] index1 = list(rel[0].keys()).index(i) index2 = list(rel[1].keys()).index(i) if index1 == 2 and index2 == 0: result = _findrel(d[1], rel2[1], rel2[2]) if result: lookup.update( dict( list( zip( list( map( str, sorted(rel[1].keys( )))), (d[1], rel2[1], result[0]))))) elif index1 == 0 and index2 == 0: result = _findrel(v, rel2[1], rel2[2]) if result: lookup.update( dict( list( zip( list( map( str, sorted(rel[1].keys( )))), (v, rel2[1], result[0]))))) pp = rel[-1] if len(list(lookup.keys())) > 3: lookup_bf = dict( (key, val["bf"]) for key, val in list(lookup.items()) if isinstance(val, dict)) lookup_ref = dict( (key, val["ref"]) for key, val in list(lookup.items()) if isinstance(val, dict)) triple = ((lookup[str(pp[0])]["lemgram"], lookup[str(pp[0])]["word"], lookup[str(pp[0])]["pos"], lookup[str(pp[0])]["ref"]), lookup[str(pp[1])], (lookup[str(pp[2])]["lemgram"], lookup[str(pp[2])]["word"], lookup[str(pp[2])]["pos"], lookup[str(pp[2])]["ref"]), (pp[3] % lookup_bf, pp[3] % lookup_ref), sentid, lookup[str(pp[0])]["ref"], lookup[str(pp[2])]["ref"]) if triple: triples.extend(_mutate_triple(triple)) break token_rels = [d[0] for d in v["dep"]] for nrel in null_rels: if nrel[0] == v["pos"]: missing_rels = [x for x in nrel[1] if x not in token_rels] for mrel in missing_rels: triple = ((v["lemgram"], v["word"], v["pos"], v["ref"]), mrel, ("", "", "", v["ref"]), ("", None), sentid, v["ref"], v["ref"]) triples.extend(_mutate_triple(triple)) triples = sorted(set(triples)) out_data = "\n".join([ "\t".join( (head, headpos, rel, dep, deppos, extra, sentid, refhead, refdep, str(bfhead), str(bfdep), str(wfhead), str(wfdep))) for (head, headpos, rel, dep, deppos, extra, sentid, refhead, refdep, bfhead, bfdep, wfhead, wfdep) in triples ]) out.write(out_data)
def annotate_text(out: Output, lexical_classes_token: Annotation, text: Annotation, token: Annotation, saldoids, cutoff, types, delimiter, affix, freq_model, decimals): """ Annotate text chuncs with lexical classes. - out: resulting annotation file - lexical_classes_token: existing annotation with lexical classes on token level. - text, token: existing annotations for the text-IDs and the tokens. - saldoids: existing annotation with saldoIDs, needed when types=True. - cutoff: value for limiting the resulting bring classes. The result will contain all words with the top x frequencies. Words with frequency = 1 will be removed from the result. - types: if True, count every class only once per saldo ID occurrence. - delimiter: delimiter character to put between ambiguous results. - affix: optional character to put before and after results to mark a set. - freq_model: pickled file with reference frequencies. - decimals: number of decimals to keep in output. """ cutoff = int(cutoff) text_children, _orphans = text.get_children(token, preserve_parent_annotation_order=True) classes = list(lexical_classes_token.read()) sense = list(saldoids.read()) if types else None if freq_model: freq_model = util.PickledLexicon(freq_model.path) out_annotation = text.create_empty_attribute() for text_index, words in enumerate(text_children): seen_types = set() class_freqs = defaultdict(int) for token_index in words: # Count only sense types if types: senses = str(sorted([s.split(util.SCORESEP)[0] for s in sense[token_index].strip(util.AFFIX).split(util.DELIM)])) if senses in seen_types: continue else: seen_types.add(senses) rogwords = classes[token_index].strip(util.AFFIX).split(util.DELIM) if classes[token_index] != util.AFFIX else [] for w in rogwords: class_freqs[w] += 1 if freq_model: for c in class_freqs: # Relative frequency rel = class_freqs[c] / len(words) # Calculate class dominance ref_freq = freq_model.lookup(c.replace("_", " "), 0) if not ref_freq: log.error("Class '%s' is missing" % ref_freq) class_freqs[c] = (rel / ref_freq) # Sort words according to frequency/dominance ordered_words = sorted(class_freqs.items(), key=lambda x: x[1], reverse=True) if freq_model: # Remove words with dominance < 1 ordered_words = [w for w in ordered_words if w[1] >= 1] else: # Remove words with frequency 1 ordered_words = [w for w in ordered_words if w[1] > 1] if len(ordered_words) > cutoff: cutoff_freq = ordered_words[cutoff - 1][1] ordered_words = [w for w in ordered_words if w[1] >= cutoff_freq] # Join words and frequencies/dominances ordered_words = [util.SCORESEP.join([word, str(round(freq, decimals))]) for word, freq in ordered_words] out_annotation[text_index] = util.cwbset(ordered_words, delimiter, affix) if ordered_words else affix out.write(out_annotation)
def annotate(out_ne: Output = Output("swener.ne", cls="named_entity", description="Named entity segments from SweNER"), out_ne_ex: Output = Output("swener.ne:swener.ex", description="Named entity expressions from SweNER"), out_ne_type: Output = Output("swener.ne:swener.type", cls="named_entity:type", description="Named entity types from SweNER"), out_ne_subtype: Output = Output("swener.ne:swener.subtype", cls="named_entity:subtype", description="Named entity sub types from SweNER"), out_ne_name: Output = Output("swener.ne:swener.name", cls="named_entity:name", description="Names in SweNER named entities"), word: Annotation = Annotation("<token:word>"), sentence: Annotation = Annotation("<sentence>"), token: Annotation = Annotation("<token>"), binary: Binary = Binary("[swener.binary]"), process_dict=None): """Tag named entities using HFST-SweNER. SweNER is either run in an already started process defined in process_dict, or a new process is started(default) - doc, word, sentence, token: existing annotations - out_ne_ex, out_ne_type, out_ne_subtype: resulting annotation files for the named entities - process_dict is used in the catapult and should never be set from the command line """ if process_dict is None: process = swenerstart(binary, "", util.UTF8, verbose=False) # else: # process = process_dict["process"] # # If process seems dead, spawn a new one # if process.stdin.closed or process.stdout.closed or process.poll(): # util.system.kill_process(process) # process = swenerstart("", encoding, verbose=False) # process_dict["process"] = process # Get sentence annotation sentences, _orphans = sentence.get_children(token, orphan_alert=True) # Collect all text word_annotation = list(word.read()) stdin = SENT_SEP.join(TOK_SEP.join(word_annotation[token_index] for token_index in sent) for sent in sentences) # Escape <, > and & stdin = xml.sax.saxutils.escape(stdin) # keep_process = len(stdin) < RESTART_THRESHOLD_LENGTH and process_dict is not None # log.info("Stdin length: %s, keep process: %s", len(stdin), keep_process) # if process_dict is not None: # process_dict["restart"] = not keep_process # # Does not work as of now since swener does not have an interactive mode # if keep_process: # # Chatting with swener: send a SENT_SEP and read correct number of lines # stdin_fd, stdout_fd = process.stdin, process.stdout # stdin_fd.write(stdin.encode(encoding) + SENT_SEP) # stdin_fd.flush() # stout = stdout_fd.readlines() # else: # Otherwise use communicate which buffers properly # log.info("STDIN %s %s", type(stdin.encode(encoding)), stdin.encode(encoding)) stdout, _ = process.communicate(stdin.encode(util.UTF8)) # log.info("STDOUT %s %s", type(stdout.decode(encoding)), stdout.decode(encoding)) parse_swener_output(sentences, token, stdout.decode(util.UTF8), out_ne, out_ne_ex, out_ne_type, out_ne_subtype, out_ne_name)
def annotate( out_phrase: Output = Output("phrase_structure.phrase", description="Phrase segments"), out_phrase_name: Output = Output( "phrase_structure.phrase:phrase_structure.name", description="Phrase names"), out_phrase_func: Output = Output( "phrase_structure.phrase:phrase_structure.func", description="Phrase functions"), token: Annotation = Annotation("<token>"), word: Annotation = Annotation("<token:word>"), sentence: Annotation = Annotation("<sentence>"), pos: Annotation = Annotation("<token:pos>"), msd: Annotation = Annotation("<token:msd>"), ref: Annotation = Annotation("<token>:misc.number_rel_<sentence>"), dephead_ref: Annotation = Annotation("<token:dephead_ref>"), deprel: Annotation = Annotation("<token:deprel>")): """Annotate sentence with phrase structures.""" sentences, _orphans = sentence.get_children(word) token_annotations = list( ref.read_attributes([ref, word, pos, msd, dephead_ref, deprel])) token_spans = list(token.read_spans()) def get_token_span(index): return token_spans[index] nodes = [] for s in sentences: tokenlist = [Token(None)] for token_index in s: token = token_annotations[token_index] tokenlist.append(Token(token)) # Get PS tree sen = Sentence(tokenlist) if not sen.is_cyclic(): tree = convert_sentence(sen).top.to_tree_str() # print(pprint.pformat(tree), file=sys.stderr) # Make nodes children = flatten_tree(tree[1], []) log.debug("\n\nSENTENCE:") position = 0 open_elem_stack = [] for child in children: if not child[0].startswith("WORD:"): start_pos = get_token_span(s[position])[0] open_elem_stack.append(child + (start_pos, )) log.debug( f"<phrase name={child[0]} func={child[1]}> {s[position]}" ) else: # Close nodes while open_elem_stack[-1][2] == child[2]: start_pos = open_elem_stack[-1][3] end_pos = get_token_span(s[position - 1])[1] nodes.append( ((start_pos, end_pos), open_elem_stack[-1][0], open_elem_stack[-1][1])) log.debug( f"</phrase name={open_elem_stack[-1][0]} func={open_elem_stack[-1][1]}> {start_pos}-{end_pos}" ) open_elem_stack.pop() position += 1 log.debug(f" {child[0][5:]}") # Close remaining open nodes end_pos = get_token_span(s[-1])[1] for elem in reversed(open_elem_stack): start_pos = elem[3] nodes.append(((start_pos, end_pos), elem[0], elem[1])) log.debug( f"</phrase name={elem[0]} func={elem[1]}> {start_pos}-{end_pos}" ) # Sort nodes sorted_nodes = sorted(nodes) # Write annotations out_phrase.write([i[0] for i in sorted_nodes]) out_phrase_name.write([i[1] for i in sorted_nodes]) out_phrase_func.write([i[2] for i in sorted_nodes])