def lix(text: Annotation = Annotation("<text>"), sentence: Annotation = Annotation("<sentence>"), word: Annotation = Annotation("<token:word>"), pos: Annotation = Annotation("<token:pos>"), out: Output = Output("<text>:readability.lix", description="LIX values for text chunks"), skip_pos: List[str] = ["MAD", "MID", "PAD"], fmt: str = "%.2f"): """Create LIX annotation for text.""" # Read annotation files and get parent_children relations text_children, _orphans = text.get_children(sentence) word_pos = list(word.read_attributes((word, pos))) sentence_children, _orphans = sentence.get_children(word) sentence_children = list(sentence_children) # Calculate LIX for every text element lix_annotation = [] for text in text_children: in_sentences = [] for sentence_index in text: s = sentence_children[sentence_index] in_sentences.append( list( actual_words([word_pos[token_index] for token_index in s], skip_pos))) lix_annotation.append(fmt % lix_calc(in_sentences)) out.write(lix_annotation)
def ovix(text: Annotation = Annotation("<text>"), word: Annotation = Annotation("<token:word>"), pos: Annotation = Annotation("<token:pos>"), out: Output = Output("<text>:readability.ovix", description="OVIX values for text chunks"), skip_pos: List[str] = ["MAD", "MID", "PAD"], fmt: str = "%.2f"): """Create OVIX annotation for text.""" text_children, _orphans = text.get_children(word) word_pos = list(word.read_attributes((word, pos))) # Calculate OVIX for every text element ovix_annotation = [] for text in text_children: in_words = list( actual_words([word_pos[token_index] for token_index in text], skip_pos)) ovix_annotation.append(fmt % ovix_calc(in_words)) out.write(ovix_annotation)
def relations( out: OutputData = OutputData("korp.relations"), word: Annotation = Annotation("<token:word>"), pos: Annotation = Annotation("<token:pos>"), lemgram: Annotation = Annotation("<token>:saldo.lemgram"), dephead: Annotation = Annotation("<token:dephead>"), deprel: Annotation = Annotation("<token:deprel>"), sentence_id: Annotation = Annotation("<sentence>:misc.id"), ref: Annotation = Annotation("<token>:misc.number_rel_<sentence>"), baseform: Annotation = Annotation("<token>:saldo.baseform")): """Find certain dependencies between words, to be used by the Word Picture feature in Korp.""" sentence_ids = sentence_id.read() sentence_tokens, _ = sentence_id.get_children(word) annotations = list( word.read_attributes( (word, pos, lemgram, dephead, deprel, ref, baseform))) # http://stp.ling.uu.se/~nivre/swedish_treebank/dep.html # Tuples with relations (head, rel, dep) to be found (with indexes) and an optional tuple specifying which info # should be stored and how rels = [ ({ 1: "VB", 2: "SS", 3: "NN" }, { 1: "VB", 4: "VG", 5: "VB" }, (5, 2, 3, "")), # "han har sprungit" ({ 1: "VB", 2: "(SS|OO|IO|OA)", 3: "NN" }, ), ({ 1: "VB", 2: "(RA|TA)", 3: "(AB|NN)" }, ), ({ 1: "VB", 2: "(RA|TA)", 3: "PP" }, { 3: "PP", 4: "(PA|HD)", 5: "NN" }, (1, 2, 5, "%(3)s")), # "ges vid behov" ({ 1: "NN", 2: "(AT|ET)", 3: "JJ" }, ), # "stor hund" ({ 1: "NN", 2: "ET", 3: "VB" }, { 3: "VB", 4: "SS", 5: "HP" }, (1, 2, 3, "%(5)s")), # "brödet som bakats" ({ 1: "NN", 2: "ET", 3: "PP" }, { 3: "PP", 4: "PA", 5: "(NN|PM)" }, (1, 2, 5, "%(3)s")), # "barnen i skolan", "hundarna i Sverige" ({ 1: "PP", 2: "PA", 3: "NN" }, ), # "på bordet" ({ 1: "JJ", 2: "AA", 3: "AB" }, ) # "fullständigt galen" ] null_rels = [ ("VB", ["OO"]), # Verb som saknar objekt ] triples = [] for sentid, sent in zip(sentence_ids, sentence_tokens): incomplete = {} # Tokens looking for heads, with head as key tokens = {} # Tokens in same sentence, with token_index as key # Link the tokens together for token_index in sent: token_word, token_pos, token_lem, token_dh, token_dr, token_ref, token_bf = annotations[ token_index] token_word = token_word.lower() if token_lem == "|": token_lem = token_word this = { "pos": token_pos, "lemgram": token_lem, "word": token_word, "head": None, "dep": [], "ref": token_ref, "bf": token_bf } tokens[token_index] = this if not token_dh == "-": token_dh = int(token_dh) # This token is looking for a head (token is not root) dep_triple = (token_dr, this) if token_dh in tokens: # Found head. Link them together both ways this["head"] = (token_dr, tokens[token_dh]) tokens[token_dh]["dep"].append(dep_triple) else: incomplete.setdefault(token_dh, []).append( (token_index, dep_triple)) # Is someone else looking for the current token as head? if token_index in incomplete: for t in incomplete[token_index]: tokens[t[0]]["head"] = this this["dep"].append(t[1]) del incomplete[token_index] assert not incomplete, "incomplete is not empty" def _match(pattern, value): return bool(re.match(r"^%s$" % pattern, value)) def _findrel(head, rel, dep): result = [] if isinstance(head, dict): for d in head["dep"]: if _match(rel, d[0]) and _match(dep, d[1]["pos"]): result.append(d[1]) if isinstance(dep, dict): h = dep["head"] if h and _match(rel, h[0]) and _match(head, h[1]["pos"]): result.append(h[1]) return result # Look for relations for v in list(tokens.values()): for d in v["dep"]: for rel in rels: r = rel[0] if _match(";".join([x[1] for x in sorted(r.items())]), ";".join([v["pos"], d[0], d[1]["pos"]])): triple = None if len(rel) == 1: triple = ((v["lemgram"], v["word"], v["pos"], v["ref"]), d[0], (d[1]["lemgram"], d[1]["word"], d[1]["pos"], d[1]["ref"]), ("", None), sentid, v["ref"], d[1]["ref"]) else: lookup = dict( list( zip(list(map(str, sorted(r.keys()))), (v, d[0], d[1])))) i = set(rel[0].keys()).intersection( set(rel[1].keys())).pop() rel2 = [x[1] for x in sorted(rel[1].items())] index1 = list(rel[0].keys()).index(i) index2 = list(rel[1].keys()).index(i) if index1 == 2 and index2 == 0: result = _findrel(d[1], rel2[1], rel2[2]) if result: lookup.update( dict( list( zip( list( map( str, sorted(rel[1].keys( )))), (d[1], rel2[1], result[0]))))) elif index1 == 0 and index2 == 0: result = _findrel(v, rel2[1], rel2[2]) if result: lookup.update( dict( list( zip( list( map( str, sorted(rel[1].keys( )))), (v, rel2[1], result[0]))))) pp = rel[-1] if len(list(lookup.keys())) > 3: lookup_bf = dict( (key, val["bf"]) for key, val in list(lookup.items()) if isinstance(val, dict)) lookup_ref = dict( (key, val["ref"]) for key, val in list(lookup.items()) if isinstance(val, dict)) triple = ((lookup[str(pp[0])]["lemgram"], lookup[str(pp[0])]["word"], lookup[str(pp[0])]["pos"], lookup[str(pp[0])]["ref"]), lookup[str(pp[1])], (lookup[str(pp[2])]["lemgram"], lookup[str(pp[2])]["word"], lookup[str(pp[2])]["pos"], lookup[str(pp[2])]["ref"]), (pp[3] % lookup_bf, pp[3] % lookup_ref), sentid, lookup[str(pp[0])]["ref"], lookup[str(pp[2])]["ref"]) if triple: triples.extend(_mutate_triple(triple)) break token_rels = [d[0] for d in v["dep"]] for nrel in null_rels: if nrel[0] == v["pos"]: missing_rels = [x for x in nrel[1] if x not in token_rels] for mrel in missing_rels: triple = ((v["lemgram"], v["word"], v["pos"], v["ref"]), mrel, ("", "", "", v["ref"]), ("", None), sentid, v["ref"], v["ref"]) triples.extend(_mutate_triple(triple)) triples = sorted(set(triples)) out_data = "\n".join([ "\t".join( (head, headpos, rel, dep, deppos, extra, sentid, refhead, refdep, str(bfhead), str(bfdep), str(wfhead), str(wfdep))) for (head, headpos, rel, dep, deppos, extra, sentid, refhead, refdep, bfhead, bfdep, wfhead, wfdep) in triples ]) out.write(out_data)
def annotate( out_complemgrams: Output = Output( "<token>:saldo.complemgram", description="Compound analysis using lemgrams"), out_compwf: Output = Output( "<token>:saldo.compwf", description="Compound analysis using wordforms"), out_baseform: Output = Output( "<token>:saldo.baseform2", description="Baseform including baseforms derived from compounds"), word: Annotation = Annotation("<token:word>"), msd: Annotation = Annotation("<token:msd>"), baseform_tmp: Annotation = Annotation("<token>:saldo.baseform"), saldo_comp_model: Model = Model("[saldo.comp_model]"), nst_model: Model = Model("[saldo.comp_nst_model]"), stats_model: Model = Model("[saldo.comp_stats_model]"), complemgramfmt: str = util.SCORESEP + "%.3e", delimiter: str = util.DELIM, compdelim: str = util.COMPSEP, affix: str = util.AFFIX, cutoff: bool = True, saldo_comp_lexicon=None, stats_lexicon=None): """Divide compound words into prefix(es) and suffix. - out_complemgram is the resulting annotation file for compound lemgrams and their probabilities - out_compwf is the resulting annotation file for compound wordforms - out_baseform is the resulting annotation file for baseforms (including baseforms for compounds) - word and msd are existing annotations for wordforms and MSDs - baseform_tmp is the existing temporary annotation file for baseforms (not including compounds) - saldo_comp_model is the Saldo compound model - nst_model is the NST part of speech compound model - stats_model is the statistics model (pickled file) - complemgramfmt is a format string for how to print the complemgram and its probability (use empty string to omit probablility) - saldo_comp_lexicon, stats_lexicon: these arguments cannot be set from the command line, but are used in the catapult. These arguments must be last. """ ################## # Load models ################## if not saldo_comp_lexicon: saldo_comp_lexicon = SaldoCompLexicon(saldo_comp_model.path) with open(nst_model.path, "rb") as f: nst_model = pickle.load(f) if not stats_lexicon: stats_lexicon = StatsLexicon(stats_model.path) word_msd_baseform_annotations = list( word.read_attributes((word, msd, baseform_tmp))) # Create alternative lexicon (for words within the file) altlexicon = InFileLexicon(word_msd_baseform_annotations) ################## # Do annotation ################## complem_annotation = [] compwf_annotation = [] baseform_annotation = [] previous_compounds = {} for word, msd, baseform_orig in word_msd_baseform_annotations: key = (word, msd) if key in previous_compounds: compounds = previous_compounds[key] else: compounds = compound(saldo_comp_lexicon, altlexicon, word, msd) if compounds: compounds = rank_compounds(compounds, nst_model, stats_lexicon) if cutoff: # Only keep analyses with the same length (or +1) as the most probable one best_length = len(compounds[0][1]) i = 0 for c in compounds: if len(c[1]) > best_length + 1 or len( c[1]) < best_length: break i += 1 compounds = compounds[:i] previous_compounds[key] = compounds # Create complem and compwf annotations make_complem_and_compwf(complem_annotation, compwf_annotation, complemgramfmt, compounds, compdelim, delimiter, affix) # Create new baseform annotation if necessary if baseform_orig != affix: baseform_annotation.append(baseform_orig) else: make_new_baseforms(baseform_annotation, msd, compounds, stats_lexicon, altlexicon, delimiter, affix) out_complemgrams.write(complem_annotation) out_compwf.write(compwf_annotation) out_baseform.write(baseform_annotation)
def annotate( out_phrase: Output = Output("phrase_structure.phrase", description="Phrase segments"), out_phrase_name: Output = Output( "phrase_structure.phrase:phrase_structure.name", description="Phrase names"), out_phrase_func: Output = Output( "phrase_structure.phrase:phrase_structure.func", description="Phrase functions"), token: Annotation = Annotation("<token>"), word: Annotation = Annotation("<token:word>"), sentence: Annotation = Annotation("<sentence>"), pos: Annotation = Annotation("<token:pos>"), msd: Annotation = Annotation("<token:msd>"), ref: Annotation = Annotation("<token>:misc.number_rel_<sentence>"), dephead_ref: Annotation = Annotation("<token:dephead_ref>"), deprel: Annotation = Annotation("<token:deprel>")): """Annotate sentence with phrase structures.""" sentences, _orphans = sentence.get_children(word) token_annotations = list( ref.read_attributes([ref, word, pos, msd, dephead_ref, deprel])) token_spans = list(token.read_spans()) def get_token_span(index): return token_spans[index] nodes = [] for s in sentences: tokenlist = [Token(None)] for token_index in s: token = token_annotations[token_index] tokenlist.append(Token(token)) # Get PS tree sen = Sentence(tokenlist) if not sen.is_cyclic(): tree = convert_sentence(sen).top.to_tree_str() # print(pprint.pformat(tree), file=sys.stderr) # Make nodes children = flatten_tree(tree[1], []) log.debug("\n\nSENTENCE:") position = 0 open_elem_stack = [] for child in children: if not child[0].startswith("WORD:"): start_pos = get_token_span(s[position])[0] open_elem_stack.append(child + (start_pos, )) log.debug( f"<phrase name={child[0]} func={child[1]}> {s[position]}" ) else: # Close nodes while open_elem_stack[-1][2] == child[2]: start_pos = open_elem_stack[-1][3] end_pos = get_token_span(s[position - 1])[1] nodes.append( ((start_pos, end_pos), open_elem_stack[-1][0], open_elem_stack[-1][1])) log.debug( f"</phrase name={open_elem_stack[-1][0]} func={open_elem_stack[-1][1]}> {start_pos}-{end_pos}" ) open_elem_stack.pop() position += 1 log.debug(f" {child[0][5:]}") # Close remaining open nodes end_pos = get_token_span(s[-1])[1] for elem in reversed(open_elem_stack): start_pos = elem[3] nodes.append(((start_pos, end_pos), elem[0], elem[1])) log.debug( f"</phrase name={elem[0]} func={elem[1]}> {start_pos}-{end_pos}" ) # Sort nodes sorted_nodes = sorted(nodes) # Write annotations out_phrase.write([i[0] for i in sorted_nodes]) out_phrase_name.write([i[1] for i in sorted_nodes]) out_phrase_func.write([i[2] for i in sorted_nodes])