def make_new_baseforms(OUT_baseform, tokid, msd_tag, compounds, stats_lexicon, altlexicon, delimiter, affix): """Add a list of baseforms to the dictionary OUT_baseform[tokid].""" baseform_list = [] msd_tag = msd_tag[:msd_tag.find('.')] for comp in compounds: comp = comp[1] base_suffix = comp[-1][1][:comp[-1][1].find('.')] prefix = comp[0][0] # If first letter has upper case, check if one of the affixes is a name: if prefix[0] == prefix[0].upper(): if not any(True for a in comp if "pm" in a[1][a[1].find('.'):]): baseform = ''.join(affix[0].lower() for affix in comp[:-1]) + base_suffix else: baseform = ''.join(affix[0] for affix in comp[:-1]) + base_suffix else: baseform = ''.join(affix[0] for affix in comp[:-1]) + base_suffix # Check if this baseform with the MSD tag occurs in stats_lexicon if baseform not in baseform_list: if stats_lexicon.lookup_word_tag_freq( baseform, msd_tag) > 0 or altlexicon.lookup( baseform.lower()) != []: baseform_list.append(baseform) # Update dictionary OUT_baseform[tokid] = util.cwbset(baseform_list, delimiter, affix) if ( compounds and baseform_list) else affix
def process_output(out, stdout, in_sentences, SALDO, sensefmt, default_prob): """Parse WSD output and write annotation.""" OUT = {} # Split output into sentences out_sentences = stdout.strip() out_sentences = out_sentences.split("\t".join(["_", "_", "_", "_", SENT_SEP, "_", "_"])) out_sentences = [i for i in out_sentences if i] # Split output into tokens for out_sent, in_sent in zip(out_sentences, in_sentences): out_tokens = [t for t in out_sent.split("\n") if t] for (out_tok, in_tok) in zip(out_tokens, in_sent): out_prob = out_tok.split("\t")[6] out_prob = [i for i in out_prob.split("|") if i != "_"] out_meanings = [i for i in out_tok.split("\t")[5].split("|") if i != "_"] saldo = [i for i in SALDO[in_tok].strip(util.AFFIX).split(util.DELIM) if i] new_saldo = [] if out_prob: for meaning in saldo: if meaning in out_meanings: i = out_meanings.index(meaning) new_saldo.append(meaning + sensefmt % float(out_prob[i])) else: new_saldo.append(meaning + sensefmt % float(default_prob)) else: new_saldo = [meaning + sensefmt % float(default_prob) for meaning in saldo] # Sort by probability new_saldo = sorted(new_saldo, key=lambda x: float(x.split(":")[-1]), reverse=True) OUT[in_tok] = util.cwbset(new_saldo) util.write_annotation(out, OUT)
def make_complem_and_compwf(OUT_complem, OUT_compwf, complemgramfmt, tokid, compounds, compdelim, delimiter, affix): """Add a list of compound lemgrams to the dictionary OUT_complem[tokid] and a list of compound wordforms to OUT_compwf.""" complem_list = [] compwf_list = [] for comp in compounds: prob = comp[0] comp = comp[1] complems = True for a in comp: if a[1] == '0': complems = False break if complems: if complemgramfmt: # Construct complemgram + lemprob complem_list.append( compdelim.join(affix[1] for affix in comp) + complemgramfmt % prob) else: complem_list.append(compdelim.join(affix[1] for affix in comp)) # If first letter has upper case, check if one of the affixes may be a name: if comp[0][0][0] == comp[0][0][0].upper(): if not any([True for a in comp if "pm" in a[1][a[1].find('.'):]] + [True for a in comp if "PM" in a[2]]): wf = compdelim.join(affix[0].lower() for affix in comp) else: wf = compdelim.join(affix[0] for affix in comp) else: wf = compdelim.join(affix[0] for affix in comp) if wf not in compwf_list: compwf_list.append(wf) # Update dictionaries OUT_complem[tokid] = util.cwbset( complem_list, delimiter, affix) if compounds and complem_list else affix OUT_compwf[tokid] = util.cwbset(compwf_list, delimiter, affix) if compounds else affix
def make_complem_and_compwf(out_complem, out_compwf, complemgramfmt, compounds, compdelim, delimiter, affix): """Add a list of compound lemgrams to out_complem and a list of compound wordforms to out_compwf.""" complem_list = [] compwf_list = [] for comp in compounds: prob = comp[0] comp = comp[1] complems = True for a in comp: if a[1] == "0": complems = False break if complems: if complemgramfmt: # Construct complemgram + lemprob complem_list.append( compdelim.join(affix[1] for affix in comp) + complemgramfmt % prob) else: complem_list.append(compdelim.join(affix[1] for affix in comp)) # If first letter has upper case, check if one of the affixes may be a name: if comp[0][0][0] == comp[0][0][0].upper(): if not any([True for a in comp if "pm" in a[1][a[1].find("."):]] + [True for a in comp if "PM" in a[2]]): wf = compdelim.join(affix[0].lower() for affix in comp) else: wf = compdelim.join(affix[0] for affix in comp) else: wf = compdelim.join(affix[0] for affix in comp) if wf not in compwf_list: compwf_list.append(wf) # Add to annotations out_complem.append( util.cwbset(complem_list, delimiter, affix ) if compounds and complem_list else affix) out_compwf.append( util.cwbset(compwf_list, delimiter, affix) if compounds else affix)
def truncateset(string, maxlength=4095, delimiter="|", affix="|", encoding="UTF-8"): """Truncate a Corpus Workbench set to a maximum length.""" if len(string) <= maxlength or string == "|": return string else: length = 1 # Including the last affix values = string[1:-1].split("|") for i, value in enumerate(values): length += len(value.encode(encoding)) + 1 if length > maxlength: return util.cwbset(values[:i], delimiter, affix)
def ufeatstag(out: Output = Output( "<token>:misc.ufeats", cls="token:ufeats", description="Universal morphological features"), pos: Annotation = Annotation("<token:pos>"), msd: Annotation = Annotation("<token:msd>")): """Convert SUC MSD tags to universal features.""" pos_tags = pos.read() msd_tags = msd.read() out_annotation = [] for pos_tag, msd_tag in zip(pos_tags, msd_tags): feats = util.tagsets.suc_to_feats(pos_tag, msd_tag) out_annotation.append(util.cwbset(feats)) out.write(out_annotation)
def process_output(word: Annotation, out: Output, stdout, in_sentences, saldo_annotation, prob_format, default_prob): """Parse WSD output and write annotation.""" out_annotation = word.create_empty_attribute() # Split output into sentences out_sentences = stdout.strip() out_sentences = out_sentences.split("\t".join( ["_", "_", "_", "_", SENT_SEP, "_", "_"])) out_sentences = [i for i in out_sentences if i] # Split output into tokens for out_sent, in_sent in zip(out_sentences, in_sentences): out_tokens = [t for t in out_sent.split("\n") if t] for (out_tok, in_tok) in zip(out_tokens, in_sent): out_prob = out_tok.split("\t")[6] out_prob = [i for i in out_prob.split("|") if i != "_"] out_meanings = [ i for i in out_tok.split("\t")[5].split("|") if i != "_" ] saldo = [ i for i in saldo_annotation[in_tok].strip(util.AFFIX).split( util.DELIM) if i ] new_saldo = [] if out_prob: for meaning in saldo: if meaning in out_meanings: i = out_meanings.index(meaning) new_saldo.append((meaning, float(out_prob[i]))) else: new_saldo.append((meaning, default_prob)) else: new_saldo = [(meaning, default_prob) for meaning in saldo] # Sort by probability new_saldo.sort(key=lambda x: (-x[1], x[0])) # Format probability according to prob_format new_saldo = [ saldo + prob_format % prob if prob_format else saldo for saldo, prob in new_saldo ] out_annotation[in_tok] = util.cwbset(new_saldo) out.write(out_annotation)
def word_weights(doc: str = Document, model: str = Model("[vw_topic_modelling.model]"), word: str = Annotation("<token:word>"), pos: str = Annotation("<token:pos>"), out: str = Output("<token>:vw_topic_modelling:label_weights", description="Label weights per word")): """ Report the weight for each label for each word. Both model and model.json must exist. See --train and --predict. """ m_json = json.load(open(model + ".json")) index_to_label = m_json["index_to_label"] min_word_length = int(m_json["min_word_length"] or "0") banned_pos = (m_json["banned_pos"] or "").split() words = list(util.read_annotation(doc, word)) poss = util.read_annotation(doc, pos) if pos else [] data = (Example(None, vw_normalize(word)) for n, word in enumerate(words) if len(word) >= min_word_length if not pos or poss[n] not in banned_pos) weights = defaultdict(list) with tempfile.NamedTemporaryFile() as tmp: args = ["--initial_regressor", model, "--invert_hash", tmp.name] for _ in vw_predict(args, data): pass for line in open(tmp.name, "r").readlines(): # allmänna[1]:14342849:0.0139527 colons = line.split(":") if len(colons) == 3: word, _hash, weight = colons if word[-1] == "]": bracesplit = word.rsplit("[", 1) else: bracesplit = [] if len(bracesplit) == 2: word, index = bracesplit n = int(index[:-1]) + 1 else: n = 1 weights[word].append(index_to_label[str(n)] + ":" + weight) ws = ( util.cwbset(weights[vw_normalize(word)]) for word in words if vw_normalize(word) in weights ) util.write_annotation(doc, out, ws)
def predict(doc: str = Document, model: str = Model("[vw_topic_modelling.model]"), modeljson: str = Model("[vw_topic_modelling.modeljson]"), order, struct, parent: str = Annotation("{chunk}"), word: str = Annotation("<token:word>"), out: str = Output("{chunk}:vw_topic_modelling.prediction", description="Predicted attributes"), pos: str = Annotation("<token:pos>"), raw: bool = False): """Predict a structural attribute.""" raw = raw == "true" m_json = json.load(open(modeljson)) data = ( Example(None, text.words, text.span) for text in texts([(order, struct, parent, word, pos)], map_label=lambda _: "?", min_word_length=m_json["min_word_length"], banned_pos=m_json["banned_pos"]) ) index_to_label = m_json["index_to_label"] args = ["--initial_regressor", model] if raw: predictions = ( util.cwbset(index_to_label[str(s)] + ":" + str(v) for s, v in ss) for ss, _span in vw_predict(args, data, raw=True) ) else: predictions = ( index_to_label[str(s)] for s, _span in vw_predict(args, data) ) util.write_annotation(doc, out, predictions)
def annotate_words(out: Output, model: Model, saldoids: Annotation, pos: Annotation, annotate, pos_limit: List[str], class_set=None, disambiguate=True, connect_ids=False, delimiter=util.DELIM, affix=util.AFFIX, scoresep=util.SCORESEP, lexicon=None): """ Annotate words with blingbring classes (rogetID). - out_sent: resulting annotation file. - model: pickled lexicon with saldoIDs as keys. - saldoids, pos: existing annotation with saldoIDs/parts of speech. - annotate: annotation function, returns an iterable containing annotations for one token ID. (annotate_bring() or annotate_swefn()) - pos_limit: parts of speech that will be annotated. Set to None to annotate all pos. - class_set: output Bring classes or Roget IDs ("bring", "roget_head", "roget_subsection", "roget_section" or "roget_class"). Set to None when not annotating blingbring. - disambiguate: use WSD and use only the most likely saldo ID. - connect_IDs: for sweFN: paste saldo ID after each sweFN ID. - delimiter: delimiter character to put between ambiguous results - affix: optional character to put before and after results to mark a set. - lexicon: this argument cannot be set from the command line, but is used in the catapult. This argument must be last. """ if not lexicon: lexicon = util.PickledLexicon(model.path) # Otherwise use pre-loaded lexicon (from catapult) sense = saldoids.read() token_pos = list(pos.read()) out_annotation = pos.create_empty_attribute() # Check if the saldo IDs are ranked (= word senses have been disambiguated) wsd = saldoids.split()[1].split(".")[0] == "wsd" for token_index, token_sense in enumerate(sense): # Check if part of speech of this token is allowed if not pos_ok(token_pos, token_index, pos_limit): saldo_ids = None out_annotation[token_index] = affix continue if wsd and util.SCORESEP in token_sense: ranked_saldo = token_sense.strip(util.AFFIX).split(util.DELIM) \ if token_sense != util.AFFIX else None saldo_tuples = [(i.split(util.SCORESEP)[0], i.split(util.SCORESEP)[1]) for i in ranked_saldo] if not disambiguate: saldo_ids = [i[0] for i in saldo_tuples] # Only take the most likely analysis into account. # Handle wsd with equal probability for several words else: saldo_ids = [saldo_tuples[0]] del saldo_tuples[0] while saldo_tuples and (saldo_tuples[0][1] == saldo_ids[0][1]): saldo_ids = [saldo_tuples[0]] del saldo_tuples[0] saldo_ids = [i[0] for i in saldo_ids] else: # No WSD saldo_ids = token_sense.strip(util.AFFIX).split(util.DELIM) \ if token_sense != util.AFFIX else None result = annotate(saldo_ids, lexicon, connect_ids, scoresep) out_annotation[token_index] = util.cwbset(result, delimiter, affix) if result else affix out.write(out_annotation)
def _format_location(location_data): """Format location as city;country;latitude;longitude""" return util.cwbset(";".join((y[0], y[3], y[1], y[2])) for x, y in location_data)
def annotate_doc(out, in_token_annotation, text_children, saldoids=None, cutoff=10, types=False, delimiter=util.DELIM, affix=util.AFFIX, freq_model=None, decimals=3): """ Annotate documents with lexical classes. - out: resulting annotation file - in_token_annotation: existing annotation with lexical classes on token level. - text_children: existing annotation for text-IDs and their word children. - saldoids: existing annotation with saldoIDs, needed when types=True. - cutoff: value for limiting the resulting bring classes. The result will contain all words with the top x frequencies. Words with frequency = 1 will be removed from the result. - types: if True, count every class only once per saldo ID occurrence. - delimiter: delimiter character to put between ambiguous results. - affix: optional character to put before and after results to mark a set. - freq_model: pickled file with reference frequencies. - decimals: number of decimals to keep in output. """ cutoff = int(cutoff) types = util.strtobool(types) text_children = util.read_annotation(text_children) classes = util.read_annotation(in_token_annotation) sense = util.read_annotation(saldoids) if types else None if freq_model: freq_model = util.PickledLexicon(freq_model) out_doc = {} for textid, words in text_children.items(): seen_types = set() class_freqs = defaultdict(int) words = words.split() for tokid in words: # Count only sense types if types: senses = str( sorted([ s.split(util.SCORESEP)[0] for s in sense[tokid].strip( util.AFFIX).split(util.DELIM) ])) if senses in seen_types: continue else: seen_types.add(senses) rogwords = classes[tokid].strip(util.AFFIX).split( util.DELIM) if classes[tokid] != util.AFFIX else [] for w in rogwords: class_freqs[w] += 1 if freq_model: for c in class_freqs: # Relative frequency rel = class_freqs[c] / len(words) # Calculate class dominance ref_freq = freq_model.lookup(c.replace("_", " "), 0) if not ref_freq: util.log.error("Class '%s' is missing" % ref_freq) class_freqs[c] = (rel / ref_freq) # Sort words according to frequency/dominance ordered_words = sorted(class_freqs.items(), key=lambda x: x[1], reverse=True) if freq_model: # Remove words with dominance < 1 ordered_words = [w for w in ordered_words if w[1] >= 1] else: # Remove words with frequency 1 ordered_words = [w for w in ordered_words if w[1] > 1] if len(ordered_words) > cutoff: cutoff_freq = ordered_words[cutoff - 1][1] ordered_words = [w for w in ordered_words if w[1] >= cutoff_freq] # Join words and frequencies/dominances ordered_words = [ util.SCORESEP.join([word, str(round(freq, decimals))]) for word, freq in ordered_words ] out_doc[textid] = util.cwbset(ordered_words, delimiter, affix) if ordered_words else affix util.write_annotation(out, out_doc)
def annotate_words(out, model, saldoids, pos, annotate, pos_limit, class_set=None, disambiguate=True, connect_ids=False, delimiter=util.DELIM, affix=util.AFFIX, scoresep=util.SCORESEP, lexicon=None): """ Annotate words with blingbring classes (rogetID). - out_sent: resulting annotation file. - model: pickled lexicon with saldoIDs as keys. - saldoids, pos: existing annotation with saldoIDs/parts of speech. - annotate: annotation function, returns an iterable containing annotations for one token ID. (annotate_bb() or annotate_swefn()) - pos_limit: parts of speech that will be annotated. Set to None to annotate all pos. - class_set: output Bring classes or Roget IDs ("bring", "roget_head", "roget_subsection", "roget_section" or "roget_class"). Set to None when not annotating blingbring. - disambiguate: use WSD and use only the most likely saldo ID. - connect_IDs: for sweFN: paste saldo ID after each sweFN ID. - delimiter: delimiter character to put between ambiguous results - affix: optional character to put before and after results to mark a set. - lexicon: this argument cannot be set from the command line, but is used in the catapult. This argument must be last. """ if not lexicon: lexicon = util.PickledLexicon(model) # Otherwise use pre-loaded lexicon (from catapult) if pos_limit.lower() == "none": pos_limit = None result_dict = {} sense = util.read_annotation(saldoids) token_pos = util.read_annotation(pos) for tokid in sense: # Check if part of speech of this token is allowed if not pos_ok(token_pos, tokid, pos_limit): saldo_ids = None result_dict[tokid] = affix continue if util.SCORESEP in sense[tokid]: # WSD ranked_saldo = sense[tokid].strip(util.AFFIX).split(util.DELIM) \ if sense[tokid] != util.AFFIX else None saldo_tuples = [(i.split(util.SCORESEP)[0], i.split(util.SCORESEP)[1]) for i in ranked_saldo] if not disambiguate: saldo_ids = [i[0] for i in saldo_tuples] # Only take the most likely analysis into account. # Handle wsd with equal probability for several words else: saldo_ids = [saldo_tuples[0]] del saldo_tuples[0] while saldo_tuples and (saldo_tuples[0][1] == saldo_ids[0][1]): saldo_ids = [saldo_tuples[0]] del saldo_tuples[0] saldo_ids = [i[0] for i in saldo_ids] else: # No WSD saldo_ids = sense[tokid].strip(util.AFFIX).split(util.DELIM) \ if sense[tokid] != util.AFFIX else None result = annotate(saldo_ids, lexicon, connect_ids, scoresep) result_dict[tokid] = util.cwbset(result, delimiter, affix) if result else affix util.write_annotation(out, result_dict)
def annotate_text(out: Output, lexical_classes_token: Annotation, text: Annotation, token: Annotation, saldoids, cutoff, types, delimiter, affix, freq_model, decimals): """ Annotate text chuncs with lexical classes. - out: resulting annotation file - lexical_classes_token: existing annotation with lexical classes on token level. - text, token: existing annotations for the text-IDs and the tokens. - saldoids: existing annotation with saldoIDs, needed when types=True. - cutoff: value for limiting the resulting bring classes. The result will contain all words with the top x frequencies. Words with frequency = 1 will be removed from the result. - types: if True, count every class only once per saldo ID occurrence. - delimiter: delimiter character to put between ambiguous results. - affix: optional character to put before and after results to mark a set. - freq_model: pickled file with reference frequencies. - decimals: number of decimals to keep in output. """ cutoff = int(cutoff) text_children, _orphans = text.get_children(token, preserve_parent_annotation_order=True) classes = list(lexical_classes_token.read()) sense = list(saldoids.read()) if types else None if freq_model: freq_model = util.PickledLexicon(freq_model.path) out_annotation = text.create_empty_attribute() for text_index, words in enumerate(text_children): seen_types = set() class_freqs = defaultdict(int) for token_index in words: # Count only sense types if types: senses = str(sorted([s.split(util.SCORESEP)[0] for s in sense[token_index].strip(util.AFFIX).split(util.DELIM)])) if senses in seen_types: continue else: seen_types.add(senses) rogwords = classes[token_index].strip(util.AFFIX).split(util.DELIM) if classes[token_index] != util.AFFIX else [] for w in rogwords: class_freqs[w] += 1 if freq_model: for c in class_freqs: # Relative frequency rel = class_freqs[c] / len(words) # Calculate class dominance ref_freq = freq_model.lookup(c.replace("_", " "), 0) if not ref_freq: log.error("Class '%s' is missing" % ref_freq) class_freqs[c] = (rel / ref_freq) # Sort words according to frequency/dominance ordered_words = sorted(class_freqs.items(), key=lambda x: x[1], reverse=True) if freq_model: # Remove words with dominance < 1 ordered_words = [w for w in ordered_words if w[1] >= 1] else: # Remove words with frequency 1 ordered_words = [w for w in ordered_words if w[1] > 1] if len(ordered_words) > cutoff: cutoff_freq = ordered_words[cutoff - 1][1] ordered_words = [w for w in ordered_words if w[1] >= cutoff_freq] # Join words and frequencies/dominances ordered_words = [util.SCORESEP.join([word, str(round(freq, decimals))]) for word, freq in ordered_words] out_annotation[text_index] = util.cwbset(ordered_words, delimiter, affix) if ordered_words else affix out.write(out_annotation)