def msdtag(out: Output = Output( "<token>:hunpos.msd", cls="token:msd", description="Part-of-speeches with morphological descriptions"), word: Annotation = Annotation("<token:word>"), sentence: Annotation = Annotation("<sentence>"), binary: Binary = Binary("[hunpos.binary]"), model: Model = Model("[hunpos.model]"), morphtable: Optional[Model] = Model("[hunpos.morphtable]"), patterns: Optional[Model] = Model("[hunpos.patterns]"), tag_mapping=None, encoding: str = util.UTF8): """POS/MSD tag using the Hunpos tagger.""" if isinstance(tag_mapping, str) and tag_mapping: tag_mapping = util.tagsets.mappings[tag_mapping] elif tag_mapping is None or tag_mapping == "": tag_mapping = {} pattern_list = [] if patterns: with open(patterns.path, encoding="utf-8") as pat: for line in pat: if line.strip() and not line.startswith("#"): name, pattern, tags = line.strip().split("\t", 2) pattern_list.append( (name, re.compile("^%s$" % pattern), tags)) def replace_word(w): """Replace word with alias if word matches a regex pattern.""" for p in pattern_list: if re.match(p[1], w): return "[[%s]]" % p[0] return w sentences, _orphans = sentence.get_children(word) token_word = list(word.read()) stdin = SENT_SEP.join( TOK_SEP.join( replace_word(token_word[token_index]) for token_index in sent) for sent in sentences) args = [model.path] if morphtable: args.extend(["-m", morphtable.path]) stdout, _ = util.system.call_binary(binary, args, stdin, encoding=encoding) out_annotation = word.create_empty_attribute() for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)): for token_index, tagged_token in zip( sent, tagged_sent.strip().split(TOK_SEP)): tag = tagged_token.strip().split(TAG_SEP)[TAG_COLUMN] tag = tag_mapping.get(tag, tag) out_annotation[token_index] = tag out.write(out_annotation)
def build_korp_stats(out: ModelOutput = ModelOutput("saldo/stats.pickle"), _saldom: Model = Model("saldo/saldom.xml")): """Download Korp's word frequency file and convert it to a model.""" txt_file = Model("saldo/stats_all.txt") try: log.info("Downloading Korp stats file...") download_stats_file( "https://svn.spraakdata.gu.se/sb-arkiv/pub/frekvens/stats_all.txt", txt_file.path) log.info("Building frequency model...") make_model(txt_file.path, out.path) finally: # Clean up txt_file.remove()
def diapivot_annotate( out: Output = Output( "<token>:hist.diapivot", description="SALDO IDs corresponding to lemgrams"), lemgram: Annotation = Annotation("<token>:saldo.lemgram"), model: Model = Model("hist/diapivot.pickle")): """Annotate each lemgram with its corresponding saldo_id according to model. Args: out (str, optional): Resulting annotation file. Defaults to Output("<token>:hist.diapivot", description="SALDO IDs corresponding to lemgrams"). lemgram (str, optional): Existing lemgram annotation. Defaults to Annotation("<token>:saldo.lemgram"). model (str, optional): Crosslink model. Defaults to Model("hist/diapivot.pickle"). """ lexicon = PivotLexicon(model) lemgram_annotation = list(lemgram.read()) out_annotation = [] for lemgrams in lemgram_annotation: saldo_ids = [] for lemgram in lemgrams.split(util.DELIM): s_i = lexicon.get_exactMatch(lemgram) if s_i: saldo_ids += [s_i] out_annotation.append(util.AFFIX + util.DELIM.join(set(saldo_ids)) + util.AFFIX if saldo_ids else util.AFFIX) out.write(out_annotation)
def blingbring_words(out: Output = Output("<token>:lexical_classes.blingbring", description="Lexical classes for tokens from Blingbring"), model: Model = Model("[lexical_classes.bb_word_model]"), saldoids: Annotation = Annotation("<token:sense>"), pos: Annotation = Annotation("<token:pos>"), pos_limit: List[str] = ["NN", "VB", "JJ", "AB"], class_set: str = "bring", disambiguate: bool = True, connect_ids: bool = False, delimiter: str = util.DELIM, affix: str = util.AFFIX, scoresep: str = util.SCORESEP, lexicon=None): """Blingbring specific wrapper for annotate_words. See annotate_words for more info.""" # pos_limit="NN VB JJ AB" | None if class_set not in ["bring", "roget_head", "roget_subsection", "roget_section", "roget_class"]: log.warning("Class '%s' not available. Fallback to 'bring'.") class_set = "bring" # Blingbring annotation function def annotate_bring(saldo_ids, lexicon, connect_IDs=False, scoresep=util.SCORESEP): rogetid = set() if saldo_ids: for sid in saldo_ids: if connect_IDs: rogetid = rogetid.union(set(i + scoresep + sid for i in lexicon.lookup(sid, default=set()))) else: rogetid = rogetid.union(lexicon.lookup(sid, default=dict()).get(class_set, set())) return sorted(rogetid) annotate_words(out, model, saldoids, pos, annotate_bring, pos_limit=pos_limit, disambiguate=disambiguate, class_set=class_set, connect_ids=connect_ids, delimiter=delimiter, affix=affix, scoresep=scoresep, lexicon=lexicon)
def swefn_words(out: Output = Output("<token>:lexical_classes.swefn", description="Lexical classes for tokens from SweFN"), model: Model = Model("[lexical_classes.swefn_word_model]"), saldoids: Annotation = Annotation("<token:sense>"), pos: Annotation = Annotation("<token:pos>"), pos_limit: List[str] = ["NN", "VB", "JJ", "AB"], disambiguate: bool = True, connect_ids: bool = False, delimiter: str = util.DELIM, affix: str = util.AFFIX, scoresep: str = util.SCORESEP, lexicon=None): """Swefn specific wrapper for annotate_words. See annotate_words for more info.""" # SweFN annotation function def annotate_swefn(saldo_ids, lexicon, connect_IDs=False, scoresep=util.SCORESEP): swefnid = set() if saldo_ids: for sid in saldo_ids: if connect_IDs: swefnid = swefnid.union(set(i + scoresep + sid for i in lexicon.lookup(sid, default=set()))) else: swefnid = swefnid.union(lexicon.lookup(sid, default=set())) return sorted(swefnid) annotate_words(out, model, saldoids, pos, annotate_swefn, pos_limit=pos_limit, disambiguate=disambiguate, connect_ids=connect_ids, delimiter=delimiter, affix=affix, scoresep=scoresep, lexicon=lexicon)
def stanza_lem_model( model: ModelOutput = ModelOutput("stanza/lem/sv_suc_lemmatizer.pt")): """Download and unzip the Stanza POS-tagging model.""" zip_model = Model("stanza/lem/synt_stanza_full.zip") zip_model.download( "https://svn.spraakdata.gu.se/sb-arkiv/pub/stanza/lem_stanza.zip") zip_model.unzip() zip_model.remove()
def build_model(out: ModelOutput = ModelOutput("sensaldo/sensaldo.pickle")): """Download and build SenSALDO model.""" # Download and extract sensaldo-base-v02.txt zip_model = Model("sensaldo/sensaldo-v02.zip") zip_model.download( "https://svn.spraakdata.gu.se/sb-arkiv/pub/lexikon/sensaldo/sensaldo-v02.zip" ) zip_model.unzip() tsv_model = Model("sensaldo/sensaldo-base-v02.txt") # Read sensaldo tsv dictionary and save as a pickle file lexicon = read_sensaldo(tsv_model) out.write_pickle(lexicon) # Clean up zip_model.remove() tsv_model.remove() Model("sensaldo/sensaldo-fullform-v02.txt").remove()
def tokenize( text: Text = Text(), out: Output = Output("segment.token", cls="token", description="Token segments"), chunk: Annotation = Annotation("[segment.token_chunk]"), segmenter: str = Config("segment.token_segmenter"), existing_segments: Optional[str] = Config("segment.existing_tokens"), model: Optional[Model] = Model("[segment.tokenizer_config]"), token_list: Optional[Model] = Model("[segment.token_list]")): """Tokenize text.""" do_segmentation(text=text, out=out, chunk=chunk, segmenter=segmenter, existing_segments=existing_segments, model=model, token_list=token_list)
def build_nst_comp(out: ModelOutput = ModelOutput("saldo/nst_comp_pos.pickle"), nst_lexicon: Model = Model("saldo/nst_utf8.txt")): """Download NST lexicon and convert it to a compound POS model. The NST lexicon can be retrieved from SVN with credentials: svn export https://svn.spraakdata.gu.se/sb-arkiv/lexikon/NST_svensk_leksikon/nst_utf8.txt saldo/nst_utf8.txt """ log.info("Building compound POS probability model...") make_model(nst_lexicon, out)
def blingbring_model( out: ModelOutput = ModelOutput("lexical_classes/blingbring.pickle")): """Download and build Blingbring model.""" # Download roget hierarchy classmap = Model("lexical_classes/roget_hierarchy.xml") classmap.download( "https://github.com/spraakbanken/sparv-models/raw/master/lexical_classes/roget_hierarchy.xml" ) # Download blingbring.txt and build blingbring.pickle raw_file = Model("lexical_classes/blingbring.txt") raw_file.download( "https://svn.spraakdata.gu.se/sb-arkiv/pub/lexikon/bring/blingbring.txt" ) lexicon = read_blingbring(raw_file.path, classmap.path) out.write_pickle(lexicon) # Clean up raw_file.remove() classmap.remove()
def stanza_dep_model( model: ModelOutput = ModelOutput("stanza/dep/sv_talbanken_parser.pt"), pretrain: ModelOutput = ModelOutput( "stanza/dep/sv_talbanken.pretrain.pt")): """Download and unzip the Stanza dependency model.""" zip_model = Model("stanza/dep/synt_stanza_full.zip") zip_model.download( "https://svn.spraakdata.gu.se/sb-arkiv/pub/stanza/synt_stanza_full.zip" ) zip_model.unzip() zip_model.remove()
def stanza_pos_model(model: ModelOutput = ModelOutput( "stanza/pos/full_sv_talbanken_tagger.pt"), pretrain: ModelOutput = ModelOutput( "stanza/pos/full_sv_talbanken.pretrain.pt")): """Download and unzip the Stanza POS-tagging model.""" zip_model = Model("stanza/pos/synt_stanza_full.zip") zip_model.download( "https://svn.spraakdata.gu.se/sb-arkiv/pub/stanza/morph_stanza_full.zip" ) zip_model.unzip() zip_model.remove()
def build_tokenlist( saldo_model: Model = Model("saldo/saldo.pickle"), out: ModelOutput = ModelOutput( "segment/bettertokenizer.sv.saldo-tokens"), segmenter: str = Config("segment.token_wordlist_segmenter"), model: Model = Model("segment/bettertokenizer.sv")): """Build a list of words from a SALDO model, to help BetterWordTokenizer.""" segmenter_args = [] if model: if model.path.suffix in ["pickle", "pkl"]: with open(model, "rb") as m: model_arg = pickle.load(m) else: model_arg = model.path segmenter_args.append(model_arg) assert segmenter in SEGMENTERS, "Available segmenters: %s" % ", ".join( sorted(SEGMENTERS)) segmenter = SEGMENTERS[segmenter] segmenter = segmenter(*segmenter_args) assert hasattr( segmenter, "span_tokenize" ), "Segmenter needs a 'span_tokenize' method: %r" % segmenter wordforms = set() # Skip strings already handled by the tokenizer. # Also skip words ending in comma (used by some multi word expressions in SALDO). with open(saldo_model.path, "rb") as F: lexicon = pickle.load(F) for w in lexicon: w2 = list(map(split_triple, lexicon[w])) mwu_extras = [ contw for w3 in w2 for cont in w3[2] for contw in cont if contw not in lexicon ] for wf in mwu_extras + [w]: spans = list(segmenter.span_tokenize(wf)) if len(spans) > 1 and not wf.endswith(","): wordforms.add(wf) out.write("\n".join(sorted(wordforms)))
def hist_morphtable(out: ModelOutput = ModelOutput("hunpos/hist/dalinm-swedberg_saldo_suc-tags.morphtable"), swedberg: Model = Model("hunpos/hist/swedberg-gender.hunpos"), dalin: Model = Model("hunpos/hist/dalinm.hunpos"), saldosuc_morphtable: Model = Model("hunpos/saldo_suc-tags.morphtable")): """Read files and make a morphtable together with the information from SALDO (saldosuc_morphtable). Args: out (str, optional): Resulting morphtable file to be written. Defaults to ModelOutput("hunpos/hist/dalinm-swedberg_saldo_suc-tags.morphtable"). swedberg (str, optional): Wordlist from Swedberg and corresponding SALDO MSD-tags. Defaults to Model("hunpos/hist/swedberg-gender.hunpos"). dalin (str, optional): Wordlist from Dalin and corresponding SALDO MSD-tags. Defaults to Model("hunpos/hist/dalinm.hunpos"). saldosuc_morphtable (str, optional): SALDO Hunpos morphtable. Defaults to Model("hunpos/saldo_suc-tags.morphtable"). """ words = {} _read_saldosuc(words, saldosuc_morphtable.path) for fil in [dalin, swedberg]: for line in open(fil.path, encoding="utf-8").readlines(): if not line.strip(): continue xs = line.split("\t") word, msd = xs[0].strip(), xs[1].strip() if " " in word: if msd.startswith("nn"): # We assume that the head of a noun mwe is the last word word = word.split()[-1] if msd.startswith("vb"): # We assume that the head of a verbal mwe is the first word word = word.split()[0] # If the tag is not present, we try to translate it anyway suc = SALDO_TO_SUC.get(msd, "") if not suc: suc = _force_parse(msd) if suc: words.setdefault(word.lower(), set()).update(suc) words.setdefault(word.title(), set()).update(suc) with open(out.path, encoding="UTF-8", mode="w") as out: for w, ts in list(words.items()): line = ("\t".join([w] + list(ts)) + "\n") out.write(line)
def swefn_model( out: ModelOutput = ModelOutput("lexical_classes/swefn.pickle")): """Download and build SweFN model.""" # Download swefn.xml and build swefn.pickle raw_file = Model("lexical_classes/swefn.xml") raw_file.download( "https://svn.spraakdata.gu.se/sb-arkiv/pub/lmf/swefn/swefn.xml") lexicon = read_swefn(raw_file.path) out.write_pickle(lexicon) # Clean up raw_file.remove()
def build_dalin(out: ModelOutput = ModelOutput("hist/dalin.pickle")): """Download Dalin morphology XML and save as a pickle file.""" # Download dalinm.xml xml_model = Model("hist/dalinm.xml") xml_model.download( "https://svn.spraakdata.gu.se/sb-arkiv/pub/lmf/dalinm/dalinm.xml") # Create pickle file lmf_to_pickle(xml_model.path, out.path) # Clean up xml_model.remove()
def build_swedberg(out: ModelOutput = ModelOutput("hist/swedberg.pickle")): """Download Swedberg morphology XML and save as a pickle file.""" # Download diapivot.xml xml_model = Model("hist/swedbergm.xml") xml_model.download( "https://svn.spraakdata.gu.se/sb-arkiv/pub/lmf/swedbergm/swedbergm.xml" ) # Create pickle file lmf_to_pickle(xml_model.path, out.path) # Clean up xml_model.remove()
def predict(doc: str = Document, model: str = Model("[vw_topic_modelling.model]"), modeljson: str = Model("[vw_topic_modelling.modeljson]"), order, struct, parent: str = Annotation("{chunk}"), word: str = Annotation("<token:word>"), out: str = Output("{chunk}:vw_topic_modelling.prediction", description="Predicted attributes"), pos: str = Annotation("<token:pos>"), raw: bool = False): """Predict a structural attribute.""" raw = raw == "true" m_json = json.load(open(modeljson)) data = ( Example(None, text.words, text.span) for text in texts([(order, struct, parent, word, pos)], map_label=lambda _: "?", min_word_length=m_json["min_word_length"], banned_pos=m_json["banned_pos"]) ) index_to_label = m_json["index_to_label"] args = ["--initial_regressor", model] if raw: predictions = ( util.cwbset(index_to_label[str(s)] + ":" + str(v) for s, v in ss) for ss, _span in vw_predict(args, data, raw=True) ) else: predictions = ( index_to_label[str(s)] for s, _span in vw_predict(args, data) ) util.write_annotation(doc, out, predictions)
def annotate( lang: Language = Language(), model: Model = Model("[treetagger.model]"), tt_binary: Binary = Binary("[treetagger.binary]"), out_upos: Output = Output("<token>:treetagger.upos", cls="token:upos", description="Part-of-speeches in UD"), out_pos: Output = Output( "<token>:treetagger.pos", cls="token:pos", description="Part-of-speeches from TreeTagger"), out_baseform: Output = Output("<token>:treetagger.baseform", description="Baseforms from TreeTagger"), word: Annotation = Annotation("<token:word>"), sentence: Annotation = Annotation("<sentence>"), encoding: str = util.UTF8): """POS/MSD tag and lemmatize using TreeTagger.""" sentences, _orphans = sentence.get_children(word) word_annotation = list(word.read()) stdin = SENT_SEP.join( TOK_SEP.join(word_annotation[token_index] for token_index in sent) for sent in sentences) args = ["-token", "-lemma", "-no-unknown", "-eos-tag", "<eos>", model.path] stdout, stderr = util.system.call_binary(tt_binary, args, stdin, encoding=encoding) log.debug("Message from TreeTagger:\n%s", stderr) # Write pos and upos annotations. out_upos_annotation = word.create_empty_attribute() out_pos_annotation = word.create_empty_attribute() for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)): for token_id, tagged_token in zip(sent, tagged_sent.strip().split(TOK_SEP)): tag = tagged_token.strip().split(TAG_SEP)[TAG_COLUMN] out_pos_annotation[token_id] = tag out_upos_annotation[token_id] = util.tagsets.pos_to_upos( tag, lang, TAG_SETS.get(lang)) out_pos.write(out_pos_annotation) out_upos.write(out_upos_annotation) # Write lemma annotations. out_lemma_annotation = word.create_empty_attribute() for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)): for token_id, tagged_token in zip(sent, tagged_sent.strip().split(TOK_SEP)): lem = tagged_token.strip().split(TAG_SEP)[LEM_COLUMN] out_lemma_annotation[token_id] = lem out_baseform.write(out_lemma_annotation)
def annotate( sense: Annotation = Annotation("<token>:saldo.sense"), out_scores: Output = Output("<token>:sensaldo.sentiment_score", description="SenSALDO sentiment score"), out_labels: Output = Output("<token>:sensaldo.sentiment_label", description="SenSALDO sentiment label"), model: Model = Model("[sensaldo.model]"), lexicon=None): """Assign sentiment values to tokens based on their sense annotation. When more than one sense is possible, calulate a weighted mean. - sense: existing annotation with saldoIDs. - out_scores, out_labels: resulting annotation file. - model: pickled lexicon with saldoIDs as keys. - lexicon: this argument cannot be set from the command line, but is used in the catapult. This argument must be last. """ if not lexicon: lexicon = util.PickledLexicon(model.path) # Otherwise use pre-loaded lexicon (from catapult) sense = sense.read() result_scores = [] result_labels = [] for token in sense: # Get set of senses for each token and sort them according to their probabilities token_senses = [ tuple(s.rsplit(util.SCORESEP, 1)) if util.SCORESEP in s else (s, -1.0) for s in token.split(util.DELIM) if s ] token_senses.sort(key=lambda x: float(x[1]), reverse=True) # Lookup the sentiment score for the most probable sense and assign a sentiment label if token_senses: best_sense = token_senses[0][0] score = lexicon.lookup(best_sense, None) else: score = None if score: result_scores.append(score) result_labels.append(SENTIMENT_LABLES.get(int(score))) else: result_scores.append(None) result_labels.append(None) out_scores.write(result_scores) out_labels.write(result_labels)
def contextual(out: Output = Output("{chunk}:geo.geo_context", description="Geographical places with coordinates"), chunk: Annotation = Annotation("{chunk}"), context: Annotation = Annotation("[geo.context_chunk]"), ne_type: Annotation = Annotation("swener.ne:swener.type"), ne_subtype: Annotation = Annotation("swener.ne:swener.subtype"), ne_name: Annotation = Annotation("swener.ne:swener.name"), model: Model = Model("[geo.model]"), method: str = "populous", language: list = []): """Annotate chunks with location data, based on locations contained within the text. context = text chunk to use for disambiguating places (when applicable). chunk = text chunk to which the annotation will be added. """ model = load_model(model, language=language) ne_type_annotation = list(ne_type.read()) ne_subtype_annotation = list(ne_subtype.read()) ne_name_annotation = list(ne_name.read()) children_context_chunk, _orphans = context.get_children(chunk) children_chunk_ne, _orphans = chunk.get_children(ne_type) out_annotation = chunk.create_empty_attribute() for chunks in children_context_chunk: all_locations = [] # TODO: Maybe not needed for anything? context_locations = [] chunk_locations = defaultdict(list) for ch in chunks: for n in children_chunk_ne[ch]: if ne_type_annotation[n] == "LOC" and "PPL" in ne_subtype_annotation[n]: location_text = ne_name_annotation[n].replace("\n", " ").replace(" ", " ") location_data = model.get(location_text.lower()) if location_data: all_locations.append((location_text, list(location_data))) context_locations.append((location_text, list(location_data))) chunk_locations[ch].append((location_text, list(location_data))) else: pass # log.info("No location found for %s" % ne_name_annotation[n].replace("%", "%%")) chunk_locations = most_populous(chunk_locations) for c in chunks: out_annotation[c] = _format_location(chunk_locations.get(c, ())) out.write(out_annotation)
def swefn_text(out: Output = Output("<text>:lexical_classes.swefn", description="Lexical classes for text chunks from SweFN"), lexical_classes_token: Annotation = Annotation("<token>:lexical_classes.swefn"), text: Annotation = Annotation("<text>"), token: Annotation = Annotation("<token>"), saldoids: Optional[Annotation] = Annotation("<token:sense>"), cutoff: int = 3, types: bool = False, delimiter: str = util.DELIM, affix: str = util.AFFIX, freq_model: Model = Model("[lexical_classes.swefn_freq_model]"), decimals: int = 3): """Annotate text chunks with SweFN classes.""" annotate_text(out=out, lexical_classes_token=lexical_classes_token, text=text, token=token, saldoids=saldoids, cutoff=cutoff, types=types, delimiter=delimiter, affix=affix, freq_model=freq_model, decimals=decimals)
def annotate(corpus_text: Text = Text(), lang: Language = Language, conf_file: Model = Model("[freeling.conf]"), fl_binary: Binary = Binary("[freeling.binary]"), sentence_chunk: Optional[Annotation] = Annotation("[freeling.sentence_chunk]"), out_token: Output = Output("freeling.token", cls="token", description="Token segments"), out_word: Output = Output("<token>:freeling.word", cls="token:word", description="Token strings"), out_baseform: Output = Output("<token>:freeling.baseform", description="Baseforms from FreeLing"), out_upos: Output = Output("<token>:freeling.upos", cls="token:upos", description="Part-of-speeches in UD"), out_pos: Output = Output("<token>:freeling.pos", cls="token:pos", description="Part-of-speeches from FreeLing"), out_sentence: Optional[Output] = Output("freeling.sentence", cls="sentence", description="Sentence segments"), sentence_annotation: Optional[Annotation] = Annotation("[freeling.sentence_annotation]")): """Run FreeLing and output sentences, tokens, baseforms, upos and pos.""" main(corpus_text, lang, conf_file, fl_binary, sentence_chunk, out_token, out_word, out_baseform, out_upos, out_pos, out_sentence, sentence_annotation)
def sentence( text: Text = Text(), out: Output = Output("segment.sentence", cls="sentence", description="Sentence segments"), chunk: Optional[Annotation] = Annotation("[segment.sentence_chunk]"), segmenter: str = Config("segment.sentence_segmenter"), existing_segments: Optional[str] = Config( "segment.existing_sentences"), model: Optional[Model] = Model("[segment.sentence_model]")): """Split text into sentences.""" do_segmentation(text=text, out=out, chunk=chunk, segmenter=segmenter, existing_segments=existing_segments, model=model)
def word_weights(doc: str = Document, model: str = Model("[vw_topic_modelling.model]"), word: str = Annotation("<token:word>"), pos: str = Annotation("<token:pos>"), out: str = Output("<token>:vw_topic_modelling:label_weights", description="Label weights per word")): """ Report the weight for each label for each word. Both model and model.json must exist. See --train and --predict. """ m_json = json.load(open(model + ".json")) index_to_label = m_json["index_to_label"] min_word_length = int(m_json["min_word_length"] or "0") banned_pos = (m_json["banned_pos"] or "").split() words = list(util.read_annotation(doc, word)) poss = util.read_annotation(doc, pos) if pos else [] data = (Example(None, vw_normalize(word)) for n, word in enumerate(words) if len(word) >= min_word_length if not pos or poss[n] not in banned_pos) weights = defaultdict(list) with tempfile.NamedTemporaryFile() as tmp: args = ["--initial_regressor", model, "--invert_hash", tmp.name] for _ in vw_predict(args, data): pass for line in open(tmp.name, "r").readlines(): # allmänna[1]:14342849:0.0139527 colons = line.split(":") if len(colons) == 3: word, _hash, weight = colons if word[-1] == "]": bracesplit = word.rsplit("[", 1) else: bracesplit = [] if len(bracesplit) == 2: word, index = bracesplit n = int(index[:-1]) + 1 else: n = 1 weights[word].append(index_to_label[str(n)] + ":" + weight) ws = ( util.cwbset(weights[vw_normalize(word)]) for word in words if vw_normalize(word) in weights ) util.write_annotation(doc, out, ws)
def annotate_full(corpus_text: Text = Text(), lang: Language = Language(), conf_file: Model = Model("[freeling.conf]"), fl_binary: Binary = Binary("[freeling.binary]"), sentence_chunk: Annotation = Annotation("[freeling.sentence_chunk]"), out_token: Output = Output("freeling.token", cls="token", description="Token segments"), out_word: Output = Output("<token>:freeling.word", cls="token:word", description="Token strings"), out_baseform: Output = Output("<token>:freeling.baseform", description="Baseforms from FreeLing"), out_upos: Output = Output("<token>:freeling.upos", cls="token:upos", description="Part-of-speeches in UD"), out_pos: Output = Output("<token>:freeling.pos", cls="token:pos", description="Part-of-speeches from FreeLing"), out_ne_type: Output = Output("<token>:freeling.ne_type", cls="token:named_entity_type", description="Named entitiy types from FreeLing"), out_sentence: Optional[Output] = Output("freeling.sentence", cls="sentence", description="Sentence segments"), sentence_annotation: Optional[Annotation] = Annotation("[freeling.sentence_annotation]")): """Run FreeLing and output the usual annotations plus named entity types.""" main(corpus_text, lang, conf_file, fl_binary, sentence_chunk, out_token, out_word, out_baseform, out_upos, out_pos, out_sentence, sentence_annotation, out_ne_type)
def metadata(out: Output = Output("{chunk}:geo.geo_metadata", description="Geographical places with coordinates"), chunk: Annotation = Annotation("{chunk}"), source: Annotation = Annotation("[geo.metadata_source]"), model: Model = Model("[geo.model]"), method: str = "populous", language: list = []): """Get location data based on metadata containing location names.""" geomodel = load_model(model, language=language) same_target_source = chunk.split()[0] == source.split()[0] chunk_annotation = list(chunk.read()) source_annotation = list(source.read()) # If location source and target chunk are not the same, we need # to find the parent/child relations between them. if not same_target_source: target_source_parents = list(source.get_parents(chunk)) chunk_locations = {} for i, _ in enumerate(chunk_annotation): if same_target_source: location_source = source_annotation[i] else: location_source = source_annotation[target_source_parents[i]] if target_source_parents[ i] is not None else None if location_source: location_data = geomodel.get(location_source.strip().lower()) if location_data: chunk_locations[i] = [(location_source, list(location_data))] else: chunk_locations[i] = [] chunk_locations = most_populous(chunk_locations) out_annotation = chunk.create_empty_attribute() for c in chunk_locations: out_annotation[c] = _format_location(chunk_locations.get(c, ())) out.write(out_annotation)
def build_diapivot(out: ModelOutput = ModelOutput("hist/diapivot.pickle")): """Download diapivot XML dictionary and save as a pickle file.""" # Download diapivot.xml xml_model = Model("hist/diapivot.xml") xml_model.download( "https://svn.spraakdata.gu.se/sb-arkiv/pub/lmf/diapivot/diapivot.xml") # Create pickle file xml_lexicon = read_xml(xml_model.path) log.info("Saving cross lexicon in Pickle format") picklex = {} for lem in xml_lexicon: lemgrams = [] for saldo, match in list(xml_lexicon[lem].items()): lemgrams.append(PART_DELIM1.join([saldo, match])) picklex[lem] = sorted(lemgrams) out.write_pickle(picklex) # Clean up xml_model.remove()
def build_model(out: ModelOutput = ModelOutput("geo/geo.pickle")): """Download and build geo model.""" # Download and extract cities1000.txt cities_zip = Model("geo/cities1000.zip") cities_zip.download("http://download.geonames.org/export/dump/cities1000.zip") cities_zip.unzip() # Download and extract alternateNames.txt names_zip = Model("geo/alternateNames.zip") names_zip.download("http://download.geonames.org/export/dump/alternateNames.zip") names_zip.unzip() pickle_model(Model("geo/cities1000.txt"), Model("geo/alternateNames.txt"), out) # Clean up cities_zip.remove() names_zip.remove() Model("geo/iso-languagecodes.txt").remove() Model("geo/cities1000.txt").remove() Model("geo/alternateNames.txt").remove()
def build_saldo(out: ModelOutput = ModelOutput("saldo/saldo.pickle"), saldom: Model = Model("saldo/saldom.xml")): """Save SALDO morphology as a pickle file.""" lmf_to_pickle(saldom.path, out.path)