def build_model(out: ModelOutput = ModelOutput("malt/swemalt-1.7.2.mco"), _maltjar: Binary = Binary("[malt.jar]")): """Download model for MALT Parser. Won't download model unless maltjar has been installed. """ out.download("http://maltparser.org/mco/swedish_parser/swemalt-1.7.2.mco")
def stanza_pos_model(model: ModelOutput = ModelOutput( "stanza/pos/full_sv_talbanken_tagger.pt"), pretrain: ModelOutput = ModelOutput( "stanza/pos/full_sv_talbanken.pretrain.pt")): """Download and unzip the Stanza POS-tagging model.""" zip_model = Model("stanza/pos/synt_stanza_full.zip") zip_model.download( "https://svn.spraakdata.gu.se/sb-arkiv/pub/stanza/morph_stanza_full.zip" ) zip_model.unzip() zip_model.remove()
def stanza_dep_model( model: ModelOutput = ModelOutput("stanza/dep/sv_talbanken_parser.pt"), pretrain: ModelOutput = ModelOutput( "stanza/dep/sv_talbanken.pretrain.pt")): """Download and unzip the Stanza dependency model.""" zip_model = Model("stanza/dep/synt_stanza_full.zip") zip_model.download( "https://svn.spraakdata.gu.se/sb-arkiv/pub/stanza/synt_stanza_full.zip" ) zip_model.unzip() zip_model.remove()
def swefn_model( out: ModelOutput = ModelOutput("lexical_classes/swefn.pickle")): """Download and build SweFN model.""" # Download swefn.xml and build swefn.pickle raw_file = Model("lexical_classes/swefn.xml") raw_file.download( "https://svn.spraakdata.gu.se/sb-arkiv/pub/lmf/swefn/swefn.xml") lexicon = read_swefn(raw_file.path) out.write_pickle(lexicon) # Clean up raw_file.remove()
def stanza_lem_model( model: ModelOutput = ModelOutput("stanza/lem/sv_suc_lemmatizer.pt")): """Download and unzip the Stanza POS-tagging model.""" zip_model = Model("stanza/lem/synt_stanza_full.zip") zip_model.download( "https://svn.spraakdata.gu.se/sb-arkiv/pub/stanza/lem_stanza.zip") zip_model.unzip() zip_model.remove()
def build_model(out: ModelOutput = ModelOutput("sensaldo/sensaldo.pickle")): """Download and build SenSALDO model.""" # Download and extract sensaldo-base-v02.txt zip_model = Model("sensaldo/sensaldo-v02.zip") zip_model.download( "https://svn.spraakdata.gu.se/sb-arkiv/pub/lexikon/sensaldo/sensaldo-v02.zip" ) zip_model.unzip() tsv_model = Model("sensaldo/sensaldo-base-v02.txt") # Read sensaldo tsv dictionary and save as a pickle file lexicon = read_sensaldo(tsv_model) out.write_pickle(lexicon) # Clean up zip_model.remove() tsv_model.remove() Model("sensaldo/sensaldo-fullform-v02.txt").remove()
def build_nst_comp(out: ModelOutput = ModelOutput("saldo/nst_comp_pos.pickle"), nst_lexicon: Model = Model("saldo/nst_utf8.txt")): """Download NST lexicon and convert it to a compound POS model. The NST lexicon can be retrieved from SVN with credentials: svn export https://svn.spraakdata.gu.se/sb-arkiv/lexikon/NST_svensk_leksikon/nst_utf8.txt saldo/nst_utf8.txt """ log.info("Building compound POS probability model...") make_model(nst_lexicon, out)
def stanza_resources_file( resources_file: ModelOutput = ModelOutput("stanza/resources.json")): """Download and unzip the Stanza dependency model.""" # Write resources.json file to keep Stanza from complaining res = json.dumps({ "sv": { "lang_name": "Swedish", "tokenize": { "orchid": {}, "best": {} }, "default_processors": { "tokenize": "orchid" }, "default_dependencies": {}, } }) resources_file.write(res)
def morphtable_inputs( suc: ModelOutput = ModelOutput("hunpos/suc3_morphtable.words"), morphtable_base: ModelOutput = ModelOutput("hunpos/suc.morphtable"), morphtable_patterns: ModelOutput = ModelOutput("hunpos/suc.patterns")): """Download the files needed to build the SALDO morphtable.""" suc.download( "https://github.com/spraakbanken/sparv-models/raw/master/hunpos/suc3_morphtable.words" ) morphtable_base.download( "https://github.com/spraakbanken/sparv-models/raw/master/hunpos/suc.morphtable" ) morphtable_patterns.download( "https://github.com/spraakbanken/sparv-models/raw/master/hunpos/suc.patterns" )
def blingbring_model( out: ModelOutput = ModelOutput("lexical_classes/blingbring.pickle")): """Download and build Blingbring model.""" # Download roget hierarchy classmap = Model("lexical_classes/roget_hierarchy.xml") classmap.download( "https://github.com/spraakbanken/sparv-models/raw/master/lexical_classes/roget_hierarchy.xml" ) # Download blingbring.txt and build blingbring.pickle raw_file = Model("lexical_classes/blingbring.txt") raw_file.download( "https://svn.spraakdata.gu.se/sb-arkiv/pub/lexikon/bring/blingbring.txt" ) lexicon = read_blingbring(raw_file.path, classmap.path) out.write_pickle(lexicon) # Clean up raw_file.remove() classmap.remove()
def hist_morphtable(out: ModelOutput = ModelOutput("hunpos/hist/dalinm-swedberg_saldo_suc-tags.morphtable"), swedberg: Model = Model("hunpos/hist/swedberg-gender.hunpos"), dalin: Model = Model("hunpos/hist/dalinm.hunpos"), saldosuc_morphtable: Model = Model("hunpos/saldo_suc-tags.morphtable")): """Read files and make a morphtable together with the information from SALDO (saldosuc_morphtable). Args: out (str, optional): Resulting morphtable file to be written. Defaults to ModelOutput("hunpos/hist/dalinm-swedberg_saldo_suc-tags.morphtable"). swedberg (str, optional): Wordlist from Swedberg and corresponding SALDO MSD-tags. Defaults to Model("hunpos/hist/swedberg-gender.hunpos"). dalin (str, optional): Wordlist from Dalin and corresponding SALDO MSD-tags. Defaults to Model("hunpos/hist/dalinm.hunpos"). saldosuc_morphtable (str, optional): SALDO Hunpos morphtable. Defaults to Model("hunpos/saldo_suc-tags.morphtable"). """ words = {} _read_saldosuc(words, saldosuc_morphtable.path) for fil in [dalin, swedberg]: for line in open(fil.path, encoding="utf-8").readlines(): if not line.strip(): continue xs = line.split("\t") word, msd = xs[0].strip(), xs[1].strip() if " " in word: if msd.startswith("nn"): # We assume that the head of a noun mwe is the last word word = word.split()[-1] if msd.startswith("vb"): # We assume that the head of a verbal mwe is the first word word = word.split()[0] # If the tag is not present, we try to translate it anyway suc = SALDO_TO_SUC.get(msd, "") if not suc: suc = _force_parse(msd) if suc: words.setdefault(word.lower(), set()).update(suc) words.setdefault(word.title(), set()).update(suc) with open(out.path, encoding="UTF-8", mode="w") as out: for w, ts in list(words.items()): line = ("\t".join([w] + list(ts)) + "\n") out.write(line)
def build_tokenlist( saldo_model: Model = Model("saldo/saldo.pickle"), out: ModelOutput = ModelOutput( "segment/bettertokenizer.sv.saldo-tokens"), segmenter: str = Config("segment.token_wordlist_segmenter"), model: Model = Model("segment/bettertokenizer.sv")): """Build a list of words from a SALDO model, to help BetterWordTokenizer.""" segmenter_args = [] if model: if model.path.suffix in ["pickle", "pkl"]: with open(model, "rb") as m: model_arg = pickle.load(m) else: model_arg = model.path segmenter_args.append(model_arg) assert segmenter in SEGMENTERS, "Available segmenters: %s" % ", ".join( sorted(SEGMENTERS)) segmenter = SEGMENTERS[segmenter] segmenter = segmenter(*segmenter_args) assert hasattr( segmenter, "span_tokenize" ), "Segmenter needs a 'span_tokenize' method: %r" % segmenter wordforms = set() # Skip strings already handled by the tokenizer. # Also skip words ending in comma (used by some multi word expressions in SALDO). with open(saldo_model.path, "rb") as F: lexicon = pickle.load(F) for w in lexicon: w2 = list(map(split_triple, lexicon[w])) mwu_extras = [ contw for w3 in w2 for cont in w3[2] for contw in cont if contw not in lexicon ] for wf in mwu_extras + [w]: spans = list(segmenter.span_tokenize(wf)) if len(spans) > 1 and not wf.endswith(","): wordforms.add(wf) out.write("\n".join(sorted(wordforms)))
def build_dalin(out: ModelOutput = ModelOutput("hist/dalin.pickle")): """Download Dalin morphology XML and save as a pickle file.""" # Download dalinm.xml xml_model = Model("hist/dalinm.xml") xml_model.download( "https://svn.spraakdata.gu.se/sb-arkiv/pub/lmf/dalinm/dalinm.xml") # Create pickle file lmf_to_pickle(xml_model.path, out.path) # Clean up xml_model.remove()
def build_diapivot(out: ModelOutput = ModelOutput("hist/diapivot.pickle")): """Download diapivot XML dictionary and save as a pickle file.""" # Download diapivot.xml xml_model = Model("hist/diapivot.xml") xml_model.download( "https://svn.spraakdata.gu.se/sb-arkiv/pub/lmf/diapivot/diapivot.xml") # Create pickle file xml_lexicon = read_xml(xml_model.path) log.info("Saving cross lexicon in Pickle format") picklex = {} for lem in xml_lexicon: lemgrams = [] for saldo, match in list(xml_lexicon[lem].items()): lemgrams.append(PART_DELIM1.join([saldo, match])) picklex[lem] = sorted(lemgrams) out.write_pickle(picklex) # Clean up xml_model.remove()
def build_swedberg(out: ModelOutput = ModelOutput("hist/swedberg.pickle")): """Download Swedberg morphology XML and save as a pickle file.""" # Download diapivot.xml xml_model = Model("hist/swedbergm.xml") xml_model.download( "https://svn.spraakdata.gu.se/sb-arkiv/pub/lmf/swedbergm/swedbergm.xml" ) # Create pickle file lmf_to_pickle(xml_model.path, out.path) # Clean up xml_model.remove()
def build_korp_stats(out: ModelOutput = ModelOutput("saldo/stats.pickle"), _saldom: Model = Model("saldo/saldom.xml")): """Download Korp's word frequency file and convert it to a model.""" txt_file = Model("saldo/stats_all.txt") try: log.info("Downloading Korp stats file...") download_stats_file( "https://svn.spraakdata.gu.se/sb-arkiv/pub/frekvens/stats_all.txt", txt_file.path) log.info("Building frequency model...") make_model(txt_file.path, out.path) finally: # Clean up txt_file.remove()
def build_model(sense_model: ModelOutput = ModelOutput( "wsd/ALL_512_128_w10_A2_140403_ctx1.bin"), context_model: ModelOutput = ModelOutput( "wsd/lem_cbow0_s512_w10_NEW2_ctx.bin")): """Download models for SALDO-based word sense disambiguation.""" # Download sense model sense_model.download( "https://github.com/spraakbanken/sparv-wsd/raw/master/models/scouse/ALL_512_128_w10_A2_140403_ctx1.bin" ) # Download context model context_model.download( "https://github.com/spraakbanken/sparv-wsd/raw/master/models/scouse/lem_cbow0_s512_w10_NEW2_ctx.bin" )
def build_model(out: ModelOutput = ModelOutput("geo/geo.pickle")): """Download and build geo model.""" # Download and extract cities1000.txt cities_zip = Model("geo/cities1000.zip") cities_zip.download("http://download.geonames.org/export/dump/cities1000.zip") cities_zip.unzip() # Download and extract alternateNames.txt names_zip = Model("geo/alternateNames.zip") names_zip.download("http://download.geonames.org/export/dump/alternateNames.zip") names_zip.unzip() pickle_model(Model("geo/cities1000.txt"), Model("geo/alternateNames.txt"), out) # Clean up cities_zip.remove() names_zip.remove() Model("geo/iso-languagecodes.txt").remove() Model("geo/cities1000.txt").remove() Model("geo/alternateNames.txt").remove()
def download_dalin_wordlist(out: ModelOutput = ModelOutput("hunpos/hist/dalinm.hunpos")): """Download Dalin wordlist.""" out.download("https://github.com/spraakbanken/sparv-models/raw/master/hunpos/hist/dalinm.hunpos")
def download_swedberg_wordlist(out: ModelOutput = ModelOutput("hunpos/hist/swedberg-gender.hunpos")): """Download Swedberg wordlist.""" out.download("https://github.com/spraakbanken/sparv-models/raw/master/hunpos/hist/swedberg-gender.hunpos")
def download_bettertokenizer( out: ModelOutput = ModelOutput("segment/bettertokenizer.sv")): """Download model for use with BetterWordTokenizer.""" out.download( "https://github.com/spraakbanken/sparv-models/raw/master/segment/bettertokenizer.sv" )
def download_punkt_model( out: ModelOutput = ModelOutput("segment/punkt-nltk-svenska.pickle")): """Download model for use with PunktSentenceTokenizer.""" out.download( "https://github.com/spraakbanken/sparv-models/raw/master/segment/punkt-nltk-svenska.pickle" )
def train(doc: str = Document, file_list, modelfile: str = ModelOutput("vw_topic_modelling/?.model"), jsonfile: str = ModelOutput("vw_topic_modelling/?.model.json"), dry_run_labels: bool = False, label_map_json=None, bound=None, min_word_length: int = 0, banned_pos=""): """ Train a model using vowpal wabbit. Creates outprefix.model and outprefix.model.json. file_list is a file with 5*N lines of annotation filenames: first N copies of: order, then N copies of: annotation_struct, then N copies of: parent, then N copies of: word. then N copies of: pos. """ with open(file_list, "r") as fp: files = fp.read().split() order_struct_parent_word_pos = interleave(files, 5) map_label = _make_label_map(label_map_json) min_word_length = int(min_word_length) if min_word_length else 0 # Look at the structs annotations to get the labels and their distribution: _, structs, _, _, _ = list(zip(*order_struct_parent_word_pos)) # TODO: skip labels with very low occurrences labels = Counter(map_label(label) for annotfile in structs for label in util.read_annotation(doc, annotfile) if map_label(label)) N = sum(labels.values()) if bound: bound = int(bound) N = min(bound, N) k = len(labels) label_to_index = {} index_to_label = {} answer = {} for i, (label, occurences) in enumerate(iter(list(labels.items())), start=1): w = float(N) / occurences log.info(f"{label}: occurences: {occurences}, weight: {w}") answer[label] = ("%s:%s | " % (i, w)).encode() label_to_index[label] = i index_to_label[i] = label if dry_run_labels == "true": from pprint import pprint pprint(labels.most_common()) print(json.dumps({l: l for l in labels}, indent=2)) log.info(f"texts: {N}, labels: {k}") sys.exit() def itertexts(): return _take(bound, texts(order_struct_parent_word_pos, map_label, min_word_length, banned_pos)) # Train model args = ["--oaa", str(k), "--passes", "10", "--cache", "--kill_cache", "--bit_precision", "24", "--final_regressor", modelfile] data = ( Example(answer[text.label], text.words) for text in every(10, itertexts(), invert=True) ) vw_train(args, data) # Performance evaluation args = ["--initial_regressor", modelfile] target = [] def data_iterator(): for text in every(10, itertexts()): target.append(label_to_index[text.label]) yield Example(None, text.words) predicted = [int(s) for s, _tag in vw_predict(args, data_iterator())] N_eval = len(predicted) assert len(predicted) == len(target) order = list(range(1, 1 + k)) info = dict( min_word_length=min_word_length, banned_pos=banned_pos, labels=[index_to_label[i] for i in order], index_to_label=index_to_label, label_to_index=label_to_index, N_train=N - N_eval, N_eval=N_eval, stats={index_to_label[i]: p.as_dict() for i, p in list(multiclass_performance(target, predicted).items())}, confusion_matrix=confusion_matrix(target, predicted, order)) with open(jsonfile, "w") as f: json.dump(info, f, sort_keys=True, indent=2) log.info(f"Wrote {jsonfile}")
def download_saldo(out: ModelOutput = ModelOutput("saldo/saldom.xml")): """Download SALDO morphology XML.""" out.download( "https://svn.spraakdata.gu.se/sb-arkiv/pub/lexikon/saldom/saldom.xml")
def metashare_template(model: ModelOutput = ModelOutput( "sbx_metadata/sbx-metashare-template.xml")): """Download the SBX META-SHARE template.""" model.download( "https://raw.githubusercontent.com/spraakbanken/sparv-sbx-metadata/main/data/sbx-metashare-template.xml" )
def download_nst_comp( out: ModelOutput = ModelOutput("saldo/nst_comp_pos.pickle")): """Download compound POS model from sparv-models repo.""" out.download( "https://github.com/spraakbanken/sparv-models/raw/master/saldo/nst_comp_pos.pickle" )
def get_rus_model(out: ModelOutput = ModelOutput("treetagger/rus.par"), tt_binary: Binary = Binary("[treetagger.binary]")): """Download TreeTagger language model.""" gzip = "treetagger/russian.par.gz" url = "http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/russian.par.gz" _download(url, gzip, out)
def hunpos_model(model: ModelOutput = ModelOutput( "hunpos/suc3_suc-tags_default-setting_utf8.model")): """Download the Hunpos model.""" model.download( "https://github.com/spraakbanken/sparv-models/raw/master/hunpos/suc3_suc-tags_default-setting_utf8.model" )
def build_saldo(out: ModelOutput = ModelOutput("saldo/saldo.pickle"), saldom: Model = Model("saldo/saldom.xml")): """Save SALDO morphology as a pickle file.""" lmf_to_pickle(saldom.path, out.path)
def saldo_morphtable(out: ModelOutput = ModelOutput( "hunpos/saldo_suc-tags.morphtable"), saldo_model: Model = Model("saldo/saldo.pickle"), suc: Model = Model("hunpos/suc3_morphtable.words"), morphtable_base: Model = Model("hunpos/suc.morphtable"), morphtable_patterns: Model = Model("hunpos/suc.patterns"), add_capitalized: bool = True, add_lowercase: bool = False): """Create a morphtable file for use with Hunpos. A morphtable contains wordforms from SALDO's morphology (with accompanying tags) which are missing in SUC3. Since the morphtable is case sensitive, both the original form and a capitalized form is saved. Args: out (str, optional): Resulting morphtable file to be written. Defaults to ModelOutput("hunpos/saldo_suc-tags.morphtable"). saldo_model (str, optional): Path to a pickled SALDO model. Defaults to Model("saldo/saldo.pickle"). suc (str, optional): Tab-separated file with wordforms from SUC, containing: frequency, wordform, tag. Defaults to Model("hunpos/suc3_morphtable.words"). morphtable_base (str, optional): Existing morphtable file, whose contents will be included in the new one. Defaults to Model("hunpos/suc.morphtable"). morphtable_patterns (str, optional): Optional file with regular expressions. Defaults to Model("hunpos/suc.patterns"). add_capitalized (bool, optional): Whether or not capitalized word forms should be added. Defaults to True. add_lowercase (bool, optional): Whether or not lower case word forms should be added. Defaults to False. """ lex = saldo.SaldoLexicon(saldo_model.path) tags = defaultdict(set) # Get all wordforms from SALDO for word in list(lex.lexicon.keys()): words = lex.lookup(word) # Filter out multi word expressions words = [x for x in words if len(x[2]) == 0] if words: # Only use MSD not containing "-" for w in words: for msd in w[1]: if "-" not in msd: tags[word].add(msd) if add_capitalized: # Add a capitalized form of the word capitalized = word[0].upper() + word[1:] if not word == capitalized: tags[capitalized].add(msd) if add_lowercase: # Add a lower case form of the word lower = word.lower() if not word == lower: tags[lower].add(msd) # Read SUC words with open(suc.path, encoding="UTF-8") as suctags: for line in suctags: _, word, msd = line.strip("\n").split("\t") # Don't keep SALDO words already in SUC if word in tags: del tags[word] # If the word is not a name, and exists as lowercase in SALDO, remove it elif not msd.startswith("PM") and not word.lower( ) == word and word.lower() in tags: del tags[word.lower()] # Read regular expressions from pattern file pattern_list = [] if morphtable_patterns: with open(morphtable_patterns.path, encoding="UTF-8") as pat: for line in pat: if line.strip() and not line.startswith("#"): pattern_name, _, pattern_tags = line.strip().split("\t", 2) pattern_list.append("[[%s]]\t%s\n" % (pattern_name, pattern_tags)) with open(out.path, encoding="UTF-8", mode="w") as out: if morphtable_base: with open(morphtable_base.path, encoding="UTF-8") as base: for line in base: out.write(line) for pattern in pattern_list: out.write(pattern) for word in sorted(tags): out.write("%s\t%s\n" % (word, "\t".join(tags[word])))