def stanza_lem_model( model: ModelOutput = ModelOutput("stanza/lem/sv_suc_lemmatizer.pt")): """Download and unzip the Stanza POS-tagging model.""" zip_model = Model("stanza/lem/synt_stanza_full.zip") zip_model.download( "https://svn.spraakdata.gu.se/sb-arkiv/pub/stanza/lem_stanza.zip") zip_model.unzip() zip_model.remove()
def stanza_dep_model( model: ModelOutput = ModelOutput("stanza/dep/sv_talbanken_parser.pt"), pretrain: ModelOutput = ModelOutput( "stanza/dep/sv_talbanken.pretrain.pt")): """Download and unzip the Stanza dependency model.""" zip_model = Model("stanza/dep/synt_stanza_full.zip") zip_model.download( "https://svn.spraakdata.gu.se/sb-arkiv/pub/stanza/synt_stanza_full.zip" ) zip_model.unzip() zip_model.remove()
def stanza_pos_model(model: ModelOutput = ModelOutput( "stanza/pos/full_sv_talbanken_tagger.pt"), pretrain: ModelOutput = ModelOutput( "stanza/pos/full_sv_talbanken.pretrain.pt")): """Download and unzip the Stanza POS-tagging model.""" zip_model = Model("stanza/pos/synt_stanza_full.zip") zip_model.download( "https://svn.spraakdata.gu.se/sb-arkiv/pub/stanza/morph_stanza_full.zip" ) zip_model.unzip() zip_model.remove()
def build_dalin(out: ModelOutput = ModelOutput("hist/dalin.pickle")): """Download Dalin morphology XML and save as a pickle file.""" # Download dalinm.xml xml_model = Model("hist/dalinm.xml") xml_model.download( "https://svn.spraakdata.gu.se/sb-arkiv/pub/lmf/dalinm/dalinm.xml") # Create pickle file lmf_to_pickle(xml_model.path, out.path) # Clean up xml_model.remove()
def swefn_model( out: ModelOutput = ModelOutput("lexical_classes/swefn.pickle")): """Download and build SweFN model.""" # Download swefn.xml and build swefn.pickle raw_file = Model("lexical_classes/swefn.xml") raw_file.download( "https://svn.spraakdata.gu.se/sb-arkiv/pub/lmf/swefn/swefn.xml") lexicon = read_swefn(raw_file.path) out.write_pickle(lexicon) # Clean up raw_file.remove()
def build_swedberg(out: ModelOutput = ModelOutput("hist/swedberg.pickle")): """Download Swedberg morphology XML and save as a pickle file.""" # Download diapivot.xml xml_model = Model("hist/swedbergm.xml") xml_model.download( "https://svn.spraakdata.gu.se/sb-arkiv/pub/lmf/swedbergm/swedbergm.xml" ) # Create pickle file lmf_to_pickle(xml_model.path, out.path) # Clean up xml_model.remove()
def build_korp_stats(out: ModelOutput = ModelOutput("saldo/stats.pickle"), _saldom: Model = Model("saldo/saldom.xml")): """Download Korp's word frequency file and convert it to a model.""" txt_file = Model("saldo/stats_all.txt") try: log.info("Downloading Korp stats file...") download_stats_file( "https://svn.spraakdata.gu.se/sb-arkiv/pub/frekvens/stats_all.txt", txt_file.path) log.info("Building frequency model...") make_model(txt_file.path, out.path) finally: # Clean up txt_file.remove()
def build_model(out: ModelOutput = ModelOutput("sensaldo/sensaldo.pickle")): """Download and build SenSALDO model.""" # Download and extract sensaldo-base-v02.txt zip_model = Model("sensaldo/sensaldo-v02.zip") zip_model.download( "https://svn.spraakdata.gu.se/sb-arkiv/pub/lexikon/sensaldo/sensaldo-v02.zip" ) zip_model.unzip() tsv_model = Model("sensaldo/sensaldo-base-v02.txt") # Read sensaldo tsv dictionary and save as a pickle file lexicon = read_sensaldo(tsv_model) out.write_pickle(lexicon) # Clean up zip_model.remove() tsv_model.remove() Model("sensaldo/sensaldo-fullform-v02.txt").remove()
def build_model(out: ModelOutput = ModelOutput("geo/geo.pickle")): """Download and build geo model.""" # Download and extract cities1000.txt cities_zip = Model("geo/cities1000.zip") cities_zip.download("http://download.geonames.org/export/dump/cities1000.zip") cities_zip.unzip() # Download and extract alternateNames.txt names_zip = Model("geo/alternateNames.zip") names_zip.download("http://download.geonames.org/export/dump/alternateNames.zip") names_zip.unzip() pickle_model(Model("geo/cities1000.txt"), Model("geo/alternateNames.txt"), out) # Clean up cities_zip.remove() names_zip.remove() Model("geo/iso-languagecodes.txt").remove() Model("geo/cities1000.txt").remove() Model("geo/alternateNames.txt").remove()
def blingbring_model( out: ModelOutput = ModelOutput("lexical_classes/blingbring.pickle")): """Download and build Blingbring model.""" # Download roget hierarchy classmap = Model("lexical_classes/roget_hierarchy.xml") classmap.download( "https://github.com/spraakbanken/sparv-models/raw/master/lexical_classes/roget_hierarchy.xml" ) # Download blingbring.txt and build blingbring.pickle raw_file = Model("lexical_classes/blingbring.txt") raw_file.download( "https://svn.spraakdata.gu.se/sb-arkiv/pub/lexikon/bring/blingbring.txt" ) lexicon = read_blingbring(raw_file.path, classmap.path) out.write_pickle(lexicon) # Clean up raw_file.remove() classmap.remove()
def build_diapivot(out: ModelOutput = ModelOutput("hist/diapivot.pickle")): """Download diapivot XML dictionary and save as a pickle file.""" # Download diapivot.xml xml_model = Model("hist/diapivot.xml") xml_model.download( "https://svn.spraakdata.gu.se/sb-arkiv/pub/lmf/diapivot/diapivot.xml") # Create pickle file xml_lexicon = read_xml(xml_model.path) log.info("Saving cross lexicon in Pickle format") picklex = {} for lem in xml_lexicon: lemgrams = [] for saldo, match in list(xml_lexicon[lem].items()): lemgrams.append(PART_DELIM1.join([saldo, match])) picklex[lem] = sorted(lemgrams) out.write_pickle(picklex) # Clean up xml_model.remove()
def _download(url, gzip, out): gzip_model = Model(gzip) gzip_model.download(url) gzip_model.ungzip(out.path) gzip_model.remove()