Exemple #1
0
def stanza_lem_model(
        model: ModelOutput = ModelOutput("stanza/lem/sv_suc_lemmatizer.pt")):
    """Download and unzip the Stanza POS-tagging model."""
    zip_model = Model("stanza/lem/synt_stanza_full.zip")
    zip_model.download(
        "https://svn.spraakdata.gu.se/sb-arkiv/pub/stanza/lem_stanza.zip")
    zip_model.unzip()
    zip_model.remove()
Exemple #2
0
def stanza_dep_model(
        model: ModelOutput = ModelOutput("stanza/dep/sv_talbanken_parser.pt"),
        pretrain: ModelOutput = ModelOutput(
            "stanza/dep/sv_talbanken.pretrain.pt")):
    """Download and unzip the Stanza dependency model."""
    zip_model = Model("stanza/dep/synt_stanza_full.zip")
    zip_model.download(
        "https://svn.spraakdata.gu.se/sb-arkiv/pub/stanza/synt_stanza_full.zip"
    )
    zip_model.unzip()
    zip_model.remove()
Exemple #3
0
def stanza_pos_model(model: ModelOutput = ModelOutput(
    "stanza/pos/full_sv_talbanken_tagger.pt"),
                     pretrain: ModelOutput = ModelOutput(
                         "stanza/pos/full_sv_talbanken.pretrain.pt")):
    """Download and unzip the Stanza POS-tagging model."""
    zip_model = Model("stanza/pos/synt_stanza_full.zip")
    zip_model.download(
        "https://svn.spraakdata.gu.se/sb-arkiv/pub/stanza/morph_stanza_full.zip"
    )
    zip_model.unzip()
    zip_model.remove()
Exemple #4
0
def build_dalin(out: ModelOutput = ModelOutput("hist/dalin.pickle")):
    """Download Dalin morphology XML and save as a pickle file."""
    # Download dalinm.xml
    xml_model = Model("hist/dalinm.xml")
    xml_model.download(
        "https://svn.spraakdata.gu.se/sb-arkiv/pub/lmf/dalinm/dalinm.xml")

    # Create pickle file
    lmf_to_pickle(xml_model.path, out.path)

    # Clean up
    xml_model.remove()
Exemple #5
0
def swefn_model(
        out: ModelOutput = ModelOutput("lexical_classes/swefn.pickle")):
    """Download and build SweFN model."""
    # Download swefn.xml and build swefn.pickle
    raw_file = Model("lexical_classes/swefn.xml")
    raw_file.download(
        "https://svn.spraakdata.gu.se/sb-arkiv/pub/lmf/swefn/swefn.xml")
    lexicon = read_swefn(raw_file.path)
    out.write_pickle(lexicon)

    # Clean up
    raw_file.remove()
Exemple #6
0
def build_swedberg(out: ModelOutput = ModelOutput("hist/swedberg.pickle")):
    """Download Swedberg morphology XML and save as a pickle file."""
    # Download diapivot.xml
    xml_model = Model("hist/swedbergm.xml")
    xml_model.download(
        "https://svn.spraakdata.gu.se/sb-arkiv/pub/lmf/swedbergm/swedbergm.xml"
    )

    # Create pickle file
    lmf_to_pickle(xml_model.path, out.path)

    # Clean up
    xml_model.remove()
Exemple #7
0
def build_korp_stats(out: ModelOutput = ModelOutput("saldo/stats.pickle"),
                     _saldom: Model = Model("saldo/saldom.xml")):
    """Download Korp's word frequency file and convert it to a model."""
    txt_file = Model("saldo/stats_all.txt")
    try:
        log.info("Downloading Korp stats file...")
        download_stats_file(
            "https://svn.spraakdata.gu.se/sb-arkiv/pub/frekvens/stats_all.txt",
            txt_file.path)

        log.info("Building frequency model...")
        make_model(txt_file.path, out.path)
    finally:
        # Clean up
        txt_file.remove()
Exemple #8
0
def build_model(out: ModelOutput = ModelOutput("sensaldo/sensaldo.pickle")):
    """Download and build SenSALDO model."""
    # Download and extract sensaldo-base-v02.txt
    zip_model = Model("sensaldo/sensaldo-v02.zip")
    zip_model.download(
        "https://svn.spraakdata.gu.se/sb-arkiv/pub/lexikon/sensaldo/sensaldo-v02.zip"
    )
    zip_model.unzip()
    tsv_model = Model("sensaldo/sensaldo-base-v02.txt")

    # Read sensaldo tsv dictionary and save as a pickle file
    lexicon = read_sensaldo(tsv_model)
    out.write_pickle(lexicon)

    # Clean up
    zip_model.remove()
    tsv_model.remove()
    Model("sensaldo/sensaldo-fullform-v02.txt").remove()
Exemple #9
0
def build_model(out: ModelOutput = ModelOutput("geo/geo.pickle")):
    """Download and build geo model."""
    # Download and extract cities1000.txt
    cities_zip = Model("geo/cities1000.zip")
    cities_zip.download("http://download.geonames.org/export/dump/cities1000.zip")
    cities_zip.unzip()

    # Download and extract alternateNames.txt
    names_zip = Model("geo/alternateNames.zip")
    names_zip.download("http://download.geonames.org/export/dump/alternateNames.zip")
    names_zip.unzip()

    pickle_model(Model("geo/cities1000.txt"), Model("geo/alternateNames.txt"), out)

    # Clean up
    cities_zip.remove()
    names_zip.remove()
    Model("geo/iso-languagecodes.txt").remove()
    Model("geo/cities1000.txt").remove()
    Model("geo/alternateNames.txt").remove()
Exemple #10
0
def blingbring_model(
        out: ModelOutput = ModelOutput("lexical_classes/blingbring.pickle")):
    """Download and build Blingbring model."""
    # Download roget hierarchy
    classmap = Model("lexical_classes/roget_hierarchy.xml")
    classmap.download(
        "https://github.com/spraakbanken/sparv-models/raw/master/lexical_classes/roget_hierarchy.xml"
    )

    # Download blingbring.txt and build blingbring.pickle
    raw_file = Model("lexical_classes/blingbring.txt")
    raw_file.download(
        "https://svn.spraakdata.gu.se/sb-arkiv/pub/lexikon/bring/blingbring.txt"
    )
    lexicon = read_blingbring(raw_file.path, classmap.path)
    out.write_pickle(lexicon)

    # Clean up
    raw_file.remove()
    classmap.remove()
def build_diapivot(out: ModelOutput = ModelOutput("hist/diapivot.pickle")):
    """Download diapivot XML dictionary and save as a pickle file."""
    # Download diapivot.xml
    xml_model = Model("hist/diapivot.xml")
    xml_model.download(
        "https://svn.spraakdata.gu.se/sb-arkiv/pub/lmf/diapivot/diapivot.xml")

    # Create pickle file
    xml_lexicon = read_xml(xml_model.path)
    log.info("Saving cross lexicon in Pickle format")
    picklex = {}
    for lem in xml_lexicon:
        lemgrams = []
        for saldo, match in list(xml_lexicon[lem].items()):
            lemgrams.append(PART_DELIM1.join([saldo, match]))
        picklex[lem] = sorted(lemgrams)

    out.write_pickle(picklex)

    # Clean up
    xml_model.remove()
Exemple #12
0
def _download(url, gzip, out):
    gzip_model = Model(gzip)
    gzip_model.download(url)
    gzip_model.ungzip(out.path)
    gzip_model.remove()