Ejemplo n.º 1
0
def build_model(out: ModelOutput = ModelOutput("malt/swemalt-1.7.2.mco"),
                _maltjar: Binary = Binary("[malt.jar]")):
    """Download model for MALT Parser.

    Won't download model unless maltjar has been installed.
    """
    out.download("http://maltparser.org/mco/swedish_parser/swemalt-1.7.2.mco")
Ejemplo n.º 2
0
def stanza_pos_model(model: ModelOutput = ModelOutput(
    "stanza/pos/full_sv_talbanken_tagger.pt"),
                     pretrain: ModelOutput = ModelOutput(
                         "stanza/pos/full_sv_talbanken.pretrain.pt")):
    """Download and unzip the Stanza POS-tagging model."""
    zip_model = Model("stanza/pos/synt_stanza_full.zip")
    zip_model.download(
        "https://svn.spraakdata.gu.se/sb-arkiv/pub/stanza/morph_stanza_full.zip"
    )
    zip_model.unzip()
    zip_model.remove()
Ejemplo n.º 3
0
def stanza_dep_model(
        model: ModelOutput = ModelOutput("stanza/dep/sv_talbanken_parser.pt"),
        pretrain: ModelOutput = ModelOutput(
            "stanza/dep/sv_talbanken.pretrain.pt")):
    """Download and unzip the Stanza dependency model."""
    zip_model = Model("stanza/dep/synt_stanza_full.zip")
    zip_model.download(
        "https://svn.spraakdata.gu.se/sb-arkiv/pub/stanza/synt_stanza_full.zip"
    )
    zip_model.unzip()
    zip_model.remove()
Ejemplo n.º 4
0
def swefn_model(
        out: ModelOutput = ModelOutput("lexical_classes/swefn.pickle")):
    """Download and build SweFN model."""
    # Download swefn.xml and build swefn.pickle
    raw_file = Model("lexical_classes/swefn.xml")
    raw_file.download(
        "https://svn.spraakdata.gu.se/sb-arkiv/pub/lmf/swefn/swefn.xml")
    lexicon = read_swefn(raw_file.path)
    out.write_pickle(lexicon)

    # Clean up
    raw_file.remove()
Ejemplo n.º 5
0
def stanza_lem_model(
        model: ModelOutput = ModelOutput("stanza/lem/sv_suc_lemmatizer.pt")):
    """Download and unzip the Stanza POS-tagging model."""
    zip_model = Model("stanza/lem/synt_stanza_full.zip")
    zip_model.download(
        "https://svn.spraakdata.gu.se/sb-arkiv/pub/stanza/lem_stanza.zip")
    zip_model.unzip()
    zip_model.remove()
Ejemplo n.º 6
0
def build_model(out: ModelOutput = ModelOutput("sensaldo/sensaldo.pickle")):
    """Download and build SenSALDO model."""
    # Download and extract sensaldo-base-v02.txt
    zip_model = Model("sensaldo/sensaldo-v02.zip")
    zip_model.download(
        "https://svn.spraakdata.gu.se/sb-arkiv/pub/lexikon/sensaldo/sensaldo-v02.zip"
    )
    zip_model.unzip()
    tsv_model = Model("sensaldo/sensaldo-base-v02.txt")

    # Read sensaldo tsv dictionary and save as a pickle file
    lexicon = read_sensaldo(tsv_model)
    out.write_pickle(lexicon)

    # Clean up
    zip_model.remove()
    tsv_model.remove()
    Model("sensaldo/sensaldo-fullform-v02.txt").remove()
Ejemplo n.º 7
0
def build_nst_comp(out: ModelOutput = ModelOutput("saldo/nst_comp_pos.pickle"),
                   nst_lexicon: Model = Model("saldo/nst_utf8.txt")):
    """Download NST lexicon and convert it to a compound POS model.

    The NST lexicon can be retrieved from SVN with credentials:
    svn export https://svn.spraakdata.gu.se/sb-arkiv/lexikon/NST_svensk_leksikon/nst_utf8.txt saldo/nst_utf8.txt
    """
    log.info("Building compound POS probability model...")
    make_model(nst_lexicon, out)
Ejemplo n.º 8
0
def stanza_resources_file(
        resources_file: ModelOutput = ModelOutput("stanza/resources.json")):
    """Download and unzip the Stanza dependency model."""
    # Write resources.json file to keep Stanza from complaining
    res = json.dumps({
        "sv": {
            "lang_name": "Swedish",
            "tokenize": {
                "orchid": {},
                "best": {}
            },
            "default_processors": {
                "tokenize": "orchid"
            },
            "default_dependencies": {},
        }
    })
    resources_file.write(res)
Ejemplo n.º 9
0
def morphtable_inputs(
        suc: ModelOutput = ModelOutput("hunpos/suc3_morphtable.words"),
        morphtable_base: ModelOutput = ModelOutput("hunpos/suc.morphtable"),
        morphtable_patterns: ModelOutput = ModelOutput("hunpos/suc.patterns")):
    """Download the files needed to build the SALDO morphtable."""
    suc.download(
        "https://github.com/spraakbanken/sparv-models/raw/master/hunpos/suc3_morphtable.words"
    )

    morphtable_base.download(
        "https://github.com/spraakbanken/sparv-models/raw/master/hunpos/suc.morphtable"
    )

    morphtable_patterns.download(
        "https://github.com/spraakbanken/sparv-models/raw/master/hunpos/suc.patterns"
    )
Ejemplo n.º 10
0
def blingbring_model(
        out: ModelOutput = ModelOutput("lexical_classes/blingbring.pickle")):
    """Download and build Blingbring model."""
    # Download roget hierarchy
    classmap = Model("lexical_classes/roget_hierarchy.xml")
    classmap.download(
        "https://github.com/spraakbanken/sparv-models/raw/master/lexical_classes/roget_hierarchy.xml"
    )

    # Download blingbring.txt and build blingbring.pickle
    raw_file = Model("lexical_classes/blingbring.txt")
    raw_file.download(
        "https://svn.spraakdata.gu.se/sb-arkiv/pub/lexikon/bring/blingbring.txt"
    )
    lexicon = read_blingbring(raw_file.path, classmap.path)
    out.write_pickle(lexicon)

    # Clean up
    raw_file.remove()
    classmap.remove()
Ejemplo n.º 11
0
def hist_morphtable(out: ModelOutput = ModelOutput("hunpos/hist/dalinm-swedberg_saldo_suc-tags.morphtable"),
                    swedberg: Model = Model("hunpos/hist/swedberg-gender.hunpos"),
                    dalin: Model = Model("hunpos/hist/dalinm.hunpos"),
                    saldosuc_morphtable: Model = Model("hunpos/saldo_suc-tags.morphtable")):
    """Read files and make a morphtable together with the information from SALDO (saldosuc_morphtable).

    Args:
        out (str, optional): Resulting morphtable file to be written.
            Defaults to ModelOutput("hunpos/hist/dalinm-swedberg_saldo_suc-tags.morphtable").
        swedberg (str, optional): Wordlist from Swedberg and corresponding SALDO MSD-tags.
            Defaults to Model("hunpos/hist/swedberg-gender.hunpos").
        dalin (str, optional): Wordlist from Dalin and corresponding SALDO MSD-tags.
            Defaults to Model("hunpos/hist/dalinm.hunpos").
        saldosuc_morphtable (str, optional): SALDO Hunpos morphtable.
            Defaults to Model("hunpos/saldo_suc-tags.morphtable").
    """
    words = {}
    _read_saldosuc(words, saldosuc_morphtable.path)
    for fil in [dalin, swedberg]:
        for line in open(fil.path, encoding="utf-8").readlines():
            if not line.strip():
                continue
            xs = line.split("\t")
            word, msd = xs[0].strip(), xs[1].strip()
            if " " in word:
                if msd.startswith("nn"):  # We assume that the head of a noun mwe is the last word
                    word = word.split()[-1]
                if msd.startswith("vb"):  # We assume that the head of a verbal mwe is the first word
                    word = word.split()[0]

            # If the tag is not present, we try to translate it anyway
            suc = SALDO_TO_SUC.get(msd, "")
            if not suc:
                suc = _force_parse(msd)
            if suc:
                words.setdefault(word.lower(), set()).update(suc)
                words.setdefault(word.title(), set()).update(suc)
    with open(out.path, encoding="UTF-8", mode="w") as out:
        for w, ts in list(words.items()):
            line = ("\t".join([w] + list(ts)) + "\n")
            out.write(line)
Ejemplo n.º 12
0
def build_tokenlist(
        saldo_model: Model = Model("saldo/saldo.pickle"),
        out: ModelOutput = ModelOutput(
            "segment/bettertokenizer.sv.saldo-tokens"),
        segmenter: str = Config("segment.token_wordlist_segmenter"),
        model: Model = Model("segment/bettertokenizer.sv")):
    """Build a list of words from a SALDO model, to help BetterWordTokenizer."""
    segmenter_args = []
    if model:
        if model.path.suffix in ["pickle", "pkl"]:
            with open(model, "rb") as m:
                model_arg = pickle.load(m)
        else:
            model_arg = model.path
        segmenter_args.append(model_arg)
    assert segmenter in SEGMENTERS, "Available segmenters: %s" % ", ".join(
        sorted(SEGMENTERS))
    segmenter = SEGMENTERS[segmenter]
    segmenter = segmenter(*segmenter_args)
    assert hasattr(
        segmenter, "span_tokenize"
    ), "Segmenter needs a 'span_tokenize' method: %r" % segmenter

    wordforms = set()

    # Skip strings already handled by the tokenizer.
    # Also skip words ending in comma (used by some multi word expressions in SALDO).
    with open(saldo_model.path, "rb") as F:
        lexicon = pickle.load(F)
        for w in lexicon:
            w2 = list(map(split_triple, lexicon[w]))
            mwu_extras = [
                contw for w3 in w2 for cont in w3[2] for contw in cont
                if contw not in lexicon
            ]
            for wf in mwu_extras + [w]:
                spans = list(segmenter.span_tokenize(wf))
                if len(spans) > 1 and not wf.endswith(","):
                    wordforms.add(wf)

    out.write("\n".join(sorted(wordforms)))
Ejemplo n.º 13
0
def build_dalin(out: ModelOutput = ModelOutput("hist/dalin.pickle")):
    """Download Dalin morphology XML and save as a pickle file."""
    # Download dalinm.xml
    xml_model = Model("hist/dalinm.xml")
    xml_model.download(
        "https://svn.spraakdata.gu.se/sb-arkiv/pub/lmf/dalinm/dalinm.xml")

    # Create pickle file
    lmf_to_pickle(xml_model.path, out.path)

    # Clean up
    xml_model.remove()
Ejemplo n.º 14
0
def build_diapivot(out: ModelOutput = ModelOutput("hist/diapivot.pickle")):
    """Download diapivot XML dictionary and save as a pickle file."""
    # Download diapivot.xml
    xml_model = Model("hist/diapivot.xml")
    xml_model.download(
        "https://svn.spraakdata.gu.se/sb-arkiv/pub/lmf/diapivot/diapivot.xml")

    # Create pickle file
    xml_lexicon = read_xml(xml_model.path)
    log.info("Saving cross lexicon in Pickle format")
    picklex = {}
    for lem in xml_lexicon:
        lemgrams = []
        for saldo, match in list(xml_lexicon[lem].items()):
            lemgrams.append(PART_DELIM1.join([saldo, match]))
        picklex[lem] = sorted(lemgrams)

    out.write_pickle(picklex)

    # Clean up
    xml_model.remove()
Ejemplo n.º 15
0
def build_swedberg(out: ModelOutput = ModelOutput("hist/swedberg.pickle")):
    """Download Swedberg morphology XML and save as a pickle file."""
    # Download diapivot.xml
    xml_model = Model("hist/swedbergm.xml")
    xml_model.download(
        "https://svn.spraakdata.gu.se/sb-arkiv/pub/lmf/swedbergm/swedbergm.xml"
    )

    # Create pickle file
    lmf_to_pickle(xml_model.path, out.path)

    # Clean up
    xml_model.remove()
Ejemplo n.º 16
0
def build_korp_stats(out: ModelOutput = ModelOutput("saldo/stats.pickle"),
                     _saldom: Model = Model("saldo/saldom.xml")):
    """Download Korp's word frequency file and convert it to a model."""
    txt_file = Model("saldo/stats_all.txt")
    try:
        log.info("Downloading Korp stats file...")
        download_stats_file(
            "https://svn.spraakdata.gu.se/sb-arkiv/pub/frekvens/stats_all.txt",
            txt_file.path)

        log.info("Building frequency model...")
        make_model(txt_file.path, out.path)
    finally:
        # Clean up
        txt_file.remove()
Ejemplo n.º 17
0
def build_model(sense_model: ModelOutput = ModelOutput(
    "wsd/ALL_512_128_w10_A2_140403_ctx1.bin"),
                context_model: ModelOutput = ModelOutput(
                    "wsd/lem_cbow0_s512_w10_NEW2_ctx.bin")):
    """Download models for SALDO-based word sense disambiguation."""
    # Download sense model
    sense_model.download(
        "https://github.com/spraakbanken/sparv-wsd/raw/master/models/scouse/ALL_512_128_w10_A2_140403_ctx1.bin"
    )

    # Download context model
    context_model.download(
        "https://github.com/spraakbanken/sparv-wsd/raw/master/models/scouse/lem_cbow0_s512_w10_NEW2_ctx.bin"
    )
Ejemplo n.º 18
0
def build_model(out: ModelOutput = ModelOutput("geo/geo.pickle")):
    """Download and build geo model."""
    # Download and extract cities1000.txt
    cities_zip = Model("geo/cities1000.zip")
    cities_zip.download("http://download.geonames.org/export/dump/cities1000.zip")
    cities_zip.unzip()

    # Download and extract alternateNames.txt
    names_zip = Model("geo/alternateNames.zip")
    names_zip.download("http://download.geonames.org/export/dump/alternateNames.zip")
    names_zip.unzip()

    pickle_model(Model("geo/cities1000.txt"), Model("geo/alternateNames.txt"), out)

    # Clean up
    cities_zip.remove()
    names_zip.remove()
    Model("geo/iso-languagecodes.txt").remove()
    Model("geo/cities1000.txt").remove()
    Model("geo/alternateNames.txt").remove()
Ejemplo n.º 19
0
def download_dalin_wordlist(out: ModelOutput = ModelOutput("hunpos/hist/dalinm.hunpos")):
    """Download Dalin wordlist."""
    out.download("https://github.com/spraakbanken/sparv-models/raw/master/hunpos/hist/dalinm.hunpos")
Ejemplo n.º 20
0
def download_swedberg_wordlist(out: ModelOutput = ModelOutput("hunpos/hist/swedberg-gender.hunpos")):
    """Download Swedberg wordlist."""
    out.download("https://github.com/spraakbanken/sparv-models/raw/master/hunpos/hist/swedberg-gender.hunpos")
Ejemplo n.º 21
0
def download_bettertokenizer(
        out: ModelOutput = ModelOutput("segment/bettertokenizer.sv")):
    """Download model for use with BetterWordTokenizer."""
    out.download(
        "https://github.com/spraakbanken/sparv-models/raw/master/segment/bettertokenizer.sv"
    )
Ejemplo n.º 22
0
def download_punkt_model(
        out: ModelOutput = ModelOutput("segment/punkt-nltk-svenska.pickle")):
    """Download model for use with PunktSentenceTokenizer."""
    out.download(
        "https://github.com/spraakbanken/sparv-models/raw/master/segment/punkt-nltk-svenska.pickle"
    )
def train(doc: str = Document,
          file_list,
          modelfile: str = ModelOutput("vw_topic_modelling/?.model"),
          jsonfile: str = ModelOutput("vw_topic_modelling/?.model.json"),
          dry_run_labels: bool = False,
          label_map_json=None,
          bound=None,
          min_word_length: int = 0,
          banned_pos=""):
    """
    Train a model using vowpal wabbit.

    Creates outprefix.model and outprefix.model.json.

    file_list is a file with 5*N lines of annotation filenames:
    first N copies of: order,
     then N copies of: annotation_struct,
     then N copies of: parent,
     then N copies of: word.
     then N copies of: pos.
    """

    with open(file_list, "r") as fp:
        files = fp.read().split()
    order_struct_parent_word_pos = interleave(files, 5)
    map_label = _make_label_map(label_map_json)
    min_word_length = int(min_word_length) if min_word_length else 0

    # Look at the structs annotations to get the labels and their distribution:
    _, structs, _, _, _ = list(zip(*order_struct_parent_word_pos))
    # TODO: skip labels with very low occurrences
    labels = Counter(map_label(label)
                     for annotfile in structs
                     for label in util.read_annotation(doc, annotfile)
                     if map_label(label))
    N = sum(labels.values())
    if bound:
        bound = int(bound)
        N = min(bound, N)
    k = len(labels)
    label_to_index = {}
    index_to_label = {}
    answer = {}
    for i, (label, occurences) in enumerate(iter(list(labels.items())), start=1):
        w = float(N) / occurences
        log.info(f"{label}: occurences: {occurences}, weight: {w}")
        answer[label] = ("%s:%s | " % (i, w)).encode()
        label_to_index[label] = i
        index_to_label[i] = label

    if dry_run_labels == "true":
        from pprint import pprint
        pprint(labels.most_common())
        print(json.dumps({l: l for l in labels}, indent=2))
        log.info(f"texts: {N}, labels: {k}")
        sys.exit()

    def itertexts():
        return _take(bound, texts(order_struct_parent_word_pos, map_label, min_word_length, banned_pos))

    # Train model
    args = ["--oaa", str(k),
            "--passes", "10",
            "--cache", "--kill_cache",
            "--bit_precision", "24",
            "--final_regressor", modelfile]
    data = (
        Example(answer[text.label], text.words)
        for text in every(10, itertexts(), invert=True)
    )
    vw_train(args, data)

    # Performance evaluation
    args = ["--initial_regressor", modelfile]
    target = []

    def data_iterator():
        for text in every(10, itertexts()):
            target.append(label_to_index[text.label])
            yield Example(None, text.words)

    predicted = [int(s) for s, _tag in vw_predict(args, data_iterator())]
    N_eval = len(predicted)

    assert len(predicted) == len(target)

    order = list(range(1, 1 + k))
    info = dict(
        min_word_length=min_word_length,
        banned_pos=banned_pos,
        labels=[index_to_label[i] for i in order],
        index_to_label=index_to_label,
        label_to_index=label_to_index,
        N_train=N - N_eval,
        N_eval=N_eval,
        stats={index_to_label[i]: p.as_dict()
               for i, p in
               list(multiclass_performance(target, predicted).items())},
        confusion_matrix=confusion_matrix(target, predicted, order))
    with open(jsonfile, "w") as f:
        json.dump(info, f, sort_keys=True, indent=2)
    log.info(f"Wrote {jsonfile}")
Ejemplo n.º 24
0
def download_saldo(out: ModelOutput = ModelOutput("saldo/saldom.xml")):
    """Download SALDO morphology XML."""
    out.download(
        "https://svn.spraakdata.gu.se/sb-arkiv/pub/lexikon/saldom/saldom.xml")
Ejemplo n.º 25
0
def metashare_template(model: ModelOutput = ModelOutput(
    "sbx_metadata/sbx-metashare-template.xml")):
    """Download the SBX META-SHARE template."""
    model.download(
        "https://raw.githubusercontent.com/spraakbanken/sparv-sbx-metadata/main/data/sbx-metashare-template.xml"
    )
Ejemplo n.º 26
0
def download_nst_comp(
        out: ModelOutput = ModelOutput("saldo/nst_comp_pos.pickle")):
    """Download compound POS model from sparv-models repo."""
    out.download(
        "https://github.com/spraakbanken/sparv-models/raw/master/saldo/nst_comp_pos.pickle"
    )
Ejemplo n.º 27
0
def get_rus_model(out: ModelOutput = ModelOutput("treetagger/rus.par"),
                  tt_binary: Binary = Binary("[treetagger.binary]")):
    """Download TreeTagger language model."""
    gzip = "treetagger/russian.par.gz"
    url = "http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/russian.par.gz"
    _download(url, gzip, out)
Ejemplo n.º 28
0
def hunpos_model(model: ModelOutput = ModelOutput(
    "hunpos/suc3_suc-tags_default-setting_utf8.model")):
    """Download the Hunpos model."""
    model.download(
        "https://github.com/spraakbanken/sparv-models/raw/master/hunpos/suc3_suc-tags_default-setting_utf8.model"
    )
Ejemplo n.º 29
0
def build_saldo(out: ModelOutput = ModelOutput("saldo/saldo.pickle"),
                saldom: Model = Model("saldo/saldom.xml")):
    """Save SALDO morphology as a pickle file."""
    lmf_to_pickle(saldom.path, out.path)
Ejemplo n.º 30
0
def saldo_morphtable(out: ModelOutput = ModelOutput(
    "hunpos/saldo_suc-tags.morphtable"),
                     saldo_model: Model = Model("saldo/saldo.pickle"),
                     suc: Model = Model("hunpos/suc3_morphtable.words"),
                     morphtable_base: Model = Model("hunpos/suc.morphtable"),
                     morphtable_patterns: Model = Model("hunpos/suc.patterns"),
                     add_capitalized: bool = True,
                     add_lowercase: bool = False):
    """Create a morphtable file for use with Hunpos.

    A morphtable contains wordforms from SALDO's morphology (with accompanying tags) which are missing in SUC3.
    Since the morphtable is case sensitive, both the original form and a capitalized form
    is saved.

    Args:
        out (str, optional): Resulting morphtable file to be written.
            Defaults to ModelOutput("hunpos/saldo_suc-tags.morphtable").
        saldo_model (str, optional): Path to a pickled SALDO model.
            Defaults to Model("saldo/saldo.pickle").
        suc (str, optional): Tab-separated file with wordforms from SUC, containing: frequency, wordform, tag.
            Defaults to Model("hunpos/suc3_morphtable.words").
        morphtable_base (str, optional): Existing morphtable file, whose contents will be included in the new one.
            Defaults to Model("hunpos/suc.morphtable").
        morphtable_patterns (str, optional): Optional file with regular expressions.
            Defaults to Model("hunpos/suc.patterns").
        add_capitalized (bool, optional): Whether or not capitalized word forms should be added. Defaults to True.
        add_lowercase (bool, optional): Whether or not lower case word forms should be added. Defaults to False.
    """
    lex = saldo.SaldoLexicon(saldo_model.path)
    tags = defaultdict(set)

    # Get all wordforms from SALDO
    for word in list(lex.lexicon.keys()):
        words = lex.lookup(word)
        # Filter out multi word expressions
        words = [x for x in words if len(x[2]) == 0]
        if words:
            # Only use MSD not containing "-"
            for w in words:
                for msd in w[1]:
                    if "-" not in msd:
                        tags[word].add(msd)
                        if add_capitalized:
                            # Add a capitalized form of the word
                            capitalized = word[0].upper() + word[1:]
                            if not word == capitalized:
                                tags[capitalized].add(msd)
                        if add_lowercase:
                            # Add a lower case form of the word
                            lower = word.lower()
                            if not word == lower:
                                tags[lower].add(msd)

    # Read SUC words
    with open(suc.path, encoding="UTF-8") as suctags:
        for line in suctags:
            _, word, msd = line.strip("\n").split("\t")

            # Don't keep SALDO words already in SUC
            if word in tags:
                del tags[word]
            # If the word is not a name, and exists as lowercase in SALDO, remove it
            elif not msd.startswith("PM") and not word.lower(
            ) == word and word.lower() in tags:
                del tags[word.lower()]

    # Read regular expressions from pattern file
    pattern_list = []
    if morphtable_patterns:
        with open(morphtable_patterns.path, encoding="UTF-8") as pat:
            for line in pat:
                if line.strip() and not line.startswith("#"):
                    pattern_name, _, pattern_tags = line.strip().split("\t", 2)
                    pattern_list.append("[[%s]]\t%s\n" %
                                        (pattern_name, pattern_tags))

    with open(out.path, encoding="UTF-8", mode="w") as out:
        if morphtable_base:
            with open(morphtable_base.path, encoding="UTF-8") as base:
                for line in base:
                    out.write(line)

        for pattern in pattern_list:
            out.write(pattern)

        for word in sorted(tags):
            out.write("%s\t%s\n" % (word, "\t".join(tags[word])))