Esempio n. 1
0
def evaluate_goldstandard(gs_file, trained_model_bin, feature_dict_json, affix_list):

    # get stored model and feature dict
    io = MorfessorIO(encoding='utf-8', compound_separator='\s+', atom_separator=None, lowercase=False)
    trained_model = io.read_binary_model_file(trained_model_bin)
    feature_dict = InfixerModel.get_features_dict_from_file(feature_dict_json)

    # eval and segment to file
    evaluator = InfixerEvaluation(trained_model, feature_dict, affix_list)
    evaluator.evaluate_model(gs_file)
Esempio n. 2
0
def evaluate_goldstandard(gs_file, trained_model_bin, feature_dict_json,
                          affix_list):

    # get stored model and feature dict
    io = MorfessorIO(encoding='utf-8',
                     compound_separator='\s+',
                     atom_separator=None,
                     lowercase=False)
    trained_model = io.read_binary_model_file(trained_model_bin)
    feature_dict = InfixerModel.get_features_dict_from_file(feature_dict_json)

    # eval and segment to file
    evaluator = InfixerEvaluation(trained_model, feature_dict, affix_list)
    evaluator.evaluate_model(gs_file)
Esempio n. 3
0
    def __init__(self, morfessor_model, feature_dict, affix_list):
        """Initialize an evaluation object with a model, feature dict, and affix list.

        :param morfessor_model: a trained Morfessor Baseline object
        :param feature_dict: the output dictionary from ModelBuilder object
        :param affix_list:
        """

        # save input
        self._model = morfessor_model
        self._feature_dict = feature_dict

        self._affix_filter = AffixFilter(affix_list)

        # set up morfessor's IO class
        self._io_manager = MorfessorIO()
Esempio n. 4
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-mf",
                        required=True,
                        help="The morfessor model to use",
                        dest="morf_model")
    parser.add_argument("input", nargs=1, help="Input text file")
    options = parser.parse_args()

    # Load config parameters
    locals().update(config)
    numpy.random.seed(0)

    # Read in the morfessor model
    morf_segmenter = MorfessorIO().read_binary_model_file(options.morf_model)

    with codecs.open(options.input[0], 'r', 'utf-8') as f:
        data = f.read().split()

    if len(data) % seq_length > 0:
        data = data[:len(data) - len(data) % seq_length + 1]
    else:
        data = data[:len(data) - seq_length + 1]
    nsamples = len(data) // seq_length

    # Read in word-level data
    words = set(data)
    vocab_size = len(words)
    word_to_ix = {word: i for i, word in enumerate(words)}
    ix_to_word = {i: word for i, word in enumerate(words)}
Esempio n. 5
0
def morfessor_main(train_files, dampening, cycle, save_file=None):
    """Calls an implementation of the Morfessor model.

    :param dampening: 'none', 'ones', or 'log'
    :param train_files: input files for model training
    :param cycle: from {'init', 'test', 'final'}
    :param save_file: base name of output files (if needed)
    :return: trained morfessor.BaselineModel
    """

    # define input variables normally input at command line
    # all arguments are equal to their args.item equivalent in original
    # script's main()

    trainfiles = train_files  # input files for training
    progress = True  # show progress bar
    encoding = 'utf-8'  # if None, tries UTF-8 and/or local encoding
    cseparator = '\s+'  # separator for compound segmentation
    separator = None  # separator for atom segmentation
    lowercase = False  # makes all inputs lowercase
    forcesplit = ['-']  # list of chars to force a split on
    corpusweight = 1.0  # load annotation data for tuning the corpus weight param
    skips = False  # use random skips for frequently seen compounds to speed up training
    nosplit = None  # if the expression matches the two surrounding characters, do not allow splitting
    dampening = dampening  # 'none', 'ones', or 'log'
    algorithm = 'recursive'  # 'recursive' or 'viterbi'
    finish_threshold = 0.005  # train stops when the improvement of last iteration is smaller than this
    maxepochs = None  # ceiling on number of training epochs
    develannots = None  # boolean on whether to use dev-data file
    splitprob = None  # initialize new words by random split using given probability
    epochinterval = 10000  # epoch interval for online training
    algparams = (
    )  # set algorithm parameters; for this model, we are not using 'viterbi', nothing to set

    # Progress bar handling
    global show_progress_bar
    if progress:
        show_progress_bar = True
    else:
        show_progress_bar = False

    # build I/O and model
    io = MorfessorIO(encoding=encoding,
                     compound_separator=cseparator,
                     atom_separator=separator,
                     lowercase=lowercase)

    model = BaselineModel(forcesplit_list=forcesplit,
                          corpusweight=corpusweight,
                          use_skips=skips,
                          nosplit_re=nosplit)

    # Set frequency dampening function
    if dampening == 'none':
        dampfunc = None
    elif dampening == 'log':
        dampfunc = lambda x: int(round(math.log(x + 1, 2)))
    elif dampening == 'ones':
        dampfunc = lambda x: 1
    else:
        raise ArgumentException("unknown dampening type '%s'" % dampening)

    # for use when building a new model or doing online training
    # this is the online+batch training model
    if len(trainfiles) > 0:

        time_start = time.time()

        data = io.read_corpus_files(trainfiles)
        epochs, total_cost = model.train_online(data, dampfunc, epochinterval,
                                                algorithm, algparams,
                                                splitprob, maxepochs)
        epochs, total_cost = model.train_batch(algorithm, algparams,
                                               develannots, finish_threshold,
                                               maxepochs)
        _logger.info("Epochs: %s" % epochs)

        time_end = time.time()
        _logger.info("Final cost: %s" % total_cost)
        _logger.info("Training time: %.3fs" % (time_end - time_start))

    else:
        _logger.warning("No training data files specified.")

    # if save file is present, write binary model to file
    if isinstance(save_file, str):

        outfile_bin = save_file + cycle + "_bin"
        io.write_binary_model_file(outfile_bin, model)

    # return model object for further manipulation
    return model
Esempio n. 6
0
def morfessor_main(train_files, dampening, cycle, save_file=None):
    """Calls an implementation of the Morfessor model.

    :param dampening: 'none', 'ones', or 'log'
    :param train_files: input files for model training
    :param cycle: from {'init', 'test', 'final'}
    :param save_file: base name of output files (if needed)
    :return: trained morfessor.BaselineModel
    """

    # define input variables normally input at command line
    # all arguments are equal to their args.item equivalent in original
    # script's main()

    trainfiles = train_files    # input files for training
    progress = True             # show progress bar
    encoding = 'utf-8'          # if None, tries UTF-8 and/or local encoding
    cseparator = '\s+'          # separator for compound segmentation
    separator = None            # separator for atom segmentation
    lowercase = False           # makes all inputs lowercase
    forcesplit = ['-']          # list of chars to force a split on
    corpusweight = 1.0          # load annotation data for tuning the corpus weight param
    skips = False               # use random skips for frequently seen compounds to speed up training
    nosplit = None              # if the expression matches the two surrounding characters, do not allow splitting
    dampening = dampening       # 'none', 'ones', or 'log'
    algorithm = 'recursive'     # 'recursive' or 'viterbi'
    finish_threshold = 0.005    # train stops when the improvement of last iteration is smaller than this
    maxepochs = None            # ceiling on number of training epochs
    develannots = None          # boolean on whether to use dev-data file
    splitprob = None            # initialize new words by random split using given probability
    epochinterval = 10000       # epoch interval for online training
    algparams = ()              # set algorithm parameters; for this model, we are not using 'viterbi', nothing to set

    # Progress bar handling
    global show_progress_bar
    if progress:
        show_progress_bar = True
    else:
        show_progress_bar = False

    # build I/O and model
    io = MorfessorIO(encoding=encoding,
                     compound_separator=cseparator,
                     atom_separator=separator,
                     lowercase=lowercase)

    model = BaselineModel(forcesplit_list=forcesplit,
                          corpusweight=corpusweight,
                          use_skips=skips,
                          nosplit_re=nosplit)

    # Set frequency dampening function
    if dampening == 'none':
        dampfunc = None
    elif dampening == 'log':
        dampfunc = lambda x: int(round(math.log(x + 1, 2)))
    elif dampening == 'ones':
        dampfunc = lambda x: 1
    else:
        raise ArgumentException("unknown dampening type '%s'" % dampening)

    # for use when building a new model or doing online training
    # this is the online+batch training model
    if len(trainfiles) > 0:

        time_start = time.time()

        data = io.read_corpus_files(trainfiles)
        epochs, total_cost = model.train_online(data, dampfunc, epochinterval,
                                                algorithm, algparams,
                                                splitprob, maxepochs)
        epochs, total_cost = model.train_batch(algorithm, algparams, develannots,
                                               finish_threshold, maxepochs)
        _logger.info("Epochs: %s" % epochs)

        time_end = time.time()
        _logger.info("Final cost: %s" % total_cost)
        _logger.info("Training time: %.3fs" % (time_end - time_start))

    else:
        _logger.warning("No training data files specified.")

    # if save file is present, write binary model to file
    if isinstance(save_file, str):

        outfile_bin = save_file + "_bin"
        io.write_binary_model_file(outfile_bin, model)

    # return model object for further manipulation
    return model
"""
Trains a morfessor morphological segmenter
for use in the morphological RNN
"""
from morfessor.io import MorfessorIO
from morfessor.baseline import BaselineModel

import argparse
import subprocess
import os

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--training-data", required=True, dest="training_data", help="Corpus to train Morfessor on")
    parser.add_argument("--output", dest="output", default="./morfessor_model", help="Output filename for the Morfessor model")
    options = parser.parse_args()

    data_reader = MorfessorIO()
    word_iterator = data_reader.read_corpus_file(options.training_data)
    model = BaselineModel()
    model.load_data(word_iterator, count_modifier=lambda x: 1) # Use types instead of tokens
    model.train_batch()

    data_reader.write_binary_model_file(options.output + "-" + os.path.basename(options.training_data) + ".bin", model)