def da_trainer(datapath):
    io = morfessor.MorfessorIO()

    train_data = list(io.read_corpus_file(datapath))

    model_types = morfessor.BaselineModel()
    model_logtokens = morfessor.BaselineModel()
    model_tokens = morfessor.BaselineModel()

    model_types.load_data(train_data, count_modifier=lambda x: 1)

    def log_func(x):
        return int(round(math.log(x + 1, 2)))

    model_logtokens.load_data(train_data, count_modifier=log_func)
    model_tokens.load_data(train_data)

    models = [model_types, model_logtokens, model_tokens]

    i = 0
    for model in models:
        model.train_batch()
        io.write_binary_model_file("model" + str(i), model)

        i += 1
Ejemplo n.º 2
0
def Base_SegModel(data, corpusweight):
    io = morfessor.MorfessorIO()
    train_data = list(io.read_corpus_file(data))
    model_types = morfessor.BaselineModel(corpusweight=corpusweight)
    model_types.load_data(train_data, count_modifier=lambda x: 1)
    model_types.train_batch()
    model_tokens = morfessor.BaselineModel()
    model_tokens.load_data(train_data)
    model_tokens.train_batch()

    return model_types, model_tokens
Ejemplo n.º 3
0
def _load_baseline():
    baseline = morfessor.BaselineModel()
    io = morfessor.MorfessorIO(encoding='latin-1')

    baseline.load_segmentations(
        io.read_segmentation_file(REFERENCE_BASELINE_SEGMENTATION))
    return baseline
Ejemplo n.º 4
0
def Base_SegModel(data, average_morph_length):
    io = morfessor.MorfessorIO()
    train_data = list(io.read_corpus_file(data))
    baseline_model = morfessor.BaselineModel(corpusweight=1.0)
    updater = morfessor.baseline.MorphLengthCorpusWeight(average_morph_length)
    baseline_model.set_corpus_weight_updater(updater)
    baseline_model.load_data(train_data, count_modifier=lambda x: 1)
    baseline_model.train_batch()

    return baseline_model
Ejemplo n.º 5
0
def main(d):
    freq, alpha, damp = parse_name(d)

    word_count = collections.Counter()
    parent_dir = os.path.dirname(d)
    for f in os.listdir(parent_dir):
        if f.endswith(".xz") and not f.startswith("dev") and not f.startswith(
                "eval") and not f.startswith("test"):
            print("Read {}".format(f), file=sys.stderr)
            for line in lzma.open(os.path.join(parent_dir, f),
                                  'rt',
                                  encoding='utf-8'):
                for word in line.strip().split():
                    word_count[word] += 1
    print("Corpora read", file=sys.stderr)
    allowed_chars = {
        line.strip()
        for line in open(os.path.join(parent_dir, 'allowed_chars'),
                         encoding='utf-8') if len(line.strip()) == 1
    }

    model = morfessor.BaselineModel(corpusweight=alpha)
    assert damp in {'types', 'tokens', 'logtokens'}
    damp_func = None
    if damp == 'types':
        damp_func = lambda x: 1
    elif damp == 'logtokens':
        damp_func = lambda x: int(round(math.log(x + 1, 2)))

    data = [(v, k) for k, v in word_count.items()
            if all(c in allowed_chars for c in k)]
    model.load_data(data, freq, damp_func)
    model.train_batch()

    io = morfessor.MorfessorIO()
    io.write_binary_model_file(os.path.join(d, 'model.bin'), model)

    io.write_segmentation_file(os.path.join(d, 'model.txt'),
                               model.get_segmentations())

    s = set()
    with open(os.path.join(d, 'wordmap'), 'w', encoding='utf-8') as outf:
        for k in word_count.keys():
            parts = model.viterbi_segment(k)[0]
            rparts = []
            for p in parts:
                if not all(c in allowed_chars for c in p):
                    p = '<UNK>'
                s.add(p)
                rparts.append(p)
            print("{}\t{}".format(k, " ".join(rparts)), file=outf)

    with open(os.path.join(d, 'vocab'), 'w', encoding='utf-8') as outf:
        for morph in s:
            print(morph, file=outf)
Ejemplo n.º 6
0
    def __init__(self,
                 corpus_weight: float = 1.0,
                 ppl_threshold: float = 100) -> None:
        self.morfessor_model = morfessor.BaselineModel(
            corpusweight=corpus_weight, forcesplit_list=["-"])

        props = flatcat.MorphUsageProperties(ppl_threshold=ppl_threshold)
        self.flatcat_model = flatcat.FlatcatModel(props,
                                                  corpusweight=corpus_weight,
                                                  forcesplit=["-"],
                                                  ml_emissions_epoch=0)
        self.flatcat_model.postprocessing.append(
            flatcat.HeuristicPostprocessor())
Ejemplo n.º 7
0
def train_seg(infile, outfile):
    io = morfessor.MorfessorIO()

    print("Open corpus file")
    train_data = list(io.read_corpus_file(infile))

    model_types = morfessor.BaselineModel()

    model_types.load_data(train_data, count_modifier=lambda x: 1)

    def log_func(x):
        return int(round(math.log(x + 1, 2)))

    print("Training data...")
    model_types.train_batch()

    print("Write bin file")
    io.write_binary_model_file(outfile, model_types)
Ejemplo n.º 8
0
def train_model(input_file, output_file=None):

    # setup input and model objects
    morf_io = morfessor.MorfessorIO()
    morf_model = morfessor.BaselineModel()

    # build a corpus from input file
    train_data = morf_io.read_corpus_file(input_file)

    # load data into model
    # optional param "count_modifier" can set frequency dampening;
    # default is each token counts
    morf_model.load_data(train_data)

    # train the model in batch form (online training also available)
    morf_model.train_batch()

    # optionally pickle model
    if output_file is not None:
        morf_io.write_binary_model_file(output_file, morf_model)

    return morf_model
Ejemplo n.º 9
0
def train_morfessor(corpus, split_prob):
    """
    Train Morfessor Baseline model
    Lowercase the input text; use random skips for frequently seen compounds
    to speed up training; initialize new words by random splitting using the 
    split probability of split_prob.
    """

    io = morfessor.MorfessorIO(compound_separator=r"[^-\w]+", lowercase=True)

    train_data = list(
        io.read_corpus_file(os.path.join('data', 'corpora', corpus)))

    model_tokens = morfessor.BaselineModel(use_skips=True)

    model_tokens.load_data(train_data, init_rand_split=split_prob)

    model_tokens.train_batch()

    io.write_binary_model_file(
        os.path.join('data', 'models', corpus[:-4] + '_morph'), model_tokens)

    return model_tokens
Ejemplo n.º 10
0
                        help='loading model path')
    parser.add_argument("--min_count",
                        type=int,
                        default=5,
                        help='minimum count')
    args = parser.parse_args()
    return args


if __name__ == '__main__':
    args = create_args()
    if args.train == True:
        # training
        io = morfessor.MorfessorIO()
        train_data = list(io.read_corpus_file(args.train_data))
        model_tokens = morfessor.BaselineModel()
        model_tokens.load_data(train_data, freqthreshold=args.min_count)
        model_tokens.train_batch()
        with open(args.save_model, 'wb') as fout:
            pickle.dump(model_tokens, fout)
    else:
        # inference
        with open(args.load_model, 'rb') as fin:
            model_tokens = pickle.load(fin)
        # test file and training file are different
        with open(args.output, 'w') as fout, open(args.test_data, 'r') as fin:
            for line in fin:
                line = line.strip()
                morph_list, score = model_tokens.viterbi_segment(line)
                morphs = ' '.join(morph_list)
                fout.write('{} {}\n'.format(line, morphs))