def da_trainer(datapath): io = morfessor.MorfessorIO() train_data = list(io.read_corpus_file(datapath)) model_types = morfessor.BaselineModel() model_logtokens = morfessor.BaselineModel() model_tokens = morfessor.BaselineModel() model_types.load_data(train_data, count_modifier=lambda x: 1) def log_func(x): return int(round(math.log(x + 1, 2))) model_logtokens.load_data(train_data, count_modifier=log_func) model_tokens.load_data(train_data) models = [model_types, model_logtokens, model_tokens] i = 0 for model in models: model.train_batch() io.write_binary_model_file("model" + str(i), model) i += 1
def Base_SegModel(data, corpusweight): io = morfessor.MorfessorIO() train_data = list(io.read_corpus_file(data)) model_types = morfessor.BaselineModel(corpusweight=corpusweight) model_types.load_data(train_data, count_modifier=lambda x: 1) model_types.train_batch() model_tokens = morfessor.BaselineModel() model_tokens.load_data(train_data) model_tokens.train_batch() return model_types, model_tokens
def _load_baseline(): baseline = morfessor.BaselineModel() io = morfessor.MorfessorIO(encoding='latin-1') baseline.load_segmentations( io.read_segmentation_file(REFERENCE_BASELINE_SEGMENTATION)) return baseline
def Base_SegModel(data, average_morph_length): io = morfessor.MorfessorIO() train_data = list(io.read_corpus_file(data)) baseline_model = morfessor.BaselineModel(corpusweight=1.0) updater = morfessor.baseline.MorphLengthCorpusWeight(average_morph_length) baseline_model.set_corpus_weight_updater(updater) baseline_model.load_data(train_data, count_modifier=lambda x: 1) baseline_model.train_batch() return baseline_model
def main(d): freq, alpha, damp = parse_name(d) word_count = collections.Counter() parent_dir = os.path.dirname(d) for f in os.listdir(parent_dir): if f.endswith(".xz") and not f.startswith("dev") and not f.startswith( "eval") and not f.startswith("test"): print("Read {}".format(f), file=sys.stderr) for line in lzma.open(os.path.join(parent_dir, f), 'rt', encoding='utf-8'): for word in line.strip().split(): word_count[word] += 1 print("Corpora read", file=sys.stderr) allowed_chars = { line.strip() for line in open(os.path.join(parent_dir, 'allowed_chars'), encoding='utf-8') if len(line.strip()) == 1 } model = morfessor.BaselineModel(corpusweight=alpha) assert damp in {'types', 'tokens', 'logtokens'} damp_func = None if damp == 'types': damp_func = lambda x: 1 elif damp == 'logtokens': damp_func = lambda x: int(round(math.log(x + 1, 2))) data = [(v, k) for k, v in word_count.items() if all(c in allowed_chars for c in k)] model.load_data(data, freq, damp_func) model.train_batch() io = morfessor.MorfessorIO() io.write_binary_model_file(os.path.join(d, 'model.bin'), model) io.write_segmentation_file(os.path.join(d, 'model.txt'), model.get_segmentations()) s = set() with open(os.path.join(d, 'wordmap'), 'w', encoding='utf-8') as outf: for k in word_count.keys(): parts = model.viterbi_segment(k)[0] rparts = [] for p in parts: if not all(c in allowed_chars for c in p): p = '<UNK>' s.add(p) rparts.append(p) print("{}\t{}".format(k, " ".join(rparts)), file=outf) with open(os.path.join(d, 'vocab'), 'w', encoding='utf-8') as outf: for morph in s: print(morph, file=outf)
def __init__(self, corpus_weight: float = 1.0, ppl_threshold: float = 100) -> None: self.morfessor_model = morfessor.BaselineModel( corpusweight=corpus_weight, forcesplit_list=["-"]) props = flatcat.MorphUsageProperties(ppl_threshold=ppl_threshold) self.flatcat_model = flatcat.FlatcatModel(props, corpusweight=corpus_weight, forcesplit=["-"], ml_emissions_epoch=0) self.flatcat_model.postprocessing.append( flatcat.HeuristicPostprocessor())
def train_seg(infile, outfile): io = morfessor.MorfessorIO() print("Open corpus file") train_data = list(io.read_corpus_file(infile)) model_types = morfessor.BaselineModel() model_types.load_data(train_data, count_modifier=lambda x: 1) def log_func(x): return int(round(math.log(x + 1, 2))) print("Training data...") model_types.train_batch() print("Write bin file") io.write_binary_model_file(outfile, model_types)
def train_model(input_file, output_file=None): # setup input and model objects morf_io = morfessor.MorfessorIO() morf_model = morfessor.BaselineModel() # build a corpus from input file train_data = morf_io.read_corpus_file(input_file) # load data into model # optional param "count_modifier" can set frequency dampening; # default is each token counts morf_model.load_data(train_data) # train the model in batch form (online training also available) morf_model.train_batch() # optionally pickle model if output_file is not None: morf_io.write_binary_model_file(output_file, morf_model) return morf_model
def train_morfessor(corpus, split_prob): """ Train Morfessor Baseline model Lowercase the input text; use random skips for frequently seen compounds to speed up training; initialize new words by random splitting using the split probability of split_prob. """ io = morfessor.MorfessorIO(compound_separator=r"[^-\w]+", lowercase=True) train_data = list( io.read_corpus_file(os.path.join('data', 'corpora', corpus))) model_tokens = morfessor.BaselineModel(use_skips=True) model_tokens.load_data(train_data, init_rand_split=split_prob) model_tokens.train_batch() io.write_binary_model_file( os.path.join('data', 'models', corpus[:-4] + '_morph'), model_tokens) return model_tokens
help='loading model path') parser.add_argument("--min_count", type=int, default=5, help='minimum count') args = parser.parse_args() return args if __name__ == '__main__': args = create_args() if args.train == True: # training io = morfessor.MorfessorIO() train_data = list(io.read_corpus_file(args.train_data)) model_tokens = morfessor.BaselineModel() model_tokens.load_data(train_data, freqthreshold=args.min_count) model_tokens.train_batch() with open(args.save_model, 'wb') as fout: pickle.dump(model_tokens, fout) else: # inference with open(args.load_model, 'rb') as fin: model_tokens = pickle.load(fin) # test file and training file are different with open(args.output, 'w') as fout, open(args.test_data, 'r') as fin: for line in fin: line = line.strip() morph_list, score = model_tokens.viterbi_segment(line) morphs = ' '.join(morph_list) fout.write('{} {}\n'.format(line, morphs))