def test_single_sent(): #parameters max_word_len = 15 dummy_start, dummy_end = u'$START#', u'$END#' print '\nRunning test for exp_l2s_predict....' path_me_model = "../working_data/train.set1.i80.model" path_to_lexicon = "../working_data/train_testPredict.dict" sent = u"材 料 利 用 率 高".split() sent = u"下 雨 天 留 客 天 天 留 我 不 留".split() print "sample sentence is:", " - ".join(sent) # # loading maximum entropy model as the score function # print '\nInitializing maximum entropy model as the scoring model' model = ScoreModel(path_me_model) print 'done' # # loading lexicion # print "\nLoading lexicion file..." with codecs.open(path_to_lexicon, 'rU', 'utf-8') as f: lexicon = [word for line in f for word in line.split()] print "lexicion size=", len( lexicon), "example word in lexicion:", " ".join(lexicon[:5]) lexicon = set(lexicon) print '\n====1 Bui latttice for the sample sentence=====' forward_unigram_lattice, backward_bigram_lattice = gen_lattice( lexicon, sent, max_word_len, dummy_start) print '\n====2 Runing Viterbi search to decode====' best_index_seq = viterbi_search(model.score_it, backward_bigram_lattice, sent, dummy_end) #b_lattic_display(backward_bigram_lattice) #f_lattice_display(forward_unigram_lattice) x = best_index_seq[:-1] y = best_index_seq[1:] z = zip(x, y) segmented = [] for index1, index2 in z: word = u"".join(sent[index1:index2]) print word segmented.append(word) print '\nSegmented sent=', u" ".join(segmented)
def test_single_sent(): #parameters max_word_len = 15 dummy_start, dummy_end = u'$START#', u'$END#' print '\nRunning test for exp_l2s_predict....' path_me_model = "../working_data/train.set1.i80.model" path_to_lexicon = "../working_data/train_testPredict.dict" sent = u"材 料 利 用 率 高".split() sent = u"下 雨 天 留 客 天 天 留 我 不 留".split() print "sample sentence is:", " - ".join(sent) # # loading maximum entropy model as the score function # print '\nInitializing maximum entropy model as the scoring model' model = ScoreModel(path_me_model) print 'done' # # loading lexicion # print "\nLoading lexicion file..." with codecs.open(path_to_lexicon, 'rU', 'utf-8') as f: lexicon = [word for line in f for word in line.split()] print "lexicion size=", len(lexicon), "example word in lexicion:", " ".join(lexicon[:5]) lexicon = set(lexicon) print '\n====1 Bui latttice for the sample sentence=====' forward_unigram_lattice, backward_bigram_lattice = gen_lattice(lexicon, sent, max_word_len, dummy_start) print '\n====2 Runing Viterbi search to decode====' best_index_seq = viterbi_search(model.score_it, backward_bigram_lattice, sent, dummy_end) #b_lattic_display(backward_bigram_lattice) #f_lattice_display(forward_unigram_lattice) x = best_index_seq[:-1] y = best_index_seq[1:] z = zip(x, y) segmented = [] for index1, index2 in z: word = u"".join(sent[index1:index2]) print word segmented.append(word) print '\nSegmented sent=', u" ".join(segmented)
def main(path_corpus, path_me_model, path_to_lexicon, path_to_output): max_word_len = 12 dummy_start, dummy_end = u'$START#', u'$END#' # # loading maximum entropy model as the score function # print '\nInitializing maximum entropy model as the scoring model' model = ScoreModel(path_me_model) print 'done' # # loading lexicion # print "\nLoading lexicion file..." with codecs.open(path_to_lexicon, 'rU', 'utf-8') as f: lexicon = [word for line in f for word in line.split()] print "lexicion size=", len( lexicon), "example word in lexicion:", " ".join(lexicon[:5]) lexicon = set(lexicon) segmented_corpus = [] print "\nLoading corpus to be segmented..." with codecs.open(path_corpus, 'rU', 'utf-8') as f: raw_corpus = [u"".join(line.split()) for line in f] print 'line count of raw_corpus=', len(raw_corpus) print 'the first line is ', raw_corpus[0] print "\n\n====Segmenting the corpus======" for sent in raw_corpus: #print '\n====1 Bui latttice for the sample sentence=====' forward_unigram_lattice, backward_bigram_lattice = gen_lattice( lexicon, sent, max_word_len, dummy_start) #print '\n====2 Runing Viterbi search to decode====' best_index_seq = viterbi_search(model.score_it, backward_bigram_lattice, sent, dummy_end) #b_lattic_display(backward_bigram_lattice) #f_lattice_display(forward_unigram_lattice) x = best_index_seq[:-1] y = best_index_seq[1:] z = zip(x, y) segmented = [] for index1, index2 in z: word = u"".join(sent[index1:index2]) #print word segmented.append(word) print '\nSegmented sent=', u" ".join(segmented) segmented_corpus.append(u" ".join(segmented)) print "\nSegmentation done, writing it to file", path_to_output, '...' with codecs.open(path_to_output, 'w', 'utf-8') as f: for sent in segmented_corpus: f.write(sent + u'\n') print 'done' print "Program exit."
def main(path_corpus, path_me_model, path_to_lexicon, path_to_output): max_word_len = 12 dummy_start, dummy_end = u'$START#', u'$END#' # # loading maximum entropy model as the score function # print '\nInitializing maximum entropy model as the scoring model' model = ScoreModel(path_me_model) print 'done' # # loading lexicion # print "\nLoading lexicion file..." with codecs.open(path_to_lexicon, 'rU', 'utf-8') as f: lexicon = [word for line in f for word in line.split()] print "lexicion size=", len(lexicon), "example word in lexicion:", " ".join(lexicon[:5]) lexicon = set(lexicon) segmented_corpus = [] print "\nLoading corpus to be segmented..." with codecs.open(path_corpus, 'rU', 'utf-8') as f: raw_corpus = [u"".join(line.split()) for line in f] print 'line count of raw_corpus=', len(raw_corpus) print 'the first line is ', raw_corpus[0] print "\n\n====Segmenting the corpus======" for sent in raw_corpus: #print '\n====1 Bui latttice for the sample sentence=====' forward_unigram_lattice, backward_bigram_lattice = gen_lattice(lexicon, sent, max_word_len, dummy_start) #print '\n====2 Runing Viterbi search to decode====' best_index_seq = viterbi_search(model.score_it, backward_bigram_lattice, sent, dummy_end) #b_lattic_display(backward_bigram_lattice) #f_lattice_display(forward_unigram_lattice) x = best_index_seq[:-1] y = best_index_seq[1:] z = zip(x, y) segmented = [] for index1, index2 in z: word = u"".join(sent[index1:index2]) #print word segmented.append(word) print '\nSegmented sent=', u" ".join(segmented) segmented_corpus.append(u" ".join(segmented)) print "\nSegmentation done, writing it to file", path_to_output, '...' with codecs.open(path_to_output, 'w', 'utf-8') as f: for sent in segmented_corpus: f.write(sent + u'\n') print 'done' print "Program exit."