def compute_soy_word_score(corpus_fname, model_fname): sentences = [sent.strip() for sent in open(corpus_fname, 'r').readlines()] word_extractor = WordExtractor(min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0) word_extractor.train(sentences) word_extractor.save(model_fname)
from tensorflow.keras.preprocessing.text import text_to_word_sequence data = pd.read_pickle('./backend/textengines/data/dc_data.pkl') soynlp_model_fname = './backend/textengines/data/tokenizer_model/soyword.model' sentences = data["title"].values word_extractor = WordExtractor( min_frequency=100, min_cohesion_forward=0.05, min_right_branching_entropy=0.0 ) word_extractor.train(sentences) word_extractor.save(soynlp_model_fname) scores = word_extractor.word_scores() scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()} # soyToken = LTokenizer(scores=scores) # soyToken.tokenize(data["title"].values[0]) ############################################################################# file = open("./backend/textengines/data/dc_title.txt", "w", encoding="utf-8") for title in data["title"].values: file.write(title) file.write("\n") file.close() spm_train = """--input=./backend/textengines/data/dc_title.txt \ --model_prefix=sentencepice \ --vocab_size=32000 \