def compute_soy_word_score(corpus_fname, model_fname):
    sentences = [sent.strip() for sent in open(corpus_fname, 'r').readlines()]
    word_extractor = WordExtractor(min_frequency=100,
                                   min_cohesion_forward=0.05,
                                   min_right_branching_entropy=0.0)
    word_extractor.train(sentences)
    word_extractor.save(model_fname)
Beispiel #2
0
from tensorflow.keras.preprocessing.text import text_to_word_sequence

data = pd.read_pickle('./backend/textengines/data/dc_data.pkl')

soynlp_model_fname = './backend/textengines/data/tokenizer_model/soyword.model'

sentences = data["title"].values

word_extractor = WordExtractor(
    min_frequency=100,
    min_cohesion_forward=0.05,
    min_right_branching_entropy=0.0
)

word_extractor.train(sentences)
word_extractor.save(soynlp_model_fname)

scores = word_extractor.word_scores()
scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()}
# soyToken = LTokenizer(scores=scores)
# soyToken.tokenize(data["title"].values[0])
#############################################################################
file = open("./backend/textengines/data/dc_title.txt", "w", encoding="utf-8")
for title in data["title"].values:
    file.write(title)
    file.write("\n")
file.close()

spm_train = """--input=./backend/textengines/data/dc_title.txt \
               --model_prefix=sentencepice \
               --vocab_size=32000 \