Beispiel #1
0
def build_tokenizer():
    """
    Train soynlp tokenizer which will be used to tokenize Korean input sentence using whole corpus
    Returns:

    """
    print(f'Now building soy-nlp tokenizer . . .')

    data_dir = Path().cwd() / 'data'
    train_file = os.path.join(data_dir, 'train_soynlp.csv')

    df = pd.read_csv(train_file, encoding='utf-8')

    # if encounters non-text row, we should skip it
    kor_lines = [
        row.korean for _, row in df.iterrows() if type(row.korean) == str
    ]

    word_extractor = WordExtractor(min_frequency=5)
    word_extractor.train(kor_lines)

    word_scores = word_extractor.extract()
    cohesion_scores = {
        word: score.cohesion_forward
        for word, score in word_scores.items()
    }

    with open('pickles/tokenizer.pickle', 'wb') as pickle_out:
        pickle.dump(cohesion_scores, pickle_out)
def compute_soy_word_score(corpus_fname, model_fname):
    sentences = [sent.strip() for sent in open(corpus_fname, 'r').readlines()]
    word_extractor = WordExtractor(min_frequency=100,
                                   min_cohesion_forward=0.05,
                                   min_right_branching_entropy=0.0)
    word_extractor.train(sentences)
    word_extractor.save(model_fname)
Beispiel #3
0
def pmi_test(corpus_path):
    print('PMI test\n{}'.format('-' * 40))

    from soynlp import DoublespaceLineCorpus
    from soynlp.word import WordExtractor
    from soynlp.tokenizer import LTokenizer
    from soynlp.vectorizer import sent_to_word_contexts_matrix
    from soynlp.word import pmi

    corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)
    print('num sents = {}'.format(len(corpus)))

    word_extractor = WordExtractor()
    word_extractor.train(corpus)
    cohesions = word_extractor.all_cohesion_scores()

    l_cohesions = {word: score[0] for word, score in cohesions.items()}
    tokenizer = LTokenizer(l_cohesions)
    print('trained l tokenizer')

    x, idx2vocab = sent_to_word_contexts_matrix(
        corpus,
        windows=3,
        min_tf=10,
        tokenizer=tokenizer,  # (default) lambda x:x.split(),
        dynamic_weight=False,
        verbose=True)

    pmi_dok = pmi(x, min_pmi=0, alpha=0.0001, verbose=True)

    for pair, pmi in sorted(pmi_dok.items(), key=lambda x: -x[1])[100:110]:
        pair_ = (idx2vocab[pair[0]], idx2vocab[pair[1]])
        print('pmi {} = {:.3f}'.format(pair_, pmi))
    print('computed PMI')
def data_tokenize(news_title, tdm_vocab):

    word_extractor = WordExtractor(
        min_frequency=100,  # example
        min_cohesion_forward=0.05,
        min_right_branching_entropy=0.0)

    word_extractor.train(news_title)
    words = word_extractor.extract()

    cohesion_score = {
        word: score.cohesion_forward
        for word, score in words.items()
    }
    tokenizer = LTokenizer(scores=cohesion_score)

    cluster_data = []
    bert_null_list = []

    for title in news_title:
        title = test(title)
        sent = tokenizer.tokenize(title, flatten=False)
        sentence = []
        for i in sent:
            if i[0] in tdm_vocab:
                sentence.append(i[0])

        cluster_data.append(sentence)

    return cluster_data
Beispiel #5
0
class SoyNLPTokenizer(BaseTokenizer):
    """
    Tokenize text using MaxScoreTokenizer of SoyNLP
    """
    def __init__(self):
        self.tokenizer = None
        self.scores = list()
        self.word_extractor = WordExtractor(min_count=100,
                                            min_cohesion_forward=0.05,
                                            min_right_branching_entropy=0.0)

    def fit(self, sentences):
        self.word_extractor.train(sentences)
        scores = self.word_extractor.extract()
        scores = [(word, (score.cohesion_forward + score.cohesion_backward) * \
                   (score.left_branching_entropy + score.right_branching_entropy)) for word, score in scores.items()]
        self.scores = scores
        self.tokenizer = MaxScoreTokenizer(scores=self.scores)

    def state_dict(self):
        return {'scores': self.scores}

    def load_state_dict(self, state_dict):
        self.scores = state_dict['scores']
        self.tokenizer = MaxScoreTokenizer(scores=self.scores)

    def tokenize(self, sentence):
        tokenized_sentence = self.tokenizer.tokenize(sentence)
        return tokenized_sentence
Beispiel #6
0
 def getTokenizer(self, contents):
     corpus = SentiCorpus(contents, iter_sent=True)
     word_extractor = WordExtractor(corpus)
     word_extractor.train(corpus)
     words_scores = word_extractor.extract()
     scores = {w: s.cohesion_forward for w, s in words_scores.items()}
     return LTokenizer(scores=scores)
def soynlp_tokenizer(corpus):
    from soynlp.tokenizer import LTokenizer
    from soynlp.word import WordExtractor
    from soynlp.noun import LRNounExtractor_v2

    # word extractor
    word_extractor = WordExtractor(
        min_frequency=100,  # example
        min_cohesion_forward=0.05,
        min_right_branching_entropy=0.0)
    word_extractor.train(corpus)
    words = word_extractor.extract()

    cohesion_score = {
        word: score.cohesion_forward
        for word, score in words.items()
    }

    # noun extractor
    noun_extractor = LRNounExtractor_v2()
    nouns = noun_extractor.train_extract(corpus)  # list of str like

    noun_scores = {noun: score.score for noun, score in nouns.items()}
    combined_scores = {
        noun: score + cohesion_score.get(noun, 0)
        for noun, score in noun_scores.items()
    }
    combined_scores.update({
        subword: cohesion
        for subword, cohesion in cohesion_score.items()
        if not (subword in combined_scores)
    })

    tokenizer = LTokenizer(scores=combined_scores)
    return tokenizer
def build_tokenizer():
    """
    Train soynlp tokenizer which will be used to tokenize Korean input sentence
    Returns:

    """
    print(f'Now building soynlp tokenizer . . .')

    data_dir = Path().cwd() / 'data'
    train_txt = os.path.join(data_dir, 'train.txt')

    with open(train_txt, encoding='utf-8') as f:
        lines = f.readlines()

    word_extractor = WordExtractor(min_frequency=5)
    word_extractor.train(lines)

    word_scores = word_extractor.extract()
    cohesion_scores = {
        word: score.cohesion_forward
        for word, score in word_scores.items()
    }

    with open('pickles/tokenizer.pickle', 'wb') as pickle_out:
        pickle.dump(cohesion_scores, pickle_out)
Beispiel #9
0
def data_tokenize(news_title):

    word_extractor = WordExtractor(
        min_frequency=100, # example
        min_cohesion_forward=0.05,
        min_right_branching_entropy=0.0
    )

    word_extractor.train(news_title)
    words = word_extractor.extract()

    cohesion_score = {word:score.cohesion_forward for word, score in words.items()}
    tokenizer = LTokenizer(scores=cohesion_score)

    return tokenizer
Beispiel #10
0
def word_extractor_test(corpus_path):
    print('WordExtractor test')
    from soynlp import DoublespaceLineCorpus
    from soynlp.word import WordExtractor

    corpus = DoublespaceLineCorpus(corpus_path, num_doc=1000)
    word_extractor = WordExtractor()
    word_extractor.train(corpus)
    word_scores = word_extractor.extract()

    print('top 20 left frequency * forward cohesion words')
    topwords = sorted(word_scores, key=lambda x: -word_scores[x].cohesion_forward * word_scores[x].leftside_frequency)[:20]
    for word in topwords:
        print('word = {}, cohesion = {}'.format(word, word_scores[word].cohesion_forward))
    print('word extractor test has been done\n\n')
Beispiel #11
0
def word_extract(datas):
    we = WordExtractor(
    min_frequency=10,
    min_cohesion_forward=0.05,
    min_right_branching_entropy=0.0
    )
    we.train(datas)
    words = we.extract()
    print('단어   (빈도수, cohesion, branching entropy)\n')
    for word, score in sorted(words.items(), key=lambda x:word_score(x[1]), reverse=True)[:10]:
        print('%s     (%d, %.3f, %.3f)' % (
            word, 
            score.leftside_frequency, 
            score.cohesion_forward,
            score.right_branching_entropy
            )
         )
    return 
Beispiel #12
0
 def _get_tokenizer(self, df):
     """
     Generate a torkenizer by extracting words
     Args:
         dataframe: data corpus of one language
     Returns:
         tokenizer
     """
     word_extractor = WordExtractor()
     word_extractor.train(df)
     words = word_extractor.extract()
     print(f'length of words is {len(words)}')
     cohesion_scores = {
         word: score.cohesion_forward
         for word, score in words.items()
     }
     tokenizer = LTokenizer(scores=cohesion_scores)
     return tokenizer
Beispiel #13
0
def word_extractor_test(corpus_path):
    print('WordExtractor test')
    from soynlp import DoublespaceLineCorpus
    from soynlp.word import WordExtractor

    corpus = DoublespaceLineCorpus(corpus_path, num_doc=1000)
    word_extractor = WordExtractor()
    word_extractor.train(corpus)
    word_scores = word_extractor.extract()

    print('top 20 left frequency * forward cohesion words')
    topwords = sorted(word_scores,
                      key=lambda x: -word_scores[x].cohesion_forward *
                      word_scores[x].leftside_frequency)[:20]
    for word in topwords:
        print('word = {}, cohesion = {}'.format(
            word, word_scores[word].cohesion_forward))
    print('word extractor test has been done\n\n')
Beispiel #14
0
def pmi_test(corpus_path):
    print('pmi test\n{}'.format('-' * 40))

    from soynlp import DoublespaceLineCorpus
    from soynlp.word import WordExtractor
    from soynlp.tokenizer import LTokenizer
    from soynlp.vectorizer import sent_to_word_contexts_matrix
    from soynlp.word import pmi

    corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)
    print('num sents = {}'.format(len(corpus)))

    word_extractor = WordExtractor()
    word_extractor.train(corpus)
    cohesions = word_extractor.all_cohesion_scores()

    l_cohesions = {word: score[0] for word, score in cohesions.items()}
    tokenizer = LTokenizer(l_cohesions)
    print('trained l tokenizer')

    x, idx2vocab = sent_to_word_contexts_matrix(
        corpus,
        windows=3,
        min_tf=10,
        tokenizer=tokenizer,  # (default) lambda x:x.split(),
        dynamic_weight=False,
        verbose=True)

    x_pmi, x, y = pmi(x, min_pmi=0, alpha=0.0001)

    rows, cols = x_pmi.nonzero()
    data = x_pmi.data

    print('row  shape = {}'.format(rows.shape))
    print('col  shape = {}'.format(cols.shape))
    print('data shape = {}'.format(data.shape))

    for indpt in data.argsort()[-150:-100]:
        i = rows[indpt]
        j = cols[indpt]
        pair = (idx2vocab[i], idx2vocab[j])
        value = data[indpt]
        print('pmi {} = {:.3f}'.format(pair, value))
    print('computed pmi')
def main(args):
    # Find patterns and extract words from a given set of documents
    sentences = DoublespaceLineCorpus(args.corpus_fname, iter_sent=True)
    word_extractor = WordExtractor(min_frequency=100,
                                   min_cohesion_forward=0.05,
                                   min_right_branching_entropy=0.0)

    # word extractor
    word_extractor.train(sentences)
    words = word_extractor.extract()
    cohesion_score = {
        word: score.cohesion_forward
        for word, score in words.items()
    }
    print('Word   (Freq, cohesion, branching entropy)\n')
    for word, score in sorted(words.items(),
                              key=lambda x: word_score(x[1]),
                              reverse=True)[:30]:
        print('%s     (%d, %.3f, %.3f)' %
              (word, score.leftside_frequency, score.cohesion_forward,
               score.right_branching_entropy))

    # noun extractor
    noun_extractor = LRNounExtractor_v2()
    nouns = noun_extractor.train_extract(args.corpus_fname)  # list of str like
    noun_scores = {noun: score.score for noun, score in nouns.items()}

    # combined score
    combined_scores = {
        noun: score + cohesion_score.get(noun, 0)
        for noun, score in noun_scores.items()
    }
    combined_scores.update({
        subword: cohesion
        for subword, cohesion in cohesion_score.items()
        if not (subword in combined_scores)
    })

    # maxScore tokenizer
    tokenizer = MaxScoreTokenizer(scores=combined_scores)

    # save tokenizer
    with open(args.tokenizer_path, 'wb') as f:
        pickle.dump(tokenizer, f, pickle.HIGHEST_PROTOCOL)
Beispiel #16
0
    def soynlp_tokenizer(self):
        def word_score(score): return (score.cohesion_forward * math.exp(score.right_branching_entropy))

        if self.mode == 'serve':
            with open(self.data_path, 'r') as file:
                word_score_dict = json.load(file)
        elif self.mode == 'train':
            word_extractor = WordExtractor()
            word_extractor.train(self.train_corpus)
            words = word_extractor.extract()
            word_score_dict = { word:word_score(score) for word, score, in words.items()}

            with open('./models/word_dict.json', 'w') as file:
                json.dump(word_score_dict, file)
        else:
            pass
        
        tokenizer = MaxScoreTokenizer(scores=word_score_dict)
        return tokenizer
Beispiel #17
0
def build_tokenizer():
    """
    입력되는 한국어 문장을 tokenize 할 soynlp tokenizer를 학습한다
    """
    print(f'Now building soy-nlp tokenizer . . .')

    data_dir = Path().cwd() / 'data'
    train_file = os.path.join(data_dir, 'corpus.csv')
    """
    학습되는 데이터가 있는 경로 지정 후 파일을 불러온다
    """
    df = pd.read_csv(train_file, encoding='utf-8')
    """
    text인 행만 분석한다
    """
    kor_lines = [
        row.korean for _, row in df.iterrows() if type(row.korean) == str
    ]
    """
    soynlp 모듈에서 가져온 WordExtractor 함수로 branching entropy, accessor variety, cohesion score의 단어 score 도출한다
    이 단어 score들은 각각 다른 방법으로 token의 경계를 찾는 값이다
    그 중 cohesion score(단어를 구성하는 글자들이 얼마나 같이 나오는지에 대한 값)만 추출한다.
    자세한 단어 score의 식과 코드는 https://github.com/lovit/soynlp/blob/master/tutorials/wordextractor_lecture.ipynb 에 자세히 나와있다.
    """
    word_extractor = WordExtractor(min_frequency=5)
    word_extractor.train(kor_lines)

    word_scores = word_extractor.extract()
    cohesion_scores = {
        word: score.cohesion_forward
        for word, score in word_scores.items()
    }
    """
    pickle로 저장한다
    """
    with open('pickles/tokenizer.pickle', 'wb') as pickle_out:
        pickle.dump(cohesion_scores, pickle_out)
Beispiel #18
0
from soynlp.tokenizer import LTokenizer
from tensorflow.keras.preprocessing.text import text_to_word_sequence

data = pd.read_pickle('./backend/textengines/data/dc_data.pkl')

soynlp_model_fname = './backend/textengines/data/tokenizer_model/soyword.model'

sentences = data["title"].values

word_extractor = WordExtractor(
    min_frequency=100,
    min_cohesion_forward=0.05,
    min_right_branching_entropy=0.0
)

word_extractor.train(sentences)
word_extractor.save(soynlp_model_fname)

scores = word_extractor.word_scores()
scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()}
# soyToken = LTokenizer(scores=scores)
# soyToken.tokenize(data["title"].values[0])
#############################################################################
file = open("./backend/textengines/data/dc_title.txt", "w", encoding="utf-8")
for title in data["title"].values:
    file.write(title)
    file.write("\n")
file.close()

spm_train = """--input=./backend/textengines/data/dc_title.txt \
               --model_prefix=sentencepice \
    lines = f.read().splitlines()
re.sub(r"[\[\]<>~]", ' ', lines[0])
re.sub(r"['~]", ' ', lines[0])
re.sub(r'"', ' ', lines[0])

text = []
for line in lines:
    line = re.sub(r"[\[\]<>~]", ' ', line)
    line = re.sub(r"['~]", ' ', line)
    line = re.sub(r'"', ' ', line)
    line = re.sub('\\W', ' ', line)
    text.append(line)

# word_score
word_extractor = WordExtractor(min_frequency=5)
word_extractor.train(text)
print("train word_extractor complete")
words_scores = word_extractor.extract()
print('complete to extract words_scores')

scores_dictionary = {
    'words_scores': words_scores,
    'noun_scores': [],
    'text': text
}

with open('scores_dictionary.pickle', 'wb') as fw:
    pickle.dump(scores_dictionary, fw)
    print("dumping complete")

with open('scores_dictionary.pickle', 'rb') as fr:
Beispiel #20
0
class SoyTokenizer:
    def __init__(self, model_path: str = None):
        self.word_extractor = WordExtractor(min_frequency=5,
                                            min_cohesion_forward=0.05,
                                            min_right_branching_entropy=0.0)
        self.unk = 0
        self.pad = 1
        self.sos = 2
        self.eos = 3

        if model_path:
            with open(model_path, 'rb') as readFile:
                self.cohesion_score = dill.load(readFile)
        else:
            self.cohesion_score = {}
        self.tokenizer = LTokenizer(scores=self.cohesion_score)
        self.tok_to_id, self.id_to_tok = self._build_dict()

    def tokenize(self, sent: str):
        return self.tokenizer.tokenize(sent)

    def text_to_id(self, sent: str):
        toks = self.tokenize(sent)
        outp = []
        for s in toks:
            try:
                outp.append(self.tok_to_id[s])
            except KeyError:
                outp.append(self.unk)
        return outp

    def id_to_text(self, idxs: list):
        return [self.id_to_tok[i] for i in idxs]

    def train(self, sentences, add_whitespace: bool = False):
        sentences = self.preprocess(sentences)
        self.word_extractor.train(sentences)
        words = self.word_extractor.extract()
        self.cohesion_score = {
            word: score.cohesion_forward
            for word, score in words.items()
        }

        # add whitespace tokens
        if add_whitespace:
            whitetokens = []
            for s in sentences:
                whitetokens += s.split(' ')
            whitetokens = list(set(whitetokens))

            for t in whitetokens:
                self.cohesion_score.update({t: 1.0})

        self.tok_to_id, self.id_to_tok = self._build_dict()

    def save_model(self, model_path: str, model_prefix: str):
        with open(os.path.join(model_path, model_prefix + '.model'),
                  'wb') as saveFile:
            dill.dump(self.cohesion_score, saveFile)

    def _build_dict(self):
        tok_to_id = {'<unk>': 0, '<pad>': 1, '<sos>': 2, '<eos>': 3}
        id_to_tok = {0: '<unk>', 1: '<pad>', 2: '<sos>', 3: '<eos>'}
        for i, key in enumerate(self.cohesion_score.keys()):
            tok_to_id[key] = i + 4
            id_to_tok[i + 4] = key
        return tok_to_id, id_to_tok

    def preprocess(self, sents: list):
        n_str_pattern = re.compile(pattern='[\\d\\-?/_!\\.,]')
        doublespacing = re.compile(pattern='\\s\\s+')

        sents = [n_str_pattern.sub(repl=' ', string=w) for w in sents]
        sents = [doublespacing.sub(repl=' ', string=w).strip() for w in sents]
        sents = [u.lower() for u in sents]
        return sents

    def __len__(self):
        return len(self.cohesion_score)
Beispiel #21
0
from soynlp.noun import NewsNounExtractor
from soynlp import DoublespaceLineCorpus
from soynlp.noun import LRNounExtractor
from soynlp.word import WordExtractor
from soynlp.tokenizer import LTokenizer

corpus_path = "text/news/articles.txt"
#corpus_path = "text/news/input5-1.txt"

corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)

#for n_sent, sent in enumerate(corpus):
#    print('sent %d: %s %s\n'%(n_sent, sent, '' ))

we = WordExtractor()
we.train(corpus)
scores = we.word_scores()
print(scores.keys())
'''
sentences = DoublespaceLineCorpus(corpus_path, iter_sent=False)
noun_extractor = LRNounExtractor()
nouns = noun_extractor.train_extract(sentences)
n = nouns.keys()
lists=""
for a in n:
	lists+=a
	lists+=" "
print(lists)
'''
#top = sorted(nouns.items(), key=lambda x:-x[1].frequency)[:1]
#print(top)
Beispiel #22
0
class KoreanTokenizer:
    '''
    A class to tokenize a Korean sentence.

    Attributes
    ----------
    pre_trained : bool
        | If True, one of pre-trained Korean analyzer, provided by KoNLPy, will be used (default : True)
        | If False, unsupervised KoreanTokenizer is initialized, based on soynlp L-Tokenizer. Argument 'anaylzer' is ignored.
    analyzer : str
        | Type of KoNLPy analyzer (default : Hannanum)
        | Available analyzers are: Hannanum, Kkma, Komoran, Mecab, Okt
        | Note: Mecab needs to be installed separately before being used.

    Methods
    -------
    train
        | Trains KoreanTokenizer on a corpus, only when 'pre_trained' argument is False.
    tokenize
        | Tokenizes the input sentence and returns its tokens.
    extract_noun
        | Extracts nouns from the input sentence.
    
    '''

    def __init__(self, pre_trained=True, analyzer='Hannanum'):
        self.pre_trained = pre_trained

        if analyzer == 'Hannanum':
            self.analyzer = tag.Hannanum()
        elif analyzer == 'Kkma':
            self.analyzer = tag.Kkma()
        elif analyzer == 'Komoran':
            self.analyzer = tag.Komoran()
        elif analyzer == 'Mecab':
            self.analyzer = tag.Mecab()
        elif analyzer == 'Okt':
            self.analyzer = tag.Okt()
        else:
            if pre_trained == False:
                pass
            else:
                print('Enter a valid KoNLPy analyzer name.\n\tavailable: Hannanum, Kkma, Komoran, Mecab, Okt')

        self.WordExtractor = WordExtractor(min_frequency=0)
        self.noun_extractor = LRNounExtractor(verbose=False)
        self.word_score = {}

    def train(self, text):
        '''
        A method to train the KoreanTokenizer on a corpus.
        If KoreanTokenizer.pre_trained == False, this method does nothing.

        Attributes
        ----------
        text : str
            | An input text in str type
        '''

        if self.pre_trained == True:
            print('A pre-trained KoreanTokenizer is being used. No need to train it.')
            return

        else:
            self.WordExtractor.train(text)
            self.words = self.WordExtractor.extract()

            def calculate_word_score(word, score):
                cohesion = score.cohesion_forward
                branching_entropy = score.right_branching_entropy
                
                word_score = cohesion * exp(branching_entropy)

                return word_score

            self.word_score = {word:calculate_word_score(word, score) for word, score in self.words.items()}

    def tokenize(self, text):
        '''
        A method to tokenize input text.

        Attriubutes
        -----------
        text : str
            | An input text to be tokenized

        Output
        ------
        tokens : list
            | List of tokens (in str) that consist of the input text

        '''

        if self.pre_trained == True:
            return self.analyzer.morphs(text)

        else:
            if not self.word_score:
                print('An unsupervised KoreanTokenizer should be trained first, before tokenizing.')
                return
            
            self.tokenizer = LTokenizer(scores=self.word_score)

            result = self.tokenizer.tokenize(text)

            return result

    def extract_noun(self, text):
        '''
        A method to extract nouns from input text

        Attributes
        ----------
        text : str
            | An input text from which nouns will be extracted

        Output
        ------
        nouns : list
            | List of noun tokens (in str) in the input text
        '''

        if self.pre_trained == True:
            return self.analyzer.nouns(text)
Beispiel #23
0
class Embedding:

    MODEL_SAVED_DIR = "saved_model/fasttext.model"
    TOKENIZER_SAVED_DIR = "saved_model\\tokenizer.pkl"

    def __init__(self, dataset: pd.DataFrame, word_train: bool):
        self.dataset = dataset
        self.corpus = dataset["TITLE"] + dataset["TEXTCONTENT"]

        if word_train == False:
            self.fasttext = FastText.load(self.MODEL_SAVED_DIR)
            self._load_tokenizer()
            self._tokenize()
        else:
            self._extracte()
            self._tokenize()
            self._save_tokenizer()
            self._train()

        self.idx_word_dict = dict(
            zip(np.arange(4,
                          len(self.fasttext.wv.vectors) + 4),
                self.fasttext.wv.index2word))
        self.idx_word_dict[0] = '<PAD>'
        self.idx_word_dict[1] = '<STA>'
        self.idx_word_dict[2] = '<EOS>'
        self.idx_word_dict[3] = '<UNK>'

    def _extracte(self) -> None:
        self.extractor = WordExtractor()
        self.extractor.train(self.corpus)
        self.words = self.extractor.extract()
        self.cohesion_score = {
            word: score.cohesion_forward
            for word, score in self.words.items()
        }
        self.tokenizer = LTokenizer(scores=self.cohesion_score)

    def _tokenize(self) -> pd.DataFrame:
        self.corpus = self.corpus.apply(
            lambda text: self.tokenizer.tokenize(text))
        self.dataset["TITLE"] = self.dataset["TITLE"].apply(
            lambda text: self.tokenizer.tokenize(text))
        self.dataset["TEXTCONTENT"] = self.dataset["TEXTCONTENT"].apply(
            lambda text: self.tokenizer.tokenize(text))

    def _save_tokenizer(self) -> None:
        with open(self.TOKENIZER_SAVED_DIR, "wb") as f:
            pickle.dump(self.tokenizer, f, pickle.HIGHEST_PROTOCOL)

    def _load_tokenizer(self) -> None:
        with open(self.TOKENIZER_SAVED_DIR, "rb") as f:
            self.tokenizer = pickle.load(f)

    def _train(self) -> None:
        self.fasttext = FastText(sentences=self.corpus,
                                 size=100,
                                 window=5,
                                 min_count=1,
                                 iter=100)
        self.fasttext.save(self.MODEL_SAVED_DIR)

    def dataset_to_embedding(self) -> pd.DataFrame:
        self.dataset["TITLE_IDX"] = self.dataset["TITLE"].apply(
            self._sentence_length_fix, args=[10])
        self.dataset["TITLE"] = self.dataset["TITLE"].apply(
            self._sentence_length_fix, args=[10])
        self.dataset["TEXTCONTENT"] = self.dataset["TEXTCONTENT"].apply(
            self._sentence_length_fix, args=[32])

        for index, value in self.dataset["TITLE_IDX"].iteritems():
            assert len(value) == 10

        for index, value in self.dataset["TITLE"].iteritems():
            assert len(value) == 10

        for index, value in self.dataset["TEXTCONTENT"].iteritems():
            assert len(value) == 32

        self.dataset["TITLE_IDX"] = self.dataset["TITLE_IDX"].apply(
            lambda tokenized: np.array(
                [self._word_to_idx(token) for token in tokenized]))
        self.dataset["TITLE"] = self.dataset["TITLE"].apply(
            lambda tokenized: np.array(
                [self._word_to_vec(token) for token in tokenized]))
        self.dataset["TEXTCONTENT"] = self.dataset["TEXTCONTENT"].apply(
            lambda tokenized: np.array(
                [self._word_to_vec(token) for token in tokenized]))

        return self.dataset

    def embedding_to_sentence(self, target: list or np.array) -> list:
        return [self._vec_to_word(vector) for vector in target]

    def _sentence_length_fix(self, sentence: list or np.array,
                             length: int) -> list or np.array:
        sentence_length = len(sentence)
        if sentence_length < length:
            while len(sentence) < length:
                sentence.append('<PAD>')
        elif sentence_length > length:
            sentence = sentence[:length]
        return sentence

    def _vec_to_word(self, vector) -> str:
        if np.array_equal(vector, np.eye(100, dtype=np.float32)[0]):
            return '<PAD>'
        elif np.array_equal(vector, np.eye(100, dtype=np.float32)[1]):
            return '<STA>'
        elif np.array_equal(vector, np.eye(100, dtype=np.float32)[2]):
            return '<EOS>'
        elif np.array_equal(vector, np.eye(100, dtype=np.float32)[3]):
            return '<UNK>'
        return self.fasttext.wv.most_similar(positive=[vector], topn=1)[0][0]

    def _word_to_vec(self, word) -> np.array:
        try:
            if word == '<PAD>': return np.eye(100, dtype=np.float32)[0]
            elif word == '<STA>': return np.eye(100, dtype=np.float32)[1]
            elif word == '<EOS>': return np.eye(100, dtype=np.float32)[2]
            elif word == '<UNK>': return np.eye(100, dtype=np.float32)[3]
            return self.fasttext.wv.word_vec(word)
        except:
            return np.eye(100, dtype=np.float32)[3]

    def _word_to_idx(self, word) -> int:
        try:
            return list(self.idx_word_dict.keys())[list(
                self.idx_word_dict.values()).index(word)]
        except:
            return 3

    def _idx_to_word(self, idx) -> str:
        return self.idx_word_dict[idx]
    #print("ㅋ이 들어간 문장 중 ㅋ의 길이의 표준편차: ", np.std(np_single))
    f.write("ㅋ이 들어간 chat 중 ㅋ의 평균 길이: " + str(round(np.mean(np_single),3)) + '\n') #얘가 신뢰할 수 없는 정보인게 히스토그램, 분산, 표준편차 확인해보면 값이 몰려 있음 -> 중앙값도 한 번 봐보자
    f.write("ㅋ이 들어간 chat 중 ㅋ의 길이의 중앙값: " + str(np.median(np_single)) + '\n') #얘도 신뢰할 수 없는 정보인게 자료분포가 중심지향적이지 않음 -> 최빈값도 고려
    f.write("ㅋ이 들어간 chat 중 ㅋ의 길이의 최빈값(상위3개): " + str(Counter(np_single).most_common()[:3])+'\n') #상위 3개 확인
    n, bins, patches = plt.hist(np_single, bins=sentence_cnt)  # ㅋ이 들어간 문장 중 ㅋ의 평균 출현 횟수에 대한 히스토그램
    plt.savefig(result_path+"/"+file_num+".png")
    f.close()

raw_time, raw_chat = read_data(file_name)
laugh_check(raw_chat)

'''
통계에 기반하여 단어를 찾아내는 비지도 학습법
1. Accessor Variety
2. Branching Entropy
3. Cohesion score
'''
word_extractor = WordExtractor(
    min_frequency=20,
    min_cohesion_forward=0.05,
    min_right_branching_entropy=0.0
) #여기서는 Cohesion Score 사용

'''
word_extractor.train(raw_chat)
words = word_extractor.extract()
print("word extraction 길이: ",len(words), " \n결과: ")
print(words)
#words_score = {word : score.cohesion_forward for word, score in words.items()}
#tokenizer = LTokenizer(scores=words_score)
'''
    f.write("ㅋ이 들어간 chat 중 ㅋ의 길이의 최빈값(상위3개): " + str(Counter(np_single).most_common()[:3])+'\n') #상위 3개 확인
    n, bins, patches = plt.hist(np_single, bins=sentence_cnt)  # ㅋ이 들어간 문장 중 ㅋ의 평균 출현 횟수에 대한 히스토그램
    plt.savefig(result_path+"/"+file_num+".png")
    f.close()

raw_time, raw_chat = read_data(file_name)
laugh_check(raw_chat)

'''
통계에 기반하여 단어를 찾아내는 비지도 학습법
1. Accessor Variety
2. Branching Entropy
3. Cohesion score
'''

word_extractor = WordExtractor(
    min_frequency=20,
    min_cohesion_forward=0.05,
    min_right_branching_entropy=0.0
) #여기서는 Cohesion Score 사용

word_extractor.train(raw_chat)
words = word_extractor.extract()

'''
print("word extraction 길이: ",len(words), " \n결과: ")
print(words)
#words_score = {word : score.cohesion_forward for word, score in words.items()}
#tokenizer = LTokenizer(scores=words_score)
'''
Beispiel #26
0
    raw_data.append(sent)

# --------------------------토크나이저 로드--------------------

import numpy as np

from soynlp.word import WordExtractor
from soynlp.utils import DoublespaceLineCorpus
from soynlp.tokenizer import LTokenizer

word_extractor = WordExtractor(
    min_frequency=100,  # example
    min_cohesion_forward=0.05,
    min_right_branching_entropy=0.0)

word_extractor.train(news_title)
words = word_extractor.extract()

cohesion_score = {
    word: score.cohesion_forward
    for word, score in words.items()
}
tokenizer = LTokenizer(scores=cohesion_score)

# # --------------------------word2vec 데이터 전처리--------------------

cluster_data = []

for k, title in enumerate(news_title):
    title = test(title)
    sent = tokenizer.tokenize(title, flatten=False)
    y_train.append(i[2])

#x_data = []
#_data = []

#for i in raw_data:
#    x_data.append(i[1])
#    y_data.append(i[2])

#x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3)

word_extractor = WordExtractor(min_frequency=150,
                               min_cohesion_forward=0.05,
                               min_right_branching_entropy=0.0)

word_extractor.train(x_train)
train_words = word_extractor.extract()
train_score = {
    word: score.cohesion_forward
    for word, score in train_words.items()
}
tokenizer = LTokenizer(scores=train_score)
train_list = []
cnt = 0
for sent in x_train:
    train_list.append([tokenizer.tokenize(sent), y_train[cnt]])
    cnt += 1

word_extractor.train(x_test)
test_words = word_extractor.extract()
test_score = {
Beispiel #28
0
def Makegraph_Wordcloud_Soynlp(target):
    try:
        if flag_login == 0 or flag_login == None or flag_login == '':
            Login()
        #elif flag_prepro == 0:
        #messagebox.showwarning('주의', '데이터 전처리 후 실행해주세요.')
        #return
        else:
            data_wordcloud_soynlp = pd.DataFrame(data_origin[target],
                                                 columns=['contents'])
            data_wordcloud_soynlp['contents'] = data_origin[target].apply(
                lambda x: re.sub('[^가-힣]', ' ', x))

            word_extractor = WordExtractor(
                min_frequency=10,  # 가변화하기 (ex. data_origin.len() 비례)
                min_cohesion_forward=0.05,
                min_right_branching_entropy=0.0)
            word_extractor.train(data_wordcloud_soynlp['contents'].values)
            words = word_extractor.extract()

            cohesion_score = {
                word: score.cohesion_forward
                for word, score in words.items()
            }  # force : 여기인가?
            # force join words
            cohesion_score['숙소제공'] = 1
            cohesion_score['교통비지급'] = 1
            cohesion_score['인센티브'] = 1
            cohesion_score['초과근무시간확대'] = 1
            cohesion_score['복지포인트'] = 1
            cohesion_score['인사우대'] = 1
            cohesion_score['근평가점'] = 1
            cohesion_score['주거이전수당'] = 1

            tokenizer = LTokenizer(scores=cohesion_score)
            data_wordcloud_soynlp['tokenizer'] = data_wordcloud_soynlp[
                'contents'].apply(
                    lambda x: tokenizer.tokenize(x, remove_r=True))

            words = list()
            for i in data_wordcloud_soynlp['tokenizer'].values:
                for j in i:
                    words.append(j)

            count_soynlp = Counter(words)
            words_dict_soynlp = dict(count_soynlp.most_common(100))  # 빈도 상위 n개

            csv_stopwords = pd.read_csv('stopwords.csv',
                                        encoding='cp949',
                                        skiprows=0)  # with open 변경
            stopwords = list()
            for i in csv_stopwords.values:
                for j in i:
                    stopwords.append(j)

            for word in stopwords:
                words_dict_soynlp.pop(word, None)

            wordcloud = WordCloud(
                font_path='NanumGothic.ttf',
                width=500,
                height=500,
                background_color='white').generate_from_frequencies(
                    words_dict_soynlp)

            plt.clf()
            plt.figure(figsize=(20, 20))
            plt.imshow(wordcloud)
            plt.axis('off')
            #plt.show()
            plt.savefig(resultdir + filename_dateflag + target +
                        ' - wordcloud_soynlp.png',
                        dpi=100)
            '''
            # 빈도그래프(temp)
            plt.clf()
            plt.style.use('ggplot')
            plt.figure(figsize = (len(list(words_dict_soynlp.keys())[:20])*0.6, 10)) # grid size 가변화
            plt.title('상위 10개 빈출단어')
            plt.bar(list(words_dict_soynlp.keys())[:20], list(words_dict_soynlp.values())[:20])
            plt.xticks(rotation = 45, ha = 'right') # x축 라벨 회전
            plt.savefig(resultdir + filename_dateflag + target + ' - wordfrequency.png', dpi = 200)
            '''

        messagebox.showinfo(
            '작업', '워드클라우드(Soynlp) 생성이 완료되었습니다.\n\nresult폴더에 결과물이 저장되었습니다.')
    except Exception as e:
        Log(desc=e)
        messagebox.showerror('경고', str(e) + ' 열을 찾을 수 없습니다.')
Beispiel #29
0
class KoreanTokenizer:
    '''
    A class to tokenize a Korean sentence.

    Attributes
    ----------
    **kwargs
        | Keyword arguments for WordExtractor object (see soynlp.word.WordExtractor)

    Methods
    -------
    train
        | Trains KoreanTokenizer on a corpus
    tokenize
        | Tokenizes the input sentence and returns its tokens
    
    '''

    from soynlp.word import WordExtractor
    from soynlp.utils import check_corpus
    from soynlp.utils import DoublespaceLineCorpus
    from soynlp.tokenizer import LTokenizer

    def __init__(self, **kwargs):
        if 'sents' in kwargs.keys():
            del kwargs['sents']
            print("WARNING: 'sents' argument is ignored.")

        self.WordExtractor = WordExtractor(**kwargs)

    def train(self, text, **kwargs):
        '''
        A method to train the KoreanTokenizer object.

        Attributes
        ----------
        text : iterable or DoublespaceLineCorpus
            | A input text in any iterable type (e.g. list)
            | or a DoublespaceLineCorpus object (see soynlp.utils.DoublespaceLineCorpus)
        **kwargs
            | Keyword arguments for WordExtractor.train() method (see soynlp.word.WordExtractor.train)
        '''

        if 'sents' in kwargs.keys():
            del kwargs['sents']
            print("WARNING: 'sents' argument is ignored; WordExtractor is trained on 'text' argument only.")
        
        self.WordExtractor.train(text, **kwargs)
        self.words = self.WordExtractor.extract()

        def calculate_word_score(word, score):
            cohesion = score.cohesion_forward
            branching_entropy = score.right_branching_entropy
            
            word_score = cohesion * exp(branching_entropy)

            return word_score

        self.word_score = {word:calculate_word_score(word, score) for word, score in words.items()}

    def tokenize(self, text, **kwargs):
        '''
        A method to tokenize the input text

        Attributes
        ----------
        text : str
            | An input text in str type

        **kwargs
            | Keyword arguments for LTokenizer.tokenize() method (see soynlp.tokenizer.LTokenizer.tokenize)
        '''
        
        if 'sentence' in kwargs.keys():
            del kwargs['sentence']
            print("WARNING: 'sentence' argument is ignored; word_tokenizer tokenizes 'text' argument only.")

        if not self.word_score:
            print('KoreanTokenizer should be trained first, before tokenizing.')
            return
        
        self.tokenizer = LTokenizer(scores=self.word_score)
        
        result = self.tokenizer.tokenize(text, **kwargs)

        return result
Beispiel #30
0
    temp['score'] = score
    temp['freq'] = freq

    nouns_list.append(temp)

df_nouns = pd.DataFrame(nouns_list)
df_nouns = df_nouns.sort_values(by=['score'], ascending=False)
nouns_candidates_list = df_nouns.loc[df.score > NOUNS_THRESHOLD].noun.tolist()
print('nouns_candidates_list : {}\n'.format(len(nouns_candidates_list)))

print(''' words extractor ''')
word_extractor = WordExtractor(min_frequency=100,
                               min_cohesion_forward=0.05,
                               min_right_branching_entropy=0.0)

word_extractor.train(corpus)
words = word_extractor.extract()
words = {k: v for k, v in words.items() if len(k) > 1}
words_list = list()
for k, v in words.items():
    temp = dict()
    cohesion = v.cohesion_forward
    branching_entropy = v.left_branching_entropy
    left_freq = v.leftside_frequency
    right_freq = v.rightside_frequency
    score = cohesion * branching_entropy

    temp['word'] = k.lower()
    temp['cohesion'] = cohesion
    temp['branching_entropy'] = branching_entropy
    temp['left_freq'] = left_freq
isSave = False

if isSave:
    txtreader = txt_reader("../Data/Text/Joins/Sasul.txt", False)
    list_words = []
    list_sents = []
    #선택적 한계가 생길 것이다.
    for i, doc in enumerate(txtreader):
        doc_text = doc.split("\t")[4]
        # splits with sentences
        sents = doc_text.split('.')
        for sent in sents:
            list_sents.append(sent)

    print("length of list_sents = {}", len(list_sents))
    word_extractor = WordExtractor(min_count=100,
                                   min_cohesion_forward=0.05,
                                   min_right_branching_entropy=0.0)
    word_extractor.train(list_sents)  # list of str or like
    words = word_extractor.extract()
    with open("words.pkl", "wb") as f:
        pickle.dump(words, f)
else:
    # 통계적인 방법은 반복수가 많아야 하는 기법이다.
    print("Load")
    with open("words.pkl", "rb") as f:
        words_dic = pickle.load(f)
    print(type(words_dic))
    nlphelper = nlp_helper()
    nlphelper.cvtWordDicToExcel(words_dic, "output")