def data_tokenize(news_title, tdm_vocab):

    word_extractor = WordExtractor(
        min_frequency=100,  # example
        min_cohesion_forward=0.05,
        min_right_branching_entropy=0.0)

    word_extractor.train(news_title)
    words = word_extractor.extract()

    cohesion_score = {
        word: score.cohesion_forward
        for word, score in words.items()
    }
    tokenizer = LTokenizer(scores=cohesion_score)

    cluster_data = []
    bert_null_list = []

    for title in news_title:
        title = test(title)
        sent = tokenizer.tokenize(title, flatten=False)
        sentence = []
        for i in sent:
            if i[0] in tdm_vocab:
                sentence.append(i[0])

        cluster_data.append(sentence)

    return cluster_data
Example #2
0
class SoyNLPTokenizer(BaseTokenizer):
    """
    Tokenize text using MaxScoreTokenizer of SoyNLP
    """
    def __init__(self):
        self.tokenizer = None
        self.scores = list()
        self.word_extractor = WordExtractor(min_count=100,
                                            min_cohesion_forward=0.05,
                                            min_right_branching_entropy=0.0)

    def fit(self, sentences):
        self.word_extractor.train(sentences)
        scores = self.word_extractor.extract()
        scores = [(word, (score.cohesion_forward + score.cohesion_backward) * \
                   (score.left_branching_entropy + score.right_branching_entropy)) for word, score in scores.items()]
        self.scores = scores
        self.tokenizer = MaxScoreTokenizer(scores=self.scores)

    def state_dict(self):
        return {'scores': self.scores}

    def load_state_dict(self, state_dict):
        self.scores = state_dict['scores']
        self.tokenizer = MaxScoreTokenizer(scores=self.scores)

    def tokenize(self, sentence):
        tokenized_sentence = self.tokenizer.tokenize(sentence)
        return tokenized_sentence
Example #3
0
 def getTokenizer(self, contents):
     corpus = SentiCorpus(contents, iter_sent=True)
     word_extractor = WordExtractor(corpus)
     word_extractor.train(corpus)
     words_scores = word_extractor.extract()
     scores = {w: s.cohesion_forward for w, s in words_scores.items()}
     return LTokenizer(scores=scores)
def soynlp_tokenizer(corpus):
    from soynlp.tokenizer import LTokenizer
    from soynlp.word import WordExtractor
    from soynlp.noun import LRNounExtractor_v2

    # word extractor
    word_extractor = WordExtractor(
        min_frequency=100,  # example
        min_cohesion_forward=0.05,
        min_right_branching_entropy=0.0)
    word_extractor.train(corpus)
    words = word_extractor.extract()

    cohesion_score = {
        word: score.cohesion_forward
        for word, score in words.items()
    }

    # noun extractor
    noun_extractor = LRNounExtractor_v2()
    nouns = noun_extractor.train_extract(corpus)  # list of str like

    noun_scores = {noun: score.score for noun, score in nouns.items()}
    combined_scores = {
        noun: score + cohesion_score.get(noun, 0)
        for noun, score in noun_scores.items()
    }
    combined_scores.update({
        subword: cohesion
        for subword, cohesion in cohesion_score.items()
        if not (subword in combined_scores)
    })

    tokenizer = LTokenizer(scores=combined_scores)
    return tokenizer
Example #5
0
def build_tokenizer():
    """
    Train soynlp tokenizer which will be used to tokenize Korean input sentence using whole corpus
    Returns:

    """
    print(f'Now building soy-nlp tokenizer . . .')

    data_dir = Path().cwd() / 'data'
    train_file = os.path.join(data_dir, 'train_soynlp.csv')

    df = pd.read_csv(train_file, encoding='utf-8')

    # if encounters non-text row, we should skip it
    kor_lines = [
        row.korean for _, row in df.iterrows() if type(row.korean) == str
    ]

    word_extractor = WordExtractor(min_frequency=5)
    word_extractor.train(kor_lines)

    word_scores = word_extractor.extract()
    cohesion_scores = {
        word: score.cohesion_forward
        for word, score in word_scores.items()
    }

    with open('pickles/tokenizer.pickle', 'wb') as pickle_out:
        pickle.dump(cohesion_scores, pickle_out)
def build_tokenizer():
    """
    Train soynlp tokenizer which will be used to tokenize Korean input sentence
    Returns:

    """
    print(f'Now building soynlp tokenizer . . .')

    data_dir = Path().cwd() / 'data'
    train_txt = os.path.join(data_dir, 'train.txt')

    with open(train_txt, encoding='utf-8') as f:
        lines = f.readlines()

    word_extractor = WordExtractor(min_frequency=5)
    word_extractor.train(lines)

    word_scores = word_extractor.extract()
    cohesion_scores = {
        word: score.cohesion_forward
        for word, score in word_scores.items()
    }

    with open('pickles/tokenizer.pickle', 'wb') as pickle_out:
        pickle.dump(cohesion_scores, pickle_out)
Example #7
0
def data_tokenize(news_title):

    word_extractor = WordExtractor(
        min_frequency=100, # example
        min_cohesion_forward=0.05,
        min_right_branching_entropy=0.0
    )

    word_extractor.train(news_title)
    words = word_extractor.extract()

    cohesion_score = {word:score.cohesion_forward for word, score in words.items()}
    tokenizer = LTokenizer(scores=cohesion_score)

    return tokenizer
Example #8
0
def word_extractor_test(corpus_path):
    print('WordExtractor test')
    from soynlp import DoublespaceLineCorpus
    from soynlp.word import WordExtractor

    corpus = DoublespaceLineCorpus(corpus_path, num_doc=1000)
    word_extractor = WordExtractor()
    word_extractor.train(corpus)
    word_scores = word_extractor.extract()

    print('top 20 left frequency * forward cohesion words')
    topwords = sorted(word_scores, key=lambda x: -word_scores[x].cohesion_forward * word_scores[x].leftside_frequency)[:20]
    for word in topwords:
        print('word = {}, cohesion = {}'.format(word, word_scores[word].cohesion_forward))
    print('word extractor test has been done\n\n')
Example #9
0
 def _get_tokenizer(self, df):
     """
     Generate a torkenizer by extracting words
     Args:
         dataframe: data corpus of one language
     Returns:
         tokenizer
     """
     word_extractor = WordExtractor()
     word_extractor.train(df)
     words = word_extractor.extract()
     print(f'length of words is {len(words)}')
     cohesion_scores = {
         word: score.cohesion_forward
         for word, score in words.items()
     }
     tokenizer = LTokenizer(scores=cohesion_scores)
     return tokenizer
Example #10
0
def word_extract(datas):
    we = WordExtractor(
    min_frequency=10,
    min_cohesion_forward=0.05,
    min_right_branching_entropy=0.0
    )
    we.train(datas)
    words = we.extract()
    print('단어   (빈도수, cohesion, branching entropy)\n')
    for word, score in sorted(words.items(), key=lambda x:word_score(x[1]), reverse=True)[:10]:
        print('%s     (%d, %.3f, %.3f)' % (
            word, 
            score.leftside_frequency, 
            score.cohesion_forward,
            score.right_branching_entropy
            )
         )
    return 
Example #11
0
def word_extractor_test(corpus_path):
    print('WordExtractor test')
    from soynlp import DoublespaceLineCorpus
    from soynlp.word import WordExtractor

    corpus = DoublespaceLineCorpus(corpus_path, num_doc=1000)
    word_extractor = WordExtractor()
    word_extractor.train(corpus)
    word_scores = word_extractor.extract()

    print('top 20 left frequency * forward cohesion words')
    topwords = sorted(word_scores,
                      key=lambda x: -word_scores[x].cohesion_forward *
                      word_scores[x].leftside_frequency)[:20]
    for word in topwords:
        print('word = {}, cohesion = {}'.format(
            word, word_scores[word].cohesion_forward))
    print('word extractor test has been done\n\n')
def main(args):
    # Find patterns and extract words from a given set of documents
    sentences = DoublespaceLineCorpus(args.corpus_fname, iter_sent=True)
    word_extractor = WordExtractor(min_frequency=100,
                                   min_cohesion_forward=0.05,
                                   min_right_branching_entropy=0.0)

    # word extractor
    word_extractor.train(sentences)
    words = word_extractor.extract()
    cohesion_score = {
        word: score.cohesion_forward
        for word, score in words.items()
    }
    print('Word   (Freq, cohesion, branching entropy)\n')
    for word, score in sorted(words.items(),
                              key=lambda x: word_score(x[1]),
                              reverse=True)[:30]:
        print('%s     (%d, %.3f, %.3f)' %
              (word, score.leftside_frequency, score.cohesion_forward,
               score.right_branching_entropy))

    # noun extractor
    noun_extractor = LRNounExtractor_v2()
    nouns = noun_extractor.train_extract(args.corpus_fname)  # list of str like
    noun_scores = {noun: score.score for noun, score in nouns.items()}

    # combined score
    combined_scores = {
        noun: score + cohesion_score.get(noun, 0)
        for noun, score in noun_scores.items()
    }
    combined_scores.update({
        subword: cohesion
        for subword, cohesion in cohesion_score.items()
        if not (subword in combined_scores)
    })

    # maxScore tokenizer
    tokenizer = MaxScoreTokenizer(scores=combined_scores)

    # save tokenizer
    with open(args.tokenizer_path, 'wb') as f:
        pickle.dump(tokenizer, f, pickle.HIGHEST_PROTOCOL)
Example #13
0
    def soynlp_tokenizer(self):
        def word_score(score): return (score.cohesion_forward * math.exp(score.right_branching_entropy))

        if self.mode == 'serve':
            with open(self.data_path, 'r') as file:
                word_score_dict = json.load(file)
        elif self.mode == 'train':
            word_extractor = WordExtractor()
            word_extractor.train(self.train_corpus)
            words = word_extractor.extract()
            word_score_dict = { word:word_score(score) for word, score, in words.items()}

            with open('./models/word_dict.json', 'w') as file:
                json.dump(word_score_dict, file)
        else:
            pass
        
        tokenizer = MaxScoreTokenizer(scores=word_score_dict)
        return tokenizer
Example #14
0
def build_tokenizer():
    """
    입력되는 한국어 문장을 tokenize 할 soynlp tokenizer를 학습한다
    """
    print(f'Now building soy-nlp tokenizer . . .')

    data_dir = Path().cwd() / 'data'
    train_file = os.path.join(data_dir, 'corpus.csv')
    """
    학습되는 데이터가 있는 경로 지정 후 파일을 불러온다
    """
    df = pd.read_csv(train_file, encoding='utf-8')
    """
    text인 행만 분석한다
    """
    kor_lines = [
        row.korean for _, row in df.iterrows() if type(row.korean) == str
    ]
    """
    soynlp 모듈에서 가져온 WordExtractor 함수로 branching entropy, accessor variety, cohesion score의 단어 score 도출한다
    이 단어 score들은 각각 다른 방법으로 token의 경계를 찾는 값이다
    그 중 cohesion score(단어를 구성하는 글자들이 얼마나 같이 나오는지에 대한 값)만 추출한다.
    자세한 단어 score의 식과 코드는 https://github.com/lovit/soynlp/blob/master/tutorials/wordextractor_lecture.ipynb 에 자세히 나와있다.
    """
    word_extractor = WordExtractor(min_frequency=5)
    word_extractor.train(kor_lines)

    word_scores = word_extractor.extract()
    cohesion_scores = {
        word: score.cohesion_forward
        for word, score in word_scores.items()
    }
    """
    pickle로 저장한다
    """
    with open('pickles/tokenizer.pickle', 'wb') as pickle_out:
        pickle.dump(cohesion_scores, pickle_out)
#x_data = []
#_data = []

#for i in raw_data:
#    x_data.append(i[1])
#    y_data.append(i[2])

#x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3)

word_extractor = WordExtractor(min_frequency=150,
                               min_cohesion_forward=0.05,
                               min_right_branching_entropy=0.0)

word_extractor.train(x_train)
train_words = word_extractor.extract()
train_score = {
    word: score.cohesion_forward
    for word, score in train_words.items()
}
tokenizer = LTokenizer(scores=train_score)
train_list = []
cnt = 0
for sent in x_train:
    train_list.append([tokenizer.tokenize(sent), y_train[cnt]])
    cnt += 1

word_extractor.train(x_test)
test_words = word_extractor.extract()
test_score = {
    word: score.cohesion_forward
from soynlp import DoublespaceLineCorpus
from soynlp.word import WordExtractor
from soynlp.tokenizer import LTokenizer

from gensim.test.utils import common_texts, get_tmpfile

if __name__ == '__main__':
    # Load Data
    file_name = 'wikiPlotText_m.txt'

    # 한국어 Cohesion score 사용 형태소 분석기
    corpus = DoublespaceLineCorpus(file_name, iter_sent=True)
    word_extractor = WordExtractor(corpus)
    word_extractor.train(corpus)
    words_scores = word_extractor.extract()
    scores = {w:s.cohesion_forword for w, s in words_scores.items()}
    tokenizer = LTokenizer(scores=scores)

    # {'games':games, 'corpus':corpus, 'titles':titles}
    games_data = fh.getGamesData(fh.getStoragePath()+file_name, tokenizer=tokenizer)
    tokenized_contents = games_data['corpus_words']

    # Vectorizing
    # sg=1(skip-gram), 0(CBOW)
    model_path = 'models/word2vec_ko.model'
    if os.path.isfile(model_path): word2vec_model = Word2Vec.load(model_path)
    else:
        path = get_tmpfile(model_path)
        word2vec_model = Word2Vec(tokenized_contents, size=100, window=10, min_count=50, workers=8, iter=1000, sg=1)
        word2vec_model.save(model_path)
Example #17
0
class SoyTokenizer:
    def __init__(self, model_path: str = None):
        self.word_extractor = WordExtractor(min_frequency=5,
                                            min_cohesion_forward=0.05,
                                            min_right_branching_entropy=0.0)
        self.unk = 0
        self.pad = 1
        self.sos = 2
        self.eos = 3

        if model_path:
            with open(model_path, 'rb') as readFile:
                self.cohesion_score = dill.load(readFile)
        else:
            self.cohesion_score = {}
        self.tokenizer = LTokenizer(scores=self.cohesion_score)
        self.tok_to_id, self.id_to_tok = self._build_dict()

    def tokenize(self, sent: str):
        return self.tokenizer.tokenize(sent)

    def text_to_id(self, sent: str):
        toks = self.tokenize(sent)
        outp = []
        for s in toks:
            try:
                outp.append(self.tok_to_id[s])
            except KeyError:
                outp.append(self.unk)
        return outp

    def id_to_text(self, idxs: list):
        return [self.id_to_tok[i] for i in idxs]

    def train(self, sentences, add_whitespace: bool = False):
        sentences = self.preprocess(sentences)
        self.word_extractor.train(sentences)
        words = self.word_extractor.extract()
        self.cohesion_score = {
            word: score.cohesion_forward
            for word, score in words.items()
        }

        # add whitespace tokens
        if add_whitespace:
            whitetokens = []
            for s in sentences:
                whitetokens += s.split(' ')
            whitetokens = list(set(whitetokens))

            for t in whitetokens:
                self.cohesion_score.update({t: 1.0})

        self.tok_to_id, self.id_to_tok = self._build_dict()

    def save_model(self, model_path: str, model_prefix: str):
        with open(os.path.join(model_path, model_prefix + '.model'),
                  'wb') as saveFile:
            dill.dump(self.cohesion_score, saveFile)

    def _build_dict(self):
        tok_to_id = {'<unk>': 0, '<pad>': 1, '<sos>': 2, '<eos>': 3}
        id_to_tok = {0: '<unk>', 1: '<pad>', 2: '<sos>', 3: '<eos>'}
        for i, key in enumerate(self.cohesion_score.keys()):
            tok_to_id[key] = i + 4
            id_to_tok[i + 4] = key
        return tok_to_id, id_to_tok

    def preprocess(self, sents: list):
        n_str_pattern = re.compile(pattern='[\\d\\-?/_!\\.,]')
        doublespacing = re.compile(pattern='\\s\\s+')

        sents = [n_str_pattern.sub(repl=' ', string=w) for w in sents]
        sents = [doublespacing.sub(repl=' ', string=w).strip() for w in sents]
        sents = [u.lower() for u in sents]
        return sents

    def __len__(self):
        return len(self.cohesion_score)
Example #18
0
class Embedding:

    MODEL_SAVED_DIR = "saved_model/fasttext.model"
    TOKENIZER_SAVED_DIR = "saved_model\\tokenizer.pkl"

    def __init__(self, dataset: pd.DataFrame, word_train: bool):
        self.dataset = dataset
        self.corpus = dataset["TITLE"] + dataset["TEXTCONTENT"]

        if word_train == False:
            self.fasttext = FastText.load(self.MODEL_SAVED_DIR)
            self._load_tokenizer()
            self._tokenize()
        else:
            self._extracte()
            self._tokenize()
            self._save_tokenizer()
            self._train()

        self.idx_word_dict = dict(
            zip(np.arange(4,
                          len(self.fasttext.wv.vectors) + 4),
                self.fasttext.wv.index2word))
        self.idx_word_dict[0] = '<PAD>'
        self.idx_word_dict[1] = '<STA>'
        self.idx_word_dict[2] = '<EOS>'
        self.idx_word_dict[3] = '<UNK>'

    def _extracte(self) -> None:
        self.extractor = WordExtractor()
        self.extractor.train(self.corpus)
        self.words = self.extractor.extract()
        self.cohesion_score = {
            word: score.cohesion_forward
            for word, score in self.words.items()
        }
        self.tokenizer = LTokenizer(scores=self.cohesion_score)

    def _tokenize(self) -> pd.DataFrame:
        self.corpus = self.corpus.apply(
            lambda text: self.tokenizer.tokenize(text))
        self.dataset["TITLE"] = self.dataset["TITLE"].apply(
            lambda text: self.tokenizer.tokenize(text))
        self.dataset["TEXTCONTENT"] = self.dataset["TEXTCONTENT"].apply(
            lambda text: self.tokenizer.tokenize(text))

    def _save_tokenizer(self) -> None:
        with open(self.TOKENIZER_SAVED_DIR, "wb") as f:
            pickle.dump(self.tokenizer, f, pickle.HIGHEST_PROTOCOL)

    def _load_tokenizer(self) -> None:
        with open(self.TOKENIZER_SAVED_DIR, "rb") as f:
            self.tokenizer = pickle.load(f)

    def _train(self) -> None:
        self.fasttext = FastText(sentences=self.corpus,
                                 size=100,
                                 window=5,
                                 min_count=1,
                                 iter=100)
        self.fasttext.save(self.MODEL_SAVED_DIR)

    def dataset_to_embedding(self) -> pd.DataFrame:
        self.dataset["TITLE_IDX"] = self.dataset["TITLE"].apply(
            self._sentence_length_fix, args=[10])
        self.dataset["TITLE"] = self.dataset["TITLE"].apply(
            self._sentence_length_fix, args=[10])
        self.dataset["TEXTCONTENT"] = self.dataset["TEXTCONTENT"].apply(
            self._sentence_length_fix, args=[32])

        for index, value in self.dataset["TITLE_IDX"].iteritems():
            assert len(value) == 10

        for index, value in self.dataset["TITLE"].iteritems():
            assert len(value) == 10

        for index, value in self.dataset["TEXTCONTENT"].iteritems():
            assert len(value) == 32

        self.dataset["TITLE_IDX"] = self.dataset["TITLE_IDX"].apply(
            lambda tokenized: np.array(
                [self._word_to_idx(token) for token in tokenized]))
        self.dataset["TITLE"] = self.dataset["TITLE"].apply(
            lambda tokenized: np.array(
                [self._word_to_vec(token) for token in tokenized]))
        self.dataset["TEXTCONTENT"] = self.dataset["TEXTCONTENT"].apply(
            lambda tokenized: np.array(
                [self._word_to_vec(token) for token in tokenized]))

        return self.dataset

    def embedding_to_sentence(self, target: list or np.array) -> list:
        return [self._vec_to_word(vector) for vector in target]

    def _sentence_length_fix(self, sentence: list or np.array,
                             length: int) -> list or np.array:
        sentence_length = len(sentence)
        if sentence_length < length:
            while len(sentence) < length:
                sentence.append('<PAD>')
        elif sentence_length > length:
            sentence = sentence[:length]
        return sentence

    def _vec_to_word(self, vector) -> str:
        if np.array_equal(vector, np.eye(100, dtype=np.float32)[0]):
            return '<PAD>'
        elif np.array_equal(vector, np.eye(100, dtype=np.float32)[1]):
            return '<STA>'
        elif np.array_equal(vector, np.eye(100, dtype=np.float32)[2]):
            return '<EOS>'
        elif np.array_equal(vector, np.eye(100, dtype=np.float32)[3]):
            return '<UNK>'
        return self.fasttext.wv.most_similar(positive=[vector], topn=1)[0][0]

    def _word_to_vec(self, word) -> np.array:
        try:
            if word == '<PAD>': return np.eye(100, dtype=np.float32)[0]
            elif word == '<STA>': return np.eye(100, dtype=np.float32)[1]
            elif word == '<EOS>': return np.eye(100, dtype=np.float32)[2]
            elif word == '<UNK>': return np.eye(100, dtype=np.float32)[3]
            return self.fasttext.wv.word_vec(word)
        except:
            return np.eye(100, dtype=np.float32)[3]

    def _word_to_idx(self, word) -> int:
        try:
            return list(self.idx_word_dict.keys())[list(
                self.idx_word_dict.values()).index(word)]
        except:
            return 3

    def _idx_to_word(self, idx) -> str:
        return self.idx_word_dict[idx]
Example #19
0
class KoreanTokenizer:
    '''
    A class to tokenize a Korean sentence.

    Attributes
    ----------
    pre_trained : bool
        | If True, one of pre-trained Korean analyzer, provided by KoNLPy, will be used (default : True)
        | If False, unsupervised KoreanTokenizer is initialized, based on soynlp L-Tokenizer. Argument 'anaylzer' is ignored.
    analyzer : str
        | Type of KoNLPy analyzer (default : Hannanum)
        | Available analyzers are: Hannanum, Kkma, Komoran, Mecab, Okt
        | Note: Mecab needs to be installed separately before being used.

    Methods
    -------
    train
        | Trains KoreanTokenizer on a corpus, only when 'pre_trained' argument is False.
    tokenize
        | Tokenizes the input sentence and returns its tokens.
    extract_noun
        | Extracts nouns from the input sentence.
    
    '''

    def __init__(self, pre_trained=True, analyzer='Hannanum'):
        self.pre_trained = pre_trained

        if analyzer == 'Hannanum':
            self.analyzer = tag.Hannanum()
        elif analyzer == 'Kkma':
            self.analyzer = tag.Kkma()
        elif analyzer == 'Komoran':
            self.analyzer = tag.Komoran()
        elif analyzer == 'Mecab':
            self.analyzer = tag.Mecab()
        elif analyzer == 'Okt':
            self.analyzer = tag.Okt()
        else:
            if pre_trained == False:
                pass
            else:
                print('Enter a valid KoNLPy analyzer name.\n\tavailable: Hannanum, Kkma, Komoran, Mecab, Okt')

        self.WordExtractor = WordExtractor(min_frequency=0)
        self.noun_extractor = LRNounExtractor(verbose=False)
        self.word_score = {}

    def train(self, text):
        '''
        A method to train the KoreanTokenizer on a corpus.
        If KoreanTokenizer.pre_trained == False, this method does nothing.

        Attributes
        ----------
        text : str
            | An input text in str type
        '''

        if self.pre_trained == True:
            print('A pre-trained KoreanTokenizer is being used. No need to train it.')
            return

        else:
            self.WordExtractor.train(text)
            self.words = self.WordExtractor.extract()

            def calculate_word_score(word, score):
                cohesion = score.cohesion_forward
                branching_entropy = score.right_branching_entropy
                
                word_score = cohesion * exp(branching_entropy)

                return word_score

            self.word_score = {word:calculate_word_score(word, score) for word, score in self.words.items()}

    def tokenize(self, text):
        '''
        A method to tokenize input text.

        Attriubutes
        -----------
        text : str
            | An input text to be tokenized

        Output
        ------
        tokens : list
            | List of tokens (in str) that consist of the input text

        '''

        if self.pre_trained == True:
            return self.analyzer.morphs(text)

        else:
            if not self.word_score:
                print('An unsupervised KoreanTokenizer should be trained first, before tokenizing.')
                return
            
            self.tokenizer = LTokenizer(scores=self.word_score)

            result = self.tokenizer.tokenize(text)

            return result

    def extract_noun(self, text):
        '''
        A method to extract nouns from input text

        Attributes
        ----------
        text : str
            | An input text from which nouns will be extracted

        Output
        ------
        nouns : list
            | List of noun tokens (in str) in the input text
        '''

        if self.pre_trained == True:
            return self.analyzer.nouns(text)
Example #20
0
def Makegraph_Wordcloud_Soynlp(target):
    try:
        if flag_login == 0 or flag_login == None or flag_login == '':
            Login()
        #elif flag_prepro == 0:
        #messagebox.showwarning('주의', '데이터 전처리 후 실행해주세요.')
        #return
        else:
            data_wordcloud_soynlp = pd.DataFrame(data_origin[target],
                                                 columns=['contents'])
            data_wordcloud_soynlp['contents'] = data_origin[target].apply(
                lambda x: re.sub('[^가-힣]', ' ', x))

            word_extractor = WordExtractor(
                min_frequency=10,  # 가변화하기 (ex. data_origin.len() 비례)
                min_cohesion_forward=0.05,
                min_right_branching_entropy=0.0)
            word_extractor.train(data_wordcloud_soynlp['contents'].values)
            words = word_extractor.extract()

            cohesion_score = {
                word: score.cohesion_forward
                for word, score in words.items()
            }  # force : 여기인가?
            # force join words
            cohesion_score['숙소제공'] = 1
            cohesion_score['교통비지급'] = 1
            cohesion_score['인센티브'] = 1
            cohesion_score['초과근무시간확대'] = 1
            cohesion_score['복지포인트'] = 1
            cohesion_score['인사우대'] = 1
            cohesion_score['근평가점'] = 1
            cohesion_score['주거이전수당'] = 1

            tokenizer = LTokenizer(scores=cohesion_score)
            data_wordcloud_soynlp['tokenizer'] = data_wordcloud_soynlp[
                'contents'].apply(
                    lambda x: tokenizer.tokenize(x, remove_r=True))

            words = list()
            for i in data_wordcloud_soynlp['tokenizer'].values:
                for j in i:
                    words.append(j)

            count_soynlp = Counter(words)
            words_dict_soynlp = dict(count_soynlp.most_common(100))  # 빈도 상위 n개

            csv_stopwords = pd.read_csv('stopwords.csv',
                                        encoding='cp949',
                                        skiprows=0)  # with open 변경
            stopwords = list()
            for i in csv_stopwords.values:
                for j in i:
                    stopwords.append(j)

            for word in stopwords:
                words_dict_soynlp.pop(word, None)

            wordcloud = WordCloud(
                font_path='NanumGothic.ttf',
                width=500,
                height=500,
                background_color='white').generate_from_frequencies(
                    words_dict_soynlp)

            plt.clf()
            plt.figure(figsize=(20, 20))
            plt.imshow(wordcloud)
            plt.axis('off')
            #plt.show()
            plt.savefig(resultdir + filename_dateflag + target +
                        ' - wordcloud_soynlp.png',
                        dpi=100)
            '''
            # 빈도그래프(temp)
            plt.clf()
            plt.style.use('ggplot')
            plt.figure(figsize = (len(list(words_dict_soynlp.keys())[:20])*0.6, 10)) # grid size 가변화
            plt.title('상위 10개 빈출단어')
            plt.bar(list(words_dict_soynlp.keys())[:20], list(words_dict_soynlp.values())[:20])
            plt.xticks(rotation = 45, ha = 'right') # x축 라벨 회전
            plt.savefig(resultdir + filename_dateflag + target + ' - wordfrequency.png', dpi = 200)
            '''

        messagebox.showinfo(
            '작업', '워드클라우드(Soynlp) 생성이 완료되었습니다.\n\nresult폴더에 결과물이 저장되었습니다.')
    except Exception as e:
        Log(desc=e)
        messagebox.showerror('경고', str(e) + ' 열을 찾을 수 없습니다.')
Example #21
0
    temp['freq'] = freq

    nouns_list.append(temp)

df_nouns = pd.DataFrame(nouns_list)
df_nouns = df_nouns.sort_values(by=['score'], ascending=False)
nouns_candidates_list = df_nouns.loc[df.score > NOUNS_THRESHOLD].noun.tolist()
print('nouns_candidates_list : {}\n'.format(len(nouns_candidates_list)))

print(''' words extractor ''')
word_extractor = WordExtractor(min_frequency=100,
                               min_cohesion_forward=0.05,
                               min_right_branching_entropy=0.0)

word_extractor.train(corpus)
words = word_extractor.extract()
words = {k: v for k, v in words.items() if len(k) > 1}
words_list = list()
for k, v in words.items():
    temp = dict()
    cohesion = v.cohesion_forward
    branching_entropy = v.left_branching_entropy
    left_freq = v.leftside_frequency
    right_freq = v.rightside_frequency
    score = cohesion * branching_entropy

    temp['word'] = k.lower()
    temp['cohesion'] = cohesion
    temp['branching_entropy'] = branching_entropy
    temp['left_freq'] = left_freq
    temp['right_freq'] = right_freq
Example #22
0
class KoreanTokenizer:
    '''
    A class to tokenize a Korean sentence.

    Attributes
    ----------
    **kwargs
        | Keyword arguments for WordExtractor object (see soynlp.word.WordExtractor)

    Methods
    -------
    train
        | Trains KoreanTokenizer on a corpus
    tokenize
        | Tokenizes the input sentence and returns its tokens
    
    '''

    from soynlp.word import WordExtractor
    from soynlp.utils import check_corpus
    from soynlp.utils import DoublespaceLineCorpus
    from soynlp.tokenizer import LTokenizer

    def __init__(self, **kwargs):
        if 'sents' in kwargs.keys():
            del kwargs['sents']
            print("WARNING: 'sents' argument is ignored.")

        self.WordExtractor = WordExtractor(**kwargs)

    def train(self, text, **kwargs):
        '''
        A method to train the KoreanTokenizer object.

        Attributes
        ----------
        text : iterable or DoublespaceLineCorpus
            | A input text in any iterable type (e.g. list)
            | or a DoublespaceLineCorpus object (see soynlp.utils.DoublespaceLineCorpus)
        **kwargs
            | Keyword arguments for WordExtractor.train() method (see soynlp.word.WordExtractor.train)
        '''

        if 'sents' in kwargs.keys():
            del kwargs['sents']
            print("WARNING: 'sents' argument is ignored; WordExtractor is trained on 'text' argument only.")
        
        self.WordExtractor.train(text, **kwargs)
        self.words = self.WordExtractor.extract()

        def calculate_word_score(word, score):
            cohesion = score.cohesion_forward
            branching_entropy = score.right_branching_entropy
            
            word_score = cohesion * exp(branching_entropy)

            return word_score

        self.word_score = {word:calculate_word_score(word, score) for word, score in words.items()}

    def tokenize(self, text, **kwargs):
        '''
        A method to tokenize the input text

        Attributes
        ----------
        text : str
            | An input text in str type

        **kwargs
            | Keyword arguments for LTokenizer.tokenize() method (see soynlp.tokenizer.LTokenizer.tokenize)
        '''
        
        if 'sentence' in kwargs.keys():
            del kwargs['sentence']
            print("WARNING: 'sentence' argument is ignored; word_tokenizer tokenizes 'text' argument only.")

        if not self.word_score:
            print('KoreanTokenizer should be trained first, before tokenizing.')
            return
        
        self.tokenizer = LTokenizer(scores=self.word_score)
        
        result = self.tokenizer.tokenize(text, **kwargs)

        return result