Exemple #1
0
    def __init__(self):
        # load noun cohesion score
        with open('utils/words.p', 'rb') as rf:
            words = pickle.load(rf)
            cohesion_score = {
                word: score.cohesion_forward
                for word, score in words.items()
            }
            cohesion_score = {
                k: v
                for k, v in sorted(cohesion_score.items(),
                                   key=lambda item: item[1],
                                   reverse=True) if v > 0
            }
        with open('utils/nouns.p', 'rb') as rf:
            nouns = pickle.load(rf)
            noun_score = {noun: score.score for noun, score in nouns.items()}
            noun_cohesion_score = {
                noun: score + cohesion_score.get(noun, 0)
                for noun, score in noun_score.items()
            }
            self._noun_cohesion_score = {
                k: v
                for k, v in sorted(noun_cohesion_score.items(),
                                   key=lambda item: item[1],
                                   reverse=True) if v > 0
            }

        self._soy = LTokenizer(scores=self._noun_cohesion_score)
        self._is_flatten = False  # no_flatten
        self._is_remove_r = False  # no_remove
        self._emo = get_emoji_regexp()  # re compiled
Exemple #2
0
def tokenizer_test():
    from soynlp.tokenizer import LTokenizer
    from soynlp.tokenizer import MaxScoreTokenizer
    from soynlp.tokenizer import RegexTokenizer

    regex_tokenizer = RegexTokenizer()
    if not (regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!') 
            == ['아라랄랄', '111', '이히힝', 'ㅇㅇ', 'ㅠㅠ', '우유우유', 'ab', '!']):
        raise ValueError("regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!') == {}".format(
            regex_tokenizer.tokenize('아라랄랄111이히힝ㅇㅇㅠㅠ우유우유ab!')))

    ltokenizer = LTokenizer({'데이터':0.4, '데이':0.35, '데이터센터':0.38})
    if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이') 
            == ['데이터', '는', '데이터', '센터의', '데이', '데이']):
        raise ValueError("ltokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format(
            ltokenizer.tokenize('데이터는 데이터센터의 데이데이')))

    if not (ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05)
            == ['데이터', '는', '데이터센터', '의', '데이', '데이']):
        raise ValueError("ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05) == {}".format(
            ltokenizer.tokenize('데이터는 데이터센터의 데이데이', tolerance=0.05)))

    maxscore_tokenizer = MaxScoreTokenizer({'데이터':0.4, '데이':0.35, '데이터센터':0.38})
    if not (maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이') 
            == ['데이터', '는', '데이터', '센터의', '데이', '데이']):
        raise ValueError("maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이') == {}".format(
            maxscore_tokenizer.tokenize('데이터는 데이터센터의 데이데이')))

    print('all tokenizer tests have been successed\n\n')
Exemple #3
0
    def tokenize(self, text):
        '''
        A method to tokenize input text.

        Attriubutes
        -----------
        text : str
            | An input text to be tokenized

        Output
        ------
        tokens : list
            | List of tokens (in str) that consist of the input text

        '''

        if self.pre_trained == True:
            return self.analyzer.morphs(text)

        else:
            if not self.word_score:
                print('An unsupervised KoreanTokenizer should be trained first, before tokenizing.')
                return
            
            self.tokenizer = LTokenizer(scores=self.word_score)

            result = self.tokenizer.tokenize(text)

            return result
Exemple #4
0
    def tokenize(self, text, **kwargs):
        '''
        A method to tokenize the input text

        Attributes
        ----------
        text : str
            | An input text in str type

        **kwargs
            | Keyword arguments for LTokenizer.tokenize() method (see soynlp.tokenizer.LTokenizer.tokenize)
        '''
        
        if 'sentence' in kwargs.keys():
            del kwargs['sentence']
            print("WARNING: 'sentence' argument is ignored; word_tokenizer tokenizes 'text' argument only.")

        if not self.word_score:
            print('KoreanTokenizer should be trained first, before tokenizing.')
            return
        
        self.tokenizer = LTokenizer(scores=self.word_score)
        
        result = self.tokenizer.tokenize(text, **kwargs)

        return result
def data_tokenize(news_title, tdm_vocab):

    word_extractor = WordExtractor(
        min_frequency=100,  # example
        min_cohesion_forward=0.05,
        min_right_branching_entropy=0.0)

    word_extractor.train(news_title)
    words = word_extractor.extract()

    cohesion_score = {
        word: score.cohesion_forward
        for word, score in words.items()
    }
    tokenizer = LTokenizer(scores=cohesion_score)

    cluster_data = []
    bert_null_list = []

    for title in news_title:
        title = test(title)
        sent = tokenizer.tokenize(title, flatten=False)
        sentence = []
        for i in sent:
            if i[0] in tdm_vocab:
                sentence.append(i[0])

        cluster_data.append(sentence)

    return cluster_data
def prediction(text):
    params = Params('config/params.json')

    # load tokenizer and torchtext Fields
    pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb')
    cohesion_scores = pickle.load(pickle_tokenizer)
    tokenizer = LTokenizer(scores=cohesion_scores)

    pickle_kor = open('pickles/kor.pickle', 'rb')
    kor = pickle.load(pickle_kor)
    pickle_eng = open('pickles/eng.pickle', 'rb')
    eng = pickle.load(pickle_eng)
    eos_idx = eng.vocab.stoi['<eos>']

    # select model and load trained model
    model = Transformer(params)
    model.load_state_dict(torch.load(params.save_model))
    model.to(params.device)
    model.eval()

    # convert input into tensor and forward it through selected model
    tokenized = tokenizer.tokenize(text)
    indexed = [kor.vocab.stoi[token] for token in tokenized]


    source = torch.LongTensor(indexed).unsqueeze(0).to(params.device)  # [1, source_len]: unsqueeze to add batch size
    target = torch.zeros(1, params.max_len).type_as(source.data)       # [1, max_len]

    encoder_output = model.encoder(source)
    next_symbol = eng.vocab.stoi['<sos>']

    for i in range(0, params.max_len):
        if next_symbol == eos_idx:
            break
        target[0][i] = next_symbol
        decoder_output, _ = model.decoder(target, source, encoder_output)  # [1, target length, output dim]
        prob = decoder_output.squeeze(0).max(dim=-1, keepdim=False)[1]
        next_word = prob.data[i]
        next_symbol = next_word.item()

    #eos_idx = torch.where(target[0] == eos_idx)[0][0]
    #eos_idx = eos_idx.item()
    eos_index = 34
    print(eos_idx)
    target = target[0][:eos_idx].unsqueeze(0)

    # translation_tensor = [target length] filed with word indices
    target, attention_map = model(source, target)
    target = target.squeeze(0).max(dim=-1)[1]

    reply_token = [eng.vocab.itos[token] for token in target if token != 3]
    print(reply_token)
    #translation = translated_token[:translated_token.index('<eos>')]
    #translation = ''.join(translation)
    reply = ' '.join(reply_token)
    #print(reply)

    #display_attention(tokenized, reply_token, attention_map[4].squeeze(0)[:-1])
    return reply 
Exemple #7
0
 def _extracte(self) -> None:
     self.extractor = WordExtractor()
     self.extractor.train(self.corpus)
     self.words = self.extractor.extract()
     self.cohesion_score = {
         word: score.cohesion_forward
         for word, score in self.words.items()
     }
     self.tokenizer = LTokenizer(scores=self.cohesion_score)
def check_morphs(lst, corpus_fname, output_fname, log_fname):
    mcab = mecab.MeCab()

    model_fname = 'soyword.model'
    word_extractor = WordExtractor(
        min_frequency=100,
        min_cohesion_forward=0.05,
        min_right_branching_entropy=0.0
    )
    word_extractor.load(model_fname)
    scores = word_extractor.word_scores()
    scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()}
    soy_tokenizer = LTokenizer(scores=scores)

    with open(corpus_fname, 'r', encoding='utf-8') as f1, \
         open(output_fname, 'w', encoding='utf-8') as f2, \
         open(log_fname, 'w', encoding='utf-8') as f3:
        sentences = f1.read()

        for item in lst:
            cnt, word = item

            if cnt < 10 or len(word) == 1:
                continue

            tokens = mcab.morphs(word)
            if len(tokens) == 1:
                continue

            soy_tokens = soy_tokenizer.tokenize(word)
            if ' '.join(tokens) == ' '.join(soy_tokens):
                continue

            if is_all_nng(mcab.pos(word)):
                #print("nouns only : {}".format(word))
                #print("{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt))
                continue

            if len(soy_tokens) > 1:
                continue

            #print("{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt))

            words = re.findall(' '.join(tokens), sentences)
            if len(words) < (cnt * 0.05):
                # 형태소 분리된 단어의 빈도수가 분리안된 단어의 빈수도의 5% 미만이면 형태소 분리오류
                (cho, jung, jong) = hgtk.letter.decompose(word[-1])
                if 'ㄱ' <= jong <= 'ㅎ':
                    dic_line = "{},,,1000,NNP,*,{},{},*,*,*,*,*".format(word, 'T', word)
                else:
                    dic_line = "{},,,1000,NNP,*,{},{},*,*,*,*,*".format(word, 'F', word)
                print("{}\t{}\t{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt, len(words), jong))
                f2.writelines(dic_line + '\n')
                f3.writelines("{}\t{}\t{}\t{}\t{}".format(word, ' '.join(tokens), ' '.join(soy_tokens), cnt, len(words)) + '\n')
Exemple #9
0
def predict(config):
    params = Params('config/params.json')

    # load tokenizer and torchtext Fields
    pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb')
    cohesion_scores = pickle.load(pickle_tokenizer)
    tokenizer = LTokenizer(scores=cohesion_scores)

    pickle_kor = open('pickles/kor.pickle', 'rb')
    kor = pickle.load(pickle_kor)

    pickle_eng = open('pickles/eng.pickle', 'rb')
    eng = pickle.load(pickle_eng)

    # select model and load trained model
    model = Transformer(params)

    model.load_state_dict(torch.load(params.save_model))
    model.to(params.device)
    model.eval()

    input = clean_text(config.input)

    # convert input into tensor and forward it through selected model
    tokenized = tokenizer.tokenize(input)
    indexed = [kor.vocab.stoi[token] for token in tokenized]

    source = torch.LongTensor(indexed).unsqueeze(0).to(
        params.device)  # [1, source length]: unsqueeze to add batch size
    target = torch.zeros(1, params.max_len).type_as(source.data)

    encoder_output = model.encoder(source)
    next_symbol = eng.vocab.stoi['<sos>']

    for i in range(0, params.max_len):
        target[0][i] = next_symbol
        dec_output = model.decoder(target, source, encoder_output)
        # dec_output = [1, target length, output dim]
        prob = dec_output.squeeze(0).max(dim=-1, keepdim=False)[1]
        next_word = prob.data[i]
        next_symbol = next_word.item()

    # translation_tensor = [target length] filed with word indices
    target = model(source, target)
    target = torch.argmax(target.squeeze(0), -1)
    # target = target.squeeze(0).max(dim=-1, keepdim=False)
    translation = [eng.vocab.itos[token] for token in target][1:]

    translation = ' '.join(translation)
    print(f'kor> {config.input}')
    print(f'eng> {translation.capitalize()}')
Exemple #10
0
    def __init__(self, model_path: str = None):
        self.word_extractor = WordExtractor(min_frequency=5,
                                            min_cohesion_forward=0.05,
                                            min_right_branching_entropy=0.0)
        self.unk = 0
        self.pad = 1
        self.sos = 2
        self.eos = 3

        if model_path:
            with open(model_path, 'rb') as readFile:
                self.cohesion_score = dill.load(readFile)
        else:
            self.cohesion_score = {}
        self.tokenizer = LTokenizer(scores=self.cohesion_score)
        self.tok_to_id, self.id_to_tok = self._build_dict()
def soynlp_tokenizer(corpus):
    from soynlp.tokenizer import LTokenizer
    from soynlp.word import WordExtractor
    from soynlp.noun import LRNounExtractor_v2

    # word extractor
    word_extractor = WordExtractor(
        min_frequency=100,  # example
        min_cohesion_forward=0.05,
        min_right_branching_entropy=0.0)
    word_extractor.train(corpus)
    words = word_extractor.extract()

    cohesion_score = {
        word: score.cohesion_forward
        for word, score in words.items()
    }

    # noun extractor
    noun_extractor = LRNounExtractor_v2()
    nouns = noun_extractor.train_extract(corpus)  # list of str like

    noun_scores = {noun: score.score for noun, score in nouns.items()}
    combined_scores = {
        noun: score + cohesion_score.get(noun, 0)
        for noun, score in noun_scores.items()
    }
    combined_scores.update({
        subword: cohesion
        for subword, cohesion in cohesion_score.items()
        if not (subword in combined_scores)
    })

    tokenizer = LTokenizer(scores=combined_scores)
    return tokenizer
Exemple #12
0
def pmi_test(corpus_path):
    print('PMI test\n{}'.format('-' * 40))

    from soynlp import DoublespaceLineCorpus
    from soynlp.word import WordExtractor
    from soynlp.tokenizer import LTokenizer
    from soynlp.vectorizer import sent_to_word_contexts_matrix
    from soynlp.word import pmi

    corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)
    print('num sents = {}'.format(len(corpus)))

    word_extractor = WordExtractor()
    word_extractor.train(corpus)
    cohesions = word_extractor.all_cohesion_scores()

    l_cohesions = {word: score[0] for word, score in cohesions.items()}
    tokenizer = LTokenizer(l_cohesions)
    print('trained l tokenizer')

    x, idx2vocab = sent_to_word_contexts_matrix(
        corpus,
        windows=3,
        min_tf=10,
        tokenizer=tokenizer,  # (default) lambda x:x.split(),
        dynamic_weight=False,
        verbose=True)

    pmi_dok = pmi(x, min_pmi=0, alpha=0.0001, verbose=True)

    for pair, pmi in sorted(pmi_dok.items(), key=lambda x: -x[1])[100:110]:
        pair_ = (idx2vocab[pair[0]], idx2vocab[pair[1]])
        print('pmi {} = {:.3f}'.format(pair_, pmi))
    print('computed PMI')
Exemple #13
0
 def getTokenizer(self, contents):
     corpus = SentiCorpus(contents, iter_sent=True)
     word_extractor = WordExtractor(corpus)
     word_extractor.train(corpus)
     words_scores = word_extractor.extract()
     scores = {w: s.cohesion_forward for w, s in words_scores.items()}
     return LTokenizer(scores=scores)
Exemple #14
0
class LTokenizerKorean(SpecialTokenizer):  # 어근 중심 tokenizer
    def __init__(self, scores=None):
        from soynlp.tokenizer import LTokenizer
        self.inst=LTokenizer(scores=scores)  # scores를 preference로 지정할 수 있고, 지정하지 않으면 cohesion score로 알아서 계산됨
        self.OUT_TYPE = [list, str]

    def __call__(self, *args, **kwargs):
        tokens = self.inst.tokenize(args[0])
        return tokens
Exemple #15
0
def select_tokenizer(model):
    if model == Okt:
        tokenizer = Okt()
        tokenized = tr['document'].apply(tokenizer.morphs).tolist()
    if model == LTokenizer:
        tokenizer = LTokenizer()
        tokenized = tr['document'].apply(tokenizer.morphs).tolist()

    return tokenized
Exemple #16
0
class LTokenizerKorean(SpecialTokenizer):
    def __init__(self, scores=None):
        from soynlp.tokenizer import LTokenizer
        self.inst = LTokenizer(scores=scores)

        self.OUT_TYPE = [list, str]

    def __call__(self, *args, **kwargs):
        tokens = self.inst.tokenize(args[0])
        return tokens
Exemple #17
0
def soy_tokenize(model_fname, input_sentence):
    word_extractor = WordExtractor(min_frequency=100,
                                   min_cohesion_forward=0.05,
                                   min_right_branching_entropy=0.0)
    word_extractor.load(model_fname)
    scores = word_extractor.word_scores()
    # https://github.com/lovit/soynlp/blob/master/tutorials/wordextractor_lecture.ipynb
    # (1) 주어진 글자가 유기적으로 연결되어 함께 자주 나타나고,
    # (2) 그 단어의 우측에 다양한 조사, 어미, 혹은 다른 단어가 등장하여 단어의 우측의 branching entropy가 높다
    scores = {
        key: (scores[key].cohesion_forward *
              math.exp(scores[key].right_branching_entropy))
        for key in scores.keys()
    }
    tokenizer = LTokenizer(scores=scores)
    tokens = tokenizer.tokenize(input_sentence)
    tokenized_sent = ' '.join(tokens)

    return tokenized_sent
def predict_cnn(config):
    # load tokenizer and torchtext Field
    pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb')
    cohesion_scores = pickle.load(pickle_tokenizer)
    tokenizer = LTokenizer(scores=cohesion_scores)

    pickle_vocab = open('pickles/text.pickle', 'rb')
    text = pickle.load(pickle_vocab)

    model = CNN(config)

    model.load_state_dict(torch.load(config.save_model))
    model.to(device)
    model.eval()

    tokenized = tokenizer.tokenize(config.input)

    min_len = config.filter_sizes[-1]

    # if user's input sentence is shorter than the largest filter size, add pad tokens to input sentence
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))

    indexed = [text.vocab.stoi[token] for token in tokenized]
    length = [len(indexed)]

    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)

    prediction = torch.sigmoid(model(tensor, length_tensor))
    label = torch.round(prediction)

    if label == 1:
        label = 'Positive'
    else:
        label = 'Negative'

    sentiment_percent = prediction.item()
    print(f'[in]  >> {config.input}')
    print(f'[out] >> {sentiment_percent*100:.2f} % : {label}')
def predict_sequential(config):
    # load tokenizer and torchtext Field
    pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb')
    cohesion_scores = pickle.load(pickle_tokenizer)
    tokenizer = LTokenizer(scores=cohesion_scores)

    pickle_vocab = open('pickles/text.pickle', 'rb')
    text = pickle.load(pickle_vocab)
    pad_idx = text.vocab.stoi[text.pad_token]

    model_type = {
        'vanilla_rnn': RNN(config, pad_idx),
        'bidirectional_lstm': BidirectionalLSTM(config, pad_idx),
    }

    # select model and load trained model
    model = model_type[config.model]
    model.load_state_dict(torch.load(config.save_model))
    model.eval()

    # convert input into tensor and forward it through selected model
    tokenized = tokenizer.tokenize(config.input)
    indexed = [text.vocab.stoi[token] for token in tokenized]
    length = [len(indexed)]

    tensor = torch.LongTensor(indexed).to(device)  # [input length]
    tensor = tensor.unsqueeze(
        1)  # [input length, 1] for adding batch dimension
    length_tensor = torch.LongTensor(length)

    prediction = torch.sigmoid(model(tensor, length_tensor))
    label = torch.round(prediction)

    if label == 1:
        label = 'Positive'
    else:
        label = 'Negative'

    sentiment_percent = prediction.item()
    print(f'[in]  >> {config.input}')
    print(f'[out] >> {sentiment_percent*100:.2f} % : {label}')
Exemple #20
0
def soy_tokenizer(ext_type = 'noun'):
    # 파일 불러오기
    with open(r'.\Model\Extractor\nouns.bin', 'rb') as f:
        nouns = pickle.load(f)
    with open(r'.\Model\Extractor\words.bin', 'rb') as f:
        words = pickle.load(f)

    noun_scores = {noun:score.score for noun, score in nouns.items()}
    cohesion_score = {word:score.cohesion_forward for word, score in words.items()}
    combined_scores = {noun:score + cohesion_score.get(noun, 0)
        for noun, score in noun_scores.items()}
    combined_scores.update(
        {subword:cohesion for subword, cohesion in cohesion_score.items()
        if not (subword in combined_scores)}
    )
    if ext_type == 'noun':
        return LTokenizer(scores = noun_scores)
    elif ext_type == 'word':
        return LTokenizer(scores = cohesion_score)
    elif ext_type == 'comb':
        return LTokenizer(scores = combined_scores)
Exemple #21
0
def content_to_token(text_file_name):
    print("opening file " + text_file_name)
    with open(text_file_name, 'r', encoding="utf-8") as f:
        lines = f.read().splitlines()
    re.sub(r"[\[\]<>~]", ' ', lines[0])
    re.sub(r"['~]", ' ', lines[0])
    re.sub(r'"', ' ', lines[0])

    text = []
    for line in lines:
        line = re.sub(r"[\[\]<>~]", ' ', line)
        line = re.sub(r"['~]", ' ', line)
        line = re.sub(r'"', ' ', line)
        line = re.sub('\\W', ' ', line)
        text.append(line)

    ltokenizer = LTokenizer(scores=scores_dictionary)

    print("making list of words")
    words = []
    for sent in text:
        conclude_sent = []
        #flatten을 False로 주어서 [L명사, R조사]형태로 분류하게 만듦.
        pre_list = ltokenizer.tokenize(sent, flatten=False)
        for LR_list in pre_list:
            word = LR_list[0]
            if word in word_dict:
                word = word_dict[word]
            if word not in exception_list:
                conclude_sent.append(word)
        words.append(conclude_sent)

    token_file_name = text_file_name[:-4] + '.csv'

    f = open(token_file_name, 'w', newline="")
    wr = csv.writer(f)
    for word in words:
        wr.writerow(word)
    f.close()
def pad_sentence(dataframe, min_len):
    """
    to use CNN, all the inputs has the minimum length as same as the largest filter size
    if the input is shorter than the largest CNN filter size, we should pad that input using pad_sentence method
    Args:
        dataframe: (DataFrame) dataframe used to train and validate the model
        min_len: (integer) the largest CNN filter size used to set the minimum length of the model

    Returns:

    """
    pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb')
    cohesion_scores = pickle.load(pickle_tokenizer)
    tokenizer = LTokenizer(scores=cohesion_scores)

    for i, row in dataframe.iterrows():
        tokenized = tokenizer.tokenize(row.document)
        if len(tokenized) < min_len:
            tokenized += ['<pad>'] * (min_len - len(tokenized))
        padded_sent = ' '.join(tokenized)
        dataframe.at[i, 'document'] = padded_sent

    return dataframe
Exemple #23
0
def soy_tokenize(corpus_fname, model_fname, output_fname):
    word_extractor = WordExtractor(min_frequency=100,
                                   min_cohesion_forward=0.05,
                                   min_right_branching_entropy=0.0
                                   )
    word_extractor.load(model_fname)
    scores = word_extractor.word_scores()

    # https://github.com/lovit/soynlp/blob/master/tutorials/wordextractor_lecture.ipynb
    # (1) 주어진 글자가 유기적으로 연결되어 함께 자주 나타나고,
    # (2) 그 단어의 우측에 다양한 조사, 어미, 혹은 다른 단어가 등장하여 단어의 우측의 branching entropy가 높다
    scores = {
        key: (scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()
    }
    tokenizer = LTokenizer(scores=scores)
    with open(corpus_fname, 'r', encoding='utf-8') as f1, \
            open(output_fname, 'w', encoding='utf-8') as f2:
        for line in f1:
            sentence = line.replace('\n', '').strip()
            normalized_sent = emoticon_normalize(sentence, num_repeats=3)
            tokens = tokenizer.tokenize(normalized_sent)
            tokenized_sent = ' '.join(tokens)
            f2.writelines(tokenized_sent + '\n')
def build_vocab(config):
    """
    위에서 얻은 score를 이용하여 vocab을 만든다.
    """
    pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb')
    cohesion_scores = pickle.load(pickle_tokenizer)
    tokenizer = LTokenizer(scores=cohesion_scores)
    """
    tokenizer 중 cohesion score를 기준으로 단어를 구분하는 LTokenizer을 사용한다.
    한국어 어절을 '명사/동사/형용사/부사'(L part) + '조사 등'(R part)으로 보고, 의미가 핵심적인 L part의 점수를 도출한다.
    """
    # Field를 통해 단어를 tokenize하고 tensor로 바꾼다.
    # Field에 대한 다양한 parameter에 대한 정보는 https://torchtext.readthedocs.io/en/latest/data.html 에서 얻을 수 있다.
    kor = ttd.Field(tokenize=tokenizer.tokenize, lower=True, batch_first=True)

    # 영어를 tokenize하는 함수는 spacy이다. 이후 항상 첫 token은 <sos>, 마지막 token은 <eos>로 지정한다.
    eng = ttd.Field(tokenize='spacy',
                    init_token='<sos>',
                    eos_token='<eos>',
                    lower=True,
                    batch_first=True)

    data_dir = Path().cwd() / 'data'
    train_file = os.path.join(data_dir, 'train.csv')
    train_data = pd.read_csv(train_file, encoding='utf-8')
    train_data = convert_to_dataset(train_data, kor, eng)

    print(f'Build vocabulary using torchtext . . .')

    # 읽어 온 data를 한국어는 한국어 토큰으로, 영어는 영어 토큰으로 나누어 저장한다.
    kor.build_vocab(train_data, max_size=config.kor_vocab)
    eng.build_vocab(train_data, max_size=config.eng_vocab)

    # unique token 개수를 출력한다.
    print(f'Unique tokens in Korean vocabulary: {len(kor.vocab)}')
    print(f'Unique tokens in English vocabulary: {len(eng.vocab)}')

    # 가장 많이 쓰인 한국어/영어 단어를 출력한다.
    print(f'Most commonly used Korean words are as follows:')
    print(kor.vocab.freqs.most_common(20))
    print(f'Most commonly used English words are as follows:')
    print(eng.vocab.freqs.most_common(20))

    # 생성된 한국어/영어 vocab을 pickle로 저장한다
    with open('pickles/kor.pickle', 'wb') as kor_file:
        pickle.dump(kor, kor_file)

    with open('pickles/eng.pickle', 'wb') as eng_file:
        pickle.dump(eng, eng_file)
Exemple #25
0
def data_tokenize(news_title):

    word_extractor = WordExtractor(
        min_frequency=100, # example
        min_cohesion_forward=0.05,
        min_right_branching_entropy=0.0
    )

    word_extractor.train(news_title)
    words = word_extractor.extract()

    cohesion_score = {word:score.cohesion_forward for word, score in words.items()}
    tokenizer = LTokenizer(scores=cohesion_score)

    return tokenizer
Exemple #26
0
def build_vocab(config):
    """
    Build vocabulary used to convert input sentence into word indices using soynlp and spacy tokenizer
    Args:
        config: configuration containing various options

    Returns:

    """
    pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb')
    cohesion_scores = pickle.load(pickle_tokenizer)
    tokenizer = LTokenizer(scores=cohesion_scores)

    # include lengths of the source sentences to use pack pad sequence
    kor = ttd.Field(tokenize=tokenizer.tokenize,
                    lower=True,
                    include_lengths=True)

    eng = ttd.Field(tokenize='spacy',
                    init_token='<sos>',
                    eos_token='<eos>',
                    lower=True)

    data_dir = Path().cwd() / 'data'
    train_file = os.path.join(data_dir, 'train.csv')
    train_data = pd.read_csv(train_file, encoding='utf-8')
    train_data = convert_to_dataset(train_data, kor, eng)

    print(f'Build vocabulary using torchtext . . .')

    kor.build_vocab(train_data, max_size=config.kor_vocab)
    eng.build_vocab(train_data, max_size=config.eng_vocab)

    print(f'Unique tokens in Korean vocabulary: {len(kor.vocab)}')
    print(f'Unique tokens in English vocabulary: {len(eng.vocab)}')

    print(f'Most commonly used Korean words are as follows:')
    print(kor.vocab.freqs.most_common(20))

    print(f'Most commonly used English words are as follows:')
    print(eng.vocab.freqs.most_common(20))

    with open('pickles/kor.pickle', 'wb') as kor_file:
        pickle.dump(kor, kor_file)

    with open('pickles/eng.pickle', 'wb') as eng_file:
        pickle.dump(eng, eng_file)
Exemple #27
0
 def _get_tokenizer(self, df):
     """
     Generate a torkenizer by extracting words
     Args:
         dataframe: data corpus of one language
     Returns:
         tokenizer
     """
     word_extractor = WordExtractor()
     word_extractor.train(df)
     words = word_extractor.extract()
     print(f'length of words is {len(words)}')
     cohesion_scores = {
         word: score.cohesion_forward
         for word, score in words.items()
     }
     tokenizer = LTokenizer(scores=cohesion_scores)
     return tokenizer
Exemple #28
0
def pmi_test(corpus_path):
    print('pmi test\n{}'.format('-' * 40))

    from soynlp import DoublespaceLineCorpus
    from soynlp.word import WordExtractor
    from soynlp.tokenizer import LTokenizer
    from soynlp.vectorizer import sent_to_word_contexts_matrix
    from soynlp.word import pmi

    corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)
    print('num sents = {}'.format(len(corpus)))

    word_extractor = WordExtractor()
    word_extractor.train(corpus)
    cohesions = word_extractor.all_cohesion_scores()

    l_cohesions = {word: score[0] for word, score in cohesions.items()}
    tokenizer = LTokenizer(l_cohesions)
    print('trained l tokenizer')

    x, idx2vocab = sent_to_word_contexts_matrix(
        corpus,
        windows=3,
        min_tf=10,
        tokenizer=tokenizer,  # (default) lambda x:x.split(),
        dynamic_weight=False,
        verbose=True)

    x_pmi, x, y = pmi(x, min_pmi=0, alpha=0.0001)

    rows, cols = x_pmi.nonzero()
    data = x_pmi.data

    print('row  shape = {}'.format(rows.shape))
    print('col  shape = {}'.format(cols.shape))
    print('data shape = {}'.format(data.shape))

    for indpt in data.argsort()[-150:-100]:
        i = rows[indpt]
        j = cols[indpt]
        pair = (idx2vocab[i], idx2vocab[j])
        value = data[indpt]
        print('pmi {} = {:.3f}'.format(pair, value))
    print('computed pmi')
def build_vocab(config):
    """
    Build vocab used to convert Korean input sentence into word indices using soynlp tokenizer
    Args:
        config: configuration object containing various options

    Returns:

    """

    pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb')
    cohesion_scores = pickle.load(pickle_tokenizer)
    tokenizer = LTokenizer(scores=cohesion_scores)

    # To use packed padded sequences, tell the model how long the actual sequences are by 'include_lengths=True'
    text = ttd.Field(tokenize=tokenizer.tokenize, include_lengths=True)
    label = ttd.LabelField(dtype=torch.float)

    data_dir = Path().cwd() / 'data'
    train_txt = os.path.join(data_dir, 'train.txt')

    train_data = pd.read_csv(train_txt, sep='\t')
    train_data, valid_data = train_test_split(train_data,
                                              test_size=0.3,
                                              random_state=32)
    train_data = convert_to_dataset(train_data, text, label)

    print(f'Building vocabulary using torchtext . . .')
    text.build_vocab(train_data, max_size=config.vocab_size)
    label.build_vocab(train_data)

    print(f'Unique tokens in TEXT vocabulary: {len(text.vocab)}')
    print(f'Unique tokens in LABEL vocabulary: {len(label.vocab)}')

    print(f'Most commonly used words are as follows:')
    print(text.vocab.freqs.most_common(20))

    file_text = open('pickles/text.pickle', 'wb')
    pickle.dump(text, file_text)

    file_label = open('pickles/label.pickle', 'wb')
    pickle.dump(label, file_label)
def train_extractor(begin_d=None,
                    end_d=None,
                    sections: list = None,
                    base_dir='./out',
                    tokenizer=None):
    _, sentences, corpus_class = make_corpus(begin_d=begin_d,
                                             end_d=end_d,
                                             sections=sections,
                                             base_dir=base_dir)
    # nouns = get_noun_words(begin_d='20201101', end_d='20201130')

    noun_extractor = LRNounExtractor()
    nouns = noun_extractor.train_extract(sentences)  # list of str like
    noun_score = dict([(key, val.score) for key, val in nouns.items()])
    if tokenizer is None:
        tokenize = lambda x: x.strip().split()
    elif tokenizer == 'max_score_tokenizer':
        tokenize = MaxScoreTokenizer(noun_score)
    elif tokenizer == 'ltokenizer':
        tokenize = LTokenizer(noun_score)
    else:
        raise NotImplementedError

    if sections is not None and len(sections) >= 1:
        min_tf = 10
        min_df = 2
    else:
        min_tf = 20
        min_df = 2

    keyword_extractor = CorpusbasedKeywordExtractor(
        min_tf=min_tf,
        min_df=min_df,
        # tokenize=lambda x: x.strip().split(),
        tokenize=tokenize,
        verbose=True)
    # docs: list of str like
    keyword_extractor.train(sentences)
    return keyword_extractor, nouns, corpus_class
Exemple #31
0
def check_morphs(lst, corpus_fname, output_fname, log_fname):
    mcab = mecab.MeCab()

    model_fname = 'soyword.model'
    word_extractor = WordExtractor(
        min_frequency=100,
        min_cohesion_forward=0.05,
        min_right_branching_entropy=0.0
    )
    word_extractor.load(model_fname)
    scores = word_extractor.word_scores()
    scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()}
    soy_tokenizer = LTokenizer(scores=scores)

    with open(corpus_fname, 'r', encoding='utf-8') as f1, \
         open(output_fname, 'w', encoding='utf-8') as f2, \
         open(log_fname, 'w', encoding='utf-8') as f3:
        sentences = f1.read()

        for item in lst:
            cnt, word = item

            if cnt < 100 or len(word) == 1:
                continue

            tokens = mcab.morphs(word)
            if len(tokens) == 1:
                continue

            (cho, jung, jong) = hgtk.letter.decompose(word[-1])
            if 'ㄱ' <= jong <= 'ㅎ':
                dic_line = "{},,,,NNP,*,{},{},*,*,*,*,*".format(word, 'T', word)
            else:
                dic_line = "{},,,,NNP,*,{},{},*,*,*,*,*".format(word, 'F', word)
            f2.writelines(dic_line + '\n')
            f3.writelines("{}\t{}\t{}".format(word, ' '.join(tokens), cnt) + '\n')