Beispiel #1
0
    def _train_noun_extractor(
            self,
            sents,
            min_num_of_features=1,
            max_frequency_when_noun_is_eojeol=30,  # noun init
            min_noun_score=0.3,
            min_noun_frequency=1,
            min_eojeol_frequency=1):  # noun extraction

        self.noun_extractor = LRNounExtractor_v2(
            extract_pos_feature=False,
            extract_determiner=False,
            extract_compound=False,
            ensure_normalized=self._ensure_normalized,
            verbose=self._verbose,
            min_num_of_features=min_num_of_features,
            max_frequency_when_noun_is_eojeol=max_frequency_when_noun_is_eojeol
        )

        self.noun_extractor.train(sents, min_eojeol_frequency)
        nouns = self.noun_extractor.extract(min_noun_score,
                                            min_noun_frequency,
                                            reset_lrgraph=False)

        return nouns
def soynlp_tokenizer(corpus):
    from soynlp.tokenizer import LTokenizer
    from soynlp.word import WordExtractor
    from soynlp.noun import LRNounExtractor_v2

    # word extractor
    word_extractor = WordExtractor(
        min_frequency=100,  # example
        min_cohesion_forward=0.05,
        min_right_branching_entropy=0.0)
    word_extractor.train(corpus)
    words = word_extractor.extract()

    cohesion_score = {
        word: score.cohesion_forward
        for word, score in words.items()
    }

    # noun extractor
    noun_extractor = LRNounExtractor_v2()
    nouns = noun_extractor.train_extract(corpus)  # list of str like

    noun_scores = {noun: score.score for noun, score in nouns.items()}
    combined_scores = {
        noun: score + cohesion_score.get(noun, 0)
        for noun, score in noun_scores.items()
    }
    combined_scores.update({
        subword: cohesion
        for subword, cohesion in cohesion_score.items()
        if not (subword in combined_scores)
    })

    tokenizer = LTokenizer(scores=combined_scores)
    return tokenizer
def get_noun_words(begin_d=None, end_d=None):
    _, sentences, corpus_class = make_corpus(begin_d=begin_d, end_d=end_d)

    noun_extractor = LRNounExtractor_v2()
    nouns = noun_extractor.train_extract(sentences)  # list of str like

    # noun_words = [(-stat.score, word, stat.frequency) for word, stat in nouns.items()]
    return nouns
Beispiel #4
0
def noun_corpus(sents):
    noun_extractor = LRNounExtractor_v2(verbose=True, extract_compound=True)
    noun_extractor.train(sents)
    nouns = noun_extractor.extract()

    noun_scores = {noun:score[0] for noun, score in nouns.items() if len(noun) > 1}
    tokenizer = NounLMatchTokenizer(noun_scores)
    corpus = [tokenizer.tokenize(sent) for sent in sents]
    return corpus
Beispiel #5
0
def noun_extractor_test(corpus_path):
    from soynlp import DoublespaceLineCorpus
    from soynlp.noun import LRNounExtractor
    from soynlp.noun import LRNounExtractor_v2
    from soynlp.noun import NewsNounExtractor
    corpus = DoublespaceLineCorpus(corpus_path, num_doc=1000)

    # LRNounExtractor
    print('LRNounExtractor test\n{}'.format('-' * 40))
    noun_extractor = LRNounExtractor()
    noun_scores = noun_extractor.train_extract(corpus)

    print('{}\n{} words are extracted\ntop 20 frequency * score'.format(
        '-' * 30, len(noun_scores)))
    topwords = sorted(
        noun_scores,
        key=lambda x: -noun_scores[x].score * noun_scores[x].frequency)[:20]
    for word in topwords:
        print('word = {}, score = {}'.format(word, noun_scores[word].score))

    # NewsNounExtractor
    print('\nNewsNounExtractor test\n{}'.format('-' * 40))
    newsnoun_extractor = NewsNounExtractor()
    newsnoun_scores = newsnoun_extractor.train_extract(corpus)

    print('\n{}\n{} words are extracted\ntop 20 frequency * score'.format(
        '-' * 30, len(newsnoun_scores)))
    topwords = sorted(newsnoun_scores,
                      key=lambda x: -newsnoun_scores[x].score *
                      newsnoun_scores[x].frequency)[:20]
    for word in topwords:
        print('word = {}, score = {}'.format(word,
                                             newsnoun_scores[word].score))
    print('noun extractor test has been done\n\n')

    # LRNounExtractor_v2
    print('\nNounExtractor_v2 test\n{}'.format('-' * 40))
    noun_extractor_v2 = LRNounExtractor_v2()
    noun_scores_v2 = noun_extractor_v2.train_extract(corpus)
    noun_scores_v2 = {
        noun: score
        for noun, score in noun_scores_v2.items() if len(noun) > 1
    }

    print('\n{}\n{} words are extracted\ntop 20 frequency * score'.format(
        '-' * 30, len(noun_scores_v2)))
    topwords = sorted(noun_scores_v2,
                      key=lambda x: -noun_scores_v2[x].score * noun_scores_v2[
                          x].frequency)[:20]
    for word in topwords:
        print('word = {}, score = {}'.format(word, noun_scores_v2[word].score))
    print('noun extractor test has been done\n\n')
 def train_lexicon(self, document_path):
     sentence_list = DoublespaceLineCorpus(document_path, iter_sent=True)
     compound_extractor = LRNounExtractor_v2(verbose=True)
     compounds = compound_extractor.train_extract(sentence_list)
     p = re.compile("[^a-zA-Z0-9가-힣_]+")
     compound_list = [n for n, score in compounds.items() 
                      if len(p.findall(n)) == 0 and score[0] + score[1] > 5 and len(n) > 2]
     train_ner_lexicon = []
     for compound in compound_list:
         train_ner_lexicon.append((compound, "UNK")) 
     for word, ner_tag in train_ner_lexicon:
         if word not in self.ner_lexicon.keys():
             self.ner_lexicon[word] = [ner_tag]
def main(args):
    # Find patterns and extract words from a given set of documents
    sentences = DoublespaceLineCorpus(args.corpus_fname, iter_sent=True)
    word_extractor = WordExtractor(min_frequency=100,
                                   min_cohesion_forward=0.05,
                                   min_right_branching_entropy=0.0)

    # word extractor
    word_extractor.train(sentences)
    words = word_extractor.extract()
    cohesion_score = {
        word: score.cohesion_forward
        for word, score in words.items()
    }
    print('Word   (Freq, cohesion, branching entropy)\n')
    for word, score in sorted(words.items(),
                              key=lambda x: word_score(x[1]),
                              reverse=True)[:30]:
        print('%s     (%d, %.3f, %.3f)' %
              (word, score.leftside_frequency, score.cohesion_forward,
               score.right_branching_entropy))

    # noun extractor
    noun_extractor = LRNounExtractor_v2()
    nouns = noun_extractor.train_extract(args.corpus_fname)  # list of str like
    noun_scores = {noun: score.score for noun, score in nouns.items()}

    # combined score
    combined_scores = {
        noun: score + cohesion_score.get(noun, 0)
        for noun, score in noun_scores.items()
    }
    combined_scores.update({
        subword: cohesion
        for subword, cohesion in cohesion_score.items()
        if not (subword in combined_scores)
    })

    # maxScore tokenizer
    tokenizer = MaxScoreTokenizer(scores=combined_scores)

    # save tokenizer
    with open(args.tokenizer_path, 'wb') as f:
        pickle.dump(tokenizer, f, pickle.HIGHEST_PROTOCOL)
def get_data(text):
    delli = [
        '등', '것', '위', '대', '뒤', '오', '통', '또', '수', '말', '더', '못', '새', '인',
        '있', '점', '올', '많', '때', '측', '기자', '종목', '수익률', 'https'
    ]
    noun_extractor = LRNounExtractor_v2(verbose=False)
    nouns = noun_extractor.train_extract(text.split(' '))
    nouns_data = Counter()

    for word in delli:
        if (word in nouns):
            del nouns[word]
        else:
            continue

    for word, data in nouns.items():
        nouns_data += Counter({word: int(data[0])})

    return nouns_data
Beispiel #9
0
def build_vocab(config, data=None):
    if data is not None:
        sents = MyIterator(data)
    else:
        sents = MyIterator(config.data_path)

    noun_extractor = LRNounExtractor_v2(verbose=False)
    nouns = noun_extractor.train_extract(sents)

    noun_dict = {}
    for noun, score in nouns.items():
        if score.frequency >= config.min_frequency and score.score >= config.min_score and len(noun) > config.min_length:
            noun_dict[noun] = score.score

    vocab_path = os.path.join(config.save_path,'vocab.pkl')
    config.vocab_path = vocab_path
    #save_pickle(vocab_path, noun_dict)

    tokenizer = MaxScoreTokenizer(noun_dict)

    if data is not None:
        word2vec_corpus = Word2VecCorpus(data, tokenizer)
    else:
        word2vec_corpus = Word2VecCorpus(config.data_path, tokenizer)

    word2vec_model = Word2Vec(
        word2vec_corpus,
        size=config.word_hidden_size,
        alpha=0.025,
        window=5,
        min_count=config.min_frequency,
        sg=0,
        negative=5)

    word2vec_path = os.path.join(config.save_path, 'word2vec{}.model'.format(config.word_hidden_size))
    config.word2vec_path = word2vec_path
    #word2vec_model.save(word2vec_path)

    return noun_dict, word2vec_model
Beispiel #10
0
    def _extract_nouns(self, sentences):

        noun_extractor = LRNounExtractor_v2(l_max_length=self.l_max_length,
                                            r_max_length=self.r_max_length,
                                            min_eojeol_count=2,
                                            min_num_of_features=2,
                                            max_count_when_noun_is_eojeol=15,
                                            extract_compound=False,
                                            logpath=self.logpath,
                                            extract_pos_feature=True,
                                            verbose=self.verbose)

        noun_extractor.train(sentences)

        nouns = noun_extractor.extract(
            reset_lrgraph=False,
            min_count=10,
            minimum_noun_score=0.4,
        )

        self._lrgraph = LRGraph({
            l: {r: v
                for r, v in rdict.items()}
            for l, rdict in noun_extractor.lrgraph._lr.items()
        })
        self._num_of_eojeols = noun_extractor._num_of_eojeols
        self._num_of_covered_eojeols = noun_extractor._num_of_covered_eojeols

        self.noun_extractor = noun_extractor

        if self.verbose:
            message = 'noun extraction was done. {} % eojeols are covered'.format(
                '%.2f' %
                (100 * self._num_of_covered_eojeols / self._num_of_eojeols))
            self._print(message, replace=True, newline=True)

        return nouns
Beispiel #11
0
def noun_extract(datas):
    ne = LRNounExtractor_v2(verbose=True)
    nouns = ne.train_extract(datas)
    print(list(ne._compounds_components.items())[:5])
    return nouns
data_path = company_name + '_labeled_data.csv'  # csv 파일로 불러오기

#contents는 각 기사 스트링으로 바꿔 리스트에 넣은거, points는 클래스 0or 1
contents, points = tool.loading_rdata(data_path)
# 사전 파일 만들기
if os.path.isfile('preprocessed_' + company_name + '.csv') == False:
    print("\n")
    print('"preprocessed_' + company_name + '.csv" deos not EXIST!')
    print('MAKE "preprocessed_' + company_name + '.csv" FILE... 가즈아~!!')
    print("\n")
    doc = pd.read_csv(data_path, index_col='datetime')
    contents = []
    for i in range(len(doc['text'])):
        if len(doc.iloc[i]['text']) > 100:
            contents.append(doc.iloc[i]['text'])
    noun_extractor = LRNounExtractor_v2(verbose=True)
    nouns = noun_extractor.train_extract(contents, min_noun_frequency=20)

    match_tokenizer = NounLMatchTokenizer(nouns)
    f = open('preprocessed_' + company_name + '.csv',
             'w',
             newline='',
             encoding='utf-8')
    fieldnames = ['text', 'num']
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    test = []
    for j in range(len(contents)):
        temp_list = match_tokenizer.tokenize(contents[j])
        del_list2 = []
        for i in range(len(temp_list)):
# %%
# soynlp 사용 명사 추출
language_list = ["python", "java", "c", "etc"]
# language_list = ["java", "etc"]

from soynlp.noun import LRNounExtractor_v2

for language_name in language_list:
    title_list = []
    df = pd.read_csv(
        f"../../analyze_data/{language_name}/{language_name}_team.csv")
    titles = df["title"]
    for title in titles:
        title_list.append(title)

    noun_extractor = LRNounExtractor_v2()
    nouns = noun_extractor.train_extract(title_list)
    if language_name == "python":
        del nouns["Python"]
    if language_name == "java":
        del nouns["Java"]

    # for noun, lank in nouns.items():
    #     print(noun, lank)
    displayWordCloud(language_name, " ".join(nouns))
    # noun_extractor._compounds_components.get(title_list, None)

# %%
# language_list = ["java", "etc"]

from soynlp.noun import LRNounExtractor_v2
Beispiel #14
0
def analyzeSentence(sentences):
    noun_extractor = LRNounExtractor_v2(verbose=True)
    nouns = noun_extractor.train_extract(sentences)
    return nouns
Beispiel #15
0
 def __init__(self, sents):
     self.inst = LRNounExtractor_v2(verbose=False, extract_compound=True)
     self.inst.train(sents)
     self.inst.extract()
Beispiel #16
0
import pandas as pd
import re
from collections import Counter
from itertools import chain

WORDS_THRESHOLD = 2
NOUNS_THRESHOLD = 0.9
TOKENS_THRESHOLD = 0.15
USERS_THRESHOLD = 0.15

df = pd.read_pickle('./data/df_raw.pkl')

corpus = df.text.tolist()

print(''' nouns extractor ''')
noun_extractor = LRNounExtractor_v2(verbose=True, extract_compound=True)

noun_extractor.train(corpus)
nouns = noun_extractor.extract(min_noun_frequency=100, min_noun_score=0.3)
nouns_list = list()
for k, v in nouns.items():
    word = k
    score = v.score
    freq = v.frequency

    temp = dict()
    temp['noun'] = word.lower()
    temp['score'] = score
    temp['freq'] = freq

    nouns_list.append(temp)
    'noun_scores': [],
    'text': text
}

with open('scores_dictionary.pickle', 'wb') as fw:
    pickle.dump(scores_dictionary, fw)
    print("dumping complete")

with open('scores_dictionary.pickle', 'rb') as fr:
    scores_dictionary = pickle.load(fr)
    print("loading complete")

## noun score

# 명사만
noun_extractor = LRNounExtractor_v2(verbose=True,
                                    extract_compound=False)  # 복합어 추출 X
nouns = noun_extractor.train_extract(text)  # list of str like
noun_scores = {noun: score.score for noun, score in nouns.items()}
print("extracting noun")

#print(list(noun_extractor._compounds_components.items())[:5])

scores_dictionary['noun_scores'] = noun_scores

with open('scores_dictionary.pickle', 'wb') as fw:
    pickle.dump(scores_dictionary, fw)
    print("dumping complete")
"""

Noun = []
for noun, score in nouns.items():