Beispiel #1
0
class KoreaHelper(object):
    def __init__(self):
        from konlpy.tag import Mecab
        self.mecab = Mecab()

    def pos(self, phrase: Text):
        """
        $ python -m sagas.ko.ko_helper pos '계획이'
        :param phrase:
        :return:
        """
        return self.mecab.pos(phrase)

    def nouns(self, phrase: Text):
        """
        $ python -m sagas.ko.ko_helper nouns '피자와 스파게티가'
        $ python -m sagas.ko.ko_helper nouns '계획이'
        :param phrase:
        :return:
        """
        from sagas.nlu.transliterations import translits
        from sagas.ko.kwn_procs import kwn
        ns = self.mecab.nouns(phrase)
        rs = []
        for w in ns:
            # ws = get_word_sets(w, 'ko')
            ws = kwn.get_synsets(w, first=True)
            if ws:
                rs.append({
                    'spec': ws[0].name(),
                    'text': w,
                    'translit': translits.translit(w, 'ko'),
                    'definition': ws[0].definition()
                })
            else:
                rs.append({
                    'text': w,
                    'translit': translits.translit(w, 'ko'),
                })
        return rs

    def translit(self, word):
        """
        $ python -m sagas.ko.ko_helper translit '피자와 스파게티가'

        See also: procs-ko-konlpy.ipynb
        :param word:
        :return:
        """
        from sagas.nlu.transliterations import translits
        for w, p in self.mecab.pos(word):
            expl = '_'
            if p in ('NNG', 'VV'):
                ws = get_word_sets(w, 'ko')
                if ws:
                    expl = f"{ws['name']}({ws['definition']})"
            print(w, translits.translit(w, 'ko'), p, expl)
def text_analysis():
    

    res = Response("block")
    res.headers["Access-Control-Allow-Origin"]="*"

    jsonData = request.get_json()
    
    resultData = dict()
    tokenizer = Mecab()
    print(tokenizer.pos(jsonData['text']))
    resultData['result'] = tokenizer.pos(jsonData['text'])
    return json.dumps(resultData)
def load_data_and_labels2(file_name):
    positive_exams = []
    negative_exams = []
    positive_count = 0
    negative_count = 0

    exams = list(open(file_name, "r").readlines())
    for s in exams:
        splited = s.split('\t')
        if splited[2] == '0\n':
            negative_exams.append(splited[1])
            negative_count = negative_count + 1
        elif splited[2] == '1\n':
            positive_exams.append(splited[1])
            positive_count = positive_count + 1
        else:
            print(splited[0], splited[1], splited[2])

    mecab = Mecab()

    positive_result = []
    for pp in positive_exams:
        one_str = mecab.pos(pp)
        str_result = ''
        for p in one_str:
            if p[1] in {
                    'NNG', 'NNP', 'NNB', 'NNBC', 'VA', 'VV', 'SL', 'SN', 'SY'
            }:
                str_result = p[0] + ' ' + str_result
        positive_result.append(str_result)

    positive_labels = [[0, 1] for _ in positive_result]

    negative_result = []
    for pp in negative_exams:
        one_str = mecab.pos(pp)
        str_result = ''
        for p in one_str:
            if p[1] in {
                    'NNG', 'NNP', 'NNB', 'NNBC', 'VA', 'VV', 'SL', 'SN', 'SY'
            }:
                str_result = p[0] + ' ' + str_result
        negative_result.append(str_result)

    negative_labels = [[1, 0] for _ in negative_result]

    y = np.concatenate([positive_labels, negative_labels], 0)

    x_text = positive_result + negative_result

    return [x_text, y]
Beispiel #4
0
def make_question_mecab_tokens(question):
    """
    입력받은 질문에 대한 형태소 분석 진행
    1. mecab
    2. 조사 제거
    3. 한 문장으로 다시 결합
    """
    # load mecab
    mecab = Mecab()

    # mecab 돌리기
    que_mecab = mecab.pos(question[0])

    # 조사 제거
    morpheme = [
        'NNG', 'NNP', 'NNB', 'NNBC', 'NR', 'NP', 'VV', 'VA', 'VX', 'VCP',
        'VCN', 'MM', 'MAG', 'MAJ', 'IC', 'SN'
    ]
    tmp = []
    que_tokens = []

    for t in que_mecab:
        if t[1] in morpheme:
            que_tokens.append(t[0])

    if len(que_tokens) == 0:
        que_tokens.append('')

    # 한 문장으로 결합
    que_tokens_str = [' '.join(que_tokens)]

    return que_tokens_str
Beispiel #5
0
    def pre_process(self, json, istrain):
        mecab = Mecab()

        data = []

        for cnt, article in enumerate(json):
            if cnt % 10000 == 0:
                print(cnt)
                
            text = bs(article["text"], "html.parser").text
            #title_pos = ["%s_%s" % (word, pos) for word, pos in mecab.pos(article["title"])]
            #author_pos = ["%s_%s" % (word, pos) for word, pos in mecab.pos(article["author"])]
            text_pos = ["%s_%s" % (first, second) for first, second in mecab.pos(text)]

            data.append({
                #"title_pos": title_pos,
                #"title_pos_sentences" : " ".join(title_pos),
                #"author_pos": author_pos,
                #"author_pos_sentences" : " ".join(author_pos),
                "text":article["text"],
                "text_pos": text_pos,
                "text_pos_sentences" : " ".join(text_pos),
                #"forumid": article["forumid"],                    
                "pk": article["pk"]
            })

            if istrain == True:
                data[cnt]["istroll"] = article["is_troll"]

        data = pd.DataFrame.from_dict(data)
        data = data.set_index('pk')

        return data
Beispiel #6
0
    def voice2Text():

        fname = r'/home/ubuntu/handypotter/v2t.txt'

        with open(fname, mode='r', buffering=-1, encoding="UTF-8") as fp:
            text = fp.read()
        fp.close

        mecab = Mecab()

        lemmatizer = Lemmatizer(dictionary_name='default')

        # 품사 구분하여 고유명사, 명사, 동사, 형용사 출력
        tagged_list = mecab.pos(text)
        print(tagged_list)

        tags = ['NNP', 'NNG', 'NP', 'VV', 'VA', 'MAG', 'XR']
        stoptags = [
            'JKS', 'SF', 'XSN', 'EC', 'EP', 'VX', 'NNB', 'EF', 'JX', 'EP+EF',
            'XSV', 'XSA', 'XSN'
        ]

        sentence_token = [t[0] for t in tagged_list if t[1] in tags]
        print(sentence_token)

        return sentence_token
Beispiel #7
0
def analyzing_morphem(content_list):
    mecab = Mecab()
    for idx, doc in enumerate(content_list):
        if idx % 5000 == 0:
            print 'Morphem Analysis on %d' % idx
        yield ' '.join([part for part, pos in mecab.pos(doc.decode('utf-8'))
                        ]).encode('utf-8')
Beispiel #8
0
def predict(estimator, data_file, voca):
    with open(data_file) as f:
        contents = f.read()

    mecab = Mecab()
    morps = mecab.pos(contents)
    morps = [morp[0] for morp in morps if morp[1] in TAGS]

    input_fn = build_input_fn([' '.join(morps)],
                              labels=None,
                              voca=voca,
                              batch_size=1,
                              num_epochs=1,
                              shuffle=False)
    predict = estimator.predict(input_fn)

    cate_names = {v: k for k, v in CATES.items()}

    def second_cls(probs):
        tup = [(i, prob) for i, prob in enumerate(probs)]
        tup = sorted(tup, key=lambda x: x[1], reverse=True)
        return tup[1]

    for i, p in enumerate(predict):
        cls, probs = p['class'], p['prob']
        name, prob = cate_names[cls], probs[cls]

        cls2, prob2 = second_cls(probs)
        name2 = cate_names[cls2]
        tf.logging.info("Prediction %s: %s(%.4f), %s(%.4f)"\
            % (i+1, name, prob, name2, prob2 ))
Beispiel #9
0
def pos():
    x = request.json  #json 데이터를 받아옴
    print(x)
    requestText = x['text']  # 형태소 분석할 텍스트

    # ------------------------형태소 분석 로직---------------------------
    m = Mecab()
    checked_sent = requestText

    # 오탈자 전처리
    non_blank_checked_sent = checked_sent.replace(" ", "")  # 공백 제거
    temp_sent = pnu_spell_check(non_blank_checked_sent)

    # 일반적으로 공백 전처리 후 실행하는 것이 성능이 더 좋았지만
    # 간혹 아닌 correction 값이 아예 나오지 않는 경우도 있어 해당 경우는
    # 원래 공백이 있는 상태의 입력값으로 처리
    if temp_sent != '':
        checked_sent = temp_sent
    else:
        checked_sent = pnu_spell_check(checked_sent)

    r = m.pos(checked_sent)
    print(r)
    # result = ''.join(r)
    # ------------------------형태소 분석 로직 끝---------------------------

    return jsonify(result=r)  # 받아온 데이터를 다시 전송
Beispiel #10
0
def view_post(request, pk):
    the_post = get_object_or_404(Post, pk=pk)
    the_comment = Comment.objects.filter(post=the_post)
    mecab = Mecab()
    morph = mecab.pos(the_post.content)
    the_morph = ' '.join(str(e) for e in morph)



    if request.method == 'GET':
        pass
    elif request.method =='POST':
        new_comment = Comment()
        new_comment.content = request.POST.get('content')
        new_comment.post = the_post
        new_comment.save()




    return render(request, 'view_post.html',{
        'post' : the_post,
        'comments' : the_comment,
        'morph' : the_morph,
    })
Beispiel #11
0
def mL(temp, temp1):
    model = load_model('./news_lstm_usev3.model')
    with open('./tokenizer_usev3.pickle', 'rb') as handle:
        tok = pickle.load(handle)
    tag_classes = ['NNG', 'NNP']
    category = {0: '세계', 1: '코로나', 2: '사회', 3: '문화', 4: '정치', 5: 'IT과학', 6: '경제'}
    m = Mecab()
    data = crawler(temp, temp1)

    ind = len(data.index)
    json_list = {}
    while (ind):
        element = {}
        result_ml =""
        element['title'] = str(data.loc[len(data.index) - ind]['title'])
        element['date'] = str(data.loc[len(data.index) - ind]['date'])
        element['contents'] = str(data.loc[len(data.index) - ind]['contents'])
        element['link'] = str(data.loc[len(data.index) - ind]['link'])
        value = m.pos((str(data.loc[len(data.index) - ind]['title']) + str(data.loc[len(data.index) - ind]['contents'])).strip())
        for i in value:
            if i[1] in tag_classes and i[0] != '*':
                result_ml += i[0] + " "
        x = [result_ml.split()]
        sequence_data = tok.texts_to_sequences(x)
        pad_sequence_data = sequence.pad_sequences(sequence_data)
        element['probability'] = {}
        for idx, i in enumerate(model.predict(pad_sequence_data)[0]):
            element['probability'][category[idx]] = round((i * 100), 2)
        ind = ind - 1
        json_list[len(data.index) - ind] = element
    return json_list
Beispiel #12
0
def getentity_slot(intent_idx, strbuf):
    mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
    for intent_str, idx in intent_dic.items():
        if idx == intent_idx:
            intent = intent_str
            break
    slot_value = story_slot_entity.get(intent)
    added = 0
    M = mecab.pos(strbuf)
    ## 조사를 안쓰는 경우 피자 같은 단어를 명사로 해석안함.. -> 꼼수로 단어단위로 잘라서 해결 + mecab에 고유명사 추가해야함
    # print(M)
    for pos_tag in M:
        if (pos_tag[1] in ['NNG', 'NNP', 'SL', 'MAG']):  # 명사, 영어만 사용
            for key in slot_value:

                if pos_tag[0] in entity_list[key]:  # 메뉴 List 에서 검색
                    added = 1
                    if slot_value[key] is None:
                        slot_value[key] = [pos_tag[0]]
                    else:
                        slot_value[key].append(pos_tag[0])

    M = strbuf.split(' ')
    for pos_tag in M:
        for key in slot_value:
            if pos_tag[0] in entity_list[key]:  # 메뉴 List 에서 검색
                added = 1
                if slot_value[key] is None:
                    slot_value[key] = [pos_tag[0]]
                else:
                    slot_value[key].append(pos_tag[0])

    return added, slot_value
Beispiel #13
0
def tokenize(talk_dic):
    SW = define_stopwords("./stopwords-ko.txt")
    mecab = Mecab()
    total = {}
    for k, v in talk_dic.items():
        total_sub = []
        for idx, talk_set in enumerate(v):
            time, talk = talk_set
            clean_talk = message_cleaning(talk)
            tokenized_talk = []
            for word, tag in mecab.pos(clean_talk):
                if len(word) == 1 and tag in [
                        'EC', 'JX', 'ETM', 'JKS', 'JKB', 'XSV', 'JKO',
                        'XSV+EC', 'XSN', 'NNB', 'EP', 'JKG', 'VCP', 'NNB+JKS',
                        'JKG'
                ]:
                    continue
                if word in SW and tag in [
                        'EC', 'JX', 'ETM', 'JKS', 'JKB', 'XSV', 'JKO',
                        'XSV+EC', 'XSN', 'NNB', 'EP', 'JKG', 'VCP', 'NNB+JKS',
                        'JKG'
                ]:
                    continue
                tokenized_talk.append((word, tag))
            #talk_dic[k][idx] = (time, talk, tokenized_talk)
            total_sub.extend(tokenized_talk)
        total[k] = total_sub
    return total
Beispiel #14
0
class KorPreprocessor(PreprocessorBase):
    def __init__(self):
        super(KorPreprocessor, self).__init__()

        self.tagger = Mecab()

    def _to_morphs(self, s):
        return self.tagger.pos(s)

    @staticmethod
    def _clean(s):
        s = re.sub(r"[^가-힣ㄱ-ㅎ?.!,]", " ", s)
        s = s.strip()
        return s

    def preprocess(self, s):
        s = self._basic_nmt(s)
        s = self._clean(s)

        tagged = self._to_morphs(s)

        _s = []
        for w, _ in tagged:
            _s.append(w)
        s = " ".join(_s)
        s = self._add_token(s)
        return s
Beispiel #15
0
class SearchCluster():
    def __init__(self, app):
        self.app = app
        self.mecab = Mecab()
        self.load_models()

    def load_models(self):
        self.word2vec = gensim.models.Word2Vec.load_word2vec_format(
            WORD2VEC_MODEL, binary=True)
        self.cluster_pipe = joblib.load(PIPE_DUMPING)

    def __task_to_vector(self, task):
        words = [key for key, pos in self.mecab.pos(task)]
        # aggregation word vectors
        vector = np.mean(np.array(
            [self.word2vec[word] for word in words if word in self.word2vec]),
                         axis=0)
        return vector

    def __predict_label(self, task):
        vector = self.__task_to_vector(task)
        return self.cluster_pipe.predict(vector)[0]

    def get_articles(self, user_id, task, topn=3):
        label = self.__predict_label(task)
        article_id_list = list(
            self.app.query_pool2.get_same_cluster_articles(
                user_id, label, topn))
        return list(
            self.app.query_pool2.get_article_list_by_id(article_id_list))
Beispiel #16
0
    def preprocessing(self):
        #2018038092 안준

        print("\ndata preprocessing...\n")
        mecab = Mecab()
        stopwords = []  #불용어를 저장하기 위한 리스트
        reader = csv.reader(self.stopwords_csv)

        for row in reader:
            stopwords.append(row)
        stopwords = sum(stopwords, [])  #2차원 리스트를 1차원 리스트로 변환
        self.stopwords_csv.close()

        for i in self.data.index:
            word_token = mecab.pos(self.data.at[i, 'content'])  #줄거리를 품사별로 분리
            filtering = [
                x for x, y in word_token if y in ['NNG', 'NNP', 'VV', 'VA']
            ]  #일반명사, 고유명사, 동사, 형용사

            #공백 제거
            new_filtering = [i.replace(' ', '') for i in filtering]

            #불용어 처리
            result = []  #전처리된 줄거리
            for word in new_filtering:
                if word not in stopwords:  #stopwords.csv 파일에서 불러옴.
                    result.append(word)

            #dataframe에 결과물 저장
            self.data.at[i, 'content'] = result  #i번째 행 content열에 전처리 결과 대입
            self.data.at[i, 'content'] = ' '.join(
                self.data.at[i, 'content'])  #list는 토큰화할 수 없기 때문에 하나의 문자열로 결합

        print('\ncomplete data preprocessing.')
Beispiel #17
0
    def doc_to_stemmed_words(self):
        '''
        뉴스기사의 각 문장에서 추출한 단어의 어근들을 반환한다.
        :param text: 뉴스기사 텍스트 (string) 
        :return: 각 문장에서 추출한 단어의 어근들의 리스트를 원소로 갖는 리스트 (nested list)
        '''

        sentences = (self.text).split(".")

        #kkma = Kkma()
        #remove_pos = "[(?P<조사>JK.*)(?P<접속조사>JC.*)(?P<전성어미>ET.*)(?P<종결어미>EF.*)(?P<연결어미>EC.*)(?P<접미사>XS.*)(?P<마침표물음표느낌표>SF.*)(?P<쉼표가운뎃점콜론빗금>SP.*)]" #kkma
        mecab = Mecab()
        remove_pos = "[(?P<조사>JK.*)(?P<접속조사>JC.*)(?P<전성어미>ET.*)(?P<종결어미>EF.*)(?P<연결어미>EC.*)(?P<접미사>XS.*)(?P<마침표물음표느낌표>SF.*)(?P<쉼표가운뎃점콜론빗금>SC.*)]"  # mecab

        stemmed_sentences = []

        for sentence in sentences:
            # stemmed_words = kkma.pos(sentence)
            stemmed_words = mecab.pos(sentence)
            stemmed_words = [
                x[0] for x in stemmed_words
                if not bool(re.match(remove_pos, x[1]))
            ]
            stemmed_sentences.append(stemmed_words)

        return stemmed_sentences
Beispiel #18
0
    def convert_data(self) :
        """
        augment data with entity list and pattern
        :return: None
        """
        with codecs.open( self.pattern_data_path, "r", "utf-8" ) as fileObj :
            document = fileObj.readlines()
            return_arr = []

            for i, line in enumerate(document) :
                words = []
                if(self.use_mecab) :
                    words = str(line).split(' ')
                else :
                    mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
                    pos = mecab.pos(line)
                    for word, tag in pos:
                        words.append(word)
                print("===={0} line job start".format(i))
                match_keys = self._check_all_match(words)
                if(self.out_format_type == 'plain') :
                    aug_data = self._aug_sent(match_keys, words, [])
                    self._plain_formatter(aug_data)
                elif(self.out_format_type == 'iob') :
                    aug_data = self._aug_sent(match_keys, words, [])
                    self._iob_formatter(aug_data)
                else :
                    raise Exception (' '.join(['not', 'plain', 'or iob']))
                print("===={0} line job done".format(i))
Beispiel #19
0
def tag_pos(sentences, tagger='kkma'):
    """
    Predict Part-of-Speech tag of input sentences
    PoS tagger: KKMA
    :param sentences: list of input sentences
    :return: tagged sentences
    """
    if tagger == 'kkma':
        kkma = Kkma()
    elif tagger == 'mecab':
        mecab = Mecab()

    morph_lists = []
    for sent in sentences:
        morph_list = []
        if tagger == 'kkma':
            pos_tagged_sentences = kkma.pos(sent)
        elif tagger == 'mecab':
            pos_tagged_sentences = mecab.pos(sent)

        for (key, value) in pos_tagged_sentences:
            value = transform_pos(value, tagger)
            morph_list.append([key, value])
        morph_lists.append(morph_list)

    return morph_lists
Beispiel #20
0
class Preprocess:
    def __init__(self, word2idx_dic="", userdic=None) -> None:
        if word2idx_dic != "":
            f = open(word2idx_dic, "rb")
            self.word_index = pickle.load(f)
            f.close()
        else:
            self.word_index = None

        if userdic is None:
            self.mecab = Mecab()
        else:
            self.mecab = Mecab(dicpath=userdic)
        self.exclusion_tags = [
            "JKS",
            "JKC",
            "JKG",
            "JKO",
            "JKB",
            "JKV",
            "JKQ",
            "JX",
            "JC",
            "SF",
            "SP",
            "SS",
            "SE",
            "SO",
            "EP",
            "EF",
            "EC",
            "ETN",
            "ETM",
            "XSN",
            "XSV",
            "XSA",
        ]

    def pos(self, sent):
        return self.mecab.pos(sent)

    def get_keywords(self, pos, without_tag=False):
        f = lambda x: x in self.exclusion_tags
        word_list = []
        for p in pos:
            if not f(p[1]):
                word_list.append(p if not without_tag else p[0])
        return word_list

    def get_wordidx_sequence(self, keywords):
        if self.word_index is None:
            return []
        w2i = []
        for word in keywords:
            try:
                w2i.append(self.word_index[word])
            except KeyError:
                w2i.append(self.word_index["OOV"])
        return w2i
Beispiel #21
0
 def nlp_function(self, str):
     mecab = Mecab(dicpath="C:\\mecab\\mecab-ko-dic")
     pos_data = mecab.pos(str)
     res_str = ""
     for i in range(len(pos_data)):
         if pos_data[i][1][:2] == 'NN' or pos_data[i][1][:2] == 'VV':
             res_str += pos_data[i][0] + " "
     return res_str
Beispiel #22
0
def filter_by_pos(text, accepts=["NNG", "NNP", "NNB", "VA", "VV", "VX", "VCP", "SL", "SH"]):
    mecab = Mecab()
    temp = []
    for pos in mecab.pos(text):
        if pos[1] in accepts:
            temp.append(pos[0])
    del mecab
    return " ".join(temp)
Beispiel #23
0
    def files_to_map(self, folder, visited_file):
        """
        Get http list and make dictionary files
        dictionary take key and values, key is refering word and values is indicating numbers of word in files
        
        Args:
            foler is the directory which have a files
        """
        m = Mecab()
        answer = {}
        #이미 넣은 파일들은 건너뛰게 하는 함수
        visited = self.get_visited_file(visited_file)
        check = False
        try:
            with open('word_idf.json','r') as f:
                html = f.read()
                
        #print(visited)
        for item in self.get_file_list(folder):
            #print(item)
            if not os.path.isfile(item):
                continue
            if item in visited:
                continue

            encoding = ['utf-8', 'cp949']
            for encode in encoding:
                try:
                    with open(item,'r', encoding = encode) as f:
                        html = f.read()
                        html = re.sub('[^가-힣ㄱ-ㅎ ]',' ',html)
                        html = re.sub(r'(.)\1+',r'\1\1',html)
                        for word in m.pos(html):
                            if word[1] not in ['NNP','NNG','NNB','VA']:
                                continue
                            answer.setdefault(word[0],1)
                            answer[word[0]] = answer[word[0]] + 1
                        visited.append(item)
                        check = True
                        break
                except Exception as e:
                    log.error('Files_to_map() Line = '+str(inspect.currentframe().f_lineno)+" Error: "+str(e))
        if answer:
            try:
                #덮어쓰기가 아니라 추가로 올려야 한다.
                json.dump(answer,open('word_idf.json','w'))
            except Exception as e:
                log.error("Files_to_map() Line = " +str(inspect.currentframe().f_lineno)+" Error: "+str(e))

        if check:
            try:
                with open(visited_file,'w',encoding='utf-8') as f:
                    writer = csv.writer(f)
                    writer.writerow(visited)
            except Exception as e:
                log.error("Files_to_map Line = " +str(inspect.currentframe().f_lineno)+" Error: "+str(e))

        return True
Beispiel #24
0
def ismenu(msg):
    mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
    M = mecab.pos(msg)
    menu = []
    for pos_tag in M:
        if (pos_tag[1] in ['NNG', 'NNP', 'SL', 'MAG']):  # 명사, 영어만 사용
            if pos_tag[0] in menu_list:  # 메뉴 List 에서 검색
                menu.append(pos_tag[0])
    return menu
Beispiel #25
0
    def pos_by_ISBN(self, contents):
        mecab = Mecab()
        pos_list = []

        for col in self.collection_review.find({"ISBN" : contents},{"_id" : 0, "review_text" : 1}):
            pos = mecab.pos(col['review_text'])
            pos_list.append(pos)

        return pos_list
Beispiel #26
0
    def _pos_tagger(self, input, type='mecab'):
        """

        :param input:
        :return:
        """
        if (type == 'mecab'):
            mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
            return mecab.pos(str(input))
Beispiel #27
0
def tokenize_ko(lyrics_file_ko):
    print("\n-------- K-POP LYRICS --------")

    # lyrics_file_ko = "../crawl_data/lyrics_kr/kr_lyrics_verbose.csv"
    df = pd.read_csv(lyrics_file_ko)
    print(df.shape, "# as_is_kpop")
    df = df.dropna()
    print(df.shape, "# dropna()")
    df = df.drop_duplicates()
    print(df.shape, "# drop_duplicates()")

    data = list(df['Lyrics'].values)
    print("num_lyrics_kpop:", len(data))

    # Load Korean stopwords.

    stopwords = ["하:VV", "있:VV", "되:VV", "있:VA", "이러:VV"]

    # Load Korean morphological analyzer.

    mecab = Mecab()

    word_list = []
    for lyric in data:
        lyric = re.sub('[a-zA-z]', '', lyric)
        parsed = mecab.pos(lyric)
        tmp = []
        for w, pos in parsed:
            # We look for four parts of speech.
            # See below URL for POS tags (Mecab-ko).
            # *** KoNLPy Korean POS Tag Comparison Chart ***
            # https://docs.google.com/spreadsheets/d/1OGAjUvalBuX-oZvZ_-9tEfYD2gQe7hTGsgUpiiBSXI8/edit#gid=0
            if (pos == 'NNG') | (pos == 'NNP') | (pos == 'VV') | (pos == 'VA'):
                wpos = "{}:{}".format(w, pos)
                if wpos not in stopwords:
                    tmp.append(wpos)
        word_list.append(tmp)

    # Save tokenized lyrics, which contains nouns, verbs, and adjectives, to a pickle file.

    with open("../data/tokeninzed_kpop.p", 'wb') as f:
        pickle.dump(word_list, f)

    print("word list kpop sample:", word_list[0])

    flat_list = [item for sublist in word_list for item in sublist]
    print("total_kpop_words:", len(flat_list))

    counts = Counter(flat_list)
    print("uniq_words_kpop:", len(counts))

    # Save unique word list with frequency to file.

    with open("../data/uniq_words_freq_kpop.txt", 'w') as f:
        for k, v in counts.most_common():
            f.write("{}\t{}\n".format(k, v))
Beispiel #28
0
def tokenize_ko(lyrics_file_ko):
    print("\n-------- K-POP LYRICS --------")

    # lyrics_file_ko = "../crawl_data/lyrics_kr/kr_lyrics_verbose.csv"
    df = pd.read_csv(lyrics_file_ko)
    print(df.shape, "# as_is_ko")
    df = df.dropna()
    print(df.shape, "# dropna()")
    df = df.drop_duplicates()
    print(df.shape, "# drop_duplicates()")

    data = list(df['Lyrics'].values)
    print("ko num of lyrics:", len(data))

    # Load Korean stopwords.

    stopwords = ["하:VV", "있:VV", "되:VV", "있:VA", "이러:VV"]

    # Load Korean morphological analyzer.

    mecab = Mecab()

    morphs = []
    for lyric in data:
        lyric = re.sub('[a-zA-z]', '', lyric)
        parsed = mecab.pos(lyric)
        tmp = []
        for w, pos in parsed:
            # We look for four parts of speech
            # See below URL for POS tags (Mecab-ko)
            # *** KoNLPy Korean POS Tag Comparison Chart ***
            # https://docs.google.com/spreadsheets/d/1OGAjUvalBuX-oZvZ_-9tEfYD2gQe7hTGsgUpiiBSXI8/edit#gid=0
            if (pos == 'NNG') | (pos == 'NNP') | (pos == 'VV') | (pos == 'VA'):
                wpos = "{}:{}".format(w, pos)
                if wpos not in stopwords:
                    tmp.append(wpos)
        morphs.append(tmp)

    # Create 'processed' directory if there isn't any.

    processed_dir = "processed"
    if not os.path.exists(processed_dir):
        os.makedirs(processed_dir)

    with open("processed/word_list_ko.p", 'wb') as f:
        pickle.dump(morphs, f)

    flat_list = [item for sublist in morphs for item in sublist]
    print("total_ko_words:", len(flat_list))

    counts = Counter(flat_list)
    print("uniq_words_ko:", len(counts))

    with open("processed/uniq_word_ko.txt", 'w') as f:
        for k, v in counts.most_common():
            f.write("{}\t{}\n".format(k, v))
Beispiel #29
0
def hello():
    app = Flask(__name__)
    app.config['JSONIFY_PRETTYPRINT_REGULAR'] = True
    contents = "ndllocvcv"

    from konlpy.tag import Mecab
    tagger = Mecab()
    t = tagger.pos("고양이는 양옹뉴턴야옹")
    print("========================================")
    return json.dumps(t, ensure_ascii=False)
Beispiel #30
0
def tokenize_n(doc):
    pos_tagger = Mecab()

    a = []
    for t in pos_tagger.pos(doc):
        if (re.search(nnpattern, t[1]) != None):
            a.append('/'.join(t))
        else:
            continue
    return a
Beispiel #31
0
    def _pos_tagger(self, input, type='mecab'):

        if (type == 'mecab'):
            osx_path = '/usr/local/lib/mecab/dic/mecab-ko-dic'
            tumbleweed_path = '/usr/local/lib64/mecab/dic/mecab-ko-dic'
            mecab = Mecab(osx_path)
            return mecab.pos(str(input))
        elif (type == 'twitter'):
            twitter = Twitter()
            return twitter.pos(str(input))
Beispiel #32
0
def getNVM(text: str):
    tokenizer = Mecab()
    parsed = tokenizer.pos(text)
    pos = []
    tags = ['NNG', 'NNP']
    for word in parsed:
        tag = word[1]
        if tag in tags:
            pos.append(word[0])
    return pos
Beispiel #33
0
def main():
	mecab = Mecab()
	if len(sys.argv) < 2:
		result = {'result':'none'}
		print json.dumps(result)
		sys.exit(0)

	morphem_list = mecab.pos(sys.argv[1].decode('utf-8'))
	result_dict = {}
	result_dict['result'] = [x[0].encode('utf-8') for x in morphem_list]
	print json.dumps(result_dict)
Beispiel #34
0
    def _mecab_parse(self, str_arr, tag_combine=True):
        """

        :param h5file:
        :return:
        """
        mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
        return_arr = []
        for data in str_arr:
            return_arr = return_arr + self._flat(mecab.pos(str(data)), tag_combine=tag_combine)
        return return_arr
    def _pos_raw_data(self, lt):
        """

        :param lt: list type value
        :return:
        """
        mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
        return_arr= []
        for raw in lt :
            pos = mecab.pos(raw)
            for word, tag in pos:
                return_arr.append("{0}/{1}".format(word, tag))
        return return_arr
Beispiel #36
0
    def _pos_tag_predict_data(self, x_input, word_len):
        """

        :param x_input:
        :return:
        """
        word_list = []
        mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
        for word_tuple in self._pad_predict_input(mecab.pos(x_input), word_len):
            if (len(word_tuple[1]) > 0):
                word = ''.join([word_tuple[0], "/", word_tuple[1]])
            else:
                word = word_tuple[0]
            word_list.append(word)
        return word_list
Beispiel #37
0
	def parse(self, data_path = "data"):
		file_list = glob.glob("%s/*.json" % data_path)
		json_list=[]

		shuffle(file_list)
		for json_file_name in file_list:
			json_file = json.loads(open(json_file_name).read())
			json_list += json_file["articles"]

		mecab = Mecab()

		dataframe = []

		for article in json_list:
			text = bs(article["text"], "html.parser").text
			title_pos = ["%s_%s" % (word, pos) for word, pos in mecab.pos(article["title"])]
			author_pos = ["%s_%s" % (word, pos) for word, pos in mecab.pos(article["author"])]
			text_pos = ["%s_%s" % (first, second) for first, second in mecab.pos(text)]

			dataframe.append({
				"title_pos": title_pos,
				"title_pos_sentences" : " ".join(title_pos),
				"author_pos": author_pos,
				"author_pos_sentences" : " ".join(author_pos),
				"text":article["text"],
				"text_pos": text_pos,
				"text_pos_sentences" : " ".join(text_pos),
				"forumid": article["forumid"],                    
				"istroll": article["is_troll"],
				"pk": article["pk"]
			})

		dataframe = pd.DataFrame.from_dict(dataframe)
		dataframe = dataframe.set_index("pk")

		return dataframe
Beispiel #38
0
def learning(request, pk):
    the_post = get_object_or_404(Post, pk=pk)
    mecab = Mecab()
    morph = mecab.pos(the_post.content)


    if request.method=="GET":
        pass
    elif request.method=="POST" and the_post.sentiword_set.exists()==False:
        for m in range(len(morph)):
            the_word = Sentiword()
            the_word.word = str(morph[m])
            the_word.post = the_post
            the_post.senti = request.POST.get('senti')
            the_post.save()
            the_word.save()
        return redirect('view_post', pk=pk)
    else:
        return redirect('view_post', pk=pk)

    return render(request, 'learning.html',{
        'post':the_post,
    })
Beispiel #39
0
class SearchCluster:
    def __init__(self, app):
        self.app = app
        self.mecab = Mecab()
        self.load_models()

    def load_models(self):
        self.word2vec = gensim.models.Word2Vec.load_word2vec_format(WORD2VEC_MODEL, binary=True)
        self.cluster_pipe = joblib.load(PIPE_DUMPING)

    def __task_to_vector(self, task):
        words = [key for key, pos in self.mecab.pos(task)]
        # aggregation word vectors
        vector = np.mean(np.array([self.word2vec[word] for word in words if word in self.word2vec]), axis=0)
        return vector

    def __predict_label(self, task):
        vector = self.__task_to_vector(task)
        return self.cluster_pipe.predict(vector)[0]

    def get_articles(self, user_id, task, topn=3):
        label = self.__predict_label(task)
        article_id_list = list(self.app.query_pool2.get_same_cluster_articles(user_id, label, topn))
        return list(self.app.query_pool2.get_article_list_by_id(article_id_list))
Beispiel #40
0
def preprocess(args):
	"""
	Description

	Return
	- word2idx: Sequence of word index. It is 2-dim like [# of laws, # of words in each law].
	- word_dict: Word to index mapping table. { word: idx } (Only contain VOCA_SIZE words)
	- word_inv_dict: Inverted version of word_dict. { idx: word } (Only contain VOCA_SIZE words)
	- word_count: Word counter of each laws. Only contain VOCA_SIZE words.
	"""
	tagger = Mecab()
	
	with open(args.input, "r") as reader:
		data = reader.read()

	# Sequence of words in each law. [num_laws, num_words]
	word_list     = list()
	# Sequence of idx. [num_laws, num_words]
	word2idx      = list()
	# Mapping table of word - idx.
	word_dict     = dict()
	# Inversed mapping table of word - idx (for fast access).
	word_inv_dict = dict()
	# Word counter.
	word_count    = list()

	""" Tag part-of-speech and remove unimportant words (like josa..). """
	# Split each laws by <END> symbol.
	law_list = data.split("<END>")
	for law in law_list:
		# Eliminate special chars
		law = re.sub("[^a-zA-Z0-9가-힣 \n]", " ", law)
		# 1. Eliminate newline, tab and strange char.
		# 2. Split words by space.
		word_list.append(law.replace("\n", " ").replace("\t", " ").replace("\xa0" ,"").split(" "))

	for i, v in enumerate(word_list):
		for j, word in enumerate(v):
			# Tag laws using Mecab tagger. and exclude some tags.
			tag = tagger.pos(word)
			excluded = [ t[0] for t in tag if not re.search("NN|XR", t[1]) ]
		
			# Exclude word if it contain number (ex. 제1조, 제1항의 경우 해당 단어 삭제).
			for t in tag:
				if t[1] == "SN": word_list[i][j] = ""
			
			# Reconstruct word_list by using excluded tag list.
			for e in excluded:
				word_list[i][j] = word_list[i][j].replace(e, "")

		word_list[i] = [ w for w in word_list[i] if len(w) > 1 or w == "법" ]
	
	# If last element of word_list is empty, remove it.
	if not word_list[-1]:
		word_list.pop()
	
	# Construct word counter. 1st element in counter is UNKOWN_WORD (simply UNK).
	word_count.append(["UNK", 0])
	merged = list(itertools.chain.from_iterable(word_list))
	word_count.extend(collections.Counter(merged).most_common(args.voca_size-1))

	# Construct word mapping table.
	word_dict = { v[0] : i for v, i in zip(word_count, itertools.count(0)) }
	word_inv_dict = { i : v for v, i in word_dict.items() }

	# Make sequence of word-idx.
	for v in word_list:
		row = list()
		for word in v:
			idx = word_dict.get(word)
			if idx != None: 
				row.append(idx)
			else: 			
				row.append(word_dict.get("UNK"))
				word_count[0][1] += 1
		word2idx.append(row)

	word_list = None # dont use anymore
	word_dict = None # dont use anymore
	word_count = None # dont use anympre
	return np.array(word2idx), word_inv_dict
Beispiel #41
0
def analyzing_morphem(content_list):
    mecab = Mecab()
    for idx, doc in enumerate(content_list):
        if idx % 5000 == 0 :
            print 'Morphem Analysis on %d' % idx
        yield ' '.join([part for part, pos in mecab.pos(doc.decode('utf-8'))]).encode('utf-8')
Beispiel #42
0
class DataAugmentation :
    """
    Data Augmentation Class for nlp
    mainly for create iob data with pattern and dict
    test = DataAugmentation()
    test.load_dict()
    test.convert_data()
    """

    class ThreadCls(threading.Thread) :
        def __init__(self, obj, idx):
            threading.Thread.__init__(self)
            self.obj = obj
            self.idx = idx

        def run(self):
            for _ in range(self.obj.dict_sample_iter):
                self.obj.load_dict()
                self.obj.convert_data(self.idx)

        def join(self):
            threading.Thread.join(self)
            return True

    def __init__(self, conf):
        """
        init parms need to mange teses parms on db
        """
        self.aug_file_cnt = 0
        self.use_mecab = conf.get("use_mecab")
        self.max_file_size = conf.get("max_file_size")  #10M
        self.pattern_data_path = conf.get("pattern_data_path")
        self.augmented_out_path = conf.get("augmented_out_path")
        self.dict_path = conf.get("dict_path")
        self.out_format_type = conf.get("out_format_type")
        self.ner_dicts = {}
        self.gpu_use = True
        self.dict_sample_size = int(conf.get("dict_sample_size"))
        self.dict_sample_iter = int(conf.get("dict_sample_iter"))
        self.thread_num = int(conf.get("thread_num"))

    def run(self):
        """
        run 
        :return: 
        """
        job_list = []
        for idx, _ in enumerate(range(self.thread_num)) :
            job_list.append(self.ThreadCls(self, idx))

        for job in job_list:
            job.start()

        for job in job_list:
            job.join()


    def load_dict(self):
        """
        load dict list from csv file
        :return:
        """
        self.ner_dicts = {}
        df_csv_read = pd.read_csv(self.dict_path,
                                  skipinitialspace=True,
                                  engine="python",
                                  encoding='utf-8-sig')
        df_csv_read = df_csv_read.sample(n=self.dict_sample_size)
        for col in df_csv_read.keys() :
            self.ner_dicts[col] = []
            for val in list(set(df_csv_read[col])) :
                if (val == val and val != None) :
                    self.ner_dicts[col].append(val)

    def _check_all_match(self, words) :
        """
        check all matcing dict keys
        in ohter word entity keys
        :param words: sentence str
        :return: list contain keys
        """
        match_keys = []
        for word in words :
            word = word.replace('\n', '')
            if(word in list(self.ner_dicts.keys())) :
                match_keys.append(word)
        return match_keys

    #@autojit
    def _aug_sent(self, keys, pattern, return_aug_sent=[]) :
        """
        function which actually augment sentences
        with given pattern and keys
        :param keys: entity keys
        :param pattern: sentence pattern
        :return: list of augmented sentence
        """
        try :
            if (len(keys) > 0):
                key = keys[0]
                del keys[0]
            else :
                return return_aug_sent

            if (len(return_aug_sent) == 0):
                for word in self.ner_dicts[key] :
                    line = []
                    for slot in pattern:
                        for rep in ['\n', 'NaN'] :
                            slot = slot.replace(rep, '')
                        if(key in slot) :
                            for wd in self.mecab.morphs(word):
                                wd = wd.replace(' ', '')
                                line.append((wd, key))
                        else :
                            line.append((slot, 'O'))
                    return_aug_sent.append(line)
            else :
                del_idx = []
                for i, line in enumerate(return_aug_sent):
                    for j, slot in enumerate(line):
                        if (slot[0] == key):
                            for word in self.ner_dicts[key]:
                                line = return_aug_sent[i].copy()
                                for z, slot in enumerate(line):
                                    if(slot[0] == key) :
                                        buffer = ""
                                        for wd in self.mecab.morphs(word) :
                                            wd = wd.replace(' ', '')
                                            if(len(buffer) > 0 ) :
                                                buffer = ''.join([buffer,' ', wd])
                                            else :
                                                buffer = wd
                                        if (len(buffer) > 1 ):
                                            line[z] = (buffer, key)
                                return_aug_sent.append(line)
                            del_idx.append(i)

                for _ in del_idx:
                    del return_aug_sent[0]
            return self._aug_sent(keys, pattern, return_aug_sent)
        except Exception as e :
            print("error on nlp data augmentation :{0}".format(e))

    def _iob_formatter(self, aug_data, idx) :
        """
        save aug list as iob file format
        :param aug_data: augmented list of sentence
        :return: None
        """
        if aug_data == None :
            pass
        path = ''.join([self.augmented_out_path, '/'+str(idx),'Test' , str(self.aug_file_cnt) , '.iob'])
        if(os.path.exists(path) == False or os.path.getsize(path) < self.max_file_size) :
            with open(path, "a")  as f :
                for line in aug_data :
                    for word in line :
                        related_words =  word[0].split(' ')
                        for tocken in related_words :
                            f.write(''.join([tocken, ' ', word[1]]))
                            f.write('\n')
                    f.write('\n')
        else :
            self.aug_file_cnt = self.aug_file_cnt + 1
            path = ''.join([self.augmented_out_path, '/'+str(idx),'Test', str(self.aug_file_cnt), '.iob'])
            with open(path, "w")  as f :
                for line in aug_data :
                    for word in line :
                        related_words =  word[0].split(' ')
                        for tocken in related_words :
                            f.write(''.join([tocken, ' ', word[1]]))
                            f.write('\n')
                    f.write('\n')

    def _plain_formatter(self, aug_data, idx) :
        """
        save aug list as iob file format
        :param aug_data: augmented list of sentence
        :return: None
        """
        if aug_data == None :
            pass
        path = ''.join([self.augmented_out_path, '/'+str(idx),'Test', str(self.aug_file_cnt), '.out'])
        if (os.path.exists(path) == False or os.path.getsize(path) < self.max_file_size):
            with open(path, "a")  as f :
                for line in aug_data :
                    for word in line :
                        f.write(''.join([word[0], ' ']))
                    f.write('\n')
        else :
            self.aug_file_cnt = self.aug_file_cnt + 1
            path = ''.join([self.augmented_out_path, '/'+str(idx),'Test', str(self.aug_file_cnt), '.out'])
            with open(path, "w")  as f :
                for line in aug_data :
                    for word in line :
                        f.write(''.join([word[0], ' ']))
                    f.write('\n')

    def _intent_formatter(self, aug_data, key, idx) :
        """
        save aug list as iob file format
        :param aug_data: augmented list of sentence
        :return: None
        """
        if aug_data == None :
            pass
        path = ''.join([self.augmented_out_path, '/'+str(idx),'Test', str(self.aug_file_cnt), '.csv'])

        if (os.path.exists(path) == False) :
            with open(path, "w")  as f :
                f.write('encode,decode\n')

        if (os.path.exists(path) == False or os.path.getsize(path) < self.max_file_size):
            with open(path, "a")  as f :
                for line in aug_data :
                    for word in line :
                        f.write(''.join([word[0], ' ']))
                    f.write(',')
                    f.write(str(key))
                    f.write('\n')
        else :
            self.aug_file_cnt = self.aug_file_cnt + 1
            path = ''.join([self.augmented_out_path, '/'+str(idx),'Test', str(self.aug_file_cnt), '.csv'])
            with open(path, "a")  as f :
                for line in aug_data :
                    for word in line :
                        f.write(''.join([word[0], ' ']))
                    f.write(',')
                    f.write(str(key))
                    f.write('\n')

    def convert_data(self, idx) :
        """
        augment data with entity list and pattern
        :return: Nones
        """
        try :
            if (self.out_format_type == 'intent'):
                self._conv_type_b(idx)
            else :
                self._conv_type_a(idx)
        except Exception as e :
            print("error log : {0}".format(e))

    def _conv_type_b(self, idx):
        """
        
        :return: 
        """
        df_csv_read = pd.read_csv(self.pattern_data_path,
                                  skipinitialspace=True,
                                  engine="python",
                                  encoding='utf-8-sig')

        i = 0
        for key, line in zip(df_csv_read['decode'].values, df_csv_read['encode'].values) :
            words = []
            if (self.use_mecab):
                self.mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
                pos = self.mecab.pos(line)
                for word, tag in pos:
                    words.append(word)
            else:
                words = str(line).split(' ')
            match_keys = self._check_all_match(words)
            aug_data = self._aug_sent(match_keys, words, [])
            self._intent_formatter(aug_data, key, idx)

            if(i%100 == 0) :
                print("====Therad{0} : {1} line job done".format(idx, i))
            i = i + 1

    def _conv_type_a(self, idx):
        """
        
        :return: 
        """
        df_csv_read = pd.read_csv(self.pattern_data_path,
                                  skipinitialspace=True,
                                  engine="python",
                                  encoding='utf-8-sig')
        i = 0
        for line in df_csv_read['encode'].values:

            words = []
            if(self.use_mecab) :
                self.mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
                pos = self.mecab.pos(line)
                for word, tag in pos:
                    words.append(word)
            else :
                words = str(line).split(' ')

            match_keys = self._check_all_match(words)
            if(self.out_format_type == 'plain') :
                aug_data = self._aug_sent(match_keys, words, [])
                self._plain_formatter(aug_data,idx)
            elif(self.out_format_type == 'iob') :
                aug_data = self._aug_sent(match_keys, words, [])
                self._iob_formatter(aug_data,idx)
            else :
                raise Exception (' '.join(['not', 'plain', 'or iob']))
            if (i % 100 == 0):
                print("====Therad{0} : {1} line job done".format(idx, i))
            i = i + 1

# da = DataAugmentation({
#                      "use_mecab": True,
#                      "max_file_size": 100000000,
#                      "pattern_data_path": "/hoya_model_root/aug/pattern.csv",
#                      "augmented_out_path": "/hoya_model_root/aug/aug_0810/",
#                      "dict_path": "/hoya_model_root/aug/dict.csv",
#                      "out_format_type": "iob",
#                      "dict_sample_size" : 3,
#                      "dict_sample_iter" : 500,
#                      "thread_num" : 8
#                  })
# da.run()
Beispiel #43
0
class crawl_community():
	def __init__( self ):
		self.driver = webdriver.Firefox()
		self.classifier = cf.classifier()
		self.URLs = []
		self.contexts = []

		self.bag = utils.load_dictionary()
		self.tagger = Mecab()

	
	def __del__( self ):
		self.driver.quit()
	
		
	def _crawl_URL( self ):
		titles = []

		# dynamic scrolling
		more_count = 0
		while True:
			time.sleep(0.5)
			more = self.driver.find_element_by_id("real_more_page")

			if more.is_displayed():
				if more.text == "더보기":
					more.click()
					more_count += 1
				else: 
					break
			else:
			 	self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
			if more_count >= self.scroll: break

		# get html source
		html = self.driver.page_source
		soup = BeautifulSoup(html)

		# crawl URL
		for c in soup.find_all("li"):
			# if items are from community 
			if c.get("class") == ['realtimeitem', 'community']:
				href = c.find("a")["href"]
				self.URLs.append(href)
				title = c.find("a").get_text().strip()
				titles.append(title)
			# if items are from twitter
			elif c.get("class") == ['realtimeitem', 'twitter']:
				for s in c.find_all("span"):
					if s.get("class") == ['text', 'snsbody']:
						href = s['href']
						self.URLs.append(href)
						titles.append("twitter")

		return titles


	def _exclude_short( self, text ):
		pos = self.tagger.pos(text)
		words = [ p[0] for p in pos ]

		is_in = False
		for b in self.bag[0]:
			if b[0] in words: is_in = True

		for b in self.bag[1]:
			if b[0] in words: is_in = True

		return not is_in


	def _crawl_dcinside( self, url, title ):
		ret = requests.get(url)
		soup = BeautifulSoup(ret.text)

		for c in soup.find_all("div"):
			if c.get("class") == ["s_write"]:
				text = c.find_all("td")[0].get_text()
				text = text.strip().replace("\n", " ")
				
				exclude = self._exclude_short(text)
				if not exclude: self.contexts.append(["dcinside", title, text])

	"""
	def _crawl_mlbpark( self, url, title ):
		ret = requests.get(url)
		soup = BeautifulSoup(ret.text)

		for c in soup.find_all("td"):
			if c.get("class") == ["G13"] and c.find_all("div"):
				div = c.find_all("div")[0]
				text = div.get_text()
				text = text.strip().replace("\n", " ")
				
				exclude = self._exclude_short(text)
				if not exclude: self.contexts.append(["mlbpark", title, text])
				break
	"""


	def _crawl_twitter( self, url, title ):
		ret = requests.get(url)
		soup = BeautifulSoup(ret.text)
	
		for c in soup.find_all("p"):
			tag = c.get("class")
			if tag and "tweet-text" in tag:
				text = c.get_text().strip().replace("\n", " ")

				exclude = self._exclude_short(text)
				if not exclude : self.contexts.append(["twitter", title, text])


	def _crawl_todayhumor( self, url, title ):
		ret = requests.get(url)
		soup = BeautifulSoup(ret.text)

		for c in soup.find_all("div"):
			if c.get("class") == ["viewContent"]:
				text = c.get_text().strip().replace("\n", " ")

				exclude = self._exclude_short(text)
				if not exclude: self.contexts.append(["todayhumor", title, text])


	"""
	def _crawl_clien( self, url, title ):
		ret = requests.get(url)
		soup = BeautifulSoup(ret.text)

		c = soup.find(id="writeContents")
		if c: 
			text = c.get_text().strip().replace("\n", " ")
			if self._exclude_short: self.contexts.append(["clien", title, text])


	def _crawl_bobaedream( self, url, title ):
		ret = requests.get(url)
		soup = BeautifulSoup(ret.text)

		for c in soup.find_all("div"):
			if c.get("class") == ["bodyCont"]:
				text = c.get_text().strip().replace("\n", " ")
				if self._exclude_short: self.contexts.append(["bobaedream", title, text])
	"""

	def _crawl_fomos( self, url, title ):
		ret = requests.get(url)
		soup = BeautifulSoup(ret.text)

		for c in soup.find_all("div"):
			if c.get("class") == ["view_text"]:
				text = c.get_text().strip().replace("\n", " ")

				exclude = self._exclude_short(text)
				if not exclude: self.contexts.append(["fomos", title, text])
				break


	def _crawl_inven( self, url, title ):
		ret = requests.get(url)
		soup = BeautifulSoup(ret.text)

		for c in soup.find_all("div"):
			if c.get("class") == ["powerbbsContent"]:
				text = c.get_text().strip().replace("\n", " ")

				exclude = self._exclude_short(text)
				if not exclude: self.contexts.append(["inven", title, text])


	def _crawl_instiz( self, url, title ):
		ret = requests.get(url)
		soup = BeautifulSoup(ret.text)

		c = soup.find(id="memo_content_1")
		if c:
			text = c.get_text().strip().replace("\n", " ")

			exclude = self._exclude_short(text)
			if not exclude: self.contexts.append(["instiz", title, text])


	def _crawl_ppomppu( self, url, title ):
		ret = requests.get(url)
		soup = BeautifulSoup(ret.text)

		for c in soup.find_all("td"):
			if c.get("class") == ["han"]:
				text = c.get_text().strip().replace("\n", " ")

				exclude = self._exclude_short(text)
				if not exclude: self.contexts.append(["ppomppu", title, text])


	# determine which URL comes from
	def _crawl_context( self, titles ):
		for i, url in enumerate(self.URLs):
			if   "dcinside"   in url: self._crawl_dcinside(url, titles[i])
			#elif "mlbpark"    in url: self._crawl_mlbpark(url, titles[i])
			elif "todayhumor" in url: self._crawl_todayhumor(url, titles[i])
			#elif "clien"      in url: self._crawl_clien(url, titles[i])
			elif "twitter"    in url: self._crawl_twitter(url, titles[i])
			#elif "bobaedream" in url: self._crawl_bobaedream(url, titles[i])
			elif "fomos"	  in url: self._crawl_fomos(url, titles[i])
			elif "inven"	  in url: self._crawl_inven(url, titles[i])
			elif "instiz"	  in url: self._crawl_instiz(url, titles[i])
			elif "ppomppu"	  in url: self._crawl_ppomppu(url, titles[i])
			else: print(url)

		# classify sentiment
		for i, v in enumerate(self.contexts):
			vector = self.classifier.features(v[1]+v[2])
			predict = self.classifier.predict(vector).tolist()[0]
			self.contexts[i].insert(0, predict)


	def crawl( self, query, scroll = 5 ):
		self.scroll = scroll
		self.query = query
		self.url = "http://search.zum.com/search.zum?method=realtime&option=accu&query="+query+"&cm=more"
		self.driver.get(self.url)

		titles = self._crawl_URL()
		self._crawl_context(titles)	

		return self.contexts	
Beispiel #44
0
class classifier():
	# include POS, MAG, VX to handle negation
	POS = "NN|XR|VA|VV|MAG|VX"

	POS_IDX = ["NN", "VA", "VV", "XR"]
	# "못"은 따로 처리
	NEG_PREV = [("아니하", "VX"), ("않", "VX"), ("없", "VA"), ("없이", "MAG")]
	NEG_NEXT = [("안", "MAG")]


	def __init__(self):
		# initalize Mecab tagger
		self.tagger = Mecab()
	
		# initalize regular expression	
		self.exp = re.compile(self.POS, re.IGNORECASE)
		
		# load sentiment dictionary
		self.bag = utils.load_dictionary()
	
		# load model if exist
		with open("../Resources/models/model", "rb") as model_file:
			self.model = pickle.load(model_file)


	def handle_negation(self, words, counter):	
		# construct index to negate word except "못"
		neg_idx = []
		for neg in self.NEG_PREV:
			find = utils.find_dup_idx(words, neg)
			for item in find:
				if item-1 > -1: neg_idx.append(item-1)
		for neg in self.NEG_NEXT:
			find = utils.find_dup_idx(words, neg)
			for item in find:
				if item+1 < len(words): neg_idx.append(item+1)
	
		# handle "못~"
		for w in words:
			loc = w[0].find("못")
			if loc > 0 and w[1].find("VX"): neg_idx.append(loc-1)
		# handle "못"
		for w in words:
			loc = w[0].find("못")
			if loc > -1 and w[1].find("MAG"):
				# 긴 부정문 (못햇다, 못 했다..)
				if loc > 1 and words[loc-1][1].find("VV"): neg_idx.append(loc-1)
				# 짧은 부정
				elif loc < len(words)-1: neg_idx.append(loc+1)
				# 한계: 못 생겼다 같은 경우는 이상하게 나옴
	
		# negate word
		for i in neg_idx:
			if words[i] in self.bag[0]:
				try: idx = self.POS_IDX.index(words[i][1])
				except ValueError: pass
				else:	
					counter[idx]   -= 1
					counter[idx+4] += 1
			elif words[i] in self.bag[1]:
				try: idx = self.POS_IDX.index(words[i][1])
				except ValueError: pass
				else:
					counter[idx]   += 1
					counter[idx+4] -= 1
	
		return counter	
	
	def make_features(self, sentence, words):	
		# feature vector:
		# [ pos_noun, pos_adj, pos_verb, pos_root,
		#   neg_noun, neg_adj, neg_verb, neg_root ]
		counter = [0, 0, 0, 0, 0, 0, 0, 0]
	
		if not words: return counter
		
		for i, w in enumerate(words):
			# replace POS to sentiment dictionary type
			words[i] = list(words[i])
			if   words[i][1].find("NN") >= 0: words[i][1] = "NN"
			elif words[i][1].find("VA") >= 0: words[i][1] = "VA"
			elif words[i][1].find("VV") >= 0: words[i][1] = "VV"
			elif words[i][1].find("XR") >= 0: words[i][1] = "XR"
			elif words[i][1].find("VX") >= 0: words[i][1] = "VX"
			elif words[i][1].find("MAG") >= 0: words[i][1] = "MAG"
			words[i] = tuple(words[i])
	
			# count frequency of sentiment words
			if words[i] in self.bag[0]: # positive
				try:
					idx = self.POS_IDX.index(words[i][1])
					counter[idx] += 1
				except ValueError: pass
			elif words[i] in self.bag[1]: # negative	
				try:
					idx = self.POS_IDX.index(words[i][1])
					counter[idx+4] += 1
				except ValueError: pass
	
		counter = self.handle_negation(words, counter)
		return counter
	
			
	def features(self, article):
		# tagging article
		pos = self.tagger.pos(article)
		words = [ p for p in pos if self.exp.search(p[1]) ]
	
		# construct data sets
		data = self.make_features(article, words)
	
		# normalize features
		arr = np.array(data, dtype=float)
		scaled = preprocessing.scale(arr).tolist()
		data = scaled

		return data


	def predict(self, vector):
		return self.model.predict(vector)
Beispiel #45
0
class keyword_anaylze():
	def __init__( self, date, news_limit = 5, net_limit = 50 ):
		self.section = util.load_file("section.txt")
		self.date = date
		self.news_limit = news_limit
		self.net_limit = net_limit
		self.refer = 0

		self.mecab = Mecab()
		self.exp = re.compile("NN|XR|VA|VV|MAG|VX")
		
		self.temp_net = {}
		self.temp_list = {}
		self.word_net = []	   # relative word and its frequency
		self.word_list = []	   # total word and its frequency (using for PMI)
		self.news = []		   # top # of news
		self.sentiment = [0, 0] # [neg, pos]
		self.counter = [ 0 for i in range(16) ]


	def _add_news( self, context, url, title ):
		if len(self.news) < self.news_limit:
			self.news.append([len(context), url, title])
			self.news.sort()
		else:
			self.news[0] = [len(context), url, title]
			self.news.sort()


	def _add_word( self, words, word_list, senti ):
		for w in words:
			if len(w) < 2: continue

			if w in word_list:
				word_list[w][0] += 1
				word_list[w][int(senti)+1] += 1
			else:
				word_list[w] = [1, 0, 0]
				word_list[w][int(senti)+1] += 1


	def _make_morp( self, context ):
		context = re.sub(r"(\"|\')", "", context)
		words = re.findall(r"[\w']+", context)
			
		for i, v in enumerate(words):
			pos = self.mecab.pos(v)
			w = [ p[0] for p in pos if not re.search("NN|XR|VA|VV|MAG|VX|SL|SN", p[1]) ]
			for x in w:
				words[i] = words[i].replace(x, "")

		# remove '' in words
		return [ w for w in words if not w == "" ]
	

	def _arrange_word_list( self, dictionary ):
		words = sorted(dictionary.items(), key=itemgetter(1), reverse=True)
		word_list = []
		for w in words:
			pos = self.mecab.pos(w[0])
			if re.search("NN|XR", pos[0][1]):
				word_list.append(w)

		return word_list


	def _traverse_news( self, keyword ):
		global news_loc

		keyword_list = keyword.split(" ")
		for s in self.section:
			idx = 0
			loc = news_loc+self.date+"/"+s

			print(loc+"/")
			while os.path.isfile(loc+"/"+str(idx)):
				f = open(loc+"/"+str(idx), "r")
				senti   = f.readline().replace("\n", "")
				url     = f.readline().replace("\n", "")
				title   = f.readline().replace("\n", "")
				context = f.read().replace("\n", "")
				words   = self._make_morp(context)
				f.close()

				self._add_word(words, self.temp_list, senti)
			
				is_key = True
				for key in keyword_list:
					have_word = False
					for w in words:
						if key in w:
							have_word = True
					if not have_word: is_key = False
				
				if is_key:
					self.counter[0+int(senti)] += 1
					self.refer += 1
					self.sentiment[int(senti)] += 1
					self._add_news(context, url, title)
					self._add_word(words, self.temp_net, senti)

				idx += 1
			

	def _traverse_community( self, keyword ):
		global community_loc
		
		base_loc = community_loc+keyword+"/"
		idx = 0

		print(base_loc)
		while True:
			loc = base_loc+str(idx)
			idx += 1
			if not os.path.isfile(loc): break

			f = open(loc, "r")
			senti   = f.readline().replace("\n", "")
			comm    = f.readline().replace("\n", "")
			title   = f.readline().replace("\n", "")
			context = f.read().replace("\n", "") 
			words   = self._make_morp(context)
			f.close()

			self.sentiment[int(senti)] += 1
			self._add_word(words, self.temp_list, senti)
			self._add_word(words, self.temp_net, senti)

			# determine community
			if 	 comm == "dcinside":   self.counter[2+int(senti)] += 1
			elif comm == "todayhumor": self.counter[4+int(senti)] += 1
			elif comm == "twitter":    self.counter[6+int(senti)] += 1
			elif comm == "fomos": 	   self.counter[8+int(senti)] += 1
			elif comm == "inven":      self.counter[10+int(senti)] += 1
			elif comm == "instiz":     self.counter[12+int(senti)] += 1
			elif comm == "ppomppu":    self.counter[14+int(senti)] += 1


	def _make_word_net( self ):
		network = []

		words = []
		count = []
		for v in self.word_net:
			words.append(v[0])
			count.append(v[1][0])

		for i, v in enumerate(self.word_list):
			for j, w in enumerate(words):
				if v[0] == w and v[1][0] > 10:
					senti = v[1][2] / v[1][0]
					pmi   = count[j] / v[1][0]
					network.append([w, senti, v[1][0], pmi])

		return network

			
	def anaylze( self, keyword ):
		self._traverse_news(keyword)
		self._traverse_community(keyword)

		# sort word_net
		self.word_net = self._arrange_word_list(self.temp_net)

		if len(self.word_net) > self.net_limit:
			self.word_net = [ self.word_net[i] for i in range(self.net_limit) ]

		# sort word_list
		self.word_list = self._arrange_word_list(self.temp_list)

		# network = [ [word, senti, frequency, PMI] .. ] 
		network = self._make_word_net()

		return self.sentiment, self.news, network, self.counter
Beispiel #46
0
def language_processing(input_data):
    mecab = Mecab()

    # 명사에 대한 yn 데이터 저장
    # 날개가 있을 경우, check_data['날개'] == 1
    check_data = dict()
    for name in [input_neuron.name for input_neuron in InputLayer.all_neuron]:
        # 우선 check_data 의 모든 데이터를 모른다는 조건으로 초기화
        check_data[name] = 0

    # [*range(3)] is same with [0, 1, 2]
    word_list, pos_list = zip(*[(word, pos)
                                for word, pos in mecab.pos(input_data)
                                if pos in ['VV', 'VA', 'NNG', 'JC', 'SC', 'MAG', 'VX']])

    # 이미 처리한 word 데이터를 False 로 바꾸기 위해
    # 데이터 변경을 지원하는 리스트로 형 변환. (기존에는 tuple)
    word_list = list(word_list)

    # 같은 이유
    pos_list = list(pos_list)

    # 부정적인 성분 부사를 가지고 있는 형용사를 치환
    # 날개가 안 보인다 --> 날개가 없다

    yn_dict = {
        '있': 1,
        '들리': 1,
        '보이': 1,
        '없': -1,
        '모르': 0
    }

    """
    for index in range(len(pos_list)):
        if pos_list[index] == 'MAG' and word_list[index] == '안':  # 성분 부사 이면서 부정 부사 일 경우
            word_list[index] = '없'  # 부정으로 치환


        for i in range(len(pos_list[index:])):  # 부정 부사 뒷 부분 탐색
            if pos_list[i] in ['VV', 'VA']:  # '있', '없' 등의 데이터가 나올 경우
                try:
                    word_list[i] = yn_change[word_list[i]]  # yn_change 를 이용해 반전시킨다
                except KeyError:
                    word_list
                    pass
    """

    # 형용사를 먼저 탐색하고, 주변 명사를 그룹화 하는 방식으로 처리한다.

    # pos 데이터 중에서 있,없 등의 수식어를 가져옴
    for index in range(len(pos_list)):
        if pos_list[index] == 'MAG' and word_list[index] == '안':  # 성분 부사 이면서 부정 부사 일 경우
            word_list[index] = '없'  # 부정으로 치환
            pos_list[index] = 'VA'  # pos 데이터도 맞게 변경

        if pos_list[index] in ['VA', 'VV']:  # if pos is yn data

            # 해당 명사에 서술한 내용에 따라 InputLayer Neuron 에 입력함
            try:
                yn = yn_dict[word_list[index]]
            except KeyError:
                yn = 0
            finally:
                # 뒤에 부정적인 보조용언이 올 경우
                # ex) ~하지 '않'는다

                # 다음 인덱스 부터 탐색
                tmp_index = index + 1
                while tmp_index < len(pos_list):
                    if pos_list[tmp_index] == 'VX':
                        if word_list[tmp_index] == '않':
                            yn *= -1
                            break
                    elif pos_list[tmp_index] == 'NNG':
                        break  # 다음 명사가 나오면 종료
                    tmp_index += 1

            # 그 전까지의 모든 명사를 위 yn 데이터로 저장
            for nng in [word_list[i] for i in range(index) if pos_list[i] == 'NNG']:
                # 이미 처리한 word 일 경우
                if nng is False:
                    continue
                else:
                    try:
                        check_data[nng]
                    except KeyError:
                        pass
                    else:
                        check_data[nng] = yn

            # 처리한 word 들은 False 으로 치환.
            word_list[:index] = ([False] * index)

    return check_data
Beispiel #47
0
learning_rate = 0.001
dim_embed = 200
n_epochs = 20
window_size = 5
min_count = 3

wiki_file = '../text/wiki_all'
with open( wiki_file ) as f:
    wiki_contents = f.read()
    wiki_docs = map(lambda x: filter(lambda y: y != '', x.text.split('\n')), BeautifulSoup( wiki_contents ).find_all('doc'))
    wiki_paragraphs = [item for sublist in wiki_docs for item in sublist]

paragraph_list = []
for wiki_paragraph in wiki_paragraphs:
    wiki_paragraph_pos = map(lambda x: x[0] + '^/'+ x[1], mecab.pos( wiki_paragraph ))
    if len(wiki_paragraph_pos) > 2:
        paragraph_list.append( wiki_paragraph_pos )

del wiki_paragraphs
word2vec_model = Word2Vec( size=dim_embed, alpha=learning_rate, min_count=min_count, workers=-1 )
word2vec_model.build_vocab( paragraph_list )

for epoch in range( n_epochs ):
    print "Training Epoch:", epoch
    word2vec_model.train( paragraph_list )
    word2vec_model.alpha *= 0.99

word2vec_model.save('../../models/word2vec')