Example #1
0
 def splitEojeol(self, eojeol):
     out = []
     m = Mecab()
     # m = Mecab(dicpath='C:/mecab/mecab-ko-dic') # (사용불가능, 비활성)
     # m = Mecab(dicpath='/usr/local/lib/mecab/dic/mecab-ko-dic') # (사용불가능, 비활성)
     if eojeol[-1] == '.': eojeol = eojeol[:-1]
     for i in range(len(eojeol)):
         for j in range(len(eojeol) - i):
             k = eojeol[i:j + i + 1]
             # k가 두 글자 이상이고 형태소 태그가 여럿인 경우 or 하나라도 명사가 아닌 경우 UM으로 등록
             if len(k) > 1 and (len(m.pos(k)) > 1
                                or m.pos(k)[0][1][0:2] != 'NN'):
                 out.append([eojeol[0:i], k, eojeol[i + j + 1:]])
     return out
Example #2
0
def main():
    # Mecab 불러오기
    mecab = Mecab()

    # Corpus 불러오기:
    with open('corpus/Control_Engine_Start_Temp.txt', 'r',
              encoding='utf-8') as f:
        raws = f.read().split('\n')

    first_sentence = pos2word(mecab.pos(raws[0]))
    print('Mecab Test: ', first_sentence)

    # Get the bigrams
    print('bigrams: ', list(bigrams(first_sentence)))

    # Get the padded bigrams
    print('bigrams (padded): ',
          list(bigrams(first_sentence, pad_left=True, pad_right=True)))

    # Get the trigrams
    print('trigrams: ', list(trigrams(first_sentence)))

    # Get the padded trigrams
    print('trigrams (padded): ',
          list(trigrams(first_sentence, pad_left=True, pad_right=True)))

    model = defaultdict(lambda: defaultdict(lambda: 0))

    for raw in raws:
        sentence = pos2word(mecab.pos(raw))
        for w1, w2, w3 in trigrams(sentence, pad_left=True, pad_right=True):
            model[(w1, w2)][w3] += 1

    test_sentence = "내 차 공조 23도로 시동 켜줄래"
    test_tokens = pos2word(mecab.pos(test_sentence))
    print('test sentence: ', test_tokens)

    print(model[(test_tokens[0], test_tokens[1])][test_tokens[2]])
    print(model[(test_tokens[0], test_tokens[1])][test_tokens[4]])
    print(model[(None, None)][test_tokens[0]])

    for w1_w2 in model:
        total_count = float(sum(model[w1_w2].values()))
        for w3 in model[w1_w2]:
            model[w1_w2][w3] /= total_count

    print(model[(test_tokens[0], test_tokens[1])][test_tokens[2]])
    print(model[(test_tokens[0], test_tokens[1])][test_tokens[4]])
    print(model[(None, None)][test_tokens[0]])
Example #3
0
def write_corpora(sentences, output_file_handle, tagger=None):
    target_tags = None

    if tagger is None:
        tagger = Mecab()

    if isinstance(tagger, konlpy.tag._okt.Okt):
        target_tags = ['Alpha', 'Noun', 'Adjective']
    elif isinstance(tagger, konlpy.tag._kkma.Kkma):
        target_tags = ['NN', 'NNG', 'NNB', 'NNM',' NNP', 'NP', 'NR', 'OH', 'OL', 'ON', 'VA', 'VXA']
    elif isinstance(tagger, konlpy.tag._komoran.Komoran):
        target_tags = ['NNG', 'NNB', 'NNP', 'NP', 'NR', 'SH', 'SL', 'SN', 'VA']
    elif isinstance(tagger, eunjeon._mecab.Mecab):
        target_tags = ['VA', 'NNG', 'NNB', 'NNBC', 'NNP', 'NP', 'NR', 'SH', 'SL', 'SN', 'VA']
    else:
        raise ValueError(f'invalid tagger {tagger.__class__}')
    
    for i, s in enumerate(sentences):
        try:
            pos_tagged = tagger.pos(s)               
        except ValueError:
            print(f'could not {i}th parsed! sentence = {s}')
            continue

        tokenized = [t[0].strip() for t in pos_tagged if t[1] in target_tags]
        output_file_handle.write(' '.join(tokenized) + '\n')    
        
Example #4
0
def validate(msg):
    # print(len(msg.split()))
    # if len(msg.split()) == 1:
    #     return ['PASS', 'ONEWORD', ""]
    """'
    eunjeon.Mecab 를 이용해 입력받은 문장의 축을 이루는 명사+동사의 개수가
    전체 형태소의 개수보다 많을 경우wordpop() 함수롤 실행시킴.

    입력값에 대하여 선택적으로 봇이 답변하도록 하여 채팅창 도배를막음.
    """
    tagger = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')
    Pos = tagger.pos(msg)
    numWord = 0  # 동사개수 - 형태소 분리 후 VV 개수로 파악.
    realword = []  # 실제 의미를 가지는 단어들
    target = [
        "NNG", "NNP", "NNB", "NNBC", "NP", "VV", "VA", "VX", "XSV", "XR",
        "MAG", "IC"
    ]  # POS tag chart : https://bit.ly/2KOA1ua
    for i in Pos:
        if i[1] in target:
            print(i[0])
            numWord += 1
            realword.append(i[0])  # 실제 의미를 가지는 단어들만 전달되도록 필터링함.
    if Pos[-1][1] == "SF":  # 문장부호는 맨 마지막에 한번만 붙이도록.
        realword.append(Pos[-1][0])

    numPos = len(Pos)  # 전체 형태소의 개수
    print(Pos)
    print(numWord, numPos - numWord)
    if numWord >= (numPos - numWord):  # 의미를 가지는 요소가 문장의 과반 이상일 경우(비율은 수정가능)
        return wordpop(realword)
    else:
        return ["PASS", numWord, numPos]
def each_sentence_division(script_string_array):  # 한 문장 단위로 끊어서 분석하는 함수
    global element_table
    mecab = Mecab()
    string_table = []
    count = 0

    for each in script_string_array:  # 문장 단위로 끊어져 저장되어 있는 배열에서 한 문장씩 (each) 형태소 분석
        complete_sentence = []
        sentence = str(each)
        sentence = remove_marks(sentence)  #특수문자 제거
        sentence = add_space_after_mot(sentence)  # '못' 띄어쓰기 처리
        mecab_result = mecab.pos(sentence)

        for i in range(len(mecab_result)):
            complete_sentence.append(mecab_result[i])

        string_table.append(complete_sentence)  # 분석 완료된 문장을 string_table에 추가
        #element_table, modifier_table 구성하는 작업
        one_line_temp = make_element_table(complete_sentence,
                                           script_string_array[count])
        element_table = np.append(element_table,
                                  np.array([one_line_temp], dtype=list),
                                  axis=0)
        modifier_table.append(
            make_modifier_table(script_string_array[count], complete_sentence))
        count += 1
def sentence_division(input_string):
    global element_table

    mecab = Mecab()

    input_string = add_space_after_mot(input_string)  # '못' 뒤에 띄어쓰기 추가

    string_table = []  # 한 문장씩 저장할 테이블
    mecab_result = mecab.pos(
        input_string)  # ex) [('안녕', 'NNG'), ('하', 'XSV'), ('세요', 'EP+EF')]

    string_start = 0  # 각 문장의 첫번째 요소 가르키는 변수
    cnt = 0  # 한 문장에 대해 형태소 분석이 안 된 문장을 index로 찾아가기 위한 변수
    for i in range(len(mecab_result)):
        if is_sentence_End(mecab_result[i]):  # 문장의 마지막인지 판단
            sentence = []
            for j in range(string_start,
                           i + 1):  # 한 문장 내의 첫번째 요소부터 마지막 요소까지 저장.
                # if is_MAG_except_neg(mecab_result[j]):  # '못', '안'을 제외한 MAG[일반 부사]는 저장 X
                #     continue
                if is_mark(mecab_result[j]):  # 문장부호는 저장 X
                    continue
                sentence.append(mecab_result[j])  # 각 요소를 현재 문장에 추가
            string_table.append(sentence)  # 완성된 한 문장을 테이블에 추가

            cnt += 1
            string_start = i + 1  # 다음 문장의 첫 번째 요소를 가리킴.
    return string_table
Example #7
0
 def isAfterNoun(self, rp):
     if rp == '': return True
     m = Mecab()
     # m = Mecab(dicpath='C:/mecab/mecab-ko-dic') # (사용불가능, 비활성)
     # m = Mecab(dicpath='/usr/local/lib/mecab/dic/mecab-ko-dic') # (사용불가능, 비활성)
     if m.pos(rp)[0][1][0] in ['N', 'V', 'J', 'E', 'X']: return True
     return False
Example #8
0
def write_corpora(sentences,
                  output_file_handle,
                  min_token_count=5,
                  tagger=None,
                  isAllTag=False):
    if tagger is None:
        tagger = Mecab()
    target_tags = get_tags(tagger, isAllTag)

    for i, s in enumerate(sentences):
        try:
            pos_tagged = tagger.pos(s)
        except ValueError:
            print(f'could not {i}th parsed! sentence = {s}')
            continue

        if len(target_tags) == 0:
            tokenized = [t[0].strip() for t in pos_tagged]
        else:
            tokenized = [
                t[0].strip() for t in pos_tagged if t[1] in target_tags
            ]

        if len(tokenized) < min_token_count:
            continue
        output_file_handle.write(' '.join(tokenized) + '\n')
Example #9
0
def get_corpora(sentences, tagger=None, isAllTag=False):
    if tagger is None:
        tagger = Mecab()

    # 모든 형태소 대상이면 비어있는 배열
    target_tags = get_tags(tagger, isAllTag)

    corporas = []
    for i, s in enumerate(sentences):
        try:
            pos_tagged = tagger.pos(s)
        except ValueError:
            print(f'could not {i}th parsed! sentence = {s}')
            continue

        if len(target_tags) == 0:
            tokenized = [t[0].strip() for t in pos_tagged]
        else:
            tokenized = [
                t[0].strip() for t in pos_tagged if t[1] in target_tags
            ]

        corporas.append(' '.join(tokenized))

    return corporas
Example #10
0
def krword_tokenize(sent):
    result = []
    mecab = Mecab()
    sample = mecab.pos(sent)
    for x in sample:
        if 'NNG' in x or 'NNP' in x or 'NNB' in x or 'NP' in x:
            result.append(x)
    return result
Example #11
0
def onlyNouns(article):
    result = ""
    mecab = Mecab()
    sample = mecab.pos(article)
    for x in sample:
        if 'NNG' in x or 'NNP' in x or 'NNB' in x or 'NP' in x:
            result += (x[0]) + ' '
            # print(result)
    return result
Example #12
0
def concat_text_with_pos(setence):
    tag = Mecab()
    pos = tag.pos(setence)
    temp = []
    for p in pos:
        temp.append(p[0] + "/" + p[1])
    
    s = ' '.join(temp)
    return s
Example #13
0
def hello():
    app = Flask(__name__)
    app.config['JSONIFY_PRETTYPRINT_REGULAR'] = True
    contents = "ndllocvcv"

    from konlpy.tag import Mecab
    tagger = Mecab()
    t = tagger.pos("고양이는 양옹뉴턴야옹")
    print("========================================")
    return json.dumps(t, ensure_ascii=False)
Example #14
0
def do_mecab(text):
    mecab = Mecab()

    me_list = mecab.pos(text)
    after = []
    for t in me_list:
        if t[1][0] == 'N' or t[1][0] == 'V':  #s랑 m도 더함
            after.append(t)
    print(after)
    return me_list
Example #15
0
def tokenize_mecab(docs_list):
    from eunjeon import Mecab
    m = Mecab(dicpath='/usr/local/lib/mecab/dic/mecab-ko-dic')
    token_list = []
    for i in tqdm(range(len(docs_list)), desc='Tokenizing docs'):
        nouns = [
            t[0] for t in m.pos(docs_list[i])
            if (t[1] in ['NNG', 'NNP']) and (len(t[0]) > 1)
        ]
        token_list.append(nouns)

    return token_list
Example #16
0
def extract_nouns_v2(news: str) -> dict:
    """Extract nouns from news.

    :param news: contents of news.
    :return: dict(). Extracted keyword and its count. {keyword: count, }
    """
    mecab = Mecab()

    news_lines = kss.split_sentences(news)

    nouns = []

    for line in news_lines:
        nn = 0
        pos = 0

        for token in mecab.pos(line):
            pos = pos + line[pos:].find(token[0])

            if token[1] == 'NNG':
                # 일반 명사
                if nn > 0:
                    if line[pos - 1] == ' ':
                        nouns.append(
                            (f'{nouns[-1][0]} {token[0]}', nouns[-1][1]))
                        nouns.append(token[0])
                    else:
                        nouns[-1] = (f'{nouns[-1][0]}{token[0]}', nouns[-1][1])

                    nn += 1
                else:
                    nn = 1
                    nouns.append(token)

            elif token[1] == 'NNP':
                # 고유 명사
                if nn > 0:
                    if line[pos - 1] == ' ':
                        nouns.append((f'{nouns[-1][0]} {token[0]}', 'NNP'))
                        nouns.append(token[0])
                    else:
                        nouns[-1] = (f'{nouns[-1][0]}{token[0]}', 'NNP')

                    nn += 2
                else:
                    nn = 2
                    nouns.append(token)
            else:
                nn = 0

            pos += len(token[0])

    return dict(Counter(nouns))
Example #17
0
 def isKnown(self, text):
     if len(text) == 0: return True
     m = Mecab()
     # m = Mecab(dicpath='C:/mecab/mecab-ko-dic') # (사용불가능, 비활성)
     # m = Mecab(dicpath='/usr/local/lib/mecab/dic/mecab-ko-dic') # (사용불가능, 비활성)
     for i in m.morphs(text):
         if m.pos(
                 i
         )[0][1] == 'UNKNOWN':  # or maybe include when first letter is 'S' too?
             # print(i)
             # it is not RP
             return False
     return True
Example #18
0
def train_vector_model(str_buf):

    mecab = Mecab()
    str_buf = train_data_list['encode']
    pos1 = mecab.pos(''.join(str_buf))
    pos2 = ' '.join(list(map(lambda x: '\n'
                             if x[1] in ['SF'] else x[0], pos1))).split('\n')
    morphs = list(map(lambda x: mecab.morphs(x), pos2))
    print(str_buf)
    model = word2vec.Word2Vec(size=vector_size, window=2, min_count=1)
    model.build_vocab(morphs)
    model.train(morphs, epochs=model.epochs, total_examples=model.corpus_count)
    return model
Example #19
0
 def isKRP(self, spacedrplist):
     # 기본적으로 self.splitWays(RP)를 인풋으로 받음
     m = Mecab()
     # m = Mecab(dicpath='C:/mecab/mecab-ko-dic') # (사용불가능, 비활성)
     # m = Mecab(dicpath='/usr/local/lib/mecab/dic/mecab-ko-dic') # (사용불가능, 비활성)
     if len(spacedrplist) == 0: return True
     # 각 조합을 모두 확인
     for i in spacedrplist:
         # 가능한 하나의 조합을 리스트로 변형시킨 후 각 마디가 모두 사전에 등록되어있는지 확인
         for j in range(len(i.split())):
             # j번째 마디를 morph로 간략화
             morph = i.split()[j]
             # j번쨰 마디의 형태소분석 결과가 여러개이거나, 한글이 아니거나, UNKNOWN이거나, 감탄사이거나 여러 태그가 있는 경우
             # if len(m.pos(morph))>1 or m.pos(morph)[0][1][0] == 'S' or m.pos(morph)[0][1] == "UNKNOWN" or m.pos(morph)[0][1] == "IC" or m.pos(morph)[0][1].count("+")>0:
             # j번쨰 마디의 형태소분석 결과가 여러개이거나, 한글이 아니거나, UNKNOWN인 경우
             if len(m.pos(morph)) > 1 or m.pos(morph)[0][1][
                     0] == 'S' or m.pos(morph)[0][1] == "UNKNOWN":
                 # j번쨰 마디는 KM이 아님
                 break
             # j번째가 마지막이면(마지막까지 if문을 통과한 경우) KRP로 인정.
             if j == len(i.split()) - 1:
                 return True
     return False
Example #20
0
def train_vector_model(str_buf):
    mecab = Mecab()
    str_buf = train_data_list['encode']
    #mecab로 POS Tagging
    pos1 = mecab.pos(''.join(str_buf))
    #문장별로 list로 나눔 마침표등이 존재시 줄바꾸기 (문장이길경우)
    pos2 = ' '.join(list(map(lambda x: '\n'
                             if x[1] in ['SF'] else x[0], pos1))).split('\n')
    #단어구성을 위한 형태소단위 문장 쪼개기
    morphs = list(map(lambda x: mecab.morphs(x), pos2))
    model = word2vec.Word2Vec(size=vector_size, window=2, min_count=1)
    model.build_vocab(morphs)
    model.train(morphs, total_examples=model.corpus_count, epochs=model.iter)
    return model
def sentence_without_part(text):  #연결어미로 끝나지 않을 때 포함하여 원본 반환
    mecab = Mecab()
    sentences = []
    mecab_text = mecab.pos(text)
    for i in range(0, len(mecab_text)):
        sentence = text
        if is_sentence_End(mecab_text[i]):
            index = text.find(mecab_text[i][0])
            index += len(mecab_text[i][0])
            if (i < len(mecab_text) - 1):
                if is_mark(mecab_text[i + 1]):
                    index += 1
            sentence = text[:index]
            text = text[index:]
            sentences.append(sentence)

    if not (text.isspace()) and text != '':
        sentences.append(text)
    return sentences
Example #22
0
    def ex_stopword(self, bool, allow_type="nv"):

        if allow_type == "n":
            allow_vv = ["NNG", "NNP", "NNB", "NR", "NP"]
        elif allow_type == "nv":
            allow_vv = [
                "NNG", "NNP", "NNB", "NR", "NP", "VV", "VA", "VX", "VCP",
                "VCN", "MM", "MAG", "MAJ"
            ]

        tqdm.pandas()
        if bool is True:
            mecab = Mecab()
            self.data['stopped_text'] = self.data['text'].progress_apply(
                lambda x: " ".join([
                    word[0] for word in mecab.pos(str(x))
                    if word[1] in allow_vv
                ]))
        else:
            self.data['stopped_text'] = self.data['text']
Example #23
0
class Predict:
    def __init__(self, pred_config:Pred_config,keyword=None,contents_id=None):
        self.pred_config = pred_config
        self.engine = create_engine(("mysql+pymysql://{}:{}@{}:{}/{}?charset=utf8mb4").format('root','robot369',
                                                                                                '10.96.5.179',3306,'datacast2'))
        self.args = self.pred_config.get_args()

        ##쿠다, cpu 중 사용할 디바이스 설정
        self.device = self.pred_config.get_device()

        ##배치사이즈 설정(모델이 한번에 처리할 input 사이즈 크기)
        self.batch_size = self.pred_config.batch_size

        ##모델 가져오기
        self.model = self.pred_config.load_model(self.args, self.device)

        ##토크나이저 가져오기
        self.tokenizer = self.pred_config.load_tokenizer()
        self.nlp = Mecab()
        self.keyword = keyword
        self.contents_id = contents_id
        self.db = Sql("datacast2")
    def verbs(self,phrase):
        """Verbs extractor."""
        verbs = ['VV']
        tagged = self.nlp.pos(phrase)
        return [s for s, t in tagged if t in verbs]

    def adjs(self,phrase):

        """Adjs extractor."""
        adjs = ['VA','IC']
        tagged = self.nlp.pos(phrase)
        return [s for s, t in tagged if t in adjs]

    def read(self):
        # conn = pymysql.connect(host='1.221.75.76', user='******', password='******', database='datacast')
        # curs = conn.cursor(pymysql.cursors.DictCursor)
        # sql_select_sentence = 'select * from analysis_sentence'
        # curs.execute(sql_select_sentence)
        # rows = curs.fetchall()
        ##pandas datatable 형태로 sentece 테이블 읽어들이기


        print('sql:',"SELECT ct.channel,cc.contents_id,cs.text from crawl_task as ct join crawl_contents as cc on ct.task_id=cc.task_id JOIN crawl_sentence AS cs ON cs.contents_id = cc.contents_id "
            "WHERE cc.contents_id=\'%s\' and ct.keyword=\'%s\'" % (self.contents_id ,self.keyword))
        # df_sentence_rows = pd.read_sql("SELECT ct.task_id,ct.channel,cc.contents_id,cc.text,cc.url from crawl_task as ct join crawl_contents as cc on ct.task_id=cc.task_id WHERE ct.keyword=\'%s\' limit %d,%d;"%(self.keyword,start_num,chunk_size),self.engine)
        df_sentence_rows = pd.read_sql(
            "SELECT ct.keyword,ct.channel,cc.contents_id as contents_id,cs.sentence_id as sentence_id, cs.text as sentence from crawl_task as ct join crawl_contents as cc on ct.task_id=cc.task_id JOIN crawl_sentence AS cs ON cs.contents_id = cc.contents_id "
            "WHERE cc.contents_id=\'%s\' and ct.keyword=\'%s\'" % (
            self.contents_id,self.keyword),
            self.engine)

        return df_sentence_rows

    def convert_input_sentence_to_tensor_dataset(self,df_sentence_rows,cls_token_segment_id=0,
                                             pad_token_segment_id=0,
                                             sequence_a_segment_id=0,
                                             mask_padding_with_zero=True):
        tokenizer = self.tokenizer
        args = self.args


        cls_token = tokenizer.cls_token
        sep_token = tokenizer.sep_token
        pad_token_id = tokenizer.pad_token_id

        all_input_ids = []
        all_attention_mask = []
        all_token_type_ids = []

        ###input file 읽어들이기
        ###input file 읽어서 tensordata type 으로 변환
        for index in df_sentence_rows.index:
            sentence = df_sentence_rows.at[index, 'sentence']

            tokens = tokenizer.tokenize(sentence)

            # Account for [CLS] and [SEP]
            special_tokens_count = 2
            #문장의 최대길이 보다 큰 문장에 대해서 길이 조정을 해준다.
            if len(tokens) > args.max_seq_len - special_tokens_count:
                tokens = tokens[:(args.max_seq_len - special_tokens_count)]

            # Add [SEP] token
            tokens += [sep_token]
            token_type_ids = [sequence_a_segment_id] *len(tokens)

            # Add [CLS] token
            tokens = [cls_token] + tokens
            token_type_ids = [cls_token_segment_id] + token_type_ids
            input_ids = tokenizer.convert_tokens_to_ids(tokens)

            # The mask has 1 real tokens and 0 for padding tokens. Only real tokens are attended to.
            attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
            # Zero-pad up to the sequence length.
            padding_length = args.max_seq_len - len(input_ids)
            input_ids = input_ids+([pad_token_id] * padding_length)
            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

            all_input_ids.append(input_ids)
            all_attention_mask.append(attention_mask)
            all_token_type_ids.append(token_type_ids)

        # Change to Tensor
        all_input_ids = torch.tensor(all_input_ids, dtype=torch.long)
        all_attention_mask = torch.tensor(all_attention_mask, dtype=torch.long)
        all_token_type_ids = torch.tensor(all_token_type_ids, dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids)
        return dataset

    def predict(self):
        ##tuning 시 파라미터 정보가 들어있는 파일(training_args.bin)
        args = self.args

        ##쿠다, cpu 중 사용할 디바이스 설정
        device = self.device

        ##배치사이즈 설정(모델이 한번에 처리할 input 사이즈 크기)
        batch_size= self.batch_size

        ##모델 가져오기
        model = self.model
        logger.info(args)

        ##감성분석할 데이터 가져오기
        df_sentence_data_rows = self.read()

        dataset = self.convert_input_sentence_to_tensor_dataset(df_sentence_data_rows)

        # dataset 을 model 을 이용하여 output 도출
        # Predict
        sampler = SequentialSampler(dataset)
        data_loader = DataLoader(dataset, sampler=sampler, batch_size=batch_size)
        preds = None
        probs = None
        print(type(data_loader),len(data_loader))
        for index,batch in enumerate(tqdm(data_loader, desc="Prediction")):
            batch = tuple(t.to(device) for t in batch)
            with torch.no_grad():
                inputs = {"input_ids": batch[0],
                          "attention_mask": batch[1],
                          "labels": None}
                if args.model_type != "distilkobert":
                    inputs["token_type_ids"] = batch[2]
                outputs = model(**inputs)
                logits = outputs[0]

                if preds is None:
                    preds = logits.detach().cpu().numpy()
                    probs = np.exp(logits.detach().cpu().numpy())/ (1 + np.exp(logits.detach().cpu().numpy()))
                else:
                    preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                    probs = np.append(probs, np.exp(logits.detach().cpu().numpy())/ (1 + np.exp(logits.detach().cpu().numpy())), axis=0)
        preds = np.argmax(preds, axis=1).tolist()
        prob_max_index = np.argmax(probs, axis=-1)
        maximum_probs = probs[np.arange(probs.shape[0]), prob_max_index]
        # maximum_probs = maximum_probs.tolist()
        # maximum_probs = list([round(maximum_prob,2) if pred==1 else round(maximum_prob,2)*(-1) for pred,maximum_prob in zip(preds,maximum_probs)])
        df_sentence_data_rows['positiveness'] = preds

        #update
        for idx in tqdm(df_sentence_data_rows.index,desc="sentence_anlysis&db_update"):
            try:
                sentence_id = df_sentence_data_rows.at[idx,'sentence_id']
                sentence = df_sentence_data_rows.at[idx,'sentence']
                positiveness = df_sentence_data_rows.at[idx,'positiveness']
                nouns = list(set(self.nlp.nouns(sentence)))
                nouns = json.dumps(nouns,ensure_ascii=False)

                verbs = list(set(self.verbs(sentence)))
                verbs = json.dumps(verbs,ensure_ascii=False)

                adjs = list(set(self.adjs(sentence)))
                adjs = json.dumps(adjs,ensure_ascii=False)
                self.db.update_multi_column("crawl_sentence",
                                     update_dict={"nouns":nouns,"verbs":verbs,"adjs":adjs,"positiveness":float(positiveness)},
                                     where_dict={"sentence_id":float(sentence_id)})
            except Exception as e:
                print(e)
                continue
# # 4. 자연어 처리

# * 형태소 처리기 Mecab() 을 불러오겠습니다.

# In[13]:

mecab = Mecab()

# ## 4.1 품사 추출

# In[14]:

data_pos = []
for sentence in movie_data['new_text']:
    data_pos.append(mecab.pos(sentence))
data_pos[:3]

# ## 4.2 형태소 분석

# In[15]:

tokenized_data = []
for sentence in movie_data['new_text']:
    for text in mecab.morphs(sentence):
        tokenized_data.append(text)
tokenized_data[:10]

# * 상위 빈도 순으로 100개의 단어는 다음과 같습니다.

# In[16]:
Example #25
0
train_data = pd.read_csv(BASE_DIR + "/data/test/sample_train_data_" +
                         str(seed) + ".csv")
test_data = pd.read_csv(BASE_DIR + "/data/test/sample_test_data_" + str(seed) +
                        ".csv")

submission_data = pd.read_csv(BASE_DIR + "/data/submission_제출양식.csv")

random.seed(seed)

test_data['prediction'] = 2  # smishing 컬럼 추가

tokenizer = Mecab()

print("train data 토큰화")
train_doc = [(tokenizer.pos(x), y)
             for x, y in tqdm(zip(train_data['text'], train_data['smishing']))]

print("test data 토큰화")
test_doc = [(tokenizer.pos(x), y)
            for x, y in tqdm(zip(test_data['text'], test_data['smishing']))]

stopwords = [
    'XXX', '.', '을', '를', '이', '가', '-', '(', ')', ':', '!', '?', ')-', '.-',
    'ㅡ', 'XXXXXX', '..', '.(', '은', '는'
]  # 필요없는 단어 리스트


def get_couple(_words):
    global stopwords
    _words = [x for x in _words if x[0] not in stopwords]
class Predict:
    def __init__(self,
                 pred_config: Pred_config,
                 task_id=None,
                 keyword=None,
                 channel=None):
        self.pred_config = pred_config
        self.engine = create_engine(
            ("mysql+pymysql://{}:{}@{}:{}/{}?charset=utf8mb4").format(
                'root', 'robot369', '10.96.5.179', 3306, 'datacast2'))
        self.args = self.pred_config.get_args()

        ##쿠다, cpu 중 사용할 디바이스 설정
        self.device = self.pred_config.get_device()

        ##배치사이즈 설정(모델이 한번에 처리할 input 사이즈 크기)
        self.batch_size = self.pred_config.batch_size

        ##모델 가져오기
        self.model = self.pred_config.load_model(self.args, self.device)

        ##토크나이저 가져오기
        self.tokenizer = self.pred_config.load_tokenizer()
        self.nlp = Mecab()
        self.task_id = task_id
        self.keyword = keyword
        self.channel = channel

    def verbs(self, phrase):
        """Verbs extractor."""
        verbs = ['VV']
        tagged = self.nlp.pos(phrase)
        return [s for s, t in tagged if t in verbs]

    def adjs(self, phrase):
        """Adjs extractor."""
        adjs = ['VA', 'IC']
        tagged = self.nlp.pos(phrase)
        return [s for s, t in tagged if t in adjs]

    def read(self):
        # conn = pymysql.connect(host='1.221.75.76', user='******', password='******', database='datacast')
        # curs = conn.cursor(pymysql.cursors.DictCursor)
        # sql_select_sentence = 'select * from analysis_sentence'
        # curs.execute(sql_select_sentence)
        # rows = curs.fetchall()
        ##pandas datatable 형태로 sentece 테이블 읽어들이기

        print(
            'sql:',
            "SELECT ct.task_id,ct.channel,cc.contents_id,cc.text,cc.url from crawl_task as ct join crawl_contents as cc on ct.task_id=cc.task_id WHERE ct.task_id=%s and ct.keyword=\'%s\'"
            % (self.task_id, self.keyword))
        df_sentence_rows = pd.read_sql(
            "SELECT ct.task_id,ct.channel,cc.contents_id,cc.text,cc.url from crawl_task as ct join crawl_contents as cc on ct.task_id=cc.task_id WHERE ct.task_id=%s and ct.keyword=\'%s\'"
            % (self.task_id, self.keyword), self.engine)
        df_sentence_rows = df_sentence_rows[df_sentence_rows['text'].apply(
            lambda x: len(x) < 5000)]
        print('read finish')
        return df_sentence_rows

    def convert_input_sentence_to_tensor_dataset(self,
                                                 df_sentence_rows,
                                                 cls_token_segment_id=0,
                                                 pad_token_segment_id=0,
                                                 sequence_a_segment_id=0,
                                                 mask_padding_with_zero=True):
        tokenizer = self.tokenizer
        args = self.args

        cls_token = tokenizer.cls_token
        sep_token = tokenizer.sep_token
        pad_token_id = tokenizer.pad_token_id

        all_input_ids = []
        all_attention_mask = []
        all_token_type_ids = []

        ###input file 읽어들이기
        ###input file 읽어서 tensordata type 으로 변환
        for index in df_sentence_rows.index:
            sentence = df_sentence_rows.at[index, 'text']

            tokens = tokenizer.tokenize(sentence)

            # Account for [CLS] and [SEP]
            special_tokens_count = 2
            #문장의 최대길이 보다 큰 문장에 대해서 길이 조정을 해준다.
            if len(tokens) > args.max_seq_len - special_tokens_count:
                tokens = tokens[:(args.max_seq_len - special_tokens_count)]

            # Add [SEP] token
            tokens += [sep_token]
            token_type_ids = [sequence_a_segment_id] * len(tokens)

            # Add [CLS] token
            tokens = [cls_token] + tokens
            token_type_ids = [cls_token_segment_id] + token_type_ids
            input_ids = tokenizer.convert_tokens_to_ids(tokens)

            # The mask has 1 real tokens and 0 for padding tokens. Only real tokens are attended to.
            attention_mask = [1 if mask_padding_with_zero else 0
                              ] * len(input_ids)
            # Zero-pad up to the sequence length.
            padding_length = args.max_seq_len - len(input_ids)
            input_ids = input_ids + ([pad_token_id] * padding_length)
            attention_mask = attention_mask + (
                [0 if mask_padding_with_zero else 1] * padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] *
                                               padding_length)

            all_input_ids.append(input_ids)
            all_attention_mask.append(attention_mask)
            all_token_type_ids.append(token_type_ids)

        # Change to Tensor
        all_input_ids = torch.tensor(all_input_ids, dtype=torch.long)
        all_attention_mask = torch.tensor(all_attention_mask, dtype=torch.long)
        all_token_type_ids = torch.tensor(all_token_type_ids, dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_attention_mask,
                                all_token_type_ids)
        return dataset

    def predict(self):
        ##tuning 시 파라미터 정보가 들어있는 파일(training_args.bin)
        args = self.args

        ##쿠다, cpu 중 사용할 디바이스 설정
        device = self.device

        ##배치사이즈 설정(모델이 한번에 처리할 input 사이즈 크기)
        batch_size = self.batch_size

        ##모델 가져오기
        model = self.model
        logger.info(args)

        ##감성분석할 데이터 가져오기
        chunck_size = 100
        df_cdata_rows = self.read()
        for idx, i in enumerate(
                tqdm(range(0, len(df_cdata_rows), chunck_size),
                     desc="Nlp&Prediction")):
            try:
                df_cdata_chuncked_rows = df_cdata_rows[idx * chunck_size +
                                                       1:(idx + 1) *
                                                       chunck_size]
                ## crawl_contents 를 sentence 로 쪼개고 crawl_sentence 에 넣는 작업
                df_sentence_rows = pd.DataFrame()
                for index in df_cdata_chuncked_rows.index:
                    df_sentence_row = pd.DataFrame()
                    crawl_contents_id = df_cdata_chuncked_rows.at[
                        index, 'contents_id']
                    text = df_cdata_chuncked_rows.at[index, 'text']
                    sentences = kss.split_sentences(text)
                    seq = [i for i in range(0, len(sentences))]
                    df_sentence_row['contents_id'] = [crawl_contents_id
                                                      ] * len(sentences)
                    df_sentence_row['text'] = sentences
                    df_sentence_row['seq'] = seq
                    df_sentence_rows = df_sentence_rows.append(
                        df_sentence_row, ignore_index=True)
                # chunk_size = 10000
                # list_df_sentence_rows = [df_sentence_rows[i:i+chunk_size] for i in range(0,df_sentence_rows.shape[0],chunk_size)]
                # for df_sentence_rows_to_read in list_df_sentence_rows:
                ##모델이 읽을 수 있도록 데이터 형변환(to TensorDataset)
                dataset = self.convert_input_sentence_to_tensor_dataset(
                    df_sentence_rows)

                # dataset 을 model 을 이용하여 output 도출
                # Predict
                sampler = SequentialSampler(dataset)
                data_loader = DataLoader(dataset,
                                         sampler=sampler,
                                         batch_size=batch_size)
                preds = None
                probs = None
                print(type(data_loader), len(data_loader))
                for index, batch in enumerate(data_loader):
                    batch = tuple(t.to(device) for t in batch)
                    with torch.no_grad():
                        inputs = {
                            "input_ids": batch[0],
                            "attention_mask": batch[1],
                            "labels": None
                        }
                        if args.model_type != "distilkobert":
                            inputs["token_type_ids"] = batch[2]
                        outputs = model(**inputs)
                        logits = outputs[0]

                        if preds is None:
                            preds = logits.detach().cpu().numpy()
                            probs = np.exp(logits.detach().cpu().numpy()) / (
                                1 + np.exp(logits.detach().cpu().numpy()))
                        else:
                            preds = np.append(preds,
                                              logits.detach().cpu().numpy(),
                                              axis=0)
                            probs = np.append(
                                probs,
                                np.exp(logits.detach().cpu().numpy()) /
                                (1 + np.exp(logits.detach().cpu().numpy())),
                                axis=0)
                print(len(preds), len(probs))
                preds = np.argmax(preds, axis=1).tolist()
                prob_max_index = np.argmax(probs, axis=-1)
                maximum_probs = probs[np.arange(probs.shape[0]),
                                      prob_max_index]
                # maximum_probs = maximum_probs.tolist()
                # maximum_probs = list([round(maximum_prob,2) if pred==1 else round(maximum_prob,2)*(-1) for pred,maximum_prob in zip(preds,maximum_probs)])
                for idx in df_sentence_rows.index:
                    sentence = df_sentence_rows.at[idx, 'text']
                    nouns = list(set(self.nlp.nouns(sentence)))
                    nouns = json.dumps(nouns, ensure_ascii=False)

                    verbs = list(set(self.verbs(sentence)))
                    verbs = json.dumps(verbs, ensure_ascii=False)

                    adjs = list(set(self.adjs(sentence)))
                    adjs = json.dumps(adjs, ensure_ascii=False)

                    df_sentence_rows.at[idx, 'nouns'] = nouns
                    df_sentence_rows.at[idx, 'verbs'] = verbs
                    df_sentence_rows.at[idx, 'adjs'] = adjs
                df_sentence_rows['positiveness'] = preds
                # df_sentence_rows['sentiment_point'] = maximum_probs
                # df_sentence_rows.set_index('sentence_id',inplace=True)
                print(df_sentence_rows)
                ##analysis_sentence 테이블 sentiment update 해주는 작업
                ##가져온 데이터의 primary 키를 u 로 잡아준다.
                # chunk_size = 10000 #chunk row size
                # list_df_sentence_rows = [df_sentence_rows[i:i+chunk_size] for i in range(0,df_sentence_rows.shape[0],chunk_size)]
                # for index,df_sentence_rows in enumerate(list_df_sentence_rows):
                # print('{}번째 chunk'.format(index))
                # df_sentence_rows.to_sql('analysis_sentence_tmp',self.engine.connect(),if_exists='replace',index=False,chunksize=1)

                conn = self.engine.connect()
                trans = conn.begin()

                try:
                    # #delete those rows that we are going to "upsert"
                    # self.engine.execute("DELETE anal_s FROM analysis_sentence AS anal_s, analysis_sentence_tmp AS anal_st WHERE anal_s.sentence_id = anal_st.sentence_id")
                    # print('delete...ing')
                    # trans.commit()

                    #insert changed rows
                    df_sentence_rows.to_sql('crawl_sentence',
                                            self.engine,
                                            if_exists='append',
                                            index=False)
                    print('insert...ing')

                except Exception as e:
                    print(e)
                    trans.rollback()
                    raise

                logger.info("Prediction Done!")
                conn.close()
            except Exception as e:
                print(e)
                continue
Example #27
0
    'purpose': ['게임', '사무', '영상편집', '방송'],
    'price': ['가성비', '고가', '저가', '최저가', '저렴'],
    'type': ['컴퓨터', '노트북', '데스크탑', '랩탑'],
    'problem': ['전원', '소리', '키보드', '모니터'],
}

length = 1
for key in list(dict_entity.keys()):
    length = length * len(dict_entity[key])
# print("Augmentation length is {0}".format(length))
#-> Augmentation length is 320

from eunjeon import Mecab

mecab = Mecab()
morpphed_text = mecab.pos(get_data_list)
# print(morpphed_text)
#-> [('저렴', 'XR'), ('한', 'XSA+ETM'), ('게임', 'NNG'), ('용', 'XSN'), ('컴퓨터', 'NNG'), ('추천', 'NNG'), ('해', 'XSV+EC'), ('줘', 'VX+EC')]

tagged_text = ''
for pos_tags in morpphed_text:
    if (pos_tags[1] in ['NNG', 'MAG', 'NNP', 'SL', 'XR']
            and len(pos_tags[0]) > 1):  #Check only Noun
        feature_value = pos_tags[0]
        tagged_text = tagged_text + pos_tags[0] + ' '
print(tagged_text)
#-> 저렴 게임 컴퓨터 추천

pattern = ''
for word in tagged_text.split(' '):
    entity = list(
Example #28
0
def test(documents):
    #텍스트 정제 (형태소 추출)
    eunjeon = Mecab()
    testdata = eunjeon.pos(documents[0])

    return testdata
Example #29
0
    def __init__(self, inputText, inputCorpus=None):

        # 원문 문서에서 신조어 추출
        # 파이썬 버전 3.6
        # 설치할 패키지: eunjeon, pandas
        # 차후 eunjeon에서 konlpy로 이전 예정

        # 리눅스 환경 mecab-ko-dic 설치과정
        # wget -c https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/최신버전-mecab-ko-dic.tar.gz
        # tar zxfv  최신버전-mecab-ko-dic.tar.gz
        # cd 최신버전-mecab-ko-dic
        # ./configure
        # make
        # make check
        # sudo make install
        # 위 과정을 거치면 /usr/local/lib/mecab/dic/mecab-ko-dic 경로에 mecab-ko-dic 설치

        # 현 문제점: 말뭉치 인풋을 받지 않음
        # Need to implement Corpus input and put it together into one string, or find other way to search the corpus

        # doc = "원문전체 문자열"
        self.doc = self.clean_str(inputText)
        # corpus = 말뭉치. 데이터 형태 미정 (사용불가능, 비활성)
        if inputCorpus == None: self.corpus = ' ' + self.doc
        else: self.corpus = self.clean_str(' ' + inputCorpus)

        # wTotal = 말뭉치 총 어절 수
        self.wTotal = self.corpus.count(' ')

        l = self.doc.split(' ')
        self.eoList = [i for i in l if len(re.sub(r'[0-9]+', '-', i)) >= 3]

        # 괄호가 포함된 어절 출력
        missed = []
        for i in self.eoList:
            if i.count("(") > 0 and i.count(")") > 0:
                missed.append(i[i.find("(") + 1:i.find(")")])
                continue
            if i.count("(") > 0:
                missed.append(i.split("(", 1)[1])
            if i.count(")") > 0:
                missed.append(i[:-1])
        parenthesisless = [
            x for x in self.eoList if not '(' in x and not ')' in x
        ] + [x for x in self.eoList if '(' in x and ')' in x]
        parenthesisless += missed
        self.eoList = parenthesisless  # 괄호가 한 쪽만 포함된 어절을 모두 제거하고 괄호 속 어절을 포함

        ############################################################################################################################################
        # 없는부분
        # [LP, UM, RP] 형태가 가능한 모든 조합을 리스트로 구축
        self.posUMpairList = []
        for i in range(len(self.eoList)):
            for j in self.splitEojeol(self.eoList[i]):
                # RP가 알려진 단어로 이루어져있는지 확인(확인된 경우 KRP라고 부름) 후 등록
                # if self.isAfterNoun(j[2]) and len(j[1]) > 1:
                if self.isKnown(j[2]):
                    self.posUMpairList.append(j)

        # partialEoList: 모든 부분어절의 리스트: ["어절1부분1", "어절1부분2", ...] # (사용가능, 비활성)
        # self.partialEoList = []
        # for i in self.eoList:
        #     for j in self.eojeolPart(i):
        #         self.partialEoList.append(j)

############################################################################################################################################

# lplist: 모든 어절의 2자 이상의 LP부분 리스트: [["어절1LP1", "어절1LP2", ...], ["어절2LP1", "어절2LP2", ...], ...]
        self.lplist = []
        iter = self.eoList[:]
        iter = list(dict.fromkeys(iter))
        for i in iter:
            if len(i) > 1: self.lplist.append(self.genLP(i))

        # 명사로 추정되는 문자열 리스트 추출 -> extnouns
        self.extnouns = []
        for i in self.lplist:
            scores = []
            finalscore = 0
            chosen = ''
            for j in range(len(i)):
                # 현재는 단순히 말뭉치에 띄어쓰기+단어가 검색된 갯수만 찾지만 본래 어절의 좌측부분만 검색하도록 해야 함
                # 문제점1: 말뭉치는 클렌징이 되어있지 않음
                # 문제점2: 기존에 이미 발견된 명사를 제외한 말뭉치에서 검색해야 함
                scores.append(self.corpus.count(' ' + i[j]) / self.wTotal)
            for j in range(len(scores)):
                if j >= len(scores) - 1:
                    chosen = i[j]
                    finalscore = scores[j]
                    break
                # 예: 마스터투자운 -> 마스터투자운용 빈도수가 크게 차이가 안 날 경우 넘어가지만
                # 마스터투자운용 -> 마스터투자운용은 빈도수가 크게 차이가 나기 때문에 그 직전에 명사로 채택
                if scores[j] > scores[j + 1] * 1.1:
                    chosen = i[j]
                    finalscore = scores[j]
                    break
                finalscore = scores[j]
            # 빈도율이 2/어절수 이상인 경우 채택
            if finalscore >= 2 / self.wTotal: self.extnouns.append(chosen)
        self.extnouns = list(dict.fromkeys(self.extnouns))

        ############################################################################################################################################
        # 없는부분

        # 여기서 Mecab은 단일 글자가 어떠한 글자인지 판단하기 위해 사용
        m = Mecab()
        # m = Mecab(dicpath='C:/mecab/mecab-ko-dic') # (사용불가능, 비활성)
        # m = Mecab(dicpath='/usr/local/lib/mecab/dic/mecab-ko-dic') # (사용불가능, 비활성)
        # 한글이 아닌 문자가 갈라지는 경우 제외
        # 예: ['신한BN', 'P파리', '바자산운용으로부터'], ['', '320', '0억원에'] 등
        temp = self.posUMpairList[:]  # temp와 포인터가 같으면 곤란하기 때문에 새로 메모리 할당
        for i in self.posUMpairList:
            # LP가 빈 문자열이 아니고 LP의 마지막 글자와 UM의 첫 글자가 모두 한글이외 문자일 경우 후보에서 제거
            if len(i[0]) > 0 and m.pos(i[0][-1])[0][1][0] == 'S' and m.pos(
                    i[1][0])[0][1][0] == 'S':
                temp.remove(i)
                # RP가 빈 문자열이 아니고 UM의 마지막 글자와 RP의 첫 글자가 모두 한글이외 문자일 경우 후보에서 제거
            elif len(i[2]) > 0 and m.pos(i[1][-1])[0][1][0] == 'S' and m.pos(
                    i[2][0])[0][1][0] == 'S':
                temp.remove(i)
                # # UM에 괄호가 한 쪽만 포함된 경우 제거
            elif '(' in i[1] and ')' not in i[1]:
                temp.remove(i)
            elif ')' in i[1] and '(' not in i[1]:
                temp.remove(i)
        # 결과물은 LP+UM+KRP의 리스트
        self.posUMpairList = temp

        # candidates: 신조어 최종 후보 리스트
        self.candidates = []
        for i in self.posUMpairList:
            # KRP가 비어있는 경우: UM을 말뭉치에 대해 검색하여 2번 이상 등장할 경우 LP+UM 등록
            if i[2] == '' and self.corpus.count(i[1]) >= 2:
                self.candidates.append(i[0] + i[1])
            # KRP가 비어있지 않은 경우: UM+KRP[0](KRP의 첫 형태소)를 말뭉치에 대해 검색하여 2번 이상 등장할 경우 LP+UM 등록
            elif i[2] != '' and self.corpus.count(i[1] +
                                                  m.morphs(i[2])[0]) >= 2:
                self.candidates.append(i[0] + i[1])

        # 서로를 포함하는 어절 빈도수 기준으로 정리
        temp = []
        for i in range(len(self.candidates) - 1):
            if self.candidates[i] in self.candidates[i + 1]:
                if self.wordFreq(
                        self.candidates[i], self.corpus) > self.wordFreq(
                            self.candidates[i + 1], self.corpus) * 1.1:
                    temp.append(self.candidates[i])
            elif self.candidates[i - 1] in self.candidates[i]:
                if self.wordFreq(self.candidates[i - 1],
                                 self.corpus) * 0.9 < self.wordFreq(
                                     self.candidates[i], self.corpus):
                    temp.append(self.candidates[i])
            else:
                temp.append(self.candidates[i])
        if self.wordFreq(self.candidates[-2],
                         self.corpus) * 0.9 < self.wordFreq(
                             self.candidates[-1], self.corpus):
            temp.append(self.candidates[-1])
        self.candidates = temp
        self.candidates = list(dict.fromkeys(self.candidates))

        # 여기서 Mecab은 기존에 등록된 명사인지 아닌지 판단하기 위해 사용
        # 기존에 등록된 명사 제외
        temp = []
        for i in self.candidates:
            if len(m.pos(i)) > 1 or m.pos(i)[0][1][0] != 'N':
                temp.append(i)
        self.candidates = temp
Example #30
0
# train 데이터가 모두 학습 될때까지 반복
while len(train_list) > 0:

    if len(train_list) > 37403:
        selected_list = random.sample(train_list, 37403)   # 메모리 사용률을 고려해서 조금 더 올려도 괜찮을 것 같음
        train_list = [index for index in train_list if index not in selected_list]
    else:
        selected_list = random.sample(train_list, len(train_list))
        train_list = [index for index in train_list if index not in selected_list]

    sampling_train_data = train_data.iloc[selected_list, :].reset_index(drop=True)

    tokenizer = Mecab()

    print("train data 토큰화")
    train_doc = [(tokenizer.pos(x), y) for x, y in tqdm(zip(sampling_train_data['text'], sampling_train_data['smishing']))]

    X_train = []  # text 값 리스트
    Y_train = []  # 각 text에 대한 smishing 값 리스트

    print("train data 토큰 필요없는 단어리스트 제거, 모형에 사용하기 위한 데이터 전처리")
    for lwords in train_doc:
        Y_train.append(lwords[1])

        temp = []
        for x, y in get_couple(lwords[0]):
            temp.append("{}.{}".format(x, y))

        X_train.append(" ".join(temp))

    vec_x_train = v.fit_transform(X_train)