Python Preprocess 예제들, chatbot.utils.Preprocess.Preprocess Python 예제들

예제 #1

0

파일 보기

파일: chatbot_dic_test.py 프로젝트: NormalbutGenuine/Chatbot1

import pickle
from chatbot.utils.Preprocess import Preprocess

# 단어 사전 불러오기
f = open("C:/Users/obybk/OneDrive/바탕 화면/인공지능/deepChat/chatbot/train_tools/dict/chatbot_dict.bin", "rb")
word_index = pickle.load(f)
f.close()

sent = "내일 오전 10시에 탕수육 주문하고 싶어 ㅋㅋ"
p = Preprocess(userdic="C:/Users/obybk/OneDrive/바탕 화면/인공지능/deepChat/Tokenizing/user_dic.txt")

# 형태소 분석기 실행
p = p.pos(sent)

# 품사 태그 없이 키워드 출력
keywords = p.get_keywords(pos, without_tag = True)
for word in keywords:
    try:
        print(word, word_index[word])
    except KeyError:
        # 해당 단어가 없는 경우 OOV처리
        print(word, word_index['OOV'])

예제 #2

0

파일 보기

import pickle
from chatbot.utils.Preprocess import Preprocess

# 단어 사전 불러오가ㅣ
f = open("../train_tools/dict/chatbot_dict.bin", "rb")
word_index = pickle.load(f)
f.close()

sent = "내일 오전 10시에 탕수육 주문하고 싶어 ㅋㅋ"

# 전처리 객체 생성
p = Preprocess(userdic='../utils/user_dic.tsv')

#형태소 분석기 실행
pos = p.pos(sent)

#품사 태그 없이 키워드 출력
keywords = p.get_keywords(pos, without_tag=True)
for word in keywords:
    try:
        print(word, word_index[word])
    except KeyError:
        #해당 단어가 사전에 없는 경우 OOV 처리
        print(word, word_index['OOV'])

예제 #3

0

파일 보기

파일: model_intent_test.py 프로젝트: NormalbutGenuine/Chatbot1

from chatbot.utils.Preprocess import Preprocess
from chatbot.models.intent.IntentModel import IntentModel

p = Preprocess(word2index_dic='C:/Users/obybk/OneDrive/바탕 화면/인공지능/deepChat/chatbot/train_tools/dict/chatbot_dict.bin',
userdic='C:/Users/obybk/OneDrive/바탕 화면/인공지능/deepChat/Tokenizing/user_dic.txt')
intent = IntentModel(model_name = 'C:/Users/obybk/OneDrive/바탕 화면/인공지능/deepChat/intent_model.h5', proprocess=p)
query = "오늘 탕수육 주문 가능한가요?"
predict = intent.predict_class(query)
predict_label = intent.labels[predict]
print(query)
print("의도 예측 클래스: ", predict)
print("의도 예측 레이블: ", predict_label)

예제 #4

0

파일 보기

파일: preprocess_test.py 프로젝트: NormalbutGenuine/Chatbot1

from chatbot.utils.Preprocess import Preprocess

sent = "내일 오전 10시에 탕수육 주문하고 싶어"

# 전처리 객체생성
p = Preprocess(userdic='C:/Users/obybk/OneDrive/바탕 화면/인공지능/deepChat/Tokenizing/user_dic.txt')

# 형태소 분석 실행
pos = p.pos(sent)

# 품사 태그와 같이 키워드 출력
ret = p.get_keywords(pos, without_tag=False)
print(ret)

ret = p.get_keywords(pos, without_tag=True)
print(ret)

예제 #5

0

파일 보기

파일: train_model.py 프로젝트: NormalbutGenuine/Chatbot1

import pandas as pd
import tensorflow as tf
from tensorflow.keras import preprocessing
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Conv1D, GlobalMaxPool1D, concatenate

#데이터 읽어오기
train_file = "C:/Users/obybk/OneDrive/바탕 화면/AI/deepChat/chatbot/models/intent/total_train_data.csv"
data = pd.read_csv(train_file, delimiter=',')
queries = data['query'].tolist()
intents = data['intent'].tolist()
from chatbot.utils.Preprocess import Preprocess
p = Preprocess(
    word2index_dic=
    'C:/Users/obybk/OneDrive/바탕 화면/AI/deepChat/chatbot/train_tools/dict/chatbot_dict.bin',
    userdic='C:/Users/obybk/OneDrive/바탕 화면/AI/deepChat/Tokenizing/user_dic.txt'
)

# 단어 시퀀스 생성
sequences = []
for sentence in queries:
    pos = p.pos(sentence)
    keywords = p.get_keywords(pos, without_tag=True)
    seq = p.get_wordidx_sequence(keywords)
    sequences.append(seq)

# 단어 인덱스 시퀀스 벡터 생성
# 단어 시퀀스 벡터 크기
from chatbot.config.GlobalParams import MAX_SEQ_LEN
padded_seqs = preprocessing.sequence.pad_sequences(sequences,
                                                   maxlen=MAX_SEQ_LEN,

예제 #6

0

파일 보기

파일: train_model.py 프로젝트: NormalbutGenuine/Chatbot1

        for idx, l in enumerate(lines):
            if l[0] == ';' and lines[idx + 1][0] == '$':
                this_sent = []
            elif l[0] == '$' and lines[idx - 1][0] == ';':
                continue
            elif l[0] == '\n':
                sents.append(this_sent)
            else:
                this_sent.append(tuple(l.split()))
    return sents


# 전처리 객체 생성
p = Preprocess(
    word2index_dic=
    'C:/Users/obybk/OneDrive/바탕 화면/인공지능/deepChat/chatbot/train_tools/dict/chatbot_dict.bin',
    userdic=
    'C:/Users/obybk/OneDrive/바탕 화면/인공지능/deepChat/Tokenizing/user_dic.txt')

# 학습용 말뭉치 데이터를 불러옴
corpus = read_file(
    'C:/Users/obybk/OneDrive/바탕 화면/인공지능/deepChat/chatbot/models/ner/ner_train.txt'
)

# 말뭉치 데이터에서 단어와 BIO태그만 불러와 학습용 데이터셋 생성
sentences, tags = [], []
for t in corpus:
    tagged_sentence = []
    sentence, bio_tag = [], []
    for w in t:
        tagged_sentence.append((w[1], w[3]))

예제 #7

0

파일 보기

from tensorflow.keras import preprocessing
import pickle


def read_corpus_data(filename):
    with open(filename, 'r', encoding="utf-8") as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        data = data[1:]

    return data


corpus_data = read_corpus_data(
    'C:\\Users\\obybk\\OneDrive\\바탕 화면\\인공지능\\deepChat\\chatbot\\train_tools\\dict\\corpus.txt'
)
p = Preprocess()
dict = []
for c in corpus_data:
    pos = p.pos(c[1])
    for k in pos:
        dict.append(k[0])

tokenizer = preprocessing.text.Tokenizer(oov_token='OOV')
tokenizer.fit_on_texts(dict)
word_index = tokenizer.word_index

f = open("chatbot_dict.bin", "wb")
try:
    pickle.dump(word_index, f)
    print("성공")
except Exception as e:

예제 #8

0

파일 보기

# 필요한 모듈 임포트
import pandas as pd
import tensorflow as tf
from tensorflow.keras import preprocessing
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Conv1D, GlobalMaxPool1D, concatenate

# 데이터 읽어오기
train_file = "total_train_data.csv"
data = pd.read_csv(train_file, delimiter=',')
queries = data['query'].tolist()
intents = data['intent'].tolist()

from chatbot.utils.Preprocess import Preprocess
p = Preprocess(word2index_dic='../../train_tools/dict/chatbot_dict.bin',
               userdic='../../utils/user_dic.tsv')

# 단어 시퀀스 생성
sequences = []
for sentence in queries:
    pos = p.pos(sentence)
    keywords = p.get_keywords(pos, without_tag=True)
    seq = p.get_wordidx_sequence(keywords)
    sequences.append(seq)

# 단어 인덱스 시퀀스 벡터 ○2
# 단어 시퀀스 벡터 크기
from chatbot.config.GlobalParams import MAX_SEQ_LEN
padded_seqs = preprocessing.sequence.pad_sequences(sequences,
                                                   maxlen=MAX_SEQ_LEN,
                                                   padding='post')

예제 #9

0

파일 보기

파일: model_intent_test.py 프로젝트: chan1031/firstChatbot

from chatbot.utils.Preprocess import Preprocess
from chatbot.models.intent.IntentModel import IntentModel

p = Preprocess(word2index_dic='../train_tools/dict/chatbot_dict.bin',
               userdic='../utils/user_dic.tsv')

intent = IntentModel(model_name='../models/intent/intent_model.h5',
                     proprocess=p)
query = "안녕하세요"
predict = intent.predict_class(query)
predict_label = intent.labels[predict]

print(query)
print("의도 예측 클래스 : ", predict)
print("의도 예측 레이블 : ", predict_label)

예제 #10

0

파일 보기

파일: create_dict.py 프로젝트: chan1031/firstChatbot

import pickle


# 말뭉치 데이터 읽어오기
def read_corpus_data(filename):
    with open(filename, 'r', encoding='UTF8') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        data = data[1:]  #헤더제거
    return data


#말뭉치 데이터 가져오기
corpus_data = read_corpus_data('corpus.txt')

#말뭉치 데이터에서 키워드만 추출해서 사전 리스트 생성
p = Preprocess()
dict = []
for c in corpus_data:
    pos = p.pos(c[1])  #말뭉치 데이터에서 첫번째 배열 즉 문장만 뽑아내서 POS 태킹한다.
    for k in pos:
        dict.append(k[0])  #문장들 중에 단어들만 dict에 추가한다.

#토크나이저를 이용하여 단어들을 토큰화 시킨다.
tokenizer = preprocessing.text.Tokenizer(oov_token='OOV')  #oov_token은
tokenizer.fit_on_texts(dict)  #fit_on_texts는 문자데이터를 리스트 형태로 변환함
word_index = tokenizer.word_index  #word_index는 단어와 숫자의 키-값으로 이루어진 딕셔너리로 변환함

#사전 파일 생성
f = open("chatbot_dict.bin", "wb")
try:
    pickle.dump(word_index, f)  #dump(넣을 내용,파일 이름)

예제 #11

0

파일 보기

파일: train_model.py 프로젝트: chan1031/firstChatbot

def read_file(file_name):
    sents = []
    with open(file_name, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for idx, l in enumerate(lines):
            if l[0] == ';' and lines[idx + 1][0] == '$':
                this_sent = []
            elif l[0] == '$' and lines[idx - 1][0] == ';':
                continue
            elif l[0] == '\n':
                sents.append(this_sent)
            else:
                this_sent.append(tuple(l.split()))
    return sents

p = Preprocess(word2index_dic='../../train_tools/dict/chatbot_dict.bin',
               userdic='../../utils/user_dic.tsv')

# 학습용 말뭉치 데이터를 불러옴
corpus = read_file('ner_train.txt')

# 말뭉치 데이터에서 단어와 BIO 태그만 불러와 학습용 데이터셋 생성
sentences, tags = [], []
for t in corpus:
    tagged_sentence = []
    sentence, bio_tag = [], []
    for w in t:
        tagged_sentence.append((w[1], w[3]))
        sentence.append(w[1])
        bio_tag.append(w[3])

    sentences.append(sentence)