Python tokenizeの例、nlp_utils.tokenize Pythonの例

コード例 #1

0

ファイルを表示

ファイル: filter_predictions.py プロジェクト: samuelBB/Translating-OOV-Words-Via-Images

def tfidf_word_feats(word_matrix, ngrams=(2, )):
    docs_by_word = [' '.join(pred_list) for pred_list in word_matrix]

    # by individual word
    tfidf_word_results = []
    tfidf_wrd = TfidfVectorizer(tokenizer=tokenize())
    word_scores = tfidf_wrd.fit_transform(docs_by_word).toarray()
    word_feat_names = tfidf_wrd.get_feature_names()
    for i, row in enumerate(word_scores):
        tfidf_word_results.append({
            pred_name: score
            for score, pred_name in zip(row, word_feat_names) if score > 0
        })

    # by n-gram, for multiple n
    tfidf_ng_results = []

    # FIXME need to merge dicts over all n
    for n in ngrams:
        tfidf_char = TfidfVectorizer(analyzer='char', ngram_range=(n, n))
        char_scores = tfidf_char.fit_transform(docs_by_word).toarray()
        char_feat_names = tfidf_char.get_feature_names()
        for i, row in enumerate(char_scores):
            tfidf_ng_results.append({
                ng: score
                for score, ng in zip(row, char_feat_names)
                if score > 0 and ' ' not in ng
            })

    return list(zip(tfidf_word_results, tfidf_ng_results))

コード例 #2

0

ファイルを表示

ファイル: graphs.py プロジェクト: xianyt/text-gcn-chainer

def load_20newsgroups(validation_ratio, normalization):
    """Load text network (20 news group)

    Arguments:
        validation_ratio (float): Ratio of validation split
        normalization (str): Variant of normalization method to use.

    Returns:
        adj (chainer.utils.sparse.CooMatrix): (Node, Node) shape
            normalized adjency matrix.
        labels (np.ndarray): (Node, ) shape labels array
        idx_train (np.ndarray): Indices of the train
        idx_val (np.ndarray): Indices of val array
        idx_test (np.ndarray): Indices of test array
    """
    train = fetch_20newsgroups(subset='train')
    test = fetch_20newsgroups(subset='test')
    adj = create_text_adjacency_matrix(
        [tokenize(t) for t in (train['data'] + test['data'])])
    if normalization == 'gcn':
        adj = normalize(adj)
    else:
        adj = normalize_pygcn(adj)
    n_train = int(len(train['data']) * (1.0 - validation_ratio))
    n_all = len(train['data']) + len(test['data'])
    idx_train = np.array(list(range(n_train)), np.int32)
    idx_val = np.array(list(range(n_train, len(train['data']))), np.int32)
    idx_test = np.array(list(range(len(train['data']), n_all)), np.int32)

    labels = np.concatenate(
        (train['target'], test['target'], np.full([adj.shape[0] - n_all], -1)))
    labels = labels.astype(np.int32)
    adj = to_chainer_sparse_variable(adj)

    return adj, labels, idx_train, idx_val, idx_test

コード例 #3

0

ファイルを表示

ファイル: imojify.py プロジェクト: owo/jitalk

def imojify_input(line, src_lang="en"):
	line = line.lower()
	sents = nlp_utils.tokenize(line)
	imojified = []

	for s in sents:
		imojified.append(imojify_sentence(nlp_utils.stem_tokens(s, src_lang),
						 src_lang))
	return imojified

コード例 #4

0

ファイルを表示

def read_seq_dataset_from_file(filename,
                               max_vocab_size=1000000,
                               min_count=0,
                               unk_tk=UNK_TK,
                               start_tk=START_TK,
                               decode_tk=DECODE_TK,
                               end_tk=END_TK,
                               tokenize=True):
    """Get the sequences and vocab from a file.

  Args:
    filename: name of file.
    max_vocab_size: the maximum number of tokens in the vocab.
    min_count: the minimum number of appearance for a token to
    be added into the vocab.
    unk_tk: the unknown token.
    start_tk: the start of sentence token.
    decode_tk: the start of decoding token.
    end_tk: the end of decoding token.
    tokenize: Whether to tokenize the text in the file.

  Returns:
    seqs: a list of lists of tokens.
    vocab: a Vocab object created from the file.
  """
    vocab = generate_vocab_from_file(filename,
                                     tokenize=tokenize,
                                     max_vocab_size=max_vocab_size,
                                     min_count=min_count,
                                     unk_tk=unk_tk,
                                     start_tk=start_tk,
                                     decode_tk=decode_tk,
                                     end_tk=end_tk)
    seqs = []
    with open(filename, 'r') as f:
        for line in f:
            if tokenize:
                tokens = nlp_utils.tokenize(line)
            else:
                tokens = line.strip().split()
            seqs.append(tokens)
    return seqs, vocab

コード例 #5

0

ファイルを表示

def generate_vocab_from_stream(text_stream,
                               max_vocab_size=1000000,
                               min_count=0,
                               unk_tk=UNK_TK,
                               start_tk=START_TK,
                               decode_tk=DECODE_TK,
                               end_tk=END_TK,
                               tokenize=True):
    """Create a vocab from a given text stream."""
    token_list = []
    for line in text_stream:
        if tokenize:
            new_list = nlp_utils.tokenize(line)
        else:
            new_list = line.strip().split()
        token_list += new_list
    return generate_vocab_from_list(token_list,
                                    max_vocab_size=max_vocab_size,
                                    min_count=min_count,
                                    unk_tk=unk_tk,
                                    start_tk=start_tk,
                                    decode_tk=decode_tk,
                                    end_tk=end_tk)

コード例 #6

0

ファイルを表示

ファイル: data_utils.py プロジェクト: dangwn/imdb_nlp_lstm

        vector[ind - 1] = 1
    return np.array(vector)


def word_vectorize_doc(doc: str, vector_size=10_000, final_doc_len=100):
    '''
    Returns an array of word vectors for a given document
    =============================================
    Inputs:
      - doc          : The document to vectorize
      - vector_size  : The size of the vectors for each word
      - final_doc_len: The length of the array of vectors
    Returns:
      - A numpy array of vectors
    '''
    tokenized_doc = tokenize(doc)

    # Tokenize document
    if len(tokenized_doc) > final_doc_len:
        tokenized_doc = tokenized_doc[:final_doc_len]

    # Create an ordered sequence of vectors
    vectors = []
    for word in tokenized_doc:
        vectors.append(word_vectorize(word, vector_size=vector_size))
    while (len(vectors) < final_doc_len):
        vectors.append(np.zeros(vector_size))

    return np.array(vectors)

コード例 #7

0

ファイルを表示

ファイル: chat.py プロジェクト: yukti99/Chatbot

tags = data["tags"]

model = NeuralNet(input_size,hidden_size,output_size)
model.load_state_dict(model_state)
# call model.eval() to set dropout and batch normalization layers to evaluation mode before running inference. Failing to do this will yield inconsistent inference results.
# acts like a switch to turn of some layers during evaluation/inference
# this is evaluation mode
model.eval()

bot_name = "Yukti's Bot"
print("Let's chat! Type 'quit' to exit..")
while True:
    sentence = input('You: ')
    if sentence == "quit":
        break
    sentence = tokenize(sentence)
    # bagofwords function returns a numpy array
    X = bagOfWords(sentence,all_words)
    X = X.reshape(1,X.shape[0])
    X = torch.from_numpy(X).to(device)
    output = model(X)
    _,predicted = torch.max(output,dim=1)
    # predicted.item - class label
    tag = tags[predicted.item()]

    # checking if the probability of the tag is high enough
    # applying softmax to get the actual probabilities
    probs = torch.softmax(output,dim=1)
    prob = probs[0][predicted.item()]
    if (prob.item() > 0.70):
        # finding corresponding intent for this tag

コード例 #8

0

ファイルを表示

ファイル: chat.py プロジェクト: YanisaW/CE-27

def question(sentence, userName, userID):
    sentence = tokenize(sentence)
    ignore_words = [
        '?', '!', '.', '"', '@', '#', '^', '=', '-', ',', '/', '*', '$', '&',
        '(', ')', ' '
    ]
    sentence = [w for w in sentence if w not in ignore_words]
    X = bag_of_words(sentence, all_words)
    X = X.reshape(1, X.shape[0])
    X = torch.from_numpy(X).to(device)

    output = model(X)
    _, predicted = torch.max(output, dim=1)

    tag = tags[predicted.item()]

    probs = torch.softmax(output, dim=1)
    prob = probs[0][predicted.item()]
    if prob.item() > 0.9:
        print(prob.item())
        for intent in intents['intents']:
            if tag == intent["tag"]:
                # print(tag)
                answer = random.choice(intent['responses'])
                result = None
                if tag == 'cancel' or tag == 'postpone':
                    result = fb_Data.get('/members',
                                         '/' + userID + '/appointment')
                    if result is None:
                        answer = 'คุณ(customer_name) ยังไม่ได้ทำการจองนัดหมายกรุณาจองนัดก่อนค่ะ \nคลิกที่เมนูเพื่อจองนัด'
                        tag = 'appointment'
                if '(name)' in answer:
                    name = ""
                    for dent in dentist['Dentist']:
                        for dayWeek in dent['OnDuty']:
                            if dayWeek == datetime.datetime.now().strftime(
                                    "%a"):  #วันตรงกัน
                                name = name + "คุณหมอ" + dent['Name'] + " "
                    answer = answer.replace('(name)', name)
                if '(customer_name)' in answer:
                    answer = answer.replace('(customer_name)', userName)
                if '(date)' in answer or '(time)' in answer:
                    if result is not None:
                        for i, day in enumerate(result):
                            if i + 1 == (len(result)):
                                time_str = result[day]['time']
                                date_time_str = result[day]['date']
                                date_time_obj = datetime.datetime.strptime(
                                    date_time_str, '%Y-%m-%d')
                                date_time_format = date_time_obj.strftime(
                                    '%d-%m-%Y')
                        answer = answer.replace('(date)',
                                                date_time_format).replace(
                                                    '(time)', time_str)
                    else:
                        time = datetime.datetime.now()
                        answer = answer.replace('(date)',
                                                time.strftime("%x")).replace(
                                                    '(time)',
                                                    time.strftime("%X"))
                if '(list)' in answer or '(price)' in answer:
                    for dental in dental_lists["dental_lists"]:
                        for a in dental["homonyms"]:
                            if a in sentence:
                                list1 = (dental["homonyms"][0])
                                price = (dental["cost"])
                                print(list1, price)
                                answer = answer.replace('(list)',
                                                        list1).replace(
                                                            '(price)',
                                                            str(price))
                                break
                    if '(list)' in answer or '(price)' in answer:
                        answer = 'ราคา'

                return answer, tag
    else:
        return "ยิ้มสวยไม่เข้าใจค่ะ ลองถามใหม่อีกครั้งค่ะ", ""


# # test case
# test1 = ['สวัสดี', 'สวัสดีครับผม', "สบายดีไหม"]
# test2 = ['นัดทำฟัน', 'ขอทำฟัน', 'ขอไม่นัดหมอนะ']
## test3 = ['นัดอุดฟัน', 'อยากทำวีเนียร์', 'อยากจัดฟัน', 'ถอนฟัน', 'นัดเอ็กซเรย์', 'ทำฟันปลอมค่ะ', 'จะฟอกฟัน', 'ทำแอร์โฟลว', 'นัดทำเลเซอร์','นัดทั่วไป', 'ทำรากฟันเทียม', 'ฟันคุดค่ะ', 'พิมพ์ปากถ่ายรูป', 'ไม่นัดทำเอกซเรย์แล้ว']
# test4 = ['สอบถามราคา', 'จัดฟันราคาเท่าไหร่', 'ขัดหินปูนราคาเท่าไหร่', 'ตกแต่งเหงือกแพงมั้ย', 'ทำฟันปลอมกี่บาท']
# test5 = ['จัดฟันต้องทำยังไง', 'ขอคำปรึกษาหน่อยค่า', 'ไม่ปรึกษา']
# test6 = ['ขอยกเลิกการนัด','อยากยกเลิกนัด','ไม่ทำแล้ว','ไม่อยากทำ']
# test7 = ['ไม่ว่างไปทำฟันในวันนัด', 'ย้ายวันนัด', 'ไม่ว่าง จะยกเลิก']
# test8 = ['ผ่อนชำระได้ไหมคะ', 'จ่ายบัตรเครดิตได้มั้ย']
# test9 = ['คลีนิกเปิดกี่โมง', 'คลีนิกเปิดวันไหนบ้าง']
# test10 = ['อาเระ', 'skfdsokf','ทำไมตอบได้แล้ว','วันนี้วันอะไร']
# for i in test10:
#     print('You : ' + i)
#     print('Bot : ' + question(i))
#     print()

コード例 #9

0

ファイルを表示

ファイル: train.py プロジェクト: yukti99/Chatbot

from model import NeuralNet

# opening our intents.json file
with open('intents.json', 'r') as f:
    intents = json.load(f)

all_words = []
tags = []
pr = []

for intent in intents['intents']:
    tag = intent['tag']
    tags.append(tag)
    # loop over all the patterns
    for pattern in intent["patterns"]:
        w = tokenize(pattern)
        # no append another array just extend it
        all_words.extend(w)
        pr.append((w, tag))

Stop_words = stopwords.words('english')
# stemming and removing unnecessary words
all_words = [stem(w) for w in all_words if w not in Stop_words]
all_words = sorted(set(all_words))
tags = sorted(set(tags))
#print("T = ",tags)

# bag of words in X
X_train = []
y_train = []