Python basic_tokenizerの例、data_utils.basic_tokenizer Pythonの例

コード例 #1

0

ファイルを表示

def reply(chat_id):
    chat = Chat.query.filter_by(id=chat_id).first_or_404()
    request_text = request.form['msg']
    request_msg = Message(chat_id=chat_id,
                          text=request_text,
                          author=Message.AUTHOR_USER,
                          order=chat.messages_count + 1)
    if app.config['NEURAL_LOGIC']:
        from app import sess, model, enc_vocab, rev_dec_vocab
        response_text = execute.decode_line(sess, model, enc_vocab,
                                            rev_dec_vocab, request_text)
    else:
        response_text = request.form['msg']

    response_msg = Message(chat_id=chat_id,
                           text=response_text,
                           author=Message.AUTHOR_BOT,
                           order=chat.messages_count + 2)
    for word in basic_tokenizer(request_text.encode()):
        db.session.add(UserWord(word=word))
    for word in basic_tokenizer(response_text.encode()):
        db.session.add(BotWord(word=word))

    chat.messages_count = chat.messages_count + 2

    db.session.add(request_msg)
    db.session.add(response_msg)
    db.session.commit()
    return jsonify(response_msg.as_dict())

コード例 #2

0

ファイルを表示

ファイル: init_db_vocab.py プロジェクト: Daemon712/tensorflow_chatbot

def init_database_vocabulary():
    max_vocabulary_size = 60000
    data_path = 'data/data.a'
    vocab = {}
    words = 0
    with gfile.GFile(data_path, mode="rb") as f:
        counter = 0
        for line in f:
            counter += 1
            if counter % 100000 == 0:
                print("  processing line %d" % counter)
            tokens = data_utils.basic_tokenizer(line)
            for w in tokens:
                word = re.sub(data_utils._DIGIT_RE, b"0", w)
                words += 1
                if word in vocab:
                    vocab[word] += 1
                else:
                    vocab[word] = 1
        vocab_list = sorted(vocab, key=vocab.get, reverse=True)
        print('>> Full Vocabulary Size :', len(vocab_list))
        if len(vocab_list) > max_vocabulary_size:
            vocab_list = vocab_list[:max_vocabulary_size]

        db.session.add_all(
            VocabularyWord(word=w, frequency=1_000_000 * vocab[w] / words)
            for w in vocab_list)

コード例 #3

0

ファイルを表示

def reply():
    print(request.form['msg'])
    res = list()
    for fir in request.form['msg'].split("，"):
        res.append(fir)
    res_set = set(res)
    # remove some duplicated lyrics
    for i in range(10):
        n = 4
        while len(basic_tokenizer("，".join(res[-n:]))) > 65 and n > 1:
            n -= 1
        cand = execute.decode_line(sess, model, enc_vocab, rev_dec_vocab,
                                   u'，'.join(res[-n:]))
        while cand in res_set and n > 1:
            n -= 1
            cand = execute.decode_line(sess, model, enc_vocab, rev_dec_vocab,
                                       u'，'.join(res[-n:]))
        if cand != res[-1]:
            while len(res) > 1 and res[-1] == res[-2]:
                res.pop()
        res.append(cand)
        res_set.add(cand)
    while len(res) > 1 and res[-1] == res[-2]:
        res.pop()
    return jsonify({'text': "\n".join(res)})

コード例 #4

0

ファイルを表示

    def build_input_var(self, user_input):
        words = data_utils.basic_tokenizer(user_input)
        words_index = []
        unknown_words = []
        for word in words:
            if word in self.vocab.word2index.keys():
                # keep known words
                words_index.append(self.vocab.word2index[word])
            else:
                unknown_words.append(word)
        if len(unknown_words) > 0:
            print('unknown_words: ' + str(unknown_words))
        # append EOS token
        words_index.append(EOS_token)

        if config.reverse_input:
            words_index = words_index[::-1]

        if len(words_index) > 0:
            input_var = Variable(torch.LongTensor([words_index
                                                   ])).transpose(0, 1)
            if config.use_cuda:
                input_var = input_var.cuda()
            # input_var size (length, 1)
            return input_var
        return None

コード例 #5

0

ファイルを表示

def normal_extract(dialogues, id2sentence):
    questions, answers = [], []
    for ids in dialogues:
        join_func = lambda id: ' '.join(
            data_utils.basic_tokenizer(id2sentence[id]))
        sentences = [join_func(id) for id in ids]
        questions.extend(sentences[:-1])
        answers.extend(sentences[1:])
    return questions, answers

コード例 #6

0

ファイルを表示

def augment_extract(dialogues, id2sentence):
    questions, answers = [], []
    augment_q, augment_a = [], []
    for ids in dialogues:
        join_func = lambda id: ' '.join(
            data_utils.basic_tokenizer(id2sentence[id]))
        sentences = [join_func(id) for id in ids]
        augment_sentence(sentences[:-1], sentences[1:], augment_q, augment_a)
        assert len(augment_q) == len(augment_a)
    return augment_q, augment_a

コード例 #7

0

ファイルを表示

def sample(FLAGS):

    # Load the data needed to convert your sentence
    en_token_ids, en_seq_lens, en_vocab_dict, en_rev_vocab_dict = \
        process_data('data/en.p', max_vocab_size=5000, target_lang=False)
    sp_token_ids, sp_seq_lens, sp_vocab_dict, sp_rev_vocab_dict = \
        process_data('data/sp.p', max_vocab_size=5000, target_lang=True)

    # Change FLAGS parameters
    FLAGS.batch_size = 1
    FLAGS.en_vocab_size = len(en_vocab_dict)
    FLAGS.sp_vocab_size = len(sp_vocab_dict)
    FLAGS.sp_max_len = max(sp_seq_lens) + 1 # GO token

    # Process sample sentence
    inference_sentence = ["I like to play tennis and eat sandwiches."]
    # Split into tokens
    tokenized = []
    for i in xrange(len(inference_sentence)):
        tokenized.append(basic_tokenizer(inference_sentence[i]))
    # Convert data to token ids
    data_as_tokens, sample_en_seq_lens = data_to_token_ids(
        tokenized, en_vocab_dict, target_lang=False, normalize_digits=True)

    # make dummy_sp_inputs
    dummy_sp_inputs = np.array([[GO_ID]*FLAGS.sp_max_len])
    sample_sp_seq_lens = np.array([len(dummy_sp_inputs)])

    print data_as_tokens
    print sample_en_seq_lens
    print dummy_sp_inputs
    print sample_sp_seq_lens

    with tf.Session() as sess:

        # Load trained model
        model = create_model(sess, FLAGS, forward_only=True)

        y_pred = model.step(sess, FLAGS, batch_encoder_inputs=data_as_tokens,
            batch_decoder_inputs=dummy_sp_inputs, batch_targets=None,
            batch_en_seq_lens=sample_en_seq_lens,
            batch_sp_seq_lens=sample_sp_seq_lens,
            dropout=0.0, forward_only=True, sampling=True)

        # compose the predicted sp sentence
        sp_sentence = []
        for idx in y_pred[0]:
            sp_sentence.append(sp_rev_vocab_dict[idx])
        print " ".join([word for word in sp_sentence])

コード例 #8

0

ファイルを表示

 def create_vocab(self, key, vocabfile):
     print("creating vocab %s" % key)
     vocab = {}
     cursor = self.client[self.corpus][self.col].find()
     for pair in cursor:
         line = tf.compat.as_bytes(pair[key].lower())
         tokens = data_utils.basic_tokenizer(line)
         for w in tokens:
             word = data_utils._DIGIT_RE.sub(b"0", w)
             if word in vocab:
                 vocab[word] += 1
             else:
                 vocab[word] = 1
     vocab_list = data_utils._START_VOCAB + sorted(
         vocab, key=vocab.get, reverse=True)
     with gfile.GFile(vocabfile, mode="wb") as v_file:
         for w in vocab_list:
             v_file.write(w + b"\n")

コード例 #9

0

ファイルを表示

ファイル: prerequisites.py プロジェクト: realJustinLee/dumb-chatbot

def export_dialogue_corpus():
    dialogue_ids_list = load_conversations(os.path.join(os.path.abspath(DATA_PATH), MOVIE_CONVERSATIONS))
    id2sentence = load_movie_lines(os.path.join(os.path.abspath(DATA_PATH), MOVIE_LINES))
    questions, answers = [], []
    for ids in dialogue_ids_list:
        length = len(ids) if len(ids) % 2 == 0 else len(ids) - 1
        for i in range(length):
            sentence = ' '.join(data_utils.basic_tokenizer(id2sentence[ids[i]]))
            if i % 2 == 0:
                questions.append(sentence)
            else:
                answers.append(sentence)
    dialogue_couples = list(zip(questions, answers))
    print('Dialogue couples: %d' % len(dialogue_couples))

    # random.shuffle(dialogue_corpus)
    with open(os.path.join(os.path.abspath(DATA_PATH), DIALOGUE_CORPUS), 'w') as dialogue_file:
        for question, answer in dialogue_couples:
            dialogue_file.write('%s +++$+++ %s\n' % (question, answer))

コード例 #10

0

ファイルを表示

ファイル: preprocess.py プロジェクト: ttt555/dumb-chatbot

def export_dialogue_corpus():
    dialogues = load_conversations(DATA_PATH + MOVIE_CONVERSATIONS)
    id2sentence = load_movie_lines(DATA_PATH + MOVIE_LINES)
    questions, answers = [], []
    for ids in dialogues:
        length = len(ids) if len(ids) % 2 == 0 else len(ids) - 1
        for i in range(length):
            sentence = ' '.join(data_utils.basic_tokenizer(
                id2sentence[ids[i]]))
            if i % 2 == 0:
                questions.append(sentence)
            else:
                answers.append(sentence)
    dialogue_groups = zip(questions, answers)
    print('Dialogue pairs: %d' % len(dialogue_groups))

    # random.shuffle(dialogue_corpus)
    with open(DATA_PATH + DIALOGUE_CORPUS, 'w') as my_file:
        for a, b in dialogue_groups:
            my_file.write('%s +++$+++ %s\n' % (a, b))

コード例 #11

0

ファイルを表示

ファイル: model_utils.py プロジェクト: realJustinLee/dumb-chatbot

 def build_input_var(self, user_input):
     words = data_utils.basic_tokenizer(user_input)
     words_index = []
     unknown_words = []
     for word in words:
         if word in self.vocab.word2index.keys():
             # keep known words
             words_index.append(self.vocab.word2index[word])
         else:
             unknown_words.append(word)
     if len(unknown_words) > 0:
         print('unknown_words: ' + str(unknown_words))
     # append EOS token
     words_index.append(EOS_token)
     if len(words_index) > 0:
         input_var = torch.tensor([words_index]).transpose(0, 1)
         if USE_CUDA:
             input_var = input_var.cuda()
         # input_var size (length, 1)
         return input_var
     return None