def reply(chat_id): chat = Chat.query.filter_by(id=chat_id).first_or_404() request_text = request.form['msg'] request_msg = Message(chat_id=chat_id, text=request_text, author=Message.AUTHOR_USER, order=chat.messages_count + 1) if app.config['NEURAL_LOGIC']: from app import sess, model, enc_vocab, rev_dec_vocab response_text = execute.decode_line(sess, model, enc_vocab, rev_dec_vocab, request_text) else: response_text = request.form['msg'] response_msg = Message(chat_id=chat_id, text=response_text, author=Message.AUTHOR_BOT, order=chat.messages_count + 2) for word in basic_tokenizer(request_text.encode()): db.session.add(UserWord(word=word)) for word in basic_tokenizer(response_text.encode()): db.session.add(BotWord(word=word)) chat.messages_count = chat.messages_count + 2 db.session.add(request_msg) db.session.add(response_msg) db.session.commit() return jsonify(response_msg.as_dict())
def init_database_vocabulary(): max_vocabulary_size = 60000 data_path = 'data/data.a' vocab = {} words = 0 with gfile.GFile(data_path, mode="rb") as f: counter = 0 for line in f: counter += 1 if counter % 100000 == 0: print(" processing line %d" % counter) tokens = data_utils.basic_tokenizer(line) for w in tokens: word = re.sub(data_utils._DIGIT_RE, b"0", w) words += 1 if word in vocab: vocab[word] += 1 else: vocab[word] = 1 vocab_list = sorted(vocab, key=vocab.get, reverse=True) print('>> Full Vocabulary Size :', len(vocab_list)) if len(vocab_list) > max_vocabulary_size: vocab_list = vocab_list[:max_vocabulary_size] db.session.add_all( VocabularyWord(word=w, frequency=1_000_000 * vocab[w] / words) for w in vocab_list)
def reply(): print(request.form['msg']) res = list() for fir in request.form['msg'].split(","): res.append(fir) res_set = set(res) # remove some duplicated lyrics for i in range(10): n = 4 while len(basic_tokenizer(",".join(res[-n:]))) > 65 and n > 1: n -= 1 cand = execute.decode_line(sess, model, enc_vocab, rev_dec_vocab, u','.join(res[-n:])) while cand in res_set and n > 1: n -= 1 cand = execute.decode_line(sess, model, enc_vocab, rev_dec_vocab, u','.join(res[-n:])) if cand != res[-1]: while len(res) > 1 and res[-1] == res[-2]: res.pop() res.append(cand) res_set.add(cand) while len(res) > 1 and res[-1] == res[-2]: res.pop() return jsonify({'text': "\n".join(res)})
def build_input_var(self, user_input): words = data_utils.basic_tokenizer(user_input) words_index = [] unknown_words = [] for word in words: if word in self.vocab.word2index.keys(): # keep known words words_index.append(self.vocab.word2index[word]) else: unknown_words.append(word) if len(unknown_words) > 0: print('unknown_words: ' + str(unknown_words)) # append EOS token words_index.append(EOS_token) if config.reverse_input: words_index = words_index[::-1] if len(words_index) > 0: input_var = Variable(torch.LongTensor([words_index ])).transpose(0, 1) if config.use_cuda: input_var = input_var.cuda() # input_var size (length, 1) return input_var return None
def normal_extract(dialogues, id2sentence): questions, answers = [], [] for ids in dialogues: join_func = lambda id: ' '.join( data_utils.basic_tokenizer(id2sentence[id])) sentences = [join_func(id) for id in ids] questions.extend(sentences[:-1]) answers.extend(sentences[1:]) return questions, answers
def augment_extract(dialogues, id2sentence): questions, answers = [], [] augment_q, augment_a = [], [] for ids in dialogues: join_func = lambda id: ' '.join( data_utils.basic_tokenizer(id2sentence[id])) sentences = [join_func(id) for id in ids] augment_sentence(sentences[:-1], sentences[1:], augment_q, augment_a) assert len(augment_q) == len(augment_a) return augment_q, augment_a
def sample(FLAGS): # Load the data needed to convert your sentence en_token_ids, en_seq_lens, en_vocab_dict, en_rev_vocab_dict = \ process_data('data/en.p', max_vocab_size=5000, target_lang=False) sp_token_ids, sp_seq_lens, sp_vocab_dict, sp_rev_vocab_dict = \ process_data('data/sp.p', max_vocab_size=5000, target_lang=True) # Change FLAGS parameters FLAGS.batch_size = 1 FLAGS.en_vocab_size = len(en_vocab_dict) FLAGS.sp_vocab_size = len(sp_vocab_dict) FLAGS.sp_max_len = max(sp_seq_lens) + 1 # GO token # Process sample sentence inference_sentence = ["I like to play tennis and eat sandwiches."] # Split into tokens tokenized = [] for i in xrange(len(inference_sentence)): tokenized.append(basic_tokenizer(inference_sentence[i])) # Convert data to token ids data_as_tokens, sample_en_seq_lens = data_to_token_ids( tokenized, en_vocab_dict, target_lang=False, normalize_digits=True) # make dummy_sp_inputs dummy_sp_inputs = np.array([[GO_ID]*FLAGS.sp_max_len]) sample_sp_seq_lens = np.array([len(dummy_sp_inputs)]) print data_as_tokens print sample_en_seq_lens print dummy_sp_inputs print sample_sp_seq_lens with tf.Session() as sess: # Load trained model model = create_model(sess, FLAGS, forward_only=True) y_pred = model.step(sess, FLAGS, batch_encoder_inputs=data_as_tokens, batch_decoder_inputs=dummy_sp_inputs, batch_targets=None, batch_en_seq_lens=sample_en_seq_lens, batch_sp_seq_lens=sample_sp_seq_lens, dropout=0.0, forward_only=True, sampling=True) # compose the predicted sp sentence sp_sentence = [] for idx in y_pred[0]: sp_sentence.append(sp_rev_vocab_dict[idx]) print " ".join([word for word in sp_sentence])
def create_vocab(self, key, vocabfile): print("creating vocab %s" % key) vocab = {} cursor = self.client[self.corpus][self.col].find() for pair in cursor: line = tf.compat.as_bytes(pair[key].lower()) tokens = data_utils.basic_tokenizer(line) for w in tokens: word = data_utils._DIGIT_RE.sub(b"0", w) if word in vocab: vocab[word] += 1 else: vocab[word] = 1 vocab_list = data_utils._START_VOCAB + sorted( vocab, key=vocab.get, reverse=True) with gfile.GFile(vocabfile, mode="wb") as v_file: for w in vocab_list: v_file.write(w + b"\n")
def export_dialogue_corpus(): dialogue_ids_list = load_conversations(os.path.join(os.path.abspath(DATA_PATH), MOVIE_CONVERSATIONS)) id2sentence = load_movie_lines(os.path.join(os.path.abspath(DATA_PATH), MOVIE_LINES)) questions, answers = [], [] for ids in dialogue_ids_list: length = len(ids) if len(ids) % 2 == 0 else len(ids) - 1 for i in range(length): sentence = ' '.join(data_utils.basic_tokenizer(id2sentence[ids[i]])) if i % 2 == 0: questions.append(sentence) else: answers.append(sentence) dialogue_couples = list(zip(questions, answers)) print('Dialogue couples: %d' % len(dialogue_couples)) # random.shuffle(dialogue_corpus) with open(os.path.join(os.path.abspath(DATA_PATH), DIALOGUE_CORPUS), 'w') as dialogue_file: for question, answer in dialogue_couples: dialogue_file.write('%s +++$+++ %s\n' % (question, answer))
def export_dialogue_corpus(): dialogues = load_conversations(DATA_PATH + MOVIE_CONVERSATIONS) id2sentence = load_movie_lines(DATA_PATH + MOVIE_LINES) questions, answers = [], [] for ids in dialogues: length = len(ids) if len(ids) % 2 == 0 else len(ids) - 1 for i in range(length): sentence = ' '.join(data_utils.basic_tokenizer( id2sentence[ids[i]])) if i % 2 == 0: questions.append(sentence) else: answers.append(sentence) dialogue_groups = zip(questions, answers) print('Dialogue pairs: %d' % len(dialogue_groups)) # random.shuffle(dialogue_corpus) with open(DATA_PATH + DIALOGUE_CORPUS, 'w') as my_file: for a, b in dialogue_groups: my_file.write('%s +++$+++ %s\n' % (a, b))
def build_input_var(self, user_input): words = data_utils.basic_tokenizer(user_input) words_index = [] unknown_words = [] for word in words: if word in self.vocab.word2index.keys(): # keep known words words_index.append(self.vocab.word2index[word]) else: unknown_words.append(word) if len(unknown_words) > 0: print('unknown_words: ' + str(unknown_words)) # append EOS token words_index.append(EOS_token) if len(words_index) > 0: input_var = torch.tensor([words_index]).transpose(0, 1) if USE_CUDA: input_var = input_var.cuda() # input_var size (length, 1) return input_var return None