def read_movie(): dir_data = "/home/llu/HardDisk/LiangqunLuGitHub/DLForChatbot/MS_DL/data/" mydata = dir_data + "movie_dialogue.txt" with open(mydata, 'r', encoding="ISO-8859-1") as f: lines = f.readlines() input_lines, target_lines = list(), list() for one in lines: input_lines.append(one.split("\t")[0]) target_lines.append(one.split("\t")[1]) print("Input and output lines: %d and %d"%(len(input_lines), len(target_lines)) ) # Vectorize the data. input_texts = [] target_texts = [] for ll in range(len(input_lines)): input_txt = ToktokTokenizer().tokenize(input_lines[ll], return_str=True) output_txt = ToktokTokenizer().tokenize(target_lines[ll], return_str=True) #if min(len(input_txt.split()), len(output_txt.split())) > 5: #continue #else: if max(len(input_txt.split()), len(output_txt.split())) <= 5 and max(len(input_txt.split()), len(output_txt.split())) > 2: #input_text, target_text = line.split('\t') #input_text = input_lines[ll].lower() #target_text = target_lines[ll].lower() input_text = input_txt.lower() target_text = output_txt.lower() # We use "tab" as the "start sequence" character # for the targets, and "\n" as "end sequence" character. input_text = input_text.replace('\n', '') target_text = target_text.replace('\n', '') input_texts.append(input_text) target_texts.append(target_text) # NMT concepts and parameters # Keras NMT tutorial https://github.com/keras-team/keras/blob/master/examples/lstm_seq2seq.py #clean input texts and target texts #input_texts, target_texts = clean_input_pairs(input_texts, target_texts) lines = pd.DataFrame({'eng': input_texts,'fr': target_texts} ) lines.eng=lines.eng.apply(lambda x: x.lower()) lines.fr=lines.fr.apply(lambda x: x.lower()) #lines.eng=lines.eng.apply(lambda x: re.sub("'", '', x)).apply(lambda x: re.sub(",", ' COMMA', x)) #lines.fr=lines.fr.apply(lambda x: re.sub("'", '', x)).apply(lambda x: re.sub(",", ' COMMA', x)) lines.eng=lines.eng.apply(lambda x: re.sub("'", '', x)).apply(lambda x: re.sub(",", ' ', x)) lines.fr=lines.fr.apply(lambda x: re.sub("'", '', x)).apply(lambda x: re.sub(",", ' ', x)) exclude = set(string.punctuation) lines.eng=lines.eng.apply(lambda x: ''.join(ch for ch in x if ch not in exclude)) lines.fr=lines.fr.apply(lambda x: ''.join(ch for ch in x if ch not in exclude)) print(lines.head(n = 10)) #lines.head(n = 10).to_csv(datatype + "_first_pairs.csv") #lines.fr = lines.fr.apply(lambda x : 'START_ '+ x + ' _END') lines.fr = lines.fr.apply(lambda x : '\t'+ x + '\n') input_texts = lines['eng'].tolist() target_texts = lines['fr'].tolist() max_encoder_seq_length = max([len(txt.split()) for txt in input_texts]) max_decoder_seq_length = max([len(txt.split()) for txt in target_texts]) print('Number of sample input:', len(input_texts) ) print('Number of sample target:', len(target_texts)) print('Max sequence length for inputs:', max_encoder_seq_length) print('Max sequence length for outputs:', max_decoder_seq_length) #num_encoder_tokens = len(input_texts) #num_decoder_tokens = len(target_texts) return input_texts, target_texts
def read_reddit(): dir_data = "/home/llu/HardDisk/LiangqunLuGitHub/DLForChatbot/NMT_sentx/" train_from = dir_data + "train.from" train_to = dir_data + "train.to" N = n_samples with open(train_from, 'r', encoding='utf-8') as f: #input_lines = f.read().split('\n') input_lines = [next(f).strip() for x in range(N)] with open(train_to, 'r', encoding='utf-8') as f: #target_lines = f.read().split('\n') target_lines = [next(f).strip() for x in range(N)] print("Input and output lines: %d and %d"%(len(input_lines), len(target_lines)) ) # Vectorize the data. input_texts = [] target_texts = [] #check english words #d = enchant.Dict("en_US") for ll in range(len(input_lines)): input_txt = ToktokTokenizer().tokenize(input_lines[ll], return_str=True) output_txt = ToktokTokenizer().tokenize(target_lines[ll], return_str=True) #if min(len(input_txt.split()), len(output_txt.split())) > 5: #continue #else: if max(len(input_txt.split()), len(output_txt.split())) <= 5 and max(len(input_txt.split()), len(output_txt.split())) > 2 and is_ascii(input_txt) and is_ascii(output_txt): #input_text, target_text = line.split('\t') #input_text = input_lines[ll].lower() #target_text = target_lines[ll].lower() input_text = input_txt.lower() target_text = output_txt.lower() # We use "tab" as the "start sequence" character # for the targets, and "\n" as "end sequence" character. input_text = input_text.replace('\n', '') target_text = target_text.replace('\n', '') input_texts.append(input_text) target_texts.append(target_text) # NMT concepts and parameters # Keras NMT tutorial https://github.com/keras-team/keras/blob/master/examples/lstm_seq2seq.py #clean input texts and target texts #input_texts, target_texts = clean_input_pairs(input_texts, target_texts) lines = pd.DataFrame({'eng': input_texts,'fr': target_texts} ) lines.eng=lines.eng.apply(lambda x: x.lower()) lines.fr=lines.fr.apply(lambda x: x.lower()) lines.eng=lines.eng.apply(lambda x: re.sub("'", '', x)).apply(lambda x: re.sub(",", ' ', x)) lines.fr=lines.fr.apply(lambda x: re.sub("'", '', x)).apply(lambda x: re.sub(",", ' ', x)) exclude = set(string.punctuation) lines.eng=lines.eng.apply(lambda x: ''.join(ch for ch in x if ch not in exclude)) lines.fr=lines.fr.apply(lambda x: ''.join(ch for ch in x if ch not in exclude)) print(lines.head(n = 10)) lines.head(n = 20).to_csv(datatype + "_first_pairs.csv") lines.fr = lines.fr.apply(lambda x : '\t'+ x + '\n') input_texts = lines['eng'].tolist() target_texts = lines['fr'].tolist() max_encoder_seq_length = max([len(txt.split()) for txt in input_texts]) max_decoder_seq_length = max([len(txt.split()) for txt in target_texts]) print('Number of sample input:', len(input_texts) ) print('Number of sample target:', len(target_texts)) print('Max sequence length for inputs:', max_encoder_seq_length) print('Max sequence length for outputs:', max_decoder_seq_length) #num_encoder_tokens = len(input_texts) #num_decoder_tokens = len(target_texts) return input_texts, target_texts