Beispiel #1
0
def read_movie():
    
    dir_data = "/home/llu/HardDisk/LiangqunLuGitHub/DLForChatbot/MS_DL/data/"
    mydata = dir_data + "movie_dialogue.txt"
    with open(mydata, 'r', encoding="ISO-8859-1") as f:
        lines = f.readlines()

    input_lines, target_lines = list(), list()    
    for one in lines:
        input_lines.append(one.split("\t")[0])
        target_lines.append(one.split("\t")[1])   
    print("Input and output lines: %d and %d"%(len(input_lines), len(target_lines)) ) 
    
    # Vectorize the data.
    input_texts = []
    target_texts = []
        
    for ll in range(len(input_lines)):
        
        input_txt = ToktokTokenizer().tokenize(input_lines[ll], return_str=True)
        output_txt = ToktokTokenizer().tokenize(target_lines[ll], return_str=True)
        
        #if min(len(input_txt.split()), len(output_txt.split())) > 5:
            #continue
        #else:
        if max(len(input_txt.split()), len(output_txt.split())) <= 5 and max(len(input_txt.split()), len(output_txt.split())) > 2:
        
            #input_text, target_text = line.split('\t')
            #input_text = input_lines[ll].lower()
            #target_text = target_lines[ll].lower()
            input_text = input_txt.lower()
            target_text = output_txt.lower()            
            
            # We use "tab" as the "start sequence" character
            # for the targets, and "\n" as "end sequence" character.
            input_text = input_text.replace('\n', '')            
            target_text = target_text.replace('\n', '')
            input_texts.append(input_text)
            target_texts.append(target_text)
                                  
    # NMT concepts and parameters
    # Keras NMT tutorial https://github.com/keras-team/keras/blob/master/examples/lstm_seq2seq.py           
    #clean input texts and target texts
    #input_texts, target_texts = clean_input_pairs(input_texts, target_texts)
    lines = pd.DataFrame({'eng': input_texts,'fr': target_texts} ) 
    
    lines.eng=lines.eng.apply(lambda x: x.lower())
    lines.fr=lines.fr.apply(lambda x: x.lower())
    #lines.eng=lines.eng.apply(lambda x: re.sub("'", '', x)).apply(lambda x: re.sub(",", ' COMMA', x))
    #lines.fr=lines.fr.apply(lambda x: re.sub("'", '', x)).apply(lambda x: re.sub(",", ' COMMA', x))
    lines.eng=lines.eng.apply(lambda x: re.sub("'", '', x)).apply(lambda x: re.sub(",", ' ', x))
    lines.fr=lines.fr.apply(lambda x: re.sub("'", '', x)).apply(lambda x: re.sub(",", ' ', x))
    
    exclude = set(string.punctuation)
    lines.eng=lines.eng.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
    lines.fr=lines.fr.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
    print(lines.head(n = 10))
    
    #lines.head(n = 10).to_csv(datatype + "_first_pairs.csv")
    
    #lines.fr = lines.fr.apply(lambda x : 'START_ '+ x + ' _END')
    lines.fr = lines.fr.apply(lambda x : '\t'+ x + '\n')
    
    input_texts = lines['eng'].tolist()
    target_texts = lines['fr'].tolist()
    
    max_encoder_seq_length = max([len(txt.split()) for txt in input_texts])
    max_decoder_seq_length = max([len(txt.split()) for txt in target_texts])
    
    print('Number of sample input:', len(input_texts) )    
    print('Number of sample target:', len(target_texts))
    print('Max sequence length for inputs:', max_encoder_seq_length)
    print('Max sequence length for outputs:', max_decoder_seq_length)
    
    #num_encoder_tokens = len(input_texts)
    #num_decoder_tokens = len(target_texts)
    
    return input_texts, target_texts
Beispiel #2
0
def read_reddit():
    
    dir_data = "/home/llu/HardDisk/LiangqunLuGitHub/DLForChatbot/NMT_sentx/"
    train_from = dir_data + "train.from"
    train_to = dir_data + "train.to"

    N = n_samples
    with open(train_from, 'r', encoding='utf-8') as f:
        #input_lines = f.read().split('\n')
        input_lines = [next(f).strip() for x in range(N)]
    
    with open(train_to, 'r', encoding='utf-8') as f:
        #target_lines = f.read().split('\n')  
        target_lines = [next(f).strip() for x in range(N)]        

    print("Input and output lines: %d and %d"%(len(input_lines), len(target_lines)) ) 
    # Vectorize the data.
    input_texts = []
    target_texts = []  
    #check english words
    #d = enchant.Dict("en_US")
    for ll in range(len(input_lines)):
        
        input_txt = ToktokTokenizer().tokenize(input_lines[ll], return_str=True)
        output_txt = ToktokTokenizer().tokenize(target_lines[ll], return_str=True)
        
        #if min(len(input_txt.split()), len(output_txt.split())) > 5:
            #continue
        #else:
        if max(len(input_txt.split()), len(output_txt.split())) <= 5 and max(len(input_txt.split()), len(output_txt.split())) > 2 and is_ascii(input_txt) and is_ascii(output_txt):
        
            #input_text, target_text = line.split('\t')
            #input_text = input_lines[ll].lower()
            #target_text = target_lines[ll].lower()
            input_text = input_txt.lower()
            target_text = output_txt.lower()            
            
            # We use "tab" as the "start sequence" character
            # for the targets, and "\n" as "end sequence" character.
            input_text = input_text.replace('\n', '')            
            target_text = target_text.replace('\n', '')
            input_texts.append(input_text)
            target_texts.append(target_text)
                                  
    # NMT concepts and parameters
    # Keras NMT tutorial https://github.com/keras-team/keras/blob/master/examples/lstm_seq2seq.py           
    #clean input texts and target texts
    
    
    #input_texts, target_texts = clean_input_pairs(input_texts, target_texts)
    lines = pd.DataFrame({'eng': input_texts,'fr': target_texts} ) 

    lines.eng=lines.eng.apply(lambda x: x.lower())
    lines.fr=lines.fr.apply(lambda x: x.lower())
    lines.eng=lines.eng.apply(lambda x: re.sub("'", '', x)).apply(lambda x: re.sub(",", ' ', x))
    lines.fr=lines.fr.apply(lambda x: re.sub("'", '', x)).apply(lambda x: re.sub(",", ' ', x))
    
    exclude = set(string.punctuation)
    lines.eng=lines.eng.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
    lines.fr=lines.fr.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
    print(lines.head(n = 10))
    
    lines.head(n = 20).to_csv(datatype + "_first_pairs.csv")
    
    lines.fr = lines.fr.apply(lambda x : '\t'+ x + '\n')
    
    input_texts = lines['eng'].tolist()
    target_texts = lines['fr'].tolist()
    
    max_encoder_seq_length = max([len(txt.split()) for txt in input_texts])
    max_decoder_seq_length = max([len(txt.split()) for txt in target_texts])
    
    print('Number of sample input:', len(input_texts) )    
    print('Number of sample target:', len(target_texts))
    print('Max sequence length for inputs:', max_encoder_seq_length)
    print('Max sequence length for outputs:', max_decoder_seq_length)
    
    #num_encoder_tokens = len(input_texts)
    #num_decoder_tokens = len(target_texts)
    
    return input_texts, target_texts