def load_text_pairs(): """ Load text pairs from the specified file. Each text pair corresponds to a single line in the text file. Both texts (left and right one) in such pair are separated by the tab character. It is assumed that the text file has the UTF-8 encoding. :param file_name: name of file containing required text pairs. :return a 2-element tuple: the 1st contains list of left texts, the 2nd contains corresponding list of right texts. """ input_texts, target_texts = data_getter.get_files(50) return input_texts, target_texts
# %matplotlib inline import re from sklearn.utils import shuffle from sklearn.model_selection import train_test_split from keras.layers import Input, LSTM, Embedding, Dense from keras.models import Model import data_getter from math import ceil from rouge import Rouge batch_size = 20 epochs = 10 num_textos = 100 raw_data = data_getter.get_files(num_textos) input_texts, target_texts = [], [] for i in raw_data: input_texts.append(i[0]) target_texts.append(i[1]) # lines= pd.read_table('./conversa/mar.txt', names=['eng', 'mar']) # print(lines.mar) input_texts, target_texts = (list(map(lambda x: x.lower(), input_texts)), list(map(lambda x: x.lower(), target_texts))) input_texts, target_texts = (list(map(lambda x: re.sub("'", '', x), input_texts)), list(map(lambda x: re.sub("'", '', x), target_texts))) input_texts, target_texts = (list(map(lambda x: re.sub('"', '', x), input_texts)), list(map(lambda x: re.sub('"', '', x), target_texts))) exclude = set(string.punctuation) # Set of all special characters
from __future__ import print_function from keras.models import Model from keras.layers import Input, LSTM, Dense import numpy as np import data_getter batch_size = 64 # Batch size for training. epochs = 10 # Number of epochs to train for. latent_dim = 10 # Latent dimensionality of the encoding space. num_samples = 100 # Number of samples to train on. # Path to the data txt file on disk. data_path = 'fra-eng/fra.txt' # Vectorize the data. input_texts, target_texts = data_getter.get_files(num_samples) input_characters = set() target_characters = set() # with open(data_path, 'r', encoding='utf-8') as f: # lines = f.read().split('\n') for text, tag in zip(input_texts, target_texts): for char in text: if char not in input_characters: input_characters.add(char) for char in tag: if char not in target_characters: target_characters.add(char) input_characters = sorted(list(input_characters)) target_characters = sorted(list(target_characters)) num_encoder_tokens = len(input_characters) num_decoder_tokens = len(target_characters)
batch_size = 64 # Batch size for training. epochs = 100 # Number of epochs to train for. latent_dim = 256 # Latent dimensionality of the encoding space. num_samples = 7000 # Number of samples to train on. # Path to the data txt file on disk. data_path = '/home/maxtelll/Downloads/fra.txt' # Vectorize the data. input_texts = [] target_texts = [] input_characters = set() target_characters = set() # with open(data_path, 'r', encoding='utf-8') as f: # lines = f.read().split('\n') lines = get_files(num_samples) for line in lines[:min(num_samples, len(lines) - 1)]: # input_text, target_text = line.split('\t') # We use "tab" as the "start sequence" character # for the targets, and "\n" as "end sequence" character. input_text, target_text = line target_text = '\t' + target_text + '\n' input_texts.append(input_text) target_texts.append(target_text) for char in input_text: if char not in input_characters: input_characters.add(char) for char in target_text: