Exemple #1
0
def load_text_pairs():
    """ Load text pairs from the specified file.

    Each text pair corresponds to a single line in the text file. Both texts (left and right one) in such pair are
    separated by the tab character. It is assumed that the text file has the UTF-8 encoding.

    :param file_name: name of file containing required text pairs.

    :return a 2-element tuple: the 1st contains list of left texts, the 2nd contains corresponding list of right texts.

    """
    input_texts, target_texts = data_getter.get_files(50)

    return input_texts, target_texts
Exemple #2
0
# %matplotlib inline
import re
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model
import data_getter
from math import ceil
from rouge import Rouge

batch_size = 20
epochs = 10
num_textos = 100


raw_data = data_getter.get_files(num_textos)
input_texts, target_texts = [], []
for i in raw_data:
    input_texts.append(i[0])
    target_texts.append(i[1])

# lines= pd.read_table('./conversa/mar.txt', names=['eng', 'mar'])
# print(lines.mar)
input_texts, target_texts = (list(map(lambda x: x.lower(), input_texts)),
                             list(map(lambda x: x.lower(), target_texts)))
input_texts, target_texts = (list(map(lambda x: re.sub("'", '', x), input_texts)),
                             list(map(lambda x: re.sub("'", '', x), target_texts)))
input_texts, target_texts = (list(map(lambda x: re.sub('"', '', x), input_texts)),
                             list(map(lambda x: re.sub('"', '', x), target_texts)))

exclude = set(string.punctuation) # Set of all special characters
Exemple #3
0
from __future__ import print_function

from keras.models import Model
from keras.layers import Input, LSTM, Dense
import numpy as np
import data_getter
batch_size = 64  # Batch size for training.
epochs = 10  # Number of epochs to train for.
latent_dim = 10  # Latent dimensionality of the encoding space.
num_samples = 100  # Number of samples to train on.
# Path to the data txt file on disk.
data_path = 'fra-eng/fra.txt'

# Vectorize the data.
input_texts, target_texts = data_getter.get_files(num_samples)
input_characters = set()
target_characters = set()
# with open(data_path, 'r', encoding='utf-8') as f:
#     lines = f.read().split('\n')
for text, tag in zip(input_texts, target_texts):
    for char in text:
        if char not in input_characters:
            input_characters.add(char)
    for char in tag:
        if char not in target_characters:
            target_characters.add(char)

input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
Exemple #4
0
batch_size = 64  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
num_samples = 7000  # Number of samples to train on.
# Path to the data txt file on disk.
data_path = '/home/maxtelll/Downloads/fra.txt'

# Vectorize the data.
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()

# with open(data_path, 'r', encoding='utf-8') as f:
#     lines = f.read().split('\n')
lines = get_files(num_samples)

for line in lines[:min(num_samples, len(lines) - 1)]:

    # input_text, target_text = line.split('\t')
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.

    input_text, target_text = line
    target_text = '\t' + target_text + '\n'
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text: