def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("dataset",
                        choices=["cornell", "opensubs"],
                        help="Name of the dataset.")
    parser.add_argument("--max_len",
                        type=int,
                        default=10,
                        help="Max length of sentences to consider.")
    args = parser.parse_args()

    dataset_path = os.path.join("data", args.dataset)
    if args.dataset == "cornell":
        data = datasets.readCornellData(dataset_path, max_len=args.max_len)
    elif args.dataset == "opensubs":
        data = datasets.readOpensubsData(dataset_path, max_len=args.max_len)
    else:
        raise ValueError("Unrecognized dataset: {!r}".format(args.dataset))

    print("Size of dataset: {}".format(len(data)))
    print("First 10 training pairs:")
    for item in data[:10]:
        print(item)

    print("Writing to a .txt file")
    with open("data/req-res.txt", "w") as f:
        for item in data:
            f.write(item[0])
            f.write('\t')
            f.write(item[1])
            f.write('\n')

    print("Done...")
Esempio n. 2
0
    def make_dataset(self, test_size=0.05, seed=12345):
        # read data
        cornell = datasets.readCornellData('cornell/',
                                           max_len=self.sent_len,
                                           kind=self.kind)

        sentences = ['<start>' + ' ' + i[0] + ' ' + '<eos>' for i in cornell]
        replies = ['<start>' + ' ' + i[1] + ' ' + '<eos>' for i in cornell]

        # filter sentences by length
        sent_mask = [
            self.min_len <= len(i.split(' ')) <= self.max_len
            for i in sentences
        ]
        replies_mask = [
            self.min_len <= len(i.split(' ')) <= self.max_len for i in replies
        ]
        full_mask = [i and j for (i, j) in zip(sent_mask, replies_mask)]

        sentences = np.array(sentences)[full_mask].tolist()
        replies = np.array(replies)[full_mask].tolist()

        # tokenize
        tokenizer = tf.keras.preprocessing.text.Tokenizer(
            num_words=self.num_words, filters='', oov_token='<unk>')
        tokenizer.fit_on_texts(sentences + replies)

        sentences_en = tokenizer.texts_to_sequences(sentences)
        replies_en = tokenizer.texts_to_sequences(replies)

        sentences_en = tf.keras.preprocessing.sequence.pad_sequences(
            sentences_en, maxlen=None, padding='post', value=0)
        replies_en = tf.keras.preprocessing.sequence.pad_sequences(
            replies_en, maxlen=None, padding='post', value=0)

        # train-test split
        X_train, X_test, y_train, y_test = train_test_split(
            sentences_en, replies_en, test_size=test_size, random_state=seed)

        buffer_size = len(X_train)
        steps_per_epoch = buffer_size // self.batch_size

        # dataset preparation
        dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
        dataset = dataset.shuffle(buffer_size,
                                  seed=seed).batch(self.batch_size,
                                                   drop_remainder=True)

        val_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
        val_dataset = val_dataset.shuffle(buffer_size,
                                          seed=seed).batch(self.batch_size,
                                                           drop_remainder=True)

        self.tokenizer = tokenizer
        self.dataset = dataset
        self.val_dataset = val_dataset
        self.steps_per_epoch = steps_per_epoch
Esempio n. 3
0
def read_dataset(dataset_name, max_sentence_length):
    dataset_path = 'data/{}'.format(dataset_name)

    if dataset_name == "cornell":
        data = datasets.readCornellData(dataset_path, max_len=max_sentence_length)
    elif dataset_name == "opensubs":
        data = datasets.readOpensubsData(dataset_path, max_len=max_sentence_length)
    elif dataset_name == 'twitter':
        data = readTwitterData()
    else:
        raise ValueError("Unrecognized dataset: {!r}".format(dataset_name))
    
    return data
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("dataset", choices=["cornell", "opensubs"], help="Name of the dataset.")
    parser.add_argument("--max_len", type=int, default=10, help="Max length of sentences to consider.")
    args = parser.parse_args()

    dataset_path = os.path.join("data", args.dataset)
    if args.dataset == "cornell":
        data = datasets.readCornellData(dataset_path, max_len=args.max_len)
    elif args.dataset == "opensubs":
        data = datasets.readOpensubsData(dataset_path, max_len=args.max_len)
    else:
        raise ValueError("Unrecognized dataset: {!r}".format(args.dataset))

    print("Size of dataset: {}".format(len(data)))
    print("First 10 training pairs:")
    for item in data[:10]:
        print(item)
Esempio n. 5
0
 def __init__(self, paths):
     self.embeddings, self.embeddings_dim = self._load_embeddings(
         paths['DIALOGUE_EMBEDDINGS'])
     self.question_vectors = unpickle_file(paths['QUESTION_VECTORS'])
     self.dialogues = datasets.readCornellData(paths['DIALOGUE_FOLDER'],
                                               max_len=100)