Exemple #1
0
def get_list_of_wav_paths(data_version: str, n_augmentations: [int, str] = 0) -> tuple:
    """
    Retrieves the list of filepaths that belong to train, validation and test
    :param n_augmentations: specify the number of augmentations to use or specify "all" to load all the available ones
    (int|str)
    :param data_version: specifies the version of the data to use (str {"0.01", "0.02"})
    :return: list of training paths, list of validation paths and list of test paths (list of lists)
    """
    folders = [get_training_data_path(data_version=data_version)]
    if type(n_augmentations) == int:
        folders += [get_augmented_data_folder(data_version=data_version, folder=str(f)) for f in range(n_augmentations)]
    elif n_augmentations == "all":
        base = get_augmented_data_path(data_version=data_version)
        folders += [os.path.join(base, f) for f in os.listdir(base)]
    else:
        raise ValueError(f"'n_augmentations' parameter value not recognized as a valid argument ('all'|int): {n_augmentations}")

    for path in folders:
        if len(os.listdir(path)) == 0:
            warnings.warn(f"Attempting to load files from an empty folder: {path}")

    list_test = open(os.path.join(get_training_data_path(data_version=data_version), "testing_list.txt"))
    list_test = list(
        map(lambda x: os.path.normpath(os.path.join(get_training_data_path(data_version=data_version), x.strip())),
            list_test))

    list_val = open(os.path.join(get_training_data_path(data_version=data_version), "validation_list.txt"))
    list_val = list(
        map(lambda x: os.path.normpath(os.path.join(get_training_data_path(data_version=data_version), x.strip())),
            list_val))

    list_train = flatten([list(recursive_listdir(os.path.normpath(folder))) for folder in folders])
    list_train = list(filter(lambda p: "background_noise" not in p and p.endswith("wav"), list_train))
    list_train = np.setdiff1d(list_train, list_test + list_val).tolist()
    return list_train, list_val, list_test
def load_cornell_dialogs(max_length=150):
    path = os.path.join(get_data_path(), "cornell movie-dialogs corpus")
    if not os.path.exists(path):
        url = 'http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip'
        response = urllib.request.urlopen(url)
        data = response.read()  # a `bytes` object
        zip_ref = zipfile.ZipFile(io.BytesIO(data))
        zip_ref.extractall(os.path.join(get_data_path()))

    movie_lines = codecs.open(os.path.join(path, "movie_lines.txt"), "r",
                              "Windows-1252").readlines()
    movie_lines = list(map(lambda x: x.strip().split(" +++$+++ "),
                           movie_lines))
    movie_lines_dict = dict(list(map(lambda x: (x[0], x[-1]), movie_lines)))

    movie_conversations = open(os.path.join(path, "movie_conversations.txt"),
                               "r").readlines()
    movie_conversations = list(
        map(lambda x: x.strip().split(" +++$+++ "), movie_conversations))

    for element in range(len(movie_conversations)):
        movie_conversations[element][-1] = [
            movie_lines_dict[line]
            for line in eval(movie_conversations[element][-1])
        ]

    dialogs = flatten(
        list(
            map(lambda x: list(zip(x[-1][:-1], x[-1][1:])),
                movie_conversations)))
    dialogs_filtered = list(
        filter(lambda x: max([len(s) for s in x]) <= max_length, dialogs))
    return (dialogs_filtered)
Exemple #3
0
 def fit(self, list_of_real_sentences):
     lists_of_tokens = list(
         map(lambda x: nltk.word_tokenize(x, "english"),
             list_of_real_sentences))
     lists_of_tokens = map(lambda s: [w.lower() for w in s],
                           lists_of_tokens)
     fdist = Counter(flatten(self.calculate_ngrams(lists_of_tokens)))
     items = list(
         map(lambda x: x[0], filter(lambda x: x[1] >= 2,
                                    dict(fdist).items())))  # Remove hapaxes
     self.unique_items = set(items)
 def test_batching(self):
     dataset_1 = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
     dataset_2 = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"]
     dataset_3 = [
         "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X"
     ]
     # Case 1
     batcher = batching(list_of_iterables=[dataset_1, dataset_2, dataset_3],
                        n=2,
                        infinite=False,
                        return_incomplete_batches=False)
     batches_1, batches_2, batches_3 = zip(*list(batcher))
     self.assertEqual(5, len(batches_1))
     self.assertEqual(5, len(batches_2))
     self.assertEqual(5, len(batches_3))
     self.assertListEqual(dataset_1, flatten(batches_1))
     self.assertListEqual(dataset_2, flatten(batches_2))
     self.assertListEqual(dataset_3, flatten(batches_3))
     # Case 2
     batcher = batching(list_of_iterables=[dataset_1, dataset_2, dataset_3],
                        n=3,
                        infinite=False,
                        return_incomplete_batches=True)
     batches_1, batches_2, batches_3 = zip(*list(batcher))
     self.assertEqual(4, len(batches_1))
     self.assertEqual(4, len(batches_2))
     self.assertEqual(4, len(batches_3))
     self.assertListEqual(dataset_1, flatten(batches_1))
     self.assertListEqual(dataset_2, flatten(batches_2))
     self.assertListEqual(dataset_3, flatten(batches_3))
     # Case 3
     batcher = batching(list_of_iterables=[dataset_1, dataset_2, dataset_3],
                        n=3,
                        infinite=False,
                        return_incomplete_batches=False)
     batches_1, batches_2, batches_3 = zip(*list(batcher))
     self.assertEqual(3, len(batches_1))
     self.assertEqual(3, len(batches_2))
     self.assertEqual(3, len(batches_3))
     self.assertListEqual(dataset_1[:-1], flatten(batches_1))
     self.assertListEqual(dataset_2[:-1], flatten(batches_2))
     self.assertListEqual(dataset_3[:-1], flatten(batches_3))
def generate_rank_reports(y_true, y_pred, k_range=None):
    """
    Given the true values and the predicted ones, it generates a dataframe containing 
    the map@k, the topKcategoricalAccuracy and the hitsRatio@K, the precision and recall
    @k by product.
    :y_true: list of actual values to be predicted (list)
    :y_pred: list of predicted values (ordered by propensity) (list)
    :k_range: range number of predictions to consider. If not specified, it will be
    considered as the whole range, by counting the total number of unique labels
    by using y_true (int|None)
    :return: a table with all the metrics for all the K values (pd.Dataframe)
    """
    k_range = [x + 1 for x in range(len(set(flatten(y_true))))
               ] if type(k_range) == type(None) else k_range
    # Compute general metrics (dependant of all the leads generated)
    _map, _acc, _hit = [], [], []
    for k in k_range:
        _map.append(mapk(y_true, y_pred, k))
        _acc.append(top_k_categorical_accuracy(y_true, y_pred, k))
        _hit.append(top_k_hit_ratio(y_true, y_pred, k))
    metrics_at_k = pd.DataFrame({
        "k": k_range,
        "Map@k": _map,
        "TopAcc@k": _acc,
        "TopHit@k": _hit
    })[["k", "Map@k", "TopAcc@k", "TopHit@k"]]

    # Compute product metrics (based on each of the products performance)
    product, k, segment, precision, recall, fscore, support = [], [], [], [], [], [], []
    for k_ in k_range:
        precision_, recall_, fscore_, support_, products_ = rank_precision_recall_fscore_support_at_k(
            y_true=y_true, y_pred=y_pred, k=k_)
        precision.extend(precision_)
        recall.extend(recall_)
        fscore.extend(fscore_)
        support.extend(support_)
        k.extend([k_] * len(products_))
        product.extend(products_)

    product_metrics_at_k = pd.DataFrame({
        "product": product,
        "k": k,
        "precision": precision,
        "recall": recall,
        "fscore": fscore,
        "support": support
    })[["product", "k", "precision", "recall", "fscore", "support"]]
    return metrics_at_k, product_metrics_at_k
def rank_precision_recall_fscore_support_at_k(y_true, y_pred, k=5):
    """
    Trims the y_pred to a length of k by the right and calculates the average
    accuracies to the y_true.
    :y_true: list of actual values to be predicted (list)
    :y_pred: list of predicted values (ordered by propensity) (list)
    :k: number of predictions to consider (int)
    :return: the average of the accuracies (float)
    """
    precision, recall, fscore, support = [], [], [], []
    categories = set(flatten(y_true))
    for category in categories:
        y_x = [category in x for x in y_true]
        l_x = [category in x[0:k] for x in y_pred]
        precision_, recall_, fscore_, support_ = np.array(
            precision_recall_fscore_support(y_true=y_x, y_pred=l_x))[:, 1]
        precision.append(precision_)
        recall.append(recall_)
        fscore.append(fscore_)
        support.append(support_)
    return precision, recall, fscore, support, categories
Exemple #7
0
import sys

from tqdm import tqdm
import numpy as np
import tensorflow as tf

# Parameters
BATCH_SIZE = 256
project_id = "chatbot"
version_id = "v06"

# Data processing
dialogs = load_cornell_dialogs()
charset = list(set("".join(list(map(lambda x: x[0] + x[1], dialogs)))))
charset_size = len(charset) + 2
max_length = max(map(len, flatten(dialogs)))
go_symbol = len(charset)
unk_symbol = len(charset) + 1
character_to_code = dict(
    list(zip(charset + ["$GO$", "$UNK$"], range(len(charset) + 2))))
code_to_character = {k: v for (v, k) in character_to_code.items()}

process_dialog = lambda dialog: [
    tuple(
        pad(x=[character_to_code[ch] for ch in sentence],
            max_length=max_length,
            mode="right",
            symbol=unk_symbol)) for sentence in dialog
]

dialogs_codes = list(map(process_dialog, dialogs))
Exemple #8
0
    def define_core_model(self):
        with tf.variable_scope("Core_Model"):
            # Embeddings
            emb_mat_store_nbr = tf.get_variable(
                shape=self.get_emb_shape("store_nbr"),
                dtype=tf.float32,
                name="emb_mat_store_nbr")
            emb_mat_city = tf.get_variable(shape=self.get_emb_shape("city"),
                                           dtype=tf.float32,
                                           name="emb_mat_city")
            emb_mat_state = tf.get_variable(shape=self.get_emb_shape("state"),
                                            dtype=tf.float32,
                                            name="emb_mat_state")
            emb_mat_store_type = tf.get_variable(
                shape=self.get_emb_shape("store_type"),
                dtype=tf.float32,
                name="emb_mat_store_type")
            emb_mat_store_cluster = tf.get_variable(
                shape=self.get_emb_shape("store_cluster"),
                dtype=tf.float32,
                name="emb_mat_store_cluster")
            emb_mat_item_family = tf.get_variable(
                shape=self.get_emb_shape("item_family"),
                dtype=tf.float32,
                name="emb_mat_item_family")
            emb_mat_item_class = tf.get_variable(
                shape=self.get_emb_shape("item_class"),
                dtype=tf.float32,
                name="emb_mat_item_class")
            emb_mat_item_nbr = tf.get_variable(
                shape=self.get_emb_shape("item_nbr"),
                dtype=tf.float32,
                name="emb_mat_item_nbr")
            emb_mat_holiday_type = tf.get_variable(
                shape=self.get_emb_shape("holiday_type"),
                dtype=tf.float32,
                name="emb_mat_holiday_type")

            emb_store_nbr = tf.nn.embedding_lookup(emb_mat_store_nbr,
                                                   self.placeholders.store_nbr,
                                                   name="emb_lookup_store_nbr")
            emb_city = tf.nn.embedding_lookup(emb_mat_city,
                                              self.placeholders.city,
                                              name="emb_lookup_city")
            emb_state = tf.nn.embedding_lookup(emb_mat_state,
                                               self.placeholders.state,
                                               name="emb_lookup_state")
            emb_store_type = tf.nn.embedding_lookup(
                emb_mat_store_type,
                self.placeholders.store_type,
                name="emb_lookup_store_type")
            emb_store_cluster = tf.nn.embedding_lookup(
                emb_mat_store_cluster,
                self.placeholders.store_cluster,
                name="emb_lookup_store_cluster")
            emb_item_family = tf.nn.embedding_lookup(
                emb_mat_item_family,
                self.placeholders.item_family,
                name="emb_lookup_item_family")
            emb_item_class = tf.nn.embedding_lookup(
                emb_mat_item_class,
                self.placeholders.item_class,
                name="emb_lookup_item_class")
            emb_item_nbr = tf.nn.embedding_lookup(emb_mat_item_nbr,
                                                  self.placeholders.item_nbr,
                                                  name="emb_lookup_item_nbr")
            emb_national_holiday_type = tf.nn.embedding_lookup(
                emb_mat_holiday_type,
                self.placeholders.national_holiday_type[:, :, 0],
                name="emb_lookup_national_holiday_type")
            emb_local_holiday_type = tf.nn.embedding_lookup(
                emb_mat_holiday_type,
                self.placeholders.local_holiday_type[:, :, 0],
                name="emb_lookup_local_holiday_type")

            future_data_norm = BatchNorm(name="bn_future")(
                tf.contrib.layers.flatten(
                    tf.concat([
                        self.placeholders.local_holiday_fut,
                        self.placeholders.national_holiday_fut,
                        self.placeholders.regional_holiday_fut,
                        self.placeholders.year_fut,
                        self.placeholders.month_fut, self.placeholders.day_fut,
                        self.placeholders.dow_fut,
                        self.placeholders.onpromotion_fut
                    ],
                              axis=2)),
                train=self.placeholders.is_train)

            # Data preparation
            static_data_norm = BatchNorm(name="bn_static")(
                tf.expand_dims(self.placeholders.item_perishable, 1),
                train=self.placeholders.is_train)

            temporal_data_norm = BatchNorm(name="bn_temporal")(
                tf.concat([
                    self.placeholders.onpromotion,
                    self.placeholders.national_holiday_transferred,
                    self.placeholders.national_holiday,
                    self.placeholders.regional_holiday,
                    self.placeholders.local_holiday_transferred,
                    self.placeholders.local_holiday,
                    self.placeholders.dcoilwtico,
                    self.placeholders.transactions, self.placeholders.year,
                    self.placeholders.month, self.placeholders.day,
                    self.placeholders.dow
                ],
                          axis=2,
                          name="temporal_data_norm"),
                train=self.placeholders.is_train)

            static_data = tf.concat([
                static_data_norm, emb_store_nbr, emb_item_nbr, emb_item_family,
                emb_item_class, emb_city, emb_state, emb_store_type,
                emb_store_cluster, future_data_norm
            ],
                                    axis=1)

            temporal_data = tf.concat([
                self.placeholders.unit_sales, temporal_data_norm,
                emb_national_holiday_type, emb_local_holiday_type
            ],
                                      axis=2,
                                      name="temporal_data")

            # Encoder
            recurrent_cell_encoder = tf.contrib.rnn.CompiledWrapper(
                tf.nn.rnn_cell.MultiRNNCell([
                    tf.nn.rnn_cell.LSTMCell(self.n_recurrent_cells),
                    tf.nn.rnn_cell.LSTMCell(self.n_recurrent_cells),
                    tf.nn.rnn_cell.LSTMCell(self.n_recurrent_cells)
                ]))
            _, states = tf.nn.dynamic_rnn(recurrent_cell_encoder,
                                          temporal_data,
                                          dtype=tf.float32)

            # Thought treatment
            states = tf.concat(flatten([[s.c for s in states],
                                        [s.h for s in states], [static_data]]),
                               axis=1)
            states = BatchNorm(name="thought_1")(
                states, train=self.placeholders.is_train)
            states = tf.layers.dense(
                inputs=states,
                units=1024,
                activation=tf.nn.relu,
                kernel_initializer=tf.contrib.layers.xavier_initializer(),
                name="d_thought_1")
            states = BatchNorm(name="thought_2")(
                states, train=self.placeholders.is_train)
            states = tf.layers.dense(
                inputs=states,
                units=1024,
                activation=tf.nn.relu,
                kernel_initializer=tf.contrib.layers.xavier_initializer(),
                name="d_thought_2")
            states = BatchNorm(name="thought_3")(
                states, train=self.placeholders.is_train)
            states = tf.layers.dense(
                inputs=states,
                units=self.n_recurrent_cells * self.n_recurrent_layers * 2,
                activation=None,
                kernel_initializer=tf.contrib.layers.xavier_initializer())
            thought_vector = []
            for i in range(self.n_recurrent_layers):
                c = states[:, i * self.n_recurrent_cells:(i + 1) *
                           self.n_recurrent_cells]
                h = states[:, (i + self.n_recurrent_layers) *
                           self.n_recurrent_cells:
                           (i + self.n_recurrent_layers + 1) *
                           self.n_recurrent_cells]
                thought_vector.append(tf.nn.rnn_cell.LSTMStateTuple(c, h))
            thought_vector = tuple(thought_vector)

            # Decoder
            recurrent_cell_decoder = tf.contrib.rnn.CompiledWrapper(
                tf.nn.rnn_cell.MultiRNNCell([
                    tf.nn.rnn_cell.LSTMCell(self.n_recurrent_cells),
                    tf.nn.rnn_cell.LSTMCell(self.n_recurrent_cells),
                    tf.nn.rnn_cell.LSTMCell(self.n_recurrent_cells)
                ]))

            go = tf.ones([
                tf.shape(self.placeholders.unit_sales)[0],
                self.n_timesteps_future, self.n_recurrent_cells
            ])
            outputs, states = decoder(
                inputs=go,
                thought_states=thought_vector,
                cell=recurrent_cell_decoder,
                max_ouput_sequence_length=self.n_timesteps_future,
                name="decoder")

            lstm_stacked_output = tf.reshape(
                outputs, shape=[-1, outputs.shape[2].value], name="stack_LSTM")
            d = tf.layers.dense(
                lstm_stacked_output,
                64,
                activation=tf.nn.relu,
                kernel_initializer=tf.contrib.layers.xavier_initializer(),
                name="dense_1")
            d = tf.contrib.layers.layer_norm(d)
            d = tf.layers.dense(
                d,
                32,
                activation=tf.nn.relu,
                kernel_initializer=tf.contrib.layers.xavier_initializer(),
                name="dense_2")
            d = tf.contrib.layers.layer_norm(d)
            d = tf.layers.dense(
                d,
                1,
                activation=None,
                kernel_initializer=tf.contrib.layers.xavier_initializer(),
                name="dense_3")
            unstacked_output = tf.reshape(
                d, shape=[-1, self.n_timesteps_future, 1], name="unstack_LSTM")

            return {"output": unstacked_output}
 def test_flatten(self):
     test_list = [[1], [2, 3], [4], [5], [6]]
     self.assertListEqual([1, 2, 3, 4, 5, 6], flatten(test_list))