def get_list_of_wav_paths(data_version: str, n_augmentations: [int, str] = 0) -> tuple: """ Retrieves the list of filepaths that belong to train, validation and test :param n_augmentations: specify the number of augmentations to use or specify "all" to load all the available ones (int|str) :param data_version: specifies the version of the data to use (str {"0.01", "0.02"}) :return: list of training paths, list of validation paths and list of test paths (list of lists) """ folders = [get_training_data_path(data_version=data_version)] if type(n_augmentations) == int: folders += [get_augmented_data_folder(data_version=data_version, folder=str(f)) for f in range(n_augmentations)] elif n_augmentations == "all": base = get_augmented_data_path(data_version=data_version) folders += [os.path.join(base, f) for f in os.listdir(base)] else: raise ValueError(f"'n_augmentations' parameter value not recognized as a valid argument ('all'|int): {n_augmentations}") for path in folders: if len(os.listdir(path)) == 0: warnings.warn(f"Attempting to load files from an empty folder: {path}") list_test = open(os.path.join(get_training_data_path(data_version=data_version), "testing_list.txt")) list_test = list( map(lambda x: os.path.normpath(os.path.join(get_training_data_path(data_version=data_version), x.strip())), list_test)) list_val = open(os.path.join(get_training_data_path(data_version=data_version), "validation_list.txt")) list_val = list( map(lambda x: os.path.normpath(os.path.join(get_training_data_path(data_version=data_version), x.strip())), list_val)) list_train = flatten([list(recursive_listdir(os.path.normpath(folder))) for folder in folders]) list_train = list(filter(lambda p: "background_noise" not in p and p.endswith("wav"), list_train)) list_train = np.setdiff1d(list_train, list_test + list_val).tolist() return list_train, list_val, list_test
def load_cornell_dialogs(max_length=150): path = os.path.join(get_data_path(), "cornell movie-dialogs corpus") if not os.path.exists(path): url = 'http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip' response = urllib.request.urlopen(url) data = response.read() # a `bytes` object zip_ref = zipfile.ZipFile(io.BytesIO(data)) zip_ref.extractall(os.path.join(get_data_path())) movie_lines = codecs.open(os.path.join(path, "movie_lines.txt"), "r", "Windows-1252").readlines() movie_lines = list(map(lambda x: x.strip().split(" +++$+++ "), movie_lines)) movie_lines_dict = dict(list(map(lambda x: (x[0], x[-1]), movie_lines))) movie_conversations = open(os.path.join(path, "movie_conversations.txt"), "r").readlines() movie_conversations = list( map(lambda x: x.strip().split(" +++$+++ "), movie_conversations)) for element in range(len(movie_conversations)): movie_conversations[element][-1] = [ movie_lines_dict[line] for line in eval(movie_conversations[element][-1]) ] dialogs = flatten( list( map(lambda x: list(zip(x[-1][:-1], x[-1][1:])), movie_conversations))) dialogs_filtered = list( filter(lambda x: max([len(s) for s in x]) <= max_length, dialogs)) return (dialogs_filtered)
def fit(self, list_of_real_sentences): lists_of_tokens = list( map(lambda x: nltk.word_tokenize(x, "english"), list_of_real_sentences)) lists_of_tokens = map(lambda s: [w.lower() for w in s], lists_of_tokens) fdist = Counter(flatten(self.calculate_ngrams(lists_of_tokens))) items = list( map(lambda x: x[0], filter(lambda x: x[1] >= 2, dict(fdist).items()))) # Remove hapaxes self.unique_items = set(items)
def test_batching(self): dataset_1 = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] dataset_2 = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"] dataset_3 = [ "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X" ] # Case 1 batcher = batching(list_of_iterables=[dataset_1, dataset_2, dataset_3], n=2, infinite=False, return_incomplete_batches=False) batches_1, batches_2, batches_3 = zip(*list(batcher)) self.assertEqual(5, len(batches_1)) self.assertEqual(5, len(batches_2)) self.assertEqual(5, len(batches_3)) self.assertListEqual(dataset_1, flatten(batches_1)) self.assertListEqual(dataset_2, flatten(batches_2)) self.assertListEqual(dataset_3, flatten(batches_3)) # Case 2 batcher = batching(list_of_iterables=[dataset_1, dataset_2, dataset_3], n=3, infinite=False, return_incomplete_batches=True) batches_1, batches_2, batches_3 = zip(*list(batcher)) self.assertEqual(4, len(batches_1)) self.assertEqual(4, len(batches_2)) self.assertEqual(4, len(batches_3)) self.assertListEqual(dataset_1, flatten(batches_1)) self.assertListEqual(dataset_2, flatten(batches_2)) self.assertListEqual(dataset_3, flatten(batches_3)) # Case 3 batcher = batching(list_of_iterables=[dataset_1, dataset_2, dataset_3], n=3, infinite=False, return_incomplete_batches=False) batches_1, batches_2, batches_3 = zip(*list(batcher)) self.assertEqual(3, len(batches_1)) self.assertEqual(3, len(batches_2)) self.assertEqual(3, len(batches_3)) self.assertListEqual(dataset_1[:-1], flatten(batches_1)) self.assertListEqual(dataset_2[:-1], flatten(batches_2)) self.assertListEqual(dataset_3[:-1], flatten(batches_3))
def generate_rank_reports(y_true, y_pred, k_range=None): """ Given the true values and the predicted ones, it generates a dataframe containing the map@k, the topKcategoricalAccuracy and the hitsRatio@K, the precision and recall @k by product. :y_true: list of actual values to be predicted (list) :y_pred: list of predicted values (ordered by propensity) (list) :k_range: range number of predictions to consider. If not specified, it will be considered as the whole range, by counting the total number of unique labels by using y_true (int|None) :return: a table with all the metrics for all the K values (pd.Dataframe) """ k_range = [x + 1 for x in range(len(set(flatten(y_true)))) ] if type(k_range) == type(None) else k_range # Compute general metrics (dependant of all the leads generated) _map, _acc, _hit = [], [], [] for k in k_range: _map.append(mapk(y_true, y_pred, k)) _acc.append(top_k_categorical_accuracy(y_true, y_pred, k)) _hit.append(top_k_hit_ratio(y_true, y_pred, k)) metrics_at_k = pd.DataFrame({ "k": k_range, "Map@k": _map, "TopAcc@k": _acc, "TopHit@k": _hit })[["k", "Map@k", "TopAcc@k", "TopHit@k"]] # Compute product metrics (based on each of the products performance) product, k, segment, precision, recall, fscore, support = [], [], [], [], [], [], [] for k_ in k_range: precision_, recall_, fscore_, support_, products_ = rank_precision_recall_fscore_support_at_k( y_true=y_true, y_pred=y_pred, k=k_) precision.extend(precision_) recall.extend(recall_) fscore.extend(fscore_) support.extend(support_) k.extend([k_] * len(products_)) product.extend(products_) product_metrics_at_k = pd.DataFrame({ "product": product, "k": k, "precision": precision, "recall": recall, "fscore": fscore, "support": support })[["product", "k", "precision", "recall", "fscore", "support"]] return metrics_at_k, product_metrics_at_k
def rank_precision_recall_fscore_support_at_k(y_true, y_pred, k=5): """ Trims the y_pred to a length of k by the right and calculates the average accuracies to the y_true. :y_true: list of actual values to be predicted (list) :y_pred: list of predicted values (ordered by propensity) (list) :k: number of predictions to consider (int) :return: the average of the accuracies (float) """ precision, recall, fscore, support = [], [], [], [] categories = set(flatten(y_true)) for category in categories: y_x = [category in x for x in y_true] l_x = [category in x[0:k] for x in y_pred] precision_, recall_, fscore_, support_ = np.array( precision_recall_fscore_support(y_true=y_x, y_pred=l_x))[:, 1] precision.append(precision_) recall.append(recall_) fscore.append(fscore_) support.append(support_) return precision, recall, fscore, support, categories
import sys from tqdm import tqdm import numpy as np import tensorflow as tf # Parameters BATCH_SIZE = 256 project_id = "chatbot" version_id = "v06" # Data processing dialogs = load_cornell_dialogs() charset = list(set("".join(list(map(lambda x: x[0] + x[1], dialogs))))) charset_size = len(charset) + 2 max_length = max(map(len, flatten(dialogs))) go_symbol = len(charset) unk_symbol = len(charset) + 1 character_to_code = dict( list(zip(charset + ["$GO$", "$UNK$"], range(len(charset) + 2)))) code_to_character = {k: v for (v, k) in character_to_code.items()} process_dialog = lambda dialog: [ tuple( pad(x=[character_to_code[ch] for ch in sentence], max_length=max_length, mode="right", symbol=unk_symbol)) for sentence in dialog ] dialogs_codes = list(map(process_dialog, dialogs))
def define_core_model(self): with tf.variable_scope("Core_Model"): # Embeddings emb_mat_store_nbr = tf.get_variable( shape=self.get_emb_shape("store_nbr"), dtype=tf.float32, name="emb_mat_store_nbr") emb_mat_city = tf.get_variable(shape=self.get_emb_shape("city"), dtype=tf.float32, name="emb_mat_city") emb_mat_state = tf.get_variable(shape=self.get_emb_shape("state"), dtype=tf.float32, name="emb_mat_state") emb_mat_store_type = tf.get_variable( shape=self.get_emb_shape("store_type"), dtype=tf.float32, name="emb_mat_store_type") emb_mat_store_cluster = tf.get_variable( shape=self.get_emb_shape("store_cluster"), dtype=tf.float32, name="emb_mat_store_cluster") emb_mat_item_family = tf.get_variable( shape=self.get_emb_shape("item_family"), dtype=tf.float32, name="emb_mat_item_family") emb_mat_item_class = tf.get_variable( shape=self.get_emb_shape("item_class"), dtype=tf.float32, name="emb_mat_item_class") emb_mat_item_nbr = tf.get_variable( shape=self.get_emb_shape("item_nbr"), dtype=tf.float32, name="emb_mat_item_nbr") emb_mat_holiday_type = tf.get_variable( shape=self.get_emb_shape("holiday_type"), dtype=tf.float32, name="emb_mat_holiday_type") emb_store_nbr = tf.nn.embedding_lookup(emb_mat_store_nbr, self.placeholders.store_nbr, name="emb_lookup_store_nbr") emb_city = tf.nn.embedding_lookup(emb_mat_city, self.placeholders.city, name="emb_lookup_city") emb_state = tf.nn.embedding_lookup(emb_mat_state, self.placeholders.state, name="emb_lookup_state") emb_store_type = tf.nn.embedding_lookup( emb_mat_store_type, self.placeholders.store_type, name="emb_lookup_store_type") emb_store_cluster = tf.nn.embedding_lookup( emb_mat_store_cluster, self.placeholders.store_cluster, name="emb_lookup_store_cluster") emb_item_family = tf.nn.embedding_lookup( emb_mat_item_family, self.placeholders.item_family, name="emb_lookup_item_family") emb_item_class = tf.nn.embedding_lookup( emb_mat_item_class, self.placeholders.item_class, name="emb_lookup_item_class") emb_item_nbr = tf.nn.embedding_lookup(emb_mat_item_nbr, self.placeholders.item_nbr, name="emb_lookup_item_nbr") emb_national_holiday_type = tf.nn.embedding_lookup( emb_mat_holiday_type, self.placeholders.national_holiday_type[:, :, 0], name="emb_lookup_national_holiday_type") emb_local_holiday_type = tf.nn.embedding_lookup( emb_mat_holiday_type, self.placeholders.local_holiday_type[:, :, 0], name="emb_lookup_local_holiday_type") future_data_norm = BatchNorm(name="bn_future")( tf.contrib.layers.flatten( tf.concat([ self.placeholders.local_holiday_fut, self.placeholders.national_holiday_fut, self.placeholders.regional_holiday_fut, self.placeholders.year_fut, self.placeholders.month_fut, self.placeholders.day_fut, self.placeholders.dow_fut, self.placeholders.onpromotion_fut ], axis=2)), train=self.placeholders.is_train) # Data preparation static_data_norm = BatchNorm(name="bn_static")( tf.expand_dims(self.placeholders.item_perishable, 1), train=self.placeholders.is_train) temporal_data_norm = BatchNorm(name="bn_temporal")( tf.concat([ self.placeholders.onpromotion, self.placeholders.national_holiday_transferred, self.placeholders.national_holiday, self.placeholders.regional_holiday, self.placeholders.local_holiday_transferred, self.placeholders.local_holiday, self.placeholders.dcoilwtico, self.placeholders.transactions, self.placeholders.year, self.placeholders.month, self.placeholders.day, self.placeholders.dow ], axis=2, name="temporal_data_norm"), train=self.placeholders.is_train) static_data = tf.concat([ static_data_norm, emb_store_nbr, emb_item_nbr, emb_item_family, emb_item_class, emb_city, emb_state, emb_store_type, emb_store_cluster, future_data_norm ], axis=1) temporal_data = tf.concat([ self.placeholders.unit_sales, temporal_data_norm, emb_national_holiday_type, emb_local_holiday_type ], axis=2, name="temporal_data") # Encoder recurrent_cell_encoder = tf.contrib.rnn.CompiledWrapper( tf.nn.rnn_cell.MultiRNNCell([ tf.nn.rnn_cell.LSTMCell(self.n_recurrent_cells), tf.nn.rnn_cell.LSTMCell(self.n_recurrent_cells), tf.nn.rnn_cell.LSTMCell(self.n_recurrent_cells) ])) _, states = tf.nn.dynamic_rnn(recurrent_cell_encoder, temporal_data, dtype=tf.float32) # Thought treatment states = tf.concat(flatten([[s.c for s in states], [s.h for s in states], [static_data]]), axis=1) states = BatchNorm(name="thought_1")( states, train=self.placeholders.is_train) states = tf.layers.dense( inputs=states, units=1024, activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer(), name="d_thought_1") states = BatchNorm(name="thought_2")( states, train=self.placeholders.is_train) states = tf.layers.dense( inputs=states, units=1024, activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer(), name="d_thought_2") states = BatchNorm(name="thought_3")( states, train=self.placeholders.is_train) states = tf.layers.dense( inputs=states, units=self.n_recurrent_cells * self.n_recurrent_layers * 2, activation=None, kernel_initializer=tf.contrib.layers.xavier_initializer()) thought_vector = [] for i in range(self.n_recurrent_layers): c = states[:, i * self.n_recurrent_cells:(i + 1) * self.n_recurrent_cells] h = states[:, (i + self.n_recurrent_layers) * self.n_recurrent_cells: (i + self.n_recurrent_layers + 1) * self.n_recurrent_cells] thought_vector.append(tf.nn.rnn_cell.LSTMStateTuple(c, h)) thought_vector = tuple(thought_vector) # Decoder recurrent_cell_decoder = tf.contrib.rnn.CompiledWrapper( tf.nn.rnn_cell.MultiRNNCell([ tf.nn.rnn_cell.LSTMCell(self.n_recurrent_cells), tf.nn.rnn_cell.LSTMCell(self.n_recurrent_cells), tf.nn.rnn_cell.LSTMCell(self.n_recurrent_cells) ])) go = tf.ones([ tf.shape(self.placeholders.unit_sales)[0], self.n_timesteps_future, self.n_recurrent_cells ]) outputs, states = decoder( inputs=go, thought_states=thought_vector, cell=recurrent_cell_decoder, max_ouput_sequence_length=self.n_timesteps_future, name="decoder") lstm_stacked_output = tf.reshape( outputs, shape=[-1, outputs.shape[2].value], name="stack_LSTM") d = tf.layers.dense( lstm_stacked_output, 64, activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer(), name="dense_1") d = tf.contrib.layers.layer_norm(d) d = tf.layers.dense( d, 32, activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer(), name="dense_2") d = tf.contrib.layers.layer_norm(d) d = tf.layers.dense( d, 1, activation=None, kernel_initializer=tf.contrib.layers.xavier_initializer(), name="dense_3") unstacked_output = tf.reshape( d, shape=[-1, self.n_timesteps_future, 1], name="unstack_LSTM") return {"output": unstacked_output}
def test_flatten(self): test_list = [[1], [2, 3], [4], [5], [6]] self.assertListEqual([1, 2, 3, 4, 5, 6], flatten(test_list))