def _compute_likelihood(self, training_set_path): log_likelihood = 0 data_reader = DatasetHandler(training_set_path) sentences_generator = data_reader.generate_sentences() k = 10 for sent in sentences_generator: z_values = [0 for i in range(self.num_clusters)] for word in sent: if word in self.model.frequent_words_set: for i in range(self.num_clusters): z_values[i] += math.log(self.model.get_p_w_given_xi(word, i)) # add alpha_i for each cluster for i in range(self.num_clusters): z_values[i] += math.log(self.model.get_p_xi(i)) m = max(z_values) z_minus_m_values = [z - m for z in z_values] # add log of only numerically stable components sent_stable_comp_sum = 0 for z_m in z_minus_m_values: if z_m >= -k: sent_stable_comp_sum += math.exp(z_m) log_likelihood += m + math.log(sent_stable_comp_sum) return log_likelihood
def __init__(self, trained_model, test_path, likelihood_file, perplexity_file): self.model = trained_model self.data_handler = DatasetHandler(test_path) self.likelihood_file = likelihood_file self.perplexity_file = perplexity_file self.clusters_topics = [] self.ordered_topics = ["acq", "money-fx", "grain", "crude", "trade", "interest", "ship", "wheat", "corn"] self.clusters_predicted_labels = []
def initiate_word_and_cluster_probs(self, dataset_path): # init reader dataset_reader = DatasetHandler(dataset_path) # init clusters and word in cluster counts data structures raw_word_counts = {} cluster_counts = [0 for i in range(self.num_clusters)] cluster_word_counts = [{} for i in range(self.num_clusters)] current_cluster = 0 # iterate over dataset for first time # just count all words in dataset sent_generator = dataset_reader.generate_sentences() for sent in sent_generator: for word in sent: raw_word_counts[word] = raw_word_counts.get(word, 0) + 1 # iterate over dataset for second time # now count the words for each topic separately # also use frequency threshold to reduce resources sent_generator = dataset_reader.generate_sentences() for sent in sent_generator: cluster_counts[current_cluster] += 1 for word in sent: # apply frequency threshold reduction if raw_word_counts[word] > self.frequent_word_threshold: cluster_word_counts[current_cluster][ word] = cluster_word_counts[current_cluster].get( word, 0) + 1 self.frequent_words_set.add(word) # update current cluster current_cluster = (current_cluster + 1) % self.num_clusters # compute cluster probs from cluster counts # compute cluster word probs from cluster words counts num_sents = sum(cluster_counts) self.cluster_probs = [c / num_sents for c in cluster_counts] self.num_words_per_cluster = [ sum(wc.values()) for wc in cluster_word_counts ] for i in range(len(cluster_word_counts)): current_cluster_word_counts = cluster_word_counts[i] num_words_current_cluster = self.num_words_per_cluster[i] for word in current_cluster_word_counts: # we already smooth the data here using lidstone smoothing method self.cluster_word_probs[i][word] = \ (current_cluster_word_counts[word] + self.lambda_) / (num_words_current_cluster + self.lambda_ * self.estimated_vocab_size) # smooth the cluster probs self.cluster_probs = self.smooth_cluster_probs(self.cluster_probs)
def query_to_dataloader(data: List[TimeSeriesData]) -> pd.DataFrame: df = pd.DataFrame(jsonable_encoder(data)) df = df.pivot_table(values="consumption", index="timestamp", columns="household_name") df.reset_index(inplace=True) df["timestamp"] = pd.to_datetime(df["timestamp"]) dataset_handler = DatasetHandler(data_path=None, num_samples=None, batch_size=len(df), val_split_ratio=0, pred_horizon=24, hist_hours=len(df) // 4, forking_total_seq_length=None) dataloader = dataset_handler.load_dataset(df, False) return dataloader
def main(args): forking = args.use_forking_sequences forking_total_seq_length = 500 if forking else None train_el_dataloader, val_el_dataloader = DatasetHandler( data_path, num_samples=args.dataset_num_samples, hist_hours=args.max_sequence_len, pred_horizon=args.forcast_horizons, batch_size=args.batch_size, # with forking, use lower batch size! forking_total_seq_length=forking_total_seq_length).load_dataset() # save dataloaders for predictions os.makedirs(DATALOADERS_PATH, exist_ok=True) train_dl_path = os.path.join(DATALOADERS_PATH, "train_dl.pkl") test_dl_path = os.path.join(DATALOADERS_PATH, "test_dl.pkl") with open(train_dl_path, "wb") as fp: pickle.dump(train_el_dataloader, fp) with open(test_dl_path, "wb") as fp: pickle.dump(val_el_dataloader, fp) # quantiles = [.1, .2, .3, .4, .5, .6, .7, .8, .9, .95] # quantiles = [.2, .4, .5, .6, .8] model = ForecasterQR( x_dim=3, y_dim=4, input_max_squence_len=args.max_sequence_len, encoder_hidden_dim=args.encoder_hidden_dim, encoder_num_layers=args.encoder_layer_count, decoder_context_dim=args.decoder_context_dim, quantiles=quantiles, horizons=args.forcast_horizons, device="gpu", init_learning_rate=args.learning_rate, init_weight_decay=args.weight_decay, sequence_forking=forking is not None ) # model checkpoint callback checkpoint_cb = pl.callbacks.ModelCheckpoint( dirpath=TRAINED_MODEL_PATH, monitor="val_loss", filename="model-{epoch:02d}-{val_loss:.2f}" ) trainer = pl.Trainer( gpus=args.gpus, max_epochs=args.epochs, checkpoint_callback=checkpoint_cb, num_sanity_val_steps=0) trainer.fit(model, train_el_dataloader, val_el_dataloader) val_loss = trainer.callback_metrics["val_loss"].item() nni.report_final_result({"default": val_loss})
os.makedirs(DATALOADERS_PATH, exist_ok=True) train_dl_path = os.path.join(DATALOADERS_PATH, "train_dl.pkl") test_dl_path = os.path.join(DATALOADERS_PATH, "test_dl.pkl") with open(train_dl_path, "wb") as fp: pickle.dump(train_dataloader, fp) with open(test_dl_path, "wb") as fp: pickle.dump(val_dataloader, fp) if __name__ == '__main__': df_train = load_data(TRAIN_DATA_PATH) df_test = load_data(TEST_DATA_PATH) kernel_seed = 42 batch_size = 256 max_sequence_len = 500 train_dataloader, val_dataloader = DatasetHandler(df_train, df_test, batch_size).load_dataset() save_dataloaders() model = RocketNet(x_dim=1, n_classes=N_CLASSES, kernel_seed=kernel_seed, kernel_count=KERNEL_COUNT, max_sequence_len=max_sequence_len) checkpoint_cb = pl.callbacks.ModelCheckpoint( dirpath=TRAINED_MODEL_PATH, monitor="val_loss", filename="model-{epoch:02d}-{val_loss:.2f}" )
def run_algorithm(self, training_set_path): # initiate the values for theta - model parameters self.model.initiate_word_and_cluster_probs(training_set_path) # initiate data reader and other variables data_reader = DatasetHandler(training_set_path) num_total_training_tokens = data_reader.count_number_of_total_tokens(frequent_threshold=self.model.frequent_word_threshold) iteration_num = 0 num_consecutive_decreasing = 0 current_likelihood = self._compute_likelihood(training_set_path) current_perplexity = self._compute_perplexity(current_likelihood, num_total_training_tokens) self.iterations_likelihood = [current_likelihood] self.iterations_perplexity = [current_perplexity] print("likelihood value for iteration {0} is: {1}".format(iteration_num, current_likelihood)) print("perplexity value for iteration {0} is: {1}".format(iteration_num, current_perplexity)) # iterate until stopping criterion while num_consecutive_decreasing < self.stop_threshold and iteration_num < self.max_num_iterations: iteration_num += 1 # print("starting iteration {0}".format(iteration_num)) # E part is already implemented within the model class # perform M part to update model parameters # update cluster probs new_clusters_mass = [0 for i in range(self.num_clusters)] sentences = data_reader.generate_sentences() for sent in sentences: for i in range(self.num_clusters): new_clusters_mass[i] += self.model.get_p_xi_given_sent(i, sent) # divide by N and normalize/smooth total_mass_all_clusters = sum(new_clusters_mass) new_cluster_probs = [cluster_mass / total_mass_all_clusters for cluster_mass in new_clusters_mass] new_cluster_probs = self.model.smooth_cluster_probs(new_cluster_probs) # print("finished updating cluster probs") # update word cluster probs # initiate needed helper variables cluster_word_mass = [{} for i in range(self.num_clusters)] sentences = data_reader.generate_sentences() # now we sum all the "mass" for every word that is frequent enough, given the current model theta for sent in sentences: for i in range(self.num_clusters): current_model_p_xi_given_sent = self.model.get_p_xi_given_sent(i, sent) for word in sent: if word in self.model.frequent_words_set: # if word is frequent enough cluster_word_mass[i][word] = cluster_word_mass[i].get(word, 0) + current_model_p_xi_given_sent clusters_total_mass = [sum(cluster_word_mass[i].values()) for i in range(self.num_clusters)] # now compute probs using the mass # apply also lidston smoothing over the calculated probabilites cluster_word_probs = [{} for i in range(self.num_clusters)] lambda_ = self.model.lambda_ vocab_size = self.model.estimated_vocab_size for word in self.model.frequent_words_set: for i in range(self.num_clusters): current_cluster_word_mass = cluster_word_mass[i][word] current_cluster_total_mass = clusters_total_mass[i] cluster_word_probs[i][word] = (current_cluster_word_mass + lambda_) / (current_cluster_total_mass + vocab_size * lambda_) # print("finished updating cluster word probs") # assign new cluster probs to and new word cluster probs to model self.model.cluster_probs = new_cluster_probs self.model.cluster_word_probs = cluster_word_probs self.model.em_clusters_total_mass = clusters_total_mass # print and save current likelihood and new likelihood prev_likelihood = current_likelihood current_likelihood = self._compute_likelihood(training_set_path) current_perplexity = self._compute_perplexity(current_likelihood, num_total_training_tokens) self.iterations_likelihood.append(current_likelihood) self.iterations_perplexity.append(current_perplexity) print("likelihood value for iteration {0} is: {1}".format(iteration_num, current_likelihood)) print("perplexity value for iteration {0} is: {1}".format(iteration_num, current_perplexity)) if current_likelihood - prev_likelihood < 0: # the likelihood didn't improve in this iteration num_consecutive_decreasing += 1 # At the end of algorithm write model parameters (theta) and iterations information self.model.save_object_as_pickle() DatasetHandler.write_results_to_file(self.iterations_likelihood, "iterations_likelihood.txt")
import sys from os import path sys.path.append(path.join(path.dirname(__file__), '..')) from utils.JsonHandler import JsonHandler from DatasetHandler import DatasetHandler if __name__ == '__main__': # Read config file configFile = JsonHandler.read_json("../../conf/create-data-config.json") inputDataset = configFile["datasetInfo"]["inputDataset"] outputDataset = configFile["datasetInfo"]["outputDataset"] classesList = configFile["datasetInfo"]["classes"] chunkSize = int(configFile["datasetInfo"]["chunkSize"]) imageSize = tuple((configFile["datasetInfo"]["imageSize"], configFile["datasetInfo"]["imageSize"])) featureExtractorName = configFile["featureExtractor"]["name"] dataset = DatasetHandler(inputDataset, outputDataset, classesList, chunkSize, imageSize, featureExtractorName) dataset.create_data()
class Report(object): def __init__(self, trained_model, test_path, likelihood_file, perplexity_file): self.model = trained_model self.data_handler = DatasetHandler(test_path) self.likelihood_file = likelihood_file self.perplexity_file = perplexity_file self.clusters_topics = [] self.ordered_topics = ["acq", "money-fx", "grain", "crude", "trade", "interest", "ship", "wheat", "corn"] self.clusters_predicted_labels = [] @staticmethod def load_list_from_file(input_file): with open(input_file, "r") as f: values_as_string = f.readline() list_of_strings = values_as_string.split(",")[:-1] return [float(v) for v in list_of_strings] def plot_iterations_graphs(self, type): if type == "p": data_type = "Perplexity" data = self.load_list_from_file(self.perplexity_file) color = 'red' else: data_type = "Log Likelihood" data = self.load_list_from_file(self.likelihood_file) color = 'blue' title = '{0} as a function of iteration number'.format(data_type) iterations = range(1, len(data)+1) plt.style.use('bmh') plt.plot(iterations, data, color=color, alpha=0.4) plt.xlabel('Iteration number') plt.ylabel(data_type) plt.title(title) plt.grid(True) plt.savefig("{0}.png".format(data_type)) plt.close() def create_confusion_mat(self): clusters_topics = [{} for i in range(self.model.num_clusters)] labeled_sents_generator = self.data_handler.generate_labeled_sentences() for sent, labels in labeled_sents_generator: predicted_cluster = self.model.classify_sent(sent) for label in labels: clusters_topics[predicted_cluster][label] = clusters_topics[predicted_cluster].get(label, 0) + 1 df = pd.DataFrame(clusters_topics) df = df[self.ordered_topics] df = df.fillna(0) df["total_in_cluster"] = df.sum(axis=1) self.clusters_topics = clusters_topics df.to_csv("conf_matrix.csv") print(df) def label_prediction_clusters(self): if len(self.clusters_topics) == 0: self.create_confusion_mat() for cluster_topics_dict in self.clusters_topics: sorted_cluster_topics = sorted(cluster_topics_dict.items(), key=lambda x: x[1], reverse=True) self.clusters_predicted_labels.append(sorted_cluster_topics[0]) print(self.clusters_predicted_labels) def plot_cluster_histogram(self, cluster_num): topics = self.ordered_topics counts = [self.clusters_topics[cluster_num].get(topic, 0) for topic in topics] cluster_topic_id = np.argmax(counts) title = 'Cluster {0}- "{1}"'.format(cluster_num, topics[cluster_topic_id]) plt.style.use('bmh') plt.bar(topics, counts, color='green', alpha=0.3) plt.xlabel('Topics') plt.title(title) plt.grid(True) plt.savefig("cluster_{0}_histogram.png".format(cluster_num)) plt.close() def compute_model_accuracy(self): if len(self.clusters_predicted_labels) == 0: self.label_prediction_clusters() num_correct_predicted = 0 total_sents = 0 labeled_sents_generator = self.data_handler.generate_labeled_sentences() for sent, labels in labeled_sents_generator: predicted_cluster = self.model.classify_sent(sent) cluster_label = self.clusters_predicted_labels[predicted_cluster][0] if cluster_label in labels: num_correct_predicted += 1 total_sents += 1 accuracy = num_correct_predicted / total_sents print("The accuracy of the model is {0}".format(accuracy)) return accuracy
import matplotlib.pyplot as plt import matplotlib.image as mpimg from DatasetHandler import DatasetHandler from ModelHandler import ModelHandler from ImageHandler import ImageHandler from sklearn.model_selection import train_test_split as tts import cv2 from keras.preprocessing.image import array_to_img im_handler = ImageHandler() ds_handler = DatasetHandler() ds_handler.init_apollo_data_source() ds_handler.read_frame_big_batch('Apollo') print(ds_handler.data_paths_apollo_leftcam) model_train = False new_model = False # TODO: uncomment to see augmentations image_paths_l = ds_handler.data_paths_apollo_leftcam #[100] image_paths_r = ds_handler.data_paths_apollo_rightcam output_paths_l = ds_handler.label_paths_apollo_leftcam #[100] output_paths_r = ds_handler.label_paths_apollo_rightcam #image_handler.augment_batch_visualize(image_paths, valid_paths)