def get_data_labels_matrices(self, workflow_paths, tool_usage_path, cutoff_date, compatible_next_tools, old_data_dictionary={}): """ Convert the training and test paths into corresponding numpy matrices """ processed_data, raw_paths = self.process_workflow_paths(workflow_paths) dictionary, reverse_dictionary = self.create_data_dictionary( processed_data, old_data_dictionary) num_classes = len(dictionary) print("Raw paths: %d" % len(raw_paths)) random.shuffle(raw_paths) print("Decomposing paths...") all_unique_paths = self.decompose_paths(raw_paths, dictionary) random.shuffle(all_unique_paths) print("Creating dictionaries...") multilabels_paths = self.prepare_paths_labels_dictionary( dictionary, reverse_dictionary, all_unique_paths, compatible_next_tools) print("Complete data: %d" % len(multilabels_paths)) train_paths_dict, test_paths_dict = self.split_test_train_data( multilabels_paths) print("Train data: %d" % len(train_paths_dict)) print("Test data: %d" % len(test_paths_dict)) test_data, test_labels = self.pad_paths(test_paths_dict, num_classes) train_data, train_labels = self.pad_paths(train_paths_dict, num_classes) # Predict tools usage print("Predicting tools' usage...") usage_pred = predict_tool_usage.ToolPopularity() usage = usage_pred.extract_tool_usage(tool_usage_path, cutoff_date, dictionary) tool_usage_prediction = usage_pred.get_pupularity_prediction(usage) tool_predicted_usage = self.get_predicted_usage( dictionary, tool_usage_prediction) # get class weights using the predicted usage for each tool class_weights = self.assign_class_weights(train_labels.shape[1], tool_predicted_usage) return train_data, train_labels, test_data, test_labels, dictionary, reverse_dictionary, class_weights, tool_predicted_usage
def get_data_labels_matrices( self, workflow_paths, tool_usage_path, cutoff_date, compatible_next_tools, standard_connections, old_data_dictionary={}, ): """ Convert the training and test paths into corresponding numpy matrices """ processed_data, raw_paths = self.process_workflow_paths(workflow_paths) dictionary, rev_dict = self.create_data_dictionary( processed_data, old_data_dictionary ) num_classes = len(dictionary) print("Raw paths: %d" % len(raw_paths)) random.shuffle(raw_paths) print("Decomposing paths...") all_unique_paths = self.decompose_paths(raw_paths, dictionary) random.shuffle(all_unique_paths) print("Creating dictionaries...") multilabels_paths = self.prepare_paths_labels_dictionary( dictionary, rev_dict, all_unique_paths, compatible_next_tools ) print("Complete data: %d" % len(multilabels_paths)) train_paths_dict, test_paths_dict = self.split_test_train_data( multilabels_paths ) print("Train data: %d" % len(train_paths_dict)) print("Test data: %d" % len(test_paths_dict)) print("Padding train and test data...") # pad training and test data with leading zeros test_data, test_labels = self.pad_paths( test_paths_dict, num_classes, standard_connections, rev_dict ) train_data, train_labels = self.pad_paths( train_paths_dict, num_classes, standard_connections, rev_dict ) print("Estimating sample frequency...") l_tool_freq = self.get_train_last_tool_freq(train_paths_dict, rev_dict) l_tool_tr_samples = self.get_toolid_samples(train_data, l_tool_freq) # Predict tools usage print("Predicting tools' usage...") usage_pred = predict_tool_usage.ToolPopularity() usage = usage_pred.extract_tool_usage(tool_usage_path, cutoff_date, dictionary) tool_usage_prediction = usage_pred.get_pupularity_prediction(usage) t_pred_usage = self.get_predicted_usage(dictionary, tool_usage_prediction) # get class weights using the predicted usage for each tool class_weights = self.assign_class_weights(num_classes, t_pred_usage) return ( train_data, train_labels, test_data, test_labels, dictionary, rev_dict, class_weights, t_pred_usage, l_tool_freq, l_tool_tr_samples, )