Ejemplo n.º 1
0
    def get_data_labels_matrices(self,
                                 workflow_paths,
                                 tool_usage_path,
                                 cutoff_date,
                                 compatible_next_tools,
                                 old_data_dictionary={}):
        """
        Convert the training and test paths into corresponding numpy matrices
        """
        processed_data, raw_paths = self.process_workflow_paths(workflow_paths)
        dictionary, reverse_dictionary = self.create_data_dictionary(
            processed_data, old_data_dictionary)
        num_classes = len(dictionary)

        print("Raw paths: %d" % len(raw_paths))
        random.shuffle(raw_paths)

        print("Decomposing paths...")
        all_unique_paths = self.decompose_paths(raw_paths, dictionary)
        random.shuffle(all_unique_paths)

        print("Creating dictionaries...")
        multilabels_paths = self.prepare_paths_labels_dictionary(
            dictionary, reverse_dictionary, all_unique_paths,
            compatible_next_tools)

        print("Complete data: %d" % len(multilabels_paths))
        train_paths_dict, test_paths_dict = self.split_test_train_data(
            multilabels_paths)

        print("Train data: %d" % len(train_paths_dict))
        print("Test data: %d" % len(test_paths_dict))

        test_data, test_labels = self.pad_paths(test_paths_dict, num_classes)
        train_data, train_labels = self.pad_paths(train_paths_dict,
                                                  num_classes)

        # Predict tools usage
        print("Predicting tools' usage...")
        usage_pred = predict_tool_usage.ToolPopularity()
        usage = usage_pred.extract_tool_usage(tool_usage_path, cutoff_date,
                                              dictionary)
        tool_usage_prediction = usage_pred.get_pupularity_prediction(usage)
        tool_predicted_usage = self.get_predicted_usage(
            dictionary, tool_usage_prediction)

        # get class weights using the predicted usage for each tool
        class_weights = self.assign_class_weights(train_labels.shape[1],
                                                  tool_predicted_usage)

        return train_data, train_labels, test_data, test_labels, dictionary, reverse_dictionary, class_weights, tool_predicted_usage
Ejemplo n.º 2
0
    def get_data_labels_matrices(
        self,
        workflow_paths,
        tool_usage_path,
        cutoff_date,
        compatible_next_tools,
        standard_connections,
        old_data_dictionary={},
    ):
        """
        Convert the training and test paths into corresponding numpy matrices
        """
        processed_data, raw_paths = self.process_workflow_paths(workflow_paths)
        dictionary, rev_dict = self.create_data_dictionary(
            processed_data, old_data_dictionary
        )
        num_classes = len(dictionary)

        print("Raw paths: %d" % len(raw_paths))
        random.shuffle(raw_paths)

        print("Decomposing paths...")
        all_unique_paths = self.decompose_paths(raw_paths, dictionary)
        random.shuffle(all_unique_paths)

        print("Creating dictionaries...")
        multilabels_paths = self.prepare_paths_labels_dictionary(
            dictionary, rev_dict, all_unique_paths, compatible_next_tools
        )

        print("Complete data: %d" % len(multilabels_paths))
        train_paths_dict, test_paths_dict = self.split_test_train_data(
            multilabels_paths
        )

        print("Train data: %d" % len(train_paths_dict))
        print("Test data: %d" % len(test_paths_dict))

        print("Padding train and test data...")
        # pad training and test data with leading zeros
        test_data, test_labels = self.pad_paths(
            test_paths_dict, num_classes, standard_connections, rev_dict
        )
        train_data, train_labels = self.pad_paths(
            train_paths_dict, num_classes, standard_connections, rev_dict
        )

        print("Estimating sample frequency...")
        l_tool_freq = self.get_train_last_tool_freq(train_paths_dict, rev_dict)
        l_tool_tr_samples = self.get_toolid_samples(train_data, l_tool_freq)

        # Predict tools usage
        print("Predicting tools' usage...")
        usage_pred = predict_tool_usage.ToolPopularity()
        usage = usage_pred.extract_tool_usage(tool_usage_path, cutoff_date, dictionary)
        tool_usage_prediction = usage_pred.get_pupularity_prediction(usage)
        t_pred_usage = self.get_predicted_usage(dictionary, tool_usage_prediction)

        # get class weights using the predicted usage for each tool
        class_weights = self.assign_class_weights(num_classes, t_pred_usage)

        return (
            train_data,
            train_labels,
            test_data,
            test_labels,
            dictionary,
            rev_dict,
            class_weights,
            t_pred_usage,
            l_tool_freq,
            l_tool_tr_samples,
        )