def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:
        dataframe_resource_id, dataframe = base_utils.get_tabular_resource(
            inputs, self.hyperparams["dataframe_resource"])

        base_file_path = "/".join(
            inputs.metadata._current_metadata.metadata["location_uris"]
            [0].split("/")[:-1])
        graph1 = os.path.join(base_file_path, "graphs",
                              inputs["0"].values[0][0])
        graph1 = nx.read_gml(graph1[7:])
        int2str_map = dict(zip(graph1.nodes, [str(n) for n in graph1.nodes]))
        graph = nx.relabel_nodes(graph1, mapping=int2str_map)

        # graph = inputs['0']
        # int2str_map = dict(zip(graph.nodes, [str(n) for n in graph.nodes]))
        graph = nx.relabel_nodes(graph, mapping=int2str_map)

        dataframe.metadata = self._update_metadata(inputs.metadata,
                                                   dataframe_resource_id)

        assert isinstance(dataframe, container.DataFrame), type(dataframe)

        U_train = {"graph": graph}
        y_train = self.produce_target(inputs=inputs).value
        X_train = dataframe

        X_train = self._typify_dataframe(X_train)

        return base.CallResult([X_train, y_train, U_train])
    def produce(self,
                *,
                inputs: container.List,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[container.Dataset]:
        # build the list of dataframes from the list of inputs
        dataframes = []
        metadata = None
        for input in inputs:
            if isinstance(input, container.DataFrame):
                dataframes.append(input)
            try:
                _, main_dr = d3m_base_utils.get_tabular_resource(input, None)
                dataframes.append(main_dr)
                metadata = input.metadata
            except ValueError as error:
                raise exceptions.InvalidArgumentValueError(
                    "Failure to find tabular resource in dataset") from error

        if self.hyperparams["column_overlap"] == "exact":
            columns_to_handle = dataframes[0].columns
            if np.sum(
                    np.array([
                        np.all(df.columns == columns_to_handle)
                        for df in dataframes
                    ])) != len(dataframes):
                raise exceptions.InvalidArgumentValueError(
                    "Dataframes don't have same columns, cannot exact concat")
            concated = pd.concat(dataframes, ignore_index=True)
        elif self.hyperparams["column_overlap"] == "union":
            concated = pd.concat(dataframes, ignore_index=True)
        elif self.hyperparams["column_overlap"] == "intersection":
            concated = pd.concat(dataframes, join="inner", ignore_index=True)

        if self.hyperparams["remove_duplicate_rows"]:
            concated.drop_duplicates(subset="d3mIndex",
                                     keep="first",
                                     inplace=True,
                                     ignore_index=True)

        if metadata is None:
            metadata = container.Dataset({
                "learningData": concated.head(1)
            },
                                         generate_metadata=True).metadata
        outputs = container.Dataset({"learningData": concated}, metadata)
        outputs.metadata = outputs.metadata.update(
            (metadata_base.ALL_ELEMENTS, ),
            {"dimension": {
                "length": concated.shape[0]
            }})

        return base.CallResult(outputs)
Esempio n. 3
0
def get_dataframe(dataset: container.Dataset, resource_id: str) -> container.DataFrame:
    # extracts a dataframe from a dataset and ensures its metadata is transferred over

    # grab the resource and its metadata out of the dataset
    dataframe_resource_id, dataframe = base_utils.get_tabular_resource(dataset, resource_id)
    resource_metadata = dict(dataset.metadata.query((dataframe_resource_id,)))
    # copy the resource metadata from the dataset into the resource
    new_metadata = metadata_base.DataMetadata(resource_metadata)
    new_metadata = dataset.metadata.copy_to(new_metadata, (resource_id,))
    new_metadata = new_metadata.remove_semantic_type((), 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint')
    dataframe.metadata = new_metadata

    return dataframe
Esempio n. 4
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:
        dataframe_resource_id, dataframe = base_utils.get_tabular_resource(
            inputs, self.hyperparams['dataframe_resource'])

        dataframe.metadata = self._update_metadata(inputs.metadata,
                                                   dataframe_resource_id)

        assert isinstance(dataframe, container.DataFrame), type(dataframe)

        return base.CallResult(dataframe)
    def produce_target(self,
                       *,
                       inputs: Inputs,
                       timeout: float = None,
                       iterations: int = None
                       ) -> base.CallResult[container.DataFrame]:
        logger.debug(f"Running {__name__} produce_target")

        _, dataframe = base_utils.get_tabular_resource(
            inputs, self.hyperparams["dataframe_resource"])
        outputs = dataframe.copy()

        # find the target column and remove all others
        num_cols = outputs.metadata.query(
            (metadata_base.ALL_ELEMENTS, ))["dimension"]["length"]
        target_idx = -1
        suggested_target_idx = -1
        for i in range(num_cols):
            semantic_types = outputs.metadata.query(
                (metadata_base.ALL_ELEMENTS, i))["semantic_types"]
            if ("https://metadata.datadrivendiscovery.org/types/Target"
                    in semantic_types or
                    "https://metadata.datadrivendiscovery.org/types/TrueTarget"
                    in semantic_types):
                target_idx = i
                outputs = self._update_type_info(semantic_types, outputs, i)
            elif ("https://metadata.datadrivendiscovery.org/types/SuggestedTarget"
                  in semantic_types):
                suggested_target_idx = i
            elif ("https://metadata.datadrivendiscovery.org/types/PrimaryKey"
                  in semantic_types):
                outputs = self._update_type_info(semantic_types, outputs, i)
        # fall back on suggested target
        if target_idx == -1:
            target_idx = suggested_target_idx

        # flip the d3mIndex to be the df index as well
        outputs = outputs.set_index("d3mIndex", drop=False)

        remove_indices = set(range(num_cols))
        remove_indices.remove(target_idx)
        outputs = outputs.remove_columns(remove_indices)

        logger.debug(f"\n{outputs.dtypes}")
        logger.debug(f"\n{outputs}")

        return base.CallResult(outputs)
Esempio n. 6
0
def get_dataframe(dataset: container.Dataset, resource_id: str, target_col: int) -> container.DataFrame:
    """ extracts a dataframe from a dataset and ensures its metadata is transferred over """

    # grab the resource and its metadata out of the dataset
    dataframe_resource_id, dataframe = base_utils.get_tabular_resource(dataset, resource_id)
    resource_metadata = dict(dataset.metadata.query((dataframe_resource_id,)))

    # copy the resource metadata from the dataset into the resource
    new_metadata = metadata_base.DataMetadata(resource_metadata)
    new_metadata = dataset.metadata.copy_to(new_metadata, (resource_id,))
    new_metadata = new_metadata.remove_semantic_type((), 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint')

    # add target metadata to specified column
    new_metadata = new_metadata.add_semantic_type(
        (metadata_base.ALL_ELEMENTS, target_col),
        'https://metadata.datadrivendiscovery.org/types/TrueTarget'
    )
    dataframe.metadata = new_metadata
    return dataframe
    def produce(
        self,
        *,
        inputs: container.Dataset,
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[container.DataFrame]:
        logger.debug(f"Running {__name__}")

        # get the learning data (the dataset entry point)
        learning_id, learning_df = base_utils.get_tabular_resource(
            inputs, None, pick_entry_point=True)
        learning_df = learning_df.head(
            int(learning_df.shape[0] * self.hyperparams["sample"]))
        learning_df.metadata = self._update_metadata(inputs.metadata,
                                                     learning_id, learning_df)

        logger.debug(f"\n{learning_df}")

        return base.CallResult(learning_df)
Esempio n. 8
0
 def set_training_data(self, *, inputs: Inputs) -> None:
     self._target_resource_id, _ = d3m_utils.get_tabular_resource(inputs, self.hyperparams["target_resource"])
     # d3m.base.utils.
     self._inputs = inputs
     self._fitted = False
Esempio n. 9
0
 def set_training_data(self, *, inputs: Input) -> None:
     self._training_inputs = inputs
     main_resource_id, main_resource = d3m_utils.get_tabular_resource(
         inputs, None, has_hyperparameter=False)
     self._main_resource_id = main_resource_id
     self._fitted = False
    def produce_collection(
        self,
        *,
        inputs: container.Dataset,
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[container.DataFrame]:
        logger.debug(f"Running {__name__}")

        # get the learning data (the dataset entry point)
        learning_id, learning_df = base_utils.get_tabular_resource(
            inputs, None, pick_entry_point=True)

        learning_df = learning_df.head(
            int(learning_df.shape[0] * self.hyperparams["sample"]))
        learning_df.metadata = self._update_metadata(inputs.metadata,
                                                     learning_id, learning_df)

        # find the column that is acting as the foreign key and extract the resource + column it references
        for i in range(
                learning_df.metadata.query(
                    (metadata_base.ALL_ELEMENTS, ))["dimension"]["length"]):
            column_metadata = learning_df.metadata.query_column(i)
            if ("foreign_key" in column_metadata
                    and column_metadata["foreign_key"]["type"] == "COLUMN"):
                resource_id = column_metadata["foreign_key"]["resource_id"]
                file_column_idx = column_metadata["foreign_key"][
                    "column_index"]

        # get the learning data (the dataset entry point)
        collection_id, collection_df = base_utils.get_tabular_resource(
            inputs, resource_id)

        collection_df = collection_df.head(learning_df.shape[0])
        collection_df.metadata = self._update_metadata(inputs.metadata,
                                                       collection_id,
                                                       collection_df)

        # get the base path
        base_path = collection_df.metadata.query(
            (metadata_base.ALL_ELEMENTS,
             file_column_idx))["location_base_uris"][0]

        # create fully resolved paths and load
        paths = learning_df.iloc[:, file_column_idx]  # TODO: remove, unused?

        file_paths = []
        for i, row in learning_df.iterrows():
            if i % 100 == 0:
                logger.debug(f"Loaded {i} / {len(learning_df.index)} files")
            try:
                start_end = row["start-end-time-slice-of-recording"]
                start, end = [float(x) for x in start_end.split(",")]
                file_paths.append((os.path.join(base_path,
                                                row["filename"]), start, end))
            except AttributeError as e:
                logger.warning("no start/end ts for {}".format(row))
                file_paths.append((os.path.join(base_path,
                                                row["filename"]), None, None))

        outputs = self._audio_load(self.hyperparams["n_jobs"], file_paths)

        logger.debug(f"\n{outputs}")

        result_df = pd.DataFrame({"audio":
                                  outputs})  # d3m container takes for_ever_
        return base.CallResult(
            container.DataFrame(result_df, generate_metadata=False))
    def _evaluate(self,
                  configuration: ConfigurationPoint,
                  cache: PrimitivesCache,
                  dump2disk: bool = True) -> typing.Dict:

        start_time = time.time()
        pipeline = self.template.to_pipeline(configuration)
        # Todo: update ResourceManager to run pipeline:  ResourceManager.add_pipeline(pipeline)
        # initlize repeat_time_level
        self._repeat_times_level_2 = 1
        self._repeat_times_level_1 = 1

        # for timeseries forcasting, we can't compare directly
        if self.problem['problem'][
                'task_type'] == TaskType.TIME_SERIES_FORECASTING:
            # just skip for now
            # TODO: add one way to evalute time series forecasting pipeline quality
            # (something like sliding window)
            fitted_pipeline = FittedPipeline(
                pipeline=pipeline,
                dataset_id=self.train_dataset1.metadata.query(())['id'],
                metric_descriptions=self.performance_metrics,
                template=self.template,
                problem=self.problem,
                extra_primitive=self.extra_primitive,
                random_seed=self.random_seed)
            fitted_pipeline.fit(cache=cache, inputs=[self.train_dataset1])
            fitted_pipeline.save(self.output_directory)

            training_ground_truth = get_target_columns(self.train_dataset1)

            # fake_metric = calculate_score(training_ground_truth, training_ground_truth,
            #     self.performance_metrics, self.task_type, SpecialMetric().regression_metric)

            fake_metric = score_prediction(training_ground_truth,
                                           [self.train_dataset1], self.problem,
                                           self.performance_metrics,
                                           self.random_seed)

            # HACK, if mean_base_line then make it slightly worse
            if fitted_pipeline.template_name == 'SRI_Mean_Baseline_Template':
                result = fake_metric[0]
                if result['metric'].best_value(
                ) < result['metric'].worst_value():
                    result['value'] = result['value'] + 0.1
                    fake_metric[0].normalize(result['value'])
                else:
                    result['value'] = result['value'] - 0.1
                    fake_metric[0].normalize(result['value'])

            fitted_pipeline.set_metric(fake_metric[0])

            # [{'column_name': 'Class', 'metric': 'f1', 'value': 0.1}]
            data = {
                # 2019-7-10: return pipeline.id as id to make debugging easier
                'id': fitted_pipeline.pipeline.id,
                'fid': fitted_pipeline.id,
                'fitted_pipeline': fitted_pipeline,
                'training_metrics': fake_metric,
                'cross_validation_metrics': None,
                'test_metrics': fake_metric,
                'total_runtime': time.time() - start_time,
                'configuration': configuration,
                'ensemble_tuning_result': None,
                'ensemble_tuning_metrics': None,
            }

            fitted_pipeline.auxiliary = dict(data)
            fitted_pipeline.save(self.output_directory)
            return data

        # following codes should only for running in the normal validation that can be splitted and tested
        # if in cross validation mode
        if self.testing_mode == Mode.CROSS_VALIDATION_MODE:
            self._repeat_times_level_2 = int(
                self.validation_config['cross_validation'])
            # start training and testing
            fitted_pipeline = FittedPipeline(
                pipeline=pipeline,
                dataset_id=self.train_dataset1.metadata.query(())['id'],
                metric_descriptions=self.performance_metrics,
                template=self.template,
                problem=self.problem,
                extra_primitive=self.extra_primitive,
                random_seed=self.random_seed)

            fitted_pipeline.fit(cache=cache, inputs=[self.train_dataset1])

            training_prediction = fitted_pipeline.get_fit_step_output(
                self.template.get_output_step_number())
            # training_ground_truth = get_target_columns(self.train_dataset1)
            # training_metrics = calculate_score(training_ground_truth, training_prediction,
            #     self.performance_metrics, self.task_type, SpecialMetric().regression_metric)
            training_metrics = score_prediction(training_prediction,
                                                [self.train_dataset1],
                                                self.problem,
                                                self.performance_metrics,
                                                self.random_seed)

            cv_metrics = fitted_pipeline.get_cross_validation_metrics()
            test_metrics = copy.deepcopy(training_metrics)

            # use cross validation's avg value as the test score
            for i in range(len(test_metrics)):
                test_metrics[i]["value"] = cv_metrics[i]["value"]

            _logger.info("CV finish")

        # if in normal testing mode(including default testing mode with train/test one time each)
        else:
            # update: 2019.3.19
            # no need to run inside(level 2 split), run base on level 1 split now!
            if self.testing_mode == Mode.TRAIN_TEST_MODE:
                self._repeat_times_level_1 = int(
                    self.validation_config['test_validation'])

            _logger.info(
                "Will use normal train-test mode (n={}) to choose best primitives."
                .format(self._repeat_times_level_2))

            training_metrics = []
            test_metrics = []

            for each_repeat in range(self._repeat_times_level_2):
                # start training and testing
                fitted_pipeline = FittedPipeline(
                    pipeline=pipeline,
                    dataset_id=self.train_dataset2[each_repeat].metadata.query(
                        ())['id'],
                    metric_descriptions=self.performance_metrics,
                    template=self.template,
                    problem=self.problem,
                    extra_primitive=self.extra_primitive,
                    random_seed=self.random_seed)

                fitted_pipeline.fit(cache=cache,
                                    inputs=[self.train_dataset2[each_repeat]])
                # fitted_pipeline.fit(inputs=[self.train_dataset2[each_repeat]])
                training_prediction = fitted_pipeline.get_fit_step_output(
                    self.template.get_output_step_number())

                # training_ground_truth = get_target_columns(self.train_dataset2[each_repeat])
                # training_metrics_each = calculate_score(
                #     training_ground_truth, training_prediction,
                #     self.performance_metrics, self.task_type, SpecialMetric().regression_metric)
                training_metrics_each = score_prediction(
                    training_prediction, [self.train_dataset2[each_repeat]],
                    self.problem, self.performance_metrics, self.random_seed)

                # only do test if the test_dataset exist
                if self.test_dataset2[each_repeat] is not None:
                    results = fitted_pipeline.produce(
                        inputs=[self.test_dataset2[each_repeat]])
                    # Note: results == test_prediction
                    test_prediction = fitted_pipeline.get_produce_step_output(
                        self.template.get_output_step_number())

                    # test_ground_truth = get_target_columns(self.test_dataset2[each_repeat])
                    # test_metrics_each = calculate_score(test_ground_truth, test_prediction,
                    #     self.performance_metrics, self.task_type, SpecialMetric().regression_metric)
                    test_metrics_each = score_prediction(
                        test_prediction, [self.test_dataset2[each_repeat]],
                        self.problem, self.performance_metrics,
                        self.random_seed)

                else:
                    # test_ground_truth = None
                    test_prediction = None
                    test_metrics_each = copy.deepcopy(training_metrics_each)
                    for each in test_metrics_each:
                        each["value"] = each['metric'].worst_value()

                training_metrics.append(training_metrics_each)
                test_metrics.append(test_metrics_each)
            # END for TRAIN_TEST_MODES
            # sample format of the output
            # [{'metric': 'f1Macro', 'value': 0.48418535913661614, 'values': [0.4841025641025641,
            #  0.4841025641025641, 0.4843509492047203]]
            # modify the test_metrics and training_metrics format to fit the requirements
            # print("[INFO] Testing finish.!!!")

            if len(training_metrics) > 1:
                training_metrics = self.conclude_k_fold_metrics(
                    training_metrics)
            else:
                if type(training_metrics[0]) is list:
                    training_metrics = training_metrics[0]

            if len(test_metrics) > 1:
                test_metrics = self.conclude_k_fold_metrics(test_metrics)
            else:
                if type(test_metrics[0]) is list:
                    test_metrics = test_metrics[0]
        # END evaluation part

        # Save results
        ensemble_tuning_result = None
        ensemble_tuning_metrics = None
        if self.test_dataset1 is None:
            # print("The dataset no need to split of split failed, will not train again.")
            fitted_pipeline2 = fitted_pipeline
            # set the metric for calculating the rank
            fitted_pipeline2.set_metric(training_metrics[0])
            cv = fitted_pipeline2.get_cross_validation_metrics()
            if not cv:
                # CandidateCache asserts cv must be a list
                cv = []

            data = {
                # 2019-7-10: return pipeline.id as id to make debugging easier
                'id': fitted_pipeline2.pipeline.id,
                'fid': fitted_pipeline2.id,
                'fitted_pipeline': fitted_pipeline2,
                'training_metrics': training_metrics,
                'cross_validation_metrics': cv,
                'test_metrics': training_metrics,
                'total_runtime': time.time() - start_time,
                'configuration': configuration,
                'ensemble_tuning_result': ensemble_tuning_result,
                'ensemble_tuning_metrics': ensemble_tuning_metrics,
            }
            fitted_pipeline.auxiliary = dict(data)

            # print("!!!! No test_dataset1")
            # pprint(data)
            # print("!!!!")

            if _logger.getEffectiveLevel() <= 10:
                data_to_logger_info = []
                if 'metric' in data['test_metrics']:
                    data_to_logger_info.append(data['test_metrics']['metric'])
                else:
                    data_to_logger_info.append("No test metrics metric found")
                if 'value' in data['test_metrics']:
                    data_to_logger_info.append(data['test_metrics']['value'])
                else:
                    data_to_logger_info.append("No test metrics value found")
                _logger.info(
                    'fitted id: %(fitted_pipeline_id)s, metric: %(metric)s, value: %(value)s',
                    {
                        'fitted_pipeline_id': fitted_pipeline2.id,
                        'metric': data_to_logger_info[0],
                        'value': data_to_logger_info[1]
                    })

            # Save fitted pipeline
            pickled = False
            if self.output_directory is not None and dump2disk:
                try:
                    fitted_pipeline2.save(self.output_directory)
                    pickled = True
                except Exception as e:
                    _logger.warning(
                        f'SKIPPING Pickle test. Saving pipeline failed: {e.message}'
                    )

            # Pickle test
            try:
                if pickled and self.output_directory is not None and dump2disk:
                    _logger.debug("Test pickled pipeline. id: {}".format(
                        fitted_pipeline2.id))
                    self.test_pickled_pipeline(
                        folder_loc=self.output_directory,
                        pipeline_id=fitted_pipeline2.id,
                        test_dataset=self.train_dataset2[0],
                        test_metrics=training_metrics
                        # test_ground_truth=get_target_columns(self.train_dataset2[0], self.problem)
                    )
            except Exception as e:
                _logger.exception('Pickle test Failed', exc_info=True)
        else:
            # update v2019.3.17, running k-fold corss validation on level_1 split
            if self.quick_mode:
                _logger.info(
                    "[INFO] Now in quick mode, will skip training with train_dataset1"
                )
                # if in quick mode, we did not fit the model with dataset_train1 again
                # just generate the predictions on dataset_test1 directly and get the rank
                fitted_pipeline2 = fitted_pipeline
                fitted_pipeline2.produce(inputs=[self.test_dataset1])
                test_prediction = fitted_pipeline2.get_produce_step_output(
                    self.template.get_output_step_number())

                # test_ground_truth = get_target_columns(self.test_dataset1)
                # test_metrics2 = calculate_score(test_ground_truth, test_prediction,
                #     self.performance_metrics, self.task_type, SpecialMetric().regression_metric)
                test_metrics2 = score_prediction(test_prediction,
                                                 [self.test_dataset1],
                                                 self.problem,
                                                 self.performance_metrics,
                                                 self.random_seed)

            else:
                _logger.info(
                    "[INFO] Now in normal mode, will add extra train with train_dataset1"
                )
                # otherwise train again with dataset_train1 and get the rank

                if self._repeat_times_level_1 > 1:
                    # generate split base on level 1 (do all-dataset level x-fold corss vaidation)
                    from common_primitives.kfold_split import KFoldDatasetSplitPrimitive, Hyperparams as hyper_k_fold
                    hyperparams_split = hyper_k_fold.defaults()
                    hyperparams_split = hyperparams_split.replace({
                        "number_of_folds":
                        self._repeat_times_level_1,
                        "shuffle":
                        True
                    })
                    if self.task_type == 'CLASSIFICATION':
                        hyperparams_split = hyperparams_split.replace(
                            {"stratified": True})
                    else:  # if not task_type == "REGRESSION":
                        hyperparams_split = hyperparams_split.replace(
                            {"stratified": False})
                    split_primitive = KFoldDatasetSplitPrimitive(
                        hyperparams=hyperparams_split)
                    split_primitive.set_training_data(dataset=self.all_dataset)
                    split_primitive.fit()
                    query_dataset_list = list(range(
                        self._repeat_times_level_1))
                    train_return = split_primitive.produce(
                        inputs=query_dataset_list).value  #['learningData']
                    test_return = split_primitive.produce_score_data(
                        inputs=query_dataset_list).value

                    all_test_metrics = []
                    for i in range(self._repeat_times_level_1):
                        current_train_dataset = train_return[i]
                        current_test_dataset = test_return[i]
                        fitted_pipeline2 = FittedPipeline(
                            pipeline=pipeline,
                            dataset_id=current_train_dataset.metadata.query(
                                ())['id'],
                            metric_descriptions=self.performance_metrics,
                            template=self.template,
                            problem=self.problem,
                            extra_primitive=self.extra_primitive,
                            random_seed=self.random_seed)
                        # retrain and compute ranking/metric using self.train_dataset
                        # fitted_pipeline2.fit(inputs = [self.train_dataset1])
                        fitted_pipeline2.fit(cache=cache,
                                             inputs=[current_train_dataset])
                        fitted_pipeline2.produce(inputs=[current_test_dataset])
                        test_prediction = fitted_pipeline2.get_produce_step_output(
                            self.template.get_output_step_number())

                        # test_ground_truth = get_target_columns(current_test_dataset)
                        # test_metrics_temp = calculate_score(test_ground_truth, test_prediction,
                        #     self.performance_metrics, self.task_type, SpecialMetric().regression_metric)
                        test_metrics_temp = score_prediction(
                            test_prediction, [current_test_dataset],
                            self.problem, self.performance_metrics,
                            self.random_seed)

                        all_test_metrics.append(test_metrics_temp)

                    results = self.conclude_k_fold_metrics(all_test_metrics)
                    test_metrics2 = results[0]
                else:
                    # otherwise still do as previously
                    fitted_pipeline2 = FittedPipeline(
                        pipeline=pipeline,
                        dataset_id=self.train_dataset1.metadata.query(
                            ())['id'],
                        metric_descriptions=self.performance_metrics,
                        template=self.template,
                        problem=self.problem,
                        extra_primitive=self.extra_primitive,
                        random_seed=self.random_seed)
                    # retrain and compute ranking/metric using self.train_dataset
                    # fitted_pipeline2.fit(inputs = [self.train_dataset1])
                    fitted_pipeline2.fit(cache=cache,
                                         inputs=[self.train_dataset1])
                    fitted_pipeline2.produce(inputs=[self.test_dataset1])
                    test_prediction = fitted_pipeline2.get_produce_step_output(
                        self.template.get_output_step_number())

                    # test_ground_truth = get_target_columns(self.test_dataset1)
                    # test_metrics2 = calculate_score(test_ground_truth, test_prediction,
                    #     self.performance_metrics, self.task_type, SpecialMetric().regression_metric)
                    test_metrics2 = score_prediction(test_prediction,
                                                     [self.test_dataset1],
                                                     self.problem,
                                                     self.performance_metrics,
                                                     self.random_seed)
            # update here:
            # Now new version of d3m runtime don't allow to run ".fit()" again on a given runtime
            #  object second time
            # So here we need to create a new FittedPipeline object to run second time's
            # runtime.fit()

            fitted_pipeline_final = FittedPipeline(
                pipeline=pipeline,
                dataset_id=self.all_dataset.metadata.query(())['id'],
                metric_descriptions=self.performance_metrics,
                template=self.template,
                problem=self.problem,
                extra_primitive=self.extra_primitive,
                random_seed=self.random_seed)
            # set the metric for calculating the rank
            fitted_pipeline_final.set_metric(test_metrics2[0])
            # end uptdate v2019.3.17

            # finally, fit the model with all data and save it
            _logger.info(
                "[INFO] Now are training the pipeline with all dataset and saving the pipeline."
            )
            fitted_pipeline_final.fit(cache=cache, inputs=[self.all_dataset])

            if self.ensemble_tuning_dataset:
                fitted_pipeline_final.produce(
                    inputs=[self.ensemble_tuning_dataset])
                ensemble_tuning_result = fitted_pipeline_final.get_produce_step_output(
                    self.template.get_output_step_number())

                # ensemble_tuning_result_ground_truth = get_target_columns(self.ensemble_tuning_dataset)
                # ensemble_tuning_metrics = calculate_score(ensemble_tuning_result_ground_truth, ensemble_tuning_result,
                #                                           self.performance_metrics, self.task_type, SpecialMetric().regression_metric)
                ensemble_tuning_metrics = score_prediction(
                    ensemble_tuning_result, [self.ensemble_tuning_dataset],
                    self.problem, self.performance_metrics, self.random_seed)

            cv = fitted_pipeline_final.get_cross_validation_metrics()
            if not cv:
                # CandidateCache asserts cv must be a list
                cv = []
            data = {
                # 2019-7-10: return pipeline.id as id to make debugging easier
                'id': fitted_pipeline_final.pipeline.id,
                'fid': fitted_pipeline_final.id,
                'fitted_pipeline': fitted_pipeline_final,
                'training_metrics': training_metrics,
                'cross_validation_metrics': cv,
                'test_metrics': test_metrics2,
                'total_runtime': time.time() - start_time,
                'configuration': configuration,
                'ensemble_tuning_result': ensemble_tuning_result,
                'ensemble_tuning_metrics': ensemble_tuning_metrics,
            }
            fitted_pipeline.auxiliary = dict(data)

            # Save fiteed pipeline
            pickled = False
            if self.output_directory is not None and dump2disk:
                try:
                    fitted_pipeline_final.save(self.output_directory)
                    pickled = True
                except Exception as e:
                    _logger.warning(
                        'SKIPPING Pickle test. Saving pipeline failed: {e.message}'
                    )

            # Pickle test
            if pickled and self.output_directory is not None and dump2disk:
                try:
                    # remove the augmented columns in self.test_dataset1 to ensure we can pass the picking test
                    res_id, test_dataset1_df = d3m_utils.get_tabular_resource(
                        dataset=self.test_dataset1, resource_id=None)

                    original_columns = []
                    remained_columns_number = 0
                    for i in range(test_dataset1_df.shape[1]):
                        current_selector = (res_id, ALL_ELEMENTS, i)
                        meta = self.test_dataset1.metadata.query(
                            current_selector)

                        if AUGMENTED_COLUMN_SEMANTIC_TYPE in meta[
                                'semantic_types'] or Q_NODE_SEMANTIC_TYPE in meta[
                                    'semantic_types']:
                            self.test_dataset1.metadata = self.test_dataset1.metadata.remove(
                                selector=current_selector)
                        else:
                            original_columns.append(i)
                            if remained_columns_number != i:
                                self.test_dataset1.metadata = self.test_dataset1.metadata.remove(
                                    selector=current_selector)
                                updated_selector = (res_id, ALL_ELEMENTS,
                                                    remained_columns_number)
                                self.test_dataset1.metadata = self.test_dataset1.metadata.update(
                                    selector=updated_selector, metadata=meta)
                            remained_columns_number += 1

                    self.test_dataset1[res_id] = self.test_dataset1[
                        res_id].iloc[:, original_columns]
                    meta = dict(
                        self.test_dataset1.metadata.query(
                            (res_id, ALL_ELEMENTS)))
                    dimension = dict(meta['dimension'])
                    dimension['length'] = remained_columns_number
                    meta['dimension'] = frozendict.FrozenOrderedDict(dimension)
                    self.test_dataset1.metadata = self.test_dataset1.metadata.update(
                        (res_id, ALL_ELEMENTS),
                        frozendict.FrozenOrderedDict(meta))
                    # end removing augmente columns

                    _ = fitted_pipeline_final.produce(
                        inputs=[self.test_dataset1])
                    test_prediction3 = fitted_pipeline_final.get_produce_step_output(
                        self.template.get_output_step_number())

                    # test_ground_truth_for_test_pickle = get_target_columns(self.test_dataset1)
                    # test_metrics3 = calculate_score(test_ground_truth_for_test_pickle, test_prediction3,
                    #     self.performance_metrics, self.task_type, SpecialMetric().regression_metric)
                    test_metrics3 = score_prediction(test_prediction3,
                                                     [self.test_dataset1],
                                                     self.problem,
                                                     self.performance_metrics,
                                                     self.random_seed)

                    _logger.info("Test pickled pipeline. id: {}".format(
                        fitted_pipeline_final.id))
                    self.test_pickled_pipeline(
                        folder_loc=self.output_directory,
                        pipeline_id=fitted_pipeline_final.id,
                        test_dataset=self.test_dataset1,
                        test_metrics=test_metrics3
                        # test_ground_truth=test_ground_truth_for_test_pickle
                    )
                except Exception as e:
                    _logger.exception('Pickle test Failed', exc_info=True)

        # still return the original fitted_pipeline with relation to train_dataset1
        return data
Esempio n. 12
0
    def set_training_data(self, *, dataset: container.Dataset) -> None:  # type: ignore
        main_resource_id, main_resource = base_utils.get_tabular_resource(dataset, None, has_hyperparameter=False)

        self._main_resource_id = main_resource_id
        self._dataset = dataset
        self._fitted = False
Esempio n. 13
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:

        dataframe_resource_id, dataframe = base_utils.get_tabular_resource(
            inputs,
            self.hyperparams["dataframe_resource"])  # get attribute columns

        hyperparams_class = (
            dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.query()
            ["primitive_code"]["class_type_arguments"]["Hyperparams"])
        primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(
            hyperparams=hyperparams_class.defaults())

        dataframe_meta = primitive.produce(inputs=inputs).value

        attributes = list_columns_with_semantic_types(
            metadata=dataframe_meta.metadata,
            semantic_types=[
                "https://metadata.datadrivendiscovery.org/types/Attribute"
            ],
        )

        base_file_path = "/".join(
            inputs.metadata._current_metadata.metadata["location_uris"]
            [0].split("/")[:-1])
        edge_list = pd.read_csv(os.path.join(base_file_path, "graphs",
                                             "edgeList.csv"),
                                index_col=0)
        if len(edge_list.columns) > 2:
            graph = nx.from_pandas_edgelist(
                edge_list,
                source=edge_list.columns[0],
                target=edge_list.columns[1],
                edge_attr=edge_list.columns[2],
            )
        else:
            graph = nx.from_pandas_edgelist(edge_list,
                                            source=edge_list.columns[0],
                                            target=edge_list.columns[1])

        if len(attributes) > 1:
            # add attributers to nodes.
            attribute_node_map = dataframe_meta[
                dataframe_meta.columns[attributes]]
            attribute_node_map["nodeID"] = attribute_node_map["nodeID"].astype(
                int)
            attribute_node_map.index = attribute_node_map["nodeID"]
            attribute_cols = attribute_node_map.columns
            attribute_node_map.drop(["nodeID"], axis=1)
            attribute_node_map = attribute_node_map.to_dict(orient="index")

            for i in graph.nodes:
                default = {attribute: 0 for attribute in attribute_cols}
                default["nodeID"] = i
                graph.nodes[i].update(attribute_node_map.get(i, default))

        else:
            # featurizer expects at a minimum nodeids to be present
            for i in graph.nodes:
                default = {}
                default["nodeID"] = i
                graph.nodes[i].update(default)
        # int2str_map = dict(zip(graph.nodes, [str(n) for n in graph.nodes]))
        # graph = nx.relabel_nodes(graph, mapping=int2str_map)

        dataframe.metadata = self._update_metadata(inputs.metadata,
                                                   dataframe_resource_id)

        assert isinstance(dataframe, container.DataFrame), type(dataframe)

        U_train = {"graph": graph}
        y_train = self.produce_target(inputs=inputs).value
        X_train = dataframe  # TODO use attribute in vertex classification

        X_train = self._typify_dataframe(X_train)
        X_train.value = pd.DataFrame(X_train.value["nodeID"])
        return base.CallResult([X_train, y_train, U_train])
Esempio n. 14
0
    def _get_truth(self, score_dataset: container.Dataset) -> typing.Tuple[pandas.DataFrame, typing.Dict[str, typing.Any]]:
        """
        Extracts true targets from the Dataset's entry point, or the only tabular resource.
        It requires that there is only one primary index column, which it makes the first
        column, named ``d3mIndex``. Then true target columns follow.

        We return a regular Pandas DataFrame with column names matching those in the metadata,
        and a dict mapping target columns to all label values in those columns, if available in metadata.
        We convert all columns to strings to match what would be loaded from ``predictions.csv`` file.
        It encodes any float vectors as strings.
        """

        main_resource_id, main_resource = base_utils.get_tabular_resource(score_dataset, None, has_hyperparameter=False)

        # We first copy before modifying in-place.
        main_resource = container.DataFrame(main_resource, copy=True)
        main_resource = self._encode_columns(main_resource)

        dataframe = self._to_dataframe(main_resource)

        indices = list(score_dataset.metadata.get_index_columns(at=(main_resource_id,)))
        targets = list(score_dataset.metadata.list_columns_with_semantic_types(
            ['https://metadata.datadrivendiscovery.org/types/TrueTarget'],
            at=(main_resource_id,),
        ))

        if not indices:
            raise exceptions.InvalidArgumentValueError("No primary index column.")
        elif len(indices) > 1:
            raise exceptions.InvalidArgumentValueError("More than one primary index column.")
        if not targets:
            raise ValueError("No true target columns.")

        dataframe = dataframe.iloc[:, indices + targets]

        dataframe = dataframe.rename(columns={dataframe.columns[0]: metrics.INDEX_COLUMN})

        if metrics.SCORE_COLUMN in dataframe.columns[1:]:
            raise ValueError("True target column cannot be named \"confidence\". It is a reserved name.")
        if metrics.RANK_COLUMN in dataframe.columns[1:]:
            raise ValueError("True target column cannot be named \"rank\". It is a reserved name.")
        if metrics.INDEX_COLUMN in dataframe.columns[1:]:
            raise ValueError("True target column cannot be named \"d3mIndex\". It is a reserved name.")

        if d3m_utils.has_duplicates(dataframe.columns):
            duplicate_names = list(dataframe.columns)
            for name in set(dataframe.columns):
                duplicate_names.remove(name)
            raise exceptions.InvalidArgumentValueError(
                "True target columns have duplicate names: {duplicate_names}".format(
                    duplicate_names=sorted(set(duplicate_names)),
                ),
            )

        all_labels = {}

        for target_column_name, main_resource_column_index in zip(dataframe.columns[1:], targets):
            try:
                column_labels = score_dataset.metadata.query_column_field(main_resource_column_index, 'all_distinct_values', at=(main_resource_id,))
            except KeyError:
                continue

            all_labels[target_column_name] = [str(label) for label in column_labels]

        return dataframe, all_labels
Esempio n. 15
0
def download():
    try:
        logger.debug("Start datamart downloading...")
        search_result = read_file(request.files, 'task', 'json')
        # if not send the json via file
        if not search_result and request.form.get('task'):
            search_result = json.loads(request.form.get('task'))
        if search_result is None:
            return wrap_response(
                code='1000',
                msg=
                'FAIL SEARCH - Unable to get search result or input is a bad format!',
                data=None)

        # if data is csv content
        data = read_file(request.files, 'data', 'csv')
        # if data is not a csv content but a str path
        if data is not None:
            loaded_dataset = load_csv_data(data)
        elif request.values.get('data'):
            path = request.values.get('data')
            if path.lower().endswith("csv"):
                loaded_dataset = load_csv_data(path)
            else:
                loaded_dataset = load_d3m_dataset(data)
        else:
            loaded_dataset = None

        return_format = request.values.get('format')
        if not return_format or return_format.lower() == "csv":
            return_format = "csv"
        elif return_format.lower() == "d3m":
            return_format = "d3m"
        else:
            return wrap_response(code='1000',
                                 msg='FAIL SEARCH - Unknown return format: ' +
                                 str(return_format),
                                 data=None)

        # search without supplied data, not implement yet
        # TODO: implement this part!
        if loaded_dataset is None:
            return wrap_response(
                code='1000',
                msg='FAIL SEARCH - Unable to load input supplied data',
                data=None)
        # search with supplied data
        else:
            # preprocess on loaded_dataset
            logger.debug("Start running wikifier...")
            search_result_wikifier = DatamartSearchResult(
                search_result={},
                supplied_data=None,
                query_json={},
                search_type="wikifier")
            logger.debug("Wikifier finished, start running download...")
            loaded_dataset = search_result_wikifier.augment(
                supplied_data=loaded_dataset)
            search_result = DatamartSearchResult.deserialize(
                search_result['materialize_info'])
            download_result = search_result.download(
                supplied_data=loaded_dataset)
            logger.debug("Download finished.")
            res_id, result_df = d3m_utils.get_tabular_resource(
                dataset=download_result, resource_id=None)

            # print("--------------")
            # print(loaded_dataset['learningData'])
            # print("--------------")
            # print(result_df)
            # print("--------------")
            # sys.stdout.flush()

            non_empty_rows = []
            for i, v in result_df.iterrows():
                if len(v["joining_pairs"]) != 0:
                    non_empty_rows.append(i)

            if len(non_empty_rows) == 0:
                return wrap_response(
                    code='1000',
                    msg='FAIL DOWNLOAD - No joinable rows found!',
                    data=None)
            logger.debug("Start saving the download results...")
            result_df = result_df.iloc[non_empty_rows, :]
            result_df.reset_index(drop=True)
            # set all cells to be str so that we can save correctly
            download_result[res_id] = result_df.astype(str)
            # update structure type
            update_part = {"structural_type": str}
            for i in range(result_df.shape[1]):
                download_result.metadata = download_result.metadata.update(
                    metadata=update_part, selector=(res_id, ALL_ELEMENTS, i))

            # update row length
            update_part = {"length": result_df.shape[0]}
            download_result.metadata = download_result.metadata.update(
                metadata=update_part, selector=(res_id, ))

            result_id = str(hash(result_df.values.tobytes()))
            # save_dir = "/tmp/download_result" + result_id
            # if os.path.isdir(save_dir) or os.path.exists(save_dir):
            #     shutil.rmtree(save_dir)
            if return_format == "d3m":
                # save dataset
                with tempfile.TemporaryDirectory() as tmpdir:
                    absolute_path_part_length = len(str(tmpdir))
                    save_dir = os.path.join(str(tmpdir), result_id)
                    # print(save_dir)
                    # sys.stdout.flush()
                    download_result.save("file://" + save_dir +
                                         "/datasetDoc.json")
                    # zip and send to client
                    base_path = pathlib.Path(save_dir + '/')
                    data = io.BytesIO()
                    filePaths = retrieve_file_paths(save_dir)

                    zip_file = zipfile.ZipFile(data, 'w')
                    with zip_file:
                        # write each file seperately
                        for fileName in filePaths:
                            shorter_path = fileName[absolute_path_part_length:]
                            zip_file.write(fileName, shorter_path)
                    data.seek(0)

                    return send_file(data,
                                     mimetype='application/zip',
                                     as_attachment=True,
                                     attachment_filename='download_result' +
                                     result_id + '.zip')

            else:
                data = io.StringIO()
                result_df.to_csv(data, index=False)
                return Response(data.getvalue(), mimetype="text/csv")

    except Exception as e:
        return wrap_response(code='1000',
                             msg="FAIL SEARCH - %s \n %s" %
                             (str(e), str(traceback.format_exc())))
Esempio n. 16
0
def get_resource(inputs, resource_name):
    _id, _df = base_utils.get_tabular_resource(inputs, resource_name)
    _df.metadata = _update_metadata(inputs.metadata, _id)
    return _id, _df
Esempio n. 17
0
    def produce(
            self,
            *,
            left: Inputs,  # type: ignore
            right: Inputs,  # type: ignore
            timeout: float = None,
            iterations: int = None) -> base.CallResult[Outputs]:

        # attempt to extract the main table
        try:
            left_resource_id, left_df = d3m_base_utils.get_tabular_resource(
                left, None)
        except ValueError as error:
            raise exceptions.InvalidArgumentValueError(
                "Failure to find tabular resource in left dataset") from error

        try:
            right_resource_id, right_df = d3m_base_utils.get_tabular_resource(
                right, None)
        except ValueError as error:
            raise exceptions.InvalidArgumentValueError(
                "Failure to find tabular resource in right dataset") from error

        accuracy = self.hyperparams['accuracy']
        if accuracy <= 0.0 or accuracy > 1.0:
            raise exceptions.InvalidArgumentValueError('accuracy of ' +
                                                       str(accuracy) +
                                                       ' is out of range')

        left_col = self.hyperparams['left_col']
        right_col = self.hyperparams['right_col']

        # perform join based on semantic type
        join_type = self._get_join_semantic_type(left, left_resource_id,
                                                 left_col, right,
                                                 right_resource_id, right_col)
        joined: pd.Dataframe = None
        if join_type in self._STRING_JOIN_TYPES:
            joined = self._join_string_col(left_df, left_col, right_df,
                                           right_col, accuracy)
        elif join_type in self._NUMERIC_JOIN_TYPES:
            joined = self._join_numeric_col(left_df, left_col, right_df,
                                            right_col, accuracy)
        elif join_type in self._DATETIME_JOIN_TYPES:
            joined = self._join_datetime_col(left_df, left_col, right_df,
                                             right_col, accuracy)
        else:
            raise exceptions.InvalidArgumentValueError(
                'join not surpported on type ' + str(join_type))

        # create a new dataset to hold the joined data
        resource_map = {}
        for resource_id, resource in left.items():  # type: ignore
            if resource_id == left_resource_id:
                resource_map[resource_id] = joined
            else:
                resource_map[resource_id] = resource
        result_dataset = container.Dataset(resource_map,
                                           generate_metadata=True)

        return base.CallResult(result_dataset)
Esempio n. 18
0
def augment():
    try:
        logger.debug("Start running augment...")
        search_result = read_file(request.files, 'task', 'json')
        # if not send the json via file
        if not search_result and request.form.get('task'):
            search_result = json.loads(request.form.get('task'))
        if search_result is None:
            return wrap_response(
                code='1000',
                msg='FAIL SEARCH - Unable to get search result',
                data=None)

        # if data is csv content
        data = read_file(request.files, 'data', 'csv')
        # if data is not a csv content but a str path
        if data is not None:
            loaded_dataset = load_csv_data(data)
        elif request.values.get('data'):
            path = request.values.get('data')
            if path.lower().endswith("csv"):
                loaded_dataset = load_csv_data(path)
            else:
                loaded_dataset = load_d3m_dataset(data)
        else:
            loaded_dataset = None

        return_format = request.values.get('format')
        if not return_format or return_format.lower() == "csv":
            return_format = "csv"
        elif return_format.lower() == "d3m":
            return_format = "d3m"
        else:
            return wrap_response(code='1000',
                                 msg='FAIL SEARCH - Unknown return format: ' +
                                 str(return_format),
                                 data=None)

        # search without supplied data, not implement yet
        # TODO: implement this part!
        if loaded_dataset is None:
            return wrap_response(
                code='1000',
                msg='FAIL SEARCH - Unable to load input supplied data',
                data=None)
        # search with supplied data
        else:
            columns = request.values.get('columns')
            if columns and type(columns) is not list:
                columns = columns.split(", ")
                logger.info("Required columns found as: " + str(columns))
            columns_formated = []
            if columns:
                for each in columns:
                    columns_formated.append(
                        DatasetColumn(resource_id=AUGMENT_RESOURCE_ID,
                                      column_index=int(each)))
            logger.debug("Start running wikifier...")
            # preprocess on loaded_dataset
            search_result_wikifier = DatamartSearchResult(
                search_result={},
                supplied_data=None,
                query_json={},
                search_type="wikifier")
            loaded_dataset = search_result_wikifier.augment(
                supplied_data=loaded_dataset)
            logger.debug("Wikifier running finished, start running augment...")
            search_result = DatamartSearchResult.deserialize(
                search_result['materialize_info'])
            augment_result = search_result.augment(
                supplied_data=loaded_dataset, augment_columns=columns_formated)
            res_id, result_df = d3m_utils.get_tabular_resource(
                dataset=augment_result, resource_id=None)
            augment_result[res_id] = result_df.astype(str)

            # update structural type
            update_part = {"structural_type": str}
            for i in range(result_df.shape[1]):
                augment_result.metadata = augment_result.metadata.update(
                    metadata=update_part, selector=(res_id, ALL_ELEMENTS, i))

            result_id = str(hash(result_df.values.tobytes()))
            # if required to store in disk and return the path
            if request.values.get('destination'):
                logger.info("Saving to a given destination required.")
                save_dir = os.path.join(request.values.get('destination'),
                                        "augment_result" + result_id)
                if os.path.isdir(save_dir) or os.path.exists(save_dir):
                    shutil.rmtree(save_dir)
                # save dataset
                augment_result.save("file://" + save_dir + "/datasetDoc.json")
                # zip and send to client
                base_path = pathlib.Path(save_dir + '/')
                data = io.BytesIO()
                filePaths = retrieve_file_paths(save_dir)
                # print('The following list of files will be zipped:')
                for fileName in filePaths:
                    # print(fileName)
                    zip_file = zipfile.ZipFile(data, 'w')
                with zip_file:
                    # write each file seperately
                    for file in filePaths:
                        zip_file.write(file)
                data.seek(0)

                return wrap_response(code='0000', msg='Success', data=save_dir)
            else:
                # save dataset in temp directory
                logger.info("Return the augment result directly required.")
                with tempfile.TemporaryDirectory() as tmpdir:
                    absolute_path_part_length = len(str(tmpdir))
                    save_dir = os.path.join(str(tmpdir), result_id)
                    # print(save_dir)
                    # sys.stdout.flush()
                    augment_result.save("file://" + save_dir +
                                        "/datasetDoc.json")
                    # zip and send to client
                    base_path = pathlib.Path(save_dir + '/')
                    data = io.BytesIO()
                    filePaths = retrieve_file_paths(save_dir)

                    zip_file = zipfile.ZipFile(data, 'w')
                    with zip_file:
                        # write each file seperately
                        for fileName in filePaths:
                            shorter_path = fileName[absolute_path_part_length:]
                            zip_file.write(fileName, shorter_path)
                    data.seek(0)

                    return send_file(data,
                                     mimetype='application/zip',
                                     as_attachment=True,
                                     attachment_filename='download_result' +
                                     result_id + '.zip')

    except Exception as e:
        return wrap_response(code='1000',
                             msg="FAIL SEARCH - %s \n %s" %
                             (str(e), str(traceback.format_exc())))
    def produce(self,
                *,
                inputs: container.Dataset,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[container.Dataset]:

        # if this is a single resource dataset we don't need to reformat it
        if len(inputs) < 2:
            return base.CallResult(inputs)

        # find the main resource if supplied, infer if not
        main_resource_id, main_resource = base_utils.get_tabular_resource(
            inputs, self.hyperparams["main_resource_id"])
        if main_resource_id is None:
            raise exceptions.InvalidArgumentValueError(
                "no main resource specified")

        # find the csv file column resource if supplied, infer if not
        file_index = self.hyperparams["file_col_index"]
        if file_index is not None:
            if not self._is_csv_file_column(inputs.metadata, main_resource_id,
                                            file_index):
                raise exceptions.InvalidArgumentValueError(
                    "column idx=" + str(file_index) +
                    " from does not contain csv file names")
        else:
            file_index = self._find_csv_file_column(inputs.metadata,
                                                    main_resource_id)
            if file_index is None:
                raise exceptions.InvalidArgumentValueError(
                    "no column from contains csv file names")

        # generate the long form timeseries data
        base_path = self._get_base_path(inputs.metadata, main_resource_id,
                                        file_index)
        csv_paths = [
            os.path.join(base_path, local_path)
            for local_path in inputs[main_resource_id].iloc[:, file_index]
        ]
        new_dfs = [pd.read_csv(path) for path in csv_paths]
        original_dfs = [
            pd.DataFrame(
                np.tile(row, (df.shape[0], 1)),
                columns=inputs[main_resource_id].columns,
                index=df.index,
            ) for row, df in zip(inputs[main_resource_id].values, new_dfs)
        ]
        combined_dfs = [
            original_df.join(new_df)
            for original_df, new_df in zip(original_dfs, new_dfs)
        ]
        output_data = pd.concat(combined_dfs)
        timeseries_dataframe = container.DataFrame(output_data)
        timeseries_dataframe.reset_index(drop=True, inplace=True)

        # make sure that all timeseries have the same length, most downstream tasks will appreciate this.
        if self.hyperparams["equal_length"]:
            min_length = (timeseries_dataframe.groupby(
                timeseries_dataframe.columns[file_index]).count().min().
                          values[0])
            group_count = timeseries_dataframe.groupby(
                timeseries_dataframe.columns[file_index]).cumcount()
            timeseries_dataframe = timeseries_dataframe.assign(
                group_count=group_count)
            timeseries_dataframe = timeseries_dataframe[
                timeseries_dataframe["group_count"] < min_length]
            timeseries_dataframe = timeseries_dataframe.drop(["group_count"],
                                                             axis=1)

        # create a dataset to hold the result
        timeseries_dataset = container.Dataset(
            {self._resource_id: timeseries_dataframe}, generate_metadata=True)
        timeseries_dataset.metadata = timeseries_dataset.metadata.update(
            (), {"id": inputs.metadata.query(())["id"]})
        timeseries_dataset.metadata = timeseries_dataset.metadata.update(
            (), {"digest": inputs.metadata.query(())["digest"]})

        # copy main resource column metadata to timeseries dataframe
        num_main_resource_cols = inputs.metadata.query(
            (main_resource_id,
             metadata_base.ALL_ELEMENTS))["dimension"]["length"]
        for i in range(num_main_resource_cols):
            source = inputs.metadata.query(
                (main_resource_id, metadata_base.ALL_ELEMENTS, i))
            timeseries_dataset.metadata = timeseries_dataset.metadata.update_column(
                i, source, at=(self._resource_id, ))

        # remove the foreign key entry from the filename column if it exists
        metadata = dict(
            timeseries_dataset.metadata.query(
                (self._resource_id, metadata_base.ALL_ELEMENTS, file_index)))
        metadata["foreign_key"] = metadata_base.NO_VALUE
        timeseries_dataset.metadata = timeseries_dataset.metadata.update(
            (self._resource_id, metadata_base.ALL_ELEMENTS, file_index),
            metadata)

        # copy timeseries column metadata to timeseries if its available in the metadata (which is not necssarily true anymore)
        source = self._find_timeseries_metadata(inputs)
        i = 0
        start_idx = 0
        if source is not None:
            for col_info in source["file_columns"]:
                timeseries_dataset.metadata = timeseries_dataset.metadata.update_column(
                    i + num_main_resource_cols,
                    col_info,
                    at=(self._resource_id, ))
                i += 1
            # flag all other columns as attributes
            start_idx = i + num_main_resource_cols
        else:
            # loop over the appended time series columns
            start_idx = original_dfs[0].shape[1]

        for i in range(start_idx, timeseries_dataframe.shape[1]):
            timeseries_dataset.metadata = timeseries_dataset.metadata.add_semantic_type(
                (self._resource_id, metadata_base.ALL_ELEMENTS, i),
                "https://metadata.datadrivendiscovery.org/types/Attribute",
            )
            struct_type = timeseries_dataset.metadata.query(
                (self._resource_id, metadata_base.ALL_ELEMENTS,
                 i))["structural_type"]
            if struct_type == np.float64:
                timeseries_dataset.metadata = (
                    timeseries_dataset.metadata.add_semantic_type(
                        (self._resource_id, metadata_base.ALL_ELEMENTS, i),
                        "http://schema.org/Float",
                    ))
            elif struct_type == np.int64:
                timeseries_dataset.metadata = (
                    timeseries_dataset.metadata.add_semantic_type(
                        (self._resource_id, metadata_base.ALL_ELEMENTS, i),
                        "http://schema.org/Integer",
                    ))
            else:
                timeseries_dataset.metadata = (
                    timeseries_dataset.metadata.add_semantic_type(
                        (self._resource_id, metadata_base.ALL_ELEMENTS, i),
                        "http://schema.org/Text",
                    ))

        # mark the filename column as a grouping key
        timeseries_dataset.metadata = timeseries_dataset.metadata.add_semantic_type(
            (self._resource_id, metadata_base.ALL_ELEMENTS, file_index),
            "https://metadata.datadrivendiscovery.org/types/GroupingKey",
        )

        # mark the d3mIndex as a primary multi-key since there are now multiple instances of the value present
        primary_index_col = (
            timeseries_dataset.metadata.list_columns_with_semantic_types(
                ("https://metadata.datadrivendiscovery.org/types/PrimaryKey",
                 ),
                at=(self._resource_id, ),
            ))
        timeseries_dataset.metadata = timeseries_dataset.metadata.remove_semantic_type(
            (self._resource_id, metadata_base.ALL_ELEMENTS,
             primary_index_col[0]),
            "https://metadata.datadrivendiscovery.org/types/PrimaryKey",
        )
        timeseries_dataset.metadata = timeseries_dataset.metadata.add_semantic_type(
            (self._resource_id, metadata_base.ALL_ELEMENTS,
             primary_index_col[0]),
            "https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey",
        )

        return base.CallResult(timeseries_dataset)
Esempio n. 20
0
    def produce(
        self,
        *,
        left: Inputs,  # type: ignore
        right: Inputs,  # type: ignore
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[Outputs]:

        # attempt to extract the main table
        try:
            left_resource_id, left_df = d3m_base_utils.get_tabular_resource(left, None)
        except ValueError as error:
            raise exceptions.InvalidArgumentValueError(
                "Failure to find tabular resource in left dataset"
            ) from error

        try:
            right_resource_id, right_df = d3m_base_utils.get_tabular_resource(
                right, None
            )
        except ValueError as error:
            raise exceptions.InvalidArgumentValueError(
                "Failure to find tabular resource in right dataset"
            ) from error

        accuracy = self.hyperparams["accuracy"]
        absolute_accuracy = self.hyperparams["absolute_accuracy"]

        # hyperparams may be parsed as tuples
        # floats could be integers if round number is passed in
        if isinstance(accuracy, collections.Iterable):
            accuracy = [float(a) for a in accuracy]
        else:
            accuracy = float(accuracy)
        if isinstance(absolute_accuracy, collections.Iterable):
            absolute_accuracy = list(absolute_accuracy)

        if type(accuracy) == float and not type(absolute_accuracy) == bool:
            raise exceptions.InvalidArgumentValueError(
                "only 1 value of accuracy provided, but multiple values for absolute accuracy provided"
            )
        if (not type(accuracy) == float) and type(absolute_accuracy) == bool:
            raise exceptions.InvalidArgumentValueError(
                "only 1 for absolute accuracy provided, but multiple values of accuracy provided"
            )
        if type(accuracy) == float and not absolute_accuracy:
            if accuracy <= 0.0 or accuracy > 1.0:
                raise exceptions.InvalidArgumentValueError(
                    "accuracy of " + str(accuracy) + " is out of range"
                )
        elif type(accuracy) == list and type(absolute_accuracy) == list:
            if not len(accuracy) == len(absolute_accuracy):
                raise exceptions.InvalidArgumentValueError(
                    "the count of accuracy hyperparams does not match the count of absolute_accuracy hyperparams"
                )
            for i in range(len(accuracy)):
                if (accuracy[i] <= 0.0 or accuracy[i] > 1.0) and not absolute_accuracy[i]:
                    raise exceptions.InvalidArgumentValueError(
                        "accuracy of " + str(acc) + " is out of range"
                    )

        left_col = self.hyperparams["left_col"]
        right_col = self.hyperparams["right_col"]

        if type(left_col) != type(right_col) or (
            type(left_col) == list
            and len(left_col) != len(right_col)
            and type(accuracy) != list
            and len(accuracy) != len(left_col)
        ):
            raise exceptions.InvalidArgumentTypeError(
                "both left_col and right_col need to have same data type and if they are lists, the same list lengths"
            )
        if type(left_col) == str:
            left_col = [left_col]
            right_col = [right_col]
            accuracy = [accuracy]
            absolute_accuracy = [absolute_accuracy]

        join_types = [
            self._get_join_semantic_type(
                left,
                left_resource_id,
                left_col[i],
                right,
                right_resource_id,
                right_col[i],
            )
            for i in range(len(left_col))
        ]

        num_splits = 32
        joined_split = [None for i in range(num_splits)]
        left_df_split = np.array_split(left_df, num_splits)
        jobs = [delayed(self._produce_threaded)(
            index = i,
            left_df_full = left_df,
            left_dfs = left_df_split,
            right_df = right_df,
            join_types = join_types,
            left_col = left_col,
            right_col = right_col,
            accuracy = accuracy,
            absolute_accuracy = absolute_accuracy
        ) for i in range(num_splits)]
        joined_data = Parallel(n_jobs=self.hyperparams["n_jobs"], backend="loky", verbose=10)(jobs)

        # joined data needs to maintain order to mimic none split joining
        for i, d in joined_data:
            joined_split[i] = d
        joined = pd.concat(joined_split, ignore_index = True)

        # create a new dataset to hold the joined data
        resource_map = {}
        float_vector_columns = {}
        for resource_id, resource in left.items():  # type: ignore
            if resource_id == left_resource_id:
                for column in joined.columns:
                    # need to avoid bug in container.Dataset, it doesn't like vector columns
                    if type(joined[column].iloc[0]) == np.ndarray:
                        float_vector_columns[column] = joined[column]
                        joined[column] = np.NAN
                resource_map[resource_id] = joined
            else:
                resource_map[resource_id] = resource

        # Generate metadata for the dataset using only the first row of the resource for speed -
        # metadata generation runs over each cell in the dataframe, but we only care about column
        # level generation.  Once that's done, set the actual dataframe value.
        result_dataset = container.Dataset(
            {k: v.head(1) for k, v in resource_map.items()}, generate_metadata=True
        )
        for k, v in resource_map.items():
            result_dataset[k] = v
            result_dataset.metadata = result_dataset.metadata.update(
                (k,), {"dimension": {"length": v.shape[0]}}
            )

        for key in float_vector_columns.keys():
            df = result_dataset[left_resource_id]
            df[key] = float_vector_columns[key]
            float_vec_loc = df.columns.get_loc(key)
            float_vec_col_indices = df.metadata.list_columns_with_semantic_types(
                ("https://metadata.datadrivendiscovery.org/types/FloatVector",)
            )
            if float_vec_loc not in float_vec_col_indices:
                df.metadata = df.metadata.add_semantic_type(
                    (metadata_base.ALL_ELEMENTS, float_vec_loc),
                    "https://metadata.datadrivendiscovery.org/types/FloatVector",
                )

        return base.CallResult(result_dataset)
Esempio n. 21
0
def get_target_columns(dataset: container.Dataset):
    """
    Extracts true targets from the Dataset's entry point, or the only tabular resource.
    It requires that there is only one primary index column, which it makes the first
     column, named ``d3mIndex``. Then true target columns follow.

    We return a regular Pandas DataFrame with column names matching those in the metadata.
    We convert all columns to strings to match what would be loaded from ``predictions.csv`` file.
    It encodes any float vectors as strings.

    From: d3m/contrib/primitives/compute_scores.py:ComputeScoresPrimitive._get_truth
    """

    main_resource_id, main_resource = base_utils.get_tabular_resource(
        dataset, None, has_hyperparameter=False)

    # We first copy before modifying in-place.
    main_resource = container.DataFrame(main_resource, copy=True)
    main_resource = _encode_columns(main_resource)

    dataframe = _to_dataframe(main_resource)

    indices = list(dataset.metadata.get_index_columns(at=(main_resource_id, )))
    targets = list(
        dataset.metadata.list_columns_with_semantic_types(
            ['https://metadata.datadrivendiscovery.org/types/TrueTarget'],
            at=(main_resource_id, ),
        ))

    if not indices:
        raise exceptions.InvalidArgumentValueError("No primary index column.")
    elif len(indices) > 1:
        raise exceptions.InvalidArgumentValueError(
            "More than one primary index column.")
    if not targets:
        raise ValueError("No true target columns.")

    dataframe = dataframe.iloc[:, indices + targets]

    dataframe = dataframe.rename({dataframe.columns[0]: 'd3mIndex'})

    if 'confidence' in dataframe.columns[1:]:
        raise ValueError(
            "True target column cannot be named \"confidence\". It is a reserved name."
        )
    if 'd3mIndex' in dataframe.columns[1:]:
        raise ValueError(
            "True target column cannot be named \"d3mIndex\". It is a reserved name."
        )

    if d3m_utils.has_duplicates(dataframe.columns):
        duplicate_names = list(dataframe.columns)
        for name in set(dataframe.columns):
            duplicate_names.remove(name)
        raise exceptions.InvalidArgumentValueError(
            "True target columns have duplicate names: {duplicate_names}".
            format(duplicate_names=sorted(set(duplicate_names)), ), )

    dataframe = container.DataFrame(dataframe)
    dataframe.metadata = dataframe.metadata.add_semantic_type(
        (metadata_base.ALL_ELEMENTS, 0),
        'https://metadata.datadrivendiscovery.org/types/PrimaryKey')
    dataframe.metadata = dataframe.metadata.add_semantic_type(
        (metadata_base.ALL_ELEMENTS, 0), 'http://schema.org/Integer')
    dataframe.metadata = dataframe.metadata.add_semantic_type(
        (metadata_base.ALL_ELEMENTS, 1),
        'https://metadata.datadrivendiscovery.org/types/TrueTarget')
    return dataframe