def get_primitive(primitive_path: str) -> typing.Type[base.PrimitiveBase]: """ Loads (if not already) a primitive class and returns it. Parameters ---------- primitive_path: A Python path under ``d3m.primitives`` namespace of a primitive. Returns ------- A primitive class. """ if not primitive_path: raise exceptions.InvalidArgumentValueError( "Primitive path is required.") if not primitive_path.startswith('d3m.primitives.'): raise exceptions.InvalidArgumentValueError( "Primitive path does not start with \"d3m.primitives\".") path, name = primitive_path.rsplit('.', 1) module = importlib.import_module(path) return getattr(module, name)
def _get_predictions(self, inputs: Inputs) -> pandas.DataFrame: """ It requires that predictions already have the right structure (one ``d3mIndex`` column, at most one ``confidence`` column, at most one ``rank`` column, no duplicate column names). We return a regular Pandas DataFrame with column names matching those in the metadata. We convert all columns to strings to match what would be loaded from ``predictions.csv`` file. Predictions DataFrame should already have float vectors encoded as strings. """ dataframe = self._to_dataframe(inputs) if metrics.INDEX_COLUMN not in dataframe.columns: raise exceptions.InvalidArgumentValueError("No primary index column.") if d3m_utils.has_duplicates(dataframe.columns): duplicate_names = list(dataframe.columns) for name in set(dataframe.columns): duplicate_names.remove(name) raise exceptions.InvalidArgumentValueError( "Predicted target columns have duplicate names: {duplicate_names}".format( duplicate_names=sorted(set(duplicate_names)), ), ) return dataframe
def __init__(self, dimension_values: typing.Dict[DimensionName, typing.List[T]], *, dimension_ordering: typing.List[DimensionName] = None, value_weights: typing.Dict[DimensionName, typing.List[float]] = None) -> None: if dimension_ordering is not None and set(dimension_values.keys()) == set(dimension_ordering): raise exceptions.InvalidArgumentValueError( 'The keys of dimension_values and dimesion_ordering must be the same') if value_weights is not None and not set(dimension_values.keys()) == set(value_weights.keys()): raise exceptions.InvalidArgumentValueError( 'The set of keys of dimension_values and value_weights must be the same') for key in dimension_values.keys(): if not len(dimension_values[key]) == len(value_weights[key]): raise exceptions.InvalidArgumentValueError( 'The length of dimension_values[{}] and values_weights[{}] must be the same'.format(key, key)) if value_weights is None: value_weights = {} for key in dimension_values.keys(): value_weights[key] = [1.0] * len(dimension_values[key]) if dimension_ordering is None: dimension_ordering = list(dimension_values.keys()) self._dimension_values: typing.Dict[DimensionName, typing.List[T]] = dimension_values self._value_weights: typing.Dict[DimensionName, typing.List[float]] = value_weights self._dimension_ordering = dimension_ordering
def insert_columns(self: D, columns: 'DataFrame', at_column_index: int) -> D: """ Inserts all columns from ``columns`` before ``at_column_index`` column in this DataFrame, pushing all existing columns to the right. E.g., ``at_column_index == 0`` means inserting ``columns`` at the beginning of this DataFrame. Top-level metadata of ``columns`` is ignored. """ columns_length = self.shape[1] if at_column_index < 0: raise exceptions.InvalidArgumentValueError( "\"at_column_index\" is smaller than 0.") if at_column_index > columns_length: raise exceptions.InvalidArgumentValueError( "\"at_column_index\" is larger than the range of existing columns." ) if at_column_index == 0: return columns.append_columns(self, use_right_metadata=True) if at_column_index == columns_length: return self.append_columns(columns) # TODO: This could probably be optimized without all the slicing and joining. before = self.select_columns(list(range(0, at_column_index))) after = self.select_columns( list(range(at_column_index, columns_length))) return before.append_columns(columns).append_columns(after)
def __init__(self, other: typing.Dict[str, typing.Any] = None, **values: typing.Any) -> None: if other is None: other = {} values = dict(other, **values) params_keys = set(self.__params_items__.keys()) # type: ignore values_keys = set(values.keys()) missing = params_keys - values_keys if len(missing): raise exceptions.InvalidArgumentValueError( "Not all parameters are specified: {missing}".format( missing=missing)) extra = values_keys - params_keys if len(extra): raise exceptions.InvalidArgumentValueError( "Additional parameters are specified: {extra}".format( extra=extra)) for name, value in values.items(): value_type = self.__params_items__[name] # type: ignore if not utils.is_instance(value, value_type): raise exceptions.InvalidArgumentTypeError( "Value '{value}' for parameter '{name}' is not an instance of the type: {value_type}" .format(value=value, name=name, value_type=value_type)) super().__init__(values)
def produce(self, *, inputs: container.Dataset, timeout: float = None, iterations: int = None) -> base.CallResult[container.Dataset]: main_resource_index = self.hyperparams['main_resource_index'] if main_resource_index is None: raise exceptions.InvalidArgumentValueError( 'no main resource specified') file_index = self.hyperparams['file_col_index'] if file_index is not None: if not self._is_csv_file_column(inputs.metadata, main_resource_index, file_index): raise exceptions.InvalidArgumentValueError( 'column idx=' + str(file_index) + ' from does not contain csv file names') else: file_index = self._find_csv_file_column(inputs.metadata) if file_index is None: raise exceptions.InvalidArgumentValueError( 'no column from contains csv file names') # generate the long form timeseries data base_path = self._get_base_path(inputs.metadata, main_resource_index, file_index) output_data = [] timeseries_dataframe = pd.DataFrame() for idx, tRow in inputs[main_resource_index].iterrows(): # read the timeseries data csv_path = os.path.join(base_path, tRow[file_index]) timeseries_row = pd.read_csv(csv_path) # add the timeseries id tRow = tRow.append(pd.Series({'series_id': int(idx)})) # combine the timeseries data with the value row output_data.extend([ pd.concat([tRow, vRow]) for vIdx, vRow in timeseries_row.iterrows() ]) # add the timeseries index timeseries_dataframe = timeseries_dataframe.append(output_data, ignore_index=True) # join the metadata from the 2 data resources timeseries_dataframe = container.DataFrame(timeseries_dataframe) # wrap as a D3M container #return base.CallResult(container.Dataset({'0': timeseries_dataframe}, metadata)) return base.CallResult( container.Dataset({'0': timeseries_dataframe}, generate_metadata=True))
def produce(self, *, inputs: container.List, timeout: float = None, iterations: int = None) -> base.CallResult[container.Dataset]: # build the list of dataframes from the list of inputs dataframes = [] metadata = None for input in inputs: if isinstance(input, container.DataFrame): dataframes.append(input) try: _, main_dr = d3m_base_utils.get_tabular_resource(input, None) dataframes.append(main_dr) metadata = input.metadata except ValueError as error: raise exceptions.InvalidArgumentValueError( "Failure to find tabular resource in dataset") from error if self.hyperparams["column_overlap"] == "exact": columns_to_handle = dataframes[0].columns if np.sum( np.array([ np.all(df.columns == columns_to_handle) for df in dataframes ])) != len(dataframes): raise exceptions.InvalidArgumentValueError( "Dataframes don't have same columns, cannot exact concat") concated = pd.concat(dataframes, ignore_index=True) elif self.hyperparams["column_overlap"] == "union": concated = pd.concat(dataframes, ignore_index=True) elif self.hyperparams["column_overlap"] == "intersection": concated = pd.concat(dataframes, join="inner", ignore_index=True) if self.hyperparams["remove_duplicate_rows"]: concated.drop_duplicates(subset="d3mIndex", keep="first", inplace=True, ignore_index=True) if metadata is None: metadata = container.Dataset({ "learningData": concated.head(1) }, generate_metadata=True).metadata outputs = container.Dataset({"learningData": concated}, metadata) outputs.metadata = outputs.metadata.update( (metadata_base.ALL_ELEMENTS, ), {"dimension": { "length": concated.shape[0] }}) return base.CallResult(outputs)
def multi_produce(self, *, inputs1: Inputs, inputs2: Inputs, produce_methods: typing.Sequence[str], timeout: float = None, iterations: int = None) -> CallResult[Outputs]: results = [] for method_name in produce_methods: if method_name != 'produce' and not method_name.startswith('produce_'): raise exceptions.InvalidArgumentValueError( "Invalid produce method name '{method_name}'.".format(method_name=method_name)) if not hasattr(self, method_name): raise exceptions.InvalidArgumentValueError( "Unknown produce method name '{method_name}'.".format(method_name=method_name)) try: expected_arguments = set(self.metadata.query()['primitive_code'].get( 'instance_methods', {})[method_name]['arguments']) except KeyError as error: raise exceptions.InvalidArgumentValueError( "Unknown produce method name '{method_name}'.".format(method_name=method_name)) from error arguments = {'inputs1': inputs1, 'inputs2': inputs2, } start = time.perf_counter() results.append(getattr(self, method_name)( timeout=timeout, **arguments)) delta = time.perf_counter() - start # Decrease the amount of time available to other calls. This delegates responsibility # of raising a "TimeoutError" exception to produce methods themselves. It also assumes # that if one passes a negative timeout value to a produce method, it raises a # "TimeoutError" exception correctly. if timeout is not None: timeout -= delta # We return the maximum number of iterations done by any produce method we called. iterations_done = None for result in results: if result.iterations_done is not None: if iterations_done is None: iterations_done = result.iterations_done else: iterations_done = max( iterations_done, result.iterations_done) return MultiCallResult( values={name: result.value for name, result in zip(produce_methods, results)}, has_finished=all(result.has_finished for result in results), iterations_done=iterations_done, )
def set_target_column(dataset): """ Function used for unit test """ # TODO: Cannot assume resource_id '0' exists resource_id = '0' for index in range( dataset.metadata.query( (resource_id, ALL_ELEMENTS))['dimension']['length'] - 1, -1, -1): column_semantic_types = dataset.metadata.query( (resource_id, ALL_ELEMENTS, index))['semantic_types'] if 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' in \ column_semantic_types: column_semantic_types = list(column_semantic_types) + [ 'https://metadata.datadrivendiscovery.org/types/Target', 'https://metadata.datadrivendiscovery.org/types/TrueTarget' ] dataset.metadata = dataset.metadata.update( (resource_id, ALL_ELEMENTS, index), {'semantic_types': column_semantic_types}) return raise exceptions.InvalidArgumentValueError( 'At least one column should have semantic type SuggestedTarget')
def select_columns(self: D, columns: typing.Sequence[ metadata_base.SimpleSelectorSegment], *, allow_empty_columns: bool = False) -> D: """ Returns a new DataFrame with data and metadata only for given ``columns``. Moreover, columns are renumbered based on the position in ``columns`` list. Top-level metadata stays unchanged, except for updating the length of the columns dimension to the number of columns. So if the ``columns`` is ``[3, 6, 5]`` then output DataFrame will have three columns, ``[0, 1, 2]``, mapping data and metadata for columns ``3`` to ``0``, ``6`` to ``1`` and ``5`` to ``2``. This allows also duplication of columns. """ if not columns and not allow_empty_columns: raise exceptions.InvalidArgumentValueError("No columns selected.") output = self.iloc[:, list(columns)] # We want to make sure it is a true copy. if output._is_view: output = output.copy() else: output._set_is_copy(copy=False) output.metadata = self.metadata.select_columns( columns, allow_empty_columns=allow_empty_columns) return output
def crawl_openml_handler( arguments: argparse.Namespace, *, pipeline_resolver: typing.Callable = None, dataset_resolver: typing.Callable = None, problem_resolver: typing.Callable = None, ) -> None: if pipeline_resolver is None: pipeline_resolver = pipeline_module.get_pipeline if dataset_resolver is None: dataset_resolver = dataset_module.get_dataset if problem_resolver is None: problem_resolver = problem_module.get_problem context = metadata_base.Context[arguments.context] compute_digest = dataset_module.ComputeDigest[getattr( arguments, 'compute_digest', dataset_module.ComputeDigest.ONLY_IF_MISSING.name)] runtime_environment = pipeline_run_module.RuntimeEnvironment( worker_id=getattr(arguments, 'worker_id', None), ) task_types = [ problem_module.OpenMLTaskType[task_type] for task_type in arguments.task_types ] if utils.has_duplicates(task_types): raise exceptions.InvalidArgumentValueError( "Same task type listed multiple times.") assert task_types inputs_config = runtime._get_inputs_config_from_arguments( arguments=arguments, pipeline_resolver=pipeline_resolver, dataset_resolver=dataset_resolver, ) assert inputs_config.data_pipeline has_errored = crawl_openml( save_dir=arguments.save_dir, task_types=task_types, data_pipeline=inputs_config.data_pipeline, data_params=inputs_config.data_params, context=context, random_seed=inputs_config.data_random_seed, volumes_dir=getattr(arguments, 'volumes_dir', None), scratch_dir=getattr(arguments, 'scratch_dir', None), runtime_environment=runtime_environment, max_tasks=arguments.max_tasks, ignore_tasks=arguments.ignore_tasks or [], ignore_datasets=arguments.ignore_datasets or [], dataset_resolver=dataset_resolver, problem_resolver=problem_resolver, compute_digest=compute_digest, strict_digest=getattr(arguments, 'strict_digest', False), ) if has_errored: sys.exit(1)
def produce(self, *, left: Inputs, # type: ignore right: Inputs, # type: ignore timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: # attempt to extract the main table try: left_resource_id, left_df = utils.get_tabular_resource(left, None) except ValueError as error: raise exceptions.InvalidArgumentValueError("Failure to find tabular resource in left dataset") from error try: right_resource_id, right_df = utils.get_tabular_resource(right, None) except ValueError as error: raise exceptions.InvalidArgumentValueError("Failure to find tabular resource in right dataset") from error accuracy = self.hyperparams['accuracy'] if accuracy <= 0.0 or accuracy > 1.0: raise exceptions.InvalidArgumentValueError('accuracy of ' + str(accuracy) + ' is out of range') left_col = self.hyperparams['left_col'] right_col = self.hyperparams['right_col'] # perform join based on semantic type join_type = self._get_join_semantic_type(left, left_resource_id, left_col, right, right_resource_id, right_col) joined: pd.Dataframe = None if join_type in self._STRING_JOIN_TYPES: joined = self._join_string_col(left_df, left_col, right_df, right_col, accuracy) elif join_type in self._NUMERIC_JOIN_TYPES: joined = self._join_numeric_col(left_df, left_col, right_df, right_col, accuracy) elif join_type in self._DATETIME_JOIN_TYPES: joined = self._join_datetime_col(left_df, left_col, right_df, right_col, accuracy) else: raise exceptions.InvalidArgumentValueError('join not surpported on type ' + str(join_type)) # create a new dataset to hold the joined data resource_map = {} for resource_id, resource in left.items(): # type: ignore if resource_id == left_resource_id: resource_map[resource_id] = joined else: resource_map[resource_id] = resource result_dataset = container.Dataset(resource_map) return base.CallResult(result_dataset)
def _get_value_indices(self, inputs_metadata): value_indices = self.hyperparams["value_cols"] if value_indices and len(value_indices) > 0: return value_indices value_indices = inputs_metadata.list_columns_with_semantic_types( self._target_semantic) if len(value_indices) > 0: return value_indices raise exceptions.InvalidArgumentValueError("no columns with target")
def __init__(self, other: typing.Dict[str, typing.Any] = None, **values: typing.Any) -> None: if other is None: other = {} values = dict(other, **values) params_keys = set(self.__params_items__.keys()) # type: ignore values_keys = set(values.keys()) missing = params_keys - values_keys if len(missing): raise exceptions.InvalidArgumentValueError("Not all parameters are specified: {missing}".format(missing=missing)) extra = values_keys - params_keys if len(extra): raise exceptions.InvalidArgumentValueError("Additional parameters are specified: {extra}".format(extra=extra)) super().__init__(values)
def _get_time_index(self, inputs_metadata): time_index = self.hyperparams["time_col"] if time_index: return time_index time_indices = inputs_metadata.list_columns_with_semantic_types( self._time_semantic) if len(time_indices) > 0: return time_indices[0] raise exceptions.InvalidArgumentValueError("no column with time")
def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None) -> base.CallResult[container.DataFrame]: file_index = self.hyperparams['file_col_index'] if file_index is not None: if not self._is_csv_file_column(inputs.metadata, file_index): raise exceptions.InvalidArgumentValueError( 'column idx=' + str(file_index) + ' from ' + str(inputs.columns) + ' does not contain csv file names') else: file_index = self._find_csv_file_column(inputs.metadata) if file_index is None: raise exceptions.InvalidArgumentValueError( 'no column from ' + str(inputs.columns) + ' contains csv file names') value_index = self.hyperparams['value_col_index'] time_index = self.hyperparams['time_col_index'] # load each time series file, transpose, and append base_path = inputs.metadata.query( (metadata_base.ALL_ELEMENTS, file_index))['location_base_uris'][0] timeseries_dataframe: pd.DataFrame for idx, file_path in enumerate(inputs.iloc[:, file_index]): csv_path = os.path.join(base_path, file_path) timeseries_row = pd.read_csv(csv_path).transpose() # use the time values as the column headers if idx is 0: timeseries_dataframe = pd.DataFrame( columns=timeseries_row.iloc[time_index]) timeseries_dataframe = timeseries_dataframe.append( timeseries_row.iloc[value_index]) # get the index to use a range of ints rather than the value col name timeseries_dataframe = timeseries_dataframe.reset_index(drop=True) # wrap as a D3M container - metadata should be auto generated return base.CallResult(container.DataFrame(data=timeseries_dataframe))
def _get_grouping_key_index(self, inputs_metadata): group_key_index = self.hyperparams["grouping_key_col"] if group_key_index: return group_key_index grouping_key_indices = inputs_metadata.list_columns_with_semantic_types( self._grouping_key_semantic) if len(grouping_key_indices) > 0: return grouping_key_indices[0] raise exceptions.InvalidArgumentValueError( "no column with grouping key")
def log_likelihoods(self, *, outputs: Outputs, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Sequence[float]]: inputs = inputs.iloc[:, self._training_indices] # Get ndarray outputs = outputs.iloc[:, self._target_column_indices] if len(inputs.columns) and len(outputs.columns): if outputs.shape[1] != self._n_classes: raise exceptions.InvalidArgumentValueError( "\"outputs\" argument does not have the correct number of target columns." ) log_proba = self._predict_log_proba(inputs, self._weights) # Making it always a list, even when only one target. if self._n_classes == 1: log_proba = [log_proba] classes = [self._classes_] else: classes = self._classes_ samples_length = inputs.shape[0] log_likelihoods = [] for k in range(self._n_classes): # We have to map each class to its internal (numerical) index used in the learner. # This allows "outputs" to contain string classes. outputs_column = outputs.iloc[:, k] classes_map = pandas.Series(np.arange(len(classes[k])), index=classes[k]) mapped_outputs_column = outputs_column.map(classes_map) # For each target column (column in "outputs"), for each sample (row) we pick the log # likelihood for a given class. log_likelihoods.append(log_proba[k][np.arange(samples_length), mapped_outputs_column]) results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True) results.columns = outputs.columns for k in range(self._n_classes): column_metadata = outputs.metadata.query_column(k) if 'name' in column_metadata: results.metadata = results.metadata.update_column( k, {'name': column_metadata['name']}) else: results = d3m_dataframe(generate_metadata=True) return CallResult(results)
def produce( self, *, inputs: container.Dataset, timeout: float = None, iterations: int = None) -> base.CallResult[container.DataFrame]: main_resource_index = self.hyperparams['main_resource_index'] if main_resource_index is None: raise exceptions.InvalidArgumentValueError( 'no main resource specified') file_index = self.hyperparams['file_col_index'] if file_index is not None: if not self._is_csv_file_column(inputs.metadata, main_resource_index, file_index): raise exceptions.InvalidArgumentValueError( 'column idx=' + str(file_index) + ' from does not contain csv file names') else: file_index = self._find_csv_file_column(inputs.metadata) if file_index is None: raise exceptions.InvalidArgumentValueError( 'no column from contains csv file names') # generate the long form timeseries data base_path = self._get_base_path(inputs.metadata, main_resource_index, file_index) csv_paths = [ os.path.join(base_path, f) for f in inputs[main_resource_index].iloc[:, file_index] ] ts_values = [pd.read_csv(path) for path in csv_paths] for ts, val in zip(ts_values, inputs[main_resource_index].values): ts[list(inputs[main_resource_index])] = pd.DataFrame( [list(val)], index=ts.index) timeseries_dataframe = pd.concat(ts_values) timeseries_dataframe = container.DataFrame(timeseries_dataframe) return base.CallResult( container.Dataset({'0': timeseries_dataframe}, generate_metadata=True))
def fit_multi_produce(self, *, produce_methods: Sequence[str], inputs: Inputs, timeout: float = None, iterations: int = None) -> MultiCallResult: self.set_training_data(inputs=inputs) # type: ignore method_name = produce_methods[0] if method_name != 'produce': raise exceptions.InvalidArgumentValueError( "Invalid produce method name '{method_name}'.".format( method_name=method_name)) result = self.fit(timeout=timeout, iterations=iterations) return MultiCallResult(values={method_name: result.value}, )
def _granularityToRule(self): granularity = self.hyperparams["granularity"] if granularity == "seconds": return "S" elif granularity == "minutes": return "T" elif granularity == "hours": return "H" elif granularity == "days": return "D" elif granularity == "weeks": return "W" elif granularity == "months": return "M" elif granularity == "years": return "A" raise exceptions.InvalidArgumentValueError( "Given granularity argument not supported")
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: primitive_1 = self.hyperparams['primitive_1'] primitive_2 = self.hyperparams['primitive_2'] results = [] if primitive_1 is not None: start = time.perf_counter() results.append(primitive_1.produce(inputs=inputs, timeout=timeout, iterations=iterations)) delta = time.perf_counter() - start # Decrease the amount of time available to other calls. This delegates responsibility # of raising a "TimeoutError" exception to produce methods themselves. It also assumes # that if one passes a negative timeout value to a produce method, it raises a # "TimeoutError" exception correctly. if timeout is not None: timeout -= delta if primitive_2 is not None: results.append(primitive_2.produce(inputs=inputs, timeout=timeout, iterations=iterations)) if not results: raise exceptions.InvalidArgumentValueError("No primitives provided as hyper-parameters.") # Even if the structure of outputs is the same as inputs, conceptually, outputs are different, # they are new data. So we do not reuse metadata from inputs but generate new metadata. outputs = container.List([sum(x) for x in zip(*[result.value for result in results])], generate_metadata=True) # We return the maximum number of iterations done by any produce method we called. iterations_done = None for result in results: if result.iterations_done is not None: if iterations_done is None: iterations_done = result.iterations_done else: iterations_done = max(iterations_done, result.iterations_done) return base.CallResult( value=outputs, has_finished=all(result.has_finished for result in results), iterations_done=iterations_done, )
def bind_primitive_IO(self, primitive: PrimitiveStep, templateIO): # print(templateIO) if len(templateIO) > 0: primitive.add_argument( name="inputs", argument_type=metadata_base.ArgumentType.CONTAINER, data_reference=templateIO[0]) if len(templateIO) > 1: arguments = primitive.primitive.metadata.query()['primitive_code'][ 'instance_methods']['set_training_data']['arguments'] if "outputs" in arguments: # Some primitives (e.g. GreedyImputer) require "outputs", while others do # not (e.g. MeanImputer) primitive.add_argument("outputs", metadata_base.ArgumentType.CONTAINER, templateIO[1]) if len(templateIO) > 2: raise exceptions.InvalidArgumentValueError( "Should be less than 3 arguments!")
def get_primitive_by_id(primitive_id: str) -> typing.Type[base.PrimitiveBase]: """ Returns a primitive class based on its ID from all currently loaded primitives. Parameters ---------- primitive_id: An ID of a primitive. Returns ------- A primitive class. """ for primitive in get_loaded_primitives(): if primitive.metadata.query()['id'] == primitive_id: return primitive raise exceptions.InvalidArgumentValueError( "Unable to get primitive '{primitive_id}'.".format( primitive_id=primitive_id))
def _produce( self, *, left_df_full: container.DataFrame, # type: ignore left_df: container.DataFrame, # type: ignore right_df: container.DataFrame, # type: ignore join_types: typing.Sequence[str], left_col: typing.Sequence[int], right_col: typing.Sequence[int], accuracy: typing.Sequence[float], absolute_accuracy: typing.Sequence[bool] ) -> base.CallResult[Outputs]: # cycle through the columns to join the dataframes right_cols_to_drop = [] new_left_cols = [] new_right_cols = [] for col_index in range(len(left_col)): # depending on the joining type, make a new dataframe that has columns we will want to merge on # keep track of which columns we will want to drop later on if len(self._STRING_JOIN_TYPES.intersection(join_types[col_index])) > 0: new_left_df = self._create_string_merge_cols( left_df, left_col[col_index], right_df, right_col[col_index], accuracy[col_index], col_index, ) left_df[new_left_df.columns] = new_left_df right_name = "righty_string" + str(col_index) right_df.rename( columns={right_col[col_index]: right_name}, inplace=True ) new_left_cols += list(new_left_df.columns) new_right_cols.append(right_name) elif len(self._NUMERIC_JOIN_TYPES.intersection(join_types[col_index])) > 0: new_left_df = self._create_numeric_merge_cols( left_df, left_col[col_index], right_df, right_col[col_index], accuracy[col_index], col_index, absolute_accuracy[col_index], ) left_df[new_left_df.columns] = new_left_df right_name = "righty_numeric" + str(col_index) right_df.rename( columns={right_col[col_index]: right_name}, inplace=True ) new_left_cols += list(new_left_df.columns) new_right_cols.append(right_name) elif len(self._GEO_JOIN_TYPES.intersection(join_types[col_index])) > 0: new_left_df, new_right_df = self._create_geo_vector_merging_cols( left_df, left_col[col_index], right_df, right_col[col_index], accuracy[col_index], col_index, absolute_accuracy[col_index], ) left_df[new_left_df.columns] = new_left_df right_df[new_right_df.columns] = new_right_df new_left_cols += list(new_left_df.columns) new_right_cols += list(new_right_df.columns) right_cols_to_drop.append(right_col[col_index]) elif len(self._VECTOR_JOIN_TYPES.intersection(join_types[col_index])) > 0: new_left_df, new_right_df = self._create_vector_merging_cols( left_df, left_col[col_index], right_df, right_col[col_index], accuracy[col_index], col_index, absolute_accuracy[col_index], ) left_df[new_left_df.columns] = new_left_df right_df[new_right_df.columns] = new_right_df new_left_cols += list(new_left_df.columns) new_right_cols += list(new_right_df.columns) right_cols_to_drop.append(right_col[col_index]) elif len(self._DATETIME_JOIN_TYPES.intersection(join_types[col_index])) > 0: tolerance = self._compute_datetime_tolerance(left_df_full, left_col[col_index], right_df, right_col[col_index], accuracy[col_index]) new_left_df, new_right_df = self._create_datetime_merge_cols( left_df, left_col[col_index], right_df, right_col[col_index], tolerance, col_index, ) left_df[new_left_df.columns] = new_left_df right_df[new_right_df.columns] = new_right_df new_left_cols += list(new_left_df.columns) new_right_cols += list(new_right_df.columns) right_cols_to_drop.append(right_col[col_index]) else: raise exceptions.InvalidArgumentValueError( "join not surpported on type " + str(join_types[col_index]) ) if "d3mIndex" in right_df.columns: right_cols_to_drop.append("d3mIndex") right_df.drop(columns=right_cols_to_drop, inplace=True) joined = pd.merge( left_df, right_df, how=self.hyperparams["join_type"], left_on=new_left_cols, right_on=new_right_cols, suffixes=["_left", "_right"], ) # don't want to keep columns that were created specifically for merging # also, inner merge keeps the right column we merge on, we want to remove it joined.drop(columns=new_left_cols + new_right_cols, inplace=True) return joined
def produce( self, *, left: Inputs, # type: ignore right: Inputs, # type: ignore timeout: float = None, iterations: int = None, ) -> base.CallResult[Outputs]: # attempt to extract the main table try: left_resource_id, left_df = d3m_base_utils.get_tabular_resource(left, None) except ValueError as error: raise exceptions.InvalidArgumentValueError( "Failure to find tabular resource in left dataset" ) from error try: right_resource_id, right_df = d3m_base_utils.get_tabular_resource( right, None ) except ValueError as error: raise exceptions.InvalidArgumentValueError( "Failure to find tabular resource in right dataset" ) from error accuracy = self.hyperparams["accuracy"] absolute_accuracy = self.hyperparams["absolute_accuracy"] # hyperparams may be parsed as tuples # floats could be integers if round number is passed in if isinstance(accuracy, collections.Iterable): accuracy = [float(a) for a in accuracy] else: accuracy = float(accuracy) if isinstance(absolute_accuracy, collections.Iterable): absolute_accuracy = list(absolute_accuracy) if type(accuracy) == float and not type(absolute_accuracy) == bool: raise exceptions.InvalidArgumentValueError( "only 1 value of accuracy provided, but multiple values for absolute accuracy provided" ) if (not type(accuracy) == float) and type(absolute_accuracy) == bool: raise exceptions.InvalidArgumentValueError( "only 1 for absolute accuracy provided, but multiple values of accuracy provided" ) if type(accuracy) == float and not absolute_accuracy: if accuracy <= 0.0 or accuracy > 1.0: raise exceptions.InvalidArgumentValueError( "accuracy of " + str(accuracy) + " is out of range" ) elif type(accuracy) == list and type(absolute_accuracy) == list: if not len(accuracy) == len(absolute_accuracy): raise exceptions.InvalidArgumentValueError( "the count of accuracy hyperparams does not match the count of absolute_accuracy hyperparams" ) for i in range(len(accuracy)): if (accuracy[i] <= 0.0 or accuracy[i] > 1.0) and not absolute_accuracy[i]: raise exceptions.InvalidArgumentValueError( "accuracy of " + str(acc) + " is out of range" ) left_col = self.hyperparams["left_col"] right_col = self.hyperparams["right_col"] if type(left_col) != type(right_col) or ( type(left_col) == list and len(left_col) != len(right_col) and type(accuracy) != list and len(accuracy) != len(left_col) ): raise exceptions.InvalidArgumentTypeError( "both left_col and right_col need to have same data type and if they are lists, the same list lengths" ) if type(left_col) == str: left_col = [left_col] right_col = [right_col] accuracy = [accuracy] absolute_accuracy = [absolute_accuracy] join_types = [ self._get_join_semantic_type( left, left_resource_id, left_col[i], right, right_resource_id, right_col[i], ) for i in range(len(left_col)) ] num_splits = 32 joined_split = [None for i in range(num_splits)] left_df_split = np.array_split(left_df, num_splits) jobs = [delayed(self._produce_threaded)( index = i, left_df_full = left_df, left_dfs = left_df_split, right_df = right_df, join_types = join_types, left_col = left_col, right_col = right_col, accuracy = accuracy, absolute_accuracy = absolute_accuracy ) for i in range(num_splits)] joined_data = Parallel(n_jobs=self.hyperparams["n_jobs"], backend="loky", verbose=10)(jobs) # joined data needs to maintain order to mimic none split joining for i, d in joined_data: joined_split[i] = d joined = pd.concat(joined_split, ignore_index = True) # create a new dataset to hold the joined data resource_map = {} float_vector_columns = {} for resource_id, resource in left.items(): # type: ignore if resource_id == left_resource_id: for column in joined.columns: # need to avoid bug in container.Dataset, it doesn't like vector columns if type(joined[column].iloc[0]) == np.ndarray: float_vector_columns[column] = joined[column] joined[column] = np.NAN resource_map[resource_id] = joined else: resource_map[resource_id] = resource # Generate metadata for the dataset using only the first row of the resource for speed - # metadata generation runs over each cell in the dataframe, but we only care about column # level generation. Once that's done, set the actual dataframe value. result_dataset = container.Dataset( {k: v.head(1) for k, v in resource_map.items()}, generate_metadata=True ) for k, v in resource_map.items(): result_dataset[k] = v result_dataset.metadata = result_dataset.metadata.update( (k,), {"dimension": {"length": v.shape[0]}} ) for key in float_vector_columns.keys(): df = result_dataset[left_resource_id] df[key] = float_vector_columns[key] float_vec_loc = df.columns.get_loc(key) float_vec_col_indices = df.metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/FloatVector",) ) if float_vec_loc not in float_vec_col_indices: df.metadata = df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, float_vec_loc), "https://metadata.datadrivendiscovery.org/types/FloatVector", ) return base.CallResult(result_dataset)
def _get_truth(self, score_dataset: container.Dataset) -> typing.Tuple[pandas.DataFrame, typing.Dict[str, typing.Any]]: """ Extracts true targets from the Dataset's entry point, or the only tabular resource. It requires that there is only one primary index column, which it makes the first column, named ``d3mIndex``. Then true target columns follow. We return a regular Pandas DataFrame with column names matching those in the metadata, and a dict mapping target columns to all label values in those columns, if available in metadata. We convert all columns to strings to match what would be loaded from ``predictions.csv`` file. It encodes any float vectors as strings. """ main_resource_id, main_resource = base_utils.get_tabular_resource(score_dataset, None, has_hyperparameter=False) # We first copy before modifying in-place. main_resource = container.DataFrame(main_resource, copy=True) main_resource = self._encode_columns(main_resource) dataframe = self._to_dataframe(main_resource) indices = list(score_dataset.metadata.get_index_columns(at=(main_resource_id,))) targets = list(score_dataset.metadata.list_columns_with_semantic_types( ['https://metadata.datadrivendiscovery.org/types/TrueTarget'], at=(main_resource_id,), )) if not indices: raise exceptions.InvalidArgumentValueError("No primary index column.") elif len(indices) > 1: raise exceptions.InvalidArgumentValueError("More than one primary index column.") if not targets: raise ValueError("No true target columns.") dataframe = dataframe.iloc[:, indices + targets] dataframe = dataframe.rename(columns={dataframe.columns[0]: metrics.INDEX_COLUMN}) if metrics.SCORE_COLUMN in dataframe.columns[1:]: raise ValueError("True target column cannot be named \"confidence\". It is a reserved name.") if metrics.RANK_COLUMN in dataframe.columns[1:]: raise ValueError("True target column cannot be named \"rank\". It is a reserved name.") if metrics.INDEX_COLUMN in dataframe.columns[1:]: raise ValueError("True target column cannot be named \"d3mIndex\". It is a reserved name.") if d3m_utils.has_duplicates(dataframe.columns): duplicate_names = list(dataframe.columns) for name in set(dataframe.columns): duplicate_names.remove(name) raise exceptions.InvalidArgumentValueError( "True target columns have duplicate names: {duplicate_names}".format( duplicate_names=sorted(set(duplicate_names)), ), ) all_labels = {} for target_column_name, main_resource_column_index in zip(dataframe.columns[1:], targets): try: column_labels = score_dataset.metadata.query_column_field(main_resource_column_index, 'all_distinct_values', at=(main_resource_id,)) except KeyError: continue all_labels[target_column_name] = [str(label) for label in column_labels] return dataframe, all_labels
def produce( # type: ignore self, *, inputs: Inputs, score_dataset: container.Dataset, timeout: float = None, iterations: int = None, ) -> base.CallResult[Outputs]: if not self.hyperparams['metrics']: raise ValueError("\"metrics\" hyper-parameter cannot be empty.") truth, all_labels = self._get_truth(score_dataset) predictions = self._get_predictions(inputs) for target_column in self.hyperparams['all_labels']: all_labels[target_column['column_name']] = list(target_column['labels']) outputs: typing.Dict[str, typing.List] = { 'metric': [], 'value': [], } if self.hyperparams['add_normalized_scores']: outputs['normalized'] = [] for metric_configuration in self.hyperparams['metrics']: metric = problem.PerformanceMetric[metric_configuration['metric']] metric_class = metric.get_class() params = {} if 'all_labels' in inspect.signature(metric_class).parameters and all_labels: params['all_labels'] = all_labels for param_name, param_value in metric_configuration.items(): if param_name == 'metric': continue if param_value is None: continue params[param_name] = param_value if metric.requires_score() and metrics.SCORE_COLUMN not in predictions.columns: raise exceptions.InvalidArgumentValueError( f"Metric {metric.name} requires score column in predictions, but it is not available.", ) if metric.requires_rank() and metrics.RANK_COLUMN not in predictions.columns: raise exceptions.InvalidArgumentValueError( f"Metric {metric.name} requires rank column in predictions, but it is not available.", ) score = metric_class(**params).score(truth, predictions) outputs['metric'].append(metric.name) outputs['value'].append(score) if self.hyperparams['add_normalized_scores']: outputs['normalized'].append(metric.normalize(score)) # Dictionary key order is preserved in Python 3.6+ which makes column order as we want it. results = container.DataFrame(data=outputs, columns=list(outputs.keys()), generate_metadata=True) # Not really necessary, but it does not hurt. In theory somebody could list same metric multiple times # (maybe with different params), so we use "PrimaryMultiKey" here. results.metadata = results.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey', ) results.metadata = results.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/Score', ) if self.hyperparams['add_normalized_scores']: results.metadata = results.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 2), 'https://metadata.datadrivendiscovery.org/types/Score', ) return base.CallResult(results)
def combine_columns( inputs: container.DataFrame, column_indices: typing.Sequence[int], columns_list: typing.Sequence[container.DataFrame], *, return_result: str, add_index_columns: bool, ) -> container.DataFrame: """ Method which appends existing columns, replaces them, or creates new result from them, based on ``return_result`` argument, which can be ``append``, ``replace``, or ``new``. ``add_index_columns`` controls if when creating a new result, primary index columns should be added if they are not already among columns. ``inputs`` is a DataFrame for which we are appending on replacing columns, or if we are creating new result, from where a primary index column can be taken. ``column_indices`` controls which columns in ``inputs`` were used to create ``columns_list``, and which columns should be replaced when replacing them. ``columns_list`` is a list of DataFrames representing all together new columns. The reason it is a list is to make it easier to operate per-column when preparing ``columns_list`` and not have to concat them all together unnecessarily. Top-level metadata in ``columns_list`` is ignored, except when creating new result. In that case top-level metadata from the first element in the list is used. When ``column_indices`` columns are being replaced with ``columns_list``, existing metadata in ``column_indices`` columns is not preserved but replaced with metadata in ``columns_list``. Ideally, metadata for ``columns_list`` has been constructed by copying source metadata from ``column_indices`` columns and modifying it as necessary to adapt it to new columns. But ``columns_list`` also can have completely new metadata, if this is more reasonable, but it should be understood that in this case when replacing ``column_indices`` columns, any custom additional metadata on those columns will be lost. ``column_indices`` and ``columns_list`` do not have to match in number of columns. Columns are first replaced in order for matching indices and columns. If then there are more ``column_indices`` than ``columns_list``, additional ``column_indices`` columns are removed. If there are more ``columns_list`` than ``column_indices`` columns, then additional ``columns_list`` are inserted after the last replaced column. If ``column_indices`` is empty, then the replacing behavior is equivalent to appending. """ if return_result == 'append': outputs = inputs for columns in columns_list: outputs = outputs.append_columns(columns) elif return_result == 'replace': if not column_indices: return combine_columns(inputs, column_indices, columns_list, return_result='append', add_index_columns=add_index_columns) # Compute the difference in "columns" to_be_added = list( numpy.setdiff1d(numpy.arange(len(inputs.columns)), column_indices)) columns_replaced = 0 if len(to_be_added) < len(column_indices): # More efficient to concatenate than replace one-by-one outputs = pandas.concat(columns_list, axis=1) outputs = container.DataFrame(data=outputs, generate_metadata=False) indices = range(columns_list[0].shape[1]) outputs.metadata = inputs.metadata.select_columns( columns=list(indices)) c = 0 for columns in columns_list: columns_length = columns.shape[1] if c == 0: outputs.metadata = outputs.metadata.replace_columns( columns.metadata, list(indices)) else: outputs.metadata = outputs.metadata.append_columns( columns.metadata) c += 1 for col in to_be_added: insert_index = col.item() if insert_index > outputs.shape[1]: insert_index = outputs.shape[1] outputs = outputs.insert_columns( inputs.select_columns([col.item()]), insert_index) outputs.metadata = outputs.metadata.compact(['structural_type']) else: # We copy here and disable copying inside "replace_columns" to copy only once. # We have to copy because "replace_columns" is modifying data in-place. outputs = copy.copy(inputs) for columns in columns_list: columns_length = columns.shape[1] if columns_replaced < len(column_indices): # It is OK if the slice of "column_indices" is shorter than "columns", Only those columns # listed in the slice will be replaced and others appended after the last replaced column. outputs = outputs.replace_columns( columns, column_indices[columns_replaced:columns_replaced + columns_length], copy=False) else: # We insert the rest of columns after the last columns we replaced. We know that "column_indices" # is non-empty and that the last item of "column_indices" points ot the last column we replaced # for those listed in "column_indices". We replaced more columns though, so we have to add the # difference, and then add 1 to insert after the last column. outputs = outputs.insert_columns( columns, column_indices[-1] + (columns_replaced - len(column_indices)) + 1) columns_replaced += columns_length if columns_replaced < len(column_indices): outputs = outputs.remove_columns( column_indices[columns_replaced:len(column_indices)]) elif return_result == 'new': if not any(columns.shape[1] for columns in columns_list): raise ValueError("No columns produced.") outputs = columns_list[0] for columns in columns_list[1:]: outputs = outputs.append_columns(columns) if add_index_columns: inputs_index_columns = inputs.metadata.get_index_columns() outputs_index_columns = outputs.metadata.get_index_columns() if inputs_index_columns and not outputs_index_columns: # Add index columns at the beginning. outputs = inputs.select_columns( inputs_index_columns).append_columns( outputs, use_right_metadata=True) else: raise exceptions.InvalidArgumentValueError( "\"return_result\" has an invalid value: {return_result}".format( return_result=return_result)) return outputs
def combine_columns_metadata( inputs: metadata_base.DataMetadata, column_indices: typing.Sequence[int], columns_list: typing.Sequence[metadata_base.DataMetadata], *, return_result: str, add_index_columns: bool, ) -> metadata_base.DataMetadata: """ Analogous to ``combine_columns`` but operates only on metadata. """ if return_result == 'append': outputs = inputs for columns in columns_list: outputs = outputs.append_columns(columns) elif return_result == 'replace': if not column_indices: return combine_columns_metadata( inputs, column_indices, columns_list, return_result='append', add_index_columns=add_index_columns) outputs = inputs columns_replaced = 0 for columns in columns_list: columns_length = columns.query_field( (metadata_base.ALL_ELEMENTS, ), 'dimension')['length'] if columns_replaced < len(column_indices): # It is OK if the slice of "column_indices" is shorter than "columns", Only those columns # listed in the slice will be replaced and others appended after the last replaced column. outputs = outputs.replace_columns( columns, column_indices[columns_replaced:columns_replaced + columns_length]) else: # We insert the rest of columns after the last columns we replaced. We know that "column_indices" # is non-empty and that the last item of "column_indices" points ot the last column we replaced # for those listed in "column_indices". We replaced more columns though, so we have to add the # difference, and then add 1 to insert after the last column. outputs = outputs.insert_columns( columns, column_indices[-1] + (columns_replaced - len(column_indices)) + 1) columns_replaced += columns_length if columns_replaced < len(column_indices): outputs = outputs.remove_columns( column_indices[columns_replaced:len(column_indices)]) elif return_result == 'new': if not any( columns_metadata.query_field( (metadata_base.ALL_ELEMENTS, ), 'dimension')['length'] for columns_metadata in columns_list): raise ValueError("No columns produced.") outputs = columns_list[0] for columns in columns_list[1:]: outputs = outputs.append_columns(columns) if add_index_columns: inputs_index_columns = inputs.get_index_columns() outputs_index_columns = outputs.get_index_columns() if inputs_index_columns and not outputs_index_columns: # Add index columns at the beginning. outputs = inputs.select_columns( inputs_index_columns).append_columns( outputs, use_right_metadata=True) else: raise exceptions.InvalidArgumentValueError( "\"return_result\" has an invalid value: {return_result}".format( return_result=return_result)) return outputs