def produce(self, *, inputs: container.List, timeout: float = None, iterations: int = None) -> base.CallResult[container.Dataset]: # build the list of dataframes from the list of inputs dataframes = [] metadata = None for input in inputs: if isinstance(input, container.DataFrame): dataframes.append(input) try: _, main_dr = d3m_base_utils.get_tabular_resource(input, None) dataframes.append(main_dr) metadata = input.metadata except ValueError as error: raise exceptions.InvalidArgumentValueError( "Failure to find tabular resource in dataset") from error if self.hyperparams["column_overlap"] == "exact": columns_to_handle = dataframes[0].columns if np.sum( np.array([ np.all(df.columns == columns_to_handle) for df in dataframes ])) != len(dataframes): raise exceptions.InvalidArgumentValueError( "Dataframes don't have same columns, cannot exact concat") concated = pd.concat(dataframes, ignore_index=True) elif self.hyperparams["column_overlap"] == "union": concated = pd.concat(dataframes, ignore_index=True) elif self.hyperparams["column_overlap"] == "intersection": concated = pd.concat(dataframes, join="inner", ignore_index=True) if self.hyperparams["remove_duplicate_rows"]: concated.drop_duplicates(subset="d3mIndex", keep="first", inplace=True, ignore_index=True) if metadata is None: metadata = container.Dataset({ "learningData": concated.head(1) }, generate_metadata=True).metadata outputs = container.Dataset({"learningData": concated}, metadata) outputs.metadata = outputs.metadata.update( (metadata_base.ALL_ELEMENTS, ), {"dimension": { "length": concated.shape[0] }}) return base.CallResult(outputs)
def produce(self, *, inputs: container.Dataset, timeout: float = None, iterations: int = None) -> base.CallResult[container.Dataset]: main_resource_index = self.hyperparams['main_resource_index'] if main_resource_index is None: raise exceptions.InvalidArgumentValueError( 'no main resource specified') file_index = self.hyperparams['file_col_index'] if file_index is not None: if not self._is_csv_file_column(inputs.metadata, main_resource_index, file_index): raise exceptions.InvalidArgumentValueError( 'column idx=' + str(file_index) + ' from does not contain csv file names') else: file_index = self._find_csv_file_column(inputs.metadata) if file_index is None: raise exceptions.InvalidArgumentValueError( 'no column from contains csv file names') # generate the long form timeseries data base_path = self._get_base_path(inputs.metadata, main_resource_index, file_index) output_data = [] timeseries_dataframe = pd.DataFrame() for idx, tRow in inputs[main_resource_index].iterrows(): # read the timeseries data csv_path = os.path.join(base_path, tRow[file_index]) timeseries_row = pd.read_csv(csv_path) # add the timeseries id tRow = tRow.append(pd.Series({'series_id': int(idx)})) # combine the timeseries data with the value row output_data.extend([ pd.concat([tRow, vRow]) for vIdx, vRow in timeseries_row.iterrows() ]) # add the timeseries index timeseries_dataframe = timeseries_dataframe.append(output_data, ignore_index=True) # join the metadata from the 2 data resources timeseries_dataframe = container.DataFrame(timeseries_dataframe) # wrap as a D3M container #return base.CallResult(container.Dataset({'0': timeseries_dataframe}, metadata)) return base.CallResult( container.Dataset({'0': timeseries_dataframe}, generate_metadata=True))
def time_classification_scores(self, rows): # This has been cut-and-paste from test_compute_scores.py truth = container.DataFrame({ 'd3mIndex': range(rows), 'col0': (1, ) * rows }) truth_dataset = container.Dataset({'learningData': truth}, generate_metadata=True) truth_dataset.metadata = truth_dataset.metadata.add_semantic_type( ('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') truth_dataset.metadata = truth_dataset.metadata.add_semantic_type( ('learningData', metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/Target') truth_dataset.metadata = truth_dataset.metadata.add_semantic_type( ('learningData', metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') # predictions are identical to truth, should have no impact on performance. predictions = truth # configure primitive hyperparams_class = compute_scores.ComputeScoresPrimitive.metadata.get_hyperparams( ) metrics_class = hyperparams_class.configuration['metrics'].elements primitive = compute_scores.ComputeScoresPrimitive( hyperparams=hyperparams_class.defaults().replace({ 'metrics': [ metrics_class({ 'metric': 'ACCURACY', 'pos_label': None, 'k': None, }), metrics_class({ 'metric': 'F1_MICRO', 'pos_label': None, 'k': None, }), metrics_class({ 'metric': 'F1_MACRO', 'pos_label': None, 'k': None, }) ], })) # run scoring. scores = primitive.produce(inputs=predictions, score_dataset=truth_dataset).value
def produce(self, *, left: Inputs, # type: ignore right: Inputs, # type: ignore timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: # attempt to extract the main table try: left_resource_id, left_df = utils.get_tabular_resource(left, None) except ValueError as error: raise exceptions.InvalidArgumentValueError("Failure to find tabular resource in left dataset") from error try: right_resource_id, right_df = utils.get_tabular_resource(right, None) except ValueError as error: raise exceptions.InvalidArgumentValueError("Failure to find tabular resource in right dataset") from error accuracy = self.hyperparams['accuracy'] if accuracy <= 0.0 or accuracy > 1.0: raise exceptions.InvalidArgumentValueError('accuracy of ' + str(accuracy) + ' is out of range') left_col = self.hyperparams['left_col'] right_col = self.hyperparams['right_col'] # perform join based on semantic type join_type = self._get_join_semantic_type(left, left_resource_id, left_col, right, right_resource_id, right_col) joined: pd.Dataframe = None if join_type in self._STRING_JOIN_TYPES: joined = self._join_string_col(left_df, left_col, right_df, right_col, accuracy) elif join_type in self._NUMERIC_JOIN_TYPES: joined = self._join_numeric_col(left_df, left_col, right_df, right_col, accuracy) elif join_type in self._DATETIME_JOIN_TYPES: joined = self._join_datetime_col(left_df, left_col, right_df, right_col, accuracy) else: raise exceptions.InvalidArgumentValueError('join not surpported on type ' + str(join_type)) # create a new dataset to hold the joined data resource_map = {} for resource_id, resource in left.items(): # type: ignore if resource_id == left_resource_id: resource_map[resource_id] = joined else: resource_map[resource_id] = resource result_dataset = container.Dataset(resource_map) return base.CallResult(result_dataset)
def produce( self, *, inputs: container.Dataset, timeout: float = None, iterations: int = None) -> base.CallResult[container.DataFrame]: main_resource_index = self.hyperparams['main_resource_index'] if main_resource_index is None: raise exceptions.InvalidArgumentValueError( 'no main resource specified') file_index = self.hyperparams['file_col_index'] if file_index is not None: if not self._is_csv_file_column(inputs.metadata, main_resource_index, file_index): raise exceptions.InvalidArgumentValueError( 'column idx=' + str(file_index) + ' from does not contain csv file names') else: file_index = self._find_csv_file_column(inputs.metadata) if file_index is None: raise exceptions.InvalidArgumentValueError( 'no column from contains csv file names') # generate the long form timeseries data base_path = self._get_base_path(inputs.metadata, main_resource_index, file_index) csv_paths = [ os.path.join(base_path, f) for f in inputs[main_resource_index].iloc[:, file_index] ] ts_values = [pd.read_csv(path) for path in csv_paths] for ts, val in zip(ts_values, inputs[main_resource_index].values): ts[list(inputs[main_resource_index])] = pd.DataFrame( [list(val)], index=ts.index) timeseries_dataframe = pd.concat(ts_values) timeseries_dataframe = container.DataFrame(timeseries_dataframe) return base.CallResult( container.Dataset({'0': timeseries_dataframe}, generate_metadata=True))
def test_classification_non_d3mindex(self): truth = container.DataFrame([ [1, 'happy-pleased'], [2, 'amazed-suprised'], [3, 'sad-lonely'], [4, 'relaxing-calm'], ], columns=['non_d3mIndex', 'class_label']) # Score dataset has a non-d3mIndex truth_dataset = container.Dataset({'learningData': truth}, generate_metadata=True) truth_dataset.metadata = truth_dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') truth_dataset.metadata = truth_dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/Target') truth_dataset.metadata = truth_dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') predictions = container.DataFrame([ [1, 'happy-pleased'], [2, 'amazed-suprised'], [3, 'relaxing-calm'], [4, 'sad-lonely'], ], columns=['d3mIndex', 'class_label'], generate_metadata=True) hyperparams_class = compute_scores.ComputeScoresPrimitive.metadata.get_hyperparams() metrics_class = hyperparams_class.configuration['metrics'].elements primitive = compute_scores.ComputeScoresPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'metrics': [metrics_class({ 'metric': 'ACCURACY', 'pos_label': None, 'k': None, })], })) scores = primitive.produce(inputs=predictions, score_dataset=truth_dataset).value self.assertEqual(scores.values.tolist(), [ ['ACCURACY', 0.5, 0.5], ])
def produce( self, *, left: Inputs, # type: ignore right: Inputs, # type: ignore timeout: float = None, iterations: int = None, ) -> base.CallResult[Outputs]: # attempt to extract the main table try: left_resource_id, left_df = d3m_base_utils.get_tabular_resource(left, None) except ValueError as error: raise exceptions.InvalidArgumentValueError( "Failure to find tabular resource in left dataset" ) from error try: right_resource_id, right_df = d3m_base_utils.get_tabular_resource( right, None ) except ValueError as error: raise exceptions.InvalidArgumentValueError( "Failure to find tabular resource in right dataset" ) from error accuracy = self.hyperparams["accuracy"] absolute_accuracy = self.hyperparams["absolute_accuracy"] # hyperparams may be parsed as tuples # floats could be integers if round number is passed in if isinstance(accuracy, collections.Iterable): accuracy = [float(a) for a in accuracy] else: accuracy = float(accuracy) if isinstance(absolute_accuracy, collections.Iterable): absolute_accuracy = list(absolute_accuracy) if type(accuracy) == float and not type(absolute_accuracy) == bool: raise exceptions.InvalidArgumentValueError( "only 1 value of accuracy provided, but multiple values for absolute accuracy provided" ) if (not type(accuracy) == float) and type(absolute_accuracy) == bool: raise exceptions.InvalidArgumentValueError( "only 1 for absolute accuracy provided, but multiple values of accuracy provided" ) if type(accuracy) == float and not absolute_accuracy: if accuracy <= 0.0 or accuracy > 1.0: raise exceptions.InvalidArgumentValueError( "accuracy of " + str(accuracy) + " is out of range" ) elif type(accuracy) == list and type(absolute_accuracy) == list: if not len(accuracy) == len(absolute_accuracy): raise exceptions.InvalidArgumentValueError( "the count of accuracy hyperparams does not match the count of absolute_accuracy hyperparams" ) for i in range(len(accuracy)): if (accuracy[i] <= 0.0 or accuracy[i] > 1.0) and not absolute_accuracy[i]: raise exceptions.InvalidArgumentValueError( "accuracy of " + str(acc) + " is out of range" ) left_col = self.hyperparams["left_col"] right_col = self.hyperparams["right_col"] if type(left_col) != type(right_col) or ( type(left_col) == list and len(left_col) != len(right_col) and type(accuracy) != list and len(accuracy) != len(left_col) ): raise exceptions.InvalidArgumentTypeError( "both left_col and right_col need to have same data type and if they are lists, the same list lengths" ) if type(left_col) == str: left_col = [left_col] right_col = [right_col] accuracy = [accuracy] absolute_accuracy = [absolute_accuracy] join_types = [ self._get_join_semantic_type( left, left_resource_id, left_col[i], right, right_resource_id, right_col[i], ) for i in range(len(left_col)) ] num_splits = 32 joined_split = [None for i in range(num_splits)] left_df_split = np.array_split(left_df, num_splits) jobs = [delayed(self._produce_threaded)( index = i, left_df_full = left_df, left_dfs = left_df_split, right_df = right_df, join_types = join_types, left_col = left_col, right_col = right_col, accuracy = accuracy, absolute_accuracy = absolute_accuracy ) for i in range(num_splits)] joined_data = Parallel(n_jobs=self.hyperparams["n_jobs"], backend="loky", verbose=10)(jobs) # joined data needs to maintain order to mimic none split joining for i, d in joined_data: joined_split[i] = d joined = pd.concat(joined_split, ignore_index = True) # create a new dataset to hold the joined data resource_map = {} float_vector_columns = {} for resource_id, resource in left.items(): # type: ignore if resource_id == left_resource_id: for column in joined.columns: # need to avoid bug in container.Dataset, it doesn't like vector columns if type(joined[column].iloc[0]) == np.ndarray: float_vector_columns[column] = joined[column] joined[column] = np.NAN resource_map[resource_id] = joined else: resource_map[resource_id] = resource # Generate metadata for the dataset using only the first row of the resource for speed - # metadata generation runs over each cell in the dataframe, but we only care about column # level generation. Once that's done, set the actual dataframe value. result_dataset = container.Dataset( {k: v.head(1) for k, v in resource_map.items()}, generate_metadata=True ) for k, v in resource_map.items(): result_dataset[k] = v result_dataset.metadata = result_dataset.metadata.update( (k,), {"dimension": {"length": v.shape[0]}} ) for key in float_vector_columns.keys(): df = result_dataset[left_resource_id] df[key] = float_vector_columns[key] float_vec_loc = df.columns.get_loc(key) float_vec_col_indices = df.metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/FloatVector",) ) if float_vec_loc not in float_vec_col_indices: df.metadata = df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, float_vec_loc), "https://metadata.datadrivendiscovery.org/types/FloatVector", ) return base.CallResult(result_dataset)
jsonCall = json.load(inputFile) inputFile.close() # Load the problem description schema with open( path.join(jsonCall['train_data'], 'problem_TRAIN', 'problemDoc.json' ) , 'r') as inputFile: problemSchema = json.load(inputFile) inputFile.close() # Load the json dataset description file with open( path.join(jsonCall['train_data'], 'dataset_TRAIN', 'datasetDoc.json' ) , 'r') as inputFile: datasetSchema = json.load(inputFile) inputFile.close() # Load dataset ds_uri = 'file://' + path.join(jsonCall['train_data'], 'dataset_TRAIN', 'datasetDoc.json') ds = container.Dataset(resources=dict(), metadata=DataMetadata()) ds = ds.load(ds_uri) # Profile dataset param = Hyperparams.sample() prof = Profiler(hyperparams=param) ds2 = prof.produce(inputs=ds) # Get resource Ids, return ['0'] for this dataset print(ds.metadata.get_elements( () )) # Get available columns, returns [0, 1, 2, ..., 30] for 38_sick dataset print(ds.metadata.get_elements(('0', ALL_ELEMENTS))) # Metadata for column 1 column_one_metadata = ds.metadata.query(('0', ALL_ELEMENTS, 1))
def handler(arguments): random_state = numpy.random.RandomState(arguments.random_seed) resources = {} generate_main_resources(random_state, resources, arguments.size) if arguments.dataset_type == DatasetType.COUNTS_PER_USER: generate_learning_data_counts_per_user(random_state, resources) elif arguments.dataset_type == DatasetType.COMMENTS_PER_POST: generate_learning_data_comments_per_post(random_state, resources) elif arguments.dataset_type == DatasetType.HAS_USER_MADE_COMMENT_ON_POST: generate_learning_data_has_user_made_comment_on_post( random_state, resources) else: raise ValueError( f"Unknown dataset type: {arguments.dataset_type.name}") dataset = container.Dataset(resources, generate_metadata=True) update_metadata_main_resources(dataset, arguments.dataset_id, arguments.dataset_type.name, arguments.size, arguments.random_seed) if arguments.dataset_type == DatasetType.COUNTS_PER_USER: update_metadata_counts_per_user(dataset) elif arguments.dataset_type == DatasetType.COMMENTS_PER_POST: update_metadata_comments_per_post(dataset) elif arguments.dataset_type == DatasetType.HAS_USER_MADE_COMMENT_ON_POST: update_metadata_has_user_made_comment_on_post(dataset) else: raise ValueError( f"Unknown dataset type: {arguments.dataset_type.name}") dataset_output_uri = 'file://' + os.path.join( os.path.abspath(arguments.output_dir), arguments.dataset_id, 'datasetDoc.json') dataset.save(dataset_output_uri) os.makedirs( os.path.join(os.path.abspath(arguments.output_dir), arguments.problem_id)) with open(os.path.join(os.path.abspath(arguments.output_dir), arguments.problem_id, 'problemDoc.json'), 'x', encoding='utf8') as problem_file: if arguments.dataset_type == DatasetType.COUNTS_PER_USER: task_keywords = ['regression', 'multivariate'] metric = { 'metric': 'rootMeanSquaredError', } targets = [ { 'targetIndex': 0, 'resID': 'learningData', 'colIndex': 2, 'colName': 'posts_count', }, { 'targetIndex': 1, 'resID': 'learningData', 'colIndex': 3, 'colName': 'comments_count', }, ] elif arguments.dataset_type == DatasetType.COMMENTS_PER_POST: task_keywords = ['regression', 'univariate'] metric = { 'metric': 'rootMeanSquaredError', } targets = [ { 'targetIndex': 0, 'resID': 'learningData', 'colIndex': 2, 'colName': 'comments_count', }, ] elif arguments.dataset_type == DatasetType.HAS_USER_MADE_COMMENT_ON_POST: task_keywords = ['classification', 'binary'] metric = { 'metric': 'f1', 'posLabel': 'yes', } targets = [ { 'targetIndex': 0, 'resID': 'learningData', 'colIndex': 3, 'colName': 'made_comment', }, ] json.dump( { 'about': { 'problemID': arguments.problem_id, 'problemName': f"Database problem of type {arguments.dataset_type.name}", 'taskKeywords': task_keywords, 'problemSchemaVersion': '4.0.0', }, 'inputs': { 'data': [ { 'datasetID': arguments.dataset_id, 'targets': targets, }, ], 'performanceMetrics': [ metric, ], }, 'expectedOutputs': { 'predictionsFile': 'predictions.csv', 'scoresFile': 'scores.csv', }, }, problem_file, indent=2)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: # If only one resource is in the dataset, we do not have anything to do. if inputs.metadata.query(())['dimension']['length'] == 1: return base.CallResult(inputs) main_resource_id = self.hyperparams['starting_resource'] if main_resource_id is None: for resource_id in inputs.keys(): if 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint' in inputs.metadata.query( (resource_id, )).get('semantic_types', []): main_resource_id = resource_id break if main_resource_id is None: raise ValueError( "A Dataset with multiple resources without an entry point and no starting resource specified as a hyper-parameter." ) main_data = inputs[main_resource_id] main_columns_length = inputs.metadata.query( (main_resource_id, metadata_base.ALL_ELEMENTS))['dimension']['length'] # There is only one resource now. top_level_metadata = dict(inputs.metadata.query(())) top_level_metadata['dimension'] = dict(top_level_metadata['dimension']) top_level_metadata['dimension']['length'] = 1 # !!! changed part: remove unloaded metadata to pass the check function metadata = inputs.metadata.clear( top_level_metadata, source=self).set_for_value(None, source=self) other_keys = [*inputs] other_keys.remove(main_resource_id) for each_key in other_keys: metadata = metadata.remove(selector=(each_key, ), recursive=True) # changed finished #metadata = inputs.metadata.clear(top_level_metadata, source=self).set_for_value(None, source=self) # Resource is not anymore an entry point. entry_point_metadata = dict(inputs.metadata.query( (main_resource_id, ))) entry_point_metadata['semantic_types'] = [ semantic_type for semantic_type in entry_point_metadata['semantic_types'] if semantic_type != 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint' ] metadata = metadata.update((main_resource_id, ), entry_point_metadata, source=self) data = None for column_index in range(main_columns_length): column_metadata = inputs.metadata.query( (main_resource_id, metadata_base.ALL_ELEMENTS, column_index)) if 'foreign_key' not in column_metadata: # We just copy over data and metadata. data, metadata = self._add_column( main_resource_id, data, metadata, self._get_column(main_data, column_index), column_metadata) else: assert column_metadata['foreign_key'][ 'type'] == 'COLUMN', column_metadata if 'column_index' in column_metadata['foreign_key']: data, metadata = self._join_by_index( main_resource_id, inputs, column_index, data, metadata, column_metadata['foreign_key']['resource_id'], column_metadata['foreign_key']['column_index'], ) elif 'column_name' in column_metadata['foreign_key']: data, metadata = self._join_by_name( main_resource_id, inputs, column_index, data, metadata, column_metadata['foreign_key']['resource_id'], column_metadata['foreign_key']['column_name'], ) else: assert False, column_metadata resources = {} resources[main_resource_id] = data # Number of columns had changed. all_rows_metadata = dict( inputs.metadata.query( (main_resource_id, metadata_base.ALL_ELEMENTS))) all_rows_metadata['dimension'] = dict(all_rows_metadata['dimension']) all_rows_metadata['dimension']['length'] = data.shape[1] metadata = metadata.update( (main_resource_id, metadata_base.ALL_ELEMENTS), all_rows_metadata, for_value=resources, source=self) # !!! changed part: load all dataset to resources ''' other_keys = [*inputs] other_keys.remove(main_resource_id) for each_key in other_keys: metadata = metadata.remove(selector = (each_key,),recursive = True, source = resources) ''' ''' # this change only works for d3m v2018.6.5, for v2018.7.10, even the "metadata.remove" will check the resouces and metadata relationship: so we have to load all data to the resources before check/remove # !!! changed part: remove unloaded metadata to pass the check function other_keys = [*inputs] other_keys.remove(main_resource_id) for each_key in other_keys: metadata = metadata.remove(selector = (each_key,),recursive = True, source = resources) # changed finished ''' metadata.check(resources) dataset = container.Dataset(resources, metadata) return base.CallResult(dataset)
def produce(self, *, inputs: container.Dataset, timeout: float = None, iterations: int = None) -> base.CallResult[container.Dataset]: # if this is a single resource dataset we don't need to reformat it if len(inputs) < 2: return base.CallResult(inputs) # find the main resource if supplied, infer if not main_resource_id, main_resource = base_utils.get_tabular_resource( inputs, self.hyperparams["main_resource_id"]) if main_resource_id is None: raise exceptions.InvalidArgumentValueError( "no main resource specified") # find the csv file column resource if supplied, infer if not file_index = self.hyperparams["file_col_index"] if file_index is not None: if not self._is_csv_file_column(inputs.metadata, main_resource_id, file_index): raise exceptions.InvalidArgumentValueError( "column idx=" + str(file_index) + " from does not contain csv file names") else: file_index = self._find_csv_file_column(inputs.metadata, main_resource_id) if file_index is None: raise exceptions.InvalidArgumentValueError( "no column from contains csv file names") # generate the long form timeseries data base_path = self._get_base_path(inputs.metadata, main_resource_id, file_index) csv_paths = [ os.path.join(base_path, local_path) for local_path in inputs[main_resource_id].iloc[:, file_index] ] new_dfs = [pd.read_csv(path) for path in csv_paths] original_dfs = [ pd.DataFrame( np.tile(row, (df.shape[0], 1)), columns=inputs[main_resource_id].columns, index=df.index, ) for row, df in zip(inputs[main_resource_id].values, new_dfs) ] combined_dfs = [ original_df.join(new_df) for original_df, new_df in zip(original_dfs, new_dfs) ] output_data = pd.concat(combined_dfs) timeseries_dataframe = container.DataFrame(output_data) timeseries_dataframe.reset_index(drop=True, inplace=True) # make sure that all timeseries have the same length, most downstream tasks will appreciate this. if self.hyperparams["equal_length"]: min_length = (timeseries_dataframe.groupby( timeseries_dataframe.columns[file_index]).count().min(). values[0]) group_count = timeseries_dataframe.groupby( timeseries_dataframe.columns[file_index]).cumcount() timeseries_dataframe = timeseries_dataframe.assign( group_count=group_count) timeseries_dataframe = timeseries_dataframe[ timeseries_dataframe["group_count"] < min_length] timeseries_dataframe = timeseries_dataframe.drop(["group_count"], axis=1) # create a dataset to hold the result timeseries_dataset = container.Dataset( {self._resource_id: timeseries_dataframe}, generate_metadata=True) timeseries_dataset.metadata = timeseries_dataset.metadata.update( (), {"id": inputs.metadata.query(())["id"]}) timeseries_dataset.metadata = timeseries_dataset.metadata.update( (), {"digest": inputs.metadata.query(())["digest"]}) # copy main resource column metadata to timeseries dataframe num_main_resource_cols = inputs.metadata.query( (main_resource_id, metadata_base.ALL_ELEMENTS))["dimension"]["length"] for i in range(num_main_resource_cols): source = inputs.metadata.query( (main_resource_id, metadata_base.ALL_ELEMENTS, i)) timeseries_dataset.metadata = timeseries_dataset.metadata.update_column( i, source, at=(self._resource_id, )) # remove the foreign key entry from the filename column if it exists metadata = dict( timeseries_dataset.metadata.query( (self._resource_id, metadata_base.ALL_ELEMENTS, file_index))) metadata["foreign_key"] = metadata_base.NO_VALUE timeseries_dataset.metadata = timeseries_dataset.metadata.update( (self._resource_id, metadata_base.ALL_ELEMENTS, file_index), metadata) # copy timeseries column metadata to timeseries if its available in the metadata (which is not necssarily true anymore) source = self._find_timeseries_metadata(inputs) i = 0 start_idx = 0 if source is not None: for col_info in source["file_columns"]: timeseries_dataset.metadata = timeseries_dataset.metadata.update_column( i + num_main_resource_cols, col_info, at=(self._resource_id, )) i += 1 # flag all other columns as attributes start_idx = i + num_main_resource_cols else: # loop over the appended time series columns start_idx = original_dfs[0].shape[1] for i in range(start_idx, timeseries_dataframe.shape[1]): timeseries_dataset.metadata = timeseries_dataset.metadata.add_semantic_type( (self._resource_id, metadata_base.ALL_ELEMENTS, i), "https://metadata.datadrivendiscovery.org/types/Attribute", ) struct_type = timeseries_dataset.metadata.query( (self._resource_id, metadata_base.ALL_ELEMENTS, i))["structural_type"] if struct_type == np.float64: timeseries_dataset.metadata = ( timeseries_dataset.metadata.add_semantic_type( (self._resource_id, metadata_base.ALL_ELEMENTS, i), "http://schema.org/Float", )) elif struct_type == np.int64: timeseries_dataset.metadata = ( timeseries_dataset.metadata.add_semantic_type( (self._resource_id, metadata_base.ALL_ELEMENTS, i), "http://schema.org/Integer", )) else: timeseries_dataset.metadata = ( timeseries_dataset.metadata.add_semantic_type( (self._resource_id, metadata_base.ALL_ELEMENTS, i), "http://schema.org/Text", )) # mark the filename column as a grouping key timeseries_dataset.metadata = timeseries_dataset.metadata.add_semantic_type( (self._resource_id, metadata_base.ALL_ELEMENTS, file_index), "https://metadata.datadrivendiscovery.org/types/GroupingKey", ) # mark the d3mIndex as a primary multi-key since there are now multiple instances of the value present primary_index_col = ( timeseries_dataset.metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/PrimaryKey", ), at=(self._resource_id, ), )) timeseries_dataset.metadata = timeseries_dataset.metadata.remove_semantic_type( (self._resource_id, metadata_base.ALL_ELEMENTS, primary_index_col[0]), "https://metadata.datadrivendiscovery.org/types/PrimaryKey", ) timeseries_dataset.metadata = timeseries_dataset.metadata.add_semantic_type( (self._resource_id, metadata_base.ALL_ELEMENTS, primary_index_col[0]), "https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey", ) return base.CallResult(timeseries_dataset)
def test_complex_value(self): self.maxDiff = None dataset = container.Dataset({ '0': container.DataFrame({ 'A': [ container.ndarray(numpy.array(['a', 'b', 'c'])), container.ndarray(numpy.array([1, 2, 3], dtype=numpy.int64)), container.ndarray(numpy.array([1.0, 2.0, 3.0])), ], 'B': [ container.List(['a', 'b', 'c']), container.List([1, 2, 3]), container.List([1.0, 2.0, 3.0]), ], }), }, generate_metadata=False) dataset_metadata = dataset.metadata.generate(dataset, compact=True) self.assertEqual(utils.to_json_structure(dataset_metadata.to_internal_simple_structure()), [{ 'selector': [], 'metadata': { 'schema': base.CONTAINER_SCHEMA_VERSION, 'structural_type': 'd3m.container.dataset.Dataset', 'dimension': { 'name': 'resources', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], 'length': 1, }, }, }, { 'selector': ['__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'd3m.container.pandas.DataFrame', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], 'dimension': { 'name': 'rows', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], 'length': 3, }, }, }, { 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], 'metadata': { 'dimension': { 'name': 'columns', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], 'length': 2, }, }, }, { 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], 'metadata': { 'dimension': { 'length': 3 }, }, }, { 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', 0], 'metadata': { 'structural_type': 'd3m.container.numpy.ndarray', 'name': 'A', }, }, { 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', 1], 'metadata': { 'structural_type': 'd3m.container.list.List', 'name': 'B', }, }, { 'selector': ['__ALL_ELEMENTS__', 0, 0, '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'numpy.str_', }, }, { 'selector': ['__ALL_ELEMENTS__', 0, 1, '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'str', }, }, { 'selector': ['__ALL_ELEMENTS__', 1, 0, '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'numpy.int64', }, }, { 'selector': ['__ALL_ELEMENTS__', 1, 1, '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'int', }, }, { 'selector': ['__ALL_ELEMENTS__', 2, 0, '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'numpy.float64', }, }, { 'selector': ['__ALL_ELEMENTS__', 2, 1, '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'float', } }]) dataset_metadata = dataset.metadata.generate(dataset, compact=False) self.assertEqual(utils.to_json_structure(dataset_metadata.to_internal_simple_structure()), [{ 'selector': [], 'metadata': { 'dimension': { 'length': 1, 'name': 'resources', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], }, 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', 'structural_type': 'd3m.container.dataset.Dataset', }, }, { 'selector': ['0'], 'metadata': { 'dimension': { 'length': 3, 'name': 'rows', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], }, 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], 'structural_type': 'd3m.container.pandas.DataFrame', }, }, { 'selector': ['0', '__ALL_ELEMENTS__'], 'metadata': { 'dimension': { 'length': 2, 'name': 'columns', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], }, }, }, { 'selector': ['0', '__ALL_ELEMENTS__', 0], 'metadata': { 'name': 'A', }, }, { 'selector': ['0', '__ALL_ELEMENTS__', 1], 'metadata': { 'name': 'B', }, }, { 'selector': ['0', 0, 0], 'metadata': { 'dimension': { 'length': 3, }, 'structural_type': 'd3m.container.numpy.ndarray', }, }, { 'selector': ['0', 0, 0, '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'numpy.str_' }, }, { 'selector': ['0', 0, 1], 'metadata': { 'dimension': { 'length': 3, }, 'structural_type': 'd3m.container.list.List', }, }, { 'selector': ['0', 0, 1, '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'str', }, }, { 'selector': ['0', 1, 0], 'metadata': { 'dimension': { 'length': 3, }, 'structural_type': 'd3m.container.numpy.ndarray', }, }, { 'selector': ['0', 1, 0, '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'numpy.int64', }, }, { 'selector': ['0', 1, 1], 'metadata': { 'dimension': { 'length': 3, }, 'structural_type': 'd3m.container.list.List', }, }, { 'selector': ['0', 1, 1, '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'int', }, }, { 'selector': ['0', 2, 0], 'metadata': { 'dimension': { 'length': 3, }, 'structural_type': 'd3m.container.numpy.ndarray', }, }, { 'selector': ['0', 2, 0, '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'numpy.float64', }, }, { 'selector': ['0', 2, 1], 'metadata': { 'dimension': { 'length': 3, }, 'structural_type': 'd3m.container.list.List', }, }, { 'selector': ['0', 2, 1, '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'float', }, }])
def test_dataset(self): dataframe = container.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'c']}) dataframe.A = dataframe.A.astype(numpy.int64, copy=False) dataset = container.Dataset({'0': dataframe}, generate_metadata=False) compact_metadata = dataset.metadata.generate(dataset, compact=True) noncompact_metadata = dataset.metadata.generate(dataset, compact=False) self.assertEqual(utils.to_json_structure(compact_metadata.to_internal_simple_structure()), [{ 'selector': [], 'metadata': { 'schema': base.CONTAINER_SCHEMA_VERSION, 'structural_type': 'd3m.container.dataset.Dataset', 'dimension': { 'name': 'resources', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], 'length': 1, }, }, }, { 'selector': ['__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'd3m.container.pandas.DataFrame', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], 'dimension': { 'name': 'rows', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], 'length': 3, }, }, }, { 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], 'metadata': { 'dimension': { 'name': 'columns', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], 'length': 2, }, }, }, { 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', 0], 'metadata': { 'name': 'A', 'structural_type': 'numpy.int64', }, }, { 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', 1], 'metadata': { 'name': 'B', 'structural_type': 'str', }, }]) self.assertEqual(utils.to_json_structure(noncompact_metadata.to_internal_simple_structure()), [{ 'selector': [], 'metadata': { 'schema': base.CONTAINER_SCHEMA_VERSION, 'structural_type': 'd3m.container.dataset.Dataset', 'dimension': { 'name': 'resources', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], 'length': 1, }, }, }, { 'selector': ['0'], 'metadata': { 'structural_type': 'd3m.container.pandas.DataFrame', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], 'dimension': { 'name': 'rows', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], 'length': 3, }, }, }, { 'selector': ['0', '__ALL_ELEMENTS__'], 'metadata': { 'dimension': { 'name': 'columns', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], 'length': 2, }, }, }, { 'selector': ['0', '__ALL_ELEMENTS__', 0], 'metadata': { 'name': 'A', 'structural_type': 'numpy.int64', }, }, { 'selector': ['0', '__ALL_ELEMENTS__', 1], 'metadata': { 'name': 'B', 'structural_type': 'str', }, }])
def test_all_labels(self): truth = container.DataFrame([ [3, 'happy-pleased'], [3, 'relaxing-calm'], [7, 'amazed-suprised'], [7, 'happy-pleased'], [13, 'quiet-still'], [13, 'sad-lonely'], ], columns=['d3mIndex', 'class_label']) truth_dataset = container.Dataset({'learningData': truth}, generate_metadata=True) truth_dataset.metadata = truth_dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') truth_dataset.metadata = truth_dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/Target') truth_dataset.metadata = truth_dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') predictions = container.DataFrame([ [3, 'happy-pleased'], [3, 'sad-lonely'], [7, 'amazed-suprised'], [7, 'happy-pleased'], [13, 'quiet-still'], [13, 'happy-pleased'], ], columns=['d3mIndex', 'class_label'], generate_metadata=True) hyperparams_class = compute_scores.ComputeScoresPrimitive.metadata.get_hyperparams() metrics_class = hyperparams_class.configuration['metrics'].elements all_labels_class = hyperparams_class.configuration['all_labels'].elements primitive = compute_scores.ComputeScoresPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'metrics': [metrics_class({ 'metric': 'HAMMING_LOSS', 'pos_label': None, 'k': None, })], })) scores = primitive.produce(inputs=predictions, score_dataset=truth_dataset).value self.assertEqual(scores.values.tolist(), [ ['HAMMING_LOSS', 0.26666666666666666, 0.7333333333333334], ]) self.assertEqual(scores.metadata.query_column(0)['name'], 'metric') self.assertEqual(scores.metadata.query_column(1)['name'], 'value') self.assertEqual(scores.metadata.query_column(2)['name'], 'normalized') primitive = compute_scores.ComputeScoresPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'metrics': [metrics_class({ 'metric': 'HAMMING_LOSS', 'pos_label': None, 'k': None, })], 'all_labels': [all_labels_class({ 'column_name': 'class_label', 'labels': ['happy-pleased', 'relaxing-calm', 'amazed-suprised', 'quiet-still', 'sad-lonely', 'foobar'], })], })) scores = primitive.produce(inputs=predictions, score_dataset=truth_dataset).value self.assertEqual(scores.values.tolist(), [ ['HAMMING_LOSS', 0.2222222222222222, 0.7777777777777778], ]) primitive = compute_scores.ComputeScoresPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'metrics': [metrics_class({ 'metric': 'HAMMING_LOSS', 'pos_label': None, 'k': None, })], 'all_labels': [all_labels_class({ 'column_name': 'class_label', 'labels': ['happy-pleased', 'relaxing-calm', 'amazed-suprised'], })], })) with self.assertRaisesRegex(exceptions.InvalidArgumentValueError, 'Truth contains extra labels'): primitive.produce(inputs=predictions, score_dataset=truth_dataset) truth_dataset.metadata = truth_dataset.metadata.update_column(1, { 'all_distinct_values': ['happy-pleased', 'relaxing-calm', 'amazed-suprised', 'quiet-still', 'sad-lonely', 'foobar'], }, at=('learningData',)) primitive = compute_scores.ComputeScoresPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'metrics': [metrics_class({ 'metric': 'HAMMING_LOSS', 'pos_label': None, 'k': None, })], })) scores = primitive.produce(inputs=predictions, score_dataset=truth_dataset).value self.assertEqual(scores.values.tolist(), [ ['HAMMING_LOSS', 0.2222222222222222, 0.7777777777777778], ]) truth_dataset.metadata = truth_dataset.metadata.update_column(1, { 'all_distinct_values': ['happy-pleased', 'relaxing-calm', 'amazed-suprised'], }, at=('learningData',)) primitive = compute_scores.ComputeScoresPrimitive(hyperparams=hyperparams_class.defaults().replace({ 'metrics': [metrics_class({ 'metric': 'HAMMING_LOSS', 'pos_label': None, 'k': None, })], })) with self.assertRaisesRegex(exceptions.InvalidArgumentValueError, 'Truth contains extra labels'): primitive.produce(inputs=predictions, score_dataset=truth_dataset)