def _join_numeric_col(cls, left_df: container.DataFrame, left_col: str, right_df: container.DataFrame, right_col: str, accuracy: float) -> pd.DataFrame: # use d3mIndex from left col if present right_df = right_df.drop(columns='d3mIndex') # fuzzy match each of the left join col against the right join col value and save the results as the left # dataframe index right_df[right_col] = pd.to_numeric(right_df[right_col]) choices = right_df[right_col].unique() left_df[left_col] = pd.to_numeric(left_df[left_col]) left_df.index = left_df[left_col]. \ map(lambda x: cls._numeric_fuzzy_match(x, choices, accuracy)) # make the right col the right dataframe index right_df = right_df.set_index(right_col) # inner join on the left / right indices joined = container.DataFrame( left_df.join(right_df, lsuffix='_1', rsuffix='_2', how='inner')) # sort on the d3m index if there, otherwise use the joined column if 'd3mIndex' in joined: joined = joined.sort_values(by=['d3mIndex']) else: joined = joined.sort_values(by=[left_col]) joined = joined.reset_index(drop=True) return joined
def _join_string_col(cls, left_df: container.DataFrame, left_col: str, right_df: container.DataFrame, right_col: str, accuracy: float) -> pd.DataFrame: # use d3mIndex from left col if present right_df = right_df.drop(columns='d3mIndex') # pre-compute fuzzy matches left_keys = left_df[left_col].unique() right_keys = right_df[right_col].unique() matches: typing.Dict[str, typing.Optional[str]] = {} for left_key in left_keys: matches[left_key] = cls._string_fuzzy_match( left_key, right_keys, accuracy * 100) # look up pre-computed fuzzy match for each element in the left column left_df.index = left_df[left_col].map(lambda key: matches[key]) # make the right col the right dataframe index right_df = right_df.set_index(right_col) # inner join on the left / right indices joined = container.DataFrame( left_df.join(right_df, lsuffix='_1', rsuffix='_2', how='inner')) # sort on the d3m index if there, otherwise use the joined column if 'd3mIndex' in joined: joined = joined.sort_values(by=['d3mIndex']) else: joined = joined.sort_values(by=[left_col]) joined = joined.reset_index(drop=True) return joined
def _join_datetime_col(cls, left_df: container.DataFrame, left_col: str, right_df: container.DataFrame, right_col: str, accuracy: float) -> pd.DataFrame: # use d3mIndex from left col if present right_df = right_df.drop(columns='d3mIndex') # compute a tolerance delta for time matching based on a percentage of the minimum left/right time # range choices = np.array([np.datetime64(parser.parse(dt)) for dt in right_df[right_col].unique()]) left_keys = np.array([np.datetime64(parser.parse(dt)) for dt in left_df[left_col].values]) time_tolerance = (1.0 - accuracy) * cls._compute_time_range(left_keys, choices) left_df.index = np.array([cls._datetime_fuzzy_match(dt, choices, time_tolerance) for dt in left_keys]) # make the right col the right dataframe index right_df = right_df.set_index(right_col) # inner join on the left / right indices joined = container.DataFrame(left_df.join(right_df, lsuffix='_1', rsuffix='_2', how='inner')) # sort on the d3m index if there, otherwise use the joined column if 'd3mIndex' in joined: joined = joined.sort_values(by=['d3mIndex']) else: joined = joined.sort_values(by=[left_col]) joined = joined.reset_index(drop=True) return joined
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): if not hyperparams['use_semantic_types']: return data, list(data.columns), list(range(len(data.columns))) metadata = data.metadata def can_produce_column(column_index: int) -> bool: accepted_semantic_types = set() accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) semantic_types = set(column_metadata.get('semantic_types', [])) if len(semantic_types) == 0: cls.logger.warning("No semantic types found in column metadata") return False # Making sure all accepted_semantic_types are available in semantic_types if len(accepted_semantic_types - semantic_types) == 0: return True return False target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, use_columns=hyperparams[ 'use_outputs_columns'], exclude_columns= hyperparams[ 'exclude_outputs_columns'], can_use_column=can_produce_column) targets = [] if target_column_indices: targets = data.select_columns(target_column_indices) target_column_names = [] for idx in target_column_indices: target_column_names.append(data.columns[idx]) return targets, target_column_names, target_column_indices
def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None, ) -> base.CallResult[container.DataFrame]: logger.debug(f"Running {__name__}") # set values that only occur once to a special token outputs = inputs.copy() # determine columns to operate on cols = distil_utils.get_operating_columns( inputs, self.hyperparams["use_columns"], CATEGORICALS) for c in cols: vcs = pd.value_counts(list(inputs.iloc[:, c])) singletons = set(vcs[vcs == 1].index) if singletons: mask = outputs.iloc[:, c].isin(singletons) outputs.loc[mask, outputs.columns[c]] = SINGLETON_INDICATOR logger.debug(f"\n{outputs}") return base.CallResult(outputs)
def _update_metadata_dimension( df: container.DataFrame) -> container.DataFrame: old_metadata = dict(df.metadata.query(())) old_metadata["dimension"] = dict(old_metadata["dimension"]) old_metadata["dimension"]["length"] = df.shape[0] df.metadata = df.metadata.update((), old_metadata) return df
def _split_aggregated(self, df: container.DataFrame, split_col_names: list) -> container.DataFrame: lengths = [len(df.loc[0, col_name]) for col_name in split_col_names] for idx, col_name in enumerate(split_col_names): if self._sorted_pipe_ids: if len(self._sorted_pipe_ids) == lengths[idx]: extend_col_names = [ "{}_{}".format(col_name, i) for i in self._sorted_pipe_ids ] else: raise ValueError( "Unique number of pipeline ids not equal to the number of aggregated values" ) else: extend_col_names = [ "{}_{}".format(col_name, i) for i in range(lengths[idx]) ] extends = container.DataFrame(df.loc[:, col_name].values.tolist(), columns=extend_col_names) df = common_utils.horizontal_concat(left=df, right=extends) origin_metadata = dict( df.metadata.query( (mbase.ALL_ELEMENTS, df.columns.get_loc(col_name)))) for name in extend_col_names: col_idx = df.columns.get_loc(name) origin_metadata["name"] = name df.metadata = df.metadata.update((mbase.ALL_ELEMENTS, col_idx), origin_metadata) return df
def _produce_threaded( self, *, index: int, left_df_full: container.DataFrame, # type: ignore left_dfs: typing.Sequence[container.DataFrame], # type: ignore right_df: container.DataFrame, # type: ignore join_types: typing.Sequence[str], left_col: typing.Sequence[int], right_col: typing.Sequence[int], accuracy: typing.Sequence[float], absolute_accuracy: typing.Sequence[bool] ) -> typing.Tuple[int, base.CallResult[Outputs]]: if left_dfs[index].empty: return (index, None) output = self._produce( left_df_full = left_df_full, left_df = left_dfs[index].reset_index(drop=True), right_df = right_df.copy(), join_types = join_types, left_col = left_col, right_col = right_col, accuracy = accuracy, absolute_accuracy = absolute_accuracy ) return (index, output)
def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None, ) -> base.CallResult[container.DataFrame]: logger.debug(f"Running {__name__}") # determine columns to operate on cols = distil_utils.get_operating_columns( inputs, self.hyperparams["use_columns"], CATEGORICALS) logger.debug(f"Found {len(cols)} categorical columns to evaluate") if len(cols) is 0: return base.CallResult(inputs) imputer = CategoricalImputer( strategy=self.hyperparams["strategy"], fill_value=self.hyperparams["fill_value"], missing_values="", tie_breaking="first", ) outputs = inputs.copy() failures: List[int] = [] for c in cols: input_col = inputs.iloc[:, c] try: imputer.fit(input_col) result = imputer.transform(input_col) outputs.iloc[:, c] = result except ValueError as e: # value error gets thrown when all data is missing if not self.hyperparams["error_on_empty"]: failures.append(c) else: raise e # for columns that failed using 'most_frequent' try again using 'constant' if not self.hyperparams["error_on_empty"]: imputer = CategoricalImputer( strategy="constant", fill_value=self.hyperparams["fill_value"], missing_values="", tie_breaking="first", ) for f in failures: outputs_col = outputs.iloc[:, f] imputer.fit(outputs_col) result = imputer.transform(outputs_col) outputs.iloc[:, f] = result logger.debug(f"\n{outputs}") return base.CallResult(outputs)
def _generate_labels(self, inputs: container.DataFrame) -> None: self._labels = {} for col_idx, (label, col) in enumerate(inputs.iteritems()): # Get all the unique data in the column and assign each element an int representation. # We reserve 0 for unseen labels so we increment the encodings by one unique_data = col.unique() self._labels[col_idx] = { label: encoded + 1 for encoded, label in enumerate(unique_data) }
def _remap_graphs( cls, data: container.DataFrame) -> Tuple[container.DataFrame, int, int]: assert data.shape[1] == 2 data = data.copy() data.columns = ("user", "item") uusers = np.unique([data.user, data.user]) user_lookup = dict(zip(uusers, range(len(uusers)))) data.user = data.user.apply(user_lookup.get) uitems = np.unique(data.item) item_lookup = dict(zip(uitems, range(len(uitems)))) data.item = data.item.apply(item_lookup.get) n_users = len(uusers) n_items = len(uitems) return data, n_users, n_items
def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None, ) -> base.CallResult[container.DataFrame]: df = inputs.select_columns( inputs.metadata.list_columns_with_semantic_types( ("http://schema.org/Float",) ) ) df = df.to_numpy().reshape( df.shape[0], 2048, self.hyperparams["height"], self.hyperparams["width"] ) all_img_features = [] batch_size = self.hyperparams["batch_size"] spatial_a = 2.0 spatial_b = 2.0 for i in range(math.ceil(df.shape[0] / batch_size)): features = df[i * batch_size : (i + 1) * batch_size] spatial_weight = features.sum(axis=1, keepdims=True) z = (spatial_weight ** spatial_a).sum(axis=(2, 3), keepdims=True) z = z ** (1.0 / spatial_a) spatial_weight = (spatial_weight / z) ** (1.0 / spatial_b) _, c, w, h = features.shape nonzeros = (features != 0).astype(float).sum(axis=(2, 3)) / 1.0 / ( w * h ) + 1e-6 channel_weight = np.log(nonzeros.sum(axis=1, keepdims=True) / nonzeros) features = features * spatial_weight features = features.sum(axis=(2, 3)) features = features * channel_weight all_img_features.append(features) all_img_features = np.vstack(all_img_features) col_names = [f"feat_{i}" for i in range(0, all_img_features.shape[1])] feature_df = pd.DataFrame(all_img_features, columns=col_names) outputs = container.DataFrame(feature_df.head(1), generate_metadata=True) outputs.metadata = outputs.metadata.update( (metadata_base.ALL_ELEMENTS,), {"dimension": {"length": feature_df.shape[0]}}, ) outputs = outputs.append(feature_df.iloc[1:]) for idx in range(outputs.shape[1]): outputs.metadata = outputs.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, idx), "http://schema.org/Float" ) return base.CallResult(outputs)
def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None, ) -> base.CallResult[container.DataFrame]: logger.debug(f"Producing {__name__}") if len(self._cols) == 0: return base.CallResult(inputs) # add the binary encoded columns and remove the source columns outputs = inputs.copy() encoded_cols = container.DataFrame() encoded_cols_source = [] bin_idx = 0 for i, c in enumerate(self._cols): categorical_inputs = outputs.iloc[:, c] result = self._encoders[i].transform(categorical_inputs) for j in range(result.shape[1]): encoded_cols[(f"__binary_{bin_idx}")] = result[:, j] encoded_cols_source.append(c) bin_idx += 1 encoded_cols.metadata = encoded_cols.metadata.generate(encoded_cols) for c in range(encoded_cols.shape[1]): encoded_cols.metadata = encoded_cols.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, c), "http://schema.org/Integer" ) encoded_cols.metadata = encoded_cols.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, c), self._attribute_semantic ) col_dict = dict( encoded_cols.metadata.query((metadata_base.ALL_ELEMENTS, c)) ) col_dict["source_column"] = outputs.metadata.query( (metadata_base.ALL_ELEMENTS, encoded_cols_source[c]) )["name"] encoded_cols.metadata = encoded_cols.metadata.update( (metadata_base.ALL_ELEMENTS, c), col_dict ) outputs = outputs.append_columns(encoded_cols) outputs = outputs.remove_columns(self._cols) logger.debug(f"\n{outputs}") return base.CallResult(outputs)
def _is_unique_key(self, input_column: container.DataFrame) -> bool: column_values = input_column.iloc[:, 0] # There should be at least one row. This prevents a degenerate case # where we would mark a column of no rows as a unique key column. # (Otherwise we also get division by zero below.) if not len(column_values): return False # Here we look at every value as-is. Even empty strings and other missing/nan values. if any(input_column.duplicated()): return False return True
def combine(self, prediction_groups: typing.Dict, inputs: container.DataFrame): all_results = [] only_one = 'only_one_time_series' in prediction_groups for i, row in inputs.iterrows(): # date = pd.Timestamp(et.time_indicator.get_datetime(row)) date = self.time_indicator.get_datetime(row) if only_one: key = 'only_one_time_series' else: key = [] for x in self.categorical_indices: key.append(row.iloc[x]) key = tuple(key) predictions = prediction_groups[key] all_results.append(predictions.loc[date, 0]) return np.array(all_results).T
def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]: columns_list_to_fold = self._mapping.get('foldable_columns', []) if len(columns_list_to_fold) == 0: return CallResult(inputs, True, 1) if inputs.shape[0] > 20000: return CallResult(inputs, True, 1) self._column_names = list(inputs) if inputs is not None else [] df = None for columns_to_fold in columns_list_to_fold: df = self._fold_columns(inputs, columns_to_fold) cols_to_drop = list() for col_idx, col_name in enumerate(inputs.columns): if col_name not in df.columns: cols_to_drop.append(col_idx) inputs = utils.remove_columns(inputs, cols_to_drop) new_df = inputs[0:0] for col_name in new_df.columns: new_df.loc[:, col_name] = df.loc[:, col_name] extends = {} for col_name in df.columns: if col_name not in new_df.columns: extends[col_name] = df.loc[:, col_name].tolist() if extends: extends_df = d3m_DataFrame.from_dict(extends) extends_df.index = new_df.index.copy() new_df = utils.append_columns(new_df, extends_df) new_df = self._update_type(new_df, list(extends.keys())) old_metadata = dict(new_df.metadata.query(())) old_metadata["dimension"] = dict(old_metadata["dimension"]) old_metadata["dimension"]["length"] = new_df.shape[0] new_df.metadata = new_df.metadata.update((), old_metadata) return CallResult(new_df, True, 1) if new_df is not None else CallResult( inputs, True, 1)
def update_type(extends, df_origin): extends_df = pd.DataFrame.from_dict(extends) extends_df = d3m_DataFrame(extends_df, generate_metadata=True) if extends != {}: extends_df.index = df_origin.index.copy() new_df = d3m_DataFrame.append_columns(df_origin, extends_df) indices = list() for key in extends: indices.append(new_df.columns.get_loc(key)) for idx in indices: old_metadata = dict(new_df.metadata.query((mbase.ALL_ELEMENTS, idx))) numerics = pd.to_numeric(new_df.iloc[:, idx], errors='coerce') length = numerics.shape[0] nans = numerics.isnull().sum() if nans / length > 0.9: if HelperFunction.is_categorical(new_df.iloc[:, idx]): old_metadata['semantic_types'] = ( "https://metadata.datadrivendiscovery.org/types/CategoricalData", ) else: old_metadata['semantic_types'] = ("http://schema.org/Text", ) else: intcheck = (numerics % 1) == 0 if np.sum(intcheck) / length > 0.9: old_metadata['semantic_types'] = ( "http://schema.org/Integer", ) else: old_metadata['semantic_types'] = ("http://schema.org/Float", ) old_metadata['semantic_types'] += ( "https://metadata.datadrivendiscovery.org/types/Attribute", ) new_df.metadata = new_df.metadata.update((mbase.ALL_ELEMENTS, idx), old_metadata) return new_df
def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None, ) -> base.CallResult[container.DataFrame]: logger.debug(f"Producing {__name__}") if len(self._cols) == 0: return base.CallResult(inputs) # encode using the previously identified categorical columns input_cols = inputs.iloc[:, self._cols] from itertools import zip_longest encoded_cols = container.DataFrame() for i in self._cols: col_name = inputs.columns[i] col = container.DataFrame.from_records( zip_longest(*inputs[col_name].values)).T col.columns = [f"{col_name}_{x}" for x in range(len(col.columns))] encoded_cols = pd.concat([encoded_cols, col], axis=1) # append the encoding columns and generate metadata outputs = inputs.copy() encoded_cols.metadata = encoded_cols.metadata.generate(encoded_cols) for c in range(encoded_cols.shape[1]): encoded_cols.metadata = encoded_cols.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, c), "http://schema.org/Float") outputs = outputs.append_columns(encoded_cols) # drop the source columns outputs = outputs.remove_columns(self._cols) logger.debug(f"\n{outputs}") return base.CallResult(outputs)
def _update_type_info(self, semantic_types: Sequence[str], outputs: container.DataFrame, i: int) -> container.DataFrame: # update the structural / df type from the semantic type if "http://schema.org/Integer" in semantic_types: outputs.metadata = outputs.metadata.update_column( i, {"structural_type": int}) outputs.iloc[:, i] = pd.to_numeric(outputs.iloc[:, i]) elif "http://schema.org/Float" in semantic_types: outputs.metadata = outputs.metadata.update_column( i, {"structural_type": float}) outputs.iloc[:, i] = pd.to_numeric(outputs.iloc[:, i]) elif "http://schema.org/Boolean" in semantic_types: outputs.metadata = outputs.metadata.update_column( i, {"structural_type": bool}) outputs.iloc[:, i] = outputs.iloc[:, i].astype("bool") return outputs
def calculate_score(ground_truth: container.DataFrame, prediction: container.DataFrame, performance_metrics: typing.List[typing.Dict], task_type, regression_metric: set): """ static method used to calculate the score based on given predictions and metric tpyes Parameters --------- ground_truth: the ground truth of target prediction: the predicted results of target performance_metrics: the metehod to calculate the score task_type: the task type of the problem """ result_metrics = [] target_amount = 0 if prediction is not None: prediction = graph_problem_conversion(task_type, prediction) for metric_description in performance_metrics: metricDesc = metric_description['metric'] params: typing.Dict = metric_description.get('params', {}) if params: metric: problem.PerformanceMetric = metricDesc.get_class()(**params) else: metric = metricDesc.get_class() # updated for d3m v2019.5.8: we need to instantiate the metric class first if it was not done yet if type(metric) is AbstractMetaclass: metric = metric() # special design for objectDetectionAP if metric_description["metric"] == problem.PerformanceMetric.OBJECT_DETECTION_AVERAGE_PRECISION: if ground_truth is not None and prediction is not None: # training_image_name_column = ground_truth.iloc[:, # ground_truth.shape[1] - 2] # prediction.insert(loc=0, column='image_name', # value=training_image_name_column) ground_truth_to_send = ground_truth.iloc[:, ground_truth.shape[1] - 2: ground_truth.shape[1]] prediction_to_send = prediction# .iloc[:, prediction.shape[1] - 2: prediction.shape[1]] if prediction_to_send['d3mIndex'].dtype.name != ground_truth_to_send['d3mIndex'].dtype.name: ground_truth_to_send = ground_truth_to_send['d3mIndex'].astype(str) prediction_to_send = prediction_to_send['d3mIndex'].astype(str) # truth = ground_truth_to_send.astype(str).values.tolist() # predictions = prediction_to_send.astype(str).values.tolist() value = metric.score(ground_truth_to_send, prediction_to_send) result_metrics.append({ 'column_name': ground_truth.columns[-1], 'metric': metric_description['metric'], 'value': value }) return result_metrics # END special design for objectDetectionAP do_regression_mode = metric_description["metric"] in regression_metric try: # generate the metrics for training results if ground_truth is not None and prediction is not None: if "d3mIndex" not in ground_truth.columns: raise NotSupportedError("No d3mIndex found for ground truth!") else: ground_truth_amount = len(ground_truth.columns) - 1 if "d3mIndex" not in prediction.columns: # for the condition that ground_truth have index but # prediction don't have target_amount = len(prediction.columns) prediction.insert(0,'d3mIndex' ,ground_truth['d3mIndex'].copy()) else: target_amount = len(prediction.columns) - 1 if prediction['d3mIndex'].dtype.name != ground_truth['d3mIndex'].dtype.name: ground_truth['d3mIndex'] = ground_truth['d3mIndex'].astype(str).copy() prediction['d3mIndex'] = prediction['d3mIndex'].astype(str).copy() if not (ground_truth_amount == target_amount): _logger.error("Ground truth's amount and prediction's amount does not match") _logger.error('predicition columns :' + str(prediction.columns)) _logger.error('Ground truth columns:' + str(ground_truth.columns)) raise ValueError("Ground truth's amount and prediction's amount does not match") # from runtime import ForkedPdb # ForkedPdb().set_trace() if do_regression_mode: # regression mode require the targets must be float for each_column in range(-target_amount, 0, 1): prediction.iloc[:,each_column] = prediction.iloc[:,each_column].astype(float).copy() # update 2019.4.12, now d3m v2019.4.4 have new metric function, we have to change like this ground_truth_d3m_index_column_index = ground_truth.columns.tolist().index("d3mIndex") prediction_d3m_index_column_index = prediction.columns.tolist().index("d3mIndex") for each_column in range(-target_amount, 0, 1): result_metrics.append({ 'column_name': ground_truth.columns[each_column], 'metric': metric_description['metric'], 'value': metric.score(truth=ground_truth.iloc[:,[ground_truth_d3m_index_column_index,each_column]], predictions=prediction.iloc[:,[prediction_d3m_index_column_index,each_column]]) }) elif ground_truth is None: raise NotSupportedError("Metric calculation failed because ground truth is None!") elif prediction is not None: raise NotSupportedError("Metric calculation failed because prediction is None!") except Exception: traceback.print_exc() raise NotSupportedError('[ERROR] metric calculation failed') # END for loop if len(result_metrics) > target_amount: _logger.warning("[WARN] Training metrics's amount is larger than target amount.") # return the training and test metrics return result_metrics
def _produce( self, *, left_df_full: container.DataFrame, # type: ignore left_df: container.DataFrame, # type: ignore right_df: container.DataFrame, # type: ignore join_types: typing.Sequence[str], left_col: typing.Sequence[int], right_col: typing.Sequence[int], accuracy: typing.Sequence[float], absolute_accuracy: typing.Sequence[bool] ) -> base.CallResult[Outputs]: # cycle through the columns to join the dataframes right_cols_to_drop = [] new_left_cols = [] new_right_cols = [] for col_index in range(len(left_col)): # depending on the joining type, make a new dataframe that has columns we will want to merge on # keep track of which columns we will want to drop later on if len(self._STRING_JOIN_TYPES.intersection(join_types[col_index])) > 0: new_left_df = self._create_string_merge_cols( left_df, left_col[col_index], right_df, right_col[col_index], accuracy[col_index], col_index, ) left_df[new_left_df.columns] = new_left_df right_name = "righty_string" + str(col_index) right_df.rename( columns={right_col[col_index]: right_name}, inplace=True ) new_left_cols += list(new_left_df.columns) new_right_cols.append(right_name) elif len(self._NUMERIC_JOIN_TYPES.intersection(join_types[col_index])) > 0: new_left_df = self._create_numeric_merge_cols( left_df, left_col[col_index], right_df, right_col[col_index], accuracy[col_index], col_index, absolute_accuracy[col_index], ) left_df[new_left_df.columns] = new_left_df right_name = "righty_numeric" + str(col_index) right_df.rename( columns={right_col[col_index]: right_name}, inplace=True ) new_left_cols += list(new_left_df.columns) new_right_cols.append(right_name) elif len(self._GEO_JOIN_TYPES.intersection(join_types[col_index])) > 0: new_left_df, new_right_df = self._create_geo_vector_merging_cols( left_df, left_col[col_index], right_df, right_col[col_index], accuracy[col_index], col_index, absolute_accuracy[col_index], ) left_df[new_left_df.columns] = new_left_df right_df[new_right_df.columns] = new_right_df new_left_cols += list(new_left_df.columns) new_right_cols += list(new_right_df.columns) right_cols_to_drop.append(right_col[col_index]) elif len(self._VECTOR_JOIN_TYPES.intersection(join_types[col_index])) > 0: new_left_df, new_right_df = self._create_vector_merging_cols( left_df, left_col[col_index], right_df, right_col[col_index], accuracy[col_index], col_index, absolute_accuracy[col_index], ) left_df[new_left_df.columns] = new_left_df right_df[new_right_df.columns] = new_right_df new_left_cols += list(new_left_df.columns) new_right_cols += list(new_right_df.columns) right_cols_to_drop.append(right_col[col_index]) elif len(self._DATETIME_JOIN_TYPES.intersection(join_types[col_index])) > 0: tolerance = self._compute_datetime_tolerance(left_df_full, left_col[col_index], right_df, right_col[col_index], accuracy[col_index]) new_left_df, new_right_df = self._create_datetime_merge_cols( left_df, left_col[col_index], right_df, right_col[col_index], tolerance, col_index, ) left_df[new_left_df.columns] = new_left_df right_df[new_right_df.columns] = new_right_df new_left_cols += list(new_left_df.columns) new_right_cols += list(new_right_df.columns) right_cols_to_drop.append(right_col[col_index]) else: raise exceptions.InvalidArgumentValueError( "join not surpported on type " + str(join_types[col_index]) ) if "d3mIndex" in right_df.columns: right_cols_to_drop.append("d3mIndex") right_df.drop(columns=right_cols_to_drop, inplace=True) joined = pd.merge( left_df, right_df, how=self.hyperparams["join_type"], left_on=new_left_cols, right_on=new_right_cols, suffixes=["_left", "_right"], ) # don't want to keep columns that were created specifically for merging # also, inner merge keeps the right column we merge on, we want to remove it joined.drop(columns=new_left_cols + new_right_cols, inplace=True) return joined
def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None, ) -> base.CallResult[container.DataFrame]: start = time.time() logger.debug(f"Producing {__name__}") cols = self._get_columns(inputs.metadata) # outputs = container.DataFrame(generate_metadata=False) outputs = [None] * inputs.shape[1] parsing_semantics = self.hyperparams["parsing_semantics"] def fromstring(x: str) -> np.ndarray: # if column isn't a string, we'll just pass it through assuming it doesn't need to be parsed if type(x) is not str: return x return np.fromstring(x, dtype=float, sep=",") for col_index in range(len(inputs.columns)): if col_index in cols: column_metadata = inputs.metadata.query( (metadata_base.ALL_ELEMENTS, col_index) ) semantic_types = column_metadata.get("semantic_types", []) desired_semantics = set(semantic_types).intersection(parsing_semantics) if desired_semantics: if ( "https://metadata.datadrivendiscovery.org/types/FloatVector" in desired_semantics ): outputs[col_index] = inputs.iloc[:, col_index].apply( fromstring, convert_dtype=False ) if outputs[col_index].shape[0] > 0: inputs.metadata = inputs.metadata.update_column( col_index, {"structural_type": type(outputs[col_index][0])}, ) elif "http://schema.org/DateTime" in desired_semantics: outputs[col_index] = inputs.iloc[:, col_index].apply( utils.parse_datetime_to_float, fuzzy=self.hyperparams["fuzzy_time_parsing"], convert_dtype=False, ) inputs.metadata = inputs.metadata.update_column( col_index, {"structural_type": float} ) elif ( "https://metadata.datadrivendiscovery.org/types/CategoricalData" in desired_semantics ): # need to make sure if a categorical type is a numeric string, convert it if inputs[inputs.columns[col_index]][0].isnumeric(): outputs[col_index] = pd.to_numeric( inputs.iloc[:, col_index], errors=self.hyperparams["error_handling"], ) if outputs[col_index].shape[0] > 0: updated_type = type(outputs[col_index][0].item()) inputs.metadata = inputs.metadata.update_column( col_index, {"structural_type": updated_type} ) else: # if it's categorical but not numerical, ensure the string stays outputs[col_index] = inputs.iloc[:, col_index] else: outputs[col_index] = pd.to_numeric( inputs.iloc[:, col_index], errors=self.hyperparams["error_handling"], ) # Update structural type to reflect the results of the to_numeric call. We can't rely on the semantic type because # error coersion may result in a type becoming a float due to the presence of NaN. if outputs[col_index].shape[0] > 0: updated_type = type(outputs[col_index][0].item()) inputs.metadata = inputs.metadata.update_column( col_index, {"structural_type": updated_type} ) else: # columns without specified semantics need to be concatenated outputs[col_index] = inputs.iloc[:, col_index] else: # columns not specified still need to be concatenated outputs[col_index] = inputs.iloc[:, col_index] outputs = container.DataFrame(pd.concat(outputs, axis=1)) outputs.metadata = inputs.metadata end = time.time() logger.debug(f"Produce {__name__} completed in {end - start} ms") return base.CallResult(outputs)
B = np.random.randn(r, n) X_org = np.append(X_org, np.dot(A, B), axis=1) # mask a fraction of entries randomly X_incomplete = X_org.copy() m, n = X_org.shape for i in range(n): idx = np.random.choice(m, int(np.round(m * missing_rate)), replace=False) X_incomplete[idx, i] = np.nan # recover the missing entries # hp= hrmc_sf.Hyperparams.defaults() hp = high_rank_imputer.Hyperparams({ 'd': 0, 'alpha': 1, 'beta': 1, 'tol': 1e-4, 'maxiter': 500 }) # if d=0, d will be automatically estimated; otherwise (d>=1), the value of d will be applied sf = high_rank_imputer.HighRankImputer(hyperparams=hp) df_incomplete = DataFrame(X_incomplete.T) # the missing entries in the matrix must be noted by NaN df_recovered = sf.produce(inputs=df_incomplete).value X_recovered = df_recovered.values.T # compute the recovery error (relative mean squared error, within [0,1], the smaller the better) RMSE = np.square(X_recovered - X_org).sum() / np.square(X_org).sum() print("RMSE:", RMSE)
def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None, ) -> base.CallResult[container.DataFrame]: logger.debug(f"Producing {__name__}") # fallthrough if there's nothing to do if len(self._cols) == 0 or self._encoder is None: return base.CallResult(inputs) # map encoded cols to source column names feature_names = self._encoder.get_feature_names() encoded_cols_source = [] # feature names are xA_YY where A is the source column index and YY is the value for name in feature_names: # take the first part of the name (xA) and remove the x encoded_feature_index = int(name.split("_")[0][1:]) feature_index = self._cols[encoded_feature_index] encoded_cols_source.append( inputs.metadata.query((metadata_base.ALL_ELEMENTS, feature_index))[ "name" ] ) # encode using the previously identified categorical columns input_cols = inputs.iloc[:, self._cols] result = self._encoder.transform(input_cols) # append the encoding columns and generate metadata outputs = inputs.copy() encoded_cols: container.DataFrame = container.DataFrame() for i in range(result.shape[1]): encoded_cols[f"__onehot_{str(i)}"] = result[:, i] encoded_cols.metadata = encoded_cols.metadata.generate(encoded_cols) for c in range(encoded_cols.shape[1]): encoded_cols.metadata = encoded_cols.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, c), "http://schema.org/Float" ) encoded_cols.metadata = encoded_cols.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, c), self._attribute_semantic ) col_dict = dict( encoded_cols.metadata.query((metadata_base.ALL_ELEMENTS, c)) ) col_dict["source_column"] = encoded_cols_source[c] encoded_cols.metadata = encoded_cols.metadata.update( (metadata_base.ALL_ELEMENTS, c), col_dict ) outputs = outputs.append_columns(encoded_cols) # drop the source columns outputs = outputs.remove_columns(self._cols) logger.debug(f"\n{outputs}") return base.CallResult(outputs)
def _detect_text(cls, X: container.DataFrame, thresh: int = 8) -> bool: """ returns true if median entry has more than `thresh` tokens""" X = X[X.notnull()] n_toks = X.apply(lambda xx: len(str(xx).split(" "))).values return np.median(n_toks) >= thresh
def _convert_lists(dataframe: container.DataFrame) -> container.DataFrame: # convert colum contents to numpy array of values similar to what extract semantic types would do for index, row in dataframe.iterrows(): row["bravo"] = container.ndarray([int(i) for i in row["bravo"].split(",")]) return dataframe
def _encode_labels(self, inputs: container.DataFrame) -> container.DataFrame: for col_idx, (label, col) in enumerate(inputs.iteritems()): encodes = [self._labels[col_idx].get(label, 0) for label in col] inputs.iloc[:, col_idx] = encodes return inputs
def combine_columns( inputs: container.DataFrame, column_indices: typing.Sequence[int], columns_list: typing.Sequence[container.DataFrame], *, return_result: str, add_index_columns: bool, ) -> container.DataFrame: """ Method which appends existing columns, replaces them, or creates new result from them, based on ``return_result`` argument, which can be ``append``, ``replace``, or ``new``. ``add_index_columns`` controls if when creating a new result, primary index columns should be added if they are not already among columns. ``inputs`` is a DataFrame for which we are appending on replacing columns, or if we are creating new result, from where a primary index column can be taken. ``column_indices`` controls which columns in ``inputs`` were used to create ``columns_list``, and which columns should be replaced when replacing them. ``columns_list`` is a list of DataFrames representing all together new columns. The reason it is a list is to make it easier to operate per-column when preparing ``columns_list`` and not have to concat them all together unnecessarily. Top-level metadata in ``columns_list`` is ignored, except when creating new result. In that case top-level metadata from the first element in the list is used. When ``column_indices`` columns are being replaced with ``columns_list``, existing metadata in ``column_indices`` columns is not preserved but replaced with metadata in ``columns_list``. Ideally, metadata for ``columns_list`` has been constructed by copying source metadata from ``column_indices`` columns and modifying it as necessary to adapt it to new columns. But ``columns_list`` also can have completely new metadata, if this is more reasonable, but it should be understood that in this case when replacing ``column_indices`` columns, any custom additional metadata on those columns will be lost. ``column_indices`` and ``columns_list`` do not have to match in number of columns. Columns are first replaced in order for matching indices and columns. If then there are more ``column_indices`` than ``columns_list``, additional ``column_indices`` columns are removed. If there are more ``columns_list`` than ``column_indices`` columns, then additional ``columns_list`` are inserted after the last replaced column. If ``column_indices`` is empty, then the replacing behavior is equivalent to appending. """ if return_result == 'append': outputs = inputs for columns in columns_list: outputs = outputs.append_columns(columns) elif return_result == 'replace': if not column_indices: return combine_columns(inputs, column_indices, columns_list, return_result='append', add_index_columns=add_index_columns) # Compute the difference in "columns" to_be_added = list( numpy.setdiff1d(numpy.arange(len(inputs.columns)), column_indices)) columns_replaced = 0 if len(to_be_added) < len(column_indices): # More efficient to concatenate than replace one-by-one outputs = pandas.concat(columns_list, axis=1) outputs = container.DataFrame(data=outputs, generate_metadata=False) indices = range(columns_list[0].shape[1]) outputs.metadata = inputs.metadata.select_columns( columns=list(indices)) c = 0 for columns in columns_list: columns_length = columns.shape[1] if c == 0: outputs.metadata = outputs.metadata.replace_columns( columns.metadata, list(indices)) else: outputs.metadata = outputs.metadata.append_columns( columns.metadata) c += 1 for col in to_be_added: insert_index = col.item() if insert_index > outputs.shape[1]: insert_index = outputs.shape[1] outputs = outputs.insert_columns( inputs.select_columns([col.item()]), insert_index) outputs.metadata = outputs.metadata.compact(['structural_type']) else: # We copy here and disable copying inside "replace_columns" to copy only once. # We have to copy because "replace_columns" is modifying data in-place. outputs = copy.copy(inputs) for columns in columns_list: columns_length = columns.shape[1] if columns_replaced < len(column_indices): # It is OK if the slice of "column_indices" is shorter than "columns", Only those columns # listed in the slice will be replaced and others appended after the last replaced column. outputs = outputs.replace_columns( columns, column_indices[columns_replaced:columns_replaced + columns_length], copy=False) else: # We insert the rest of columns after the last columns we replaced. We know that "column_indices" # is non-empty and that the last item of "column_indices" points ot the last column we replaced # for those listed in "column_indices". We replaced more columns though, so we have to add the # difference, and then add 1 to insert after the last column. outputs = outputs.insert_columns( columns, column_indices[-1] + (columns_replaced - len(column_indices)) + 1) columns_replaced += columns_length if columns_replaced < len(column_indices): outputs = outputs.remove_columns( column_indices[columns_replaced:len(column_indices)]) elif return_result == 'new': if not any(columns.shape[1] for columns in columns_list): raise ValueError("No columns produced.") outputs = columns_list[0] for columns in columns_list[1:]: outputs = outputs.append_columns(columns) if add_index_columns: inputs_index_columns = inputs.metadata.get_index_columns() outputs_index_columns = outputs.metadata.get_index_columns() if inputs_index_columns and not outputs_index_columns: # Add index columns at the beginning. outputs = inputs.select_columns( inputs_index_columns).append_columns( outputs, use_right_metadata=True) else: raise exceptions.InvalidArgumentValueError( "\"return_result\" has an invalid value: {return_result}".format( return_result=return_result)) return outputs
from d3m.container import DataFrame from pyglrm_d3m.huber_pca import HuberPCA A = DataFrame([[1, 2, 3, 4], [2, 4, 6, 8], [4, 5, 6, 7]]) model = HuberPCA(hyperparams={'k': 2}) #create a class for Huber PCA model.set_training_data(inputs=A) model.fit() #get parameter parameter = model.get_params() print("Initial parameter (Y): {}".format(parameter['Y'].values)) #modify parameter print( "Now we change the (0,0) entry of Y to 0, and set the modified Y as parameter of the Huber PCA class." ) parameter['Y'].values[0, 0] = 0 model.set_params(params={'Y': parameter['Y']}) #check if parameter has been modified parameter = model.get_params() print("Modified parameter (Y): {}".format(parameter['Y'].values))
from pyglrm_d3m.huber_pca import HuberPCA from d3m.container import DataFrame A = DataFrame([[1, 2, 3, 4], [2, 4, 6, 8], [4, 5, 6, 7]], generate_metadata=True) model = HuberPCA(hyperparams={'k': 2}) #create a class for Huber PCA model.set_training_data(inputs=A) model.fit() a_new = DataFrame([[6, 7, 8, 9]]) #initialize a new row to be tested x = model.produce( inputs=a_new).value.values #get the latent representation of a_new print(x)