def split_mnist(X, y): """ prepare mnist data as d3m dataframe, fit linear SVC as baseline""" # train/test split n_class = len(set(y)) n_samples_per_class = 2 idxs = np.arange(X.shape[0]) X_train, X_test, y_train, y_test, _, idx_test = train_test_split( X, y, idxs, train_size=n_class * n_samples_per_class, stratify=y) y[idx_test] = "" # Linear SVC global svc svc = LinearSVC().fit(X_train, y_train) svc_preds = svc.predict(X_test) svc_acc = (y_test == svc_preds).mean() features_df = pd.DataFrame(X) labels_df = pd.DataFrame({"target": y}) features_df = d3m_DataFrame(features_df) labels_df = d3m_DataFrame(labels_df) return features_df, labels_df, svc_acc, idx_test, y_test
def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]: #make_keras_pickleable() produce_data, learning_df, nodes_df, edges_df = self._parse_inputs( inputs, return_all=True) if self.fitted: result = self._sdne._Y #produce( )#_Y else: dim = self.hyperparams['dimension'] alpha = self.hyperparams['alpha'] beta = self.hyperparams['beta'] #self._model self._sdne = sdne.SDNE(d=dim, alpha=alpha, beta=beta, **args) produce_data = networkx.from_scipy_sparse_matrix(produce_data) self._sdne.learn_embedding(graph=produce_data) self._model = self._sdne._model result = self._sdne._Y target_types = [ 'https://metadata.datadrivendiscovery.org/types/TrueTarget', 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' ] if self.hyperparams['return_list']: result_np = container.ndarray(result, generate_metadata=True) return_list = d3m_List([result_np, inputs[1], inputs[2]], generate_metadata=True) return CallResult(return_list, True, 1) else: learn_df = d3m_DataFrame(learning_df, generate_metadata=True) learn_df = get_columns_not_of_type(learn_df, target_types) learn_df = learn_df.remove_columns( [learn_df.columns.get_loc('nodeID')]) #learn_df = learn_df.drop('nodeID', axis = 'columns') result_df = d3m_DataFrame(result, generate_metadata=True) result_df = result_df.loc[result_df.index.isin( learning_df['d3mIndex'].values)] for column_index in range(result_df.shape[1]): col_dict = dict( result_df.metadata.query((ALL_ELEMENTS, column_index))) col_dict['structural_type'] = type(1.0) col_dict['name'] = str(learn_df.shape[1] + column_index) col_dict['semantic_types'] = ( 'http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute') result_df.metadata = result_df.metadata.update( (ALL_ELEMENTS, column_index), col_dict) result_df.index = learn_df.index.copy() output = utils.append_columns(learn_df, result_df) #output.set_index('d3mIndex', inplace=True) return CallResult(output, True, 1)
def load_inputs(): fnames = sorted(glob('/test_data/bigearth-100-single/*/*.tif')) imnames = sorted(list(set(['_'.join(f.split('_')[:-1]) for f in fnames]))) imgs = [load_patch(img_path).astype(np.float32) for img_path in imnames] imgs_df = pd.DataFrame({'image_col': imgs, 'dummy_idx': range(len(imgs))}) y = [i.split('/')[3] for i in imnames] tgts_df = pd.DataFrame({'target': y}) return (d3m_DataFrame(imgs_df), d3m_DataFrame(tgts_df))
def load_inputs(): fnames = sorted(glob("/test_data/bigearth-100-single-2c/*/*.tif")) imnames = sorted(list(set(["_".join(f.split("_")[:-1]) for f in fnames]))) imgs = [load_patch(img_path).astype(np.float32) for img_path in imnames] imgs_df = pd.DataFrame({"image_col": imgs, "dummy_idx": range(len(imgs))}) y = [i.split("/")[3] for i in imnames] tgts_df = pd.DataFrame({"target": y}) return (d3m_DataFrame(imgs_df), d3m_DataFrame(tgts_df))
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: if not self._fitted: return CallResult(inputs, True, 1) temp = pd.DataFrame(self._training_data.iloc[:, self._s_cols].apply( lambda x: self._d[x.name].transform(x))) outputs = self._training_data.copy() for id_index, od_index in zip(self._s_cols, range(temp.shape[1])): outputs.iloc[:, id_index] = temp.iloc[:, od_index] lookup = { "int": ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute') } #new_dtype = temp.dtypes for index in self._s_cols: old_metadata = dict( outputs.metadata.query((mbase.ALL_ELEMENTS, index))) old_metadata["semantic_types"] = lookup["int"] old_metadata["structural_type"] = type(10) outputs.metadata = outputs.metadata.update( (mbase.ALL_ELEMENTS, index), old_metadata) if outputs.shape == inputs.shape: print("output:", outputs.head(5)) return CallResult(d3m_DataFrame(outputs), True, 1) else: return CallResult(inputs, True, 1)
def test_new_moons(): X, y = load_moons(labeled_sample=0) features_df = pd.DataFrame(X) features_df = d3m_DataFrame(features_df) accs = {} for algorithm in ["PseudoLabel", "VAT", "ICT"]: tss = TabularSemiSupervisedPrimitive( hyperparams=tss_hp( tss_hp.defaults(), algorithm=algorithm, weights_filepath=f"{algorithm}.pth", ), random_seed=5, ) tss.set_params(params=tss_params[algorithm]) preds = tss.produce(inputs=features_df).value acc = (y == preds["target"].astype(float)).mean() print(f"{algorithm}: {acc}") accs[algorithm] = acc assert accs["VAT"] > accs["PseudoLabel"] assert accs["VAT"] > accs["ICT"] assert accs["PseudoLabel"] > accs["ICT"]
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: if not self._fitted: return CallResult(inputs, True, 1) temp = pd.DataFrame(self._model.transform(inputs.iloc[:, self._s_cols])) outputs = self._training_data.copy() for id_index, od_index in zip(self._s_cols, range(temp.shape[1])): outputs.iloc[:, id_index] = temp.iloc[:, od_index] new_dtype = temp.dtypes lookup = {"float": ('http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'), "int": ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute')} for d, index in zip(new_dtype, self._s_cols): print("old metadata : ", outputs.metadata.query((mbase.ALL_ELEMENTS, index))) old_metadata = dict(outputs.metadata.query((mbase.ALL_ELEMENTS, index))) if d==np.dtype(np.float16) or d==np.dtype(np.float32) or d==np.dtype(np.float64) or d==np.dtype(np.float128): old_metadata["semantic_types"] = lookup["float"] old_metadata["structural_type"] = type(10.0) else: old_metadata["semantic_types"] = lookup["int"] old_metadata["structural_type"] = type(10) outputs.metadata = outputs.metadata.update((mbase.ALL_ELEMENTS, index),old_metadata) print("updated dict : ",old_metadata) print("check again : ", outputs.metadata.query((mbase.ALL_ELEMENTS, index))) if outputs.shape == inputs.shape: return CallResult(d3m_DataFrame(outputs), True, 1) else: return CallResult(inputs, True, 1)
def iterative_labeling(features, labels, seed_idx=2, n_rounds=5): # initial query image y = (labels == labels[seed_idx]).astype(np.int) annotations = np.zeros(features.shape[0]) - 1 annotations[seed_idx] = 1 n_pos, n_neg = 1, 0 for i in range(n_rounds): print(f'round {i}') # generate ranking by similarity sampler = ImageRetrievalPrimitive( hyperparams=ir_hp(ir_hp.defaults(), reduce_dimension=256)) sampler.set_training_data( inputs=features, outputs=d3m_DataFrame(pd.DataFrame({'annotations': annotations}))) sampler.fit() ranking_df = sampler.produce(inputs=features).value assert ranking_df.shape[0] == features.shape[0] - i - 1 exc_labeled = ranking_df['index'].values inc_labeled = np.concatenate((sampler.pos_idxs, exc_labeled)) # simulate human labeling next_idx = exc_labeled[0] next_label = y[next_idx] annotations[next_idx] = next_label if next_label == 1: n_pos += 1 else: n_neg += 1 # evaluate ranking against ground truth results = { 'round': i + 1, 'next_idx': int(next_idx), 'next_label': next_label, 'n_pos': n_pos, 'n_neg': n_neg, 'a_p': [float(y[inc_labeled[:k]].mean()) for k in 2**np.arange(11)], # precision, including labeled 'u_p': [float(y[exc_labeled[:k]].mean()) for k in 2**np.arange(11)], # precision, excluding labeled 'r_p': [ float(y[inc_labeled[:k]].sum() / y.sum()) for k in 2**np.arange(11) ], # recall, including labeled } print() print(results)
def load_frame(compress_data = False): img_paths = [ os.path.join(dataset_path, filename) for filename in os.listdir(dataset_path) ] imgs = [ load_patch(img_path).astype(np.float32) for img_path in img_paths ] if compress_data: compressed_imgs = [] for img in imgs: output_bytes = bytearray(struct.pack( 'cIII', bytes(img.dtype.char.encode()), len(img), img.shape[1], img.shape[1] )) output_bytes.extend(img.tobytes()) compressed_bytes = lzo.compress(bytes(output_bytes)) compressed_img = np.frombuffer( compressed_bytes, dtype='uint8', count=len(compressed_bytes) ) compressed_imgs.append(compressed_img) imgs = compressed_imgs df = pd.DataFrame({'dummy_idx': range(len(imgs)), 'image_col': imgs}) return d3m_DataFrame(df)
def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]: try: self._output_columns = self._output_columns except: self._output_columns = ['output'] * len(list(output)) preds = self.model.produce(inputs.values) output = d3m_DataFrame(preds, columns=self._output_columns, source=self, generate_metadata=True) # output.metadata = inputs.metadata.clear(source=self, for_value=output, generate_metadata=True) #output.metadata = self._add_target_semantic_types(metadata=output.metadata, target_names=self._output_columns, source=self) self._training_indices = [ c for c in inputs.columns if isinstance(c, str) and 'index' in c.lower() ] outputs = common_utils.combine_columns( return_result='new', #self.hyperparams['return_result'], add_index_columns=True, #self.hyperparams['add_index_columns'], inputs=inputs, columns_list=[output], source=self, column_indices=self._training_indices) return CallResult(outputs, True, 1)
def produce_metafeatures(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Produce primitive's best guess for the structural type of each input column. Parameters ---------- inputs : D3M Dataset object Returns ------- Outputs The outputs is two lists of lists, each has length equal to number of columns in input pandas frame. Each entry of the first one is a list of strings corresponding to each column's multi-label classification. Each entry of the second one is a list of floats corresponding to prediction probabilities. """ out_df = self._produce_annotations(inputs = inputs) # add metadata to output data frame simon_df = d3m_DataFrame(out_df) # first column ('semantic types') col_dict = dict(simon_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict['structural_type'] = type("this is text") col_dict['name'] = 'semantic types' col_dict['semantic_types'] = ('http://schema.org/Text', 'https://metadata.datadrivendiscovery.org/types/Attribute') simon_df.metadata = simon_df.metadata.update((metadata_base.ALL_ELEMENTS, 0), col_dict) # second column ('probabilities') col_dict = dict(simon_df.metadata.query((metadata_base.ALL_ELEMENTS, 1))) col_dict['structural_type'] = type("this is text") col_dict['name'] = 'probabilities' col_dict['semantic_types'] = ('http://schema.org/Text', 'https://metadata.datadrivendiscovery.org/types/Attribute') simon_df.metadata = simon_df.metadata.update((metadata_base.ALL_ELEMENTS, 1), col_dict) return CallResult(simon_df)
def produce_metafeatures(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Parameters ---------- inputs: D3M dataframe, NOTE: Target column MUST be the last column Returns ---------- Outputs: D3M dataframe with ordered list of original features in first column """ # add metadata to output dataframe rff_df = d3m_DataFrame( RFFeatures().rank_features(inputs=inputs.iloc[:, :-1], targets=pandas.DataFrame( inputs.iloc[:, -1])), columns=["features"], ) # first column ('features') col_dict = dict(rff_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict["structural_type"] = type("it is a string") col_dict["name"] = "features" col_dict["semantic_types"] = ( "http://schema.org/Text", "https://metadata.datadrivendiscovery.org/types/Attribute", ) rff_df.metadata = rff_df.metadata.update( (metadata_base.ALL_ELEMENTS, 0), col_dict) return CallResult(rff_df)
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: ''' Sets primitive's training data Parameters ---------- inputs: numpy ndarray of size (number_of_time_series, time_series_length, dimension) containing training time series outputs: numpy ndarray of size (number_time_series,) containing classes of training time series ''' if not self.hyperparams['long_format']: inputs = TimeSeriesFormatterPrimitive( hyperparams=self._hp).produce(inputs=inputs).value['0'] else: hyperparams_class = DatasetToDataFrame.DatasetToDataFramePrimitive.metadata.query( )['primitive_code']['class_type_arguments']['Hyperparams'] ds2df_client = DatasetToDataFrame.DatasetToDataFramePrimitive( hyperparams=hyperparams_class.defaults().replace( {"dataframe_resource": "learningData"})) inputs = d3m_DataFrame(ds2df_client.produce(inputs=inputs).value) # load and reshape training data # 'series_id' and 'value' should be set by metadata n_ts = len(inputs.d3mIndex.unique()) ts_sz = int(inputs.shape[0] / n_ts) self._X_train = np.array(inputs.value).reshape(n_ts, ts_sz, 1) self._y_train = np.array(inputs.label.iloc[::ts_sz]).reshape(-1, )
def produce_metafeatures(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Parameters ------- inputs : Input pandas frame Returns ------- Outputs : pandas frame with list of original features in first column, ordered by their contribution to the first principal component, and scores in the second column. """ # add metadata to output data frame pca_df = d3m_DataFrame(PCAFeatures().rank_features(inputs = inputs)) # first column ('features') col_dict = dict(pca_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict['structural_type'] = type("1") col_dict['name'] = 'features' col_dict['semantic_types'] = ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute') pca_df.metadata = pca_df.metadata.update((metadata_base.ALL_ELEMENTS, 0), col_dict) # second column ('scores') col_dict = dict(pca_df.metadata.query((metadata_base.ALL_ELEMENTS, 1))) col_dict['structural_type'] = type("1.0") col_dict['name'] = 'scores' col_dict['semantic_types'] = ('http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute') pca_df.metadata = pca_df.metadata.update((metadata_base.ALL_ELEMENTS, 1), col_dict) return CallResult(pca_df)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Produce predictions using fit adversarial debiasing algorithm Parameters ---------- inputs : D3M dataframe Returns ---------- Outputs : D3M dataframe -> predictions from fit debiasing algorithm """ # transfrom test dataframe to IBM 360 compliant dataset inputs[self.label_names] = self.train_dataset.convert_to_dataframe()[0][self.label_names].values[:inputs.shape[0]].astype(int) test_dataset = datasets.BinaryLabelDataset(df = inputs[self.attribute_names + self.label_names], label_names = self.label_names, protected_attribute_names = self.protected_attributes, favorable_label=self.hyperparams['favorable_label'], unfavorable_label=self.unfavorable_label) transformed_dataset = self.clf.predict(test_dataset) # transform IBM dataset back to D3M dataset df = transformed_dataset.convert_to_dataframe()[0][self.label_names].astype(int) df = d3m_DataFrame(pandas.concat([inputs[self.idx].reset_index(drop=True), df.reset_index(drop=True)], axis = 1)) df.metadata = df.metadata.update((metadata_base.ALL_ELEMENTS, 0), inputs.metadata.query_column(0)) df.metadata = df.metadata.update((metadata_base.ALL_ELEMENTS, 1), inputs.metadata.query_column(1)) print(df.head(), file = sys.__stdout__) return CallResult(df)
def produce_metafeatures(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Perform supervised recursive feature elimination using random forests to generate an ordered list of features Parameters ---------- inputs : Input pandas frame, NOTE: Target column MUST be the last column Returns ------- Outputs : pandas frame with ordered list of original features in first column """ # add metadata to output dataframe rff_df = d3m_DataFrame(RFFeatures().rank_features( inputs=inputs.iloc[:, :-1], targets=pandas.DataFrame(inputs.iloc[:, -1])), columns=['features']) # first column ('features') col_dict = dict(rff_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict['structural_type'] = type("it is a string") col_dict['name'] = 'features' col_dict['semantic_types'] = ( 'http://schema.org/Text', 'https://metadata.datadrivendiscovery.org/types/Attribute') rff_df.metadata = rff_df.metadata.update( (metadata_base.ALL_ELEMENTS, 0), col_dict) return CallResult(rff_df)
def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]: self.columns = list(inputs) X_ = inputs[self.columns].values if iterations is not None: self.max_iter = iterations else: self.max_iter = 10000 #try: if not self.fitted: raise ValueError('Please fit before calling produce') #except: # pass self.latent_factors = self.model.transform(X_) out_df = d3m_DataFrame(inputs, generate_metadata=True) corex_df = d3m_DataFrame(self.latent_factors, generate_metadata=True) for column_index in range(corex_df.shape[1]): col_dict = dict( corex_df.metadata.query((ALL_ELEMENTS, column_index))) col_dict['structural_type'] = type(1.0) # FIXME: assume we apply corex only once per template, otherwise column names might duplicate col_dict['name'] = str( out_df.shape[1] + column_index ) #should just be column index, no corex prefix #'corex_' + col_dict['semantic_types'] = ( 'http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute') corex_df.metadata = corex_df.metadata.update( (ALL_ELEMENTS, column_index), col_dict) corex_df.index = out_df.index.copy() out_df = utils.append_columns(out_df, corex_df) return CallResult(out_df, True, self.max_iter)
def produce_metafeatures(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Produce primitive's best guess for the structural type of each input column. Arguments: inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, not considered (default: {None}) Raises: PrimitiveNotFittedError: if primitive not fit Returns: CallResult[Outputs] -- dataframe with two columns: "semantic type classifications" and "probabilities" Each row represents a column in the original dataframe. The column "semantic type classifications" contains a list of all semantic type labels and the column "probabilities" contains a list of the model's confidence in assigning each respective semantic type label """ if not self._is_fit: raise PrimitiveNotFittedError("Primitive not fitted.") out_df = self._produce_annotations(inputs=inputs) # add metadata to output data frame simon_df = d3m_DataFrame(out_df) # first column ('semantic types') col_dict = dict( simon_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict["structural_type"] = typing.List[str] col_dict["name"] = "semantic types" col_dict["semantic_types"] = ( "http://schema.org/Text", "https://metadata.datadrivendiscovery.org/types/Attribute", ) simon_df.metadata = simon_df.metadata.update( (metadata_base.ALL_ELEMENTS, 0), col_dict) # second column ('probabilities') col_dict = dict( simon_df.metadata.query((metadata_base.ALL_ELEMENTS, 1))) col_dict["structural_type"] = typing.List[float] col_dict["name"] = "probabilities" col_dict["semantic_types"] = ( "http://schema.org/Text", "https://metadata.datadrivendiscovery.org/types/Attribute", ) simon_df.metadata = simon_df.metadata.update( (metadata_base.ALL_ELEMENTS, 1), col_dict) return CallResult(simon_df, has_finished=self._is_fit)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ return ranking of unlabeled instances based on similarity to positively and negatively labeled instances Ex. d3mIndex score 1130 0.586983 11 0.469862 1077 0.394225 1125 0.355335 21 0.353363 Arguments: inputs {Inputs} -- ignores these `inputs`, uses `inputs` from `set_training_data()` Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, not considered (default: {None}) """ pos_scores = np.row_stack(self.pos_scores) pos_scores = gem(pos_scores, p = self.hyperparams['gem_p']) if len(self.neg_scores) >= self.hyperparams['denominator_min']: print('rank by negative') neg_scores = np.row_stack(self.neg_scores) neg_scores = gem(neg_scores, p = self.hyperparams['gem_p']) scores = pos_scores / (neg_scores + 1e-12) else: print('rank by positive') scores = pos_scores mis_scores = scores[self.mis_idxs] mis_ranks = self.mis_idxs[np.argsort(-mis_scores)] mis_ranks = self.d3m_idxs[mis_ranks] ranking_df = pd.DataFrame({ self.idx_name: mis_ranks, 'score': np.flip(np.sort(mis_scores)), }) ranking_df = d3m_DataFrame(ranking_df, generate_metadata = True) ranking_df.metadata = ranking_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), "http://schema.org/Integer" ) ranking_df.metadata = ranking_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/PrimaryKey' ) ranking_df.metadata = ranking_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 1), "http://schema.org/Float" ) return CallResult(ranking_df)
def _prepare_d3m_df(self, all_preds): """ prepare d3m dataframe with appropriate metadata """ all_preds = [preds.tolist() for preds in all_preds] preds_df = pd.DataFrame({f"{self._positive_class}_mask": all_preds}) preds_df = d3m_DataFrame(preds_df, generate_metadata=False) preds_df.metadata = preds_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), "https://metadata.datadrivendiscovery.org/types/FloatVector", ) return preds_df
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Parameters ---------- inputs : dataframe Returns ---------- Outputs The output is a transformed dataframe of X fit into an embedded space, n feature columns will equal n_components hyperparameter For timeseries datasets the output is the dimensions concatenated to the timeseries filename dataframe """ targets = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/SuggestedTarget') target_names = [list(inputs)[t] for t in targets] index = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/PrimaryKey') index_names = [list(inputs)[i] for i in index] X_test = inputs.drop(columns = list(inputs)[index[0]]) X_test = X_test.drop(columns = target_names).values # special semi-supervised case - during training, only produce rows with labels series = inputs[target_names] != '' if series.any().any(): inputs = dataframe_utils.select_rows(inputs, np.flatnonzero(series)) X_test = X_test[np.flatnonzero(series)] sc_df = d3m_DataFrame(pandas.DataFrame(self.sc.fit_predict(X_test), columns=['cluster_labels'])) # just add last column of last column ('clusters') col_dict = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict['structural_type'] = type(1) if self.hyperparams['task_type'] == 'classification': col_dict['semantic_types'] = ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute') col_dict['name'] = 'cluster_labels' else: col_dict['semantic_types'] = ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget') col_dict['name'] = target_names[0] sc_df.metadata = sc_df.metadata.update((metadata_base.ALL_ELEMENTS, 0), col_dict) df_dict = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict_1 = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict['dimension'] = df_dict_1 df_dict_1['name'] = 'columns' df_dict_1['semantic_types'] = ('https://metadata.datadrivendiscovery.org/types/TabularColumn',) df_dict_1['length'] = 1 sc_df.metadata = sc_df.metadata.update((metadata_base.ALL_ELEMENTS,), df_dict) return CallResult(utils_cp.append_columns(inputs, sc_df))
def test_moons(labeled_sample=10): X_l, y_l, X_u, y_u = load_moons(labeled_sample) X = np.vstack((X_l, X_u)).astype(str) y = np.concatenate((y_l, y_u)).astype(str) y[labeled_sample:] = "" features_df = pd.DataFrame(X) labels_df = pd.DataFrame({"target": y}) features_df = d3m_DataFrame(features_df) labels_df = d3m_DataFrame(labels_df) global tss_params tss_params = {} accs = {} for algorithm in ["PseudoLabel", "VAT", "ICT"]: tss = TabularSemiSupervisedPrimitive( hyperparams=tss_hp( tss_hp.defaults(), epochs=50, algorithm=algorithm, weights_filepath=f"{algorithm}.pth", ), random_seed=5, ) tss.set_training_data(inputs=features_df, outputs=labels_df) tss.fit() tss_params[algorithm] = tss.get_params() preds = tss.produce(inputs=features_df).value acc = (y_u == preds["target"][labeled_sample:].astype(float)).mean() print(f"{algorithm}: {acc}") accs[algorithm] = acc assert accs["VAT"] > accs["PseudoLabel"] assert accs["VAT"] > accs["ICT"] assert accs["PseudoLabel"] > accs["ICT"]
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: if not self._fitted: return CallResult(inputs, self._has_finished, self._iterations_done) assert isinstance( self._model, dict), "self._model type must be dict not defaultdict!" temp = pd.DataFrame( inputs.iloc[:, self._s_cols].apply(lambda x: self._model[ x.name].transform(x) if x.name in self._model else None)) outputs = inputs.copy() for id_index, od_index in zip(self._s_cols, range(temp.shape[1])): outputs.iloc[:, id_index] = temp.iloc[:, od_index] lookup = { "int": ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute') } for index in self._s_cols: old_metadata = dict( outputs.metadata.query((mbase.ALL_ELEMENTS, index))) old_metadata["semantic_types"] = lookup["int"] old_metadata["structural_type"] = type(10) outputs.metadata = outputs.metadata.update( (mbase.ALL_ELEMENTS, index), old_metadata) # remove the columns that appeared in produce method but were not in fitted data drop_names = set(outputs.columns[self._s_cols]).difference( set(self._model.keys())) drop_indices = map(lambda a: outputs.columns.get_loc(a), drop_names) drop_indices = sorted(drop_indices) outputs = common_utils.remove_columns(outputs, drop_indices, source='ISI DSBox Data Labler') # sanity check and report the results if outputs.shape[0] == inputs.shape[0] and \ outputs.shape[1] == inputs.shape[1] - len(drop_names): self._has_finished = True self._iterations_done = True # print("output:",outputs.head(5)) return CallResult(d3m_DataFrame(outputs), self._has_finished, self._iterations_done) else: return CallResult(inputs, self._has_finished, self._iterations_done)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Produce primitive's prediction for future time series data Parameters ---------- None Returns ---------- Outputs The output is a data frame containing the d3m index and a forecast for each of the 'n_periods' future time periods """ # add metadata to output # just take d3m index from input test set output_df = inputs['d3mIndex'] # produce future foecast using arima future_forecast = pandas.DataFrame( self._sloth.PredictSeriesARIMA(self._arima, self.hyperparams['n_periods'])) output_df = pandas.concat([output_df, future_forecast], axis=1) parrot_df = d3m_DataFrame(output_df) # first column ('d3mIndex') col_dict = dict( parrot_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict['structural_type'] = type("1") col_dict['name'] = 'd3mIndex' col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', ) parrot_df.metadata = parrot_df.metadata.update( (metadata_base.ALL_ELEMENTS, 0), col_dict) # second column ('predictions') col_dict = dict( parrot_df.metadata.query((metadata_base.ALL_ELEMENTS, 1))) col_dict['structural_type'] = type("1") col_dict['name'] = list(inputs)[self.hyperparams['index']] col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute', ) parrot_df.metadata = parrot_df.metadata.update( (metadata_base.ALL_ELEMENTS, 1), col_dict) return CallResult(parrot_df)
def load_big_earthnet(): fnames = sorted(glob('/test_data/bigearth-100-single/*/*.tif')) imnames = sorted(list(set(['_'.join(f.split('_')[:-1]) for f in fnames]))) imgs = [load_patch(img_path).astype(np.float32) for img_path in imnames] imgs_df = pd.DataFrame({'image_col': imgs, 'index': range(len(imgs))}) imgs_df = d3m_DataFrame(imgs_df) imgs_df.metadata = imgs_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') y = [i.split('/')[3] for i in imnames] return imgs_df, np.array(y)
def produce_explanations(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Produce explanation masks for primitive's predictions Arguments: inputs {Inputs} -- D3M dataframe containing attributes Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, not considered (default: {None}) """ clf_model, test_loader = self._prepare_test_inputs(inputs) if self.hyperparams['explain_all_classes']: all_class_masks = [[] for _ in range(self._nclasses)] else: all_class_masks = [[]] all_outputs = [] for test_inputs in tqdm(test_loader): test_inputs = test_inputs[0].to(self._device) test_inputs.requires_grad = True test_outputs = clf_model(test_inputs) all_outputs.append(test_outputs) one_hots = self._get_one_hots(test_outputs) for i, one_hot in enumerate(one_hots): masks = self._get_masks(clf_model, test_inputs, test_outputs, one_hot) masks = self._resize_masks(masks, self.hyperparams['image_dim']) all_class_masks[i].append(masks) all_class_masks = [ list(np.concatenate(masks).tolist()) for masks in all_class_masks ] self._all_outputs = torch.cat(all_outputs) explain_df = pd.DataFrame() for i, masks in enumerate(all_class_masks): explain_df[f'class_{i}'] = masks if not self.hyperparams['explain_all_classes']: explain_df.columns = ['class_argmax'] explain_df = d3m_DataFrame(explain_df, generate_metadata=False) return CallResult(explain_df)
def load_csv_data(data) -> d3m_Dataset: """ Function used to load general csv file :param data: a str or a pd.DataFrame :return: a d3m style Dataset """ logger.debug("Trying to load csv data with first 100 characters as:") logger.debug(str(data[:100])) if type(data) is str: data = pd.read_csv(data, dtype=str) elif type(data) is pd.DataFrame: data = data.astype(str) else: raise ValueError("Unknown input type.") # transform pd.DataFrame to d3m.Dataset d3m_df = d3m_DataFrame(data, generate_metadata=False) resources = {AUGMENT_RESOURCE_ID: d3m_df} return_ds = d3m_Dataset(resources=resources, generate_metadata=False) return_ds.metadata = return_ds.metadata.clear(source="", for_value=return_ds, generate_metadata=True) for i, each_column in enumerate(return_ds[AUGMENT_RESOURCE_ID]): metadata_selector = (AUGMENT_RESOURCE_ID, ALL_ELEMENTS, i) structural_type = str metadata_each_column = { "name": each_column, "structural_type": structural_type, 'semantic_types': ('https://metadata.datadrivendiscovery.org/types/Attribute', "http://schema.org/Text") } return_ds.metadata = return_ds.metadata.update( metadata=metadata_each_column, selector=metadata_selector) metadata_all_level = { "id": "datamart_search_" + str(hash(data.values.tobytes())), "version": "2.0", "name": "user given input from datamart userend", "location_uris": ('file:///tmp/datasetDoc.json', ), "digest": "", "description": "", "source": { 'license': 'Other' }, } return_ds.metadata = return_ds.metadata.update(metadata=metadata_all_level, selector=()) logger.debug("Loading csv and transform to d3m dataset format success!") return return_ds
def load_nwpu(data_dir: str = '/NWPU-RESISC45', n_imgs=200): paths = sorted(glob(os.path.join(data_dir, '*/*'))) paths = [os.path.abspath(p) for p in paths] imgs = [Image.open(p) for p in paths[:n_imgs]] labels = [os.path.basename(os.path.dirname(p)) for p in paths[:n_imgs]] transform = tv.transforms.Compose([ tv.transforms.ToTensor(), tv.transforms.Normalize( mean=(0.3680, 0.3810, 0.3436), std=(0.2034, 0.1854, 0.1848), ) ]) imgs = [transform(img) for img in imgs] imgs = d3m_DataFrame(pd.DataFrame({'imgs': imgs})) labels = np.array(labels) return imgs, labels
def _prepare_d3m_df(self, Z_smoothed, n_class): """ prepare d3m dataframe with appropriate metadata """ if self.test_dataset: Z_smoothed = Z_smoothed[len(self.idx_train):] if self.hyperparams["all_scores"]: index = np.repeat(range(len(Z_smoothed)), n_class) labels = np.tile(range(n_class), len(Z_smoothed)) scores = Z_smoothed.flatten() else: index = None labels = np.argmax(Z_smoothed, -1) scores = Z_smoothed[range(len(labels)), labels] labels = self.label_encoder.inverse_transform(labels) preds_df = d3m_DataFrame( pd.DataFrame( { self.output_column: labels, "confidence": scores }, index=index, ), generate_metadata=True, ) preds_df.metadata = preds_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), "https://metadata.datadrivendiscovery.org/types/PredictedTarget", ) preds_df.metadata = preds_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 1), "https://metadata.datadrivendiscovery.org/types/Score", ) preds_df.metadata = preds_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 1), "https://metadata.datadrivendiscovery.org/types/PredictedTarget", ) preds_df.metadata = preds_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 1), "http://schema.org/Float") return preds_df
def make_annotations_dataset(self, n_rows, round_num=0, num_bands=12): if self.annotations is None: annotationsDoc = { "dataResources": [ { "resID": "annotationsData", "resPath": "/scratch_dir/annotationsData.csv", "resType": "table", "resFormat": {"text/csv": ["csv"]}, "columns": [ { "colIndex": 0, "colName": "annotations", "colType": "integer", "role": ["attribute"], } ], } ] } with open("/scratch_dir/annotationsDoc.json", "w") as json_file: json.dump(annotationsDoc, json_file) if round_num == 0: annotations = np.zeros(n_rows) - 1 annotations[0] = 1 annotations = pd.DataFrame( {"d3mIndex": np.arange(n_rows), "annotations": annotations.astype(int)} ) else: annotations = pd.read_csv("/scratch_dir/annotationsData.csv") ranking = pd.read_csv("/scratch_dir/rankings.csv") test_index = pd.read_csv( f"/datasets/seed_datasets_current/{self.dataset}/TEST/dataset_TEST/tables/learningData.csv" )["d3mIndex"].values top_idx = np.where(test_index == ranking.iloc[0, 0])[0][0] // num_bands human_annotation = np.random.randint(2) annotations.iloc[top_idx, 1] = human_annotation annotations.to_csv("/scratch_dir/annotationsData.csv", index=False) return d3m_DataFrame(annotations)