Example #1
0
def split_mnist(X, y):
    """ prepare mnist data as d3m dataframe, fit linear SVC as baseline"""

    # train/test split
    n_class = len(set(y))
    n_samples_per_class = 2

    idxs = np.arange(X.shape[0])
    X_train, X_test, y_train, y_test, _, idx_test = train_test_split(
        X, y, idxs, train_size=n_class * n_samples_per_class, stratify=y)
    y[idx_test] = ""

    # Linear SVC
    global svc
    svc = LinearSVC().fit(X_train, y_train)
    svc_preds = svc.predict(X_test)
    svc_acc = (y_test == svc_preds).mean()

    features_df = pd.DataFrame(X)
    labels_df = pd.DataFrame({"target": y})

    features_df = d3m_DataFrame(features_df)
    labels_df = d3m_DataFrame(labels_df)

    return features_df, labels_df, svc_acc, idx_test, y_test
Example #2
0
    def produce(self,
                *,
                inputs: Input,
                timeout: float = None,
                iterations: int = None) -> CallResult[Output]:
        #make_keras_pickleable()
        produce_data, learning_df, nodes_df, edges_df = self._parse_inputs(
            inputs, return_all=True)
        if self.fitted:
            result = self._sdne._Y  #produce( )#_Y
        else:
            dim = self.hyperparams['dimension']
            alpha = self.hyperparams['alpha']
            beta = self.hyperparams['beta']
            #self._model
            self._sdne = sdne.SDNE(d=dim, alpha=alpha, beta=beta, **args)

            produce_data = networkx.from_scipy_sparse_matrix(produce_data)
            self._sdne.learn_embedding(graph=produce_data)
            self._model = self._sdne._model
            result = self._sdne._Y

        target_types = [
            'https://metadata.datadrivendiscovery.org/types/TrueTarget',
            'https://metadata.datadrivendiscovery.org/types/SuggestedTarget'
        ]
        if self.hyperparams['return_list']:
            result_np = container.ndarray(result, generate_metadata=True)
            return_list = d3m_List([result_np, inputs[1], inputs[2]],
                                   generate_metadata=True)
            return CallResult(return_list, True, 1)
        else:
            learn_df = d3m_DataFrame(learning_df, generate_metadata=True)
            learn_df = get_columns_not_of_type(learn_df, target_types)

            learn_df = learn_df.remove_columns(
                [learn_df.columns.get_loc('nodeID')])
            #learn_df = learn_df.drop('nodeID', axis = 'columns')

            result_df = d3m_DataFrame(result, generate_metadata=True)
            result_df = result_df.loc[result_df.index.isin(
                learning_df['d3mIndex'].values)]

            for column_index in range(result_df.shape[1]):
                col_dict = dict(
                    result_df.metadata.query((ALL_ELEMENTS, column_index)))
                col_dict['structural_type'] = type(1.0)
                col_dict['name'] = str(learn_df.shape[1] + column_index)
                col_dict['semantic_types'] = (
                    'http://schema.org/Float',
                    'https://metadata.datadrivendiscovery.org/types/Attribute')

                result_df.metadata = result_df.metadata.update(
                    (ALL_ELEMENTS, column_index), col_dict)
            result_df.index = learn_df.index.copy()

            output = utils.append_columns(learn_df, result_df)
            #output.set_index('d3mIndex', inplace=True)
            return CallResult(output, True, 1)
Example #3
0
def load_inputs():
    fnames = sorted(glob('/test_data/bigearth-100-single/*/*.tif'))
    imnames = sorted(list(set(['_'.join(f.split('_')[:-1]) for f in fnames])))
    imgs = [load_patch(img_path).astype(np.float32) for img_path in imnames]
    imgs_df = pd.DataFrame({'image_col': imgs, 'dummy_idx': range(len(imgs))})

    y = [i.split('/')[3] for i in imnames]
    tgts_df = pd.DataFrame({'target': y})

    return (d3m_DataFrame(imgs_df), d3m_DataFrame(tgts_df))
Example #4
0
def load_inputs():
    fnames = sorted(glob("/test_data/bigearth-100-single-2c/*/*.tif"))
    imnames = sorted(list(set(["_".join(f.split("_")[:-1]) for f in fnames])))
    imgs = [load_patch(img_path).astype(np.float32) for img_path in imnames]
    imgs_df = pd.DataFrame({"image_col": imgs, "dummy_idx": range(len(imgs))})

    y = [i.split("/")[3] for i in imnames]
    tgts_df = pd.DataFrame({"target": y})

    return (d3m_DataFrame(imgs_df), d3m_DataFrame(tgts_df))
Example #5
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        if not self._fitted:
            return CallResult(inputs, True, 1)
        temp = pd.DataFrame(self._training_data.iloc[:, self._s_cols].apply(
            lambda x: self._d[x.name].transform(x)))
        outputs = self._training_data.copy()

        for id_index, od_index in zip(self._s_cols, range(temp.shape[1])):
            outputs.iloc[:, id_index] = temp.iloc[:, od_index]

        lookup = {
            "int": ('http://schema.org/Integer',
                    'https://metadata.datadrivendiscovery.org/types/Attribute')
        }

        #new_dtype = temp.dtypes

        for index in self._s_cols:
            old_metadata = dict(
                outputs.metadata.query((mbase.ALL_ELEMENTS, index)))
            old_metadata["semantic_types"] = lookup["int"]
            old_metadata["structural_type"] = type(10)
            outputs.metadata = outputs.metadata.update(
                (mbase.ALL_ELEMENTS, index), old_metadata)

        if outputs.shape == inputs.shape:
            print("output:", outputs.head(5))
            return CallResult(d3m_DataFrame(outputs), True, 1)
        else:
            return CallResult(inputs, True, 1)
Example #6
0
def test_new_moons():

    X, y = load_moons(labeled_sample=0)

    features_df = pd.DataFrame(X)
    features_df = d3m_DataFrame(features_df)

    accs = {}
    for algorithm in ["PseudoLabel", "VAT", "ICT"]:
        tss = TabularSemiSupervisedPrimitive(
            hyperparams=tss_hp(
                tss_hp.defaults(),
                algorithm=algorithm,
                weights_filepath=f"{algorithm}.pth",
            ),
            random_seed=5,
        )
        tss.set_params(params=tss_params[algorithm])

        preds = tss.produce(inputs=features_df).value
        acc = (y == preds["target"].astype(float)).mean()
        print(f"{algorithm}: {acc}")
        accs[algorithm] = acc

    assert accs["VAT"] > accs["PseudoLabel"]
    assert accs["VAT"] > accs["ICT"]
    assert accs["PseudoLabel"] > accs["ICT"]
Example #7
0
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        if not self._fitted:
            return CallResult(inputs, True, 1)
        temp = pd.DataFrame(self._model.transform(inputs.iloc[:, self._s_cols]))
        outputs = self._training_data.copy()
        for id_index, od_index in zip(self._s_cols, range(temp.shape[1])):
            outputs.iloc[:, id_index] = temp.iloc[:, od_index]

        new_dtype = temp.dtypes
        lookup = {"float": ('http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'),
                  "int": ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute')}

        for d, index in zip(new_dtype, self._s_cols):
            print("old metadata : ", outputs.metadata.query((mbase.ALL_ELEMENTS, index)))
            old_metadata = dict(outputs.metadata.query((mbase.ALL_ELEMENTS, index)))
            if d==np.dtype(np.float16) or d==np.dtype(np.float32) or d==np.dtype(np.float64) or d==np.dtype(np.float128):
                old_metadata["semantic_types"] = lookup["float"]
                old_metadata["structural_type"] = type(10.0)
            else:
                old_metadata["semantic_types"] = lookup["int"]
                old_metadata["structural_type"] = type(10)
            outputs.metadata = outputs.metadata.update((mbase.ALL_ELEMENTS, index),old_metadata)
            print("updated dict : ",old_metadata)
            print("check again : ", outputs.metadata.query((mbase.ALL_ELEMENTS, index)))

        if outputs.shape == inputs.shape:
            return CallResult(d3m_DataFrame(outputs), True, 1)
        else:
            return CallResult(inputs, True, 1)
Example #8
0
def iterative_labeling(features, labels, seed_idx=2, n_rounds=5):

    # initial query image
    y = (labels == labels[seed_idx]).astype(np.int)
    annotations = np.zeros(features.shape[0]) - 1
    annotations[seed_idx] = 1

    n_pos, n_neg = 1, 0
    for i in range(n_rounds):

        print(f'round {i}')

        # generate ranking by similarity
        sampler = ImageRetrievalPrimitive(
            hyperparams=ir_hp(ir_hp.defaults(), reduce_dimension=256))
        sampler.set_training_data(
            inputs=features,
            outputs=d3m_DataFrame(pd.DataFrame({'annotations': annotations})))
        sampler.fit()
        ranking_df = sampler.produce(inputs=features).value
        assert ranking_df.shape[0] == features.shape[0] - i - 1

        exc_labeled = ranking_df['index'].values
        inc_labeled = np.concatenate((sampler.pos_idxs, exc_labeled))

        # simulate human labeling
        next_idx = exc_labeled[0]
        next_label = y[next_idx]
        annotations[next_idx] = next_label

        if next_label == 1:
            n_pos += 1
        else:
            n_neg += 1

        # evaluate ranking against ground truth
        results = {
            'round':
            i + 1,
            'next_idx':
            int(next_idx),
            'next_label':
            next_label,
            'n_pos':
            n_pos,
            'n_neg':
            n_neg,
            'a_p':
            [float(y[inc_labeled[:k]].mean())
             for k in 2**np.arange(11)],  # precision, including labeled
            'u_p':
            [float(y[exc_labeled[:k]].mean())
             for k in 2**np.arange(11)],  # precision, excluding labeled
            'r_p': [
                float(y[inc_labeled[:k]].sum() / y.sum())
                for k in 2**np.arange(11)
            ],  # recall, including labeled
        }
        print()
        print(results)
Example #9
0
def load_frame(compress_data = False):
    img_paths = [
        os.path.join(dataset_path, filename) 
        for filename in os.listdir(dataset_path)
    ]
    imgs = [
        load_patch(img_path).astype(np.float32) 
        for img_path in img_paths
    ]
    if compress_data:
        compressed_imgs = []
        for img in imgs:
            output_bytes = bytearray(struct.pack(
                'cIII', 
                bytes(img.dtype.char.encode()), 
                len(img), 
                img.shape[1], 
                img.shape[1]
            ))
            output_bytes.extend(img.tobytes())
            compressed_bytes = lzo.compress(bytes(output_bytes))
            compressed_img = np.frombuffer(
                compressed_bytes, 
                dtype='uint8', 
                count=len(compressed_bytes)
            )
            compressed_imgs.append(compressed_img)
        imgs = compressed_imgs

    df = pd.DataFrame({'dummy_idx': range(len(imgs)), 'image_col': imgs})
    return d3m_DataFrame(df)
Example #10
0
    def produce(self,
                *,
                inputs: Input,
                timeout: float = None,
                iterations: int = None) -> CallResult[Output]:
        try:
            self._output_columns = self._output_columns
        except:
            self._output_columns = ['output'] * len(list(output))
        preds = self.model.produce(inputs.values)

        output = d3m_DataFrame(preds,
                               columns=self._output_columns,
                               source=self,
                               generate_metadata=True)  #
        output.metadata = inputs.metadata.clear(source=self,
                                                for_value=output,
                                                generate_metadata=True)
        #output.metadata = self._add_target_semantic_types(metadata=output.metadata, target_names=self._output_columns, source=self)

        self._training_indices = [
            c for c in inputs.columns
            if isinstance(c, str) and 'index' in c.lower()
        ]
        outputs = common_utils.combine_columns(
            return_result='new',  #self.hyperparams['return_result'],
            add_index_columns=True,  #self.hyperparams['add_index_columns'],
            inputs=inputs,
            columns_list=[output],
            source=self,
            column_indices=self._training_indices)
        return CallResult(outputs, True, 1)
Example #11
0
    def produce_metafeatures(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        """
        Produce primitive's best guess for the structural type of each input column.

        Parameters
        ----------
        inputs : D3M Dataset object

        Returns
        -------
        Outputs
            The outputs is two lists of lists, each has length equal to number of columns in input pandas frame.
            Each entry of the first one is a list of strings corresponding to each column's multi-label classification.
            Each entry of the second one is a list of floats corresponding to prediction probabilities.
        """

        out_df = self._produce_annotations(inputs = inputs)

        # add metadata to output data frame
        simon_df = d3m_DataFrame(out_df)
        # first column ('semantic types')
        col_dict = dict(simon_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
        col_dict['structural_type'] = type("this is text")
        col_dict['name'] = 'semantic types'
        col_dict['semantic_types'] = ('http://schema.org/Text', 'https://metadata.datadrivendiscovery.org/types/Attribute')
        simon_df.metadata = simon_df.metadata.update((metadata_base.ALL_ELEMENTS, 0), col_dict)
        # second column ('probabilities')
        col_dict = dict(simon_df.metadata.query((metadata_base.ALL_ELEMENTS, 1)))
        col_dict['structural_type'] = type("this is text")
        col_dict['name'] = 'probabilities'
        col_dict['semantic_types'] = ('http://schema.org/Text', 'https://metadata.datadrivendiscovery.org/types/Attribute')
        simon_df.metadata = simon_df.metadata.update((metadata_base.ALL_ELEMENTS, 1), col_dict)
        
        return CallResult(simon_df)
Example #12
0
    def produce_metafeatures(self,
                             *,
                             inputs: Inputs,
                             timeout: float = None,
                             iterations: int = None) -> CallResult[Outputs]:
        """
        Parameters
        ----------
        inputs: D3M dataframe, NOTE: Target column MUST be the last column

        Returns
        ----------
        Outputs: D3M dataframe with ordered list of original features in first column
        """
        # add metadata to output dataframe
        rff_df = d3m_DataFrame(
            RFFeatures().rank_features(inputs=inputs.iloc[:, :-1],
                                       targets=pandas.DataFrame(
                                           inputs.iloc[:, -1])),
            columns=["features"],
        )
        # first column ('features')
        col_dict = dict(rff_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
        col_dict["structural_type"] = type("it is a string")
        col_dict["name"] = "features"
        col_dict["semantic_types"] = (
            "http://schema.org/Text",
            "https://metadata.datadrivendiscovery.org/types/Attribute",
        )
        rff_df.metadata = rff_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 0), col_dict)

        return CallResult(rff_df)
Example #13
0
    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        '''
        Sets primitive's training data

        Parameters
        ----------
        inputs: numpy ndarray of size (number_of_time_series, time_series_length, dimension) containing training time series

        outputs: numpy ndarray of size (number_time_series,) containing classes of training time series
        '''
        if not self.hyperparams['long_format']:
            inputs = TimeSeriesFormatterPrimitive(
                hyperparams=self._hp).produce(inputs=inputs).value['0']
        else:
            hyperparams_class = DatasetToDataFrame.DatasetToDataFramePrimitive.metadata.query(
            )['primitive_code']['class_type_arguments']['Hyperparams']
            ds2df_client = DatasetToDataFrame.DatasetToDataFramePrimitive(
                hyperparams=hyperparams_class.defaults().replace(
                    {"dataframe_resource": "learningData"}))
            inputs = d3m_DataFrame(ds2df_client.produce(inputs=inputs).value)

        # load and reshape training data
        # 'series_id' and 'value' should be set by metadata
        n_ts = len(inputs.d3mIndex.unique())
        ts_sz = int(inputs.shape[0] / n_ts)
        self._X_train = np.array(inputs.value).reshape(n_ts, ts_sz, 1)
        self._y_train = np.array(inputs.label.iloc[::ts_sz]).reshape(-1, )
    def produce_metafeatures(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        """
        Parameters
        -------
        inputs : Input pandas frame

        Returns
        -------
        Outputs : pandas frame with list of original features in first column, ordered
            by their contribution to the first principal component, and scores in
            the second column.
        """

        # add metadata to output data frame
        pca_df = d3m_DataFrame(PCAFeatures().rank_features(inputs = inputs))
        # first column ('features')
        col_dict = dict(pca_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
        col_dict['structural_type'] = type("1")
        col_dict['name'] = 'features'
        col_dict['semantic_types'] = ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute')
        pca_df.metadata = pca_df.metadata.update((metadata_base.ALL_ELEMENTS, 0), col_dict)
        # second column ('scores')
        col_dict = dict(pca_df.metadata.query((metadata_base.ALL_ELEMENTS, 1)))
        col_dict['structural_type'] = type("1.0")
        col_dict['name'] = 'scores'
        col_dict['semantic_types'] = ('http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute')
        pca_df.metadata = pca_df.metadata.update((metadata_base.ALL_ELEMENTS, 1), col_dict)

        return CallResult(pca_df)
Example #15
0
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        """
        Produce predictions using fit adversarial debiasing algorithm

        Parameters
        ----------
        inputs : D3M dataframe

        Returns
        ----------
        Outputs : D3M dataframe -> predictions from fit debiasing algorithm
            
        """
        # transfrom test dataframe to IBM 360 compliant dataset
        inputs[self.label_names] = self.train_dataset.convert_to_dataframe()[0][self.label_names].values[:inputs.shape[0]].astype(int)
        test_dataset = datasets.BinaryLabelDataset(df = inputs[self.attribute_names + self.label_names],
                                                label_names = self.label_names,
                                                protected_attribute_names = self.protected_attributes,
                                                favorable_label=self.hyperparams['favorable_label'],
                                                unfavorable_label=self.unfavorable_label)

        transformed_dataset = self.clf.predict(test_dataset)
        
        # transform IBM dataset back to D3M dataset
        df = transformed_dataset.convert_to_dataframe()[0][self.label_names].astype(int)
        df = d3m_DataFrame(pandas.concat([inputs[self.idx].reset_index(drop=True), df.reset_index(drop=True)], axis = 1))
        df.metadata = df.metadata.update((metadata_base.ALL_ELEMENTS, 0), inputs.metadata.query_column(0))
        df.metadata = df.metadata.update((metadata_base.ALL_ELEMENTS, 1), inputs.metadata.query_column(1))
        print(df.head(), file = sys.__stdout__)
        return CallResult(df)
Example #16
0
    def produce_metafeatures(self,
                             *,
                             inputs: Inputs,
                             timeout: float = None,
                             iterations: int = None) -> CallResult[Outputs]:
        """
        Perform supervised recursive feature elimination using random forests to generate an ordered
        list of features 
        Parameters
        ----------
        inputs : Input pandas frame, NOTE: Target column MUST be the last column

        Returns
        -------
        Outputs : pandas frame with ordered list of original features in first column
        """
        # add metadata to output dataframe
        rff_df = d3m_DataFrame(RFFeatures().rank_features(
            inputs=inputs.iloc[:, :-1],
            targets=pandas.DataFrame(inputs.iloc[:, -1])),
                               columns=['features'])
        # first column ('features')
        col_dict = dict(rff_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
        col_dict['structural_type'] = type("it is a string")
        col_dict['name'] = 'features'
        col_dict['semantic_types'] = (
            'http://schema.org/Text',
            'https://metadata.datadrivendiscovery.org/types/Attribute')
        rff_df.metadata = rff_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 0), col_dict)

        return CallResult(rff_df)
Example #17
0
    def produce(self,
                *,
                inputs: Input,
                timeout: float = None,
                iterations: int = None) -> CallResult[Output]:

        self.columns = list(inputs)
        X_ = inputs[self.columns].values

        if iterations is not None:
            self.max_iter = iterations
        else:
            self.max_iter = 10000

        #try:
        if not self.fitted:
            raise ValueError('Please fit before calling produce')
        #except:
        #    pass

        self.latent_factors = self.model.transform(X_)

        out_df = d3m_DataFrame(inputs, generate_metadata=True)
        corex_df = d3m_DataFrame(self.latent_factors, generate_metadata=True)

        for column_index in range(corex_df.shape[1]):
            col_dict = dict(
                corex_df.metadata.query((ALL_ELEMENTS, column_index)))
            col_dict['structural_type'] = type(1.0)
            # FIXME: assume we apply corex only once per template, otherwise column names might duplicate
            col_dict['name'] = str(
                out_df.shape[1] + column_index
            )  #should just be column index, no corex prefix #'corex_' +
            col_dict['semantic_types'] = (
                'http://schema.org/Float',
                'https://metadata.datadrivendiscovery.org/types/Attribute')

            corex_df.metadata = corex_df.metadata.update(
                (ALL_ELEMENTS, column_index), col_dict)
        corex_df.index = out_df.index.copy()

        out_df = utils.append_columns(out_df, corex_df)

        return CallResult(out_df, True, self.max_iter)
Example #18
0
    def produce_metafeatures(self,
                             *,
                             inputs: Inputs,
                             timeout: float = None,
                             iterations: int = None) -> CallResult[Outputs]:
        """ Produce primitive's best guess for the structural type of each input column.

            Arguments:
                inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target
            
            Keyword Arguments:
                timeout {float} -- timeout, not considered (default: {None})
                iterations {int} -- iterations, not considered (default: {None})

            Raises:
                PrimitiveNotFittedError: if primitive not fit
            
            Returns:
                CallResult[Outputs] -- dataframe with two columns: "semantic type classifications" and "probabilities"
                    Each row represents a column in the original dataframe. The column "semantic type 
                    classifications" contains a list of all semantic type labels and the column
                    "probabilities" contains a list of the model's confidence in assigning each 
                    respective semantic type label  
        """

        if not self._is_fit:
            raise PrimitiveNotFittedError("Primitive not fitted.")

        out_df = self._produce_annotations(inputs=inputs)

        # add metadata to output data frame
        simon_df = d3m_DataFrame(out_df)
        # first column ('semantic types')
        col_dict = dict(
            simon_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
        col_dict["structural_type"] = typing.List[str]
        col_dict["name"] = "semantic types"
        col_dict["semantic_types"] = (
            "http://schema.org/Text",
            "https://metadata.datadrivendiscovery.org/types/Attribute",
        )
        simon_df.metadata = simon_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 0), col_dict)
        # second column ('probabilities')
        col_dict = dict(
            simon_df.metadata.query((metadata_base.ALL_ELEMENTS, 1)))
        col_dict["structural_type"] = typing.List[float]
        col_dict["name"] = "probabilities"
        col_dict["semantic_types"] = (
            "http://schema.org/Text",
            "https://metadata.datadrivendiscovery.org/types/Attribute",
        )
        simon_df.metadata = simon_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 1), col_dict)

        return CallResult(simon_df, has_finished=self._is_fit)
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        """ return ranking of unlabeled instances based on similarity to positively and negatively
            labeled instances

            Ex. 
                d3mIndex       score
                    1130    0.586983
                    11      0.469862
                    1077    0.394225
                    1125    0.355335
                    21      0.353363
                    
            Arguments:
                inputs {Inputs} -- ignores these `inputs`, uses `inputs` from `set_training_data()`
            
            Keyword Arguments:
                timeout {float} -- timeout, not considered (default: {None})
                iterations {int} -- iterations, not considered (default: {None})
        """

        pos_scores = np.row_stack(self.pos_scores)
        pos_scores = gem(pos_scores, p = self.hyperparams['gem_p'])

        if len(self.neg_scores) >= self.hyperparams['denominator_min']:
            print('rank by negative')
            neg_scores = np.row_stack(self.neg_scores)
            neg_scores = gem(neg_scores, p = self.hyperparams['gem_p'])
            scores = pos_scores / (neg_scores + 1e-12)
        else:
            print('rank by positive')
            scores = pos_scores

        mis_scores = scores[self.mis_idxs]
        mis_ranks = self.mis_idxs[np.argsort(-mis_scores)]
        mis_ranks = self.d3m_idxs[mis_ranks]
        
        ranking_df = pd.DataFrame({
            self.idx_name: mis_ranks,
            'score': np.flip(np.sort(mis_scores)),
        })
        ranking_df = d3m_DataFrame(ranking_df, generate_metadata = True)

        ranking_df.metadata = ranking_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0),
            "http://schema.org/Integer"
        )
        ranking_df.metadata = ranking_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0),
            'https://metadata.datadrivendiscovery.org/types/PrimaryKey'
        )
        ranking_df.metadata = ranking_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 1),
            "http://schema.org/Float"
        )
        return CallResult(ranking_df)
    def _prepare_d3m_df(self, all_preds):
        """ prepare d3m dataframe with appropriate metadata """

        all_preds = [preds.tolist() for preds in all_preds]
        preds_df = pd.DataFrame({f"{self._positive_class}_mask": all_preds})
        preds_df = d3m_DataFrame(preds_df, generate_metadata=False)
        preds_df.metadata = preds_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0),
            "https://metadata.datadrivendiscovery.org/types/FloatVector",
        )
        return preds_df
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        """
        Parameters
        ----------
        inputs : dataframe 

        Returns
        ----------
        Outputs
            The output is a transformed dataframe of X fit into an embedded space, n feature columns will equal n_components hyperparameter
            For timeseries datasets the output is the dimensions concatenated to the timeseries filename dataframe
        """ 
    
        targets = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/SuggestedTarget')
        target_names = [list(inputs)[t] for t in targets]
        index = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/PrimaryKey')
        index_names = [list(inputs)[i] for i in index]
 
        X_test = inputs.drop(columns = list(inputs)[index[0]])
        X_test = X_test.drop(columns = target_names).values
        
        # special semi-supervised case - during training, only produce rows with labels
        series = inputs[target_names] != ''
        if series.any().any():
            inputs = dataframe_utils.select_rows(inputs, np.flatnonzero(series))
            X_test = X_test[np.flatnonzero(series)]

        sc_df = d3m_DataFrame(pandas.DataFrame(self.sc.fit_predict(X_test), columns=['cluster_labels']))

        # just add last column of last column ('clusters')
        col_dict = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
        col_dict['structural_type'] = type(1)
        if self.hyperparams['task_type'] == 'classification':
            col_dict['semantic_types'] = ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute')
            col_dict['name'] = 'cluster_labels'
        else:
            col_dict['semantic_types'] = ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')
            col_dict['name'] = target_names[0]
        sc_df.metadata = sc_df.metadata.update((metadata_base.ALL_ELEMENTS, 0), col_dict)
        
        df_dict = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
        df_dict_1 = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) 
        df_dict['dimension'] = df_dict_1
        df_dict_1['name'] = 'columns'
        df_dict_1['semantic_types'] = ('https://metadata.datadrivendiscovery.org/types/TabularColumn',)
        df_dict_1['length'] = 1        
        sc_df.metadata = sc_df.metadata.update((metadata_base.ALL_ELEMENTS,), df_dict)
                
        return CallResult(utils_cp.append_columns(inputs, sc_df))
                  
Example #22
0
def test_moons(labeled_sample=10):

    X_l, y_l, X_u, y_u = load_moons(labeled_sample)

    X = np.vstack((X_l, X_u)).astype(str)
    y = np.concatenate((y_l, y_u)).astype(str)
    y[labeled_sample:] = ""

    features_df = pd.DataFrame(X)
    labels_df = pd.DataFrame({"target": y})

    features_df = d3m_DataFrame(features_df)
    labels_df = d3m_DataFrame(labels_df)

    global tss_params
    tss_params = {}

    accs = {}
    for algorithm in ["PseudoLabel", "VAT", "ICT"]:
        tss = TabularSemiSupervisedPrimitive(
            hyperparams=tss_hp(
                tss_hp.defaults(),
                epochs=50,
                algorithm=algorithm,
                weights_filepath=f"{algorithm}.pth",
            ),
            random_seed=5,
        )
        tss.set_training_data(inputs=features_df, outputs=labels_df)
        tss.fit()

        tss_params[algorithm] = tss.get_params()

        preds = tss.produce(inputs=features_df).value
        acc = (y_u == preds["target"][labeled_sample:].astype(float)).mean()
        print(f"{algorithm}: {acc}")
        accs[algorithm] = acc

    assert accs["VAT"] > accs["PseudoLabel"]
    assert accs["VAT"] > accs["ICT"]
    assert accs["PseudoLabel"] > accs["ICT"]
Example #23
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        if not self._fitted:
            return CallResult(inputs, self._has_finished,
                              self._iterations_done)

        assert isinstance(
            self._model,
            dict), "self._model type must be dict not defaultdict!"

        temp = pd.DataFrame(
            inputs.iloc[:, self._s_cols].apply(lambda x: self._model[
                x.name].transform(x) if x.name in self._model else None))

        outputs = inputs.copy()
        for id_index, od_index in zip(self._s_cols, range(temp.shape[1])):
            outputs.iloc[:, id_index] = temp.iloc[:, od_index]
        lookup = {
            "int": ('http://schema.org/Integer',
                    'https://metadata.datadrivendiscovery.org/types/Attribute')
        }

        for index in self._s_cols:
            old_metadata = dict(
                outputs.metadata.query((mbase.ALL_ELEMENTS, index)))
            old_metadata["semantic_types"] = lookup["int"]
            old_metadata["structural_type"] = type(10)
            outputs.metadata = outputs.metadata.update(
                (mbase.ALL_ELEMENTS, index), old_metadata)

        # remove the columns that appeared in produce method but were not in fitted data
        drop_names = set(outputs.columns[self._s_cols]).difference(
            set(self._model.keys()))
        drop_indices = map(lambda a: outputs.columns.get_loc(a), drop_names)
        drop_indices = sorted(drop_indices)
        outputs = common_utils.remove_columns(outputs,
                                              drop_indices,
                                              source='ISI DSBox Data Labler')

        # sanity check and report the results
        if outputs.shape[0] == inputs.shape[0] and \
           outputs.shape[1] == inputs.shape[1] - len(drop_names):
            self._has_finished = True
            self._iterations_done = True
            # print("output:",outputs.head(5))
            return CallResult(d3m_DataFrame(outputs), self._has_finished,
                              self._iterations_done)
        else:
            return CallResult(inputs, self._has_finished,
                              self._iterations_done)
Example #24
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Produce primitive's prediction for future time series data

        Parameters
        ----------
        None

        Returns
        ----------
        Outputs
            The output is a data frame containing the d3m index and a forecast for each of the 'n_periods' future time periods
        """

        # add metadata to output
        # just take d3m index from input test set
        output_df = inputs['d3mIndex']
        # produce future foecast using arima
        future_forecast = pandas.DataFrame(
            self._sloth.PredictSeriesARIMA(self._arima,
                                           self.hyperparams['n_periods']))
        output_df = pandas.concat([output_df, future_forecast], axis=1)
        parrot_df = d3m_DataFrame(output_df)

        # first column ('d3mIndex')
        col_dict = dict(
            parrot_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
        col_dict['structural_type'] = type("1")
        col_dict['name'] = 'd3mIndex'
        col_dict['semantic_types'] = (
            'http://schema.org/Integer',
            'https://metadata.datadrivendiscovery.org/types/PrimaryKey',
        )
        parrot_df.metadata = parrot_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 0), col_dict)
        # second column ('predictions')
        col_dict = dict(
            parrot_df.metadata.query((metadata_base.ALL_ELEMENTS, 1)))
        col_dict['structural_type'] = type("1")
        col_dict['name'] = list(inputs)[self.hyperparams['index']]
        col_dict['semantic_types'] = (
            'http://schema.org/Integer',
            'https://metadata.datadrivendiscovery.org/types/Attribute',
        )
        parrot_df.metadata = parrot_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 1), col_dict)

        return CallResult(parrot_df)
Example #25
0
def load_big_earthnet():
    fnames = sorted(glob('/test_data/bigearth-100-single/*/*.tif'))
    imnames = sorted(list(set(['_'.join(f.split('_')[:-1]) for f in fnames])))
    imgs = [load_patch(img_path).astype(np.float32) for img_path in imnames]
    imgs_df = pd.DataFrame({'image_col': imgs, 'index': range(len(imgs))})
    imgs_df = d3m_DataFrame(imgs_df)
    imgs_df.metadata = imgs_df.metadata.add_semantic_type(
        (metadata_base.ALL_ELEMENTS, 1),
        'https://metadata.datadrivendiscovery.org/types/PrimaryKey')

    y = [i.split('/')[3] for i in imnames]

    return imgs_df, np.array(y)
Example #26
0
    def produce_explanations(self,
                             *,
                             inputs: Inputs,
                             timeout: float = None,
                             iterations: int = None) -> CallResult[Outputs]:
        """ Produce explanation masks for primitive's predictions

            Arguments:
                inputs {Inputs} -- D3M dataframe containing attributes

            Keyword Arguments:
                timeout {float} -- timeout, not considered (default: {None})
                iterations {int} -- iterations, not considered (default: {None})
        """

        clf_model, test_loader = self._prepare_test_inputs(inputs)

        if self.hyperparams['explain_all_classes']:
            all_class_masks = [[] for _ in range(self._nclasses)]
        else:
            all_class_masks = [[]]

        all_outputs = []
        for test_inputs in tqdm(test_loader):
            test_inputs = test_inputs[0].to(self._device)
            test_inputs.requires_grad = True
            test_outputs = clf_model(test_inputs)
            all_outputs.append(test_outputs)

            one_hots = self._get_one_hots(test_outputs)
            for i, one_hot in enumerate(one_hots):
                masks = self._get_masks(clf_model, test_inputs, test_outputs,
                                        one_hot)
                masks = self._resize_masks(masks,
                                           self.hyperparams['image_dim'])
                all_class_masks[i].append(masks)

        all_class_masks = [
            list(np.concatenate(masks).tolist()) for masks in all_class_masks
        ]
        self._all_outputs = torch.cat(all_outputs)

        explain_df = pd.DataFrame()
        for i, masks in enumerate(all_class_masks):
            explain_df[f'class_{i}'] = masks

        if not self.hyperparams['explain_all_classes']:
            explain_df.columns = ['class_argmax']

        explain_df = d3m_DataFrame(explain_df, generate_metadata=False)
        return CallResult(explain_df)
Example #27
0
def load_csv_data(data) -> d3m_Dataset:
    """
    Function used to load general csv file
    :param data: a str or a pd.DataFrame
    :return: a d3m style Dataset
    """
    logger.debug("Trying to load csv data with first 100 characters as:")
    logger.debug(str(data[:100]))
    if type(data) is str:
        data = pd.read_csv(data, dtype=str)
    elif type(data) is pd.DataFrame:
        data = data.astype(str)
    else:
        raise ValueError("Unknown input type.")

    # transform pd.DataFrame to d3m.Dataset
    d3m_df = d3m_DataFrame(data, generate_metadata=False)
    resources = {AUGMENT_RESOURCE_ID: d3m_df}
    return_ds = d3m_Dataset(resources=resources, generate_metadata=False)
    return_ds.metadata = return_ds.metadata.clear(source="",
                                                  for_value=return_ds,
                                                  generate_metadata=True)
    for i, each_column in enumerate(return_ds[AUGMENT_RESOURCE_ID]):
        metadata_selector = (AUGMENT_RESOURCE_ID, ALL_ELEMENTS, i)
        structural_type = str
        metadata_each_column = {
            "name":
            each_column,
            "structural_type":
            structural_type,
            'semantic_types':
            ('https://metadata.datadrivendiscovery.org/types/Attribute',
             "http://schema.org/Text")
        }
        return_ds.metadata = return_ds.metadata.update(
            metadata=metadata_each_column, selector=metadata_selector)
    metadata_all_level = {
        "id": "datamart_search_" + str(hash(data.values.tobytes())),
        "version": "2.0",
        "name": "user given input from datamart userend",
        "location_uris": ('file:///tmp/datasetDoc.json', ),
        "digest": "",
        "description": "",
        "source": {
            'license': 'Other'
        },
    }
    return_ds.metadata = return_ds.metadata.update(metadata=metadata_all_level,
                                                   selector=())
    logger.debug("Loading csv and transform to d3m dataset format success!")
    return return_ds
Example #28
0
def load_nwpu(data_dir: str = '/NWPU-RESISC45', n_imgs=200):
    paths = sorted(glob(os.path.join(data_dir, '*/*')))
    paths = [os.path.abspath(p) for p in paths]
    imgs = [Image.open(p) for p in paths[:n_imgs]]
    labels = [os.path.basename(os.path.dirname(p)) for p in paths[:n_imgs]]

    transform = tv.transforms.Compose([
        tv.transforms.ToTensor(),
        tv.transforms.Normalize(
            mean=(0.3680, 0.3810, 0.3436),
            std=(0.2034, 0.1854, 0.1848),
        )
    ])
    imgs = [transform(img) for img in imgs]

    imgs = d3m_DataFrame(pd.DataFrame({'imgs': imgs}))
    labels = np.array(labels)
    return imgs, labels
Example #29
0
    def _prepare_d3m_df(self, Z_smoothed, n_class):
        """ prepare d3m dataframe with appropriate metadata """

        if self.test_dataset:
            Z_smoothed = Z_smoothed[len(self.idx_train):]

        if self.hyperparams["all_scores"]:
            index = np.repeat(range(len(Z_smoothed)), n_class)
            labels = np.tile(range(n_class), len(Z_smoothed))
            scores = Z_smoothed.flatten()
        else:
            index = None
            labels = np.argmax(Z_smoothed, -1)
            scores = Z_smoothed[range(len(labels)), labels]

        labels = self.label_encoder.inverse_transform(labels)

        preds_df = d3m_DataFrame(
            pd.DataFrame(
                {
                    self.output_column: labels,
                    "confidence": scores
                },
                index=index,
            ),
            generate_metadata=True,
        )

        preds_df.metadata = preds_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0),
            "https://metadata.datadrivendiscovery.org/types/PredictedTarget",
        )
        preds_df.metadata = preds_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 1),
            "https://metadata.datadrivendiscovery.org/types/Score",
        )
        preds_df.metadata = preds_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 1),
            "https://metadata.datadrivendiscovery.org/types/PredictedTarget",
        )
        preds_df.metadata = preds_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 1), "http://schema.org/Float")

        return preds_df
    def make_annotations_dataset(self, n_rows, round_num=0, num_bands=12):

        if self.annotations is None:
            annotationsDoc = {
                "dataResources": [
                    {
                        "resID": "annotationsData",
                        "resPath": "/scratch_dir/annotationsData.csv",
                        "resType": "table",
                        "resFormat": {"text/csv": ["csv"]},
                        "columns": [
                            {
                                "colIndex": 0,
                                "colName": "annotations",
                                "colType": "integer",
                                "role": ["attribute"],
                            }
                        ],
                    }
                ]
            }
            with open("/scratch_dir/annotationsDoc.json", "w") as json_file:
                json.dump(annotationsDoc, json_file)

        if round_num == 0:
            annotations = np.zeros(n_rows) - 1
            annotations[0] = 1
            annotations = pd.DataFrame(
                {"d3mIndex": np.arange(n_rows), "annotations": annotations.astype(int)}
            )
        else:
            annotations = pd.read_csv("/scratch_dir/annotationsData.csv")
            ranking = pd.read_csv("/scratch_dir/rankings.csv")
            test_index = pd.read_csv(
                f"/datasets/seed_datasets_current/{self.dataset}/TEST/dataset_TEST/tables/learningData.csv"
            )["d3mIndex"].values

            top_idx = np.where(test_index == ranking.iloc[0, 0])[0][0] // num_bands
            human_annotation = np.random.randint(2)
            annotations.iloc[top_idx, 1] = human_annotation

        annotations.to_csv("/scratch_dir/annotationsData.csv", index=False)
        return d3m_DataFrame(annotations)