Esempio n. 1
0
class SequentialModel(SupervisedLearnerPrimitiveBase[Input, Output, SM_Params,
                                                     SM_Hyperparams]):
    metadata = PrimitiveMetadata({
        "schema": "v0",
        "id": "4d6cbfca-5ac4-4e92-a3de-dc4a47008649",
        "version": "1.0.2",
        "name": "SequentialModel",
        "description":
        "Uses Sequential from Keras to do predictions with previously finely tuned hyperparams.",
        "python_path": "d3m.primitives.dsbox.SequentialModel",
        "original_python_path": "sequential_model.SequentialModel",
        "source": {
            "name": "ISI",
            "contact": "mailto:[email protected]",
            "uris": ["https://github.com/serbanstan/dsbox_sm"]
        },
        "installation": [cfg_.INSTALLATION],
        "algorithm_types": ['MULTILAYER_PERCEPTRON'],
        "primitive_family": "CLASSIFICATION",
        "hyperparams_to_tune": ["reg_val"]
    })

    def __init__(self, *, hyperparams: SM_Hyperparams) -> None:
        super().__init__(hyperparams=hyperparams)

    def set_training_data(self, *, inputs: Input, outputs: Output) -> None:
        # initialize the default parameters
        self.validateSplitRate = 0.2
        self.epochs = 30
        self.batchSize = 10

        # work in DF format
        indeM = inputs.shape[1]

        self.inputDim = indeM
        self.kindOfcrossEntropy = 'categorical_crossentropy'

        # turn data to ndarray format
        self.training_inputs = inputs.values
        self.training_outputs = to_categorical(self._create_mapping(outputs))
        self.fitted = False

    def fit(self) -> CallResult[None]:
        make_keras_picklable()

        modelSub = Sequential()
        modelSub.add(
            Dense(100,
                  input_dim=self.inputDim,
                  kernel_regularizer=regularizers.l2(
                      self.hyperparams['reg_val']),
                  activation='tanh',
                  kernel_constraint=maxnorm(2)))
        modelSub.add(
            Dense(self.training_outputs.shape[1],
                  kernel_regularizer=regularizers.l2(
                      self.hyperparams['reg_val']),
                  activation='sigmoid'))

        #inp = InputK(shape = (self.inputDim,))
        #x = Dense(100, kernel_regularizer = regularizers.l2(self.hyperparams['reg_val']), activation = 'tanh', kernel_constraint = maxnorm(2))(inp)
        #x = Dense(self.training_outputs.shape[1], kernel_regularizer = regularizers.l2(self.hyperparams['reg_val']), activation = 'sigmoid')(x)
        optimizer = Adam(lr=0.001)

        #modelSub = keras.models.Model(inputs = inp, outputs = x)
        modelSub.compile(loss=self.kindOfcrossEntropy,
                         optimizer=optimizer,
                         metrics=['accuracy'])

        self.model = modelSub
        self.model.fit(self.training_inputs,
                       self.training_outputs,
                       validation_split=self.validateSplitRate,
                       epochs=self.epochs,
                       batch_size=self.batchSize)

        self.fitted = True

        return CallResult(None, True, 1)

    def produce(self,
                *,
                inputs: Input,
                timeout: float = None,
                iterations: int = None) -> CallResult[Output]:
        if not self.fitted:
            return CallResult(inputs, True, 1)

        prediction = container.DataFrame(
            self._inverse_mapping(self.model.predict_classes(inputs.values)))
        prediction.index = copy.deepcopy(inputs.index)

        return CallResult(prediction, True, 1)

    def _create_mapping(self, vec):
        # create a mapping from type to float
        self.mapping = dict()
        self.inverse_map = dict()

        res = []

        mapping_index = 0
        for v in vec.values.ravel():
            if v in self.mapping:
                res.append(self.mapping[v])
            else:
                mapping_index = mapping_index + 1

                self.mapping[v] = mapping_index
                self.inverse_map[mapping_index] = v

                res.append(mapping_index)

        return res

    def _inverse_mapping(self, vec):
        return [self.inverse_map[x] for x in vec]

    def get_params(self) -> SM_Params:
        if not self.fitted:
            raise ValueError("Fit not performed")
        return SM_Params(model_=self.model, inverse_map_=self.inverse_map)

    def set_params(self, *, params: SM_Params) -> None:
        self.model = params["model_"]
        self.inverse_map = params["inverse_map_"]

    def _annotation(self):
        if self._annotation is not None:
            return self._annotation
        self._annotation = Primitive()
        self._annotation.name = 'SequentialModel'
        self._annotation.task = 'Classification'
        self._annotation.learning_type = 'SupervisedLearning'
        self._annotation.ml_algorithm = ['Keras Sequential']
        self._annotation.tags = ['multilayer_perceptron']
        return self._annotation
Esempio n. 2
0
class CorexSAE(SupervisedLearnerPrimitiveBase[Input, Output, CorexSAE_Params,
                                              CorexSAE_Hyperparams]):

    metadata = PrimitiveMetadata({
        "schema":
        "v0",
        "id":
        "6c95166f-434a-435d-a3d7-bce8d7238061",
        "version":
        "1.0.0",
        "name":
        "CorexSupervised",
        "description":
        "Autoencoder implementation of Corex / Information Bottleneck",
        "python_path":
        "d3m.primitives.dsbox.CorexSupervised",
        "original_python_path":
        "corexsae.corex_sae.CorexSAE",
        "source": {
            "name": "ISI",
            "contact": "mailto:[email protected]",
            "uris": ["https://github.com/brekelma/dsbox_corex"]
        },
        # git+https://github.com/brekelma/corex_continuous#egg=corex_continuous
        "installation": [{
            'type':
            'PIP',
            'package_uri':
            'git+https://github.com/brekelma/dsbox_corex.git@7381c3ed2d41a8dbe96bbf267a915a0ec48ee397#egg=dsbox-corex'  #'+ str(git.Repo(search_parent_directories = True).head.object.hexsha) + '#egg=dsbox-corex'
        }],
        "algorithm_types": ["EXPECTATION_MAXIMIZATION_ALGORITHM"],
        "primitive_family":
        "CLASSIFICATION",  #"FEATURE_CONSTRUCTION",
        "hyperparams_to_tune": ["label_beta", "epochs"]
    })

    def __init__(
        self, *, hyperparams: CorexSAE_Hyperparams
    ) -> None:  #, random_seed : int =  0, docker_containers: typing.Dict[str, DockerContainer] = None
        super().__init__(
            hyperparams=hyperparams
        )  # random_seed = random_seed, docker_containers = docker_containers)

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:

        # create keras architecture
        self._latent_dims = [100, 100, 20]
        self._decoder_dims = list(reversed(self.latent_dims[:-1]))

        # TRAINING ARGS... what to do?
        self._activation = 'softplus'
        self._optimizer = Adam(.001)
        self._batch = 100
        self._epochs = None  # HYPERPARAM?
        self._noise = 'add'
        self._anneal_sched = None

        if iterations is not None:
            self.Hyperparams["epochs"] = iterations

        x = Input(shape=(self.training_inputs.shape[-1], ))
        t = x

        for i in range(len(self.latent_dims[:-1])):
            t = Dense(self.latent_dims[i], activation=self.activation)(t)

        if self._noise == 'add' or self._noise == 'vae':
            final_enc_act = 'linear'
            sample_function = vae_sample
        else:
            #final_enc_act = 'softplus'
            final_enc_act = 'linear'
            sample_function = ido_sample

        z_mean = Dense(self.latent_dims[:-1],
                       activation=final_enc_act,
                       name='z_mean')(t)
        z_noise = Dense(self.latent_dims[:-1],
                        activation=final_enc_act,
                        name='z_noise')(t)
        z_act = Lambda(vae_sample,
                       output_shape=(self.latent_dims[:-1], ))([z_mean, z_var])

        t = z_act
        for i in range(len(self._decoder_dims)):
            t = Dense(self._decoder_dims[i], activation=self.activation)(t)

        label_act = 'softmax' if self._label_unique > 1 else 'linear'
        y_pred = Dense(self._label_unique, activation='softmax', name='y_pred')

        if self._input_types:
            pass
        else:
            print("Purely Supervised Bottleneck")
            # no reconstruction layers

        outputs = []
        loss_functions = []
        loss_weights = []

        beta = Beta(name='beta', beta=self.Hyperparams["label_beta"])(x)

        outputs.append(y_pred)
        if label_act == 'softmax':
            loss_functions.append(objectives.categorical_crossentropy)
        else:
            loss_functions.append(objectives.mean_squared_error)  #mse
        loss_weights.append(beta)

        self.model = Model(inputs=x, outputs=outputs)
        self.model.compile(optimizer=self._optimizer,
                           loss=loss_functions,
                           loss_weights=loss_weights)

        # anneal?
        if self._anneal_sched:
            raise NotImplementedError
        else:
            self.model.fit(self.training_inputs,
                           [self.training_outputs] * len(outputs),
                           shuffle=True,
                           epochs=self.Hyperparams["epochs"],
                           batch_size=self._batch_size
                           )  # validation_data = [] early stopping?

        #Lambda(ido_sample)
        #Lambda(vae_sample, output_shape = (d,))([z_mean, z_var])

        return CallResult(None, True, self.Hyperparams["epochs"])

    def produce(
        self,
        *,
        inputs: Input,
        timeout: float = None,
        iterations: int = None
    ) -> CallResult[Output]:  # TAKES IN DF with index column
        return CallResult(self.model.predict(inputs), True, 0)

    def set_training_data(self, *, inputs: Input, outputs: Output) -> None:
        self.training_inputs = inputs
        self.training_outputs = to_categorical(
            outputs, num_classes=np.unique(outputs).shape[0])
        self.fitted = False

        # DATA PROFILING? softmax categorical (encoded) X or labels Y
        # binary data? np.logical_and(self.training_inputs >= 0, self.training_inputs )
        self._input_types = []
        self._label_unique = np.unique(outputs).shape[0]
        self._label_unique = 1 if self._label_unique > self.max_discrete_labels else self._label_unique

    def get_params(self) -> CorexSAE_Params:
        return CorexSAE_Params()  #args)

    def set_params(self, *, params: CorexSAE_Params) -> None:
        self.max_discrete_labels = params["max_discrete_labels"]
        pass
Esempio n. 3
0
class CorexText(UnsupervisedLearnerPrimitiveBase[Input, Output,
                                                 CorexText_Params,
                                                 CorexText_Hyperparams]
                ):  #(Primitive):
    """
    Learns latent factors / topics which explain the most multivariate information in bag of words representations of documents. Returns learned topic scores for each document. Also supports hierarchical models and 'anchoring' to encourage topics to concentrate around desired words.
    """
    metadata = PrimitiveMetadata({
        "schema":
        "v0",
        "id":
        "0c64ffd6-cb9e-49f0-b7cb-abd70a5a8261",
        "version":
        "1.0.0",
        "name":
        "CorexText",
        "description":
        "Learns latent factors / topics which explain the most multivariate information in bag of words representations of documents. Returns learned topic scores for each document. Also supports hierarchical models and 'anchoring' to encourage topics to concentrate around desired words.",
        #"python_path": "d3m.primitives.dsbox.corex_text.CorexText",
        "python_path":
        "d3m.primitives.feature_construction.corex_text.DSBOX",
        "original_python_path":
        "corextext.corex_text.CorexText",
        "source": {
            "name": "ISI",
            "contact": "mailto:[email protected]",
            "uris": ["https://github.com/brekelma/dsbox_corex"]
        },
        "installation": [cfg_.INSTALLATION],
        "algorithm_types":
        ["EXPECTATION_MAXIMIZATION_ALGORITHM", "LATENT_DIRICHLET_ALLOCATION"],
        "primitive_family":
        "FEATURE_CONSTRUCTION",
        "hyperparams_to_tune":
        ["n_hidden", "threshold", "n_grams", "max_df", "min_df"]
    })

    def __init__(self, *, hyperparams: CorexText_Hyperparams) -> None:
        super(CorexText, self).__init__(hyperparams=hyperparams)

    # instantiate data and create model and bag of words
    def set_training_data(self, *, inputs: Input) -> None:
        self.training_data = inputs
        self.fitted = False

    # assumes input as data-frame and do prediction on the 'text' labeled columns
    def fit(self, *, timeout: float = None, iterations: int = None) -> None:
        # if already fitted, do nothing
        if self.fitted:
            return CallResult(None, True, 1)

        self.training_data = self._process_files(self.training_data)

        text_attributes = DataMetadata.list_columns_with_semantic_types(self=self.training_data.metadata,\
            semantic_types=["http://schema.org/Text"])
        all_attributes = DataMetadata.list_columns_with_semantic_types(self=self.training_data.metadata,\
            semantic_types=["https://metadata.datadrivendiscovery.org/types/Attribute"])
        categorical_attributes = DataMetadata.list_columns_with_semantic_types(self=self.training_data.metadata,\
            semantic_types=["https://metadata.datadrivendiscovery.org/types/CategoricalData"])

        # want text columns that are attributes
        self.text_columns = set(all_attributes).intersection(text_attributes)

        # but, don't want to edit categorical columns
        self.text_columns = set(
            self.text_columns) - set(categorical_attributes)

        # and, we want the text columns as a list
        self.text_columns = list(self.text_columns)

        # if no text columns are present don't do anything
        self.do_nothing = False
        if len(self.text_columns) == 0:
            self.fitted = True

            self.model = None
            self.bow = None
            self.do_nothing = True
            self.text_columns = None
            self.latent_factors = None
            self.max_iter = None

            return CallResult(None, True, 1)

        # instantiate a corex model and a bag of words model
        self.model = Corex(n_hidden=self.hyperparams['n_hidden'],
                           max_iter=iterations,
                           seed=self.random_seed)
        self.bow = TfidfVectorizer(decode_error='ignore',
                                   max_df=self.hyperparams['max_df'],
                                   min_df=self.hyperparams['min_df'])

        # set the number of iterations (for wrapper and underlying Corex model)
        if iterations is not None:
            self.max_iter = iterations
        else:
            self.max_iter = 250
        self.model.max_iter = self.max_iter

        # concatenate the columns row-wise
        concat_cols = None
        for column_index in self.text_columns:
            if concat_cols is not None:
                concat_cols = concat_cols.str.cat(
                    self.training_data.iloc[:, column_index], sep=" ")
            else:
                concat_cols = copy.deepcopy(
                    self.training_data.iloc[:, column_index])

        try:
            bow = self.bow.fit_transform(
                map(self._get_ngrams, concat_cols.ravel()))
        except ValueError:
            self.bow = TfidfVectorizer(decode_error='ignore',
                                       max_df=self.hyperparams['max_df'],
                                       min_df=0)
            bow = self.bow.fit_transform(
                map(self._get_ngrams, concat_cols.ravel()))

            print("[WARNING] Setting min_df to 0 to avoid ValueError")

        # choose between CorEx and the TfIdf matrix
        if bow.shape[1] > self.hyperparams['threshold']:
            # use CorEx
            self.latent_factors = self.model.fit_transform(bow)
        else:
            # just use the bag of words representation
            self.latent_factors = pd.DataFrame(bow.todense())

        self.fitted = True

        return CallResult(None, True, 1)

    def produce(self,
                *,
                inputs: Input,
                timeout: float = None,
                iterations: int = None) -> CallResult[Output]:
        # if corex didn't run for any reason, just return the given dataset
        if self.do_nothing:
            return CallResult(inputs, True, 1)

        inputs = self._process_files(inputs)

        if iterations is not None:
            self.max_iter = iterations
        else:
            self.max_iter = 250
        self.model.max_iter = self.max_iter

        # concatenate the columns row-wise
        concat_cols = None
        for column_index in self.text_columns:
            if concat_cols is not None:
                concat_cols = concat_cols.str.cat(inputs.iloc[:, column_index],
                                                  sep=" ")
            else:
                concat_cols = copy.deepcopy(inputs.iloc[:, column_index])
        bow = self.bow.transform(map(self._get_ngrams, concat_cols.ravel()))

        # choose between CorEx and the TfIdf matrix
        if bow.shape[1] > self.hyperparams['threshold']:
            # use CorEx
            self.latent_factors = self.model.transform(bow).astype(float)
        else:
            # just use the bag of words representation
            self.latent_factors = pd.DataFrame(bow.todense())
        # make the columns corex adds distinguishable from other columns

        # remove the selected columns from input and add the latent factors given by corex
        out_df = d3m_DataFrame(inputs, generate_metadata=True)

        self.latent_factors.columns = [
            str(out_df.shape[-1] + i)
            for i in range(self.latent_factors.shape[-1])
        ]

        # create metadata for the corex columns
        corex_df = d3m_DataFrame(self.latent_factors, generate_metadata=True)
        for column_index in range(corex_df.shape[1]):
            col_dict = dict(
                corex_df.metadata.query((ALL_ELEMENTS, column_index)))
            col_dict['structural_type'] = type(1.0)
            # FIXME: assume we apply corex only once per template, otherwise column names might duplicate
            col_dict['name'] = 'corex_' + str(out_df.shape[1] + column_index)
            col_dict['semantic_types'] = (
                'http://schema.org/Float',
                'https://metadata.datadrivendiscovery.org/types/Attribute')

            corex_df.metadata = corex_df.metadata.update(
                (ALL_ELEMENTS, column_index), col_dict)

        # concatenate is --VERY-- slow without this next line
        corex_df.index = out_df.index.copy()

        out_df = utils.append_columns(out_df, corex_df)

        # remove the initial text columns from the df, if we do this before CorEx we can get an empty dataset error
        out_df = utils.remove_columns(out_df, self.text_columns)

        # TO DO : Incorporate timeout, max_iter
        # return CallResult(d3m_DataFrame(self.latent_factors))
        return CallResult(out_df, True, 1)

    #def fit_multi_produce(self, ):

    def _get_ngrams(self, text: str = None) -> str:
        punctuation_table = str.maketrans(dict.fromkeys(string.punctuation))
        try:
            words = text.translate(punctuation_table).lower().rsplit(" ")
        except:
            words = text.str.translate(punctuation_table).lower().rsplit(" ")

        new_text = ""
        for i in range(len(words)):
            new_text += "".join(
                words[i:i + int(self.hyperparams['n_grams'])]) + " "

        return new_text

    # remove the FileName columns from the data frame and replace them with text
    def _process_files(self, inputs: Input):
        fn_attributes = DataMetadata.list_columns_with_semantic_types(self=inputs.metadata, \
            semantic_types=["https://metadata.datadrivendiscovery.org/types/FileName"])
        all_attributes = DataMetadata.list_columns_with_semantic_types(self=inputs.metadata, \
            semantic_types=["https://metadata.datadrivendiscovery.org/types/Attribute"])
        fn_columns = list(set(all_attributes).intersection(fn_attributes))

        # if no file name columns are detected, default to regular behavior
        if len(fn_columns) == 0:
            return inputs

        # create an empty DataFrame of the required size
        processed_cols = pd.DataFrame("", index = copy.deepcopy(inputs.index), \
            columns = ['text_files_' + str(i) for i in range(len(fn_columns))])

        # for column_index in range(len(fn_columns)):
        for column_index in fn_columns:
            curr_column = copy.deepcopy(inputs.iloc[:, column_index])

            file_loc = inputs.metadata.query(
                (ALL_ELEMENTS, column_index))['location_base_uris']
            file_loc = file_loc[0]  # take the first elem of the tuple
            file_loc = file_loc[7:]  # get rid of 'file://' prefix

            for row_index in range(curr_column.shape[0]):
                text_file = curr_column.iloc[row_index]
                file_path = file_loc + text_file

                with open(file_path, 'rb') as file:
                    doc = file.read()
                doc = "".join(map(chr, doc))
                doc_tokens = re.compile(r"(?u)\b\w\w+\b").findall(
                    doc)  # list of strings

                processed_cols.iloc[row_index,
                                    fn_columns.index(column_index)] = " ".join(
                                        doc_tokens)

        # construct metadata for the newly generated columns
        processed_cols = d3m_DataFrame(processed_cols, generate_metadata=True)

        for column_index in range(processed_cols.shape[1]):
            col_dict = dict(
                processed_cols.metadata.query((ALL_ELEMENTS, column_index)))
            col_dict['structural_type'] = type("text")
            # FIXME: assume we apply corex only once per template, otherwise column names might duplicate
            col_dict['name'] = 'processed_file_' + str(inputs.shape[1] +
                                                       column_index)
            col_dict['semantic_types'] = (
                'http://schema.org/Text',
                'https://metadata.datadrivendiscovery.org/types/Attribute')

            processed_cols.metadata = processed_cols.metadata.update(
                (ALL_ELEMENTS, column_index), col_dict)

        # concatenate the input with the newly created columns
        updated_inputs = utils.append_columns(inputs, processed_cols)

        # remove the initial FileName columns from the df, if we do this before concatenating we might get an empty dataset error
        updated_inputs = utils.remove_columns(updated_inputs, fn_columns)

        return updated_inputs

    def get_params(self) -> CorexText_Params:
        return CorexText_Params(fitted_=self.fitted,
                                model_=self.model,
                                bow_=self.bow,
                                do_nothing_=self.do_nothing,
                                text_columns_=self.text_columns,
                                latent_factors_=self.latent_factors,
                                max_iter_=self.max_iter)

    def set_params(self, *, params: CorexText_Params) -> None:
        self.fitted = params['fitted_']
        self.model = params['model_']
        self.bow = params['bow_']
        self.do_nothing = params['do_nothing_']
        self.text_columns = params['text_columns_']
        self.latent_factors = params['latent_factors_']
        self.max_iter = params['max_iter_']

    def _annotation(self):
        if self._annotation is not None:
            return self._annotation
        self._annotation = Primitive()
        self._annotation.name = 'CorexText'
        self._annotation.task = 'FeatureExtraction'
        self._annotation.learning_type = 'UnsupervisedLearning'
        self._annotation.ml_algorithm = ['Dimension Reduction']
        self._annotation.tags = ['feature_extraction', 'text']
        return self._annotation
Esempio n. 4
0
class EchoLinearRegression(
        SupervisedLearnerPrimitiveBase[Input, Output, EchoRegressor_Params,
                                       EchoRegressor_Hyperparams]
):  #(Primitive):
    """
    Least squares regression with information capacity constraint from echo noise. Minimizes the objective function::
    E(y - y_hat)^2 + alpha * I(X,y)
    where, X_bar = X + S * echo noise, y_hat = X_bar w + w_0,
    so that I(X,y) <= -log det S,
    with w the learned weights / coefficients.
    The objective simplifies and has an analytic solution.
    """
    metadata = PrimitiveMetadata({
        "schema": "v0",
        "id": "18e63b10-c5b7-34bc-a670-f2c831d6b4bf",
        "version": "1.0.0",
        "name": "EchoLinearRegression",
        "description":
        "Learns latent factors / topics which explain the most multivariate information in bag of words representations of documents. Returns learned topic scores for each document. Also supports hierarchical models and 'anchoring' to encourage topics to concentrate around desired words.",
        #"python_path": "d3m.primitives.dsbox.echo.EchoRegressor",
        "python_path": "d3m.primitives.regression.echo_linear.DSBOX",
        "original_python_path": "echo_regressor.EchoLinearRegression",
        "source": {
            "name": "ISI",
            "contact": "mailto:[email protected]",
            "uris": ["https://github.com/brekelma/dsbox_corex"]
        },
        "installation": [cfg_.INSTALLATION],
        "algorithm_types": ["LINEAR_REGRESSION"],
        "primitive_family": "REGRESSION",
        "hyperparams_to_tune": ["alpha"]
    })

    def __init__(self, *, hyperparams: EchoRegressor_Hyperparams) -> None:
        super().__init__(hyperparams=hyperparams)

    # instantiate data and create model and bag of words
    def set_training_data(self, *, inputs: Input, outputs: Output) -> None:
        self.training_data = inputs
        self.labels = outputs
        self._output_columns = outputs.columns
        self.fitted = False

    # assumes input as data-frame and do prediction on the 'text' labeled columns
    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        # if already fitted, do nothing
        if self.fitted:
            return CallResult(None, True, 1)

        self.model = EchoRegression(
            alpha=self.hyperparams['alpha'],
            assume_diagonal=self.hyperparams['diagonal'])

        self.model.fit(self.training_data, self.labels)

        return CallResult(None, True, 1)

    def produce(self,
                *,
                inputs: Input,
                timeout: float = None,
                iterations: int = None) -> CallResult[Output]:
        try:
            self._output_columns = self._output_columns
        except:
            self._output_columns = ['output'] * len(list(output))
        preds = self.model.produce(inputs.values)

        output = d3m_DataFrame(preds,
                               columns=self._output_columns,
                               source=self,
                               generate_metadata=True)  #
        output.metadata = inputs.metadata.clear(source=self,
                                                for_value=output,
                                                generate_metadata=True)
        #output.metadata = self._add_target_semantic_types(metadata=output.metadata, target_names=self._output_columns, source=self)

        self._training_indices = [
            c for c in inputs.columns
            if isinstance(c, str) and 'index' in c.lower()
        ]
        outputs = common_utils.combine_columns(
            return_result='new',  #self.hyperparams['return_result'],
            add_index_columns=True,  #self.hyperparams['add_index_columns'],
            inputs=inputs,
            columns_list=[output],
            source=self,
            column_indices=self._training_indices)
        return CallResult(outputs, True, 1)

    def get_params(self) -> EchoRegressor_Params:
        return EchoRegressor_Params(fitted_=self.fitted,
                                    model_=self.model,
                                    output_columns_=self._output_columns)
        """
        Sets all the search parameters from a Params object
        :param is_classifier: True for discrete-class output. False for numeric output.
        :type: boolean
        :type: Double
        """

    def set_params(self, *, params: EchoRegressor_Params) -> CallResult[None]:
        self.fitted = params['fitted_']
        self.model = params['model_']
        self._output_columns = params['output_columns_']

        return CallResult(None, True, 1)

    def _add_target_semantic_types(
        cls,
        metadata: DataMetadata,
        source: typing.Any,
        target_names: List = None,
    ) -> DataMetadata:
        for column_index in range(
                metadata.query((ALL_ELEMENTS, ))['dimension']['length']):
            metadata = metadata.add_semantic_type(
                (ALL_ELEMENTS, column_index),
                'https://metadata.datadrivendiscovery.org/types/Target',
                source=source)
            metadata = metadata.add_semantic_type(
                (ALL_ELEMENTS, column_index),
                'https://metadata.datadrivendiscovery.org/types/PredictedTarget',
                source=source)
            if target_names:
                metadata = metadata.update((ALL_ELEMENTS, column_index), {
                    'name': target_names[column_index],
                },
                                           source=source)
        return metadata
Esempio n. 5
0
class EchoIB(SupervisedLearnerPrimitiveBase[Input, Output, EchoIB_Params,
                                            EchoIB_Hyperparams]):
    """
    Keras NN implementing the information bottleneck method with Echo Noise to calculate I(X:Z), where Z also trained to maximize I(X:Y) for label Y.  Control tradeoff using 'label_beta param'
    """
    metadata = PrimitiveMetadata({
        "schema":
        "v0",
        "id":
        "393f9de8-a5b9-4d92-aaff-8808d563b6c4",
        "version":
        "1.0.0",
        "name":
        "Echo",
        "description":
        "Autoencoder implementation of Information Bottleneck using Echo Noise: https://arxiv.org/abs/1904.07199.  Can be used for feature construction with the task of classification or regression.  Image featurization and collaborative filtering in prep.  Returns embedding of size n_hidden, alongside predictions (which can be used with downstream modeling primitive).  Beta hyperparam controls regularization: Loss = task_loss - beta * I(X:Z).  Returns learned features (# = n_hidden) and predictions of training classifier if use_as_modeling = False.",
        "python_path":
        "d3m.primitives.feature_construction.echo_ib.DSBOX",
        "original_python_path":
        "echo_ib.EchoIB",
        "can_use_gpus":
        True,
        "source": {
            "name": "ISI",
            "contact": "mailto:[email protected]",
            "uris": ["https://github.com/brekelma/dsbox_corex"]
        },
        # git+https://github.com/brekelma/corex_continuous#egg=corex_continuous
        "installation": [cfg_.INSTALLATION]
        #{'type': 'PIP',
        #'package_uri': 'git+https://github.com/brekelma/dsbox_corex.git@7381c3ed2d41a8dbe96bbf267a915a0ec48ee397#egg=dsbox-corex'#'+ str(git.Repo(search_parent_directories = True).head.object.hexsha) + '#egg=dsbox-corex'
        #}
        #]
        ,
        "algorithm_types":
        ["STOCHASTIC_NEURAL_NETWORK"],  #"EXPECTATION_MAXIMIZATION_ALGORITHM"],
        "primitive_family":
        "FEATURE_CONSTRUCTION",
        "hyperparams_to_tune": ["n_hidden", "beta", "epochs"]
    })

    def __init__(
        self, *, hyperparams: EchoIB_Hyperparams
    ) -> None:  #, random_seed : int =  0, docker_containers: typing.Dict[str, DockerContainer] = None
        super().__init__(
            hyperparams=hyperparams
        )  # random_seed = random_seed, docker_containers = docker_containers)

    def _extra_params(self,
                      latent_dims=None,
                      activation=None,
                      lr=None,
                      batch=None,
                      epochs=None,
                      noise=None):
        self._latent_dims = [
            self.hyperparams['units'], self.hyperparams['units'],
            self.hyperparams['n_hidden']
        ]
        self._decoder_dims = list(reversed(self._latent_dims[:-1]))

        # TRAINING ARGS... what to do?
        self._activation = self.hyperparams['activation']  #'tanh' #'softplus'
        self._lr = self.hyperparams['lr']
        self._optimizer = Adam(self._lr)
        self._batch = int(self.hyperparams['batch'])  #20
        self._epochs = None  # HYPERPARAM?
        self._noise = 'echo'
        self._kl_warmup = 0  # .1 * kl reg for first _ epochs
        self._anneal_sched = None  # not supported
        self._echo_args = {
            'batch': self._batch,
            'd_max': self._batch,
            'nomc': True,
            'calc_log': True,
            'plus_sx': True
        }
        self._label_unique = 0
        try:
            self.label_encode = self.label_encode
        except:
            self.label_encode = None
        try:
            self.output_columns = self.output_columns  #['Hall of Fame']
        except:
            pass

    # def build_encoder(self):

    # def build_decoder(self):

    # def build_model(self):

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        make_keras_pickleable()
        # create keras architecture
        # TODO : Architecture / layers as input hyperparameter

        self._extra_params()

        if iterations is not None:
            self.hyperparams["epochs"] = iterations

        if self.hyperparams['convolutional']:
            encoder = build_convolutional_encoder(self.hyperparams['n_hidden'])
            z_act = encoder.outputs[0]
        else:
            x = keras_layers.Input(shape=(self.training_inputs.shape[-1], ))
            t = x

            for i in range(len(self._latent_dims[:-1])):
                t = Dense(self._latent_dims[i], activation=self._activation)(t)

            if self._noise == 'add' or self._noise == 'vae':
                z_mean_act = 'linear'
                z_var_act = 'linear'
                sample_function = vae_sample
                latent_loss = gaussian_kl_prior
            elif self._noise == 'ido' or self._noise == 'mult':
                #final_enc_act = 'softplus'
                z_mean_act = 'linear'
                z_var_act = 'linear'
                sample_function = ido_sample
                latent_loss = gaussian_kl_prior
            elif self._noise == 'echo':
                z_mean_act = tanh64
                z_var_act = tf.math.log_sigmoid
                sample_function = echo_sample
                latent_loss = echo_loss
            else:
                z_mean_act = tanh64
                z_var_act = tf.math.log_sigmoid
                sample_function = echo_sample
                latent_loss = echo_loss

            #z_var_act = log_sigmoid_64

            z_mean = Dense(self._latent_dims[-1],
                           activation=z_mean_act,
                           name='z_mean')(t)
            z_noise = Dense(self._latent_dims[-1],
                            activation=z_var_act,
                            name='z_noise',
                            bias_initializer='ones')(t)
            z_act = Lambda(echo_sample,
                           arguments=self._echo_args,
                           output_shape=(self._latent_dims[-1], ),
                           name='z_act')([z_mean, z_noise])

        z_inp = keras_layers.Input(shape=(self._latent_dims[-1], ))
        t = z_act
        dt = z_inp
        for i in range(len(self._decoder_dims)):
            lyr = Dense(self._decoder_dims[i],
                        name='decoder_' + str(i),
                        activation=self._activation)
            t = lyr(t)
            dt = lyr(dt)

        if 'classification' in self.hyperparams['task'].lower(
        ) and self._label_unique > 0:
            label_act = 'softmax' if self._label_unique > 1 else 'sigmoid'
            lyr = Dense(self._label_unique,
                        activation=label_act,
                        name='y_pred')
            y_pred = lyr(t)
            y_p = lyr(dt)
        elif 'regression' in self.hyperparams['task'].lower(
        ) or self._label_unique == 0:
            label_act = 'linear'
            lyr = Dense(self.training_outputs.shape[-1],
                        activation=label_act,
                        name='y_pred')
            y_pred = lyr(t)
            y_p = lyr(dt)
        else:
            raise NotImplementedError(
                "TASK TYPE SHOULD BE CLASSIFICATION OR REGRESSION")

        # TO DO : Add reconstruction layers and additional representation as in https://arxiv.org/abs/1912.00646

        outputs = []
        dec_outputs = []
        loss_functions = []
        loss_weights = []

        outputs.append(y_pred)
        dec_outputs.append(y_p)
        if label_act == 'softmax':
            loss_functions.append(K.categorical_crossentropy)
        elif label_act == 'sigmoid':
            loss_functions.append(K.binary_crossentropy)
        else:
            loss_functions.append(tf.keras.losses.mean_squared_error)  #mse
        loss_weights.append(1)

        loss_tensor = Lambda(latent_loss)([z_mean, z_noise])
        outputs.append(loss_tensor)
        loss_functions.append(dim_sum)
        loss_weights.append(
            tf.Variable(self.hyperparams["beta"],
                        dtype=tf.float32,
                        trainable=False))

        #if self._kl_warmup is not None and self._kl_warmup > 0:
        #    my_callbacks = [ZeroAnneal(lw = self.hyperparams['beta'], index = -1, epochs = self._kl_warmup)]
        #else:
        my_callbacks = []
        my_callbacks.append(keras.callbacks.TerminateOnNaN())
        self.model = keras.models.Model(inputs=x, outputs=outputs)
        self.enc_model = keras.models.Model(inputs=x, outputs=z_act)
        self.dec_model = keras.models.Model(inputs=z_inp,
                                            outputs=dec_outputs[0])
        self.model.compile(optimizer=self._optimizer,
                           loss=loss_functions,
                           loss_weights=loss_weights)
        #get_session().run(tf.global_variables_initializer())

        # anneal?
        if self._anneal_sched:
            raise NotImplementedError
        else:
            self.model.fit_generator(
                generator(self.training_inputs,
                          self.training_outputs,
                          target_len=len(outputs),
                          batch=self._batch),
                verbose=1,  #callbacks = my_callbacks,
                steps_per_epoch=int(self.training_inputs.shape[0] /
                                    self._batch),
                epochs=int(self.hyperparams["epochs"]))

        self.fitted = True

        return CallResult(None, True, self.hyperparams["epochs"])

    def produce(
        self,
        *,
        inputs: Input,
        timeout: float = None,
        iterations: int = None
    ) -> CallResult[Output]:  # TAKES IN DF with index column
        self._extra_params()

        modeling = self.hyperparams['use_as_modeling']
        inp = self.model.input

        # outputs = [layer.output for layer in self.model.layers if 'z_mean' in layer.name or 'z_noise' in layer.name]
        # functors = [K.function([inp, K.learning_phase()], [out]) for out in outputs]
        # dec_inp = [layer.input for layer in self.model.layers if 'decoder_0' in layer.name][0]
        # # directly output sampled latent?
        # output_z = [layer.output for layer in self.model.layers if 'z_act' in layer.name or 'latent_act' in layer.name]
        # functors_z = [K.function([inp, K.learning_phase()], [out]) for out in output_z]

        # preds = [layer.output for layer in self.model.layers if 'y_pred' in layer.name]
        # pred_function = K.function([dec_inp, K.learning_phase()], [preds[0]])

        inps = inputs.remove_columns([inputs.columns.get_loc('d3mIndex')])
        #predictions = []
        #eatures = []

        features = self.enc_model.predict(inps, batch_size=self._batch)
        predictions = self.dec_model.predict(features, batch_size=self._batch)
        # for i in range(0, inps.shape[0], self._batch):
        #     data = inps.values[i:i+self._batch]
        #     z_stats = [func([data, 1.])[0] for func in functors]
        #     z_out = [func([data, 1.])[0] for func in functors_z]

        #     z_act = self.enc_model(data)
        #     y_pred = self.dec_model(z_act)
        #     _echo_args = copy.copy(self._echo_args)
        #     _echo_args['batch'] = data.shape[0]
        #     _echo_args['d_max'] = data.shape[0]

        #     #z_act = echo_sample(z_stats, **_echo_args).eval(session=get_session())

        #     y_pred= pred_function([z_act, 1.])[0]#.eval(session=K.get_session())
        #     features.extend([z_act[yp] for yp in range(z_act.shape[0])])

        #     y_pred = np.argmax(y_pred, axis = -1)
        #     predictions.extend([y_pred[yp] for yp in range(y_pred.shape[0])])
        # predictions = np.array(predictions)

        if self.label_encode is not None:
            predictions = np.argmax(predictions, axis=-1)
            predictions = self.label_encode.inverse_transform(predictions)

        if modeling:
            output = d3m_DataFrame(predictions,
                                   columns=self.output_columns,
                                   generate_metadata=True,
                                   source=self)
        else:
            out_df = d3m_DataFrame(inputs, generate_metadata=True)

            # create metadata for the corex columns
            features = np.array(features)

            if len(predictions.shape) < len(features.shape):
                predictions = np.expand_dims(predictions, axis=-1)

            constructed = np.concatenate([features, predictions], axis=-1)
            corex_df = d3m_DataFrame(constructed, generate_metadata=True)

            for column_index in range(corex_df.shape[1]):
                col_dict = dict(
                    corex_df.metadata.query((ALL_ELEMENTS, column_index)))
                col_dict['structural_type'] = type(1.0)
                # FIXME: assume we apply corex only once per template, otherwise column names might duplicate
                col_dict['name'] = str(
                    out_df.shape[1] + column_index
                )  #'echoib_'+('pred_' if column_index < self.hyperparams['n_hidden'] else 'feature_') +
                col_dict['semantic_types'] = (
                    'http://schema.org/Float',
                    'https://metadata.datadrivendiscovery.org/types/Attribute')

                corex_df.metadata = corex_df.metadata.update(
                    (ALL_ELEMENTS, column_index), col_dict)

            # concatenate is --VERY-- slow without this next line
            corex_df.index = out_df.index.copy()

            outputs = common_utils.append_columns(out_df, corex_df)

        if modeling:
            self._training_indices = [
                c for c in inputs.columns
                if isinstance(c, str) and 'index' in c.lower()
            ]

            outputs = common_utils.combine_columns(
                return_result='new',  #self.hyperparams['return_result'],
                add_index_columns=True,  #self.hyperparams['add_index_columns'],
                inputs=inputs,
                columns_list=[output],
                source=self,
                column_indices=self._training_indices)

        #predictions = d3m_DataFrame(predictions, index = inputs.index.copy())# columns = self.output_columns

        return CallResult(outputs, True, 1)
        #return CallResult(d3m_DataFrame(self.model.predict(inputs)), True, 0)

    def set_training_data(self, *, inputs: Input, outputs: Output) -> None:
        inps = inputs.remove_columns([inputs.columns.get_loc('d3mIndex')])

        self.training_inputs = inps.values

        self.output_columns = outputs.columns

        if 'classification' in self.hyperparams['task'].lower():
            self._label_unique = np.unique(outputs.values).shape[0]
            if self._label_unique >= outputs.values.shape[0] - 1:
                self._label_unique = 0
                self.training_outputs = outputs.values
            else:
                self.label_encode = LabelEncoder()
                self.training_outputs = to_categorical(
                    self.label_encode.fit_transform(outputs.values),
                    num_classes=np.unique(outputs.values).shape[0])
        else:
            self.training_outputs = outputs.values

        #self.training_outputs = to_categorical(outputs, num_classes = np.unique(outputs.values).shape[0])
        self.fitted = False

        # DATA PROFILING? softmax categorical (encoded) X or labels Y
        # binary data? np.logical_and(self.training_inputs >= 0, self.training_inputs )

        # CHECK unique values for determining discrete / continuous
        #self._input_types = []
        #self._label_unique = np.unique(outputs).shape[0]
        #self._label_unique = 1 if self._label_unique > self.max_discrete_labels else self._label_unique

    def get_params(self) -> EchoIB_Params:
        return EchoIB_Params(model = self.model, model_weights = self.model.get_weights(), fitted = self.fitted, \
             label_encode = self.label_encode, output_columns = self.output_columns, \
                 enc_model = self.enc_model,
                 dec_model = self.dec_model)#max_discrete_labels = self.max_discrete_labels)#args)

    def set_params(self, *, params: EchoIB_Params) -> None:
        #self.max_discrete_labels = params["max_discrete_labels"]
        self._extra_params()
        self.model = params['model']
        self.model.set_weights(params['model_weights'])
        self.enc_model = params['enc_model']
        self.dec_model = params['dec_model']
        self.fitted = params['fitted']
        self.label_encode = params['label_encode']
        self.output_columns = params['output_columns']

    def _add_target_semantic_types(
        cls,
        metadata: DataMetadata,
        source: typing.Any,
        target_names: List = None,
    ) -> DataMetadata:
        for column_index in range(
                metadata.query((ALL_ELEMENTS, ))['dimension']['length']):
            metadata = metadata.add_semantic_type(
                (ALL_ELEMENTS, column_index),
                'https://metadata.datadrivendiscovery.org/types/Target',
                source=source)
            metadata = metadata.add_semantic_type(
                (ALL_ELEMENTS, column_index),
                'https://metadata.datadrivendiscovery.org/types/PredictedTarget',
                source=source)
            if target_names:
                metadata = metadata.update((ALL_ELEMENTS, column_index), {
                    'name': target_names[column_index],
                },
                                           source=source)
        return metadata
Esempio n. 6
0
class CorexContinuous(
        UnsupervisedLearnerPrimitiveBase[Input, Output, CorexContinuous_Params,
                                         CorexContinuous_Hyperparams]
):  #(Primitive):
    """
    Return components/latent factors that explain the most multivariate mutual information in the data under Linear Gaussian model. For comparison, PCA returns components explaining the most variance in the data.  Serves as DSBox 'wrapper' for https://github.com/gregversteeg/linearcorex"
    """
    metadata = PrimitiveMetadata({
        "schema":
        "v0",
        "id":
        "d2d4fefc-0859-3522-91df-7e445f61a69b",
        "version":
        "1.0.0",
        "name":
        "CorexContinuous",
        "description":
        "Return components/latent factors that explain the most multivariate mutual information in the data under Linear Gaussian model. For comparison, PCA returns components explaining the most variance in the data.",
        #"python_path": "d3m.primitives.dsbox.corex_continuous.CorexContinuous",
        "python_path":
        "d3m.primitives.feature_construction.corex_continuous.CorexContinuous",
        "original_python_path":
        "corexcontinuous.corex_continuous.CorexContinuous",
        "source": {
            "name": "ISI",
            "contact": 'mailto:[email protected]',
            "uris": ['https://github.com/brekelma/dsbox_corex']
        },
        "installation": [cfg_.INSTALLATION],
        #[ {
        # 'type': 'PIP',
        # 'package_uri': 'git+https://github.com/brekelma/dsbox_corex.git@7381c3ed2d41a8dbe96bbf267a915a0ec48ee397#egg=dsbox-corex'#+ str(git.Repo(search_parent_directories = True).head.object.hexsha) + '#egg=dsbox-corex'
        #}
        #],
        "algorithm_types": ["EXPECTATION_MAXIMIZATION_ALGORITHM"],
        "primitive_family":
        "FEATURE_CONSTRUCTION",
        "preconditions": ["NO_MISSING_VALUES", "NO_CATEGORICAL_VALUES"],
        "hyperparams_to_tune": ["n_hidden"]
    })

    #  "effects": [],

    #def __init__(self, n_hidden : Any = None, max_iter : int = 10000,
    def __init__(
        self, *, hyperparams: CorexContinuous_Hyperparams
    ) -> None:  #, random_seed : int =  0, docker_containers: typing.Dict[str, DockerContainer] = None
        # Additional Corex Parameters set to defaults:  see github.com/gregversteeg/LinearCorex

        #tol : float = 1e-5, anneal : bool = True, discourage_overlap : bool = True, gaussianize : str = 'standard',
        #gpu : bool = False, verbose : bool = False, seed : int = None, **kwargs) -> None:

        super(CorexContinuous, self).__init__(
            hyperparams=hyperparams
        )  # random_seed = random_seed, docker_containers = docker_containers)

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        if self.fitted:
            return
        if not hasattr(self, 'training_inputs'):
            raise ValueError("Missing training data.")

        self._fit_transform(self.training_inputs, timeout, iterations)
        self.fitted = True
        # add support for max_iter / incomplete
        return CallResult(None, True, self.max_iter)

    def produce(self,
                *,
                inputs: Input,
                timeout: float = None,
                iterations: int = None) -> CallResult[Output]:

        self.columns = list(inputs)
        X_ = inputs[self.columns].values

        if iterations is not None:
            self.max_iter = iterations
        else:
            self.max_iter = 10000

        if not self.fitted:
            raise ValueError('Please fit before calling produce')

        self.latent_factors = self.model.transform(X_)

        out_df = d3m_DataFrame(inputs)
        corex_df = d3m_DataFrame(self.latent_factors)

        for column_index in range(corex_df.shape[1]):
            col_dict = dict(
                corex_df.metadata.query((mbase.ALL_ELEMENTS, column_index)))
            col_dict['structural_type'] = type(1.0)
            # FIXME: assume we apply corex only once per template, otherwise column names might duplicate
            col_dict['name'] = 'corex_' + str(out_df.shape[1] + column_index)
            col_dict['semantic_types'] = (
                'http://schema.org/Float',
                'https://metadata.datadrivendiscovery.org/types/Attribute')

            corex_df.metadata = corex_df.metadata.update(
                (mbase.ALL_ELEMENTS, column_index), col_dict)
        corex_df.index = out_df.index.copy()

        out_df = utils.append_columns(out_df, corex_df)
        return CallResult(out_df, True, self.max_iter)

    def _fit_transform(self,
                       inputs: Input,
                       timeout: float = None,
                       iterations: int = None) -> Sequence[Output]:

        self.columns = list(inputs)
        X_ = inputs[self.columns].values

        if iterations is not None:
            self.max_iter = iterations
        else:
            self.max_iter = 10000

        if isinstance(self.hyperparams['n_hidden'], int):
            self.n_hidden = self.hyperparams['n_hidden']
        elif isinstance(self.hyperparams['n_hidden'], float):
            self.n_hidden = max(
                1, int(self.hyperparams['n_hidden'] * len(self.columns)))

        if not hasattr(self, 'model') or self.model is None:
            _stdout = sys.stdout
            null = open(os.devnull, 'wb')
            sys.stdout = null
            self.model = corex_cont.Corex(n_hidden=self.n_hidden,
                                          max_iter=self.max_iter)
            sys.stdout = _stdout

        self.latent_factors = self.model.fit_transform(X_)
        self.fitted = True
        return self.latent_factors

    def set_training_data(self, *, inputs: Input) -> None:
        self.training_inputs = inputs
        self.fitted = False

    def get_params(self) -> CorexContinuous_Params:
        return CorexContinuous_Params(model=self.model)

    def set_params(self, *, params: CorexContinuous_Params) -> None:
        self.model = params['model']
        #self.fitted = params.fitted
        #self.training_inputs = params.training_inputs

    def _annotation(self):
        if self._annotation is not None:
            return self._annotation
        self._annotation = Primitive()
        self._annotation.name = 'CorexContinuous'
        self._annotation.task = 'FeatureExtraction'
        self._annotation.learning_type = 'UnsupervisedLearning'
        self._annotation.ml_algorithm = ['Dimension Reduction']
        self._annotation.tags = ['feature_extraction', 'continuous']
        return self._annotation

    def _get_feature_names(self):
        return [
            'CorexContinuous_' + str(i)
            for i in range(self.hyperparams['n_hidden'])
        ]
Esempio n. 7
0
class CorexContinuous(UnsupervisedLearnerPrimitiveBase[Input, Output, CorexContinuous_Params, CorexContinuous_Hyperparams]):  #(Primitive):
    
    """
    Return components/latent factors that explain the most multivariate mutual information in the data under Linear Gaussian model. For comparison, PCA returns components explaining the most variance in the data.  Serves as DSBox 'wrapper' for https://github.com/gregversteeg/linearcorex"
    """
    metadata = PrimitiveMetadata({
      "schema": "v0",
      "id": "d2d4fefc-0859-3522-91df-7e445f61a69b",
      "version": "1.0.0",
      "name": "CorexContinuous",
      "description": "Return components/latent factors that explain the most multivariate mutual information in the data under Linear Gaussian model. For comparison, PCA returns components explaining the most variance in the data.",
      "python_path": "d3m.primitives.dsbox.CorexContinuous",
      "original_python_path": "corexcontinuous.corex_continuous.CorexContinuous",
      "source": {
            "name": "ISI",
            "contact": 'mailto:[email protected]',
            "uris": [ 'https://github.com/brekelma/dsbox_corex' ]
            },
      "installation": [
            {
             'type': 'PIP', 
             'package_uri': 'git+https://github.com/brekelma/dsbox_corex.git@8672da14a7f2e00ea488da460ad68ef0799a9532#egg=dsbox-corex'
            }
        ],
      "algorithm_types": ["EXPECTATION_MAXIMIZATION_ALGORITHM"],
      "primitive_family": "FEATURE_CONSTRUCTION",
      "preconditions": ["NO_MISSING_VALUES", "NO_CATEGORICAL_VALUES"],
      "hyperparams_to_tune": ["n_hidden"]
    })
    #  "effects": [],

    #def __init__(self, n_hidden : Any = None, max_iter : int = 10000, 
    def __init__(self, *, hyperparams : CorexContinuous_Hyperparams) -> None: #, random_seed : int =  0, docker_containers: typing.Dict[str, DockerContainer] = None
        # Additional Corex Parameters set to defaults:  see github.com/gregversteeg/LinearCorex
        
        #tol : float = 1e-5, anneal : bool = True, discourage_overlap : bool = True, gaussianize : str = 'standard',  
        #gpu : bool = False, verbose : bool = False, seed : int = None, **kwargs) -> None:
        
        super().__init__(hyperparams = hyperparams)# random_seed = random_seed, docker_containers = docker_containers)
        


    def fit(self, *, timeout: float = None, iterations : int = None) -> CallResult[None]:
        if self.fitted:
            return
        if not hasattr(self, 'training_inputs'):
            raise ValueError("Missing training data.")

        self._fit_transform(self.training_inputs, timeout, iterations)
        self.fitted = True
        # add support for max_iter / incomplete
        return CallResult(None, True, self.max_iter)

    def produce(self, *, inputs : Input, timeout : float = None, iterations : int = None) -> CallResult[Output]: 

        self.columns = list(inputs)
        X_ = inputs[self.columns].values 
    	
        if iterations is not None:
            self.max_iter = iterations
        else:
            self.max_iter = 10000

        if not self.fitted:
            raise ValueError('Please fit before calling produce')

        self.latent_factors = self.model.transform(X_)

        return CallResult(self.latent_factors, True, self.max_iter)

    def _fit_transform(self, inputs : Input, timeout: float = None, iterations : int = None) -> Sequence[Output]:
        
        self.columns = list(inputs)
        X_ = inputs[self.columns].values

        if iterations is not None:
            self.max_iter = iterations
        else:
            self.max_iter = 10000

        if isinstance(self.hyperparams['n_hidden'], int):
            self.n_hidden = self.hyperparams['n_hidden']
        elif isinstance(self.hyperparams['n_hidden'], float):
            self.n_hidden = max(1,int(self.hyperparams['n_hidden']*len(self.columns)))

        if not hasattr(self, 'model') or self.model is None:
            _stdout = sys.stdout
            null = open(os.devnull,'wb')
            sys.stdout = null
            self.model = corex_cont.Corex(n_hidden= self.n_hidden, max_iter = self.max_iter)
            sys.stdout = _stdout

        self.latent_factors = self.model.fit_transform(X_)
        self.fitted = True
        return self.latent_factors

    def set_training_data(self, *, inputs : Input, outputs : Output) -> None:
        self.training_inputs = inputs
        self.fitted = False

    def get_params(self) -> CorexContinuous_Params:
        return CorexContinuous_Params(model = self.model)

    def set_params(self, *, params: CorexContinuous_Params) -> None:
        self.model = params['model']
        #self.fitted = params.fitted
        #self.training_inputs = params.training_inputs


    def _annotation(self):
        if self._annotation is not None:
            return self._annotation
        self._annotation = Primitive()
        self._annotation.name = 'CorexContinuous'
        self._annotation.task = 'FeatureExtraction'
        self._annotation.learning_type = 'UnsupervisedLearning'
        self._annotation.ml_algorithm = ['Dimension Reduction']
        self._annotation.tags = ['feature_extraction', 'continuous']
        return self._annotation

    def _get_feature_names(self):
    	return ['CorexContinuous_'+ str(i) for i in range(self.hyperparams['n_hidden'])]
Esempio n. 8
0
class SDNE(UnsupervisedLearnerPrimitiveBase[Input, Output, SDNE_Params,
                                            SDNE_Hyperparams]):
    """
    Graph embedding method
    """

    metadata = PrimitiveMetadata({
        "schema":
        "v0",
        "id":
        "7d61e488-b5bb-4c79-bad6-f1dc07292bf4",
        "version":
        "1.0.0",
        "name":
        "SDNE",
        "description":
        "Structural Deep Network Embedding (Wang et al 2016): unsupervised network embedding using autoencoders to preserve first order proximity (i.e. connected nodes have similar embeddings) and second order proximity (i.e. nodes with similar neighbors have similar embeddings).  Hyperparam alpha controls weight of 1st order proximity loss (L2 norm of embedding difference), beta controls second-order loss (reconstruction of adjacency matrix row, matrix B in Wang et al).  Expects list of [learning_df, nodes_df, edges_df] as input (e.g. by running common_primitives.normalize_graphs + data_tranformation.graph_to_edge_list.DSBOX)",
        "python_path":
        "d3m.primitives.feature_construction.sdne.DSBOX",
        "original_python_path":
        "sdne.SDNE",
        "source": {
            "name": "ISI",
            "contact": "mailto:[email protected]",
            "uris": ["https://github.com/brekelma/dsbox_graphs"]
        },
        "installation": [cfg_.INSTALLATION],
        "algorithm_types": ["AUTOENCODER"],
        "primitive_family":
        "FEATURE_CONSTRUCTION",
        "hyperparams_to_tune": ["dimension", "beta", "alpha"]
    })

    def __init__(self, *, hyperparams: SDNE_Hyperparams) -> None:
        super(SDNE, self).__init__(hyperparams=hyperparams)
        # nu1 = 1e-6, nu2=1e-6, K=3,n_units=[500, 300,], rho=0.3, n_iter=30, xeta=0.001,n_batch=500

    def _make_adjacency(self, sources, dests, num_nodes=None, tensor=True):
        if num_nodes is None:
            num_nodes = len(self.node_encode.classes_)
        if tensor:
            try:
                adj = tf.SparseTensor(
                    [[sources.values[i, 0], dests.values[i, 0]]
                     for i in range(sources.values.shape[0])],
                    [1.0 for i in range(sources.values.shape[0])],
                    dense_shape=(num_nodes, num_nodes))
            except:
                adj = tf.SparseTensor([[sources[i], dests[i]]
                                       for i in range(sources.shape[0])],
                                      [1.0 for i in range(sources.shape[0])],
                                      dense_shape=(num_nodes, num_nodes))
        else:
            try:
                adj = csr_matrix(
                    ([1.0 for i in range(sources.values.shape[0])], ([
                        sources.values[i, 0]
                        for i in range(sources.values.shape[0])
                    ], [
                        dests.values[i, 0]
                        for i in range(sources.values.shape[0])
                    ])),
                    shape=(num_nodes, num_nodes))
            except:
                adj = csr_matrix(
                    ([1.0 for i in range(sources.shape[0])],
                     ([sources[i] for i in range(sources.shape[0])
                       ], [dests[i] for i in range(sources.shape[0])])),
                    shape=(num_nodes, num_nodes))
        return adj

    def _get_source_dest(self, edges_df, source_types=None, dest_types=None):

        if source_types is None:
            source_types = (
                'https://metadata.datadrivendiscovery.org/types/EdgeSource',
                'https://metadata.datadrivendiscovery.org/types/DirectedEdgeSource',
                'https://metadata.datadrivendiscovery.org/types/UndirectedEdgeSource',
                'https://metadata.datadrivendiscovery.org/types/SimpleEdgeSource',
                'https://metadata.datadrivendiscovery.org/types/MultiEdgeSource'
            )

        sources = get_columns_of_type(edges_df, source_types)

        if dest_types is None:
            dest_types = (
                'https://metadata.datadrivendiscovery.org/types/EdgeTarget',
                'https://metadata.datadrivendiscovery.org/types/DirectedEdgeTarget',
                'https://metadata.datadrivendiscovery.org/types/UndirectedEdgeTarget',
                'https://metadata.datadrivendiscovery.org/types/SimpleEdgeTarget',
                'https://metadata.datadrivendiscovery.org/types/MultiEdgeTarget'
            )
        dests = get_columns_of_type(edges_df, dest_types)

        return sources, dests

    def _parse_inputs(self, inputs: Input, return_all=False):
        try:
            learning_id, learning_df = get_resource(inputs, 'learningData')
        except:
            pass
        try:  # resource id, resource
            nodes_id, nodes_df = get_resource(inputs, '0_nodes')
        except:
            try:
                nodes_id, nodes_df = get_resource(inputs, 'nodes')
            except:
                nodes_df = learning_df
        try:
            edges_id, edges_df = get_resource(inputs, '0_edges')
        except:
            try:
                edges_id, edges_df = get_resource(inputs, 'edges')
            except:
                edges_id, edges_df = get_resource(inputs, '1')

        try:
            print("LEANRING DF ", learning_df)
            print("NODES DF ", nodes_df)
            print("EDGES DF ", edges_df)
        except:
            pass

        self.node_encode = LabelEncoder()
        sources, dests = self._get_source_dest(edges_df)
        sources = sources.astype(np.int32)
        dests = dests.astype(np.int32)
        to_fit = np.sort(
            np.concatenate([sources.values, dests.values],
                           axis=-1).astype(np.int32).ravel())

        self.node_encode.fit(to_fit)  #nodes_df[id_col].values)

        sources[sources.columns[0]] = self.node_encode.transform(
            sources.values.astype(np.int32))
        dests[dests.columns[0]] = self.node_encode.transform(
            dests.values.astype(np.int32))

        other_training_data = self._make_adjacency(sources,
                                                   dests,
                                                   tensor=False)
        return other_training_data if not return_all else other_training_data, learning_df, nodes_df, edges_df

    def set_training_data(self, *, inputs: Input) -> None:

        training_data = self._parse_inputs(inputs)
        if isinstance(training_data, tuple):
            training_data = training_data[0]

        self.training_data = networkx.from_scipy_sparse_matrix(training_data)

        self.fitted = False

    def fit(self, *, timeout: float = None, iterations: int = None) -> None:

        if self.fitted:
            return CallResult(None, True, 1)

        args = {}
        args['nu1'] = 1e-6
        args['nu2'] = 1e-6
        args['K'] = self.hyperparams['depth']
        args['n_units'] = [
            500,
            300,
        ]
        args['rho'] = 0.3
        args['n_iter'] = self.hyperparams['epochs']
        args['xeta'] = self.hyperparams['lr']  #0.0005
        args['n_batch'] = 100  #500
        self._args = args

        dim = self.hyperparams['dimension']
        alpha = self.hyperparams['alpha']
        beta = self.hyperparams['beta']
        #self._model = sdne.SDNE(d = dim,
        self._sdne = sdne.SDNE(d=dim, alpha=alpha, beta=beta, **args)
        #self._model.learn_embedding(graph = self.training_data)
        self._sdne.learn_embedding(graph=self.training_data)
        self._model = self._sdne._model

        make_keras_pickleable()
        self.fitted = True
        return CallResult(None, True, 1)

    def produce(self,
                *,
                inputs: Input,
                timeout: float = None,
                iterations: int = None) -> CallResult[Output]:
        #make_keras_pickleable()
        produce_data, learning_df, nodes_df, edges_df = self._parse_inputs(
            inputs, return_all=True)
        if self.fitted:
            result = self._sdne._Y  #produce( )#_Y
        else:
            dim = self.hyperparams['dimension']
            alpha = self.hyperparams['alpha']
            beta = self.hyperparams['beta']
            #self._model
            self._sdne = sdne.SDNE(d=dim, alpha=alpha, beta=beta, **args)

            produce_data = networkx.from_scipy_sparse_matrix(produce_data)
            self._sdne.learn_embedding(graph=produce_data)
            self._model = self._sdne._model
            result = self._sdne._Y

        target_types = [
            'https://metadata.datadrivendiscovery.org/types/TrueTarget',
            'https://metadata.datadrivendiscovery.org/types/SuggestedTarget'
        ]
        if self.hyperparams['return_list']:
            result_np = container.ndarray(result, generate_metadata=True)
            return_list = d3m_List([result_np, inputs[1], inputs[2]],
                                   generate_metadata=True)
            return CallResult(return_list, True, 1)
        else:
            learn_df = d3m_DataFrame(learning_df, generate_metadata=True)
            learn_df = get_columns_not_of_type(learn_df, target_types)

            learn_df = learn_df.remove_columns(
                [learn_df.columns.get_loc('nodeID')])
            #learn_df = learn_df.drop('nodeID', axis = 'columns')

            result_df = d3m_DataFrame(result, generate_metadata=True)
            result_df = result_df.loc[result_df.index.isin(
                learning_df['d3mIndex'].values)]

            for column_index in range(result_df.shape[1]):
                col_dict = dict(
                    result_df.metadata.query((ALL_ELEMENTS, column_index)))
                col_dict['structural_type'] = type(1.0)
                col_dict['name'] = str(learn_df.shape[1] + column_index)
                col_dict['semantic_types'] = (
                    'http://schema.org/Float',
                    'https://metadata.datadrivendiscovery.org/types/Attribute')

                result_df.metadata = result_df.metadata.update(
                    (ALL_ELEMENTS, column_index), col_dict)
            result_df.index = learn_df.index.copy()

            output = utils.append_columns(learn_df, result_df)
            #output.set_index('d3mIndex', inplace=True)
            return CallResult(output, True, 1)

    def multi_produce(self,
                      *,
                      produce_methods: typing.Sequence[str],
                      inputs: Input,
                      timeout: float = None,
                      iterations: int = None) -> MultiCallResult:
        return self._multi_produce(produce_methods=produce_methods,
                                   timeout=timeout,
                                   iterations=iterations,
                                   inputs=inputs)

    def fit_multi_produce(self,
                          *,
                          produce_methods: typing.Sequence[str],
                          inputs: Input,
                          timeout: float = None,
                          iterations: int = None) -> MultiCallResult:
        return self._fit_multi_produce(produce_methods=produce_methods,
                                       timeout=timeout,
                                       iterations=iterations,
                                       inputs=inputs)

    def get_params(self) -> SDNE_Params:
        return SDNE_Params(fitted=self.fitted,
                           model=self._sdne,
                           node_encode=self.node_encode)

    def set_params(self, *, params: SDNE_Params) -> None:
        self.fitted = params['fitted']
        self._sdne = params['model']
        self.node_encode = params['node_encode']
Esempio n. 9
0
class CorexText(UnsupervisedLearnerPrimitiveBase[Input, Output,
                                                 CorexText_Params,
                                                 CorexText_Hyperparams]
                ):  #(Primitive):
    """
    Learns latent factors / topics which explain the most multivariate information in bag of words representations of documents. Returns learned topic scores for each document. Also supports hierarchical models and 'anchoring' to encourage topics to concentrate around desired words.
    """
    metadata = PrimitiveMetadata({
        "schema":
        "v0",
        "id":
        "18e63b10-c5b7-34bc-a670-f2c831d6b4bf",
        "version":
        "1.0.0",
        "name":
        "CorexText",
        "description":
        "Learns latent factors / topics which explain the most multivariate information in bag of words representations of documents. Returns learned topic scores for each document. Also supports hierarchical models and 'anchoring' to encourage topics to concentrate around desired words.",
        "python_path":
        "d3m.primitives.dsbox.CorexText",
        "original_python_path":
        "corextext.corex_text.CorexText",
        "source": {
            "name": "ISI",
            "contact": "mailto:[email protected]",
            "uris": ["https://github.com/brekelma/dsbox_corex"]
        },
        "installation": [{
            'type':
            'PIP',
            'package_uri':
            'git+https://github.com/brekelma/dsbox_corex.git@8672da14a7f2e00ea488da460ad68ef0799a9532#egg=dsbox-corex'
        }],
        "algorithm_types":
        ["EXPECTATION_MAXIMIZATION_ALGORITHM", "LATENT_DIRICHLET_ALLOCATION"],
        "primitive_family":
        "FEATURE_CONSTRUCTION",
        "hyperparams_to_tune": ["n_hidden", "chunking", "max_df", "min_df"]
    })

    #"preconditions": [],
    #      "effects": [],

    def __init__(
        self, *, hyperparams: CorexText_Hyperparams
    ) -> None:  #, random_seed : int =  0, docker_containers: typing.Dict[str, DockerContainer] = None)

        super().__init__(
            hyperparams=hyperparams
        )  #, random_seed = random_seed, docker_containers = docker_containers)

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None
            ) -> CallResult[None]:  #X : Sequence[Input]):
        #self.columns = list(X)
        #X_ = X[self.columns].values # useless if only desired columns are passed
        if self.fitted:
            return

        if not hasattr(self, 'model') or self.model is None:
            self.model = Corex(n_hidden=self.hyperparams['n_hidden'],
                               max_iter=iterations,
                               seed=self.random_seed)  #, **kwargs)

        if not hasattr(self, 'training_inputs'):
            raise ValueError("Missing training data.")

        if not hasattr(self, 'get_text'):
            raise ValueError("Missing get_text parameter")
        else:
            if not self.get_text or self.hyperparams['chunking'] > 0:
                self.bow = TfidfVectorizer(
                    input='content',
                    decode_error='ignore',
                    max_df=self.hyperparams['max_df'],
                    min_df=self.hyperparams['min_df'],
                    max_features=self.hyperparams['max_features'])
            else:
                self.bow = TfidfVectorizer(
                    input='filename',
                    max_df=self.hyperparams['max_df'],
                    min_df=self.hyperparams['min_df'],
                    max_features=self.hyperparams['max_features'])

        if iterations is not None:
            self.max_iter = iterations
            self.model.max_iter = self.max_iter
        else:
            self.max_iter = 250
            self.model.max_iter = self.max_iter

        if self.hyperparams['chunking'] == 0:
            bow = self.bow.fit_transform(self.training_inputs.values.ravel(
            )) if not self.get_text else self.bow.fit_transform(
                self._get_raw_inputs())
        else:
            inp, self.chunks = self._read_and_chunk(
                self.training_inputs.values.ravel(), read=self.get_text)
            bow = self.bow.fit_transform(inp)

        self.latent_factors = self.model.fit_transform(bow)

        if self.hyperparams['chunking'] > 0:
            self.latent_factors = self._unchunk(self.latent_factors,
                                                self.chunks)

        self.fitted = True
        return CallResult(None, True, self.max_iter)

    def produce(
        self,
        *,
        inputs: Input,
        timeout: float = None,
        iterations: int = None
    ) -> CallResult[Output]:  # TAKES IN DF with index column
        #self.columns = list(X)
        #X_ = X[self.columns].values # useless if only desired columns are passed
        if iterations is not None:
            self.max_iter = iterations
            self.model.max_iter = self.max_iter
        else:
            self.max_iter = 250
            self.model.max_iter = self.max_iter

        if not self.fitted:
            if self.hyperparams['chunking'] == 0:
                bow = self.bow.fit_transform(inputs.values.ravel(
                )) if not self.get_text else self.bow.fit_transform(
                    self._get_raw_inputs(inputs=inputs,
                                         data_path=self.data_path))
            else:
                inp, self.chunks = self._read_and_chunk(
                    inputs.values.ravel(),
                    data_path=self.data_path,
                    read=self.get_text)
                bow = self.bow.fit_transform(inp)
            self.latent_factors = self.model.fit_transform(bow).astype(float)

            self.fitted = True
        else:
            if self.hyperparams['chunking'] == 0:
                bow = self.bow.transform(inputs.values.ravel(
                )) if not self.get_text else self.bow.transform(
                    self._get_raw_inputs(inputs=inputs,
                                         data_path=self.data_path))
            else:
                inp, self.chunks = self._read_and_chunk(
                    inputs.values.ravel(),
                    data_path=self.data_path,
                    read=self.get_text)
                bow = self.bow.transform(inp)
                #print('bow shape ', bow.shape)

            self.latent_factors = self.model.transform(bow).astype(float)

        if self.hyperparams['chunking'] > 0:
            self.latent_factors = self._unchunk(self.latent_factors,
                                                self.chunks)

        # TO DO : Incorporate timeout, max_iter
        return CallResult(self.latent_factors, True, self.max_iter)

    def _fit_transform(
        self,
        inputs: Input,
        timeout: float = None,
        iterations: int = None
    ) -> Sequence[Output]:  # TAKES IN DF with index column
        #self.columns = list(X)
        #X_ = X[self.columns].values # useless if only desired columns are passed

        if iterations is not None:
            self.max_iter = iterations
            self.model.max_iter = self.max_iter

        if self.hyperparams['chunking'] == 0:
            bow = self.bow.fit_transform(inputs.values.ravel(
            )) if not self.get_text else self.bow.fit_transform(
                self._get_raw_inputs(inputs=inputs))
        else:
            inp, self.chunks = self._read_and_chunk(inputs.values.ravel(),
                                                    read=self.get_text)
            bow = self.bow.fit_transform(inp)
        self.latent_factors = self.model.fit_transform(bow)

        if self.hyperparams['chunking'] > 0:
            self.latent_factors = self._unchunk(self.latent_factors,
                                                self.chunks)

        self.fitted = True
        return self.latent_factors

    def _get_raw_inputs(self,
                        inputs: Input = None,
                        data_path=None) -> np.ndarray:
        print_ = True
        raw_inputs = self.training_inputs.values if inputs is None else inputs.values
        inp = self.training_inputs.values if inputs is None else inputs.values
        if data_path is not None:
            for idx, val in np.ndenumerate(inp):
                raw_inputs[idx] = os.path.join(data_path, val)
        elif self.data_path is not None:
            for idx, val in np.ndenumerate(inp):
                raw_inputs[idx] = os.path.join(self.data_path, val)
        else:
            warn('Data_path param not passed.')

        return raw_inputs.ravel()

    def _read_and_chunk(self,
                        inputs: Input = None,
                        data_path: str = None,
                        read: bool = True) -> Tuple[np.ndarray, np.ndarray]:
        # read data into documents / text
        # chunk_array = np.zeros((inputs.shape[0],))
        chunked_docs = []
        chunk_list = []
        overall_j = 0
        for i in range(inputs.shape[0]):
            if read:
                if data_path is None:
                    file_path = os.path.join(self.data_path, inputs[i])
                else:
                    file_path = os.path.join(data_path, inputs[i])
                with open(file_path, 'rb') as fn:
                    doc = fn.read()
                doc = "".join(map(chr, doc))
                doc_tokens = re.compile(r"(?u)\b\w\w+\b").findall(
                    doc)  # list of strings
            else:
                doc_tokens = inputs[i]

            j = 0

            while (j + 2) * self.hyperparams['chunking'] <= len(doc_tokens):
                new_chunked_str = " ".join(
                    doc_tokens[j * self.hyperparams['chunking']:(j + 1) *
                               self.hyperparams['chunking']])
                chunked_docs.append(new_chunked_str)
                j = j + 1

            new_chunked_str = " ".join(
                doc_tokens[j * self.hyperparams['chunking']:])
            chunked_docs.append(new_chunked_str)
            overall_j += (j + 1)
            chunk_list.append(overall_j)
        # all docs in 1 array, list indicating changepoints of documents
        return np.array(chunked_docs), np.array(chunk_list)

    def _unchunk(self, transformed: np.ndarray, chunk_array: np.ndarray):
        # transformed is samples x topics
        j = 0
        return_val = None
        # hacky?

        chunk_array = np.append(chunk_array,
                                np.array([transformed.shape[0] - 1]),
                                axis=0)

        temp = np.zeros((transformed.shape[1], ))
        for i in range(transformed.shape[0]):
            #print('row ', i, '/', transformed.shape[0], ' chunk: ', j, ' chunk_array[j]: ', chunk_array[j])
            if i < chunk_array[j] and i < transformed.shape[0] - 1:
                #temp = np.maximum(temp, transformed[i,:])
                temp = temp + transformed[i, :]
            else:
                divisor = (chunk_array[j] - chunk_array[j - 1] +
                           1) if j > 0 else chunk_array[j]

                temp = temp / float(divisor)
                temp = temp[np.newaxis, :]  # 1 x features

                if return_val is None:
                    return_val = temp
                else:
                    return_val = np.concatenate([return_val, temp], axis=0)

                j = j + 1
                temp = np.zeros((transformed.shape[1], ))
                #print(i, return_val.shape)
        return return_val

    def set_training_data(self, *, inputs: Input, outputs: Output) -> None:
        self.training_inputs = inputs
        self.fitted = False

    def get_params(self) -> CorexText_Params:
        return CorexText_Params(model=self.model,
                                bow=self.bow,
                                get_text=self.get_text,
                                data_path=self.data_path)
        #fitted = self.fitted, training_inputs = self.training_inputs)

    def set_params(self, *, params: CorexText_Params) -> None:
        self.model = params['model']
        self.bow = params['bow']
        self.get_text = params['get_text']
        self.data_path = params['data_path']
        #self.fitted = params.fitted
        #self.training_inputs = params.training_inputs

    def _annotation(self):
        if self._annotation is not None:
            return self._annotation
        self._annotation = Primitive()
        self._annotation.name = 'CorexText'
        self._annotation.task = 'FeatureExtraction'
        self._annotation.learning_type = 'UnsupervisedLearning'
        self._annotation.ml_algorithm = ['Dimension Reduction']
        self._annotation.tags = ['feature_extraction', 'text']
        return self._annotation

    def _get_feature_names(self):
        return [
            'CorexText_' + str(i) for i in range(self.hyperparams['n_hidden'])
        ]
Esempio n. 10
0
class Learner(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params,
                                             Hyperparams]):

    metadata = PrimitiveMetadata({
        'algorithm_types': [
            PrimitiveAlgorithmType.ADAPTIVE_ALGORITHM,
        ],
        'id':
        'c1c54b03-717d-4e6b-b043-8fc93364b92e',
        'keywords': ['learner'],
        'name':
        "Learner",
        'primitive_family':
        PrimitiveFamily.LEARNER,
        'python_path':
        'd3m.primitives.mit_primitives.Learner',
        'source': {
            'name': 'MIT_FeatureLabs',
        },
        'version':
        '0.0.3-dev',
        'installation': [{
            'type':
            PrimitiveInstallationType.PIP,
            'package_uri':
            ('git+https://github.com/HDI-Project/mit-primitives.git@'
             '{git_commit}#egg=mit-primitives').format(
                 git_commit=utils.current_git_commit(os.path.dirname(
                     __file__)))
        }],
    })

    def get_params(self) -> Params:
        return self.params

    def set_params(self, *, params: Params) -> None:
        if not hasattr(self, 'params'):
            self.params = params

        else:
            self.params.update(params)

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        self.inputs = inputs
        self.outputs = outputs

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        learner_params = self.hyperparams['learner_params']
        learner = mlpipeline.MLPipeline(**learner_params)

        fit_params = self.params['fit_params']
        predict_params = self.params['predict_params']
        learner.fit(
            self.inputs,
            self.outputs,
            fit_params=fit_params,
            predict_params=predict_params,
        )

        self.params['learner'] = learner

        return CallResult(None)

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        predict_params = self.params['predict_params']
        results = self.params['learner'].predict(inputs,
                                                 predict_params=predict_params)

        return CallResult(results)

    def to_dict(self) -> dict:
        return self.params['learner'].to_dict()