class SequentialModel(SupervisedLearnerPrimitiveBase[Input, Output, SM_Params, SM_Hyperparams]): metadata = PrimitiveMetadata({ "schema": "v0", "id": "4d6cbfca-5ac4-4e92-a3de-dc4a47008649", "version": "1.0.2", "name": "SequentialModel", "description": "Uses Sequential from Keras to do predictions with previously finely tuned hyperparams.", "python_path": "d3m.primitives.dsbox.SequentialModel", "original_python_path": "sequential_model.SequentialModel", "source": { "name": "ISI", "contact": "mailto:[email protected]", "uris": ["https://github.com/serbanstan/dsbox_sm"] }, "installation": [cfg_.INSTALLATION], "algorithm_types": ['MULTILAYER_PERCEPTRON'], "primitive_family": "CLASSIFICATION", "hyperparams_to_tune": ["reg_val"] }) def __init__(self, *, hyperparams: SM_Hyperparams) -> None: super().__init__(hyperparams=hyperparams) def set_training_data(self, *, inputs: Input, outputs: Output) -> None: # initialize the default parameters self.validateSplitRate = 0.2 self.epochs = 30 self.batchSize = 10 # work in DF format indeM = inputs.shape[1] self.inputDim = indeM self.kindOfcrossEntropy = 'categorical_crossentropy' # turn data to ndarray format self.training_inputs = inputs.values self.training_outputs = to_categorical(self._create_mapping(outputs)) self.fitted = False def fit(self) -> CallResult[None]: make_keras_picklable() modelSub = Sequential() modelSub.add( Dense(100, input_dim=self.inputDim, kernel_regularizer=regularizers.l2( self.hyperparams['reg_val']), activation='tanh', kernel_constraint=maxnorm(2))) modelSub.add( Dense(self.training_outputs.shape[1], kernel_regularizer=regularizers.l2( self.hyperparams['reg_val']), activation='sigmoid')) #inp = InputK(shape = (self.inputDim,)) #x = Dense(100, kernel_regularizer = regularizers.l2(self.hyperparams['reg_val']), activation = 'tanh', kernel_constraint = maxnorm(2))(inp) #x = Dense(self.training_outputs.shape[1], kernel_regularizer = regularizers.l2(self.hyperparams['reg_val']), activation = 'sigmoid')(x) optimizer = Adam(lr=0.001) #modelSub = keras.models.Model(inputs = inp, outputs = x) modelSub.compile(loss=self.kindOfcrossEntropy, optimizer=optimizer, metrics=['accuracy']) self.model = modelSub self.model.fit(self.training_inputs, self.training_outputs, validation_split=self.validateSplitRate, epochs=self.epochs, batch_size=self.batchSize) self.fitted = True return CallResult(None, True, 1) def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]: if not self.fitted: return CallResult(inputs, True, 1) prediction = container.DataFrame( self._inverse_mapping(self.model.predict_classes(inputs.values))) prediction.index = copy.deepcopy(inputs.index) return CallResult(prediction, True, 1) def _create_mapping(self, vec): # create a mapping from type to float self.mapping = dict() self.inverse_map = dict() res = [] mapping_index = 0 for v in vec.values.ravel(): if v in self.mapping: res.append(self.mapping[v]) else: mapping_index = mapping_index + 1 self.mapping[v] = mapping_index self.inverse_map[mapping_index] = v res.append(mapping_index) return res def _inverse_mapping(self, vec): return [self.inverse_map[x] for x in vec] def get_params(self) -> SM_Params: if not self.fitted: raise ValueError("Fit not performed") return SM_Params(model_=self.model, inverse_map_=self.inverse_map) def set_params(self, *, params: SM_Params) -> None: self.model = params["model_"] self.inverse_map = params["inverse_map_"] def _annotation(self): if self._annotation is not None: return self._annotation self._annotation = Primitive() self._annotation.name = 'SequentialModel' self._annotation.task = 'Classification' self._annotation.learning_type = 'SupervisedLearning' self._annotation.ml_algorithm = ['Keras Sequential'] self._annotation.tags = ['multilayer_perceptron'] return self._annotation
class CorexSAE(SupervisedLearnerPrimitiveBase[Input, Output, CorexSAE_Params, CorexSAE_Hyperparams]): metadata = PrimitiveMetadata({ "schema": "v0", "id": "6c95166f-434a-435d-a3d7-bce8d7238061", "version": "1.0.0", "name": "CorexSupervised", "description": "Autoencoder implementation of Corex / Information Bottleneck", "python_path": "d3m.primitives.dsbox.CorexSupervised", "original_python_path": "corexsae.corex_sae.CorexSAE", "source": { "name": "ISI", "contact": "mailto:[email protected]", "uris": ["https://github.com/brekelma/dsbox_corex"] }, # git+https://github.com/brekelma/corex_continuous#egg=corex_continuous "installation": [{ 'type': 'PIP', 'package_uri': 'git+https://github.com/brekelma/dsbox_corex.git@7381c3ed2d41a8dbe96bbf267a915a0ec48ee397#egg=dsbox-corex' #'+ str(git.Repo(search_parent_directories = True).head.object.hexsha) + '#egg=dsbox-corex' }], "algorithm_types": ["EXPECTATION_MAXIMIZATION_ALGORITHM"], "primitive_family": "CLASSIFICATION", #"FEATURE_CONSTRUCTION", "hyperparams_to_tune": ["label_beta", "epochs"] }) def __init__( self, *, hyperparams: CorexSAE_Hyperparams ) -> None: #, random_seed : int = 0, docker_containers: typing.Dict[str, DockerContainer] = None super().__init__( hyperparams=hyperparams ) # random_seed = random_seed, docker_containers = docker_containers) def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: # create keras architecture self._latent_dims = [100, 100, 20] self._decoder_dims = list(reversed(self.latent_dims[:-1])) # TRAINING ARGS... what to do? self._activation = 'softplus' self._optimizer = Adam(.001) self._batch = 100 self._epochs = None # HYPERPARAM? self._noise = 'add' self._anneal_sched = None if iterations is not None: self.Hyperparams["epochs"] = iterations x = Input(shape=(self.training_inputs.shape[-1], )) t = x for i in range(len(self.latent_dims[:-1])): t = Dense(self.latent_dims[i], activation=self.activation)(t) if self._noise == 'add' or self._noise == 'vae': final_enc_act = 'linear' sample_function = vae_sample else: #final_enc_act = 'softplus' final_enc_act = 'linear' sample_function = ido_sample z_mean = Dense(self.latent_dims[:-1], activation=final_enc_act, name='z_mean')(t) z_noise = Dense(self.latent_dims[:-1], activation=final_enc_act, name='z_noise')(t) z_act = Lambda(vae_sample, output_shape=(self.latent_dims[:-1], ))([z_mean, z_var]) t = z_act for i in range(len(self._decoder_dims)): t = Dense(self._decoder_dims[i], activation=self.activation)(t) label_act = 'softmax' if self._label_unique > 1 else 'linear' y_pred = Dense(self._label_unique, activation='softmax', name='y_pred') if self._input_types: pass else: print("Purely Supervised Bottleneck") # no reconstruction layers outputs = [] loss_functions = [] loss_weights = [] beta = Beta(name='beta', beta=self.Hyperparams["label_beta"])(x) outputs.append(y_pred) if label_act == 'softmax': loss_functions.append(objectives.categorical_crossentropy) else: loss_functions.append(objectives.mean_squared_error) #mse loss_weights.append(beta) self.model = Model(inputs=x, outputs=outputs) self.model.compile(optimizer=self._optimizer, loss=loss_functions, loss_weights=loss_weights) # anneal? if self._anneal_sched: raise NotImplementedError else: self.model.fit(self.training_inputs, [self.training_outputs] * len(outputs), shuffle=True, epochs=self.Hyperparams["epochs"], batch_size=self._batch_size ) # validation_data = [] early stopping? #Lambda(ido_sample) #Lambda(vae_sample, output_shape = (d,))([z_mean, z_var]) return CallResult(None, True, self.Hyperparams["epochs"]) def produce( self, *, inputs: Input, timeout: float = None, iterations: int = None ) -> CallResult[Output]: # TAKES IN DF with index column return CallResult(self.model.predict(inputs), True, 0) def set_training_data(self, *, inputs: Input, outputs: Output) -> None: self.training_inputs = inputs self.training_outputs = to_categorical( outputs, num_classes=np.unique(outputs).shape[0]) self.fitted = False # DATA PROFILING? softmax categorical (encoded) X or labels Y # binary data? np.logical_and(self.training_inputs >= 0, self.training_inputs ) self._input_types = [] self._label_unique = np.unique(outputs).shape[0] self._label_unique = 1 if self._label_unique > self.max_discrete_labels else self._label_unique def get_params(self) -> CorexSAE_Params: return CorexSAE_Params() #args) def set_params(self, *, params: CorexSAE_Params) -> None: self.max_discrete_labels = params["max_discrete_labels"] pass
class CorexText(UnsupervisedLearnerPrimitiveBase[Input, Output, CorexText_Params, CorexText_Hyperparams] ): #(Primitive): """ Learns latent factors / topics which explain the most multivariate information in bag of words representations of documents. Returns learned topic scores for each document. Also supports hierarchical models and 'anchoring' to encourage topics to concentrate around desired words. """ metadata = PrimitiveMetadata({ "schema": "v0", "id": "0c64ffd6-cb9e-49f0-b7cb-abd70a5a8261", "version": "1.0.0", "name": "CorexText", "description": "Learns latent factors / topics which explain the most multivariate information in bag of words representations of documents. Returns learned topic scores for each document. Also supports hierarchical models and 'anchoring' to encourage topics to concentrate around desired words.", #"python_path": "d3m.primitives.dsbox.corex_text.CorexText", "python_path": "d3m.primitives.feature_construction.corex_text.DSBOX", "original_python_path": "corextext.corex_text.CorexText", "source": { "name": "ISI", "contact": "mailto:[email protected]", "uris": ["https://github.com/brekelma/dsbox_corex"] }, "installation": [cfg_.INSTALLATION], "algorithm_types": ["EXPECTATION_MAXIMIZATION_ALGORITHM", "LATENT_DIRICHLET_ALLOCATION"], "primitive_family": "FEATURE_CONSTRUCTION", "hyperparams_to_tune": ["n_hidden", "threshold", "n_grams", "max_df", "min_df"] }) def __init__(self, *, hyperparams: CorexText_Hyperparams) -> None: super(CorexText, self).__init__(hyperparams=hyperparams) # instantiate data and create model and bag of words def set_training_data(self, *, inputs: Input) -> None: self.training_data = inputs self.fitted = False # assumes input as data-frame and do prediction on the 'text' labeled columns def fit(self, *, timeout: float = None, iterations: int = None) -> None: # if already fitted, do nothing if self.fitted: return CallResult(None, True, 1) self.training_data = self._process_files(self.training_data) text_attributes = DataMetadata.list_columns_with_semantic_types(self=self.training_data.metadata,\ semantic_types=["http://schema.org/Text"]) all_attributes = DataMetadata.list_columns_with_semantic_types(self=self.training_data.metadata,\ semantic_types=["https://metadata.datadrivendiscovery.org/types/Attribute"]) categorical_attributes = DataMetadata.list_columns_with_semantic_types(self=self.training_data.metadata,\ semantic_types=["https://metadata.datadrivendiscovery.org/types/CategoricalData"]) # want text columns that are attributes self.text_columns = set(all_attributes).intersection(text_attributes) # but, don't want to edit categorical columns self.text_columns = set( self.text_columns) - set(categorical_attributes) # and, we want the text columns as a list self.text_columns = list(self.text_columns) # if no text columns are present don't do anything self.do_nothing = False if len(self.text_columns) == 0: self.fitted = True self.model = None self.bow = None self.do_nothing = True self.text_columns = None self.latent_factors = None self.max_iter = None return CallResult(None, True, 1) # instantiate a corex model and a bag of words model self.model = Corex(n_hidden=self.hyperparams['n_hidden'], max_iter=iterations, seed=self.random_seed) self.bow = TfidfVectorizer(decode_error='ignore', max_df=self.hyperparams['max_df'], min_df=self.hyperparams['min_df']) # set the number of iterations (for wrapper and underlying Corex model) if iterations is not None: self.max_iter = iterations else: self.max_iter = 250 self.model.max_iter = self.max_iter # concatenate the columns row-wise concat_cols = None for column_index in self.text_columns: if concat_cols is not None: concat_cols = concat_cols.str.cat( self.training_data.iloc[:, column_index], sep=" ") else: concat_cols = copy.deepcopy( self.training_data.iloc[:, column_index]) try: bow = self.bow.fit_transform( map(self._get_ngrams, concat_cols.ravel())) except ValueError: self.bow = TfidfVectorizer(decode_error='ignore', max_df=self.hyperparams['max_df'], min_df=0) bow = self.bow.fit_transform( map(self._get_ngrams, concat_cols.ravel())) print("[WARNING] Setting min_df to 0 to avoid ValueError") # choose between CorEx and the TfIdf matrix if bow.shape[1] > self.hyperparams['threshold']: # use CorEx self.latent_factors = self.model.fit_transform(bow) else: # just use the bag of words representation self.latent_factors = pd.DataFrame(bow.todense()) self.fitted = True return CallResult(None, True, 1) def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]: # if corex didn't run for any reason, just return the given dataset if self.do_nothing: return CallResult(inputs, True, 1) inputs = self._process_files(inputs) if iterations is not None: self.max_iter = iterations else: self.max_iter = 250 self.model.max_iter = self.max_iter # concatenate the columns row-wise concat_cols = None for column_index in self.text_columns: if concat_cols is not None: concat_cols = concat_cols.str.cat(inputs.iloc[:, column_index], sep=" ") else: concat_cols = copy.deepcopy(inputs.iloc[:, column_index]) bow = self.bow.transform(map(self._get_ngrams, concat_cols.ravel())) # choose between CorEx and the TfIdf matrix if bow.shape[1] > self.hyperparams['threshold']: # use CorEx self.latent_factors = self.model.transform(bow).astype(float) else: # just use the bag of words representation self.latent_factors = pd.DataFrame(bow.todense()) # make the columns corex adds distinguishable from other columns # remove the selected columns from input and add the latent factors given by corex out_df = d3m_DataFrame(inputs, generate_metadata=True) self.latent_factors.columns = [ str(out_df.shape[-1] + i) for i in range(self.latent_factors.shape[-1]) ] # create metadata for the corex columns corex_df = d3m_DataFrame(self.latent_factors, generate_metadata=True) for column_index in range(corex_df.shape[1]): col_dict = dict( corex_df.metadata.query((ALL_ELEMENTS, column_index))) col_dict['structural_type'] = type(1.0) # FIXME: assume we apply corex only once per template, otherwise column names might duplicate col_dict['name'] = 'corex_' + str(out_df.shape[1] + column_index) col_dict['semantic_types'] = ( 'http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute') corex_df.metadata = corex_df.metadata.update( (ALL_ELEMENTS, column_index), col_dict) # concatenate is --VERY-- slow without this next line corex_df.index = out_df.index.copy() out_df = utils.append_columns(out_df, corex_df) # remove the initial text columns from the df, if we do this before CorEx we can get an empty dataset error out_df = utils.remove_columns(out_df, self.text_columns) # TO DO : Incorporate timeout, max_iter # return CallResult(d3m_DataFrame(self.latent_factors)) return CallResult(out_df, True, 1) #def fit_multi_produce(self, ): def _get_ngrams(self, text: str = None) -> str: punctuation_table = str.maketrans(dict.fromkeys(string.punctuation)) try: words = text.translate(punctuation_table).lower().rsplit(" ") except: words = text.str.translate(punctuation_table).lower().rsplit(" ") new_text = "" for i in range(len(words)): new_text += "".join( words[i:i + int(self.hyperparams['n_grams'])]) + " " return new_text # remove the FileName columns from the data frame and replace them with text def _process_files(self, inputs: Input): fn_attributes = DataMetadata.list_columns_with_semantic_types(self=inputs.metadata, \ semantic_types=["https://metadata.datadrivendiscovery.org/types/FileName"]) all_attributes = DataMetadata.list_columns_with_semantic_types(self=inputs.metadata, \ semantic_types=["https://metadata.datadrivendiscovery.org/types/Attribute"]) fn_columns = list(set(all_attributes).intersection(fn_attributes)) # if no file name columns are detected, default to regular behavior if len(fn_columns) == 0: return inputs # create an empty DataFrame of the required size processed_cols = pd.DataFrame("", index = copy.deepcopy(inputs.index), \ columns = ['text_files_' + str(i) for i in range(len(fn_columns))]) # for column_index in range(len(fn_columns)): for column_index in fn_columns: curr_column = copy.deepcopy(inputs.iloc[:, column_index]) file_loc = inputs.metadata.query( (ALL_ELEMENTS, column_index))['location_base_uris'] file_loc = file_loc[0] # take the first elem of the tuple file_loc = file_loc[7:] # get rid of 'file://' prefix for row_index in range(curr_column.shape[0]): text_file = curr_column.iloc[row_index] file_path = file_loc + text_file with open(file_path, 'rb') as file: doc = file.read() doc = "".join(map(chr, doc)) doc_tokens = re.compile(r"(?u)\b\w\w+\b").findall( doc) # list of strings processed_cols.iloc[row_index, fn_columns.index(column_index)] = " ".join( doc_tokens) # construct metadata for the newly generated columns processed_cols = d3m_DataFrame(processed_cols, generate_metadata=True) for column_index in range(processed_cols.shape[1]): col_dict = dict( processed_cols.metadata.query((ALL_ELEMENTS, column_index))) col_dict['structural_type'] = type("text") # FIXME: assume we apply corex only once per template, otherwise column names might duplicate col_dict['name'] = 'processed_file_' + str(inputs.shape[1] + column_index) col_dict['semantic_types'] = ( 'http://schema.org/Text', 'https://metadata.datadrivendiscovery.org/types/Attribute') processed_cols.metadata = processed_cols.metadata.update( (ALL_ELEMENTS, column_index), col_dict) # concatenate the input with the newly created columns updated_inputs = utils.append_columns(inputs, processed_cols) # remove the initial FileName columns from the df, if we do this before concatenating we might get an empty dataset error updated_inputs = utils.remove_columns(updated_inputs, fn_columns) return updated_inputs def get_params(self) -> CorexText_Params: return CorexText_Params(fitted_=self.fitted, model_=self.model, bow_=self.bow, do_nothing_=self.do_nothing, text_columns_=self.text_columns, latent_factors_=self.latent_factors, max_iter_=self.max_iter) def set_params(self, *, params: CorexText_Params) -> None: self.fitted = params['fitted_'] self.model = params['model_'] self.bow = params['bow_'] self.do_nothing = params['do_nothing_'] self.text_columns = params['text_columns_'] self.latent_factors = params['latent_factors_'] self.max_iter = params['max_iter_'] def _annotation(self): if self._annotation is not None: return self._annotation self._annotation = Primitive() self._annotation.name = 'CorexText' self._annotation.task = 'FeatureExtraction' self._annotation.learning_type = 'UnsupervisedLearning' self._annotation.ml_algorithm = ['Dimension Reduction'] self._annotation.tags = ['feature_extraction', 'text'] return self._annotation
class EchoLinearRegression( SupervisedLearnerPrimitiveBase[Input, Output, EchoRegressor_Params, EchoRegressor_Hyperparams] ): #(Primitive): """ Least squares regression with information capacity constraint from echo noise. Minimizes the objective function:: E(y - y_hat)^2 + alpha * I(X,y) where, X_bar = X + S * echo noise, y_hat = X_bar w + w_0, so that I(X,y) <= -log det S, with w the learned weights / coefficients. The objective simplifies and has an analytic solution. """ metadata = PrimitiveMetadata({ "schema": "v0", "id": "18e63b10-c5b7-34bc-a670-f2c831d6b4bf", "version": "1.0.0", "name": "EchoLinearRegression", "description": "Learns latent factors / topics which explain the most multivariate information in bag of words representations of documents. Returns learned topic scores for each document. Also supports hierarchical models and 'anchoring' to encourage topics to concentrate around desired words.", #"python_path": "d3m.primitives.dsbox.echo.EchoRegressor", "python_path": "d3m.primitives.regression.echo_linear.DSBOX", "original_python_path": "echo_regressor.EchoLinearRegression", "source": { "name": "ISI", "contact": "mailto:[email protected]", "uris": ["https://github.com/brekelma/dsbox_corex"] }, "installation": [cfg_.INSTALLATION], "algorithm_types": ["LINEAR_REGRESSION"], "primitive_family": "REGRESSION", "hyperparams_to_tune": ["alpha"] }) def __init__(self, *, hyperparams: EchoRegressor_Hyperparams) -> None: super().__init__(hyperparams=hyperparams) # instantiate data and create model and bag of words def set_training_data(self, *, inputs: Input, outputs: Output) -> None: self.training_data = inputs self.labels = outputs self._output_columns = outputs.columns self.fitted = False # assumes input as data-frame and do prediction on the 'text' labeled columns def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: # if already fitted, do nothing if self.fitted: return CallResult(None, True, 1) self.model = EchoRegression( alpha=self.hyperparams['alpha'], assume_diagonal=self.hyperparams['diagonal']) self.model.fit(self.training_data, self.labels) return CallResult(None, True, 1) def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]: try: self._output_columns = self._output_columns except: self._output_columns = ['output'] * len(list(output)) preds = self.model.produce(inputs.values) output = d3m_DataFrame(preds, columns=self._output_columns, source=self, generate_metadata=True) # output.metadata = inputs.metadata.clear(source=self, for_value=output, generate_metadata=True) #output.metadata = self._add_target_semantic_types(metadata=output.metadata, target_names=self._output_columns, source=self) self._training_indices = [ c for c in inputs.columns if isinstance(c, str) and 'index' in c.lower() ] outputs = common_utils.combine_columns( return_result='new', #self.hyperparams['return_result'], add_index_columns=True, #self.hyperparams['add_index_columns'], inputs=inputs, columns_list=[output], source=self, column_indices=self._training_indices) return CallResult(outputs, True, 1) def get_params(self) -> EchoRegressor_Params: return EchoRegressor_Params(fitted_=self.fitted, model_=self.model, output_columns_=self._output_columns) """ Sets all the search parameters from a Params object :param is_classifier: True for discrete-class output. False for numeric output. :type: boolean :type: Double """ def set_params(self, *, params: EchoRegressor_Params) -> CallResult[None]: self.fitted = params['fitted_'] self.model = params['model_'] self._output_columns = params['output_columns_'] return CallResult(None, True, 1) def _add_target_semantic_types( cls, metadata: DataMetadata, source: typing.Any, target_names: List = None, ) -> DataMetadata: for column_index in range( metadata.query((ALL_ELEMENTS, ))['dimension']['length']): metadata = metadata.add_semantic_type( (ALL_ELEMENTS, column_index), 'https://metadata.datadrivendiscovery.org/types/Target', source=source) metadata = metadata.add_semantic_type( (ALL_ELEMENTS, column_index), 'https://metadata.datadrivendiscovery.org/types/PredictedTarget', source=source) if target_names: metadata = metadata.update((ALL_ELEMENTS, column_index), { 'name': target_names[column_index], }, source=source) return metadata
class EchoIB(SupervisedLearnerPrimitiveBase[Input, Output, EchoIB_Params, EchoIB_Hyperparams]): """ Keras NN implementing the information bottleneck method with Echo Noise to calculate I(X:Z), where Z also trained to maximize I(X:Y) for label Y. Control tradeoff using 'label_beta param' """ metadata = PrimitiveMetadata({ "schema": "v0", "id": "393f9de8-a5b9-4d92-aaff-8808d563b6c4", "version": "1.0.0", "name": "Echo", "description": "Autoencoder implementation of Information Bottleneck using Echo Noise: https://arxiv.org/abs/1904.07199. Can be used for feature construction with the task of classification or regression. Image featurization and collaborative filtering in prep. Returns embedding of size n_hidden, alongside predictions (which can be used with downstream modeling primitive). Beta hyperparam controls regularization: Loss = task_loss - beta * I(X:Z). Returns learned features (# = n_hidden) and predictions of training classifier if use_as_modeling = False.", "python_path": "d3m.primitives.feature_construction.echo_ib.DSBOX", "original_python_path": "echo_ib.EchoIB", "can_use_gpus": True, "source": { "name": "ISI", "contact": "mailto:[email protected]", "uris": ["https://github.com/brekelma/dsbox_corex"] }, # git+https://github.com/brekelma/corex_continuous#egg=corex_continuous "installation": [cfg_.INSTALLATION] #{'type': 'PIP', #'package_uri': 'git+https://github.com/brekelma/dsbox_corex.git@7381c3ed2d41a8dbe96bbf267a915a0ec48ee397#egg=dsbox-corex'#'+ str(git.Repo(search_parent_directories = True).head.object.hexsha) + '#egg=dsbox-corex' #} #] , "algorithm_types": ["STOCHASTIC_NEURAL_NETWORK"], #"EXPECTATION_MAXIMIZATION_ALGORITHM"], "primitive_family": "FEATURE_CONSTRUCTION", "hyperparams_to_tune": ["n_hidden", "beta", "epochs"] }) def __init__( self, *, hyperparams: EchoIB_Hyperparams ) -> None: #, random_seed : int = 0, docker_containers: typing.Dict[str, DockerContainer] = None super().__init__( hyperparams=hyperparams ) # random_seed = random_seed, docker_containers = docker_containers) def _extra_params(self, latent_dims=None, activation=None, lr=None, batch=None, epochs=None, noise=None): self._latent_dims = [ self.hyperparams['units'], self.hyperparams['units'], self.hyperparams['n_hidden'] ] self._decoder_dims = list(reversed(self._latent_dims[:-1])) # TRAINING ARGS... what to do? self._activation = self.hyperparams['activation'] #'tanh' #'softplus' self._lr = self.hyperparams['lr'] self._optimizer = Adam(self._lr) self._batch = int(self.hyperparams['batch']) #20 self._epochs = None # HYPERPARAM? self._noise = 'echo' self._kl_warmup = 0 # .1 * kl reg for first _ epochs self._anneal_sched = None # not supported self._echo_args = { 'batch': self._batch, 'd_max': self._batch, 'nomc': True, 'calc_log': True, 'plus_sx': True } self._label_unique = 0 try: self.label_encode = self.label_encode except: self.label_encode = None try: self.output_columns = self.output_columns #['Hall of Fame'] except: pass # def build_encoder(self): # def build_decoder(self): # def build_model(self): def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: make_keras_pickleable() # create keras architecture # TODO : Architecture / layers as input hyperparameter self._extra_params() if iterations is not None: self.hyperparams["epochs"] = iterations if self.hyperparams['convolutional']: encoder = build_convolutional_encoder(self.hyperparams['n_hidden']) z_act = encoder.outputs[0] else: x = keras_layers.Input(shape=(self.training_inputs.shape[-1], )) t = x for i in range(len(self._latent_dims[:-1])): t = Dense(self._latent_dims[i], activation=self._activation)(t) if self._noise == 'add' or self._noise == 'vae': z_mean_act = 'linear' z_var_act = 'linear' sample_function = vae_sample latent_loss = gaussian_kl_prior elif self._noise == 'ido' or self._noise == 'mult': #final_enc_act = 'softplus' z_mean_act = 'linear' z_var_act = 'linear' sample_function = ido_sample latent_loss = gaussian_kl_prior elif self._noise == 'echo': z_mean_act = tanh64 z_var_act = tf.math.log_sigmoid sample_function = echo_sample latent_loss = echo_loss else: z_mean_act = tanh64 z_var_act = tf.math.log_sigmoid sample_function = echo_sample latent_loss = echo_loss #z_var_act = log_sigmoid_64 z_mean = Dense(self._latent_dims[-1], activation=z_mean_act, name='z_mean')(t) z_noise = Dense(self._latent_dims[-1], activation=z_var_act, name='z_noise', bias_initializer='ones')(t) z_act = Lambda(echo_sample, arguments=self._echo_args, output_shape=(self._latent_dims[-1], ), name='z_act')([z_mean, z_noise]) z_inp = keras_layers.Input(shape=(self._latent_dims[-1], )) t = z_act dt = z_inp for i in range(len(self._decoder_dims)): lyr = Dense(self._decoder_dims[i], name='decoder_' + str(i), activation=self._activation) t = lyr(t) dt = lyr(dt) if 'classification' in self.hyperparams['task'].lower( ) and self._label_unique > 0: label_act = 'softmax' if self._label_unique > 1 else 'sigmoid' lyr = Dense(self._label_unique, activation=label_act, name='y_pred') y_pred = lyr(t) y_p = lyr(dt) elif 'regression' in self.hyperparams['task'].lower( ) or self._label_unique == 0: label_act = 'linear' lyr = Dense(self.training_outputs.shape[-1], activation=label_act, name='y_pred') y_pred = lyr(t) y_p = lyr(dt) else: raise NotImplementedError( "TASK TYPE SHOULD BE CLASSIFICATION OR REGRESSION") # TO DO : Add reconstruction layers and additional representation as in https://arxiv.org/abs/1912.00646 outputs = [] dec_outputs = [] loss_functions = [] loss_weights = [] outputs.append(y_pred) dec_outputs.append(y_p) if label_act == 'softmax': loss_functions.append(K.categorical_crossentropy) elif label_act == 'sigmoid': loss_functions.append(K.binary_crossentropy) else: loss_functions.append(tf.keras.losses.mean_squared_error) #mse loss_weights.append(1) loss_tensor = Lambda(latent_loss)([z_mean, z_noise]) outputs.append(loss_tensor) loss_functions.append(dim_sum) loss_weights.append( tf.Variable(self.hyperparams["beta"], dtype=tf.float32, trainable=False)) #if self._kl_warmup is not None and self._kl_warmup > 0: # my_callbacks = [ZeroAnneal(lw = self.hyperparams['beta'], index = -1, epochs = self._kl_warmup)] #else: my_callbacks = [] my_callbacks.append(keras.callbacks.TerminateOnNaN()) self.model = keras.models.Model(inputs=x, outputs=outputs) self.enc_model = keras.models.Model(inputs=x, outputs=z_act) self.dec_model = keras.models.Model(inputs=z_inp, outputs=dec_outputs[0]) self.model.compile(optimizer=self._optimizer, loss=loss_functions, loss_weights=loss_weights) #get_session().run(tf.global_variables_initializer()) # anneal? if self._anneal_sched: raise NotImplementedError else: self.model.fit_generator( generator(self.training_inputs, self.training_outputs, target_len=len(outputs), batch=self._batch), verbose=1, #callbacks = my_callbacks, steps_per_epoch=int(self.training_inputs.shape[0] / self._batch), epochs=int(self.hyperparams["epochs"])) self.fitted = True return CallResult(None, True, self.hyperparams["epochs"]) def produce( self, *, inputs: Input, timeout: float = None, iterations: int = None ) -> CallResult[Output]: # TAKES IN DF with index column self._extra_params() modeling = self.hyperparams['use_as_modeling'] inp = self.model.input # outputs = [layer.output for layer in self.model.layers if 'z_mean' in layer.name or 'z_noise' in layer.name] # functors = [K.function([inp, K.learning_phase()], [out]) for out in outputs] # dec_inp = [layer.input for layer in self.model.layers if 'decoder_0' in layer.name][0] # # directly output sampled latent? # output_z = [layer.output for layer in self.model.layers if 'z_act' in layer.name or 'latent_act' in layer.name] # functors_z = [K.function([inp, K.learning_phase()], [out]) for out in output_z] # preds = [layer.output for layer in self.model.layers if 'y_pred' in layer.name] # pred_function = K.function([dec_inp, K.learning_phase()], [preds[0]]) inps = inputs.remove_columns([inputs.columns.get_loc('d3mIndex')]) #predictions = [] #eatures = [] features = self.enc_model.predict(inps, batch_size=self._batch) predictions = self.dec_model.predict(features, batch_size=self._batch) # for i in range(0, inps.shape[0], self._batch): # data = inps.values[i:i+self._batch] # z_stats = [func([data, 1.])[0] for func in functors] # z_out = [func([data, 1.])[0] for func in functors_z] # z_act = self.enc_model(data) # y_pred = self.dec_model(z_act) # _echo_args = copy.copy(self._echo_args) # _echo_args['batch'] = data.shape[0] # _echo_args['d_max'] = data.shape[0] # #z_act = echo_sample(z_stats, **_echo_args).eval(session=get_session()) # y_pred= pred_function([z_act, 1.])[0]#.eval(session=K.get_session()) # features.extend([z_act[yp] for yp in range(z_act.shape[0])]) # y_pred = np.argmax(y_pred, axis = -1) # predictions.extend([y_pred[yp] for yp in range(y_pred.shape[0])]) # predictions = np.array(predictions) if self.label_encode is not None: predictions = np.argmax(predictions, axis=-1) predictions = self.label_encode.inverse_transform(predictions) if modeling: output = d3m_DataFrame(predictions, columns=self.output_columns, generate_metadata=True, source=self) else: out_df = d3m_DataFrame(inputs, generate_metadata=True) # create metadata for the corex columns features = np.array(features) if len(predictions.shape) < len(features.shape): predictions = np.expand_dims(predictions, axis=-1) constructed = np.concatenate([features, predictions], axis=-1) corex_df = d3m_DataFrame(constructed, generate_metadata=True) for column_index in range(corex_df.shape[1]): col_dict = dict( corex_df.metadata.query((ALL_ELEMENTS, column_index))) col_dict['structural_type'] = type(1.0) # FIXME: assume we apply corex only once per template, otherwise column names might duplicate col_dict['name'] = str( out_df.shape[1] + column_index ) #'echoib_'+('pred_' if column_index < self.hyperparams['n_hidden'] else 'feature_') + col_dict['semantic_types'] = ( 'http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute') corex_df.metadata = corex_df.metadata.update( (ALL_ELEMENTS, column_index), col_dict) # concatenate is --VERY-- slow without this next line corex_df.index = out_df.index.copy() outputs = common_utils.append_columns(out_df, corex_df) if modeling: self._training_indices = [ c for c in inputs.columns if isinstance(c, str) and 'index' in c.lower() ] outputs = common_utils.combine_columns( return_result='new', #self.hyperparams['return_result'], add_index_columns=True, #self.hyperparams['add_index_columns'], inputs=inputs, columns_list=[output], source=self, column_indices=self._training_indices) #predictions = d3m_DataFrame(predictions, index = inputs.index.copy())# columns = self.output_columns return CallResult(outputs, True, 1) #return CallResult(d3m_DataFrame(self.model.predict(inputs)), True, 0) def set_training_data(self, *, inputs: Input, outputs: Output) -> None: inps = inputs.remove_columns([inputs.columns.get_loc('d3mIndex')]) self.training_inputs = inps.values self.output_columns = outputs.columns if 'classification' in self.hyperparams['task'].lower(): self._label_unique = np.unique(outputs.values).shape[0] if self._label_unique >= outputs.values.shape[0] - 1: self._label_unique = 0 self.training_outputs = outputs.values else: self.label_encode = LabelEncoder() self.training_outputs = to_categorical( self.label_encode.fit_transform(outputs.values), num_classes=np.unique(outputs.values).shape[0]) else: self.training_outputs = outputs.values #self.training_outputs = to_categorical(outputs, num_classes = np.unique(outputs.values).shape[0]) self.fitted = False # DATA PROFILING? softmax categorical (encoded) X or labels Y # binary data? np.logical_and(self.training_inputs >= 0, self.training_inputs ) # CHECK unique values for determining discrete / continuous #self._input_types = [] #self._label_unique = np.unique(outputs).shape[0] #self._label_unique = 1 if self._label_unique > self.max_discrete_labels else self._label_unique def get_params(self) -> EchoIB_Params: return EchoIB_Params(model = self.model, model_weights = self.model.get_weights(), fitted = self.fitted, \ label_encode = self.label_encode, output_columns = self.output_columns, \ enc_model = self.enc_model, dec_model = self.dec_model)#max_discrete_labels = self.max_discrete_labels)#args) def set_params(self, *, params: EchoIB_Params) -> None: #self.max_discrete_labels = params["max_discrete_labels"] self._extra_params() self.model = params['model'] self.model.set_weights(params['model_weights']) self.enc_model = params['enc_model'] self.dec_model = params['dec_model'] self.fitted = params['fitted'] self.label_encode = params['label_encode'] self.output_columns = params['output_columns'] def _add_target_semantic_types( cls, metadata: DataMetadata, source: typing.Any, target_names: List = None, ) -> DataMetadata: for column_index in range( metadata.query((ALL_ELEMENTS, ))['dimension']['length']): metadata = metadata.add_semantic_type( (ALL_ELEMENTS, column_index), 'https://metadata.datadrivendiscovery.org/types/Target', source=source) metadata = metadata.add_semantic_type( (ALL_ELEMENTS, column_index), 'https://metadata.datadrivendiscovery.org/types/PredictedTarget', source=source) if target_names: metadata = metadata.update((ALL_ELEMENTS, column_index), { 'name': target_names[column_index], }, source=source) return metadata
class CorexContinuous( UnsupervisedLearnerPrimitiveBase[Input, Output, CorexContinuous_Params, CorexContinuous_Hyperparams] ): #(Primitive): """ Return components/latent factors that explain the most multivariate mutual information in the data under Linear Gaussian model. For comparison, PCA returns components explaining the most variance in the data. Serves as DSBox 'wrapper' for https://github.com/gregversteeg/linearcorex" """ metadata = PrimitiveMetadata({ "schema": "v0", "id": "d2d4fefc-0859-3522-91df-7e445f61a69b", "version": "1.0.0", "name": "CorexContinuous", "description": "Return components/latent factors that explain the most multivariate mutual information in the data under Linear Gaussian model. For comparison, PCA returns components explaining the most variance in the data.", #"python_path": "d3m.primitives.dsbox.corex_continuous.CorexContinuous", "python_path": "d3m.primitives.feature_construction.corex_continuous.CorexContinuous", "original_python_path": "corexcontinuous.corex_continuous.CorexContinuous", "source": { "name": "ISI", "contact": 'mailto:[email protected]', "uris": ['https://github.com/brekelma/dsbox_corex'] }, "installation": [cfg_.INSTALLATION], #[ { # 'type': 'PIP', # 'package_uri': 'git+https://github.com/brekelma/dsbox_corex.git@7381c3ed2d41a8dbe96bbf267a915a0ec48ee397#egg=dsbox-corex'#+ str(git.Repo(search_parent_directories = True).head.object.hexsha) + '#egg=dsbox-corex' #} #], "algorithm_types": ["EXPECTATION_MAXIMIZATION_ALGORITHM"], "primitive_family": "FEATURE_CONSTRUCTION", "preconditions": ["NO_MISSING_VALUES", "NO_CATEGORICAL_VALUES"], "hyperparams_to_tune": ["n_hidden"] }) # "effects": [], #def __init__(self, n_hidden : Any = None, max_iter : int = 10000, def __init__( self, *, hyperparams: CorexContinuous_Hyperparams ) -> None: #, random_seed : int = 0, docker_containers: typing.Dict[str, DockerContainer] = None # Additional Corex Parameters set to defaults: see github.com/gregversteeg/LinearCorex #tol : float = 1e-5, anneal : bool = True, discourage_overlap : bool = True, gaussianize : str = 'standard', #gpu : bool = False, verbose : bool = False, seed : int = None, **kwargs) -> None: super(CorexContinuous, self).__init__( hyperparams=hyperparams ) # random_seed = random_seed, docker_containers = docker_containers) def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: if self.fitted: return if not hasattr(self, 'training_inputs'): raise ValueError("Missing training data.") self._fit_transform(self.training_inputs, timeout, iterations) self.fitted = True # add support for max_iter / incomplete return CallResult(None, True, self.max_iter) def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]: self.columns = list(inputs) X_ = inputs[self.columns].values if iterations is not None: self.max_iter = iterations else: self.max_iter = 10000 if not self.fitted: raise ValueError('Please fit before calling produce') self.latent_factors = self.model.transform(X_) out_df = d3m_DataFrame(inputs) corex_df = d3m_DataFrame(self.latent_factors) for column_index in range(corex_df.shape[1]): col_dict = dict( corex_df.metadata.query((mbase.ALL_ELEMENTS, column_index))) col_dict['structural_type'] = type(1.0) # FIXME: assume we apply corex only once per template, otherwise column names might duplicate col_dict['name'] = 'corex_' + str(out_df.shape[1] + column_index) col_dict['semantic_types'] = ( 'http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute') corex_df.metadata = corex_df.metadata.update( (mbase.ALL_ELEMENTS, column_index), col_dict) corex_df.index = out_df.index.copy() out_df = utils.append_columns(out_df, corex_df) return CallResult(out_df, True, self.max_iter) def _fit_transform(self, inputs: Input, timeout: float = None, iterations: int = None) -> Sequence[Output]: self.columns = list(inputs) X_ = inputs[self.columns].values if iterations is not None: self.max_iter = iterations else: self.max_iter = 10000 if isinstance(self.hyperparams['n_hidden'], int): self.n_hidden = self.hyperparams['n_hidden'] elif isinstance(self.hyperparams['n_hidden'], float): self.n_hidden = max( 1, int(self.hyperparams['n_hidden'] * len(self.columns))) if not hasattr(self, 'model') or self.model is None: _stdout = sys.stdout null = open(os.devnull, 'wb') sys.stdout = null self.model = corex_cont.Corex(n_hidden=self.n_hidden, max_iter=self.max_iter) sys.stdout = _stdout self.latent_factors = self.model.fit_transform(X_) self.fitted = True return self.latent_factors def set_training_data(self, *, inputs: Input) -> None: self.training_inputs = inputs self.fitted = False def get_params(self) -> CorexContinuous_Params: return CorexContinuous_Params(model=self.model) def set_params(self, *, params: CorexContinuous_Params) -> None: self.model = params['model'] #self.fitted = params.fitted #self.training_inputs = params.training_inputs def _annotation(self): if self._annotation is not None: return self._annotation self._annotation = Primitive() self._annotation.name = 'CorexContinuous' self._annotation.task = 'FeatureExtraction' self._annotation.learning_type = 'UnsupervisedLearning' self._annotation.ml_algorithm = ['Dimension Reduction'] self._annotation.tags = ['feature_extraction', 'continuous'] return self._annotation def _get_feature_names(self): return [ 'CorexContinuous_' + str(i) for i in range(self.hyperparams['n_hidden']) ]
class CorexContinuous(UnsupervisedLearnerPrimitiveBase[Input, Output, CorexContinuous_Params, CorexContinuous_Hyperparams]): #(Primitive): """ Return components/latent factors that explain the most multivariate mutual information in the data under Linear Gaussian model. For comparison, PCA returns components explaining the most variance in the data. Serves as DSBox 'wrapper' for https://github.com/gregversteeg/linearcorex" """ metadata = PrimitiveMetadata({ "schema": "v0", "id": "d2d4fefc-0859-3522-91df-7e445f61a69b", "version": "1.0.0", "name": "CorexContinuous", "description": "Return components/latent factors that explain the most multivariate mutual information in the data under Linear Gaussian model. For comparison, PCA returns components explaining the most variance in the data.", "python_path": "d3m.primitives.dsbox.CorexContinuous", "original_python_path": "corexcontinuous.corex_continuous.CorexContinuous", "source": { "name": "ISI", "contact": 'mailto:[email protected]', "uris": [ 'https://github.com/brekelma/dsbox_corex' ] }, "installation": [ { 'type': 'PIP', 'package_uri': 'git+https://github.com/brekelma/dsbox_corex.git@8672da14a7f2e00ea488da460ad68ef0799a9532#egg=dsbox-corex' } ], "algorithm_types": ["EXPECTATION_MAXIMIZATION_ALGORITHM"], "primitive_family": "FEATURE_CONSTRUCTION", "preconditions": ["NO_MISSING_VALUES", "NO_CATEGORICAL_VALUES"], "hyperparams_to_tune": ["n_hidden"] }) # "effects": [], #def __init__(self, n_hidden : Any = None, max_iter : int = 10000, def __init__(self, *, hyperparams : CorexContinuous_Hyperparams) -> None: #, random_seed : int = 0, docker_containers: typing.Dict[str, DockerContainer] = None # Additional Corex Parameters set to defaults: see github.com/gregversteeg/LinearCorex #tol : float = 1e-5, anneal : bool = True, discourage_overlap : bool = True, gaussianize : str = 'standard', #gpu : bool = False, verbose : bool = False, seed : int = None, **kwargs) -> None: super().__init__(hyperparams = hyperparams)# random_seed = random_seed, docker_containers = docker_containers) def fit(self, *, timeout: float = None, iterations : int = None) -> CallResult[None]: if self.fitted: return if not hasattr(self, 'training_inputs'): raise ValueError("Missing training data.") self._fit_transform(self.training_inputs, timeout, iterations) self.fitted = True # add support for max_iter / incomplete return CallResult(None, True, self.max_iter) def produce(self, *, inputs : Input, timeout : float = None, iterations : int = None) -> CallResult[Output]: self.columns = list(inputs) X_ = inputs[self.columns].values if iterations is not None: self.max_iter = iterations else: self.max_iter = 10000 if not self.fitted: raise ValueError('Please fit before calling produce') self.latent_factors = self.model.transform(X_) return CallResult(self.latent_factors, True, self.max_iter) def _fit_transform(self, inputs : Input, timeout: float = None, iterations : int = None) -> Sequence[Output]: self.columns = list(inputs) X_ = inputs[self.columns].values if iterations is not None: self.max_iter = iterations else: self.max_iter = 10000 if isinstance(self.hyperparams['n_hidden'], int): self.n_hidden = self.hyperparams['n_hidden'] elif isinstance(self.hyperparams['n_hidden'], float): self.n_hidden = max(1,int(self.hyperparams['n_hidden']*len(self.columns))) if not hasattr(self, 'model') or self.model is None: _stdout = sys.stdout null = open(os.devnull,'wb') sys.stdout = null self.model = corex_cont.Corex(n_hidden= self.n_hidden, max_iter = self.max_iter) sys.stdout = _stdout self.latent_factors = self.model.fit_transform(X_) self.fitted = True return self.latent_factors def set_training_data(self, *, inputs : Input, outputs : Output) -> None: self.training_inputs = inputs self.fitted = False def get_params(self) -> CorexContinuous_Params: return CorexContinuous_Params(model = self.model) def set_params(self, *, params: CorexContinuous_Params) -> None: self.model = params['model'] #self.fitted = params.fitted #self.training_inputs = params.training_inputs def _annotation(self): if self._annotation is not None: return self._annotation self._annotation = Primitive() self._annotation.name = 'CorexContinuous' self._annotation.task = 'FeatureExtraction' self._annotation.learning_type = 'UnsupervisedLearning' self._annotation.ml_algorithm = ['Dimension Reduction'] self._annotation.tags = ['feature_extraction', 'continuous'] return self._annotation def _get_feature_names(self): return ['CorexContinuous_'+ str(i) for i in range(self.hyperparams['n_hidden'])]
class SDNE(UnsupervisedLearnerPrimitiveBase[Input, Output, SDNE_Params, SDNE_Hyperparams]): """ Graph embedding method """ metadata = PrimitiveMetadata({ "schema": "v0", "id": "7d61e488-b5bb-4c79-bad6-f1dc07292bf4", "version": "1.0.0", "name": "SDNE", "description": "Structural Deep Network Embedding (Wang et al 2016): unsupervised network embedding using autoencoders to preserve first order proximity (i.e. connected nodes have similar embeddings) and second order proximity (i.e. nodes with similar neighbors have similar embeddings). Hyperparam alpha controls weight of 1st order proximity loss (L2 norm of embedding difference), beta controls second-order loss (reconstruction of adjacency matrix row, matrix B in Wang et al). Expects list of [learning_df, nodes_df, edges_df] as input (e.g. by running common_primitives.normalize_graphs + data_tranformation.graph_to_edge_list.DSBOX)", "python_path": "d3m.primitives.feature_construction.sdne.DSBOX", "original_python_path": "sdne.SDNE", "source": { "name": "ISI", "contact": "mailto:[email protected]", "uris": ["https://github.com/brekelma/dsbox_graphs"] }, "installation": [cfg_.INSTALLATION], "algorithm_types": ["AUTOENCODER"], "primitive_family": "FEATURE_CONSTRUCTION", "hyperparams_to_tune": ["dimension", "beta", "alpha"] }) def __init__(self, *, hyperparams: SDNE_Hyperparams) -> None: super(SDNE, self).__init__(hyperparams=hyperparams) # nu1 = 1e-6, nu2=1e-6, K=3,n_units=[500, 300,], rho=0.3, n_iter=30, xeta=0.001,n_batch=500 def _make_adjacency(self, sources, dests, num_nodes=None, tensor=True): if num_nodes is None: num_nodes = len(self.node_encode.classes_) if tensor: try: adj = tf.SparseTensor( [[sources.values[i, 0], dests.values[i, 0]] for i in range(sources.values.shape[0])], [1.0 for i in range(sources.values.shape[0])], dense_shape=(num_nodes, num_nodes)) except: adj = tf.SparseTensor([[sources[i], dests[i]] for i in range(sources.shape[0])], [1.0 for i in range(sources.shape[0])], dense_shape=(num_nodes, num_nodes)) else: try: adj = csr_matrix( ([1.0 for i in range(sources.values.shape[0])], ([ sources.values[i, 0] for i in range(sources.values.shape[0]) ], [ dests.values[i, 0] for i in range(sources.values.shape[0]) ])), shape=(num_nodes, num_nodes)) except: adj = csr_matrix( ([1.0 for i in range(sources.shape[0])], ([sources[i] for i in range(sources.shape[0]) ], [dests[i] for i in range(sources.shape[0])])), shape=(num_nodes, num_nodes)) return adj def _get_source_dest(self, edges_df, source_types=None, dest_types=None): if source_types is None: source_types = ( 'https://metadata.datadrivendiscovery.org/types/EdgeSource', 'https://metadata.datadrivendiscovery.org/types/DirectedEdgeSource', 'https://metadata.datadrivendiscovery.org/types/UndirectedEdgeSource', 'https://metadata.datadrivendiscovery.org/types/SimpleEdgeSource', 'https://metadata.datadrivendiscovery.org/types/MultiEdgeSource' ) sources = get_columns_of_type(edges_df, source_types) if dest_types is None: dest_types = ( 'https://metadata.datadrivendiscovery.org/types/EdgeTarget', 'https://metadata.datadrivendiscovery.org/types/DirectedEdgeTarget', 'https://metadata.datadrivendiscovery.org/types/UndirectedEdgeTarget', 'https://metadata.datadrivendiscovery.org/types/SimpleEdgeTarget', 'https://metadata.datadrivendiscovery.org/types/MultiEdgeTarget' ) dests = get_columns_of_type(edges_df, dest_types) return sources, dests def _parse_inputs(self, inputs: Input, return_all=False): try: learning_id, learning_df = get_resource(inputs, 'learningData') except: pass try: # resource id, resource nodes_id, nodes_df = get_resource(inputs, '0_nodes') except: try: nodes_id, nodes_df = get_resource(inputs, 'nodes') except: nodes_df = learning_df try: edges_id, edges_df = get_resource(inputs, '0_edges') except: try: edges_id, edges_df = get_resource(inputs, 'edges') except: edges_id, edges_df = get_resource(inputs, '1') try: print("LEANRING DF ", learning_df) print("NODES DF ", nodes_df) print("EDGES DF ", edges_df) except: pass self.node_encode = LabelEncoder() sources, dests = self._get_source_dest(edges_df) sources = sources.astype(np.int32) dests = dests.astype(np.int32) to_fit = np.sort( np.concatenate([sources.values, dests.values], axis=-1).astype(np.int32).ravel()) self.node_encode.fit(to_fit) #nodes_df[id_col].values) sources[sources.columns[0]] = self.node_encode.transform( sources.values.astype(np.int32)) dests[dests.columns[0]] = self.node_encode.transform( dests.values.astype(np.int32)) other_training_data = self._make_adjacency(sources, dests, tensor=False) return other_training_data if not return_all else other_training_data, learning_df, nodes_df, edges_df def set_training_data(self, *, inputs: Input) -> None: training_data = self._parse_inputs(inputs) if isinstance(training_data, tuple): training_data = training_data[0] self.training_data = networkx.from_scipy_sparse_matrix(training_data) self.fitted = False def fit(self, *, timeout: float = None, iterations: int = None) -> None: if self.fitted: return CallResult(None, True, 1) args = {} args['nu1'] = 1e-6 args['nu2'] = 1e-6 args['K'] = self.hyperparams['depth'] args['n_units'] = [ 500, 300, ] args['rho'] = 0.3 args['n_iter'] = self.hyperparams['epochs'] args['xeta'] = self.hyperparams['lr'] #0.0005 args['n_batch'] = 100 #500 self._args = args dim = self.hyperparams['dimension'] alpha = self.hyperparams['alpha'] beta = self.hyperparams['beta'] #self._model = sdne.SDNE(d = dim, self._sdne = sdne.SDNE(d=dim, alpha=alpha, beta=beta, **args) #self._model.learn_embedding(graph = self.training_data) self._sdne.learn_embedding(graph=self.training_data) self._model = self._sdne._model make_keras_pickleable() self.fitted = True return CallResult(None, True, 1) def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]: #make_keras_pickleable() produce_data, learning_df, nodes_df, edges_df = self._parse_inputs( inputs, return_all=True) if self.fitted: result = self._sdne._Y #produce( )#_Y else: dim = self.hyperparams['dimension'] alpha = self.hyperparams['alpha'] beta = self.hyperparams['beta'] #self._model self._sdne = sdne.SDNE(d=dim, alpha=alpha, beta=beta, **args) produce_data = networkx.from_scipy_sparse_matrix(produce_data) self._sdne.learn_embedding(graph=produce_data) self._model = self._sdne._model result = self._sdne._Y target_types = [ 'https://metadata.datadrivendiscovery.org/types/TrueTarget', 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' ] if self.hyperparams['return_list']: result_np = container.ndarray(result, generate_metadata=True) return_list = d3m_List([result_np, inputs[1], inputs[2]], generate_metadata=True) return CallResult(return_list, True, 1) else: learn_df = d3m_DataFrame(learning_df, generate_metadata=True) learn_df = get_columns_not_of_type(learn_df, target_types) learn_df = learn_df.remove_columns( [learn_df.columns.get_loc('nodeID')]) #learn_df = learn_df.drop('nodeID', axis = 'columns') result_df = d3m_DataFrame(result, generate_metadata=True) result_df = result_df.loc[result_df.index.isin( learning_df['d3mIndex'].values)] for column_index in range(result_df.shape[1]): col_dict = dict( result_df.metadata.query((ALL_ELEMENTS, column_index))) col_dict['structural_type'] = type(1.0) col_dict['name'] = str(learn_df.shape[1] + column_index) col_dict['semantic_types'] = ( 'http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute') result_df.metadata = result_df.metadata.update( (ALL_ELEMENTS, column_index), col_dict) result_df.index = learn_df.index.copy() output = utils.append_columns(learn_df, result_df) #output.set_index('d3mIndex', inplace=True) return CallResult(output, True, 1) def multi_produce(self, *, produce_methods: typing.Sequence[str], inputs: Input, timeout: float = None, iterations: int = None) -> MultiCallResult: return self._multi_produce(produce_methods=produce_methods, timeout=timeout, iterations=iterations, inputs=inputs) def fit_multi_produce(self, *, produce_methods: typing.Sequence[str], inputs: Input, timeout: float = None, iterations: int = None) -> MultiCallResult: return self._fit_multi_produce(produce_methods=produce_methods, timeout=timeout, iterations=iterations, inputs=inputs) def get_params(self) -> SDNE_Params: return SDNE_Params(fitted=self.fitted, model=self._sdne, node_encode=self.node_encode) def set_params(self, *, params: SDNE_Params) -> None: self.fitted = params['fitted'] self._sdne = params['model'] self.node_encode = params['node_encode']
class CorexText(UnsupervisedLearnerPrimitiveBase[Input, Output, CorexText_Params, CorexText_Hyperparams] ): #(Primitive): """ Learns latent factors / topics which explain the most multivariate information in bag of words representations of documents. Returns learned topic scores for each document. Also supports hierarchical models and 'anchoring' to encourage topics to concentrate around desired words. """ metadata = PrimitiveMetadata({ "schema": "v0", "id": "18e63b10-c5b7-34bc-a670-f2c831d6b4bf", "version": "1.0.0", "name": "CorexText", "description": "Learns latent factors / topics which explain the most multivariate information in bag of words representations of documents. Returns learned topic scores for each document. Also supports hierarchical models and 'anchoring' to encourage topics to concentrate around desired words.", "python_path": "d3m.primitives.dsbox.CorexText", "original_python_path": "corextext.corex_text.CorexText", "source": { "name": "ISI", "contact": "mailto:[email protected]", "uris": ["https://github.com/brekelma/dsbox_corex"] }, "installation": [{ 'type': 'PIP', 'package_uri': 'git+https://github.com/brekelma/dsbox_corex.git@8672da14a7f2e00ea488da460ad68ef0799a9532#egg=dsbox-corex' }], "algorithm_types": ["EXPECTATION_MAXIMIZATION_ALGORITHM", "LATENT_DIRICHLET_ALLOCATION"], "primitive_family": "FEATURE_CONSTRUCTION", "hyperparams_to_tune": ["n_hidden", "chunking", "max_df", "min_df"] }) #"preconditions": [], # "effects": [], def __init__( self, *, hyperparams: CorexText_Hyperparams ) -> None: #, random_seed : int = 0, docker_containers: typing.Dict[str, DockerContainer] = None) super().__init__( hyperparams=hyperparams ) #, random_seed = random_seed, docker_containers = docker_containers) def fit(self, *, timeout: float = None, iterations: int = None ) -> CallResult[None]: #X : Sequence[Input]): #self.columns = list(X) #X_ = X[self.columns].values # useless if only desired columns are passed if self.fitted: return if not hasattr(self, 'model') or self.model is None: self.model = Corex(n_hidden=self.hyperparams['n_hidden'], max_iter=iterations, seed=self.random_seed) #, **kwargs) if not hasattr(self, 'training_inputs'): raise ValueError("Missing training data.") if not hasattr(self, 'get_text'): raise ValueError("Missing get_text parameter") else: if not self.get_text or self.hyperparams['chunking'] > 0: self.bow = TfidfVectorizer( input='content', decode_error='ignore', max_df=self.hyperparams['max_df'], min_df=self.hyperparams['min_df'], max_features=self.hyperparams['max_features']) else: self.bow = TfidfVectorizer( input='filename', max_df=self.hyperparams['max_df'], min_df=self.hyperparams['min_df'], max_features=self.hyperparams['max_features']) if iterations is not None: self.max_iter = iterations self.model.max_iter = self.max_iter else: self.max_iter = 250 self.model.max_iter = self.max_iter if self.hyperparams['chunking'] == 0: bow = self.bow.fit_transform(self.training_inputs.values.ravel( )) if not self.get_text else self.bow.fit_transform( self._get_raw_inputs()) else: inp, self.chunks = self._read_and_chunk( self.training_inputs.values.ravel(), read=self.get_text) bow = self.bow.fit_transform(inp) self.latent_factors = self.model.fit_transform(bow) if self.hyperparams['chunking'] > 0: self.latent_factors = self._unchunk(self.latent_factors, self.chunks) self.fitted = True return CallResult(None, True, self.max_iter) def produce( self, *, inputs: Input, timeout: float = None, iterations: int = None ) -> CallResult[Output]: # TAKES IN DF with index column #self.columns = list(X) #X_ = X[self.columns].values # useless if only desired columns are passed if iterations is not None: self.max_iter = iterations self.model.max_iter = self.max_iter else: self.max_iter = 250 self.model.max_iter = self.max_iter if not self.fitted: if self.hyperparams['chunking'] == 0: bow = self.bow.fit_transform(inputs.values.ravel( )) if not self.get_text else self.bow.fit_transform( self._get_raw_inputs(inputs=inputs, data_path=self.data_path)) else: inp, self.chunks = self._read_and_chunk( inputs.values.ravel(), data_path=self.data_path, read=self.get_text) bow = self.bow.fit_transform(inp) self.latent_factors = self.model.fit_transform(bow).astype(float) self.fitted = True else: if self.hyperparams['chunking'] == 0: bow = self.bow.transform(inputs.values.ravel( )) if not self.get_text else self.bow.transform( self._get_raw_inputs(inputs=inputs, data_path=self.data_path)) else: inp, self.chunks = self._read_and_chunk( inputs.values.ravel(), data_path=self.data_path, read=self.get_text) bow = self.bow.transform(inp) #print('bow shape ', bow.shape) self.latent_factors = self.model.transform(bow).astype(float) if self.hyperparams['chunking'] > 0: self.latent_factors = self._unchunk(self.latent_factors, self.chunks) # TO DO : Incorporate timeout, max_iter return CallResult(self.latent_factors, True, self.max_iter) def _fit_transform( self, inputs: Input, timeout: float = None, iterations: int = None ) -> Sequence[Output]: # TAKES IN DF with index column #self.columns = list(X) #X_ = X[self.columns].values # useless if only desired columns are passed if iterations is not None: self.max_iter = iterations self.model.max_iter = self.max_iter if self.hyperparams['chunking'] == 0: bow = self.bow.fit_transform(inputs.values.ravel( )) if not self.get_text else self.bow.fit_transform( self._get_raw_inputs(inputs=inputs)) else: inp, self.chunks = self._read_and_chunk(inputs.values.ravel(), read=self.get_text) bow = self.bow.fit_transform(inp) self.latent_factors = self.model.fit_transform(bow) if self.hyperparams['chunking'] > 0: self.latent_factors = self._unchunk(self.latent_factors, self.chunks) self.fitted = True return self.latent_factors def _get_raw_inputs(self, inputs: Input = None, data_path=None) -> np.ndarray: print_ = True raw_inputs = self.training_inputs.values if inputs is None else inputs.values inp = self.training_inputs.values if inputs is None else inputs.values if data_path is not None: for idx, val in np.ndenumerate(inp): raw_inputs[idx] = os.path.join(data_path, val) elif self.data_path is not None: for idx, val in np.ndenumerate(inp): raw_inputs[idx] = os.path.join(self.data_path, val) else: warn('Data_path param not passed.') return raw_inputs.ravel() def _read_and_chunk(self, inputs: Input = None, data_path: str = None, read: bool = True) -> Tuple[np.ndarray, np.ndarray]: # read data into documents / text # chunk_array = np.zeros((inputs.shape[0],)) chunked_docs = [] chunk_list = [] overall_j = 0 for i in range(inputs.shape[0]): if read: if data_path is None: file_path = os.path.join(self.data_path, inputs[i]) else: file_path = os.path.join(data_path, inputs[i]) with open(file_path, 'rb') as fn: doc = fn.read() doc = "".join(map(chr, doc)) doc_tokens = re.compile(r"(?u)\b\w\w+\b").findall( doc) # list of strings else: doc_tokens = inputs[i] j = 0 while (j + 2) * self.hyperparams['chunking'] <= len(doc_tokens): new_chunked_str = " ".join( doc_tokens[j * self.hyperparams['chunking']:(j + 1) * self.hyperparams['chunking']]) chunked_docs.append(new_chunked_str) j = j + 1 new_chunked_str = " ".join( doc_tokens[j * self.hyperparams['chunking']:]) chunked_docs.append(new_chunked_str) overall_j += (j + 1) chunk_list.append(overall_j) # all docs in 1 array, list indicating changepoints of documents return np.array(chunked_docs), np.array(chunk_list) def _unchunk(self, transformed: np.ndarray, chunk_array: np.ndarray): # transformed is samples x topics j = 0 return_val = None # hacky? chunk_array = np.append(chunk_array, np.array([transformed.shape[0] - 1]), axis=0) temp = np.zeros((transformed.shape[1], )) for i in range(transformed.shape[0]): #print('row ', i, '/', transformed.shape[0], ' chunk: ', j, ' chunk_array[j]: ', chunk_array[j]) if i < chunk_array[j] and i < transformed.shape[0] - 1: #temp = np.maximum(temp, transformed[i,:]) temp = temp + transformed[i, :] else: divisor = (chunk_array[j] - chunk_array[j - 1] + 1) if j > 0 else chunk_array[j] temp = temp / float(divisor) temp = temp[np.newaxis, :] # 1 x features if return_val is None: return_val = temp else: return_val = np.concatenate([return_val, temp], axis=0) j = j + 1 temp = np.zeros((transformed.shape[1], )) #print(i, return_val.shape) return return_val def set_training_data(self, *, inputs: Input, outputs: Output) -> None: self.training_inputs = inputs self.fitted = False def get_params(self) -> CorexText_Params: return CorexText_Params(model=self.model, bow=self.bow, get_text=self.get_text, data_path=self.data_path) #fitted = self.fitted, training_inputs = self.training_inputs) def set_params(self, *, params: CorexText_Params) -> None: self.model = params['model'] self.bow = params['bow'] self.get_text = params['get_text'] self.data_path = params['data_path'] #self.fitted = params.fitted #self.training_inputs = params.training_inputs def _annotation(self): if self._annotation is not None: return self._annotation self._annotation = Primitive() self._annotation.name = 'CorexText' self._annotation.task = 'FeatureExtraction' self._annotation.learning_type = 'UnsupervisedLearning' self._annotation.ml_algorithm = ['Dimension Reduction'] self._annotation.tags = ['feature_extraction', 'text'] return self._annotation def _get_feature_names(self): return [ 'CorexText_' + str(i) for i in range(self.hyperparams['n_hidden']) ]
class Learner(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): metadata = PrimitiveMetadata({ 'algorithm_types': [ PrimitiveAlgorithmType.ADAPTIVE_ALGORITHM, ], 'id': 'c1c54b03-717d-4e6b-b043-8fc93364b92e', 'keywords': ['learner'], 'name': "Learner", 'primitive_family': PrimitiveFamily.LEARNER, 'python_path': 'd3m.primitives.mit_primitives.Learner', 'source': { 'name': 'MIT_FeatureLabs', }, 'version': '0.0.3-dev', 'installation': [{ 'type': PrimitiveInstallationType.PIP, 'package_uri': ('git+https://github.com/HDI-Project/mit-primitives.git@' '{git_commit}#egg=mit-primitives').format( git_commit=utils.current_git_commit(os.path.dirname( __file__))) }], }) def get_params(self) -> Params: return self.params def set_params(self, *, params: Params) -> None: if not hasattr(self, 'params'): self.params = params else: self.params.update(params) def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: self.inputs = inputs self.outputs = outputs def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: learner_params = self.hyperparams['learner_params'] learner = mlpipeline.MLPipeline(**learner_params) fit_params = self.params['fit_params'] predict_params = self.params['predict_params'] learner.fit( self.inputs, self.outputs, fit_params=fit_params, predict_params=predict_params, ) self.params['learner'] = learner return CallResult(None) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: predict_params = self.params['predict_params'] results = self.params['learner'].predict(inputs, predict_params=predict_params) return CallResult(results) def to_dict(self) -> dict: return self.params['learner'].to_dict()