def generate_main_resources(random_state, resources, size): users_count = size posts_count = size * 10 comments_count = size * 10 user_ids = numpy.array(range(users_count)) post_ids = numpy.array(range(posts_count)) comment_ids = numpy.array(range(comments_count)) users = container.DataFrame({ 'id': user_ids, 'name': [f'User {i}' for i in range(users_count)], }) posts = container.DataFrame({ 'id': post_ids, 'author_id': pareto_choice(random_state, user_ids, posts_count), 'post': [f'Post {i}' for i in range(posts_count)], }) comments = container.DataFrame({ 'id': comment_ids, 'post_id': pareto_choice(random_state, post_ids, comments_count), 'author_id': pareto_choice(random_state, user_ids, comments_count), 'comment': [f'Comment {i}' for i in range(comments_count)], }) resources.update({'users': users, 'posts': posts, 'comments': comments})
def _create_string_merge_cols( cls, left_df: container.DataFrame, left_col: str, right_df: container.DataFrame, right_col: str, accuracy: float, index: int, ) -> pd.DataFrame: if accuracy < 1: left_keys = left_df[left_col].unique() right_keys = right_df[right_col].unique() matches: typing.Dict[str, typing.Optional[str]] = {} for left_key in left_keys: matches[left_key] = cls._string_fuzzy_match( left_key, right_keys, accuracy * 100 ) new_left_df = container.DataFrame( { "lefty_string" + str(index): left_df[left_col].map(lambda key: matches[key]) } ) else: new_left_df = container.DataFrame( {"lefty_string" + str(index): left_df[left_col]} ) return new_left_df
def test_basic(self): main = container.DataFrame({'timestamp': [1, 2, 3,4], 'value': [0.32,0.32,0.31,0.33],}, { 'top_level': 'main', }, generate_metadata=True) self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ 'selector': [], 'metadata': { 'top_level': 'main', 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, 'structural_type': 'd3m.container.pandas.DataFrame', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], 'dimension': { 'name': 'rows', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], 'length': 4, }, }, }, { 'selector': ['__ALL_ELEMENTS__'], 'metadata': { 'dimension': { 'name': 'columns', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], 'length': 2, }, }, }, { 'selector': ['__ALL_ELEMENTS__', 0], 'metadata': {'structural_type': 'numpy.int64', 'name': 'timestamp'}, }, { 'selector': ['__ALL_ELEMENTS__', 1], 'metadata': {'structural_type': 'numpy.float64', 'name': 'value'}, }]) hyperparams_class = HoltWintersExponentialSmoothing.HoltWintersExponentialSmoothing.metadata.get_hyperparams() primitive = HoltWintersExponentialSmoothing.HoltWintersExponentialSmoothing(hyperparams=hyperparams_class.defaults()) # primitive.set_training_data(inputs=main) # primitive.fit() output_main = primitive.produce(inputs=main).value output_main = round(output_main,2) # new_main_drop = new_main.iloc[2:] # new_main_drop = new_main_drop.reset_index(drop = True) print ( "output", output_main) expected_result = container.DataFrame(data = { 'timestamp' : [1,2,3,4], 'value': [0.32,0.32,0.31,0.32]}) print ("expected_result", expected_result) # output_main.reset_index() self.assertEqual(output_main[['timestamp','value_holt_winters_smoothing']].values.tolist(), expected_result[['timestamp','value']].values.tolist()) params = primitive.get_params() primitive.set_params(params=params)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Gaussian classification (i.e. seeded gaussian "clustering"). Inputs D - An n x d feature numpy array Returns labels - Class labels for each unlabeled vertex """ if not self._fitted: raise ValueError("Not fitted") n = self._embedding.shape[0] unique_labels = np.unique(self._labels) K = len(unique_labels) testing = inputs[2] try: testing_nodeIDs = np.asarray(testing['G1.nodeID']) except: testing_nodeIDs = np.asarray(testing['nodeID']) final_labels = np.zeros(len(testing)) if self._PD and self._ENOUGH_SEEDS: for i in range(len(testing_nodeIDs)): temp = np.where(self._nodeIDs == int(testing_nodeIDs[i]))[0][0] weighted_pdfs = np.array([self._pis[j]*MVN.pdf(self._embedding[temp,:], self._means[j], self._covariances[j, :, :]) for j in range(K)]) label = np.argmax(weighted_pdfs) final_labels[i] = int(label) else: for i in range(len(testing_nodeIDs)): temp = np.where(self._nodeIDs == int(testing_nodeIDs[i]))[0][0] try: weighted_pdfs = np.array([self._pis[j]*MVN.pdf(self._embedding[temp,:], self._means[j], self._covariances) for j in range(K)]) except: self._covariances += self._covariances + np.ones(self._covariances.shape)*0.00001 weighted_pdfs = np.array([self._pis[j]*MVN.pdf(self._embedding[temp,:], self._means[j], self._covariances) for j in range(K)]) label = np.argmax(weighted_pdfs) final_labels[i] = int(label) if self._problem == "VN": testing['classLabel'] = final_labels outputs = container.DataFrame(testing[['d3mIndex','classLabel']]) outputs[['d3mIndex', 'classLabel']] = outputs[['d3mIndex', 'classLabel']].astype(int) else: testing['community'] = final_labels outputs = container.DataFrame(testing[['d3mIndex', 'community']]) outputs[['d3mIndex', 'community']] = outputs[['d3mIndex', 'community']].astype(int) return base.CallResult(outputs)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: index_col = common_utils.list_columns_with_semantic_types( metadata=inputs.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/PrimaryKey" ]) if not index_col: warnings.warn( "Did not find primary key column. Can not vote, output origin") return CallResult(inputs) predict_target_col = common_utils.list_columns_with_semantic_types( metadata=inputs.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/PredictedTarget" ]) if not index_col: warnings.warn( "Did not find PredictedTarget column. Can not vote, output origin" ) return CallResult(inputs) df = inputs.copy() new_df = self._get_index_and_target_df(inputs=df, use_cols=index_col + predict_target_col) if self.hyperparams["ensemble_method"] == 'majority': groupby_df = new_df.groupby([ new_df.columns[pos] for pos in index_col ]).agg(lambda x: x.value_counts().index[0]).reset_index(drop=False) ret_df = container.DataFrame(groupby_df) ret_df.metadata = new_df.metadata if self.hyperparams["ensemble_method"] == 'max': groupby_df = new_df.groupby([ new_df.columns[pos] for pos in index_col ]).max().reset_index(drop=False) ret_df = container.DataFrame(groupby_df) ret_df.metadata = new_df.metadata if self.hyperparams["ensemble_method"] == 'min': groupby_df = new_df.groupby([ new_df.columns[pos] for pos in index_col ]).min().reset_index(drop=False) ret_df = container.DataFrame(groupby_df) ret_df.metadata = new_df.metadata return CallResult(self._update_metadata(df=ret_df))
def setup(self, columns): self.large_dataframe_with_many_columns = container.DataFrame( {str(i): [j for j in range(5)] for i in range(columns)}, columns=[str(i) for i in range(columns)], generate_metadata=True) self.list_of_many_dataframe_columns = [ container.DataFrame({str(i): [j for j in range(5, 10)]}, columns=[str(i)], generate_metadata=True) for i in range(int(columns / 2)) ]
def generate_learning_data_has_user_made_comment_on_post( random_state, resources): user_ids = resources['users'].loc[:, 'id'] post_ids = resources['posts'].loc[:, 'id'] users_count = len(user_ids) comments = resources['comments'] authors_and_posts = comments.loc[:, ['author_id', 'post_id']] authors_and_posts_set = set( authors_and_posts.itertuples(index=False, name=None)) data = { 'user_id': [], 'post_id': [], 'made_comment': [], } for author_id, post_id in authors_and_posts.sample( n=users_count, random_state=random_state).itertuples(index=False, name=None): data['user_id'].append(author_id) data['post_id'].append(post_id) data['made_comment'].append('yes') for user_id in random_state.permutation(user_ids): for post_id in random_state.permutation(post_ids): if (user_id, post_id) in authors_and_posts_set: continue data['user_id'].append(user_id) data['post_id'].append(post_id) data['made_comment'].append('no') if len(data['user_id']) == 2 * users_count: break if len(data['user_id']) == 2 * users_count: break assert len(data['user_id']) == 2 * users_count data = container.DataFrame(data) data = data.sample(frac=1.0, random_state=random_state).reset_index(drop=True) index = container.DataFrame({ 'd3mIndex': numpy.array(range(len(data))), }) resources['learningData'] = container.DataFrame( pandas.concat([index, data], axis=1))
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ :param inputs: assume the first column is the filename :param timeout: :param iterations: :return: """ features = [] # TODO consider a more robust means to 1) get location_base_uris and remove file:// media_root_dir = inputs.metadata.query( (0, 0))['location_base_uris'][0][len('file://'):] # remove file:// for filename in inputs.iloc[:, 0]: file_path = os.path.join(media_root_dir, filename) if os.path.isfile(file_path): video = self._read_fileuri( file_path ) # video is a ndarray of F x H x W x C, e.g. (408, 240, 320, 3) feature = self._generate_vid_feature(video) else: self.logger.warning( "No such file {}. Feature vector will be set to all zeros." .format(file_path)) feature = np.zeros(2048) features.append(feature) results = container.DataFrame(features, generate_metadata=True) return base.CallResult(results)
def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None, ) -> CallResult[container.DataFrame]: logger.debug(f"Producing {__name__}") # create dataframe to hold d3mIndex and result result = self._model.predict(self._format_text(inputs)) df = pd.DataFrame(result) # pipline run saving is now getting fussy about the prediction names matching the original target column # name df.columns = self._target_col_names # if we mapped values earlier map them back. if self._label_map: df.replace(self._label_map, inplace=True) result_df = container.DataFrame(df, generate_metadata=True) # mark the semantic types on the dataframe result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), "https://metadata.datadrivendiscovery.org/types/PredictedTarget", ) return base.CallResult(result_df)
def _join_datetime_col(cls, left_df: container.DataFrame, left_col: str, right_df: container.DataFrame, right_col: str, accuracy: float) -> pd.DataFrame: # use d3mIndex from left col if present right_df = right_df.drop(columns='d3mIndex') # compute a tolerance delta for time matching based on a percentage of the minimum left/right time # range choices = np.array([np.datetime64(parser.parse(dt)) for dt in right_df[right_col].unique()]) left_keys = np.array([np.datetime64(parser.parse(dt)) for dt in left_df[left_col].values]) time_tolerance = (1.0 - accuracy) * cls._compute_time_range(left_keys, choices) left_df.index = np.array([cls._datetime_fuzzy_match(dt, choices, time_tolerance) for dt in left_keys]) # make the right col the right dataframe index right_df = right_df.set_index(right_col) # inner join on the left / right indices joined = container.DataFrame(left_df.join(right_df, lsuffix='_1', rsuffix='_2', how='inner')) # sort on the d3m index if there, otherwise use the joined column if 'd3mIndex' in joined: joined = joined.sort_values(by=['d3mIndex']) else: joined = joined.sort_values(by=[left_col]) joined = joined.reset_index(drop=True) return joined
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """Apply neural network-based feature extraction to image_tensor""" self._lazy_init() image_tensor = inputs[1] image_d3mIndex = inputs[0] if not len(image_tensor.shape) == 4: raise ValueError('Expect shape to have 4 dimension') resized = False if self._resize_data: if not (image_tensor.shape[1] == 244 and image_tensor.shape[2] == 244): resized = True y = np.empty((image_tensor.shape[0], 224, 224, 3)) for index in range(image_tensor.shape[0]): y[index] = imresize(image_tensor[index], (224, 224)) image_tensor = y # preprocess() modifies the data. For now just copy the data. if self._preprocess_data: if resized: # Okay to modify image_tensor, since its not input data = image_tensor else: data = image_tensor.copy() self._preprocess(data) else: data = image_tensor # BUG fix: add global variable to fix ta3 system if calling multiple times of this primitive with self._graph.as_default(): output_ndarray = self._model.predict(data) output_ndarray = output_ndarray.reshape(output_ndarray.shape[0], -1) output_dataFrame = container.DataFrame( container.ndarray(output_ndarray)) # if generate_metadata is true, update the metadata if self.hyperparams["generate_metadata"]: for each_column in range(output_ndarray.shape[1]): metadata_selector = (mbase.ALL_ELEMENTS, each_column) metadata_each_column = { 'semantic_types': ('https://metadata.datadrivendiscovery.org/types/TabularColumn', 'https://metadata.datadrivendiscovery.org/types/Attribute' ) } output_dataFrame.metadata = output_dataFrame.metadata.update( metadata=metadata_each_column, selector=metadata_selector) # update the original index to be d3mIndex output_dataFrame = output_dataFrame.set_index(image_d3mIndex) self._has_finished = True self._iterations_done = True return CallResult(output_dataFrame, self._has_finished, self._iterations_done)
def _get_predictions(self,*, permutation_matrix: np.matrix, inputs: Inputs): testing = inputs['2'] threshold = self.hyperparams['threshold'] for i in range(testing.shape[0]): testing['match'][i] = 0 v1 = testing['G1.nodeID'][i] v2 = testing['G2.nodeID'][i] found = False j = 0 while not found: if self._g1_node_attributes[j] == int(v1): found = True v1 = j j += 1 # print(found) found = False j = 0 while not found: if self._g2_node_attributes[j] == int(v2): found = True v2 = j j += 1 if permutation_matrix[v1, v2] > threshold: testing['match'][i] = 1 else: testing['match'][i] = 0 df = container.DataFrame({"d3mIndex": testing['d3mIndex'], "match": testing['match']}) return df
def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None, ) -> base.CallResult[container.DataFrame]: logger.debug(f"Producing {__name__}") if len(self._cols) == 0: return base.CallResult(inputs) numerical_inputs = inputs.iloc[:, self._cols] k_means = KMeans(n_clusters=self.hyperparams["n_clusters"], random_state=self.random_seed) result = k_means.fit_predict(numerical_inputs) result_df = container.DataFrame( {self.hyperparams["cluster_col_name"]: result}, generate_metadata=True) result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), "https://metadata.datadrivendiscovery.org/types/PredictedTarget", ) return base.CallResult(result_df)
def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None, ) -> base.CallResult[container.DataFrame]: if self._needs_fit: self.fit() result = self._model.predict(inputs) result_df = container.DataFrame( { "outlier_label": result, }, generate_metadata=True, ) result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), "https://metadata.datadrivendiscovery.org/types/PredictedTarget", ) return base.CallResult(result_df)
def _get_inputs(self, problem, rinputs): inputs = [] for ip in rinputs: dataset = None if ip.HasField("dataset_uri") == True: dataset = D3MDatasetLoader().load(ip.dataset_uri) elif ip.HasField("csv_uri") == True: data = pd.read_csv( ip.csv_uri, dtype=str, header=0, na_filter=False, encoding='utf8', low_memory=False, ) dataset = container.DataFrame(data) logging.critical("Problem %s", problem) if len(problem.inputs) > 0: targets = problem.inputs[0].targets dataset = util.add_target_metadata(dataset, targets) dataset = util.add_privileged_metadata( dataset, problem.inputs[0].privileged_data) inputs.append(dataset) return inputs
def _join_string_col(cls, left_df: container.DataFrame, left_col: str, right_df: container.DataFrame, right_col: str, accuracy: float) -> pd.DataFrame: # use d3mIndex from left col if present right_df = right_df.drop(columns='d3mIndex') # pre-compute fuzzy matches left_keys = left_df[left_col].unique() right_keys = right_df[right_col].unique() matches: typing.Dict[str, typing.Optional[str]] = {} for left_key in left_keys: matches[left_key] = cls._string_fuzzy_match( left_key, right_keys, accuracy * 100) # look up pre-computed fuzzy match for each element in the left column left_df.index = left_df[left_col].map(lambda key: matches[key]) # make the right col the right dataframe index right_df = right_df.set_index(right_col) # inner join on the left / right indices joined = container.DataFrame( left_df.join(right_df, lsuffix='_1', rsuffix='_2', how='inner')) # sort on the d3m index if there, otherwise use the joined column if 'd3mIndex' in joined: joined = joined.sort_values(by=['d3mIndex']) else: joined = joined.sort_values(by=[left_col]) joined = joined.reset_index(drop=True) return joined
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """Produce primitive's predictions for specific time series at specific future time instances * these specific timesteps / series are specified implicitly by input dataset Arguments: inputs {Inputs} -- D3M dataframe containing attributes Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, not considered (default: {None}) Raises: PrimitiveNotFittedError: if primitive not fit Returns: CallResult[Outputs] -- (N, 2) dataframe with d3m_index and value for each prediction slice requested. prediction slice = specific horizon idx for specific series in specific regression """ all_preds, pred_intervals = self._produce(inputs) if self.hyperparams["interpretable"]: all_components = [[] for c in range(3)] for series, idxs in zip(all_preds, pred_intervals): for i, component in enumerate(series): all_components[i].append(component[idxs]) all_components = [ np.concatenate(component) for component in all_components ] col_names = ( self._output_column, "trend-component", "seasonality-component", ) df_data = { col_name: component for col_name, component in zip(col_names, all_components) } else: point_estimates = np.concatenate([ series[0][idxs] for series, idxs in zip(all_preds, pred_intervals) ]) df_data = {self._output_column: point_estimates} result_df = container.DataFrame( df_data, generate_metadata=True, ) result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), ("https://metadata.datadrivendiscovery.org/types/PredictedTarget"), ) return CallResult(result_df, has_finished=self._is_fit)
def _join_numeric_col(cls, left_df: container.DataFrame, left_col: str, right_df: container.DataFrame, right_col: str, accuracy: float) -> pd.DataFrame: # use d3mIndex from left col if present right_df = right_df.drop(columns='d3mIndex') # fuzzy match each of the left join col against the right join col value and save the results as the left # dataframe index right_df[right_col] = pd.to_numeric(right_df[right_col]) choices = right_df[right_col].unique() left_df[left_col] = pd.to_numeric(left_df[left_col]) left_df.index = left_df[left_col]. \ map(lambda x: cls._numeric_fuzzy_match(x, choices, accuracy)) # make the right col the right dataframe index right_df = right_df.set_index(right_col) # inner join on the left / right indices joined = container.DataFrame( left_df.join(right_df, lsuffix='_1', rsuffix='_2', how='inner')) # sort on the d3m index if there, otherwise use the joined column if 'd3mIndex' in joined: joined = joined.sort_values(by=['d3mIndex']) else: joined = joined.sort_values(by=[left_col]) joined = joined.reset_index(drop=True) return joined
def produce( # type: ignore self, *, inputs: Inputs, score_dataset: container.Dataset, timeout: float = None, iterations: int = None, ) -> base.CallResult[Outputs]: outputs: typing.Dict[str, typing.List] = { 'metric': [problem.PerformanceMetric.ACCURACY.name], 'value': [1.0], 'normalized': [1.0], } results = container.DataFrame(data=outputs, columns=list(outputs.keys()), generate_metadata=True) results.metadata = results.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey', ) results.metadata = results.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/Score', ) results.metadata = results.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 2), 'https://metadata.datadrivendiscovery.org/types/Score', ) return base.CallResult(results)
def _split_aggregated(self, df: container.DataFrame, split_col_names: list) -> container.DataFrame: lengths = [len(df.loc[0, col_name]) for col_name in split_col_names] for idx, col_name in enumerate(split_col_names): if self._sorted_pipe_ids: if len(self._sorted_pipe_ids) == lengths[idx]: extend_col_names = [ "{}_{}".format(col_name, i) for i in self._sorted_pipe_ids ] else: raise ValueError( "Unique number of pipeline ids not equal to the number of aggregated values" ) else: extend_col_names = [ "{}_{}".format(col_name, i) for i in range(lengths[idx]) ] extends = container.DataFrame(df.loc[:, col_name].values.tolist(), columns=extend_col_names) df = common_utils.horizontal_concat(left=df, right=extends) origin_metadata = dict( df.metadata.query( (mbase.ALL_ELEMENTS, df.columns.get_loc(col_name)))) for name in extend_col_names: col_idx = df.columns.get_loc(name) origin_metadata["name"] = name df.metadata = df.metadata.update((mbase.ALL_ELEMENTS, col_idx), origin_metadata) return df
class Hyperparams(hyperparams.Hyperparams): n_components = hyperparams.Hyperparameter[typing.Optional[int]]( default=None, description= 'Number of components (< n_classes - 1) for dimensionality reduction.', semantic_types=[ 'https://metadata.datadrivendiscovery.org/types/TuningParameter' ], ) learning_rate = hyperparams.Uniform( lower=0.01, upper=2, default=0.1, description= 'Learning rate shrinks the contribution of each classifier by ``learning_rate``. There is a trade-off between ``learning_rate`` and ``n_estimators``.', semantic_types=[ 'https://metadata.datadrivendiscovery.org/types/TuningParameter', 'https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter', ], ) array1 = hyperparams.Hyperparameter[container.ndarray]( default=container.ndarray(numpy.array([[1, 2], [3, 4]]), generate_metadata=True), semantic_types=[ 'https://metadata.datadrivendiscovery.org/types/TuningParameter' ], ) array2 = hyperparams.Hyperparameter[container.DataFrame]( default=container.DataFrame([[1, 2], [3, 4]], generate_metadata=True), semantic_types=[ 'https://metadata.datadrivendiscovery.org/types/TuningParameter' ], )
def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None, ) -> base.CallResult[container.DataFrame]: logger.debug(f"Producing {__name__}") inputs = inputs # create dataframe to hold result if self._model is None: raise ValueError("No model available for primitive") result = self._model.predict(inputs) # use the original saved target column name result_df = container.DataFrame({self._target_col: result}, generate_metadata=True) # mark the semantic types on the dataframe result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), "https://metadata.datadrivendiscovery.org/types/PredictedTarget", ) logger.debug(f"\n{result_df}") print(result_df) return base.CallResult(result_df)
def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None, ) -> base.CallResult[container.DataFrame]: logger.debug(f"Producing {__name__}") # extract and encode user and item columns inputs = inputs.iloc[:, [ self.hyperparams["user_col"], self.hyperparams["item_col"] ]] inputs = self._encode_labels(inputs) # predict ratings result = self._model.predict(inputs) # create dataframe to hold result result_df = container.DataFrame({self._target_col: result}, generate_metadata=True) # mark the semantic types on the dataframe result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), "https://metadata.datadrivendiscovery.org/types/PredictedTarget", ) logger.debug(f"\n{result_df}") return base.CallResult(result_df)
def produce(self, *, inputs_1: Inputs, inputs_2: Inputs, reference: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: xhat = inputs_1 yhat = inputs_2 matches = np.zeros(len(reference), dtype=int) for i in range(len(reference)): e_id = xhat.index[xhat['e_nodeID'] == reference['e_nodeID'].iloc[i]] g_id = yhat.index[yhat['g_nodeID'] == reference['g_nodeID'].iloc[i]] matches[i] = 1 if g_id == self._match[e_id] else 0 reference['match'] = matches results = reference[['d3mIndex', 'match']] predictions = { "d3mIndex": reference['d3mIndex'], "match": reference['match'] } return base.CallResult(container.DataFrame(predictions), has_finished=True, iterations_done=1)
def time_columns(self, columns): [ container.DataFrame({str(i): [j for j in range(5, 10)]}, columns=[str(i)], generate_metadata=False) for i in range(int(columns / 2)) ]
def time_dataframe(self, compact, columns): df = container.DataFrame( {str(i): [j for j in range(5)] for i in range(columns)}, columns=[str(i) for i in range(columns)], generate_metadata=False) df.metadata.generate(df, compact=compact)
def produce(self, *, inputs: container.List, timeout: float = None, iterations: int = None) -> CallResult[container.DataFrame]: logger.debug(f"Producing {__name__}") X_train, _, _ = inputs X_train = X_train.value result = self._model.predict(X_train) # create dataframe to hold d3mIndex and result result_df = container.DataFrame({ X_train.index.name: X_train.index, self._target_col: result }) # mark the semantic types on the dataframe result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), "https://metadata.datadrivendiscovery.org/types/PrimaryKey", ) result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 1), "https://metadata.datadrivendiscovery.org/types/PredictedTarget", ) return base.CallResult(result_df)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """Produce primitive's classifications for new time series data Arguments: inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, not considered (default: {None}) Raises: PrimitiveNotFittedError: if primitive not fit Returns: CallResult[Outputs] -- dataframe with a column containing a predicted class for each input time series """ if not self._is_fit: raise PrimitiveNotFittedError("Primitive not fitted.") # instantiate classifier and load saved weights clf = generate_lstmfcn( self._ts_sz, self._n_classes, lstm_dim=self.hyperparams["lstm_dim"], attention=self.hyperparams["attention_lstm"], dropout=self.hyperparams["dropout_rate"], ) clf.load_weights(self.hyperparams["weights_filepath"]) # find column with ts value through metadata grouping_column = self._get_cols(inputs.metadata) n_ts = inputs.iloc[:, grouping_column[0]].nunique() ts_sz = inputs.shape[0] // n_ts attribute_col = self._get_value_col(inputs.metadata) x_vals = inputs.iloc[:, attribute_col].values.reshape(n_ts, 1, ts_sz) x_vals = tf.cast(x_vals, tf.float32) test_dataset = LSTMSequenceTest(x_vals, self.hyperparams["batch_size"]) # make predictions preds = clf.predict(test_dataset) preds = self._label_encoder.inverse_transform(np.argmax(preds, axis=1)) # create output frame result_df = container.DataFrame({self._output_columns[0]: preds}, generate_metadata=True) result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), ("https://metadata.datadrivendiscovery.org/types/PredictedTarget"), ) # ok to set to True because we have checked that primitive has been fit return CallResult(result_df, has_finished=True)
def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> base.CallResult[Output]: output = self._prim_instance.produce(inputs=inputs) output = container.DataFrame(output.value, generate_metadata=True) return base.CallResult(output)
def produce_clusters(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[container.pandas.DataFrame]: # generate the clusters self._get_columns(inputs) clusters = self._get_clusters(inputs) # generate the response df cluster_df = container.DataFrame(clusters, columns=('key', self.hyperparams['output_col_name']), generate_metadata=True) return CallResult(cluster_df)