def generate_main_resources(random_state, resources, size):
    users_count = size
    posts_count = size * 10
    comments_count = size * 10

    user_ids = numpy.array(range(users_count))
    post_ids = numpy.array(range(posts_count))
    comment_ids = numpy.array(range(comments_count))

    users = container.DataFrame({
        'id':
        user_ids,
        'name': [f'User {i}' for i in range(users_count)],
    })

    posts = container.DataFrame({
        'id':
        post_ids,
        'author_id':
        pareto_choice(random_state, user_ids, posts_count),
        'post': [f'Post {i}' for i in range(posts_count)],
    })

    comments = container.DataFrame({
        'id':
        comment_ids,
        'post_id':
        pareto_choice(random_state, post_ids, comments_count),
        'author_id':
        pareto_choice(random_state, user_ids, comments_count),
        'comment': [f'Comment {i}' for i in range(comments_count)],
    })

    resources.update({'users': users, 'posts': posts, 'comments': comments})
Exemple #2
0
    def _create_string_merge_cols(
        cls,
        left_df: container.DataFrame,
        left_col: str,
        right_df: container.DataFrame,
        right_col: str,
        accuracy: float,
        index: int,
    ) -> pd.DataFrame:

        if accuracy < 1:
            left_keys = left_df[left_col].unique()
            right_keys = right_df[right_col].unique()
            matches: typing.Dict[str, typing.Optional[str]] = {}
            for left_key in left_keys:
                matches[left_key] = cls._string_fuzzy_match(
                    left_key, right_keys, accuracy * 100
                )
            new_left_df = container.DataFrame(
                {
                    "lefty_string"
                    + str(index): left_df[left_col].map(lambda key: matches[key])
                }
            )
        else:
            new_left_df = container.DataFrame(
                {"lefty_string" + str(index): left_df[left_col]}
            )
        return new_left_df
    def test_basic(self):
        main = container.DataFrame({'timestamp': [1, 2, 3,4], 'value': [0.32,0.32,0.31,0.33],}, {
            'top_level': 'main',
        },  generate_metadata=True)
       

        self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{
            'selector': [],
            'metadata': {
                'top_level': 'main',
                'schema': metadata_base.CONTAINER_SCHEMA_VERSION,
                'structural_type': 'd3m.container.pandas.DataFrame',
                'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'],
                'dimension': {
                    'name': 'rows',
                    'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'],
                    'length': 4,
                },
            },
        }, {
            'selector': ['__ALL_ELEMENTS__'],
            'metadata': {
                'dimension': {
                    'name': 'columns',
                    'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'],
                    'length': 2,
                },
            },
        }, {
            'selector': ['__ALL_ELEMENTS__', 0],
            'metadata': {'structural_type': 'numpy.int64', 'name': 'timestamp'},
        }, {
            'selector': ['__ALL_ELEMENTS__', 1],
            'metadata': {'structural_type': 'numpy.float64', 'name': 'value'},
        }])

        hyperparams_class = HoltWintersExponentialSmoothing.HoltWintersExponentialSmoothing.metadata.get_hyperparams()
        primitive = HoltWintersExponentialSmoothing.HoltWintersExponentialSmoothing(hyperparams=hyperparams_class.defaults())
      #  primitive.set_training_data(inputs=main)
      #  primitive.fit()
        output_main = primitive.produce(inputs=main).value
        output_main = round(output_main,2)
       
     #   new_main_drop = new_main.iloc[2:]
     #   new_main_drop = new_main_drop.reset_index(drop = True)
        print ( "output", output_main)

        expected_result = container.DataFrame(data = { 'timestamp' : [1,2,3,4], 'value': [0.32,0.32,0.31,0.32]})
        print ("expected_result", expected_result)
     #   output_main.reset_index()
      
        self.assertEqual(output_main[['timestamp','value_holt_winters_smoothing']].values.tolist(), expected_result[['timestamp','value']].values.tolist())
        

        params = primitive.get_params()
        primitive.set_params(params=params)
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        """
        Gaussian classification (i.e. seeded gaussian "clustering").

        Inputs
            D - An n x d feature numpy array
        Returns
            labels - Class labels for each unlabeled vertex
        """

        if not self._fitted:
            raise ValueError("Not fitted")

        n = self._embedding.shape[0]

        unique_labels = np.unique(self._labels)
        K = len(unique_labels)

        testing = inputs[2]

        try:
            testing_nodeIDs = np.asarray(testing['G1.nodeID'])
        except:
            testing_nodeIDs = np.asarray(testing['nodeID'])
        final_labels = np.zeros(len(testing))

        if self._PD and self._ENOUGH_SEEDS:
            for i in range(len(testing_nodeIDs)):
                temp = np.where(self._nodeIDs == int(testing_nodeIDs[i]))[0][0]
                weighted_pdfs = np.array([self._pis[j]*MVN.pdf(self._embedding[temp,:], self._means[j], self._covariances[j, :, :]) for j in range(K)])
                label = np.argmax(weighted_pdfs)
                final_labels[i] = int(label)
        else:

            for i in range(len(testing_nodeIDs)):
                temp = np.where(self._nodeIDs == int(testing_nodeIDs[i]))[0][0]
                try:
                    weighted_pdfs = np.array([self._pis[j]*MVN.pdf(self._embedding[temp,:], self._means[j], self._covariances) for j in range(K)])
                except:
                    self._covariances += self._covariances + np.ones(self._covariances.shape)*0.00001
                    weighted_pdfs = np.array([self._pis[j]*MVN.pdf(self._embedding[temp,:], self._means[j], self._covariances) for j in range(K)])
                label = np.argmax(weighted_pdfs)
                final_labels[i] = int(label)

        if self._problem == "VN":
            testing['classLabel'] = final_labels
            outputs = container.DataFrame(testing[['d3mIndex','classLabel']])
            outputs[['d3mIndex', 'classLabel']] = outputs[['d3mIndex', 'classLabel']].astype(int)
        else:
            testing['community'] = final_labels
            outputs = container.DataFrame(testing[['d3mIndex', 'community']])
            outputs[['d3mIndex', 'community']] = outputs[['d3mIndex', 'community']].astype(int)

        return base.CallResult(outputs)
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:

        index_col = common_utils.list_columns_with_semantic_types(
            metadata=inputs.metadata,
            semantic_types=[
                "https://metadata.datadrivendiscovery.org/types/PrimaryKey"
            ])
        if not index_col:
            warnings.warn(
                "Did not find primary key column. Can not vote, output origin")
            return CallResult(inputs)

        predict_target_col = common_utils.list_columns_with_semantic_types(
            metadata=inputs.metadata,
            semantic_types=[
                "https://metadata.datadrivendiscovery.org/types/PredictedTarget"
            ])
        if not index_col:
            warnings.warn(
                "Did not find PredictedTarget column. Can not vote, output origin"
            )
            return CallResult(inputs)

        df = inputs.copy()
        new_df = self._get_index_and_target_df(inputs=df,
                                               use_cols=index_col +
                                               predict_target_col)

        if self.hyperparams["ensemble_method"] == 'majority':
            groupby_df = new_df.groupby([
                new_df.columns[pos] for pos in index_col
            ]).agg(lambda x: x.value_counts().index[0]).reset_index(drop=False)
            ret_df = container.DataFrame(groupby_df)
            ret_df.metadata = new_df.metadata

        if self.hyperparams["ensemble_method"] == 'max':
            groupby_df = new_df.groupby([
                new_df.columns[pos] for pos in index_col
            ]).max().reset_index(drop=False)
            ret_df = container.DataFrame(groupby_df)
            ret_df.metadata = new_df.metadata

        if self.hyperparams["ensemble_method"] == 'min':
            groupby_df = new_df.groupby([
                new_df.columns[pos] for pos in index_col
            ]).min().reset_index(drop=False)
            ret_df = container.DataFrame(groupby_df)
            ret_df.metadata = new_df.metadata

        return CallResult(self._update_metadata(df=ret_df))
Exemple #6
0
 def setup(self, columns):
     self.large_dataframe_with_many_columns = container.DataFrame(
         {str(i): [j for j in range(5)]
          for i in range(columns)},
         columns=[str(i) for i in range(columns)],
         generate_metadata=True)
     self.list_of_many_dataframe_columns = [
         container.DataFrame({str(i): [j for j in range(5, 10)]},
                             columns=[str(i)],
                             generate_metadata=True)
         for i in range(int(columns / 2))
     ]
def generate_learning_data_has_user_made_comment_on_post(
        random_state, resources):
    user_ids = resources['users'].loc[:, 'id']
    post_ids = resources['posts'].loc[:, 'id']
    users_count = len(user_ids)
    comments = resources['comments']

    authors_and_posts = comments.loc[:, ['author_id', 'post_id']]

    authors_and_posts_set = set(
        authors_and_posts.itertuples(index=False, name=None))

    data = {
        'user_id': [],
        'post_id': [],
        'made_comment': [],
    }

    for author_id, post_id in authors_and_posts.sample(
            n=users_count, random_state=random_state).itertuples(index=False,
                                                                 name=None):
        data['user_id'].append(author_id)
        data['post_id'].append(post_id)
        data['made_comment'].append('yes')

    for user_id in random_state.permutation(user_ids):
        for post_id in random_state.permutation(post_ids):
            if (user_id, post_id) in authors_and_posts_set:
                continue

            data['user_id'].append(user_id)
            data['post_id'].append(post_id)
            data['made_comment'].append('no')

            if len(data['user_id']) == 2 * users_count:
                break

        if len(data['user_id']) == 2 * users_count:
            break

    assert len(data['user_id']) == 2 * users_count

    data = container.DataFrame(data)
    data = data.sample(frac=1.0,
                       random_state=random_state).reset_index(drop=True)

    index = container.DataFrame({
        'd3mIndex': numpy.array(range(len(data))),
    })

    resources['learningData'] = container.DataFrame(
        pandas.concat([index, data], axis=1))
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        :param inputs: assume the first column is the filename
        :param timeout:
        :param iterations:
        :return:
        """
        features = []
        # TODO consider a more robust means to 1) get location_base_uris and remove file://
        media_root_dir = inputs.metadata.query(
            (0, 0))['location_base_uris'][0][len('file://'):]  # remove file://
        for filename in inputs.iloc[:, 0]:
            file_path = os.path.join(media_root_dir, filename)
            if os.path.isfile(file_path):
                video = self._read_fileuri(
                    file_path
                )  # video is a ndarray of F x H x W x C, e.g. (408, 240, 320, 3)
                feature = self._generate_vid_feature(video)
            else:
                self.logger.warning(
                    "No such file {}. Feature vector will be set to all zeros."
                    .format(file_path))
                feature = np.zeros(2048)
            features.append(feature)

        results = container.DataFrame(features, generate_metadata=True)

        return base.CallResult(results)
Exemple #9
0
    def produce(
        self,
        *,
        inputs: container.DataFrame,
        timeout: float = None,
        iterations: int = None,
    ) -> CallResult[container.DataFrame]:
        logger.debug(f"Producing {__name__}")

        # create dataframe to hold d3mIndex and result
        result = self._model.predict(self._format_text(inputs))
        df = pd.DataFrame(result)

        # pipline run saving is now getting fussy about the prediction names matching the original target column
        # name
        df.columns = self._target_col_names

        # if we mapped values earlier map them back.
        if self._label_map:
            df.replace(self._label_map, inplace=True)
        result_df = container.DataFrame(df, generate_metadata=True)

        # mark the semantic types on the dataframe
        result_df.metadata = result_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0),
            "https://metadata.datadrivendiscovery.org/types/PredictedTarget",
        )

        return base.CallResult(result_df)
Exemple #10
0
    def _join_datetime_col(cls,
                           left_df: container.DataFrame,
                           left_col: str,
                           right_df: container.DataFrame,
                           right_col: str,
                           accuracy: float) -> pd.DataFrame:
        # use d3mIndex from left col if present
        right_df = right_df.drop(columns='d3mIndex')

        # compute a tolerance delta for time matching based on a percentage of the minimum left/right time
        # range
        choices = np.array([np.datetime64(parser.parse(dt)) for dt in right_df[right_col].unique()])
        left_keys = np.array([np.datetime64(parser.parse(dt)) for dt in left_df[left_col].values])
        time_tolerance = (1.0 - accuracy) * cls._compute_time_range(left_keys, choices)
        
        left_df.index = np.array([cls._datetime_fuzzy_match(dt, choices, time_tolerance) for dt in left_keys])

        # make the right col the right dataframe index
        right_df = right_df.set_index(right_col)

        # inner join on the left / right indices
        joined = container.DataFrame(left_df.join(right_df, lsuffix='_1', rsuffix='_2', how='inner'))

        # sort on the d3m index if there, otherwise use the joined column
        if 'd3mIndex' in joined:
            joined = joined.sort_values(by=['d3mIndex'])
        else:
            joined = joined.sort_values(by=[left_col])
        joined = joined.reset_index(drop=True)

        return joined
Exemple #11
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """Apply neural network-based feature extraction to image_tensor"""

        self._lazy_init()

        image_tensor = inputs[1]
        image_d3mIndex = inputs[0]

        if not len(image_tensor.shape) == 4:
            raise ValueError('Expect shape to have 4 dimension')

        resized = False
        if self._resize_data:
            if not (image_tensor.shape[1] == 244
                    and image_tensor.shape[2] == 244):
                resized = True
                y = np.empty((image_tensor.shape[0], 224, 224, 3))
                for index in range(image_tensor.shape[0]):
                    y[index] = imresize(image_tensor[index], (224, 224))
                image_tensor = y

        # preprocess() modifies the data. For now just copy the data.
        if self._preprocess_data:
            if resized:
                # Okay to modify image_tensor, since its not input
                data = image_tensor
            else:
                data = image_tensor.copy()
            self._preprocess(data)
        else:
            data = image_tensor
        # BUG fix: add global variable to fix ta3 system if calling multiple times of this primitive
        with self._graph.as_default():
            output_ndarray = self._model.predict(data)
        output_ndarray = output_ndarray.reshape(output_ndarray.shape[0], -1)
        output_dataFrame = container.DataFrame(
            container.ndarray(output_ndarray))

        # if generate_metadata is true, update the metadata
        if self.hyperparams["generate_metadata"]:
            for each_column in range(output_ndarray.shape[1]):
                metadata_selector = (mbase.ALL_ELEMENTS, each_column)
                metadata_each_column = {
                    'semantic_types':
                    ('https://metadata.datadrivendiscovery.org/types/TabularColumn',
                     'https://metadata.datadrivendiscovery.org/types/Attribute'
                     )
                }
                output_dataFrame.metadata = output_dataFrame.metadata.update(
                    metadata=metadata_each_column, selector=metadata_selector)
        # update the original index to be d3mIndex
        output_dataFrame = output_dataFrame.set_index(image_d3mIndex)
        self._has_finished = True
        self._iterations_done = True
        return CallResult(output_dataFrame, self._has_finished,
                          self._iterations_done)
Exemple #12
0
    def _get_predictions(self,*, permutation_matrix: np.matrix, inputs: Inputs):
        testing = inputs['2']

        threshold = self.hyperparams['threshold']

        for i in range(testing.shape[0]):
            testing['match'][i] = 0
            v1 = testing['G1.nodeID'][i]
            v2 = testing['G2.nodeID'][i]
            found = False
            j = 0
            while not found:
                if self._g1_node_attributes[j] == int(v1):
                    found = True
                    v1 = j
                j += 1
            # print(found)
            found = False
            j = 0

            while not found:
                if self._g2_node_attributes[j] == int(v2):
                    found = True
                    v2 = j
                j += 1

            if permutation_matrix[v1, v2] > threshold:
                testing['match'][i] = 1
            else:
                testing['match'][i] = 0

        df = container.DataFrame({"d3mIndex": testing['d3mIndex'], "match": testing['match']})
        return df
Exemple #13
0
    def produce(
        self,
        *,
        inputs: container.DataFrame,
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[container.DataFrame]:
        logger.debug(f"Producing {__name__}")

        if len(self._cols) == 0:
            return base.CallResult(inputs)

        numerical_inputs = inputs.iloc[:, self._cols]
        k_means = KMeans(n_clusters=self.hyperparams["n_clusters"],
                         random_state=self.random_seed)
        result = k_means.fit_predict(numerical_inputs)
        result_df = container.DataFrame(
            {self.hyperparams["cluster_col_name"]: result},
            generate_metadata=True)
        result_df.metadata = result_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0),
            "https://metadata.datadrivendiscovery.org/types/PredictedTarget",
        )

        return base.CallResult(result_df)
Exemple #14
0
    def produce(
        self,
        *,
        inputs: container.DataFrame,
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[container.DataFrame]:

        if self._needs_fit:
            self.fit()

        result = self._model.predict(inputs)

        result_df = container.DataFrame(
            {
                "outlier_label": result,
            },
            generate_metadata=True,
        )
        result_df.metadata = result_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0),
            "https://metadata.datadrivendiscovery.org/types/PredictedTarget",
        )

        return base.CallResult(result_df)
Exemple #15
0
    def _get_inputs(self, problem, rinputs):
        inputs = []

        for ip in rinputs:
            dataset = None
            if ip.HasField("dataset_uri") == True:
                dataset = D3MDatasetLoader().load(ip.dataset_uri)
            elif ip.HasField("csv_uri") == True:
                data = pd.read_csv(
                    ip.csv_uri,
                    dtype=str,
                    header=0,
                    na_filter=False,
                    encoding='utf8',
                    low_memory=False,
                )
                dataset = container.DataFrame(data)

            logging.critical("Problem %s", problem)
            if len(problem.inputs) > 0:
                targets = problem.inputs[0].targets
                dataset = util.add_target_metadata(dataset, targets)
                dataset = util.add_privileged_metadata(
                    dataset, problem.inputs[0].privileged_data)
            inputs.append(dataset)

        return inputs
Exemple #16
0
    def _join_string_col(cls, left_df: container.DataFrame, left_col: str,
                         right_df: container.DataFrame, right_col: str,
                         accuracy: float) -> pd.DataFrame:
        # use d3mIndex from left col if present
        right_df = right_df.drop(columns='d3mIndex')

        # pre-compute fuzzy matches
        left_keys = left_df[left_col].unique()
        right_keys = right_df[right_col].unique()
        matches: typing.Dict[str, typing.Optional[str]] = {}
        for left_key in left_keys:
            matches[left_key] = cls._string_fuzzy_match(
                left_key, right_keys, accuracy * 100)

        # look up pre-computed fuzzy match for each element in the left column
        left_df.index = left_df[left_col].map(lambda key: matches[key])

        # make the right col the right dataframe index
        right_df = right_df.set_index(right_col)

        # inner join on the left / right indices
        joined = container.DataFrame(
            left_df.join(right_df, lsuffix='_1', rsuffix='_2', how='inner'))

        # sort on the d3m index if there, otherwise use the joined column
        if 'd3mIndex' in joined:
            joined = joined.sort_values(by=['d3mIndex'])
        else:
            joined = joined.sort_values(by=[left_col])
        joined = joined.reset_index(drop=True)

        return joined
Exemple #17
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """Produce primitive's predictions for specific time series at specific future time instances
        * these specific timesteps / series are specified implicitly by input dataset

        Arguments:
            inputs {Inputs} -- D3M dataframe containing attributes

        Keyword Arguments:
            timeout {float} -- timeout, not considered (default: {None})
            iterations {int} -- iterations, not considered (default: {None})

        Raises:
            PrimitiveNotFittedError: if primitive not fit

        Returns:
            CallResult[Outputs] -- (N, 2) dataframe with d3m_index and value for each prediction slice requested.
                prediction slice = specific horizon idx for specific series in specific regression
        """
        all_preds, pred_intervals = self._produce(inputs)

        if self.hyperparams["interpretable"]:
            all_components = [[] for c in range(3)]
            for series, idxs in zip(all_preds, pred_intervals):
                for i, component in enumerate(series):
                    all_components[i].append(component[idxs])
            all_components = [
                np.concatenate(component) for component in all_components
            ]

            col_names = (
                self._output_column,
                "trend-component",
                "seasonality-component",
            )
            df_data = {
                col_name: component
                for col_name, component in zip(col_names, all_components)
            }

        else:
            point_estimates = np.concatenate([
                series[0][idxs]
                for series, idxs in zip(all_preds, pred_intervals)
            ])
            df_data = {self._output_column: point_estimates}

        result_df = container.DataFrame(
            df_data,
            generate_metadata=True,
        )

        result_df.metadata = result_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0),
            ("https://metadata.datadrivendiscovery.org/types/PredictedTarget"),
        )
        return CallResult(result_df, has_finished=self._is_fit)
Exemple #18
0
    def _join_numeric_col(cls, left_df: container.DataFrame, left_col: str,
                          right_df: container.DataFrame, right_col: str,
                          accuracy: float) -> pd.DataFrame:
        # use d3mIndex from left col if present
        right_df = right_df.drop(columns='d3mIndex')

        # fuzzy match each of the left join col against the right join col value and save the results as the left
        # dataframe index
        right_df[right_col] = pd.to_numeric(right_df[right_col])
        choices = right_df[right_col].unique()
        left_df[left_col] = pd.to_numeric(left_df[left_col])
        left_df.index = left_df[left_col]. \
            map(lambda x: cls._numeric_fuzzy_match(x, choices, accuracy))

        # make the right col the right dataframe index
        right_df = right_df.set_index(right_col)

        # inner join on the left / right indices
        joined = container.DataFrame(
            left_df.join(right_df, lsuffix='_1', rsuffix='_2', how='inner'))

        # sort on the d3m index if there, otherwise use the joined column
        if 'd3mIndex' in joined:
            joined = joined.sort_values(by=['d3mIndex'])
        else:
            joined = joined.sort_values(by=[left_col])
        joined = joined.reset_index(drop=True)

        return joined
Exemple #19
0
    def produce(  # type: ignore
        self,
        *,
        inputs: Inputs,
        score_dataset: container.Dataset,
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[Outputs]:
        outputs: typing.Dict[str, typing.List] = {
            'metric': [problem.PerformanceMetric.ACCURACY.name],
            'value': [1.0],
            'normalized': [1.0],
        }

        results = container.DataFrame(data=outputs,
                                      columns=list(outputs.keys()),
                                      generate_metadata=True)

        results.metadata = results.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0),
            'https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey',
        )
        results.metadata = results.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 1),
            'https://metadata.datadrivendiscovery.org/types/Score',
        )
        results.metadata = results.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 2),
            'https://metadata.datadrivendiscovery.org/types/Score',
        )

        return base.CallResult(results)
Exemple #20
0
    def _split_aggregated(self, df: container.DataFrame,
                          split_col_names: list) -> container.DataFrame:
        lengths = [len(df.loc[0, col_name]) for col_name in split_col_names]

        for idx, col_name in enumerate(split_col_names):
            if self._sorted_pipe_ids:
                if len(self._sorted_pipe_ids) == lengths[idx]:
                    extend_col_names = [
                        "{}_{}".format(col_name, i)
                        for i in self._sorted_pipe_ids
                    ]
                else:
                    raise ValueError(
                        "Unique number of pipeline ids not equal to the number of aggregated values"
                    )
            else:
                extend_col_names = [
                    "{}_{}".format(col_name, i) for i in range(lengths[idx])
                ]

            extends = container.DataFrame(df.loc[:, col_name].values.tolist(),
                                          columns=extend_col_names)

            df = common_utils.horizontal_concat(left=df, right=extends)
            origin_metadata = dict(
                df.metadata.query(
                    (mbase.ALL_ELEMENTS, df.columns.get_loc(col_name))))

            for name in extend_col_names:
                col_idx = df.columns.get_loc(name)
                origin_metadata["name"] = name
                df.metadata = df.metadata.update((mbase.ALL_ELEMENTS, col_idx),
                                                 origin_metadata)

        return df
 class Hyperparams(hyperparams.Hyperparams):
     n_components = hyperparams.Hyperparameter[typing.Optional[int]](
         default=None,
         description=
         'Number of components (< n_classes - 1) for dimensionality reduction.',
         semantic_types=[
             'https://metadata.datadrivendiscovery.org/types/TuningParameter'
         ],
     )
     learning_rate = hyperparams.Uniform(
         lower=0.01,
         upper=2,
         default=0.1,
         description=
         'Learning rate shrinks the contribution of each classifier by ``learning_rate``. There is a trade-off between ``learning_rate`` and ``n_estimators``.',
         semantic_types=[
             'https://metadata.datadrivendiscovery.org/types/TuningParameter',
             'https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter',
         ],
     )
     array1 = hyperparams.Hyperparameter[container.ndarray](
         default=container.ndarray(numpy.array([[1, 2], [3, 4]]),
                                   generate_metadata=True),
         semantic_types=[
             'https://metadata.datadrivendiscovery.org/types/TuningParameter'
         ],
     )
     array2 = hyperparams.Hyperparameter[container.DataFrame](
         default=container.DataFrame([[1, 2], [3, 4]],
                                     generate_metadata=True),
         semantic_types=[
             'https://metadata.datadrivendiscovery.org/types/TuningParameter'
         ],
     )
    def produce(
        self,
        *,
        inputs: container.DataFrame,
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[container.DataFrame]:
        logger.debug(f"Producing {__name__}")

        inputs = inputs

        # create dataframe to hold result
        if self._model is None:
            raise ValueError("No model available for primitive")
        result = self._model.predict(inputs)

        # use the original saved target column name
        result_df = container.DataFrame({self._target_col: result},
                                        generate_metadata=True)

        # mark the semantic types on the dataframe
        result_df.metadata = result_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0),
            "https://metadata.datadrivendiscovery.org/types/PredictedTarget",
        )

        logger.debug(f"\n{result_df}")

        print(result_df)

        return base.CallResult(result_df)
Exemple #23
0
    def produce(
        self,
        *,
        inputs: container.DataFrame,
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[container.DataFrame]:
        logger.debug(f"Producing {__name__}")
        # extract and encode user and item columns
        inputs = inputs.iloc[:, [
            self.hyperparams["user_col"], self.hyperparams["item_col"]
        ]]
        inputs = self._encode_labels(inputs)

        # predict ratings
        result = self._model.predict(inputs)
        # create dataframe to hold result
        result_df = container.DataFrame({self._target_col: result},
                                        generate_metadata=True)

        # mark the semantic types on the dataframe
        result_df.metadata = result_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0),
            "https://metadata.datadrivendiscovery.org/types/PredictedTarget",
        )

        logger.debug(f"\n{result_df}")
        return base.CallResult(result_df)
    def produce(self,
                *,
                inputs_1: Inputs,
                inputs_2: Inputs,
                reference: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        xhat = inputs_1
        yhat = inputs_2

        matches = np.zeros(len(reference), dtype=int)
        for i in range(len(reference)):
            e_id = xhat.index[xhat['e_nodeID'] ==
                              reference['e_nodeID'].iloc[i]]
            g_id = yhat.index[yhat['g_nodeID'] ==
                              reference['g_nodeID'].iloc[i]]
            matches[i] = 1 if g_id == self._match[e_id] else 0

        reference['match'] = matches

        results = reference[['d3mIndex', 'match']]

        predictions = {
            "d3mIndex": reference['d3mIndex'],
            "match": reference['match']
        }
        return base.CallResult(container.DataFrame(predictions),
                               has_finished=True,
                               iterations_done=1)
Exemple #25
0
 def time_columns(self, columns):
     [
         container.DataFrame({str(i): [j for j in range(5, 10)]},
                             columns=[str(i)],
                             generate_metadata=False)
         for i in range(int(columns / 2))
     ]
Exemple #26
0
 def time_dataframe(self, compact, columns):
     df = container.DataFrame(
         {str(i): [j for j in range(5)]
          for i in range(columns)},
         columns=[str(i) for i in range(columns)],
         generate_metadata=False)
     df.metadata.generate(df, compact=compact)
Exemple #27
0
    def produce(self,
                *,
                inputs: container.List,
                timeout: float = None,
                iterations: int = None) -> CallResult[container.DataFrame]:
        logger.debug(f"Producing {__name__}")

        X_train, _, _ = inputs
        X_train = X_train.value
        result = self._model.predict(X_train)

        # create dataframe to hold d3mIndex and result
        result_df = container.DataFrame({
            X_train.index.name: X_train.index,
            self._target_col: result
        })

        # mark the semantic types on the dataframe
        result_df.metadata = result_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0),
            "https://metadata.datadrivendiscovery.org/types/PrimaryKey",
        )
        result_df.metadata = result_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 1),
            "https://metadata.datadrivendiscovery.org/types/PredictedTarget",
        )

        return base.CallResult(result_df)
Exemple #28
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """Produce primitive's classifications for new time series data

        Arguments:
            inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target

        Keyword Arguments:
            timeout {float} -- timeout, not considered (default: {None})
            iterations {int} -- iterations, not considered (default: {None})

        Raises:
            PrimitiveNotFittedError: if primitive not fit

        Returns:
            CallResult[Outputs] -- dataframe with a column containing a predicted class
                for each input time series
        """

        if not self._is_fit:
            raise PrimitiveNotFittedError("Primitive not fitted.")

        # instantiate classifier and load saved weights
        clf = generate_lstmfcn(
            self._ts_sz,
            self._n_classes,
            lstm_dim=self.hyperparams["lstm_dim"],
            attention=self.hyperparams["attention_lstm"],
            dropout=self.hyperparams["dropout_rate"],
        )
        clf.load_weights(self.hyperparams["weights_filepath"])

        # find column with ts value through metadata
        grouping_column = self._get_cols(inputs.metadata)

        n_ts = inputs.iloc[:, grouping_column[0]].nunique()
        ts_sz = inputs.shape[0] // n_ts
        attribute_col = self._get_value_col(inputs.metadata)
        x_vals = inputs.iloc[:, attribute_col].values.reshape(n_ts, 1, ts_sz)
        x_vals = tf.cast(x_vals, tf.float32)
        test_dataset = LSTMSequenceTest(x_vals, self.hyperparams["batch_size"])

        # make predictions
        preds = clf.predict(test_dataset)
        preds = self._label_encoder.inverse_transform(np.argmax(preds, axis=1))

        # create output frame
        result_df = container.DataFrame({self._output_columns[0]: preds},
                                        generate_metadata=True)
        result_df.metadata = result_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0),
            ("https://metadata.datadrivendiscovery.org/types/PredictedTarget"),
        )

        # ok to set to True because we have checked that primitive has been fit
        return CallResult(result_df, has_finished=True)
 def produce(self,
             *,
             inputs: Input,
             timeout: float = None,
             iterations: int = None) -> base.CallResult[Output]:
     output = self._prim_instance.produce(inputs=inputs)
     output = container.DataFrame(output.value, generate_metadata=True)
     return base.CallResult(output)
Exemple #30
0
    def produce_clusters(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[container.pandas.DataFrame]:
        # generate the clusters
        self._get_columns(inputs)
        clusters = self._get_clusters(inputs)

        # generate the response df
        cluster_df = container.DataFrame(clusters, columns=('key', self.hyperparams['output_col_name']), generate_metadata=True)
        return CallResult(cluster_df)