Example #1
0
    def test_pickle(self):
        # This test is not really useful anymore because primitive now does not keep random state
        # anymore but outputs depend only on inputs, and not on previous calls to "produce" method.

        hyperparams_class = RandomPrimitive.metadata.get_hyperparams()

        primitive = RandomPrimitive(random_seed=42, hyperparams=hyperparams_class.defaults())

        inputs = container.List(list(range(4)), generate_metadata=True)

        call_metadata = self.call_primitive(primitive, 'produce', inputs=inputs)

        self.assertTrue(numpy.allclose(call_metadata.value.values, container.ndarray([0.496714153011, -0.138264301171, 0.647688538101, 1.52302985641]).reshape(4, 1)))

        pickled_primitive = pickle.dumps(primitive)

        inputs = container.List(list(range(4, 8)), generate_metadata=True)

        call_metadata = self.call_primitive(primitive, 'produce', inputs=inputs)

        self.assertTrue(numpy.allclose(call_metadata.value.values, container.ndarray([-0.23415337, -0.23413696, 1.57921282, 0.76743473]).reshape(4, 1)))

        unpickled_primitive = pickle.loads(pickled_primitive)

        call_metadata = self.call_primitive(unpickled_primitive, 'produce', inputs=inputs)

        self.assertTrue(numpy.allclose(call_metadata.value.values, container.ndarray([-0.23415337, -0.23413696, 1.57921282, 0.76743473]).reshape(4, 1)))
Example #2
0
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        np.random.seed(1234)

        G = inputs[0].copy()

        try:
            link_predicton = inputs[3]
            if type(link_predicton) is not bool:
                link_predicton = False
        except:
            link_predicton = False

        if link_predicton:
            g = np.array(G.copy())
        else:
            g = graspyPTR(G)

        n = g.shape[0]

        max_dimension = self.hyperparams['max_dimension']

        if max_dimension > n:
            max_dimension = n 

        n_elbows = self.hyperparams['which_elbow']

        if self.hyperparams['use_attributes']:
            adj = [g]
            MORE_ATTR = True
            attr_number = 1
            while MORE_ATTR:
                attr = 'attr'
                temp_attr = np.array(list(networkx.get_node_attributes(G, 'attr' + str(attr_number)).values()))
                if len(temp_attr) == 0:
                    MORE_ATTR = False
                else:
                    K = np.sum((temp_attr[:, np.newaxis][:, np.newaxis, :] - temp_attr[:, np.newaxis][np.newaxis, :, :])**2, axis = -1)
                    adj.append(graspyPTR(K))
                    attr_number += 1
            M = len(adj)
            
            if M > 1:
                omni_object = graspyOMNI(n_components = max_dimension, n_elbows = n_elbows)
                X_hats = omni_object.fit_transform(adj)
                X_hat = np.mean(X_hats, axis = 0)

                embedding = X_hat.copy()

                inputs[0] = container.ndarray(embedding)

                return base.CallResult(inputs)

        ase_object = graspyASE(n_components=max_dimension, n_elbows = n_elbows)
        X_hat = ase_object.fit_transform(g)

        inputs[0] = container.ndarray(X_hat)

        return base.CallResult(inputs)
    def test_regularization(self):
        # Generate data, well-posed problem with nSamples > nFeatures
        np.random.seed(0)
        nSamples = 10
        nFeatures = 5
        true_coef, inputs, outputs = generate_linear_data(nSamples, nFeatures)

        # Test fitting with default hyperparams
        hp = OWLHyperparams(OWLHyperparams.defaults())
        primitive = OWLRegression(hyperparams=hp)
        primitive.set_training_data(inputs=inputs, outputs=outputs)
        primitive.fit()
        ps = primitive.get_params()
        self.assertEqual(np.all(ps['coef'] == primitive._coef), True)
        self.assertEqual(ps['intercept'] == primitive._intercept, True)
        self.assertEqual(ps['fitted'], True)

        relative_error = np.linalg.norm(ps['coef'] -
                                        true_coef) / np.linalg.norm(true_coef)
        #print("relative_error = {}".format(relative_error))
        self.assertEqual(relative_error < 0.2, True)
        self.assertEqual(np.abs(ps['intercept']) < 0.1, True)

        # Test fitting with customized hyperparams: OSCAR
        hp = OWLHyperparams(OWLHyperparams.defaults(),
                            weight_type='linear',
                            weight_max_val=0.01,
                            weight_max_off=0,
                            weight_min_val=0.005,
                            weight_min_off=nFeatures - 1,
                            learning_rate=0.001)
        primitive = OWLRegression(hyperparams=hp)
        primitive.set_training_data(inputs=inputs, outputs=outputs)
        primitive.fit()
        ps = primitive.get_params()
        self.assertEqual(np.all(ps['coef'] == primitive._coef), True)
        self.assertEqual(ps['intercept'] == primitive._intercept, True)
        self.assertEqual(ps['fitted'], True)
        relative_error = np.linalg.norm(ps['coef'] -
                                        true_coef) / np.linalg.norm(true_coef)
        #print("relative_error = {}".format(relative_error))
        self.assertEqual(relative_error < 0.2, True)
        self.assertEqual(np.abs(ps['intercept']) < 0.1, True)

        # Test single / multiple produce
        inputs_produce = container.ndarray(np.random.randn(1, nFeatures))
        outputs_produce = primitive.produce(inputs=inputs_produce).value
        self.assertEqual(outputs_produce.shape, (1, ))

        inputs_produce = container.ndarray(np.random.randn(2, nFeatures))
        outputs_produce = primitive.produce(inputs=inputs_produce).value
        self.assertEqual(outputs_produce.shape, (2, ))
Example #4
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Input
            G: an n x n matrix or a networkx Graph
        Return
            The largest connected component of g

        """
        G = inputs['0']
        csv = inputs['learningData']

        #if len(list(nx.get_node_attributes(G, 'nodeID').values())) == 0:
        #    nx.set_node_attributes(G,'nodeID',-1)
        #    for i in range(len(G)):
        #        G.node[i]['nodeID'] = i

        if len(csv) != 0:
            if len(list(nx.get_node_attributes(G, 'nodeID').values())) == 0:
                nx.set_node_attributes(G, 'nodeID', -1)
                for i in range(len(G)):
                    G.node[i]['nodeID'] = i

            nodeIDs = list(nx.get_node_attributes(G, 'nodeID').values())
            nodeIDs = container.ndarray(np.array([int(i) for i in nodeIDs]))

            return base.CallResult(container.List([G.copy(), nodeIDs, csv]))

        if type(G) == np.ndarray:
            if G.ndim == 2:
                if G.shape[0] == G.shape[1]:  # n x n matrix
                    G = Graph(G)
                else:
                    raise TypeError(
                        "Networkx graphs or n x n numpy arrays only")

        subgraphs = [G.subgraph(i).copy() for i in nx.connected_components(G)]

        G_connected = [[0]]
        for i in subgraphs:
            if len(i) > len(G_connected[0]):
                G_connected = [i]

        nodeIDs = list(
            nx.get_node_attributes(G_connected[0], 'nodeID').values())
        nodeIDs = container.ndarray(np.array([int(i) for i in nodeIDs]))

        return base.CallResult(
            container.List([G_connected[0].copy(), nodeIDs, csv]))
Example #5
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:
        _X = inputs.T
        d, numVectors = _X.shape
        Uhat = self._U

        self._grastaOPTIONS.subsampling = self._subsampling
        Lhat = np.zeros(_X.shape)
        for i in range(0, numVectors):
            _x = _X[:, i]
            if (self._grastaOPTIONS.subsampling < 1):
                _xidx = self._random_state.choice(
                    self._dim,
                    int(np.ceil(self._grastaOPTIONS.subsampling * self._dim)),
                    replace=False)
            else:
                _xidx = np.where(~np.isnan(_x))[0]

            U, w, s, STATUS_new, admm_OPTS = self._grasta_stream(
                Uhat, _x, _xidx)
            Lhat[:, i] = U @ w

        return base.CallResult(
            container.ndarray(Lhat.T, generate_metadata=True))
Example #6
0
    def test_ndarray(self):
        with self.assertLogs(SumPrimitive.metadata.query()['python_path'],
                             level='DEBUG') as cm:
            hyperparams_class = SumPrimitive.metadata.get_hyperparams()

            primitive = SumPrimitive(
                hyperparams=hyperparams_class.defaults(),
                docker_containers=self.get_docker_containers())

            inputs = container.ndarray([[1, 2, 3, 4], [5, 6, 7, 8]],
                                       generate_metadata=True)

            call_metadata = self.call_primitive(primitive,
                                                'produce',
                                                inputs=inputs)

            # Because it is a singleton produce method we can know that there is exactly one value in outputs.
            result = call_metadata.value[0]

            self.assertEqual(result, 36)
            self.assertEqual(call_metadata.has_finished, True)
            self.assertEqual(call_metadata.iterations_done, None)

            self.assertEqual(
                call_metadata.value.metadata.query(
                    (metadata_base.ALL_ELEMENTS, ))['structural_type'], float)

        self.assertEqual(len(cm.records), 2)
        self.assertEqual(cm.records[0].name,
                         SumPrimitive.metadata.query()['python_path'])
        self.assertEqual(cm.records[1].name,
                         SumPrimitive.metadata.query()['python_path'])

        self.assertIsInstance(cm.records[0].data, numpy.ndarray)
        self.assertEqual(cm.records[1].response.status, 200)
Example #7
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Pass to ranks

        **Positional Arguments:**

        inputs:
            - JHUGraph adjacency matrix
        """

        path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                            "ptr.interface.R")
        cmd = """
        source("%s")
        fn <- function(inputs) {
            ptr.interface(inputs)
        }
        """ % path
        #print(cmd)

        result = robjects.r(cmd)(inputs)
        #print(result)

        outputs = container.ndarray(result)

        return base.CallResult(outputs)
 class Hyperparams(hyperparams.Hyperparams):
     n_components = hyperparams.Hyperparameter[typing.Optional[int]](
         default=None,
         description=
         'Number of components (< n_classes - 1) for dimensionality reduction.',
         semantic_types=[
             'https://metadata.datadrivendiscovery.org/types/TuningParameter'
         ],
     )
     learning_rate = hyperparams.Uniform(
         lower=0.01,
         upper=2,
         default=0.1,
         description=
         'Learning rate shrinks the contribution of each classifier by ``learning_rate``. There is a trade-off between ``learning_rate`` and ``n_estimators``.',
         semantic_types=[
             'https://metadata.datadrivendiscovery.org/types/TuningParameter',
             'https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter',
         ],
     )
     array1 = hyperparams.Hyperparameter[container.ndarray](
         default=container.ndarray(numpy.array([[1, 2], [3, 4]]),
                                   generate_metadata=True),
         semantic_types=[
             'https://metadata.datadrivendiscovery.org/types/TuningParameter'
         ],
     )
     array2 = hyperparams.Hyperparameter[container.DataFrame](
         default=container.DataFrame([[1, 2], [3, 4]],
                                     generate_metadata=True),
         semantic_types=[
             'https://metadata.datadrivendiscovery.org/types/TuningParameter'
         ],
     )
Example #9
0
    def test_basic(self):
        hyperparams_class = RandomPrimitive.metadata.get_hyperparams()

        primitive = RandomPrimitive(random_seed=42,
                                    hyperparams=hyperparams_class.defaults())

        inputs = container.List(list(range(4)), generate_metadata=True)

        call_metadata = self.call_primitive(primitive,
                                            'produce',
                                            inputs=inputs)

        self.assertTrue(
            numpy.allclose(
                call_metadata.value.values,
                container.ndarray([
                    0.496714153011, -0.138264301171, 0.647688538101,
                    1.52302985641
                ]).reshape((4, 1))))
        self.assertEqual(call_metadata.has_finished, True)
        self.assertEqual(call_metadata.iterations_done, None)

        self.assertEqual(
            call_metadata.value.metadata.query(
                (base.ALL_ELEMENTS, 0))['structural_type'], numpy.float64)
    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
        if self._fitted:
            return base.CallResult(None)

        xhat = self._inputs_1
        yhat = self._inputs_2

        seeds = self._reference['match'].astype(int).astype(bool)

        xhat_seed_names = self._reference[self._reference.columns[1]][seeds].values
        yhat_seed_names = self._reference[self._reference.columns[2]][seeds].values

        n_seeds = len(xhat_seed_names)

        x_seeds = np.zeros(n_seeds)
        y_seeds = np.zeros(n_seeds)
        for i in range(n_seeds):
            x_seeds[i] = np.where(xhat[xhat.columns[0]] == xhat_seed_names[i])[0][0]

            y_seeds[i] = np.where(yhat[yhat.columns[0]] == yhat_seed_names[i])[0][0]

        # do this more carefully TODO
        xhat_embedding = xhat.values[:,1:].astype(np.float32)
        yhat_embedding = yhat.values[:,1:].astype(np.float32)

        S_xx = np.exp(-cdist(xhat_embedding, xhat_embedding, ))
        S_yy = np.exp(-cdist(yhat_embedding, yhat_embedding, ))

        gmp = GraphMatch(shuffle_input=False)
        match = gmp.fit_predict(S_xx, S_yy, x_seeds, y_seeds)
        self._match = container.ndarray(match)
        self._fitted = True

        return CallResult(None)
    def _read_fileuri(self, fileuri: str) -> container.ndarray:
        """
        @see https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/video_reader.py#L65
        :param fileuri:
        :return:
        """
        capture = cv2.VideoCapture(fileuri)
        frames = []

        try:
            while capture.isOpened():
                ret, frame = capture.read()
                if not ret:
                    break
                else:
                    assert frame.dtype == np.uint8, frame.dtype

                    if frame.ndim == 2:
                        # Make sure there are always three dimensions.
                        frame = frame.reshape(list(frame.shape) + [1])

                    assert frame.ndim == 3, frame.ndim

                    frames.append(frame)
        finally:
            capture.release()

        return container.ndarray(np.array(frames), generate_metadata=False)
Example #12
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """Apply neural network-based feature extraction to image_tensor"""

        self._lazy_init()

        image_tensor = inputs[1]
        image_d3mIndex = inputs[0]

        if not len(image_tensor.shape) == 4:
            raise ValueError('Expect shape to have 4 dimension')

        resized = False
        if self._resize_data:
            if not (image_tensor.shape[1] == 244
                    and image_tensor.shape[2] == 244):
                resized = True
                y = np.empty((image_tensor.shape[0], 224, 224, 3))
                for index in range(image_tensor.shape[0]):
                    y[index] = imresize(image_tensor[index], (224, 224))
                image_tensor = y

        # preprocess() modifies the data. For now just copy the data.
        if self._preprocess_data:
            if resized:
                # Okay to modify image_tensor, since its not input
                data = image_tensor
            else:
                data = image_tensor.copy()
            self._preprocess(data)
        else:
            data = image_tensor
        # BUG fix: add global variable to fix ta3 system if calling multiple times of this primitive
        with self._graph.as_default():
            output_ndarray = self._model.predict(data)
        output_ndarray = output_ndarray.reshape(output_ndarray.shape[0], -1)
        output_dataFrame = container.DataFrame(
            container.ndarray(output_ndarray))

        # if generate_metadata is true, update the metadata
        if self.hyperparams["generate_metadata"]:
            for each_column in range(output_ndarray.shape[1]):
                metadata_selector = (mbase.ALL_ELEMENTS, each_column)
                metadata_each_column = {
                    'semantic_types':
                    ('https://metadata.datadrivendiscovery.org/types/TabularColumn',
                     'https://metadata.datadrivendiscovery.org/types/Attribute'
                     )
                }
                output_dataFrame.metadata = output_dataFrame.metadata.update(
                    metadata=metadata_each_column, selector=metadata_selector)
        # update the original index to be d3mIndex
        output_dataFrame = output_dataFrame.set_index(image_d3mIndex)
        self._has_finished = True
        self._iterations_done = True
        return CallResult(output_dataFrame, self._has_finished,
                          self._iterations_done)
Example #13
0
    def produce(self,
                *,
                inputs: Input,
                timeout: float = None,
                iterations: int = None) -> CallResult[Output]:
        #make_keras_pickleable()
        produce_data, learning_df, nodes_df, edges_df = self._parse_inputs(
            inputs, return_all=True)
        if self.fitted:
            result = self._sdne._Y  #produce( )#_Y
        else:
            dim = self.hyperparams['dimension']
            alpha = self.hyperparams['alpha']
            beta = self.hyperparams['beta']
            #self._model
            self._sdne = sdne.SDNE(d=dim, alpha=alpha, beta=beta, **args)

            produce_data = networkx.from_scipy_sparse_matrix(produce_data)
            self._sdne.learn_embedding(graph=produce_data)
            self._model = self._sdne._model
            result = self._sdne._Y

        target_types = [
            'https://metadata.datadrivendiscovery.org/types/TrueTarget',
            'https://metadata.datadrivendiscovery.org/types/SuggestedTarget'
        ]
        if self.hyperparams['return_list']:
            result_np = container.ndarray(result, generate_metadata=True)
            return_list = d3m_List([result_np, inputs[1], inputs[2]],
                                   generate_metadata=True)
            return CallResult(return_list, True, 1)
        else:
            learn_df = d3m_DataFrame(learning_df, generate_metadata=True)
            learn_df = get_columns_not_of_type(learn_df, target_types)

            learn_df = learn_df.remove_columns(
                [learn_df.columns.get_loc('nodeID')])
            #learn_df = learn_df.drop('nodeID', axis = 'columns')

            result_df = d3m_DataFrame(result, generate_metadata=True)
            result_df = result_df.loc[result_df.index.isin(
                learning_df['d3mIndex'].values)]

            for column_index in range(result_df.shape[1]):
                col_dict = dict(
                    result_df.metadata.query((ALL_ELEMENTS, column_index)))
                col_dict['structural_type'] = type(1.0)
                col_dict['name'] = str(learn_df.shape[1] + column_index)
                col_dict['semantic_types'] = (
                    'http://schema.org/Float',
                    'https://metadata.datadrivendiscovery.org/types/Attribute')

                result_df.metadata = result_df.metadata.update(
                    (ALL_ELEMENTS, column_index), col_dict)
            result_df.index = learn_df.index.copy()

            output = utils.append_columns(learn_df, result_df)
            #output.set_index('d3mIndex', inplace=True)
            return CallResult(output, True, 1)
Example #14
0
    def produce_subspace(self,
                         *,
                         inputs: Inputs,
                         timeout: float = None,
                         iterations: int = None) -> base.CallResult[Outputs]:
        X = inputs
        U = self._U.copy()

        return base.CallResult(container.ndarray(U, generate_metadata=True))
Example #15
0
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        np.random.seed(1234)

        G = inputs[0].copy()

        g = graspyPTR(G)

        n = g.shape[0]

        max_dimension = self.hyperparams['max_dimension']

        if max_dimension > n:
            max_dimension = n 

        n_elbows = self.hyperparams['which_elbow']

        """
        What does Omni(DAD) even look like? 

        if self.hyperparams['use_attributes']:
            adj = [g]
            MORE_ATTR = True
            attr_number = 1

            while MORE_ATTR:
                attr = 'attr'
                temp_attr = np.array(list(networkx.get_node_attributes(G, 'attr' + str(attr_number)).values()))
                if len(temp_attr) == 0:
                    MORE_ATTR = False
                else:
                    K = np.sum((temp_attr[:, np.newaxis][:, np.newaxis, :] - temp_attr[:, np.newaxis][np.newaxis, :, :])**2, axis = -1)
                    adj.append(graspyPTR(K))
                    attr_number += 1
            M = len(adj)
            if M > 1:
                g = self._omni(adj)
                lse_object = graspyLSE(n_components = max_dimension, n_elbows=n_elbows)
                X_hats = lse_object.fit_transform(g)

                d = X_hats.shape[1]

                X_hats_reshaped = X_hats.reshape((M, n, d))
                X_hat = np.mean(X_hats_reshaped, axis = 0)

                embedding = X_hat.copy()

                inputs[0] = container.ndarray(embedding)

                return base.CallResult(inputs)
        """

        lse_object = graspyLSE(n_components = max_dimension, n_elbows=n_elbows)
        X_hat = lse_object.fit_transform(g)

        inputs[0] = container.ndarray(X_hat)

        return base.CallResult(inputs)
Example #16
0
    def produce_sparse(self,
                       *,
                       inputs: Inputs,
                       timeout: float = None,
                       iterations: int = None) -> base.CallResult[Outputs]:

        Lhat = self.produce(inputs=inputs).value
        Shat = inputs - Lhat

        return base.CallResult(container.ndarray(Shat, generate_metadata=True))
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:

        # if self._training_data is None or self._y_dim==0:
        inputs_timeseries = inputs[1]
        inputs_d3mIndex = inputs[0]
        if not self._fitted:
            return CallResult(None, True, 0)
        if isinstance(inputs_timeseries, np.ndarray):
            X = np.zeros((inputs_timeseries.shape[0], self._y_dim))
        else:
            X = np.zeros((len(inputs_timeseries), self._y_dim))

        for i, series in enumerate(inputs_timeseries):
            if series.shape[1] > 1 and not self._value_found:
                series_output = pd.DataFrame()
                for j in range(series.shape[1]):
                    series_output = pd.concat(
                        [series_output, series.iloc[:, j]])
            else:
                series_output = series
            if (series_output.shape[0] < self._y_dim):
                # pad with zeros
                X[i, :series_output.
                  shape[0]] = series_output.iloc[:series_output.shape[0],
                                                 self._value_dimension]
            else:
                # Truncate or just fit in
                X[i, :] = series_output.iloc[:self._y_dim,
                                             self._value_dimension]

        # save the result to DataFrame format
        output_ndarray = self._model.transform(X)
        output_dataFrame = container.DataFrame(
            container.ndarray(output_ndarray))

        if self.hyperparams["generate_metadata"]:
            # add metadata if required
            for each_column in range(output_ndarray.shape[1]):
                metadata_selector = (mbase.ALL_ELEMENTS, each_column)
                metadata_each_column = {
                    'semantic_types':
                    ('https://metadata.datadrivendiscovery.org/types/TabularColumn',
                     'https://metadata.datadrivendiscovery.org/types/Attribute'
                     )
                }
                output_dataFrame.metadata = output_dataFrame.metadata.update(
                    metadata=metadata_each_column, selector=metadata_selector)

        # update the original index to be d3mIndex
        output_dataFrame = output_dataFrame.set_index(inputs_d3mIndex)
        return CallResult(output_dataFrame, True, 1)
Example #18
0
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        """
        Perform Out of Sample Adjacency Spectral Embedding on a graph.
        """
        np.random.seed(1234)

        g = inputs[0].copy()
        if type(g) == networkx.classes.graph.Graph:
            g = networkx.to_numpy_array(g)

        n = g.shape[0]
        D = np.linalg.pinv(np.diag(g.sum(axis=1))**(1/2))
        L = D @ g @ D

        if self.hyperparams['max_dimension'] >= n:
            self.hyperparams['max_dimension'] = n - 1
        d_max = self.hyperparams['max_dimension']

        in_sample_n = self.hyperparams['n_in_sample']

        if in_sample_n > n:
            in_sample_n = n
            # TODO ASE HERE
        in_sample_idx = np.random.choice(n, in_sample_n)
        out_sample_idx = np.setdiff1d(list(range(n)),in_sample_idx)

        in_sample_A = L[np.ix_(in_sample_idx, in_sample_idx)]
        out_sample_A = L[np.ix_(out_sample_idx, in_sample_idx)]

        # hp_ase = ase_hyperparameters({'max_dimension': dim, 'use_attributes': False, 'which_elbow': self.hyperparams['which_elbow']})
        # ASE = ase(hyperparams = hp_ase)
        # embedding = ASE.produce(inputs = [g]).value[0]

        tsvd = TruncatedSVD(n_components = d_max)
        tsvd.fit(in_sample_A)

        eig_vectors = tsvd.components_.T
        eig_values = tsvd.singular_values_
        elbow = self._profile_likelihood_maximization(eig_values, self.hyperparams['which_elbow'])[-1]
        
        eig_vectors = eig_vectors[:, :elbow + 1].copy()
        eig_values = eig_values[:elbow + 1].copy()
        d = len(eig_values)

        in_sample_embedding = eig_vectors.dot(np.diag(eig_values**0.5))

        out_sample_embedding = out_sample_A @ eig_vectors @ np.diag(1/np.sqrt(eig_values))
        embedding = np.zeros((n,d))
        embedding[in_sample_idx] = in_sample_embedding
        embedding[out_sample_idx] = out_sample_embedding

        inputs[0] = container.ndarray(embedding)

        return base.CallResult(inputs)
def generate_linear_data(nSamples, nFeatures):
    """
    y = X * coef + noise
    noise = 0, for simplicity of unittest
    """
    # design matrix
    X = np.random.randn(nSamples, nFeatures)
    X = X - np.mean(X, 0)  # centered
    X = X / np.linalg.norm(X, ord=2, axis=0)  # normalized

    # noise with variance 0.01
    #noise = np.random.randn(nSamples) * 0.1
    noise = np.zeros(nSamples)

    # coefficients
    coef = np.random.randn(nFeatures)

    y = X.dot(coef) + noise
    return coef, container.ndarray(
        X, generate_metadata=True), container.ndarray(y,
                                                      generate_metadata=True)
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        graph = inputs['0']
        csv = inputs['1']

        linktypes = np.array(csv['linkType'], dtype='int32')
        uniq_linktypes, n_i = np.unique(linktypes, return_counts=True)
        n_linktypes = len(uniq_linktypes)

        sources = np.array(csv['source_nodeID'], dtype='int32')
        targets = np.array(csv['target_nodeID'], dtype='int32')
        nodes = set(np.concatenate((sources, targets)))
        n_nodes = len(nodes)

        info = np.array(csv['linkExists'], dtype='int32')
        n_info = len(info)

        edge_counts = np.zeros(n_linktypes)
        for i in range(n_info):
            temp_link_type = linktypes[i]
            edge_counts[temp_link_type] += info[i]

        p_hats = edge_counts / n_i

        graphs = [
            p_hats[i] * np.ones(shape=(n_nodes, n_nodes))
            for i in range(n_linktypes)
        ]  # set up a bunch of empty graphs

        for i in range(n_info):
            temp_link_type = int(linktypes[i])
            graphs[temp_link_type][sources[i], targets[i]] = info[i]
            graphs[temp_link_type][targets[i], sources[i]] = info[i]

        big_graph = np.zeros(shape=(n_nodes * int(n_linktypes),
                                    n_nodes * int(n_linktypes)))

        for i in range(n_linktypes):
            big_graph[i * n_nodes:(i + 1) * n_nodes,
                      i * n_nodes:(i + 1) * n_nodes] = graphs[i]

        for i in range(n_linktypes):
            for j in range(i + 1, n_linktypes):
                big_graph[i * n_nodes:(i + 1) * n_nodes, j * n_nodes:(j + 1) *
                          n_nodes] = (graphs[i] + graphs[j]) / 2
                big_graph[j * n_nodes:(j + 1) * n_nodes, i * n_nodes:(i + 1) *
                          n_nodes] = (graphs[i] + graphs[j]) / 2

        return base.CallResult(container.List([container.ndarray(big_graph)]))
Example #21
0
    def _load_image_group(self, uris: List[str], bands: List[str],
                          base_uri: str,
                          max_dimension: int) -> container.ndarray:

        zipped = zip(bands, uris)
        images = list(
            map(lambda image: self._load_image(image[0], image[1], base_uri),
                zipped))

        # reshape images (upsample) to have it all fit within an array
        if self.hyperparams["compress_data"]:
            # Store a header consisting of the dtype character and the data shape as unsigned integers.
            # Given c struct alignment, will occupy 16 bytes (1 + 4 + 4 + 4 + 3 padding)
            output_bytes = bytearray(
                struct.pack(
                    "cIII",
                    bytes(images[0][1].dtype.char.encode()),
                    len(images),
                    max_dimension,
                    max_dimension,
                ))
            for band, image in images:
                output_bytes.extend(
                    self._bilinear_resample(image, max_dimension).tobytes())
            output_compressed_bytes = lz4.frame.compress(bytes(output_bytes))
            output = np.frombuffer(
                output_compressed_bytes,
                dtype="uint8",
                count=len(output_compressed_bytes),
            )
        else:
            output = np.ndarray((
                len(DataFrameSatelliteImageLoaderPrimitive._BAND_ORDER),
                max_dimension,
                max_dimension,
            ))
            for band, image in images:
                band_idx = DataFrameSatelliteImageLoaderPrimitive._BAND_ORDER[
                    self._normalized_band_id(band)]
                output[band_idx] = self._bilinear_resample(
                    image, max_dimension)

        output = container.ndarray(
            output,
            {
                "schema": metadata_base.CONTAINER_SCHEMA_VERSION,
                "structural_type": container.ndarray,
            },
            generate_metadata=True,
        )

        return output
Example #22
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        #    def embed(self, *, g : JHUGraph, dim: int):
        """
        Perform Laplacian Spectral Embedding on a graph
        TODO: YP description

        **Positional Arguments:**

        g:
            - Graph in JHUGraph format

        **Optional Arguments:**

        dim:
            - The number of dimensions in which to embed the data
        """

        max_dimension = self.hyperparams['max_dimension']

        path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                            "lse.interface.R")
        cmd = """
        source("%s")
        fn <- function(inputs, max_dimension) {
            lse.interface(inputs, max_dimension)
        }
        """ % path
        #print(cmd)

        result = robjects.r(cmd)(inputs, max_dimension)

        vectors = container.ndarray(result[0])
        eig_values = container.ndarray(result[1])

        return base.CallResult([vectors, eig_values])
Example #23
0
    def test_columns_sum(self):
        dataframe = container.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, generate_metadata=True)

        dataframe_sum = utils.columns_sum(dataframe)

        self.assertEqual(dataframe_sum.values.tolist(), [[6, 15]])
        self.assertEqual(dataframe_sum.metadata.query((metadata_base.ALL_ELEMENTS, 0))['name'], 'a')
        self.assertEqual(dataframe_sum.metadata.query((metadata_base.ALL_ELEMENTS, 1))['name'], 'b')

        array = container.ndarray(dataframe, generate_metadata=True)

        array_sum = utils.columns_sum(array)

        self.assertEqual(array_sum.tolist(), [[6, 15]])
        self.assertEqual(array_sum.metadata.query((metadata_base.ALL_ELEMENTS, 0))['name'], 'a')
        self.assertEqual(array_sum.metadata.query((metadata_base.ALL_ELEMENTS, 1))['name'], 'b')
Example #24
0
    def _read_fileuri(self, metadata: frozendict.FrozenOrderedDict,
                      fileuri: str) -> container.ndarray:
        image_array = container.ndarray(numpy.array(
            [[fileuri.split('/')[-1]]], dtype=object), {
                'schema': metadata_base.CONTAINER_SCHEMA_VERSION,
                'structural_type': container.ndarray,
            },
                                        generate_metadata=False)

        image_array.metadata = image_array.metadata.update((), {
            'image_reader_metadata': {
                'foobar': 42,
            },
        })

        return image_array
    def test_ndarray(self):
        array = container.ndarray(numpy.array([1, 2, 3], dtype=numpy.int64), generate_metadata=True)

        self.assertEqual(utils.to_json_structure(array.metadata.to_internal_simple_structure()), [{
            'selector': [],
            'metadata': {
                'schema': base.CONTAINER_SCHEMA_VERSION,
                'structural_type': 'd3m.container.numpy.ndarray',
                'dimension': {
                    'length': 3,
                },
            },
        }, {
            'selector': ['__ALL_ELEMENTS__'],
            'metadata': {
                'structural_type': 'numpy.int64',
            },
        }])
Example #26
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Non-parametric clustering

        **Positional Arguments:**

        xhat1:
            - A numpy.ndarray type "matrix"
        xhat2:
            - A numpy.ndarray type "matrix"

        **Optional Arguments:**

        sigma:
            - a sigma for the Gaussian kernel
        """

        #xhat1 = inputs[0,:,:]
        #xhat2 = inputs[1,:,:]

        xhat1 = inputs[0]
        xhat2 = inputs[1]

        sigma = self.hyperparams['sigma']

        path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                            "nonpar.interface.R")

        cmd = """
        source("%s")
        fn <- function(xhat1, xhat2, sigma) {
            nonpar.interface(xhat1, xhat2, sigma)
        }
        """ % path

        result = np.array(robjects.r(cmd)(xhat1, xhat2, sigma))

        outputs = container.ndarray(result)

        return base.CallResult(outputs)
Example #27
0
    def fit(self, *, timeout: float = None, iterations: int = None) -> base.CallResult[None]:
        if self._fitted:
            return base.CallResult(None)

        embeddings = self._training_inputs[1][0]
        csv = self._training_inputs[0]
        n_nodes, n_links = self._training_inputs[3]

        n_info = csv.shape[0]
        ranks = [[[], []] for i in range(n_links + 1)]

        try:
            int(np.array(csv['linkType'])[0])
        except:
            csv['linkType'] = np.zeros(n_info)

        # print(csv, file=sys.stderr)
        csv_headers = csv.columns
        for header in csv_headers:
            if header[:6] == "source":
                SOURCE = header
            elif header[:6] == "target":
                TARGET = header

        for i in range(n_info):
            temp_link = int(np.array(csv['linkType'])[i])
            temp_exists = int(np.array(csv['linkExists'])[i])
            temp_source = int(np.array(csv[SOURCE])[i])
            temp_target = int(np.array(csv[TARGET])[i])
            temp_dot = embeddings[temp_link*n_nodes + temp_source - 1] @ embeddings[temp_link*n_nodes + temp_target - 1]
            ranks[temp_link][temp_exists].append(temp_dot)
            ranks[-1][temp_exists].append(temp_dot)

        for i in range(len(ranks)):
            ranks[i][0] = np.sort(ranks[i][0])
            ranks[i][1] = np.sort(ranks[i][1])

        self._embeddings = container.ndarray(embeddings)
        self._inner_products = container.List(ranks)

        self._fitted = True

        return base.CallResult(None)
Example #28
0
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]:
        """
        Compute the predictions given inputs with shape n by m,
        yielding an array of size n.

        Inputs must match the dimensionality of the training data.
        """
        # First do assorted error checking and initialization
        if self._fitted is False:
            raise ValueError("Calling produce before fitting.")

        if(inputs.shape[1] != self._coef.shape[0]):
            raise ValueError('Input dimension is wrong.')

        # Start timeout counter.
        outputs: container.ndarray = container.ndarray(
               inputs.dot(self._coef) + self._intercept)
        outputs.metadata = inputs.metadata.clear(for_value=outputs, source=self)
        return base.CallResult(outputs)
    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
        if self._fitted:
            return base.CallResult(None)

        xhat = self._inputs_1
        yhat = self._inputs_2

        temp_train = self._reference.merge(xhat, how='left', on='e_nodeID')
        temp_train = temp_train.merge(yhat, how='left', on='g_nodeID')

        temp_train = temp_train[temp_train['match'].astype(int).astype(bool)]

        xhat_train = temp_train.values[:, 4:-300].astype(np.float32)
        yhat_train = temp_train.values[:, -300:].astype(np.float32)

        w, _ = orthogonal_procrustes(yhat_train, xhat_train)
        self._w = container.ndarray(w)
        self._fitted = True

        return CallResult(None)
Example #30
0
 def setup(self):
     self.large_dataframe = container.DataFrame(pandas.DataFrame(
         {str(i): [str(j) for j in range(10000)]
          for i in range(50)},
         columns=[str(i) for i in range(50)]),
                                                generate_metadata=True)
     self.large_list = container.List([
         container.List([str(j) for i in range(50)]) for j in range(10000)
     ],
                                      generate_metadata=True)
     self.large_ndarray = container.ndarray(numpy.array(
         [[[str(k) for k in range(5)] for i in range(10)]
          for j in range(10000)],
         dtype=object),
                                            generate_metadata=True)
     self.large_dict_list = container.List(
         {str(i): {str(j): j
                   for j in range(10000)}
          for i in range(50)},
         generate_metadata=True)