コード例 #1
0
    def test_list_with_objects(self):
        l = container.List([container.List([str(j) for i in range(5)]) for j in range(10)], generate_metadata=True)

        self.assertEqual(utils.to_json_structure(l.metadata.to_internal_simple_structure()), [
            {
                'selector': [],
                'metadata': {
                    'schema': base.CONTAINER_SCHEMA_VERSION,
                    'structural_type': 'd3m.container.list.List',
                    'dimension': {
                        'length': 10,
                    },
                },
            },
            {
                'selector': ['__ALL_ELEMENTS__'],
                'metadata': {
                    'structural_type': 'd3m.container.list.List',
                    'dimension': {
                        'length': 5,
                    },
                }
            },
            {
                'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'],
                'metadata': {
                    'structural_type': 'str',
                },
            },
        ])
コード例 #2
0
    def test_pickle(self):
        # This test is not really useful anymore because primitive now does not keep random state
        # anymore but outputs depend only on inputs, and not on previous calls to "produce" method.

        hyperparams_class = RandomPrimitive.metadata.get_hyperparams()

        primitive = RandomPrimitive(random_seed=42, hyperparams=hyperparams_class.defaults())

        inputs = container.List(list(range(4)), generate_metadata=True)

        call_metadata = self.call_primitive(primitive, 'produce', inputs=inputs)

        self.assertTrue(numpy.allclose(call_metadata.value.values, container.ndarray([0.496714153011, -0.138264301171, 0.647688538101, 1.52302985641]).reshape(4, 1)))

        pickled_primitive = pickle.dumps(primitive)

        inputs = container.List(list(range(4, 8)), generate_metadata=True)

        call_metadata = self.call_primitive(primitive, 'produce', inputs=inputs)

        self.assertTrue(numpy.allclose(call_metadata.value.values, container.ndarray([-0.23415337, -0.23413696, 1.57921282, 0.76743473]).reshape(4, 1)))

        unpickled_primitive = pickle.loads(pickled_primitive)

        call_metadata = self.call_primitive(unpickled_primitive, 'produce', inputs=inputs)

        self.assertTrue(numpy.allclose(call_metadata.value.values, container.ndarray([-0.23415337, -0.23413696, 1.57921282, 0.76743473]).reshape(4, 1)))
コード例 #3
0
    def test_hyperparameter(self):
        hyperparams_class = MonomialPrimitive.metadata.get_hyperparams()

        primitive = MonomialPrimitive(hyperparams=hyperparams_class(bias=1))

        inputs = container.List([1, 2, 3, 4, 5, 6], generate_metadata=True)

        outputs = container.List([2, 4, 6, 8, 10, 12], generate_metadata=True)

        self.call_primitive(primitive,
                            'set_training_data',
                            inputs=inputs,
                            outputs=outputs)
        call_metadata = self.call_primitive(primitive, 'fit')

        self.assertEqual(call_metadata.has_finished, True)
        self.assertEqual(call_metadata.iterations_done, None)

        inputs = container.List([10, 20, 30], generate_metadata=True)

        call_metadata = self.call_primitive(primitive,
                                            'produce',
                                            inputs=inputs)

        self.assertSequenceEqual(call_metadata.value, [21, 41, 61])
        self.assertEqual(call_metadata.has_finished, True)
        self.assertEqual(call_metadata.iterations_done, None)

        self.assertEqual(
            call_metadata.value.metadata.query(())['dimension']['length'], 3)
        self.assertEqual(
            call_metadata.value.metadata.query(
                (base.ALL_ELEMENTS, ))['structural_type'], float)
コード例 #4
0
    def test_lists(self):
        hyperparams_class = SumPrimitive.metadata.get_hyperparams()

        primitive = SumPrimitive(
            hyperparams=hyperparams_class.defaults(),
            docker_containers=self.get_docker_containers())

        inputs = container.List(
            [container.List([1, 2, 3, 4]),
             container.List([5, 6, 7, 8])],
            generate_metadata=True)

        call_metadata = self.call_primitive(primitive,
                                            'produce',
                                            inputs=inputs)

        # Because it is a singleton produce method we can know that there is exactly one value in outputs.
        result = call_metadata.value[0]

        self.assertEqual(result, 36)
        self.assertEqual(call_metadata.has_finished, True)
        self.assertEqual(call_metadata.iterations_done, None)

        self.assertEqual(
            call_metadata.value.metadata.query(
                (metadata_base.ALL_ELEMENTS, ))['structural_type'], float)
コード例 #5
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Input
            G: an n x n matrix or a networkx Graph
        Return
            The largest connected component of g

        """
        G = inputs['0']
        csv = inputs['learningData']

        #if len(list(nx.get_node_attributes(G, 'nodeID').values())) == 0:
        #    nx.set_node_attributes(G,'nodeID',-1)
        #    for i in range(len(G)):
        #        G.node[i]['nodeID'] = i

        if len(csv) != 0:
            if len(list(nx.get_node_attributes(G, 'nodeID').values())) == 0:
                nx.set_node_attributes(G, 'nodeID', -1)
                for i in range(len(G)):
                    G.node[i]['nodeID'] = i

            nodeIDs = list(nx.get_node_attributes(G, 'nodeID').values())
            nodeIDs = container.ndarray(np.array([int(i) for i in nodeIDs]))

            return base.CallResult(container.List([G.copy(), nodeIDs, csv]))

        if type(G) == np.ndarray:
            if G.ndim == 2:
                if G.shape[0] == G.shape[1]:  # n x n matrix
                    G = Graph(G)
                else:
                    raise TypeError(
                        "Networkx graphs or n x n numpy arrays only")

        subgraphs = [G.subgraph(i).copy() for i in nx.connected_components(G)]

        G_connected = [[0]]
        for i in subgraphs:
            if len(i) > len(G_connected[0]):
                G_connected = [i]

        nodeIDs = list(
            nx.get_node_attributes(G_connected[0], 'nodeID').values())
        nodeIDs = container.ndarray(np.array([int(i) for i in nodeIDs]))

        return base.CallResult(
            container.List([G_connected[0].copy(), nodeIDs, csv]))
コード例 #6
0
ファイル: test_random.py プロジェクト: tods-doc/tamu_d3m
    def test_basic(self):
        hyperparams_class = RandomPrimitive.metadata.get_hyperparams()

        primitive = RandomPrimitive(random_seed=42,
                                    hyperparams=hyperparams_class.defaults())

        inputs = container.List(list(range(4)), generate_metadata=True)

        call_metadata = self.call_primitive(primitive,
                                            'produce',
                                            inputs=inputs)

        self.assertTrue(
            numpy.allclose(
                call_metadata.value.values,
                container.ndarray([
                    0.496714153011, -0.138264301171, 0.647688538101,
                    1.52302985641
                ]).reshape((4, 1))))
        self.assertEqual(call_metadata.has_finished, True)
        self.assertEqual(call_metadata.iterations_done, None)

        self.assertEqual(
            call_metadata.value.metadata.query(
                (base.ALL_ELEMENTS, 0))['structural_type'], numpy.float64)
コード例 #7
0
    def test_basic(self):
        hyperparam_primitive1 = NullTransformerPrimitive(
            hyperparams=NullTransformerPrimitive.metadata.get_hyperparams(
            ).defaults())
        hyperparam_primitive2 = NullTransformerPrimitive(
            hyperparams=NullTransformerPrimitive.metadata.get_hyperparams(
            ).defaults())

        primitive = PrimitiveSumPrimitive(
            hyperparams={
                'primitive_1': hyperparam_primitive1,
                'primitive_2': hyperparam_primitive2
            })
        inputs = container.List([10, 20, 30], generate_metadata=True)
        call_metadata = self.call_primitive(primitive,
                                            'produce',
                                            inputs=inputs)

        self.assertSequenceEqual(call_metadata.value, [20, 40, 60])
        self.assertEqual(call_metadata.has_finished, True)
        self.assertEqual(call_metadata.iterations_done, None)

        self.assertEqual(
            call_metadata.value.metadata.query(())['dimension']['length'], 3)
        self.assertEqual(
            call_metadata.value.metadata.query(
                (base.ALL_ELEMENTS, ))['structural_type'], int)
コード例 #8
0
ファイル: labler.py プロジェクト: usc-isi-i2/dsbox-cleaning
    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        self._fitted = True
        categorical_attributes = common_utils.list_columns_with_semantic_types(
            metadata=self._training_data.metadata,
            semantic_types=[
                "https://metadata.datadrivendiscovery.org/types/OrdinalData",
                "https://metadata.datadrivendiscovery.org/types/CategoricalData"
            ])

        all_attributes = common_utils.list_columns_with_semantic_types(
            metadata=self._training_data.metadata,
            semantic_types=[
                "https://metadata.datadrivendiscovery.org/types/Attribute"
            ])

        self._s_cols = container.List(
            set(all_attributes).intersection(categorical_attributes))
        _logger.debug("%d of categorical attributes found." %
                      (len(self._s_cols)))

        if len(self._s_cols) > 0:
            # temp_model = defaultdict(LabelEncoder)
            # self._training_data.iloc[:, self._s_cols].apply(lambda x: temp_model[x.name].fit(x))
            # self._model = dict(temp_model)
            self._model = {}
            for col_index in self._s_cols:
                self._model[
                    col_index] = self._training_data.iloc[:, col_index].dropna(
                    ).unique()

        return CallResult(None, has_finished=True)
コード例 #9
0
    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        categorical_attributes = common_utils.list_columns_with_semantic_types(
            metadata=self._training_data.metadata,
            semantic_types=[
                "https://metadata.datadrivendiscovery.org/types/OrdinalData",
                "https://metadata.datadrivendiscovery.org/types/CategoricalData"
            ])

        all_attributes = common_utils.list_columns_with_semantic_types(
            metadata=self._training_data.metadata,
            semantic_types=[
                "https://metadata.datadrivendiscovery.org/types/Attribute"
            ])

        self._s_cols = container.List(
            set(all_attributes).intersection(categorical_attributes))
        print("[INFO] %d of categorical attributes found." %
              (len(self._s_cols)))

        if len(self._s_cols) > 0:
            temp_model = defaultdict(LabelEncoder)
            self._training_data.iloc[:, self._s_cols].apply(
                lambda x: temp_model[x.name].fit(x))
            self._model = dict(temp_model)
            self._fitted = True
        else:
            self._fitted = False
コード例 #10
0
ファイル: abs_sum.py プロジェクト: tods-doc/axolotl
 def produce(self,
             *,
             inputs: Inputs,
             timeout: float = None,
             iterations: int = None) -> base.CallResult[Outputs]:
     result = np.abs(self._convert_value(inputs)).sum()
     outputs = container.List((result, ), generate_metadata=True)
     return base.CallResult(outputs)
コード例 #11
0
ファイル: primitives.py プロジェクト: tods-doc/tamu_d3m
    def _produce(self, inputs: DatasetSplitInputs, is_train: bool) -> base.CallResult[DatasetSplitOutputs]:
        """
        This function splits the fitted Dataset.

        Parameters
        ----------
        inputs:
            A list of 0-based indices which specify which splits to be used as test split in output.
        is_train:
            Whether we are producing train or test data.

        Returns
        -------
        Returns a list of Datasets.
        """

        if not self._fitted or self._splits is None or self._dataset is None or self._main_resource_id is None or self._graph is None:
            raise exceptions.PrimitiveNotFittedError("Primitive not fitted.")

        output_datasets = container.List(generate_metadata=True)

        for index in inputs:
            train_indices, test_indices = self._splits[index]

            if is_train:
                output_dataset = base_utils.sample_rows(
                    self._dataset,
                    self._main_resource_id,
                    set(train_indices),
                    self._graph,
                    delete_recursive=self.hyperparams.get('delete_recursive', False),
                )
            else:
                output_dataset = base_utils.sample_rows(
                    self._dataset,
                    self._main_resource_id,
                    set(test_indices),
                    self._graph,
                    delete_recursive=self.hyperparams.get('delete_recursive', False),
                )

            output_datasets.append(output_dataset)

        output_datasets.metadata = metadata_base.DataMetadata({
            'schema': metadata_base.CONTAINER_SCHEMA_VERSION,
            'structural_type': container.List,
            'dimension': {
                'length': len(output_datasets),
            },
        })

        # We update metadata based on metadata of each dataset.
        # TODO: In the future this might be done automatically by generate_metadata.
        #       See: https://gitlab.com/datadrivendiscovery/d3m/issues/119
        for index, dataset in enumerate(output_datasets):
            output_datasets.metadata = dataset.metadata.copy_to(output_datasets.metadata, (), (index,))

        return base.CallResult(output_datasets)
コード例 #12
0
 def produce(self,
             *,
             inputs: Inputs,
             timeout: float = None,
             iterations: int = None) -> base.CallResult[Outputs]:
     with self._connection.cursor() as cursor:
         cursor.execute("SELECT 42;")
         return base.CallResult(
             container.List([cursor.fetchone()[0]], generate_metadata=True))
コード例 #13
0
ファイル: unfold.py プロジェクト: byu-dml/dsbox-primitives
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        primary_key_cols = common_utils.list_columns_with_semantic_types(
            metadata=inputs.metadata,
            semantic_types=["https://metadata.datadrivendiscovery.org/types/PrimaryKey"]
        )

        unfold_cols = common_utils.list_columns_with_semantic_types(
            metadata=inputs.metadata,
            semantic_types=self.hyperparams["unfold_semantic_types"]
        )

        if not primary_key_cols:
            warnings.warn("Did not find primary key column for grouping. Will not unfold")
            return CallResult(inputs)

        if not unfold_cols:
            warnings.warn("Did not find any column to unfold. Will not unfold")
            return CallResult(inputs)

        primary_key_col_names = [inputs.columns[pos] for pos in primary_key_cols]
        unfold_col_names = [inputs.columns[pos] for pos in unfold_cols]

        if self.hyperparams["use_pipeline_id_semantic_type"]:
            pipeline_id_cols = common_utils.list_columns_with_semantic_types(
                metadata=inputs.metadata,
                semantic_types=["https://metadata.datadrivendiscovery.org/types/PipelineId"]
            )

            if len(pipeline_id_cols) >= 2:
                warnings.warn("Multiple pipeline id columns found. Will use first.")

            if pipeline_id_cols:
                inputs = inputs.sort_values(primary_key_col_names + [inputs.columns[pos] for pos in pipeline_id_cols])
                self._sorted_pipe_ids = sorted(inputs.iloc[:, pipeline_id_cols[0]].unique())
            else:
                warnings.warn(
                    "No pipeline id column found by 'https://metadata.datadrivendiscovery.org/types/PipelineId'")

        new_df = self._get_new_df(inputs=inputs, use_cols=primary_key_cols + unfold_cols)

        groupby_df = inputs.groupby(primary_key_col_names)[unfold_col_names].aggregate(
            lambda x: container.List(x)).reset_index(drop=False)

        ret_df = container.DataFrame(groupby_df)
        ret_df.metadata = new_df.metadata
        ret_df = self._update_metadata_dimension(df=ret_df)

        split_col_names = [inputs.columns[pos] for pos in unfold_cols]

        ret_df = self._split_aggregated(df=ret_df, split_col_names=split_col_names)
        ret_df = common_utils.remove_columns(
            inputs=ret_df,
            column_indices=[ret_df.columns.get_loc(name) for name in split_col_names]
        )

        return CallResult(ret_df)
コード例 #14
0
 def setup(self):
     self.large_dataframe = container.DataFrame(pandas.DataFrame(
         {str(i): [str(j) for j in range(10000)]
          for i in range(50)},
         columns=[str(i) for i in range(50)]),
                                                generate_metadata=True)
     self.large_list = container.List([
         container.List([str(j) for i in range(50)]) for j in range(10000)
     ],
                                      generate_metadata=True)
     self.large_ndarray = container.ndarray(numpy.array(
         [[[str(k) for k in range(5)] for i in range(10)]
          for j in range(10000)],
         dtype=object),
                                            generate_metadata=True)
     self.large_dict_list = container.List(
         {str(i): {str(j): j
                   for j in range(10000)}
          for i in range(50)},
         generate_metadata=True)
コード例 #15
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        graph = inputs['0']
        csv = inputs['1']

        linktypes = np.array(csv['linkType'], dtype='int32')
        uniq_linktypes, n_i = np.unique(linktypes, return_counts=True)
        n_linktypes = len(uniq_linktypes)

        sources = np.array(csv['source_nodeID'], dtype='int32')
        targets = np.array(csv['target_nodeID'], dtype='int32')
        nodes = set(np.concatenate((sources, targets)))
        n_nodes = len(nodes)

        info = np.array(csv['linkExists'], dtype='int32')
        n_info = len(info)

        edge_counts = np.zeros(n_linktypes)
        for i in range(n_info):
            temp_link_type = linktypes[i]
            edge_counts[temp_link_type] += info[i]

        p_hats = edge_counts / n_i

        graphs = [
            p_hats[i] * np.ones(shape=(n_nodes, n_nodes))
            for i in range(n_linktypes)
        ]  # set up a bunch of empty graphs

        for i in range(n_info):
            temp_link_type = int(linktypes[i])
            graphs[temp_link_type][sources[i], targets[i]] = info[i]
            graphs[temp_link_type][targets[i], sources[i]] = info[i]

        big_graph = np.zeros(shape=(n_nodes * int(n_linktypes),
                                    n_nodes * int(n_linktypes)))

        for i in range(n_linktypes):
            big_graph[i * n_nodes:(i + 1) * n_nodes,
                      i * n_nodes:(i + 1) * n_nodes] = graphs[i]

        for i in range(n_linktypes):
            for j in range(i + 1, n_linktypes):
                big_graph[i * n_nodes:(i + 1) * n_nodes, j * n_nodes:(j + 1) *
                          n_nodes] = (graphs[i] + graphs[j]) / 2
                big_graph[j * n_nodes:(j + 1) * n_nodes, i * n_nodes:(i + 1) *
                          n_nodes] = (graphs[i] + graphs[j]) / 2

        return base.CallResult(container.List([container.ndarray(big_graph)]))
コード例 #16
0
class Hyperparams(hyperparams.Hyperparams):
    search_result = hyperparams.Hyperparameter[bytes](
        default=b'',
        semantic_types=[
            'https://metadata.datadrivendiscovery.org/types/ControlParameter',
        ],
        description="Pickled search result provided by Datamart",
    )
    augment_columns = hyperparams.Hyperparameter[list](
        default=container.List(),
        semantic_types=[
            'https://metadata.datadrivendiscovery.org/types/ControlParameter'
        ],
        description=
        "Optional list of columns from the Datamart dataset that will be added"
    )
コード例 #17
0
ファイル: test_plasma.py プロジェクト: tods-doc/tamu_d3m
    def test_list(self):
        l = container.List([1, 2, 3], generate_metadata=True)

        l.metadata = l.metadata.update((), {
            'test': 'foobar',
        })

        object_id = self.client.put(l)
        l_copy = self.client.get(object_id)

        self.assertIsInstance(l_copy, container.List)
        self.assertTrue(hasattr(l_copy, 'metadata'))

        self.assertSequenceEqual(l, l_copy)
        self.assertEqual(l.metadata.to_internal_json_structure(), l_copy.metadata.to_internal_json_structure())
        self.assertEqual(l_copy.metadata.query(()).get('test'), 'foobar')
コード例 #18
0
 def setup(self, compact):
     self.large_dataframe_with_objects = pandas.DataFrame(
         {str(i): [str(j) for j in range(10000)]
          for i in range(50)},
         columns=[str(i) for i in range(50)])
     self.large_list_with_objects = [
         container.List([str(j) for i in range(50)]) for j in range(10000)
     ]
     self.large_ndarray_with_objects = numpy.array(
         [[[str(k) for k in range(5)] for i in range(10)]
          for j in range(10000)],
         dtype=object)
     self.large_dict_with_objects = {
         str(i): {str(j): j
                  for j in range(10000)}
         for i in range(50)
     }
コード例 #19
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:
        # In the future, we should store here data in Arrow format into
        # Plasma store and just pass an ObjectId of data over HTTP.
        value = self._convert_value(inputs)
        data = pickle.dumps(value)

        # TODO: Retry if connection fails.
        #       This connection can sometimes fail because the service inside a Docker container
        #       is not yet ready, despite container itself already running. Primitive should retry
        #       a few times before aborting.

        # Primitive knows the port the container is listening on.
        connection = client.HTTPConnection(
            self.docker_containers[DOCKER_KEY].address,
            port=self.docker_containers[DOCKER_KEY].ports['8000/tcp'])
        # This simple primitive does not keep any state in the Docker container.
        # But if your primitive does have to associate requests with a primitive, consider
        # using Python's "id(self)" call to get an identifier of a primitive's instance.
        self.logger.debug("HTTP request: container=%(container)s",
                          {'container': self.docker_containers[DOCKER_KEY]},
                          extra={'data': value})
        connection.request('POST', '/', data, {
            'Content-Type': 'multipart/form-data',
        })
        response = connection.getresponse()
        self.logger.debug("HTTP response: status=%(status)s",
                          {'status': response.status},
                          extra={'response': response})

        if response.status != 200:
            raise ValueError("Invalid HTTP response status: {status}".format(
                status=response.status))

        result = float(response.read())

        # Outputs are different from inputs, so we do not reuse metadata from inputs but generate new metadata.
        outputs = container.List((result, ), generate_metadata=True)

        # Wrap it into default "CallResult" object: we are not doing any iterations.
        return base.CallResult(outputs)
コード例 #20
0
    def fit(self, *, timeout: float = None, iterations: int = None) -> base.CallResult[None]:
        if self._fitted:
            return base.CallResult(None)

        embeddings = self._training_inputs[1][0]
        csv = self._training_inputs[0]
        n_nodes, n_links = self._training_inputs[3]

        n_info = csv.shape[0]
        ranks = [[[], []] for i in range(n_links + 1)]

        try:
            int(np.array(csv['linkType'])[0])
        except:
            csv['linkType'] = np.zeros(n_info)

        # print(csv, file=sys.stderr)
        csv_headers = csv.columns
        for header in csv_headers:
            if header[:6] == "source":
                SOURCE = header
            elif header[:6] == "target":
                TARGET = header

        for i in range(n_info):
            temp_link = int(np.array(csv['linkType'])[i])
            temp_exists = int(np.array(csv['linkExists'])[i])
            temp_source = int(np.array(csv[SOURCE])[i])
            temp_target = int(np.array(csv[TARGET])[i])
            temp_dot = embeddings[temp_link*n_nodes + temp_source - 1] @ embeddings[temp_link*n_nodes + temp_target - 1]
            ranks[temp_link][temp_exists].append(temp_dot)
            ranks[-1][temp_exists].append(temp_dot)

        for i in range(len(ranks)):
            ranks[i][0] = np.sort(ranks[i][0])
            ranks[i][1] = np.sort(ranks[i][1])

        self._embeddings = container.ndarray(embeddings)
        self._inner_products = container.List(ranks)

        self._fitted = True

        return base.CallResult(None)
コード例 #21
0
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]:
        primitive_1 = self.hyperparams['primitive_1']
        primitive_2 = self.hyperparams['primitive_2']

        results = []

        if primitive_1 is not None:
            start = time.perf_counter()
            results.append(primitive_1.produce(inputs=inputs, timeout=timeout, iterations=iterations))
            delta = time.perf_counter() - start

            # Decrease the amount of time available to other calls. This delegates responsibility
            # of raising a "TimeoutError" exception to produce methods themselves. It also assumes
            # that if one passes a negative timeout value to a produce method, it raises a
            # "TimeoutError" exception correctly.
            if timeout is not None:
                timeout -= delta

        if primitive_2 is not None:
            results.append(primitive_2.produce(inputs=inputs, timeout=timeout, iterations=iterations))

        if not results:
            raise exceptions.InvalidArgumentValueError("No primitives provided as hyper-parameters.")

        # Even if the structure of outputs is the same as inputs, conceptually, outputs are different,
        # they are new data. So we do not reuse metadata from inputs but generate new metadata.
        outputs = container.List([sum(x) for x in zip(*[result.value for result in results])], generate_metadata=True)

        # We return the maximum number of iterations done by any produce method we called.
        iterations_done = None
        for result in results:
            if result.iterations_done is not None:
                if iterations_done is None:
                    iterations_done = result.iterations_done
                else:
                    iterations_done = max(iterations_done, result.iterations_done)

        return base.CallResult(
            value=outputs,
            has_finished=all(result.has_finished for result in results),
            iterations_done=iterations_done,
        )
コード例 #22
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        np.random.seed(self.random_seed)
        #print('lcc, baby!', file=sys.stderr)

        csv = inputs[0]
        G = inputs[1][0]
        nodeIDs = inputs[2]
        TASK = inputs[3]

        # print(len(G), file=sys.stderr)
        subgraphs = [G.subgraph(i).copy() for i in nx.connected_components(G)]

        components = np.zeros(len(G), dtype=int)
        for i, connected_component in enumerate(nx.connected_components(G)):
            #print(np.array(list(connected_component), dtype=int), file=sys.stderr)
            components[np.array(list(connected_component), dtype=int)] = i + 1

        # NODEID = ""
        # for header in csv.columns:
        #     if "nodeID" in header:
        #         NODEID = header
        # nodeIDs = list(csv[NODEID].values)

        # if TASK == "vertexClassification":
        #     csv['components'] = components[np.array(csv[NODEID], dtype=int)]
        if TASK == "communityDetection":
            csv['components'] = components

        G_connected = [0]
        for i in subgraphs:
            if len(i) > len(G_connected):
                G_connected = i

        return base.CallResult(
            container.List([csv, [G_connected.copy()], nodeIDs]))
コード例 #23
0
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        # read in graph and training csv
        np.random.seed(random_seed)
        
        graph = inputs['0']
        csv = inputs['learningData']

        n = len(graph)

        # grab link types (values) and edge list (keys)
        values = np.array(list(nx.get_edge_attributes(graph, 'linkType').values()))
        keys = np.array(list(nx.get_edge_attributes(graph, 'linkType').keys()))

        # grab the unique link types
        uniq_linktypes = np.unique(values)
        M = len(uniq_linktypes)

        n_edges = np.zeros(M) # imputation
        n_choose_2 = (n**2 - n)/2

        for i in range(len(values)):
            temp_linktype = values[i]
            n_edges[temp_linktype] += 1 # imputation

        A_imps = [0.5*(0.5 + n_edges[i]/n_choose_2)*np.ones((n, n)) for i in range(M)]

        for i in range(len(values)):
            temp_linktype = values[i]
            A_imps[temp_linktype][keys[i][0], keys[i][1]] = 1
            A_imps[temp_linktype][keys[i][1], keys[i][0]] = 1

        
        for i in range(M):
            imputations = 0
            while imputations < n_edges[i]:
                v1 = np.random.randint(n)
                v2 = np.random.randint(n)
                if v1 == v2 or A_imps[i][v1, v2] == 1:
                    pass
                else:
                    A_imps[i][v1, v2] = 0
                    A_imps[i][v2, v1] = 0
                    imputations += 1

        A = -1*np.zeros(shape = (M*n, M*n))

        for i in range(M):
            for j in range(i, M):
                A[i*n: (i + 1)*n, j*n: (j + 1)*n] = (A_imps[i] + A_imps[j])/2
                A[j*n: (j + 1)*n, i*n: (i + 1)*n] = (A_imps[i] + A_imps[j])/2 

        info = container.List([n, M])
        link_prediction = True

        # # initialize a list of graphs to pass around
        # list_of_graphs = [nx.Graph() for i in range(M)]

        # # each graph is on the same node set
        # for i in range(M):
        #     list_of_graphs[i].add_nodes_from(graph) 

        # # populate the graphs with edges
        # for i in range(len(values)):
        #     temp_G = list_of_graphs[values[i]]
        #     temp_G.add_edge(keys[i][0], keys[i][1])
        #     temp_G.add_edge(keys[i][1], keys[i][0])

        return base.CallResult(container.List([container.ndarray(A), csv, info, link_prediction]))
コード例 #24
0
    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
        """
        Need training data from set_training_data first.
        The encoder would record specified columns to encode and column values to
        unary encode later in the produce step.
        """
        if self._fitted:
            return

        if self._training_inputs is None:
            raise ValueError('Missing training(fitting) data.')

        data = self._training_inputs.copy()
        all_attributes = utils.list_columns_with_semantic_types(metadata=data.metadata, semantic_types=[
            "https://metadata.datadrivendiscovery.org/types/Attribute"])

        # Remove columns with all empty values, structural type str
        numeric = utils.list_columns_with_semantic_types(
            data.metadata, ['http://schema.org/Integer', 'http://schema.org/Float'])
        numeric = [x for x in numeric if x in all_attributes]
        for element in numeric:
            if data.metadata.query((mbase.ALL_ELEMENTS, element)).get('structural_type', ())==str:
                if pd.isnull(pd.to_numeric(data.iloc[:,element], errors='coerce')).sum() == data.shape[0]:
                    self._empty_columns.append(element)

        # Remove columns with all empty values, structural numeric
        is_empty = pd.isnull(data).sum(axis=0) == data.shape[0]
        for i in all_attributes:
            if is_empty.iloc[i]:
                self._empty_columns.append(i)
        self._empty_columns = list(set(self._empty_columns))
        self._empty_columns.reverse()
        self._empty_columns = container.List(self._empty_columns)
        data = utils.remove_columns(data, self._empty_columns)
        # print('fit', data.shape)

        categorical_attributes = utils.list_columns_with_semantic_types(
            metadata=data.metadata,
            semantic_types=[
                "https://metadata.datadrivendiscovery.org/types/OrdinalData",
                "https://metadata.datadrivendiscovery.org/types/CategoricalData"
                ]
            )
        all_attributes = utils.list_columns_with_semantic_types(
            metadata=data.metadata,
            semantic_types=["https://metadata.datadrivendiscovery.org/types/Attribute"]
            )
        self._cat_col_index = container.List(set(all_attributes).intersection(numeric))
        self._cat_columns = container.List(data.columns[self._cat_col_index].tolist())
        #import pdb
        #pdb.set_trace()
        numerical_values = data.iloc[:, self._cat_col_index].apply(
            lambda col: pd.to_numeric(col, errors='coerce'))

        self._all_columns = set(data.columns)

        # mapping
        idict = {}
        for name in self._cat_columns:
            col = numerical_values[name]
            idict[name] = sorted(col.unique())
        self._mapping = idict

        if self._text2int:
            texts = data.drop(self._mapping.keys(),axis=1)
            texts = texts.select_dtypes(include=[object])
            le = Label_encoder()
            le.fit_pd(texts)
            self._textmapping = le.get_params()

        # determine whether to run unary encoder on the given column or not
        data_enc = data.iloc[:, self._cat_col_index].apply(lambda col: pd.to_numeric(col, errors='coerce'))
        for column_name in data_enc:
            col = data_enc[column_name]
            col.is_copy = False
            # only apply unary encoder when the amount of the numerical data is less than 12
            if col.unique().shape[0] < 13:
                self._requirement[column_name] = True
            else:
                self._requirement[column_name] = False

        self._fitted = True

        return CallResult(None, has_finished=True, iterations_done=1)
コード例 #25
0
    def test_list(self):
        lst = container.List(['a', 'b', 'c'], generate_metadata=True)

        self.assertEqual(utils.to_json_structure(lst.metadata.to_internal_simple_structure()), [{
            'selector': [],
            'metadata': {
                'schema': base.CONTAINER_SCHEMA_VERSION,
                'structural_type': 'd3m.container.list.List',
                'dimension': {
                    'length': 3,
                },
            },
        }, {
            'selector': ['__ALL_ELEMENTS__'],
            'metadata': {
                'structural_type': 'str',
            },
        }])

        lst = container.List([1, 'a', 2.0], generate_metadata=True)

        self.assertEqual(utils.to_json_structure(lst.metadata.to_internal_simple_structure()), [{
            'selector': [],
            'metadata': {
                'schema': base.CONTAINER_SCHEMA_VERSION,
                'structural_type': 'd3m.container.list.List',
                'dimension': {
                    'length': 3,
                },
            },
        }, {
            'selector': [0],
            'metadata': {
                'structural_type': 'int',
            },
        }, {
            'selector': [1],
            'metadata': {
                'structural_type': 'str',
            },
        }, {
            'selector': [2],
            'metadata': {
                'structural_type': 'float',
            },
        }])

        dataframe = container.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'c']})
        dataframe.A = dataframe.A.astype(numpy.int64)
        lst = container.List([dataframe], generate_metadata=True)

        self.assertEqual(utils.to_json_structure(lst.metadata.to_internal_simple_structure()), [{
            'selector': [],
            'metadata': {
                'schema': base.CONTAINER_SCHEMA_VERSION,
                'structural_type': 'd3m.container.list.List',
                'dimension': {
                    'length': 1,
                },
            },
        }, {
            'selector': ['__ALL_ELEMENTS__'],
            'metadata': {
                'structural_type': 'd3m.container.pandas.DataFrame',
                'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'],
                'dimension': {
                    'name': 'rows',
                    'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'],
                    'length': 3,
                },
            },
        }, {
            'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'],
            'metadata': {
                'dimension': {
                    'name': 'columns',
                    'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'],
                    'length': 2,
                },
            },
        }, {
            'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', 0],
            'metadata': {
                'name': 'A',
                'structural_type': 'numpy.int64',
            },
        }, {
            'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', 1],
            'metadata': {
                'name': 'B',
                'structural_type': 'str',
            },
        }])
コード例 #26
0
 def time_large_dict_with_objects(self, compact):
     l = container.List([self.large_dict_with_objects],
                        generate_metadata=False)
     l.metadata.generate(l, compact=compact)
コード例 #27
0
    def produce(self,
                *,
                inputs: Input,
                timeout: float = None,
                iterations: int = None) -> CallResult[Output]:
        """
        generate features for the input.
        Input:
            typing.Union[container.Dataset, container.DataFrame, container.ndarray, container.matrix, container.List]
        Output:
            typing.Union[container.Dataset, container.DataFrame, container.ndarray, container.matrix, container.List]
        """
        # Wrap as container, if needed
        inputs = inputs.copy()
        if not pytypes.is_of_type(inputs, types.Container):
            if isinstance(inputs, pd.DataFrame):
                inputs = container.DataFrame(inputs)
            elif isinstance(inputs, np.matrix):
                inputs = container.matrix(inputs)
            elif isinstance(inputs, np.ndarray):
                inputs = container.ndarray(inputs)
            elif isinstance(inputs, list):
                inputs = container.List(inputs)
            else:
                # Inputs is not a container, and cannot be converted to a container.
                # Nothing to do, since cannot store the computed metadata.
                return CallResult(inputs)

        # calling the utility to detect integer and float datatype columns
        # inputs = dtype_detector.detector(inputs)

        # calling the utility to categorical datatype columns
        metadata = self._produce(inputs, inputs.metadata, [])
        # I guess there are updating the metdata here
        inputs.metadata = metadata

        if inputs.shape[0] > 100:
            self._sample_df = inputs.dropna().iloc[0:100, :]
        else:
            self._sample_df = inputs

        # calling date detector

        self._DateFeaturizer = DateFeaturizerOrg(inputs)
        try:
            cols = self._DateFeaturizer.detect_date_columns(self._sample_df)
        except Exception as e:
            _logger.error(traceback.print_exc(e))
            cols = list()
        if cols:
            indices = [
                inputs.columns.get_loc(c) for c in cols if c in inputs.columns
            ]
            for i in indices:
                old_metadata = dict(
                    inputs.metadata.query((mbase.ALL_ELEMENTS, i)))
                temp_value = list(old_metadata["semantic_types"])
                if len(temp_value) >= 1:
                    # if 'https://metadata.datadrivendiscovery.org/types/CategoricalData' not in old_metadata.get(
                    #         "semantic_types", []):
                    #     old_metadata["semantic_types"] = (
                    #         'https://metadata.datadrivendiscovery.org/types/CategoricalData',
                    #         'https://metadata.datadrivendiscovery.org/types/Attribute')
                    if 'https://metadata.datadrivendiscovery.org/types/Time' not in old_metadata.get(
                            "semantic_types", []):
                        old_metadata["semantic_types"] += (
                            'https://metadata.datadrivendiscovery.org/types/Time',
                        )
                # if isinstance(self._sample_df.iloc[:, i].head(1).values[0], str):
                #     old_metadata["structural_type"] = type("str")
                # elif isinstance(self._sample_df.iloc[:, i].head(1).values[0], int):
                #     old_metadata["structural_type"] = type(10)
                # else:
                #     old_metadata["structural_type"] = type(10.2)

                _logger.info(
                    "Date detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'",
                    {
                        'column_index':
                        i,
                        'old_metadata':
                        dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))),
                        'new_metadata':
                        old_metadata,
                    },
                )

                inputs.metadata = inputs.metadata.update(
                    (mbase.ALL_ELEMENTS, i), old_metadata)

        # calling the PhoneParser detector

        try:
            PhoneParser_indices = PhoneParser.detect(df=self._sample_df)
        except Exception as e:
            _logger.error(traceback.print_exc(e))
            PhoneParser_indices = dict()
        if PhoneParser_indices.get("columns_to_perform"):
            for i in PhoneParser_indices["columns_to_perform"]:
                old_metadata = dict(
                    inputs.metadata.query((mbase.ALL_ELEMENTS, i)))
                # print("old metadata", old_metadata)
                if 'https://metadata.datadrivendiscovery.org/types/isAmericanPhoneNumber' not in old_metadata.get(
                        "semantic_types", []):
                    old_metadata["semantic_types"] += (
                        'https://metadata.datadrivendiscovery.org/types/isAmericanPhoneNumber',
                    )

                # if isinstance(self._sample_df.iloc[:, i].head(1).values[0], str):
                #     old_metadata["structural_type"] = type("str")
                # elif isinstance(self._sample_df.iloc[:, i].head(1).values[0], int):
                #     old_metadata["structural_type"] = type(10)
                # else:
                #     old_metadata["structural_type"] = type(10.2)

                _logger.info(
                    "Phone detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'",
                    {
                        'column_index':
                        i,
                        'old_metadata':
                        dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))),
                        'new_metadata':
                        old_metadata,
                    },
                )
                inputs.metadata = inputs.metadata.update(
                    (mbase.ALL_ELEMENTS, i), old_metadata)

        # calling the PunctuationSplitter detector

        try:
            PunctuationSplitter_indices = PunctuationParser.detect(
                df=self._sample_df,
                max_avg_length=self.hyperparams['split_on_column_with_avg_len']
            )
        except Exception as e:
            _logger.error(traceback.print_exc(e))
            PunctuationSplitter_indices = dict()
        if PunctuationSplitter_indices.get("columns_to_perform"):
            for i in PunctuationSplitter_indices["columns_to_perform"]:
                old_metadata = dict(
                    inputs.metadata.query((mbase.ALL_ELEMENTS, i)))
                if 'https://metadata.datadrivendiscovery.org/types/TokenizableByPunctuation' not in old_metadata.get(
                        "semantic_types", []):
                    old_metadata["semantic_types"] += (
                        'https://metadata.datadrivendiscovery.org/types/TokenizableByPunctuation',
                    )

                # if isinstance(self._sample_df.iloc[:, i].head(1).values[0], str):
                #     old_metadata["structural_type"] = type("str")
                # elif isinstance(self._sample_df.iloc[:, i].head(1).values[0], int):
                #     old_metadata["structural_type"] = type(10)
                # else:
                #     old_metadata["structural_type"] = type(10.2)

                _logger.info(
                    "Punctuation detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'",
                    {
                        'column_index':
                        i,
                        'old_metadata':
                        dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))),
                        'new_metadata':
                        old_metadata,
                    },
                )
                inputs.metadata = inputs.metadata.update(
                    (mbase.ALL_ELEMENTS, i), old_metadata)

        # calling the NumAlphaSplitter detector

        try:
            NumAlphaSplitter_indices = NumAlphaParser.detect(
                df=self._sample_df,
                max_avg_length=self.
                hyperparams['split_on_column_with_avg_len'],
            )
        except Exception as e:
            _logger.error(traceback.print_exc(e))
            NumAlphaSplitter_indices = dict()

        if NumAlphaSplitter_indices.get("columns_to_perform"):
            for i in NumAlphaSplitter_indices["columns_to_perform"]:
                old_metadata = dict(
                    inputs.metadata.query((mbase.ALL_ELEMENTS, i)))
                if 'https://metadata.datadrivendiscovery.org/types/TokenizableIntoNumericAndAlphaTokens' not in old_metadata.get(
                        "semantic_types", []):
                    old_metadata["semantic_types"] += (
                        'https://metadata.datadrivendiscovery.org/types/TokenizableIntoNumericAndAlphaTokens',
                    )

                # if isinstance(self._sample_df.iloc[:, i].head(1).values[0], str):
                #     old_metadata["structural_type"] = type("str")
                # elif isinstance(self._sample_df.iloc[:, i].head(1).values[0], int):
                #     old_metadata["structural_type"] = type(10)
                # else:
                #     old_metadata["structural_type"] = type(10.2)

                _logger.info(
                    "NumAlpha detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'",
                    {
                        'column_index':
                        i,
                        'old_metadata':
                        dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))),
                        'new_metadata':
                        old_metadata,
                    },
                )
                inputs.metadata = inputs.metadata.update(
                    (mbase.ALL_ELEMENTS, i), old_metadata)

        inputs = self._relabel_categorical(inputs)
        return CallResult(inputs)
コード例 #28
0
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        # print('lcc produce started', file=sys.stderr)

        # unpack the data from the graph to list reader
        learning_data, graphs_full_all, nodeIDs_full_all, task_type = inputs

        # initialize lists for connected components and associated nodeids
        graphs_largest_all = []
        nodeIDs_largest_all = []

        for graph_index in range(len(graphs_full_all)):
            # select the graph and node ids for the current graph
            graph_full = graphs_full_all[graph_index]
            nodeIDs_full = nodeIDs_full_all[graph_index]

            # split the current graph into connected components
            subgraphs = [graph_full.subgraph(i).copy()
                        for i in sorted(nx.connected_components(graph_full),
                                        key=len, reverse=True)]

            # pick the largest connected component of the current graph
            graph_largest = [0]
            components = np.zeros(len(graph_full), dtype=int) # only for CD
            for i, connected_component in enumerate(subgraphs):
                # obtain indices associated with the node_ids in this component
                temp_indices = [j for j, x in enumerate(nodeIDs_full)
                                if x in [str(c) for c in list(connected_component)]]
                components[temp_indices] = i
                # check if the component is largest
                if len(connected_component) > len(graph_largest):
                    # if it is largest - flag as such
                    graph_largest = connected_component.copy()
                    # and subselect the appropriate nodeIDs
                    nodeIDs_largest = nodeIDs_full[temp_indices]

            # append the largest_connected component and nodeIDs
            graphs_largest_all.append(graph_largest)
            nodeIDs_largest_all.append(nodeIDs_largest)

            # for communityDetection the component needs to be specified in
            # the dataframe; in this problem there is always only one graph
            # TODO: condsider avoiding the specification of the problem
            #       likely can be achiebed by handling nodeIDs data smartly
            if task_type == "communityDetection":
                learning_data['components'] = components

        outputs = container.List([
            learning_data, graphs_largest_all, nodeIDs_largest_all])

        debugging = False
        if debugging:
            # GRAPH STUFF
            print("length of the first graph: {}".format(
                len(list(graphs_largest_all[0].nodes()))), file=sys.stderr)
            print("first 20 nodes of the first graph", file=sys.stderr)
            print(list(graphs_largest_all[0].nodes())[:20], file=sys.stderr)
            # NODE IDS STUFF
            print("type of a nodeID: {}".format(
                type(nodeIDs_largest_all[0][0])), file=sys.stderr)
            print("length of the nodeIds: {}".format(
                len(nodeIDs_largest_all[0])), file=sys.stderr)
            print("first 20 nodesIDs", file=sys.stderr)
            print(nodeIDs_largest_all[0][:20], file=sys.stderr)
            # TASK STUFF
            print("task: {}". format(task_type), file=sys.stderr)
            # LCC stuff
            print("unique components: {}".format(np.unique(components)),
                  file=sys.stderr)
        # print('lcc produce ended', file=sys.stderr)

        return base.CallResult(outputs)
コード例 #29
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        # read in graph and training csv
        np.random.seed(self.random_seed)
        graph_dataframe = inputs['0']
        csv = inputs['learningData']

        # antons debugging feel free to delete
        # print("start of anton debugging", file=sys.stderr)

        # print(dir(inputs), file=sys.stderr)
        # for i in inputs:
        #     print(i, file=sys.stderr)
        #     print(type(i), file=sys.stderr)
        # print(type(inputs['0']), file=sys.stderr)
        #        print(inputs['0'].edges.data(), file=sys.stderr)
        # print(type(graph_dataframe.at[0, 'filename']), file=sys.stderr)
        # print(graph_dataframe.at[0, 'filename'], file=sys.stderr)

        # print("end of anton debugging", file=sys.stderr)

        temp_json = inputs.to_json_structure()
        location_uri = temp_json['location_uris'][0]
        path_to_graph = location_uri[:-15] + "graphs/" + graph_dataframe.at[
            0, 'filename']
        graph = nx.read_gml(path=path_to_graph[7:])
        n = len(graph)

        # grab link types (values) and edge list (keys)
        values = np.array(list(
            nx.get_edge_attributes(graph, 'linkType').values()),
                          dtype=int)
        keys = np.array(list(nx.get_edge_attributes(graph, 'linkType').keys()),
                        dtype=int)

        # grab the unique link types
        uniq_linktypes = np.unique(values)
        M = len(uniq_linktypes)

        if M == 0:
            M = 1
            n_edges = np.array([len(list(graph.edges))])
            values = np.zeros(n_edges[0])
            keys = np.array(list(graph.edges), dtype=int)
        else:
            n_edges = np.zeros(M)  # imputation

            for i in range(len(values)):
                temp_linktype = int(values[i])
                n_edges[temp_linktype] += 1  # imputation

        n_choose_2 = (n**2 - n) / 2
        A_imps = [
            0.5 * (0.5 + n_edges[i] / n_choose_2) * np.ones((n, n))
            for i in range(M)
        ]

        for i in range(len(values)):
            temp_linktype = int(values[i])
            A_imps[temp_linktype][keys[i][0] - 1, keys[i][1] - 1] = 1
            A_imps[temp_linktype][keys[i][1] - 1, keys[i][0] - 1] = 1

        for i in range(M):
            imputations = 0
            while imputations < n_edges[i]:
                v1 = np.random.randint(n)
                v2 = np.random.randint(n)
                if v1 == v2 or A_imps[i][v1, v2] == 1:
                    pass
                else:
                    A_imps[i][v1, v2] = 0
                    A_imps[i][v2, v1] = 0
                    imputations += 1

        A = -1 * np.zeros(shape=(M * n, M * n))

        for i in range(M):
            for j in range(i, M):
                A[i * n:(i + 1) * n,
                  j * n:(j + 1) * n] = (A_imps[i] + A_imps[j]) / 2
                A[j * n:(j + 1) * n,
                  i * n:(i + 1) * n] = (A_imps[i] + A_imps[j]) / 2

        info = container.List([n, M])
        link_prediction = True

        # # initialize a list of graphs to pass around
        # list_of_graphs = [nx.Graph() for i in range(M)]

        # # each graph is on the same node set
        # for i in range(M):
        #     list_of_graphs[i].add_nodes_from(graph)

        # # populate the graphs with edges
        # for i in range(len(values)):
        #     temp_G = list_of_graphs[values[i]]
        #     temp_G.add_edge(keys[i][0], keys[i][1])
        #     temp_G.add_edge(keys[i][1], keys[i][0])

        return base.CallResult(
            container.List([container.ndarray(A), csv, info, link_prediction]))
コード例 #30
0
    def test_complex_value(self):
        self.maxDiff = None

        dataset = container.Dataset({
            '0': container.DataFrame({
                'A': [
                    container.ndarray(numpy.array(['a', 'b', 'c'])),
                    container.ndarray(numpy.array([1, 2, 3], dtype=numpy.int64)),
                    container.ndarray(numpy.array([1.0, 2.0, 3.0])),
                ],
                'B': [
                    container.List(['a', 'b', 'c']),
                    container.List([1, 2, 3]),
                    container.List([1.0, 2.0, 3.0]),
                ],
            }),
        }, generate_metadata=False)

        dataset_metadata = dataset.metadata.generate(dataset, compact=True)

        self.assertEqual(utils.to_json_structure(dataset_metadata.to_internal_simple_structure()), [{
            'selector': [],
            'metadata': {
                'schema': base.CONTAINER_SCHEMA_VERSION,
                'structural_type': 'd3m.container.dataset.Dataset',
                'dimension': {
                    'name': 'resources',
                    'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'],
                    'length': 1,
                },
            },
        }, {
            'selector': ['__ALL_ELEMENTS__'],
            'metadata': {
                'structural_type': 'd3m.container.pandas.DataFrame',
                'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'],
                'dimension': {
                    'name': 'rows',
                    'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'],
                    'length': 3,
                },
            },
        }, {
            'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'],
            'metadata': {
                'dimension': {
                    'name': 'columns',
                    'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'],
                    'length': 2,
                },
            },
        }, {
            'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'],
            'metadata': {
                'dimension': {
                    'length': 3
                },
            },
        }, {
            'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', 0],
            'metadata': {
                'structural_type': 'd3m.container.numpy.ndarray',
                'name': 'A',
            },
        }, {
            'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', 1],
            'metadata': {
                'structural_type': 'd3m.container.list.List',
                'name': 'B',
            },
        }, {
            'selector': ['__ALL_ELEMENTS__', 0, 0, '__ALL_ELEMENTS__'],
            'metadata': {
                'structural_type': 'numpy.str_',
            },
        }, {
            'selector': ['__ALL_ELEMENTS__', 0, 1, '__ALL_ELEMENTS__'],
            'metadata': {
                'structural_type': 'str',
            },
        }, {
            'selector': ['__ALL_ELEMENTS__', 1, 0, '__ALL_ELEMENTS__'],
            'metadata': {
                'structural_type': 'numpy.int64',
            },
        }, {
            'selector': ['__ALL_ELEMENTS__', 1, 1, '__ALL_ELEMENTS__'],
            'metadata': {
                'structural_type': 'int',
            },
        }, {
            'selector': ['__ALL_ELEMENTS__', 2, 0, '__ALL_ELEMENTS__'],
            'metadata': {
                'structural_type': 'numpy.float64',
            },
        }, {
            'selector': ['__ALL_ELEMENTS__', 2, 1, '__ALL_ELEMENTS__'],
            'metadata': {
                'structural_type': 'float',
            }
        }])

        dataset_metadata = dataset.metadata.generate(dataset, compact=False)

        self.assertEqual(utils.to_json_structure(dataset_metadata.to_internal_simple_structure()), [{
            'selector': [],
            'metadata': {
                'dimension': {
                    'length': 1,
                    'name': 'resources',
                    'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'],
                },
               'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json',
               'structural_type': 'd3m.container.dataset.Dataset',
            },
        }, {
            'selector': ['0'],
            'metadata': {
                'dimension': {
                    'length': 3,
                    'name': 'rows',
                    'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'],
                },
               'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'],
               'structural_type': 'd3m.container.pandas.DataFrame',
            },
        },
        {
            'selector': ['0', '__ALL_ELEMENTS__'],
            'metadata': {
                'dimension': {
                    'length': 2,
                    'name': 'columns',
                    'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'],
                },
            },
        },
        {
            'selector': ['0', '__ALL_ELEMENTS__', 0],
            'metadata': {
                'name': 'A',
            },
        },
        {
            'selector': ['0', '__ALL_ELEMENTS__', 1],
            'metadata': {
                'name': 'B',
            },
        },
        {
            'selector': ['0', 0, 0],
            'metadata': {
                'dimension': {
                    'length': 3,
                },
                'structural_type': 'd3m.container.numpy.ndarray',
            },
        },
        {
            'selector': ['0', 0, 0, '__ALL_ELEMENTS__'],
            'metadata': {
                'structural_type': 'numpy.str_'
            },
        },
        {
            'selector': ['0', 0, 1],
            'metadata': {
                'dimension': {
                    'length': 3,
                },
                'structural_type': 'd3m.container.list.List',
            },
        }, {
            'selector': ['0', 0, 1, '__ALL_ELEMENTS__'],
            'metadata': {
                'structural_type': 'str',
            },
        }, {
            'selector': ['0', 1, 0],
            'metadata': {
                'dimension': {
                    'length': 3,
                },
                'structural_type': 'd3m.container.numpy.ndarray',
            },
        }, {
            'selector': ['0', 1, 0, '__ALL_ELEMENTS__'],
            'metadata': {
                'structural_type': 'numpy.int64',
            },
        }, {
            'selector': ['0', 1, 1],
            'metadata': {
                'dimension': {
                    'length': 3,
                },
                'structural_type': 'd3m.container.list.List',
            },
        }, {
            'selector': ['0', 1, 1, '__ALL_ELEMENTS__'],
            'metadata': {
                'structural_type': 'int',
            },
        }, {
            'selector': ['0', 2, 0],
            'metadata': {
                'dimension': {
                    'length': 3,
                },
                'structural_type': 'd3m.container.numpy.ndarray',
            },
        }, {
            'selector': ['0', 2, 0, '__ALL_ELEMENTS__'],
            'metadata': {
                'structural_type': 'numpy.float64',
            },
        },
        {
            'selector': ['0', 2, 1],
            'metadata': {
                'dimension': {
                    'length': 3,
                },
                'structural_type': 'd3m.container.list.List',
            },
        }, {
            'selector': ['0', 2, 1, '__ALL_ELEMENTS__'],
            'metadata': {
                'structural_type': 'float',
            },
        }])