def test_list_with_objects(self): l = container.List([container.List([str(j) for i in range(5)]) for j in range(10)], generate_metadata=True) self.assertEqual(utils.to_json_structure(l.metadata.to_internal_simple_structure()), [ { 'selector': [], 'metadata': { 'schema': base.CONTAINER_SCHEMA_VERSION, 'structural_type': 'd3m.container.list.List', 'dimension': { 'length': 10, }, }, }, { 'selector': ['__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'd3m.container.list.List', 'dimension': { 'length': 5, }, } }, { 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'str', }, }, ])
def test_pickle(self): # This test is not really useful anymore because primitive now does not keep random state # anymore but outputs depend only on inputs, and not on previous calls to "produce" method. hyperparams_class = RandomPrimitive.metadata.get_hyperparams() primitive = RandomPrimitive(random_seed=42, hyperparams=hyperparams_class.defaults()) inputs = container.List(list(range(4)), generate_metadata=True) call_metadata = self.call_primitive(primitive, 'produce', inputs=inputs) self.assertTrue(numpy.allclose(call_metadata.value.values, container.ndarray([0.496714153011, -0.138264301171, 0.647688538101, 1.52302985641]).reshape(4, 1))) pickled_primitive = pickle.dumps(primitive) inputs = container.List(list(range(4, 8)), generate_metadata=True) call_metadata = self.call_primitive(primitive, 'produce', inputs=inputs) self.assertTrue(numpy.allclose(call_metadata.value.values, container.ndarray([-0.23415337, -0.23413696, 1.57921282, 0.76743473]).reshape(4, 1))) unpickled_primitive = pickle.loads(pickled_primitive) call_metadata = self.call_primitive(unpickled_primitive, 'produce', inputs=inputs) self.assertTrue(numpy.allclose(call_metadata.value.values, container.ndarray([-0.23415337, -0.23413696, 1.57921282, 0.76743473]).reshape(4, 1)))
def test_hyperparameter(self): hyperparams_class = MonomialPrimitive.metadata.get_hyperparams() primitive = MonomialPrimitive(hyperparams=hyperparams_class(bias=1)) inputs = container.List([1, 2, 3, 4, 5, 6], generate_metadata=True) outputs = container.List([2, 4, 6, 8, 10, 12], generate_metadata=True) self.call_primitive(primitive, 'set_training_data', inputs=inputs, outputs=outputs) call_metadata = self.call_primitive(primitive, 'fit') self.assertEqual(call_metadata.has_finished, True) self.assertEqual(call_metadata.iterations_done, None) inputs = container.List([10, 20, 30], generate_metadata=True) call_metadata = self.call_primitive(primitive, 'produce', inputs=inputs) self.assertSequenceEqual(call_metadata.value, [21, 41, 61]) self.assertEqual(call_metadata.has_finished, True) self.assertEqual(call_metadata.iterations_done, None) self.assertEqual( call_metadata.value.metadata.query(())['dimension']['length'], 3) self.assertEqual( call_metadata.value.metadata.query( (base.ALL_ELEMENTS, ))['structural_type'], float)
def test_lists(self): hyperparams_class = SumPrimitive.metadata.get_hyperparams() primitive = SumPrimitive( hyperparams=hyperparams_class.defaults(), docker_containers=self.get_docker_containers()) inputs = container.List( [container.List([1, 2, 3, 4]), container.List([5, 6, 7, 8])], generate_metadata=True) call_metadata = self.call_primitive(primitive, 'produce', inputs=inputs) # Because it is a singleton produce method we can know that there is exactly one value in outputs. result = call_metadata.value[0] self.assertEqual(result, 36) self.assertEqual(call_metadata.has_finished, True) self.assertEqual(call_metadata.iterations_done, None) self.assertEqual( call_metadata.value.metadata.query( (metadata_base.ALL_ELEMENTS, ))['structural_type'], float)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Input G: an n x n matrix or a networkx Graph Return The largest connected component of g """ G = inputs['0'] csv = inputs['learningData'] #if len(list(nx.get_node_attributes(G, 'nodeID').values())) == 0: # nx.set_node_attributes(G,'nodeID',-1) # for i in range(len(G)): # G.node[i]['nodeID'] = i if len(csv) != 0: if len(list(nx.get_node_attributes(G, 'nodeID').values())) == 0: nx.set_node_attributes(G, 'nodeID', -1) for i in range(len(G)): G.node[i]['nodeID'] = i nodeIDs = list(nx.get_node_attributes(G, 'nodeID').values()) nodeIDs = container.ndarray(np.array([int(i) for i in nodeIDs])) return base.CallResult(container.List([G.copy(), nodeIDs, csv])) if type(G) == np.ndarray: if G.ndim == 2: if G.shape[0] == G.shape[1]: # n x n matrix G = Graph(G) else: raise TypeError( "Networkx graphs or n x n numpy arrays only") subgraphs = [G.subgraph(i).copy() for i in nx.connected_components(G)] G_connected = [[0]] for i in subgraphs: if len(i) > len(G_connected[0]): G_connected = [i] nodeIDs = list( nx.get_node_attributes(G_connected[0], 'nodeID').values()) nodeIDs = container.ndarray(np.array([int(i) for i in nodeIDs])) return base.CallResult( container.List([G_connected[0].copy(), nodeIDs, csv]))
def test_basic(self): hyperparams_class = RandomPrimitive.metadata.get_hyperparams() primitive = RandomPrimitive(random_seed=42, hyperparams=hyperparams_class.defaults()) inputs = container.List(list(range(4)), generate_metadata=True) call_metadata = self.call_primitive(primitive, 'produce', inputs=inputs) self.assertTrue( numpy.allclose( call_metadata.value.values, container.ndarray([ 0.496714153011, -0.138264301171, 0.647688538101, 1.52302985641 ]).reshape((4, 1)))) self.assertEqual(call_metadata.has_finished, True) self.assertEqual(call_metadata.iterations_done, None) self.assertEqual( call_metadata.value.metadata.query( (base.ALL_ELEMENTS, 0))['structural_type'], numpy.float64)
def test_basic(self): hyperparam_primitive1 = NullTransformerPrimitive( hyperparams=NullTransformerPrimitive.metadata.get_hyperparams( ).defaults()) hyperparam_primitive2 = NullTransformerPrimitive( hyperparams=NullTransformerPrimitive.metadata.get_hyperparams( ).defaults()) primitive = PrimitiveSumPrimitive( hyperparams={ 'primitive_1': hyperparam_primitive1, 'primitive_2': hyperparam_primitive2 }) inputs = container.List([10, 20, 30], generate_metadata=True) call_metadata = self.call_primitive(primitive, 'produce', inputs=inputs) self.assertSequenceEqual(call_metadata.value, [20, 40, 60]) self.assertEqual(call_metadata.has_finished, True) self.assertEqual(call_metadata.iterations_done, None) self.assertEqual( call_metadata.value.metadata.query(())['dimension']['length'], 3) self.assertEqual( call_metadata.value.metadata.query( (base.ALL_ELEMENTS, ))['structural_type'], int)
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: self._fitted = True categorical_attributes = common_utils.list_columns_with_semantic_types( metadata=self._training_data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/OrdinalData", "https://metadata.datadrivendiscovery.org/types/CategoricalData" ]) all_attributes = common_utils.list_columns_with_semantic_types( metadata=self._training_data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/Attribute" ]) self._s_cols = container.List( set(all_attributes).intersection(categorical_attributes)) _logger.debug("%d of categorical attributes found." % (len(self._s_cols))) if len(self._s_cols) > 0: # temp_model = defaultdict(LabelEncoder) # self._training_data.iloc[:, self._s_cols].apply(lambda x: temp_model[x.name].fit(x)) # self._model = dict(temp_model) self._model = {} for col_index in self._s_cols: self._model[ col_index] = self._training_data.iloc[:, col_index].dropna( ).unique() return CallResult(None, has_finished=True)
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: categorical_attributes = common_utils.list_columns_with_semantic_types( metadata=self._training_data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/OrdinalData", "https://metadata.datadrivendiscovery.org/types/CategoricalData" ]) all_attributes = common_utils.list_columns_with_semantic_types( metadata=self._training_data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/Attribute" ]) self._s_cols = container.List( set(all_attributes).intersection(categorical_attributes)) print("[INFO] %d of categorical attributes found." % (len(self._s_cols))) if len(self._s_cols) > 0: temp_model = defaultdict(LabelEncoder) self._training_data.iloc[:, self._s_cols].apply( lambda x: temp_model[x.name].fit(x)) self._model = dict(temp_model) self._fitted = True else: self._fitted = False
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: result = np.abs(self._convert_value(inputs)).sum() outputs = container.List((result, ), generate_metadata=True) return base.CallResult(outputs)
def _produce(self, inputs: DatasetSplitInputs, is_train: bool) -> base.CallResult[DatasetSplitOutputs]: """ This function splits the fitted Dataset. Parameters ---------- inputs: A list of 0-based indices which specify which splits to be used as test split in output. is_train: Whether we are producing train or test data. Returns ------- Returns a list of Datasets. """ if not self._fitted or self._splits is None or self._dataset is None or self._main_resource_id is None or self._graph is None: raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") output_datasets = container.List(generate_metadata=True) for index in inputs: train_indices, test_indices = self._splits[index] if is_train: output_dataset = base_utils.sample_rows( self._dataset, self._main_resource_id, set(train_indices), self._graph, delete_recursive=self.hyperparams.get('delete_recursive', False), ) else: output_dataset = base_utils.sample_rows( self._dataset, self._main_resource_id, set(test_indices), self._graph, delete_recursive=self.hyperparams.get('delete_recursive', False), ) output_datasets.append(output_dataset) output_datasets.metadata = metadata_base.DataMetadata({ 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, 'structural_type': container.List, 'dimension': { 'length': len(output_datasets), }, }) # We update metadata based on metadata of each dataset. # TODO: In the future this might be done automatically by generate_metadata. # See: https://gitlab.com/datadrivendiscovery/d3m/issues/119 for index, dataset in enumerate(output_datasets): output_datasets.metadata = dataset.metadata.copy_to(output_datasets.metadata, (), (index,)) return base.CallResult(output_datasets)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: with self._connection.cursor() as cursor: cursor.execute("SELECT 42;") return base.CallResult( container.List([cursor.fetchone()[0]], generate_metadata=True))
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: primary_key_cols = common_utils.list_columns_with_semantic_types( metadata=inputs.metadata, semantic_types=["https://metadata.datadrivendiscovery.org/types/PrimaryKey"] ) unfold_cols = common_utils.list_columns_with_semantic_types( metadata=inputs.metadata, semantic_types=self.hyperparams["unfold_semantic_types"] ) if not primary_key_cols: warnings.warn("Did not find primary key column for grouping. Will not unfold") return CallResult(inputs) if not unfold_cols: warnings.warn("Did not find any column to unfold. Will not unfold") return CallResult(inputs) primary_key_col_names = [inputs.columns[pos] for pos in primary_key_cols] unfold_col_names = [inputs.columns[pos] for pos in unfold_cols] if self.hyperparams["use_pipeline_id_semantic_type"]: pipeline_id_cols = common_utils.list_columns_with_semantic_types( metadata=inputs.metadata, semantic_types=["https://metadata.datadrivendiscovery.org/types/PipelineId"] ) if len(pipeline_id_cols) >= 2: warnings.warn("Multiple pipeline id columns found. Will use first.") if pipeline_id_cols: inputs = inputs.sort_values(primary_key_col_names + [inputs.columns[pos] for pos in pipeline_id_cols]) self._sorted_pipe_ids = sorted(inputs.iloc[:, pipeline_id_cols[0]].unique()) else: warnings.warn( "No pipeline id column found by 'https://metadata.datadrivendiscovery.org/types/PipelineId'") new_df = self._get_new_df(inputs=inputs, use_cols=primary_key_cols + unfold_cols) groupby_df = inputs.groupby(primary_key_col_names)[unfold_col_names].aggregate( lambda x: container.List(x)).reset_index(drop=False) ret_df = container.DataFrame(groupby_df) ret_df.metadata = new_df.metadata ret_df = self._update_metadata_dimension(df=ret_df) split_col_names = [inputs.columns[pos] for pos in unfold_cols] ret_df = self._split_aggregated(df=ret_df, split_col_names=split_col_names) ret_df = common_utils.remove_columns( inputs=ret_df, column_indices=[ret_df.columns.get_loc(name) for name in split_col_names] ) return CallResult(ret_df)
def setup(self): self.large_dataframe = container.DataFrame(pandas.DataFrame( {str(i): [str(j) for j in range(10000)] for i in range(50)}, columns=[str(i) for i in range(50)]), generate_metadata=True) self.large_list = container.List([ container.List([str(j) for i in range(50)]) for j in range(10000) ], generate_metadata=True) self.large_ndarray = container.ndarray(numpy.array( [[[str(k) for k in range(5)] for i in range(10)] for j in range(10000)], dtype=object), generate_metadata=True) self.large_dict_list = container.List( {str(i): {str(j): j for j in range(10000)} for i in range(50)}, generate_metadata=True)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: graph = inputs['0'] csv = inputs['1'] linktypes = np.array(csv['linkType'], dtype='int32') uniq_linktypes, n_i = np.unique(linktypes, return_counts=True) n_linktypes = len(uniq_linktypes) sources = np.array(csv['source_nodeID'], dtype='int32') targets = np.array(csv['target_nodeID'], dtype='int32') nodes = set(np.concatenate((sources, targets))) n_nodes = len(nodes) info = np.array(csv['linkExists'], dtype='int32') n_info = len(info) edge_counts = np.zeros(n_linktypes) for i in range(n_info): temp_link_type = linktypes[i] edge_counts[temp_link_type] += info[i] p_hats = edge_counts / n_i graphs = [ p_hats[i] * np.ones(shape=(n_nodes, n_nodes)) for i in range(n_linktypes) ] # set up a bunch of empty graphs for i in range(n_info): temp_link_type = int(linktypes[i]) graphs[temp_link_type][sources[i], targets[i]] = info[i] graphs[temp_link_type][targets[i], sources[i]] = info[i] big_graph = np.zeros(shape=(n_nodes * int(n_linktypes), n_nodes * int(n_linktypes))) for i in range(n_linktypes): big_graph[i * n_nodes:(i + 1) * n_nodes, i * n_nodes:(i + 1) * n_nodes] = graphs[i] for i in range(n_linktypes): for j in range(i + 1, n_linktypes): big_graph[i * n_nodes:(i + 1) * n_nodes, j * n_nodes:(j + 1) * n_nodes] = (graphs[i] + graphs[j]) / 2 big_graph[j * n_nodes:(j + 1) * n_nodes, i * n_nodes:(i + 1) * n_nodes] = (graphs[i] + graphs[j]) / 2 return base.CallResult(container.List([container.ndarray(big_graph)]))
class Hyperparams(hyperparams.Hyperparams): search_result = hyperparams.Hyperparameter[bytes]( default=b'', semantic_types=[ 'https://metadata.datadrivendiscovery.org/types/ControlParameter', ], description="Pickled search result provided by Datamart", ) augment_columns = hyperparams.Hyperparameter[list]( default=container.List(), semantic_types=[ 'https://metadata.datadrivendiscovery.org/types/ControlParameter' ], description= "Optional list of columns from the Datamart dataset that will be added" )
def test_list(self): l = container.List([1, 2, 3], generate_metadata=True) l.metadata = l.metadata.update((), { 'test': 'foobar', }) object_id = self.client.put(l) l_copy = self.client.get(object_id) self.assertIsInstance(l_copy, container.List) self.assertTrue(hasattr(l_copy, 'metadata')) self.assertSequenceEqual(l, l_copy) self.assertEqual(l.metadata.to_internal_json_structure(), l_copy.metadata.to_internal_json_structure()) self.assertEqual(l_copy.metadata.query(()).get('test'), 'foobar')
def setup(self, compact): self.large_dataframe_with_objects = pandas.DataFrame( {str(i): [str(j) for j in range(10000)] for i in range(50)}, columns=[str(i) for i in range(50)]) self.large_list_with_objects = [ container.List([str(j) for i in range(50)]) for j in range(10000) ] self.large_ndarray_with_objects = numpy.array( [[[str(k) for k in range(5)] for i in range(10)] for j in range(10000)], dtype=object) self.large_dict_with_objects = { str(i): {str(j): j for j in range(10000)} for i in range(50) }
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: # In the future, we should store here data in Arrow format into # Plasma store and just pass an ObjectId of data over HTTP. value = self._convert_value(inputs) data = pickle.dumps(value) # TODO: Retry if connection fails. # This connection can sometimes fail because the service inside a Docker container # is not yet ready, despite container itself already running. Primitive should retry # a few times before aborting. # Primitive knows the port the container is listening on. connection = client.HTTPConnection( self.docker_containers[DOCKER_KEY].address, port=self.docker_containers[DOCKER_KEY].ports['8000/tcp']) # This simple primitive does not keep any state in the Docker container. # But if your primitive does have to associate requests with a primitive, consider # using Python's "id(self)" call to get an identifier of a primitive's instance. self.logger.debug("HTTP request: container=%(container)s", {'container': self.docker_containers[DOCKER_KEY]}, extra={'data': value}) connection.request('POST', '/', data, { 'Content-Type': 'multipart/form-data', }) response = connection.getresponse() self.logger.debug("HTTP response: status=%(status)s", {'status': response.status}, extra={'response': response}) if response.status != 200: raise ValueError("Invalid HTTP response status: {status}".format( status=response.status)) result = float(response.read()) # Outputs are different from inputs, so we do not reuse metadata from inputs but generate new metadata. outputs = container.List((result, ), generate_metadata=True) # Wrap it into default "CallResult" object: we are not doing any iterations. return base.CallResult(outputs)
def fit(self, *, timeout: float = None, iterations: int = None) -> base.CallResult[None]: if self._fitted: return base.CallResult(None) embeddings = self._training_inputs[1][0] csv = self._training_inputs[0] n_nodes, n_links = self._training_inputs[3] n_info = csv.shape[0] ranks = [[[], []] for i in range(n_links + 1)] try: int(np.array(csv['linkType'])[0]) except: csv['linkType'] = np.zeros(n_info) # print(csv, file=sys.stderr) csv_headers = csv.columns for header in csv_headers: if header[:6] == "source": SOURCE = header elif header[:6] == "target": TARGET = header for i in range(n_info): temp_link = int(np.array(csv['linkType'])[i]) temp_exists = int(np.array(csv['linkExists'])[i]) temp_source = int(np.array(csv[SOURCE])[i]) temp_target = int(np.array(csv[TARGET])[i]) temp_dot = embeddings[temp_link*n_nodes + temp_source - 1] @ embeddings[temp_link*n_nodes + temp_target - 1] ranks[temp_link][temp_exists].append(temp_dot) ranks[-1][temp_exists].append(temp_dot) for i in range(len(ranks)): ranks[i][0] = np.sort(ranks[i][0]) ranks[i][1] = np.sort(ranks[i][1]) self._embeddings = container.ndarray(embeddings) self._inner_products = container.List(ranks) self._fitted = True return base.CallResult(None)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: primitive_1 = self.hyperparams['primitive_1'] primitive_2 = self.hyperparams['primitive_2'] results = [] if primitive_1 is not None: start = time.perf_counter() results.append(primitive_1.produce(inputs=inputs, timeout=timeout, iterations=iterations)) delta = time.perf_counter() - start # Decrease the amount of time available to other calls. This delegates responsibility # of raising a "TimeoutError" exception to produce methods themselves. It also assumes # that if one passes a negative timeout value to a produce method, it raises a # "TimeoutError" exception correctly. if timeout is not None: timeout -= delta if primitive_2 is not None: results.append(primitive_2.produce(inputs=inputs, timeout=timeout, iterations=iterations)) if not results: raise exceptions.InvalidArgumentValueError("No primitives provided as hyper-parameters.") # Even if the structure of outputs is the same as inputs, conceptually, outputs are different, # they are new data. So we do not reuse metadata from inputs but generate new metadata. outputs = container.List([sum(x) for x in zip(*[result.value for result in results])], generate_metadata=True) # We return the maximum number of iterations done by any produce method we called. iterations_done = None for result in results: if result.iterations_done is not None: if iterations_done is None: iterations_done = result.iterations_done else: iterations_done = max(iterations_done, result.iterations_done) return base.CallResult( value=outputs, has_finished=all(result.has_finished for result in results), iterations_done=iterations_done, )
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: np.random.seed(self.random_seed) #print('lcc, baby!', file=sys.stderr) csv = inputs[0] G = inputs[1][0] nodeIDs = inputs[2] TASK = inputs[3] # print(len(G), file=sys.stderr) subgraphs = [G.subgraph(i).copy() for i in nx.connected_components(G)] components = np.zeros(len(G), dtype=int) for i, connected_component in enumerate(nx.connected_components(G)): #print(np.array(list(connected_component), dtype=int), file=sys.stderr) components[np.array(list(connected_component), dtype=int)] = i + 1 # NODEID = "" # for header in csv.columns: # if "nodeID" in header: # NODEID = header # nodeIDs = list(csv[NODEID].values) # if TASK == "vertexClassification": # csv['components'] = components[np.array(csv[NODEID], dtype=int)] if TASK == "communityDetection": csv['components'] = components G_connected = [0] for i in subgraphs: if len(i) > len(G_connected): G_connected = i return base.CallResult( container.List([csv, [G_connected.copy()], nodeIDs]))
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: # read in graph and training csv np.random.seed(random_seed) graph = inputs['0'] csv = inputs['learningData'] n = len(graph) # grab link types (values) and edge list (keys) values = np.array(list(nx.get_edge_attributes(graph, 'linkType').values())) keys = np.array(list(nx.get_edge_attributes(graph, 'linkType').keys())) # grab the unique link types uniq_linktypes = np.unique(values) M = len(uniq_linktypes) n_edges = np.zeros(M) # imputation n_choose_2 = (n**2 - n)/2 for i in range(len(values)): temp_linktype = values[i] n_edges[temp_linktype] += 1 # imputation A_imps = [0.5*(0.5 + n_edges[i]/n_choose_2)*np.ones((n, n)) for i in range(M)] for i in range(len(values)): temp_linktype = values[i] A_imps[temp_linktype][keys[i][0], keys[i][1]] = 1 A_imps[temp_linktype][keys[i][1], keys[i][0]] = 1 for i in range(M): imputations = 0 while imputations < n_edges[i]: v1 = np.random.randint(n) v2 = np.random.randint(n) if v1 == v2 or A_imps[i][v1, v2] == 1: pass else: A_imps[i][v1, v2] = 0 A_imps[i][v2, v1] = 0 imputations += 1 A = -1*np.zeros(shape = (M*n, M*n)) for i in range(M): for j in range(i, M): A[i*n: (i + 1)*n, j*n: (j + 1)*n] = (A_imps[i] + A_imps[j])/2 A[j*n: (j + 1)*n, i*n: (i + 1)*n] = (A_imps[i] + A_imps[j])/2 info = container.List([n, M]) link_prediction = True # # initialize a list of graphs to pass around # list_of_graphs = [nx.Graph() for i in range(M)] # # each graph is on the same node set # for i in range(M): # list_of_graphs[i].add_nodes_from(graph) # # populate the graphs with edges # for i in range(len(values)): # temp_G = list_of_graphs[values[i]] # temp_G.add_edge(keys[i][0], keys[i][1]) # temp_G.add_edge(keys[i][1], keys[i][0]) return base.CallResult(container.List([container.ndarray(A), csv, info, link_prediction]))
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """ Need training data from set_training_data first. The encoder would record specified columns to encode and column values to unary encode later in the produce step. """ if self._fitted: return if self._training_inputs is None: raise ValueError('Missing training(fitting) data.') data = self._training_inputs.copy() all_attributes = utils.list_columns_with_semantic_types(metadata=data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/Attribute"]) # Remove columns with all empty values, structural type str numeric = utils.list_columns_with_semantic_types( data.metadata, ['http://schema.org/Integer', 'http://schema.org/Float']) numeric = [x for x in numeric if x in all_attributes] for element in numeric: if data.metadata.query((mbase.ALL_ELEMENTS, element)).get('structural_type', ())==str: if pd.isnull(pd.to_numeric(data.iloc[:,element], errors='coerce')).sum() == data.shape[0]: self._empty_columns.append(element) # Remove columns with all empty values, structural numeric is_empty = pd.isnull(data).sum(axis=0) == data.shape[0] for i in all_attributes: if is_empty.iloc[i]: self._empty_columns.append(i) self._empty_columns = list(set(self._empty_columns)) self._empty_columns.reverse() self._empty_columns = container.List(self._empty_columns) data = utils.remove_columns(data, self._empty_columns) # print('fit', data.shape) categorical_attributes = utils.list_columns_with_semantic_types( metadata=data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/OrdinalData", "https://metadata.datadrivendiscovery.org/types/CategoricalData" ] ) all_attributes = utils.list_columns_with_semantic_types( metadata=data.metadata, semantic_types=["https://metadata.datadrivendiscovery.org/types/Attribute"] ) self._cat_col_index = container.List(set(all_attributes).intersection(numeric)) self._cat_columns = container.List(data.columns[self._cat_col_index].tolist()) #import pdb #pdb.set_trace() numerical_values = data.iloc[:, self._cat_col_index].apply( lambda col: pd.to_numeric(col, errors='coerce')) self._all_columns = set(data.columns) # mapping idict = {} for name in self._cat_columns: col = numerical_values[name] idict[name] = sorted(col.unique()) self._mapping = idict if self._text2int: texts = data.drop(self._mapping.keys(),axis=1) texts = texts.select_dtypes(include=[object]) le = Label_encoder() le.fit_pd(texts) self._textmapping = le.get_params() # determine whether to run unary encoder on the given column or not data_enc = data.iloc[:, self._cat_col_index].apply(lambda col: pd.to_numeric(col, errors='coerce')) for column_name in data_enc: col = data_enc[column_name] col.is_copy = False # only apply unary encoder when the amount of the numerical data is less than 12 if col.unique().shape[0] < 13: self._requirement[column_name] = True else: self._requirement[column_name] = False self._fitted = True return CallResult(None, has_finished=True, iterations_done=1)
def test_list(self): lst = container.List(['a', 'b', 'c'], generate_metadata=True) self.assertEqual(utils.to_json_structure(lst.metadata.to_internal_simple_structure()), [{ 'selector': [], 'metadata': { 'schema': base.CONTAINER_SCHEMA_VERSION, 'structural_type': 'd3m.container.list.List', 'dimension': { 'length': 3, }, }, }, { 'selector': ['__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'str', }, }]) lst = container.List([1, 'a', 2.0], generate_metadata=True) self.assertEqual(utils.to_json_structure(lst.metadata.to_internal_simple_structure()), [{ 'selector': [], 'metadata': { 'schema': base.CONTAINER_SCHEMA_VERSION, 'structural_type': 'd3m.container.list.List', 'dimension': { 'length': 3, }, }, }, { 'selector': [0], 'metadata': { 'structural_type': 'int', }, }, { 'selector': [1], 'metadata': { 'structural_type': 'str', }, }, { 'selector': [2], 'metadata': { 'structural_type': 'float', }, }]) dataframe = container.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'c']}) dataframe.A = dataframe.A.astype(numpy.int64) lst = container.List([dataframe], generate_metadata=True) self.assertEqual(utils.to_json_structure(lst.metadata.to_internal_simple_structure()), [{ 'selector': [], 'metadata': { 'schema': base.CONTAINER_SCHEMA_VERSION, 'structural_type': 'd3m.container.list.List', 'dimension': { 'length': 1, }, }, }, { 'selector': ['__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'd3m.container.pandas.DataFrame', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], 'dimension': { 'name': 'rows', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], 'length': 3, }, }, }, { 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], 'metadata': { 'dimension': { 'name': 'columns', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], 'length': 2, }, }, }, { 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', 0], 'metadata': { 'name': 'A', 'structural_type': 'numpy.int64', }, }, { 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', 1], 'metadata': { 'name': 'B', 'structural_type': 'str', }, }])
def time_large_dict_with_objects(self, compact): l = container.List([self.large_dict_with_objects], generate_metadata=False) l.metadata.generate(l, compact=compact)
def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]: """ generate features for the input. Input: typing.Union[container.Dataset, container.DataFrame, container.ndarray, container.matrix, container.List] Output: typing.Union[container.Dataset, container.DataFrame, container.ndarray, container.matrix, container.List] """ # Wrap as container, if needed inputs = inputs.copy() if not pytypes.is_of_type(inputs, types.Container): if isinstance(inputs, pd.DataFrame): inputs = container.DataFrame(inputs) elif isinstance(inputs, np.matrix): inputs = container.matrix(inputs) elif isinstance(inputs, np.ndarray): inputs = container.ndarray(inputs) elif isinstance(inputs, list): inputs = container.List(inputs) else: # Inputs is not a container, and cannot be converted to a container. # Nothing to do, since cannot store the computed metadata. return CallResult(inputs) # calling the utility to detect integer and float datatype columns # inputs = dtype_detector.detector(inputs) # calling the utility to categorical datatype columns metadata = self._produce(inputs, inputs.metadata, []) # I guess there are updating the metdata here inputs.metadata = metadata if inputs.shape[0] > 100: self._sample_df = inputs.dropna().iloc[0:100, :] else: self._sample_df = inputs # calling date detector self._DateFeaturizer = DateFeaturizerOrg(inputs) try: cols = self._DateFeaturizer.detect_date_columns(self._sample_df) except Exception as e: _logger.error(traceback.print_exc(e)) cols = list() if cols: indices = [ inputs.columns.get_loc(c) for c in cols if c in inputs.columns ] for i in indices: old_metadata = dict( inputs.metadata.query((mbase.ALL_ELEMENTS, i))) temp_value = list(old_metadata["semantic_types"]) if len(temp_value) >= 1: # if 'https://metadata.datadrivendiscovery.org/types/CategoricalData' not in old_metadata.get( # "semantic_types", []): # old_metadata["semantic_types"] = ( # 'https://metadata.datadrivendiscovery.org/types/CategoricalData', # 'https://metadata.datadrivendiscovery.org/types/Attribute') if 'https://metadata.datadrivendiscovery.org/types/Time' not in old_metadata.get( "semantic_types", []): old_metadata["semantic_types"] += ( 'https://metadata.datadrivendiscovery.org/types/Time', ) # if isinstance(self._sample_df.iloc[:, i].head(1).values[0], str): # old_metadata["structural_type"] = type("str") # elif isinstance(self._sample_df.iloc[:, i].head(1).values[0], int): # old_metadata["structural_type"] = type(10) # else: # old_metadata["structural_type"] = type(10.2) _logger.info( "Date detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'", { 'column_index': i, 'old_metadata': dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))), 'new_metadata': old_metadata, }, ) inputs.metadata = inputs.metadata.update( (mbase.ALL_ELEMENTS, i), old_metadata) # calling the PhoneParser detector try: PhoneParser_indices = PhoneParser.detect(df=self._sample_df) except Exception as e: _logger.error(traceback.print_exc(e)) PhoneParser_indices = dict() if PhoneParser_indices.get("columns_to_perform"): for i in PhoneParser_indices["columns_to_perform"]: old_metadata = dict( inputs.metadata.query((mbase.ALL_ELEMENTS, i))) # print("old metadata", old_metadata) if 'https://metadata.datadrivendiscovery.org/types/isAmericanPhoneNumber' not in old_metadata.get( "semantic_types", []): old_metadata["semantic_types"] += ( 'https://metadata.datadrivendiscovery.org/types/isAmericanPhoneNumber', ) # if isinstance(self._sample_df.iloc[:, i].head(1).values[0], str): # old_metadata["structural_type"] = type("str") # elif isinstance(self._sample_df.iloc[:, i].head(1).values[0], int): # old_metadata["structural_type"] = type(10) # else: # old_metadata["structural_type"] = type(10.2) _logger.info( "Phone detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'", { 'column_index': i, 'old_metadata': dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))), 'new_metadata': old_metadata, }, ) inputs.metadata = inputs.metadata.update( (mbase.ALL_ELEMENTS, i), old_metadata) # calling the PunctuationSplitter detector try: PunctuationSplitter_indices = PunctuationParser.detect( df=self._sample_df, max_avg_length=self.hyperparams['split_on_column_with_avg_len'] ) except Exception as e: _logger.error(traceback.print_exc(e)) PunctuationSplitter_indices = dict() if PunctuationSplitter_indices.get("columns_to_perform"): for i in PunctuationSplitter_indices["columns_to_perform"]: old_metadata = dict( inputs.metadata.query((mbase.ALL_ELEMENTS, i))) if 'https://metadata.datadrivendiscovery.org/types/TokenizableByPunctuation' not in old_metadata.get( "semantic_types", []): old_metadata["semantic_types"] += ( 'https://metadata.datadrivendiscovery.org/types/TokenizableByPunctuation', ) # if isinstance(self._sample_df.iloc[:, i].head(1).values[0], str): # old_metadata["structural_type"] = type("str") # elif isinstance(self._sample_df.iloc[:, i].head(1).values[0], int): # old_metadata["structural_type"] = type(10) # else: # old_metadata["structural_type"] = type(10.2) _logger.info( "Punctuation detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'", { 'column_index': i, 'old_metadata': dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))), 'new_metadata': old_metadata, }, ) inputs.metadata = inputs.metadata.update( (mbase.ALL_ELEMENTS, i), old_metadata) # calling the NumAlphaSplitter detector try: NumAlphaSplitter_indices = NumAlphaParser.detect( df=self._sample_df, max_avg_length=self. hyperparams['split_on_column_with_avg_len'], ) except Exception as e: _logger.error(traceback.print_exc(e)) NumAlphaSplitter_indices = dict() if NumAlphaSplitter_indices.get("columns_to_perform"): for i in NumAlphaSplitter_indices["columns_to_perform"]: old_metadata = dict( inputs.metadata.query((mbase.ALL_ELEMENTS, i))) if 'https://metadata.datadrivendiscovery.org/types/TokenizableIntoNumericAndAlphaTokens' not in old_metadata.get( "semantic_types", []): old_metadata["semantic_types"] += ( 'https://metadata.datadrivendiscovery.org/types/TokenizableIntoNumericAndAlphaTokens', ) # if isinstance(self._sample_df.iloc[:, i].head(1).values[0], str): # old_metadata["structural_type"] = type("str") # elif isinstance(self._sample_df.iloc[:, i].head(1).values[0], int): # old_metadata["structural_type"] = type(10) # else: # old_metadata["structural_type"] = type(10.2) _logger.info( "NumAlpha detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'", { 'column_index': i, 'old_metadata': dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))), 'new_metadata': old_metadata, }, ) inputs.metadata = inputs.metadata.update( (mbase.ALL_ELEMENTS, i), old_metadata) inputs = self._relabel_categorical(inputs) return CallResult(inputs)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: # print('lcc produce started', file=sys.stderr) # unpack the data from the graph to list reader learning_data, graphs_full_all, nodeIDs_full_all, task_type = inputs # initialize lists for connected components and associated nodeids graphs_largest_all = [] nodeIDs_largest_all = [] for graph_index in range(len(graphs_full_all)): # select the graph and node ids for the current graph graph_full = graphs_full_all[graph_index] nodeIDs_full = nodeIDs_full_all[graph_index] # split the current graph into connected components subgraphs = [graph_full.subgraph(i).copy() for i in sorted(nx.connected_components(graph_full), key=len, reverse=True)] # pick the largest connected component of the current graph graph_largest = [0] components = np.zeros(len(graph_full), dtype=int) # only for CD for i, connected_component in enumerate(subgraphs): # obtain indices associated with the node_ids in this component temp_indices = [j for j, x in enumerate(nodeIDs_full) if x in [str(c) for c in list(connected_component)]] components[temp_indices] = i # check if the component is largest if len(connected_component) > len(graph_largest): # if it is largest - flag as such graph_largest = connected_component.copy() # and subselect the appropriate nodeIDs nodeIDs_largest = nodeIDs_full[temp_indices] # append the largest_connected component and nodeIDs graphs_largest_all.append(graph_largest) nodeIDs_largest_all.append(nodeIDs_largest) # for communityDetection the component needs to be specified in # the dataframe; in this problem there is always only one graph # TODO: condsider avoiding the specification of the problem # likely can be achiebed by handling nodeIDs data smartly if task_type == "communityDetection": learning_data['components'] = components outputs = container.List([ learning_data, graphs_largest_all, nodeIDs_largest_all]) debugging = False if debugging: # GRAPH STUFF print("length of the first graph: {}".format( len(list(graphs_largest_all[0].nodes()))), file=sys.stderr) print("first 20 nodes of the first graph", file=sys.stderr) print(list(graphs_largest_all[0].nodes())[:20], file=sys.stderr) # NODE IDS STUFF print("type of a nodeID: {}".format( type(nodeIDs_largest_all[0][0])), file=sys.stderr) print("length of the nodeIds: {}".format( len(nodeIDs_largest_all[0])), file=sys.stderr) print("first 20 nodesIDs", file=sys.stderr) print(nodeIDs_largest_all[0][:20], file=sys.stderr) # TASK STUFF print("task: {}". format(task_type), file=sys.stderr) # LCC stuff print("unique components: {}".format(np.unique(components)), file=sys.stderr) # print('lcc produce ended', file=sys.stderr) return base.CallResult(outputs)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: # read in graph and training csv np.random.seed(self.random_seed) graph_dataframe = inputs['0'] csv = inputs['learningData'] # antons debugging feel free to delete # print("start of anton debugging", file=sys.stderr) # print(dir(inputs), file=sys.stderr) # for i in inputs: # print(i, file=sys.stderr) # print(type(i), file=sys.stderr) # print(type(inputs['0']), file=sys.stderr) # print(inputs['0'].edges.data(), file=sys.stderr) # print(type(graph_dataframe.at[0, 'filename']), file=sys.stderr) # print(graph_dataframe.at[0, 'filename'], file=sys.stderr) # print("end of anton debugging", file=sys.stderr) temp_json = inputs.to_json_structure() location_uri = temp_json['location_uris'][0] path_to_graph = location_uri[:-15] + "graphs/" + graph_dataframe.at[ 0, 'filename'] graph = nx.read_gml(path=path_to_graph[7:]) n = len(graph) # grab link types (values) and edge list (keys) values = np.array(list( nx.get_edge_attributes(graph, 'linkType').values()), dtype=int) keys = np.array(list(nx.get_edge_attributes(graph, 'linkType').keys()), dtype=int) # grab the unique link types uniq_linktypes = np.unique(values) M = len(uniq_linktypes) if M == 0: M = 1 n_edges = np.array([len(list(graph.edges))]) values = np.zeros(n_edges[0]) keys = np.array(list(graph.edges), dtype=int) else: n_edges = np.zeros(M) # imputation for i in range(len(values)): temp_linktype = int(values[i]) n_edges[temp_linktype] += 1 # imputation n_choose_2 = (n**2 - n) / 2 A_imps = [ 0.5 * (0.5 + n_edges[i] / n_choose_2) * np.ones((n, n)) for i in range(M) ] for i in range(len(values)): temp_linktype = int(values[i]) A_imps[temp_linktype][keys[i][0] - 1, keys[i][1] - 1] = 1 A_imps[temp_linktype][keys[i][1] - 1, keys[i][0] - 1] = 1 for i in range(M): imputations = 0 while imputations < n_edges[i]: v1 = np.random.randint(n) v2 = np.random.randint(n) if v1 == v2 or A_imps[i][v1, v2] == 1: pass else: A_imps[i][v1, v2] = 0 A_imps[i][v2, v1] = 0 imputations += 1 A = -1 * np.zeros(shape=(M * n, M * n)) for i in range(M): for j in range(i, M): A[i * n:(i + 1) * n, j * n:(j + 1) * n] = (A_imps[i] + A_imps[j]) / 2 A[j * n:(j + 1) * n, i * n:(i + 1) * n] = (A_imps[i] + A_imps[j]) / 2 info = container.List([n, M]) link_prediction = True # # initialize a list of graphs to pass around # list_of_graphs = [nx.Graph() for i in range(M)] # # each graph is on the same node set # for i in range(M): # list_of_graphs[i].add_nodes_from(graph) # # populate the graphs with edges # for i in range(len(values)): # temp_G = list_of_graphs[values[i]] # temp_G.add_edge(keys[i][0], keys[i][1]) # temp_G.add_edge(keys[i][1], keys[i][0]) return base.CallResult( container.List([container.ndarray(A), csv, info, link_prediction]))
def test_complex_value(self): self.maxDiff = None dataset = container.Dataset({ '0': container.DataFrame({ 'A': [ container.ndarray(numpy.array(['a', 'b', 'c'])), container.ndarray(numpy.array([1, 2, 3], dtype=numpy.int64)), container.ndarray(numpy.array([1.0, 2.0, 3.0])), ], 'B': [ container.List(['a', 'b', 'c']), container.List([1, 2, 3]), container.List([1.0, 2.0, 3.0]), ], }), }, generate_metadata=False) dataset_metadata = dataset.metadata.generate(dataset, compact=True) self.assertEqual(utils.to_json_structure(dataset_metadata.to_internal_simple_structure()), [{ 'selector': [], 'metadata': { 'schema': base.CONTAINER_SCHEMA_VERSION, 'structural_type': 'd3m.container.dataset.Dataset', 'dimension': { 'name': 'resources', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], 'length': 1, }, }, }, { 'selector': ['__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'd3m.container.pandas.DataFrame', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], 'dimension': { 'name': 'rows', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], 'length': 3, }, }, }, { 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], 'metadata': { 'dimension': { 'name': 'columns', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], 'length': 2, }, }, }, { 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], 'metadata': { 'dimension': { 'length': 3 }, }, }, { 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', 0], 'metadata': { 'structural_type': 'd3m.container.numpy.ndarray', 'name': 'A', }, }, { 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', 1], 'metadata': { 'structural_type': 'd3m.container.list.List', 'name': 'B', }, }, { 'selector': ['__ALL_ELEMENTS__', 0, 0, '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'numpy.str_', }, }, { 'selector': ['__ALL_ELEMENTS__', 0, 1, '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'str', }, }, { 'selector': ['__ALL_ELEMENTS__', 1, 0, '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'numpy.int64', }, }, { 'selector': ['__ALL_ELEMENTS__', 1, 1, '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'int', }, }, { 'selector': ['__ALL_ELEMENTS__', 2, 0, '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'numpy.float64', }, }, { 'selector': ['__ALL_ELEMENTS__', 2, 1, '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'float', } }]) dataset_metadata = dataset.metadata.generate(dataset, compact=False) self.assertEqual(utils.to_json_structure(dataset_metadata.to_internal_simple_structure()), [{ 'selector': [], 'metadata': { 'dimension': { 'length': 1, 'name': 'resources', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], }, 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', 'structural_type': 'd3m.container.dataset.Dataset', }, }, { 'selector': ['0'], 'metadata': { 'dimension': { 'length': 3, 'name': 'rows', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], }, 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], 'structural_type': 'd3m.container.pandas.DataFrame', }, }, { 'selector': ['0', '__ALL_ELEMENTS__'], 'metadata': { 'dimension': { 'length': 2, 'name': 'columns', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], }, }, }, { 'selector': ['0', '__ALL_ELEMENTS__', 0], 'metadata': { 'name': 'A', }, }, { 'selector': ['0', '__ALL_ELEMENTS__', 1], 'metadata': { 'name': 'B', }, }, { 'selector': ['0', 0, 0], 'metadata': { 'dimension': { 'length': 3, }, 'structural_type': 'd3m.container.numpy.ndarray', }, }, { 'selector': ['0', 0, 0, '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'numpy.str_' }, }, { 'selector': ['0', 0, 1], 'metadata': { 'dimension': { 'length': 3, }, 'structural_type': 'd3m.container.list.List', }, }, { 'selector': ['0', 0, 1, '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'str', }, }, { 'selector': ['0', 1, 0], 'metadata': { 'dimension': { 'length': 3, }, 'structural_type': 'd3m.container.numpy.ndarray', }, }, { 'selector': ['0', 1, 0, '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'numpy.int64', }, }, { 'selector': ['0', 1, 1], 'metadata': { 'dimension': { 'length': 3, }, 'structural_type': 'd3m.container.list.List', }, }, { 'selector': ['0', 1, 1, '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'int', }, }, { 'selector': ['0', 2, 0], 'metadata': { 'dimension': { 'length': 3, }, 'structural_type': 'd3m.container.numpy.ndarray', }, }, { 'selector': ['0', 2, 0, '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'numpy.float64', }, }, { 'selector': ['0', 2, 1], 'metadata': { 'dimension': { 'length': 3, }, 'structural_type': 'd3m.container.list.List', }, }, { 'selector': ['0', 2, 1, '__ALL_ELEMENTS__'], 'metadata': { 'structural_type': 'float', }, }])