def _add_column( self, main_resource_id: str, data: pandas.DataFrame, metadata: metadata_base.DataMetadata, column_data: pandas.DataFrame, column_metadata: typing.Dict ) -> typing.Tuple[pandas.DataFrame, metadata_base.DataMetadata]: assert column_data.shape[1] == 1 if data is None: data = column_data else: #import pdb #pdb.set_trace() #data = data.reset_index().drop(columns=['index']) column_data = column_data.set_index(data.index) #column_data = column_data.reset_index().drop(columns=['index']) data = pandas.concat([data, column_data], axis=1) ''' data = data.reset_index().drop(columns=['index']) selected_data_key = column_data.columns for each_key in selected_data_key: data[each_key] = column_data[each_key] ''' metadata = metadata.update( (main_resource_id, metadata_base.ALL_ELEMENTS, data.shape[1] - 1), column_metadata, source=self) return data, metadata
def _copy_elements_metadata( cls, source_metadata: metadata_base.DataMetadata, selector_prefix: metadata_base.Selector, selector: metadata_base.Selector, target_metadata: metadata_base.DataMetadata, *, source: typing.Any = None) -> metadata_base.DataMetadata: if source is None: source = cls elements = source_metadata.get_elements( list(selector_prefix) + list(selector)) for element in elements: new_selector = list(selector) + [element] metadata = source_metadata.query( list(selector_prefix) + new_selector) target_metadata = target_metadata.update(new_selector, metadata, source=source) target_metadata = cls._copy_elements_metadata(source_metadata, selector_prefix, new_selector, target_metadata, source=source) return target_metadata
def _join_by_index(self, main_resource_id: str, inputs: Inputs, inputs_column_index: int, data: typing.Optional[pandas.DataFrame], metadata: metadata_base.DataMetadata, foreign_resource_id: str, foreign_column_index: int) -> typing.Tuple[pandas.DataFrame, metadata_base.DataMetadata]: main_column_metadata = inputs.metadata.query((main_resource_id, metadata_base.ALL_ELEMENTS, inputs_column_index)) main_data = inputs[main_resource_id] foreign_data = inputs[foreign_resource_id] value_to_index = {} for value_index, value in enumerate(foreign_data.iloc[:, foreign_column_index]): # TODO: Check if values are not unique. value_to_index[value] = value_index rows = [] for value in main_data.iloc[:, inputs_column_index]: rows.append([foreign_data.iloc[value_to_index[value], j] for j in range(len(foreign_data.columns))]) if data is None: data_columns_length = 0 else: data_columns_length = data.shape[1] # Copy over metadata. foreign_data_columns_length = inputs.metadata.query((foreign_resource_id, metadata_base.ALL_ELEMENTS))['dimension']['length'] for column_index in range(foreign_data_columns_length): column_metadata = dict(inputs.metadata.query((foreign_resource_id, metadata_base.ALL_ELEMENTS, column_index))) # Foreign keys can reference same foreign row multiple times, so values in this column might not be even # unique anymore, nor they are a primary key at all. Sso we remove the semantic type marking a column as such. if 'https://metadata.datadrivendiscovery.org/types/PrimaryKey' in column_metadata.get('semantic_types', []): column_metadata['semantic_types'] = [ semantic_type for semantic_type in column_metadata['semantic_types'] if semantic_type != 'https://metadata.datadrivendiscovery.org/types/PrimaryKey' ] # If the original index column was an attribute, make sure the new index column is as well. if 'https://metadata.datadrivendiscovery.org/types/Attribute' in main_column_metadata.get('semantic_types', []): if 'https://metadata.datadrivendiscovery.org/types/Attribute' not in column_metadata['semantic_types']: column_metadata['semantic_types'].append('https://metadata.datadrivendiscovery.org/types/Attribute') # If the original index column was a suggested target, make sure the new index column is as well. if 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' in main_column_metadata.get('semantic_types', []): if 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' not in column_metadata['semantic_types']: column_metadata['semantic_types'].append('https://metadata.datadrivendiscovery.org/types/SuggestedTarget') metadata = metadata.update((main_resource_id, metadata_base.ALL_ELEMENTS, data_columns_length + column_index), column_metadata, source=self) selected_data = pandas.DataFrame(rows) if data is None: data = selected_data else: #import pdb #pdb.set_trace() #data = data.reset_index().drop(columns=['index']) selected_data = selected_data.set_index(data.index) #selected_data = selected_data.reset_index().drop(columns=['index']) data = pandas.concat([data, selected_data], axis=1, ignore_index=True) return data, metadata
def _add_target_semantic_types(cls, metadata: metadata_base.DataMetadata, source: typing.Any, target_names: typing.List = None,) -> metadata_base.DataMetadata: for column_index in range(metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']): metadata = metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, column_index), 'https://metadata.datadrivendiscovery.org/types/Target', source=source) metadata = metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, column_index), 'https://metadata.datadrivendiscovery.org/types/PredictedTarget', source=source) if target_names: metadata = metadata.update((metadata_base.ALL_ELEMENTS, column_index), { 'name': target_names[column_index], }, source=source) return metadata
def _select_columns_metadata( cls, inputs_metadata: metadata_base.DataMetadata, resource_id: metadata_base.SelectorSegment, columns: typing.Sequence[int]) -> metadata_base.DataMetadata: """ This is similar to ``select_columns_metadata`` but operates on a Dataset. """ if not columns: raise exceptions.InvalidArgumentValueError("No columns selected.") # This makes a copy so that we can modify metadata in-place. output_metadata = inputs_metadata.update( ( resource_id, metadata_base.ALL_ELEMENTS, ), { 'dimension': { 'length': len(columns), }, }, ) if resource_id is metadata_base.ALL_ELEMENTS: metadata_chain = itertools.chain( [ output_metadata._current_metadata.all_elements.all_elements if output_metadata._current_metadata.all_elements is not None else None ], output_metadata._current_metadata.all_elements.elements.values( ) if output_metadata._current_metadata.all_elements is not None else iter([None]), ) else: resource_id = typing.cast(metadata_base.SimpleSelectorSegment, resource_id) metadata_chain = itertools.chain( [ output_metadata._current_metadata.all_elements.all_elements if output_metadata._current_metadata.all_elements is not None else None ], output_metadata._current_metadata.all_elements.elements.values( ) if output_metadata._current_metadata.all_elements is not None else iter([None]), [ output_metadata._current_metadata.elements[resource_id]. all_elements ], output_metadata._current_metadata.elements[resource_id]. elements.values(), ) # TODO: Do this better. This change is missing an entry in metadata log. for element_metadata_entry in metadata_chain: if element_metadata_entry is None: continue elements = element_metadata_entry.elements element_metadata_entry.elements = {} for i, column_index in enumerate(columns): if column_index in elements: # If "column_index" is really numeric, we re-enumerate it. if isinstance(column_index, int): element_metadata_entry.elements[i] = elements[ column_index] else: element_metadata_entry.elements[ column_index] = elements[column_index] return output_metadata