def _update_type(df, added_cols): indices = list() for key in added_cols: indices.append(df.columns.get_loc(key)) for idx in indices: old_metadata = dict(df.metadata.query((mbase.ALL_ELEMENTS, idx))) numerics = pd.to_numeric(df.iloc[:, idx], errors='coerce') length = numerics.shape[0] nans = numerics.isnull().sum() if nans / length > 0.9: if HelperFunction.is_categorical(df.iloc[:, idx]): old_metadata['semantic_types'] = ( "https://metadata.datadrivendiscovery.org/types/CategoricalData",) else: old_metadata['semantic_types'] = ("http://schema.org/Text",) old_metadata['structural_type'] = type("type") else: intcheck = (numerics % 1) == 0 if np.sum(intcheck) / length > 0.9: old_metadata['semantic_types'] = ("http://schema.org/Integer",) old_metadata['structural_type'] = type(10) else: old_metadata['semantic_types'] = ("http://schema.org/Float",) old_metadata['structural_type'] = type(10.1) old_metadata['semantic_types'] += ("https://metadata.datadrivendiscovery.org/types/Attribute",) df.metadata = df.metadata.update((mbase.ALL_ELEMENTS, idx), old_metadata) return df
def update_types(self, col_name): old_metadata = dict( self.df.metadata.query( (mbase.ALL_ELEMENTS, self.df.columns.get_loc(col_name)))) numerics = pd.to_numeric(self.df[col_name], errors='coerce') length = numerics.shape[0] nans = numerics.isnull().sum() if nans / length > 0.9: if HelperFunction.is_categorical(self.df[col_name]): old_metadata['semantic_types'] = ( "https://metadata.datadrivendiscovery.org/types/CategoricalData", ) else: old_metadata['semantic_types'] = ("http://schema.org/Text", ) else: intcheck = (numerics % 1) == 0 if np.sum(intcheck) / length > 0.9: old_metadata['semantic_types'] = ( "http://schema.org/Integer", ) else: old_metadata['semantic_types'] = ("http://schema.org/Float", ) old_metadata['semantic_types'] += \ ("https://metadata.datadrivendiscovery.org/types/Attribute",) self.df.metadata = self.df.metadata.update( (mbase.ALL_ELEMENTS, self.df.columns.get_loc(col_name)), old_metadata)
def detect(df, max_avg_length=30, columns_ignore=list()): positive_semantic_types = set(["http://schema.org/Text"]) cols_to_detect = HelperFunction.cols_to_clean(df, positive_semantic_types) require_checking = list( set(cols_to_detect).difference(set(columns_ignore))) extends = {"columns_to_perform": [], "split_to": []} for one_column in require_checking: rows = df.iloc[:, one_column] filtered_rows = [ len(str(row)) for row in rows if len(str(row)) > 0 ] if len(filtered_rows) > 0: avg_len = sum(filtered_rows) / len(filtered_rows) if avg_len < max_avg_length: if not NumAlphaParser.num_check(df.iloc[:, one_column]): isnum_alpha = NumAlphaParser.is_num_alpha( df.iloc[:, one_column]) if isnum_alpha: result = NumAlphaParser.num_alpha_splitter( df.iloc[:, one_column]) extends["columns_to_perform"].append(one_column) extends["split_to"].append(len(result)) return extends
def _relabel_categorical(inputs: Input) -> Output: for col in range(inputs.shape[1]): old_metadata = dict(inputs.metadata.query((mbase.ALL_ELEMENTS, col))) semantic_type = old_metadata.get('semantic_types', []) if 'https://metadata.datadrivendiscovery.org/types/CategoricalData' in semantic_type: if not HelperFunction.is_categorical(inputs.iloc[:, col]): old_metadata['semantic_types'] = tuple(i for i in old_metadata['semantic_types'] if i != 'https://metadata.datadrivendiscovery.org/types/CategoricalData') numerics = pd.to_numeric(inputs.iloc[:, col], errors='coerce') length = numerics.shape[0] nans = numerics.isnull().sum() if nans / length > 0.9: if "http://schema.org/Text" not in old_metadata['semantic_types']: old_metadata['semantic_types'] += ("http://schema.org/Text",) else: intcheck = (numerics % 1) == 0 if np.sum(intcheck) / length > 0.9: if "http://schema.org/Integer" not in old_metadata['semantic_types']: old_metadata['semantic_types'] += ("http://schema.org/Integer",) # old_metadata['structural_type'] = type(10) # inputs.iloc[:, col] = numerics else: if "http://schema.org/Float" not in old_metadata['semantic_types']: old_metadata['semantic_types'] += ("http://schema.org/Float",) # old_metadata['structural_type'] = type(10.2) # inputs.iloc[:, col] = numerics inputs.metadata = inputs.metadata.update((mbase.ALL_ELEMENTS, col), old_metadata) return inputs
def detect(df, columns_ignore=list()): positive_semantic_types = set(["http://schema.org/Text"]) cols_to_detect = HelperFunction.cols_to_clean(df, positive_semantic_types) require_checking = \ list(set(cols_to_detect).difference(set(columns_ignore))) extends = {"columns_to_perform": [], "split_to": []} for one_column in require_checking: if PhoneParser.is_phone(df.iloc[:, one_column]): extends["columns_to_perform"].append(one_column) return extends
def _update_structural_type(self): for col in range(self._input_data_copy.shape[1]): old_metadata = dict( self._input_data_copy.metadata.query( (mbase.ALL_ELEMENTS, col))) semantic_type = old_metadata.get('semantic_types', None) if not semantic_type: numerics = pd.to_numeric(self._input_data_copy.iloc[:, col], errors='coerce') length = numerics.shape[0] nans = numerics.isnull().sum() if nans / length > 0.9: if HelperFunction.is_categorical( self._input_data_copy.iloc[:, col]): old_metadata['semantic_types'] = ( "https://metadata.datadrivendiscovery.org/types/CategoricalData", ) else: old_metadata['semantic_types'] = ( "http://schema.org/Text", ) else: intcheck = (numerics % 1) == 0 if np.sum(intcheck) / length > 0.9: old_metadata['semantic_types'] = ( "http://schema.org/Integer", ) old_metadata['structural_type'] = type(10) self._input_data_copy.iloc[:, col] = numerics else: old_metadata['semantic_types'] = ( "http://schema.org/Float", ) old_metadata['structural_type'] = type(10.2) self._input_data_copy.iloc[:, col] = numerics old_metadata['semantic_types'] += ( "https://metadata.datadrivendiscovery.org/types/Attribute", ) else: if "http://schema.org/Integer" in semantic_type: self._input_data_copy.iloc[:, col] = pd.to_numeric( self._input_data_copy.iloc[:, col], errors='coerce') old_metadata['structural_type'] = type(10) elif "http://schema.org/Float" in semantic_type: self._input_data_copy.iloc[:, col] = pd.to_numeric( self._input_data_copy.iloc[:, col], errors='coerce') old_metadata['structural_type'] = type(10.2) self._input_data_copy.metadata = self._input_data_copy.metadata.update( (mbase.ALL_ELEMENTS, col), old_metadata)
def compute_lang(column, feature): """ compute which language(s) it use for a given series (column); store the result into (feature). not apply for numbers PROBLEMS: 1. not accurate when string contains digits 2. not accurate when string is too short maybe need to consider the special cases for the above conditions """ column = column.dropna() # ignore all missing value if (column.size == 0): # if the column is empty, do nothing return feature["natural_language_of_feature"] = list() language_count = {} for cell in column: if cell.isdigit() or HelperFunction.is_Decimal_Number(cell): continue else: #detecting language try: language = detect(cell) if language in language_count: language_count[language] += 1 else: language_count[language] = 1 except Exception as e: print( "there is something may not be any language nor number: {}" .format(cell)) pass languages_ordered = sorted(language_count, key=language_count.get, reverse=True) for lang in languages_ordered: lang_obj = {} lang_obj['name'] = lang lang_obj['count'] = language_count[lang] feature["natural_language_of_feature"].append(lang_obj)
def update_type(extends, df_origin): extends_df = pd.DataFrame.from_dict(extends) extends_df = d3m_DataFrame(extends_df, generate_metadata=True) if extends != {}: extends_df.index = df_origin.index.copy() new_df = d3m_DataFrame.append_columns(df_origin, extends_df) indices = list() for key in extends: indices.append(new_df.columns.get_loc(key)) for idx in indices: old_metadata = dict(new_df.metadata.query((mbase.ALL_ELEMENTS, idx))) numerics = pd.to_numeric(new_df.iloc[:, idx], errors='coerce') length = numerics.shape[0] nans = numerics.isnull().sum() if nans / length > 0.9: if HelperFunction.is_categorical(new_df.iloc[:, idx]): old_metadata['semantic_types'] = ( "https://metadata.datadrivendiscovery.org/types/CategoricalData", ) else: old_metadata['semantic_types'] = ("http://schema.org/Text", ) else: intcheck = (numerics % 1) == 0 if np.sum(intcheck) / length > 0.9: old_metadata['semantic_types'] = ( "http://schema.org/Integer", ) else: old_metadata['semantic_types'] = ("http://schema.org/Float", ) old_metadata['semantic_types'] += ( "https://metadata.datadrivendiscovery.org/types/Attribute", ) new_df.metadata = new_df.metadata.update((mbase.ALL_ELEMENTS, idx), old_metadata) return new_df
def detect_date_columns(self, sampled_df, except_list=list()): """ Detects date columns in the sampled_df and returns a list of column indices which have dates params: - sampled_df [DataFrame]: a sample of rows from the original dataframe for detecting dates - except_list [List]: list of column indices to be ignored """ positive_semantic_types = set([ "https://metadata.datadrivendiscovery.org/types/Time", "http://schema.org/Text" ]) cols_to_detect = HelperFunction.cols_to_clean(sampled_df, positive_semantic_types) date_cols = [] for idx in cols_to_detect: if idx not in except_list: if self._parse_column(sampled_df, idx) is not None: date_cols.append(idx) return date_cols
def compute_punctuation(column, feature, weight_outlier): """ compute the statistical values related to punctuations, for details, see the format section of README. not apply for numbers (eg: for number 1.23, "." does not count as a punctuation) weight_outlier: = number_of_sigma in function "helper_outlier_calcu" """ column = column.dropna() # get rid of all missing value if (column.size == 0): # if the column is empty, do nothing return number_of_chars = sum(column.apply(len)) # number of all chars in column num_chars_cell = np.zeros(column.size) # number of chars for each cell puncs_cell = np.zeros( [column.size, len(string.punctuation)], dtype=int) # (number_of_cell * number_of_puncs) sized array # step 1: pre-calculations cell_id = -1 for cell in column: cell_id += 1 num_chars_cell[cell_id] = len(cell) # only counts puncs for non-number cell if cell.isdigit() or HelperFunction.is_Decimal_Number(cell): continue else: counts_cell_punc = np.asarray( list(cell.count(c) for c in string.punctuation)) puncs_cell[cell_id] = counts_cell_punc counts_column_punc = puncs_cell.sum( axis=0) # number of possible puncs in this column cell_density_array = puncs_cell / num_chars_cell.reshape([column.size, 1]) puncs_density_average = cell_density_array.sum(axis=0) / column.size # step 2: extract from pre-calculated data # only create this feature when punctuations exist if (sum(counts_column_punc) > 0): feature["most_common_punctuations"] = list() # list of dict # extract the counts to feature, for each punctuation for i in range(len(string.punctuation)): if (counts_column_punc[i] == 0 ): # if no this punctuation occur in the whole column, ignore continue else: punc_obj = {} punc_obj["punctuation"] = string.punctuation[i] punc_obj["count"] = counts_column_punc[i] punc_obj["ratio"] = counts_column_punc[i] / float( number_of_chars) punc_obj["punctuation_density_aggregate"] = { "mean": puncs_density_average[i] } # calculate outlier outlier_array = helper_outlier_calcu(cell_density_array[:, i], weight_outlier) # only one element in outlier punc_obj["punctuation_density_outliers"] = [{ "n": weight_outlier, "count": sum(outlier_array) }] feature["most_common_punctuations"].append(punc_obj) # step 3: sort feature["most_common_punctuations"] = sorted( feature["most_common_punctuations"], key=lambda k: k['count'], reverse=True)