def test_simplify(self): type_counts = { TypeDetector.GEO_GPS: 1, TypeDetector.GEO_ZIP: 2, TypeDetector.GEO_BOROUGH: 3, TypeDetector.TEMPORAL_DATE: 4, TypeDetector.TEMPORAL_TIME: 5, TypeDetector.NUMERIC_INT: 6, TypeDetector.NUMERIC_DOUBLE: 7, TypeDetector.TEXTUAL: 8, TypeDetector.NULL: 0 } correct_simple_type_counts = { TypeDetector.GEO: 6, TypeDetector.TEMPORAL: 9, TypeDetector.NUMERIC: 13, TypeDetector.TEXTUAL: 8, TypeDetector.NULL: 0 } simple_type_counts = TypeDetector.simplify(type_counts) self.assertEqual(correct_simple_type_counts, simple_type_counts)
def do_profile(self, database, file_name, skip_rows, n_rows, metadata): printv = self.printv self.db_name = metadata['db_name'] self.last_sumary = None self.column_metadata = pd.DataFrame( columns=['Database-id', 'Column-name', 'Group', 'Key', 'Value']) readable_time_start = TimeUtils.current_time_formated() time_start = time.time() total_rows = len(database.index) total_missing_values = 0 text_cols = [] text_cols_names = [] num_cols = [] num_cols_names = [] geo_cols = [] geo_cols_names = [] temp_cols = [] temp_cols_names = [] null_cols = [] null_cols_names = [] self.gps_cols = [] self.zip_cols = [] self.types_summary = pd.DataFrame(columns=[ 'database-id', 'column-name', 'socrata-type', 'profiler-type', 'profiler-most-detected' ]) str_cols_types_percent = '' str_cols_types_percent_simple = '' printv('\nMetadata Types (Socrata Only): ') metadata_types = {} if metadata is not None and MetadataConstants.TYPES in metadata.keys(): metadata_types = metadata[MetadataConstants.TYPES] for col in metadata_types.keys(): printv(' ' + col + ': ' + metadata_types[col]) SocrataUtils.prepare_location_columns(database, metadata_types) col_names = database.dtypes.keys() printv('\nProfiling {0} columns.'.format(col_names.size)) # print 'col_names=', col_names for i in range(0, col_names.size): # reset col description unique = counts = vmin = vmax = std = mean = length_min = length_max = length_std = length_mean = missing = None col = database[col_names[i]] col_name = col_names[i] App.debug('Profiling column: ', col_name) App.debug('Pandas DType: ', col.dtype) if col_name in metadata_types: metadata_type = metadata_types[col_name] else: metadata_type = '' App.debug('Metadata type: ', metadata_type) unique_data = pd.DataFrame() temp = col.dropna() temp = temp.unique( ) # <= gives this error probably is a pandas error: TypeError: unhashable type: 'dict' unique_data[col_name] = temp unique_data = unique_data[col_name] col_types_percent_detailed = TypeDetector.types_of(unique_data) App.debug('[var] col_types_percent_detailed: ', col_types_percent_detailed) processed_col_name = col_name.replace("'", "`") str_cols_types_percent += "'{0}': {1}\n".format( processed_col_name, col_types_percent_detailed) col_types_percent = TypeDetector.simplify( col_types_percent_detailed) printv('{0}: {1}'.format(col_name, col_types_percent)) str_cols_types_percent_simple += "'{0}': {1}\n".format( processed_col_name, col_types_percent) data_type = TypeDetector.simple_type_of_considering_all( col_types_percent_detailed, metadata_type, col_name) most_detected, precision = TypeDetector.most_detected( col_types_percent_detailed) valid_col_values = TypeDetector.valid_values_of_type( most_detected, col) ########## Geographic ########## if data_type is TypeDetector.GEO: printv( "Processing Column {0}: {1} - Geo".format( i + 1, col_names[i]), str(i + 1)) # printv(' Processing Count', '.') count = col.count() missing = total_rows - count total_missing_values += missing # printv(' Processing Value Counts', '.') value_counts = valid_col_values.value_counts() top = value_counts.first_valid_index() freq = 0 if top is not None: freq = str(value_counts[top]) # improve readability if is zip code if type(top) is not str and isinstance(top, int): top = '%i' % top unique = len(value_counts) geo_cols_names.append(col_name) geo_cols.append({ 'Count': count, 'Missing': missing, 'Unique Values': unique, 'Most Frequent': top, 'Top Frequency': freq, 'Min': col.min(), 'Max': col.max(), 'Types Percent': str(col_types_percent_detailed) }) if most_detected == TypeDetector.GEO_GPS: self.gps_cols.append(col_name) # Save column metadata # Only makes sense to save these numeric descriptors if is a lat, lon if most_detected == TypeDetector.GEO_GPS_LATLON: self.add_column_metadata(col_name, 'Type Details', 'min', vmin) self.add_column_metadata(col_name, 'Type Details', 'max', vmax) self.add_column_metadata(col_name, 'Type Details', 'std', std) self.add_column_metadata(col_name, 'Type Details', 'mean', mean) ########## Numeric ########## elif data_type is TypeDetector.NUMERIC: printv( "Processing Column {0}: {1} - Numeric".format( i + 1, col_names[i]), str(i + 1)) # printv(' Get valid Numeric gps_data', '.') col_data_numeric = TypeDetector.get_numeric_data(col) printv(' Processing Count', '.') count = col_data_numeric.count() missing = total_rows - count total_missing_values += missing if count > 0: # printv(' Processing Mean', '.') mean = np.mean(col_data_numeric) # printv(' Processing Std', '.') std = np.std(col_data_numeric) # printv(' Processing Min', '.') vmin = np.min(col_data_numeric) # printv(' Processing Max', '.') vmax = np.max(col_data_numeric) # printv(' Processing Unique', '.') unique = col_data_numeric.nunique() value_counts = col_data_numeric.value_counts() top = value_counts.keys()[0] freq = value_counts[top] # Histogram Data, default bins=10 num_bins = min([Profiler.MAX_BINS, unique]) hist_counts, bins = np.histogram(col_data_numeric, bins=num_bins) hist = pd.Series(hist_counts, index=bins[:-1]) hist_json = hist.to_json() else: printv(' All NaN values') hist_json = mean = std = vmin = vmax = unique = freq = None num_cols_names.append(col_name) num_cols.append({ 'Count': count, 'Missing': missing, 'Mean': mean, 'Std': std, 'Min': vmin, 'Max': vmax, 'Unique Values': unique, 'Most Frequent': top, 'Top Frequency': freq, 'Types Percent': str(col_types_percent_detailed), 'Histogram Data JSON': hist_json, }) # Save column metadata self.add_column_metadata(col_name, 'Type Details', 'min', vmin) self.add_column_metadata(col_name, 'Type Details', 'max', vmax) self.add_column_metadata(col_name, 'Type Details', 'std', std) self.add_column_metadata(col_name, 'Type Details', 'mean', mean) self.add_column_metadata(col_name, 'Type Details', 'Histogram Data JSON', hist_json) ########## Temporal ########## elif data_type is TypeDetector.TEMPORAL: # data_type is PandasUtils.Temporal: printv( "Processing Column {0}: {1} - Temporal".format( i + 1, col_names[i]), str(i + 1)) # printv(' Processing Info', '.') info = col.astype(str).describe() temp_cols_names.append(col_name) printv(' Processing Count', '.') count = info['count'] missing = total_rows - count total_missing_values += missing # printv(' Processing Lenght', '.') lenghts = col.str.len() len_min = lenghts.min() len_max = lenghts.max() len_mean = lenghts.mean() len_std = lenghts.std() top = None if 'top' in info.keys(): top = info['top'].__repr__().strip() freq = None if 'freq' in info.keys(): freq = info['freq'] # printv(' Processing Min', '.') vmin = col.min() # printv(' Processing Max', '.') vmax = col.max() unique = info['unique'] value_counts = valid_col_values.value_counts() temp_cols.append({ 'Count': count, 'Missing': missing, 'Unique Values': unique, 'Most Frequent': top, 'Top Frequency': freq, 'Min': vmin, 'Max': vmax, # 'Lenght Min': '{0:.0f}'.format(len_min), # 'Lenght Max': '{0:.0f}'.format(len_max), # 'Lenght Mean': '{0:.2f}'.format(len_mean), # 'Lenght Std': '{0:.2f}'.format(len_std), 'Types Percent': str(col_types_percent) }) # Save column metadata self.add_column_metadata(col_name, 'Type Details', 'min', vmin) self.add_column_metadata(col_name, 'Type Details', 'max', vmax) self.add_column_metadata(col_name, 'Type Details', 'std', std) self.add_column_metadata(col_name, 'Type Details', 'mean', mean) # ########## Textual ########## elif data_type is TypeDetector.TEXTUAL: printv( "Processing Column {0}: {1} - Text".format( i + 1, col_names[i]), str(i + 1)) #printv(' Processing Info', '.') info = col.astype(str).describe() text_cols_names.append(col_name) #printv(' Processing Count', '.') count = info['count'] missing = total_rows - count total_missing_values += missing #printv(' Processing Lenght', '.') lenghts = col.astype(str).str.len() length_min = lenghts.min() length_max = lenghts.max() length_mean = lenghts.mean() length_std = lenghts.std() App.debug('Counting words...') word_counts = col.astype(str).apply(lambda x: len(x.split()) if x is not None else 0) word_count_min = word_counts.min() word_count_max = word_counts.max() word_count_std = word_counts.std() word_count_mean = word_counts.mean() top = None if 'top' in info.keys(): top = info['top'].__repr__().strip() freq = None if 'freq' in info.keys(): freq = info['freq'] unique = info['unique'] text_cols.append({ 'Count': count, 'Missing': missing, 'Unique Values': unique, 'Most Frequent': top, 'Top Frequency': freq, 'Lenght Min': '{0:.0f}'.format(length_min), 'Lenght Max': '{0:.0f}'.format(length_max), 'Lenght Mean': '{0:.2f}'.format(length_mean), 'Lenght Std': '{0:.2f}'.format(length_std), 'Word Count Min': word_count_min, 'Word Count Max': word_count_max, 'Word Count Std': word_count_std, 'Word Count Mean': word_count_mean, 'Types Percent': str(col_types_percent_detailed) }) # Save Column Metadata self.add_column_metadata(col_name, 'Type Details', 'length-min', length_min) self.add_column_metadata(col_name, 'Type Details', 'length-max', length_max) self.add_column_metadata(col_name, 'Type Details', 'length-std', length_std) self.add_column_metadata(col_name, 'Type Details', 'length-mean', length_mean) self.add_column_metadata(col_name, 'Type Details', 'words-min', word_count_min) self.add_column_metadata(col_name, 'Type Details', 'words-max', word_count_max) self.add_column_metadata(col_name, 'Type Details', 'words-mean', word_count_mean) self.add_column_metadata(col_name, 'Type Details', 'words-std', word_count_std) value_counts = valid_col_values.value_counts() else: # data_type is TypeDetector.NULL: printv( "Processing Column {0}: {1} - {2}".format( i + 1, col_names[i], data_type), str(i + 1)) # printv(' Processing Info', '.') info = col.astype(str).describe() null_cols_names.append(col_name) # printv(' Processing Count', '.') count = info['count'] missing = len(col) - col.count() total_missing_values += missing # printv(' Processing Lenght', '.') lenghts = col.astype(str).apply(lambda x: len(x)) length_min = lenghts.min() length_max = lenghts.max() length_mean = lenghts.mean() length_std = lenghts.std() top = None if 'top' in info.keys(): top = info['top'].__repr__().strip() freq = None if 'freq' in info.keys(): freq = info['freq'] unique = info['unique'] null_cols.append({ 'Count': count, 'Missing': missing, 'Unique Values': unique, 'Most Frequent': top, 'Top Frequency': freq, 'Lenght Min': '{0:.0f}'.format(length_min), 'Lenght Max': '{0:.0f}'.format(length_max), 'Lenght Mean': '{0:.2f}'.format(length_mean), 'Lenght Std': '{0:.2f}'.format(length_std), 'Types Percent': str(col_types_percent_detailed) }) # Save Column Metadata self.add_column_metadata(col_name, 'Type Details', 'length-min', length_min) self.add_column_metadata(col_name, 'Type Details', 'length-max', length_max) self.add_column_metadata(col_name, 'Type Details', 'length-std', length_std) self.add_column_metadata(col_name, 'Type Details', 'length-mean', length_mean) # print 'valid_col_values[:10]=', valid_col_values[:10] # if len(valid_col_values) > 0: value_counts = valid_col_values.value_counts() # else: # value_counts = {} # #Add column info column_data = { 'database-id': self.db_name, 'column-name': col_name, 'socrata-type': metadata_type, 'profiler-type': data_type, 'profiler-most-detected_%': precision, 'profiler-most-detected': most_detected, 'unique': unique, 'missing': missing, 'values': count, } # General self.add_column_metadata(col_name, 'General', 'top-value', top) self.add_column_metadata(col_name, 'General', 'top-freq', freq) # self.add_column_metadata(col_name, 'General', 'profiler-most-detected_%', precision) # self.add_column_metadata(col_name, 'General', 'profiler-most-detected', most_detected) # Add column index to column metadata if ProfilerUtils.COLUMN_INDEXES in metadata and col_name in metadata[ ProfilerUtils.COLUMN_INDEXES]: self.add_column_metadata( col_name, 'General', 'index', metadata[ProfilerUtils.COLUMN_INDEXES][col_name]) if value_counts is not None: top_k = {} unique_values = len(value_counts) limit = min(Profiler.MAX_TOP_K, unique_values) if self.part: limit = unique_values for k in value_counts.keys()[:limit]: # top_k[str(k)] = value_counts[k] top_k[str(k)] = value_counts[k] # TODO: Fix a bug here self.add_column_metadata(col_name, 'Type Details', 'top-k', top_k) # Simple type info for k in col_types_percent.keys(): self.add_column_metadata(col_name, 'Simple Type', k, col_types_percent[k]) # Complete type info for k in col_types_percent_detailed.keys(): self.add_column_metadata(col_name, 'Detailed Type', k, col_types_percent_detailed[k]) self.types_summary = self.types_summary.append(column_data, ignore_index=True) # ============================================================================== # ============================ SUMMARYZE gps_data ================================== # ============================================================================== # Sumary DataFrames self.numeric_DataFrame = pd.DataFrame(num_cols, index=num_cols_names) self.geo_DataFrame = pd.DataFrame(geo_cols, index=geo_cols_names) self.textual_DataFrame = pd.DataFrame(text_cols, index=text_cols_names) self.null_DataFrame = pd.DataFrame(null_cols, index=null_cols_names) self.temporal_DataFrame = pd.DataFrame(temp_cols, index=temp_cols_names) time_end = time.time() readable_time_end = TimeUtils.current_time_formated() # Database Sumary total_values = total_rows * col_names.size missing_percent = (total_missing_values * 100.0) / (total_values) missing_percent = '{0:.2f}'.format(missing_percent) processing_time = '{0:.2f}'.format(time_end - time_start) printv('\n\n=============== DATABASE SUMARY ===============') printv('File: {0}'.format(self.db_name)) printv('Rows: {0:n}'.format(total_rows)) printv('Columns: {0:n}'.format(col_names.size)) printv(' - Geo: {0:n}'.format(len(geo_cols_names))) printv(' - Temporal: {0:n}'.format(len(temp_cols_names))) printv(' - Numeric: {0:n}'.format(len(num_cols_names))) printv(' - Textual: {0:n}'.format(len(text_cols_names))) printv('Values') printv(' - Total: {0:n} (Rows x Columns)'.format(total_values)) printv(' - Missing: {0:n} ({1}%)'.format(total_missing_values, missing_percent)) printv('Processing time: {0} sec'.format(processing_time)) used_memory = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000 null_cols = col_names.size - len(num_cols_names) - len( geo_cols_names) - len(temp_cols_names) - len(text_cols_names) self.last_sumary = pd.DataFrame( [{ 'Name': self.db_name, 'Rows': total_rows, 'Columns': col_names.size, 'Columns Numeric': len(num_cols_names), 'Columns Temporal': len(temp_cols_names), 'Columns Geo': len(geo_cols_names), 'Columns Text': len(text_cols_names), 'Columns Null': len(null_cols_names), 'Column Names Numeric': str(num_cols_names), 'Column Names Geo': str(geo_cols_names), 'Column Names Text': str(text_cols_names), 'Column Names Temporal': str(temp_cols_names), 'Values': total_values, 'Values Missing': total_missing_values, 'Values Missing Percent': missing_percent, 'ETL-Profiler Processing Time (sec)': processing_time, 'ETL-Profiler Time Begin': readable_time_start, 'ETL-Profiler Time End': readable_time_end, 'ETL-Profiler Input File': file_name, 'ETL-Profiler Input File Size (KB)': metadata['file_size'], 'ETL-Profiler Total Memory (MB)': used_memory, Profiler.STATUS: Profiler.STATUS_SUCCESS }], columns=Profiler.SUMARY_COLUMNS, index=[self.db_name]) if self.ignore_metadata: printv('=============== Metadata Ignored ===============') elif metadata is not None and MetadataUtils.has_success(metadata): try: printv('=============== PROVIDED METADATA ===============') for key in metadata.keys(): if key == MetadataConstants.TYPES: continue #Ignore Provided Types in dataset metadata value = metadata[key] App.debug(key, '=', value, ' - type:', type(value)) self.last_sumary[key] = value.__str__() self.last_sumary[MetadataConstants. PRIMARY] = MetadataUtils.is_primary(metadata) printv('') except (UnicodeEncodeError) as ex: #do nothing printv('UnicodeEncodeError with socrata metadata.') if self.stop_on_error: raise ex # ============================================================================== # ============================ Process Geo gps_data ==================================== # ============================================================================== if self.ignore_index: printv('Ignoring geo-temp index') else: printv('Processing Geographic gps_data') self.generate_index(database) # ============================================================================== # ============================ SHOW RESULTS ==================================== # ============================================================================== if self.show_details: if self.show_all_columns: numeric_info_to_show = self.numeric_DataFrame.columns.tolist() text_info_to_show = self.textual_DataFrame.columns.tolist() if 'Value Counts' in text_info_to_show: text_info_to_show.remove('Value Counts') geo_info_to_show = self.geo_DataFrame.columns.tolist() if 'Value Counts' in geo_info_to_show: geo_info_to_show.remove('Value Counts') temporal_info_to_show = self.temporal_DataFrame.columns.tolist( ) if 'Value Counts' in temporal_info_to_show: temporal_info_to_show.remove('Value Counts') else: numeric_info_to_show = Profiler.NUMERIC_INFO_SMALL text_info_to_show = Profiler.TEXT_INFO_SMALL geo_info_to_show = Profiler.GEO_INFO_SMALL temporal_info_to_show = Profiler.TEMPORAL_INFO_SMALL null_info_to_show = Profiler.TEMPORAL_INFO_SMALL if len(geo_cols) > 0: print '\n=============== Geo gps_data Sumary:' # print self.geo_DataFrame[geo_info_to_show] # for count, row in self.geo_DataFrame[geo_info_to_show].iterrows(): # print '>>>>>>>>>>>>>>>>>>>>', row print self.geo_DataFrame[geo_info_to_show] if len(num_cols) > 0: print '=============== Numeric Sumary:' print self.numeric_DataFrame[numeric_info_to_show] if len(temp_cols) > 0: print '\n=============== Temporal Sumary:' print self.temporal_DataFrame[temporal_info_to_show] if len(text_cols) > 0: print '\n=============== Textual Sumary:' print self.textual_DataFrame[text_info_to_show] if len(null_cols_names) > 0: print '\n=============== Null Sumary:' print self.null_DataFrame[null_info_to_show] printv( '\n========================================================= Types Information:' ) printv(' --- Complete ---') printv(str_cols_types_percent.rstrip('\n')) printv( '\n ------------------- Types Summary ------------------- ' ) printv(self.types_summary) printv( '===============================================================' ) printv(self.last_sumary.T) # printv('\n ------------------- Column Metadata ------------------- ') # printv( '===============================================================') # printv (self.column_metadata) printv( '===============================================================') # ============================================================================== # ================================ SAVE FILES ================================== # ============================================================================== if self.save_output_files: print 'Generated Files:' # self.save_dataframe_to_csv(self.numeric_DataFrame, '_profiled_numeric') # self.save_dataframe_to_csv(self.geo_DataFrame, '_profiled_geo') # self.save_dataframe_to_csv(self.textual_DataFrame, '_profiled_textual') # self.save_last_sumary_to_json() # filename = self.to_folder + self.db_name + '_types.json' # self.types_summary.to_csv() if self.last_zip_rows is None: print ' No Zip file to save.' else: filename = self.to_folder + self.db_name + '_zip.csv' self.last_zip_rows.to_csv(filename) print ' ' + filename if self.last_gps_rows is None: print ' No GPS file to save.' else: