def data_summary(table_schema, _table, fname, sample_size=1.0, feature_colname='column', dtype_colname='type', output_root='', keep_images=False, n_jobs=1): """ Summarize basic information of all columns in a data table based on the provided data schema Parameters ---------- table_schema: pandas DataFrame schema of the table, should contain data types of each column _table: pandas DataFrame the data table fname: string the output file name sample_size: integer or float(<=1.0), default=1.0 int: number of sample rows to do the summary (useful for large tables) float: sample size in percentage feature_colname: string name of the column for feature dtype_colname: string name of the column for data type output_root: string the root directory for the output file keep_images: boolean whether to keep all generated images n_jobs: int the number of jobs to run in parall """ # make a copy of the raw table table = _table.copy() # calculate the sample size if sample_size <= 1.0: sample_size = int(table.shape[0] * sample_size) if sample_size < table.shape[0]: table = table.sample(sample_size).reset_index(drop=True) # classify features based on data type key_features = table_schema[table_schema[dtype_colname] == 'key'][feature_colname].values numeric_features = table_schema[table_schema[dtype_colname] == 'numeric'][feature_colname].values string_features = table_schema[table_schema[dtype_colname] == 'str'][feature_colname].values date_features = table_schema[table_schema[dtype_colname] == 'date'][feature_colname].values # features with wrong types type_correct_features = list(key_features) + list(numeric_features) + list( string_features) + list(date_features) type_error_features = list( set(table_schema[feature_colname].values) - set(type_correct_features)) # features not in table null_features = [] # temp dir to store all the images generated img_dir = 'img_temp' if os.path.isdir(img_dir): shutil.rmtree(img_dir) os.mkdir(img_dir) # create a new workbook to store everything wb = openpyxl.Workbook() # for key features # only check features in table key_features = [ feat for feat in key_features if feat in table.columns.values ] null_features += [ feat for feat in key_features if feat not in table.columns.values ] if len(key_features) > 0: # get the check result _n_jobs = np.min([n_jobs, len(key_features)]) key_results = Parallel(n_jobs=_n_jobs)( delayed(_check_string)(col, table[[col]]) for col in key_features) ws = wb.create_sheet(title='key') # write the final result to work sheet _insert_string_results(key_results, ws, 25) # for numeric features # only check features in table numeric_features = [ feat for feat in numeric_features if feat in table.columns.values ] null_features += [ feat for feat in numeric_features if feat not in table.columns.values ] if len(numeric_features) > 0: # get the check result _n_jobs = np.min([n_jobs, len(numeric_features)]) numeric_results = Parallel(n_jobs=_n_jobs)( delayed(_check_numeric)(col, table[[col]], img_dir) for col in numeric_features) ws = wb.create_sheet(title='numeric') # write the final result to work sheet _insert_numeric_results(numeric_results, ws, 35, img_dir) # for string features # only check features in table string_features = [ feat for feat in string_features if feat in table.columns.values ] null_features += [ feat for feat in string_features if feat not in table.columns.values ] if len(string_features) > 0: _n_jobs = np.min([n_jobs, len(string_features)]) string_results = Parallel(n_jobs=_n_jobs)( delayed(_check_string)(col, table[[col]]) for col in string_features) ws = wb.create_sheet(title='string') # write the final result to work sheet _insert_string_results(string_results, ws, 25) # for date features # only check features in table date_features = [ feat for feat in date_features if feat in table.columns.values ] null_features += [ feat for feat in date_features if feat not in table.columns.values ] if len(date_features) > 0: # get the current time snapshot_date_now = str(datetime.datetime.now().date()) for col in date_features: table['%s_numeric' % (col)] = ( pd.to_datetime(snapshot_date_now) - pd.to_datetime(table[col], errors='coerce')).astype( 'timedelta64[M]', errors='ignore') _n_jobs = np.min([n_jobs, len(date_features)]) date_results = Parallel(n_jobs=_n_jobs)( delayed(_check_date)('%s_numeric' % (col), table[['%s_numeric' % (col), col]], img_dir) for col in date_features) ws = wb.create_sheet(title='date') # write the final result to work sheet _insert_numeric_results(date_results, ws, 35, img_dir, date_flag=True) # write schema ws = wb['Sheet'] ws.title = 'schema' out_schema = table_schema[[feature_colname, dtype_colname]] out_schema['check'] = 'Ok' error_indices = [] if len(type_error_features) > 0: out_schema['check'] = out_schema[feature_colname].apply( lambda x: 'type error' if x in type_error_features else x) error_indices += list(out_schema[out_schema[feature_colname].isin( type_error_features)].index.values) if len(null_features) > 0: out_schema['check'] = out_schema[feature_colname].apply( lambda x: 'not exits' if x in null_features else x) error_indices += list(out_schema[out_schema[feature_colname].isin( null_features)].index.values) _ = _insert_df(out_schema, ws, header=True) if len(error_indices) > 0: for idx in error_indices: ws['C%d' % (idx + 2)].style = 'Bad' _adjust_ws(ws=ws, row_height=25) wb.save(filename=os.path.join(output_root, 'data_summary_%s.xlsx' % (fname))) # remove all temp images if not keep_images: shutil.rmtree(img_dir)
def data_consist(table1, table2, key1, key2, schema1, schema2, fname, sample_size=1.0, output_root='', keep_images=False, n_jobs=1): """ Check consistency between two tables Parameters ---------- table1: pandas DataFrame one of the two tables to compare table2: pandas DataFrame one of the two tables to compare key1: string key for table1 key2: string key for table2 schema1: pandas DataFrame data schema (contains column names and corresponding data types) for _table1 schema2: pandas DataFrame data schema (contains column names and corresponding data types) for _table2 fname: string the output file name sample_size: integer or float(<=1.0), default=1.0 int: number of sample rows to do the comparison (useful for large tables) float: sample size in percentage output_root: string, default='' the root directory for the output file keep_images: boolean, default=False whether to keep all generated images n_jobs: int, default=1 the number of jobs to run in parallel """ # check whether keys are valid if key1 not in table1.columns.values: raise ValueError('key1: does not exist in table1') if key2 not in table2.columns.values: raise ValueError('key2: does not exist in table2') # check whether two tables are unique in key level if table1[key1].nunique() != table1.shape[0]: raise ValueError('table1: should be unique in %s level' % (key1)) if table2[key2].nunique() != table2.shape[0]: raise ValueError('table2: should be unique in %s level' % (key2)) # check sample_size if sample_size > 1: if int(sample_size) != sample_size: raise ValueError( 'sample_size: only accept integer when it is > 1.0') if (sample_size > table1.shape[0]) or (sample_size > table2.shape[0]): print('sample_size: %d is smaller than %d or %d...' % (sample_size, table1.shape[0], table2.shape[0])) # check output_root if output_root != '': if not os.path.isdir(output_root): raise ValueError('output_root: root not exists') # create a new workbook to store everything wb = openpyxl.Workbook() # prepare directory for generated images img_dir = 'img_temp' if os.path.isdir(img_dir): shutil.rmtree(img_dir) os.mkdir(img_dir) # calculate the sample size if sample_size <= 1.0: both_keys = list( set(table1[key1].values).intersection(set(table2[key2].values))) sample_size = np.min([ int(table1.shape[0] * sample_size), int(table2.shape[0] * sample_size), len(both_keys) ]) sample_keys = np.random.choice(both_keys, sample_size, replace=False) table1 = table1[table1[key1].isin(sample_keys)].reset_index(drop=True) table2 = table2[table2[key2].isin(sample_keys)].reset_index(drop=True) schema, check_features = _check_features(schema1, schema2) corr_results = [] # key features key_features = check_features['key'] if len(key_features) > 0: _n_jobs = np.min([n_jobs, len(key_features)]) key_results = Parallel(n_jobs=_n_jobs)( delayed(_compare_key)(col, table1[[col]], table2[[col]], img_dir) for col in key_features) for key_result in key_results: if 'corr' in key_result.keys(): corr_results.append(key_result['corr']) # write all results to worksheet ws = wb.create_sheet(title=u'key') _insert_numeric_results(key_results, ws, 40, img_dir) # numeric features numeric_features = check_features['numeric'] if len(numeric_features) > 0: _n_jobs = np.min([n_jobs, len(numeric_features)]) numeric_results = Parallel(n_jobs=_n_jobs)(delayed(_consist_numeric)( col, table1[[key1, col]], table2[[key2, col]], key1, key2, img_dir) for col in numeric_features) for numeric_result in numeric_results: if 'corr' in numeric_result.keys(): corr_results.append(numeric_result['corr']) # write all results to worksheet ws = wb.create_sheet(title=u'numeric') _insert_numeric_results(numeric_results, ws, 45, img_dir) # string features string_features = check_features['str'] if len(string_features) > 0: _n_jobs = np.min([n_jobs, len(string_features)]) string_results = Parallel(n_jobs=_n_jobs)(delayed(_consist_string)( col, table1[[key1, col]], table2[[key2, col]], key1, key2) for col in string_features) for string_result in string_results: if 'corr' in string_result.keys(): corr_results.append(string_result['corr']) # write all results to worksheet ws = wb.create_sheet(title=u'string') _insert_string_results(string_results, ws, 25) # date features date_features = check_features['date'] if len(date_features) > 0: # get the current time snapshot_date_now = str(datetime.datetime.now().date()) for col in date_features: table1[col] = (pd.to_datetime(snapshot_date_now) - pd.to_datetime( table1[col], errors='coerce')).astype('timedelta64[M]', errors='ignore') table2[col] = (pd.to_datetime(snapshot_date_now) - pd.to_datetime( table2[col], errors='coerce')).astype('timedelta64[M]', errors='ignore') _n_jobs = np.min([n_jobs, len(date_features)]) date_results = Parallel(n_jobs=_n_jobs)( delayed(_consist_numeric)(col, table1[[key1, col]], table2[[key2, col]], key1, key2, img_dir, date_flag=True) for col in date_features) for date_result in date_results: if 'corr' in date_result.keys(): corr_results.append(date_result['corr']) # write all results to worksheet ws = wb.create_sheet(title=u'date') _insert_numeric_results(date_results, ws, 45, img_dir, date_flag=True) # insert the summary _insert_summary(wb, schema, corr_results) wb.save(filename=os.path.join(output_root, 'data_consist_%s.xlsx' % (fname))) if not keep_images: shutil.rmtree(img_dir)
def data_compare(_table1, _table2, _schema1, _schema2, fname, sample_size=1.0, feature_colname1='column', feature_colname2='column', dtype_colname1='type', dtype_colname2='type', output_root='', keep_images=False, n_jobs=1): """ Compare columns between two tables Parameters ---------- _table1: pandas DataFrame one of the two tables to compare _table2: pandas DataFrame one of the two tables to compare _schema1: pandas DataFrame data schema (contains column names and corresponding data types) for _table1 _schema2: pandas DataFrame data schema (contains column names and corresponding data types) for _table2 fname: string the output file name sample_size: integer or float(<=1.0), default=1.0 int: number of sample rows to do the comparison (useful for large tables) float: sample size in percentage feature_colname1: string, default='column' name of the column for feature of _table1 feature_colname2: string, default='column' name of the column for feature of _table2 dtype_colname1: string, default='type' name of the column for data type of _table1 dtype_colname2: string, default='type' name of the column for data type of _table2 output_root: string, default='' the root directory for the output file keep_images: boolean, default=False whether to keep all generated images n_jobs: int, default=1 the number of jobs to run in parallel """ # start to compare with correct schemas # create a new workbook to store everything wb = openpyxl.Workbook() # prepare directory for generated images img_dir = 'img_temp' if os.path.isdir(img_dir): shutil.rmtree(img_dir) os.mkdir(img_dir) # copy data tables table1 = _table1.copy() table2 = _table2.copy() # calculate the sample size if sample_size <= 1.0: sample_size1 = int(table1.shape[0] * sample_size) sample_size2 = int(table2.shape[0] * sample_size) sample_size = np.min([sample_size1, sample_size2]) # copy both schema schema1 = _schema1.copy()[[feature_colname1, dtype_colname1]].rename(columns={ feature_colname1: 'column_1', dtype_colname1: 'type_1' }) schema2 = _schema2.copy()[[feature_colname2, dtype_colname2]].rename(columns={ feature_colname2: 'column_2', dtype_colname2: 'type_2' }) # merge two schemas schema = schema1.merge(schema2, left_on='column_1', right_on='column_2', how='outer') # if data types are different in schema1 and schema2, move to error schema_error = schema[schema['type_1'] != schema['type_2']].reset_index( drop=True) schema_error['error'] = "inconsistent data types" schema_error.loc[schema_error['column_1'].isnull(), 'error'] = "column not in table1" schema_error.loc[schema_error['column_2'].isnull(), 'error'] = "column not in table2" schema_correct = schema[schema['type_1'] == schema['type_2']].reset_index( drop=True) # classify the features to compare key_features = schema_correct[schema_correct['type_1'] == 'key']['column_1'].values numeric_features = schema_correct[schema_correct['type_1'] == 'numeric']['column_1'].values string_features = schema_correct[schema_correct['type_1'] == 'str']['column_1'].values date_features = schema_correct[schema_correct['type_1'] == 'date']['column_1'].values corr_results = [] # for key features # only check features in both tables key_features = [ feat for feat in key_features if (feat in table1.columns.values) and (feat in table2.columns.values) ] if len(key_features) > 0: _n_jobs = np.min([n_jobs, len(key_features)]) key_results = Parallel(n_jobs=_n_jobs)( delayed(_compare_key)(col, table1[[col]], table2[[col]], img_dir) for col in key_features) for key_result in key_results: if 'corr' in key_result.keys(): corr_results.append(key_result['corr']) # write all results to worksheet ws = wb.create_sheet(title='key') _insert_numeric_results(key_results, ws, 40, img_dir) # do sampling here if sample_size < table1.shape[0]: table1 = table1.sample(sample_size).reset_index(drop=True) if sample_size < table2.shape[0]: table2 = table2.sample(sample_size).reset_index(drop=True) # for numeric features # only check features in both tables numeric_features = [ feat for feat in numeric_features if (feat in table1.columns.values) and (feat in table2.columns.values) ] if len(numeric_features) > 0: _n_jobs = np.min([n_jobs, len(numeric_features)]) numeric_results = Parallel(n_jobs=_n_jobs)(delayed(_compare_numeric)( col, table1[[col]], table2[[col]], img_dir) for col in numeric_features) for numeric_result in numeric_results: if 'corr' in numeric_result.keys(): corr_results.append(numeric_result['corr']) # write all results to worksheet ws = wb.create_sheet(title='numeric') _insert_numeric_results(numeric_results, ws, 40, img_dir) # for string features # only check features in both tables string_features = [ feat for feat in string_features if (feat in table1.columns.values) and (feat in table2.columns.values) ] if len(string_features) > 0: _n_jobs = np.min([n_jobs, len(string_features)]) string_results = Parallel(n_jobs=_n_jobs)( delayed(_compare_string)(col, table1[[col]], table2[[col]]) for col in string_features) for string_result in string_results: if 'corr' in string_result.keys(): corr_results.append(string_result['corr']) # write all results to worksheet ws = wb.create_sheet(title='string') _insert_compare_string_results(string_results, ws, 40) # for date features # only check features in both tables date_features = [ feat for feat in date_features if (feat in table1.columns.values) and (feat in table2.columns.values) ] if len(date_features) > 0: # get the current time snapshot_date_now = str(datetime.datetime.now().date()) for col in date_features: table1['%s_numeric' % (col)] = ( pd.to_datetime(snapshot_date_now) - pd.to_datetime(table1[col], errors='coerce')).astype( 'timedelta64[M]', errors='ignore') table2['%s_numeric' % (col)] = ( pd.to_datetime(snapshot_date_now) - pd.to_datetime(table2[col], errors='coerce')).astype( 'timedelta64[M]', errors='ignore') _n_jobs = np.min([n_jobs, len(date_features)]) date_results = Parallel(n_jobs=_n_jobs)( delayed(_compare_date)('%s_numeric' % (col), table1[ ['%s_numeric' % (col), col]], table2[['%s_numeric' % (col), col]], img_dir) for col in date_features) for date_result in date_results: if 'corr' in date_result.keys(): corr_results.append(date_result['corr']) # write all results to worksheet ws = wb.create_sheet(title='date') _insert_numeric_results(date_results, ws, 40, img_dir, date_flag=True) # insert the summary ws = wb['Sheet'] ws.title = 'summary' summary_df = schema_correct[['column_1', 'type_1']].rename(columns={ 'column_1': 'column', 'type_1': 'type' }) corr_df = pd.DataFrame(corr_results) summary_df = summary_df.merge(corr_df, on='column', how='left') summary_df['corr'] = summary_df['corr'].fillna('error') summary_df['error_flg'] = summary_df['corr'].apply(lambda x: 1 if x == 'error' else 0) error_rows = summary_df[summary_df['error_flg'] == 1].index.values _ = _insert_df(summary_df[['column', 'type', 'corr']], ws, header=True) for r_idx in error_rows: ws['C%d' % (r_idx + 2)].style = 'Bad' _adjust_ws(ws=ws, row_height=25) # if there are some errors if len(schema_error) > 0: ws = wb.create_sheet(title='error') _ = _insert_df(schema_error, ws, header=True) _adjust_ws(ws=ws, row_height=25) wb.save(filename=os.path.join(output_root, 'data_compare_%s.xlsx' % (fname))) if not keep_images: shutil.rmtree(img_dir)
def data_summary(table_schema, table, fname, sample_size=1.0, sample_rows=100, output_root='', keep_images=False, n_jobs=1): """ Summarize basic information of all columns in a data table based on the provided data schema Parameters ---------- table_schema: pandas DataFrame schema of the table, should contain data types of each column table: pandas DataFrame the data table fname: string the output file name sample_size: integer or float(<=1.0), default=1.0 int: number of sample rows to do the summary (useful for large tables) float: sample size in percentage sample_rows: integer number of rows to get data samples output_root: string the root directory for the output file keep_images: boolean whether to keep all generated images n_jobs: int the number of jobs to run in parall """ # check sample_size if sample_size > 1: if int(sample_size) != sample_size: raise ValueError('sample_size: only accept integer when it is > 1.0') if sample_size > table.shape[0]: print("sample_size: %d is larger than the data size: %d" % (sample_size, table.shape[0])) # check output_root if output_root != '': if not os.path.isdir(output_root): raise ValueError('output_root: root not exists') # get data samples before sample_size data_sample = table.sample(sample_rows).reset_index(drop=True) # calculate the sample size if sample_size <= 1.0: sample_size = int(table.shape[0] * sample_size) if sample_size < table.shape[0]: table = table.sample(sample_size).reset_index(drop=True) exclude_features, check_features = _check_features(table_schema) # temp dir to store all the images generated img_dir = 'img_temp' if os.path.isdir(img_dir): shutil.rmtree(img_dir) os.mkdir(img_dir) # create a new workbook to store everything wb = openpyxl.Workbook() # key features key_features = check_features['key'] if len(key_features) > 0: # get the check result _n_jobs = np.min([n_jobs, len(key_features)]) key_results = Parallel(n_jobs=_n_jobs)(delayed(_check_string)(col, table[[col]]) for col in key_features) ws = wb.create_sheet(title=u'key') # write the final result to work sheet _insert_string_results(key_results, ws, 25) # numeric features numeric_features = check_features['numeric'] if len(numeric_features) > 0: # get the check result _n_jobs = np.min([n_jobs, len(numeric_features)]) numeric_results = Parallel(n_jobs=_n_jobs)( delayed(_check_numeric)(col, table[[col]], img_dir) for col in numeric_features) ws = wb.create_sheet(title=u'numeric') # write the final result to work sheet _insert_numeric_results(numeric_results, ws, 35, img_dir) # string features string_features = check_features['str'] if len(string_features) > 0: _n_jobs = np.min([n_jobs, len(string_features)]) string_results = Parallel(n_jobs=_n_jobs)(delayed(_check_string)(col, table[[col]]) for col in string_features) ws = wb.create_sheet(title=u'string') # write the final result to work sheet _insert_string_results(string_results, ws, 25) # date features date_features = check_features['date'] if len(date_features) > 0: # get the current time snapshot_date_now = str(datetime.datetime.now().date()) for col in date_features: table['%s_numeric' %(col)] = (pd.to_datetime(snapshot_date_now) - pd.to_datetime(table[col], errors='coerce')).astype('timedelta64[M]', errors='ignore') _n_jobs = np.min([n_jobs, len(date_features)]) date_results = Parallel(n_jobs=_n_jobs)( delayed(_check_date)('%s_numeric' %(col), table[['%s_numeric' %(col), col]], img_dir) for col in date_features) ws = wb.create_sheet(title=u'date') # write the final result to work sheet _insert_numeric_results(date_results, ws, 35, img_dir, date_flag=True) # write schema ws = wb['Sheet'] ws.title = 'schema' out_schema = table_schema[['column', 'type']] out_schema['check'] = 'Ok' # output error features error_indices = [] if len(exclude_features) > 0: out_schema['check'] = out_schema['column'].apply(lambda x : 'exclude' if x in exclude_features else 'Ok') error_indices += list(out_schema[out_schema['column'].isin(exclude_features)].index.values) _ = _insert_df(out_schema, ws, header=True) if len(error_indices) > 0: for idx in error_indices: ws['C%d' %(idx+2)].style = 'Bad' _adjust_ws(ws=ws, row_height=25) # write data samples ws = wb.create_sheet(title=u'sample') _ = _insert_df(data_sample, ws, header=True, head_color=False) wb.save(filename=os.path.join(output_root, 'data_summary_%s.xlsx' %(fname))) # remove all temp images if not keep_images: shutil.rmtree(img_dir)