def get_filters(args, groups, column_types): """Create list of group IDs that will be used to filter group rows.""" filters = { '__select__': ['Show All', 'Show All Problems'], 'Show All': groups.keys(), 'Show All Problems': [] } # Get the remaining filters. They are the columns in the explanations row. group = next(iter(groups.values())) columns = util.sort_columns(args, group['explanations'].keys(), column_types) filters['__select__'] += [ 'Show problems with: ' + c for c in columns if c in group['explanations'].keys() ] # Initialize the filters for name in filters['__select__'][2:]: filters[name] = [] # Get the problems for each group all_problems = {} for group_by, group in groups.items(): for column, value in group['explanations'].items(): if re.search(PROBLEM_PATTERN, value): key = 'Show problems with: ' + column all_problems[group_by] = 1 filters[key].append(group_by) # Sort by the grouping column filters['Show All Problems'] = all_problems.keys() for name in filters['__select__']: filters[name] = sorted(filters[name]) return filters
def reconciled_output( args, unreconciled, reconciled, explanations, column_types): """ Output the reconciled based upon the given arguments. 1) Split any mmr columns into individual columns for Mean, Mode, and Range. 2) If the --explanations option is selected then add an explanations column for every output column just after the reconciled output. 3) If the --transcribers option is selected then add two columns for every user. One for the user name and one for the value entered. """ columns = util.sort_columns(args, reconciled.columns, column_types) del columns[0] del columns[0] del columns[0] reconciled = reconciled.reindex(columns, axis='columns').fillna('') plugins = util.get_plugins('column_types') for _, plugin in plugins.items(): if hasattr(plugin, 'adjust_reconciled_columns'): reconciled = plugin.adjust_reconciled_columns( reconciled, column_types) if args.explanations: reconciled = add_explanations(reconciled, explanations, column_types) if args.transcribers: reconciled = add_transcribers(reconciled, unreconciled, column_types) reconciled.to_csv(args.reconciled) return reconciled
def merge_df(args, unreconciled, reconciled, explanations, column_types): """ Combine dataframes. Make sure they are grouped by subject ID. Also sort them within each subject ID group. """ # Make the index a column rec = reconciled.reset_index() exp = explanations.reset_index() unr = unreconciled.astype(object).copy() # Sort by group-by then by row_type and then key-column rec['row_type'] = '1-reconciled' exp['row_type'] = '2-explanations' unr['row_type'] = '3-unreconciled' # Merge and format the dataframes merged = pd.concat([rec, exp, unr], sort=True) columns = util.sort_columns(args, merged.columns, column_types) return (merged.reindex(columns, axis=1) .fillna('') .sort_values([args.group_by, 'row_type', args.key_column])) return merged
def get_filters(args, groups, column_types): """Create list of group IDs that will be used to filter group rows.""" filters = { '__select__': ['Show All', 'Show All Problems'], 'Show All': groups.keys(), 'Show All Problems': []} # Get the remaining filters. They are the columns in the explanations row. group = next(iter(groups.values())) columns = util.sort_columns( args, group['explanations'].keys(), column_types) filters['__select__'] += ['Show problems with: ' + c for c in columns if c in group['explanations'].keys()] # Initialize the filters for name in filters['__select__'][2:]: filters[name] = [] # Get the problems for each group all_problems = {} for group_by, group in groups.items(): for column, value in group['explanations'].items(): if re.search(PROBLEM_PATTERN, value): key = 'Show problems with: ' + column all_problems[group_by] = 1 filters[key].append(group_by) # Sort by the grouping column filters['Show All Problems'] = all_problems.keys() for name in filters['__select__']: filters[name] = sorted(filters[name]) return filters
def read(args): """Read and convert the input CSV data.""" df = pd.read_csv(args.input_file, dtype=str) # Workflows must be processed individually workflow_id = get_workflow_id(df, args) df = remove_rows_not_in_workflow(df, str(workflow_id)) get_nfn_only_defaults(df, args, workflow_id) # A hack to workaround crap coming back from Zooniverse workflow_strings = get_workflow_strings(args.workflow_csv, workflow_id) # Extract the various json blobs column_types = {} df = (extract_annotations(df, column_types, workflow_strings).pipe( extract_subject_data, column_types).pipe(extract_metadata)) # Get the subject_id from the subject_ids list, use the first one df[args.group_by] = df.subject_ids.map(lambda x: int(str(x).split(';')[0])) # Remove unwanted columns unwanted_columns = [ c for c in df.columns if c.lower() in [ 'user_id', 'user_ip', 'subject_ids', 'subject_data', 'subject_retired' ] ] df = df.drop(unwanted_columns, axis=1) column_types = { k: v for k, v in column_types.items() if k not in unwanted_columns } columns = util.sort_columns(args, df.columns, column_types) df = df.loc[:, ~df.columns.duplicated()] df = df.reindex(columns, axis='columns').fillna('') df = df.sort_values([args.group_by, STARTED_AT]) df = df.drop_duplicates([args.group_by, USER_NAME], keep='first') df = df.groupby(args.group_by).head(args.keep_count) return df, column_types
def main(): """Reconcile the data.""" args = parse_command_line() formats = util.get_plugins('formats') unreconciled, column_types = formats[args.format].read(args) if unreconciled.shape[0] == 0: sys.exit('Workflow {} has no data.'.format(args.workflow_id)) plugins = util.get_plugins('column_types') column_types = get_column_types(args, column_types) validate_columns(args, column_types, unreconciled, plugins=plugins) if args.unreconciled: unreconciled.to_csv(args.unreconciled, index=False) if args.reconciled or args.summary or args.merged: reconciled, explanations = reconciler.build( args, unreconciled, column_types, plugins=plugins) if args.reconciled: columns = util.sort_columns(args, reconciled.columns, column_types) del columns[0] del columns[0] del columns[0] reconciled = reconciled.reindex(columns, axis=1).fillna('') reconciled.to_csv(args.reconciled) if args.summary: summary.report( args, unreconciled, reconciled, explanations, column_types) if args.merged: smerged = merged.merge( args, unreconciled, reconciled, explanations, column_types) smerged.to_csv(args.merged, index=False) if args.zip: zip_files(args)
def report(args, unreconciled, reconciled, explanations, column_types): """Generate the report.""" # Everything as strings reconciled = reconciled.applymap(str) unreconciled = unreconciled.applymap(str) # Convert links into anchor elements reconciled = reconciled.applymap(create_link) unreconciled = unreconciled.applymap(create_link) # Get the report template env = Environment(loader=PackageLoader('reconcile', '.')) template = env.get_template('lib/summary/template.html') # Create the group dataset groups = get_groups(args, unreconciled, reconciled, explanations) # Create filter lists filters = get_filters(args, groups, column_types) # Get transcriber summary data transcribers = user_summary(args, unreconciled) # Build the summary report summary = template.render( args=vars(args), header=header_data(args, unreconciled, reconciled, transcribers), groups=groups, filters=filters, columns=util.sort_columns(args, unreconciled, column_types), transcribers=transcribers, reconciled=reconciled_summary(explanations, column_types), problem_pattern=PROBLEM_PATTERN) # Output the report with open(args.summary, 'w', encoding='utf-8') as out_file: out_file.write(summary)
def merge_df(args, unreconciled, reconciled, explanations, column_types): """ Combine dataframes. Make sure they are grouped by subject ID. Also sort them within each subject ID group. """ # Make the index a column rec = reconciled.reset_index() exp = explanations.reset_index() unr = unreconciled.astype(object).copy() # Sort by group-by then by row_type and then key-column rec['row_type'] = '1-reconciled' exp['row_type'] = '2-explanations' unr['row_type'] = '3-unreconciled' # Merge and format the dataframes merged = pd.concat([rec, exp, unr], sort=True) columns = util.sort_columns(args, merged.columns, column_types) return (merged.reindex(columns, axis=1).fillna('').sort_values( [args.group_by, 'row_type', args.key_column])) return merged