def get_filters(args, groups, column_types):
    """Create list of group IDs that will be used to filter group rows."""
    filters = {
        '__select__': ['Show All', 'Show All Problems'],
        'Show All': groups.keys(),
        'Show All Problems': []
    }

    # Get the remaining filters. They are the columns in the explanations row.
    group = next(iter(groups.values()))
    columns = util.sort_columns(args, group['explanations'].keys(),
                                column_types)
    filters['__select__'] += [
        'Show problems with: ' + c for c in columns
        if c in group['explanations'].keys()
    ]
    # Initialize the filters
    for name in filters['__select__'][2:]:
        filters[name] = []

    # Get the problems for each group
    all_problems = {}
    for group_by, group in groups.items():
        for column, value in group['explanations'].items():
            if re.search(PROBLEM_PATTERN, value):
                key = 'Show problems with: ' + column
                all_problems[group_by] = 1
                filters[key].append(group_by)

    # Sort by the grouping column
    filters['Show All Problems'] = all_problems.keys()
    for name in filters['__select__']:
        filters[name] = sorted(filters[name])

    return filters
Exemple #2
0
def reconciled_output(
        args, unreconciled, reconciled, explanations, column_types):
    """
    Output the reconciled based upon the given arguments.

    1) Split any mmr columns into individual columns for Mean, Mode, and Range.

    2) If the --explanations option is selected then add an
       explanations column for every output column just after the reconciled
       output.

    3) If the --transcribers option is selected then add two columns
       for every user. One for the user name and one for the value entered.
    """
    columns = util.sort_columns(args, reconciled.columns, column_types)
    del columns[0]
    del columns[0]
    del columns[0]
    reconciled = reconciled.reindex(columns, axis='columns').fillna('')

    plugins = util.get_plugins('column_types')
    for _, plugin in plugins.items():
        if hasattr(plugin, 'adjust_reconciled_columns'):
            reconciled = plugin.adjust_reconciled_columns(
                reconciled, column_types)

    if args.explanations:
        reconciled = add_explanations(reconciled, explanations, column_types)

    if args.transcribers:
        reconciled = add_transcribers(reconciled, unreconciled, column_types)

    reconciled.to_csv(args.reconciled)

    return reconciled
def merge_df(args, unreconciled, reconciled, explanations, column_types):
    """
    Combine dataframes.

    Make sure they are grouped by subject ID. Also sort them within each
    subject ID group.
    """
    # Make the index a column
    rec = reconciled.reset_index()
    exp = explanations.reset_index()
    unr = unreconciled.astype(object).copy()

    # Sort by group-by then by row_type and then key-column
    rec['row_type'] = '1-reconciled'
    exp['row_type'] = '2-explanations'
    unr['row_type'] = '3-unreconciled'

    # Merge and format the dataframes
    merged = pd.concat([rec, exp, unr], sort=True)
    columns = util.sort_columns(args, merged.columns, column_types)
    return (merged.reindex(columns, axis=1)
                  .fillna('')
                  .sort_values([args.group_by, 'row_type', args.key_column]))

    return merged
def get_filters(args, groups, column_types):
    """Create list of group IDs that will be used to filter group rows."""
    filters = {
        '__select__': ['Show All', 'Show All Problems'],
        'Show All': groups.keys(),
        'Show All Problems': []}

    # Get the remaining filters. They are the columns in the explanations row.
    group = next(iter(groups.values()))
    columns = util.sort_columns(
        args, group['explanations'].keys(), column_types)
    filters['__select__'] += ['Show problems with: ' + c
                              for c in columns
                              if c in group['explanations'].keys()]
    # Initialize the filters
    for name in filters['__select__'][2:]:
        filters[name] = []

    # Get the problems for each group
    all_problems = {}
    for group_by, group in groups.items():
        for column, value in group['explanations'].items():
            if re.search(PROBLEM_PATTERN, value):
                key = 'Show problems with: ' + column
                all_problems[group_by] = 1
                filters[key].append(group_by)

    # Sort by the grouping column
    filters['Show All Problems'] = all_problems.keys()
    for name in filters['__select__']:
        filters[name] = sorted(filters[name])

    return filters
Exemple #5
0
def read(args):
    """Read and convert the input CSV data."""
    df = pd.read_csv(args.input_file, dtype=str)

    # Workflows must be processed individually
    workflow_id = get_workflow_id(df, args)

    df = remove_rows_not_in_workflow(df, str(workflow_id))

    get_nfn_only_defaults(df, args, workflow_id)

    # A hack to workaround crap coming back from Zooniverse
    workflow_strings = get_workflow_strings(args.workflow_csv, workflow_id)

    # Extract the various json blobs
    column_types = {}
    df = (extract_annotations(df, column_types, workflow_strings).pipe(
        extract_subject_data, column_types).pipe(extract_metadata))

    # Get the subject_id from the subject_ids list, use the first one
    df[args.group_by] = df.subject_ids.map(lambda x: int(str(x).split(';')[0]))

    # Remove unwanted columns
    unwanted_columns = [
        c for c in df.columns if c.lower() in [
            'user_id', 'user_ip', 'subject_ids', 'subject_data',
            'subject_retired'
        ]
    ]
    df = df.drop(unwanted_columns, axis=1)
    column_types = {
        k: v
        for k, v in column_types.items() if k not in unwanted_columns
    }

    columns = util.sort_columns(args, df.columns, column_types)
    df = df.loc[:, ~df.columns.duplicated()]
    df = df.reindex(columns, axis='columns').fillna('')
    df = df.sort_values([args.group_by, STARTED_AT])
    df = df.drop_duplicates([args.group_by, USER_NAME], keep='first')
    df = df.groupby(args.group_by).head(args.keep_count)

    return df, column_types
Exemple #6
0
def main():
    """Reconcile the data."""
    args = parse_command_line()

    formats = util.get_plugins('formats')
    unreconciled, column_types = formats[args.format].read(args)

    if unreconciled.shape[0] == 0:
        sys.exit('Workflow {} has no data.'.format(args.workflow_id))

    plugins = util.get_plugins('column_types')
    column_types = get_column_types(args, column_types)
    validate_columns(args, column_types, unreconciled, plugins=plugins)

    if args.unreconciled:
        unreconciled.to_csv(args.unreconciled, index=False)

    if args.reconciled or args.summary or args.merged:
        reconciled, explanations = reconciler.build(
            args, unreconciled, column_types, plugins=plugins)

        if args.reconciled:
            columns = util.sort_columns(args, reconciled.columns, column_types)
            del columns[0]
            del columns[0]
            del columns[0]
            reconciled = reconciled.reindex(columns, axis=1).fillna('')
            reconciled.to_csv(args.reconciled)

        if args.summary:
            summary.report(
                args, unreconciled, reconciled, explanations, column_types)

        if args.merged:
            smerged = merged.merge(
                args, unreconciled, reconciled, explanations, column_types)
            smerged.to_csv(args.merged, index=False)

    if args.zip:
        zip_files(args)
def report(args, unreconciled, reconciled, explanations, column_types):
    """Generate the report."""
    # Everything as strings
    reconciled = reconciled.applymap(str)
    unreconciled = unreconciled.applymap(str)

    # Convert links into anchor elements
    reconciled = reconciled.applymap(create_link)
    unreconciled = unreconciled.applymap(create_link)

    # Get the report template
    env = Environment(loader=PackageLoader('reconcile', '.'))
    template = env.get_template('lib/summary/template.html')

    # Create the group dataset
    groups = get_groups(args, unreconciled, reconciled, explanations)

    # Create filter lists
    filters = get_filters(args, groups, column_types)

    # Get transcriber summary data
    transcribers = user_summary(args, unreconciled)

    # Build the summary report
    summary = template.render(
        args=vars(args),
        header=header_data(args, unreconciled, reconciled, transcribers),
        groups=groups,
        filters=filters,
        columns=util.sort_columns(args, unreconciled, column_types),
        transcribers=transcribers,
        reconciled=reconciled_summary(explanations, column_types),
        problem_pattern=PROBLEM_PATTERN)

    # Output the report
    with open(args.summary, 'w', encoding='utf-8') as out_file:
        out_file.write(summary)
def report(args, unreconciled, reconciled, explanations, column_types):
    """Generate the report."""
    # Everything as strings
    reconciled = reconciled.applymap(str)
    unreconciled = unreconciled.applymap(str)

    # Convert links into anchor elements
    reconciled = reconciled.applymap(create_link)
    unreconciled = unreconciled.applymap(create_link)

    # Get the report template
    env = Environment(loader=PackageLoader('reconcile', '.'))
    template = env.get_template('lib/summary/template.html')

    # Create the group dataset
    groups = get_groups(args, unreconciled, reconciled, explanations)

    # Create filter lists
    filters = get_filters(args, groups, column_types)

    # Get transcriber summary data
    transcribers = user_summary(args, unreconciled)

    # Build the summary report
    summary = template.render(
        args=vars(args),
        header=header_data(args, unreconciled, reconciled, transcribers),
        groups=groups,
        filters=filters,
        columns=util.sort_columns(args, unreconciled, column_types),
        transcribers=transcribers,
        reconciled=reconciled_summary(explanations, column_types),
        problem_pattern=PROBLEM_PATTERN)

    # Output the report
    with open(args.summary, 'w', encoding='utf-8') as out_file:
        out_file.write(summary)
def merge_df(args, unreconciled, reconciled, explanations, column_types):
    """
    Combine dataframes.

    Make sure they are grouped by subject ID. Also sort them within each
    subject ID group.
    """
    # Make the index a column
    rec = reconciled.reset_index()
    exp = explanations.reset_index()
    unr = unreconciled.astype(object).copy()

    # Sort by group-by then by row_type and then key-column
    rec['row_type'] = '1-reconciled'
    exp['row_type'] = '2-explanations'
    unr['row_type'] = '3-unreconciled'

    # Merge and format the dataframes
    merged = pd.concat([rec, exp, unr], sort=True)
    columns = util.sort_columns(args, merged.columns, column_types)
    return (merged.reindex(columns, axis=1).fillna('').sort_values(
        [args.group_by, 'row_type', args.key_column]))

    return merged