def get_collapsed_relations(dict_list, mapping = 'levels', answer_name='answer'):

    collapsed_dicts = []
    level_rel_dict = load_rel_level_mapping(mapping = mapping)
    dict_list_by_worker = sort_by_key(dict_list, ['workerid'])
    for w, dicts in dict_list_by_worker.items():
        dicts_by_pair = sort_by_key(dicts, ['property', 'concept'])
        for p, dicts in dicts_by_pair.items():
            dicts_by_level = defaultdict(list)
            for d in dicts:
                rel = d['relation']
                if rel in level_rel_dict:
                    level = level_rel_dict[rel]
                else:
                    level = rel
                dicts_by_level[level].append(d)
            for level, dicts in dicts_by_level.items():
                answers = [d[answer_name] for d in dicts]
                answers = [str(a).lower() for a in answers]

                if 'true' in answers:
                    answer = True
                else:
                    answer = False
                new_d = dict()
                new_d['quid'] = f'{p}-{level}'
                new_d['workerid'] = w
                new_d['completionurl'] = dicts[0]['completionurl']
                new_d['property'] = d['property']
                new_d['concept'] = d['concept']
                new_d['relation'] = level
                new_d[answer_name] = answer
                collapsed_dicts.append(new_d)
    return collapsed_dicts
def aggregate_binary_labels(data_dict_list, ct_units, ct_thresholds):
    aggregated_binary_labels = []
    data_by_pair = sort_by_key(data_dict_list, ['property', 'concept'])
    units_by_quid = sort_by_key(ct_units, ['unit'])
    for pair, data_dicts_pair in data_by_pair.items():
        if not pair.startswith('_'):
            data_by_rel = sort_by_key(data_dicts_pair, ['relation'])
            # collect scores/propertions:
            rel_prop_dict = get_props(data_by_rel)
            rel_ct_score_dict = get_cts(data_by_rel, units_by_quid)

            rel_majority_vote = get_majority_vote(rel_prop_dict)
            rel_top_vote = get_top_vote(rel_prop_dict)
            ct_thresh_votes = dict()
            for thresh in ct_thresholds:
                ct_thresh_votes[thresh] = get_ct_vote(rel_ct_score_dict,
                                                      thresh)
            for rel, data in data_by_rel.items():
                triple_dict = get_agg_dict(data, pair, rel)
                triple_dict['majority_vote'] = rel_majority_vote[rel]
                triple_dict['top_vote'] = rel_top_vote[rel]
                if len(ct_thresh_votes) > 0:
                    for thresh, ct_vote in ct_thresh_votes.items():
                        triple_dict[f'uas-{thresh}'] = ct_vote[rel]
                aggregated_binary_labels.append(triple_dict)
    return aggregated_binary_labels
Example #3
0
def remove_low_quality_workers_ct(all_annotations, unit, n_stds):
    ct_workers = load_ct('*', 'experiment*', '*', 'workers', as_dict=True)
    ct_by_workers = sort_by_key(ct_workers, ['worker'])

    if unit == 'batch':
        annotations_by_unit = sort_by_key(all_annotations,
                                          ['filename', 'completionurl'])
    elif unit == 'pair':
        annotations_by_unit = sort_by_key(all_annotations,
                                          ['property', 'concept'])
    elif unit == 'total':
        annotations_by_unit = dict()
        annotations_by_unit['total'] = all_annotations

    clean_annotations = []
    for unit_id, annotations in annotations_by_unit.items():
        worker_dicts = []
        workers = set([d['workerid'] for d in annotations])
        for w in workers:
            w_dict = dict()
            w_dict['workerid'] = w
            w_dict['wqs'] = ct_by_workers[w][0]['wqs']
            worker_dicts.append(w_dict)
        workers_to_remove = filter_with_stdv(worker_dicts,
                                             measure='wqs',
                                             n_stds=n_stds)
        for d in annotations:
            worker = d['workerid']
            if worker not in workers_to_remove:
                clean_annotations.append(d)

    return clean_annotations
Example #4
0
def evaluate(gold, crowd, verbose=False):
    crowd_by_triple = sort_by_key(crowd, ['relation', 'property', 'concept'], key_type='tuple')
    gold_by_triple = sort_by_key(gold, ['relation', 'property', 'concept'], key_type='tuple')
    total = []
    labels_gold = []
    labels_crowd = []
    cov = 0
    for t, gold_data in gold_by_triple.items():
        crowd_data = crowd_by_triple[t]
        if len(crowd_data) == 1:
            cov += 1
            gold_answer = str(gold_data[0]['answer']).lower().strip()
            crowd_answer = str(crowd_data[0]['label']).lower().strip()
            labels_gold.append(gold_answer)
            labels_crowd.append(crowd_answer)
            # print(t, crowd_answer, gold_answer, crowd_answer == gold_answer)
        elif verbose == True:
            print('irregularities in data:', t, len(crowd_data))
            # pass

    p, r, f1, support = p_r_f1(labels_gold, labels_crowd, average='weighted')
    results_dict = dict()
    results_dict['f1'] = f1
    results_dict['p'] = p
    results_dict['r'] = r
    results_dict['coverage'] = cov / len(gold)
    return results_dict
Example #5
0
def time_filter(
    all_annotations,
    n_std,
    direction='both',
):
    clean_annotations = []
    annotations_by_batch = sort_by_key(all_annotations,
                                       ['completionurl', 'f_name_full'])
    for batch, annotations in annotations_by_batch.items():
        annotations_by_worker = sort_by_key(annotations, ['workerid'])
        worker_seconds = dict()
        for worker, annotations in annotations_by_worker.items():
            seconds = [
                float(a['duration_in_seconds']) for a in annotations
                if not a['duration_in_seconds'] is None
            ]
            worker_seconds[worker] = sum(seconds)
        mean_seconds = sum(worker_seconds.values()) / len(worker_seconds)
        std = stdev(worker_seconds.values())
        for worker, seconds in worker_seconds.items():
            if (direction
                    == 'below') and (seconds < mean_seconds - n_std * std):
                print('worker took too little time', worker)
            elif (direction
                  == 'above') and (seconds > mean_seconds + n_std * std):
                print('worker took too much time', worker)
            elif (direction
                  == 'both') and ((seconds > mean_seconds + n_std * std) or
                                  (seconds < mean_seconds - n_std * std)):
                print('worker outside 2 stdevs', worker)
            else:
                annotations = annotations_by_worker[worker]
                clean_annotations.extend(annotations)
    return clean_annotations
def get_pair_analysis(data_dict_list, name):

    pair_data_dicts = []
    data_by_pair = sort_by_key(data_dict_list, ['property', 'concept'])
    contradictions = load_contradiction_pairs()
    for pair, dl_pair in data_by_pair.items():
        d = dict()
        n_annotations = len(dl_pair)
        data_by_worker = sort_by_key(dl_pair, ['workerid'])
        cont_cnt = Counter()
        av_time_all_workers = []
        d['pair'] = pair
        workers_with_contradictions = []
        d['n_annotations'] = n_annotations
        n_workers = len(data_by_worker)
        d['n_workers'] = n_workers
        annoation_ids_with_contradictions = []
        n_possible_contradictions = 0
        for worker, dl_worker in data_by_worker.items():
            av_time_all_workers.append(get_average_time_worker(dl_worker))
            pair_worker_cont = collect_contradictions(dl_worker, contradictions, threshold = 0)
            relations = [d['relation'] for d in dl_worker]
            for r1, r2 in contradictions:
                if r1 in relations and r2 in relations:
                    n_possible_contradictions += 1
            if len(pair_worker_cont) > 0:
                workers_with_contradictions.append(worker)
                annoation_ids_with_contradictions.extend(get_annotation_ids(dl_worker))
            cont_cnt.update(pair_worker_cont)
        n_contradictions = sum(cont_cnt.values())
        d['n_contradictions'] = n_contradictions
        d['n_workers_contradicting'] = len(workers_with_contradictions)
        d['ratio_workers_contradicting'] = len(workers_with_contradictions)/n_workers
        d['contradiction_annotation_ratio'] = n_contradictions/n_annotations
        d['n_possible_contradictions'] = n_possible_contradictions
        if n_possible_contradictions != 0:
            d['contradiction_poss_contradiction_ratio'] = n_contradictions/n_possible_contradictions
        else:
            d['contradiction_poss_contradiction_ratio'] = 0
        d['average_time_pair'] = sum(av_time_all_workers)/len(av_time_all_workers)
        d['workers_contradicting'] = ' '.join(workers_with_contradictions)
        workers_not_contradicting = [w for w in data_by_worker if w \
                                     not in workers_with_contradictions]
        d['workers_not_contradicting'] = ' '.join(workers_not_contradicting)
        d.update(cont_cnt)
        d['annotations_with_contradiction'] = ' '.join(annoation_ids_with_contradictions)
        pair_data_dicts.append(d)

    pair_df = pd.DataFrame(pair_data_dicts)
    # sort by contradiction to annotation ratio
    pair_df.sort_values('ratio_workers_contradicting', axis=0, ascending=False, inplace=True)
    out_dir = '../analyses/pairs/'
    os.makedirs(out_dir, exist_ok=True)
    filepath = f'{out_dir}{name}.csv'
    pair_df.to_csv(filepath, index=False)
    return pair_df, filepath
def get_worker_analysis_by_pair(data_dict_list, contradictions):
    analysis_data_dicts = []
    data_by_pair = sort_by_key(data_dict_list, ['property', 'concept'])
    for pair, data in data_by_pair.items():
        data_by_worker = sort_by_key(data, ['workerid'])
        worker_data_dicts = get_worker_data(data_by_worker, contradictions)
        for d in worker_data_dicts:
            d['pair'] = pair
        analysis_data_dicts.extend(worker_data_dicts)
    return analysis_data_dicts
Example #8
0
def get_evaluation_instances(crowd, gold, verbose=False):
    triples_gold = sort_by_key(gold, ['relation', 'property', 'concept'], key_type='tuple')
    triples_crowd = sort_by_key(crowd, ['relation', 'property', 'concept'], key_type='tuple')
    evaluation_instances_crowd = []
    for t, gold_data in triples_gold.items():
        evaluation_instances_crowd.extend(triples_crowd[t])
        if len(triples_crowd[t]) == 0 and verbose == True:
            print(t, 'no data')
    print(len(triples_gold), len(triples_crowd), len(evaluation_instances_crowd))
    return evaluation_instances_crowd
def get_worker_analysis_by_batch(data_dict_list, contradictions):

    analysis_data_dicts = []
    data_by_batch = sort_by_key(data_dict_list, ['filename', 'completionurl'])
    for f_url, data in data_by_batch.items():
        data_by_worker = sort_by_key(data, ['workerid'])
        worker_data_dicts = get_worker_data(data_by_worker, contradictions)
        for d in worker_data_dicts:
            d['filename-url'] = f_url
        analysis_data_dicts.extend(worker_data_dicts)
    return analysis_data_dicts
Example #10
0
def remove_contradictory_annotations(all_annotations):
    clean_annotations = []
    contradictions = load_contradiction_pairs()
    annotations_by_unit = sort_by_key(all_annotations, ['property', 'concept'])
    for pair, annotations in annotations_by_unit.items():
        annotations_per_worker = sort_by_key(all_annotations, ['workerid'])
        for w, annotations in annotations_per_worker.items():
            pair_contradictions = collect_contradictions(annotations,
                                                         contradictions,
                                                         threshold=0)
            if len(pair_contradictions) == 0:
                clean_annotations.extend(annotations)

    return clean_annotations
def get_agreement(dict_list_out, collapse_relations = False, v=True, disable_kappa=False):
    agreement_dict = dict()
    if collapse_relations != False:
        dict_list_out = get_collapsed_relations(dict_list_out, collapse_relations)
    matrix = create_matrix(dict_list_out)
    ratingtask = agreement.AnnotationTask(data=matrix)
    alpha = ratingtask.alpha()
    prop = proportional_agreement_pairs(matrix)
    #average_kappa = get_average_kappa(matrix)
    # Calculate kappa by file (not over entire set)
    total_kappa = 0.0
    data_by_file = sort_by_key(dict_list_out, ['completionurl'])
    for f, d_list in data_by_file.items():
        matrix = create_matrix(d_list)
        if disable_kappa == False:
            kappa = get_average_kappa(matrix)
            if np.isnan(kappa):
                kappa = 0.0
            total_kappa += kappa
        else:
            kappa = '-'

    if total_kappa != 0.0 and len(data_by_file) != 0 and kappa != '-':
        average_kappa = total_kappa/len(data_by_file)
    else:
        average_kappa = '-'
    if v == True:
        print(f"Krippendorff's alpha: {alpha}")
        print(f"Average Cohen's Kappa (pairwise): {average_kappa}")
        print(f"Proportional agreement (pairwise): {prop}")
        print()
    agreement_dict['Krippendorff'] = alpha
    agreement_dict['Proportional'] = prop
    agreement_dict['Av_Cohens_kappa'] = average_kappa
    return agreement_dict
Example #12
0
def remove_contradicting_workers(all_annotations, dict_list_workers, unit,
                                 n_stds):
    print('all runs before cleaning')
    all_runs = set([d['f_name_full'].split('/')[3] for d in all_annotations])
    print(all_runs)
    if unit == 'batch':
        annotations_by_unit = sort_by_key(all_annotations,
                                          ['filename', 'completionurl'])
        workers_by_unit = sort_by_key(dict_list_workers, ['filename-url'])
        print('annotations_by_unit', len(annotations_by_unit))
        print('workers by unit', len(workers_by_unit))
        print('comparing unit ids')
        for u, annotations in annotations_by_unit.items():
            if u in workers_by_unit:
                continue
            else:
                print('not found', u)
    elif unit == 'pair':
        annotations_by_unit = sort_by_key(all_annotations,
                                          ['property', 'concept'])
        workers_by_unit = sort_by_key(dict_list_workers, ['pair'])

    elif unit == 'total':
        annotations_by_unit = dict()
        annotations_by_unit['total'] = all_annotations
        workers_by_unit = dict()
        workers_by_unit['total'] = dict_list_workers
    clean_annotations = []
    print('removing workers')
    for unit_id, workers in workers_by_unit.items():
        workers_to_remove = filter_with_stdv(
            workers,
            measure='contradiction_poss_contradiction_ratio',
            n_stds=n_stds)
        #print(unit_id, len(workers_to_remove))

        annotations = annotations_by_unit[unit_id]
        for d in annotations:
            worker = d['workerid']

            if worker not in workers_to_remove:
                clean_annotations.append(d)
            #if unit_id == 'dangerous-scalpel':
            #    print('remove:', worker in workers_to_remove, worker)
    return clean_annotations
Example #13
0
def add_duration_info(dict_list_out_batch):
    """
    Add duration info per item after time
    info has been added from
    prolific summary data.
    """
    # divide data by worker
    data_by_worker = sort_by_key(dict_list_out_batch, ['workerid'])
    for worker, worker_data in data_by_worker.items():
        get_duration(worker_data)
Example #14
0
def remove_singletons(data_dict_list, v=False):
    clean_data = []
    data_by_pair = sort_by_key(data_dict_list, ['property', 'concept'])
    for pair, dl in data_by_pair.items():
        data_by_relation = sort_by_key(dl, ['relation'])
        if len(data_by_relation) == 1:
            relation = list(data_by_relation.keys())[0]
            # print(list(data_by_relation.values()))
            quid = list(data_by_relation.values())[0][0]['quid']
            filter_checks = [(relation =='_check'),
                             relation.startswith('test_'),
                             quid.startswith('test')]
            if not any(filter_checks):
                if v == True:
                    print('Filter true')
                continue
            else:
                clean_data.extend(dl)
        else:
            clean_data.extend(dl)
    if v == True:
        print(f'number of questions: {len(data_dict_list)}')
        print(f'number of questions without singletons: {len(clean_data)}')
    return clean_data
def get_worker_data(data_by_worker, contradictions):
    worker_data_dicts = []
    for worker, dl_worker in data_by_worker.items():
        d = dict()
        n_annotations = len(dl_worker)
        fails = get_tests_and_checks(dl_worker)
        d['workerid'] = worker
        d['n_annotations'] = n_annotations
        cont_cnt = Counter()
        data_by_pair = sort_by_key(dl_worker, ['property', 'concept'])
        n_possible_contradictions = 0
        pairs_with_cont = 0
        for pair, dl_pair in data_by_pair.items():
            pair_contradictions = collect_contradictions(dl_pair,
                                                         contradictions,
                                                         threshold=0)
            relations = [d['relation'] for d in dl_pair]
            for r1, r2 in contradictions:
                if r1 in relations and r2 in relations:
                    n_possible_contradictions += 1
            cont_cnt.update(pair_contradictions)
            if len(pair_contradictions) != 0:
                pairs_with_cont += 1
        n_contradictions = sum(cont_cnt.values())
        d['n_contradictions'] = n_contradictions
        d['n_fails'] = len(fails)
        d['contradiction_annotation_ratio'] = n_contradictions / n_annotations
        d['n_possible_contradictions'] = n_possible_contradictions
        if n_possible_contradictions != 0:
            d['contradiction_poss_contradiction_ratio'] = n_contradictions / n_possible_contradictions
        else:
            d['contradiction_poss_contradiction_ratio'] = 0
        d['fail_annotation_ratio'] = len(fails) / n_annotations
        d['contradictory_pairs_ratio'] = pairs_with_cont / len(data_by_pair)
        d['average_time_question'] = get_average_time_worker(dl_worker)
        d['annotations'] = ' '.join(get_annotation_ids(dl_worker))
        # normalize number of contradictions per type by total number of possible contradictions
        for cont, cnt in cont_cnt.items():
            if n_possible_contradictions != 0:
                cnt_norm = cnt / n_possible_contradictions
            else:
                cnt_norm = 0
            d[cont] = cnt_norm
        worker_data_dicts.append(d)
    return worker_data_dicts
Example #16
0
def main():

    # evaluate total:
    gold = load_gold_data()
    print(gold[0].keys())
    print(len(gold))
    # remove no gold:
    gold = [d for d in gold if d['answer'] != 'NOGOLD']
    results_dicts = evaluate_configs(gold)
    df = pd.DataFrame(results_dicts).sort_values('f1', ascending=False)
    df.to_csv('../evaluation/evaluation_accuracy_full_update.csv')

    # evaluate expectation sets:
    # evaluate agree category:
    gold = load_gold_data()
    # evaluate agree category:
    gold_by_agreement = sort_by_key(gold, ['expected_agreement'])
    gold_agree = gold_by_agreement['agreement']
    gold_poss_disagree = gold_by_agreement['possible_disagreement']
    gold_disagree = gold_by_agreement['disagreement']
    print('gold agree', len(gold_agree))
    print('gold poss disagree', len(gold_poss_disagree))
    print('gold disagree', len(gold_disagree))
    # merge possible with certain disagreement
    gold_disagree_all = []
    gold_disagree_all.extend(gold_poss_disagree)
    gold_disagree_all.extend(gold_disagree)

    results_dicts_agree = evaluate_configs(gold_agree)
    results_dicts_disagree = evaluate_configs(gold_disagree_all)

    for d in results_dicts_agree:
        d['behav.'] = 'agree'
    for d in results_dicts_disagree:
        d['behav.'] = 'disagree'
    overview_dicts_total = []
    overview_dicts_total.extend(results_dicts_agree)
    overview_dicts_total.extend(results_dicts_disagree)

    df = pd.DataFrame(overview_dicts_total).sort_values('f1', ascending=False)
    df.to_csv('../evaluation/evaluation_accuracy_agree_disagree_update.csv')
Example #17
0
def get_duration(worker_data):
    # sort data by time stamp to create sequence
    data_by_timestamp = sort_by_key(worker_data, ['timestamp_datetime'])
    time_series = sorted(data_by_timestamp.keys())
    first_timestamp = time_series[0]

    # get task start and end time from any of the dicts
    start_time_str = worker_data[0]['started_datetime']
    end_time_str = worker_data[0]['completed_datetime']

    # get start and end as datetime objects
    if not start_time_str is None:
        start_uk = datetime.fromisoformat(start_time_str)
        diff_start_1 = first_timestamp - start_uk
        diff_start_1_seconds = diff_start_1.total_seconds()
    else:
        start_uk = None
        diff_start_1_seconds = None
    if not end_time_str is None:
        end_uk = datetime.fromisoformat(end_time_str)
        # adapt time zone from uk to our server
    else:
        end_uk = None

    #print('-----time zone update ----')
    hours_seconds = 60 * 60
    hours1 = hours_seconds
    hours2 = 2 * hours_seconds
    if not diff_start_1_seconds is None:
        if diff_start_1_seconds > hours2:
            #print('2 hour difference')
            to_add = timedelta(hours=2)
        else:
            #print('1 hour difference')
            to_add = timedelta(hours=1)

    if not start_uk is None:
        start = start_uk + to_add
    else:
        start = None

    if not end_uk is None:
        end = end_uk + to_add
    else:
        end = None
    #print(start, end)

    #print('---------------------------')

    for n, timestamp in enumerate(time_series):
        ds = data_by_timestamp[timestamp]
        for nd, d in enumerate(ds):
            if nd + 1 < len(ds):
                # print('submitted at same time?')
                submit_time = ds[nd + 1]['timestamp_datetime']
            elif n == len(time_series) - 1:
                # print('final time step: time stamp to submit time')
                if not end is None:
                    submit_time = end
                else:
                    submit_time = None
            else:

                submit_time = timestamp

            if n == 0:
                if not start is None:
                    # print('assign start to start')
                    start_time = start
                else:
                    start_time = None
            else:
                start_time = time_series[n - 1]
            if None in [start_time, submit_time]:
                duration = None
            else:

                duration = (submit_time - start_time).total_seconds()
                #print(start_time, submit_time, duration)
            d['duration_in_seconds'] = duration
def get_worker_analysis_total(data_dict_list, contradictions):
    data_by_worker = sort_by_key(data_dict_list, ['workerid'])
    analysis_data_dicts = get_worker_data(data_by_worker, contradictions)
    return analysis_data_dicts
Example #19
0
def main():

    config_dict = load_config()

    parser = argparse.ArgumentParser()
    parser.add_argument("--metric", default='contradictions', type=str)
    parser.add_argument("--units",
                        default=['total', 'batch', 'pair'],
                        type=list,
                        nargs='+')
    parser.add_argument("--stds",
                        default=[0.5, 1, 1.5, 2],
                        type=list,
                        nargs='+')
    args = parser.parse_args()

    #run = config_dict['run']
    batch = config_dict['batch']
    n_q = config_dict['number_questions']
    #group = config_dict['group']
    metric = args.metric
    units = args.units
    stds = args.stds

    #data_dict_list = load_experiment_data(run, group, n_q, batch, remove_not_val = True)

    #print('Metric:', metric)
    #print('Units:', units)
    #print('Number of standard deviations away from mean for cut-off:', stds)

    units = ['batch', 'pair', 'total']
    #metric = 'contradictions'
    metric = 'time-below'
    n_stds = [0.5, 1, 1.5, 2]
    if metric == 'time_below':
        units = ['batch']
    clean = True

    configs = [
        #('1', 'experiment1')
        ('3', 'experiment1'),
        ('4', 'experiment2')
        #('5_pilot', 'experiment3'),
        #('5_scalar_heat', 'scalar_heat')
    ]
    batch = '*'
    n_q = '*'
    n_lists = '*'

    runs = [conf[0] for conf in configs]
    groups = [conf[1] for conf in configs]

    # make dir
    all_data = []
    for run, group in configs:
        data = load_experiment_data(run,
                                    group,
                                    n_q,
                                    n_lists,
                                    batch,
                                    verbose=False)
        print(data[0].keys())
        print(data[0]['f_name_full'])
        all_data.extend(data)

    if clean:
        # clean all data
        for unit in units:
            for n_std in n_stds:
                print(len(all_data), type(all_data))
                data_clean = clean_workers(all_data, runs, groups, batch,
                                           metric, unit, n_std)
                #data_clean = (all_data, runs, groups, batch, metric, unit, n_std)
                print(type(data_clean), len(data_clean))
                print(data_clean[0].keys())
                all_runs = set(
                    [d['f_name_full'].split('/')[3] for d in data_clean])
                print('all runs after cleaning')
                print(all_runs)
                name = f'clean_{metric}_{unit}_{n_std}'
                name_dir = f'annotations_{name}'
                data_by_filepath = sort_by_key(data_clean, ['f_name_full'])
                for f, data in data_by_filepath.items():
                    new_f = f.replace('prolific_output', name_dir)
                    fbase = os.path.basename(new_f)
                    dir_path = new_f.rstrip(fbase)
                    if not os.path.isdir(dir_path):
                        os.makedirs(dir_path)
                    header = data[0].keys()
                    with open(new_f, 'w') as outfile:
                        writer = csv.DictWriter(outfile, fieldnames=header)
                        writer.writeheader()
                        for d in data:
                            writer.writerow(d)
    else:
        data_clean = all_data
        print(type(data_clean), len(data_clean))
        print(data_clean[0].keys())
        all_runs = set([d['f_name_full'].split('/')[3] for d in data_clean])
        print('all runs after cleaning')
        print(all_runs)
        name = f'data_processed'
        name_dir = f'annotations_{name}'
        data_by_filepath = sort_by_key(data_clean, ['f_name_full'])
        for f, data in data_by_filepath.items():
            new_f = f.replace('prolific_output', name_dir)
            fbase = os.path.basename(new_f)
            dir_path = new_f.rstrip(fbase)
            if not os.path.isdir(dir_path):
                os.makedirs(dir_path)
            header = data[0].keys()
            with open(new_f, 'w') as outfile:
                writer = csv.DictWriter(outfile, fieldnames=header)
                writer.writeheader()
                for d in data:
                    writer.writerow(d)