def get_collapsed_relations(dict_list, mapping = 'levels', answer_name='answer'): collapsed_dicts = [] level_rel_dict = load_rel_level_mapping(mapping = mapping) dict_list_by_worker = sort_by_key(dict_list, ['workerid']) for w, dicts in dict_list_by_worker.items(): dicts_by_pair = sort_by_key(dicts, ['property', 'concept']) for p, dicts in dicts_by_pair.items(): dicts_by_level = defaultdict(list) for d in dicts: rel = d['relation'] if rel in level_rel_dict: level = level_rel_dict[rel] else: level = rel dicts_by_level[level].append(d) for level, dicts in dicts_by_level.items(): answers = [d[answer_name] for d in dicts] answers = [str(a).lower() for a in answers] if 'true' in answers: answer = True else: answer = False new_d = dict() new_d['quid'] = f'{p}-{level}' new_d['workerid'] = w new_d['completionurl'] = dicts[0]['completionurl'] new_d['property'] = d['property'] new_d['concept'] = d['concept'] new_d['relation'] = level new_d[answer_name] = answer collapsed_dicts.append(new_d) return collapsed_dicts
def aggregate_binary_labels(data_dict_list, ct_units, ct_thresholds): aggregated_binary_labels = [] data_by_pair = sort_by_key(data_dict_list, ['property', 'concept']) units_by_quid = sort_by_key(ct_units, ['unit']) for pair, data_dicts_pair in data_by_pair.items(): if not pair.startswith('_'): data_by_rel = sort_by_key(data_dicts_pair, ['relation']) # collect scores/propertions: rel_prop_dict = get_props(data_by_rel) rel_ct_score_dict = get_cts(data_by_rel, units_by_quid) rel_majority_vote = get_majority_vote(rel_prop_dict) rel_top_vote = get_top_vote(rel_prop_dict) ct_thresh_votes = dict() for thresh in ct_thresholds: ct_thresh_votes[thresh] = get_ct_vote(rel_ct_score_dict, thresh) for rel, data in data_by_rel.items(): triple_dict = get_agg_dict(data, pair, rel) triple_dict['majority_vote'] = rel_majority_vote[rel] triple_dict['top_vote'] = rel_top_vote[rel] if len(ct_thresh_votes) > 0: for thresh, ct_vote in ct_thresh_votes.items(): triple_dict[f'uas-{thresh}'] = ct_vote[rel] aggregated_binary_labels.append(triple_dict) return aggregated_binary_labels
def remove_low_quality_workers_ct(all_annotations, unit, n_stds): ct_workers = load_ct('*', 'experiment*', '*', 'workers', as_dict=True) ct_by_workers = sort_by_key(ct_workers, ['worker']) if unit == 'batch': annotations_by_unit = sort_by_key(all_annotations, ['filename', 'completionurl']) elif unit == 'pair': annotations_by_unit = sort_by_key(all_annotations, ['property', 'concept']) elif unit == 'total': annotations_by_unit = dict() annotations_by_unit['total'] = all_annotations clean_annotations = [] for unit_id, annotations in annotations_by_unit.items(): worker_dicts = [] workers = set([d['workerid'] for d in annotations]) for w in workers: w_dict = dict() w_dict['workerid'] = w w_dict['wqs'] = ct_by_workers[w][0]['wqs'] worker_dicts.append(w_dict) workers_to_remove = filter_with_stdv(worker_dicts, measure='wqs', n_stds=n_stds) for d in annotations: worker = d['workerid'] if worker not in workers_to_remove: clean_annotations.append(d) return clean_annotations
def evaluate(gold, crowd, verbose=False): crowd_by_triple = sort_by_key(crowd, ['relation', 'property', 'concept'], key_type='tuple') gold_by_triple = sort_by_key(gold, ['relation', 'property', 'concept'], key_type='tuple') total = [] labels_gold = [] labels_crowd = [] cov = 0 for t, gold_data in gold_by_triple.items(): crowd_data = crowd_by_triple[t] if len(crowd_data) == 1: cov += 1 gold_answer = str(gold_data[0]['answer']).lower().strip() crowd_answer = str(crowd_data[0]['label']).lower().strip() labels_gold.append(gold_answer) labels_crowd.append(crowd_answer) # print(t, crowd_answer, gold_answer, crowd_answer == gold_answer) elif verbose == True: print('irregularities in data:', t, len(crowd_data)) # pass p, r, f1, support = p_r_f1(labels_gold, labels_crowd, average='weighted') results_dict = dict() results_dict['f1'] = f1 results_dict['p'] = p results_dict['r'] = r results_dict['coverage'] = cov / len(gold) return results_dict
def time_filter( all_annotations, n_std, direction='both', ): clean_annotations = [] annotations_by_batch = sort_by_key(all_annotations, ['completionurl', 'f_name_full']) for batch, annotations in annotations_by_batch.items(): annotations_by_worker = sort_by_key(annotations, ['workerid']) worker_seconds = dict() for worker, annotations in annotations_by_worker.items(): seconds = [ float(a['duration_in_seconds']) for a in annotations if not a['duration_in_seconds'] is None ] worker_seconds[worker] = sum(seconds) mean_seconds = sum(worker_seconds.values()) / len(worker_seconds) std = stdev(worker_seconds.values()) for worker, seconds in worker_seconds.items(): if (direction == 'below') and (seconds < mean_seconds - n_std * std): print('worker took too little time', worker) elif (direction == 'above') and (seconds > mean_seconds + n_std * std): print('worker took too much time', worker) elif (direction == 'both') and ((seconds > mean_seconds + n_std * std) or (seconds < mean_seconds - n_std * std)): print('worker outside 2 stdevs', worker) else: annotations = annotations_by_worker[worker] clean_annotations.extend(annotations) return clean_annotations
def get_pair_analysis(data_dict_list, name): pair_data_dicts = [] data_by_pair = sort_by_key(data_dict_list, ['property', 'concept']) contradictions = load_contradiction_pairs() for pair, dl_pair in data_by_pair.items(): d = dict() n_annotations = len(dl_pair) data_by_worker = sort_by_key(dl_pair, ['workerid']) cont_cnt = Counter() av_time_all_workers = [] d['pair'] = pair workers_with_contradictions = [] d['n_annotations'] = n_annotations n_workers = len(data_by_worker) d['n_workers'] = n_workers annoation_ids_with_contradictions = [] n_possible_contradictions = 0 for worker, dl_worker in data_by_worker.items(): av_time_all_workers.append(get_average_time_worker(dl_worker)) pair_worker_cont = collect_contradictions(dl_worker, contradictions, threshold = 0) relations = [d['relation'] for d in dl_worker] for r1, r2 in contradictions: if r1 in relations and r2 in relations: n_possible_contradictions += 1 if len(pair_worker_cont) > 0: workers_with_contradictions.append(worker) annoation_ids_with_contradictions.extend(get_annotation_ids(dl_worker)) cont_cnt.update(pair_worker_cont) n_contradictions = sum(cont_cnt.values()) d['n_contradictions'] = n_contradictions d['n_workers_contradicting'] = len(workers_with_contradictions) d['ratio_workers_contradicting'] = len(workers_with_contradictions)/n_workers d['contradiction_annotation_ratio'] = n_contradictions/n_annotations d['n_possible_contradictions'] = n_possible_contradictions if n_possible_contradictions != 0: d['contradiction_poss_contradiction_ratio'] = n_contradictions/n_possible_contradictions else: d['contradiction_poss_contradiction_ratio'] = 0 d['average_time_pair'] = sum(av_time_all_workers)/len(av_time_all_workers) d['workers_contradicting'] = ' '.join(workers_with_contradictions) workers_not_contradicting = [w for w in data_by_worker if w \ not in workers_with_contradictions] d['workers_not_contradicting'] = ' '.join(workers_not_contradicting) d.update(cont_cnt) d['annotations_with_contradiction'] = ' '.join(annoation_ids_with_contradictions) pair_data_dicts.append(d) pair_df = pd.DataFrame(pair_data_dicts) # sort by contradiction to annotation ratio pair_df.sort_values('ratio_workers_contradicting', axis=0, ascending=False, inplace=True) out_dir = '../analyses/pairs/' os.makedirs(out_dir, exist_ok=True) filepath = f'{out_dir}{name}.csv' pair_df.to_csv(filepath, index=False) return pair_df, filepath
def get_worker_analysis_by_pair(data_dict_list, contradictions): analysis_data_dicts = [] data_by_pair = sort_by_key(data_dict_list, ['property', 'concept']) for pair, data in data_by_pair.items(): data_by_worker = sort_by_key(data, ['workerid']) worker_data_dicts = get_worker_data(data_by_worker, contradictions) for d in worker_data_dicts: d['pair'] = pair analysis_data_dicts.extend(worker_data_dicts) return analysis_data_dicts
def get_evaluation_instances(crowd, gold, verbose=False): triples_gold = sort_by_key(gold, ['relation', 'property', 'concept'], key_type='tuple') triples_crowd = sort_by_key(crowd, ['relation', 'property', 'concept'], key_type='tuple') evaluation_instances_crowd = [] for t, gold_data in triples_gold.items(): evaluation_instances_crowd.extend(triples_crowd[t]) if len(triples_crowd[t]) == 0 and verbose == True: print(t, 'no data') print(len(triples_gold), len(triples_crowd), len(evaluation_instances_crowd)) return evaluation_instances_crowd
def get_worker_analysis_by_batch(data_dict_list, contradictions): analysis_data_dicts = [] data_by_batch = sort_by_key(data_dict_list, ['filename', 'completionurl']) for f_url, data in data_by_batch.items(): data_by_worker = sort_by_key(data, ['workerid']) worker_data_dicts = get_worker_data(data_by_worker, contradictions) for d in worker_data_dicts: d['filename-url'] = f_url analysis_data_dicts.extend(worker_data_dicts) return analysis_data_dicts
def remove_contradictory_annotations(all_annotations): clean_annotations = [] contradictions = load_contradiction_pairs() annotations_by_unit = sort_by_key(all_annotations, ['property', 'concept']) for pair, annotations in annotations_by_unit.items(): annotations_per_worker = sort_by_key(all_annotations, ['workerid']) for w, annotations in annotations_per_worker.items(): pair_contradictions = collect_contradictions(annotations, contradictions, threshold=0) if len(pair_contradictions) == 0: clean_annotations.extend(annotations) return clean_annotations
def get_agreement(dict_list_out, collapse_relations = False, v=True, disable_kappa=False): agreement_dict = dict() if collapse_relations != False: dict_list_out = get_collapsed_relations(dict_list_out, collapse_relations) matrix = create_matrix(dict_list_out) ratingtask = agreement.AnnotationTask(data=matrix) alpha = ratingtask.alpha() prop = proportional_agreement_pairs(matrix) #average_kappa = get_average_kappa(matrix) # Calculate kappa by file (not over entire set) total_kappa = 0.0 data_by_file = sort_by_key(dict_list_out, ['completionurl']) for f, d_list in data_by_file.items(): matrix = create_matrix(d_list) if disable_kappa == False: kappa = get_average_kappa(matrix) if np.isnan(kappa): kappa = 0.0 total_kappa += kappa else: kappa = '-' if total_kappa != 0.0 and len(data_by_file) != 0 and kappa != '-': average_kappa = total_kappa/len(data_by_file) else: average_kappa = '-' if v == True: print(f"Krippendorff's alpha: {alpha}") print(f"Average Cohen's Kappa (pairwise): {average_kappa}") print(f"Proportional agreement (pairwise): {prop}") print() agreement_dict['Krippendorff'] = alpha agreement_dict['Proportional'] = prop agreement_dict['Av_Cohens_kappa'] = average_kappa return agreement_dict
def remove_contradicting_workers(all_annotations, dict_list_workers, unit, n_stds): print('all runs before cleaning') all_runs = set([d['f_name_full'].split('/')[3] for d in all_annotations]) print(all_runs) if unit == 'batch': annotations_by_unit = sort_by_key(all_annotations, ['filename', 'completionurl']) workers_by_unit = sort_by_key(dict_list_workers, ['filename-url']) print('annotations_by_unit', len(annotations_by_unit)) print('workers by unit', len(workers_by_unit)) print('comparing unit ids') for u, annotations in annotations_by_unit.items(): if u in workers_by_unit: continue else: print('not found', u) elif unit == 'pair': annotations_by_unit = sort_by_key(all_annotations, ['property', 'concept']) workers_by_unit = sort_by_key(dict_list_workers, ['pair']) elif unit == 'total': annotations_by_unit = dict() annotations_by_unit['total'] = all_annotations workers_by_unit = dict() workers_by_unit['total'] = dict_list_workers clean_annotations = [] print('removing workers') for unit_id, workers in workers_by_unit.items(): workers_to_remove = filter_with_stdv( workers, measure='contradiction_poss_contradiction_ratio', n_stds=n_stds) #print(unit_id, len(workers_to_remove)) annotations = annotations_by_unit[unit_id] for d in annotations: worker = d['workerid'] if worker not in workers_to_remove: clean_annotations.append(d) #if unit_id == 'dangerous-scalpel': # print('remove:', worker in workers_to_remove, worker) return clean_annotations
def add_duration_info(dict_list_out_batch): """ Add duration info per item after time info has been added from prolific summary data. """ # divide data by worker data_by_worker = sort_by_key(dict_list_out_batch, ['workerid']) for worker, worker_data in data_by_worker.items(): get_duration(worker_data)
def remove_singletons(data_dict_list, v=False): clean_data = [] data_by_pair = sort_by_key(data_dict_list, ['property', 'concept']) for pair, dl in data_by_pair.items(): data_by_relation = sort_by_key(dl, ['relation']) if len(data_by_relation) == 1: relation = list(data_by_relation.keys())[0] # print(list(data_by_relation.values())) quid = list(data_by_relation.values())[0][0]['quid'] filter_checks = [(relation =='_check'), relation.startswith('test_'), quid.startswith('test')] if not any(filter_checks): if v == True: print('Filter true') continue else: clean_data.extend(dl) else: clean_data.extend(dl) if v == True: print(f'number of questions: {len(data_dict_list)}') print(f'number of questions without singletons: {len(clean_data)}') return clean_data
def get_worker_data(data_by_worker, contradictions): worker_data_dicts = [] for worker, dl_worker in data_by_worker.items(): d = dict() n_annotations = len(dl_worker) fails = get_tests_and_checks(dl_worker) d['workerid'] = worker d['n_annotations'] = n_annotations cont_cnt = Counter() data_by_pair = sort_by_key(dl_worker, ['property', 'concept']) n_possible_contradictions = 0 pairs_with_cont = 0 for pair, dl_pair in data_by_pair.items(): pair_contradictions = collect_contradictions(dl_pair, contradictions, threshold=0) relations = [d['relation'] for d in dl_pair] for r1, r2 in contradictions: if r1 in relations and r2 in relations: n_possible_contradictions += 1 cont_cnt.update(pair_contradictions) if len(pair_contradictions) != 0: pairs_with_cont += 1 n_contradictions = sum(cont_cnt.values()) d['n_contradictions'] = n_contradictions d['n_fails'] = len(fails) d['contradiction_annotation_ratio'] = n_contradictions / n_annotations d['n_possible_contradictions'] = n_possible_contradictions if n_possible_contradictions != 0: d['contradiction_poss_contradiction_ratio'] = n_contradictions / n_possible_contradictions else: d['contradiction_poss_contradiction_ratio'] = 0 d['fail_annotation_ratio'] = len(fails) / n_annotations d['contradictory_pairs_ratio'] = pairs_with_cont / len(data_by_pair) d['average_time_question'] = get_average_time_worker(dl_worker) d['annotations'] = ' '.join(get_annotation_ids(dl_worker)) # normalize number of contradictions per type by total number of possible contradictions for cont, cnt in cont_cnt.items(): if n_possible_contradictions != 0: cnt_norm = cnt / n_possible_contradictions else: cnt_norm = 0 d[cont] = cnt_norm worker_data_dicts.append(d) return worker_data_dicts
def main(): # evaluate total: gold = load_gold_data() print(gold[0].keys()) print(len(gold)) # remove no gold: gold = [d for d in gold if d['answer'] != 'NOGOLD'] results_dicts = evaluate_configs(gold) df = pd.DataFrame(results_dicts).sort_values('f1', ascending=False) df.to_csv('../evaluation/evaluation_accuracy_full_update.csv') # evaluate expectation sets: # evaluate agree category: gold = load_gold_data() # evaluate agree category: gold_by_agreement = sort_by_key(gold, ['expected_agreement']) gold_agree = gold_by_agreement['agreement'] gold_poss_disagree = gold_by_agreement['possible_disagreement'] gold_disagree = gold_by_agreement['disagreement'] print('gold agree', len(gold_agree)) print('gold poss disagree', len(gold_poss_disagree)) print('gold disagree', len(gold_disagree)) # merge possible with certain disagreement gold_disagree_all = [] gold_disagree_all.extend(gold_poss_disagree) gold_disagree_all.extend(gold_disagree) results_dicts_agree = evaluate_configs(gold_agree) results_dicts_disagree = evaluate_configs(gold_disagree_all) for d in results_dicts_agree: d['behav.'] = 'agree' for d in results_dicts_disagree: d['behav.'] = 'disagree' overview_dicts_total = [] overview_dicts_total.extend(results_dicts_agree) overview_dicts_total.extend(results_dicts_disagree) df = pd.DataFrame(overview_dicts_total).sort_values('f1', ascending=False) df.to_csv('../evaluation/evaluation_accuracy_agree_disagree_update.csv')
def get_duration(worker_data): # sort data by time stamp to create sequence data_by_timestamp = sort_by_key(worker_data, ['timestamp_datetime']) time_series = sorted(data_by_timestamp.keys()) first_timestamp = time_series[0] # get task start and end time from any of the dicts start_time_str = worker_data[0]['started_datetime'] end_time_str = worker_data[0]['completed_datetime'] # get start and end as datetime objects if not start_time_str is None: start_uk = datetime.fromisoformat(start_time_str) diff_start_1 = first_timestamp - start_uk diff_start_1_seconds = diff_start_1.total_seconds() else: start_uk = None diff_start_1_seconds = None if not end_time_str is None: end_uk = datetime.fromisoformat(end_time_str) # adapt time zone from uk to our server else: end_uk = None #print('-----time zone update ----') hours_seconds = 60 * 60 hours1 = hours_seconds hours2 = 2 * hours_seconds if not diff_start_1_seconds is None: if diff_start_1_seconds > hours2: #print('2 hour difference') to_add = timedelta(hours=2) else: #print('1 hour difference') to_add = timedelta(hours=1) if not start_uk is None: start = start_uk + to_add else: start = None if not end_uk is None: end = end_uk + to_add else: end = None #print(start, end) #print('---------------------------') for n, timestamp in enumerate(time_series): ds = data_by_timestamp[timestamp] for nd, d in enumerate(ds): if nd + 1 < len(ds): # print('submitted at same time?') submit_time = ds[nd + 1]['timestamp_datetime'] elif n == len(time_series) - 1: # print('final time step: time stamp to submit time') if not end is None: submit_time = end else: submit_time = None else: submit_time = timestamp if n == 0: if not start is None: # print('assign start to start') start_time = start else: start_time = None else: start_time = time_series[n - 1] if None in [start_time, submit_time]: duration = None else: duration = (submit_time - start_time).total_seconds() #print(start_time, submit_time, duration) d['duration_in_seconds'] = duration
def get_worker_analysis_total(data_dict_list, contradictions): data_by_worker = sort_by_key(data_dict_list, ['workerid']) analysis_data_dicts = get_worker_data(data_by_worker, contradictions) return analysis_data_dicts
def main(): config_dict = load_config() parser = argparse.ArgumentParser() parser.add_argument("--metric", default='contradictions', type=str) parser.add_argument("--units", default=['total', 'batch', 'pair'], type=list, nargs='+') parser.add_argument("--stds", default=[0.5, 1, 1.5, 2], type=list, nargs='+') args = parser.parse_args() #run = config_dict['run'] batch = config_dict['batch'] n_q = config_dict['number_questions'] #group = config_dict['group'] metric = args.metric units = args.units stds = args.stds #data_dict_list = load_experiment_data(run, group, n_q, batch, remove_not_val = True) #print('Metric:', metric) #print('Units:', units) #print('Number of standard deviations away from mean for cut-off:', stds) units = ['batch', 'pair', 'total'] #metric = 'contradictions' metric = 'time-below' n_stds = [0.5, 1, 1.5, 2] if metric == 'time_below': units = ['batch'] clean = True configs = [ #('1', 'experiment1') ('3', 'experiment1'), ('4', 'experiment2') #('5_pilot', 'experiment3'), #('5_scalar_heat', 'scalar_heat') ] batch = '*' n_q = '*' n_lists = '*' runs = [conf[0] for conf in configs] groups = [conf[1] for conf in configs] # make dir all_data = [] for run, group in configs: data = load_experiment_data(run, group, n_q, n_lists, batch, verbose=False) print(data[0].keys()) print(data[0]['f_name_full']) all_data.extend(data) if clean: # clean all data for unit in units: for n_std in n_stds: print(len(all_data), type(all_data)) data_clean = clean_workers(all_data, runs, groups, batch, metric, unit, n_std) #data_clean = (all_data, runs, groups, batch, metric, unit, n_std) print(type(data_clean), len(data_clean)) print(data_clean[0].keys()) all_runs = set( [d['f_name_full'].split('/')[3] for d in data_clean]) print('all runs after cleaning') print(all_runs) name = f'clean_{metric}_{unit}_{n_std}' name_dir = f'annotations_{name}' data_by_filepath = sort_by_key(data_clean, ['f_name_full']) for f, data in data_by_filepath.items(): new_f = f.replace('prolific_output', name_dir) fbase = os.path.basename(new_f) dir_path = new_f.rstrip(fbase) if not os.path.isdir(dir_path): os.makedirs(dir_path) header = data[0].keys() with open(new_f, 'w') as outfile: writer = csv.DictWriter(outfile, fieldnames=header) writer.writeheader() for d in data: writer.writerow(d) else: data_clean = all_data print(type(data_clean), len(data_clean)) print(data_clean[0].keys()) all_runs = set([d['f_name_full'].split('/')[3] for d in data_clean]) print('all runs after cleaning') print(all_runs) name = f'data_processed' name_dir = f'annotations_{name}' data_by_filepath = sort_by_key(data_clean, ['f_name_full']) for f, data in data_by_filepath.items(): new_f = f.replace('prolific_output', name_dir) fbase = os.path.basename(new_f) dir_path = new_f.rstrip(fbase) if not os.path.isdir(dir_path): os.makedirs(dir_path) header = data[0].keys() with open(new_f, 'w') as outfile: writer = csv.DictWriter(outfile, fieldnames=header) writer.writeheader() for d in data: writer.writerow(d)