turk_label = turk[vid][turk_lid] if len(truth_sorted_lidlist) <= turk_to_truth[index]: pass # print "[fp]", turk_label["time"] # false_positive += 1 # for t in turk_label["points_turk"]: # print " Turk:", t["time"], t["desc"].encode("utf-8") else: true_lid = truth_sorted_lidlist[turk_to_truth[index]] true_label = truth[vid][true_lid] distance = math.fabs( float(turk_label["time"]) - float(true_label["time"])) signed_distance = float(turk_label["time"]) - float( true_label["time"]) if found is not 1 and "matched_new" not in true_label and distance <= window_size: match_result = is_same_label(true_label["desc"], turk_label["label"]) if match_result[0]: # if True: true_label["matched_new"] = True # print "[m ]", '{:3.2f}'.format(float(match_result[3])), '{:3.2f}'.format(float(turk_label["time"])), true_label["time"], distance, signed_distance"" print "[m ]", '{:3.2f}'.format( float(turk_label["time"]) ), true_label["time"], distance, signed_distance # print " True:", true_label["desc"] # print " Turk:", turk_label["label"].encode("utf-8") found += 1 # else: # print match_result[3] # print " True:", true_label["desc"] # print " Turk:", turk_label["label"].encode("utf-8")
l3 = cluster[2] final_label = "" # print l1["vid"], cid if l1["answer"] == l2["answer"] == l3["answer"]: # print "[unanimous]", getLabel(l1) final_label = getLabel(l1) count_unanimous += 1 if l1["answer"] == "noop": count_unanimous_noop_included += 1 if l1["answer"] == "@": count_unanimous_custom_included += 1 elif (l1["answer"] == l2["answer"]) or (l2["answer"] == l3["answer"]) or (l3["answer"] == l1["answer"]): if l1["answer"] == l2["answer"]: if l1["answer"] == "@": count_majority_custom_included += 1 if is_same_label(getLabel(l1), getLabel(l3))[0]: count_majority_dup_string += 1 answer_label = l1 if l1["answer"] == "noop": count_majority_noop_included += 1 elif l2["answer"] == l3["answer"]: if l2["answer"] == "@": count_majority_custom_included += 1 if is_same_label(getLabel(l2), getLabel(l1))[0]: count_majority_dup_string += 1 answer_label = l2 if l2["answer"] == "noop": count_majority_noop_included += 1 elif l3["answer"] == l1["answer"]: if l3["answer"] == "@": count_majority_custom_included += 1
def merge_neighbors(clusters, label_turk_list, label_turk_list_desc, sorted_turk): threshold = 10 # only consider neighbor labels within 10 second distance clone_clusters = list(clusters) current_cluster = "no cluster yet" for (i, label) in enumerate(label_turk_list): if i == 0: # starting from index 1, because it's about merging i and i-1 continue key = str(clone_clusters[i]) if current_cluster != key: # new cluster beginning new_cluster_count = 0 current_cluster = key # if key != "-1.0": # looking at only valid clusters (this will now include _R and _T) try: # skip if i-1 is in the same cluster as i. # an existing cluster doesn't need merging. if key == str(clone_clusters[i-1]): continue import math # print "\n", i, (i-1), "in cluster", clone_clusters[i-1], label_turk_list[i], label_turk_list[i-1] dist = math.fabs(label_turk_list[i] - label_turk_list[i-1]) if i == len(label_turk_list) - 1: next_dist = 10000 else: next_dist = math.fabs(label_turk_list[i] - label_turk_list[i+1]) if dist > threshold: continue similarity = is_same_label(label_turk_list_desc[i], label_turk_list_desc[i-1]) # to make sure i-1 is more similar to i than to i-2, both in terms of time and label if i == len(label_turk_list) - 1: next_similarity = [False, "", "", 0] else: next_similarity = is_same_label(label_turk_list_desc[i], label_turk_list_desc[i+1]) # check distance between i and i-1, and between i and i+1 # in order for merge between i and i-1 to happen, i and i-1 should be closer # print "[testing...]", similarity[3], next_similarity[3], dist, next_dist similarity_cond = similarity[0] and similarity[3] > next_similarity[3] and dist < next_dist if not similarity_cond: # print "[not similar]", similarity[1] continue # check if i-1's cluster contains labels by i's Turker, which violates reclustering tid = sorted_turk[i]["workerid"] tid_exists = False for (j, label_inner) in enumerate(label_turk_list): # only checking i-1's cluster if i >= j and clone_clusters[j] == clone_clusters[i-1]: if sorted_turk[j]["workerid"] == tid: tid_exists = True break if tid_exists: # print "[cannot merge]", "multiple labels from", tid continue print "[merge]", i, (i-1), label_turk_list_desc[i], "===", label_turk_list_desc[i-1], dist, next_dist # case by case based on whether items are singletons or not prev_key = str(clone_clusters[i-1]) is_singleton = clone_clusters.count(key) == 1 prev_is_singleton = clone_clusters.count(prev_key) == 1 if is_singleton and prev_is_singleton: clone_clusters[i] = clone_clusters[i-1] # either way works elif not is_singleton and prev_is_singleton: clone_clusters[i-1] = clone_clusters[i] elif is_singleton and not prev_is_singleton: clone_clusters[i] = clone_clusters[i-1] else: # dealing with clusters on both sides. # merge to the cluster whose centroid is closer to i print "[clusters both sides]", clone_clusters[i], clone_clusters[i-1] cent = get_cluster_centroid(key, clone_clusters, label_turk_list) prev_cent = get_cluster_centroid(prev_key, clone_clusters, label_turk_list) if math.fabs(label_turk_list[i] - cent) >= math.fabs(label_turk_list[i] - prev_cent): clone_clusters[i] = clone_clusters[i-1] else: clone_clusters[i-1] = clone_clusters[i] except IndexError: # ignore print "index error", i return clone_clusters
valid_turk_count += 1 turk_lid = turk_sorted_lidlist[index] turk_label = turk[vid][turk_lid] if len(truth_sorted_lidlist) <= turk_to_truth[index]: pass # print "[fp]", turk_label["time"] # false_positive += 1 # for t in turk_label["points_turk"]: # print " Turk:", t["time"], t["desc"].encode("utf-8") else: true_lid = truth_sorted_lidlist[turk_to_truth[index]] true_label = truth[vid][true_lid] distance = math.fabs(float(turk_label["time"]) - float(true_label["time"])) signed_distance = float(turk_label["time"]) - float(true_label["time"]) if found is not 1 and "matched_new" not in true_label and distance <= window_size: match_result = is_same_label(true_label["desc"], turk_label["label"]) if match_result[0]: # if True: true_label["matched_new"] = True # print "[m ]", '{:3.2f}'.format(float(match_result[3])), '{:3.2f}'.format(float(turk_label["time"])), true_label["time"], distance, signed_distance"" print "[m ]", '{:3.2f}'.format(float(turk_label["time"])), true_label["time"], distance, signed_distance # print " True:", true_label["desc"] # print " Turk:", turk_label["label"].encode("utf-8") found += 1 # else: # print match_result[3] # print " True:", true_label["desc"] # print " Turk:", turk_label["label"].encode("utf-8") if found == 1: match += 1
def merge_neighbors(clusters, label_turk_list, label_turk_list_desc, sorted_turk): threshold = 10 # only consider neighbor labels within 10 second distance clone_clusters = list(clusters) current_cluster = "no cluster yet" for (i, label) in enumerate(label_turk_list): if i == 0: # starting from index 1, because it's about merging i and i-1 continue key = str(clone_clusters[i]) if current_cluster != key: # new cluster beginning new_cluster_count = 0 current_cluster = key # if key != "-1.0": # looking at only valid clusters (this will now include _R and _T) try: # skip if i-1 is in the same cluster as i. # an existing cluster doesn't need merging. if key == str(clone_clusters[i - 1]): continue import math # print "\n", i, (i-1), "in cluster", clone_clusters[i-1], label_turk_list[i], label_turk_list[i-1] dist = math.fabs(label_turk_list[i] - label_turk_list[i - 1]) if i == len(label_turk_list) - 1: next_dist = 10000 else: next_dist = math.fabs(label_turk_list[i] - label_turk_list[i + 1]) if dist > threshold: continue similarity = is_same_label(label_turk_list_desc[i], label_turk_list_desc[i - 1]) # to make sure i-1 is more similar to i than to i-2, both in terms of time and label if i == len(label_turk_list) - 1: next_similarity = [False, "", "", 0] else: next_similarity = is_same_label(label_turk_list_desc[i], label_turk_list_desc[i + 1]) # check distance between i and i-1, and between i and i+1 # in order for merge between i and i-1 to happen, i and i-1 should be closer # print "[testing...]", similarity[3], next_similarity[3], dist, next_dist similarity_cond = similarity[ 0] and similarity[3] > next_similarity[3] and dist < next_dist if not similarity_cond: # print "[not similar]", similarity[1] continue # check if i-1's cluster contains labels by i's Turker, which violates reclustering tid = sorted_turk[i]["workerid"] tid_exists = False for (j, label_inner) in enumerate(label_turk_list): # only checking i-1's cluster if i >= j and clone_clusters[j] == clone_clusters[i - 1]: if sorted_turk[j]["workerid"] == tid: tid_exists = True break if tid_exists: # print "[cannot merge]", "multiple labels from", tid continue print "[merge]", i, ( i - 1), label_turk_list_desc[i], "===", label_turk_list_desc[ i - 1], dist, next_dist # case by case based on whether items are singletons or not prev_key = str(clone_clusters[i - 1]) is_singleton = clone_clusters.count(key) == 1 prev_is_singleton = clone_clusters.count(prev_key) == 1 if is_singleton and prev_is_singleton: clone_clusters[i] = clone_clusters[i - 1] # either way works elif not is_singleton and prev_is_singleton: clone_clusters[i - 1] = clone_clusters[i] elif is_singleton and not prev_is_singleton: clone_clusters[i] = clone_clusters[i - 1] else: # dealing with clusters on both sides. # merge to the cluster whose centroid is closer to i print "[clusters both sides]", clone_clusters[ i], clone_clusters[i - 1] cent = get_cluster_centroid(key, clone_clusters, label_turk_list) prev_cent = get_cluster_centroid(prev_key, clone_clusters, label_turk_list) if math.fabs(label_turk_list[i] - cent) >= math.fabs(label_turk_list[i] - prev_cent): clone_clusters[i] = clone_clusters[i - 1] else: clone_clusters[i - 1] = clone_clusters[i] except IndexError: # ignore print "index error", i return clone_clusters