turk_label = turk[vid][turk_lid]
                if len(truth_sorted_lidlist) <= turk_to_truth[index]:
                    pass
                    # print "[fp]", turk_label["time"]
                    # false_positive += 1
                    # for t in turk_label["points_turk"]:
                    #     print "    Turk:", t["time"], t["desc"].encode("utf-8")
                else:
                    true_lid = truth_sorted_lidlist[turk_to_truth[index]]
                    true_label = truth[vid][true_lid]
                    distance = math.fabs(
                        float(turk_label["time"]) - float(true_label["time"]))
                    signed_distance = float(turk_label["time"]) - float(
                        true_label["time"])
                    if found is not 1 and "matched_new" not in true_label and distance <= window_size:
                        match_result = is_same_label(true_label["desc"],
                                                     turk_label["label"])
                        if match_result[0]:
                            # if True:
                            true_label["matched_new"] = True
                            # print "[m ]", '{:3.2f}'.format(float(match_result[3])),  '{:3.2f}'.format(float(turk_label["time"])), true_label["time"], distance, signed_distance""
                            print "[m ]", '{:3.2f}'.format(
                                float(turk_label["time"])
                            ), true_label["time"], distance, signed_distance
                            # print "    True:", true_label["desc"]
                            # print "    Turk:", turk_label["label"].encode("utf-8")
                            found += 1
                        # else:
                        #     print match_result[3]
                        #     print "    True:", true_label["desc"]
                        #     print "    Turk:", turk_label["label"].encode("utf-8")
 l3 = cluster[2]
 final_label = ""
 # print l1["vid"], cid
 if l1["answer"] == l2["answer"] == l3["answer"]:
     # print "[unanimous]", getLabel(l1)
     final_label = getLabel(l1)
     count_unanimous += 1
     if l1["answer"] == "noop":
         count_unanimous_noop_included += 1
     if l1["answer"] == "@":
         count_unanimous_custom_included += 1
 elif (l1["answer"] == l2["answer"]) or (l2["answer"] == l3["answer"]) or (l3["answer"] == l1["answer"]):
     if l1["answer"] == l2["answer"]:
         if l1["answer"] == "@":
             count_majority_custom_included += 1
         if is_same_label(getLabel(l1), getLabel(l3))[0]:
             count_majority_dup_string += 1
         answer_label = l1
         if l1["answer"] == "noop":
             count_majority_noop_included += 1    
     elif l2["answer"] == l3["answer"]:
         if l2["answer"] == "@":
             count_majority_custom_included += 1
         if is_same_label(getLabel(l2), getLabel(l1))[0]:
             count_majority_dup_string += 1            
         answer_label = l2
         if l2["answer"] == "noop":
             count_majority_noop_included += 1    
     elif l3["answer"] == l1["answer"]:
         if l3["answer"] == "@":
             count_majority_custom_included += 1
Esempio n. 3
0
def merge_neighbors(clusters, label_turk_list, label_turk_list_desc, sorted_turk):
    threshold = 10 # only consider neighbor labels within 10 second distance
    clone_clusters = list(clusters)
    current_cluster = "no cluster yet"
    for (i, label) in enumerate(label_turk_list):
        if i == 0:  # starting from index 1, because it's about merging i and i-1
            continue
        key = str(clone_clusters[i])

        if current_cluster != key:  # new cluster beginning
            new_cluster_count = 0
            current_cluster = key                
        # if key != "-1.0": # looking at only valid clusters (this will now include _R and _T)
        try:
            # skip if i-1 is in the same cluster as i.
            # an existing cluster doesn't need merging.
            if key == str(clone_clusters[i-1]):
                continue
            import math
            # print "\n", i, (i-1), "in cluster", clone_clusters[i-1], label_turk_list[i], label_turk_list[i-1]                        
            dist = math.fabs(label_turk_list[i] - label_turk_list[i-1])
            if i == len(label_turk_list) - 1:
                next_dist = 10000
            else:
                next_dist = math.fabs(label_turk_list[i] - label_turk_list[i+1])

            if dist > threshold:
                continue

            similarity = is_same_label(label_turk_list_desc[i], label_turk_list_desc[i-1])
            # to make sure i-1 is more similar to i than to i-2, both in terms of time and label
            if i == len(label_turk_list) - 1:
                next_similarity = [False, "", "", 0]
            else:
                next_similarity = is_same_label(label_turk_list_desc[i], label_turk_list_desc[i+1])
            # check distance between i and i-1, and between i and i+1
            # in order for merge between i and i-1 to happen, i and i-1 should be closer
            # print "[testing...]", similarity[3], next_similarity[3], dist, next_dist
            similarity_cond = similarity[0] and similarity[3] > next_similarity[3] and dist < next_dist
            if not similarity_cond:
                # print "[not similar]", similarity[1]
                continue
            
            # check if i-1's cluster contains labels by i's Turker, which violates reclustering
            tid = sorted_turk[i]["workerid"]
            tid_exists = False
            for (j, label_inner) in enumerate(label_turk_list):
                # only checking i-1's cluster
                if i >= j and clone_clusters[j] == clone_clusters[i-1]:
                    if sorted_turk[j]["workerid"] == tid:
                        tid_exists = True
                        break
            if tid_exists:
                # print "[cannot merge]", "multiple labels from", tid
                continue
            
            print "[merge]", i, (i-1), label_turk_list_desc[i], "===", label_turk_list_desc[i-1], dist, next_dist
            # case by case based on whether items are singletons or not
            prev_key = str(clone_clusters[i-1])
            is_singleton = clone_clusters.count(key) == 1
            prev_is_singleton = clone_clusters.count(prev_key) == 1
            if is_singleton and prev_is_singleton:
                clone_clusters[i] = clone_clusters[i-1] # either way works
            elif not is_singleton and prev_is_singleton:
                clone_clusters[i-1] = clone_clusters[i]
            elif is_singleton and not prev_is_singleton:
                clone_clusters[i] = clone_clusters[i-1]
            else: 
            # dealing with clusters on both sides.
            # merge to the cluster whose centroid is closer to i
                print "[clusters both sides]", clone_clusters[i], clone_clusters[i-1]
                cent = get_cluster_centroid(key, clone_clusters, label_turk_list)
                prev_cent = get_cluster_centroid(prev_key, clone_clusters, label_turk_list)
                if math.fabs(label_turk_list[i] - cent) >= math.fabs(label_turk_list[i] - prev_cent):
                    clone_clusters[i] = clone_clusters[i-1]
                else:
                    clone_clusters[i-1] = clone_clusters[i]         

        except IndexError: # ignore
            print "index error", i

    return clone_clusters
Esempio n. 4
0
                valid_turk_count += 1
                turk_lid = turk_sorted_lidlist[index]
                turk_label = turk[vid][turk_lid]        
                if len(truth_sorted_lidlist) <= turk_to_truth[index]:
                    pass
                    # print "[fp]", turk_label["time"]
                    # false_positive += 1
                    # for t in turk_label["points_turk"]: 
                    #     print "    Turk:", t["time"], t["desc"].encode("utf-8")
                else:
                    true_lid = truth_sorted_lidlist[turk_to_truth[index]]
                    true_label = truth[vid][true_lid]
                    distance = math.fabs(float(turk_label["time"]) - float(true_label["time"]))
                    signed_distance = float(turk_label["time"]) - float(true_label["time"])
                    if found is not 1 and "matched_new" not in true_label and distance <= window_size: 
                        match_result = is_same_label(true_label["desc"], turk_label["label"])
                        if match_result[0]:
                        # if True:
                            true_label["matched_new"] = True
                            # print "[m ]", '{:3.2f}'.format(float(match_result[3])),  '{:3.2f}'.format(float(turk_label["time"])), true_label["time"], distance, signed_distance""
                            print "[m ]", '{:3.2f}'.format(float(turk_label["time"])), true_label["time"], distance, signed_distance
                            # print "    True:", true_label["desc"]
                            # print "    Turk:", turk_label["label"].encode("utf-8")
                            found += 1                        
                        # else:
                        #     print match_result[3] 
                        #     print "    True:", true_label["desc"]
                        #     print "    Turk:", turk_label["label"].encode("utf-8")

            if found == 1:
                match += 1
def merge_neighbors(clusters, label_turk_list, label_turk_list_desc,
                    sorted_turk):
    threshold = 10  # only consider neighbor labels within 10 second distance
    clone_clusters = list(clusters)
    current_cluster = "no cluster yet"
    for (i, label) in enumerate(label_turk_list):
        if i == 0:  # starting from index 1, because it's about merging i and i-1
            continue
        key = str(clone_clusters[i])

        if current_cluster != key:  # new cluster beginning
            new_cluster_count = 0
            current_cluster = key
        # if key != "-1.0": # looking at only valid clusters (this will now include _R and _T)
        try:
            # skip if i-1 is in the same cluster as i.
            # an existing cluster doesn't need merging.
            if key == str(clone_clusters[i - 1]):
                continue
            import math
            # print "\n", i, (i-1), "in cluster", clone_clusters[i-1], label_turk_list[i], label_turk_list[i-1]
            dist = math.fabs(label_turk_list[i] - label_turk_list[i - 1])
            if i == len(label_turk_list) - 1:
                next_dist = 10000
            else:
                next_dist = math.fabs(label_turk_list[i] -
                                      label_turk_list[i + 1])

            if dist > threshold:
                continue

            similarity = is_same_label(label_turk_list_desc[i],
                                       label_turk_list_desc[i - 1])
            # to make sure i-1 is more similar to i than to i-2, both in terms of time and label
            if i == len(label_turk_list) - 1:
                next_similarity = [False, "", "", 0]
            else:
                next_similarity = is_same_label(label_turk_list_desc[i],
                                                label_turk_list_desc[i + 1])
            # check distance between i and i-1, and between i and i+1
            # in order for merge between i and i-1 to happen, i and i-1 should be closer
            # print "[testing...]", similarity[3], next_similarity[3], dist, next_dist
            similarity_cond = similarity[
                0] and similarity[3] > next_similarity[3] and dist < next_dist
            if not similarity_cond:
                # print "[not similar]", similarity[1]
                continue

            # check if i-1's cluster contains labels by i's Turker, which violates reclustering
            tid = sorted_turk[i]["workerid"]
            tid_exists = False
            for (j, label_inner) in enumerate(label_turk_list):
                # only checking i-1's cluster
                if i >= j and clone_clusters[j] == clone_clusters[i - 1]:
                    if sorted_turk[j]["workerid"] == tid:
                        tid_exists = True
                        break
            if tid_exists:
                # print "[cannot merge]", "multiple labels from", tid
                continue

            print "[merge]", i, (
                i - 1), label_turk_list_desc[i], "===", label_turk_list_desc[
                    i - 1], dist, next_dist
            # case by case based on whether items are singletons or not
            prev_key = str(clone_clusters[i - 1])
            is_singleton = clone_clusters.count(key) == 1
            prev_is_singleton = clone_clusters.count(prev_key) == 1
            if is_singleton and prev_is_singleton:
                clone_clusters[i] = clone_clusters[i - 1]  # either way works
            elif not is_singleton and prev_is_singleton:
                clone_clusters[i - 1] = clone_clusters[i]
            elif is_singleton and not prev_is_singleton:
                clone_clusters[i] = clone_clusters[i - 1]
            else:
                # dealing with clusters on both sides.
                # merge to the cluster whose centroid is closer to i
                print "[clusters both sides]", clone_clusters[
                    i], clone_clusters[i - 1]
                cent = get_cluster_centroid(key, clone_clusters,
                                            label_turk_list)
                prev_cent = get_cluster_centroid(prev_key, clone_clusters,
                                                 label_turk_list)
                if math.fabs(label_turk_list[i] -
                             cent) >= math.fabs(label_turk_list[i] -
                                                prev_cent):
                    clone_clusters[i] = clone_clusters[i - 1]
                else:
                    clone_clusters[i - 1] = clone_clusters[i]

        except IndexError:  # ignore
            print "index error", i

    return clone_clusters