def v_correlation(cluster_list, dicts): dict_list = [[], [], [], []] print "Calculating Clustered Data Clustroid..." p_minrowsum = sys.maxint p_clustroid = None p_avgdistance = 0 i = 1 for email in cluster_list: print "Calculating on email " + str(i) + " of " + str(len(cluster_list)) rowsum = 0 for email2 in cluster_list: if email == email2: continue dist = Distance.distance(email, email2, "extreme") rowsum += dist ** 2 if rowsum < p_minrowsum: p_minrowsum = rowsum p_clustroid = email p_avgdistance = sqrt(rowsum / (len(cluster_list) - 1)) i += 1 print "Calculating Dictionary Data Clustroid..." m_minrowsum = sys.maxint m_clustroid = None m_avgdistance = 0 i = 1 for email in dicts: if "dictionary3.spam.txt" in email.tag: dict_list[0] = email.clues assert(len(email.clues) > 0) elif "wordlist3.spam.txt" in email.tag: dict_list[1] = email.clues elif "words3.spam.txt" in email.tag: dict_list[2] = email.clues elif "wordsEn3.spam.txt" in email.tag: dict_list[3] = email.clues print "Calculating on email " + str(i) + " of " + str(len(dicts)) rowsum = 0 for email2 in dicts: if email == email2: continue dist = Distance.distance(email, email2, "extreme") rowsum += dist ** 2 if rowsum < m_minrowsum: m_minrowsum = rowsum m_clustroid = email m_avgdistance = sqrt(rowsum / (len(dicts) - 1)) i += 1 print "Calculating Overlap..." p_size = 0 i = 1 for email in cluster_list: distance = Distance.distance(email, m_clustroid, "extreme") print "Scanning Clustered Email " + str(i) + " of " + str(len(cluster_list)) + " with distance " + str(distance) if distance < m_avgdistance: p_size += 1 i += 1 m_size = 0 i = 1 for email in dicts: distance = Distance.distance(email, p_clustroid, "extreme") print "Scanning Dictionary Email " + str(i) + " of " + str(len(dicts)) + " with distance " + str(distance) if distance < p_avgdistance: m_size += 1 i += 1 total_size = len(cluster_list) + len(dicts) print "Total Size: " + str(total_size) print "Size of Cluster Overlap: " + str(p_size) print "Size of Dictionary Overlap: " + str(m_size) print "Cluster average distance: " + str(p_avgdistance) print "Dictionary average distance: " + str(m_avgdistance) print "Dictionary Clues: " + str(dict_list) return (float(p_size) + float(m_size)) / float(total_size)
def v_correlation(polluted, mislabeled): print "Calculating Polluted Data Clustroid..." p_minrowsum = sys.maxint p_clustroid = None p_avgdistance = 0 i = 1 for email in polluted: print "Calculating on email " + str(i) + " of " + str(len(polluted)) rowsum = 0 for email2 in polluted: if email == email2: continue dist = Distance.distance(email, email2, "extreme") rowsum += dist ** 2 if rowsum < p_minrowsum: p_minrowsum = rowsum p_clustroid = email p_avgdistance = sqrt(rowsum / (len(polluted) - 1)) i += 1 print "Calculating Mislabeled Data Clustroid..." m_minrowsum = sys.maxint m_clustroid = None m_avgdistance = 0 i = 1 for email in mislabeled: print "Calculating on email " + str(i) + " of " + str(len(mislabeled)) rowsum = 0 for email2 in mislabeled: if email == email2: continue dist = Distance.distance(email, email2, "extreme") rowsum += dist ** 2 if rowsum < m_minrowsum: m_minrowsum = rowsum m_clustroid = email m_avgdistance = sqrt(rowsum / (len(polluted) - 1)) i += 1 print "Calculating Overlap..." p_size = 0 i = 1 for email in polluted: print "Scanning Polluted Email " + str(i) + " of " + str(len(polluted)) if Distance.distance(email, m_clustroid, "extreme") < m_avgdistance: p_size += 1 i += 1 m_size = 0 i = 1 for email in mislabeled: print "Scanning Mislabeled Email " + str(i) + " of " + str(len(mislabeled)) if Distance.distance(email, p_clustroid, "extreme") < p_avgdistance: m_size += 1 i += 1 total_size = len(polluted) + len(mislabeled) print "Total Size: " + str(total_size) print "Size of Polluted Overlap: " + str(p_size) print "Size of Mislabeled Overlap: " + str(m_size) return (float(p_size) + float(m_size)) / float(total_size)
def v_correlation(polluted, mislabeled): print "Calculating Polluted Data Clustroid..." p_minrowsum = sys.maxint p_clustroid = None p_avgdistance = 0 i = 1 for email in polluted: print "Calculating on email " + str(i) + " of " + str(len(polluted)) rowsum = 0 for email2 in polluted: if email == email2: continue dist = Distance.distance(email, email2, "extreme") rowsum += dist**2 if rowsum < p_minrowsum: p_minrowsum = rowsum p_clustroid = email p_avgdistance = sqrt(rowsum / (len(polluted) - 1)) i += 1 print "Calculating Mislabeled Data Clustroid..." m_minrowsum = sys.maxint m_clustroid = None m_avgdistance = 0 i = 1 for email in mislabeled: print "Calculating on email " + str(i) + " of " + str(len(mislabeled)) rowsum = 0 for email2 in mislabeled: if email == email2: continue dist = Distance.distance(email, email2, "extreme") rowsum += dist**2 if rowsum < m_minrowsum: m_minrowsum = rowsum m_clustroid = email m_avgdistance = sqrt(rowsum / (len(polluted) - 1)) i += 1 print "Calculating Overlap..." p_size = 0 i = 1 for email in polluted: print "Scanning Polluted Email " + str(i) + " of " + str(len(polluted)) if Distance.distance(email, m_clustroid, "extreme") < m_avgdistance: p_size += 1 i += 1 m_size = 0 i = 1 for email in mislabeled: print "Scanning Mislabeled Email " + str(i) + " of " + str( len(mislabeled)) if Distance.distance(email, p_clustroid, "extreme") < p_avgdistance: m_size += 1 i += 1 total_size = len(polluted) + len(mislabeled) print "Total Size: " + str(total_size) print "Size of Polluted Overlap: " + str(p_size) print "Size of Mislabeled Overlap: " + str(m_size) return (float(p_size) + float(m_size)) / float(total_size)
def v_correlation(cluster_list, dicts): dict_list = [[], [], [], []] print "Calculating Clustered Data Clustroid..." p_minrowsum = sys.maxint p_clustroid = None p_avgdistance = 0 i = 1 for email in cluster_list: print "Calculating on email " + str(i) + " of " + str( len(cluster_list)) rowsum = 0 for email2 in cluster_list: if email == email2: continue dist = Distance.distance(email, email2, "extreme") rowsum += dist**2 if rowsum < p_minrowsum: p_minrowsum = rowsum p_clustroid = email p_avgdistance = sqrt(rowsum / (len(cluster_list) - 1)) i += 1 print "Calculating Dictionary Data Clustroid..." m_minrowsum = sys.maxint m_clustroid = None m_avgdistance = 0 i = 1 for email in dicts: if "dictionary3.spam.txt" in email.tag: dict_list[0] = email.clues assert (len(email.clues) > 0) elif "wordlist3.spam.txt" in email.tag: dict_list[1] = email.clues elif "words3.spam.txt" in email.tag: dict_list[2] = email.clues elif "wordsEn3.spam.txt" in email.tag: dict_list[3] = email.clues print "Calculating on email " + str(i) + " of " + str(len(dicts)) rowsum = 0 for email2 in dicts: if email == email2: continue dist = Distance.distance(email, email2, "extreme") rowsum += dist**2 if rowsum < m_minrowsum: m_minrowsum = rowsum m_clustroid = email m_avgdistance = sqrt(rowsum / (len(dicts) - 1)) i += 1 print "Calculating Overlap..." p_size = 0 i = 1 for email in cluster_list: distance = Distance.distance(email, m_clustroid, "extreme") print "Scanning Clustered Email " + str(i) + " of " + str( len(cluster_list)) + " with distance " + str(distance) if distance < m_avgdistance: p_size += 1 i += 1 m_size = 0 i = 1 for email in dicts: distance = Distance.distance(email, p_clustroid, "extreme") print "Scanning Dictionary Email " + str(i) + " of " + str( len(dicts)) + " with distance " + str(distance) if distance < p_avgdistance: m_size += 1 i += 1 total_size = len(cluster_list) + len(dicts) print "Total Size: " + str(total_size) print "Size of Cluster Overlap: " + str(p_size) print "Size of Dictionary Overlap: " + str(m_size) print "Cluster average distance: " + str(p_avgdistance) print "Dictionary average distance: " + str(m_avgdistance) print "Dictionary Clues: " + str(dict_list) return (float(p_size) + float(m_size)) / float(total_size)