def v_correlation(cluster_list, dicts):
    dict_list = [[], [], [], []]

    print "Calculating Clustered Data Clustroid..."

    p_minrowsum = sys.maxint
    p_clustroid = None
    p_avgdistance = 0
    i = 1
    for email in cluster_list:
        print "Calculating on email " + str(i) + " of " + str(len(cluster_list))
        rowsum = 0
        for email2 in cluster_list:
            if email == email2:
                continue
            dist = Distance.distance(email, email2, "extreme")
            rowsum += dist ** 2
        if rowsum < p_minrowsum:
            p_minrowsum = rowsum
            p_clustroid = email
            p_avgdistance = sqrt(rowsum / (len(cluster_list) - 1))
        i += 1

    print "Calculating Dictionary Data Clustroid..."

    m_minrowsum = sys.maxint
    m_clustroid = None
    m_avgdistance = 0
    i = 1
    for email in dicts:
        if "dictionary3.spam.txt" in email.tag:
            dict_list[0] = email.clues
            assert(len(email.clues) > 0)
        elif "wordlist3.spam.txt" in email.tag:
            dict_list[1] = email.clues
        elif "words3.spam.txt" in email.tag:
            dict_list[2] = email.clues
        elif "wordsEn3.spam.txt" in email.tag:
            dict_list[3] = email.clues

        print "Calculating on email " + str(i) + " of " + str(len(dicts))
        rowsum = 0
        for email2 in dicts:
            if email == email2:
                continue
            dist = Distance.distance(email, email2, "extreme")
            rowsum += dist ** 2
        if rowsum < m_minrowsum:
            m_minrowsum = rowsum
            m_clustroid = email
            m_avgdistance = sqrt(rowsum / (len(dicts) - 1))
        i += 1

    print "Calculating Overlap..."

    p_size = 0
    i = 1
    for email in cluster_list:
        distance = Distance.distance(email, m_clustroid, "extreme")
        print "Scanning Clustered Email " + str(i) + " of " + str(len(cluster_list)) + " with distance " + str(distance)
        if distance < m_avgdistance:
            p_size += 1
        i += 1
    m_size = 0
    i = 1
    for email in dicts:
        distance = Distance.distance(email, p_clustroid, "extreme")
        print "Scanning Dictionary Email " + str(i) + " of " + str(len(dicts)) + " with distance " + str(distance)
        if distance < p_avgdistance:
            m_size += 1
        i += 1

    total_size = len(cluster_list) + len(dicts)

    print "Total Size: " + str(total_size)
    print "Size of Cluster Overlap: " + str(p_size)
    print "Size of Dictionary Overlap: " + str(m_size)
    print "Cluster average distance: " + str(p_avgdistance)
    print "Dictionary average distance: " + str(m_avgdistance)
    print "Dictionary Clues: " + str(dict_list)

    return (float(p_size) + float(m_size)) / float(total_size)
def v_correlation(polluted, mislabeled):

    print "Calculating Polluted Data Clustroid..."

    p_minrowsum = sys.maxint
    p_clustroid = None
    p_avgdistance = 0
    i = 1
    for email in polluted:
        print "Calculating on email " + str(i) + " of " + str(len(polluted))
        rowsum = 0
        for email2 in polluted:
            if email == email2:
                continue
            dist = Distance.distance(email, email2, "extreme")
            rowsum += dist ** 2
        if rowsum < p_minrowsum:
            p_minrowsum = rowsum
            p_clustroid = email
            p_avgdistance = sqrt(rowsum / (len(polluted) - 1))
        i += 1

    print "Calculating Mislabeled Data Clustroid..."

    m_minrowsum = sys.maxint
    m_clustroid = None
    m_avgdistance = 0
    i = 1
    for email in mislabeled:
        print "Calculating on email " + str(i) + " of " + str(len(mislabeled))
        rowsum = 0
        for email2 in mislabeled:
            if email == email2:
                continue
            dist = Distance.distance(email, email2, "extreme")
            rowsum += dist ** 2
        if rowsum < m_minrowsum:
            m_minrowsum = rowsum
            m_clustroid = email
            m_avgdistance = sqrt(rowsum / (len(polluted) - 1))
        i += 1

    print "Calculating Overlap..."

    p_size = 0
    i = 1
    for email in polluted:
        print "Scanning Polluted Email " + str(i) + " of " + str(len(polluted))
        if Distance.distance(email, m_clustroid, "extreme") < m_avgdistance:
            p_size += 1
        i += 1
    m_size = 0
    i = 1
    for email in mislabeled:
        print "Scanning Mislabeled Email " + str(i) + " of " + str(len(mislabeled))
        if Distance.distance(email, p_clustroid, "extreme") < p_avgdistance:
            m_size += 1
        i += 1

    total_size = len(polluted) + len(mislabeled)

    print "Total Size: " + str(total_size)
    print "Size of Polluted Overlap: " + str(p_size)
    print "Size of Mislabeled Overlap: " + str(m_size)

    return (float(p_size) + float(m_size)) / float(total_size)
Example #3
0
def v_correlation(polluted, mislabeled):

    print "Calculating Polluted Data Clustroid..."

    p_minrowsum = sys.maxint
    p_clustroid = None
    p_avgdistance = 0
    i = 1
    for email in polluted:
        print "Calculating on email " + str(i) + " of " + str(len(polluted))
        rowsum = 0
        for email2 in polluted:
            if email == email2:
                continue
            dist = Distance.distance(email, email2, "extreme")
            rowsum += dist**2
        if rowsum < p_minrowsum:
            p_minrowsum = rowsum
            p_clustroid = email
            p_avgdistance = sqrt(rowsum / (len(polluted) - 1))
        i += 1

    print "Calculating Mislabeled Data Clustroid..."

    m_minrowsum = sys.maxint
    m_clustroid = None
    m_avgdistance = 0
    i = 1
    for email in mislabeled:
        print "Calculating on email " + str(i) + " of " + str(len(mislabeled))
        rowsum = 0
        for email2 in mislabeled:
            if email == email2:
                continue
            dist = Distance.distance(email, email2, "extreme")
            rowsum += dist**2
        if rowsum < m_minrowsum:
            m_minrowsum = rowsum
            m_clustroid = email
            m_avgdistance = sqrt(rowsum / (len(polluted) - 1))
        i += 1

    print "Calculating Overlap..."

    p_size = 0
    i = 1
    for email in polluted:
        print "Scanning Polluted Email " + str(i) + " of " + str(len(polluted))
        if Distance.distance(email, m_clustroid, "extreme") < m_avgdistance:
            p_size += 1
        i += 1
    m_size = 0
    i = 1
    for email in mislabeled:
        print "Scanning Mislabeled Email " + str(i) + " of " + str(
            len(mislabeled))
        if Distance.distance(email, p_clustroid, "extreme") < p_avgdistance:
            m_size += 1
        i += 1

    total_size = len(polluted) + len(mislabeled)

    print "Total Size: " + str(total_size)
    print "Size of Polluted Overlap: " + str(p_size)
    print "Size of Mislabeled Overlap: " + str(m_size)

    return (float(p_size) + float(m_size)) / float(total_size)
Example #4
0
def v_correlation(cluster_list, dicts):
    dict_list = [[], [], [], []]

    print "Calculating Clustered Data Clustroid..."

    p_minrowsum = sys.maxint
    p_clustroid = None
    p_avgdistance = 0
    i = 1
    for email in cluster_list:
        print "Calculating on email " + str(i) + " of " + str(
            len(cluster_list))
        rowsum = 0
        for email2 in cluster_list:
            if email == email2:
                continue
            dist = Distance.distance(email, email2, "extreme")
            rowsum += dist**2
        if rowsum < p_minrowsum:
            p_minrowsum = rowsum
            p_clustroid = email
            p_avgdistance = sqrt(rowsum / (len(cluster_list) - 1))
        i += 1

    print "Calculating Dictionary Data Clustroid..."

    m_minrowsum = sys.maxint
    m_clustroid = None
    m_avgdistance = 0
    i = 1
    for email in dicts:
        if "dictionary3.spam.txt" in email.tag:
            dict_list[0] = email.clues
            assert (len(email.clues) > 0)
        elif "wordlist3.spam.txt" in email.tag:
            dict_list[1] = email.clues
        elif "words3.spam.txt" in email.tag:
            dict_list[2] = email.clues
        elif "wordsEn3.spam.txt" in email.tag:
            dict_list[3] = email.clues

        print "Calculating on email " + str(i) + " of " + str(len(dicts))
        rowsum = 0
        for email2 in dicts:
            if email == email2:
                continue
            dist = Distance.distance(email, email2, "extreme")
            rowsum += dist**2
        if rowsum < m_minrowsum:
            m_minrowsum = rowsum
            m_clustroid = email
            m_avgdistance = sqrt(rowsum / (len(dicts) - 1))
        i += 1

    print "Calculating Overlap..."

    p_size = 0
    i = 1
    for email in cluster_list:
        distance = Distance.distance(email, m_clustroid, "extreme")
        print "Scanning Clustered Email " + str(i) + " of " + str(
            len(cluster_list)) + " with distance " + str(distance)
        if distance < m_avgdistance:
            p_size += 1
        i += 1
    m_size = 0
    i = 1
    for email in dicts:
        distance = Distance.distance(email, p_clustroid, "extreme")
        print "Scanning Dictionary Email " + str(i) + " of " + str(
            len(dicts)) + " with distance " + str(distance)
        if distance < p_avgdistance:
            m_size += 1
        i += 1

    total_size = len(cluster_list) + len(dicts)

    print "Total Size: " + str(total_size)
    print "Size of Cluster Overlap: " + str(p_size)
    print "Size of Dictionary Overlap: " + str(m_size)
    print "Cluster average distance: " + str(p_avgdistance)
    print "Dictionary average distance: " + str(m_avgdistance)
    print "Dictionary Clues: " + str(dict_list)

    return (float(p_size) + float(m_size)) / float(total_size)