def main():
    parser = ld.parse_arguments()

    print "#### LEAVE ONE OUT "
    print "# KNN Classifier", parser.k

    # stopwords
    stopwords = None
    if parser.stopwords_path:
        stopwords = ld.load_stopwords(parser.stopwords_path)

    # loading the necessary data
    (vocabulary, neigh_classes) = ld.load_train(parser.train_path, stopwords)

    # transforming each item to a v-dimensional space
    train = space.train_transform(vocabulary, parser.train_path)

    print "# Classifying"
    acc = 0

    for x in xrange(len(train)):
        dist_heap = []
        item = train[x]
        for i in xrange(len(train)):
            # skipping the own element
            if x == i:
                continue
            point = train[i]
            distance = spd.euclidean(item, point)

            tup = (distance, i)
            heapq.heappush(dist_heap, tup)

        # return the highest k similar points
        top_k = heapq.nsmallest(parser.k, dist_heap)

        # classifing
        classification = np.zeros(2)
        for (_, idi) in top_k:
            classe = neigh_classes[idi]
            classification[int(classe)] += 1

        out_class = 0
        if classification[0] >= classification[1]:
            out_class = 0
        else:
            out_class = 1

        print x, " -> ", out_class, neigh_classes[x]

        # increment the acc
        if out_class == neigh_classes[x]:
            acc += 1

    print "# Acurácia para ", parser.k, ": ",
    print acc, float(acc) / float(len(train))
Exemple #2
0
 def __init__(self, data, voca):
     self.p_reset = 0.1
     self.p_transition = 1 - self.p_reset
     self.max_repeat = 500
     self.window_size = 10
     self.idf = collections.Counter()
     self.def_idf = 2
     for document in data:
         for word in set(document):
             self.idf[word] += 1
     self.stopword = set()
     for word in load_stopwords():
         if word in voca:
             self.stopword.add(voca[word])
Exemple #3
0
def main():
    print "# KNN Classifier"
    parser = ld.parse_arguments()

    # priting args
    print '\t-k = ' + str(parser.k)
    print '\t-d = ' + parser.distance

    stopwords = None
    if parser.stopwords_path:
        stopwords = ld.load_stopwords(parser.stopwords_path)

    voc = load_vocabulary(parser.train_path, stopwords)
    answers = load_answers(parser.train_path)

    train = transform(voc, parser.train_path)
    test = transform(voc, parser.test_path)

    # output file
    out_path = '../results/' + parser.distance + '_' + str(parser.k)
    out_path += '.txt'
    out_file = open(out_path, 'w')

    for point in test:
        neighbors = []
        for i in xrange(len(train)):
            neigh = train[i]
            distance = 0.0

            if parser.distance == 'cosine':
                distance = spd.cosine(neigh, point)
            elif parser.distance == 'jaccard':
                distance = spd.jaccard(neigh, point)
            elif parser.distance == 'euclidean':
                distance = spd.euclidean(neigh, point)
            elif parser.distance == 'dice':
                distance = spd.dice(neigh, point)
            elif parser.distance == 'correlation':
                distance = spd.correlation(neigh, point)
            elif parser.distance == 'manhattan':
                distance = spd.cityblock(neigh, point)
            else:
                print >> stderr, "ERRO! -  Distância informada inválida."
                exit()

            tup = (distance, i)
            heapq.heappush(neighbors, tup)

        # return the highest k similar points
        top_k = heapq.nsmallest(parser.k, neighbors)

        # classifing
        classification = np.zeros(2)
        for (_, idi) in top_k:
            classe = answers[idi]
            classification[int(classe)] += 1

        # outputing classification
        if(classification[0] >= classification[1]):
            print >> out_file, '0'
            print '0'
        else:
            print >> out_file, '1'
            print '1'

    # outputing the results'
    print
    print "# Resultados salvos no arquivo: " + out_path
    out_file.close()
    result.result("../data/imdb_test", out_path)
Exemple #4
0
def main():
    print "# KNN Classifier"
    parser = ld.parse_arguments()

    stopwords = None
    if parser.stopwords_path:
        stopwords = ld.load_stopwords(parser.stopwords_path)

    # priting args
    print '\t-k = ' + str(parser.k)
    print '\t-d = ' + parser.distance

    # loading the necessary data
    (vocabulary, neigh_classes) = ld.load_train(parser.train_path, stopwords)

    print "# Tamanho do vocabulário:", len(vocabulary)

    # transforming each item to a v-dimensional space
    (train, test) = space.transform(vocabulary, parser.train_path,
                                    parser.test_path)

    # output file
    out_path = parser.distance + '_' + str(parser.k)
    out_path += '.txt'
    out_file = open(out_path, 'w')

    # knn classification
    print "# Classifying", len(train) * parser.percentage
    for item in test:
        dist_heap = []

        # calculates the distance to every point in the training set
        for i in xrange(int(len(train) * parser.percentage)):
            point = train[i]
            distance = 0.0

            if parser.distance == 'cosine':
                distance = spd.cosine(item, point)
            elif parser.distance == 'jaccard':
                distance = spd.jaccard(item, point)
            elif parser.distance == 'euclidean':
                distance = spd.euclidean(item, point)
            elif parser.distance == 'hamming':
                distance = spd.hamming(item, point)
            elif parser.distance == 'correlation':
                distance = spd.correlation(item, point)
            elif parser.distance == 'manhattan':
                distance = spd.cityblock(item, point)
            else:
                print >> stderr, "ERRO! -  Distância informada inválida."
                exit()

            tup = (distance, i)
            heapq.heappush(dist_heap, tup)

        # return the highest k similar points
        top_k = heapq.nsmallest(parser.k, dist_heap)

        # classifing
        classification = np.zeros(2)
        for (_, idi) in top_k:
            classe = neigh_classes[idi]
            classification[int(classe)] += 1

        # DEBUG
        print classification,

        # outputing classification
        if(classification[0] >= classification[1]):
            print >> out_file, '0'
            print '0'
        else:
            print >> out_file, '1'
            print '1'

    print
    print "# Resultados salvos no arquivo: " + out_path
    out_file.close()
    result.result("../data/imdb_test", out_path)