def main(): parser = ld.parse_arguments() print "#### LEAVE ONE OUT " print "# KNN Classifier", parser.k # stopwords stopwords = None if parser.stopwords_path: stopwords = ld.load_stopwords(parser.stopwords_path) # loading the necessary data (vocabulary, neigh_classes) = ld.load_train(parser.train_path, stopwords) # transforming each item to a v-dimensional space train = space.train_transform(vocabulary, parser.train_path) print "# Classifying" acc = 0 for x in xrange(len(train)): dist_heap = [] item = train[x] for i in xrange(len(train)): # skipping the own element if x == i: continue point = train[i] distance = spd.euclidean(item, point) tup = (distance, i) heapq.heappush(dist_heap, tup) # return the highest k similar points top_k = heapq.nsmallest(parser.k, dist_heap) # classifing classification = np.zeros(2) for (_, idi) in top_k: classe = neigh_classes[idi] classification[int(classe)] += 1 out_class = 0 if classification[0] >= classification[1]: out_class = 0 else: out_class = 1 print x, " -> ", out_class, neigh_classes[x] # increment the acc if out_class == neigh_classes[x]: acc += 1 print "# Acurácia para ", parser.k, ": ", print acc, float(acc) / float(len(train))
def __init__(self, data, voca): self.p_reset = 0.1 self.p_transition = 1 - self.p_reset self.max_repeat = 500 self.window_size = 10 self.idf = collections.Counter() self.def_idf = 2 for document in data: for word in set(document): self.idf[word] += 1 self.stopword = set() for word in load_stopwords(): if word in voca: self.stopword.add(voca[word])
def main(): print "# KNN Classifier" parser = ld.parse_arguments() # priting args print '\t-k = ' + str(parser.k) print '\t-d = ' + parser.distance stopwords = None if parser.stopwords_path: stopwords = ld.load_stopwords(parser.stopwords_path) voc = load_vocabulary(parser.train_path, stopwords) answers = load_answers(parser.train_path) train = transform(voc, parser.train_path) test = transform(voc, parser.test_path) # output file out_path = '../results/' + parser.distance + '_' + str(parser.k) out_path += '.txt' out_file = open(out_path, 'w') for point in test: neighbors = [] for i in xrange(len(train)): neigh = train[i] distance = 0.0 if parser.distance == 'cosine': distance = spd.cosine(neigh, point) elif parser.distance == 'jaccard': distance = spd.jaccard(neigh, point) elif parser.distance == 'euclidean': distance = spd.euclidean(neigh, point) elif parser.distance == 'dice': distance = spd.dice(neigh, point) elif parser.distance == 'correlation': distance = spd.correlation(neigh, point) elif parser.distance == 'manhattan': distance = spd.cityblock(neigh, point) else: print >> stderr, "ERRO! - Distância informada inválida." exit() tup = (distance, i) heapq.heappush(neighbors, tup) # return the highest k similar points top_k = heapq.nsmallest(parser.k, neighbors) # classifing classification = np.zeros(2) for (_, idi) in top_k: classe = answers[idi] classification[int(classe)] += 1 # outputing classification if(classification[0] >= classification[1]): print >> out_file, '0' print '0' else: print >> out_file, '1' print '1' # outputing the results' print print "# Resultados salvos no arquivo: " + out_path out_file.close() result.result("../data/imdb_test", out_path)
def main(): print "# KNN Classifier" parser = ld.parse_arguments() stopwords = None if parser.stopwords_path: stopwords = ld.load_stopwords(parser.stopwords_path) # priting args print '\t-k = ' + str(parser.k) print '\t-d = ' + parser.distance # loading the necessary data (vocabulary, neigh_classes) = ld.load_train(parser.train_path, stopwords) print "# Tamanho do vocabulário:", len(vocabulary) # transforming each item to a v-dimensional space (train, test) = space.transform(vocabulary, parser.train_path, parser.test_path) # output file out_path = parser.distance + '_' + str(parser.k) out_path += '.txt' out_file = open(out_path, 'w') # knn classification print "# Classifying", len(train) * parser.percentage for item in test: dist_heap = [] # calculates the distance to every point in the training set for i in xrange(int(len(train) * parser.percentage)): point = train[i] distance = 0.0 if parser.distance == 'cosine': distance = spd.cosine(item, point) elif parser.distance == 'jaccard': distance = spd.jaccard(item, point) elif parser.distance == 'euclidean': distance = spd.euclidean(item, point) elif parser.distance == 'hamming': distance = spd.hamming(item, point) elif parser.distance == 'correlation': distance = spd.correlation(item, point) elif parser.distance == 'manhattan': distance = spd.cityblock(item, point) else: print >> stderr, "ERRO! - Distância informada inválida." exit() tup = (distance, i) heapq.heappush(dist_heap, tup) # return the highest k similar points top_k = heapq.nsmallest(parser.k, dist_heap) # classifing classification = np.zeros(2) for (_, idi) in top_k: classe = neigh_classes[idi] classification[int(classe)] += 1 # DEBUG print classification, # outputing classification if(classification[0] >= classification[1]): print >> out_file, '0' print '0' else: print >> out_file, '1' print '1' print print "# Resultados salvos no arquivo: " + out_path out_file.close() result.result("../data/imdb_test", out_path)