def handle(self, *args, **options): super(Command, self).handle(self, *args, **options) print "Loading classifier" classifier_path = os.path.join( settings.BASE_DIR, "lda_stats/classifiers/naive_bayes.classifier.cpickle") nb = cPickle.load(open(classifier_path, 'rb')) filters = {} if options['only_newsitems']: filters['newsitem__isnull'] = False elif options['only_comments']: filters['comment__isnull'] = False for database in self.selected_dbs: print "Processing database " + database algorithm_naive_bayes, created = Algorithm.objects.using(database)\ .get_or_create(name="naive_bayes") print "Removing previous results" query = """ DELETE FROM result WHERE algorithm_id IN ( SELECT id FROM algorithm WHERE name = 'naive_bayes' )""" cursor = connections[database].cursor() cursor.execute(query) print "Querying database" queryset = Text.objects.using(database).filter(**filters) results = [] print "Calculating..." self.pbar_setup(maxval=queryset.count()) for text in queryset_iterator(queryset, chunksize=2000): estimate = nb.predict([ utils.TFIDF_to_list(utils.TFIDF(utils.tokenize(text.text))) ]) results.append( Result(algorithm=algorithm_naive_bayes, text=text, value=str(estimate[0]))) self.pbar_increment() if len(results) > 100000: print "\nSaving partial results..." Result.objects.using(database).bulk_create(results) results = [] self.pbar_destroy() print "Saving results" Result.objects.using(database).bulk_create(results) self.stdout.write(self.style.SUCCESS('Command executed succesfully'))
def estimate(text): return (text, nb.predict( [utils.TFIDF_to_list(utils.TFIDF(utils.tokenize(text.text)))]))
def handle(self, *args, **options): k = 5 print "Reading data to memory" TFIDF_list = [] label = [] queryset = Newsitem.objects.all() self.pbar_setup(maxval=queryset.count()) newsitems = queryset_iterator(queryset, chunksize=100) for newsitem in newsitems: TFIDF_list.append(utils.TFIDF(utils.tokenize(newsitem.text.text))) if newsitem.cat1 in [1, 2]: label.append(1) else: label.append(0) self.pbar_increment() self.pbar_destroy() print "Creating traing and test data..." TFIDF_svm = [] for i in TFIDF_list: TFIDF_svm.append(utils.TFIDF_to_list(i)) # TFIDF_svm is the input matrix of SVM # Reads the train_len from command line #train_len=int(sys.argv[1]) train_len = 200 # Index of train samples from class 0 indexZero = [i for i in range(len(label)) if label[i] == 0][:train_len] # Index of train samples from class 1 indexOne = [i for i in range(len(label)) if label[i] == 1][:train_len] # We have K number of positive samples and also K number of negative samples train = [] train_label = [] for i in indexZero + indexOne: train.append(TFIDF_svm[i]) train_label.append(label[i]) # Train: train matrix # train_label: lables of train data # The other samples are test samples. test = [ TFIDF_svm[i] for i in range(len(TFIDF_svm)) if i not in indexZero + indexOne ] test_label = [ label[i] for i in range(len(label)) if i not in indexZero + indexOne ] print "Fitting..." clf = svm.SVC(probability=True) # Train the model clf.fit(train, train_label) #print "Score: " + clf.score(train, train_label) print "Generating probabilities" pred_probas = clf.predict_proba(test)[:, 1] fpr, tpr, _ = roc_curve(test_label, pred_probas) roc_auc = auc(fpr, tpr) print "Plotting..." plt.plot(fpr, tpr, label='area = %.2f' % roc_auc) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.legend(loc='lower right') print "Saving!" plt.savefig('out.png') self.stdout.write(self.style.SUCCESS('Command executed succesfully'))
def handle(self, *args, **options): TFIDF_list = [] label = [] queryset = Newsitem.objects self.pbar_setup(maxval=queryset.count()) newsitems = queryset_iterator(queryset.all(), chunksize=100) for newsitem in newsitems: TFIDF_list.append(self.tokenize(newsitem.text.text)) if newsitem.cat1 in [1, 2]: label.append(1) else: label.append(0) self.pbar_increment() self.pbar_destroy() self.train() print "Estimating..." self.pbar_setup(maxval=len(TFIDF_list)) counter1 = 0 TP = 0 TN = 0 FP = 0 FN = 0 while counter1 < len(TFIDF_list): distance_list = [] counter2 = 0 while counter2 < len(TFIDF_list): if counter1 != counter2: distance_list.append( utils.TFIDF_distance(TFIDF_list[counter1], TFIDF_list[counter2])) counter2 += 1 nearest_list = sorted(range(len(distance_list)), key=lambda i: distance_list[i])[:k] repeat_dic = {} for i in nearest_list: if repeat_dic.has_key(label[i]): repeat_dic[label[i]] += 1 else: repeat_dic[label[i]] = 1 estimate_label = max(repeat_dic, key=repeat_dic.get) if estimate_label == 1 and label[counter1] == 1: TP += 1 elif estimate_label == 1 and label[counter1] == 0: FN += 1 elif estimate_label == 0 and label[counter1] == 0: TN += 1 else: FP += 1 counter1 += 1 self.pbar_increment() self.pbar_destroy() data = [ ('algo_knn_tp', TP), ('algo_knn_fn', FN), ('algo_knn_fp', FP), ('algo_knn_tn', TN), ] print 'TP=>', TP, 'FN=>', FN, 'FP=>', FP, 'TN=>', TN #print 'F1 Measurement: ', float(TP+TN)/(TP+FN+FP+TN), float(TP)/(TP+FP), float(TP)/(TP+FN), TP, FN, FP, TN print "Saving algorithm results" for item in data: algorithm_name = item[0] value = item[1] algorithm, create = Algorithm.objects.get_or_create( name=algorithm_name) result, created = Result.objects.get_or_create(algorithm=algorithm) result.value = str(value) result.save() algo_knn_uniform_estimative, create = Algorithm.objects.get_or_create( name="algo_knn_uniform_estimative") print "Calculating estimatives and saving result" queryset = Newsitem.objects self.pbar_setup(maxval=queryset.count()) newsitems = queryset_iterator(queryset.all(), chunksize=100) for newsitem in newsitems: data = utils.TFIDF(utils.tokenize(row[0])) distance_list = [] for i in range(len(TFIDF_list)): distance_list.append(utils.TFIDF_distance(data, TFIDF_list[i])) nearest_list = sorted(range(len(distance_list)), key=lambda i: distance_list[i])[:k] repeat_dic = {} for i in nearest_list: if distance_list[i] != 0: if repeat_dic.has_key(label[i]): repeat_dic[label[i]] += 1 else: repeat_dic[label[i]] = 1 estimate = max(repeat_dic, key=repeat_dic.get) Result.objects.create(algorithm=algo_knn_uniform_estimative, text=newsitem.text, value=str(estimate)) self.pbar_increment() self.pbar_destroy() self.stdout.write(self.style.SUCCESS('Command executed succesfully'))
def tokenize(self, text): return utils.TFIDF(utils.tokenize(text))
def handle(self, *args, **options): super(Command, self).handle(self, *args, **options) k = 5 print "Reading data to memory" TFIDF_list = [] label = [] queryset = Newsitem.objects.all() self.pbar_setup(maxval=queryset.count()) newsitems = queryset_iterator(queryset, chunksize=100) for newsitem in newsitems: TFIDF_list.append(utils.TFIDF(utils.tokenize(newsitem.text.text))) if newsitem.cat1 in [1, 2]: label.append(1) else: label.append(0) self.pbar_increment() self.pbar_destroy() print "Training..." TFIDF_svm = [] for i in TFIDF_list: TFIDF_svm.append(utils.TFIDF_to_list(i)) # TFIDF_svm is the input matrix of SVM # Reads the train_len from command line #train_len=int(sys.argv[1]) train_len = 200 # Index of train samples from class 0 indexZero = [i for i in range(len(label)) if label[i] == 0][:train_len] # Index of train samples from class 1 indexOne = [i for i in range(len(label)) if label[i] == 1][:train_len] # We have K number of positive samples and also K number of negative samples train = [] train_label = [] for i in indexZero + indexOne: train.append(TFIDF_svm[i]) train_label.append(label[i]) # Train: train matrix # train_label: lables of train data # The other samples are test samples. test = [ TFIDF_svm[i] for i in range(len(TFIDF_svm)) if i not in indexZero + indexOne ] test_label = [ label[i] for i in range(len(label)) if i not in indexZero + indexOne ] clf = svm.SVC() # Train the model clf.fit(train, train_label) counter1 = 0 TP = 0 TN = 0 FP = 0 FN = 0 print "Estimating..." self.pbar_setup(maxval=len(test)) for i in test: estimate_label = clf.predict([i])[0] if estimate_label == 1 and label[counter1] == 1: TP += 1 elif estimate_label == 1 and label[counter1] == 0: FN += 1 elif estimate_label == 0 and label[counter1] == 0: TN += 1 else: FP += 1 counter1 += 1 self.pbar_increment() self.pbar_destroy() print 'TP=>', TP, 'FN=>', FN, 'FP=>', FP, 'TN=>', TN #print 'F1 Measurement: ', float(TP+TN)/(TP+FN+FP+TN), float(TP)/(TP+FP), float(TP)/(TP+FN), TP, FN, FP, TN data = [ ('algo_svm_tp', TP), ('algo_svm_fn', FN), ('algo_svm_fp', FP), ('algo_svm_tn', TN), ('algo_svm_score', clf.score(train, train_label)), ] print "Saving algorithm results" for item in data: algorithm_name = item[0] value = item[1] algorithm, create = Algorithm.objects.get_or_create( name=algorithm_name) result, create = Result.objects.get_or_create(algorithm=algorithm) result.value = str(value) result.save() algo_svm_estimative, create = Algorithm.objects.get_or_create( name="algo_svm_estimative") print "Calculating estimatives and saving result" queryset = Newsitem.objects self.pbar_setup(maxval=queryset.count()) newsitems = queryset_iterator(queryset.all(), chunksize=100) for newsitem in newsitems: estimate = clf.predict([ utils.TFIDF_to_list( utils.TFIDF(utils.tokenize(newsitem.text.text))) ]) Result.objects.create(algorithm=algo_svm_estimative, text=newsitem.text, value=str(estimate[0])) self.pbar_increment() self.pbar_destroy() self.stdout.write(self.style.SUCCESS('Command executed succesfully'))