self.start_time = time.time() def Train(self, training_data): training_list = [] label_list = [] for question in training_data: training_list.append(question.raw_words) label_list.append(core.popular_tags.index(iter(question.tag_list).next())) decomposed_data = self.svd.fit_transform(self.tfidf.fit_transform(self.cv.fit_transform(training_list))) self.svm.fit(decomposed_data, label_list) def Classify(self, eval_question): eval_doc = self.svd.transform(self.tfidf.fit_transform(self.cv.transform([eval_question.raw_words]))) predicted_class = core.popular_tags[self.svm.predict(eval_doc)[0]] print "DEBUG: Classified question, svm. %f" % (time.time() - self.start_time) return [predicted_class] def knn_factory(): return KNNClassifier() def svm_factory(): return OVASVMClassifier() setup(sys.argv) #ev.leave_one_out(knn_factory, ev.eval_tp1, './knn_eval', questions, set(truncated_popular_tags), threads=1) ev.leave_one_out(svm_factory, ev.eval_tp1, './ova_svm_eval', questions, set(truncated_popular_tags), threads=1)
import sets import os import eval_classifier as ev import preprocess_to_questions as pq # comment out if you don't want dependency on joblib #from joblib import Parallel, delayed PROJECT_PATH = '/home/nclimer/autotagger/' tags_to_consider = ['javascript', 'java', 'android', 'php', 'c#', 'python', 'jquery', 'html', 'ios'] questions = pq.read_questions(PROJECT_PATH) questions = pq.tp1_filter(pq.filter_tags(questions, tags_to_consider)) print 'Evaulationg {} questions'.format(len(questions)) all_tags = set() for q in questions: all_tags.update(q.tag_list) print all_tags try: os.mkdir('./nieve_bayse_eval') except: pass class nieve_bayse_factory: def __call__(self): return nb.NaiveBayseClassifier(1) ev.leave_one_out(nieve_bayse_factory(), ev.eval_tp1, './nieve_bayse_eval', questions, all_tags, threads=4)
import os import eval_classifier as ev import binary_relevance as br import preprocess_to_questions as pq # comment out if you don't want dependency on joblib #from joblib import Parallel, delayed PROJECT_PATH = '/home/njclimer/source/csc522/' tags_to_consider = ['javascript', 'java', 'android', 'php', 'c#', 'python', 'jquery', 'html', 'ios'] questions = pq.read_questions(PROJECT_PATH) questions = pq.filter_tags(questions, tags_to_consider) print 'processing {} questions.'.format(len(questions)) # questions = questions[:1000] all_tags = set() for q in questions: all_tags.update(q.tag_list) print all_tags try: os.mkdir('./nieve_bayse_br_eval_doc') except: pass def nieve_bayse_factory(): return nb.NaiveBayseClassifier(word_frequency=False) def binary_relevance_factory(): return br.BinaryRelevanceClassifier(nieve_bayse_factory) ev.leave_one_out(binary_relevance_factory, ev.eval_tp1, './nieve_bayse_br_eval_doc', questions, all_tags, threads=3)
import preprocess_to_questions as pq # comment out if you don't want dependency on joblib # from joblib import Parallel, delayed PROJECT_PATH = "/home/njclimer/source/csc522/" tags_to_consider = ["c++", "sql", "angularjs"] questions = pq.read_questions(PROJECT_PATH) questions = pq.tp1_filter(pq.filter_tags(questions, tags_to_consider)) questions = questions[:100] print "Evaulationg {} questions".format(len(questions)) all_tags = set() for q in questions: all_tags.update(q.tag_list) print all_tags try: os.mkdir("./nieve_bayse_doc_counts_eval") except: pass class nieve_bayse_factory: def __call__(self): return nb.NaiveBayseClassifier(1, word_frequency=False) ev.leave_one_out(nieve_bayse_factory(), ev.eval_tp1, "./nieve_bayse_doc_counts_eval", questions, all_tags, threads=1)