def q3(self): print_question('3') # compaure variances svd = self.svd svd.fit(self.tfidf) variances = svd.explained_variance_ratio_.cumsum().tolist() plt.plot(range(1, 1001), variances, label="SVD") plt.xlabel('r') plt.ylabel('variance ratio') plt.title('variance ratio & r value relation') plt.show() # compare different r nmf_res = [] svd_res = [] all_r = [1, 2, 3, 5, 10, 20, 50, 100, 300] for r in all_r: km_svd = KMeans(n_clusters=2, max_iter=100, n_init=3) svd = TruncatedSVD(n_components=r, random_state=0) km_svd.fit(svd.fit_transform(self.tfidf)) msg = 'svd with r=%s' % str(r) res_svd = self.show_result(km_svd.labels_, msg) km_nmf = KMeans(n_clusters=2, max_iter=100, n_init=3) nmf = NMF(n_components=r, init='random', random_state=0) km_nmf.fit(nmf.fit_transform(self.tfidf)) msg = 'nmf with r=%s' % str(r) res_nmf = self.show_result(km_nmf.labels_, msg) nmf_res.append(res_nmf) svd_res.append(res_svd) plot_res(svd_res, all_r, 'SVD Result') plot_res(nmf_res, all_r, 'NMF Result')
def e(self): print_question('e') hard_classifier = svm.LinearSVC(C=1000, random_state=42) soft_classifier = svm.LinearSVC(C=0.001, random_state=42) self.show_result(*self.svm_classify_SVD(hard_classifier, 'Hard Margin SVM')) self.show_result(*self.svm_classify_SVD(soft_classifier, 'Soft Margin SVM')) self.show_result(*self.svm_classify_NMF(hard_classifier, 'Hard Margin SVM')) self.show_result(*self.svm_classify_NMF(soft_classifier, 'Soft Margin SVM'))
def i(self): print_question('i') params = [0.001, 0.1, 1, 10, 1000] penalties = ['l1', 'l2'] for p in penalties: for c in params: lg = LogisticRegression(C=c, penalty=p) msg = 'Logistic Regression Classifier with c=%s, penalty=%s' % (str(c), p) self.show_result(*self.prob_classify_NMF(lg, msg)) self.show_result(*self.prob_classify_SVD(lg, msg))
def a(self): # count the documents in each category print_question('a') count = defaultdict(int) for d in self.train_data.target: count[d] += 1 colors = rand_color_arr(8) plt.barh(self.train_data.target_names, count.values(), alpha=0.8, color=colors) plt.xlabel('Class') plt.ylabel('Number') plt.title('the Number of Documents per Class') plt.show()
def c(self): print_question('c') allDoc = [] for cat in allCat: data = fetch_data([cat], 'train').data poke = "" for doc in data: poke = poke + " " + doc allDoc.append(poke) vectors_full = self.to_vec(allDoc) tficf_train = self.to_tfidf(vectors_full) tficf_train_copy = tficf_train.copy() features = self.vectorizer.get_feature_names() for i in range(4): words = [] for j in range(10): doc = tficf_train_copy[i] max_index = np.argmax(doc) words.append(features[max_index]) tficf_train_copy[i, max_index] = 0 print allCat[i], words
def f(self): print_question('f') best_score = 0 best_gamma = 0 for gamma in [0.001, 0.01, 0.1, 1, 10, 100, 1000]: classifier = svm.LinearSVC(C=gamma, random_state=42) classifier.fit(self.tfidf_SVD, self.train_labels) scores = (cross_val_score(classifier, self.tfidf_SVD, self.train_labels, cv=5)) score = scores.mean() if score > best_score: best_score = score best_gamma = gamma print "Accuracy: %.5f | gamma: " % score, gamma print "Best Accuracy: %.5f | gamma: %d" % (best_score, best_gamma) classifier = svm.LinearSVC(C=best_gamma, random_state=42) self.show_result(*self.svm_classify_SVD(classifier, 'Best SVM')) best_score = 0 best_gamma = 0 for gamma in [0.001, 0.01, 0.1, 1, 10, 100, 1000]: classifier = svm.LinearSVC(C=gamma, random_state=42) classifier.fit(self.tfidf_NMF, self.train_labels) scores = (cross_val_score(classifier, self.tfidf_NMF, self.train_labels, cv=5)) score = scores.mean() if score > best_score: best_score = score best_gamma = gamma print "Accuracy: %.5f | gamma: " % score, gamma print "Best Accuracy: %.5f | gamma: %d" % (best_score, best_gamma) classifier = svm.LinearSVC(C=best_gamma, random_state=42) self.show_result(*self.svm_classify_NMF(classifier, 'Best SVM'))
def j(self): print_question('j') # build training data train = fetch_data(cat_4, 'train') vectors = self.to_vec(train.data) tfidf = self.to_tfidf(vectors) nmf = self.to_NMF(tfidf) # build testing data test = fetch_data(cat_4, 'test') vectors_test = self.vectorizer.transform(test.data) tfidf_test = self.tfidf_transformer.transform(vectors_test) nmf_test = self.nmf.transform(tfidf_test) # build classifiers svc = svm.LinearSVC(C=1, random_state=42) nb = MultinomialNB() ovo = OneVsOneClassifier(svc) ovr = OneVsRestClassifier(svc) # train and test self.multi_classify(nb, nmf, nmf_test, train.target, test.target, 'naive bayes') self.multi_classify(ovo, nmf, nmf_test, train.target, test.target, 'one vs one') self.multi_classify(ovr, nmf, nmf_test, train.target, test.target, 'one vs rest')
#!/usr/bin/env python3 import DHUPythonDev from utils import print_question print_question(""" 問1. 下記の要件を満たすクラスを実装せよ 1-2. 【定義したモジュールを利用する】 上記constに定義した数値tauをprintを用いて出力するプログラムを実装せよ。 """) print(DHUPythonDev.const.τ) print_question(""" 1-5. 【定義したモジュールを利用する】 ここまでのテストとして、inputを用いて受けた数値を半径として、円の面積を出力するプログラムを実装せよ """) print(DHUPythonDev.math.Math.circleArea(10))
from sqlalchemy.orm import sessionmaker from model import engine from utils import print_questions, select_question, print_question if __name__ == '__main__': Session = sessionmaker(bind=engine, autoflush=False, autocommit=False) session = Session() while True: question = select_question(session) if not question: continue print_question(question)
def q2(self): print_question('2') km = KMeans(n_clusters=2, max_iter=200, n_init=5) km.fit(self.tfidf) self.show_result(km.labels_, 'quesiton 2')
def q1(self): print_question('1') print "dimensions: ", self.tfidf.shape
if __name__ == '__main__': Session = sessionmaker(bind=engine) session = Session() subject = select_subject(session) selected_topic = select_topic(session, subject) questions = list() try: start_id = sys.argv[1] questions = session.query(Question).filter(Question.topic == selected_topic).filter(Question.id >= start_id) except: questions = session.query(Question).filter(Question.topic == selected_topic).all() for question in questions: last_id = print_question(question) if safe_prompt(session, "Edit this question? ") in ("Y",'y','Yes','yes'): while True: print_available_answers(session, last_id) command = re.match(match_command, safe_prompt(session, "Command: ")) if not command: break if command.group('command') == 'u': id_selected = int(command.group('ans_id')) answer = session.query(Answer).filter(Answer.id == id_selected).first() answer.question = None session.add(answer) session.commit() if command.group('command') == 'a': ans_id = int(command.group('ans_id')) q_id = int(command.group('q_id')) answer = session.query(Answer).filter(Answer.id == ans_id).first()
def h(self): print_question('h') lg = LogisticRegression() self.show_result(*self.prob_classify_NMF(lg, 'Logistic Regression Classifier')) self.show_result(*self.prob_classify_SVD(lg, 'Logistic Regression Classifier'))
def g(self): print_question('g') nb = MultinomialNB() self.show_result(*self.prob_classify_NMF(nb, 'Naive Beyes Classifier')) self.show_result(*self.prob_classify_SVD(nb, 'Naive Beyes Classifier'))
def d(self): print_question('d') print 'SVD shape: %s' % str(self.tfidf_SVD.shape) print 'NMF shape: %s' % str(self.tfidf_NMF.shape)
def b(self): print_question('b') vectorizer_2 = CountVectorizer(analyzer='word', stop_words=stop_words, min_df=2, tokenizer=stemTokenizer) vectors_2 = vectorizer_2.fit_transform(self.train_data.data) print "terms num when mid_df = 2: %d" % vectors_2.shape[1] print "terms num when mid_df = 5: %d" % self.vectors.shape[1]