def do_training(): global X_train, X_test, feature_names, ch2 print("Extracting features from the training data using a sparse vectorizer") t0 = time() if opts.use_hashing: vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=opts.n_features) X_train = vectorizer.transform(data_train_data) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.25, stop_words='english') X_train = vectorizer.fit_transform(data_train_data) duration = time() - t0 #print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print() print("Extracting features from the test data using the same vectorizer") t0 = time() X_test = vectorizer.transform(data_test_data) duration = time() - t0 #print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_test.shape) print() # mapping from integer feature name to original token string if opts.use_hashing: feature_names = None else: feature_names = vectorizer.get_feature_names() if True:#opts.select_chi2: print("Extracting %d best features by a chi-squared test" % 20000) t0 = time() ch2 = SelectKBest(chi2, k=20000) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) if feature_names: # keep selected feature names feature_names = [feature_names[i] for i in ch2.get_support(indices=True)] print("done in %fs" % (time() - t0)) print() if feature_names: feature_names = np.asarray(feature_names) results = [] #for penalty in ["l2", "l1"]: penalty = 'l2' print('=' * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model clf = LinearSVC(loss='l2', penalty=penalty,dual=False, tol=1e-3) results.append(benchmark(clf)) joblib.dump(vectorizer, 'vectorizer.pkl', compress=9) joblib.dump(ch2, 'feature_selector.pkl', compress=9) joblib.dump(clf, 'linearsvc_classifier.pkl', compress=9)
def main_function(): # Display progress logs on stdout logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') # parse commandline arguments op = OptionParser() op.add_option("--report", action="store_true", dest="print_report", help="Print a detailed classification report.") op.add_option("--chi2_select", action="store", type="int", dest="select_chi2", help="Select some number of features using a chi-squared test") op.add_option("--confusion_matrix", action="store_true", dest="print_cm", help="Print the confusion matrix.") op.add_option("--top10", action="store_true", dest="print_top10", help="Print ten most discriminative terms per class" " for every classifier.") op.add_option("--all_categories", action="store_true", dest="all_categories", help="Whether to use all categories or not.") op.add_option("--use_hashing", action="store_true", help="Use a hashing vectorizer.") op.add_option("--n_features", action="store", type=int, default=2 ** 16, help="n_features when using the hashing vectorizer.") op.add_option("--filtered", action="store_true", help="Remove newsgroup information that is easily overfit: " "headers, signatures, and quoting.") (opts, args) = op.parse_args() if len(args) > 0: op.error("this script takes no arguments.") sys.exit(1) print(__doc__) op.print_help() print() ############################################################################### # Load some categories from the training set if opts.all_categories: categories = None else: obj = input_from_xml() domains,questions = obj.fetch_input_from_xml_questions() categories = domains print("Loading Data from different categories....") print(categories if categories else "all") obj = extract_data() train_data,train_target,train_target_names = obj.extract_training_data() test_data,test_target = obj.extract_testing_data() print('data loaded') # print train_target_names def size_mb(docs): return sum(len(s.encode('utf-8')) for s in docs) / 1e6 data_train_size_mb = size_mb(train_data) data_test_size_mb = size_mb(test_data) print("%d documents - %0.3fMB (training set)" % ( len(train_data), data_train_size_mb)) print("%d documents - %0.3fMB (test set)" % ( len(test_data), data_test_size_mb)) print("%d categories" % len(categories)) print() categories = train_target_names y_train,y_test = train_target,test_target print("Extracting features from the training dataset using a sparse vectorizer") t0 = time() if opts.use_hashing: vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=opts.n_features) X_train = vectorizer.transform(train_data) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_train = vectorizer.fit_transform(train_data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print() print("Extracting features from the test dataset using the same vectorizer") t0 = time() X_test = vectorizer.transform(test_data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_test.shape) print() if opts.select_chi2: print("Extracting %d best features by a chi-squared test" % opts.select_chi2) t0 = time() ch2 = SelectKBest(chi2, k=opts.select_chi2) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) print("done in %fs" % (time() - t0)) print() def trim(s): """Trim string to fit on terminal (assuming 80-column display)""" return s if len(s) <= 80 else s[:77] + "..." if opts.use_hashing: feature_names = None else: feature_names = np.asarray(vectorizer.get_feature_names()) loop = 1 C= 1.0 while True: print('_' * 80) print("Training the data: ") t0 = time() clf = LinearSVC(C=C) clf.fit(X_train,y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) scores = clf.decision_function(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) # return test_data, pred, y_test, scores, train_target_names train_data = train_data + test_data train_target = train_target + test_target X_train = vectorizer.transform(train_data) y_train = train_target cnt = 0 cnt2 = 0 for i in range(0,len(y_test)): if y_test[i]!=-1: if pred[i] == y_test[i]: cnt = cnt + 1 cnt2 = cnt2 + 1 print cnt,cnt2 print "efficiency of classification in loop %d is: %f" %(loop,(cnt + len(y_test) - cnt2)/(1.0*len(y_test))*100) ''' for i in range(0,10): print "quest: ",test_data[i] print "Given class: %s" %(train_target_names[y_test[i]]) print "Predicted class: %s" %(train_target_names[pred[i]]) print "#####################" ''' loop = loop + 1
def plot(): # Display progress logs on stdout logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') # parse commandline arguments op = OptionParser() op.add_option("--report", action="store_true", dest="print_report", help="Print a detailed classification report.") op.add_option("--chi2_select", action="store", type="int", dest="select_chi2", help="Select some number of features using a chi-squared test") op.add_option("--confusion_matrix", action="store_true", dest="print_cm", help="Print the confusion matrix.") op.add_option("--top10", action="store_true", dest="print_top10", help="Print ten most discriminative terms per class" " for every classifier.") op.add_option("--all_categories", action="store_true", dest="all_categories", help="Whether to use all categories or not.") op.add_option("--use_hashing", action="store_true", help="Use a hashing vectorizer.") op.add_option("--n_features", action="store", type=int, default=2 ** 16, help="n_features when using the hashing vectorizer.") op.add_option("--filtered", action="store_true", help="Remove newsgroup information that is easily overfit: " "headers, signatures, and quoting.") (opts, args) = op.parse_args() if len(args) > 0: op.error("this script takes no arguments.") sys.exit(1) print(__doc__) op.print_help() print() ############################################################################### # Load some categories from the training set if opts.all_categories: categories = None else: categories = [ 'alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', ] if opts.filtered: remove = ('headers', 'footers', 'quotes') else: remove = () print("Loading 20 newsgroups dataset for categories:") print(categories if categories else "all") data_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42, remove=remove) data_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42, remove=remove) print('data loaded') categories = data_train.target_names # for case categories == None def size_mb(docs): return sum(len(s.encode('utf-8')) for s in docs) / 1e6 data_train_size_mb = size_mb(data_train.data) data_test_size_mb = size_mb(data_test.data) print("%d documents - %0.3fMB (training set)" % ( len(data_train.data), data_train_size_mb)) print("%d documents - %0.3fMB (test set)" % ( len(data_test.data), data_test_size_mb)) print("%d categories" % len(categories)) print() # split a training set and a test set y_train, y_test = data_train.target, data_test.target print("Extracting features from the training dataset using a sparse vectorizer") t0 = time() if opts.use_hashing: vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=opts.n_features) X_train = vectorizer.transform(data_train.data) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_train = vectorizer.fit_transform(data_train.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print() print("Extracting features from the test dataset using the same vectorizer") t0 = time() X_test = vectorizer.transform(data_test.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_test.shape) print() if opts.select_chi2: print("Extracting %d best features by a chi-squared test" % opts.select_chi2) t0 = time() ch2 = SelectKBest(chi2, k=opts.select_chi2) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) print("done in %fs" % (time() - t0)) print() def trim(s): """Trim string to fit on terminal (assuming 80-column display)""" return s if len(s) <= 80 else s[:77] + "..." # mapping from integer feature name to original token string if opts.use_hashing: feature_names = None else: feature_names = np.asarray(vectorizer.get_feature_names()) ############################################################################### # Benchmark classifiers def benchmark(clf): print('_' * 80) print("Training: ") print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.f1_score(y_test, pred) print("f1-score: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) if opts.print_top10 and feature_names is not None: print("top 10 keywords per class:") for i, category in enumerate(categories): top10 = np.argsort(clf.coef_[i])[-10:] print(trim("%s: %s" % (category, " ".join(feature_names[top10])))) print() if opts.print_report: print("classification report:") print(metrics.classification_report(y_test, pred, target_names=categories)) if opts.print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time results = [] for clf, name in ( (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"), (Perceptron(n_iter=50), "Perceptron"), (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"), (KNeighborsClassifier(n_neighbors=10), "kNN")): print('=' * 80) print(name) results.append(benchmark(clf)) for penalty in ["l2", "l1"]: print('=' * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model results.append(benchmark(LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3))) # Train SGD model results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty))) # Train SGD with Elastic Net penalty print('=' * 80) print("Elastic-Net penalty") results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet"))) # Train NearestCentroid without threshold print('=' * 80) print("NearestCentroid (aka Rocchio classifier)") results.append(benchmark(NearestCentroid())) # Train sparse Naive Bayes classifiers print('=' * 80) print("Naive Bayes") results.append(benchmark(MultinomialNB(alpha=.01))) results.append(benchmark(BernoulliNB(alpha=.01))) class L1LinearSVC(LinearSVC): def fit(self, X, y): # The smaller C, the stronger the regularization. # The more regularization, the more sparsity. self.transformer_ = LinearSVC(penalty="l1", dual=False, tol=1e-3) X = self.transformer_.fit_transform(X, y) return LinearSVC.fit(self, X, y) def predict(self, X): X = self.transformer_.transform(X) return LinearSVC.predict(self, X) print('=' * 80) print("LinearSVC with L1-based feature selection") results.append(benchmark(L1LinearSVC())) # make some plots indices = np.arange(len(results)) results = [[x[i] for x in results] for i in range(4)] clf_names, score, training_time, test_time = results training_time = np.array(training_time) / np.max(training_time) test_time = np.array(test_time) / np.max(test_time) pl.figure(figsize=(12,8)) pl.title("Score") pl.barh(indices, score, .2, label="score", color='r') pl.barh(indices + .3, training_time, .2, label="training time", color='g') pl.barh(indices + .6, test_time, .2, label="test time", color='b') pl.yticks(()) pl.legend(loc='best') pl.subplots_adjust(left=.25) pl.subplots_adjust(top=.95) pl.subplots_adjust(bottom=.05) for i, c in zip(indices, clf_names): pl.text(-.3, i, c) pl.show()
def bench(texts,y,opts): data_train, data_test, y_train, y_test = train_test_split(texts, y, test_size=0.33, random_state=42) logger.info('data loaded') data_train_size_mb = size_mb(data_train) data_test_size_mb = size_mb(data_test) logger.info("%d documents - %0.3fMB (training set)" % (len(data_train), data_train_size_mb)) logger.info("%d documents - %0.3fMB (test set)" % (len(data_test), data_test_size_mb)) logger.info("\n") logger.info("Extracting features from the training data using a sparse vectorizer") t0 = time() if opts.use_hashing: logger.info("Use hashing") vectorizer = HashingVectorizer(preprocessor=stif_classifier_dataset.preprocessor, non_negative=True, n_features=opts.n_features) X_train = vectorizer.transform(data_train) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, preprocessor=stif_classifier_dataset.preprocessor) X_train = vectorizer.fit_transform(data_train) duration = time() - t0 logger.info("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) logger.info("n_samples: %d, n_features: %d" % X_train.shape) logger.info("\n") logger.info("Extracting features from the test data using the same vectorizer") t0 = time() X_test = vectorizer.transform(data_test) duration = time() - t0 logger.info("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration)) logger.info("n_samples: %d, n_features: %d" % X_test.shape) logger.info("\n") # mapping from integer feature name to original token string if opts.use_hashing: feature_names = None else: feature_names = vectorizer.get_feature_names() if opts.select_chi2: logger.info("Extracting %d best features by a chi-squared test" % opts.select_chi2) t0 = time() ch2 = SelectKBest(chi2, k=opts.select_chi2) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) if feature_names: # keep selected feature names feature_names = [feature_names[i] for i in ch2.get_support(indices=True)] logger.info("done in %fs" % (time() - t0)) logger.info("\n") if feature_names: feature_names = np.asarray(feature_names) categories = ["no alert", "alert"] results = [] for clf, name in ( (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"), (Perceptron(n_iter=50), "Perceptron"), (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"), (KNeighborsClassifier(n_neighbors=10), "kNN"), (RandomForestClassifier(n_estimators=100), "Random forest")): logger.info('=' * 80) logger.info(name) clf_descr = str(clf).split('(')[0] results.append(benchmark(clf, clf_descr,X_train,y_train,X_test,y_test,categories,feature_names,opts)) for penalty in ["l2", "l1"]: logger.info('=' * 80) logger.info("%s penalty" % penalty.upper()) # Train Liblinear model clf_descr = 'LinearSVC ' + penalty results.append(benchmark(LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3), clf_descr,X_train,y_train,X_test,y_test,categories,feature_names,opts)) # Train SGD model clf_descr = 'SGDClassifier ' + penalty results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty), clf_descr,X_train,y_train,X_test,y_test,categories,feature_names,opts)) # Train SGD with Elastic Net penalty logger.info('=' * 80) logger.info("Elastic-Net penalty") clf_descr = 'SGDClassifier Elastic-Net' results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet"), clf_descr,X_train,y_train,X_test,y_test,categories,feature_names,opts)) # Train NearestCentroid without threshold logger.info('=' * 80) logger.info("NearestCentroid (aka Rocchio classifier)") results.append(benchmark(NearestCentroid(), 'NearestCentroid',X_train,y_train,X_test,y_test,categories,feature_names,opts)) # Train sparse Naive Bayes classifiers logger.info('=' * 80) logger.info("Naive Bayes") results.append(benchmark(MultinomialNB(alpha=.01), 'MultinomialNB',X_train,y_train,X_test,y_test,categories,feature_names,opts)) results.append(benchmark(BernoulliNB(alpha=.01), 'BernoulliNB',X_train,y_train,X_test,y_test,categories,feature_names,opts)) logger.info('=' * 80) logger.info("LinearSVC with L1-based feature selection") # The smaller C, the stronger the regularization. # The more regularization, the more sparsity. results.append(benchmark(Pipeline([ ('feature_selection', LinearSVC(penalty="l1", dual=False, tol=1e-3)), ('classification', LinearSVC()) ]), 'LinearSVC with feature_selection',X_train,y_train,X_test,y_test,categories,feature_names,opts)) # make some plots indices = np.arange(len(results)) results = [[x[i] for x in results] for i in range(4)] clf_names, score, training_time, test_time = results training_time = np.array(training_time) / np.max(training_time) test_time = np.array(test_time) / np.max(test_time) ts = time() st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%dT%H-%M-%S') pdf = PdfPages('stif_classification_' + st + '.pdf') fig = plt.figure(figsize=(12, 8)) ax = plt.subplot(111) plt.title("Benchmark de classifieurs") ax.barh(indices, score, .2, label="score", color='r') ax.barh(indices + .3, training_time, .2, label="training time", color='g') ax.barh(indices + .6, test_time, .2, label="test time", color='b') plt.yticks(()) plt.xticks(np.linspace(0.0, 1.0, 11, endpoint=True)) plt.subplots_adjust(left=.25) plt.subplots_adjust(top=.95) plt.subplots_adjust(bottom=.05) for i, c in zip(indices, clf_names): ax.text(-.4, i, c) # Shrink current axis by 20% box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) # Put a legend to the right of the current axis ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) # plt.show() fig.savefig('stif_classification_' + st + '.png') pdf.savefig() pdf.close()
def predict_and_cluster(opts,mode): n_digits = 3 # n_samples, n_features = (25,1927) n_samples, n_features = (25,491) labels = array([0,1,2,1,1,2,2,1,2,0,0,0,1,1,2,1,1,1,1,1,1,1,1,2,1]) true_k = np.unique(labels).shape[0] corpus, news = jieba_tokenizer() print("Extracting features from the training dataset using a sparse vectorizer") t0 = time() if opts.use_hashing: if opts.use_idf: # Perform an IDF normalization on the output of HashingVectorizer hasher = HashingVectorizer(n_features=opts.n_features, stop_words='english', non_negative=True, norm=None, binary=False) vectorizer = make_pipeline(hasher, TfidfTransformer()) else: vectorizer = HashingVectorizer(n_features=opts.n_features, stop_words='english', non_negative=False, norm='l2', binary=False) else: vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features, min_df=2, stop_words='english', use_idf=opts.use_idf) X = vectorizer.fit_transform(corpus) print("done in %fs" % (time() - t0)) # n_samples: how many articles are there # n_features: how many different words in all articles are there print("n_samples: %d, n_features: %d" % X.shape) print() if opts.n_components: print("Performing dimensionality reduction using LSA") t0 = time() # Vectorizer results are normalized, which makes KMeans behave as # spherical k-means for better results. Since LSA/SVD results are # not normalized, we have to redo the normalization. svd = TruncatedSVD(opts.n_components) lsa = make_pipeline(svd, Normalizer(copy=False)) X = lsa.fit_transform(X) print("done in %fs" % (time() - t0)) svd = TruncatedSVD().fit(X) X_proj = svd.transform(X) explained_variances = np.var(X_proj, axis=0) / np.var(X, axis=0).sum() print("Explained variance of the SVD step: {}%".format( int(explained_variances[0] * 100))) print() # ================================================= # KMeans clustering # if opts.minibatch: # km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1, # init_size=1000, batch_size=1000, verbose=True) # else: print ('*' * 80) km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, verbose=True) # always better print("Clustering sparse data with %s" % km) t0 = time() km.fit(X) print("done in %0.3fs" % (time() - t0)) print() print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)) print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, sample_size=None)) print ("labels " ,labels) print ("my_labels " ,km.labels_) if not (opts.n_components or opts.use_hashing): print("Top terms per cluster:") order_centroids = km.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() for i in range(true_k): print("Cluster %d:" % i, end='') for ind in order_centroids[i, :10]: print(' %s' % terms[ind], end='') print() for i in range(len(news)): news[i].category = labels[i] from sklearn.metrics.pairwise import cosine_similarity FG=nx.Graph() for i in range(len(news)): news[i].similarity = cosine_similarity(X[i:i+1], X)[0] cs = news[i].similarity # print (cs) for j in range(len(news)): if i != j: FG.add_weighted_edges_from([(i,j,cs[j])]) print () print ('*' * 80) print (X.shape[0]) print (X.shape) print (self) gmm(X) print () print ('*' * 80) best_part(FG) print () print ('*' * 80)
def main(): if DATASET == 'SPAM': data_train = get_spam_dataset('corpus/english_big.txt') data_test = get_spam_dataset('corpus/english.txt', cleaned=False) categories = ['spam', 'ham'] elif DATASET == 'NEWS': data_train, data_test, categories = get_news_data() else: raise ValueError('Unknown DATASET') print('data loaded') # order of labels in `target_names` can be different from `categories` target_names = data_train.target_names data_train_size_mb = size_mb(data_train.data) data_test_size_mb = size_mb(data_test.data) print("%d documents - %0.3fMB (training set)" % (len(data_train.data), data_train_size_mb)) print("%d documents - %0.3fMB (test set)" % (len(data_test.data), data_test_size_mb)) print("{0} categories".format(len(categories) if categories else 'all')) print('=' * 80 + '\n') # split a training set and a test set y_train, y_test = data_train.target, data_test.target print( "Extracting features from the training data using a sparse vectorizer") t0 = time.time() if VECTORIZER == 'HASH': vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False, n_features=N_FEATURES) X_train = vectorizer.transform(data_train.data) elif VECTORIZER == 'TFIDF': vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_train = vectorizer.fit_transform(data_train.data) else: vectorizer = CountVectorizer(stop_words='english') X_train = vectorizer.fit_transform(data_train.data) # Print matrix # X_train.toarray().astype(int) print("done in %fs" % (time.time() - t0)) print("n_samples: %d, n_features: %d" % X_train.shape) print() print("Extracting features from the test data using the same vectorizer") t0 = time.time() X_test = vectorizer.transform(data_test.data) print("done in %fs" % (time.time() - t0)) print("n_samples: %d, n_features: %d" % X_test.shape) print() # mapping from integer feature name to original token string if VECTORIZER == 'HASH': feature_names = None else: feature_names = vectorizer.get_feature_names() results = classify(X_train, X_test, y_train, y_test, feature_names, target_names) plot_results(results)
def svm(num_cats, train_data, test_data): # train_path = os.path.join(data_path, "train") # test_path = os.path.join(data_path, "test") data_train = {} data_train["data"] = [] data_train["target"] = [] data_train["target_names"] = [str(i) for i in range(num_cats)] data_test = {} data_test["data"] = [] data_test["target"] = [] # with open(train_path, 'r') as train_f, open(test_path) as test_f: # train_data = train_f.readlines() # test_data = test_f.readlines() for exam in train_data: data_train["data"].append(exam[12:]) # print(exam) data_train["target"].append(int(exam[9])) for exam in test_data: data_test["data"].append(exam[12:]) data_test["target"].append(int(exam[9])) data_train = ap.Namespace(**data_train) data_test = ap.Namespace(**data_test) print('data loaded') categories = data_train.target_names # for case categories == None def size_mb(docs): return sum(len(s.encode('utf-8')) for s in docs) / 1e6 data_train_size_mb = size_mb(data_train.data) data_test_size_mb = size_mb(data_test.data) print("%d documents - %0.3fMB (training set)" % ( len(data_train.data), data_train_size_mb)) print("%d documents - %0.3fMB (test set)" % ( len(data_test.data), data_test_size_mb)) print("%d categories" % len(categories)) print(data_train.data[:10]) # split a training set and a test set y_train, y_test = data_train.target, data_test.target print("Extracting features from the training data using a sparse vectorizer") t0 = time() if opts.use_hashing: vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=opts.n_features) X_train = vectorizer.transform(data_train.data) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_train = vectorizer.fit_transform(data_train.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print() print("Extracting features from the test data using the same vectorizer") t0 = time() X_test = vectorizer.transform(data_test.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_test.shape) print() # mapping from integer feature name to original token string if opts.use_hashing: feature_names = None else: feature_names = vectorizer.get_feature_names() if opts.select_chi2: print("Extracting %d best features by a chi-squared test" % opts.select_chi2) t0 = time() ch2 = SelectKBest(chi2, k=opts.select_chi2) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) if feature_names: # keep selected feature names feature_names = [feature_names[i] for i in ch2.get_support(indices=True)] print("done in %fs" % (time() - t0)) print() if feature_names: feature_names = np.asarray(feature_names) def trim(s): """Trim string to fit on terminal (assuming 80-column display)""" return s if len(s) <= 80 else s[:77] + "..." ############################################################################### # Benchmark classifiers def benchmark(clf): print('_' * 80) print("Training: ") print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) if opts.print_top10 and feature_names is not None: print("top 10 keywords per class:") for i, category in enumerate(categories): top10 = np.argsort(clf.coef_[i])[-10:] print(trim("%s: %s" % (category, " ".join(feature_names[top10])))) print() if opts.print_report: print("classification report:") print(metrics.classification_report(y_test, pred, target_names=categories)) if opts.print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time results = [] for penalty in ["l2", "l1"]: print('=' * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model results.append(benchmark(LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3))) # make some plots indices = np.arange(len(results)) results = [[x[i] for x in results] for i in range(4)] clf_names, score, training_time, test_time = results training_time = np.array(training_time) / np.max(training_time) test_time = np.array(test_time) / np.max(test_time) plt.figure(figsize=(12, 8)) plt.title("Score") plt.barh(indices, score, .2, label="score", color='r') plt.barh(indices + .3, training_time, .2, label="training time", color='g') plt.barh(indices + .6, test_time, .2, label="test time", color='b') plt.yticks(()) plt.legend(loc='best') plt.subplots_adjust(left=.25) plt.subplots_adjust(top=.95) plt.subplots_adjust(bottom=.05) for i, c in zip(indices, clf_names): plt.text(-.3, i, c) plt.show()
def main_function(): # Display progress logs on stdout logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') # parse commandline arguments op = OptionParser() op.add_option("--report", action="store_true", dest="print_report", help="Print a detailed classification report.") op.add_option("--chi2_select", action="store", type="int", dest="select_chi2", help="Select some number of features using a chi-squared test") op.add_option("--confusion_matrix", action="store_true", dest="print_cm", help="Print the confusion matrix.") op.add_option("--top10", action="store_true", dest="print_top10", help="Print ten most discriminative terms per class" " for every classifier.") op.add_option("--all_categories", action="store_true", dest="all_categories", help="Whether to use all categories or not.") op.add_option("--use_hashing", action="store_true", help="Use a hashing vectorizer.") op.add_option("--n_features", action="store", type=int, default=2 ** 16, help="n_features when using the hashing vectorizer.") op.add_option("--filtered", action="store_true", help="Remove newsgroup information that is easily overfit: " "headers, signatures, and quoting.") (opts, args) = op.parse_args() if len(args) > 0: op.error("this script takes no arguments.") sys.exit(1) print(__doc__) op.print_help() print() ############################################################################### # Load some categories from the training set if opts.all_categories: categories = None else: obj = input_from_xml() domains,questions = obj.fetch_input_from_xml_questions() categories = domains print("Loading Data from different categories....") print(categories if categories else "all") obj = extract_data() train_data,train_target,train_target_names = obj.extract_training_data() test_data,test_target = obj.extract_testing_data() print('data loaded') # print train_target_names def size_mb(docs): return sum(len(s.encode('utf-8')) for s in docs) / 1e6 data_train_size_mb = size_mb(train_data) data_test_size_mb = size_mb(test_data) print("%d documents - %0.3fMB (training set)" % ( len(train_data), data_train_size_mb)) print("%d documents - %0.3fMB (test set)" % ( len(test_data), data_test_size_mb)) print("%d categories" % len(categories)) print() categories = train_target_names y_train,y_test = train_target,test_target print("Extracting features from the training dataset using a sparse vectorizer") t0 = time() if opts.use_hashing: vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=opts.n_features) X_train = vectorizer.transform(train_data) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_train = vectorizer.fit_transform(train_data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print() print("Extracting features from the test dataset using the same vectorizer") t0 = time() X_test = vectorizer.transform(test_data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_test.shape) print() if opts.select_chi2: print("Extracting %d best features by a chi-squared test" % opts.select_chi2) t0 = time() ch2 = SelectKBest(chi2, k=opts.select_chi2) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) print("done in %fs" % (time() - t0)) print() def trim(s): """Trim string to fit on terminal (assuming 80-column display)""" return s if len(s) <= 80 else s[:77] + "..." if opts.use_hashing: feature_names = None else: feature_names = np.asarray(vectorizer.get_feature_names()) loop = 1 while True: results = [] results.append(benchmark(X_train,y_train,X_test,y_test)) train_data = train_data + test_data train_target = train_target + test_target X_train = vectorizer.transform(train_data) y_train = train_target if(loop == 1): print('=' * 80) indices = np.arange(len(results)) results = [[x[i] for x in results] for i in range(4)] clf_names, score, training_time, test_time = results training_time = np.array(training_time) / np.max(training_time) test_time = np.array(test_time) / np.max(test_time) pl.figure(figsize=(8,4)) pl.title("Score") pl.barh(indices, score, .2, label="score", color='r') pl.barh(indices + .3, training_time, .2, label="training time", color='g') pl.barh(indices + .6, test_time, .2, label="test time", color='b') pl.yticks(()) pl.legend(loc='best') pl.subplots_adjust(left=.25) pl.subplots_adjust(top=.95) pl.subplots_adjust(bottom=.05) for i, c in zip(indices, clf_names): pl.text(-.3, i, c) pl.show() loop = loop + 1
X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) print("done in %fs" % (time() - t0)) print() def trim(s): """Trim string to fit on terminal (assuming 80-column display)""" return s if len(s) <= 80 else s[:77] + "..." # mapping from integer feature name to original token string if opts.use_hashing: feature_names = None else: feature_names = np.asarray(vectorizer.get_feature_names()) ############################################################################### # Benchmark classifiers def benchmark(clf): print('_' * 80) print("Training: ") print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test)
def extract_features(self, df, max_df=0.5, stop_words='english', ngram_range=(1, 1), store_vect=True, random_state=None): """Extracts features :param df: dataframe with column keys, 'input' and 'label', where 'input' are the contents of articles and 'label' are the desired results/targets :param max_df: See TFiDF definition: default 0.5 :param stop_words: list of stop words for feature extraction. Default is english. :param ngram_range: range of gram to be used in type tuple. Default is (1, 1) :param store_vect: store vectorizer to pickle :param random_state: train_test_split param, RandomState instance or None, optional (default=None) :return x_train, x_test, y_train, y_test: vectorized sets of train and test with .8:.2 ratio, respectively""" # SPLIT TRAIN AND TEST print( "Extracting features from the training data using a sparse vectorizer" ) t0 = time() x_train, x_test, y_train, y_test = train_test_split( df['input'], df['label'], random_state=random_state) print('Training set: ', len(y_train)) print('Testing set: ', len(y_test)) if self.use_hashing and self.n_features: vectorizer = HashingVectorizer(stop_words=stop_words, alternate_sign=False, n_features=self.n_features, ngram_range=ngram_range) x_train = vectorizer.transform(x_train) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=max_df, stop_words=stop_words, ngram_range=ngram_range) x_train = vectorizer.fit_transform(x_train) try: duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, df['input'][:int(len(df['input']) * .8)].size / duration)) print("n_samples: %d, n_features: %d" % x_train.shape) print() except ZeroDivisionError as err: print(err) print( "Extracting features from the test data using the same vectorizer") t0 = time() x_test = vectorizer.transform(x_test) duration = time() - t0 try: print("done in %fs at %0.3fMB/s" % (duration, df['input'][int(len(df['input']) * .8):].size / duration)) print("n_samples: %d, n_features: %d" % x_test.shape) print() except ZeroDivisionError as err: print(err) # mapping from integer feature name to original token string if self.use_hashing: feature_names = None else: feature_names = vectorizer.get_feature_names() if self.select_chi2: print("Extracting %d best features by a chi-squared test" % self.select_chi2) t0 = time() ch2 = SelectKBest(chi2, k=self.select_chi2) x_train = ch2.fit_transform(x_train, y_train) x_test = ch2.transform(x_test) if feature_names: # keep selected feature names self.feature_names = [ feature_names[i] for i in ch2.get_support(indices=True) ] print("done in %fs" % (time() - t0)) print() if feature_names: self.feature_names = np.asarray(feature_names) self.X_train = x_train # sparse matrix train (train_samples, n_features) self.X_test = x_test # sparse matrix test (test_samples, n_features) self.y_train = y_train # (train_n_tagets,) self.y_test = y_test # (test_n_targets, ) if store_vect: base, f_name = os.path.split( os.path.abspath('article_classifier.py')) filename = base + '/' + 'vectorizer.pkl' joblib.dump(vectorizer, filename) print('Saved ', os.path.abspath(filename)) return x_train, x_test, y_train, y_test
def wordVec(): dataset = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42) print("%d documents" % len(dataset.data)) print("%d categories" % len(dataset.target_names)) print() labels = dataset.target print("Extracting features from the training dataset " "using a sparse vectorizer") t0 = time.clock() if sklearn.naive_bayes.check_X_y(): if sklearn.naive_bayes.safe_sparse_dot(): hasher = HashingVectorizer(n_features=opts.n_features, stop_words='english', alternate_sign=False, norm=None) vectorizer = make_pipeline(hasher, TfidfTransformer()) else: vectorizer = HashingVectorizer(n_features=opts.n_features, stop_words='english', alternate_sign=False, norm='l2') else: vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features, min_df=2, stop_words='english', use_idf=sklearn.metrics.roc_curve()) X = vectorizer.fit_transform(dataset.data) print("done in %fs" % (time.time() - t0)) print("n_samples: %d, n_features: %d" % X.shape) print() if True: print("Performing dimensionality reduction using LSA") t0 = time.time() svd = TruncatedSVD(sklearn.linear_model.SGDClassifier.predict()) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X = lsa.fit_transform(X) print("done in %fs" % (time() - t0)) explained_variance = svd.explained_variance_ratio_.sum() print("Explained variance of the SVD step: {}%".format( int(explained_variance * 100))) print() if opts.minibatch: km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=opts.verbose) else: km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, verbose=opts.verbose) print("Clustering sparse data with %s" % km) t0 = time() km.fit(X) print("done in %0.3fs" % (time() - t0)) print() print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)) print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, km.labels_, sample_size=1000)) print() if not opts.use_hashing: print("Top terms per cluster:") if opts.n_components: original_space_centroids = svd.inverse_transform( km.cluster_centers_) order_centroids = original_space_centroids.argsort()[:, ::-1] else: order_centroids = km.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() for i in range(true_k): print("Cluster %d:" % i, end='') for ind in order_centroids[i, :10]: print(' %s' % terms[ind], end='') print()
def bench(texts, y, opts): data_train, data_test, y_train, y_test = train_test_split(texts, y, test_size=0.33, random_state=42) logger.info('data loaded') data_train_size_mb = size_mb(data_train) data_test_size_mb = size_mb(data_test) logger.info("%d documents - %0.3fMB (training set)" % (len(data_train), data_train_size_mb)) logger.info("%d documents - %0.3fMB (test set)" % (len(data_test), data_test_size_mb)) logger.info("\n") logger.info( "Extracting features from the training data using a sparse vectorizer") t0 = time() if opts.use_hashing: logger.info("Use hashing") vectorizer = HashingVectorizer( preprocessor=stif_classifier_dataset.preprocessor, non_negative=True, n_features=opts.n_features) X_train = vectorizer.transform(data_train) else: vectorizer = TfidfVectorizer( sublinear_tf=True, max_df=0.5, preprocessor=stif_classifier_dataset.preprocessor) X_train = vectorizer.fit_transform(data_train) duration = time() - t0 logger.info("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) logger.info("n_samples: %d, n_features: %d" % X_train.shape) logger.info("\n") logger.info( "Extracting features from the test data using the same vectorizer") t0 = time() X_test = vectorizer.transform(data_test) duration = time() - t0 logger.info("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration)) logger.info("n_samples: %d, n_features: %d" % X_test.shape) logger.info("\n") # mapping from integer feature name to original token string if opts.use_hashing: feature_names = None else: feature_names = vectorizer.get_feature_names() if opts.select_chi2: logger.info("Extracting %d best features by a chi-squared test" % opts.select_chi2) t0 = time() ch2 = SelectKBest(chi2, k=opts.select_chi2) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) if feature_names: # keep selected feature names feature_names = [ feature_names[i] for i in ch2.get_support(indices=True) ] logger.info("done in %fs" % (time() - t0)) logger.info("\n") if feature_names: feature_names = np.asarray(feature_names) categories = ["no alert", "alert"] results = [] for clf, name in ((RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"), (Perceptron(n_iter=50), "Perceptron"), (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"), (KNeighborsClassifier(n_neighbors=10), "kNN"), (RandomForestClassifier(n_estimators=100), "Random forest")): logger.info('=' * 80) logger.info(name) clf_descr = str(clf).split('(')[0] results.append( benchmark(clf, clf_descr, X_train, y_train, X_test, y_test, categories, feature_names, opts)) for penalty in ["l2", "l1"]: logger.info('=' * 80) logger.info("%s penalty" % penalty.upper()) # Train Liblinear model clf_descr = 'LinearSVC ' + penalty results.append( benchmark( LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3), clf_descr, X_train, y_train, X_test, y_test, categories, feature_names, opts)) # Train SGD model clf_descr = 'SGDClassifier ' + penalty results.append( benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty), clf_descr, X_train, y_train, X_test, y_test, categories, feature_names, opts)) # Train SGD with Elastic Net penalty logger.info('=' * 80) logger.info("Elastic-Net penalty") clf_descr = 'SGDClassifier Elastic-Net' results.append( benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet"), clf_descr, X_train, y_train, X_test, y_test, categories, feature_names, opts)) # Train NearestCentroid without threshold logger.info('=' * 80) logger.info("NearestCentroid (aka Rocchio classifier)") results.append( benchmark(NearestCentroid(), 'NearestCentroid', X_train, y_train, X_test, y_test, categories, feature_names, opts)) # Train sparse Naive Bayes classifiers logger.info('=' * 80) logger.info("Naive Bayes") results.append( benchmark(MultinomialNB(alpha=.01), 'MultinomialNB', X_train, y_train, X_test, y_test, categories, feature_names, opts)) results.append( benchmark(BernoulliNB(alpha=.01), 'BernoulliNB', X_train, y_train, X_test, y_test, categories, feature_names, opts)) logger.info('=' * 80) logger.info("LinearSVC with L1-based feature selection") # The smaller C, the stronger the regularization. # The more regularization, the more sparsity. results.append( benchmark( Pipeline([('feature_selection', LinearSVC(penalty="l1", dual=False, tol=1e-3)), ('classification', LinearSVC())]), 'LinearSVC with feature_selection', X_train, y_train, X_test, y_test, categories, feature_names, opts)) # make some plots indices = np.arange(len(results)) results = [[x[i] for x in results] for i in range(4)] clf_names, score, training_time, test_time = results training_time = np.array(training_time) / np.max(training_time) test_time = np.array(test_time) / np.max(test_time) ts = time() st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%dT%H-%M-%S') pdf = PdfPages('stif_classification_' + st + '.pdf') fig = plt.figure(figsize=(12, 8)) ax = plt.subplot(111) plt.title("Benchmark de classifieurs") ax.barh(indices, score, .2, label="score", color='r') ax.barh(indices + .3, training_time, .2, label="training time", color='g') ax.barh(indices + .6, test_time, .2, label="test time", color='b') plt.yticks(()) plt.xticks(np.linspace(0.0, 1.0, 11, endpoint=True)) plt.subplots_adjust(left=.25) plt.subplots_adjust(top=.95) plt.subplots_adjust(bottom=.05) for i, c in zip(indices, clf_names): ax.text(-.4, i, c) # Shrink current axis by 20% box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) # Put a legend to the right of the current axis ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) # plt.show() fig.savefig('stif_classification_' + st + '.png') pdf.savefig() pdf.close()
def train(self): # TODO not relevant for paper but important X = self.load() x_train, x_test, y_train, y_test, indices_train, indices_test = \ train_test_split( X['data'], X['target'], range(0, len(X['data'])), test_size=0.2, random_state=42) print('data loaded') # order of labels in `target_names` can be different from `categories` target_names = X['target_names'] def size_mb(docs): return sum(len(s.encode('utf-8')) for s in docs) / 1e6 data_train_size_mb = size_mb(x_train) data_test_size_mb = size_mb(x_test) print("%d documents - %0.3fMB (training set)" % (len(x_train), data_train_size_mb)) print("%d documents - %0.3fMB (test set)" % (len(x_test), data_test_size_mb)) print("%d categories" % len(target_names)) print() print( "Extracting features from the training data using a sparse vectorizer" ) t0 = time() if False: vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False, n_features=2**16) X_train = vectorizer.transform(x_train) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_train = vectorizer.fit_transform(x_train) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print() print( "Extracting features from the test data using the same vectorizer") t0 = time() X_test = vectorizer.transform(x_test) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_test.shape) print() # mapping from integer feature name to original token string if False: feature_names = None else: feature_names = vectorizer.get_feature_names() if feature_names: feature_names = np.asarray(feature_names) def trim(s): """Trim string to fit on terminal (assuming 80-column display)""" return s if len(s) <= 80 else s[:77] + "..." # ############################################################################# # Benchmark classifiers def benchmark(clf): print('_' * 80) print("Training: ") print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) if False and feature_names is not None: print("top 10 keywords per class:") for i, label in enumerate(target_names): top10 = np.argsort(clf.coef_[i])[-10:] print( trim("%s: %s" % (label, " ".join(feature_names[top10])))) print() print("classification report:") print( metrics.classification_report(y_test, pred, target_names=target_names)) print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time results = [] for clf, name in ((RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"), (Perceptron(n_iter=50), "Perceptron"), (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"), (KNeighborsClassifier(n_neighbors=10), "kNN"), (RandomForestClassifier(n_estimators=100), "Random forest")): print('=' * 80) print(name) results.append(benchmark(clf)) for penalty in ["l2", "l1"]: print('=' * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model results.append( benchmark(LinearSVC(penalty=penalty, dual=False, tol=1e-3))) # Train SGD model results.append( benchmark( SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty))) # Train SGD with Elastic Net penalty print('=' * 80) print("Elastic-Net penalty") results.append( benchmark( SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet"))) # Train NearestCentroid without threshold print('=' * 80) print("NearestCentroid (aka Rocchio classifier)") results.append(benchmark(NearestCentroid())) # Train sparse Naive Bayes classifiers print('=' * 80) print("Naive Bayes") results.append(benchmark(MultinomialNB(alpha=.01))) results.append(benchmark(BernoulliNB(alpha=.01))) print('=' * 80) print("LinearSVC with L1-based feature selection") # The smaller C, the stronger the regularization. # The more regularization, the more sparsity. results.append( benchmark( Pipeline([('feature_selection', SelectFromModel( LinearSVC(penalty="l1", dual=False, tol=1e-3))), ('classification', LinearSVC(penalty="l2"))]))) # make some plots indices = np.arange(len(results)) results = [[x[i] for x in results] for i in range(4)] clf_names, score, training_time, test_time = results training_time = np.array(training_time) / np.max(training_time) test_time = np.array(test_time) / np.max(test_time) plt.figure(figsize=(12, 8)) plt.title("Score") plt.barh(indices, score, .2, label="score", color='navy') plt.barh(indices + .3, training_time, .2, label="training time", color='c') plt.barh(indices + .6, test_time, .2, label="test time", color='darkorange') plt.yticks(()) plt.legend(loc='best') plt.subplots_adjust(left=.25) plt.subplots_adjust(top=.95) plt.subplots_adjust(bottom=.05) for i, c in zip(indices, clf_names): plt.text(-.3, i, c) plt.show() return self._container
def train(opts): print("Loading data...") neg2pos_ratio = 1 #neg examples are twice pos ones train2all_ratio = 0.5 #20 core_prop_names = ["recipeInstructions", "ingredients"] data_train, data_test = utils.prepare_data(opts.prop_name, core_prop_names, opts.index_file, neg2pos_ratio, train2all_ratio) categories = ['negative', 'positive'] def size_mb(docs): return sum(len(s.encode('utf-8')) for s in docs) / 1e6 data_train_size_mb = size_mb(data_train.data) data_test_size_mb = size_mb(data_test.data) print("%d documents - %0.3fMB (training set)" % (len(data_train.data), data_train_size_mb)) print("%d documents - %0.3fMB (test set)" % (len(data_test.data), data_test_size_mb)) print("%d categories" % len(categories)) print() # split a training set and a test set y_train, y_test = data_train.target, data_test.target if opts.use_hashing: vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=opts.n_features) X_train = vectorizer.transform(data_train.data) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_train = vectorizer.fit_transform(data_train.data) print("n_samples: %d, n_features: %d" % X_train.shape) print("Extracting features from the test data using the same vectorizer") X_test = vectorizer.transform(data_test.data) print("n_samples: %d, n_features: %d \n" % X_test.shape) ''' #Print training data for test purpose trainf = open("train_test.txt", "w") for item in data_train.data: trainf.write(item + "\n") trainf.close() #end test ''' # mapping from integer feature name to original token string if opts.use_hashing: feature_names = None else: feature_names = vectorizer.get_feature_names() if opts.select_chi2: print("Extracting %d best features by a chi-squared test" % opts.select_chi2) t0 = time() ch2 = SelectKBest(chi2, k=opts.select_chi2) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) if feature_names: # keep selected feature names feature_names = [ feature_names[i] for i in ch2.get_support(indices=True) ] print("done in %fs" % (time() - t0)) print() if feature_names: feature_names = np.asarray(feature_names) clf = LinearSVC(loss='squared_hinge', penalty='l2', dual=False, tol=1e-3) benchmark(clf, X_train, y_train, X_test, y_test, opts) #Saving the model and vectorizer if opts.model_file: with open(opts.model_file, "wb") as f: pickle.dump(clf, f) if opts.vect_file: with open(opts.vect_file, "wb") as f: pickle.dump(vectorizer, f)
""" print() cluster_colors = dict() cluster_name_map = dict() if not opts.use_hashing: print("Top terms per cluster:") if opts.n_components: original_space_centroids = svd.inverse_transform(km.cluster_centers_) order_centroids = original_space_centroids.argsort()[:, ::-1] else: order_centroids = km.cluster_centers_.argsort()[:, ::-1] cluster_name = [] terms = vectorizer.get_feature_names() for i in range(num_cluster): print("Cluster %d:" % i, end='') cluster_colors[i] = randomcolor() name = ' ' for ind in order_centroids[i, :10]: name += ' ' + terms[ind] print(' %s' % terms[ind], end='') cluster_name.append(name) cluster_name_map[i] = name.split(",") print() cluster_predicit = km.predict(X) # ================ Update columns cluster in database ============================ for index, row in news_df.iterrows():
print("n_samples: %d, n_features: %d" % X_train.shape) print() print("Extracting features from the test data using the same vectorizer") t0 = time() X_test = vectorizer.transform(data_test_data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_test.shape) print() # mapping from integer feature name to original token string if opts.use_hashing: feature_names = None else: feature_names = vectorizer.get_feature_names() if opts.select_chi2: print("Extracting %d best features by a chi-squared test" % opts.select_chi2) t0 = time() ch2 = SelectKBest(chi2, k=opts.select_chi2) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) if feature_names: # keep selected feature names feature_names = [feature_names[i] for i in ch2.get_support(indices=True)] print("done in %fs" % (time() - t0)) print()
def svm(num_cats, train_data, test_data): # train_path = os.path.join(data_path, "train") # test_path = os.path.join(data_path, "test") data_train = {} data_train["data"] = [] data_train["target"] = [] data_train["target_names"] = [str(i) for i in range(num_cats)] data_test = {} data_test["data"] = [] data_test["target"] = [] # with open(train_path, 'r') as train_f, open(test_path) as test_f: # train_data = train_f.readlines() # test_data = test_f.readlines() for exam in train_data: data_train["data"].append(exam[12:]) # print(exam) data_train["target"].append(int(exam[9])) for exam in test_data: data_test["data"].append(exam[12:]) data_test["target"].append(int(exam[9])) data_train = ap.Namespace(**data_train) data_test = ap.Namespace(**data_test) print('data loaded') categories = data_train.target_names # for case categories == None def size_mb(docs): return sum(len(s.encode('utf-8')) for s in docs) / 1e6 data_train_size_mb = size_mb(data_train.data) data_test_size_mb = size_mb(data_test.data) print("%d documents - %0.3fMB (training set)" % (len(data_train.data), data_train_size_mb)) print("%d documents - %0.3fMB (test set)" % (len(data_test.data), data_test_size_mb)) print("%d categories" % len(categories)) print(data_train.data[:10]) # split a training set and a test set y_train, y_test = data_train.target, data_test.target print( "Extracting features from the training data using a sparse vectorizer") t0 = time() if opts.use_hashing: vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=opts.n_features) X_train = vectorizer.transform(data_train.data) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_train = vectorizer.fit_transform(data_train.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print() print("Extracting features from the test data using the same vectorizer") t0 = time() X_test = vectorizer.transform(data_test.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_test.shape) print() # mapping from integer feature name to original token string if opts.use_hashing: feature_names = None else: feature_names = vectorizer.get_feature_names() if opts.select_chi2: print("Extracting %d best features by a chi-squared test" % opts.select_chi2) t0 = time() ch2 = SelectKBest(chi2, k=opts.select_chi2) X_train = ch2.fit_transform(X_train, y_train) X_test = ch2.transform(X_test) if feature_names: # keep selected feature names feature_names = [ feature_names[i] for i in ch2.get_support(indices=True) ] print("done in %fs" % (time() - t0)) print() if feature_names: feature_names = np.asarray(feature_names) def trim(s): """Trim string to fit on terminal (assuming 80-column display)""" return s if len(s) <= 80 else s[:77] + "..." ############################################################################### # Benchmark classifiers def benchmark(clf): print('_' * 80) print("Training: ") print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) if opts.print_top10 and feature_names is not None: print("top 10 keywords per class:") for i, category in enumerate(categories): top10 = np.argsort(clf.coef_[i])[-10:] print( trim("%s: %s" % (category, " ".join(feature_names[top10])))) print() if opts.print_report: print("classification report:") print( metrics.classification_report(y_test, pred, target_names=categories)) if opts.print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time results = [] for penalty in ["l2", "l1"]: print('=' * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model results.append( benchmark( LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3))) # make some plots indices = np.arange(len(results)) results = [[x[i] for x in results] for i in range(4)] clf_names, score, training_time, test_time = results training_time = np.array(training_time) / np.max(training_time) test_time = np.array(test_time) / np.max(test_time) plt.figure(figsize=(12, 8)) plt.title("Score") plt.barh(indices, score, .2, label="score", color='r') plt.barh(indices + .3, training_time, .2, label="training time", color='g') plt.barh(indices + .6, test_time, .2, label="test time", color='b') plt.yticks(()) plt.legend(loc='best') plt.subplots_adjust(left=.25) plt.subplots_adjust(top=.95) plt.subplots_adjust(bottom=.05) for i, c in zip(indices, clf_names): plt.text(-.3, i, c) plt.show()