def run(analyzer, ngrams, lowercase, stop_words, max_df, min_df, norm, use_idf, smooth_idf, sublinear_tf): def train_model(X_train, y_train, stat_model): #stat_model.optimize_hyperparameters(X_train, y_train, folds=5) # params = stat_model.best_params params = {'kernel': 'rbf', 'C': 10, 'probability': True, 'gamma': 0.1} #params = {'kernel': 'rbf', 'C': 1, 'probability': True, 'gamma': 0.001} #with scale #params = {'alpha': alpha} #print params stat_model.train(X_train, y_train, params) def test_model(stat_model, X_test, y_test, fscore_list): y_true, y_pred = y_test, stat_model.predict(X_test) fscore_list.append(accuracy_score(y_true, y_pred)) print(fscore_list[-1]) # load data # data data = News() #data = Spam() data_train_x, data_train_y = data.get_train() data_test_x, data_test_y = data.get_test() #print newsgroups_train featurizer = TFIDF(analyzer, ngrams, lowercase, stop_words, max_df, min_df, norm, use_idf, smooth_idf, sublinear_tf) (x, y, X_test, y_test) = featurizer.featurize(data_train_x, data_train_y, data_test_x, data_test_y) #incremental training stat_model = SVC_Model() #stat_model = NaiveBayes() X_train = None y_train = [] accuracy_list = [0] start_time = time.time() #print x.shape[0] # first model train_model(x, y, stat_model) test_model(stat_model, X_test, y_test, accuracy_list) #print featurizer.pipeline.get_params() print("--- %s seconds ---" % (time.time() - start_time)) return stat_model.model.score(X_test, y_test)
#params = {'kernel': 'rbf', 'C': 1, 'probability': True, 'gamma': 0.001} #with scale params = {'alpha': 0.01} print params stat_model.train(X_train, y_train, params) def test_model(stat_model, X_test, y_test, fscore_list): y_true, y_pred = y_test, stat_model.predict(X_test) fscore_list.append(accuracy_score(y_true, y_pred)) print(fscore_list[-1]) # data data = News() #data = Spam() data_train_x, data_train_y = data.get_train() data_test_x, data_test_y = data.get_test() random_seed = 42 # specify TF-IDF featurizer = TFIDF() (x, y, X_test, y_test) = featurizer.featurize(data_train_x, data_train_y, data_test_x, data_test_y) print x.shape[0] ''' y_encoder = LabelBinarizer() y_encoder.fit(y)
train_start_time = time.time() stat_model.partial_train(X_train, y_train, params) traintime.append((time.time() - train_start_time)) def test_model(stat_model, X_test, y_test, fscore_list): y_true, y_pred = y_test, stat_model.predict(X_test) predictive_accuracy = np.mean(y_pred == y_true) fscore_list.append(predictive_accuracy) print(predictive_accuracy) # load data # data data = News() #data = Spam() data_train_x, data_train_y = data.get_train() data_test_x, data_test_y = data.get_test() random_seed = 42 #print newsgroups_train featurizer = TFIDF() (x, y, X_test, y_test) = featurizer.featurize(data_train_x, data_train_y, data_test_x, data_test_y) all_list = [] time_list_all = []