def unigram_selection(i,path_to_training_set,path_to_pickle): (X_train, y_train, X_test, y_test,number_training, number_testing)= extract_data.extract_training_and_testing_set( path_to_training_set+'metrics_training_set_%d.data'%i, path_to_training_set+'metrics_testing_set_%d.data'%i) print(X_train[0].__len__()) clf=svm.SVC(C=1, cache_size=2000, class_weight=None, coef0=0.0, degree=3, gamma=0.1, kernel='linear', max_iter=-1, probability=False, shrinking=True, tol=0.001, verbose=False) clf.fit(X_train, y_train) print () print("Detailed classification report:") print() print("The model is trained on the full development set: %d" % number_training) print("The scores are computed on the full evaluation set: %d" % number_testing) print() y_true=y_test y_prediction=clf.predict(X_test) print(metrics.classification_report(y_true, y_prediction)) clf_metrics=np.vstack((y_true,y_prediction)) with open(path_to_pickle+'60000_all_features_%d.pkl'%i, 'wb') as fid : cPickle.dump(clf_metrics, fid) print()
__author__ = 'pierregagliardi' import numpy as np import pickle from sklearn.feature_selection import VarianceThreshold from projet_sentiment_analysis.code.utilities import extract_data if __name__ == "__main__": general_path = '/Users/pierregagliardi/DossierTravail/Programmation/PythonPath/projet_sentiment_analysis/' path_to_training_set = general_path + 'training_set_60000/training_set_unigram_all_features/' path_to_pickle = general_path + 'pickle_hyper_parameters/' (X_train, y_train, X_test, y_test, number_training, number_testing) = extract_data.extract_training_and_testing_set( path_to_training_set + 'metrics_training_set_7000.data', path_to_training_set + 'metrics_testing_set_7000.data') sel = VarianceThreshold(threshold=(.999 * (1 - .999))) X_train = sel.fit_transform(X_train) X_test = sel.transform(X_test) with open(path_to_pickle + 'metrics_60000_all_features_7000.pkl', 'wb') as fid: pickle.dump((X_train, y_train, X_test, y_test), fid)
os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) print("Parameters are %s, %s with a score of %0.2f" %(C, gamma,score )) return metrics.accuracy_score(y_test, y_prediction) if __name__ == "__main__": ####### home computer path ###### general_path='/Users/pierregagliardi/DossierTravail/Programmation/PythonPath/projet_sentiment_analysis/' path_to_training_set=general_path+'training_set_60000/training_set_unigram_all_features/' path_to_pickle=general_path+'pickle_hyper_parameters/' (X_train, y_train, X_test, y_test,number_training, number_testing)= extract_data.extract_training_and_testing_set( path_to_training_set+'metrics_training_set_1000.data', path_to_training_set+'metrics_testing_set_1000.data') #check how many cpu are available to parallelise the task print multiprocessing.cpu_count() # For an initial search, a logarithmic grid with basis # 10 is often helpful. Using a basis of 2, a finer # tuning can be achieved but at a much higher cost. C_range = np.logspace(0, 3, 4) gamma_range = np.logspace(-3, -1, 3) param_grid = dict(gamma=gamma_range, C=C_range)