def mlp_classifier(self): def get_hidden_layers(): x = [64, 128, 256] hl = [] for i in range(1, len(x)): hl.extend([p for p in itertools.product(x, repeat=i+1)]) return hl clf = MLPClassifier(solver='adam', alpha=1e-5, early_stopping=True, \ random_state=self.random_state) hidden_layer_sizes = get_hidden_layers() params = {'hidden_layer_sizes': hidden_layer_sizes} mdl = GridSearch(model=clf, param_grid=params) mdl.fit(self.x_train, self.y_train, self.x_val, self.y_val) self.reports['mlp'] = self.get_results(mdl) model_path = os.path.join(MODELS, dir_name, file_name+'.mlp') joblib.dump(mdl, model_path)
def test_gridsearch_crossval( model=SVC(random_state=0), return_model=False, param_grid=None, opt_score=0.9298, assertions=True, scoring=None, verbose=False, ): data = load_breast_cancer() # Create test and train sets from one dataset X_train, X_test, y_train, y_test = train_test_split( data["data"], data["target"], test_size=0.3, random_state=0, stratify=data["target"], ) # List the parameters to search across if param_grid is None: param_grid = { 'C': [1, 10, 100, 120, 150], 'gamma': [0.001, 0.0001], 'kernel': ['rbf'], } # Grid-search all parameter combinations WITHOUT a validation set. gs = GridSearch( model=model, param_grid=param_grid, ) gs.fit(X_train, y_train, scoring=scoring, verbose=False) # Compare with default model without hyperopt default = SVC(random_state=0) default.fit(X_train, y_train) default_score = round(default.score(X_test, y_test), 4) gs_score = round(gs.score(X_test, y_test), 4) if verbose: print('Default score:', default_score, '| GridSearch Score:', gs_score) if assertions: assert (gs_score == opt_score) if return_model: return gs
def rf_regressor(self): clf = RandomForestRegressor(random_state=self.random_state) params = {'n_estimators': [int(x) for x in np.linspace(start=100, stop=800, num=8)],\ 'max_features':['auto', 'sqrt', 'log2']} mdl = GridSearch(model=clf, param_grid=params) print('Fitting Rf') mdl.fit(self.x_train, self.y_train, self.x_val, self.y_val) self.reports['rf'] = self.get_results(mdl) model_path = os.path.join(MODELS, dir_name, file_name + '.rf') joblib.dump(mdl, model_path)
def xgb_classifier(self): clf = xgb.XGBClassifier(random_state=self.random_state) params = { 'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.3, 0.5],\ 'n_estimators': [int(x) for x in np.linspace(start=50, stop=800, num=16)],\ 'colsample_bytree': [i/10.0 for i in range(3,11)]} mdl = GridSearch(model=clf, param_grid=params) print('Fitting XGBoost') mdl.fit(self.x_train, self.y_train) mdl.fit(self.x_train, self.y_train, self.x_val, self.y_val) self.reports['xgb'] = self.get_results(mdl) model_path = os.path.join(MODELS, dir_name, file_name+'.xgb') joblib.dump(mdl, model_path)
def test_prob_methods(): data = load_breast_cancer() # Create test and train sets from one dataset X_train, X_test, y_train, y_test = train_test_split( data["data"], data["target"], test_size=0.3, random_state=0, stratify=data["target"], ) # List the parameters to search across param_grid = {'C': [1, 10, 100, 120, 150]} # Grid-search all parameter combinations using a validation set. model = GridSearch( model=LogisticRegression(), param_grid=param_grid, ) model.fit(X_train, y_train, verbose=False) assert (model.predict(X_test) is not None) assert (model.predict_proba(X_test) is not None)
class Prediction: def __init__(self, data, model, prefix, param_grid=[]): self.train_df, self.test_df = data self.model = model self.param_grid = param_grid self.prefix = prefix + datetime.now().strftime('%m-%d-%H:%M') self.X = self.train_df.loc[:, self.train_df.columns != 'precio'] self.y = self.train_df['precio'].values self.X_train, self.X_val, self.y_train, self.y_val = train_test_split( self.X, self.y, test_size=0.1, random_state=1) def manualGridSearch(self): best_score = math.inf for g in self.param_grid: print(g) self.model.set_params(**g) self.model.fit(self.X_train, self.y_train) score = mean_absolute_error(self.model.predict(self.X_val), self.y_val) print(score) # save if best if score < best_score: self.best_score = score self.best_grid = g def gridSearchTrain(self): print('Training...') self.gscv = GridSearchCV(self.model, self.param_grid, scoring='neg_mean_absolute_error', verbose=10) self.gscv.fit(self.X_train, self.y_train) self.best_params = self.gscv.best_params_ self.score = self.gscv.best_score_ self.predicted = self.gscv.predict(self.test_df) print(self.best_params) print(self.score) def HypOptTrain(self): print('Training...') self.opt = GridSearch(model=self.model, param_grid=self.param_grid) self.opt.fit(self.X_train, self.y_train, self.X_val, self.y_val, scoring='neg_mean_squared_error') self.best_params = self.opt.best_params_ self.score = self.opt.score(X_val, y_val) self.predicted = self.opt.predict(self.test_df) print(self.best_params) print(self.score) def train(self): print('Training...') self.model.fit(self.X_train, self.y_train) self.score = mean_absolute_error(self.model.predict(self.X_val), self.y_val) print(self.score) self.predicted = self.model.predict(self.test_df) def crossValidation(self, cv=5): cv_scores = cross_val_score( self.model, self.X, self.y, cv=cv, scoring='neg_mean_absolute_error' ) #print each cv score (accuracy) and average them self.score = np.mean(cv_scores) print(self.score) def save(self): if self.param_grid == []: with open('{}.model'.format(self.prefix), 'wb') as f: pickle.dump(self.model, f) else: with open('{}.model'.format(self.prefix), 'wb') as f: pickle.dump(self.gscv, f) def submit(self): self.test_ids = pd.read_csv('data/test.csv')['id'] answer = pd.DataFrame(list(zip(self.test_ids, self.predicted)), columns=['id', 'target']) answer.to_csv('{}-{}.csv'.format(self.prefix, int(round(self.score))), sep=',', index=False)
# for feature in features: # print(feature) # print('feature counts: {0}'.format(len(features))) X_train = vect_val.transform(X_train) X_val = vect_val.transform(X_val) print('******** GridSearch ********') param_grid = { 'n_estimators': [40, 60, 80, 100, 120], 'learning_rate': [0.1, 0.15, 0.2], 'max_depth': [6, 7, 8, 9, 10] } scorer = make_scorer(f2) gs = GridSearch(model=GradientBoostingClassifier()) gs.fit(X_train, y_train, param_grid, X_val, y_val, scoring=scorer) print('params: ', gs.get_best_params()) print('Test Score for Optimized Parameters:', gs.score(X_val, y_val)) # print('******** GradientBoostingClassifier ********') # gb = GradientBoostingClassifier(learning_rate=0.1, n_estimators=80, max_depth=7) # gb, preds_train, preds = train_and_predict(gb, X_train, y_train, X_val) # print_scores(y_val, preds, y_train, preds_train) # # print('******** AdamBoostingClassifier ********') # ada = AdaBoostClassifier() # ada, preds_train, preds = train_and_predict(ada, X_train, y_train, X_val) # print_scores(y_val, preds, y_train, preds_train) # # print('******** XgBoostClassifier ********') # xgb = XGBClassifier()
# Create a validation set. X_train, X_val, y_train, y_val = train_test_split( X_train, y_train, test_size = 0.3, random_state = 0, stratify = y_train, ) # List the parameters to search across # List the parameters to search across param_grid = { 'C': [1, 10, 100, 120, 150], 'gamma': [0.001, 0.0001], 'kernel': ['rbf'], } # Grid-search all parameter combinations using a validation set. gs = GridSearch(model = SVC(random_state=0), param_grid=param_grid, parallelize=False) # You can choose the metric to optimize (f1, auc_roc, accuracy, etc.) # scoring = None will default to optimizing model.score() _ = gs.fit(X_train, y_train, X_val, y_val, scoring = 'f1') # Compare with default model without hyperopt default = SVC(random_state=0) _ = default.fit(X_train, y_train) print('\nTest score comparison (larger is better):') print('Non-optimized Parameters:', round(default.score(X_test, y_test), 4)) print('Optimized Parameters:', round(gs.score(X_test, y_test), 4))
print("Length of Train/Test:",len(X_train),len(X_dev)) params = { 'lr' : np.random.uniform(0,1,[5]).tolist() + [1.0], 'ada' : np.random.uniform(0,1,[5]).tolist() + [1.0], 'et' : np.random.uniform(0,1,[5]).tolist() + [1.0], 'xgb' : np.random.uniform(0,1,5).tolist() + [1.0], 'gb' : np.random.uniform(0,1,[5]).tolist() + [1.0], 'rf' : np.random.uniform(0,1,[5]).tolist() + [1.0], 'br' : np.random.uniform(0,1,5).tolist() + [1.0], 'lasso' : np.random.uniform(0,1,[5]).tolist() + [1.0], 'lrf' : [math.floor,math.ceil,avg], 'adaf' : [math.floor,math.ceil,avg], 'etf' : [math.floor,math.ceil,avg], 'xgbf' : [math.floor,math.ceil,avg], 'gbf' : [math.floor,math.ceil,avg], 'rff' :[math.floor,math.ceil,avg], 'brf' : [math.floor,math.ceil,avg], 'lassof' : [math.floor,math.ceil,avg], } model = GridSearch(N2C2Classifier(),param_grid=params) model.fit(X_train,y_train,X_dev,y_dev,verbose=True) print(model.get_best_params()) print(model.get_best_score())
random_state=0, ) # Create a validation set. X_train, X_val, y_train, y_val = train_test_split( X_train, y_train, test_size=0.1, random_state=0, ) # List the parameters to search across param_grid = { 'C': [1, 10, 100, 120, 150], 'gamma': [0.001, 0.0001], 'kernel': ['rbf'], } # Grid-search all parameter combinations using a validation set. gs = GridSearch(model=SVR()) # Choose the metric to optimize (r2, explained_variance, etc.) # scoring = None will default to optimizing model.score() _ = gs.fit(X_train, y_train, param_grid, X_val, y_val, scoring='r2') # Compare with default model without hyperopt default = SVR() _ = default.fit(X_train, y_train) print('\nTest score comparison (larger is better):') print('Non-optimized Parameters:', round(default.score(X_test, y_test), 4)) print('Optimized Parameters:', round(gs.score(X_test, y_test), 4))
class MachineLearning: """Machine learning class to run sklearn-like pipeline on MethylationArray data. Initialize object with scikit-learn model, and optionally supply a hyperparameter search grid. model Scikit-learn-like model, classification, regression, dimensionality reduction, clustering etc. options Options to supply model in form of dictionary. grid Alternatively, supply search grid to search for bets hyperparameters. labelencode T/F encode string labels. n_eval Number of evaluations for randomized grid search, if set to 0, perform exhaustive grid search """ def __init__(self, model, options, grid={}, labelencode=False, n_eval=0): if grid: self.model = GridSearch( model=model(), param_grid=grid, num_random_search=None if not n_eval else n_eval) self.param_grid_exists = True self.grid = grid else: self.model = model(**options) self.param_grid_exists = False if labelencode: self.encoder = LabelEncoder() else: self.encoder = None def fit(self, train_methyl_array, val_methyl_array=None, outcome_cols=None): """Fit data to model. Parameters ---------- train_methyl_array Training MethylationArray. val_methyl_array Validation MethylationArray. Can set to None. outcome_cols Set to none if not needed, but phenotype column to train on, can be multiple. """ if outcome_cols != None: if self.encoder != None: self.encoder.fit(train_methyl_array.pheno[outcome_cols]) if self.param_grid_exists: self.model.fit( train_methyl_array.beta, self.encoder.transform( train_methyl_array.pheno[outcome_cols]) if self.encoder != None else train_methyl_array.pheno[outcome_cols], val_methyl_array.beta, self.encoder.transform( val_methyl_array.pheno[outcome_cols]) if self.encoder != None else val_methyl_array.pheno[outcome_cols], scoring='accuracy' if self.encoder != None else 'r2') else: self.model.fit( train_methyl_array.beta, self.encoder.transform( train_methyl_array.pheno[outcome_cols]) if self.encoder != None else train_methyl_array.pheno[outcome_cols]) else: self.model.fit(train_methyl_array.beta) return self.model def transform(self, test_methyl_array): """Transform test methylation array. Parameters ---------- test_methyl_array Testing MethylationArray. """ self.results = self.model.transform(test_methyl_array.beta) return self.results def fit_transform(self, train_methyl_array, outcome_cols=None): """Fit and transform to training data. Parameters ---------- train_methyl_array Training MethylationArray. outcome_cols Set to none if not needed, but phenotype column to train on, can be multiple. """ self.results = self.fit( train_methyl_array, outcome_cols=None).transform(train_methyl_array) return self.results def predict(self, test_methyl_array): """Make new predictions on test methylation array. Parameters ---------- test_methyl_array Testing MethylationArray. """ self.results = self.model.predict(test_methyl_array.beta) if self.encoder != None: self.results = self.encoder.inverse_transform(self.results) return self.results def fit_predict(self, train_methyl_array, outcome_cols=None): """Fit and predict training data. Parameters ---------- train_methyl_array Training MethylationArray. outcome_cols Set to none if not needed, but phenotype column to train on, can be multiple. """ self.results = self.fit(train_methyl_array, outcome_cols).predict(train_methyl_array) return self.results def store_results(self, output_pkl, results_dict={}): """Store results in pickle file. Parameters ---------- output_pkl Output pickle to dump results to. results_dict Supply own results dict to be dumped. """ if not results_dict: results_dict = dict(results=self.results) pickle.dump(results_dict, open(results_dict, 'wb')) def assign_results_to_pheno_col(self, methyl_array, new_col, output_pkl): """Assign results to new phenotype column. Parameters ---------- methyl_array MethylationArray. new_col New column name. output_pkl Output pickle to dump MethylationArray to. """ methyl_array.pheno[new_col] = self.results methyl_array.write_pickle(output_pkl) def transform_results_to_beta(self, methyl_array, output_pkl): """Transform beta matrix into reduced beta matrix and store. Parameters ---------- methyl_array MethylationArray. output_pkl Output pickle to dump MethylationArray to. """ methyl_array.beta = pd.DataFrame(self.results, index=self.beta.index) methyl_array.write_pickle(output_pkl) def return_outcome_metric(self, methyl_array, outcome_cols, metric, run_bootstrap=False): """Supply metric to evaluate results. Parameters ---------- methyl_array MethylationArray to evaluate. outcome_cols Outcome phenotype columns. metric Sklearn evaluation metric. run_bootstrap Make 95% CI from 1k bootstraps. """ y_true = methyl_array.pheno[outcome_cols] y_pred = self.results if not bootstrap: return metric(y_true, y_pred) else: from sklearn.utils import resample boot_results = np.array([ metric(*resample(y_true, y_pred, random_state=123)) for i in range(n_bootstrap) ]) original = metric(y_true, y_pred) std_err = np.std(boot_results) boot_results = np.sort(boot_results) ci = 0.95 bound = (1 - ci) / 2. # BORROWED FROM MLXTEND def quantile(x, q): rank = round(q * x.shape[0]) - 1 if rank >= x.shape[0]: rank = x.shape[0] elif rank <= 0: rank = 0 rank = int(round(rank)) return x[rank] high_ci = quantile(boot_results, q=(ci + bound)) low_ci = quantile(boot_results, q=bound) return original, std_err, (low_ci, high_ci)
def fit(self, X, Y, X_val, Y_val): param_grid = [] val_mode = False if X_val is not None and Y_val is not None: val_mode = True if isinstance(self.base_estimator, sklearn.svm.SVC): self.base_estimator.set_params(probability=True) param_grid = [ { 'C': [1, 10, 100], 'kernel': ['linear'] }, { 'C': [1, 10, 100], 'gamma': [0.001, 0.0001], 'kernel': ['rbf'] }, ] elif isinstance(self.base_estimator, sklearn.linear_model.LogisticRegression): param_grid = [{ 'solver': ['liblinear'], 'penalty': ['l1', 'l2'], 'C': [0.0001, 0.001, 0.01, 1, 100] }] else: print('not in validation mode') if isinstance(self.base_estimator, sklearn.svm.SVC): self.base_estimator.set_params(gamma='auto') self.base_estimator.set_params(kernel='linear') elif isinstance(self.base_estimator, sklearn.linear_model.LogisticRegression): self.base_estimator.set_params(solver='liblinear') """Fit the model to data matrix X and targets Y. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data. Y : array-like, shape (n_samples, n_classes) The target values. Returns ------- self : object Returns self. """ X, Y = check_X_y(X, Y, multi_output=True, accept_sparse=True) random_state = check_random_state(self.random_state) check_array(X, accept_sparse=True) self.order_ = self.order if self.order_ is None: self.order_ = np.array(range(Y.shape[1])) elif isinstance(self.order_, str): if self.order_ == 'random': self.order_ = random_state.permutation(Y.shape[1]) elif sorted(self.order_) != list(range(Y.shape[1])): raise ValueError("invalid order") self.estimators_ = [ clone(self.base_estimator) for _ in range(Y.shape[1]) ] self.classes_ = [] if self.cv is None: Y_pred_chain = sp.lil_matrix((X.shape[0], Y.shape[1])) X_aug = sp.hstack((X, Y_pred_chain), format='lil') if val_mode: Y_pred_chain_val = sp.lil_matrix((X_val.shape[0], Y.shape[1])) X_val_aug = sp.hstack((X_val, Y_pred_chain_val.copy()), format='lil') elif sp.issparse(X): Y_pred_chain = sp.lil_matrix((X.shape[0], Y.shape[1])) X_aug = sp.hstack((X, Y_pred_chain), format='lil') if val_mode: Y_pred_chain_val = sp.lil_matrix((X_val.shape[0], Y.shape[1])) X_val_aug = sp.hstack((X_val, Y_pred_chain_val.copy()), format='lil') else: Y_pred_chain = np.zeros((X.shape[0], Y.shape[1])) X_aug = np.hstack((X, Y_pred_chain)) if val_mode: Y_pred_chain_val = np.zeros((X_val.shape[0], Y.shape[1])) X_val_aug = sp.hstack((X_val, Y_pred_chain_val.copy())) del Y_pred_chain if val_mode: del Y_pred_chain_val class_1 = 1 class_2 = 0 if -1 in Y: class_2 = -1 for chain_idx, estimator in enumerate(self.estimators_): y = Y[:, self.order_[chain_idx]] # class_1_counter = np.count_nonzero(y[:, 0] == class_1) # class_2_counter = np.count_nonzero(y[:, 0] == class_2) class_1_counter = y.flatten().tolist().count(class_1) class_2_counter = y.flatten().tolist().count(class_2) if class_1_counter <= class_2_counter: minority_index = 1 minority_counter = class_1_counter majority_index = 0 else: minority_index = 0 minority_counter = class_2_counter majority_index = 1 # get all the minority samples sampled_index = [ index for index, label in enumerate(y) if label == minority_index ] sampled_y = [minority_index] * minority_counter #print('m'+str(len(sampled_y))+' '+str(minority_index)+'s') sampled_index.extend(sampled_index) sampled_y.extend(sampled_y) # sample the majority samples temp_sampled_index = [ index for index, label in enumerate(y) if label == majority_index ] #print(str(len(temp_sampled_index))+' '+str(majority_index)+'s') sampled_index.extend( random.sample(temp_sampled_index, minority_counter)) sampled_y.extend([majority_index] * minority_counter) print( str(chain_idx) + '___' + str(self.order_[chain_idx]) + ') training label:' + str(chain_idx) + ' with ' + str(len(sampled_y)) + ' instances ') #print('X_aug[np.array(sampled_index), :(X.shape[1] + chain_idx)]: '+str(X_aug[np.array(sampled_index), :(X.shape[1] + chain_idx)].shape)) #print('np.array(sampled_y): '+str(np.array(sampled_y).shape)) if val_mode: # for the grid search version gs = GridSearch(model=estimator, param_grid=param_grid) temp_estimator = gs.fit( X_aug[np.array(sampled_index), :(X.shape[1] + chain_idx)], np.array(sampled_y), X_val_aug[:, :(X.shape[1] + chain_idx)], Y_val[:, self.order_[chain_idx]], scoring='roc_auc') else: estimator.fit( X_aug[np.array(sampled_index), :(X.shape[1] + chain_idx)], np.array(sampled_y)) if chain_idx < len(self.estimators_) - 1: # produce the predictions and add them as features for the next classifier in the chain col_idx = X.shape[1] + chain_idx # use all the available features(from X_train and all the predictions thus far) if val_mode: previous_predictions_val = temp_estimator.predict( X_val_aug[:, :col_idx]) previous_predictions = temp_estimator.predict( X_aug[:, :col_idx]) else: previous_predictions = estimator.predict( X_aug[:, :col_idx]) # insert the predictions as features to be used for the next classifier in the chain if sp.issparse(X_aug): X_aug[:, col_idx] = np.expand_dims(previous_predictions, 1) if val_mode: X_val_aug[:, col_idx] = np.expand_dims( previous_predictions_val, 1) else: X_aug[:, col_idx] = previous_predictions if val_mode: X_val_aug[:, col_idx] = previous_predictions_val # ------------------------------------------------------------------------------------------------------------------------------- if self.cv is not None and chain_idx < len(self.estimators_) - 1: col_idx = X.shape[1] + chain_idx cv_result = cross_val_predict(self.base_estimator, X_aug[:, :col_idx], y=y, cv=self.cv) if sp.issparse(X_aug): X_aug[:, col_idx] = np.expand_dims(cv_result, 1) else: X_aug[:, col_idx] = cv_result if val_mode: self.classes_.append(temp_estimator.classes_) self.estimators_[chain_idx] = temp_estimator else: self.classes_.append(estimator.classes_) return self
def test_regression( model=SVR(), return_model=False, param_grid=None, gs_score=.4532, assertions=True, scoring=None, verbose=False, ): from sklearn.datasets import load_boston data = load_boston() # Create test and train sets from one dataset X_train, X_test, y_train, y_test = train_test_split( data["data"], data["target"], test_size=0.1, random_state=0, ) # Create a validation set. X_train, X_val, y_train, y_val = train_test_split( X_train, y_train, test_size=0.1, random_state=0, ) # List the parameters to search across if param_grid is None: param_grid = { 'C': [1, 10, 100, 120, 150], 'gamma': [0.001, 0.0001], 'kernel': ['rbf'], } # Grid-search all parameter combinations using a validation set. gs = GridSearch( model=model, param_grid=param_grid, ) gs.fit( X_train, y_train, X_val, y_val, scoring=scoring, verbose=True, ) # Compare with default model without hyperopt default = model default.fit(X_train, y_train) default_score = round(default.score(X_test, y_test), 4) gridsearch_score = round(gs.score(X_test, y_test), 4) if verbose: print('Default score:', default_score, '| GridSearch Score:', gridsearch_score) if assertions: assert (default_score == .0175) assert (gridsearch_score is not None) if return_model: return gs
'verbose': [True], 'silent': [False], 'min_child_weight': list(np.arange(1, X_test.shape[1], 5)), 'n_estimators': [10, 100, 200, 300, 1000] }] xg_reg = xgb.XGBRegressor() # Applying Grid Search to find the best model and the best parameters from hypopt import GridSearch from sklearn.model_selection import GridSearchCV grid_search = GridSearch(model=xg_reg, param_grid=parameters) grid_search = grid_search.fit(X_train, y_train, X_val=X_test, y_val=y_test, scoring='neg_mean_squared_error') best_parameters = grid_search.get_params() #best_mse = (-grid_search.best_score_)**(1/2) #best_parameters = grid_search.best_params_ '''best_parameters['n_estimators'] = 10000 best_parameters['gamma'] = 0 best_parameters['min_child_weight'] = 2''' xg_reg = xgb.XGBRegressor(**best_parameters) xg_reg.fit(X_train, y_train) preds = xg_reg.predict(X_test)
def classifier(classifier, train, truth, validate, validate_truth, test, test_truth, datatype): np.random.seed(0) rng = np.random.permutation(1)[0] train = pd.DataFrame(train) validate = pd.DataFrame(validate) test = pd.DataFrame(test) logger = logging.getLogger('myapp') hdlr = logging.FileHandler('classifiers.log') formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') hdlr.setFormatter(formatter) logger.addHandler(hdlr) logger.setLevel(logging.WARN) if classifier.lower( ) == 'svm': #best: C = 50, gamma = 0.0001, kernel = rbf model = svm.SVC(random_state=rng) hyperparameter = { 'kernel': ('linear', 'rbf'), 'C': [1, 1.5, 10, 50, 100, 200], 'gamma': [1e-7, 1e-4] } elif classifier.lower() == 'randomforest': #120 model = RandomForestClassifier(random_state=rng) hyperparameter = {'n_estimators': np.arange(10, 300, 10)} elif classifier.lower() == 'adaboost': model = AdaBoostClassifier(random_state=rng) hyperparameter = { 'n_estimators': np.arange(10, 300, 10), 'algorithm': ('SAMME', 'SAMME.R') } elif classifier.lower() == 'knn': #120 model = KNeighborsClassifier() hyperparameter = dict(n_neighbors=list(range(1, 100))) else: ## assume it's asking for neural network (multi-layer perceptron) model = MLPClassifier( max_iter=100 ) #activation=tanh, hiddenlayersize=(20,20), 'learning_rate'=adaptive,solver=lbfgs hyperparameter = { 'hidden_layer_sizes': [(20, 20), (80, 20), (80, 20, 20), (80, 40, 40, 20), (40, 40, 20, 20, 20, 10)], 'learning_rate': ['adaptive'], 'activation': ['tanh', 'relu', 'logistic'], 'solver': ['lbfgs', 'sgd', 'adam'] } tuned_model = GridSearch(model=model, param_grid=hyperparameter) tuned_model.fit(train, truth) prediction = tuned_model.score(test, test_truth) logger.warn(classifier + ' ' + datatype + ' validate ' + str(prediction)) tuned_model.fit(train, truth, validate, validate_truth) prediction = tuned_model.score(test, test_truth) target_names = [ 'c-CS-s', 'c-CS-m', 'c-SC-s', 'c-SC-m', 't-CS-s', 't-CS-m', 't-SC-s', 't-SC-m' ] prediction = tuned_model.predict(test) print( classification_report(test_truth, prediction, target_names=target_names)) logger.warn(classifier + ' ' + datatype + ' ' + str(prediction)) return
def main(): parser = argparse.ArgumentParser() parser.add_argument('--vectors_file', required=True, type=str) parser.add_argument('--data_path', default='stanfordSentimentTreebank', type=str) parser.add_argument('--output_file', required=True, type=str) args = parser.parse_args() try: vectors except NameError: print('Reading vectors file ... ', end='') t = time.time() with codecs.open(args.vectors_file, 'r', "UTF-8") as f: vocab_size = sum(1 for line in f) with codecs.open(args.vectors_file, 'r', "UTF-8") as f: line = f.readline() val = line.rstrip().split(' ') check = False if len( val ) == 2: # Check if the vectors file has vocab size and diensionality in the first line val = f.readline().rstrip().split(' ') vocab_size -= 1 check = True vector_dim = len(list(map(float, val[1:]))) vectors = np.zeros((vocab_size, vector_dim)) words = [""] * vocab_size vocab_dict = dict() with codecs.open(args.vectors_file, 'r', "UTF-8") as f: if check: next(f) for idx, line in enumerate(f): vals = line.rstrip().split(' ') words[idx] = vals[0] vocab_dict[vals[0]] = idx # indices start from 0 vec = list(map(float, vals[1:])) try: vectors[idx, :] = vec except IndexError: if vals[0] == '<unk>': # ignore the <unk> vector pass else: raise Exception('IncompatibleInputs') print("done in " + str(int(time.time() - t)) + " seconds") print('Reading train and test data ... ', end='') t = time.time() dictionary = dict() with codecs.open(args.data_path + "/dictionary.txt", 'r', "UTF-8") as f: for line in f.read().splitlines(): tmp = line.split("|") dictionary[tmp[0]] = int(tmp[1]) with codecs.open(args.data_path + "/datasetSentences.txt", "r", "UTF-8") as f: sentences = [] for sentence in f.read().splitlines()[1:]: sentences.append(sentence.split("\t")[1]) all_labels = [] with open(args.data_path + "/sentiment_labels.txt") as f: for label in f.read().splitlines()[1:]: all_labels.append(float(label.split("|")[1])) split_classes = [] with open(args.data_path + "/datasetSplit.txt") as f: for line in f.read().splitlines()[1:]: split_classes.append(int(line.split(",")[1])) print("done in " + str(int(time.time() - t)) + " seconds") print( 'Generating train and test samples from the data for selected classes ... ', end='') t = time.time() train_size = sum([1 for label in split_classes if label == 1]) val_size = sum([1 for label in split_classes if label == 3]) test_size = sum([1 for label in split_classes if label == 2]) train_samples = np.zeros([train_size, vector_dim]) train_labels = [] val_samples = np.zeros([val_size, vector_dim]) val_labels = [] test_samples = np.zeros([test_size, vector_dim]) test_labels = [] train_no = 0 val_no = 0 test_no = 0 not_in_dict_count = 0 for sample_no, sentence in enumerate(sentences): try: score = all_labels[dictionary[sentence]] except: not_in_dict_count += 1 continue if score <= 0.4 or score > 0.6: # Eliminate noutral sentences inds = process_sentence(sentence, vocab_dict) if len(inds) > 0: if split_classes[sample_no] == 1: for ind in inds: train_samples[train_no, :] += vectors[ind, :] train_samples[ train_no, :] = train_samples[train_no, :] / len(inds) if score <= 0.4: train_labels.append(0) elif score > 0.6: train_labels.append(1) train_no += 1 elif split_classes[sample_no] == 3: for ind in inds: val_samples[val_no, :] += vectors[ind, :] val_samples[val_no, :] = val_samples[val_no, :] / len(inds) if score <= 0.4: val_labels.append(0) elif score > 0.6: val_labels.append(1) val_no += 1 elif split_classes[sample_no] == 2: for ind in inds: test_samples[test_no, :] += vectors[ind, :] test_samples[ test_no, :] = test_samples[test_no, :] / len(inds) if score <= 0.4: test_labels.append(0) elif score > 0.6: test_labels.append(1) test_no += 1 train_samples = train_samples[:train_no, :] val_samples = val_samples[:val_no, :] test_samples = test_samples[:test_no, :] print("done in " + str(int(time.time() - t)) + " seconds") print('Training linear SVM for parameter optimization ... ', end='') tuned_parameters = [{ 'kernel': ['linear'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }] clf = GridSearch(model=SVC(), param_grid=tuned_parameters) clf.fit(train_samples, train_labels, val_samples, val_labels) print("done in " + str(int(time.time() - t)) + " seconds") predicted_labels = clf.predict(test_samples) accuracy = sum([ true == predicted for true, predicted in zip(test_labels, predicted_labels) ]) / len(test_samples) * 100 print("Accuracy for sentiment classification of sentences is: " + str(round(accuracy, 2)) + "% (" + str(int(accuracy / 100 * len(predicted_labels))) + "/" + str(len(predicted_labels)) + ")") f_out = open(args.output_file, "w") f_out.write("Accuracy for sentiment classification is: " + str(round(accuracy, 2)) + "% (" + str(int(accuracy / 100 * len(predicted_labels))) + "/" + str(len(predicted_labels)) + ")\n") f_out.close()