def test_prob_methods(): data = load_breast_cancer() # Create test and train sets from one dataset X_train, X_test, y_train, y_test = train_test_split( data["data"], data["target"], test_size=0.3, random_state=0, stratify=data["target"], ) # List the parameters to search across param_grid = {'C': [1, 10, 100, 120, 150]} # Grid-search all parameter combinations using a validation set. model = GridSearch( model=LogisticRegression(), param_grid=param_grid, ) model.fit(X_train, y_train, verbose=False) assert (model.predict(X_test) is not None) assert (model.predict_proba(X_test) is not None)
class Prediction: def __init__(self, data, model, prefix, param_grid=[]): self.train_df, self.test_df = data self.model = model self.param_grid = param_grid self.prefix = prefix + datetime.now().strftime('%m-%d-%H:%M') self.X = self.train_df.loc[:, self.train_df.columns != 'precio'] self.y = self.train_df['precio'].values self.X_train, self.X_val, self.y_train, self.y_val = train_test_split( self.X, self.y, test_size=0.1, random_state=1) def manualGridSearch(self): best_score = math.inf for g in self.param_grid: print(g) self.model.set_params(**g) self.model.fit(self.X_train, self.y_train) score = mean_absolute_error(self.model.predict(self.X_val), self.y_val) print(score) # save if best if score < best_score: self.best_score = score self.best_grid = g def gridSearchTrain(self): print('Training...') self.gscv = GridSearchCV(self.model, self.param_grid, scoring='neg_mean_absolute_error', verbose=10) self.gscv.fit(self.X_train, self.y_train) self.best_params = self.gscv.best_params_ self.score = self.gscv.best_score_ self.predicted = self.gscv.predict(self.test_df) print(self.best_params) print(self.score) def HypOptTrain(self): print('Training...') self.opt = GridSearch(model=self.model, param_grid=self.param_grid) self.opt.fit(self.X_train, self.y_train, self.X_val, self.y_val, scoring='neg_mean_squared_error') self.best_params = self.opt.best_params_ self.score = self.opt.score(X_val, y_val) self.predicted = self.opt.predict(self.test_df) print(self.best_params) print(self.score) def train(self): print('Training...') self.model.fit(self.X_train, self.y_train) self.score = mean_absolute_error(self.model.predict(self.X_val), self.y_val) print(self.score) self.predicted = self.model.predict(self.test_df) def crossValidation(self, cv=5): cv_scores = cross_val_score( self.model, self.X, self.y, cv=cv, scoring='neg_mean_absolute_error' ) #print each cv score (accuracy) and average them self.score = np.mean(cv_scores) print(self.score) def save(self): if self.param_grid == []: with open('{}.model'.format(self.prefix), 'wb') as f: pickle.dump(self.model, f) else: with open('{}.model'.format(self.prefix), 'wb') as f: pickle.dump(self.gscv, f) def submit(self): self.test_ids = pd.read_csv('data/test.csv')['id'] answer = pd.DataFrame(list(zip(self.test_ids, self.predicted)), columns=['id', 'target']) answer.to_csv('{}-{}.csv'.format(self.prefix, int(round(self.score))), sep=',', index=False)
class MachineLearning: """Machine learning class to run sklearn-like pipeline on MethylationArray data. Initialize object with scikit-learn model, and optionally supply a hyperparameter search grid. model Scikit-learn-like model, classification, regression, dimensionality reduction, clustering etc. options Options to supply model in form of dictionary. grid Alternatively, supply search grid to search for bets hyperparameters. labelencode T/F encode string labels. n_eval Number of evaluations for randomized grid search, if set to 0, perform exhaustive grid search """ def __init__(self, model, options, grid={}, labelencode=False, n_eval=0): if grid: self.model = GridSearch( model=model(), param_grid=grid, num_random_search=None if not n_eval else n_eval) self.param_grid_exists = True self.grid = grid else: self.model = model(**options) self.param_grid_exists = False if labelencode: self.encoder = LabelEncoder() else: self.encoder = None def fit(self, train_methyl_array, val_methyl_array=None, outcome_cols=None): """Fit data to model. Parameters ---------- train_methyl_array Training MethylationArray. val_methyl_array Validation MethylationArray. Can set to None. outcome_cols Set to none if not needed, but phenotype column to train on, can be multiple. """ if outcome_cols != None: if self.encoder != None: self.encoder.fit(train_methyl_array.pheno[outcome_cols]) if self.param_grid_exists: self.model.fit( train_methyl_array.beta, self.encoder.transform( train_methyl_array.pheno[outcome_cols]) if self.encoder != None else train_methyl_array.pheno[outcome_cols], val_methyl_array.beta, self.encoder.transform( val_methyl_array.pheno[outcome_cols]) if self.encoder != None else val_methyl_array.pheno[outcome_cols], scoring='accuracy' if self.encoder != None else 'r2') else: self.model.fit( train_methyl_array.beta, self.encoder.transform( train_methyl_array.pheno[outcome_cols]) if self.encoder != None else train_methyl_array.pheno[outcome_cols]) else: self.model.fit(train_methyl_array.beta) return self.model def transform(self, test_methyl_array): """Transform test methylation array. Parameters ---------- test_methyl_array Testing MethylationArray. """ self.results = self.model.transform(test_methyl_array.beta) return self.results def fit_transform(self, train_methyl_array, outcome_cols=None): """Fit and transform to training data. Parameters ---------- train_methyl_array Training MethylationArray. outcome_cols Set to none if not needed, but phenotype column to train on, can be multiple. """ self.results = self.fit( train_methyl_array, outcome_cols=None).transform(train_methyl_array) return self.results def predict(self, test_methyl_array): """Make new predictions on test methylation array. Parameters ---------- test_methyl_array Testing MethylationArray. """ self.results = self.model.predict(test_methyl_array.beta) if self.encoder != None: self.results = self.encoder.inverse_transform(self.results) return self.results def fit_predict(self, train_methyl_array, outcome_cols=None): """Fit and predict training data. Parameters ---------- train_methyl_array Training MethylationArray. outcome_cols Set to none if not needed, but phenotype column to train on, can be multiple. """ self.results = self.fit(train_methyl_array, outcome_cols).predict(train_methyl_array) return self.results def store_results(self, output_pkl, results_dict={}): """Store results in pickle file. Parameters ---------- output_pkl Output pickle to dump results to. results_dict Supply own results dict to be dumped. """ if not results_dict: results_dict = dict(results=self.results) pickle.dump(results_dict, open(results_dict, 'wb')) def assign_results_to_pheno_col(self, methyl_array, new_col, output_pkl): """Assign results to new phenotype column. Parameters ---------- methyl_array MethylationArray. new_col New column name. output_pkl Output pickle to dump MethylationArray to. """ methyl_array.pheno[new_col] = self.results methyl_array.write_pickle(output_pkl) def transform_results_to_beta(self, methyl_array, output_pkl): """Transform beta matrix into reduced beta matrix and store. Parameters ---------- methyl_array MethylationArray. output_pkl Output pickle to dump MethylationArray to. """ methyl_array.beta = pd.DataFrame(self.results, index=self.beta.index) methyl_array.write_pickle(output_pkl) def return_outcome_metric(self, methyl_array, outcome_cols, metric, run_bootstrap=False): """Supply metric to evaluate results. Parameters ---------- methyl_array MethylationArray to evaluate. outcome_cols Outcome phenotype columns. metric Sklearn evaluation metric. run_bootstrap Make 95% CI from 1k bootstraps. """ y_true = methyl_array.pheno[outcome_cols] y_pred = self.results if not bootstrap: return metric(y_true, y_pred) else: from sklearn.utils import resample boot_results = np.array([ metric(*resample(y_true, y_pred, random_state=123)) for i in range(n_bootstrap) ]) original = metric(y_true, y_pred) std_err = np.std(boot_results) boot_results = np.sort(boot_results) ci = 0.95 bound = (1 - ci) / 2. # BORROWED FROM MLXTEND def quantile(x, q): rank = round(q * x.shape[0]) - 1 if rank >= x.shape[0]: rank = x.shape[0] elif rank <= 0: rank = 0 rank = int(round(rank)) return x[rank] high_ci = quantile(boot_results, q=(ci + bound)) low_ci = quantile(boot_results, q=bound) return original, std_err, (low_ci, high_ci)
def classifier(classifier, train, truth, validate, validate_truth, test, test_truth, datatype): np.random.seed(0) rng = np.random.permutation(1)[0] train = pd.DataFrame(train) validate = pd.DataFrame(validate) test = pd.DataFrame(test) logger = logging.getLogger('myapp') hdlr = logging.FileHandler('classifiers.log') formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') hdlr.setFormatter(formatter) logger.addHandler(hdlr) logger.setLevel(logging.WARN) if classifier.lower( ) == 'svm': #best: C = 50, gamma = 0.0001, kernel = rbf model = svm.SVC(random_state=rng) hyperparameter = { 'kernel': ('linear', 'rbf'), 'C': [1, 1.5, 10, 50, 100, 200], 'gamma': [1e-7, 1e-4] } elif classifier.lower() == 'randomforest': #120 model = RandomForestClassifier(random_state=rng) hyperparameter = {'n_estimators': np.arange(10, 300, 10)} elif classifier.lower() == 'adaboost': model = AdaBoostClassifier(random_state=rng) hyperparameter = { 'n_estimators': np.arange(10, 300, 10), 'algorithm': ('SAMME', 'SAMME.R') } elif classifier.lower() == 'knn': #120 model = KNeighborsClassifier() hyperparameter = dict(n_neighbors=list(range(1, 100))) else: ## assume it's asking for neural network (multi-layer perceptron) model = MLPClassifier( max_iter=100 ) #activation=tanh, hiddenlayersize=(20,20), 'learning_rate'=adaptive,solver=lbfgs hyperparameter = { 'hidden_layer_sizes': [(20, 20), (80, 20), (80, 20, 20), (80, 40, 40, 20), (40, 40, 20, 20, 20, 10)], 'learning_rate': ['adaptive'], 'activation': ['tanh', 'relu', 'logistic'], 'solver': ['lbfgs', 'sgd', 'adam'] } tuned_model = GridSearch(model=model, param_grid=hyperparameter) tuned_model.fit(train, truth) prediction = tuned_model.score(test, test_truth) logger.warn(classifier + ' ' + datatype + ' validate ' + str(prediction)) tuned_model.fit(train, truth, validate, validate_truth) prediction = tuned_model.score(test, test_truth) target_names = [ 'c-CS-s', 'c-CS-m', 'c-SC-s', 'c-SC-m', 't-CS-s', 't-CS-m', 't-SC-s', 't-SC-m' ] prediction = tuned_model.predict(test) print( classification_report(test_truth, prediction, target_names=target_names)) logger.warn(classifier + ' ' + datatype + ' ' + str(prediction)) return
def main(): parser = argparse.ArgumentParser() parser.add_argument('--vectors_file', required=True, type=str) parser.add_argument('--data_path', default='stanfordSentimentTreebank', type=str) parser.add_argument('--output_file', required=True, type=str) args = parser.parse_args() try: vectors except NameError: print('Reading vectors file ... ', end='') t = time.time() with codecs.open(args.vectors_file, 'r', "UTF-8") as f: vocab_size = sum(1 for line in f) with codecs.open(args.vectors_file, 'r', "UTF-8") as f: line = f.readline() val = line.rstrip().split(' ') check = False if len( val ) == 2: # Check if the vectors file has vocab size and diensionality in the first line val = f.readline().rstrip().split(' ') vocab_size -= 1 check = True vector_dim = len(list(map(float, val[1:]))) vectors = np.zeros((vocab_size, vector_dim)) words = [""] * vocab_size vocab_dict = dict() with codecs.open(args.vectors_file, 'r', "UTF-8") as f: if check: next(f) for idx, line in enumerate(f): vals = line.rstrip().split(' ') words[idx] = vals[0] vocab_dict[vals[0]] = idx # indices start from 0 vec = list(map(float, vals[1:])) try: vectors[idx, :] = vec except IndexError: if vals[0] == '<unk>': # ignore the <unk> vector pass else: raise Exception('IncompatibleInputs') print("done in " + str(int(time.time() - t)) + " seconds") print('Reading train and test data ... ', end='') t = time.time() dictionary = dict() with codecs.open(args.data_path + "/dictionary.txt", 'r', "UTF-8") as f: for line in f.read().splitlines(): tmp = line.split("|") dictionary[tmp[0]] = int(tmp[1]) with codecs.open(args.data_path + "/datasetSentences.txt", "r", "UTF-8") as f: sentences = [] for sentence in f.read().splitlines()[1:]: sentences.append(sentence.split("\t")[1]) all_labels = [] with open(args.data_path + "/sentiment_labels.txt") as f: for label in f.read().splitlines()[1:]: all_labels.append(float(label.split("|")[1])) split_classes = [] with open(args.data_path + "/datasetSplit.txt") as f: for line in f.read().splitlines()[1:]: split_classes.append(int(line.split(",")[1])) print("done in " + str(int(time.time() - t)) + " seconds") print( 'Generating train and test samples from the data for selected classes ... ', end='') t = time.time() train_size = sum([1 for label in split_classes if label == 1]) val_size = sum([1 for label in split_classes if label == 3]) test_size = sum([1 for label in split_classes if label == 2]) train_samples = np.zeros([train_size, vector_dim]) train_labels = [] val_samples = np.zeros([val_size, vector_dim]) val_labels = [] test_samples = np.zeros([test_size, vector_dim]) test_labels = [] train_no = 0 val_no = 0 test_no = 0 not_in_dict_count = 0 for sample_no, sentence in enumerate(sentences): try: score = all_labels[dictionary[sentence]] except: not_in_dict_count += 1 continue if score <= 0.4 or score > 0.6: # Eliminate noutral sentences inds = process_sentence(sentence, vocab_dict) if len(inds) > 0: if split_classes[sample_no] == 1: for ind in inds: train_samples[train_no, :] += vectors[ind, :] train_samples[ train_no, :] = train_samples[train_no, :] / len(inds) if score <= 0.4: train_labels.append(0) elif score > 0.6: train_labels.append(1) train_no += 1 elif split_classes[sample_no] == 3: for ind in inds: val_samples[val_no, :] += vectors[ind, :] val_samples[val_no, :] = val_samples[val_no, :] / len(inds) if score <= 0.4: val_labels.append(0) elif score > 0.6: val_labels.append(1) val_no += 1 elif split_classes[sample_no] == 2: for ind in inds: test_samples[test_no, :] += vectors[ind, :] test_samples[ test_no, :] = test_samples[test_no, :] / len(inds) if score <= 0.4: test_labels.append(0) elif score > 0.6: test_labels.append(1) test_no += 1 train_samples = train_samples[:train_no, :] val_samples = val_samples[:val_no, :] test_samples = test_samples[:test_no, :] print("done in " + str(int(time.time() - t)) + " seconds") print('Training linear SVM for parameter optimization ... ', end='') tuned_parameters = [{ 'kernel': ['linear'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }] clf = GridSearch(model=SVC(), param_grid=tuned_parameters) clf.fit(train_samples, train_labels, val_samples, val_labels) print("done in " + str(int(time.time() - t)) + " seconds") predicted_labels = clf.predict(test_samples) accuracy = sum([ true == predicted for true, predicted in zip(test_labels, predicted_labels) ]) / len(test_samples) * 100 print("Accuracy for sentiment classification of sentences is: " + str(round(accuracy, 2)) + "% (" + str(int(accuracy / 100 * len(predicted_labels))) + "/" + str(len(predicted_labels)) + ")") f_out = open(args.output_file, "w") f_out.write("Accuracy for sentiment classification is: " + str(round(accuracy, 2)) + "% (" + str(int(accuracy / 100 * len(predicted_labels))) + "/" + str(len(predicted_labels)) + ")\n") f_out.close()