def deserialize_gradient_boosting(model_dict): model = GradientBoostingClassifier(**model_dict['params']) estimators = [ regression.deserialize_decision_tree_regressor(tree) for tree in model_dict['estimators_'] ] model.estimators_ = np.array(estimators).reshape( model_dict['estimators_shape']) if 'init_' in model_dict and model_dict['init_']['meta'] == 'dummy': model.init_ = dummy.DummyClassifier() model.init_.__dict__ = model_dict['init_'] model.init_.__dict__.pop('meta') model.classes_ = np.array(model_dict['classes_']) model.train_score_ = np.array(model_dict['train_score_']) model.max_features_ = model_dict['max_features_'] model.n_classes_ = model_dict['n_classes_'] model.n_features_ = model_dict['n_features_'] if model_dict['loss_'] == 'deviance': model.loss_ = _gb_losses.BinomialDeviance(model.n_classes_) elif model_dict['loss_'] == 'exponential': model.loss_ = _gb_losses.ExponentialLoss(model.n_classes_) elif model_dict['loss_'] == 'multinomial': model.loss_ = _gb_losses.MultinomialDeviance(model.n_classes_) if 'priors' in model_dict: model.init_.priors = np.array(model_dict['priors']) return model
def getF1_SAF_allrows(allEntries): y_pred = {} for algo in ALGOS: algoStr = str(algo).split('.')[1].upper() y_pred[algoStr] = [] y_pred[algoStr].append([]) y_pred[algoStr].append([]) y_actual = [] print(len(allEntries)) threshold_sets = { 'St_c_DS': getThreshold(THRESHOLD_SETS.HUMANCLONE_QUART3, DB_SETS.GT10_DB_DATA, 'all'), 'St_n_DS': getThreshold(THRESHOLD_SETS.HUMANND_MEDIAN, DB_SETS.GT10_DB_DATA, 'all'), 'O_s_DS': getThreshold(THRESHOLD_SETS.OPTIMAL, DB_SETS.GT10_DB_DATA, 'all'), 'O_c_DS': getThreshold(THRESHOLD_SETS.OPTIMAL_CLASSIFICATION_CLONE, DB_SETS.GT10_DB_DATA, 'all'), 'O_n_DS': getThreshold(THRESHOLD_SETS.OPTIMAL_CLASSIFICATION_ND, DB_SETS.GT10_DB_DATA, 'all') } # threshold_sets["proportion_based"] = [getThreshold(THRESHOLD_SETS.FULLDB_QUART1, DB_SETS.GT10_DB_DATA, 'all'), getThreshold(THRESHOLD_SETS.FULLDB_MEDIAN, DB_SETS.GT10_DB_DATA, 'all')] # threshold_sets["sample_based"] = [getThreshold(THRESHOLD_SETS.HUMANCLONE_QUART3, DB_SETS.GT10_DB_DATA, 'all'), getThreshold(THRESHOLD_SETS.HUMANND_MEDIAN.FULLDB_MEDIAN, DB_SETS.GT10_DB_DATA, 'all')] algoScoreRows = [] fieldNames = [ 'thresholdSet', 'algoName', 'threshold', 'precision', 'recall', 'f1' ] dummyClassifier = dummy.DummyClassifier(strategy="stratified") print(threshold_sets) for threshold_set_name in threshold_sets: threshold_set = threshold_sets[threshold_set_name] for algo in ALGOS: algoStr = str(algo).split('.')[1].upper() threshold = threshold_set[algoStr] precision, recall, f1 = getF1_SAF(threshold, allEntries, algoStr) algoScoreRows.append({ 'thresholdSet': threshold_set_name, 'algoName': algoStr, 'threshold': threshold, 'precision': precision, 'recall': recall, 'f1': f1 }) writeCSV(fieldNames, algoScoreRows, "rq2_1_" + str(datetime.now().strftime("%Y%m%d-%H%M%S")) + ".csv")
def fit_baseline(self, x, y): ''' Fit the baseline for the MetaEstimator. That is, depending on the loss function, determine the optimal constant predictor, based on the training data on the output ''' # Determine if regression or classification problem if self.method_type is None: is_above = len(np.unique(y, axis=0)) > self.cutoff_categorical self.method_type = ('classif','regr')[is_above] # Fit a Dummy (constant) estimator if self.method_type == 'regr': self.fitted = dummy.DummyRegressor().fit(x, y) else: self.fitted = dummy.DummyClassifier().fit(x, y) self.classes = dummy.DummyClassifier().fit(x, y).classes_
def build_model(model_type, num_targets=1): if model_type == 'gradient_boosting': base = ensemble.GradientBoostingClassifier(n_estimators=100, verbose=True) elif model_type == 'random_forest': base = ensemble.RandomForestClassifier() elif model_type == 'dummy_stratified': base = dummy.DummyClassifier('stratified') elif model_type == 'dummy_most_frequent': base = dummy.DummyClassifier('most_frequent') else: raise (ValueError('invalid model type: {}'.format(model_type))) # multiple outputs in the dataset => fit a separate regressor to each if num_targets > 1: return multioutput.MultiOutputClassifier(base) else: return base
def build_sklearn(self, model_id, model_params): """Method that builds models implemented in sklearn""" if model_id == 'sklearn_LogisticRegressionCV': return linear_model.LogisticRegressionCV(**model_params) if model_id == 'sklearn_LogisticRegression': return linear_model.LogisticRegression(**model_params) elif model_id == 'sklearn_MLPClassifier': return neural_network.MLPClassifier(**model_params) elif model_id == 'sklearn_GaussianNB': return naive_bayes.GaussianNB(**model_params) elif model_id == 'sklearn_MultinomialNB': return naive_bayes.MultinomialNB(**model_params) elif model_id == 'sklearn_BernoulliNB': return naive_bayes.BernoulliNB(**model_params) elif model_id == 'sklearn_RandomForestClassifier': return ensemble.RandomForestClassifier(**model_params) elif model_id == 'sklearn_SVC': return svm.SVC(**model_params) elif model_id == 'sklearn_AdaBoostClassifier': return ensemble.AdaBoostClassifier(**model_params) elif model_id == 'sklearn_SGDClassifier': return linear_model.SGDClassifier(**model_params) elif model_id == 'sklearn_PassiveAggressiveClassifier': return linear_model.PassiveAggressiveClassifier(**model_params) elif model_id == 'sklearn_RidgeClassifier': return linear_model.RidgeClassifier(**model_params) elif model_id == 'sklearn_DummyClassifier': return dummy.DummyClassifier(**model_params) elif model_id == 'sklearn_KNeighborsClassifier': return neighbors.KNeighborsClassifier(**model_params) elif model_id == 'sklearn_DecisionTreeClassifier': return tree.DecisionTreeClassifier(**model_params) elif model_id == 'sklearn_LinearRegression': return linear_model.LinearRegression(**model_params) elif model_id == 'sklearn_LassoCV': return linear_model.LassoCV(**model_params) elif model_id == 'sklearn_RidgeCV': return linear_model.RidgeCV(**model_params) elif model_id == 'sklearn_Ridge': return linear_model.Ridge(**model_params) elif model_id == 'sklearn_DummyRegressor': return dummy.DummyRegressor(**model_params) elif model_id == 'sklearn_RandomForestRegressor': return ensemble.RandomForestRegressor(**model_params) elif model_id == 'sklearn_GradientBoostingRegressor': return ensemble.GradientBoostingRegressor(**model_params) elif model_id == 'sklearn_MLPRegressor': return neural_network.MLPRegressor(**model_params) elif model_id == 'sklearn_KNeighborsRegressor': return neighbors.KNeighborsRegressor(**model_params) elif model_id == 'sklearn_SVR': return svm.SVR(**model_params) elif model_id == 'sklearn_SGDRegressor': return linear_model.SGDRegressor(**model_params) elif model_id == 'sklearn_DecisionTreeRegressor': return tree.DecisionTreeRegressor(**model_params) return None
def train(excel_file, text_column, labels_column, train_test_idxs_file, n_jobs, model_file, n_accepted_probs, output_file): execution_info = pd.DataFrame() execution_info['Start date'] = [get_local_time_str()] torch.manual_seed(RANDOM_STATE) device = torch.device(f'cuda:{torch.cuda.current_device()}' \ if torch.cuda.is_available() \ else 'cpu') device_str = f'{device.type}:{device.index} ({torch.cuda.get_device_name(device.index)})' \ if device.type == 'cuda' \ else device.type print(f'Device: {device_str}') df = pd.read_excel(excel_file) df = df.fillna('NaN') corpus = df[text_column].tolist() labels = df[labels_column].tolist() train_test_idxs = load_json(train_test_idxs_file) train_idxs = train_test_idxs['train_idxs'] test_idxs = train_test_idxs['test_idxs'] corpus_train = utils.safe_indexing(corpus, train_idxs) corpus_test = utils.safe_indexing(corpus, test_idxs) y_train = utils.safe_indexing(labels, train_idxs) y_test = utils.safe_indexing(labels, test_idxs) train_set = BERTTokenizedDataset(corpus_train, y_train) val_set = BERTTokenizedDataset(corpus_test, y_test) train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, num_workers=n_jobs-1) val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, num_workers=n_jobs-1) assert train_loader.dataset.classes_ == val_loader.dataset.classes_ net = BERTNeuralNet(len(val_loader.dataset.classes_), freeze_bert=FREEZE_BERT) net.load_state_dict(torch.load(model_file, map_location=device)['model_state_dict']) net.additional_layers = nn.Sequential(*list(net.additional_layers.children())[0:-1]) ft = FeatureExtractor(device, net) X_train = ft.extract_features(train_loader, 'X_train.pkl', 'X_train.dat') X_test = ft.extract_features(val_loader, 'X_test.pkl', 'X_test.dat') clfs = [ ensemble.RandomForestClassifier(n_estimators=100, n_jobs=n_jobs, random_state=RANDOM_STATE), LinearSVC(random_state=RANDOM_STATE), dummy.DummyClassifier(strategy='stratified', random_state=RANDOM_STATE, constant=None), linear_model.SGDClassifier(loss='modified_huber', max_iter=1000, tol=1e-3, n_jobs=n_jobs, random_state=RANDOM_STATE) ] predictions = {'y_true': y_test} for clf in tqdm(iterable=clfs, desc='Fitting classifiers', unit='clf'): clf.fit(X_train, y_train) dump_pickle(clf, '%s.pkl' % (clf.__class__.__name__)) for clf in tqdm(iterable=clfs, desc='Obtaining probabilities', unit='clf'): y_predict_proba = clf.predict_proba(X_test) dicts = predict_proba_to_dicts(clf.classes_, y_predict_proba) predictions[clf.__class__.__name__] = dicts dump_json(predictions, 'predictions.json') execution_info['End date'] = [get_local_time_str()] execution_info['Excel file'] = [excel_file] execution_info['Text column'] = [text_column] execution_info['Label column'] = [labels_column] execution_info['Accepted probabilities'] = [n_accepted_probs] execution_info['Device'] = [device_str] execution_info['Base model'] = [model_file] execution_info['Batch size'] = [BATCH_SIZE] generate_report(execution_info, predictions, output_file)
def train(self): """ Train the family level classifiers """ print("Training ARO classifiers for each family") family_level_classifiers = {} for family in tqdm(self.card.gene_family_to_aro.keys()): family_name = family.replace(' ', '_').replace('/', '_') # get all the aros relevant to the family family_aros = self.card.gene_family_to_aro[family] # filter input to just the columns containing similarity to aros # within the family X_train = self.X[family_aros] # get the indices where the label is one of the AROs belonging to # the current family label_indices = [ ix for ix, x in enumerate(self.y) if x in family_aros ] y_train = np.array(self.y)[label_indices] # grab only the reads where the label index is an ARO belonging to the # current family being trained X_train = X_train.iloc[label_indices] if os.path.exists('models/{}.pkl'.format(family_name)): family_clf = joblib.load('models/{}.pkl'.format(family_name)) family_level_classifiers.update( {family: [family_clf, family_aros]}) continue # i.e. if family only has a single member if len(family_aros) == 1: family_clf = dummy.DummyClassifier(strategy='constant', constant=family_aros[0]) family_clf.fit(X_train, y_train) joblib.dump(family_clf, 'models/{}.pkl'.format(family_name)) family_level_classifiers.update( {family: [family_clf, family_aros]}) else: # rebalance using SMOTE X_resampled, y_resampled = SMOTE( kind='borderline1').fit_sample(X_train, y_train) family_clf = ensemble.RandomForestClassifier() family_clf.fit(X_resampled, y_resampled) joblib.dump(family_clf, 'models/{}.pkl'.format(family_name)) family_level_classifiers.update( {family: [family_clf, family_aros]}) self.family_level_classifiers = family_level_classifiers
def __init__(self, use_stacked_prob=False, stacked_classifier="decision_tree", estimators_to_remove=[], include_original_input=False): """Setup a SuperLearner classifier""" self.decision_tree = tree.DecisionTreeClassifier(criterion="entropy", max_depth=7, min_samples_split=11) self.random_forest = ensemble.RandomForestClassifier( n_estimators=500, max_features=4) #change_max_features self.bagging = ensemble.BaggingClassifier( base_estimator=tree.DecisionTreeClassifier(criterion="entropy"), n_estimators=10) self.logistic_model = linear_model.LogisticRegression( multi_class='auto') self.k_nearest_neighbours = neighbors.KNeighborsClassifier( n_neighbors=5) self.linear_svc = svm.SVC(kernel="linear", C=1.0, probability=True) self.include_original_input = include_original_input self.use_stacked_prob = use_stacked_prob self.estimators = { "decision_tree": self.decision_tree, "random_forest": self.random_forest, "bagging": self.bagging, "logistic_regression": self.logistic_model, "k_nearest_neighbours": self.k_nearest_neighbours, "linear_svc": self.linear_svc } #can use any subset of the availabe estimators self.estimators = { key: value for key, value in self.estimators.items() if key not in estimators_to_remove } #stacked layer classifier if stacked_classifier == "decision_tree" or stacked_classifier == None: self.Z_classifier = tree.DecisionTreeClassifier( criterion="entropy") elif stacked_classifier == "logistic_regression": self.Z_classifier = linear_model.LogisticRegression() elif stacked_classifier == "k_nearest_neighbours": self.Z_classifier = neighbors.KNeighborsClassifier(n_neighbors=5) elif stacked_classifier == "random_forest": self.Z_classifier = ensemble.RandomForestClassifier( n_estimators=500) elif stacked_classifier == "most_frequent": self.Z_classifier = dummy.DummyClassifier(strategy="most_frequent") else: raise ValueError( 'Error: Not known classifier for stacked layer classifier, check spelling' )
def zr(data_for_algos): model = dummy.DummyClassifier(strategy="most_frequent") X = np.asarray(list(map(lambda row: row[:-1], data_for_algos))) y = np.asarray(list(map(lambda row: row[-1], data_for_algos))) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) model.fit(X_train, y_train) return model, model.score(X_test, y_test)
def baseline(data): strategies = ['stratified', 'most_frequent', 'prior', 'uniform'] baseDict = {} X, y, features = data.get_data(target=data.default_target_attribute, return_attribute_names=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for strat in strategies: clf = dummy.DummyClassifier(strategy=strat, random_state=0) clf.fit(X_train, y_train) baseDict[strat] = clf.score(X_test, y_test) return baseDict
def calculate_classification_metrics(data, features): if len(features) == 0: clsfr = dummy.DummyClassifier(strategy="most_frequent") else: clsfr = linear_model.LogisticRegression() data_permutation = generate_permutation(len(data["class_label"])) # prepare data for classification X, y, race = generate_X_y_for_feature_set(data, features) # splitting data into train and test and preparing vector for calculating disparate mistreatment y = y[data_permutation] X = X[data_permutation] race = race[data_permutation] total_entries = len(y) cut_index = int(total_entries / 2.0) X_train, X_test = X[:cut_index], X[cut_index:] y_train, y_test = y[:cut_index], y[cut_index:] race_train, race_test = race[:cut_index], race[cut_index:] # calculating accuracy, disparate mistreatment and auc clsfr.fit(X_train, y_train) results = dict() predicted_labels = clsfr.predict(X_test) results["predicted_labels"] = predicted_labels results["accuracy"] = sum(y_test == predicted_labels) / len(y_test) results["auc"] = roc_auc_score(y_test, predicted_labels) # disparate mistreatment, calculating false positives and false negatives for Caucasian and non-Caucasian results["fp_C"] = sum( np.logical_and(predicted_labels == 1, np.logical_and(race_test == 1, y_test == -1))) / max( sum(np.logical_and(race_test == 1, y_test == -1)), 1) results["fn_C"] = sum( np.logical_and(predicted_labels == -1, np.logical_and(race_test == 1, y_test == 1))) / max( sum(np.logical_and(race_test == 1, y_test == 1)), 1) results["fp_nC"] = sum( np.logical_and(predicted_labels == 1, np.logical_and(race_test == 0, y_test == -1))) / max( sum(np.logical_and(race_test == 0, y_test == -1)), 1) results["fn_nC"] = sum( np.logical_and(predicted_labels == -1, np.logical_and(race_test == 0, y_test == 1))) / max( sum(np.logical_and(race_test == 0, y_test == 1)), 1) results["disparate_mistreatment"] = abs( results["fp_C"] - results["fp_nC"]) + abs(results["fn_C"] - results["fn_nC"]) return results
def train(excel_file, text_column, labels_column, train_test_idxs_file, n_jobs, n_accepted_probs, output_file): execution_info = pd.DataFrame() execution_info['Start date'] = [get_local_time_str()] df = pd.read_excel(excel_file) df = df.fillna('NaN') preprocessor = Preprocessor() corpus = preprocessor.preprocess(df[text_column]) dump_json(corpus, 'preprocessed_corpus_ELMo.json') labels = df[labels_column].tolist() train_test_idxs = load_json(train_test_idxs_file) train_idxs = train_test_idxs['train_idxs'] test_idxs = train_test_idxs['test_idxs'] corpus_train = utils.safe_indexing(corpus, train_idxs) corpus_test = utils.safe_indexing(corpus, test_idxs) y_train = utils.safe_indexing(labels, train_idxs) y_test = utils.safe_indexing(labels, test_idxs) ft = FeatureExtractor() X_train = ft.extract_features(corpus_train, 'X_train_ELMo.pkl', 'X_train_ELMo.dat') X_test = ft.extract_features(corpus_test, 'X_test_ELMo.pkl', 'X_test_ELMo.dat') clfs = [ ensemble.RandomForestClassifier(n_estimators=100, n_jobs=n_jobs, random_state=RANDOM_STATE), LinearSVC(random_state=RANDOM_STATE), dummy.DummyClassifier(strategy='stratified', random_state=RANDOM_STATE, constant=None), linear_model.SGDClassifier(loss='modified_huber', max_iter=1000, tol=1e-3, n_jobs=n_jobs, random_state=RANDOM_STATE) ] predictions = {'y_true': y_test} for clf in tqdm(iterable=clfs, desc='Fitting classifiers', unit='clf'): clf.fit(X_train, y_train) dump_pickle(clf, '%s.pkl' % (clf.__class__.__name__)) for clf in tqdm(iterable=clfs, desc='Obtaining probabilities', unit='clf'): y_predict_proba = clf.predict_proba(X_test) dicts = predict_proba_to_dicts(clf.classes_, y_predict_proba) predictions[clf.__class__.__name__] = dicts dump_json(predictions, 'predictions.json') execution_info['End date'] = [get_local_time_str()] execution_info['Excel file'] = excel_file execution_info['Text column'] = text_column execution_info['Label column'] = labels_column execution_info['n_jobs'] = n_jobs execution_info['Accepted probabilities'] = n_accepted_probs generate_report(execution_info, predictions, output_file)
def testZeroHour(STOCK, future_day, data_for_algos, data_to_predict_for_algos, test_classes): try: model = dummy.DummyClassifier(strategy="most_frequent") X = np.asarray(list(map(lambda row: row[:-1], data_for_algos))) y = np.asarray(list(map(lambda row: row[-1], data_for_algos))) model.fit(X, y) predictions = model.predict(data_to_predict_for_algos) our_test_score = get_test_score(predictions, test_classes) result = result_in_csv(STOCK, 'ZR', Future_day=future_day, Our_test_score=our_test_score) except: result = result_in_csv(STOCK, 'ZR', Future_day=future_day, Our_test_score=-1) print(result) return result
def classification(data_matrix, target_matrix, test_matrix, strategy='most_frequent'): print "data detected", datetime.now().time() model = dummy.DummyClassifier(strategy=strategy, random_state=None, constant=None) print "model made", datetime.now().time() model.fit(data_matrix, target_matrix) print "model fitted", datetime.now().time() results = model.predict(test_matrix) print results
def fit(self, x, y): ''' Fit method for the MetaEstimator. Output is a fitted estimator, that can then be used for prediction. ''' # Determine if regression or classification problem, by comparing number of # unique values in output against threshold if self.method_type is None: is_above = len(np.unique(y, axis=0)) > self.cutoff_categorical self.method_type = ('classif','regr')[is_above] # Fetch the appropriate list of estimators if self.estimators is None: if self.method is not None: self.get_estim(y) else: if self.method_type == 'regr': self.estimators = linear_model.LassoCV(normalize=True) elif self.method_type == 'classif': self.estimators = ensemble.RandomForestClassifier(random_state=1) else: if self.method_type == 'regr': self.estimators = self.estimators[0] elif self.method_type == 'classif': self.estimators = self.estimators[1] # Collect information on classes in training set (needed later) if self.method_type == 'classif': self.classes = dummy.DummyClassifier().fit(x, y).classes_ # Fit according to respective ensembling method if self.method == 'stacking': if self.method_type == 'regr': self.fitted = regressor.StackingRegressor(regressors=self.estimators, meta_regressor=linear_model.LinearRegression()).fit(x, y) elif self.method_type == 'classif': self.fitted = classifier.StackingClassifier(classifiers=self.estimators, meta_classifier=linear_model.LogisticRegression(random_state = 1)).fit(x, y) elif self.method == 'multiplexing': for i in self.estimators: self.losses.append(np.mean(cross_val_score(i, x, y))) # For multiplexing, cross validation scores determine which estimator is chosen self.fitted = self.estimators[np.argmin(self.losses)].fit(x, y) else: self.fitted = self.estimators.fit(x, y) return self
def zero_cost_model(self,X,y,add_to_model=False): if self.base_model._estimator_type=='classifier': model = dummy.DummyClassifier("prior") elif self.base_model._estimator_type=='regressor': model = dummy.DummyRegressor("mean") else: raise TypeError("sklearn Classifier or Regressor required!") cost = 0 features = [] model.fit(self.selectfeats(X,features),y) if add_to_model: self.model_costs.insert(0,cost) self.model_features.insert(0,features) self.models.insert(0,model) return (model, cost, features)
def run_cv(data, features, labels, folds): baseline = dummy.DummyClassifier(strategy='most_frequent') predictions = cross_val_predict(baseline, features, labels, cv=folds, n_jobs=-1) print('Cross-validated baseline most frequent:', accuracy_score(labels, predictions), file=stderr) nb = naive_bayes.GaussianNB() predictions = cross_val_predict(nb, features, labels, cv=folds, n_jobs=-1) print('Cross-validated naive Bayes:', accuracy_score(labels, predictions), file=stderr) knn_uni = neighbors.KNeighborsClassifier() predictions = cross_val_predict(knn_uni, features, labels, cv=folds, n_jobs=-1) print('Cross-validated KNN (uniform):', accuracy_score(labels, predictions), file=stderr) knn_dist = neighbors.KNeighborsClassifier(weights='distance') predictions = cross_val_predict(knn_dist, features, labels, cv=folds, n_jobs=-1) print('Cross-validated KNN (distance):', accuracy_score(labels, predictions), file=stderr) logreg = linear_model.LogisticRegression() # rfecv = RFECV(logreg, cv=10) predictions = cross_val_predict(logreg, features, labels, cv=folds, n_jobs=-1) print('Cross-validated logistic regression:', accuracy_score(labels, predictions), file=stderr) for i, prediction in enumerate(predictions): print(data['docno'][i], data['query'][i], prediction)
def __pred_randomly(self, X_train, y_train, X_test): dummyX_train = [[0] for x in X_train] dummyX_test = [[0] for x in X_test] clf = None if self.dataset.type == 'c': clf = dummy.DummyClassifier(strategy=self.dummy_strategy) else: clf = dummy.DummyRegressor() clf.fit(dummyX_train, y_train) return clf.predict(dummyX_test)
def testZeroHour(STOCK, future_day, data_for_algos): test_size = future_day X = np.asarray(list(map(lambda row: row[:-1], data_for_algos))) y = np.asarray(list(map(lambda row: row[-1], data_for_algos))) try: model = dummy.DummyClassifier(strategy="most_frequent") model.fit(X[:-test_size], y[:-test_size]) except: return result_in_csv(STOCK, 'ZR', Future_day=future_day, Our_test_score=-1) our_score = model.score(X[-test_size:], y[-test_size:]) return result_in_csv(STOCK, 'ZR', Future_day=future_day, Our_test_score=our_score)
def zr(data, future_day): scores = [] X = np.asarray(list(map(lambda row: row[:-1], data))) y = np.asarray(list(map(lambda row: row[-1], data))) train_indices, test_indices = k_splits.get_max_k_splits( X, k=10, size_of_each_split=future_day) model = dummy.DummyClassifier(strategy="most_frequent") predict_score = -1 for train_index, test_index in zip(train_indices, test_indices): X_train, y_train, X_test, y_test = k_splits.get_train_test_set( X, y, train_index, test_index) model.fit(X_train, y_train) predict_score = score.get_score(model, X_test, y_test) scores.append(predict_score) mean_score = np.mean(scores[:-1]) mean_score = -1 if np.isnan(mean_score) else mean_score return model, mean_score, predict_score
def testZeroHour(STOCK, future_day, data_for_algos, data_to_predict_for_algos, test_classes): try: model = dummy.DummyClassifier(strategy="most_frequent") X = np.asarray(list(map(lambda row: row[:-1], data_for_algos))) y = np.asarray(list(map(lambda row: row[-1], data_for_algos))) model.fit(X, y) predictions = model.predict(data_to_predict_for_algos) our_test_score = collections.Counter(predictions[0:future_day] * test_classes[0:future_day]).get(1) our_test_score = 0 if our_test_score is None else our_test_score result = f"{STOCK},ZR,0,0,0,0,{future_day},{our_test_score}\n" except: result = f"{STOCK},ZR,0,0,0,0,{future_day},error\n" print(result) return result
def run_baseline(config, datasets): #model = dummy.DummyClassifier(strategy='stratified') model = dummy.DummyClassifier(strategy='most_frequent') model.fit(datasets.train_vectors, datasets.train_labels) #test_acc = model.score(datasets.test_vectors, datasets.test_labels) preds = model.predict(datasets.test_vectors) test_acc = accuracy_score(datasets.test_labels, preds) print("baseline(most_freq) test_acc: %.4f" % (test_acc)) test_acc_by_tag = {} test_acc_by_freqbin = {} if config.test_tags: test_acc_by_tag = test_breakdown_by_tag(datasets.test_labels, preds, datasets.test_words, '__') if config.freq_dict: test_acc_by_freqbin = test_breakdown_by_freqbin( datasets.test_labels, preds, datasets.test_words, '__', datasets.freqbin_dict) return test_acc, test_acc_by_tag, test_acc_by_freqbin
def __create_models(self) -> list[tuple[Any, ModelType]]: n_jobs = self.config.n_jobs model_type = self.config.model_type models = [] if model_type in [ModelType.ALL, ModelType.DUMMY]: models.append((dummy.DummyClassifier(strategy="stratified"), ModelType.DUMMY)) if model_type in [ModelType.ALL, ModelType.RANDOM_FOREST]: models.append(( ensemble.RandomForestClassifier(n_jobs=n_jobs), ModelType.RANDOM_FOREST, )) if model_type in [ModelType.ALL, ModelType.EXTRA_TREES]: models.append(( ensemble.ExtraTreesClassifier(n_jobs=n_jobs), ModelType.EXTRA_TREES, )) if model_type in [ModelType.ALL, ModelType.LGBM]: n_labels = self.dataset_train.n_labels models.append(( lgb.LGBMClassifier(objective="multiclass", num_class=n_labels, n_jobs=n_jobs), ModelType.LGBM, )) if model_type in [ModelType.ALL, ModelType.SVM]: models.append((svm.SVC(), ModelType.SVM)) if model_type in [ModelType.ALL, ModelType.KNN]: models.append( (neighbors.KNeighborsClassifier(n_jobs=n_jobs), ModelType.KNN)) return models
def test_gridsearch(): doctor = strategyGame.strategyGameDoctor() ratingCounts = [10, 30, 50, 80, 100, 150, 200, 500, 1000, 2000, 3000] ratingCounts = [100] scores = [] for minRatingCount in ratingCounts: Xdata, ydata = getData(minRatingCount) X = doctor.readXdata(Xdata) y = doctor.readydata(ydata) # X, y = preprocess.balanceSample(X, y) pipe = doctor.getPipe() model1 = ordinalClassifier.OrdinalClassifier( linear_model.LogisticRegression()) model2 = tree.DecisionTreeClassifier(max_depth=5) model3 = ensemble.RandomForestClassifier(max_depth=10) model4 = dummy.DummyClassifier(strategy="most_frequent") transformer1 = pipeline.Pipeline([("pac", decomposition.PCA())]) transformer2 = "passthrough" # model5 = naive_bayes.GaussianNB() paramGrid = { "tranformer": [transformer2], "model": [model3, model4], # "model__max_depth": [3, 5, 7, 10, 15], } gscv = model_selection.GridSearchCV(pipe, paramGrid, cv=5, scoring="accuracy") gscv.fit(X, y) print("\nGrid Search Report") for key in ["mean_test_score", "std_test_score", "rank_test_score"]: print(f"{key}:{[round(x,2) for x in gscv.cv_results_[key]]}") scores.append( (minRatingCount, list(gscv.cv_results_["mean_test_score"]))) print(scores)
ensemble.RandomForestClassifier(max_depth=2, random_state=0), 'Adaboost': ensemble.AdaBoostClassifier(random_state=0), 'MultinomialNB': naive_bayes.MultinomialNB(), # 'GaussianNB': gnb_predict, 'BernoulliNB': naive_bayes.BernoulliNB(), 'KNN': neighbors.KNeighborsClassifier(n_neighbors=10), 'SVM': svm.SVC(kernel='rbf', gamma=0.7, C=1, probability=True), # 'Random': # dummy.DummyClassifier(strategy='stratified'), 'Most Frequent': dummy.DummyClassifier(strategy='most_frequent') } #----------------------------------------------preprocessing--------------------------------------------------------# file = pd.read_csv('rnn3.csv') dim = np.arange(length).astype(str) #train label = file[str(length)].values.astype(int) data = np.asarray(file[dim].values / 101) X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=294967295)
import pandas as pd import os from sklearn import dummy dir = 'E:/' titanic_train = pd.read_csv(os.path.join(dir, 'train.csv')) print(titanic_train.info()) print(titanic_train.columns) titanic_train.groupby('Survived').size() X_train = titanic_train[ ['SibSp', 'Parch'] ] y_train = titanic_train['Survived'] dummy_estimator = dummy.DummyClassifier(strategy="stratified", random_state=10) dummy_estimator.fit(X_train, y_train) titanic_test = pd.read_csv(os.path.join(dir, 'test.csv')) print(titanic_test.info()) X_test = titanic_test[ ['SibSp', 'Parch'] ] titanic_test['Survived'] = dummy_estimator.predict(X_test) titanic_test.groupby('Survived').size() titanic_test.to_csv(os.path.join(dir, 'submission.csv'), columns=['PassengerId', 'Survived'], index=False)
def testDummyClassifier(): dummyClassifier = dummy.DummyClassifier(strategy="stratified") X = [[0]] * 10 y = [0, 1, 2, 0, 1, 2, 0, 0, 1, 2] dummyClassifier.fit(X, y) dbName = "/Test/gt10/gt10_last500Responses.db" connectToDB(dbName) allEntries = fetchAllNearDuplicates("where human_classification>=0") closeDBConnection() y_actual = [] fieldNames = [ 'thresholdSet', 'algoName', 'c-thre', 'n-thre', 'precision', 'recall', 'f1' ] for entry in allEntries: index = 4 for algo in ALGOS: index = index + 1 y_actual.append(entry[index]) y_pred = dummyClassifier.predict(y_actual) precision, recall, f1, support = metrics.precision_recall_fscore_support( y_actual, y_pred, average="macro") # precision = metrics.precision(y_actual, y_pred[algoStr], average="macro") # recall = metrics.recall(y_actual, y_pred[algoStr], average="macro") # f1 = metrics.f1_score(y_actual, y_pred[algoStr], average="macro") row1 = { 'thresholdSet': None, 'algoName': "dummy", 'c-thre': None, 'n-thre': None, 'precision': precision, 'recall': recall, 'f1': f1 } print(row1) dbName = "/comparator/src/main/resources/GoldStandards/SS.db" connectToDB(dbName) allEntries = fetchAllNearDuplicates("where human_classification>=0") closeDBConnection() y_actual = [] for entry in allEntries: index = 4 for algo in ALGOS: index = index + 1 y_actual.append(entry[index]) y_pred = dummyClassifier.predict(y_actual) precision, recall, f1, support = metrics.precision_recall_fscore_support( y_actual, y_pred, average="macro") row2 = { 'thresholdSet': None, 'algoName': "dummy", 'c-thre': None, 'n-thre': None, 'precision': precision, 'recall': recall, 'f1': f1 } print(row2) writeCSV(fieldNames, [row1, row2], "rq1_dummy.csv")
def getF1_Classifier(allEntries): y_pred = {} for algo in ALGOS: algoStr = str(algo).split('.')[1].upper() y_pred[algoStr] = [] y_actual = [] print(len(allEntries)) threshold_sets = {} # threshold_sets["proportion_based"] = [getThreshold(THRESHOLD_SETS.FULLDB_QUART1, DB_SETS.GT10_DB_DATA, 'all'), getThreshold(THRESHOLD_SETS.FULLDB_MEDIAN, DB_SETS.GT10_DB_DATA, 'all')] threshold_sets["statistical"] = [ getThreshold(THRESHOLD_SETS.HUMANCLONE_QUART3, DB_SETS.GT10_DB_DATA, 'all'), getThreshold(THRESHOLD_SETS.HUMANND_MEDIAN, DB_SETS.GT10_DB_DATA, 'all') ] threshold_sets["optimal"] = [ getThreshold(THRESHOLD_SETS.OPTIMAL_CLASSIFICATION_CLONE, DB_SETS.GT10_DB_DATA, 'all'), getThreshold(THRESHOLD_SETS.OPTIMAL_CLASSIFICATION_ND, DB_SETS.GT10_DB_DATA, 'all') ] algoScoreRows = [] fieldNames = [ 'thresholdSet', 'algoName', 'c-thre', 'n-thre', 'precision', 'recall', 'f1' ] dummyClassifier = dummy.DummyClassifier(strategy="stratified") print(threshold_sets) for threshold_set_name in threshold_sets: threshold_set = threshold_sets[threshold_set_name] cloneThresholds = threshold_set[0] ndThresholds = threshold_set[1] # print(cloneThresholds) for entry in allEntries: index = 4 for algo in ALGOS: algoStr = str(algo).split('.')[1].upper() value = float(entry[index]) pred = -1 if algo.value[2] == "lt": if value <= cloneThresholds[algoStr]: pred = 0 if value > cloneThresholds[algoStr]: if value <= ndThresholds[algoStr]: pred = 1 else: pred = 2 else: if value >= cloneThresholds[algoStr]: pred = 0 if value < cloneThresholds[algoStr]: if value >= ndThresholds[algoStr]: pred = 1 else: pred = 2 y_pred[algoStr].append(pred) index = index + 1 y_actual.append(entry[index]) for algo in ALGOS: algoStr = str(algo).split('.')[1].upper() cm = metrics.confusion_matrix(y_actual, y_pred[algoStr]) # print(cm) precision, recall, f1, support = metrics.precision_recall_fscore_support( y_actual, y_pred[algoStr], average="macro") # precision = metrics.precision(y_actual, y_pred[algoStr], average="macro") # recall = metrics.recall(y_actual, y_pred[algoStr], average="macro") # f1 = metrics.f1_score(y_actual, y_pred[algoStr], average="macro") row = { 'thresholdSet': threshold_set_name, 'algoName': algoStr, 'c-thre': cloneThresholds[algoStr], 'n-thre': ndThresholds[algoStr], 'precision': precision, 'recall': recall, 'f1': f1 } algoScoreRows.append(row) X = [[0]] * len(y_actual) dummyClassifier.fit(X, y_actual) y_pred_dummy = dummyClassifier.predict(y_actual) precision, recall, f1, support = metrics.precision_recall_fscore_support( y_actual, y_pred_dummy, average="macro") row2 = { 'thresholdSet': None, 'algoName': "dummy", 'c-thre': None, 'n-thre': None, 'precision': precision, 'recall': recall, 'f1': f1 } algoScoreRows.append(row2) writeCSV( fieldNames, algoScoreRows, os.path.join( os.path.abspath(".."), RESULTS_FOLDER, "rq1_" + str(datetime.now().strftime("%Y%m%d-%H%M%S")) + ".csv"))
ensemble.RandomForestClassifier(max_depth=2, random_state=0), 'Adaboost': ensemble.AdaBoostClassifier(random_state=0), 'MultinomialNB': naive_bayes.MultinomialNB(), # 'GaussianNB': gnb_predict, 'BernoulliNB': naive_bayes.BernoulliNB(), 'KNN': neighbors.KNeighborsClassifier(n_neighbors=10), 'SVM': svm.SVC(kernel='rbf', gamma=0.7, C=1, probability=True), # 'Random': # dummy.DummyClassifier(strategy='stratified'), 'Most Frequent': dummy.DummyClassifier(strategy='most_frequent'), 'Uniform': dummy.DummyClassifier(strategy='uniform') } #----------------------------------------------preprocessing--------------------------------------------------------# file = pd.read_csv('rnn3.csv') dim = np.arange(length).astype(str) #train label = file[str(length)].values.astype(int) data = np.asarray(file[dim].values / 101) X_train, X_test, y_train, y_test = train_test_split(data, label,
import pandas as pd import os from sklearn import dummy dir = 'E:/' titanic_train = pd.read_csv(os.path.join(dir, 'train.csv')) print(titanic_train.info()) print(titanic_train.columns) X_train = titanic_train[ ['SibSp', 'Parch'] ] y_train = titanic_train['Survived'] dummy_estimator = dummy.DummyClassifier(strategy="uniform", random_state=10) dummy_estimator.fit(X_train, y_train) titanic_test = pd.read_csv(os.path.join(dir, 'test.csv')) print(titanic_test.info()) X_test = titanic_test[ ['SibSp', 'Parch'] ] titanic_test['Survived'] = dummy_estimator.predict(X_test) titanic_test.to_csv(os.path.join(dir, 'submission.csv'), columns=['PassengerId', 'Survived'], index=False)