def deserialize_gradient_boosting(model_dict):
    model = GradientBoostingClassifier(**model_dict['params'])
    estimators = [
        regression.deserialize_decision_tree_regressor(tree)
        for tree in model_dict['estimators_']
    ]
    model.estimators_ = np.array(estimators).reshape(
        model_dict['estimators_shape'])
    if 'init_' in model_dict and model_dict['init_']['meta'] == 'dummy':
        model.init_ = dummy.DummyClassifier()
        model.init_.__dict__ = model_dict['init_']
        model.init_.__dict__.pop('meta')

    model.classes_ = np.array(model_dict['classes_'])
    model.train_score_ = np.array(model_dict['train_score_'])
    model.max_features_ = model_dict['max_features_']
    model.n_classes_ = model_dict['n_classes_']
    model.n_features_ = model_dict['n_features_']
    if model_dict['loss_'] == 'deviance':
        model.loss_ = _gb_losses.BinomialDeviance(model.n_classes_)
    elif model_dict['loss_'] == 'exponential':
        model.loss_ = _gb_losses.ExponentialLoss(model.n_classes_)
    elif model_dict['loss_'] == 'multinomial':
        model.loss_ = _gb_losses.MultinomialDeviance(model.n_classes_)

    if 'priors' in model_dict:
        model.init_.priors = np.array(model_dict['priors'])
    return model
Esempio n. 2
0
def getF1_SAF_allrows(allEntries):

    y_pred = {}

    for algo in ALGOS:
        algoStr = str(algo).split('.')[1].upper()
        y_pred[algoStr] = []
        y_pred[algoStr].append([])
        y_pred[algoStr].append([])

    y_actual = []

    print(len(allEntries))

    threshold_sets = {
        'St_c_DS':
        getThreshold(THRESHOLD_SETS.HUMANCLONE_QUART3, DB_SETS.GT10_DB_DATA,
                     'all'),
        'St_n_DS':
        getThreshold(THRESHOLD_SETS.HUMANND_MEDIAN, DB_SETS.GT10_DB_DATA,
                     'all'),
        'O_s_DS':
        getThreshold(THRESHOLD_SETS.OPTIMAL, DB_SETS.GT10_DB_DATA, 'all'),
        'O_c_DS':
        getThreshold(THRESHOLD_SETS.OPTIMAL_CLASSIFICATION_CLONE,
                     DB_SETS.GT10_DB_DATA, 'all'),
        'O_n_DS':
        getThreshold(THRESHOLD_SETS.OPTIMAL_CLASSIFICATION_ND,
                     DB_SETS.GT10_DB_DATA, 'all')
    }

    # threshold_sets["proportion_based"] = [getThreshold(THRESHOLD_SETS.FULLDB_QUART1, DB_SETS.GT10_DB_DATA, 'all'), getThreshold(THRESHOLD_SETS.FULLDB_MEDIAN, DB_SETS.GT10_DB_DATA, 'all')]
    # threshold_sets["sample_based"] = [getThreshold(THRESHOLD_SETS.HUMANCLONE_QUART3, DB_SETS.GT10_DB_DATA, 'all'), getThreshold(THRESHOLD_SETS.HUMANND_MEDIAN.FULLDB_MEDIAN, DB_SETS.GT10_DB_DATA, 'all')]

    algoScoreRows = []
    fieldNames = [
        'thresholdSet', 'algoName', 'threshold', 'precision', 'recall', 'f1'
    ]

    dummyClassifier = dummy.DummyClassifier(strategy="stratified")

    print(threshold_sets)

    for threshold_set_name in threshold_sets:
        threshold_set = threshold_sets[threshold_set_name]
        for algo in ALGOS:
            algoStr = str(algo).split('.')[1].upper()
            threshold = threshold_set[algoStr]
            precision, recall, f1 = getF1_SAF(threshold, allEntries, algoStr)
            algoScoreRows.append({
                'thresholdSet': threshold_set_name,
                'algoName': algoStr,
                'threshold': threshold,
                'precision': precision,
                'recall': recall,
                'f1': f1
            })

    writeCSV(fieldNames, algoScoreRows,
             "rq2_1_" + str(datetime.now().strftime("%Y%m%d-%H%M%S")) + ".csv")
Esempio n. 3
0
 def fit_baseline(self, x, y):
     '''
     Fit the baseline for the MetaEstimator. That is, depending on the loss function, determine
     the optimal constant predictor, based on the training data on the output
     '''
     
     # Determine if regression or classification problem
     if self.method_type is None:
         is_above = len(np.unique(y, axis=0)) > self.cutoff_categorical
         self.method_type = ('classif','regr')[is_above]
     
     # Fit a Dummy (constant) estimator
     if self.method_type == 'regr':
         self.fitted = dummy.DummyRegressor().fit(x, y)
     else:
         self.fitted = dummy.DummyClassifier().fit(x, y)
         self.classes = dummy.DummyClassifier().fit(x, y).classes_
Esempio n. 4
0
def build_model(model_type, num_targets=1):
    if model_type == 'gradient_boosting':
        base = ensemble.GradientBoostingClassifier(n_estimators=100,
                                                   verbose=True)
    elif model_type == 'random_forest':
        base = ensemble.RandomForestClassifier()
    elif model_type == 'dummy_stratified':
        base = dummy.DummyClassifier('stratified')
    elif model_type == 'dummy_most_frequent':
        base = dummy.DummyClassifier('most_frequent')
    else:
        raise (ValueError('invalid model type: {}'.format(model_type)))

    # multiple outputs in the dataset => fit a separate regressor to each
    if num_targets > 1:
        return multioutput.MultiOutputClassifier(base)
    else:
        return base
Esempio n. 5
0
 def build_sklearn(self, model_id, model_params):
     """Method that builds models implemented in sklearn"""
     if model_id == 'sklearn_LogisticRegressionCV':
         return linear_model.LogisticRegressionCV(**model_params)
     if model_id == 'sklearn_LogisticRegression':
         return linear_model.LogisticRegression(**model_params)
     elif model_id == 'sklearn_MLPClassifier':
         return neural_network.MLPClassifier(**model_params)
     elif model_id == 'sklearn_GaussianNB':
         return naive_bayes.GaussianNB(**model_params)
     elif model_id == 'sklearn_MultinomialNB':
         return naive_bayes.MultinomialNB(**model_params)
     elif model_id == 'sklearn_BernoulliNB':
         return naive_bayes.BernoulliNB(**model_params)
     elif model_id == 'sklearn_RandomForestClassifier':
         return ensemble.RandomForestClassifier(**model_params)
     elif model_id == 'sklearn_SVC':
         return svm.SVC(**model_params)
     elif model_id == 'sklearn_AdaBoostClassifier':
         return ensemble.AdaBoostClassifier(**model_params)
     elif model_id == 'sklearn_SGDClassifier':
         return linear_model.SGDClassifier(**model_params)
     elif model_id == 'sklearn_PassiveAggressiveClassifier':
         return linear_model.PassiveAggressiveClassifier(**model_params)
     elif model_id == 'sklearn_RidgeClassifier':
         return linear_model.RidgeClassifier(**model_params)
     elif model_id == 'sklearn_DummyClassifier':
         return dummy.DummyClassifier(**model_params)
     elif model_id == 'sklearn_KNeighborsClassifier':
         return neighbors.KNeighborsClassifier(**model_params)
     elif model_id == 'sklearn_DecisionTreeClassifier':
         return tree.DecisionTreeClassifier(**model_params)
     elif model_id == 'sklearn_LinearRegression':
         return linear_model.LinearRegression(**model_params)
     elif model_id == 'sklearn_LassoCV':
         return linear_model.LassoCV(**model_params)
     elif model_id == 'sklearn_RidgeCV':
         return linear_model.RidgeCV(**model_params)
     elif model_id == 'sklearn_Ridge':
         return linear_model.Ridge(**model_params)
     elif model_id == 'sklearn_DummyRegressor':
         return dummy.DummyRegressor(**model_params)
     elif model_id == 'sklearn_RandomForestRegressor':
         return ensemble.RandomForestRegressor(**model_params)
     elif model_id == 'sklearn_GradientBoostingRegressor':
         return ensemble.GradientBoostingRegressor(**model_params)
     elif model_id == 'sklearn_MLPRegressor':
         return neural_network.MLPRegressor(**model_params)
     elif model_id == 'sklearn_KNeighborsRegressor':
         return neighbors.KNeighborsRegressor(**model_params)
     elif model_id == 'sklearn_SVR':
         return svm.SVR(**model_params)
     elif model_id == 'sklearn_SGDRegressor':
         return linear_model.SGDRegressor(**model_params)
     elif model_id == 'sklearn_DecisionTreeRegressor':
         return tree.DecisionTreeRegressor(**model_params)
     return None
Esempio n. 6
0
def train(excel_file, text_column, labels_column, train_test_idxs_file, n_jobs, model_file, n_accepted_probs, output_file):
    execution_info = pd.DataFrame()
    execution_info['Start date'] = [get_local_time_str()]
    torch.manual_seed(RANDOM_STATE)
    device = torch.device(f'cuda:{torch.cuda.current_device()}' \
                          if torch.cuda.is_available() \
                          else 'cpu')
    device_str = f'{device.type}:{device.index} ({torch.cuda.get_device_name(device.index)})' \
                 if device.type == 'cuda' \
                 else device.type
    print(f'Device: {device_str}')
    df = pd.read_excel(excel_file)
    df = df.fillna('NaN')
    corpus = df[text_column].tolist()
    labels = df[labels_column].tolist()
    train_test_idxs = load_json(train_test_idxs_file)
    train_idxs = train_test_idxs['train_idxs']
    test_idxs = train_test_idxs['test_idxs']
    corpus_train = utils.safe_indexing(corpus, train_idxs)
    corpus_test = utils.safe_indexing(corpus, test_idxs)
    y_train = utils.safe_indexing(labels, train_idxs)
    y_test = utils.safe_indexing(labels, test_idxs)
    train_set = BERTTokenizedDataset(corpus_train, y_train)
    val_set = BERTTokenizedDataset(corpus_test, y_test)
    train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, num_workers=n_jobs-1)
    val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, num_workers=n_jobs-1)
    assert train_loader.dataset.classes_ == val_loader.dataset.classes_
    net = BERTNeuralNet(len(val_loader.dataset.classes_), freeze_bert=FREEZE_BERT)
    net.load_state_dict(torch.load(model_file, map_location=device)['model_state_dict'])
    net.additional_layers = nn.Sequential(*list(net.additional_layers.children())[0:-1])
    ft = FeatureExtractor(device, net)
    X_train = ft.extract_features(train_loader, 'X_train.pkl', 'X_train.dat')
    X_test = ft.extract_features(val_loader, 'X_test.pkl', 'X_test.dat')
    clfs = [
        ensemble.RandomForestClassifier(n_estimators=100, n_jobs=n_jobs, random_state=RANDOM_STATE),
        LinearSVC(random_state=RANDOM_STATE),
        dummy.DummyClassifier(strategy='stratified', random_state=RANDOM_STATE, constant=None),
        linear_model.SGDClassifier(loss='modified_huber', max_iter=1000, tol=1e-3, n_jobs=n_jobs, random_state=RANDOM_STATE)
    ]
    predictions = {'y_true': y_test}
    for clf in tqdm(iterable=clfs, desc='Fitting classifiers', unit='clf'):
        clf.fit(X_train, y_train)
        dump_pickle(clf, '%s.pkl' % (clf.__class__.__name__))
    for clf in tqdm(iterable=clfs, desc='Obtaining probabilities', unit='clf'):
        y_predict_proba = clf.predict_proba(X_test)
        dicts = predict_proba_to_dicts(clf.classes_, y_predict_proba)
        predictions[clf.__class__.__name__] = dicts
    dump_json(predictions, 'predictions.json')
    execution_info['End date'] = [get_local_time_str()]
    execution_info['Excel file'] = [excel_file]
    execution_info['Text column'] = [text_column]
    execution_info['Label column'] = [labels_column]
    execution_info['Accepted probabilities'] = [n_accepted_probs]
    execution_info['Device'] = [device_str]
    execution_info['Base model'] = [model_file]
    execution_info['Batch size'] = [BATCH_SIZE]
    generate_report(execution_info, predictions, output_file)
Esempio n. 7
0
    def train(self):
        """
        Train the family level classifiers
        """
        print("Training ARO classifiers for each family")
        family_level_classifiers = {}
        for family in tqdm(self.card.gene_family_to_aro.keys()):

            family_name = family.replace(' ', '_').replace('/', '_')
            # get all the aros relevant to the family
            family_aros = self.card.gene_family_to_aro[family]

            # filter input to just the columns containing similarity to aros
            # within the family
            X_train = self.X[family_aros]

            # get the indices where the label is one of the AROs belonging to
            # the current family
            label_indices = [
                ix for ix, x in enumerate(self.y) if x in family_aros
            ]

            y_train = np.array(self.y)[label_indices]

            # grab only the reads where the label index is an ARO belonging to the
            # current family being trained
            X_train = X_train.iloc[label_indices]

            if os.path.exists('models/{}.pkl'.format(family_name)):
                family_clf = joblib.load('models/{}.pkl'.format(family_name))
                family_level_classifiers.update(
                    {family: [family_clf, family_aros]})
                continue

            # i.e. if family only has a single member
            if len(family_aros) == 1:
                family_clf = dummy.DummyClassifier(strategy='constant',
                                                   constant=family_aros[0])
                family_clf.fit(X_train, y_train)

                joblib.dump(family_clf, 'models/{}.pkl'.format(family_name))
                family_level_classifiers.update(
                    {family: [family_clf, family_aros]})
            else:
                # rebalance using SMOTE
                X_resampled, y_resampled = SMOTE(
                    kind='borderline1').fit_sample(X_train, y_train)

                family_clf = ensemble.RandomForestClassifier()

                family_clf.fit(X_resampled, y_resampled)

                joblib.dump(family_clf, 'models/{}.pkl'.format(family_name))
                family_level_classifiers.update(
                    {family: [family_clf, family_aros]})

        self.family_level_classifiers = family_level_classifiers
Esempio n. 8
0
    def __init__(self,
                 use_stacked_prob=False,
                 stacked_classifier="decision_tree",
                 estimators_to_remove=[],
                 include_original_input=False):
        """Setup a SuperLearner classifier"""
        self.decision_tree = tree.DecisionTreeClassifier(criterion="entropy",
                                                         max_depth=7,
                                                         min_samples_split=11)
        self.random_forest = ensemble.RandomForestClassifier(
            n_estimators=500, max_features=4)  #change_max_features
        self.bagging = ensemble.BaggingClassifier(
            base_estimator=tree.DecisionTreeClassifier(criterion="entropy"),
            n_estimators=10)
        self.logistic_model = linear_model.LogisticRegression(
            multi_class='auto')
        self.k_nearest_neighbours = neighbors.KNeighborsClassifier(
            n_neighbors=5)
        self.linear_svc = svm.SVC(kernel="linear", C=1.0, probability=True)

        self.include_original_input = include_original_input
        self.use_stacked_prob = use_stacked_prob

        self.estimators = {
            "decision_tree": self.decision_tree,
            "random_forest": self.random_forest,
            "bagging": self.bagging,
            "logistic_regression": self.logistic_model,
            "k_nearest_neighbours": self.k_nearest_neighbours,
            "linear_svc": self.linear_svc
        }

        #can use any subset of the availabe estimators
        self.estimators = {
            key: value
            for key, value in self.estimators.items()
            if key not in estimators_to_remove
        }

        #stacked layer classifier
        if stacked_classifier == "decision_tree" or stacked_classifier == None:
            self.Z_classifier = tree.DecisionTreeClassifier(
                criterion="entropy")
        elif stacked_classifier == "logistic_regression":
            self.Z_classifier = linear_model.LogisticRegression()
        elif stacked_classifier == "k_nearest_neighbours":
            self.Z_classifier = neighbors.KNeighborsClassifier(n_neighbors=5)
        elif stacked_classifier == "random_forest":
            self.Z_classifier = ensemble.RandomForestClassifier(
                n_estimators=500)
        elif stacked_classifier == "most_frequent":
            self.Z_classifier = dummy.DummyClassifier(strategy="most_frequent")
        else:
            raise ValueError(
                'Error: Not known classifier for stacked layer classifier, check spelling'
            )
def zr(data_for_algos):
    model = dummy.DummyClassifier(strategy="most_frequent")
    X = np.asarray(list(map(lambda row: row[:-1], data_for_algos)))
    y = np.asarray(list(map(lambda row: row[-1], data_for_algos)))
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)
    model.fit(X_train, y_train)
    return model, model.score(X_test, y_test)
Esempio n. 10
0
def baseline(data):
    strategies = ['stratified', 'most_frequent', 'prior', 'uniform']
    baseDict = {}
    X, y, features = data.get_data(target=data.default_target_attribute,
                                   return_attribute_names=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    for strat in strategies:
        clf = dummy.DummyClassifier(strategy=strat, random_state=0)
        clf.fit(X_train, y_train)
        baseDict[strat] = clf.score(X_test, y_test)
    return baseDict
def calculate_classification_metrics(data, features):
    if len(features) == 0:
        clsfr = dummy.DummyClassifier(strategy="most_frequent")
    else:
        clsfr = linear_model.LogisticRegression()
    data_permutation = generate_permutation(len(data["class_label"]))

    # prepare data for classification
    X, y, race = generate_X_y_for_feature_set(data, features)

    # splitting data into train and test and preparing vector for calculating disparate mistreatment
    y = y[data_permutation]
    X = X[data_permutation]
    race = race[data_permutation]

    total_entries = len(y)
    cut_index = int(total_entries / 2.0)
    X_train, X_test = X[:cut_index], X[cut_index:]
    y_train, y_test = y[:cut_index], y[cut_index:]
    race_train, race_test = race[:cut_index], race[cut_index:]

    # calculating accuracy, disparate mistreatment and auc
    clsfr.fit(X_train, y_train)
    results = dict()

    predicted_labels = clsfr.predict(X_test)
    results["predicted_labels"] = predicted_labels
    results["accuracy"] = sum(y_test == predicted_labels) / len(y_test)
    results["auc"] = roc_auc_score(y_test, predicted_labels)

    # disparate mistreatment, calculating false positives and false negatives for Caucasian and non-Caucasian
    results["fp_C"] = sum(
        np.logical_and(predicted_labels == 1,
                       np.logical_and(race_test == 1, y_test == -1))) / max(
                           sum(np.logical_and(race_test == 1, y_test == -1)),
                           1)
    results["fn_C"] = sum(
        np.logical_and(predicted_labels == -1,
                       np.logical_and(race_test == 1, y_test == 1))) / max(
                           sum(np.logical_and(race_test == 1, y_test == 1)), 1)
    results["fp_nC"] = sum(
        np.logical_and(predicted_labels == 1,
                       np.logical_and(race_test == 0, y_test == -1))) / max(
                           sum(np.logical_and(race_test == 0, y_test == -1)),
                           1)
    results["fn_nC"] = sum(
        np.logical_and(predicted_labels == -1,
                       np.logical_and(race_test == 0, y_test == 1))) / max(
                           sum(np.logical_and(race_test == 0, y_test == 1)), 1)
    results["disparate_mistreatment"] = abs(
        results["fp_C"] - results["fp_nC"]) + abs(results["fn_C"] -
                                                  results["fn_nC"])

    return results
def train(excel_file, text_column, labels_column, train_test_idxs_file, n_jobs,
          n_accepted_probs, output_file):
    execution_info = pd.DataFrame()
    execution_info['Start date'] = [get_local_time_str()]
    df = pd.read_excel(excel_file)
    df = df.fillna('NaN')
    preprocessor = Preprocessor()
    corpus = preprocessor.preprocess(df[text_column])
    dump_json(corpus, 'preprocessed_corpus_ELMo.json')
    labels = df[labels_column].tolist()
    train_test_idxs = load_json(train_test_idxs_file)
    train_idxs = train_test_idxs['train_idxs']
    test_idxs = train_test_idxs['test_idxs']
    corpus_train = utils.safe_indexing(corpus, train_idxs)
    corpus_test = utils.safe_indexing(corpus, test_idxs)
    y_train = utils.safe_indexing(labels, train_idxs)
    y_test = utils.safe_indexing(labels, test_idxs)
    ft = FeatureExtractor()
    X_train = ft.extract_features(corpus_train, 'X_train_ELMo.pkl',
                                  'X_train_ELMo.dat')
    X_test = ft.extract_features(corpus_test, 'X_test_ELMo.pkl',
                                 'X_test_ELMo.dat')
    clfs = [
        ensemble.RandomForestClassifier(n_estimators=100,
                                        n_jobs=n_jobs,
                                        random_state=RANDOM_STATE),
        LinearSVC(random_state=RANDOM_STATE),
        dummy.DummyClassifier(strategy='stratified',
                              random_state=RANDOM_STATE,
                              constant=None),
        linear_model.SGDClassifier(loss='modified_huber',
                                   max_iter=1000,
                                   tol=1e-3,
                                   n_jobs=n_jobs,
                                   random_state=RANDOM_STATE)
    ]
    predictions = {'y_true': y_test}
    for clf in tqdm(iterable=clfs, desc='Fitting classifiers', unit='clf'):
        clf.fit(X_train, y_train)
        dump_pickle(clf, '%s.pkl' % (clf.__class__.__name__))
    for clf in tqdm(iterable=clfs, desc='Obtaining probabilities', unit='clf'):
        y_predict_proba = clf.predict_proba(X_test)
        dicts = predict_proba_to_dicts(clf.classes_, y_predict_proba)
        predictions[clf.__class__.__name__] = dicts
    dump_json(predictions, 'predictions.json')
    execution_info['End date'] = [get_local_time_str()]
    execution_info['Excel file'] = excel_file
    execution_info['Text column'] = text_column
    execution_info['Label column'] = labels_column
    execution_info['n_jobs'] = n_jobs
    execution_info['Accepted probabilities'] = n_accepted_probs
    generate_report(execution_info, predictions, output_file)
Esempio n. 13
0
def testZeroHour(STOCK, future_day, data_for_algos, data_to_predict_for_algos, test_classes):
    try:
        model = dummy.DummyClassifier(strategy="most_frequent")
        X = np.asarray(list(map(lambda row: row[:-1], data_for_algos)))
        y = np.asarray(list(map(lambda row: row[-1], data_for_algos)))
        model.fit(X, y)
        predictions = model.predict(data_to_predict_for_algos)
        our_test_score = get_test_score(predictions, test_classes)
        result = result_in_csv(STOCK, 'ZR', Future_day=future_day, Our_test_score=our_test_score)
    except:
        result = result_in_csv(STOCK, 'ZR', Future_day=future_day, Our_test_score=-1)
    print(result)
    return result
Esempio n. 14
0
def classification(data_matrix,
                   target_matrix,
                   test_matrix,
                   strategy='most_frequent'):
    print "data detected", datetime.now().time()
    model = dummy.DummyClassifier(strategy=strategy,
                                  random_state=None,
                                  constant=None)
    print "model made", datetime.now().time()
    model.fit(data_matrix, target_matrix)
    print "model fitted", datetime.now().time()
    results = model.predict(test_matrix)
    print results
Esempio n. 15
0
    def fit(self, x, y):
        '''
        Fit method for the MetaEstimator. Output is a fitted estimator, that can then be used
        for prediction.
        '''
        
        # Determine if regression or classification problem, by comparing number of
        # unique values in output against threshold
        if self.method_type is None:
            is_above = len(np.unique(y, axis=0)) > self.cutoff_categorical
            self.method_type = ('classif','regr')[is_above]
        
        # Fetch the appropriate list of estimators
        if self.estimators is None:
            if self.method is not None:
                self.get_estim(y)
            else:
                if self.method_type == 'regr':
                    self.estimators = linear_model.LassoCV(normalize=True)
                elif self.method_type == 'classif':
                    self.estimators = ensemble.RandomForestClassifier(random_state=1)
        else:
            if self.method_type == 'regr':
                self.estimators = self.estimators[0]
            elif self.method_type == 'classif':
                self.estimators = self.estimators[1]

        # Collect information on classes in training set (needed later)
        if self.method_type == 'classif':
            self.classes = dummy.DummyClassifier().fit(x, y).classes_

        # Fit according to respective ensembling method
        if self.method == 'stacking':
            if self.method_type == 'regr':
                self.fitted = regressor.StackingRegressor(regressors=self.estimators,
                                            meta_regressor=linear_model.LinearRegression()).fit(x, y)

            elif self.method_type == 'classif':
                self.fitted = classifier.StackingClassifier(classifiers=self.estimators,
                            meta_classifier=linear_model.LogisticRegression(random_state = 1)).fit(x, y)

        elif self.method == 'multiplexing':
            for i in self.estimators:
                self.losses.append(np.mean(cross_val_score(i, x, y)))
            # For multiplexing, cross validation scores determine which estimator is chosen
            self.fitted  = self.estimators[np.argmin(self.losses)].fit(x, y)

        else:
            self.fitted = self.estimators.fit(x, y)

        return self
Esempio n. 16
0
 def zero_cost_model(self,X,y,add_to_model=False):
     if self.base_model._estimator_type=='classifier':
         model = dummy.DummyClassifier("prior") 
     elif self.base_model._estimator_type=='regressor':
         model = dummy.DummyRegressor("mean")
     else: raise TypeError("sklearn Classifier or Regressor required!")
     cost = 0
     features = []
     model.fit(self.selectfeats(X,features),y)
     if add_to_model:
         self.model_costs.insert(0,cost)
         self.model_features.insert(0,features)
         self.models.insert(0,model)
     return (model, cost, features)
def run_cv(data, features, labels, folds):
    baseline = dummy.DummyClassifier(strategy='most_frequent')
    predictions = cross_val_predict(baseline,
                                    features,
                                    labels,
                                    cv=folds,
                                    n_jobs=-1)
    print('Cross-validated baseline most frequent:',
          accuracy_score(labels, predictions),
          file=stderr)

    nb = naive_bayes.GaussianNB()
    predictions = cross_val_predict(nb, features, labels, cv=folds, n_jobs=-1)
    print('Cross-validated naive Bayes:',
          accuracy_score(labels, predictions),
          file=stderr)

    knn_uni = neighbors.KNeighborsClassifier()
    predictions = cross_val_predict(knn_uni,
                                    features,
                                    labels,
                                    cv=folds,
                                    n_jobs=-1)
    print('Cross-validated KNN (uniform):',
          accuracy_score(labels, predictions),
          file=stderr)

    knn_dist = neighbors.KNeighborsClassifier(weights='distance')
    predictions = cross_val_predict(knn_dist,
                                    features,
                                    labels,
                                    cv=folds,
                                    n_jobs=-1)
    print('Cross-validated KNN (distance):',
          accuracy_score(labels, predictions),
          file=stderr)

    logreg = linear_model.LogisticRegression()
    # rfecv = RFECV(logreg, cv=10)
    predictions = cross_val_predict(logreg,
                                    features,
                                    labels,
                                    cv=folds,
                                    n_jobs=-1)
    print('Cross-validated logistic regression:',
          accuracy_score(labels, predictions),
          file=stderr)

    for i, prediction in enumerate(predictions):
        print(data['docno'][i], data['query'][i], prediction)
Esempio n. 18
0
    def __pred_randomly(self, X_train, y_train, X_test):

        dummyX_train = [[0] for x in X_train]
        dummyX_test = [[0] for x in X_test]

        clf = None

        if self.dataset.type == 'c':
            clf = dummy.DummyClassifier(strategy=self.dummy_strategy)
        else:
            clf = dummy.DummyRegressor()

        clf.fit(dummyX_train, y_train)
        return clf.predict(dummyX_test)
Esempio n. 19
0
def testZeroHour(STOCK, future_day, data_for_algos):
    test_size = future_day
    X = np.asarray(list(map(lambda row: row[:-1], data_for_algos)))
    y = np.asarray(list(map(lambda row: row[-1], data_for_algos)))
    try:
        model = dummy.DummyClassifier(strategy="most_frequent")
        model.fit(X[:-test_size], y[:-test_size])
    except:
        return result_in_csv(STOCK,
                             'ZR',
                             Future_day=future_day,
                             Our_test_score=-1)
    our_score = model.score(X[-test_size:], y[-test_size:])
    return result_in_csv(STOCK,
                         'ZR',
                         Future_day=future_day,
                         Our_test_score=our_score)
def zr(data, future_day):
    scores = []
    X = np.asarray(list(map(lambda row: row[:-1], data)))
    y = np.asarray(list(map(lambda row: row[-1], data)))

    train_indices, test_indices = k_splits.get_max_k_splits(
        X, k=10, size_of_each_split=future_day)
    model = dummy.DummyClassifier(strategy="most_frequent")
    predict_score = -1
    for train_index, test_index in zip(train_indices, test_indices):
        X_train, y_train, X_test, y_test = k_splits.get_train_test_set(
            X, y, train_index, test_index)
        model.fit(X_train, y_train)
        predict_score = score.get_score(model, X_test, y_test)
        scores.append(predict_score)
    mean_score = np.mean(scores[:-1])
    mean_score = -1 if np.isnan(mean_score) else mean_score
    return model, mean_score, predict_score
def testZeroHour(STOCK, future_day, data_for_algos, data_to_predict_for_algos,
                 test_classes):
    try:
        model = dummy.DummyClassifier(strategy="most_frequent")
        X = np.asarray(list(map(lambda row: row[:-1], data_for_algos)))
        y = np.asarray(list(map(lambda row: row[-1], data_for_algos)))
        model.fit(X, y)
        predictions = model.predict(data_to_predict_for_algos)
        our_test_score = collections.Counter(predictions[0:future_day] *
                                             test_classes[0:future_day]).get(1)
        our_test_score = 0 if our_test_score is None else our_test_score

        result = f"{STOCK},ZR,0,0,0,0,{future_day},{our_test_score}\n"

    except:
        result = f"{STOCK},ZR,0,0,0,0,{future_day},error\n"
    print(result)
    return result
Esempio n. 22
0
def run_baseline(config, datasets):
    #model = dummy.DummyClassifier(strategy='stratified')
    model = dummy.DummyClassifier(strategy='most_frequent')
    model.fit(datasets.train_vectors, datasets.train_labels)
    #test_acc = model.score(datasets.test_vectors, datasets.test_labels)
    preds = model.predict(datasets.test_vectors)
    test_acc = accuracy_score(datasets.test_labels, preds)
    print("baseline(most_freq) test_acc: %.4f" % (test_acc))

    test_acc_by_tag = {}
    test_acc_by_freqbin = {}
    if config.test_tags:
        test_acc_by_tag = test_breakdown_by_tag(datasets.test_labels, preds,
                                                datasets.test_words, '__')
    if config.freq_dict:
        test_acc_by_freqbin = test_breakdown_by_freqbin(
            datasets.test_labels, preds, datasets.test_words, '__',
            datasets.freqbin_dict)

    return test_acc, test_acc_by_tag, test_acc_by_freqbin
Esempio n. 23
0
    def __create_models(self) -> list[tuple[Any, ModelType]]:
        n_jobs = self.config.n_jobs
        model_type = self.config.model_type
        models = []

        if model_type in [ModelType.ALL, ModelType.DUMMY]:
            models.append((dummy.DummyClassifier(strategy="stratified"),
                           ModelType.DUMMY))

        if model_type in [ModelType.ALL, ModelType.RANDOM_FOREST]:
            models.append((
                ensemble.RandomForestClassifier(n_jobs=n_jobs),
                ModelType.RANDOM_FOREST,
            ))

        if model_type in [ModelType.ALL, ModelType.EXTRA_TREES]:
            models.append((
                ensemble.ExtraTreesClassifier(n_jobs=n_jobs),
                ModelType.EXTRA_TREES,
            ))

        if model_type in [ModelType.ALL, ModelType.LGBM]:
            n_labels = self.dataset_train.n_labels
            models.append((
                lgb.LGBMClassifier(objective="multiclass",
                                   num_class=n_labels,
                                   n_jobs=n_jobs),
                ModelType.LGBM,
            ))

        if model_type in [ModelType.ALL, ModelType.SVM]:
            models.append((svm.SVC(), ModelType.SVM))

        if model_type in [ModelType.ALL, ModelType.KNN]:
            models.append(
                (neighbors.KNeighborsClassifier(n_jobs=n_jobs), ModelType.KNN))

        return models
Esempio n. 24
0
def test_gridsearch():
    doctor = strategyGame.strategyGameDoctor()
    ratingCounts = [10, 30, 50, 80, 100, 150, 200, 500, 1000, 2000, 3000]
    ratingCounts = [100]
    scores = []
    for minRatingCount in ratingCounts:
        Xdata, ydata = getData(minRatingCount)
        X = doctor.readXdata(Xdata)
        y = doctor.readydata(ydata)
        # X, y = preprocess.balanceSample(X, y)
        pipe = doctor.getPipe()
        model1 = ordinalClassifier.OrdinalClassifier(
            linear_model.LogisticRegression())
        model2 = tree.DecisionTreeClassifier(max_depth=5)
        model3 = ensemble.RandomForestClassifier(max_depth=10)
        model4 = dummy.DummyClassifier(strategy="most_frequent")

        transformer1 = pipeline.Pipeline([("pac", decomposition.PCA())])
        transformer2 = "passthrough"
        # model5 = naive_bayes.GaussianNB()
        paramGrid = {
            "tranformer": [transformer2],
            "model": [model3, model4],
            # "model__max_depth": [3, 5, 7, 10, 15],
        }
        gscv = model_selection.GridSearchCV(pipe,
                                            paramGrid,
                                            cv=5,
                                            scoring="accuracy")
        gscv.fit(X, y)
        print("\nGrid Search Report")
        for key in ["mean_test_score", "std_test_score", "rank_test_score"]:
            print(f"{key}:{[round(x,2) for x in gscv.cv_results_[key]]}")
        scores.append(
            (minRatingCount, list(gscv.cv_results_["mean_test_score"])))
    print(scores)
Esempio n. 25
0
        ensemble.RandomForestClassifier(max_depth=2, random_state=0),
        'Adaboost':
        ensemble.AdaBoostClassifier(random_state=0),
        'MultinomialNB':
        naive_bayes.MultinomialNB(),
        # 'GaussianNB': gnb_predict,
        'BernoulliNB':
        naive_bayes.BernoulliNB(),
        'KNN':
        neighbors.KNeighborsClassifier(n_neighbors=10),
        'SVM':
        svm.SVC(kernel='rbf', gamma=0.7, C=1, probability=True),
        # 'Random':
        #     dummy.DummyClassifier(strategy='stratified'),
        'Most Frequent':
        dummy.DummyClassifier(strategy='most_frequent')
    }

    #----------------------------------------------preprocessing--------------------------------------------------------#

    file = pd.read_csv('rnn3.csv')
    dim = np.arange(length).astype(str)

    #train
    label = file[str(length)].values.astype(int)
    data = np.asarray(file[dim].values / 101)

    X_train, X_test, y_train, y_test = train_test_split(data,
                                                        label,
                                                        test_size=0.2,
                                                        random_state=294967295)
import pandas as pd
import os
from sklearn import dummy

dir = 'E:/'
titanic_train = pd.read_csv(os.path.join(dir, 'train.csv'))
print(titanic_train.info())
print(titanic_train.columns)

titanic_train.groupby('Survived').size()
X_train = titanic_train[ ['SibSp', 'Parch'] ]
y_train = titanic_train['Survived']
dummy_estimator = dummy.DummyClassifier(strategy="stratified", random_state=10)
dummy_estimator.fit(X_train, y_train)

titanic_test = pd.read_csv(os.path.join(dir, 'test.csv'))
print(titanic_test.info())
X_test = titanic_test[ ['SibSp', 'Parch'] ]
titanic_test['Survived'] = dummy_estimator.predict(X_test)
titanic_test.groupby('Survived').size()
titanic_test.to_csv(os.path.join(dir, 'submission.csv'), columns=['PassengerId', 'Survived'], index=False)
Esempio n. 27
0
def testDummyClassifier():
    dummyClassifier = dummy.DummyClassifier(strategy="stratified")

    X = [[0]] * 10
    y = [0, 1, 2, 0, 1, 2, 0, 0, 1, 2]
    dummyClassifier.fit(X, y)

    dbName = "/Test/gt10/gt10_last500Responses.db"
    connectToDB(dbName)
    allEntries = fetchAllNearDuplicates("where human_classification>=0")
    closeDBConnection()
    y_actual = []
    fieldNames = [
        'thresholdSet', 'algoName', 'c-thre', 'n-thre', 'precision', 'recall',
        'f1'
    ]

    for entry in allEntries:
        index = 4
        for algo in ALGOS:
            index = index + 1

        y_actual.append(entry[index])

    y_pred = dummyClassifier.predict(y_actual)

    precision, recall, f1, support = metrics.precision_recall_fscore_support(
        y_actual, y_pred, average="macro")
    # precision = metrics.precision(y_actual, y_pred[algoStr], average="macro")
    # recall = metrics.recall(y_actual, y_pred[algoStr], average="macro")
    # f1 = metrics.f1_score(y_actual, y_pred[algoStr], average="macro")
    row1 = {
        'thresholdSet': None,
        'algoName': "dummy",
        'c-thre': None,
        'n-thre': None,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
    print(row1)

    dbName = "/comparator/src/main/resources/GoldStandards/SS.db"
    connectToDB(dbName)
    allEntries = fetchAllNearDuplicates("where human_classification>=0")
    closeDBConnection()
    y_actual = []

    for entry in allEntries:
        index = 4
        for algo in ALGOS:
            index = index + 1

        y_actual.append(entry[index])

    y_pred = dummyClassifier.predict(y_actual)

    precision, recall, f1, support = metrics.precision_recall_fscore_support(
        y_actual, y_pred, average="macro")
    row2 = {
        'thresholdSet': None,
        'algoName': "dummy",
        'c-thre': None,
        'n-thre': None,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
    print(row2)
    writeCSV(fieldNames, [row1, row2], "rq1_dummy.csv")
Esempio n. 28
0
def getF1_Classifier(allEntries):
    y_pred = {}

    for algo in ALGOS:
        algoStr = str(algo).split('.')[1].upper()
        y_pred[algoStr] = []

    y_actual = []

    print(len(allEntries))

    threshold_sets = {}
    # threshold_sets["proportion_based"] = [getThreshold(THRESHOLD_SETS.FULLDB_QUART1, DB_SETS.GT10_DB_DATA, 'all'), getThreshold(THRESHOLD_SETS.FULLDB_MEDIAN, DB_SETS.GT10_DB_DATA, 'all')]
    threshold_sets["statistical"] = [
        getThreshold(THRESHOLD_SETS.HUMANCLONE_QUART3, DB_SETS.GT10_DB_DATA,
                     'all'),
        getThreshold(THRESHOLD_SETS.HUMANND_MEDIAN, DB_SETS.GT10_DB_DATA,
                     'all')
    ]
    threshold_sets["optimal"] = [
        getThreshold(THRESHOLD_SETS.OPTIMAL_CLASSIFICATION_CLONE,
                     DB_SETS.GT10_DB_DATA, 'all'),
        getThreshold(THRESHOLD_SETS.OPTIMAL_CLASSIFICATION_ND,
                     DB_SETS.GT10_DB_DATA, 'all')
    ]

    algoScoreRows = []
    fieldNames = [
        'thresholdSet', 'algoName', 'c-thre', 'n-thre', 'precision', 'recall',
        'f1'
    ]

    dummyClassifier = dummy.DummyClassifier(strategy="stratified")

    print(threshold_sets)
    for threshold_set_name in threshold_sets:
        threshold_set = threshold_sets[threshold_set_name]
        cloneThresholds = threshold_set[0]
        ndThresholds = threshold_set[1]
        # print(cloneThresholds)
        for entry in allEntries:
            index = 4
            for algo in ALGOS:
                algoStr = str(algo).split('.')[1].upper()
                value = float(entry[index])
                pred = -1
                if algo.value[2] == "lt":
                    if value <= cloneThresholds[algoStr]:
                        pred = 0
                    if value > cloneThresholds[algoStr]:
                        if value <= ndThresholds[algoStr]:
                            pred = 1
                        else:
                            pred = 2
                else:
                    if value >= cloneThresholds[algoStr]:
                        pred = 0
                    if value < cloneThresholds[algoStr]:
                        if value >= ndThresholds[algoStr]:
                            pred = 1
                        else:
                            pred = 2

                y_pred[algoStr].append(pred)
                index = index + 1

            y_actual.append(entry[index])

        for algo in ALGOS:
            algoStr = str(algo).split('.')[1].upper()
            cm = metrics.confusion_matrix(y_actual, y_pred[algoStr])
            # print(cm)
            precision, recall, f1, support = metrics.precision_recall_fscore_support(
                y_actual, y_pred[algoStr], average="macro")
            # precision = metrics.precision(y_actual, y_pred[algoStr], average="macro")
            # recall = metrics.recall(y_actual, y_pred[algoStr], average="macro")
            # f1 = metrics.f1_score(y_actual, y_pred[algoStr], average="macro")
            row = {
                'thresholdSet': threshold_set_name,
                'algoName': algoStr,
                'c-thre': cloneThresholds[algoStr],
                'n-thre': ndThresholds[algoStr],
                'precision': precision,
                'recall': recall,
                'f1': f1
            }
            algoScoreRows.append(row)
        X = [[0]] * len(y_actual)
        dummyClassifier.fit(X, y_actual)
        y_pred_dummy = dummyClassifier.predict(y_actual)
        precision, recall, f1, support = metrics.precision_recall_fscore_support(
            y_actual, y_pred_dummy, average="macro")
        row2 = {
            'thresholdSet': None,
            'algoName': "dummy",
            'c-thre': None,
            'n-thre': None,
            'precision': precision,
            'recall': recall,
            'f1': f1
        }
        algoScoreRows.append(row2)

    writeCSV(
        fieldNames, algoScoreRows,
        os.path.join(
            os.path.abspath(".."), RESULTS_FOLDER,
            "rq1_" + str(datetime.now().strftime("%Y%m%d-%H%M%S")) + ".csv"))
Esempio n. 29
0
        ensemble.RandomForestClassifier(max_depth=2, random_state=0),
        'Adaboost':
        ensemble.AdaBoostClassifier(random_state=0),
        'MultinomialNB':
        naive_bayes.MultinomialNB(),
        # 'GaussianNB': gnb_predict,
        'BernoulliNB':
        naive_bayes.BernoulliNB(),
        'KNN':
        neighbors.KNeighborsClassifier(n_neighbors=10),
        'SVM':
        svm.SVC(kernel='rbf', gamma=0.7, C=1, probability=True),
        # 'Random':
        #     dummy.DummyClassifier(strategy='stratified'),
        'Most Frequent':
        dummy.DummyClassifier(strategy='most_frequent'),
        'Uniform':
        dummy.DummyClassifier(strategy='uniform')
    }

    #----------------------------------------------preprocessing--------------------------------------------------------#

    file = pd.read_csv('rnn3.csv')
    dim = np.arange(length).astype(str)

    #train
    label = file[str(length)].values.astype(int)
    data = np.asarray(file[dim].values / 101)

    X_train, X_test, y_train, y_test = train_test_split(data,
                                                        label,
import pandas as pd
import os
from sklearn import dummy

dir = 'E:/'
titanic_train = pd.read_csv(os.path.join(dir, 'train.csv'))
print(titanic_train.info())
print(titanic_train.columns)

X_train = titanic_train[ ['SibSp', 'Parch'] ]
y_train = titanic_train['Survived']
dummy_estimator = dummy.DummyClassifier(strategy="uniform", random_state=10)
dummy_estimator.fit(X_train, y_train)

titanic_test = pd.read_csv(os.path.join(dir, 'test.csv'))
print(titanic_test.info())
X_test = titanic_test[ ['SibSp', 'Parch'] ]
titanic_test['Survived'] = dummy_estimator.predict(X_test)
titanic_test.to_csv(os.path.join(dir, 'submission.csv'), columns=['PassengerId', 'Survived'], index=False)