Esempio n. 1
0
    def __init__(
        self,
        path2dataset,
        result_filename,
        neighbor_suffix=None,
        expression_suffix=None,
        showHyperparameters=False,
    ):
        self.path2dataset = Path(path2dataset)
        self.result_filename = self.path2dataset / 'results' / result_filename
        print(f'Result file = {self.result_filename}')

        with openH5File(self.result_filename, 'r') as f:
            for k in f['hyperparameters'].keys():
                v = f[f'hyperparameters/{k}']
                if isinstance(v, h5py.Group): continue
                v = v[()]
                if k in ['repli_list']:
                    v = np.array([_.decode('utf-8') for _ in v])
                setattr(self, k, v)
                if showHyperparameters:
                    print(f'{k} \t= {v}')
        self.num_repli = len(self.repli_list)
        self.use_spatial = [True] * self.num_repli
        loadDataset(self,
                    neighbor_suffix=neighbor_suffix,
                    expression_suffix=expression_suffix)
        self.columns_latent_states = np.array(
            [f'latent state {i}' for i in range(self.K)])
        self.columns_exprs = np.array([f'expr {_}' for _ in self.genes[0]])
        self.data = pd.DataFrame(index=range(sum(self.Ns)))
        self.data[['coor X', 'coor Y']] = np.concatenate([
            loadExpression(
                self.path2dataset / 'files' / f'coordinates_{repli}.txt')
            for repli in self.repli_list
        ],
                                                         axis=0)
        self.data['cell type'] = np.concatenate([
            np.loadtxt(self.path2dataset / 'files' / f'celltypes_{repli}.txt',
                       dtype=str) for repli in self.repli_list
        ],
                                                axis=0)
        self.data['repli'] = sum(
            [[repli] * N for repli, N in zip(self.repli_list, self.Ns)], [])
        self.data[self.columns_exprs] = np.concatenate(self.YTs, axis=0)
        self.scaling = [
            G / self.GG * self.K / YT.sum(1).mean()
            for YT, G in zip(self.YTs, self.Gs)
        ]
        self.colors = {}
        self.orders = {}

        self.metagene_order = np.arange(self.K)
Esempio n. 2
0
    def __init__(
        self,
        path2dataset,
        repli_list,
        use_spatial,
        neighbor_suffix,
        expression_suffix,
        K,
        lambda_SigmaXInv,
        betas,
        prior_x_modes,
        result_filename=None,
        PyTorch_device='cpu',
        num_processes=1,
    ):
        self.PyTorch_device = PyTorch_device
        self.num_processes = num_processes

        self.path2dataset = Path(path2dataset)
        self.repli_list = repli_list
        self.use_spatial = use_spatial
        self.num_repli = len(self.repli_list)
        assert len(self.repli_list) == len(self.use_spatial)
        loadDataset(self,
                    neighbor_suffix=neighbor_suffix,
                    expression_suffix=expression_suffix)

        self.K = K
        self.YTs = [
            G / self.GG * self.K * YT / YT.sum(1).mean()
            for YT, G in zip(self.YTs, self.Gs)
        ]
        self.lambda_SigmaXInv = lambda_SigmaXInv
        self.betas = betas
        self.prior_x_modes = prior_x_modes
        self.M_constraint = 'sum2one'
        self.X_constraint = 'none'
        self.dropout_mode = 'raw'
        self.sigma_yx_inv_mode = 'average'
        self.pairwise_potential_mode = 'normalized'

        if result_filename is not None:
            os.makedirs(self.path2dataset / 'results', exist_ok=True)
            self.result_filename = self.path2dataset / 'results' / result_filename
            logging.info(
                f'{print_datetime()}result file = {self.result_filename}')
        else:
            self.result_filename = None
        self.saveHyperparameters()
Esempio n. 3
0
def loadModelForDataset(model_class,
                        dataset_string,
                        experiment_folder_name=None):

    log_file = sys.stdout if experiment_folder_name == None else open(
        f'{experiment_folder_name}/log_training.txt', 'w')

    if not (model_class in {'lr', 'mlp', 'tree', 'forest'}):
        raise Exception(f'{model_class} not supported.')

    if not (dataset_string in {
            'random', 'mortgage', 'twomoon', 'german', 'credit', 'compass',
            'adult'
    }):
        raise Exception(f'{dataset_string} not supported.')

    dataset_obj = loadData.loadDataset(dataset_string,
                                       return_one_hot=True,
                                       load_from_cache=True,
                                       debug_flag=False)
    X_train, X_test, y_train, y_test = dataset_obj.getTrainTestSplit()
    feature_names = dataset_obj.getInputAttributeNames(
        'kurz')  # easier to read (nothing to do with one-hot vs non-hit!)

    if model_class == 'tree':
        model_pretrain = DecisionTreeClassifier()
    elif model_class == 'forest':
        model_pretrain = RandomForestClassifier()
    elif model_class == 'lr':
        # IMPORTANT: The default solver changed from ‘liblinear’ to ‘lbfgs’ in 0.22;
        #            therefore, results may differ slightly from paper.
        model_pretrain = LogisticRegression(
        )  # default penalty='l2', i.e., ridge
    elif model_class == 'mlp':
        model_pretrain = MLPClassifier(hidden_layer_sizes=(10, 10))

    print(
        f'[INFO] Training `{model_class}` on {X_train.shape[0]:,} samples ' +
        f'(%{100 * X_train.shape[0] / (X_train.shape[0] + X_test.shape[0]):.2f}'
        + f'of {X_train.shape[0] + X_test.shape[0]:,} samples)...',
        file=log_file,
    )
    model_trained = model_pretrain.fit(X_train, y_train)
    print(
        f'\tTraining accuracy: %{accuracy_score(y_train, model_trained.predict(X_train)) * 100:.2f}',
        file=log_file)
    print(
        f'\tTesting accuracy: %{accuracy_score(y_test, model_trained.predict(X_test)) * 100:.2f}',
        file=log_file)
    print('[INFO] done.\n', file=log_file)

    if model_class == 'tree':
        if SIMPLIFY_TREES:
            print('[INFO] Simplifying decision tree...', end='', file=log_file)
            model_trained.tree_ = treeUtils.simplifyDecisionTree(
                model_trained, False)
            print('\tdone.', file=log_file)
        treeUtils.saveTreeVisualization(model_trained, model_class, '', X_test,
                                        feature_names, experiment_folder_name)
    elif model_class == 'forest':
        for tree_idx in range(len(model_trained.estimators_)):
            if SIMPLIFY_TREES:
                print(
                    f'[INFO] Simplifying decision tree (#{tree_idx + 1}/{len(model_trained.estimators_)})...',
                    end='',
                    file=log_file)
                model_trained.estimators_[
                    tree_idx].tree_ = treeUtils.simplifyDecisionTree(
                        model_trained.estimators_[tree_idx], False)
                print('\tdone.', file=log_file)
            treeUtils.saveTreeVisualization(
                model_trained.estimators_[tree_idx], model_class,
                f'tree{tree_idx}', X_test, feature_names,
                experiment_folder_name)

    if experiment_folder_name:
        pickle.dump(model_trained,
                    open(f'{experiment_folder_name}/_model_trained', 'wb'))

    return model_trained
Esempio n. 4
0
def main():
    model = myConvNet()

    if cuda:
        model = model.cuda()
    model.apply(weights_init)
    print(model)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    #optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    loss_fn = nn.CrossEntropyLoss()

    mean_train_losses = []
    epochs = 10000

    trainref = 'bigtrain'
    testref = 'bigtest'
    print(f"Trainref: {trainref}, testref: {testref}")
    trainset = loadData.loadDataset(trainref,
                                    flipHorizontal=True,
                                    flipVertical=True,
                                    meanNorm=True,
                                    stdNorm=False)
    train_loader = torch.utils.data.DataLoader(trainset,
                                               batch_size=500,
                                               shuffle=True,
                                               num_workers=2)
    testset = loadData.loadDataset(testref,
                                   flipHorizontal=False,
                                   flipVertical=False,
                                   meanNorm=True,
                                   stdNorm=False)
    test_loader = torch.utils.data.DataLoader(testset,
                                              batch_size=12,
                                              shuffle=True,
                                              num_workers=2)

    train = True

    for epoch in range(epochs):
        if train:
            train_losses = []
            for i, (images, labels) in enumerate(train_loader):

                optimizer.zero_grad()
                inputs = images.float()

                if cuda:
                    inputs = inputs.cuda()
                    labels = labels.cuda()

                outputs = model(inputs)

                loss = loss_fn(outputs, labels)
                loss.backward()
                optimizer.step()

                train_losses.append(loss.item())

            mean_train_losses.append(np.mean(train_losses))
            print("Train losses: ", np.mean(train_losses))

        correct = 0
        total = 0
        with torch.no_grad():
            for images, labels in test_loader:
                inputs = images.float()
                if cuda:
                    inputs = inputs.cuda()
                    labels = labels.cuda()
                outputs = model(inputs)
                _, predicted = torch.max(outputs.data, 1)
                correct += (predicted == labels).sum().item()
                total += labels.size(0)

        if epoch % 10 == 0:
            # Save model
            torch.save(model.state_dict(), f"models/model_{epoch}.pth")
        accuracy = 100 * correct / total
        print('epoch : {}, train loss : {:.4f} accuracy: {:.4f}'.format(
            epoch + 1, np.mean(train_losses), accuracy))
Esempio n. 5
0
def runExperiments(dataset_values, model_class_values, norm_values,
                   approaches_values, batch_number, sample_count, gen_cf_for,
                   process_id):

    for dataset_string in dataset_values:

        print(f'\n\nExperimenting with dataset_string = `{dataset_string}`')

        for model_class_string in model_class_values:

            print(
                f'\tExperimenting with model_class_string = `{model_class_string}`'
            )

            for norm_type_string in norm_values:

                print(
                    f'\t\tExperimenting with norm_type_string = `{norm_type_string}`'
                )

                for approach_string in approaches_values:

                    print(
                        f'\t\t\tExperimenting with approach_string = `{approach_string}`'
                    )

                    # if norm_type_string == 'two_norm':
                    #   raise Exception(f'{norm_type_string} not supported.')

                    if model_class_string in {'tree', 'forest'}:
                        one_hot = False
                    elif model_class_string in {'lr', 'mlp'}:
                        one_hot = True
                    else:
                        raise Exception(
                            f'{model_class_string} not recognized as a valid `model_class_string`.'
                        )

                    # prepare experiment folder
                    experiment_name = f'{dataset_string}__{model_class_string}__{norm_type_string}__{approach_string}__batch{batch_number}__samples{sample_count}__pid{process_id}'
                    experiment_folder_name = f"_experiments/{datetime.now().strftime('%Y.%m.%d_%H.%M.%S')}__{experiment_name}"
                    explanation_folder_name = f'{experiment_folder_name}/__explanation_log'
                    minimum_distance_folder_name = f'{experiment_folder_name}/__minimum_distances'
                    os.mkdir(f'{experiment_folder_name}')
                    os.mkdir(f'{explanation_folder_name}')
                    os.mkdir(f'{minimum_distance_folder_name}')
                    log_file = open(
                        f'{experiment_folder_name}/log_experiment.txt', 'w')

                    # save some files
                    dataset_obj = loadData.loadDataset(dataset_string,
                                                       return_one_hot=one_hot,
                                                       load_from_cache=False,
                                                       debug_flag=False)
                    pickle.dump(
                        dataset_obj,
                        open(f'{experiment_folder_name}/_dataset_obj', 'wb'))
                    #     training portion used to train models
                    #     testing portion used to compute counterfactuals
                    X_train, X_test, y_train, y_test = dataset_obj.getTrainTestSplit(
                    )

                    standard_deviations = list(X_train.std())

                    # train the model
                    # model_trained = modelTraining.trainAndSaveModels(
                    #   model_class_string,
                    #   dataset_string,
                    #   experiment_folder_name,
                    # )
                    model_trained = loadModel.loadModelForDataset(
                        model_class_string,
                        dataset_string,
                        experiment_folder_name=experiment_folder_name)

                    # get the predicted labels (only test set)
                    # X_test = pd.concat([X_train, X_test]) # ONLY ACTIVATE THIS WHEN TEST SET IS NOT LARGE ENOUGH TO GEN' MODEL RECON DATASET
                    X_test_pred_labels = model_trained.predict(X_test)

                    all_pred_data_df = X_test
                    # IMPORTANT: note that 'y' is actually 'pred_y', not 'true_y'
                    all_pred_data_df['y'] = X_test_pred_labels
                    neg_pred_data_df = all_pred_data_df.where(
                        all_pred_data_df['y'] == 0).dropna()
                    pos_pred_data_df = all_pred_data_df.where(
                        all_pred_data_df['y'] == 1).dropna()

                    batch_start_index = batch_number * sample_count
                    batch_end_index = (batch_number + 1) * sample_count

                    # generate counterfactuals for {only negative, negative & positive} samples
                    if gen_cf_for == 'neg_only':
                        iterate_over_data_df = neg_pred_data_df[
                            batch_start_index:
                            batch_end_index]  # choose only a subset to compare
                        observable_data_df = pos_pred_data_df
                    elif gen_cf_for == 'pos_only':
                        iterate_over_data_df = pos_pred_data_df[
                            batch_start_index:
                            batch_end_index]  # choose only a subset to compare
                        observable_data_df = neg_pred_data_df
                    elif gen_cf_for == 'neg_and_pos':
                        iterate_over_data_df = all_pred_data_df[
                            batch_start_index:
                            batch_end_index]  # choose only a subset to compare
                        observable_data_df = all_pred_data_df
                    else:
                        raise Exception(
                            f'{gen_cf_for} not recognized as a valid `gen_cf_for`.'
                        )

                    # convert to dictionary for easier enumeration (iteration)
                    iterate_over_data_dict = iterate_over_data_df.T.to_dict()
                    observable_data_dict = observable_data_df.T.to_dict()

                    # loop through samples for which we desire a counterfactual,
                    # (to be saved as part of the same file of minimum distances)
                    explanation_counter = 1
                    all_minimum_distances = {}
                    for factual_sample_index, factual_sample in iterate_over_data_dict.items(
                    ):

                        factual_sample['y'] = bool(factual_sample['y'])

                        print(
                            '\t\t\t\t'
                            f'Generating explanation for\t'
                            f'batch #{batch_number}\t'
                            f'sample #{explanation_counter}/{len(iterate_over_data_dict.keys())}\t'
                            f'(sample index {factual_sample_index}): ',
                            end='')  # , file=log_file)
                        explanation_counter = explanation_counter + 1
                        explanation_file_name = f'{explanation_folder_name}/sample_{factual_sample_index}.txt'

                        explanation_object = generateExplanations(
                            approach_string,
                            explanation_file_name,
                            model_trained,
                            dataset_obj,
                            factual_sample,
                            norm_type_string,
                            observable_data_dict,  # used solely for minimum_observable method
                            standard_deviations,  # used solely for feature_tweaking method
                        )

                        if 'MINT' in approach_string:
                            print(
                                f'\t- scf_found: {explanation_object["scf_found"]} -'
                                f'\t- scf_plaus: {explanation_object["scf_plausible"]} -'
                                f'\t- scf_time: {explanation_object["scf_time"]:.4f} -'
                                f'\t- int_cost: {explanation_object["int_cost"]:.4f} -'
                                f'\t- scf_dist: {explanation_object["scf_distance"]:.4f} -'
                            )  # , file=log_file)
                        else:  # 'MACE' or other..
                            print(
                                f'\t- cfe_found: {explanation_object["cfe_found"]} -'
                                f'\t- cfe_plaus: {explanation_object["cfe_plausible"]} -'
                                f'\t- cfe_time: {explanation_object["cfe_time"]:.4f} -'
                                f'\t- int_cost: N/A -'
                                f'\t- cfe_dist: {explanation_object["cfe_distance"]:.4f} -'
                            )  # , file=log_file)

                        all_minimum_distances[
                            f'sample_{factual_sample_index}'] = explanation_object

                    pickle.dump(
                        all_minimum_distances,
                        open(f'{experiment_folder_name}/_minimum_distances',
                             'wb'))
                    pprint(
                        all_minimum_distances,
                        open(f'{experiment_folder_name}/minimum_distances.txt',
                             'w'))
Esempio n. 6
0
def loadModelForDataset(model_class,
                        dataset_class,
                        scm_class=None,
                        num_train_samples=1e5,
                        fair_nodes=None,
                        fair_kernel_type=None,
                        experiment_folder_name=None):

    log_file = sys.stdout if experiment_folder_name == None else open(
        f'{experiment_folder_name}/log_training.txt', 'w')

    if not (model_class in {'lr', 'mlp', 'tree', 'forest'}) and not (
            model_class in fairRecourse.FAIR_MODELS):
        raise Exception(f'{model_class} not supported.')

    if not (dataset_class in {
            'synthetic', 'mortgage', 'twomoon', 'german', 'credit', 'compass',
            'adult', 'test'
    }):
        raise Exception(f'{dataset_class} not supported.')

    if dataset_class == 'adult':
        dataset_obj = loadData.loadDataset(dataset_class,
                                           return_one_hot=False,
                                           load_from_cache=False,
                                           index_offset=1)
    else:
        dataset_obj = loadData.loadDataset(dataset_class,
                                           return_one_hot=True,
                                           load_from_cache=False,
                                           meta_param=scm_class)

    if model_class not in fairRecourse.FAIR_MODELS:
        X_train, X_test, y_train, y_test = dataset_obj.getTrainTestSplit()
        y_all = pd.concat([y_train, y_test], axis=0)
        assert sum(y_all) / len(
            y_all) == 0.5, 'Expected class balance should be 50/50%.'
    else:
        if dataset_class == 'adult':
            X_train, X_test, y_train, y_test = dataset_obj.getTrainTestSplit(
                with_meta=False, balanced=False)
            X_train = pd.concat([X_train], axis=1)[fair_nodes]
            X_test = pd.concat([X_test], axis=1)[fair_nodes]
        else:
            X_train, X_test, U_train, U_test, y_train, y_test = dataset_obj.getTrainTestSplit(
                with_meta=True, balanced=False)
            X_train = pd.concat([X_train, U_train], axis=1)[fair_nodes]
            X_test = pd.concat([X_test, U_test], axis=1)[fair_nodes]

    if model_class == 'tree':
        model_pretrain = DecisionTreeClassifier()
    elif model_class == 'forest':
        model_pretrain = RandomForestClassifier()
    elif model_class == 'lr':
        # IMPORTANT: The default solver changed from ‘liblinear’ to ‘lbfgs’ in 0.22;
        #            therefore, results may differ slightly from paper.
        model_pretrain = LogisticRegression(
        )  # default penalty='l2', i.e., ridge
    elif model_class == 'mlp':
        model_pretrain = MLPClassifier(hidden_layer_sizes=(10, 10))
    else:
        model_pretrain = trainFairClassifier(model_class, fair_kernel_type)
        X_train = np.array(X_train)
        X_test = np.array(X_test)
        y_train = np.array(y_train)
        y_test = np.array(y_test)

    X_train = X_train[:num_train_samples]
    y_train = y_train[:num_train_samples]

    training_setup_string = f'[INFO] Training `{model_class}` on {X_train.shape[0]:,} samples ' + \
      f'(%{100 * X_train.shape[0] / (X_train.shape[0] + X_test.shape[0]):.2f}' + \
      f'of {X_train.shape[0] + X_test.shape[0]:,} samples)...'
    print(training_setup_string, file=log_file)
    print(training_setup_string)

    model_trained = model_pretrain.fit(X_train, y_train)

    train_accuracy_string = f'\t[INFO] Training accuracy: %{accuracy_score(y_train, model_trained.predict(X_train)) * 100:.2f}.'
    test_accuracy_string = f'\t[INFO] Testing accuracy: %{accuracy_score(y_test, model_trained.predict(X_test)) * 100:.2f}.'

    print(train_accuracy_string, file=log_file)
    print(test_accuracy_string, file=log_file)
    print(train_accuracy_string)
    print(test_accuracy_string)

    if hasattr(model_trained, 'best_estimator_'):
        hyperparams_string = f'\t[INFO] Hyper-parameters of best classifier selected by CV:\n\t{model_trained.best_estimator_}'
        print(hyperparams_string, file=log_file)
        print(hyperparams_string)

    # shouldn't deal with bad model; arbitrarily select offset to be 70% accuracy
    tmp = accuracy_score(y_train, model_trained.predict(X_train))

    # TODO (fair): added try except loop for use of nonlinear classifiers in fairness experiments
    try:
        assert tmp > 0.70, f'Model accuracy only {tmp}'
    except:
        print('[INFO] logistic regression accuracy may be low (<70%)')
        pass

    classifier_obj = model_trained
    visualizeDatasetAndFixedModel(dataset_obj, classifier_obj,
                                  experiment_folder_name)

    feature_names = dataset_obj.getInputAttributeNames(
        'kurz')  # easier to read (nothing to do with one-hot vs non-hit!)
    if model_class == 'tree':
        if SIMPLIFY_TREES:
            print('[INFO] Simplifying decision tree...', end='', file=log_file)
            model_trained.tree_ = treeUtils.simplifyDecisionTree(
                model_trained, False)
            print('\tdone.', file=log_file)
        # treeUtils.saveTreeVisualization(model_trained, model_class, '', X_test, feature_names, experiment_folder_name)
    elif model_class == 'forest':
        for tree_idx in range(len(model_trained.estimators_)):
            if SIMPLIFY_TREES:
                print(
                    f'[INFO] Simplifying decision tree (#{tree_idx + 1}/{len(model_trained.estimators_)})...',
                    end='',
                    file=log_file)
                model_trained.estimators_[
                    tree_idx].tree_ = treeUtils.simplifyDecisionTree(
                        model_trained.estimators_[tree_idx], False)
                print('\tdone.', file=log_file)
            # treeUtils.saveTreeVisualization(model_trained.estimators_[tree_idx], model_class, f'tree{tree_idx}', X_test, feature_names, experiment_folder_name)

    if experiment_folder_name:
        pickle.dump(model_trained,
                    open(f'{experiment_folder_name}/_model_trained', 'wb'))

    return model_trained
Esempio n. 7
0
        distances = sorted(distances, key=lambda x: x[1])
        labels = list()
        for i in range(K):
            labels.append(distances[i][0])
        counter = Counter(labels)

        prediction = counter.most_common(1)[0][0]
        if (prediction == testY):
            correct += 1
    total = len(y_test)
    print("Accuracy = %s" % (correct / total))


K = 3

X, Y, imgPaths = loadDataset("HiraganaGit", loadAgain=False)

indices = np.arange(len(X))
# np.random.seed(3)
np.random.shuffle(indices)
X = X[indices]
Y = Y[indices]

N = X.shape[0]

Ntrain = int(N * 80 / 100)
Ntest = int(N * 20 / 100)

x_train = X[:Ntrain].reshape((Ntrain, 1, 84, 83))
y_train = Y[:Ntrain]
Esempio n. 8
0
def loadModelForDataset(model_class,
                        dataset_string,
                        scm_class=None,
                        experiment_folder_name=None):

    log_file = sys.stdout if experiment_folder_name == None else open(
        f'{experiment_folder_name}/log_training.txt', 'w')

    if not (model_class in {'lr', 'mlp', 'tree', 'forest'}):
        raise Exception(f'{model_class} not supported.')

    if not (dataset_string in {
            'synthetic', 'mortgage', 'twomoon', 'german', 'credit', 'compass',
            'adult', 'test', 'iris', 'housing', 'wine', 'poker'
    }):
        raise Exception(f'{dataset_string} not supported.')

    if model_class in {'tree', 'forest'}:
        one_hot = False
    elif model_class in {'lr', 'mlp'}:
        one_hot = True
    dataset_obj = loadData.loadDataset(dataset_string,
                                       return_one_hot=one_hot,
                                       load_from_cache=False,
                                       meta_param=scm_class)
    X_train, X_test, y_train, y_test = dataset_obj.getTrainTestSplit()
    X_all = pd.concat([X_train, X_test], axis=0)
    y_all = pd.concat([y_train, y_test], axis=0)
    if dataset_obj.problem_type == 'classification':
        assert y_all.value_counts().nunique(
        ) == 1  # Expected class balance should be equal.
    feature_names = dataset_obj.getInputAttributeNames(
        'kurz')  # easier to read (nothing to do with one-hot vs non-hit!)

    # Define model type
    if model_class == 'tree':
        if dataset_obj.problem_type == 'classification':
            model_pretrain = DecisionTreeClassifier()
        elif dataset_obj.problem_type == 'regression':
            model_pretrain = DecisionTreeRegressor()
    elif model_class == 'forest':
        if dataset_obj.problem_type == 'classification':
            model_pretrain = RandomForestClassifier(n_estimators=100)
        elif dataset_obj.problem_type == 'regression':
            model_pretrain = RandomForestRegressor(n_estimators=100)
    elif model_class == 'lr':
        # IMPORTANT: The default solver changed from ‘liblinear’ to ‘lbfgs’ in 0.22;
        #            therefore, results may differ slightly from paper.
        model_pretrain = LogisticRegression(
        )  # default penalty='l2', i.e., ridge
    elif model_class == 'mlp':
        if dataset_obj.problem_type == 'classification':
            model_pretrain = MLPClassifier(hidden_layer_sizes=(10, 10))
        elif dataset_obj.problem_type == 'regression':
            model_pretrain = MLPRegressor(hidden_layer_sizes=(10, 10))

    tmp_text = f'[INFO] Training `{model_class}` on {X_train.shape[0]:,} samples ' + \
      f'(%{100 * X_train.shape[0] / (X_train.shape[0] + X_test.shape[0]):.2f} ' + \
      f'of {X_train.shape[0] + X_test.shape[0]:,} samples)...'
    print(tmp_text)
    print(tmp_text, file=log_file)
    model_trained = model_pretrain.fit(X_train, y_train)
    if dataset_obj.problem_type == 'classification':
        print(
            f'\tTraining accuracy: %{accuracy_score(y_train, model_trained.predict(X_train)) * 100:.2f}',
            file=log_file)
        print(
            f'\tTesting accuracy: %{accuracy_score(y_test, model_trained.predict(X_test)) * 100:.2f}',
            file=log_file)
        print(
            f'\tTraining accuracy: %{accuracy_score(y_train, model_trained.predict(X_train)) * 100:.2f}'
        )
        print(
            f'\tTesting accuracy: %{accuracy_score(y_test, model_trained.predict(X_test)) * 100:.2f}'
        )
    else:
        print(
            f'\tTraining MAE: {mean_absolute_error(y_train, model_trained.predict(X_train)):.2f}',
            file=log_file)
        print(
            f'\tTesting MAE: {mean_absolute_error(y_test, model_trained.predict(X_test)):.2f}',
            file=log_file)
        print(
            f'\tTraining MAE: {mean_absolute_error(y_train, model_trained.predict(X_train)):.2f}'
        )
        print(
            f'\tTesting MAE: {mean_absolute_error(y_test, model_trained.predict(X_test)):.2f}'
        )
    print('[INFO] done.\n', file=log_file)
    print('[INFO] done.\n')
    #assert accuracy_score(y_train, model_trained.predict(X_train)) > 0.70  # TODO uncomment

    classifier_obj = model_trained
    visualizeDatasetAndFixedModel(dataset_obj, classifier_obj,
                                  experiment_folder_name)

    if model_class == 'tree':
        if SIMPLIFY_TREES:
            print('[INFO] Simplifying decision tree...', end='', file=log_file)
            model_trained.tree_ = treeUtils.simplifyDecisionTree(
                model_trained, False)
            print('\tdone.', file=log_file)
        # treeUtils.saveTreeVisualization(model_trained, model_class, '', X_test, feature_names, experiment_folder_name)
    elif model_class == 'forest':
        for tree_idx in range(len(model_trained.estimators_)):
            if SIMPLIFY_TREES:
                print(
                    f'[INFO] Simplifying decision tree (#{tree_idx + 1}/{len(model_trained.estimators_)})...',
                    end='',
                    file=log_file)
                model_trained.estimators_[
                    tree_idx].tree_ = treeUtils.simplifyDecisionTree(
                        model_trained.estimators_[tree_idx], False)
                print('\tdone.', file=log_file)
            # treeUtils.saveTreeVisualization(model_trained.estimators_[tree_idx], model_class, f'tree{tree_idx}', X_test, feature_names, experiment_folder_name)

    if experiment_folder_name:
        pickle.dump(model_trained,
                    open(f'{experiment_folder_name}/_model_trained', 'wb'))

    return model_trained
Esempio n. 9
0
	model = Sequential()
	model.add(Conv2D(50, (5, 5), input_shape=(1, 84, 83), data_format='channels_first', activation='relu'))
	model.add(MaxPooling2D(pool_size=(2, 2)))
	model.add(Flatten())
	model.add(Dense(128, activation='relu'))
	model.add(Dense(num_classes, activation='softmax'))
	# Compile model
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model


# DATASET = "Hiragana73"
DATASET = "HiraganaGit"

#SWITCH THE LINES BELOW IF YOU NEED TO LOAD ALL THE DATA FROM THE DATASET AGAIN
X, Y, imgPaths = loadDataset(DATASET, loadAgain=True)
#X, Y, imgPaths = loadDataset(DATASET, loadAgain=False)

X /= 255

#X has format (height, width, N)

indices = np.arange(X.shape[0])
np.random.seed(3)
np.random.shuffle(indices)
X = X[indices]
Y = Y[indices]
imgPaths = imgPaths[indices]

N = X.shape[0]