Example #1
0
def test_prob_methods():
    data = load_breast_cancer()

    # Create test and train sets from one dataset
    X_train, X_test, y_train, y_test = train_test_split(
        data["data"],
        data["target"],
        test_size=0.3,
        random_state=0,
        stratify=data["target"],
    )

    # List the parameters to search across
    param_grid = {'C': [1, 10, 100, 120, 150]}

    # Grid-search all parameter combinations using a validation set.
    model = GridSearch(
        model=LogisticRegression(),
        param_grid=param_grid,
    )
    model.fit(X_train, y_train, verbose=False)

    assert (model.predict(X_test) is not None)
    assert (model.predict_proba(X_test) is not None)
Example #2
0
class Prediction:
    def __init__(self, data, model, prefix, param_grid=[]):
        self.train_df, self.test_df = data
        self.model = model
        self.param_grid = param_grid
        self.prefix = prefix + datetime.now().strftime('%m-%d-%H:%M')
        self.X = self.train_df.loc[:, self.train_df.columns != 'precio']
        self.y = self.train_df['precio'].values
        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
            self.X, self.y, test_size=0.1, random_state=1)

    def manualGridSearch(self):
        best_score = math.inf
        for g in self.param_grid:
            print(g)
            self.model.set_params(**g)
            self.model.fit(self.X_train, self.y_train)
            score = mean_absolute_error(self.model.predict(self.X_val),
                                        self.y_val)
            print(score)
            # save if best
            if score < best_score:
                self.best_score = score
                self.best_grid = g

    def gridSearchTrain(self):
        print('Training...')
        self.gscv = GridSearchCV(self.model,
                                 self.param_grid,
                                 scoring='neg_mean_absolute_error',
                                 verbose=10)
        self.gscv.fit(self.X_train, self.y_train)
        self.best_params = self.gscv.best_params_
        self.score = self.gscv.best_score_
        self.predicted = self.gscv.predict(self.test_df)
        print(self.best_params)
        print(self.score)

    def HypOptTrain(self):
        print('Training...')
        self.opt = GridSearch(model=self.model, param_grid=self.param_grid)
        self.opt.fit(self.X_train,
                     self.y_train,
                     self.X_val,
                     self.y_val,
                     scoring='neg_mean_squared_error')
        self.best_params = self.opt.best_params_
        self.score = self.opt.score(X_val, y_val)
        self.predicted = self.opt.predict(self.test_df)
        print(self.best_params)
        print(self.score)

    def train(self):
        print('Training...')
        self.model.fit(self.X_train, self.y_train)
        self.score = mean_absolute_error(self.model.predict(self.X_val),
                                         self.y_val)
        print(self.score)
        self.predicted = self.model.predict(self.test_df)

    def crossValidation(self, cv=5):
        cv_scores = cross_val_score(
            self.model,
            self.X,
            self.y,
            cv=cv,
            scoring='neg_mean_absolute_error'
        )  #print each cv score (accuracy) and average them
        self.score = np.mean(cv_scores)
        print(self.score)

    def save(self):
        if self.param_grid == []:
            with open('{}.model'.format(self.prefix), 'wb') as f:
                pickle.dump(self.model, f)
        else:
            with open('{}.model'.format(self.prefix), 'wb') as f:
                pickle.dump(self.gscv, f)

    def submit(self):
        self.test_ids = pd.read_csv('data/test.csv')['id']
        answer = pd.DataFrame(list(zip(self.test_ids, self.predicted)),
                              columns=['id', 'target'])
        answer.to_csv('{}-{}.csv'.format(self.prefix, int(round(self.score))),
                      sep=',',
                      index=False)
class MachineLearning:
    """Machine learning class to run sklearn-like pipeline on MethylationArray data.
    Initialize object with scikit-learn model, and optionally supply a hyperparameter search grid.

    model
        Scikit-learn-like model, classification, regression, dimensionality reduction, clustering etc.
    options
        Options to supply model in form of dictionary.
    grid
        Alternatively, supply search grid to search for bets hyperparameters.
    labelencode
        T/F encode string labels.
    n_eval
        Number of evaluations for randomized grid search, if set to 0, perform exhaustive grid search
    """
    def __init__(self, model, options, grid={}, labelencode=False, n_eval=0):
        if grid:
            self.model = GridSearch(
                model=model(),
                param_grid=grid,
                num_random_search=None if not n_eval else n_eval)
            self.param_grid_exists = True
            self.grid = grid
        else:
            self.model = model(**options)
            self.param_grid_exists = False
        if labelencode:
            self.encoder = LabelEncoder()
        else:
            self.encoder = None

    def fit(self,
            train_methyl_array,
            val_methyl_array=None,
            outcome_cols=None):
        """Fit data to model.

        Parameters
        ----------
        train_methyl_array
            Training MethylationArray.
        val_methyl_array
            Validation MethylationArray. Can set to None.
        outcome_cols
            Set to none if not needed, but phenotype column to train on, can be multiple.
        """
        if outcome_cols != None:
            if self.encoder != None:
                self.encoder.fit(train_methyl_array.pheno[outcome_cols])
            if self.param_grid_exists:
                self.model.fit(
                    train_methyl_array.beta,
                    self.encoder.transform(
                        train_methyl_array.pheno[outcome_cols])
                    if self.encoder != None else
                    train_methyl_array.pheno[outcome_cols],
                    val_methyl_array.beta,
                    self.encoder.transform(
                        val_methyl_array.pheno[outcome_cols])
                    if self.encoder != None else
                    val_methyl_array.pheno[outcome_cols],
                    scoring='accuracy' if self.encoder != None else 'r2')
            else:
                self.model.fit(
                    train_methyl_array.beta,
                    self.encoder.transform(
                        train_methyl_array.pheno[outcome_cols])
                    if self.encoder != None else
                    train_methyl_array.pheno[outcome_cols])
        else:
            self.model.fit(train_methyl_array.beta)
        return self.model

    def transform(self, test_methyl_array):
        """Transform test methylation array.

        Parameters
        ----------
        test_methyl_array
            Testing MethylationArray.
        """
        self.results = self.model.transform(test_methyl_array.beta)
        return self.results

    def fit_transform(self, train_methyl_array, outcome_cols=None):
        """Fit and transform to training data.

        Parameters
        ----------
        train_methyl_array
            Training MethylationArray.
        outcome_cols
            Set to none if not needed, but phenotype column to train on, can be multiple.
        """
        self.results = self.fit(
            train_methyl_array,
            outcome_cols=None).transform(train_methyl_array)
        return self.results

    def predict(self, test_methyl_array):
        """Make new predictions on test methylation array.

        Parameters
        ----------
        test_methyl_array
            Testing MethylationArray.
        """
        self.results = self.model.predict(test_methyl_array.beta)
        if self.encoder != None:
            self.results = self.encoder.inverse_transform(self.results)
        return self.results

    def fit_predict(self, train_methyl_array, outcome_cols=None):
        """Fit and predict training data.

        Parameters
        ----------
        train_methyl_array
            Training MethylationArray.
        outcome_cols
            Set to none if not needed, but phenotype column to train on, can be multiple.
        """
        self.results = self.fit(train_methyl_array,
                                outcome_cols).predict(train_methyl_array)
        return self.results

    def store_results(self, output_pkl, results_dict={}):
        """Store results in pickle file.

        Parameters
        ----------
        output_pkl
            Output pickle to dump results to.
        results_dict
            Supply own results dict to be dumped.
        """
        if not results_dict:
            results_dict = dict(results=self.results)
        pickle.dump(results_dict, open(results_dict, 'wb'))

    def assign_results_to_pheno_col(self, methyl_array, new_col, output_pkl):
        """Assign results to new phenotype column.

        Parameters
        ----------
        methyl_array
            MethylationArray.
        new_col
            New column name.
        output_pkl
            Output pickle to dump MethylationArray to.
        """
        methyl_array.pheno[new_col] = self.results
        methyl_array.write_pickle(output_pkl)

    def transform_results_to_beta(self, methyl_array, output_pkl):
        """Transform beta matrix into reduced beta matrix and store.

        Parameters
        ----------
        methyl_array
            MethylationArray.
        output_pkl
            Output pickle to dump MethylationArray to.
        """
        methyl_array.beta = pd.DataFrame(self.results, index=self.beta.index)
        methyl_array.write_pickle(output_pkl)

    def return_outcome_metric(self,
                              methyl_array,
                              outcome_cols,
                              metric,
                              run_bootstrap=False):
        """Supply metric to evaluate results.

        Parameters
        ----------
        methyl_array
            MethylationArray to evaluate.
        outcome_cols
            Outcome phenotype columns.
        metric
            Sklearn evaluation metric.
        run_bootstrap
            Make 95% CI from 1k bootstraps.
        """
        y_true = methyl_array.pheno[outcome_cols]
        y_pred = self.results
        if not bootstrap:
            return metric(y_true, y_pred)
        else:
            from sklearn.utils import resample
            boot_results = np.array([
                metric(*resample(y_true, y_pred, random_state=123))
                for i in range(n_bootstrap)
            ])
            original = metric(y_true, y_pred)
            std_err = np.std(boot_results)
            boot_results = np.sort(boot_results)
            ci = 0.95
            bound = (1 - ci) / 2.

            # BORROWED FROM MLXTEND
            def quantile(x, q):
                rank = round(q * x.shape[0]) - 1
                if rank >= x.shape[0]:
                    rank = x.shape[0]
                elif rank <= 0:
                    rank = 0
                rank = int(round(rank))
                return x[rank]

            high_ci = quantile(boot_results, q=(ci + bound))
            low_ci = quantile(boot_results, q=bound)
            return original, std_err, (low_ci, high_ci)
def classifier(classifier, train, truth, validate, validate_truth, test,
               test_truth, datatype):
    np.random.seed(0)
    rng = np.random.permutation(1)[0]
    train = pd.DataFrame(train)
    validate = pd.DataFrame(validate)
    test = pd.DataFrame(test)
    logger = logging.getLogger('myapp')
    hdlr = logging.FileHandler('classifiers.log')
    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    hdlr.setFormatter(formatter)
    logger.addHandler(hdlr)
    logger.setLevel(logging.WARN)
    if classifier.lower(
    ) == 'svm':  #best: C = 50, gamma = 0.0001, kernel = rbf
        model = svm.SVC(random_state=rng)
        hyperparameter = {
            'kernel': ('linear', 'rbf'),
            'C': [1, 1.5, 10, 50, 100, 200],
            'gamma': [1e-7, 1e-4]
        }
    elif classifier.lower() == 'randomforest':  #120
        model = RandomForestClassifier(random_state=rng)
        hyperparameter = {'n_estimators': np.arange(10, 300, 10)}
    elif classifier.lower() == 'adaboost':
        model = AdaBoostClassifier(random_state=rng)
        hyperparameter = {
            'n_estimators': np.arange(10, 300, 10),
            'algorithm': ('SAMME', 'SAMME.R')
        }
    elif classifier.lower() == 'knn':  #120
        model = KNeighborsClassifier()
        hyperparameter = dict(n_neighbors=list(range(1, 100)))
    else:  ## assume it's asking for neural network (multi-layer perceptron)
        model = MLPClassifier(
            max_iter=100
        )  #activation=tanh, hiddenlayersize=(20,20), 'learning_rate'=adaptive,solver=lbfgs
        hyperparameter = {
            'hidden_layer_sizes': [(20, 20), (80, 20), (80, 20, 20),
                                   (80, 40, 40, 20), (40, 40, 20, 20, 20, 10)],
            'learning_rate': ['adaptive'],
            'activation': ['tanh', 'relu', 'logistic'],
            'solver': ['lbfgs', 'sgd', 'adam']
        }
    tuned_model = GridSearch(model=model, param_grid=hyperparameter)
    tuned_model.fit(train, truth)
    prediction = tuned_model.score(test, test_truth)
    logger.warn(classifier + ' ' + datatype + ' validate    ' +
                str(prediction))
    tuned_model.fit(train, truth, validate, validate_truth)
    prediction = tuned_model.score(test, test_truth)
    target_names = [
        'c-CS-s', 'c-CS-m', 'c-SC-s', 'c-SC-m', 't-CS-s', 't-CS-m', 't-SC-s',
        't-SC-m'
    ]
    prediction = tuned_model.predict(test)
    print(
        classification_report(test_truth,
                              prediction,
                              target_names=target_names))
    logger.warn(classifier + ' ' + datatype + '    ' + str(prediction))
    return
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--vectors_file', required=True, type=str)
    parser.add_argument('--data_path',
                        default='stanfordSentimentTreebank',
                        type=str)
    parser.add_argument('--output_file', required=True, type=str)
    args = parser.parse_args()

    try:
        vectors
    except NameError:
        print('Reading vectors file ...  ', end='')
        t = time.time()
        with codecs.open(args.vectors_file, 'r', "UTF-8") as f:
            vocab_size = sum(1 for line in f)

        with codecs.open(args.vectors_file, 'r', "UTF-8") as f:
            line = f.readline()
            val = line.rstrip().split(' ')
            check = False
            if len(
                    val
            ) == 2:  # Check if the vectors file has vocab size and diensionality in the first line
                val = f.readline().rstrip().split(' ')
                vocab_size -= 1
                check = True
            vector_dim = len(list(map(float, val[1:])))

        vectors = np.zeros((vocab_size, vector_dim))

        words = [""] * vocab_size
        vocab_dict = dict()
        with codecs.open(args.vectors_file, 'r', "UTF-8") as f:
            if check:
                next(f)
            for idx, line in enumerate(f):
                vals = line.rstrip().split(' ')

                words[idx] = vals[0]
                vocab_dict[vals[0]] = idx  # indices start from 0
                vec = list(map(float, vals[1:]))
                try:
                    vectors[idx, :] = vec
                except IndexError:
                    if vals[0] == '<unk>':  # ignore the <unk> vector
                        pass
                    else:
                        raise Exception('IncompatibleInputs')

        print("done in " + str(int(time.time() - t)) + " seconds")

    print('Reading train and test data ...  ', end='')
    t = time.time()
    dictionary = dict()
    with codecs.open(args.data_path + "/dictionary.txt", 'r', "UTF-8") as f:
        for line in f.read().splitlines():
            tmp = line.split("|")
            dictionary[tmp[0]] = int(tmp[1])

    with codecs.open(args.data_path + "/datasetSentences.txt", "r",
                     "UTF-8") as f:
        sentences = []
        for sentence in f.read().splitlines()[1:]:
            sentences.append(sentence.split("\t")[1])

    all_labels = []
    with open(args.data_path + "/sentiment_labels.txt") as f:
        for label in f.read().splitlines()[1:]:
            all_labels.append(float(label.split("|")[1]))

    split_classes = []
    with open(args.data_path + "/datasetSplit.txt") as f:
        for line in f.read().splitlines()[1:]:
            split_classes.append(int(line.split(",")[1]))

    print("done in " + str(int(time.time() - t)) + " seconds")

    print(
        'Generating train and test samples from the data for selected classes ...  ',
        end='')
    t = time.time()

    train_size = sum([1 for label in split_classes if label == 1])
    val_size = sum([1 for label in split_classes if label == 3])
    test_size = sum([1 for label in split_classes if label == 2])

    train_samples = np.zeros([train_size, vector_dim])
    train_labels = []

    val_samples = np.zeros([val_size, vector_dim])
    val_labels = []

    test_samples = np.zeros([test_size, vector_dim])
    test_labels = []

    train_no = 0
    val_no = 0
    test_no = 0
    not_in_dict_count = 0
    for sample_no, sentence in enumerate(sentences):
        try:
            score = all_labels[dictionary[sentence]]
        except:
            not_in_dict_count += 1
            continue

        if score <= 0.4 or score > 0.6:  # Eliminate noutral sentences
            inds = process_sentence(sentence, vocab_dict)
            if len(inds) > 0:
                if split_classes[sample_no] == 1:
                    for ind in inds:
                        train_samples[train_no, :] += vectors[ind, :]

                    train_samples[
                        train_no, :] = train_samples[train_no, :] / len(inds)

                    if score <= 0.4:
                        train_labels.append(0)
                    elif score > 0.6:
                        train_labels.append(1)

                    train_no += 1

                elif split_classes[sample_no] == 3:
                    for ind in inds:
                        val_samples[val_no, :] += vectors[ind, :]

                    val_samples[val_no, :] = val_samples[val_no, :] / len(inds)

                    if score <= 0.4:
                        val_labels.append(0)
                    elif score > 0.6:
                        val_labels.append(1)

                    val_no += 1

                elif split_classes[sample_no] == 2:
                    for ind in inds:
                        test_samples[test_no, :] += vectors[ind, :]

                    test_samples[
                        test_no, :] = test_samples[test_no, :] / len(inds)

                    if score <= 0.4:
                        test_labels.append(0)
                    elif score > 0.6:
                        test_labels.append(1)

                    test_no += 1

    train_samples = train_samples[:train_no, :]
    val_samples = val_samples[:val_no, :]
    test_samples = test_samples[:test_no, :]

    print("done in " + str(int(time.time() - t)) + " seconds")

    print('Training linear SVM for parameter optimization ... ', end='')

    tuned_parameters = [{
        'kernel': ['linear'],
        'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    }]
    clf = GridSearch(model=SVC(), param_grid=tuned_parameters)
    clf.fit(train_samples, train_labels, val_samples, val_labels)

    print("done in " + str(int(time.time() - t)) + " seconds")

    predicted_labels = clf.predict(test_samples)
    accuracy = sum([
        true == predicted
        for true, predicted in zip(test_labels, predicted_labels)
    ]) / len(test_samples) * 100

    print("Accuracy for sentiment classification of sentences is: " +
          str(round(accuracy, 2)) + "% (" +
          str(int(accuracy / 100 * len(predicted_labels))) + "/" +
          str(len(predicted_labels)) + ")")

    f_out = open(args.output_file, "w")
    f_out.write("Accuracy for sentiment classification is: " +
                str(round(accuracy, 2)) + "% (" +
                str(int(accuracy / 100 * len(predicted_labels))) + "/" +
                str(len(predicted_labels)) + ")\n")
    f_out.close()