Beispiel #1
0
n_classes = len(y.unique())
n_samples0 = y.value_counts()[0]
n_samples1 = y.value_counts()[1]

w0 = n_samples / (n_classes * n_samples0)
w1 = n_samples / (n_classes * n_samples1)

X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

weights = y_train.map(lambda y: w0 if y == 0 else w1)

from sklearn.neighbors import NearestCentroid
from sklearn.metrics import classification_report
# Creating the Nearest Centroid Clissifier
model = NearestCentroid()

# Training the classifier
model.fit(X_train, y_train.values.ravel())

model.score(X_train, y_train, sample_weight=weights)

# Printing Accuracy on Training and Test sets
print(f"Training Set Score : {model.score(X_train, y_train) * 100} %")
print(f"Test Set Score : {model.score(X_test, y_test) * 100} %")

# Printing classification report of classifier on the test set set data
print(
    f"Model Classification Report : \n{classification_report(y_test, model.predict(X_test))}"
)
'''
Beispiel #2
0
    #Scoring
    score = metrics.accuracy_score(targets, predictions)
    print("accuracy:   %0.3f" % score)
    totalPredictions.append(predictions)
    return name, score, train_time, test_time

results = []

#Main Code

#Classifiers
clf1 = LogisticRegression()
clf2 = PassiveAggressiveClassifier()
clf3 = MultinomialNB(alpha=.01)
clf4 = BernoulliNB(alpha=.01)
clf5 = NearestCentroid()
clf6 = RidgeClassifier(tol=1e-2, solver="sag")
clf7 = Perceptron(n_iter=50)
clf8 = SGDClassifier(loss='hinge',alpha=.0001, n_iter=50,shuffle=True,penalty="l2",n_jobs=-1)
clf9 = SGDClassifier(loss='hinge',alpha=.0001, n_iter=50,shuffle=True,penalty="l1",n_jobs=-1)
clf10 = SGDClassifier(loss='hinge',alpha=.0001, n_iter=50,shuffle=True,penalty="elasticnet",n_jobs=-1)
clf11 = SGDClassifier(loss='log',alpha=.0001, n_iter=50,shuffle=True,penalty="l2",n_jobs=-1)
clf12 = SGDClassifier(loss='log',alpha=.0001, n_iter=50,shuffle=True,penalty="l1",n_jobs=-1)
clf13 = SGDClassifier(loss='log',alpha=.0001, n_iter=50,shuffle=True,penalty="elasticnet",n_jobs=-1)
clf14 = SGDClassifier(loss='modified_huber',alpha=.0001, n_iter=50,shuffle=True,penalty="l2",n_jobs=-1)
clf15 = SGDClassifier(loss='modified_huber',alpha=.0001, n_iter=50,shuffle=True,penalty="l1",n_jobs=-1)
clf16 = SGDClassifier(loss='modified_huber',alpha=.0001, n_iter=50,shuffle=True,penalty="elasticnet",n_jobs=-1)
clf17 = SGDClassifier(loss='squared_hinge',alpha=.0001, n_iter=50,shuffle=True,penalty="l2",n_jobs=-1)
clf18 = SGDClassifier(loss='squared_hinge',alpha=.0001, n_iter=50,shuffle=True,penalty="l1",n_jobs=-1)
clf19 = SGDClassifier(loss='squared_hinge',alpha=.0001, n_iter=50,shuffle=True,penalty="elasticnet",n_jobs=-1)
clf20 = SGDClassifier(loss='perceptron',alpha=.0001, n_iter=50,shuffle=True,penalty="l2",n_jobs=-1)
    def update_species(self,
                       new_individuals,
                       historical_marker,
                       individual_type,
                       delta=0.0005):

        if individual_type == "modules":
            species_list = self.module_species_list
            individuals = self.modules
        elif individual_type == "blueprints":
            species_list = self.blueprint_species_list
            individuals = self.blueprints
        else:
            raise ValueError("type must be one of blueprints or modules")

        create_new_species = False
        old_features = [
            individual.genotype_phenotype_features(historical_marker)
            for individual in individuals
        ]
        old_labels = [individual.species for individual in individuals]

        new_features = [
            new_individual.genotype_phenotype_features(historical_marker)
            for new_individual in new_individuals
        ]
        scaled_features = scale(old_features + new_features)

        old_features = scaled_features[:len(old_features)]
        new_features = scaled_features[len(old_features):]

        print("Old Labels", individual_type)
        print(old_labels)
        if len(set(old_labels)) < 2:
            dominant_species_id = 0
            for species in species_list:
                if species.members:
                    dominant_species_id = species.id

            kmeans = KMeans(n_clusters=1, random_state=0)
            kmeans.fit(old_features)
            centroids = kmeans.cluster_centers_

            cluster_distances_map = {}

            unique_labels = list(set(old_labels))

            unique_labels.sort()
            centroid_map = {}
            for i in range(len(unique_labels)):
                centroid_map[unique_labels[i]] = centroids[i]

            for i in unique_labels:
                cluster_distances_map[i] = []
            for i in range(len(old_features)):
                cluster_distances_map[old_labels[i]].append(
                    np.sum(
                        pairwise_distances(
                            old_features[i].reshape(1, -1),
                            np.array(centroid_map[old_labels[i]]).reshape(
                                1, -1),
                            force_all_finite=True)))

            max_point_distance_clusters = {}

            for cluster, distances in cluster_distances_map.items():
                if np.max(distances) == 0:
                    max_point_distance_clusters[cluster] = 1
                else:
                    max_point_distance_clusters[cluster] = np.max(distances)

            new_labels = kmeans.predict(new_features)
            dominant_labels = [dominant_species_id] * len(new_features)
            print(dominant_labels)
            adjusted_labels = []

            for i in range(len(new_features)):
                print(i)
                d1 = np.sum(
                    pairwise_distances(
                        new_features[i].reshape(1, -1),
                        np.array(centroid_map[new_labels[i]]).reshape(1, -1),
                        force_all_finite=True))
                print("Print Distance")
                print(max_point_distance_clusters[new_labels[i]])
                print(((d1 - max_point_distance_clusters[new_labels[i]]) /
                       max_point_distance_clusters[new_labels[i]]))
                print(delta)
                if ((d1 - max_point_distance_clusters[new_labels[i]]) /
                        max_point_distance_clusters[new_labels[i]]) > delta:
                    adjusted_labels.append(unique_labels[-1] + 1)
                    print("Found unique individual")
                    create_new_species = True
                else:
                    adjusted_labels.append(dominant_labels[i])

            if create_new_species:
                species_list.append(
                    Species(
                        unique_labels[-1] + 1, [], individual_type,
                        "Species_" + individual_type + "_" +
                        str(unique_labels[-1] + 1)))
                print("Created new Species")

            for i in range(len(new_individuals)):
                new_individuals[i].species = adjusted_labels[i]

            species_exclusion_list = []

            for species in species_list:
                members = []
                for new_individual in new_individuals:
                    if new_individual.species == species.id:
                        members.append(new_individual)
                if members:
                    species.members = species.members + members
                if not species.members:
                    species_exclusion_list.append(species)

            for species in species_exclusion_list:
                species_list.remove(species)
        else:
            if not new_individuals:
                raise Exception("No new offsprings")

            NCcLf = NearestCentroid(metric="euclidean")
            NCcLf.fit(old_features, old_labels)
            centroids = NCcLf.centroids_
            print("centroids")
            print(NCcLf.centroids_)

            cluster_distances_map = {}

            unique_labels = list(set(old_labels))

            unique_labels.sort()
            centroid_map = {}
            for i in range(len(unique_labels)):
                centroid_map[unique_labels[i]] = centroids[i]

            for i in unique_labels:
                cluster_distances_map[i] = []
            for i in range(len(old_features)):
                cluster_distances_map[old_labels[i]].append(
                    np.sum(
                        pairwise_distances(
                            old_features[i].reshape(1, -1),
                            np.array(centroid_map[old_labels[i]]).reshape(
                                1, -1),
                            force_all_finite=True)))

            max_point_distance_clusters = {}

            for cluster, distances in cluster_distances_map.items():
                if np.max(distances) == 0:
                    max_point_distance_clusters[cluster] = 1
                else:
                    max_point_distance_clusters[cluster] = np.max(distances)

            new_labels = NCcLf.predict(new_features)
            adjusted_labels = []

            for i in range(len(new_features)):
                d1 = np.sum(
                    pairwise_distances(
                        new_features[i].reshape(1, -1),
                        np.array(centroid_map[new_labels[i]]).reshape(1, -1),
                        force_all_finite=True))
                print("Print Distance")
                print(max_point_distance_clusters[new_labels[i]])
                print(((d1 - max_point_distance_clusters[new_labels[i]]) /
                       max_point_distance_clusters[new_labels[i]]))
                if ((d1 - max_point_distance_clusters[new_labels[i]]) /
                        max_point_distance_clusters[new_labels[i]]) > delta:
                    adjusted_labels.append(unique_labels[-1] + 1)
                    create_new_species = True
                else:
                    adjusted_labels.append(new_labels[i])
            non_existing = True
            if create_new_species:
                for species in species_list:
                    if species.id == unique_labels[-1] + 1:
                        raise Exception("Creating Duplicate species")
                species_list.append(
                    Species(
                        unique_labels[-1] + 1, [], individual_type,
                        "Species_" + individual_type + "_" +
                        str(unique_labels[-1] + 1)))
                print("Created new Species")

            for i in range(len(new_individuals)):
                new_individuals[i].species = adjusted_labels[i]

            species_exclusion_list = []

            for species in species_list:
                members = []
                for new_individual in new_individuals:
                    if new_individual.species == species.id:
                        members.append(new_individual)
                if members:
                    species.members = species.members + members
                if not species.members:
                    species_exclusion_list.append(species)

            for species in species_exclusion_list:
                species_list.remove(species)
Beispiel #4
0
 def nearest_centroid(X, Y, parameters):
     method_params = parameters["static"]["params"]["strategies"]["ml_model"]["params"]
     clf = NearestCentroid(**method_params)
     clf.fit(X, Y)
     return clf
Beispiel #5
0
 def construct_model(X_train, y_train, params, sample_weights=None):
     model_name = params['name']
     model_params = params['params']
     base_model_params = params.get('base_model_params', {})
     train_data = (X_train, y_train,
                   sample_weights) if sample_weights is not None else (
                       X_train, y_train)
     if model_name == 'DecisionTreeClassifier':
         from sklearn.tree import DecisionTreeClassifier
         model = DecisionTreeClassifier(**model_params).fit(*train_data)
     elif model_name == 'SVC':
         from sklearn.svm import SVC
         model = SVC(**model_params).fit(*train_data)
     elif model_name == 'KNN':
         from sklearn.neighbors import KNeighborsClassifier
         model = KNeighborsClassifier(**model_params).fit(X_train, y_train)
     elif model_name == 'GaussianNB':
         from sklearn.naive_bayes import GaussianNB
         model = GaussianNB(**model_params).fit(X_train, y_train)
     elif model_name == 'RandomForestClassifier':
         from sklearn.ensemble import RandomForestClassifier
         model = RandomForestClassifier(**model_params).fit(*train_data)
     elif model_name == 'GradientBoostingClassifier':
         from sklearn.ensemble import GradientBoostingClassifier
         model = GradientBoostingClassifier(**model_params).fit(*train_data)
     elif model_name == 'BaggedKNN':
         from sklearn.ensemble import BaggingClassifier
         from sklearn.neighbors import KNeighborsClassifier
         model = BaggingClassifier(
             KNeighborsClassifier(**base_model_params),
             **model_params).fit(*train_data)
     elif model_name == 'AdaBoostedTree':
         from sklearn.ensemble import AdaBoostClassifier
         from sklearn.tree import DecisionTreeClassifier
         model = AdaBoostClassifier(
             DecisionTreeClassifier(**base_model_params),
             **model_params).fit(*train_data)
     elif model_name == 'XGBoostClassifier':
         from xgboost import XGBClassifier
         model = XGBClassifier(**model_params).fit(*train_data)
     elif model_name == 'RidgeClassifier':
         from sklearn.linear_model import RidgeClassifier
         model = RidgeClassifier(**model_params).fit(*train_data)
     elif model_name == 'LogisticRegression':
         from sklearn.linear_model import LogisticRegression
         model = LogisticRegression(**model_params).fit(*train_data)
     elif model_name == 'LDA':
         from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
         model = LinearDiscriminantAnalysis(**model_params).fit(
             X_train, y_train)
     elif model_name == 'QDA':
         from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
         model = QuadraticDiscriminantAnalysis(**model_params).fit(
             X_train, y_train)
     elif model_name == 'MLP':
         from sklearn.neural_network import MLPClassifier
         model = MLPClassifier(**model_params).fit(X_train, y_train)
     elif model_name == 'NearestCentroid':
         from sklearn.neighbors import NearestCentroid
         model = NearestCentroid(**model_params).fit(X_train, y_train)
     elif model_name == 'RadiusNeighborsClassifier':
         from sklearn.neighbors import RadiusNeighborsClassifier
         model = RadiusNeighborsClassifier(**model_params).fit(
             X_train, y_train)
     else:
         raise ValueError("unknown ML model passed in model_name")
     return model
def get_estimators_list(dataset,
                        options,
                        use_imdb_multi_class_labels,
                        is_soft_voting=False,
                        is_stacking_classifier=False,
                        final_estimator=None):

    if is_stacking_classifier:
        ml_algorithm_list = [
            Classifier.COMPLEMENT_NB.name, Classifier.RIDGE_CLASSIFIER.name,
            Classifier.LINEAR_SVC.name, Classifier.LOGISTIC_REGRESSION.name,
            Classifier.PASSIVE_AGGRESSIVE_CLASSIFIER.name,
            Classifier.RANDOM_FOREST_CLASSIFIER.name
        ]
    else:  # is VotingClassifier
        if is_soft_voting:
            ml_algorithm_list = [
                Classifier.COMPLEMENT_NB.name,
                Classifier.LOGISTIC_REGRESSION.name,
                Classifier.MULTINOMIAL_NB.name,
                Classifier.RANDOM_FOREST_CLASSIFIER.name
            ]
        else:
            ml_algorithm_list = [
                Classifier.COMPLEMENT_NB.name,
                Classifier.RIDGE_CLASSIFIER.name, Classifier.LINEAR_SVC.name,
                Classifier.LOGISTIC_REGRESSION.name,
                Classifier.PASSIVE_AGGRESSIVE_CLASSIFIER.name,
                Classifier.RANDOM_FOREST_CLASSIFIER.name
            ]

    estimators_list = []

    if Classifier.ADA_BOOST_CLASSIFIER.name in ml_algorithm_list:
        json_with_best_parameters = get_json_with_best_parameters(
            dataset, Classifier.ADA_BOOST_CLASSIFIER,
            use_imdb_multi_class_labels)
        # adding options.random_state in the map
        json_with_best_parameters['random_state'] = options.random_state
        # create classifier with best parameters
        ada_boost_classifier = AdaBoostClassifier(**json_with_best_parameters)
        print('\t', ada_boost_classifier)
        estimators_list.append(('ada_boost_classifier', ada_boost_classifier))

    if Classifier.BERNOULLI_NB.name in ml_algorithm_list:
        json_with_best_parameters = get_json_with_best_parameters(
            dataset, Classifier.BERNOULLI_NB, use_imdb_multi_class_labels)
        # create classifier with best parameters
        bernoulli_nb = BernoulliNB(**json_with_best_parameters)
        print('\t', bernoulli_nb)
        estimators_list.append(('bernoulli_nb', bernoulli_nb))

    if Classifier.COMPLEMENT_NB.name in ml_algorithm_list:
        json_with_best_parameters = get_json_with_best_parameters(
            dataset, Classifier.COMPLEMENT_NB, use_imdb_multi_class_labels)
        # create classifier with best parameters
        complement_nb = ComplementNB(**json_with_best_parameters)
        print('\t', complement_nb)
        estimators_list.append(('complement_nb', complement_nb))

    if Classifier.DECISION_TREE_CLASSIFIER.name in ml_algorithm_list:
        json_with_best_parameters = get_json_with_best_parameters(
            dataset, Classifier.DECISION_TREE_CLASSIFIER,
            use_imdb_multi_class_labels)
        # adding options.random_state in the map
        json_with_best_parameters['random_state'] = options.random_state
        # create classifier with best parameters
        decision_tree_classifier = DecisionTreeClassifier(
            **json_with_best_parameters)
        print('\t', decision_tree_classifier)
        estimators_list.append(
            ('decision_tree_classifier', decision_tree_classifier))

    if Classifier.GRADIENT_BOOSTING_CLASSIFIER.name in ml_algorithm_list:
        json_with_best_parameters = get_json_with_best_parameters(
            dataset, Classifier.GRADIENT_BOOSTING_CLASSIFIER,
            use_imdb_multi_class_labels)
        # adding options.verbose in the map
        json_with_best_parameters['verbose'] = options.verbose
        # adding options.random_state in the map
        json_with_best_parameters['random_state'] = options.random_state
        # create classifier with best parameters
        gradient_boosting_classifier = GradientBoostingClassifier(
            **json_with_best_parameters)
        print('\t', gradient_boosting_classifier)
        estimators_list.append(
            ('gradient_boosting_classifier', gradient_boosting_classifier))

    if Classifier.K_NEIGHBORS_CLASSIFIER.name in ml_algorithm_list:
        json_with_best_parameters = get_json_with_best_parameters(
            dataset, Classifier.K_NEIGHBORS_CLASSIFIER,
            use_imdb_multi_class_labels)
        # adding options.random_state in the map
        json_with_best_parameters['n_jobs'] = options.n_jobs
        # create classifier with best parameters
        k_neighbors_classifier = KNeighborsClassifier(
            **json_with_best_parameters)
        print('\t', k_neighbors_classifier)
        estimators_list.append(
            ('k_neighbors_classifier', k_neighbors_classifier))

    if Classifier.LINEAR_SVC.name in ml_algorithm_list:
        json_with_best_parameters = get_json_with_best_parameters(
            dataset, Classifier.LINEAR_SVC, use_imdb_multi_class_labels)
        # adding options.verbose in the map
        json_with_best_parameters['verbose'] = options.verbose
        # adding options.random_state in the map
        json_with_best_parameters['random_state'] = options.random_state
        # create classifier with best parameters
        linear_svc = LinearSVC(**json_with_best_parameters)
        print('\t', linear_svc)
        estimators_list.append(('linear_svc', linear_svc))

    if Classifier.LOGISTIC_REGRESSION.name in ml_algorithm_list:
        json_with_best_parameters = get_json_with_best_parameters(
            dataset, Classifier.LOGISTIC_REGRESSION,
            use_imdb_multi_class_labels)
        # adding options.n_jobs in the map
        json_with_best_parameters['n_jobs'] = options.n_jobs
        # adding options.verbose in the map
        json_with_best_parameters['verbose'] = options.verbose
        # adding options.random_state in the map
        json_with_best_parameters['random_state'] = options.random_state
        # create classifier with best parameters
        logistic_regression = LogisticRegression(**json_with_best_parameters)
        print('\t', logistic_regression)
        estimators_list.append(('logistic_regression', logistic_regression))

    if Classifier.MULTINOMIAL_NB.name in ml_algorithm_list:
        json_with_best_parameters = get_json_with_best_parameters(
            dataset, Classifier.MULTINOMIAL_NB, use_imdb_multi_class_labels)
        # create classifier with best parameters
        multinomial_nb = MultinomialNB(**json_with_best_parameters)
        print('\t', multinomial_nb)
        estimators_list.append(('multinomial_nb', multinomial_nb))

    if Classifier.NEAREST_CENTROID.name in ml_algorithm_list:
        json_with_best_parameters = get_json_with_best_parameters(
            dataset, Classifier.NEAREST_CENTROID, use_imdb_multi_class_labels)
        # create classifier with best parameters
        nearest_centroid = NearestCentroid(**json_with_best_parameters)
        print('\t', nearest_centroid)
        estimators_list.append(('nearest_centroid', nearest_centroid))

    if Classifier.PASSIVE_AGGRESSIVE_CLASSIFIER.name in ml_algorithm_list:
        json_with_best_parameters = get_json_with_best_parameters(
            dataset, Classifier.PASSIVE_AGGRESSIVE_CLASSIFIER,
            use_imdb_multi_class_labels)
        # adding options.n_jobs in the map
        json_with_best_parameters['n_jobs'] = options.n_jobs
        # adding options.verbose in the map
        json_with_best_parameters['verbose'] = options.verbose
        # adding options.random_state in the map
        json_with_best_parameters['random_state'] = options.random_state
        # create classifier with best parameters
        passive_aggressive_classifier = PassiveAggressiveClassifier(
            **json_with_best_parameters)
        print('\t', passive_aggressive_classifier)
        estimators_list.append(
            ('passive_aggressive_classifier', passive_aggressive_classifier))

    if Classifier.PERCEPTRON.name in ml_algorithm_list:
        json_with_best_parameters = get_json_with_best_parameters(
            dataset, Classifier.PERCEPTRON, use_imdb_multi_class_labels)
        # adding options.n_jobs in the map
        json_with_best_parameters['n_jobs'] = options.n_jobs
        # adding options.verbose in the map
        json_with_best_parameters['verbose'] = options.verbose
        # adding options.random_state in the map
        json_with_best_parameters['random_state'] = options.random_state
        # create classifier with best parameters
        perceptron = Perceptron(**json_with_best_parameters)
        print('\t', perceptron)
        estimators_list.append(('perceptron', perceptron))

    if Classifier.RANDOM_FOREST_CLASSIFIER.name in ml_algorithm_list:
        json_with_best_parameters = get_json_with_best_parameters(
            dataset, Classifier.RANDOM_FOREST_CLASSIFIER,
            use_imdb_multi_class_labels)
        # adding options.n_jobs in the map
        json_with_best_parameters['n_jobs'] = options.n_jobs
        # adding options.verbose in the map
        json_with_best_parameters['verbose'] = options.verbose
        # adding options.random_state in the map
        json_with_best_parameters['random_state'] = options.random_state
        # create classifier with best parameters
        random_forest_classifier = RandomForestClassifier(
            **json_with_best_parameters)
        print('\t', random_forest_classifier)
        estimators_list.append(
            ('random_forest_classifier', random_forest_classifier))

    if Classifier.RIDGE_CLASSIFIER.name in ml_algorithm_list:
        json_with_best_parameters = get_json_with_best_parameters(
            dataset, Classifier.RIDGE_CLASSIFIER, use_imdb_multi_class_labels)
        # adding options.random_state in the map
        json_with_best_parameters['random_state'] = options.random_state
        # create classifier with best parameters
        ridge_classifier = RidgeClassifier(**json_with_best_parameters)
        print('\t', ridge_classifier)
        estimators_list.append(('ridge_classifier', ridge_classifier))

    if is_stacking_classifier:
        if final_estimator == Classifier.LINEAR_SVC.name:
            return estimators_list, linear_svc
        elif final_estimator == Classifier.LOGISTIC_REGRESSION.name:
            return estimators_list, logistic_regression
        elif final_estimator == Classifier.PASSIVE_AGGRESSIVE_CLASSIFIER.name:
            return estimators_list, passive_aggressive_classifier
        elif final_estimator == Classifier.RIDGE_CLASSIFIER.name:
            return estimators_list, ridge_classifier
        else:
            # Default
            return estimators_list, LinearSVC()

    return estimators_list
Beispiel #7
0
def featureEng(train,
               name='NB1',
               n_comp=50,
               ngram_cv=3,
               ngram_tfidf=3,
               training=True):
    eng_stopwords = set(stopwords.words('english'))

    cls = [(RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"),
           (Perceptron(max_iter=50), "Perceptron"),
           (KNeighborsClassifier(n_neighbors=10), "kNN"),
           (RandomForestClassifier(n_estimators=10), "Random forest"),
           (LinearSVC(loss='squared_hinge', penalty='l1', dual=False,
                      tol=1e-3), "SVC-L1"),
           (LinearSVC(loss='squared_hinge', penalty='l2', dual=False,
                      tol=1e-3), "SVC-L2"),
           (SGDClassifier(alpha=.01, max_iter=50, penalty='l1'), 'SGD-L1'),
           (SGDClassifier(alpha=.01, max_iter=50, penalty='l2'), 'SGD-L2'),
           (SGDClassifier(alpha=.01, max_iter=50,
                          penalty='elasticnet'), 'SGD-ElasticNet'),
           (NearestCentroid(), 'Nearest neighbor'),
           (MultinomialNB(alpha=.1), 'NB1'), (BernoulliNB(alpha=.1), 'NB2')]

    train['num_words'] = train['text'].apply(lambda x: len(str(x).split()))
    train['num_unique_words'] = train['text'].apply(
        lambda x: len(set(str(x).split())))
    train['num_chars'] = train['text'].apply(lambda x: len(str(x)))
    train['num_stopwords'] = train['text'].apply(lambda x: len(
        [w for w in str(x).lower().split() if w in eng_stopwords]))
    train['num_punctions'] = train['text'].apply(
        lambda x: len([w for w in str(x) if w in string.punctuation]))
    train["num_words_upper"] = train["text"].apply(
        lambda x: len([w for w in str(x).split() if w.isupper()]))
    ## Number of title case words in the text ##
    train["num_words_title"] = train["text"].apply(
        lambda x: len([w for w in str(x).split() if w.istitle()]))
    ## Average length of the words in the text ##
    train["mean_word_len"] = train["text"].apply(
        lambda x: np.mean([len(w) for w in str(x).split()]))
    all_text_without_sw = ''
    for i in train.itertuples():
        all_text_without_sw = all_text_without_sw + str(i.text)
    #getting counts of each words:
    counts = Counter(re.findall(r"[\w']+", all_text_without_sw))
    #deleting ' from counts
    del counts["'"]
    #getting top 50 used words:
    sorted_x = dict(
        sorted(counts.items(), key=operator.itemgetter(1), reverse=True)[:300])

    #Feature-5: The count of top used words.
    train['num_top'] = train['text'].apply(
        lambda x: len([w for w in str(x).lower().split() if w in sorted_x]))

    #Similarly lets identify the least used words:
    reverted_x = dict(
        sorted(counts.items(), key=operator.itemgetter(1))[:10000])
    #Feature-6: The count of least used words.
    train['num_least'] = train['text'].apply(
        lambda x: len([w for w in str(x).lower().split() if w in reverted_x]))
    train['unique_word_fraction'] = train['text'].apply(
        lambda row: unique_word_fraction(row))
    train['stopwords_count'] = train['text'].apply(
        lambda row: stopwords_count(row))
    train['punctuations_fraction'] = train['text'].apply(
        lambda row: punctuations_fraction(row))
    train['char_count'] = train['text'].apply(lambda row: char_count(row))
    train['fraction_noun'] = train['text'].apply(
        lambda row: fraction_noun(row))
    train['fraction_adj'] = train['text'].apply(lambda row: fraction_adj(row))
    train['fraction_verbs'] = train['text'].apply(
        lambda row: fraction_verbs(row))
    train['sentiment_id'] = train['sentiment'].apply(
        lambda row: sentiment_mapping[row])
    if training:
        train['y1'] = train.apply(
            lambda row: process_data(row.text, row.selected_text)[0], axis=1)
        train['y2'] = train.apply(
            lambda row: process_data(row.text, row.selected_text)[1], axis=1)
    tfidf_vec = TfidfVectorizer(stop_words='english',
                                ngram_range=(1, ngram_tfidf))
    train_tfidf = tfidf_vec.fit_transform(train['text'].values.tolist())

    ###SVD on word TFIDF
    svd_obj = TruncatedSVD(n_components=n_comp, algorithm='randomized')
    svd_obj.fit(train_tfidf)
    train_svd = pd.DataFrame(svd_obj.transform(train_tfidf))

    train_svd.columns = ['svd_wordtfidf_' + str(i) for i in range(n_comp)]
    train = pd.concat([train, train_svd], axis=1)
    del train_tfidf, train_svd

    ### Fit transform the count vectorizer ###
    wordcv_vec = CountVectorizer(stop_words='english',
                                 ngram_range=(1, ngram_cv))
    train_vec = wordcv_vec.fit_transform(train['text'].values.tolist())

    ###SVD on Character TFIDF
    svd_obj = TruncatedSVD(n_components=n_comp, algorithm='randomized')
    svd_obj.fit(train_vec)
    train_svd = pd.DataFrame(svd_obj.transform(train_vec))

    train_svd.columns = ['svd_wordcv_' + str(i) for i in range(n_comp)]
    train = pd.concat([train, train_svd], axis=1)
    del train_vec, train_svd

    charcv_vec = CountVectorizer(ngram_range=(1, ngram_cv), analyzer='char')
    train_vec = charcv_vec.fit_transform(train['text'].values.tolist())

    ###SVD on Character TFIDF
    svd_obj = TruncatedSVD(n_components=n_comp, algorithm='randomized')
    svd_obj.fit(train_vec)
    train_svd = pd.DataFrame(svd_obj.transform(train_vec))

    train_svd.columns = ['svd_charcv_' + str(i) for i in range(n_comp)]
    train = pd.concat([train, train_svd], axis=1)
    del train_vec, train_svd

    return train
def compare_models(X, Y):
    models.append(('NCC',
                   Pipeline(steps=[
                       ('preprocessor', preprocessor),
                       ('clf', NearestCentroid()),
                   ], )))

    models.append(('PC',
                   Pipeline(steps=[
                       ('preprocessor', preprocessor),
                       ('clf', Perceptron()),
                   ], )))

    models.append(('NB',
                   Pipeline(steps=[
                       ('preprocessor', preprocessor),
                       ('clf', BernoulliNB(alpha=.001)),
                   ], )))

    models.append(
        ('SGD',
         Pipeline(steps=[
             ('preprocessor', preprocessor),
             ('clf', SGDClassifier(loss='modified_huber', max_iter=10000)),
         ], )))
    models.append(('KNN',
                   Pipeline(steps=[
                       ('preprocessor', preprocessor),
                       ('clf', KNeighborsClassifier()),
                   ], )))
    models.append(('SVM',
                   Pipeline(steps=[
                       ('preprocessor', preprocessor),
                       ('clf', LinearSVC(max_iter=10000)),
                   ], )))

    models.append(('LR',
                   Pipeline(steps=[
                       ('preprocessor', preprocessor),
                       ('clf', SGDClassifier(loss='log', max_iter=10000)),
                   ], )))

    models.append(('DT/CART',
                   Pipeline(steps=[
                       ('preprocessor', preprocessor),
                       ('clf', DecisionTreeClassifier(max_depth=500)),
                   ], )))

    models.append(('RF',
                   Pipeline(steps=[
                       ('preprocessor', preprocessor),
                       ('clf', RandomForestClassifier(max_depth=500)),
                   ], )))

    # evaluate each model in turn
    results = []
    names = []
    scoring = 'accuracy'
    for name, model in models:
        kfold = model_selection.KFold(n_splits=10, random_state=seed)
        cv_results = model_selection.cross_val_score(model,
                                                     X,
                                                     Y,
                                                     cv=kfold,
                                                     scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
    grad_boost = np.array(
        [0.73, 0.72, 0.69, 0.75, 0.78, 0.79, 0.80, 0.74, 0.76, 0.74])
    names.append('GB')
    results.append(grad_boost)
    # boxplot algorithm comparison
    fig = plt.figure()
    fig.suptitle('Algorithm Comparison')
    ax = fig.add_subplot(111)
    plt.boxplot(results)
    ax.set_xticklabels(names)
    plt.show()
Beispiel #9
0
    kernels = [ 'rbf', 'poly', 'linear', 'sigmoid' ]
    iterations = 3

    success_rate = lambda x, y : sum(x == y) * 1.0 / len(x)

    svc_kernels = { kernel : [] for kernel in kernels }
    svc_linear = []
    nn_all = []
    cn_all = []

    for i in range(iterations):

        for kernel in kernels:
            svc = SVC(kernel=kernel).fit(X_train, y_train).predict(X_test)
            svc_kernels[kernel].append(success_rate(svc, y_test))

        nn = KNeighborsClassifier(n_neighbors=1).fit(X_train, y_train).predict(X_test)
        nn_all.append(success_rate(nn, y_test))

        cn = NearestCentroid().fit(X_train, y_train).predict(X_test)
        cn_all.append(success_rate(cn, y_test))

    for kernel in kernels:
        print mean(svc_kernels[kernel])

    print

    print mean(nn_all)
    print mean(cn_all)

    # visualise(X_train + X_test, y_train + y_test)
def main():

    # Checks for correct number of arguments
    if len(sys.argv) != 3:
        print(
            'usage: ./troll_identifier.py [TRAIN DATASET] [TEST/DEV DATASET]')
        sys.exit()

    # set up dataset
    data_train = pd.read_csv(sys.argv[1])
    data_test = pd.read_csv(sys.argv[2])

    print('train: {}'.format(sys.argv[1]))
    print('test: {}'.format(sys.argv[2]))

    x_train = data_train.drop(
        [data_train.columns[0], data_train.columns[1], data_train.columns[-1]],
        axis=1).apply(pd.to_numeric, errors='ignore')
    y_train = pd.Series(data_train.iloc[:, -1])
    x_test = data_test.drop(
        [data_test.columns[0], data_test.columns[1], data_test.columns[-1]],
        axis=1).apply(pd.to_numeric, errors='ignore')
    y_test = pd.Series(data_test.iloc[:, -1])

    type = input('type: [1: supervised, 2: semi-supervised, 3: unsupervised] ')
    if type == 1:
        method = input('method: [1: classification, 2: regression] ')
        if method == 1:
            classifier = input(
                'classifier: [1: decision tree, 2: extra tree, 3: extra trees, 4: k nearest neighbor, 5: naive bayes, 6: radius neighbors, 7: random forest, 8: support vector machine, 9: gradient boosting, 10: gaussian process, 11: stochastic gradient descent, 12: passive aggressive, 13: nearest centroid, 14: perceptron, 15: multi-layer perceptron, 16: ada boost] '
            )
            if classifier == 1:
                criterion = input('criterion: [1: gini, 2: entropy] ')
                if criterion == 1:
                    print(type, method, classifier, criterion)
                    model = DecisionTreeClassifier(criterion='gini')
                elif criterion == 2:
                    print(type, method, classifier, criterion)
                    model = DecisionTreeClassifier(criterion='entropy')
                else:
                    print('no criterion chosen')
                    exit()
            elif classifier == 2:
                print(type, method, classifier)
                model = ExtraTreeClassifier()
            elif classifier == 3:
                print(type, method, classifier)
                model = ExtraTreesClassifier()
            elif classifier == 4:
                n = input('n: [1: 1, 2: 3: 3: 5] ')
                if n == 1:
                    print(type, method, classifier, n)
                    model = KNeighborsClassifier(n_neighbors=1)
                elif n == 2:
                    print(type, method, classifier, n)
                    model = KNeighborsClassifier(n_neighbors=3)
                elif n == 3:
                    print(type, method, classifier, n)
                    model = KNeighborsClassifier(n_neighbors=5)
                else:
                    print('no n chosen')
                    exit()
            elif classifier == 5:
                version = input(
                    'version: [1: gaussian, 2: bernoulli, 3: multinomial, 4: complement] '
                )
                if version == 1:
                    print(type, method, classifier, version)
                    model = GaussianNB()
                elif version == 2:
                    print(type, method, classifier, version)
                    model = BernoulliNB()
                elif version == 3:
                    print(type, method, classifier, version)
                    model = MultinomialNB()
                elif version == 4:
                    print(type, method, classifier, version)
                    model = ComplementNB()
                else:
                    print('no version chosen')
                    exit()
            elif classifier == 6:
                print(type, method, classifier)
                model = RadiusNeighborsClassifier(radius=1.0)
            elif classifier == 7:
                print(type, method, classifier)
                model = RandomForestClassifier(n_estimators=50, random_state=1)
            elif classifier == 8:
                print(type, method, classifier)
                model = LinearSVC(
                    multi_class='crammer_singer')  #multi_class='ovr'
            elif classifier == 9:
                print(type, method, classifier)
                model = GradientBoostingClassifier()
            elif classifier == 10:
                print(type, method, classifier)
                model = GaussianProcessClassifier(multi_class='one_vs_one')
                # model = GaussianProcessClassifier(multi_class='one_vs_rest')
            elif classifier == 11:
                print(type, method, classifier)
                model = SGDClassifier()
            elif classifier == 12:
                print(type, method, classifier)
                model = PassiveAggressiveClassifier()
            elif classifier == 13:
                print(type, method, classifier)
                model = NearestCentroid()
            elif classifier == 14:
                print(type, method, classifier)
                model = Perceptron(tol=1e-3, random_state=0)
            elif classifier == 15:
                print(type, method, classifier)
                model = MLPClassifier()
            elif classifier == 16:
                print(type, method, classifier)
                model = AdaBoostClassifier(n_estimators=100)
            else:
                print('no classifier chosen')
                exit()
            # train the model using the training sets and check score
            model.fit(x_train, y_train)
            model.score(x_train, y_train)

            # predict output
            predictions = pd.Series(model.predict(x_test))

            filename = '{},{},{}.txt'.format(type, method, classifier)
            with open(filename, 'w') as output:
                output.write('{:10}\t{:10}\t{:10}\t{:10}'.format(
                    'actual', 'predict', 'approximate', 'match?'))
                for i in range(len(predictions)):
                    match = True if (y_test[i] == predictions[i]) else False
                    output.write('{:10}\t{:10}\t{:10}'.format(
                        y_train[i], predictions[i], match))
                output.write('accuracy: {:7.2f}%'.format(
                    100 * accuracy_score(y_test, predictions)))

            print('accuracy: {:7.2f}%'.format(
                100 * accuracy_score(y_test, predictions)))
            print(
                classification_report(
                    y_test,
                    predictions,
                    target_names=['RightTroll', 'LeftTroll', 'Other']))
            print(
                confusion_matrix(y_test,
                                 predictions,
                                 labels=["RightTroll", "LeftTroll", "Other"]))
        elif method == 2:
            # transform into binary classification problem
            # y_train = y_train.apply(lambda x: 0 if x == 'Other' else 1)
            # y_test = y_test.apply(lambda x: 0 if x == 'Other' else 1)

            # transform string labels into integers
            # le = LabelEncoder()
            # le.fit(y_train) # print(le.transform(['LeftTroll', 'Other', 'Other', 'RightTroll'])), print(le.inverse_transform([0, 1, 2, 1]))
            # print(le.classes_)
            #
            # y_train = le.transform(y_train)
            # y_test = le.transform(y_test)

            regressor = input(
                'regressor: [1: linear discriminant analysis, 2: logistic regression, 3: ridge regression, 4: quadratic discriminant analysis, 5: linear regression, 6: decision tree regression, 7: pls regression, 8: pls canonical, 9: canonical correlation analysis, 10: lasso, 11: multi-task lasso, 12: elastic net, 13: multi-task elastic net, 14: least angle regression, 15: least angle regression lasso, 16: orthogonal matching pursuit, 17: bayesian ridge, 18: automatic relevence determination, 19: theil sen regression, 20: huber regressor, 21: random sample consensus] '
            )
            if regressor == 1:
                print(type, method, regressor)
                model = LinearDiscriminantAnalysis()
            elif regressor == 2:
                print(type, method, regressor)
                model = LogisticRegression(
                    solver='lbfgs', multi_class='multinomial')  #'newton-cg'
            elif regressor == 3:
                print(type, method, regressor)
                model = RidgeClassifier()
            elif regressor == 4:
                print(type, method, regressor)
                model = QuadraticDiscriminantAnalysis()
            elif regressor == 5:
                strategy = input('strategy: [1: one vs rest, 2: one vs one] ')
                if strategy == 1:
                    print(type, method, strategy, regressor)
                    model = OneVsRestClassifier(LinearRegression())
                elif strategy == 2:
                    print(type, method, strategy, regressor)
                    model = OneVsOneClassifier(LinearRegression())
                else:
                    print('no strategy selected')
                    exit()
            elif regressor == 6:
                strategy = input('strategy: [1: one vs rest, 2: one vs one] ')
                if strategy == 1:
                    print(type, method, strategy, regressor)
                    model = OneVsRestClassifier(DecisionTreeRegressor())
                elif strategy == 2:
                    print(type, method, strategy, regressor)
                    model = OneVsOneClassifier(DecisionTreeRegressor())
                else:
                    print('no strategy selected')
                    exit()
            elif regressor == 7:
                print(type, method, regressor)
                model = PLSRegression(n_components=2)
            elif regressor == 8:
                print(type, method, regressor)
                model = PLSCanonical(n_components=2)
            elif regressor == 9:
                print(type, method, regressor)
                model = CCA(n_components=1)
            elif regressor == 10:
                print(type, method, regressor)
                model = Lasso(alpha=0.1)
            elif regressor == 11:
                print(type, method, regressor)
                model = MultiTaskLasso(alpha=0.1)
            elif regressor == 12:
                print(type, method, regressor)
                model = ElasticNet(random_state=0)
            elif regressor == 13:
                print(type, method, regressor)
                model = MultiTaskElasticNet(random_state=0)
            elif regressor == 14:
                print(type, method, regressor)
                model = Lars(n_nonzero_coefs=1)
            elif regressor == 15:
                print(type, method, regressor)
                model = LassoLars(alpha=.1)
            elif regressor == 16:
                print(type, method, regressor)
                model = OrthogonalMatchingPursuit()
            elif regressor == 17:
                print(type, method, regressor)
                model = BayesianRidge()
            elif regressor == 18:
                print(type, method, regressor)
                model = ARDRegression()
            elif regressor == 19:
                print(type, method, regressor)
                model = TheilSenRegressor(random_state=0)
            elif regressor == 20:
                print(type, method, regressor)
                model = HuberRegressor()
            elif regressor == 21:
                print(type, method, regressor)
                model = RANSACRegressor(random_state=0)
            else:
                print('no regressor chosen')
                exit()

            # train the model using the training sets and check score
            model.fit(x_train, y_train)
            model.score(x_train, y_train)

            # print('coefficient:', model.coef_)
            # print('intercept:', model.intercept_)

            # predict output
            predictions = pd.Series(model.predict(x_test))
            print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?'))

            # calculate accuracy
            numerator = 0.0
            denominator = float(len(predictions))
            for i in range(len(predictions)):
                match = True if (y_test[i] == predictions[i]) else False
                numerator += 1 if match else 0
                print('{:10}\t{:10}\t{:10}'.format(y_train[i], predictions[i],
                                                   match))
            print('accuracy = {:7.2f}%'.format(100 * numerator / denominator))

        else:
            print('no method chosen')
            exit()
    elif type == 2:
        classifier = input(
            'classifier: [1: label propagation, 2: label spreading] ')
        if classifier == 1:
            print(type, classifier)
            model = LabelPropagation()
        elif classifier == 2:
            print(type, classifier)
            model = LabelSpreading()
        else:
            print('no classifier chosen')
            exit()
        # train the model using the training sets and check score
        model.fit(x_train, y_train)
        model.score(x_train, y_train)

        # predict output
        predictions = pd.Series(model.predict(x_test))
        print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?'))

        # calculate accuracy
        numerator = 0.0
        denominator = float(len(predictions))
        for i in range(len(predictions)):
            match = True if (y_test[i] == predictions[i]) else False
            numerator += 1 if match else 0
            print('{:10}\t{:10}\t{:10}'.format(y_train[i], predictions[i],
                                               match))
        print('accuracy = {:7.2f}%'.format(100 * numerator / denominator))
    elif type == 3:
        method = input(
            'method: [1: clustering, 2: random trees embedding, 3: nearest neighbors] '
        )
        if method == 1:
            clusterer = input('clustere: [1: k means]')
            if clusterer == 1:
                clusters = input('clusters: [1: 1, 2: 2, 3: 3] ')
                if clusters == 1:
                    print(type, method, clusters)
                    model = KMeans(n_clusters=1, random_state=0)
                elif clusters == 2:
                    print(type, method, clusters)
                    model = KMeans(n_clusters=2, random_state=0)
                elif clusters == 3:
                    print(type, method, clusters)
                    model = KMeans(n_clusters=3, random_state=0)
                else:
                    print('no clusters chosen')
                    exit()
            else:
                print('no clusterer chosen')
                exit()
            # train the model using the training sets and check score
            model.fit(x_train)

            # predict output
            predictions = model.predict(x_test)
            print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?'))

            # check details
            print('centroids: ' + model.cluster_centers_)
            # print('labels: ' + model.labels_)
        elif method == 2:
            model = RandomTreesEmbedding()
            # train the model using the training sets and check score
            model.fit(x_train)

            # predict output
            predictions = model.apply(x_test)
            print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?'))
        elif method == 3:
            model = NearestNeighbors(n_neighbors=2, algorithm='ball_tree')
            # train the model using the training sets and check score
            model.fit(x_train)
            distances, indices = nbrs.kneighbors(X)

        else:
            print('no method chosen')
            exit()

        # calculate accuracy
        numerator = 0.0
        denominator = float(len(predictions))
        for i in range(len(predictions)):
            match = True if (y_test[i] == predictions[i]) else False
            numerator += 1 if match else 0
            print('{:10}\t{:10}\t{:10}'.format(y_train[i], predictions[i],
                                               match))
        print('accuracy = {:7.2f}%'.format(100 * numerator / denominator))
    else:
        print('no type chosen')
        exit()
Beispiel #11
0
def create_nearest_centroid(preprocessor, x_train, y_train):
    model = Pipeline(steps=[('preprocessor',
                             preprocessor), ('classifier', NearestCentroid())])
    model.fit(x_train, y_train)
    return model
              'Jump sideways', 'Jump leg/arms open/closed ', 'Jump rope'
    , 'Trunk twist (arms outstretched)',
              'Trunk twist (elbows bent)', 'Waist bends forward', ' Waist rotation',
              'Waist bends (reach foot with opposite hand)', 'Reach heels backwards'
    , 'Lateral bend', 'Lateral bend with arm up', 'Repetitive forward stretching',
              'Upper trunk and lower body opposite twist', 'Lateral elevation of arms',
              'Frontal elevation of arms'
    , 'Frontal hand claps', 'Frontal crossing of arms', 'Shoulders high-amplitude rotation',
              'Shoulders low-amplitude rotation', 'Arms inner rotation',
              'Knees (alternating) to the breast',
              'Heels (alternatively) to the backside', 'Knees bending (crouching)',
              'Knees (alternating) bending forward', 'Rotation on the knees', 'Rowing', 'Elliptical bike',
              'Cycling']

models = {'DT': DecisionTreeClassifier(criterion='entropy'), 'NB': GaussianNB(),
          'NCC': NearestCentroid(), "KNN": KNeighborsClassifier(n_neighbors=3)}


def per_class_classification(file, cv_type='iid', overlap=False):
    if overlap:
        overlap_path = 'overlap'
    else:
        overlap_path = 'nonoverlap'

    file_name = os.path.basename(os.path.splitext(file)[0])
    fs = os.path.basename(os.path.dirname(file))
    print(fs)
    win_size = (file_name[7:])
    print(str(win_size))
    dataset = pd.read_csv(file, sep='\t')
    groups = dataset.iloc[:, 1]
        benchmark(SGDClassifier(alpha=.0001, max_iter=50, penalty=penalty),
                  use_tfidf=False))

# Train SGD with Elastic Net penalty
print('=' * 80)
print("Elastic-Net penalty")
results.append(
    benchmark(SGDClassifier(alpha=.0001, max_iter=50, penalty="elasticnet")))
results_count.append(
    benchmark(SGDClassifier(alpha=.0001, max_iter=50, penalty="elasticnet"),
              use_tfidf=False))

# Train NearestCentroid without threshold
print('=' * 80)
print("NearestCentroid (aka Rocchio classifier)")
results.append(benchmark(NearestCentroid()))
results_count.append(benchmark(NearestCentroid(), use_tfidf=False))

# Train sparse Naive Bayes classifiers
print('=' * 80)
print("Naive Bayes")
results.append(benchmark(MultinomialNB(alpha=.01)))
results.append(benchmark(BernoulliNB(alpha=.01)))
results.append(benchmark(ComplementNB(alpha=.1)))

results_count.append(benchmark(MultinomialNB(alpha=.01), use_tfidf=False))
results_count.append(benchmark(BernoulliNB(alpha=.01), use_tfidf=False))
results_count.append(benchmark(ComplementNB(alpha=.1), use_tfidf=False))

print('=' * 80)
print("LinearSVC with L1-based feature selection")
def get_ml_algorithm_pair_list(options, ml_algorithm_list,
                               use_classifiers_with_default_parameters,
                               use_imdb_multi_class_labels, dataset):
    ml_final_list = []

    if Classifier.ADA_BOOST_CLASSIFIER.name in ml_algorithm_list:
        if use_classifiers_with_default_parameters:
            ml_final_list.append(
                (AdaBoostClassifier(random_state=options.random_state),
                 Classifier.ADA_BOOST_CLASSIFIER))
        else:
            json_with_best_parameters = get_json_with_best_parameters(
                dataset, Classifier.ADA_BOOST_CLASSIFIER,
                use_imdb_multi_class_labels)
            # adding options.random_state in the map
            json_with_best_parameters['random_state'] = options.random_state

            # create classifier with best parameters
            classifier_with_best_parameters = AdaBoostClassifier(
                **json_with_best_parameters)
            print('\t', classifier_with_best_parameters)

            ml_final_list.append((classifier_with_best_parameters,
                                  Classifier.ADA_BOOST_CLASSIFIER))

    if Classifier.BERNOULLI_NB.name in ml_algorithm_list:
        if use_classifiers_with_default_parameters:
            ml_final_list.append((BernoulliNB(), Classifier.BERNOULLI_NB))
        else:
            json_with_best_parameters = get_json_with_best_parameters(
                dataset, Classifier.BERNOULLI_NB, use_imdb_multi_class_labels)

            # create classifier with best parameters
            classifier_with_best_parameters = BernoulliNB(
                **json_with_best_parameters)
            print('\t', classifier_with_best_parameters)

            ml_final_list.append(
                (classifier_with_best_parameters, Classifier.BERNOULLI_NB))

    if Classifier.COMPLEMENT_NB.name in ml_algorithm_list:
        if use_classifiers_with_default_parameters:
            ml_final_list.append((ComplementNB(), Classifier.COMPLEMENT_NB))
        else:
            json_with_best_parameters = get_json_with_best_parameters(
                dataset, Classifier.COMPLEMENT_NB, use_imdb_multi_class_labels)

            # create classifier with best parameters
            classifier_with_best_parameters = ComplementNB(
                **json_with_best_parameters)
            print('\t', classifier_with_best_parameters)

            ml_final_list.append(
                (classifier_with_best_parameters, Classifier.COMPLEMENT_NB))

    if Classifier.DECISION_TREE_CLASSIFIER.name in ml_algorithm_list:
        if use_classifiers_with_default_parameters:
            ml_final_list.append(
                (DecisionTreeClassifier(random_state=options.random_state),
                 Classifier.DECISION_TREE_CLASSIFIER))
        else:
            json_with_best_parameters = get_json_with_best_parameters(
                dataset, Classifier.DECISION_TREE_CLASSIFIER,
                use_imdb_multi_class_labels)
            # adding options.random_state in the map
            json_with_best_parameters['random_state'] = options.random_state

            # create classifier with best parameters
            classifier_with_best_parameters = DecisionTreeClassifier(
                **json_with_best_parameters)
            print('\t', classifier_with_best_parameters)

            ml_final_list.append((classifier_with_best_parameters,
                                  Classifier.DECISION_TREE_CLASSIFIER))

    if Classifier.GRADIENT_BOOSTING_CLASSIFIER.name in ml_algorithm_list:
        if use_classifiers_with_default_parameters:
            ml_final_list.append(
                (GradientBoostingClassifier(verbose=options.verbose,
                                            random_state=options.random_state),
                 Classifier.GRADIENT_BOOSTING_CLASSIFIER))
        else:
            json_with_best_parameters = get_json_with_best_parameters(
                dataset, Classifier.GRADIENT_BOOSTING_CLASSIFIER,
                use_imdb_multi_class_labels)
            # adding options.verbose in the map
            json_with_best_parameters['verbose'] = options.verbose
            # adding options.random_state in the map
            json_with_best_parameters['random_state'] = options.random_state

            # create classifier with best parameters
            classifier_with_best_parameters = GradientBoostingClassifier(
                **json_with_best_parameters)
            print('\t', classifier_with_best_parameters)

            ml_final_list.append((classifier_with_best_parameters,
                                  Classifier.GRADIENT_BOOSTING_CLASSIFIER))

    if Classifier.K_NEIGHBORS_CLASSIFIER.name in ml_algorithm_list:
        if use_classifiers_with_default_parameters:
            ml_final_list.append((KNeighborsClassifier(n_jobs=options.n_jobs),
                                  Classifier.K_NEIGHBORS_CLASSIFIER))
        else:
            json_with_best_parameters = get_json_with_best_parameters(
                dataset, Classifier.K_NEIGHBORS_CLASSIFIER,
                use_imdb_multi_class_labels)
            # adding options.random_state in the map
            json_with_best_parameters['n_jobs'] = options.n_jobs

            # create classifier with best parameters
            classifier_with_best_parameters = KNeighborsClassifier(
                **json_with_best_parameters)
            print('\t', classifier_with_best_parameters)

            ml_final_list.append((classifier_with_best_parameters,
                                  Classifier.K_NEIGHBORS_CLASSIFIER))

    if Classifier.LINEAR_SVC.name in ml_algorithm_list:
        if use_classifiers_with_default_parameters:
            ml_final_list.append((LinearSVC(verbose=options.verbose,
                                            random_state=options.random_state),
                                  Classifier.LINEAR_SVC))
        else:
            json_with_best_parameters = get_json_with_best_parameters(
                dataset, Classifier.LINEAR_SVC, use_imdb_multi_class_labels)
            # adding options.verbose in the map
            json_with_best_parameters['verbose'] = options.verbose
            # adding options.random_state in the map
            json_with_best_parameters['random_state'] = options.random_state

            # create classifier with best parameters
            classifier_with_best_parameters = LinearSVC(
                **json_with_best_parameters)
            print('\t', classifier_with_best_parameters)

            ml_final_list.append(
                (classifier_with_best_parameters, Classifier.LINEAR_SVC))

    if Classifier.LOGISTIC_REGRESSION.name in ml_algorithm_list:
        if use_classifiers_with_default_parameters:
            ml_final_list.append(
                (LogisticRegression(n_jobs=options.n_jobs,
                                    verbose=options.verbose,
                                    random_state=options.random_state),
                 Classifier.LOGISTIC_REGRESSION))
        else:
            json_with_best_parameters = get_json_with_best_parameters(
                dataset, Classifier.LOGISTIC_REGRESSION,
                use_imdb_multi_class_labels)
            # adding options.n_jobs in the map
            json_with_best_parameters['n_jobs'] = options.n_jobs
            # adding options.verbose in the map
            json_with_best_parameters['verbose'] = options.verbose
            # adding options.random_state in the map
            json_with_best_parameters['random_state'] = options.random_state

            # create classifier with best parameters
            classifier_with_best_parameters = LogisticRegression(
                **json_with_best_parameters)
            print('\t', classifier_with_best_parameters)

            ml_final_list.append((classifier_with_best_parameters,
                                  Classifier.LOGISTIC_REGRESSION))

    if Classifier.MULTINOMIAL_NB.name in ml_algorithm_list:
        if use_classifiers_with_default_parameters:
            ml_final_list.append((MultinomialNB(), Classifier.MULTINOMIAL_NB))
        else:
            json_with_best_parameters = get_json_with_best_parameters(
                dataset, Classifier.MULTINOMIAL_NB,
                use_imdb_multi_class_labels)

            # create classifier with best parameters
            classifier_with_best_parameters = MultinomialNB(
                **json_with_best_parameters)
            print('\t', classifier_with_best_parameters)

            ml_final_list.append(
                (classifier_with_best_parameters, Classifier.MULTINOMIAL_NB))

    if Classifier.NEAREST_CENTROID.name in ml_algorithm_list:
        if use_classifiers_with_default_parameters:
            ml_final_list.append(
                (NearestCentroid(), Classifier.NEAREST_CENTROID))
        else:
            json_with_best_parameters = get_json_with_best_parameters(
                dataset, Classifier.NEAREST_CENTROID,
                use_imdb_multi_class_labels)

            # create classifier with best parameters
            classifier_with_best_parameters = NearestCentroid(
                **json_with_best_parameters)
            print('\t', classifier_with_best_parameters)

            ml_final_list.append(
                (classifier_with_best_parameters, Classifier.NEAREST_CENTROID))

    if Classifier.PASSIVE_AGGRESSIVE_CLASSIFIER.name in ml_algorithm_list:
        if use_classifiers_with_default_parameters:
            ml_final_list.append((PassiveAggressiveClassifier(
                n_jobs=options.n_jobs,
                verbose=options.verbose,
                random_state=options.random_state),
                                  Classifier.PASSIVE_AGGRESSIVE_CLASSIFIER))
        else:
            json_with_best_parameters = get_json_with_best_parameters(
                dataset, Classifier.PASSIVE_AGGRESSIVE_CLASSIFIER,
                use_imdb_multi_class_labels)
            # adding options.n_jobs in the map
            json_with_best_parameters['n_jobs'] = options.n_jobs
            # adding options.verbose in the map
            json_with_best_parameters['verbose'] = options.verbose
            # adding options.random_state in the map
            json_with_best_parameters['random_state'] = options.random_state

            # create classifier with best parameters
            classifier_with_best_parameters = PassiveAggressiveClassifier(
                **json_with_best_parameters)
            print('\t', classifier_with_best_parameters)

            ml_final_list.append((classifier_with_best_parameters,
                                  Classifier.PASSIVE_AGGRESSIVE_CLASSIFIER))

    if Classifier.PERCEPTRON.name in ml_algorithm_list:
        if use_classifiers_with_default_parameters:
            ml_final_list.append(
                (Perceptron(n_jobs=options.n_jobs,
                            verbose=options.verbose,
                            random_state=options.random_state),
                 Classifier.PERCEPTRON))
        else:
            json_with_best_parameters = get_json_with_best_parameters(
                dataset, Classifier.PERCEPTRON, use_imdb_multi_class_labels)
            # adding options.n_jobs in the map
            json_with_best_parameters['n_jobs'] = options.n_jobs
            # adding options.verbose in the map
            json_with_best_parameters['verbose'] = options.verbose
            # adding options.random_state in the map
            json_with_best_parameters['random_state'] = options.random_state

            # create classifier with best parameters
            classifier_with_best_parameters = Perceptron(
                **json_with_best_parameters)
            print('\t', classifier_with_best_parameters)

            ml_final_list.append(
                (classifier_with_best_parameters, Classifier.PERCEPTRON))

    if Classifier.RANDOM_FOREST_CLASSIFIER.name in ml_algorithm_list:
        if use_classifiers_with_default_parameters:
            ml_final_list.append(
                (RandomForestClassifier(n_jobs=options.n_jobs,
                                        verbose=options.verbose,
                                        random_state=options.random_state),
                 Classifier.RANDOM_FOREST_CLASSIFIER))
        else:
            json_with_best_parameters = get_json_with_best_parameters(
                dataset, Classifier.RANDOM_FOREST_CLASSIFIER,
                use_imdb_multi_class_labels)
            # adding options.n_jobs in the map
            json_with_best_parameters['n_jobs'] = options.n_jobs
            # adding options.verbose in the map
            json_with_best_parameters['verbose'] = options.verbose
            # adding options.random_state in the map
            json_with_best_parameters['random_state'] = options.random_state

            # create classifier with best parameters
            classifier_with_best_parameters = RandomForestClassifier(
                **json_with_best_parameters)
            print('\t', classifier_with_best_parameters)

            ml_final_list.append((classifier_with_best_parameters,
                                  Classifier.RANDOM_FOREST_CLASSIFIER))

    if Classifier.RIDGE_CLASSIFIER.name in ml_algorithm_list:
        if use_classifiers_with_default_parameters:
            ml_final_list.append(
                (RidgeClassifier(random_state=options.random_state),
                 Classifier.RIDGE_CLASSIFIER))
        else:
            json_with_best_parameters = get_json_with_best_parameters(
                dataset, Classifier.RIDGE_CLASSIFIER,
                use_imdb_multi_class_labels)
            # adding options.random_state in the map
            json_with_best_parameters['random_state'] = options.random_state

            # create classifier with best parameters
            classifier_with_best_parameters = RidgeClassifier(
                **json_with_best_parameters)
            print('\t', classifier_with_best_parameters)

            ml_final_list.append(
                (classifier_with_best_parameters, Classifier.RIDGE_CLASSIFIER))

    if Classifier.MAJORITY_VOTING_CLASSIFIER.name in ml_algorithm_list:
        estimators_list = get_estimators_list(dataset,
                                              options,
                                              use_imdb_multi_class_labels,
                                              is_soft_voting=False,
                                              is_stacking_classifier=False)

        classifier_with_best_parameters = VotingClassifier(
            estimators=estimators_list,
            voting='hard',  # voting='hard' means majority voting
            n_jobs=options.n_jobs)
        print('\t', classifier_with_best_parameters)

        ml_final_list.append((classifier_with_best_parameters,
                              Classifier.MAJORITY_VOTING_CLASSIFIER))

    if Classifier.SOFT_VOTING_CLASSIFIER.name in ml_algorithm_list:
        estimators_list = get_estimators_list(dataset,
                                              options,
                                              use_imdb_multi_class_labels,
                                              is_soft_voting=True,
                                              is_stacking_classifier=False)

        classifier_with_best_parameters = VotingClassifier(
            estimators=estimators_list,
            voting='soft',
            # voting='soft' predicts the class label based on the argmax of the sums of the predicted probabilities
            n_jobs=options.n_jobs)
        print('\t', classifier_with_best_parameters)

        ml_final_list.append((classifier_with_best_parameters,
                              Classifier.SOFT_VOTING_CLASSIFIER))

    if Classifier.STACKING_CLASSIFIER.name in ml_algorithm_list:
        estimators_list, final_estimator = get_estimators_list(
            dataset,
            options,
            use_imdb_multi_class_labels,
            is_stacking_classifier=True,
            final_estimator=Classifier.LINEAR_SVC.name)

        classifier_with_best_parameters = StackingClassifier(
            estimators=estimators_list,
            final_estimator=final_estimator,
            verbose=options.verbose,
            n_jobs=options.n_jobs)
        print('\t', classifier_with_best_parameters)

        ml_final_list.append(
            (classifier_with_best_parameters, Classifier.STACKING_CLASSIFIER))

    return ml_final_list
Beispiel #15
0
def run_NMC(DataPath,
            LabelsPath,
            CV_RDataPath,
            OutputDir,
            GeneOrderPath="",
            NumGenes=0):
    '''
    run baseline classifier: NMC
    Wrapper script to run a NMC classifier on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.

    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    '''

    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype='int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype='bool')
    col = np.array(robjects.r['col_Index'], dtype='int')
    col = col - 1
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the data
    data = pd.read_csv(DataPath, index_col=0, sep=',')
    labels = pd.read_csv(LabelsPath,
                         header=0,
                         index_col=None,
                         sep=',',
                         usecols=col)

    labels = labels.iloc[tokeep]
    data = data.iloc[tokeep]

    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,
                               header=0,
                               index_col=None,
                               sep=',')

    # normalize data
    data = np.log1p(data)

    Classifier = NearestCentroid()

    tr_time = []
    ts_time = []
    truelab = []
    pred = []

    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype='int') - 1
        train_ind_i = np.array(train_ind[i], dtype='int') - 1

        train = data.iloc[train_ind_i]
        test = data.iloc[test_ind_i]
        y_train = labels.iloc[train_ind_i]
        y_test = labels.iloc[test_ind_i]

        if (NumGenes > 0):
            feat_to_use = features.iloc[0:NumGenes, i]
            train = train.iloc[:, feat_to_use]
            test = test.iloc[:, feat_to_use]

        start = tm.time()
        Classifier.fit(train, y_train)
        tr_time.append(tm.time() - start)

        start = tm.time()
        predicted = Classifier.predict(test)
        ts_time.append(tm.time() - start)

        truelab.extend(y_test.values)
        pred.extend(predicted)

    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)

    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)

    OutputDir = Path(OutputDir)

    truelab.to_csv(str(OutputDir / Path("NMC_true.csv")), index=False)

    pred.to_csv(str(OutputDir / Path("NMC_pred.csv")), index=False)

    tr_time.to_csv(str(OutputDir / Path("NMC_training_time.csv")), index=False)
    ts_time.to_csv(str(OutputDir / Path("NMC_test_time.csv")), index=False)
Beispiel #16
0
def do_classification(clm, data_fname, clm_type):
    d_name = "output"
    if os.path.isdir(d_name) is False:
        os.mkdir(d_name)

    fname = os.path.basename(data_fname).replace('.csv', '')
    fn = 'result_' + fname + "_type" + str(clm_type) + "_" +\
        datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '.csv'
    csv_out_fnamee = os.path.join(d_name, fn)
    fi = open(csv_out_fnamee, 'w')
    csv_out = csv.writer(fi, delimiter=',')

    # Create dataframe for training
    base_df = pd.read_csv(data_fname)
    df = base_df[clm]

    df = df[df['heartRate'] > 40]
    df = df[df['skinTemperature'] > 10]
    df = df[df['met'] > 0.4]

    X_train = df[clm[:-2]]
    Y_train = [df[clm[-2]], df[clm[-1]]]

    # Model: Decision Tree
    ML_NAME = 'Decision Tree'
    depth_list = np.concatenate(
        (np.arange(1, 10), np.arange(10, 20, 2), np.arange(20, 50, 5),
         np.arange(50, 100, 10), np.arange(150, 1000, 50)))
    for t in [0, 1]:
        for depth in depth_list:
            clf = DecisionTreeClassifier(class_weight=None,
                                         criterion='entropy',
                                         max_depth=depth,
                                         max_features=None,
                                         max_leaf_nodes=None,
                                         min_impurity_decrease=0.0,
                                         min_impurity_split=None,
                                         min_samples_leaf=1,
                                         min_samples_split=2,
                                         min_weight_fraction_leaf=0.0,
                                         presort=False,
                                         random_state=None,
                                         splitter='best')
            do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out,
                                fname, depth)

    # Model: Extra Tree Classifier
    ML_NAME = 'Extremely randomized tree classifier'
    for t in [0, 1]:
        clf = ExtraTreeClassifier()
        do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out,
                            fname, 0)

    # Model: Gaussian
    ML_NAME = 'Gaussian Naive Bayes'

    for t in [0, 1]:
        clf = GaussianNB(priors=None)
        do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out,
                            fname, 0)

    # Model: Multivariate Bernoulli Model
    ML_NAME = 'Multivariate Bernoulli Model'
    alphas = np.concatenate(
        (np.arange(0.1, 1, 0.2), np.arange(1, 10), np.arange(10, 20, 2),
         np.arange(20, 50, 5), np.arange(50, 150, 10)))

    for t in [0, 1]:
        for a in alphas:
            clf = BernoulliNB(alpha=a)
            do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out,
                                fname, a)

    # Model: AdaBoost Classifier
    ML_NAME = 'AdaBoost classifier'
    noestimator = np.arange(5, 1000, 20)

    for t in [0, 1]:
        for n in noestimator:
            clf = AdaBoostClassifier(algorithm='SAMME.R',
                                     base_estimator=None,
                                     learning_rate=0.1,
                                     n_estimators=n,
                                     random_state=None)
            do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out,
                                fname, n)

    # Model: Gradient Boosting Classifier
    ML_NAME = 'Gradient Boosting Classifier'
    noestimator = np.arange(5, 1000, 20)

    for t in [0, 1]:
        for n in noestimator:
            clf = GradientBoostingClassifier(criterion='friedman_mse',
                                             init=None,
                                             learning_rate=0.1,
                                             loss='deviance',
                                             max_depth=4,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=n,
                                             presort='auto',
                                             random_state=None,
                                             subsample=1.0,
                                             verbose=0,
                                             warm_start=False)
            do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out,
                                fname, n)

    # Model: Random Forest Classifier
    ML_NAME = 'Random Forest Classifier'
    noestimator = np.concatenate(
        (np.arange(1, 10), np.arange(10, 20, 2), np.arange(20, 50, 5),
         np.arange(50, 150, 10), np.arange(150, 1000, 50)))

    for t in [0, 1]:
        for n in noestimator:
            clf = RandomForestClassifier(n_estimators=n)
            do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out,
                                fname, n)

    # Model: Support Vector Machines - RBF
    ML_NAME = 'Support Vector Machines - RBF'
    c_values = np.concatenate(
        (np.arange(0.1, 1, 0.2), np.arange(1, 10), np.arange(10, 20, 2),
         np.arange(20, 50, 5), np.arange(50, 150, 10)))

    for t in [0, 1]:
        for c in c_values:
            clf = SVC(C=c, kernel='rbf')
            do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out,
                                fname, c)

    # Model: Support Vector Machines - poly
    ML_NAME = 'Support Vector Machines - poly'
    c_values = np.concatenate(
        (np.arange(0.1, 1, 0.2), np.arange(1, 10), np.arange(10, 20, 2),
         np.arange(20, 50, 5), np.arange(50, 150, 10)))

    for t in [0, 1]:
        for c in c_values:
            clf = SVC(C=c, kernel='poly')
            do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out,
                                fname, c)

    # Model: Support Vector Machines - Sigmoid
    ML_NAME = 'Support Vector Machines - Sigmoid'
    c_values = np.concatenate(
        (np.arange(0.1, 1, 0.2), np.arange(1, 10), np.arange(10, 20, 2),
         np.arange(20, 50, 5), np.arange(50, 150, 10)))

    for t in [0, 1]:
        for c in c_values:
            clf = SVC(C=c, kernel='sigmoid')
            do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out,
                                fname, c)

    # Model: Support Vector Machines - Linear
    ML_NAME = 'Support Vector Machines - Linear'
    c_values = np.concatenate(
        (np.arange(0.1, 1, 0.2), np.arange(1, 10), np.arange(10, 20, 2),
         np.arange(20, 50, 5), np.arange(50, 150, 10)))

    for t in [0, 1]:
        for c in c_values:
            clf = SVC(C=c, kernel='linear')
            do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out,
                                fname, c)

    # Model: KNeighborsClassifier
    ML_NAME = 'KNeighborsClassifier'
    n_neighbors = np.concatenate(
        (np.arange(1, 10), np.arange(10, 20,
                                     2), np.arange(20, 50,
                                                   5), np.arange(50, 150, 10)))

    for t in [0, 1]:
        for n in n_neighbors:
            try:
                clf = KNeighborsClassifier(n_neighbors=n)
                do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t,
                                    csv_out, fname, n)
            except:
                pass

    # Model: Radius Neighbors Classifier
    ML_NAME = 'Radius Neighbors Classifier'
    n_neighbors = np.concatenate(
        (np.arange(1, 10), np.arange(10, 20,
                                     2), np.arange(20, 50,
                                                   5), np.arange(50, 150, 10)))

    for t in [0, 1]:
        for n in n_neighbors:
            try:
                clf = RadiusNeighborsClassifier(radius=n)
                do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t,
                                    csv_out, fname, n)
            except:
                pass

    # Model: NearestCentroid
    ML_NAME = 'Nearest Centroid Classifier'

    for t in [0, 1]:
        clf = NearestCentroid()
        do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out,
                            fname, 0)

    fi.close()
# y Data
y = df["default.payment.next.month"]

# ----------------------------------------------------
# Splitting data

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=44, shuffle=True
)


# ----------------------------------------------------
# Applying LogisticRegression Model


NearestNeighbors = NearestCentroid()
NearestNeighbors.fit(X_train, y_train)

# Calculating Details
print("NearestNeighbors Train Score is : ", NearestNeighbors.score(X_train, y_train))
print("NearestNeighbors Test Score is : ", NearestNeighbors.score(X_test, y_test))
print("NearestNeighbors Classes are : ", NearestNeighbors.classes_)
print("----------------------------------------------------")

# Calculating Prediction
y_pred = NearestNeighbors.predict(X_test)
print("Predicted Value for NearestNeighbors is : ", y_pred[:10])

# ----------------------------------------------------
# Calculating Confusion Matrix
CM = confusion_matrix(y_test, y_pred)
def all_classifier_models():
    models = []
    metrix = []
    c_report = []
    train_accuracy = []
    test_accuracy = []
    
    models.append(('LogisticRegression', LogisticRegression(solver='liblinear', multi_class='ovr')))
    models.append(('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis()))
    models.append(('KNeighborsClassifier', KNeighborsClassifier()))
    models.append(('DecisionTreeClassifier', DecisionTreeClassifier()))
    models.append(('GaussianNB', GaussianNB()))
    models.append(('RandomForestClassifier', RandomForestClassifier(n_estimators=100)))
    models.append(('SVM', SVC(gamma='auto')))
    models.append(('Linear_SVM', LinearSVC()))
    models.append(('XGB', XGBClassifier()))
    models.append(('SGD', SGDClassifier()))
    models.append(('Perceptron', Perceptron()))
    models.append(('ExtraTreeClassifier', ExtraTreeClassifier()))
    models.append(('OneClassSVM', OneClassSVM(gamma = 'auto')))
    models.append(('NuSVC', NuSVC()))
    models.append(('MLPClassifier', MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=1)))
    models.append(('RadiusNeighborsClassifier', RadiusNeighborsClassifier(radius=2.0)))
    models.append(('OutputCodeClassifier', OutputCodeClassifier(estimator=RandomForestClassifier(random_state=0),random_state=0)))
    models.append(('OneVsOneClassifier', OneVsOneClassifier(estimator = RandomForestClassifier(random_state=1))))
    models.append(('OneVsRestClassifier', OneVsRestClassifier(estimator = RandomForestClassifier(random_state=1))))
    models.append(('LogisticRegressionCV', LogisticRegressionCV()))
    models.append(('RidgeClassifierCV', RidgeClassifierCV()))
    models.append(('RidgeClassifier', RidgeClassifier()))
    models.append(('PassiveAggressiveClassifier', PassiveAggressiveClassifier()))
    models.append(('GaussianProcessClassifier', GaussianProcessClassifier()))
    models.append(('HistGradientBoostingClassifier', HistGradientBoostingClassifier()))
    estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),('svr', make_pipeline(StandardScaler(),LinearSVC(random_state=42)))]
    models.append(('StackingClassifier', StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())))
    clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
    clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
    clf3 = GaussianNB()
    models.append(('VotingClassifier', VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')))
    models.append(('AdaBoostClassifier', AdaBoostClassifier()))
    models.append(('GradientBoostingClassifier', GradientBoostingClassifier()))
    models.append(('BaggingClassifier', BaggingClassifier()))
    models.append(('ExtraTreesClassifier', ExtraTreesClassifier()))
    models.append(('CategoricalNB', CategoricalNB()))
    models.append(('ComplementNB', ComplementNB()))
    models.append(('BernoulliNB', BernoulliNB()))
    models.append(('MultinomialNB', MultinomialNB()))
    models.append(('CalibratedClassifierCV', CalibratedClassifierCV()))
    models.append(('LabelPropagation', LabelPropagation()))
    models.append(('LabelSpreading', LabelSpreading()))
    models.append(('NearestCentroid', NearestCentroid()))
    models.append(('QuadraticDiscriminantAnalysis', QuadraticDiscriminantAnalysis()))
    models.append(('GaussianMixture', GaussianMixture()))
    models.append(('BayesianGaussianMixture', BayesianGaussianMixture()))
    
    test_accuracy= []
    names = []
    for name, model in models:
        try:
            m = model
            m.fit(X_train, y_train)
            y_pred = m.predict(X_test)
            train_acc = round(m.score(X_train, y_train) * 100, 2)
            test_acc = metrics.accuracy_score(y_test,y_pred) *100
            c_report.append(classification_report(y_test, y_pred))
            test_accuracy.append(test_acc)
            names.append(name)
            metrix.append([name, train_acc, test_acc])
        except:
            print("Exception Occurred  :",name)
    return metrix,test_accuracy,names