n_classes = len(y.unique()) n_samples0 = y.value_counts()[0] n_samples1 = y.value_counts()[1] w0 = n_samples / (n_classes * n_samples0) w1 = n_samples / (n_classes * n_samples1) X_train = X_train.reset_index(drop=True) y_train = y_train.reset_index(drop=True) weights = y_train.map(lambda y: w0 if y == 0 else w1) from sklearn.neighbors import NearestCentroid from sklearn.metrics import classification_report # Creating the Nearest Centroid Clissifier model = NearestCentroid() # Training the classifier model.fit(X_train, y_train.values.ravel()) model.score(X_train, y_train, sample_weight=weights) # Printing Accuracy on Training and Test sets print(f"Training Set Score : {model.score(X_train, y_train) * 100} %") print(f"Test Set Score : {model.score(X_test, y_test) * 100} %") # Printing classification report of classifier on the test set set data print( f"Model Classification Report : \n{classification_report(y_test, model.predict(X_test))}" ) '''
#Scoring score = metrics.accuracy_score(targets, predictions) print("accuracy: %0.3f" % score) totalPredictions.append(predictions) return name, score, train_time, test_time results = [] #Main Code #Classifiers clf1 = LogisticRegression() clf2 = PassiveAggressiveClassifier() clf3 = MultinomialNB(alpha=.01) clf4 = BernoulliNB(alpha=.01) clf5 = NearestCentroid() clf6 = RidgeClassifier(tol=1e-2, solver="sag") clf7 = Perceptron(n_iter=50) clf8 = SGDClassifier(loss='hinge',alpha=.0001, n_iter=50,shuffle=True,penalty="l2",n_jobs=-1) clf9 = SGDClassifier(loss='hinge',alpha=.0001, n_iter=50,shuffle=True,penalty="l1",n_jobs=-1) clf10 = SGDClassifier(loss='hinge',alpha=.0001, n_iter=50,shuffle=True,penalty="elasticnet",n_jobs=-1) clf11 = SGDClassifier(loss='log',alpha=.0001, n_iter=50,shuffle=True,penalty="l2",n_jobs=-1) clf12 = SGDClassifier(loss='log',alpha=.0001, n_iter=50,shuffle=True,penalty="l1",n_jobs=-1) clf13 = SGDClassifier(loss='log',alpha=.0001, n_iter=50,shuffle=True,penalty="elasticnet",n_jobs=-1) clf14 = SGDClassifier(loss='modified_huber',alpha=.0001, n_iter=50,shuffle=True,penalty="l2",n_jobs=-1) clf15 = SGDClassifier(loss='modified_huber',alpha=.0001, n_iter=50,shuffle=True,penalty="l1",n_jobs=-1) clf16 = SGDClassifier(loss='modified_huber',alpha=.0001, n_iter=50,shuffle=True,penalty="elasticnet",n_jobs=-1) clf17 = SGDClassifier(loss='squared_hinge',alpha=.0001, n_iter=50,shuffle=True,penalty="l2",n_jobs=-1) clf18 = SGDClassifier(loss='squared_hinge',alpha=.0001, n_iter=50,shuffle=True,penalty="l1",n_jobs=-1) clf19 = SGDClassifier(loss='squared_hinge',alpha=.0001, n_iter=50,shuffle=True,penalty="elasticnet",n_jobs=-1) clf20 = SGDClassifier(loss='perceptron',alpha=.0001, n_iter=50,shuffle=True,penalty="l2",n_jobs=-1)
def update_species(self, new_individuals, historical_marker, individual_type, delta=0.0005): if individual_type == "modules": species_list = self.module_species_list individuals = self.modules elif individual_type == "blueprints": species_list = self.blueprint_species_list individuals = self.blueprints else: raise ValueError("type must be one of blueprints or modules") create_new_species = False old_features = [ individual.genotype_phenotype_features(historical_marker) for individual in individuals ] old_labels = [individual.species for individual in individuals] new_features = [ new_individual.genotype_phenotype_features(historical_marker) for new_individual in new_individuals ] scaled_features = scale(old_features + new_features) old_features = scaled_features[:len(old_features)] new_features = scaled_features[len(old_features):] print("Old Labels", individual_type) print(old_labels) if len(set(old_labels)) < 2: dominant_species_id = 0 for species in species_list: if species.members: dominant_species_id = species.id kmeans = KMeans(n_clusters=1, random_state=0) kmeans.fit(old_features) centroids = kmeans.cluster_centers_ cluster_distances_map = {} unique_labels = list(set(old_labels)) unique_labels.sort() centroid_map = {} for i in range(len(unique_labels)): centroid_map[unique_labels[i]] = centroids[i] for i in unique_labels: cluster_distances_map[i] = [] for i in range(len(old_features)): cluster_distances_map[old_labels[i]].append( np.sum( pairwise_distances( old_features[i].reshape(1, -1), np.array(centroid_map[old_labels[i]]).reshape( 1, -1), force_all_finite=True))) max_point_distance_clusters = {} for cluster, distances in cluster_distances_map.items(): if np.max(distances) == 0: max_point_distance_clusters[cluster] = 1 else: max_point_distance_clusters[cluster] = np.max(distances) new_labels = kmeans.predict(new_features) dominant_labels = [dominant_species_id] * len(new_features) print(dominant_labels) adjusted_labels = [] for i in range(len(new_features)): print(i) d1 = np.sum( pairwise_distances( new_features[i].reshape(1, -1), np.array(centroid_map[new_labels[i]]).reshape(1, -1), force_all_finite=True)) print("Print Distance") print(max_point_distance_clusters[new_labels[i]]) print(((d1 - max_point_distance_clusters[new_labels[i]]) / max_point_distance_clusters[new_labels[i]])) print(delta) if ((d1 - max_point_distance_clusters[new_labels[i]]) / max_point_distance_clusters[new_labels[i]]) > delta: adjusted_labels.append(unique_labels[-1] + 1) print("Found unique individual") create_new_species = True else: adjusted_labels.append(dominant_labels[i]) if create_new_species: species_list.append( Species( unique_labels[-1] + 1, [], individual_type, "Species_" + individual_type + "_" + str(unique_labels[-1] + 1))) print("Created new Species") for i in range(len(new_individuals)): new_individuals[i].species = adjusted_labels[i] species_exclusion_list = [] for species in species_list: members = [] for new_individual in new_individuals: if new_individual.species == species.id: members.append(new_individual) if members: species.members = species.members + members if not species.members: species_exclusion_list.append(species) for species in species_exclusion_list: species_list.remove(species) else: if not new_individuals: raise Exception("No new offsprings") NCcLf = NearestCentroid(metric="euclidean") NCcLf.fit(old_features, old_labels) centroids = NCcLf.centroids_ print("centroids") print(NCcLf.centroids_) cluster_distances_map = {} unique_labels = list(set(old_labels)) unique_labels.sort() centroid_map = {} for i in range(len(unique_labels)): centroid_map[unique_labels[i]] = centroids[i] for i in unique_labels: cluster_distances_map[i] = [] for i in range(len(old_features)): cluster_distances_map[old_labels[i]].append( np.sum( pairwise_distances( old_features[i].reshape(1, -1), np.array(centroid_map[old_labels[i]]).reshape( 1, -1), force_all_finite=True))) max_point_distance_clusters = {} for cluster, distances in cluster_distances_map.items(): if np.max(distances) == 0: max_point_distance_clusters[cluster] = 1 else: max_point_distance_clusters[cluster] = np.max(distances) new_labels = NCcLf.predict(new_features) adjusted_labels = [] for i in range(len(new_features)): d1 = np.sum( pairwise_distances( new_features[i].reshape(1, -1), np.array(centroid_map[new_labels[i]]).reshape(1, -1), force_all_finite=True)) print("Print Distance") print(max_point_distance_clusters[new_labels[i]]) print(((d1 - max_point_distance_clusters[new_labels[i]]) / max_point_distance_clusters[new_labels[i]])) if ((d1 - max_point_distance_clusters[new_labels[i]]) / max_point_distance_clusters[new_labels[i]]) > delta: adjusted_labels.append(unique_labels[-1] + 1) create_new_species = True else: adjusted_labels.append(new_labels[i]) non_existing = True if create_new_species: for species in species_list: if species.id == unique_labels[-1] + 1: raise Exception("Creating Duplicate species") species_list.append( Species( unique_labels[-1] + 1, [], individual_type, "Species_" + individual_type + "_" + str(unique_labels[-1] + 1))) print("Created new Species") for i in range(len(new_individuals)): new_individuals[i].species = adjusted_labels[i] species_exclusion_list = [] for species in species_list: members = [] for new_individual in new_individuals: if new_individual.species == species.id: members.append(new_individual) if members: species.members = species.members + members if not species.members: species_exclusion_list.append(species) for species in species_exclusion_list: species_list.remove(species)
def nearest_centroid(X, Y, parameters): method_params = parameters["static"]["params"]["strategies"]["ml_model"]["params"] clf = NearestCentroid(**method_params) clf.fit(X, Y) return clf
def construct_model(X_train, y_train, params, sample_weights=None): model_name = params['name'] model_params = params['params'] base_model_params = params.get('base_model_params', {}) train_data = (X_train, y_train, sample_weights) if sample_weights is not None else ( X_train, y_train) if model_name == 'DecisionTreeClassifier': from sklearn.tree import DecisionTreeClassifier model = DecisionTreeClassifier(**model_params).fit(*train_data) elif model_name == 'SVC': from sklearn.svm import SVC model = SVC(**model_params).fit(*train_data) elif model_name == 'KNN': from sklearn.neighbors import KNeighborsClassifier model = KNeighborsClassifier(**model_params).fit(X_train, y_train) elif model_name == 'GaussianNB': from sklearn.naive_bayes import GaussianNB model = GaussianNB(**model_params).fit(X_train, y_train) elif model_name == 'RandomForestClassifier': from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier(**model_params).fit(*train_data) elif model_name == 'GradientBoostingClassifier': from sklearn.ensemble import GradientBoostingClassifier model = GradientBoostingClassifier(**model_params).fit(*train_data) elif model_name == 'BaggedKNN': from sklearn.ensemble import BaggingClassifier from sklearn.neighbors import KNeighborsClassifier model = BaggingClassifier( KNeighborsClassifier(**base_model_params), **model_params).fit(*train_data) elif model_name == 'AdaBoostedTree': from sklearn.ensemble import AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier model = AdaBoostClassifier( DecisionTreeClassifier(**base_model_params), **model_params).fit(*train_data) elif model_name == 'XGBoostClassifier': from xgboost import XGBClassifier model = XGBClassifier(**model_params).fit(*train_data) elif model_name == 'RidgeClassifier': from sklearn.linear_model import RidgeClassifier model = RidgeClassifier(**model_params).fit(*train_data) elif model_name == 'LogisticRegression': from sklearn.linear_model import LogisticRegression model = LogisticRegression(**model_params).fit(*train_data) elif model_name == 'LDA': from sklearn.discriminant_analysis import LinearDiscriminantAnalysis model = LinearDiscriminantAnalysis(**model_params).fit( X_train, y_train) elif model_name == 'QDA': from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis model = QuadraticDiscriminantAnalysis(**model_params).fit( X_train, y_train) elif model_name == 'MLP': from sklearn.neural_network import MLPClassifier model = MLPClassifier(**model_params).fit(X_train, y_train) elif model_name == 'NearestCentroid': from sklearn.neighbors import NearestCentroid model = NearestCentroid(**model_params).fit(X_train, y_train) elif model_name == 'RadiusNeighborsClassifier': from sklearn.neighbors import RadiusNeighborsClassifier model = RadiusNeighborsClassifier(**model_params).fit( X_train, y_train) else: raise ValueError("unknown ML model passed in model_name") return model
def get_estimators_list(dataset, options, use_imdb_multi_class_labels, is_soft_voting=False, is_stacking_classifier=False, final_estimator=None): if is_stacking_classifier: ml_algorithm_list = [ Classifier.COMPLEMENT_NB.name, Classifier.RIDGE_CLASSIFIER.name, Classifier.LINEAR_SVC.name, Classifier.LOGISTIC_REGRESSION.name, Classifier.PASSIVE_AGGRESSIVE_CLASSIFIER.name, Classifier.RANDOM_FOREST_CLASSIFIER.name ] else: # is VotingClassifier if is_soft_voting: ml_algorithm_list = [ Classifier.COMPLEMENT_NB.name, Classifier.LOGISTIC_REGRESSION.name, Classifier.MULTINOMIAL_NB.name, Classifier.RANDOM_FOREST_CLASSIFIER.name ] else: ml_algorithm_list = [ Classifier.COMPLEMENT_NB.name, Classifier.RIDGE_CLASSIFIER.name, Classifier.LINEAR_SVC.name, Classifier.LOGISTIC_REGRESSION.name, Classifier.PASSIVE_AGGRESSIVE_CLASSIFIER.name, Classifier.RANDOM_FOREST_CLASSIFIER.name ] estimators_list = [] if Classifier.ADA_BOOST_CLASSIFIER.name in ml_algorithm_list: json_with_best_parameters = get_json_with_best_parameters( dataset, Classifier.ADA_BOOST_CLASSIFIER, use_imdb_multi_class_labels) # adding options.random_state in the map json_with_best_parameters['random_state'] = options.random_state # create classifier with best parameters ada_boost_classifier = AdaBoostClassifier(**json_with_best_parameters) print('\t', ada_boost_classifier) estimators_list.append(('ada_boost_classifier', ada_boost_classifier)) if Classifier.BERNOULLI_NB.name in ml_algorithm_list: json_with_best_parameters = get_json_with_best_parameters( dataset, Classifier.BERNOULLI_NB, use_imdb_multi_class_labels) # create classifier with best parameters bernoulli_nb = BernoulliNB(**json_with_best_parameters) print('\t', bernoulli_nb) estimators_list.append(('bernoulli_nb', bernoulli_nb)) if Classifier.COMPLEMENT_NB.name in ml_algorithm_list: json_with_best_parameters = get_json_with_best_parameters( dataset, Classifier.COMPLEMENT_NB, use_imdb_multi_class_labels) # create classifier with best parameters complement_nb = ComplementNB(**json_with_best_parameters) print('\t', complement_nb) estimators_list.append(('complement_nb', complement_nb)) if Classifier.DECISION_TREE_CLASSIFIER.name in ml_algorithm_list: json_with_best_parameters = get_json_with_best_parameters( dataset, Classifier.DECISION_TREE_CLASSIFIER, use_imdb_multi_class_labels) # adding options.random_state in the map json_with_best_parameters['random_state'] = options.random_state # create classifier with best parameters decision_tree_classifier = DecisionTreeClassifier( **json_with_best_parameters) print('\t', decision_tree_classifier) estimators_list.append( ('decision_tree_classifier', decision_tree_classifier)) if Classifier.GRADIENT_BOOSTING_CLASSIFIER.name in ml_algorithm_list: json_with_best_parameters = get_json_with_best_parameters( dataset, Classifier.GRADIENT_BOOSTING_CLASSIFIER, use_imdb_multi_class_labels) # adding options.verbose in the map json_with_best_parameters['verbose'] = options.verbose # adding options.random_state in the map json_with_best_parameters['random_state'] = options.random_state # create classifier with best parameters gradient_boosting_classifier = GradientBoostingClassifier( **json_with_best_parameters) print('\t', gradient_boosting_classifier) estimators_list.append( ('gradient_boosting_classifier', gradient_boosting_classifier)) if Classifier.K_NEIGHBORS_CLASSIFIER.name in ml_algorithm_list: json_with_best_parameters = get_json_with_best_parameters( dataset, Classifier.K_NEIGHBORS_CLASSIFIER, use_imdb_multi_class_labels) # adding options.random_state in the map json_with_best_parameters['n_jobs'] = options.n_jobs # create classifier with best parameters k_neighbors_classifier = KNeighborsClassifier( **json_with_best_parameters) print('\t', k_neighbors_classifier) estimators_list.append( ('k_neighbors_classifier', k_neighbors_classifier)) if Classifier.LINEAR_SVC.name in ml_algorithm_list: json_with_best_parameters = get_json_with_best_parameters( dataset, Classifier.LINEAR_SVC, use_imdb_multi_class_labels) # adding options.verbose in the map json_with_best_parameters['verbose'] = options.verbose # adding options.random_state in the map json_with_best_parameters['random_state'] = options.random_state # create classifier with best parameters linear_svc = LinearSVC(**json_with_best_parameters) print('\t', linear_svc) estimators_list.append(('linear_svc', linear_svc)) if Classifier.LOGISTIC_REGRESSION.name in ml_algorithm_list: json_with_best_parameters = get_json_with_best_parameters( dataset, Classifier.LOGISTIC_REGRESSION, use_imdb_multi_class_labels) # adding options.n_jobs in the map json_with_best_parameters['n_jobs'] = options.n_jobs # adding options.verbose in the map json_with_best_parameters['verbose'] = options.verbose # adding options.random_state in the map json_with_best_parameters['random_state'] = options.random_state # create classifier with best parameters logistic_regression = LogisticRegression(**json_with_best_parameters) print('\t', logistic_regression) estimators_list.append(('logistic_regression', logistic_regression)) if Classifier.MULTINOMIAL_NB.name in ml_algorithm_list: json_with_best_parameters = get_json_with_best_parameters( dataset, Classifier.MULTINOMIAL_NB, use_imdb_multi_class_labels) # create classifier with best parameters multinomial_nb = MultinomialNB(**json_with_best_parameters) print('\t', multinomial_nb) estimators_list.append(('multinomial_nb', multinomial_nb)) if Classifier.NEAREST_CENTROID.name in ml_algorithm_list: json_with_best_parameters = get_json_with_best_parameters( dataset, Classifier.NEAREST_CENTROID, use_imdb_multi_class_labels) # create classifier with best parameters nearest_centroid = NearestCentroid(**json_with_best_parameters) print('\t', nearest_centroid) estimators_list.append(('nearest_centroid', nearest_centroid)) if Classifier.PASSIVE_AGGRESSIVE_CLASSIFIER.name in ml_algorithm_list: json_with_best_parameters = get_json_with_best_parameters( dataset, Classifier.PASSIVE_AGGRESSIVE_CLASSIFIER, use_imdb_multi_class_labels) # adding options.n_jobs in the map json_with_best_parameters['n_jobs'] = options.n_jobs # adding options.verbose in the map json_with_best_parameters['verbose'] = options.verbose # adding options.random_state in the map json_with_best_parameters['random_state'] = options.random_state # create classifier with best parameters passive_aggressive_classifier = PassiveAggressiveClassifier( **json_with_best_parameters) print('\t', passive_aggressive_classifier) estimators_list.append( ('passive_aggressive_classifier', passive_aggressive_classifier)) if Classifier.PERCEPTRON.name in ml_algorithm_list: json_with_best_parameters = get_json_with_best_parameters( dataset, Classifier.PERCEPTRON, use_imdb_multi_class_labels) # adding options.n_jobs in the map json_with_best_parameters['n_jobs'] = options.n_jobs # adding options.verbose in the map json_with_best_parameters['verbose'] = options.verbose # adding options.random_state in the map json_with_best_parameters['random_state'] = options.random_state # create classifier with best parameters perceptron = Perceptron(**json_with_best_parameters) print('\t', perceptron) estimators_list.append(('perceptron', perceptron)) if Classifier.RANDOM_FOREST_CLASSIFIER.name in ml_algorithm_list: json_with_best_parameters = get_json_with_best_parameters( dataset, Classifier.RANDOM_FOREST_CLASSIFIER, use_imdb_multi_class_labels) # adding options.n_jobs in the map json_with_best_parameters['n_jobs'] = options.n_jobs # adding options.verbose in the map json_with_best_parameters['verbose'] = options.verbose # adding options.random_state in the map json_with_best_parameters['random_state'] = options.random_state # create classifier with best parameters random_forest_classifier = RandomForestClassifier( **json_with_best_parameters) print('\t', random_forest_classifier) estimators_list.append( ('random_forest_classifier', random_forest_classifier)) if Classifier.RIDGE_CLASSIFIER.name in ml_algorithm_list: json_with_best_parameters = get_json_with_best_parameters( dataset, Classifier.RIDGE_CLASSIFIER, use_imdb_multi_class_labels) # adding options.random_state in the map json_with_best_parameters['random_state'] = options.random_state # create classifier with best parameters ridge_classifier = RidgeClassifier(**json_with_best_parameters) print('\t', ridge_classifier) estimators_list.append(('ridge_classifier', ridge_classifier)) if is_stacking_classifier: if final_estimator == Classifier.LINEAR_SVC.name: return estimators_list, linear_svc elif final_estimator == Classifier.LOGISTIC_REGRESSION.name: return estimators_list, logistic_regression elif final_estimator == Classifier.PASSIVE_AGGRESSIVE_CLASSIFIER.name: return estimators_list, passive_aggressive_classifier elif final_estimator == Classifier.RIDGE_CLASSIFIER.name: return estimators_list, ridge_classifier else: # Default return estimators_list, LinearSVC() return estimators_list
def featureEng(train, name='NB1', n_comp=50, ngram_cv=3, ngram_tfidf=3, training=True): eng_stopwords = set(stopwords.words('english')) cls = [(RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"), (Perceptron(max_iter=50), "Perceptron"), (KNeighborsClassifier(n_neighbors=10), "kNN"), (RandomForestClassifier(n_estimators=10), "Random forest"), (LinearSVC(loss='squared_hinge', penalty='l1', dual=False, tol=1e-3), "SVC-L1"), (LinearSVC(loss='squared_hinge', penalty='l2', dual=False, tol=1e-3), "SVC-L2"), (SGDClassifier(alpha=.01, max_iter=50, penalty='l1'), 'SGD-L1'), (SGDClassifier(alpha=.01, max_iter=50, penalty='l2'), 'SGD-L2'), (SGDClassifier(alpha=.01, max_iter=50, penalty='elasticnet'), 'SGD-ElasticNet'), (NearestCentroid(), 'Nearest neighbor'), (MultinomialNB(alpha=.1), 'NB1'), (BernoulliNB(alpha=.1), 'NB2')] train['num_words'] = train['text'].apply(lambda x: len(str(x).split())) train['num_unique_words'] = train['text'].apply( lambda x: len(set(str(x).split()))) train['num_chars'] = train['text'].apply(lambda x: len(str(x))) train['num_stopwords'] = train['text'].apply(lambda x: len( [w for w in str(x).lower().split() if w in eng_stopwords])) train['num_punctions'] = train['text'].apply( lambda x: len([w for w in str(x) if w in string.punctuation])) train["num_words_upper"] = train["text"].apply( lambda x: len([w for w in str(x).split() if w.isupper()])) ## Number of title case words in the text ## train["num_words_title"] = train["text"].apply( lambda x: len([w for w in str(x).split() if w.istitle()])) ## Average length of the words in the text ## train["mean_word_len"] = train["text"].apply( lambda x: np.mean([len(w) for w in str(x).split()])) all_text_without_sw = '' for i in train.itertuples(): all_text_without_sw = all_text_without_sw + str(i.text) #getting counts of each words: counts = Counter(re.findall(r"[\w']+", all_text_without_sw)) #deleting ' from counts del counts["'"] #getting top 50 used words: sorted_x = dict( sorted(counts.items(), key=operator.itemgetter(1), reverse=True)[:300]) #Feature-5: The count of top used words. train['num_top'] = train['text'].apply( lambda x: len([w for w in str(x).lower().split() if w in sorted_x])) #Similarly lets identify the least used words: reverted_x = dict( sorted(counts.items(), key=operator.itemgetter(1))[:10000]) #Feature-6: The count of least used words. train['num_least'] = train['text'].apply( lambda x: len([w for w in str(x).lower().split() if w in reverted_x])) train['unique_word_fraction'] = train['text'].apply( lambda row: unique_word_fraction(row)) train['stopwords_count'] = train['text'].apply( lambda row: stopwords_count(row)) train['punctuations_fraction'] = train['text'].apply( lambda row: punctuations_fraction(row)) train['char_count'] = train['text'].apply(lambda row: char_count(row)) train['fraction_noun'] = train['text'].apply( lambda row: fraction_noun(row)) train['fraction_adj'] = train['text'].apply(lambda row: fraction_adj(row)) train['fraction_verbs'] = train['text'].apply( lambda row: fraction_verbs(row)) train['sentiment_id'] = train['sentiment'].apply( lambda row: sentiment_mapping[row]) if training: train['y1'] = train.apply( lambda row: process_data(row.text, row.selected_text)[0], axis=1) train['y2'] = train.apply( lambda row: process_data(row.text, row.selected_text)[1], axis=1) tfidf_vec = TfidfVectorizer(stop_words='english', ngram_range=(1, ngram_tfidf)) train_tfidf = tfidf_vec.fit_transform(train['text'].values.tolist()) ###SVD on word TFIDF svd_obj = TruncatedSVD(n_components=n_comp, algorithm='randomized') svd_obj.fit(train_tfidf) train_svd = pd.DataFrame(svd_obj.transform(train_tfidf)) train_svd.columns = ['svd_wordtfidf_' + str(i) for i in range(n_comp)] train = pd.concat([train, train_svd], axis=1) del train_tfidf, train_svd ### Fit transform the count vectorizer ### wordcv_vec = CountVectorizer(stop_words='english', ngram_range=(1, ngram_cv)) train_vec = wordcv_vec.fit_transform(train['text'].values.tolist()) ###SVD on Character TFIDF svd_obj = TruncatedSVD(n_components=n_comp, algorithm='randomized') svd_obj.fit(train_vec) train_svd = pd.DataFrame(svd_obj.transform(train_vec)) train_svd.columns = ['svd_wordcv_' + str(i) for i in range(n_comp)] train = pd.concat([train, train_svd], axis=1) del train_vec, train_svd charcv_vec = CountVectorizer(ngram_range=(1, ngram_cv), analyzer='char') train_vec = charcv_vec.fit_transform(train['text'].values.tolist()) ###SVD on Character TFIDF svd_obj = TruncatedSVD(n_components=n_comp, algorithm='randomized') svd_obj.fit(train_vec) train_svd = pd.DataFrame(svd_obj.transform(train_vec)) train_svd.columns = ['svd_charcv_' + str(i) for i in range(n_comp)] train = pd.concat([train, train_svd], axis=1) del train_vec, train_svd return train
def compare_models(X, Y): models.append(('NCC', Pipeline(steps=[ ('preprocessor', preprocessor), ('clf', NearestCentroid()), ], ))) models.append(('PC', Pipeline(steps=[ ('preprocessor', preprocessor), ('clf', Perceptron()), ], ))) models.append(('NB', Pipeline(steps=[ ('preprocessor', preprocessor), ('clf', BernoulliNB(alpha=.001)), ], ))) models.append( ('SGD', Pipeline(steps=[ ('preprocessor', preprocessor), ('clf', SGDClassifier(loss='modified_huber', max_iter=10000)), ], ))) models.append(('KNN', Pipeline(steps=[ ('preprocessor', preprocessor), ('clf', KNeighborsClassifier()), ], ))) models.append(('SVM', Pipeline(steps=[ ('preprocessor', preprocessor), ('clf', LinearSVC(max_iter=10000)), ], ))) models.append(('LR', Pipeline(steps=[ ('preprocessor', preprocessor), ('clf', SGDClassifier(loss='log', max_iter=10000)), ], ))) models.append(('DT/CART', Pipeline(steps=[ ('preprocessor', preprocessor), ('clf', DecisionTreeClassifier(max_depth=500)), ], ))) models.append(('RF', Pipeline(steps=[ ('preprocessor', preprocessor), ('clf', RandomForestClassifier(max_depth=500)), ], ))) # evaluate each model in turn results = [] names = [] scoring = 'accuracy' for name, model in models: kfold = model_selection.KFold(n_splits=10, random_state=seed) cv_results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) print(msg) grad_boost = np.array( [0.73, 0.72, 0.69, 0.75, 0.78, 0.79, 0.80, 0.74, 0.76, 0.74]) names.append('GB') results.append(grad_boost) # boxplot algorithm comparison fig = plt.figure() fig.suptitle('Algorithm Comparison') ax = fig.add_subplot(111) plt.boxplot(results) ax.set_xticklabels(names) plt.show()
kernels = [ 'rbf', 'poly', 'linear', 'sigmoid' ] iterations = 3 success_rate = lambda x, y : sum(x == y) * 1.0 / len(x) svc_kernels = { kernel : [] for kernel in kernels } svc_linear = [] nn_all = [] cn_all = [] for i in range(iterations): for kernel in kernels: svc = SVC(kernel=kernel).fit(X_train, y_train).predict(X_test) svc_kernels[kernel].append(success_rate(svc, y_test)) nn = KNeighborsClassifier(n_neighbors=1).fit(X_train, y_train).predict(X_test) nn_all.append(success_rate(nn, y_test)) cn = NearestCentroid().fit(X_train, y_train).predict(X_test) cn_all.append(success_rate(cn, y_test)) for kernel in kernels: print mean(svc_kernels[kernel]) print print mean(nn_all) print mean(cn_all) # visualise(X_train + X_test, y_train + y_test)
def main(): # Checks for correct number of arguments if len(sys.argv) != 3: print( 'usage: ./troll_identifier.py [TRAIN DATASET] [TEST/DEV DATASET]') sys.exit() # set up dataset data_train = pd.read_csv(sys.argv[1]) data_test = pd.read_csv(sys.argv[2]) print('train: {}'.format(sys.argv[1])) print('test: {}'.format(sys.argv[2])) x_train = data_train.drop( [data_train.columns[0], data_train.columns[1], data_train.columns[-1]], axis=1).apply(pd.to_numeric, errors='ignore') y_train = pd.Series(data_train.iloc[:, -1]) x_test = data_test.drop( [data_test.columns[0], data_test.columns[1], data_test.columns[-1]], axis=1).apply(pd.to_numeric, errors='ignore') y_test = pd.Series(data_test.iloc[:, -1]) type = input('type: [1: supervised, 2: semi-supervised, 3: unsupervised] ') if type == 1: method = input('method: [1: classification, 2: regression] ') if method == 1: classifier = input( 'classifier: [1: decision tree, 2: extra tree, 3: extra trees, 4: k nearest neighbor, 5: naive bayes, 6: radius neighbors, 7: random forest, 8: support vector machine, 9: gradient boosting, 10: gaussian process, 11: stochastic gradient descent, 12: passive aggressive, 13: nearest centroid, 14: perceptron, 15: multi-layer perceptron, 16: ada boost] ' ) if classifier == 1: criterion = input('criterion: [1: gini, 2: entropy] ') if criterion == 1: print(type, method, classifier, criterion) model = DecisionTreeClassifier(criterion='gini') elif criterion == 2: print(type, method, classifier, criterion) model = DecisionTreeClassifier(criterion='entropy') else: print('no criterion chosen') exit() elif classifier == 2: print(type, method, classifier) model = ExtraTreeClassifier() elif classifier == 3: print(type, method, classifier) model = ExtraTreesClassifier() elif classifier == 4: n = input('n: [1: 1, 2: 3: 3: 5] ') if n == 1: print(type, method, classifier, n) model = KNeighborsClassifier(n_neighbors=1) elif n == 2: print(type, method, classifier, n) model = KNeighborsClassifier(n_neighbors=3) elif n == 3: print(type, method, classifier, n) model = KNeighborsClassifier(n_neighbors=5) else: print('no n chosen') exit() elif classifier == 5: version = input( 'version: [1: gaussian, 2: bernoulli, 3: multinomial, 4: complement] ' ) if version == 1: print(type, method, classifier, version) model = GaussianNB() elif version == 2: print(type, method, classifier, version) model = BernoulliNB() elif version == 3: print(type, method, classifier, version) model = MultinomialNB() elif version == 4: print(type, method, classifier, version) model = ComplementNB() else: print('no version chosen') exit() elif classifier == 6: print(type, method, classifier) model = RadiusNeighborsClassifier(radius=1.0) elif classifier == 7: print(type, method, classifier) model = RandomForestClassifier(n_estimators=50, random_state=1) elif classifier == 8: print(type, method, classifier) model = LinearSVC( multi_class='crammer_singer') #multi_class='ovr' elif classifier == 9: print(type, method, classifier) model = GradientBoostingClassifier() elif classifier == 10: print(type, method, classifier) model = GaussianProcessClassifier(multi_class='one_vs_one') # model = GaussianProcessClassifier(multi_class='one_vs_rest') elif classifier == 11: print(type, method, classifier) model = SGDClassifier() elif classifier == 12: print(type, method, classifier) model = PassiveAggressiveClassifier() elif classifier == 13: print(type, method, classifier) model = NearestCentroid() elif classifier == 14: print(type, method, classifier) model = Perceptron(tol=1e-3, random_state=0) elif classifier == 15: print(type, method, classifier) model = MLPClassifier() elif classifier == 16: print(type, method, classifier) model = AdaBoostClassifier(n_estimators=100) else: print('no classifier chosen') exit() # train the model using the training sets and check score model.fit(x_train, y_train) model.score(x_train, y_train) # predict output predictions = pd.Series(model.predict(x_test)) filename = '{},{},{}.txt'.format(type, method, classifier) with open(filename, 'w') as output: output.write('{:10}\t{:10}\t{:10}\t{:10}'.format( 'actual', 'predict', 'approximate', 'match?')) for i in range(len(predictions)): match = True if (y_test[i] == predictions[i]) else False output.write('{:10}\t{:10}\t{:10}'.format( y_train[i], predictions[i], match)) output.write('accuracy: {:7.2f}%'.format( 100 * accuracy_score(y_test, predictions))) print('accuracy: {:7.2f}%'.format( 100 * accuracy_score(y_test, predictions))) print( classification_report( y_test, predictions, target_names=['RightTroll', 'LeftTroll', 'Other'])) print( confusion_matrix(y_test, predictions, labels=["RightTroll", "LeftTroll", "Other"])) elif method == 2: # transform into binary classification problem # y_train = y_train.apply(lambda x: 0 if x == 'Other' else 1) # y_test = y_test.apply(lambda x: 0 if x == 'Other' else 1) # transform string labels into integers # le = LabelEncoder() # le.fit(y_train) # print(le.transform(['LeftTroll', 'Other', 'Other', 'RightTroll'])), print(le.inverse_transform([0, 1, 2, 1])) # print(le.classes_) # # y_train = le.transform(y_train) # y_test = le.transform(y_test) regressor = input( 'regressor: [1: linear discriminant analysis, 2: logistic regression, 3: ridge regression, 4: quadratic discriminant analysis, 5: linear regression, 6: decision tree regression, 7: pls regression, 8: pls canonical, 9: canonical correlation analysis, 10: lasso, 11: multi-task lasso, 12: elastic net, 13: multi-task elastic net, 14: least angle regression, 15: least angle regression lasso, 16: orthogonal matching pursuit, 17: bayesian ridge, 18: automatic relevence determination, 19: theil sen regression, 20: huber regressor, 21: random sample consensus] ' ) if regressor == 1: print(type, method, regressor) model = LinearDiscriminantAnalysis() elif regressor == 2: print(type, method, regressor) model = LogisticRegression( solver='lbfgs', multi_class='multinomial') #'newton-cg' elif regressor == 3: print(type, method, regressor) model = RidgeClassifier() elif regressor == 4: print(type, method, regressor) model = QuadraticDiscriminantAnalysis() elif regressor == 5: strategy = input('strategy: [1: one vs rest, 2: one vs one] ') if strategy == 1: print(type, method, strategy, regressor) model = OneVsRestClassifier(LinearRegression()) elif strategy == 2: print(type, method, strategy, regressor) model = OneVsOneClassifier(LinearRegression()) else: print('no strategy selected') exit() elif regressor == 6: strategy = input('strategy: [1: one vs rest, 2: one vs one] ') if strategy == 1: print(type, method, strategy, regressor) model = OneVsRestClassifier(DecisionTreeRegressor()) elif strategy == 2: print(type, method, strategy, regressor) model = OneVsOneClassifier(DecisionTreeRegressor()) else: print('no strategy selected') exit() elif regressor == 7: print(type, method, regressor) model = PLSRegression(n_components=2) elif regressor == 8: print(type, method, regressor) model = PLSCanonical(n_components=2) elif regressor == 9: print(type, method, regressor) model = CCA(n_components=1) elif regressor == 10: print(type, method, regressor) model = Lasso(alpha=0.1) elif regressor == 11: print(type, method, regressor) model = MultiTaskLasso(alpha=0.1) elif regressor == 12: print(type, method, regressor) model = ElasticNet(random_state=0) elif regressor == 13: print(type, method, regressor) model = MultiTaskElasticNet(random_state=0) elif regressor == 14: print(type, method, regressor) model = Lars(n_nonzero_coefs=1) elif regressor == 15: print(type, method, regressor) model = LassoLars(alpha=.1) elif regressor == 16: print(type, method, regressor) model = OrthogonalMatchingPursuit() elif regressor == 17: print(type, method, regressor) model = BayesianRidge() elif regressor == 18: print(type, method, regressor) model = ARDRegression() elif regressor == 19: print(type, method, regressor) model = TheilSenRegressor(random_state=0) elif regressor == 20: print(type, method, regressor) model = HuberRegressor() elif regressor == 21: print(type, method, regressor) model = RANSACRegressor(random_state=0) else: print('no regressor chosen') exit() # train the model using the training sets and check score model.fit(x_train, y_train) model.score(x_train, y_train) # print('coefficient:', model.coef_) # print('intercept:', model.intercept_) # predict output predictions = pd.Series(model.predict(x_test)) print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?')) # calculate accuracy numerator = 0.0 denominator = float(len(predictions)) for i in range(len(predictions)): match = True if (y_test[i] == predictions[i]) else False numerator += 1 if match else 0 print('{:10}\t{:10}\t{:10}'.format(y_train[i], predictions[i], match)) print('accuracy = {:7.2f}%'.format(100 * numerator / denominator)) else: print('no method chosen') exit() elif type == 2: classifier = input( 'classifier: [1: label propagation, 2: label spreading] ') if classifier == 1: print(type, classifier) model = LabelPropagation() elif classifier == 2: print(type, classifier) model = LabelSpreading() else: print('no classifier chosen') exit() # train the model using the training sets and check score model.fit(x_train, y_train) model.score(x_train, y_train) # predict output predictions = pd.Series(model.predict(x_test)) print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?')) # calculate accuracy numerator = 0.0 denominator = float(len(predictions)) for i in range(len(predictions)): match = True if (y_test[i] == predictions[i]) else False numerator += 1 if match else 0 print('{:10}\t{:10}\t{:10}'.format(y_train[i], predictions[i], match)) print('accuracy = {:7.2f}%'.format(100 * numerator / denominator)) elif type == 3: method = input( 'method: [1: clustering, 2: random trees embedding, 3: nearest neighbors] ' ) if method == 1: clusterer = input('clustere: [1: k means]') if clusterer == 1: clusters = input('clusters: [1: 1, 2: 2, 3: 3] ') if clusters == 1: print(type, method, clusters) model = KMeans(n_clusters=1, random_state=0) elif clusters == 2: print(type, method, clusters) model = KMeans(n_clusters=2, random_state=0) elif clusters == 3: print(type, method, clusters) model = KMeans(n_clusters=3, random_state=0) else: print('no clusters chosen') exit() else: print('no clusterer chosen') exit() # train the model using the training sets and check score model.fit(x_train) # predict output predictions = model.predict(x_test) print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?')) # check details print('centroids: ' + model.cluster_centers_) # print('labels: ' + model.labels_) elif method == 2: model = RandomTreesEmbedding() # train the model using the training sets and check score model.fit(x_train) # predict output predictions = model.apply(x_test) print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?')) elif method == 3: model = NearestNeighbors(n_neighbors=2, algorithm='ball_tree') # train the model using the training sets and check score model.fit(x_train) distances, indices = nbrs.kneighbors(X) else: print('no method chosen') exit() # calculate accuracy numerator = 0.0 denominator = float(len(predictions)) for i in range(len(predictions)): match = True if (y_test[i] == predictions[i]) else False numerator += 1 if match else 0 print('{:10}\t{:10}\t{:10}'.format(y_train[i], predictions[i], match)) print('accuracy = {:7.2f}%'.format(100 * numerator / denominator)) else: print('no type chosen') exit()
def create_nearest_centroid(preprocessor, x_train, y_train): model = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', NearestCentroid())]) model.fit(x_train, y_train) return model
'Jump sideways', 'Jump leg/arms open/closed ', 'Jump rope' , 'Trunk twist (arms outstretched)', 'Trunk twist (elbows bent)', 'Waist bends forward', ' Waist rotation', 'Waist bends (reach foot with opposite hand)', 'Reach heels backwards' , 'Lateral bend', 'Lateral bend with arm up', 'Repetitive forward stretching', 'Upper trunk and lower body opposite twist', 'Lateral elevation of arms', 'Frontal elevation of arms' , 'Frontal hand claps', 'Frontal crossing of arms', 'Shoulders high-amplitude rotation', 'Shoulders low-amplitude rotation', 'Arms inner rotation', 'Knees (alternating) to the breast', 'Heels (alternatively) to the backside', 'Knees bending (crouching)', 'Knees (alternating) bending forward', 'Rotation on the knees', 'Rowing', 'Elliptical bike', 'Cycling'] models = {'DT': DecisionTreeClassifier(criterion='entropy'), 'NB': GaussianNB(), 'NCC': NearestCentroid(), "KNN": KNeighborsClassifier(n_neighbors=3)} def per_class_classification(file, cv_type='iid', overlap=False): if overlap: overlap_path = 'overlap' else: overlap_path = 'nonoverlap' file_name = os.path.basename(os.path.splitext(file)[0]) fs = os.path.basename(os.path.dirname(file)) print(fs) win_size = (file_name[7:]) print(str(win_size)) dataset = pd.read_csv(file, sep='\t') groups = dataset.iloc[:, 1]
benchmark(SGDClassifier(alpha=.0001, max_iter=50, penalty=penalty), use_tfidf=False)) # Train SGD with Elastic Net penalty print('=' * 80) print("Elastic-Net penalty") results.append( benchmark(SGDClassifier(alpha=.0001, max_iter=50, penalty="elasticnet"))) results_count.append( benchmark(SGDClassifier(alpha=.0001, max_iter=50, penalty="elasticnet"), use_tfidf=False)) # Train NearestCentroid without threshold print('=' * 80) print("NearestCentroid (aka Rocchio classifier)") results.append(benchmark(NearestCentroid())) results_count.append(benchmark(NearestCentroid(), use_tfidf=False)) # Train sparse Naive Bayes classifiers print('=' * 80) print("Naive Bayes") results.append(benchmark(MultinomialNB(alpha=.01))) results.append(benchmark(BernoulliNB(alpha=.01))) results.append(benchmark(ComplementNB(alpha=.1))) results_count.append(benchmark(MultinomialNB(alpha=.01), use_tfidf=False)) results_count.append(benchmark(BernoulliNB(alpha=.01), use_tfidf=False)) results_count.append(benchmark(ComplementNB(alpha=.1), use_tfidf=False)) print('=' * 80) print("LinearSVC with L1-based feature selection")
def get_ml_algorithm_pair_list(options, ml_algorithm_list, use_classifiers_with_default_parameters, use_imdb_multi_class_labels, dataset): ml_final_list = [] if Classifier.ADA_BOOST_CLASSIFIER.name in ml_algorithm_list: if use_classifiers_with_default_parameters: ml_final_list.append( (AdaBoostClassifier(random_state=options.random_state), Classifier.ADA_BOOST_CLASSIFIER)) else: json_with_best_parameters = get_json_with_best_parameters( dataset, Classifier.ADA_BOOST_CLASSIFIER, use_imdb_multi_class_labels) # adding options.random_state in the map json_with_best_parameters['random_state'] = options.random_state # create classifier with best parameters classifier_with_best_parameters = AdaBoostClassifier( **json_with_best_parameters) print('\t', classifier_with_best_parameters) ml_final_list.append((classifier_with_best_parameters, Classifier.ADA_BOOST_CLASSIFIER)) if Classifier.BERNOULLI_NB.name in ml_algorithm_list: if use_classifiers_with_default_parameters: ml_final_list.append((BernoulliNB(), Classifier.BERNOULLI_NB)) else: json_with_best_parameters = get_json_with_best_parameters( dataset, Classifier.BERNOULLI_NB, use_imdb_multi_class_labels) # create classifier with best parameters classifier_with_best_parameters = BernoulliNB( **json_with_best_parameters) print('\t', classifier_with_best_parameters) ml_final_list.append( (classifier_with_best_parameters, Classifier.BERNOULLI_NB)) if Classifier.COMPLEMENT_NB.name in ml_algorithm_list: if use_classifiers_with_default_parameters: ml_final_list.append((ComplementNB(), Classifier.COMPLEMENT_NB)) else: json_with_best_parameters = get_json_with_best_parameters( dataset, Classifier.COMPLEMENT_NB, use_imdb_multi_class_labels) # create classifier with best parameters classifier_with_best_parameters = ComplementNB( **json_with_best_parameters) print('\t', classifier_with_best_parameters) ml_final_list.append( (classifier_with_best_parameters, Classifier.COMPLEMENT_NB)) if Classifier.DECISION_TREE_CLASSIFIER.name in ml_algorithm_list: if use_classifiers_with_default_parameters: ml_final_list.append( (DecisionTreeClassifier(random_state=options.random_state), Classifier.DECISION_TREE_CLASSIFIER)) else: json_with_best_parameters = get_json_with_best_parameters( dataset, Classifier.DECISION_TREE_CLASSIFIER, use_imdb_multi_class_labels) # adding options.random_state in the map json_with_best_parameters['random_state'] = options.random_state # create classifier with best parameters classifier_with_best_parameters = DecisionTreeClassifier( **json_with_best_parameters) print('\t', classifier_with_best_parameters) ml_final_list.append((classifier_with_best_parameters, Classifier.DECISION_TREE_CLASSIFIER)) if Classifier.GRADIENT_BOOSTING_CLASSIFIER.name in ml_algorithm_list: if use_classifiers_with_default_parameters: ml_final_list.append( (GradientBoostingClassifier(verbose=options.verbose, random_state=options.random_state), Classifier.GRADIENT_BOOSTING_CLASSIFIER)) else: json_with_best_parameters = get_json_with_best_parameters( dataset, Classifier.GRADIENT_BOOSTING_CLASSIFIER, use_imdb_multi_class_labels) # adding options.verbose in the map json_with_best_parameters['verbose'] = options.verbose # adding options.random_state in the map json_with_best_parameters['random_state'] = options.random_state # create classifier with best parameters classifier_with_best_parameters = GradientBoostingClassifier( **json_with_best_parameters) print('\t', classifier_with_best_parameters) ml_final_list.append((classifier_with_best_parameters, Classifier.GRADIENT_BOOSTING_CLASSIFIER)) if Classifier.K_NEIGHBORS_CLASSIFIER.name in ml_algorithm_list: if use_classifiers_with_default_parameters: ml_final_list.append((KNeighborsClassifier(n_jobs=options.n_jobs), Classifier.K_NEIGHBORS_CLASSIFIER)) else: json_with_best_parameters = get_json_with_best_parameters( dataset, Classifier.K_NEIGHBORS_CLASSIFIER, use_imdb_multi_class_labels) # adding options.random_state in the map json_with_best_parameters['n_jobs'] = options.n_jobs # create classifier with best parameters classifier_with_best_parameters = KNeighborsClassifier( **json_with_best_parameters) print('\t', classifier_with_best_parameters) ml_final_list.append((classifier_with_best_parameters, Classifier.K_NEIGHBORS_CLASSIFIER)) if Classifier.LINEAR_SVC.name in ml_algorithm_list: if use_classifiers_with_default_parameters: ml_final_list.append((LinearSVC(verbose=options.verbose, random_state=options.random_state), Classifier.LINEAR_SVC)) else: json_with_best_parameters = get_json_with_best_parameters( dataset, Classifier.LINEAR_SVC, use_imdb_multi_class_labels) # adding options.verbose in the map json_with_best_parameters['verbose'] = options.verbose # adding options.random_state in the map json_with_best_parameters['random_state'] = options.random_state # create classifier with best parameters classifier_with_best_parameters = LinearSVC( **json_with_best_parameters) print('\t', classifier_with_best_parameters) ml_final_list.append( (classifier_with_best_parameters, Classifier.LINEAR_SVC)) if Classifier.LOGISTIC_REGRESSION.name in ml_algorithm_list: if use_classifiers_with_default_parameters: ml_final_list.append( (LogisticRegression(n_jobs=options.n_jobs, verbose=options.verbose, random_state=options.random_state), Classifier.LOGISTIC_REGRESSION)) else: json_with_best_parameters = get_json_with_best_parameters( dataset, Classifier.LOGISTIC_REGRESSION, use_imdb_multi_class_labels) # adding options.n_jobs in the map json_with_best_parameters['n_jobs'] = options.n_jobs # adding options.verbose in the map json_with_best_parameters['verbose'] = options.verbose # adding options.random_state in the map json_with_best_parameters['random_state'] = options.random_state # create classifier with best parameters classifier_with_best_parameters = LogisticRegression( **json_with_best_parameters) print('\t', classifier_with_best_parameters) ml_final_list.append((classifier_with_best_parameters, Classifier.LOGISTIC_REGRESSION)) if Classifier.MULTINOMIAL_NB.name in ml_algorithm_list: if use_classifiers_with_default_parameters: ml_final_list.append((MultinomialNB(), Classifier.MULTINOMIAL_NB)) else: json_with_best_parameters = get_json_with_best_parameters( dataset, Classifier.MULTINOMIAL_NB, use_imdb_multi_class_labels) # create classifier with best parameters classifier_with_best_parameters = MultinomialNB( **json_with_best_parameters) print('\t', classifier_with_best_parameters) ml_final_list.append( (classifier_with_best_parameters, Classifier.MULTINOMIAL_NB)) if Classifier.NEAREST_CENTROID.name in ml_algorithm_list: if use_classifiers_with_default_parameters: ml_final_list.append( (NearestCentroid(), Classifier.NEAREST_CENTROID)) else: json_with_best_parameters = get_json_with_best_parameters( dataset, Classifier.NEAREST_CENTROID, use_imdb_multi_class_labels) # create classifier with best parameters classifier_with_best_parameters = NearestCentroid( **json_with_best_parameters) print('\t', classifier_with_best_parameters) ml_final_list.append( (classifier_with_best_parameters, Classifier.NEAREST_CENTROID)) if Classifier.PASSIVE_AGGRESSIVE_CLASSIFIER.name in ml_algorithm_list: if use_classifiers_with_default_parameters: ml_final_list.append((PassiveAggressiveClassifier( n_jobs=options.n_jobs, verbose=options.verbose, random_state=options.random_state), Classifier.PASSIVE_AGGRESSIVE_CLASSIFIER)) else: json_with_best_parameters = get_json_with_best_parameters( dataset, Classifier.PASSIVE_AGGRESSIVE_CLASSIFIER, use_imdb_multi_class_labels) # adding options.n_jobs in the map json_with_best_parameters['n_jobs'] = options.n_jobs # adding options.verbose in the map json_with_best_parameters['verbose'] = options.verbose # adding options.random_state in the map json_with_best_parameters['random_state'] = options.random_state # create classifier with best parameters classifier_with_best_parameters = PassiveAggressiveClassifier( **json_with_best_parameters) print('\t', classifier_with_best_parameters) ml_final_list.append((classifier_with_best_parameters, Classifier.PASSIVE_AGGRESSIVE_CLASSIFIER)) if Classifier.PERCEPTRON.name in ml_algorithm_list: if use_classifiers_with_default_parameters: ml_final_list.append( (Perceptron(n_jobs=options.n_jobs, verbose=options.verbose, random_state=options.random_state), Classifier.PERCEPTRON)) else: json_with_best_parameters = get_json_with_best_parameters( dataset, Classifier.PERCEPTRON, use_imdb_multi_class_labels) # adding options.n_jobs in the map json_with_best_parameters['n_jobs'] = options.n_jobs # adding options.verbose in the map json_with_best_parameters['verbose'] = options.verbose # adding options.random_state in the map json_with_best_parameters['random_state'] = options.random_state # create classifier with best parameters classifier_with_best_parameters = Perceptron( **json_with_best_parameters) print('\t', classifier_with_best_parameters) ml_final_list.append( (classifier_with_best_parameters, Classifier.PERCEPTRON)) if Classifier.RANDOM_FOREST_CLASSIFIER.name in ml_algorithm_list: if use_classifiers_with_default_parameters: ml_final_list.append( (RandomForestClassifier(n_jobs=options.n_jobs, verbose=options.verbose, random_state=options.random_state), Classifier.RANDOM_FOREST_CLASSIFIER)) else: json_with_best_parameters = get_json_with_best_parameters( dataset, Classifier.RANDOM_FOREST_CLASSIFIER, use_imdb_multi_class_labels) # adding options.n_jobs in the map json_with_best_parameters['n_jobs'] = options.n_jobs # adding options.verbose in the map json_with_best_parameters['verbose'] = options.verbose # adding options.random_state in the map json_with_best_parameters['random_state'] = options.random_state # create classifier with best parameters classifier_with_best_parameters = RandomForestClassifier( **json_with_best_parameters) print('\t', classifier_with_best_parameters) ml_final_list.append((classifier_with_best_parameters, Classifier.RANDOM_FOREST_CLASSIFIER)) if Classifier.RIDGE_CLASSIFIER.name in ml_algorithm_list: if use_classifiers_with_default_parameters: ml_final_list.append( (RidgeClassifier(random_state=options.random_state), Classifier.RIDGE_CLASSIFIER)) else: json_with_best_parameters = get_json_with_best_parameters( dataset, Classifier.RIDGE_CLASSIFIER, use_imdb_multi_class_labels) # adding options.random_state in the map json_with_best_parameters['random_state'] = options.random_state # create classifier with best parameters classifier_with_best_parameters = RidgeClassifier( **json_with_best_parameters) print('\t', classifier_with_best_parameters) ml_final_list.append( (classifier_with_best_parameters, Classifier.RIDGE_CLASSIFIER)) if Classifier.MAJORITY_VOTING_CLASSIFIER.name in ml_algorithm_list: estimators_list = get_estimators_list(dataset, options, use_imdb_multi_class_labels, is_soft_voting=False, is_stacking_classifier=False) classifier_with_best_parameters = VotingClassifier( estimators=estimators_list, voting='hard', # voting='hard' means majority voting n_jobs=options.n_jobs) print('\t', classifier_with_best_parameters) ml_final_list.append((classifier_with_best_parameters, Classifier.MAJORITY_VOTING_CLASSIFIER)) if Classifier.SOFT_VOTING_CLASSIFIER.name in ml_algorithm_list: estimators_list = get_estimators_list(dataset, options, use_imdb_multi_class_labels, is_soft_voting=True, is_stacking_classifier=False) classifier_with_best_parameters = VotingClassifier( estimators=estimators_list, voting='soft', # voting='soft' predicts the class label based on the argmax of the sums of the predicted probabilities n_jobs=options.n_jobs) print('\t', classifier_with_best_parameters) ml_final_list.append((classifier_with_best_parameters, Classifier.SOFT_VOTING_CLASSIFIER)) if Classifier.STACKING_CLASSIFIER.name in ml_algorithm_list: estimators_list, final_estimator = get_estimators_list( dataset, options, use_imdb_multi_class_labels, is_stacking_classifier=True, final_estimator=Classifier.LINEAR_SVC.name) classifier_with_best_parameters = StackingClassifier( estimators=estimators_list, final_estimator=final_estimator, verbose=options.verbose, n_jobs=options.n_jobs) print('\t', classifier_with_best_parameters) ml_final_list.append( (classifier_with_best_parameters, Classifier.STACKING_CLASSIFIER)) return ml_final_list
def run_NMC(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath="", NumGenes=0): ''' run baseline classifier: NMC Wrapper script to run a NMC classifier on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) nfolds = np.array(robjects.r['n_folds'], dtype='int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype='bool') col = np.array(robjects.r['col_Index'], dtype='int') col = col - 1 test_ind = np.array(robjects.r['Test_Idx']) train_ind = np.array(robjects.r['Train_Idx']) # read the data data = pd.read_csv(DataPath, index_col=0, sep=',') labels = pd.read_csv(LabelsPath, header=0, index_col=None, sep=',', usecols=col) labels = labels.iloc[tokeep] data = data.iloc[tokeep] # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath, header=0, index_col=None, sep=',') # normalize data data = np.log1p(data) Classifier = NearestCentroid() tr_time = [] ts_time = [] truelab = [] pred = [] for i in range(np.squeeze(nfolds)): test_ind_i = np.array(test_ind[i], dtype='int') - 1 train_ind_i = np.array(train_ind[i], dtype='int') - 1 train = data.iloc[train_ind_i] test = data.iloc[test_ind_i] y_train = labels.iloc[train_ind_i] y_test = labels.iloc[test_ind_i] if (NumGenes > 0): feat_to_use = features.iloc[0:NumGenes, i] train = train.iloc[:, feat_to_use] test = test.iloc[:, feat_to_use] start = tm.time() Classifier.fit(train, y_train) tr_time.append(tm.time() - start) start = tm.time() predicted = Classifier.predict(test) ts_time.append(tm.time() - start) truelab.extend(y_test.values) pred.extend(predicted) truelab = pd.DataFrame(truelab) pred = pd.DataFrame(pred) tr_time = pd.DataFrame(tr_time) ts_time = pd.DataFrame(ts_time) OutputDir = Path(OutputDir) truelab.to_csv(str(OutputDir / Path("NMC_true.csv")), index=False) pred.to_csv(str(OutputDir / Path("NMC_pred.csv")), index=False) tr_time.to_csv(str(OutputDir / Path("NMC_training_time.csv")), index=False) ts_time.to_csv(str(OutputDir / Path("NMC_test_time.csv")), index=False)
def do_classification(clm, data_fname, clm_type): d_name = "output" if os.path.isdir(d_name) is False: os.mkdir(d_name) fname = os.path.basename(data_fname).replace('.csv', '') fn = 'result_' + fname + "_type" + str(clm_type) + "_" +\ datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '.csv' csv_out_fnamee = os.path.join(d_name, fn) fi = open(csv_out_fnamee, 'w') csv_out = csv.writer(fi, delimiter=',') # Create dataframe for training base_df = pd.read_csv(data_fname) df = base_df[clm] df = df[df['heartRate'] > 40] df = df[df['skinTemperature'] > 10] df = df[df['met'] > 0.4] X_train = df[clm[:-2]] Y_train = [df[clm[-2]], df[clm[-1]]] # Model: Decision Tree ML_NAME = 'Decision Tree' depth_list = np.concatenate( (np.arange(1, 10), np.arange(10, 20, 2), np.arange(20, 50, 5), np.arange(50, 100, 10), np.arange(150, 1000, 50))) for t in [0, 1]: for depth in depth_list: clf = DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=depth, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter='best') do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out, fname, depth) # Model: Extra Tree Classifier ML_NAME = 'Extremely randomized tree classifier' for t in [0, 1]: clf = ExtraTreeClassifier() do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out, fname, 0) # Model: Gaussian ML_NAME = 'Gaussian Naive Bayes' for t in [0, 1]: clf = GaussianNB(priors=None) do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out, fname, 0) # Model: Multivariate Bernoulli Model ML_NAME = 'Multivariate Bernoulli Model' alphas = np.concatenate( (np.arange(0.1, 1, 0.2), np.arange(1, 10), np.arange(10, 20, 2), np.arange(20, 50, 5), np.arange(50, 150, 10))) for t in [0, 1]: for a in alphas: clf = BernoulliNB(alpha=a) do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out, fname, a) # Model: AdaBoost Classifier ML_NAME = 'AdaBoost classifier' noestimator = np.arange(5, 1000, 20) for t in [0, 1]: for n in noestimator: clf = AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.1, n_estimators=n, random_state=None) do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out, fname, n) # Model: Gradient Boosting Classifier ML_NAME = 'Gradient Boosting Classifier' noestimator = np.arange(5, 1000, 20) for t in [0, 1]: for n in noestimator: clf = GradientBoostingClassifier(criterion='friedman_mse', init=None, learning_rate=0.1, loss='deviance', max_depth=4, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=n, presort='auto', random_state=None, subsample=1.0, verbose=0, warm_start=False) do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out, fname, n) # Model: Random Forest Classifier ML_NAME = 'Random Forest Classifier' noestimator = np.concatenate( (np.arange(1, 10), np.arange(10, 20, 2), np.arange(20, 50, 5), np.arange(50, 150, 10), np.arange(150, 1000, 50))) for t in [0, 1]: for n in noestimator: clf = RandomForestClassifier(n_estimators=n) do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out, fname, n) # Model: Support Vector Machines - RBF ML_NAME = 'Support Vector Machines - RBF' c_values = np.concatenate( (np.arange(0.1, 1, 0.2), np.arange(1, 10), np.arange(10, 20, 2), np.arange(20, 50, 5), np.arange(50, 150, 10))) for t in [0, 1]: for c in c_values: clf = SVC(C=c, kernel='rbf') do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out, fname, c) # Model: Support Vector Machines - poly ML_NAME = 'Support Vector Machines - poly' c_values = np.concatenate( (np.arange(0.1, 1, 0.2), np.arange(1, 10), np.arange(10, 20, 2), np.arange(20, 50, 5), np.arange(50, 150, 10))) for t in [0, 1]: for c in c_values: clf = SVC(C=c, kernel='poly') do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out, fname, c) # Model: Support Vector Machines - Sigmoid ML_NAME = 'Support Vector Machines - Sigmoid' c_values = np.concatenate( (np.arange(0.1, 1, 0.2), np.arange(1, 10), np.arange(10, 20, 2), np.arange(20, 50, 5), np.arange(50, 150, 10))) for t in [0, 1]: for c in c_values: clf = SVC(C=c, kernel='sigmoid') do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out, fname, c) # Model: Support Vector Machines - Linear ML_NAME = 'Support Vector Machines - Linear' c_values = np.concatenate( (np.arange(0.1, 1, 0.2), np.arange(1, 10), np.arange(10, 20, 2), np.arange(20, 50, 5), np.arange(50, 150, 10))) for t in [0, 1]: for c in c_values: clf = SVC(C=c, kernel='linear') do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out, fname, c) # Model: KNeighborsClassifier ML_NAME = 'KNeighborsClassifier' n_neighbors = np.concatenate( (np.arange(1, 10), np.arange(10, 20, 2), np.arange(20, 50, 5), np.arange(50, 150, 10))) for t in [0, 1]: for n in n_neighbors: try: clf = KNeighborsClassifier(n_neighbors=n) do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out, fname, n) except: pass # Model: Radius Neighbors Classifier ML_NAME = 'Radius Neighbors Classifier' n_neighbors = np.concatenate( (np.arange(1, 10), np.arange(10, 20, 2), np.arange(20, 50, 5), np.arange(50, 150, 10))) for t in [0, 1]: for n in n_neighbors: try: clf = RadiusNeighborsClassifier(radius=n) do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out, fname, n) except: pass # Model: NearestCentroid ML_NAME = 'Nearest Centroid Classifier' for t in [0, 1]: clf = NearestCentroid() do_cross_validation(ML_NAME, clf, X_train, Y_train[t], t, csv_out, fname, 0) fi.close()
# y Data y = df["default.payment.next.month"] # ---------------------------------------------------- # Splitting data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=44, shuffle=True ) # ---------------------------------------------------- # Applying LogisticRegression Model NearestNeighbors = NearestCentroid() NearestNeighbors.fit(X_train, y_train) # Calculating Details print("NearestNeighbors Train Score is : ", NearestNeighbors.score(X_train, y_train)) print("NearestNeighbors Test Score is : ", NearestNeighbors.score(X_test, y_test)) print("NearestNeighbors Classes are : ", NearestNeighbors.classes_) print("----------------------------------------------------") # Calculating Prediction y_pred = NearestNeighbors.predict(X_test) print("Predicted Value for NearestNeighbors is : ", y_pred[:10]) # ---------------------------------------------------- # Calculating Confusion Matrix CM = confusion_matrix(y_test, y_pred)
def all_classifier_models(): models = [] metrix = [] c_report = [] train_accuracy = [] test_accuracy = [] models.append(('LogisticRegression', LogisticRegression(solver='liblinear', multi_class='ovr'))) models.append(('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis())) models.append(('KNeighborsClassifier', KNeighborsClassifier())) models.append(('DecisionTreeClassifier', DecisionTreeClassifier())) models.append(('GaussianNB', GaussianNB())) models.append(('RandomForestClassifier', RandomForestClassifier(n_estimators=100))) models.append(('SVM', SVC(gamma='auto'))) models.append(('Linear_SVM', LinearSVC())) models.append(('XGB', XGBClassifier())) models.append(('SGD', SGDClassifier())) models.append(('Perceptron', Perceptron())) models.append(('ExtraTreeClassifier', ExtraTreeClassifier())) models.append(('OneClassSVM', OneClassSVM(gamma = 'auto'))) models.append(('NuSVC', NuSVC())) models.append(('MLPClassifier', MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=1))) models.append(('RadiusNeighborsClassifier', RadiusNeighborsClassifier(radius=2.0))) models.append(('OutputCodeClassifier', OutputCodeClassifier(estimator=RandomForestClassifier(random_state=0),random_state=0))) models.append(('OneVsOneClassifier', OneVsOneClassifier(estimator = RandomForestClassifier(random_state=1)))) models.append(('OneVsRestClassifier', OneVsRestClassifier(estimator = RandomForestClassifier(random_state=1)))) models.append(('LogisticRegressionCV', LogisticRegressionCV())) models.append(('RidgeClassifierCV', RidgeClassifierCV())) models.append(('RidgeClassifier', RidgeClassifier())) models.append(('PassiveAggressiveClassifier', PassiveAggressiveClassifier())) models.append(('GaussianProcessClassifier', GaussianProcessClassifier())) models.append(('HistGradientBoostingClassifier', HistGradientBoostingClassifier())) estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),('svr', make_pipeline(StandardScaler(),LinearSVC(random_state=42)))] models.append(('StackingClassifier', StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()))) clf1 = LogisticRegression(multi_class='multinomial', random_state=1) clf2 = RandomForestClassifier(n_estimators=50, random_state=1) clf3 = GaussianNB() models.append(('VotingClassifier', VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard'))) models.append(('AdaBoostClassifier', AdaBoostClassifier())) models.append(('GradientBoostingClassifier', GradientBoostingClassifier())) models.append(('BaggingClassifier', BaggingClassifier())) models.append(('ExtraTreesClassifier', ExtraTreesClassifier())) models.append(('CategoricalNB', CategoricalNB())) models.append(('ComplementNB', ComplementNB())) models.append(('BernoulliNB', BernoulliNB())) models.append(('MultinomialNB', MultinomialNB())) models.append(('CalibratedClassifierCV', CalibratedClassifierCV())) models.append(('LabelPropagation', LabelPropagation())) models.append(('LabelSpreading', LabelSpreading())) models.append(('NearestCentroid', NearestCentroid())) models.append(('QuadraticDiscriminantAnalysis', QuadraticDiscriminantAnalysis())) models.append(('GaussianMixture', GaussianMixture())) models.append(('BayesianGaussianMixture', BayesianGaussianMixture())) test_accuracy= [] names = [] for name, model in models: try: m = model m.fit(X_train, y_train) y_pred = m.predict(X_test) train_acc = round(m.score(X_train, y_train) * 100, 2) test_acc = metrics.accuracy_score(y_test,y_pred) *100 c_report.append(classification_report(y_test, y_pred)) test_accuracy.append(test_acc) names.append(name) metrix.append([name, train_acc, test_acc]) except: print("Exception Occurred :",name) return metrix,test_accuracy,names