def get_user_feature(feature_type,behavior,num_feature=800): X_train = get_features(feature_type,behavior) index = X_train.index # 对X进行降维 Y = pd.read_csv('data/train_Y_%d.csv'%behavior, index_col='user_id')['type'] print 'start selectKbest...' # select = SelectKBest(chi2,k=min(num_feature,X_train.shape[1])) percent = 0 if feature_type == 'cat_id': percent = 60 elif feature_type == 'brand_id': percent = 15 elif feature_type == 'seller_id': percent = 20 select = SelectPercentile(f_classif, percentile=percent) select.fit(X_train,Y) X_train = select.transform(X_train) print 'end select...' print 'write %s features to train file' % feature_type train_feature_file_name = 'data/train_feature_%s_%d.csv' % (feature_type,behavior) DataFrame(X_train,index=index).to_csv(train_feature_file_name) # 用同样的列降维对应的测试集数据 X_test = get_features(feature_type,behavior,is_train=False) index = X_test.index X_test = select.transform(X_test) # 写入文件 print 'write %s features to test file' % feature_type test_feature_file_name = 'data/test_feature_%s_%d.csv' % (feature_type,behavior) DataFrame(X_test,index=index).to_csv(test_feature_file_name) print 'end....'
def preprocess(words_file = "word_data.pkl", authors_file="email_authors.pkl"): ### the words (features) and authors (labels), already largely preprocessed ### this preprocessing will be repeated in the text learning mini-project authors_file_handler = open(authors_file, "r") authors = pickle.load(authors_file_handler) authors_file_handler.close() words_file_handler = open(words_file, "r") word_data = cPickle.load(words_file_handler) words_file_handler.close() ### test_size is the percentage of events assigned to the test set ### (remainder go into training) features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42) ### text vectorization--go from strings to lists of numbers vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) ### feature selection, because text is super high dimensional and ### can be really computationally chewy as a result selector = SelectPercentile(f_classif, percentile=10) selector.fit(features_train_transformed, labels_train) features_train_transformed = selector.transform(features_train_transformed).toarray() features_test_transformed = selector.transform(features_test_transformed).toarray() ### info on the data print "no. of Enrique training emails:", sum(labels_train) print "no. of Juan training emails:", len(labels_train)-sum(labels_train) return features_train_transformed, features_test_transformed, labels_train, labels_test
def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/email_authors.pkl"): """ this function takes a pre-made list of email texts (by default word_data.pkl) and the corresponding authors (by default email_authors.pkl) and performs a number of preprocessing steps: -- splits into training/testing sets (10% testing) -- vectorizes into tfidf matrix -- selects/keeps most helpful features after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions 4 objects are returned: -- training/testing features -- training/testing labels """ ### the words (features) and authors (labels), already largely preprocessed ### this preprocessing will be repeated in the text learning mini-project authors_file_handler = open(authors_file, "r") authors = pickle.load(authors_file_handler) authors_file_handler.close() words_file_handler = open(words_file, "r") word_data = cPickle.load(words_file_handler) words_file_handler.close() ### test_size is the percentage of events assigned to the test set ### (remainder go into training) features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42) ### text vectorization--go from strings to lists of numbers vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) ### feature selection, because text is super high dimensional and ### can be really computationally chewy as a result # selector = SelectPercentile(f_classif, percentile=10) ## <Temporary hack for Lesson 3> selector = SelectPercentile(f_classif, percentile=1) selector.fit(features_train_transformed, labels_train) features_train_transformed = selector.transform(features_train_transformed).toarray() features_test_transformed = selector.transform(features_test_transformed).toarray() ### info on the data print "no. of Chris training emails:", sum(labels_train) print "no. of Sara training emails:", len(labels_train)-sum(labels_train) return features_train_transformed, features_test_transformed, labels_train, labels_test
def predict(classifier_type="tree",selection="Univariate", f="1"): if (f=="1"): kc_fn = "GS_pickles\kmeans_Genes_87_1x_v3.pkl" p = 1 BIG_C = 0.001 if (f=="2"): kc_fn = "GS_pickles\kmeans_Genes_433_50x_v2.pkl" p = 5 BIG_C = 0.1 if (f=="3"): kc_fn = "GS_pickles\kmeans_Genes_2163_20x_v1.pkl" p = 25 BIG_C = 2 dump_data = False kernel_type = "linear" (data_matrix, features, samples) = readData() x = data_matrix.data y = data_matrix.target target_names = data_matrix.target_names x_indices = np.arange(x.shape[-1]) (m,n) = x.shape test = joblib.load("GS_pickles\imputed_test_data.pkl") test_x = np.array(test) (i,j) = test_x.shape print "Training matrix shape: %s,%s" %(m,n) print "Test matrix shape: %s,%s" %(i,j) trimmed_x = [] trimmed_test_x = [] if (selection=="Univariate"): selector = SelectPercentile(f_classif, percentile=p) selector.fit(x, y) # Trimming the matrix, now should contain x% of the 8650 features trimmed_x = selector.transform(x) trimmed_test_x = selector.transform(test_x) if (selection=="kclusters"): kcluster_flist = joblib.load(kc_fn) trimmed_x = np.take(x, kcluster_flist, axis=1) trimmed_test_x = np.take(test_x, kcluster_flist, axis=1) n_samples, n_features = trimmed_x.shape # Linear SVM classifier if (classifier_type=="SVM"): clf = svm.SVC(kernel=kernel_type, degree=3, probability=True) # Gaussian Naive Bayes classifier if (classifier_type=="NB"): clf = GaussianNB() clf.fit(trimmed_x,y) result = clf.predict(trimmed_test_x) return result
def eval(ds, testNum, p, splitProportion=0.2): #testNum=1 #splitProportion=0.2 allFeaturesF1=[] allFeaturesRecall=[] allFeaturesPrecision=[] featureSelctedF1=[] featureSelctedRecall = [] featureSelctedPrecision = [] for _ in range(testNum): tstdata, trndata = ds.splitWithProportion( splitProportion ) X, Y = labanUtil.fromDStoXY(trndata) X_test, Y_test = labanUtil.fromDStoXY(tstdata) #localF1s = [] #localRecalls = [] #localPercisions = [] for y, y_test in zip(Y, Y_test): if all(v == 0 for v in y): continue #clf = LinearSVC()#fit_intercept=True, C=p) #clf.sparsify() #clf = RandomForestClassifier()#criterion='entropy') #clf = tree.DecisionTreeClassifier()#max_depth=p) clf = AdaBoostClassifier() #clf = GradientBoostingClassifier()#, learning_rate=lr) #clf = ExtraTreesClassifier(n_estimators=p) #svc = LinearSVC() #selector = RFE(estimator=svc, n_features_to_select=p*19, step=0.2) selector = SelectPercentile(chooser, percentile=p) selector.fit(X, y) name = str(clf).split()[0].split('(')[0] clf.fit(selector.transform(X), y) pred = clf.predict(selector.transform(X_test)) featureSelctedF1.append(metrics.f1_score(y_test, pred)) featureSelctedRecall.append(metrics.recall_score(y_test, pred)) featureSelctedPrecision.append(metrics.precision_score(y_test, pred)) clf.fit(X, y) pred = clf.predict(X_test) allFeaturesF1.append(metrics.f1_score(y_test, pred)) allFeaturesRecall.append(metrics.recall_score(y_test, pred)) allFeaturesPrecision.append(metrics.precision_score(y_test, pred)) return np.mean(allFeaturesF1), np.mean(featureSelctedF1), \ np.mean(allFeaturesRecall), np.mean(featureSelctedRecall), \ np.mean(allFeaturesPrecision), np.mean(featureSelctedPrecision), \ name
def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/email_authors.pkl"): """ this function takes a pre-made list of email texts (by default word_data.pkl) and the corresponding authors (by default email_authors.pkl) and performs a number of preprocessing steps: -- splits into training/testing sets (10% testing) -- vectorizes into tfidf matrix -- selects/keeps most helpful features after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions 4 objects are returned: -- training/testing features -- training/testing labels """ ### the words (features) and authors (labels), already largely preprocessed ### this preprocessing will be repeated in the text learning mini-project # read a vector of documents from file(decoded) authors_file_handler = open(authors_file, "r") authors = pickle.load(authors_file_handler) authors_file_handler.close() # read a vector of labels/authors from file(decoded) words_file_handler = open(words_file, "r") word_data = cPickle.load(words_file_handler) words_file_handler.close() ### test_size is the percentage of events assigned to the test set ### (remainder go into training) features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42) # features_train,features_test is a vector of sentences ### text vectorization--go from strings to lists of numbers vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) # no fitting here. So the idf is the one calculated initially # returns sparse matrix(N*M) where N = each document/sample, M gives tf*invdf weightage of current feature word in document. ### feature selection, because text is super high dimensional and ### can be really computationally chewy as a result selector = SelectPercentile(f_classif, percentile=10) selector.fit(features_train_transformed, labels_train) # select top k% best features using univariate statistical tests features_train_transformed = selector.transform(features_train_transformed).toarray() # select the columns based on the stats test features_test_transformed = selector.transform(features_test_transformed).toarray() # do as above ### info on the data #print "no. of Chris training emails:", sum(labels_train) #print "no. of Sara training emails:", len(labels_train)-sum(labels_train) return features_train_transformed, features_test_transformed, labels_train, labels_test
def reduce(self,percent_taken): #fits classifier chi2 for non-negative X, otherwise F-value (ANOVA) try: fited=SelectPercentile(chi2, percentile=percent_taken).fit(self.train_set, self.Y) except: fited=SelectPercentile(f_classif, percentile=percent_taken).fit(self.train_set, self.Y) self.fitted_reductor=fited self.train_set = fited.transform(self.train_set) self.test_set = fited.transform(self.test_set) print 'number of featute(s) selected: {0}\n'.format(len(self.test_set[0]))
def make_train_test(df_train, df_test): vectorizer = CountVectorizer() X_train = vectorizer.fit_transform(df_train['Phrase'].values) Y_train = df_train['Sentiment'].values X_test = vectorizer.transform(df_test['Phrase'].values) selector = SelectPercentile(f_classif, percentile=50) selector.fit(X_train, Y_train) features_train_transformed = selector.transform(X_train) features_test_transformed = selector.transform(X_test) return features_train_transformed, Y_train, features_test_transformed
def preprocess_4(article_file, lable_file): # article_file = "pkl/2013_article.pkl" # lable_file = "pkl/2013_lable.pkl" features = pickle.load(open(article_file)) features = np.array(features) # transform non-numerical labels (as long as they are hashable and comparable) to numerical labels lables = pickle.load(open(lable_file)) le = preprocessing.LabelEncoder() le.fit(lables) lables = le.transform(lables) ### test_size is the percentage of events assigned to the test set (remainder go into training) features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, lables, test_size=0.1, random_state=42) # print features_train.shape # print features_test[0] # print features_test.shape ### text vectorization--go from strings to lists of numbers t0 = time() vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=1, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) # print "features_train_transformed is {}".format(features_train_transformed.shape) # print "features_test_transformed is {}".format(features_test_transformed.shape) # print "vectorizer time:", round(time()-t0, 3), "s" # print len(vectorizer.get_feature_names()) ### feature selection, because text is super high dimensional and ### can be really computationally chewy as a result t0 = time() selector = SelectPercentile(f_classif, percentile=30) selector.fit(features_train_transformed, labels_train) features_train_transformed = selector.transform(features_train_transformed).toarray() features_test_transformed = selector.transform(features_test_transformed).toarray() # print "features_train_transformed is {}".format(features_train_transformed.shape) # print "features_test_transformed is {}".format(features_test_transformed.shape) # print "selector time:", round(time()-t0, 3), "s" # print len(vectorizer.get_feature_names()) # print vectorizer.get_feature_names()[0:-10] # print len(selector.scores_) return features_train_transformed, features_test_transformed, labels_train, labels_test
def main(): #set the timer start = time.time() #load the data trainX = np.load('trainX.npy') testX = np.load('testX.npy') trainY = np.load('trainY.npy') testY = np.load('testY.npy') print('\n!!! Data Loading Completed !!!\n') #get the 1st digit zero and plot it zero = trainX[14].reshape(28, 28) plt.imshow(zero, cmap=cm.Greys_r) plt.savefig("original"+str(trainY[14])+".png") #plt.show() #apply kpca kpca = KernelPCA(kernel='rbf', gamma=1, fit_inverse_transform=True) kpca.fit(trainX[0:3000]) trainX_kpca = kpca.transform(trainX) testX_kpca = kpca.transform(testX) #do inverse transform and plot the result orig = kpca.inverse_transform(trainX_kpca) img = orig[14].reshape(28, 28) plt.imshow(img, cmap=cm.Greys_r) plt.savefig("reconstructed"+str(trainY[14])+".png") #plt.show() selector = SelectPercentile(f_classif, percentile=5) selector.fit(trainX_kpca, trainY) trainX = selector.transform(trainX_kpca) testX = selector.transform(testX_kpca) #fit a classifier parameters = {'n_neighbors' : list(np.arange(15)+1)} clf = GridSearchCV(KNeighborsClassifier(weights='distance', n_jobs=-1), parameters) clf.fit(trainX, trainY) pred = clf.predict(testX) print accuracy_score(testY, pred) print confusion_matrix(testY, pred) #print(clf.best_params_) print('total : %d, correct : %d, incorrect : %d\n' %(len(pred), np.sum(pred == testY), np.sum(pred != testY))) print('Test Time : %f Minutes\n' %((time.time()-start)/60))
def trainingPreprocess(words_file, authors_file): """ this function takes a pre-made list of email texts (by default word_data.pkl) and the corresponding authors (by default email_authors.pkl) and performs a number of preprocessing steps: -- splits into training/testing sets (10% testing) -- vectorizes into tfidf matrix -- selects/keeps most helpful features after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions 6 objects are returned: -- training/testing features -- training/testing labels -- a fitted vectorizer -- a fitted selector """ ### the words (features) and authors (labels), already largely preprocessed ### this preprocessing will be repeated in the text learning mini-project authors_file_handler = open(authors_file, "r") authors = pickle.load(authors_file_handler) authors_file_handler.close() words_file_handler = open(words_file, "r") word_data = cPickle.load(words_file_handler) words_file_handler.close() ### test_size is the percentage of events assigned to the test set ### (remainder go into training) features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42) ### text vectorization--go from strings to lists of numbers vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) ### feature selection selector = SelectPercentile(f_classif, percentile=10) selector.fit(features_train_transformed, labels_train) features_train_transformed = selector.transform(features_train_transformed).toarray() features_test_transformed = selector.transform(features_test_transformed).toarray() return features_train_transformed, features_test_transformed, labels_train, labels_test, vectorizer, selector
def selectFeatures(features, labels, features_list): ''' Select features according to the 20th percentile of the highest scores. Return a list of features selected and a dataframe showing the ranking of each feature related to their p values features: numpy array with the features to be used to test sklearn models labels: numpy array with the real output features_list: a list of names of each feature ''' #feature selection selector = SelectPercentile(f_classif, percentile=20) selector.fit(features, labels) features_transformed = selector.transform(features) #filter names to be returned l_rtn = [x for x, t in zip(features_list, list(selector.get_support())) if t] # pd.DataFrame(features_transformed, columns = l_labels2).head() #calculate scores scores = -np.log10(selector.pvalues_) scores /= scores.max() df_rtn = pd.DataFrame(pd.Series(dict(zip(features_list,scores)))) df_rtn.columns = ["pValue_Max"] df_rtn = df_rtn.sort("pValue_Max", ascending=False) # df_rtn["different_from_zero"]=((df!=0).sum()*1./df.shape[0]) return l_rtn, df_rtn
def preprocess(article_file, lable_file, k): features = pickle.load(open(article_file)) features = np.array(features) # transform non-numerical labels (as long as they are hashable and comparable) to numerical labels lables = pickle.load(open(lable_file)) le = preprocessing.LabelEncoder() le.fit(lables) lables = le.transform(lables) # print le.inverse_transform([0]) ### text vectorization--go from strings to lists of numbers vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=1, stop_words='english') features_train_transformed = vectorizer.fit_transform(features) # selector : SelectPercentile selector = SelectPercentile(f_classif, percentile=k) selector.fit(features_train_transformed, lables) # selector : chi2 # selector = SelectPercentile(score_func=chi2) # selector.fit(features_train_transformed, lables) features_train_transformed = selector.transform(features_train_transformed).toarray() return features_train_transformed, lables, vectorizer, selector, le, features
def main(): main_data = pd.read_csv('../data/train.csv', index_col='ID') output = [] for x in main_data.columns: output.append({ 'variable': x, 'variance': main_data.ix[:, x].var(), 'corr_w_target': round(main_data.ix[:, x].corr(main_data.TARGET), 4), 'abs_corr': abs(round(main_data.ix[:, x].corr(main_data.TARGET), 4))} ) # print csv for later in the presentation docs variable_selector = pd.DataFrame(output) variable_selector = variable_selector.set_index('variable') variable_selector = variable_selector.drop('TARGET') variable_selector.sort_values('abs_corr', ascending=False).to_csv('../presentationDocs/corrs.csv') selector = SelectPercentile(f_classif, percentile=25) subset = pd.DataFrame(selector.fit_transform(main_data.drop('TARGET', axis=1), main_data['TARGET'])) subset.to_csv('../data/main_data.csv', index=False) main_data[['TARGET']].to_csv('../data/target.csv', cols=['TARGET'], index=False) # print transformed test data to csv test_data = pd.read_csv('../data/test.csv', index_col='ID') test_data = pd.DataFrame(selector.transform(test_data), index=test_data.index) test_data.to_csv('../data/test_transform.csv', index=True, index_label='ID')
def select_features(X,y): selector = SelectPercentile(f_classif, percentile=10) print "fit selector" selector.fit(X, y) print "transform features" X = selector.transform(X) return X,selector
def train_type_model(): globals.read_configuration('config.cfg') parser = globals.get_parser() scorer_globals.init() datasets = ["webquestions_split_train", ] parameters = translator.TranslatorParameters() parameters.require_relation_match = False parameters.restrict_answer_type = False feature_extractor = FeatureExtractor(False, False, n_gram_types_features=True) features = [] labels = [] for dataset in datasets: queries = get_evaluated_queries(dataset, True, parameters) for index, query in enumerate(queries): tokens = [token.lemma for token in parser.parse(query.utterance).tokens] n_grams = get_grams_feats(tokens) answer_entities = [mid for answer in query.target_result for mid in KBEntity.get_entityid_by_name(answer, keep_most_triples=True)] correct_notable_types = set(filter(lambda x: x, [KBEntity.get_notable_type(entity_mid) for entity_mid in answer_entities])) other_notable_types = set() for candidate in query.eval_candidates: entities = [mid for entity_name in candidate.prediction for mid in KBEntity.get_entityid_by_name(entity_name, keep_most_triples=True)] other_notable_types.update(set([KBEntity.get_notable_type(entity_mid) for entity_mid in entities])) incorrect_notable_types = other_notable_types.difference(correct_notable_types) for type in correct_notable_types.union(incorrect_notable_types): if type in correct_notable_types: labels.append(1) else: labels.append(0) features.append(feature_extractor.extract_ngram_features(n_grams, [type, ], "type")) with open("type_model_data.pickle", 'wb') as out: pickle.dump((features, labels), out) label_encoder = LabelEncoder() labels = label_encoder.fit_transform(labels) vec = DictVectorizer(sparse=True) X = vec.fit_transform(features) feature_selector = SelectPercentile(chi2, percentile=5).fit(X, labels) vec.restrict(feature_selector.get_support()) X = feature_selector.transform(X) type_scorer = SGDClassifier(loss='log', class_weight='auto', n_iter=1000, alpha=1.0, random_state=999, verbose=5) type_scorer.fit(X, labels) with open("type-model.pickle", 'wb') as out: pickle.dump((vec, type_scorer), out)
def preprocess(words_file="../tools/word_data.pkl", authors_file="../tools/email_authors.pkl"): """ Take a pre-made list of email texts (by default word_data.pkl) and the corresponding authors (by default email_authors.pkl) and preprocesses them. Preprocessor steps: - split into training/testing sets (10% testing) - vectorize into tfidf matrix - select/keep most helpful features After this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions A tfidf matrix is defined as TF(t)*IDF(t) where TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document). IDF(t) = log_e(Total number of documents / Number of documents with term t in it). 4 objects are returned: -- training/testing features -- training/testing labels """ # the words (features) and authors (labels), already largely preprocessed this preprocessing will be repeated in the text learning mini-project print('words_file = {}'.format(words_file)) word_data = pickle.load(open(words_file, "rb")) authors = pickle.load(open(authors_file, "rb")) # test_size is the percentage of events assigned to the test set (remainder go into training) features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42) # text vectorization--go from strings to lists of numbers vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) # feature selection, because text is super high dimensional and can be really computationally chewy as a result selector = SelectPercentile(f_classif, percentile=10) selector.fit(features_train_transformed, labels_train) features_train_transformed = selector.transform(features_train_transformed).toarray() features_test_transformed = selector.transform(features_test_transformed).toarray() # info on the data print("no. of Chris training emails:", sum(labels_train)) print("no. of Sara training emails:", len(labels_train) - sum(labels_train)) return numpy.array(features_train_transformed), numpy.array(features_test_transformed), numpy.array(labels_train), numpy.array(labels_test)
def feature_transform(features_train, features_test, top_percent=1): """ Function to apply Bag of Words feature creator with TfIdf statistic normalisation. The input is train and test text, and optional parameter 'top_percent' which shows how many percent of super high dimensional text feature space is to return (defaul is 1%). The output is the transformed train and test feature vectors suitable to use with sklearn classifiers. """ vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) ### Feature selection, because text is super high dimensional selector = SelectPercentile(f_classif, percentile=top_percent) selector.fit(features_train_transformed, labels_train) features_train_transformed = selector.transform(features_train_transformed).toarray() features_test_transformed = selector.transform(features_test_transformed).toarray() return features_train_transformed, features_test_transformed
def univariate_feature_selection(dataset, features): # load the dataset spreadsheet = Spreadsheet('../../Downloads/ip/project data.xlsx') data = Data(spreadsheet) targets = data.targets X = dataset y = data.targets ############################################################################### plt.figure(1) plt.clf() X_indices = np.arange(X.shape[-1]) ############################################################################### # Univariate feature selection with F-test for feature scoring # We use the default selection function: the 10% most significant features selector = SelectPercentile(f_classif, percentile=10) selector.fit(X, y) scores = -np.log10(selector.pvalues_) scores /= scores.max() plt.bar(X_indices - .45, scores, width=.2, label=r'Univariate score ($-Log(p_{value})$)', color='g') ############################################################################### # Compare to the weights of an SVM clf = svm.SVC(kernel='linear') clf.fit(X, y) svm_weights = (clf.coef_ ** 2).sum(axis=0) svm_weights /= svm_weights.max() plt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight', color='r') clf_selected = svm.SVC(kernel='linear') clf_selected.fit(selector.transform(X), y) svm_weights_selected = (clf_selected.coef_ ** 2).sum(axis=0) svm_weights_selected /= svm_weights_selected.max() plt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected, width=.2, label='SVM weights after selection', color='b') x = np.arange(0, len(features)) plt.title("Comparing feature selection") plt.xlabel('Feature number') plt.xticks(x, features, rotation=45) plt.yticks(()) #plt.axis('tight') plt.legend(loc='upper right') plt.show()
def preprocess(X,y): ### test_size is the percentage of events assigned to the test set ### (remainder go into training) features_train, features_test, labels_train, labels_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42) ### text vectorization--go from strings to lists of numbers vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) joblib.dump(vectorizer, 'vectorizer_intent.pkl') ### feature selection, because text is super high dimensional and ### can be really computationally chewy as a result selector = SelectPercentile(f_classif, percentile=10) selector.fit(features_train_transformed, labels_train) joblib.dump(selector, 'selector_intent.pkl') features_train_transformed = selector.transform(features_train_transformed).toarray() features_test_transformed = selector.transform(features_test_transformed).toarray() return features_train_transformed, features_test_transformed, labels_train, labels_test
def Preprocess(words_file="/home/mohamed/python/sherlok-tools/helpers/word_data.pkl", labels_file="/home/mohamed/python/sherlok-tools/helpers/label_data.pkl"): """ this function takes a pre-made list of data texts (by default word_data.pkl) and the corresponding labels (by default label_data.pkl) and performs a number of preprocessing steps: -- splits into training/testing sets (10% testing) -- vectorizes into tfidf matrix -- selects/keeps most helpful features after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions 4 objects are returned: -- training/testing features -- training/testing labels """ ### the words (features) and labels (positive or negative) word_data = pickle.load( open(words_file, "r")) labels = pickle.load( open(labels_file, "r") ) ### test_size is the percentage of events assigned to the test set (remainder go into training) features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, labels, test_size=0.1, random_state=42) ### text vectorization--go from strings to lists of numbers vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, encoding='windows-1256') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) ### feature selection, because text is super high dimensional and ### can be really computationally chewy as a result selector = SelectPercentile(f_classif, percentile=10) selector.fit(features_train_transformed, labels_train) features_train_transformed = selector.transform(features_train_transformed).toarray() features_test_transformed = selector.transform(features_test_transformed).toarray() ### info on the data print "no. of positive training files:", sum(labels_train) print "no. of negative training files:", len(labels_train)-sum(labels_train) return features_train_transformed, features_test_transformed, labels_train, labels_test
def preprocess(words_file = "../data/data.pkl", authors_file="../data/datalabels.pkl"): authors_file_handler = open(authors_file, "r") authors = pickle.load(authors_file_handler) authors_file_handler.close() words_file_handler = open(words_file, "r") word_data = cPickle.load(words_file_handler) words_file_handler.close() features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42) features_train_vect = features_train vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) selector = SelectPercentile(f_classif, percentile=1) selector.fit(features_train_transformed, labels_train) features_train_transformed = selector.transform(features_train_transformed).toarray() features_test_transformed = selector.transform(features_test_transformed).toarray() return features_train_vect , features_train_transformed, features_test_transformed, labels_train, labels_test
def convertText(trainData, trainLabel, testData, testLabel, reduceDimensionality=0): ''' trainData: training data trainLabel: training labels testData: test data testLabel: test labels return numerical arrays of data from text vectors ''' vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') trainDataTransformed = vectorizer.fit_transform(trainData).toarray() testDataTransformed = vectorizer.transform(testData).toarray() if reduceDimensionality: selector = SelectPercentile(f_classif, percentile=0.10) selector.fit(trainDataTransformed, trainLabel) trainDataTransformed = selector.transform(trainDataTransformed).toarray() testDataTransformed = selector.transform(testDataTransformed).toarray() return trainDataTransformed, trainLabel, testDataTransformed, testLabel
def eval(ds, clf, splitProportion=0.2, p=4): #splitProportion = 0.2 tstdata, trndata = ds.splitWithProportion( splitProportion ) X, Y = labanUtil.fromDStoXY(trndata) X_test, Y_test = labanUtil.fromDStoXY(tstdata) f1s=[] ps =[] rs=[] for i, (y, y_test) in enumerate(zip(Y, Y_test)): if all(v == 0 for v in y): continue selector = SelectPercentile(chooser, percentile=p) selector.fit(X, y) name = str(clf).split()[0].split('(')[0] clf.fit(selector.transform(X), y) pred = clf.predict(selector.transform(X_test)) f1 = metrics.f1_score(y_test, pred) f1s.append(f1) ps.append(metrics.precision_score(y_test, pred)) rs.append(metrics.recall_score(y_test, pred)) return f1s, ps, rs
class AnovaPercentileStep(SklearnStep): def __init__(self, percentile): super(AnovaPercentileStep, self).__init__() self._percentile = percentile def fit_transform(self): self._model = SelectPercentile(f_classif, self._percentile) x, y = load_svmlight(self.input_path) x = self._model.fit_transform(x, y) save_svmlight(x, y, self._output_path) def transform(self, x=None): if x is None: x, y = load_svmlight(self._test_input_path) x = self._model.transform(x) save_svmlight(x, y, self._test_output_path) else: transformed_x = self._model.transform(x) return transformed_x def get_param(self): return {'percentile': self._percentile}
def train(config, model_data, data, record): model_class_name, percentile = model_data model = instantiate_from_class_string(model_class_name) try: model.n_jobs = config['n_jobs'] except: log.info('Cannot set n_jobs for this model...') record['model'] = model_name(model) record['parameters'] = model.get_params() record['feats_percentile'] = percentile train_x = data['train_x'] train_y = data['train_y'] test_x = data['test_x'] # estimate accuracy using cross-validation model = make_pipeline(SelectPercentile(f_classif, percentile), StandardScaler(), model) scores = cross_validation.cross_val_score(model, train_x, train_y, cv=5, scoring='accuracy') record['mean_acc'] = scores.mean() # predict on the test set fn = SelectPercentile(f_classif, percentile).fit(train_x, train_y) train_x = fn.transform(train_x) test_x = fn.transform(test_x) scaler = StandardScaler().fit(train_x) train_x = scaler.transform(train_x) test_x = scaler.transform(test_x) model.fit(train_x, train_y) ids = data['test_ids'] preds = model.predict(test_x) record['test_preds'] = [(id_, pred) for id_, pred in zip(ids, preds)]
class FeatureSelection: """ 特征选择 percentile:选取特征的百分比 """ def __init__(self,percentile=70): self.percentile=percentile def fit(self,x,y): self.sepChi=SelectPercentile(score_func=chi2,percentile=self.percentile)#使用卡方 self.sepChi.fit(x,y) def transform(self,x,y): return (self.sepChi.transform(x),y)
def preprocess_input(feature_test,words_file="/home/mohamed/python/sherlok-tools/helpers/word_data.pkl", labels_file="/home/mohamed/python/sherlok-tools/helpers/label_data.pkl"): word_data = pickle.load( open(words_file, "r")) labels = pickle.load( open(labels_file, "r") ) ### test_size is the percentage of events assigned to the test set (remainder go into training) ### split training & testing features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, labels, test_size=0.0, random_state=42) vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, encoding='windows-1256') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(feature_test) selector = SelectPercentile(f_classif, percentile=10) selector.fit(features_train_transformed, labels_train) features_train_transformed = selector.transform(features_train_transformed).toarray() features_test_transformed = selector.transform(features_test_transformed).toarray() ### info on the data print ("no. of positive training files:", sum(labels_train)) print ("no. of negative training files:", len(labels_train)-sum(labels_train)) return features_train_transformed, features_test_transformed, labels_train
def cross_val_score(clf, data, target, k): shuffle_arr = [] size = len(data) for i in range(size): shuffle_arr.append(i) scores = [] for i in range(0, k): #generate shuffled train and test dataset data_train_raw = [] data_test_raw = [] target_train = [] target_test = [] # seperate shuffled train and test dataset random.shuffle(shuffle_arr) shuffle_train = shuffle_arr[:size - size/k] shuffle_test = shuffle_arr[size-size/k :] for j in shuffle_train: data_train_raw.append(data_total[j]) target_train.append(target[j]) for r in shuffle_test: data_test_raw.append(data_total[r]) target_test.append(target[r]) data_train = data_process(data_train_raw) data_test = data_process(data_test_raw) # transform array of string to counts count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(data_train) # transform counts to frequencies tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts) X_train_tf = tf_transformer.transform(X_train_counts) # feature selection select = SelectPercentile(chi2, percentile = 10) X_train_fs = select.fit_transform(X_train_tf, target_train) # train the model clf_train = clf.fit(X_train_fs, target_train) # test the model X_new_counts = count_vect.transform(data_test) X_new_tfidf = tf_transformer.transform(X_new_counts) X_new_fs = select.transform(X_new_tfidf) test_result = clf_train.predict(X_new_fs) scores.append(GetPrecisionRecallF1(test_result, target_test)) #clf_score = clf_train.score(X_new_fs, target_test) #scores.append(clf_score) return scores
def get_combined_separate_fsets(feature_sets, fs_fn='pct', ptile=10, nFeatures=5, score_fn=f_classif): df_lst = [] for fset_name, df in feature_sets.items(): X_train = df[df.partition == 'train'].drop(['partition', 'fatality_ind'], axis=1) y_train = df[df.partition == 'train'].fatality_ind df_X = df.drop(['partition', 'fatality_ind'], axis=1) if fs_fn == 'pct': featureSelector = SelectPercentile(score_func=score_fn, percentile=ptile) else: featureSelector = SelectKBest(score_func=score_fn, k=nFeatures) featureSelector.fit(X_train, y_train) fs = featureSelector.transform(df.drop(['partition', 'fatality_ind'], axis=1)) cols_fs = df_X.columns[list(featureSelector.get_support(indices=True))] cols_fs_ref = [fset_name + ' ' + c for c in cols_fs] df_fs = pd.DataFrame(fs, index=df_X.index, columns=cols_fs_ref) df_lst.append(df_fs) df_comb = df[['partition', 'fatality_ind']].join(pd.concat(df_lst, axis=1)) return df_comb
features_train = pre_process(features_train) features_test = pre_process(features_test) vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) #print features_train_transformed #print features_train_transformed.shape selector = SelectPercentile(f_classif, percentile=10) selector.fit(features_train_transformed, labels_train) features_train_transformed = selector.transform(features_train_transformed).toarray() '''print '--------------------------------------------------------------------' print features_train_transformed print features_train_transformed.shape print labels_train.shape''' features_test_transformed = selector.transform(features_test_transformed).toarray() '''print features_test_transformed.shape print labels_test.shape print type(features_train_transformed) print type(labels_train)''' clf = GaussianNB() t0 = time() clf.fit(features_train_transformed, labels_train) print "training time:", round(time()-t0, 3), "s"
from sklearn.feature_selection import SelectPercentile from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression cancer = load_breast_cancer() rng = np.random.RandomState(42) noise = rng.normal(size=(len(cancer.data), 50)) X_w_noise = np.hstack([cancer.data, noise]) X_train, X_test, y_train, y_test = train_test_split( X_w_noise, cancer.target, random_state=0, test_size=.5) select = SelectPercentile(percentile=50) select.fit(X_train, y_train) X_train_selected = select.transform(X_train) mask = select.get_support() print(mask) plt.matshow(mask.reshape(1, -1), cmap='gray_r') plt.xlabel('Sample index') plt.show() X_test_selected = select.transform(X_test) lr = LogisticRegression() lr.fit(X_train, y_train) print('Score with all features: {:.3f}'.format(lr.score(X_test, y_test))) lr.fit(X_train_selected, y_train) print('Score with selected features: {:.3f}'.format(lr.score(X_test_selected, y_test)))
features_cv_transformed=selector.transform(features_cv_transformed) selected_word_indices = selector.get_support(indices=True) vocab = vectorizer.get_feature_names() trimmed_vocab = [vocab[i] for i in selected_word_indices] print[trimmed_vocab] ''' from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words="english") features_transformed = vectorizer.fit_transform(word_data) features_transformed = features_transformed.toarray() selector = SelectPercentile(f_classif, percentile=20) selector.fit(features_transformed, sentiid) features_transformed = selector.transform(features_transformed) ################################################################################################################################################## from sklearn.svm import SVC from sklearn.datasets import load_digits from sklearn.model_selection import learning_curve from sklearn.model_selection import ShuffleSplit def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
tokenizer=stem_comment, max_features=5000) dtm = vect.fit_transform(Xtrain) words = vect.get_feature_names() print('dtm matrix') from sklearn.feature_selection import SelectPercentile, mutual_info_classif selector = SelectPercentile(mutual_info_classif, percentile=20) dtm_reduced = selector.fit_transform(dtm, ytrain) selector_scores = selector.scores_ print('selected') dtm_test = vect.transform(Xtest) dtm_selected = selector.transform(dtm_test) from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report from sklearn.metrics import roc_curve, auc, confusion_matrix from sklearn.preprocessing import label_binarize model_random_forest = RandomForestClassifier() model_random_forest.fit(dtm_reduced, ytrain) prob_pred = model_random_forest.predict_proba(dtm_selected) pred = model_random_forest.predict(dtm_selected) cm_random_forest = confusion_matrix(ytest, pred) cr_random_forest = classification_report(ytest, pred) #%%
train = data[:train.shape[0]] test = data[train.shape[0]:] train_y = train['click'] cntv = CountVectorizer() cntv.fit(train['user_tags']) train_a = cntv.transform(train['user_tags']) test_a = cntv.transform(test['user_tags']) train_new = sparse.hstack( (train_new, train_a), 'csr' ) #hstack : 将矩阵按照列进行拼接,对应的列数必须相等,hstack(blocks, format=None, dtype=None) test_new = sparse.hstack((test_new, test_a), 'csr') SKB = SelectPercentile(chi2, percentile=95).fit( train_new, train_y) #区别:SelectKBest选择排名排在前n个的变量 SelectPercentile 选择排名排在前n%的变量 train_new = SKB.transform(train_new) test_new = SKB.transform(test_new) ''' 在稀疏矩阵存储格式中: # - COO 格式在构建矩阵时比较高效 # - CSC 和 CSR 格式在乘法计算时比较高效 A.todense() # 可以转化为普通矩阵: ''' #%% #统计特征构造 #adid统计特征,不同种类数量(已经创建了记录数的统计,现在是一个特征对应另外一个特征的种类) ## 由于adid是次样本层级的粒度,是聚集到点击率的层面所以是重要的特征,adid基本与广告信息表一一对应,我们象征性的选择广告id与其他挑选出来的id进行特征nunique统计 adid_nuq = [ 'model', 'make', 'os', 'city', 'province', 'user_tags', 'app_id', 'carrier', 'nnt', 'devtype', 'app_cate_id', 'inner_slot_id'
def train_classifier_use_feature_selection(self): # Get list of features count_vect = CountVectorizer(stop_words=stopwords, min_df=3, max_df=0.90, ngram_range=_ngram_range) X_CV = count_vect.fit_transform(docs_train) # print number of unique words (n_features) print ("Shape of train data is "+str(X_CV.shape)) # tfidf transformation### tfidf_transformer = TfidfTransformer(use_idf=_use_idf) X_tfidf = tfidf_transformer.fit_transform(X_CV) ################# # feature selection ################# selector = SelectPercentile(score_func=_score_func, percentile=_percentile) print ("Fitting data with feature selection ...") selector.fit(X_tfidf, y_train) # get how many features are left after feature selection X_features = selector.transform(X_tfidf) print ("Shape of array after feature selection is "+str(X_features.shape)) clf = MultinomialNB(alpha=_alpha).fit(X_features, y_train) # get the features which are selected and write to file feature_boolean = selector.get_support(indices=False) f = open(path_to_store_feature_selection_boolean_file,'w') for fb in feature_boolean: f.write(str(fb)+'\n') f.close() ################## # get cross validation score ################## scores = cross_val_score(clf, X_features, y_train, cv=10, scoring='f1_weighted') print ("Cross validation score: "+str(scores)) # Get average performance of classifier on training data using 10-fold CV, along with standard deviation print("Cross validation accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) #################### #test clf on test data #################### X_test_CV = count_vect.transform(docs_test) print ("Shape of test data is "+str(X_test_CV.shape)) X_test_tfidf = tfidf_transformer.transform(X_test_CV) # apply feature selection on test data too X_test_selector = selector.transform(X_test_tfidf) print ("Shape of array for test data after feature selection is "+str(X_test_selector.shape)) y_predicted = clf.predict(X_test_selector) # print the mean accuracy on the given test data and labels print ("Classifier score on test data is: %0.2f " % clf.score(X_test_selector,y_test)) print(metrics.classification_report(y_test, y_predicted)) cm = metrics.confusion_matrix(y_test, y_predicted) print(cm) return clf, count_vect
def benchmark(X_train, X_test, y5_train, y5_test, y3_train, y3_test, y2_train, y2_test, exp_folder, ds_folder, perc_f): config.logger.info('benchmark_text: ' + str(perc_f)) try: subfolder = 'all/best_k/' + str(perc_f) + '/' path = OUTPUT_FOLDER + exp_folder + ds_folder + 'benchmark/' + subfolder #input_layer_neurons = len(X) + 1 #output_layer_neurons = 1 #hidden_nodes = np.math.ceil(len(X) / (2 * (input_layer_neurons + output_layer_neurons))) #X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X, y3, test_size=test_size, random_state=random_state) #X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X, y2, test_size=test_size, random_state=random_state) # just to double check... #assert np.all(X_train_5 == X_train_3) #assert np.all(X_train_5 == X_train_2) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) config.logger.debug('OK. feature selection') # feature selection best5 = SelectPercentile(f_regression, perc_f) best3 = SelectPercentile(f_classif, perc_f) best2 = SelectPercentile(f_classif, perc_f) X_train_best5 = best5.fit_transform(X_train, y5_train) X_test_best5 = best5.transform(X_test) #feature_names = ['a', 'b', 'c', 'd', 'e'] #best_scores = best5.scores_ #best_features_ordered = [feature_names[i] for i in np.argsort(best5.scores_)[::-1]] X_train_best3 = best3.fit_transform(X_train, y3_train) X_test_best3 = best3.transform(X_test) X_train_best2 = best2.fit_transform(X_train, y2_train) X_test_best2 = best2.transform(X_test) best_estimators = [] x_axis_2 = [] x_axis_3 = [] y_axis_2 = [] y_axis_3 = [] title = 'Webpage Text Features' x_axis_label = 'Classifiers' y_axis_label = 'F1-measure' # -------------------------------------------------------------------------------------------------------------- # regression experiment # -------------------------------------------------------------------------------------------------------------- config.logger.info('starting experiments regression (5-classes)') with open(path + EXP_5_CLASSES_LABEL + '/log/results.txt', "w") as file_log_regression: file_log_regression.write(HEADER_CLASSIFICATION) for estimator, hyperparam, grid_method in CONFIGS_REGRESSION: out = [] label = estimator.__class__.__name__ + '.' + str( perc_f) + '.' + EXP_5_CLASSES_LABEL out, best_estimator = train_test_export_save_per_exp_type( estimator, label, hyperparam, grid_method, X_train_best5, X_test_best5, y5_train, y5_test, EXP_5_CLASSES_LABEL, 0, out, file_log_regression, subfolder, exp_folder, ds_folder) file_log_regression.flush() # -------------------------------------------------------------------------------------------------------------- # classification experiment # -------------------------------------------------------------------------------------------------------------- config.logger.info( 'starting experiments classification (2-classes and 3-classes)') i = 1 for exp_type in (EXP_2_CLASSES_LABEL, EXP_3_CLASSES_LABEL): with open(path + exp_type + '/log/results.txt', "w") as file_log_classification: file_log_classification.write(HEADER_CLASSIFICATION) if exp_type == EXP_2_CLASSES_LABEL: _X_train = X_train_best2 _X_test = X_test_best2 _y_train = y2_train _y_test = y2_test y_axis = y_axis_2 x_axis = x_axis_2 graph_file = 'graph.' + str(perc_f) + '.2-class.png' threshold = THRESHOLD_LABEL_2class elif exp_type == EXP_3_CLASSES_LABEL: _X_train = X_train_best3 _X_test = X_test_best3 _y_train = y3_train _y_test = y3_test y_axis = y_axis_3 x_axis = x_axis_3 graph_file = 'graph.' + str(perc_f) + '.3-class.png' threshold = THRESHOLD_LABEL_3class else: raise Exception('blah! error') for estimator, hyperparam, grid_method in CONFIGS_CLASSIFICATION: out = [] label = estimator.__class__.__name__ + '.' + str( perc_f) + '.' + exp_type out, best_estimator = train_test_export_save_per_exp_type( estimator, label, hyperparam, grid_method, _X_train, _X_test, _y_train, _y_test, exp_type, 0, out, file_log_classification, subfolder, exp_folder, ds_folder) best_estimators.append( (estimator.__class__.__name__, best_estimator)) i += 1 y_axis.extend(np.array(out)[:, 2]) x_axis.append( best_estimator.__class__.__name__.replace( 'Classifier', '')) #estimator_ensamble = VotingClassifier(estimators=best_estimators) #hyperparam_ensamble = dict(voting=['hard', 'soft'], flatten_transform=[True, False]) #out = [] #out, best_estimator = train_test_export_save_per_exp_type(estimator_ensamble, estimator_ensamble.__class__.__name__, # hyperparam_ensamble, SEARCH_METHOD_GRID, # _X_train, _X_test, _y_train, _y_test, exp_type, 0, # out, file_log_classification, subfolder, exp_folder, ds_folder) #y_axis.extend(np.array(out)[:, 2]) #x_axis.append(best_estimator.__class__.__name__.replace('Classifier', '')) config.logger.info( 'experiments classification done! exporting charts...') export_chart_bar(x_axis, y_axis, graph_file, ds_folder, exp_folder, perc_f, exp_type, title, x_axis_label, y_axis_label, threshold) config.logger.info('charts exported!') file_log_classification.flush() except Exception as e: config.logger.error(repr(e)) raise
target = pickle.load(open("../generated/group.p", "r")) device_id = pickle.load(open("../generated/device_id.p", "r")) trainDevices = pd.read_csv("../../../data/gender_age_train.csv", usecols=["device_id"]) indexes = pd.read_csv("../generated/raddarIndices.csv") indexes = pd.merge(trainDevices, indexes, how="left", on="device_id", left_index=True).reset_index().drop(["index"], axis=1) ###################### # Feature Selection ###################### fs = SelectPercentile(chi2, percentile=23).fit(train, target) train = fs.transform(train) test = fs.transform(test) ################## # Pre Processing ################## targetEncoder = LabelEncoder() target = targetEncoder.fit_transform(target) target = np_utils.to_categorical(target) ################## # Build Model ################## def modelBuilder(): model = Sequential()
def main(): data_loc = sys.argv[1] stemmer = PorterStemmer() # Get data for all combos train_stem_stop = load_files_correctly(os.path.join(data_loc, 'Training'), stemmer=stemmer, stop=True) test_stem_stop = load_files_correctly(os.path.join(data_loc, 'Test'), stemmer=stemmer, stop=True) train_no_stem_stop = load_files_correctly(os.path.join( data_loc, 'Training'), stemmer=None, stop=True) test_no_stem_stop = load_files_correctly(os.path.join(data_loc, 'Test'), stemmer=None, stop=True) train_stem_no_stop = load_files_correctly(os.path.join( data_loc, 'Training'), stemmer=stemmer, stop=False) test_stem_no_stop = load_files_correctly(os.path.join(data_loc, 'Test'), stemmer=stemmer, stop=False) train_no_stem_no_stop = load_files_correctly(os.path.join( data_loc, 'Training'), stemmer=None, stop=False) test_no_stem_no_stop = load_files_correctly(os.path.join(data_loc, 'Test'), stemmer=None, stop=False) tfid_u_stem_stop = TfidfVectorizer(ngram_range=(1, 1), decode_error='ignore') tfid_u_stem_no_stop = TfidfVectorizer(ngram_range=(1, 1), decode_error='ignore') tfid_u_no_stem_stop = TfidfVectorizer(ngram_range=(1, 1), decode_error='ignore') tfid_u_no_stem_no_stop = TfidfVectorizer(ngram_range=(1, 1), decode_error='ignore') count_u_stem_stop = CountVectorizer(ngram_range=(1, 1), decode_error='ignore') count_u_stem_no_stop = CountVectorizer(ngram_range=(1, 1), decode_error='ignore') count_u_no_stem_stop = CountVectorizer(ngram_range=(1, 1), decode_error='ignore') count_u_no_stem_no_stop = CountVectorizer(ngram_range=(1, 1), decode_error='ignore') tfid_u_train_stem_stop = tfid_u_stem_stop.fit_transform( train_stem_stop.data) tfid_u_test_stem_stop = tfid_u_stem_stop.transform(test_stem_stop.data) tfid_u_train_stem_no_stop = tfid_u_stem_no_stop.fit_transform( train_stem_no_stop.data) tfid_u_test_stem_no_stop = tfid_u_stem_no_stop.transform( test_stem_no_stop.data) tfid_u_train_no_stem_stop = tfid_u_no_stem_stop.fit_transform( train_no_stem_stop.data) tfid_u_test_no_stem_stop = tfid_u_no_stem_stop.transform( test_no_stem_stop.data) tfid_u_train_no_stem_no_stop = tfid_u_no_stem_no_stop.fit_transform( train_no_stem_no_stop.data) tfid_u_test_no_stem_no_stop = tfid_u_no_stem_no_stop.transform( test_no_stem_no_stop.data) count_u_train_stem_stop = count_u_stem_stop.fit_transform( train_stem_stop.data) count_u_test_stem_stop = count_u_stem_stop.transform(test_stem_stop.data) count_u_train_stem_no_stop = count_u_stem_no_stop.fit_transform( train_stem_no_stop.data) count_u_test_stem_no_stop = count_u_stem_no_stop.transform( test_stem_no_stop.data) count_u_train_no_stem_stop = count_u_no_stem_stop.fit_transform( train_no_stem_stop.data) count_u_test_no_stem_stop = count_u_no_stem_stop.transform( test_no_stem_stop.data) count_u_train_no_stem_no_stop = count_u_no_stem_no_stop.fit_transform( train_no_stem_no_stop.data) count_u_test_no_stem_no_stop = count_u_no_stem_no_stop.transform( test_no_stem_no_stop.data) # Vectorize data res = [] name = 'Naive Bayes alpha=0.01; unigram; countvectorizer; no stemmer; no stopper; no feature selection' clf = MultinomialNB(alpha=0.01) clf.fit(count_u_train_no_stem_no_stop, train_no_stem_no_stop.target) pred = clf.predict(count_u_test_no_stem_no_stop) res.append({ 'name': name, 'clf': clf, 'pred': pred, 'target': test_no_stem_no_stop.target }) name = 'Naive Bayes alpha=0.01; unigram; tfidfvectorizer; no stemmer; no stopper; no feature selection' clf = MultinomialNB(alpha=0.01) clf.fit(tfid_u_train_no_stem_no_stop, train_no_stem_no_stop.target) pred = clf.predict(tfid_u_test_no_stem_no_stop) res.append({ 'name': name, 'clf': clf, 'pred': pred, 'target': test_no_stem_no_stop.target }) name = 'Naive Bayes alpha=0.01 fit_prior=False; unigram; tfidfvectorizer; no stemmer; no stopper; no feature selection' clf = MultinomialNB(alpha=0.01, fit_prior=False) clf.fit(tfid_u_train_no_stem_no_stop, train_no_stem_no_stop.target) pred = clf.predict(tfid_u_test_no_stem_no_stop) res.append({ 'name': name, 'clf': clf, 'pred': pred, 'target': test_no_stem_no_stop.target }) name = 'Naive Bayes alpha=0.01; unigram; countvectorizer; stemmer; no stopper; no feature selection' clf = MultinomialNB(alpha=0.01) clf.fit(count_u_train_stem_no_stop, train_stem_no_stop.target) pred = clf.predict(count_u_test_stem_no_stop) res.append({ 'name': name, 'clf': clf, 'pred': pred, 'target': test_stem_no_stop.target }) name = 'Naive Bayes alpha=0.01; unigram; countvectorizer; no stemmer; stopper; no feature selection' clf = MultinomialNB(alpha=0.01) clf.fit(count_u_train_no_stem_stop, train_no_stem_stop.target) pred = clf.predict(count_u_test_no_stem_stop) res.append({ 'name': name, 'clf': clf, 'pred': pred, 'target': test_no_stem_stop.target }) name = 'Naive Bayes alpha=0.01; unigram; countvectorizer; stemmer; stopper; no feature selection' clf = MultinomialNB(alpha=0.01) clf.fit(count_u_train_stem_stop, train_stem_stop.target) pred = clf.predict(count_u_test_stem_stop) res.append({ 'name': name, 'clf': clf, 'pred': pred, 'target': test_stem_stop.target }) name = 'Naive Bayes alpha=0.01 fit_prior=False; unigram; tfidfvectorizer; no stemmer; stopper; no feature selection' clf = MultinomialNB(alpha=0.01, fit_prior=False) clf.fit(tfid_u_train_no_stem_stop, train_no_stem_stop.target) pred = clf.predict(tfid_u_test_no_stem_stop) res.append({ 'name': name, 'clf': clf, 'pred': pred, 'target': test_no_stem_stop.target }) name = 'Naive Bayes alpha=0.01 fit_prior=False; unigram; tfidfvectorizer; stemmer; stopper; no feature selection' clf = MultinomialNB(alpha=0.01, fit_prior=False) clf.fit(tfid_u_train_stem_stop, train_stem_stop.target) pred = clf.predict(tfid_u_test_stem_stop) res.append({ 'name': name, 'clf': clf, 'pred': pred, 'target': test_stem_stop.target }) name = 'Naive Bayes alpha=0.01; unigram; countvectorizer; no stemmer; stopper; feature selection - SelectPercentile=80' ch2 = SelectPercentile(chi2, percentile=80) X_train = ch2.fit_transform(count_u_train_no_stem_stop, train_no_stem_stop.target) X_test = ch2.transform(count_u_test_no_stem_stop) clf = MultinomialNB(alpha=0.01) clf.fit(X_train, train_no_stem_stop.target) pred = clf.predict(X_test) res.append({ 'name': name, 'clf': clf, 'pred': pred, 'target': test_no_stem_stop.target }) name = 'Naive Bayes alpha=0.01 fit_prior=False; unigram; tfifdfvectorizer; stemmer; stopper; feature selection - SelectPercentile=80' ch2 = SelectPercentile(chi2, percentile=80) X_train = ch2.fit_transform(tfid_u_train_stem_stop, train_stem_stop.target) X_test = ch2.transform(tfid_u_test_stem_stop) clf = MultinomialNB(alpha=0.01, fit_prior=False) clf.fit(X_train, train_stem_stop.target) pred = clf.predict(X_test) res.append({ 'name': name, 'clf': clf, 'pred': pred, 'target': test_stem_stop.target }) name = 'Naive Bayes alpha=0.01; unigram; countvectorizer; no stemmer; stopper; feature selection - SelectFromModel threshold=39' clf = MultinomialNB(alpha=0.01) clf.fit(count_u_train_no_stem_stop, train_no_stem_stop.target) model = SelectFromModel(clf, threshold=30, prefit=True) X_new = model.transform(count_u_train_no_stem_stop) X_new_test = model.transform(count_u_test_no_stem_stop) clf = MultinomialNB(alpha=0.01) clf.fit(X_new, train_no_stem_stop.target) pred = clf.predict(X_new_test) res.append({ 'name': name, 'clf': clf, 'pred': pred, 'target': test_no_stem_stop.target }) name = 'Naive Bayes alpha=0.01 fit_prior=False; unigram; tfidfvectorizer; stemmer; stopper; feature selection - SelectFromModel threshold=30' clf = MultinomialNB(alpha=0.01, fit_prior=False) clf.fit(tfid_u_train_stem_stop, train_stem_stop.target) model = SelectFromModel(clf, threshold=30, prefit=True) X_new_train = model.transform(tfid_u_train_stem_stop) X_new_test = model.transform(tfid_u_test_stem_stop) clf = MultinomialNB(alpha=0.01, fit_prior=False) clf.fit(X_new_train, train_stem_stop.target) pred = clf.predict(X_new_test) res.append({ 'name': name, 'clf': clf, 'pred': pred, 'target': test_stem_stop.target }) # This is the best model best_model = name rows = [] headers = [ 'No.', 'Name', 'Precision', 'Recall', 'Precision/Recall', 'F1 Score' ] for i, val in enumerate(res): precision = metrics.precision_score(val['target'], val['pred'], average='macro') recall = metrics.recall_score(val['target'], val['pred'], average='macro') f1_score = metrics.f1_score(val['target'], val['pred'], average='macro') row = [ i + 1, val['name'], precision, recall, precision / recall, f1_score ] rows.append(row) print('\nResults - ') print tabulate(rows, headers, tablefmt='orgtbl') print('\n') print 'Best model is ->', best_model
def feature_select(self): feature_select = SelectPercentile(chi2, percentile=95) feature_select.fit(self.train_x, self.train_y) train_csr = feature_select.transform(self.train_x) predict_csr = feature_select.transform(self.test_x) return train_csr, predict_csr
def process(self, df): print('process:', self.name) # 1). create strings based on text train = df[df['type'] == 'train'] print(train.head()) test = df[df['type'] == 'test'] print(test.head()) print('train.shape:', train.shape) n_train = train.shape[0] print('n_train:', n_train) n_test = test.shape[0] print('n_test:', n_test) # 2). fit a TfidfVectorizer on text vec_text = TfidfVectorizer(analyzer='char', ngram_range=(1, 2), max_df=0.8, min_df=2, sublinear_tf=True) text_tfidf = vec_text.fit_transform(df['text'].tolist()) print('text Tfidf.shape:', text_tfidf.shape) vocabulary = vec_text.vocabulary_ print('vocabulary size:%d' % len(vocabulary)) print('vocabulary list:') count = 0 for k, v in vocabulary.items(): if count < 10: print("%s\t%s" % (k, v)) count += 1 print("feature set nums: ", len(vocabulary)) feature_names = vec_text.get_feature_names() ch2_precent = SelectPercentile(chi2, percentile=5) ch2 = ch2_precent.fit(text_tfidf[:n_train], df.iloc[:n_train]['label']) text_tfidf = ch2_precent.transform(text_tfidf) features = [feature_names[i] for i in ch2.get_support(indices=True)] feature_scores = [ch2.scores_[i] for i in ch2.get_support(indices=True)] sorted_feature = sorted(zip(features, feature_scores), key=lambda x: x[1], reverse=True) feature_output_file = config.output_dir + 'char_tfidf_feature.txt' with open(feature_output_file, "w", encoding="utf-8") as f: for id, item in enumerate(sorted_feature): f.write("\t".join([str(id + 1), item[0], str(item[1])]) + "\n") print("feature select done, new feature set num: ", len(feature_scores)) # save train and test into separate files tfidf_train = text_tfidf[:n_train, :] tfidf_train_feature_path = config.output_dir + "train.text.char.tfidf.pkl" with open(tfidf_train_feature_path, "wb") as f: pickle.dump(tfidf_train, f) print('text tfidf features of training set saved in %s' % tfidf_train_feature_path) tfidf_test = None if n_test > 0: # test set is available tfidf_test = text_tfidf[n_train:, :] tfidf_test_feature_path = config.output_dir + "test.text.char.tfidf.pkl" with open(tfidf_test_feature_path, "wb") as f: pickle.dump(tfidf_test, f) print('text tfidf features of test set saved in %s' % tfidf_test_feature_path) return tfidf_train.toarray(), tfidf_test.toarray(), train['label'].values
] data_frame = data_frame.fillna(0) # Store to my_dataset for easy export below. my_dataset = data_frame.to_dict('index') print("") print("New data_dict:\n", my_dataset) # Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys=True) labels, features = targetFeatureSplit(data) from sklearn.feature_selection import SelectPercentile, f_classif selector = SelectPercentile(f_classif, percentile=50) selector.fit(features, labels) features_train = selector.transform(features) features_test = selector.transform(features) SelectPercentile_features = zip(selector.get_support(), features_list[1:], selector.scores_) SelectPercentile_features = sorted(SelectPercentile_features, key=lambda x: x[2], reverse=True) print ("(Features marked with 'True' are used in the final algorithm.):") for feature in SelectPercentile_features: print(feature) # Task 4: Try a variety of classifiers # Provided to give you a starting point. Try a variety of classifiers. # from sklearn.naive_bayes import GaussianNB # from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier # Task 5: Tune your classifier to achieve better than .3 precision and recall using our testing script. clf = GradientBoostingClassifier(init=None,
def lightgbm_make_submission(): train, test = read_csv() # x_train, x_test, y_train, y_test = make_train_set() x_train, y_train = make_train_set(train) # df = pd.merge(x_train, y_train, on='TERMINALNO') # if PREDICT == False: # df.corr().to_csv("./data/corr.csv") # else: # print(df.corr()) # del df # gc.collect() x_test, y_test = make_train_set(test) y_train = y_train['Y'] # feature selection sel = SelectPercentile(f_regression, 50) x_train = sel.fit_transform(x_train, y_train) x_test = sel.transform(x_test) # print("**********************x_train*******************") # print(x_train) # print("**********************x_train end***************") # print("**********************x_test********************") # print(x_test.head()) # print("**********************x_test end****************") train_x, valid_x, train_y, valid_y = train_test_split(x_train, y_train, test_size=0.2, random_state=0) params = { # 'boosting': 'dart', 'learning_rate': 0.01, 'application': 'regression', 'max_depth': -1, 'num_leaves': 5, 'verbosity': -1, 'feature_fraction': 0.8, 'feature_fraction_seed': 9, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'bagging_seed': 9, 'min_data_in_leaf': 6, 'min_sum_hessian_in_leaf': 11, 'metric': 'mae', } d_train = lgb.Dataset(train_x, label=train_y) d_valid = lgb.Dataset(valid_x, label=valid_y) watchlist = [d_train, d_valid] model = lgb.train(params, train_set=d_train, num_boost_round=300, valid_sets=watchlist, verbose_eval=20) # if PREDICT: print( "*******************************start predict***************************" ) preds = model.predict(x_test) y_test['Y'] = preds print(y_test['Y'].var()) y_test.columns = ['TERMINALNO', 'Pred'] y_test.set_index('TERMINALNO', inplace=True) y_test.to_csv(path_test_out, columns=['Pred'], index=True, index_label=['Id'])
print(df.dtypes) #datatype of the columns ##instantiate the predictor variables & the target variable (with known classes encoded in numbers) dataset = df.values #returns a numpy array format of the dataset x = dataset[:, 0:7] #each column from 1 - 7 contains the predictor variables (x) y = dataset[:, 7] #the last column 8 contains the target variable which is the "class" of the localisation site (y) #(Optional) feature engineering: automatic feature selection to reduce dimensionality #(I)Univeriate statistics method by SelectPercentile select = SelectPercentile( percentile=50 ) #(B) this automatically selected half of the features: lip, alm1, alm2 select.fit(x, y) x_selected_bypercent = select.transform(x) x_chosen1 = select.get_support( ) #this shows the 3 features selected from the 7 possible print(x_chosen1) print(x_selected_bypercent ) #the 3 automatically selected features to be used for the train data #(II)Recursive Feature Elimination - to keep the most important features, important for some Machine learning algorithms. print( df.corr(method='pearson') ) #other options: 'kendall', 'spearman'. highest correlation in descending order: mcg, gvh, alm1, lip, aac, chg, alm2. estimator = SVR( kernel="linear" ) #Choose the model it is appropriate for: ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’. select = RFE(estimator, n_features_to_select=3,
features_train = vectorizer.fit_transform(features_train) features_test = vectorizer.transform(features_test) feature_names = vectorizer.get_feature_names() print(features_train.shape) print(features_test.shape) #Feature selection selector = SelectPercentile(f_classif, percentile=10) selector.fit(features_train, labels_train) features_train_transformed = selector.transform(features_train) features_test_transformed = selector.transform(features_test) features_train_transformed.shape print("No of features after selection :", features_train_transformed.shape[1]) #Using MultinomialNB clf = MultinomialNB() grid_param = {'alpha': [0.001, 0.01, 0.1, 0.5, 1, 10]} grid_search = GridSearchCV(estimator=clf, param_grid=grid_param, cv=5,
optimum_complexity = complexities[np.argmax(uar_scores)] print('\nOptimum complexity: {0:.6f}, maximum UAR on Devel {1:.1f}\n'.format( optimum_complexity, np.max(uar_scores) * 100)) if not feat_selection: clf = svm.LinearSVC(C=optimum_complexity, random_state=0) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) else: uar = [] percentile = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100] for p in percentile: selection = SelectPercentile(f_classif, percentile=p) feat_selected = selection.fit_transform(X_train, y_train) feat_devel = selection.transform(X_devel) print('\nComplexity {0:.6f}'.format(optimum_complexity)) #clf = svm.LinearSVC(C=optimum_complexity, random_state=0) clf = svm.SVC(C=optimum_complexity, kernel='linear', random_state=0) clf.fit(feat_selected, y_train) y_pred = clf.predict(feat_devel) uar.append( recall_score(y_devel, y_pred, labels=classes, average='macro')) print('UAR on Devel {0:.1f}'.format(uar[-1] * 100)) if show_confusion: print('Confusion matrix (Devel):') print(classes) print(confusion_matrix(y_devel, y_pred, labels=classes)) optimum_percentile = percentile[np.argmax(uar)] print( '\nOptimum percentile: {0:.6f}, maximum UAR on Devel {1:.1f}\n'.format(
def col_filter(mtx_train, y_train, mtx_test, func = chi2, percentile = 90): feature_select = SelectPercentile(func, percentile = percentile) feature_select.fit(mtx_train, y_train) mtx_train = feature_select.transform(mtx_train) mtx_test = feature_select.transform(mtx_test) return mtx_train, mtx_test
"_train.npy") X_test = load_numpy_matrix(feature_set_path + "Google_TfidfFeatures" + tag + "_test.npy") print "\nFeatures", FEATURES[featureV] print '\nTotal:', X_train.shape[0] + X_test.shape[0] print 'Features:', X_train.shape[1] print "\nClass distribution", Counter(y_train) print "\nClass distribution", Counter(y_test) # FEATURE SELECT if featureV == 0: selector = SelectPercentile(score_func=f_classif, percentile=perc).fit(X_train, y_train) X_train = selector.transform(X_train) X_test = selector.transform(X_test) elif featureV < 2: selector = SelectKBest( score_func=chi2, k=min(200000, int(X_train.shape[1] * (perc / 100.0)))).fit( X_train, y_train) X_train = selector.transform(X_train) X_test = selector.transform(X_test) print X_train.shape print X_test.shape # FEATURE SCALING if featureV == 0: scaler = preprocessing.StandardScaler().fit(X_train)
'csr', 'bool') print('cv prepared !') sparse.save_npz(path + '/feature/base_train_csr.npz', base_train_csr) sparse.save_npz(path + '/feature/base_predict_csr.npz', base_predict_csr) train_csr = sparse.hstack( (sparse.csr_matrix(train_x[num_feature]), base_train_csr), 'csr').astype('float32') predict_csr = sparse.hstack( (sparse.csr_matrix(predict_x[num_feature]), base_predict_csr), 'csr').astype('float32') print(train_csr.shape) feature_select = SelectPercentile(chi2, percentile=50) feature_select.fit(train_csr, train_y) train_csr = feature_select.transform(train_csr) predict_csr = feature_select.transform(predict_csr) print('feature select') print(train_csr.shape) n = 1500 data_col = pd.read_csv('col_sort_one11.csv', header=None) col = data_col[0].values.copy() lgb_model = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=60, max_depth=-1, learning_rate=0.1, n_estimators=n, max_bin=425, subsample_for_bin=50000,
def cross(train_path, test_path, select_feat): print('Read train and test') train = pd.read_csv("./data/train_no_events.csv", dtype={'device_id': np.str}) train.drop(['age', 'gender'], axis=1, inplace=True) train_label = train["group"] lable_group = LabelEncoder() train_label = lable_group.fit_transform(train_label) test = pd.read_csv("./data/test_no_events.csv", dtype={'device_id': np.str}) test["group"] = np.nan trf = open(train_path, 'rb') train_sp = pickle.load(trf) trf.close() ttf = open(test_path, 'rb') test_sp = pickle.load(ttf) ttf.close() train_sp = train_sp.toarray() if select_feat == "1": X_train, X_val, y_train, y_val = train_test_split(train_sp, train_label, train_size=.90, random_state=10) print X_train.shape print train.shape print X_val.shape print("# Feature Selection") selector = SelectPercentile(f_classif, percentile=100) selector.fit(X_train, y_train) X_train = selector.transform(X_train) X_val = selector.transform(X_val) print X_train.shape print X_val.shape train_sp = selector.transform(train_sp) test_sp = selector.transform(test_sp) dtrain = xgb.DMatrix(X_train, y_train) dvalid = xgb.DMatrix(X_val, y_val) #dtrain = xgb.DMatrix(train_sp, train_label) params = { "objective": "multi:softprob", "num_class": 12, "booster": "gblinear", "eval_metric": "mlogloss", "eta": 0.05, "silent": 1, "lambda": 3, "alpha": 2, } params2 = { "objective": "multi:softprob", "num_class": 12, "booster": "gbtree", "eval_metric": "mlogloss", "eta": 0.05, "max_depth": 6, "subsample": 0.7, "colsample_bytree": 0.7, "num_parallel_tree": 1, "seed": 114, "silent": 1, } watchlist = [(dtrain, 'train'), (dvalid, 'eval')] gbm = xgb.train(params2, dtrain, 1000, evals=watchlist, early_stopping_rounds=10, verbose_eval=True) else: selector = SelectPercentile(f_classif, percentile=100) selector.fit(train_sp, train_label) #X_train = selector.transform(X_train) #X_val = selector.transform(X_val) train_sp = selector.transform(train_sp) test_sp = selector.transform(test_sp) #dtrain = xgb.DMatrix(X_train, y_train) #dvalid = xgb.DMatrix(X_val, y_val) dtrain = xgb.DMatrix(train_sp, train_label) dtest = xgb.DMatrix(test_sp) params = { "objective": "multi:softprob", "num_class": 12, "booster": "gbtree", "eval_metric": "mlogloss", "eta": 0.05, "max_depth": 8, "subsample": 0.7, "colsample_bytree": 0.7, "num_parallel_tree": 1, "seed": 114, "silent": 1, } params2 = { "objective": "multi:softprob", "num_class": 12, "booster": "gblinear", "max_depth": 6, "eval_metric": "mlogloss", "eta": 0.05, "silent": 1, "lambda": 3, "alpha": 2, } res = xgb.cv(params2, dtrain, num_boost_round=700, nfold=5, callbacks=[ xgb.callback.print_evaluation(show_stdv=False), xgb.callback.early_stop(3) ]) print(res)
def train_classifier_use_feature_selection(self): ################# # feature selection ################# selector = SelectPercentile(score_func=_score_func, percentile=_percentile) print("Fitting data with feature selection ...") selector.fit(x_train, y_train) # get how many features are left after feature selection x_features = selector.transform(x_train) print("Shape of array after feature selection is " + str(x_features.shape)) clf = SGDClassifier(loss=_loss, penalty=_penalty, alpha=_alpha, n_iter=_n_iter, random_state=42).fit(x_features, y_train) # get the features which are selected and write to file feature_boolean = selector.get_support(indices=False) ################## # get cross validation score ################## scores = cross_val_score(clf, x_features, y_train, cv=10, scoring='f1_weighted') print("Cross validation score: " + str(scores)) # Get average performance of classifier on training data using 10-fold CV, along with standard deviation print("Cross validation accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) #################### # test clf on test data #################### # apply feature selection on test data too x_test_selector = selector.transform(x_test) print("Shape of array for test data after feature selection is " + str(x_test_selector.shape)) y_predicted = clf.predict(x_test_selector) # print the mean accuracy on the given test data and labels print("Classifier score on test data is: %0.2f " % clf.score(x_test_selector, y_test)) print(metrics.classification_report(y_test, y_predicted)) cm = metrics.confusion_matrix(y_test, y_predicted) print(cm) return clf
cancer = load_breast_cancer() import numpy as np #노이즈 생성해서 추가 rng = np.random.RandomState(42) noise = rng.normal(size=(len(cancer.data), 50)) X_w_noise = np.hstack([cancer.data, noise]) X_train, X_test, y_train, y_test = train_test_split(X_w_noise, cancer.target, random_state=0, test_size=0.5) select = SelectPercentile(percentile=50) #특성의 50%만 선택 select.fit(X_train, y_train) X_train_selected = select.transform(X_train) print("X_train.shape : {}".format(X_train.shape)) print("X_train_selected.shape : {}".format(X_train_selected.shape)) mask = select.get_support() #선택된거 안된거 표시 print(mask) #대체로 원본데이터가 선택됨 import matplotlib.pyplot as plt plt.matshow(mask.reshape(1, -1), cmap="gray") #흰색 선택 O, 검은색 선택 x plt.xlabel("feature num") #전체와 선택 성능 비교 from sklearn.linear_model import LogisticRegression X_test_selected = select.transform(X_test)
class LocalRegression(object): """implements a scikitlearn model that finds nearest geographic neighbours and computes a regression. Defaults to a global regression if location data is not available. methods: fit( data, response, location_data ) predict( data, location_data) """ def __init__(self, k=200, feature_selection=False, regressor=LinearRegression, verbose=False, params={}, response_f=identity, inv_response_f=identity): self.k = k self.response_f = response_f self.inv_response_f = inv_response_f self.zero_coef_ = np.zeros(10000) self.verbose = verbose self.feature_selection = feature_selection self.selector = SelectPercentile(f_regression, 50) #if regressor is a list of regressor, we need to initialize all of them if type(regressor) == list: self.regressor = [ self.regressor[i](**params[i]) for i in range(len(regressor)) ] else: self.regressor = [regressor(**params)] def __str__(self): return "%dK%sLocalRegression" % ( self.k, self.regressor.__name__.replace(" ", "")) def fit(self, data, response, location_data): try: #incase I pass in a numpy array or pandas df self.data_ = data.values except: self.data_ = data try: self.response_ = response.values except: self.response_ = response try: self.location_data_ = location_data.values except: self.location_data_ = location_data self.gnn = geoNN.GeoNNFinder(self.location_data_) return self def predict(self, data, location_data, weights=None): if location_data.shape[0] != data.shape[0]: raise Exception( "length of first argument does not equal length of second argument." ) n = location_data.shape[0] try: data = data.values except: pass #reg = self.regressor(**self.params) prediction = np.zeros(n) try: location_data = location_data.values except: pass #weights = weights.values #argweights_sorted = np.argsort( weights ) for i in range(n): if (self.verbose and i % 100 == 0): print i #how many estimators should we make, proptional to the percentile of the weight. #naive scheme: #n_estimators = self.trivial_n_estimators( i, argweights_sorted) location = location_data[i, :] #sub_predictions = np.zeros( n_estimators) #for n_est in range(n_estimators): if np.any(pandas.isnull(location)): prediction[i] = self.response_.mean() else: inx = self.gnn.find(location[0], location[1], self.k) sub_data = self.data_[inx, :] sub_response = self.response_f(self.response_[inx, :]) to_predict = data[i, :] if self.feature_selection: sub_data = self.selector.fit_transform( self.data_[inx, :], sub_response) to_predict = self.selector.transform(data[i, :]) for reg in self.regressor: reg.fit(sub_data, sub_response) try: self.zero_coef_[np.nonzero( abs(reg.coef_) < .000001)[0]] += 1 except: pass #take the average prediction[i] = np.array([ reg.predict(to_predict) for reg in self.regressor ]).mean() #prediction[i] = sub_predictions.mean() #make sure everything is inside [0-100] prediction = self.inv_response_f(prediction) prediction[prediction > 100] = 99 prediction[prediction < 0] = 4 return prediction def naive_n_estimators(self, i, argweights_sorted): """returns the deci-percentile plus 1, i.e. if the weight is the 86th percentile, return 9""" n = int(float(argweights_sorted[i]) / len(argweights_sorted) * 10) + 1 return n def trivial_n_estimators(self, i, argweights_sorted): return 1
print(data_frame) numpy.seterr(divide='ignore', invalid='ignore') # Train/Test Split x_train, x_test, y_train, y_test = train_test_split(data_frame, target, random_state=0) print(f'\nTrain data shape: {x_train.shape}') print(f'Test data shape{x_test.shape}') print(f'Target shape {y_test.shape}') # Feature Selection selection = SelectPercentile(percentile=50) selection.fit(x_train, y_train) x_train_compressed = selection.transform(x_train) print(f'\nTrain shape after selection: {x_train_compressed.shape}') selection_status = list(selection.get_support()) print(f'Selection Status: {selection_status} Length: {len(selection_status)}') x_test_compressed = selection.transform(x_test) # Printing Selected Column Names i = 0 selected_columns = [] for status in selection_status: if status: selected_columns.append(data_column_names[i]) i += 1 print(f'Selected Columns: {selected_columns} Length: {len(selected_columns)}') # Applying Linear Regression
# In[40]: #Accuracy over test set after training for all the features accuracy = clf.score(test.toarray(), np.asarray(y_test.flatten(), dtype=np.int64)) print("Accuracy of test set: ", accuracy) # In[56]: #Selecting the top-p percentile features p = 0.1 select = SelectPercentile(f_classif, percentile=p) select.fit(train, y_train) train_select = select.transform(train) # In[59]: #Batch wise training of top-p percentile features n = len(corpus) batch = 1000 clf2 = GaussianNB() test_select = select.transform(test) start = time.time() for i in range(int(n/batch)): s = i*batch e = (i+1)*batch
label=r'Univariate score ($-Log(p_{value})$)', color='darkorange', edgecolor='black') # ############################################################################# # Compare to the weights of an SVM clf = svm.SVC(kernel='linear') clf.fit(X, y) svm_weights = (clf.coef_ ** 2).sum(axis=0) svm_weights /= svm_weights.max() plt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight', color='navy', edgecolor='black') clf_selected = svm.SVC(kernel='linear') clf_selected.fit(selector.transform(X), y) svm_weights_selected = (clf_selected.coef_ ** 2).sum(axis=0) svm_weights_selected /= svm_weights_selected.max() plt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected, width=.2, label='SVM weights after selection', color='c', edgecolor='black') plt.title("Comparing feature selection") plt.xlabel('Feature number') plt.yticks(()) plt.axis('tight') plt.legend(loc='upper right') plt.show()
from sklearn.feature_selection import SelectPercentile, chi2 from sklearn.model_selection import cross_val_score import pylab as pl # 利用 5折CV法 在训练集上对合适的特征选择量进行验证 percentiles = range(1, 100, 2) results = [] for i in percentiles: fs = SelectPercentile(score_func=chi2, percentile=i) x_train_fs = fs.fit_transform(x_train, y_train) # 由于是5折验证, 所以输出score时是5个 scores = cross_val_score(dt, x_train_fs, y_train, cv=5) results.append(scores.mean()) results = np.array(list(map(lambda x: round(x, 4), results))) print(results) print("the Optimal Number of Features is %d" % (percentiles[results.argmax()])) pl.plot(percentiles, results) pl.xlabel("percentile of features") pl.ylabel("acc") pl.show() # 利用得到的最优参数重新训练,并对测试集进行预测 fs = SelectPercentile(score_func=chi2, percentile=percentiles[results.argmax()]) x_train_fs = fs.fit_transform(x_train, y_train) selectedFeatures = np.array(vec.feature_names_)[fs.get_support()] dt.fit(x_train_fs, y_train) x_test_fs = fs.transform(x_test) print("the score of DT with filtering features is ", dt.score(x_test_fs, y_test))
print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_)) print("Test set score: {:.2f}".format(grid.score(X_test, y_test))) print("Best parameters: {}".format(grid.best_params_)) #. ILLUSTRATING INFORMATION LEAKAGE import numpy as np from sklearn.svm import SVC from sklearn.preprocessing import MinMaxScaler # Load and split the data rnd = np.random.RandomState(seed=0) X = rnd.normal(size=(100, 10000)) y = rnd.normal(size=(100, )) from sklearn.feature_selection import SelectPercentile, f_regression select = SelectPercentile(score_func=f_regression, percentile=5).fit(X, y) X_selected = select.transform(X) print("X_selected.shape: {}".format(X_selected.shape)) from sklearn.model_selection import cross_val_score from sklearn.linear_model import Ridge print("Cross-validation accuracy (cv only on ridge): {:.2f}".format( np.mean(cross_val_score(Ridge(), X_selected, y, cv=5)))) from sklearn.pipeline import Pipeline pipe = Pipeline([("select", SelectPercentile(score_func=f_regression, percentile=5)), ("ridge", Ridge())]) print("Cross-validation accuracy (pipeline): {:.2f}".format( np.mean(cross_val_score(pipe, X, y, cv=5)))) #CONVENIENT PIPELINE INTERFACE WITH MAKE_PIPELINE import numpy as np from sklearn.svm import SVC
import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.feature_selection import SelectPercentile from sklearn.linear_model import LogisticRegression cancer = load_breast_cancer() rng = np.random.RandomState(42) noise = rng.normal(size=(len(cancer.data),50)) X_w_noise = np.hstack([cancer.data,noise]) X_train, X_test, y_train, y_test = train_test_split(X_w_noise,cancer.target,random_state=0,test_size=0.5) select = SelectPercentile(percentile=50) select.fit(X_train,y_train) X_train_selected = select.transform(X_train) print("X_train.shape: {}".format(X_train.shape)) print("X_train_selected.shape: {}".format(X_train_selected.shape)) mask = select.get_support() print(mask) plt.matshow(mask.reshape(1,-1),cmap="gray_r") plt.show()
#Initialize and fit scaler scaler = StandardScaler() #Fit scaler using the training data scaler.fit(X_train_raw) #Transform the raw data X_train_standardized = scaler.transform(X_train_raw) X_test_standardized = scaler.transform(X_test_raw) #Initialize and fit selector MI_selector = SelectPercentile(mutual_info_classif, percentile=60)#Remove the lower 40% MI_selector.fit(X_train_standardized, y_train.values.ravel()) #Transform X_train_MI = MI_selector.transform(X_train_standardized) X_test_MI = MI_selector.transform(X_test_standardized) #Summary print("Feature Selection Results - Univariate Feature Selection") #Summary print("Filter Result:") print("Number of features: ",X_train_MI.shape[1]) #Rank the features by scores plt.figure(figsize=(10, 8), dpi= 60) feat_scores = pd.Series(MI_selector.scores_, index=X_train_raw.columns) top_feat = feat_scores.nlargest(10) top_feat.plot(kind='barh') plt.title("Feature Ranking by Mutual Information Score")