def main(idir, odir, featuredir, N=50, generatetxt=False, selectfeatures=False): if generatetxt: start = [2017, 1, 1] end = [2017, 9, 1] xls2txt(idir, odir, start, end) if selectfeatures: feature_selection(odir, featuredir) date = readdate(os.path.join(odir, 'datetime.txt')) length = len(date) - 1 features = {} with open(os.path.join(featuredir, 'features.txt'), 'r') as f: for line in f: features[line.strip()] = readdata(os.path.join(odir, line.strip())) r_abnormities = [] with open(os.path.join(featuredir, 'abnormities.txt'), 'r') as f: for line in f: r_abnormities.append(line.strip()) r_abnormities_ts = str2timestamp(r_abnormities) abnormities = gaussian_detection(features, phi=1.96) f1_scores = {} for feature in abnormities: abnormity = abnormities[feature] f1_scores[feature] = adj_f1(str2timestamp(date[i] for i in abnormity), r_abnormities_ts) count = weighted_count(abnormities, f1_scores, length) print(get_abnormity(count, date, N))
def read_dataset(size_training): full_train = pd.read_csv("../../input/train.csv") selected_train = feature_selection(full_train) zero_train = selected_train.loc[selected_train['target'] == 0].values one_train = selected_train.loc[selected_train['target'] == 1].values zero_features_train = zero_train[:, 2:] zero_targets_train = zero_train[:, 1] one_features_train = one_train[:, 2:] one_targets_train = one_train[:, 1] random_training_zero = list(range(zero_targets_train.shape[0])) random.shuffle(random_training_zero) zero_features_train = zero_features_train[ random_training_zero[0:size_training], :] zero_targets_train = zero_targets_train[ random_training_zero[0:size_training]] features_train = concatenate((zero_features_train, one_features_train), axis=0).astype(float64) targets_train = concatenate((zero_targets_train, one_targets_train), axis=0).astype(int) full_test = pd.read_csv("../../input/test.csv") test = feature_selection(full_test).values features_test = test[:, 1:].astype(float64) return features_train, targets_train, features_test
def main(): feature_selection() linear_regression_predict() polynomial_regression_predict() randomforest_predict() bp_predict() svr_predict()
def feature_selection_topn(self): #f = open("C:\\Users\\Administrator\\Desktop\\python note\\craw\\taobaomm\\sj_names.txt") module_path = dirname(__file__) f = open(join(module_path, 'sj_names.txt')) class_list = [] term_str = [] try: for line in f: lt = line.split(',') if lt[1] == '全部': #过滤掉全部分类 continue class_list.append(lt[0]) temstr = lt[2].split('(') seg_list = jieba.cut_for_search(temstr[0]) #搜索引擎模式 terlist = ", ".join(seg_list) #解析成字符串 try: term_str.append([ term.strip() for term in terlist.split(',') if len(term.strip()) > 1 ]) #去掉空格字符转换为列表 except UnicodeEncodeError: print 'err' finally: f.close() print len(term_str) term_set_fs = fs.feature_selection(term_str, class_list, 'IG')[:2000] #选取前1000个信息增益最大的词 self.term_set_dict = dict(zip(term_set_fs, range(len(term_set_fs)))) #生成字典 f2 = open('feature_term_result.txt', "w+") for i in range(len(term_set_fs)): str = term_set_fs[i] f2.write(str.encode('utf-8') + '\n') f2.close()
def text_classifly_twang(dataset_dir_name, fs_method, fs_num): print 'Loading dataset, 80% for training, 20% for testing...' movie_reviews = load_files(dataset_dir_name) doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(movie_reviews.data, movie_reviews.target, test_size = 0.2, random_state = 0) print 'Feature selection...' print 'fs method:' + fs_method, 'fs num:' + str(fs_num) vectorizer = CountVectorizer(binary = True) word_tokenizer = vectorizer.build_tokenizer() doc_terms_list_train = [word_tokenizer(doc_str) for doc_str in doc_str_list_train] term_set_fs = feature_selection.feature_selection(doc_terms_list_train, doc_class_list_train, fs_method)[:fs_num] print 'Building VSM model...' term_dict = dict(zip(term_set_fs, range(len(term_set_fs)))) vectorizer.fixed_vocabulary = True vectorizer.vocabulary_ = term_dict doc_train_vec = vectorizer.fit_transform(doc_str_list_train) doc_test_vec= vectorizer.transform(doc_str_list_test) clf = MultinomialNB().fit(doc_train_vec, doc_class_list_train) #调用MultinomialNB分类 doc_test_predicted = clf.predict(doc_test_vec) acc = np.mean(doc_test_predicted == doc_class_list_test) print 'Accuracy: ', acc return acc
def text_classifly_twang(dataset_dir_name, fs_method, fs_num): print('Loading dataset, 80% for training, 20% for testing...') movie_reviews = load_files(dataset_dir_name) doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split( movie_reviews.data, movie_reviews.target, test_size=0.2, random_state=0) print('Feature selection...') print('fs method:' + fs_method, 'fs num:' + str(fs_num)) vectorizer = CountVectorizer(binary=True) word_tokenizer = vectorizer.build_tokenizer() doc_terms_list_train = [ word_tokenizer(doc_str) for doc_str in doc_str_list_train ] term_set_fs = feature_selection.feature_selection(doc_terms_list_train, doc_class_list_train, fs_method)[:fs_num] print('Building VSM model...') term_dict = dict(zip(term_set_fs, range(len(term_set_fs)))) vectorizer.fixed_vocabulary = True vectorizer.vocabulary_ = term_dict doc_train_vec = vectorizer.fit_transform(doc_str_list_train) doc_test_vec = vectorizer.transform(doc_str_list_test) clf = MultinomialNB().fit(doc_train_vec, doc_class_list_train) # µ÷ÓÃMultinomialNB·ÖÀàÆ÷ doc_test_predicted = clf.predict(doc_test_vec) acc = np.mean(doc_test_predicted == doc_class_list_test) print('Accuracy: ', acc) return acc
def cross_validation(y, x, k_indices, k, lambda_): train_idxs = [ n for (i, idxs) in enumerate(k_indices) for n in idxs if i != k ] test_idxs = k_indices[k] x_train, y_train = x[train_idxs], y[train_idxs] x_test, y_test = x[test_idxs], y[test_idxs] tx_train = np.c_[np.ones(len(y_train)), x_train] x_test = np.c_[np.ones(len(y_test)), x_test] x_train, x_test, w, indicies = feature_selection(x_train, y_train, x_test, y_test, lambda_) loss_tr = compute_mse(y_train, x_train, w) loss_te = compute_mse(y_test, x_test, w) return loss_tr, loss_te, w, indicies
def fit(self, Xs, y, time_ramain): #self.tables = copy.deepcopy(Xs) Xs, self.y = data_sample(Xs, y) self.Xs = copy.deepcopy(Xs) X, y, feature_names, cat_feature_map, stampcol = baseline_features( Xs, y, self.config) features_from_base, self.feature_selection_models = feature_selection( X, y, int(len(X.columns) / 5), feature_names, cat_feature_map) X, self.cat_dict_counts = cat_value_counts( X, list(cat_feature_map.keys())) X = pd.concat([X, features_from_base], axis=1) # # # one_hot_feature,models = onehot_feature_selection(X, y, cat_feature_map.keys(), feature_num_everyiter=1) # # one_hot_feature = pd.DataFrame(one_hot_feature,columns=["one_hot_feature"]) # # print(X.shape) # # X = pd.concat([X,one_hot_feature],axis=1) # # print(X.shape) #features_from_base,self.feature_selection_models = feature_selection(X, y ,20,feature_names, cat_feature_map) # #timestamp_features(X, y, features_from_base, cat_feature_map, self.config,stampcol) # X=polyfeatures(X) # # model=XGBClassifier() # # model.fit(X, y) # # print(model.feature_importances_) train(X, y, self.config)
def load_dataset(split): df = pd.read_csv('data/text_emotion.csv') df.columns = ['id', 'class', 'author', 'tweet'] if os.path.exists('data_ml/text_emotion_features.npy'): X = np.load('data_ml/text_emotion_features.npy') else: print('Fix encoding...') df = fix_encoding(df) print('Split sentences...') df = split_tweet_sentences(df) print('Tokenize tweets...') df = tokenize_tweets(df) print('Lematize tweets...') df = get_lemmas(df) lexicon = pd.read_csv('lexicons/Ratings_Warriner_et_al.csv', usecols=[0, 1, 2, 5], index_col=0) lexicon.columns = ['word', 'valence', 'arousal'] path_to_jar = 'stanford_parser/stanford-parser.jar' path_to_models_jar = 'stanford_parser/stanford-parser-3.9.1-models.jar' valence_shifter = FeatureExtractionContextValenceShifting(path_to_jar, path_to_models_jar, lexicon) df = valence_shifter.get_initial_valences(df) featured_dataset, vocab = generate_initial_features(df) X = featured_dataset['valences'].values.tolist()[:split] y = featured_dataset['class'].values.tolist()[:split] selected, mask = feature_selection(X, y, vocab) for index, row in featured_dataset.iterrows(): valences = np.array(row.valences[mask]) featured_dataset.set_value(index=index, col='valences', value=valences) X = np.vstack(featured_dataset.valences.values) np.save('data_ml/text_emotion_features', X) classes = df['class'].values.tolist() c = np.unique(classes).tolist() d = dict([(y, x) for x, y in enumerate(c)]) classes = np.array([d[x] for x in classes]) return X, classes, len(c)
models = ['Linear', 'Ridge', 'AdaBoost', 'RandomForest', 'SVM'] for imp_method in ['Mode', 'KNN']: for outcome in outcomes: print("Generating results for {}".format(outcome)) features_path = 'features_{}_{}.csv'.format(outcome, imp_method) labels_path = 'labels_{}_{}.csv'.format(outcome, imp_method) print("Loading dataset...") X, Y = get_data(args.data_dir, features_path, labels_path) is_classf = Y.dtype == np.int8 print("Successfully loaded dataset.") for fs_method in fs_methods: if fs_method: print("Performing feature selection using {}...".format( fs_method)) print(X.shape) X_subset = feature_selection(X, Y, outcome, fs_method, imp_method, args.data_dir, verbose=1) print(X_subset.shape) """ for model in models: train(model, X_subset, Y, is_classf, outcome, fs_method, imp_method, args.data_dir, args.results_dir, verbose=1) """
def main(): """Wrapper function which calls all the other functions""" rolled_df = deriving_features.create_dataframe_with_features() print(rolled_df.columns) target = input("Enter the column name of y variable:") e = eda(rolled_df, target) event = input("Enter the event:") er = e.eventRatio(event) print(er) stat = e.impstat() rng = e.range() rng = rng.rename('Range') print("Size is") print(rng.size) iq = e.iqr() iq = iq.rename('IQR') cor = e.corr() print(cor) ske = e.skew() print(ske) ske = ske.rename('Skewness') kur = e.kurt() print(kur) kur = kur.rename('Kurtosis') [mi, mi1] = e.missinginfo() print("missing value is") print(pd.Series(mi1[1:])) mi1 = mi1.rename('Missing values') e.missingplot() b = e.bin('woe', 'y', 'yes', 'pdays', 'day') print(b) try: os.remove('D:/Other projects/python modules/Report.xlsx') engine = 'xlsxwriter' writer = pd.ExcelWriter('Report.xlsx', engine=engine) stat1 = pd.DataFrame(stat.T) print(stat1) stat1.to_csv(writer, startcol=0, startrow=5) ws = writer.sheets['Sheet1'] ws.write_string(1, 4, 'DataDescription') rng.to_excel(writer, startcol=9, startrow=5, index=False) iq.to_excel(writer, startcol=10, startrow=5, index=False) ske.to_excel(writer, startcol=11, startrow=5, index=False) kur.to_excel(writer, startcol=12, startrow=5, index=False) ws.write_string(5 + rng.size + 2, 5, 'Correlation') cor.to_excel(writer, startcol=0, startrow=5 + rng.size + 4) # mi1[1:].to_excel(writer,startcol=rng.size+2,startrow=5+rng.size+4) b.to_excel(writer, startcol=12, startrow=5 + rng.size + 4) # ws.write_string(5+rng.size+2,14,'Binning') # misplot.to_excel(writer,startcol=0,startrow=rng.size+rng.size+5+3+5) writer.close() except: engine = 'xlsxwriter' writer = pd.ExcelWriter('Report.xlsx', engine=engine) stat1 = pd.DataFrame(stat.T) print(stat1) stat1.to_excel(writer, startcol=0, startrow=5) ws = writer.sheets['Sheet1'] ws.write_string(1, 4, 'DataDescription') rng.to_excel(writer, startcol=9, startrow=5, index=False) iq.to_excel(writer, startcol=10, startrow=5, index=False) ske.to_excel(writer, startcol=11, startrow=5, index=False) kur.to_excel(writer, startcol=12, startrow=5, index=False) ws.write_string(5 + rng.size + 2, 5, 'Correlation') cor.to_excel(writer, startcol=0, startrow=5 + rng.size + 4) mi1[1:].to_excel(writer, startcol=rng.size + 2, startrow=5 + rng.size + 4) writer.close() ft = feature_transformation(rolled_df, target) p = True while (p): degree = input( "Enter the degree of the polynomial features you want to derive") try: degree = int(degree) p = False except: print("You did not enter correct value. Try again") p = True poly_feature_set = ft.poly_features() feature_transformed_df = ft.transformation() cols_to_use = poly_feature_set.columns.difference( feature_transformed_df.columns) final_df = pd.merge(feature_transformed_df, poly_feature_set[cols_to_use], left_index=True, right_index=True, how='outer') print(final_df.columns) cat = [x for x in final_df.columns if final_df[x].dtypes == 'object'].copy() label_encod = ft.label_encoding(final_df) one_hot = ft.one_hot_encoding(label_encod, cat) cols_use = one_hot.columns.difference(final_df.columns) final_f = pd.merge(final_df, one_hot[cols_use], left_index=True, right_index=True, how='outer') nystroem_rbf_dataframe = ft.kernel_transformation_using_nystroem_rbf( final_f, cat) cols_needed = nystroem_rbf_dataframe.columns.difference(final_df) final_data = pd.merge(final_df, nystroem_rbf_dataframe[cols_needed], left_index=True, right_index=True, how='outer') f = 0 p1 = True while (p1): try: p1 = False f = input( "Enter 1 if you want to write the dataframe into a csv file else enter 0:" ) if (int(f) == 1): path = input("Enter the path where you want to save:") final_data.to_csv(path, index=False) except: p1 = True print("You have entered wrong value. Please try again.") # ml=machine_learning(final_data,target) datecol = [ x for x in final_data.columns if final_data[x].dtypes == 'datetime64[ns]' ] X1 = [ x for x in final_data.columns if final_data[x].dtypes != 'object' and x not in datecol and x not in target ] X = [x for x in X1 if x not in cat] fs = feature_selection(final_data, target) fs.recursive_feature_elimination(X)
# pre-training/initialization of the parameters of 1LNN: w_h1, w_o, b1, bo, t1, accuracy1[t], F1[t] = NN_pretraining_one( trX, trY, teX, teY, K) time1[t] = t1 / 10000 # f_subset = np.arange(19) # respiration features # f_subset = np.arange(19,32,1) # wrist features f_subset = [25, 29] # median of (roll,pitch) # f_subset = np.arange(37) # all the features # number of features used in the first stage f = 5 # indicator function whether or not to use z-normalization, set z to 1 when we have 5 features z = 1 # training and testing data in the first stage, trX2 and teX2: trX2, teX2 = feature_selection(trX, teX, f_subset, f, z) # firm cascade: plambda = [0.25] t3, accuracy3[t], F3[t], nnz3[t] = cascade_two_stage( trX, trY, teX, teY, trX2, teX2, w_h1, w_o, b1, bo, plambda, a) time3[t] = t3 / 10000 # soft cascade: beta = [0.0001] t2, accuracy2[t], F2[t], nnz2[t] = soft_cascade_LR_1LNN( trX, trY, teX, teY, trX2, teX2, beta, K) time2[t] = t2 / 10000 t += 1
def text_classifly_twang(vectorizer, doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test, doc_terms_list_train, fs_method, fs_num): #文件夹下有多个子文件夹,每个文件夹名为类别并存放属于该类的txt #fs_method是特征选取的方式 #fs_num是在进行特征选取后,按照特征重要度排序后得到的前fs_num个特征 print('Loading dataset, 80% for training, 20% for testing...') # print(doc_class_list_train)#一串1 0 组成的列表 print('Feature selection...') print('fs method:' + fs_method, 'fs num:' + str(fs_num)) # at first i encounter an encoding problem #here i fix the problem with the decode_error parameter = u'ignore' while default setting is u'strict' #http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html # print((word_tokenizer(('i am new student\nyou good'))))#return a list of word_tokenizer # print(list(jieba.cut('我是撒谎比\n你是\t杀吧', cut_all = False ))) # doc_terms_list_train = [word_tokenizer(str(doc_str, encoding = 'utf-8', errors = 'ignore')) for doc_str in doc_str_list_train] selectedFeatures = feature_selection.feature_selection( doc_terms_list_train, doc_class_list_train, fs_method) print(len(selectedFeatures)) term_set_fs = selectedFeatures[:fs_num] # print('-----------',len(term_set_fs)) term_dict = dict(zip(term_set_fs, range(len(term_set_fs)))) vectorizer.fixed_vocabulary = True vectorizer.vocabulary = term_dict doc_train_vec = vectorizer.fit_transform(doc_str_list_train) #print the number of features # print(doc_train_vec.shape) # print((doc_train_vec)) #scipy.sparse.csr.csr_matrix # print(type(doc_train_vec)) doc_test_vec = vectorizer.transform(doc_str_list_test) # print('Building Naive Beyas model...') # clf = MultinomialNB().fit(doc_train_vec, doc_class_list_train) #调用MultinomialNB分类��? # doc_test_predicted = clf.predict(doc_test_vec) # print('Building SVM model...') # svclf = SVC(kernel = 'linear')#default with 'rbf' # svclf.fit(doc_train_vec, doc_class_list_train) # doc_test_predicted = svclf.predict(doc_test_vec) print('Building KNN model...') knnclf = KNeighborsClassifier() knnclf.fit(doc_train_vec, doc_class_list_train) doc_test_predicted = knnclf.predict(doc_test_vec) #here i can't use RBM, there is not prediction attribute # from sklearn.neural_network import BernoulliRBM # RBMclf = BernoulliRBM().fit(doc_train_vec, doc_class_list_train) # doc_test_predicted = RBMclf.predict(doc_test_vec) # print('Building Multilayer perception classifier model...') # mlpclf = MLPClassifier(solver = 'lbfgs', alpha = 1e-5, hidden_layer_sizes = (50,20), random_state = 1) # mlpclf.fit(doc_train_vec, doc_class_list_train) # doc_test_predicted = mlpclf.predict(doc_test_vec) acc = np.mean(doc_test_predicted == doc_class_list_test) print('Accuracy:', acc) return acc
predictions = model.predict(df) result = load_submission() result['SalePrice'] = predictions if save_csv: result.to_csv('../data/processed/test_results.csv', index=False) return result def load_submission(path='../data/raw/sample_submission.csv'): return pd.read_csv(path) def preprocessing(df): df = pd.get_dummies(df, drop_first=True) return df def load_model(filename="../models/2-gradient-boosting.sav"): return joblib.load(filename) if __name__ == "__main__": os.chdir(os.path.dirname(sys.argv[0])) model = load_model() test = pd.read_csv('../data/raw/test.csv', index_col=0) test = clean(test, to_test=True) test = feature_selection(test) test = preprocessing(test) print(predict_results(model, test))
train = pd.read_csv('data/train.csv') test = pd.read_csv('data/test.csv') for c in train.columns: if train[c].dtype == 'object': lbl = LabelEncoder() lbl.fit(list(train[c].values) + list(test[c].values)) train[c] = lbl.transform(list(train[c].values)) test[c] = lbl.transform(list(test[c].values)) trainX = train.drop(["ID", "y"], axis=1).values trainY = train['y'].values model = feature_selection.feature_selection( trainX, trainY, chi2, method="SelectKBest", k=150) # trainX_new = model.transform(trainX) params = {} params['max_bin'] = 10 params['learning_rate'] = 0.003 # shrinkage_rate params['boosting_type'] = 'gbdt' params['objective'] = 'regression' params['metric'] = 'l1' # or 'mae' params['sub_feature'] = 0.95 # feature_fraction params['bagging_fraction'] = 0.85 # sub_row params['bagging_freq'] = 20 params['num_leaves'] = 512 # num_leaf params['min_data'] = 500 # min_data_in_leaf params['min_hessian'] = 0.05 # min_sum_hessian_in_leaf
def run_model_recursive(apt_fname, tr_te_split, res_file_pref, freq, is_feat_select, is_draw): ############ # load data ############ print('========================================' * 2) print(apt_fname, res_file_pref) print(tr_te_split) df = load_data(apt_fname, freq) train = df[tr_te_split['trb']:tr_te_split['tre']] test = df[tr_te_split['teb']:tr_te_split['tee']] print(test) print('train/test:', train.shape, test.shape) feat = list(train.columns.values) feat.remove('energy') feat.remove('raw_energy') print('features (%d):' % len(feat), feat) print('index of energy-1:', feat.index('energy-1')) X_train = train[feat].as_matrix() y_train = train['energy'].as_matrix() X_test = test[feat].as_matrix() y_test = test['raw_energy'].as_matrix() print('train/test (after converting to matrix):', X_train.shape, X_test.shape) #################### # feature selection #################### if is_feat_select: print('feature seleciton ...') selected = feature_selection(X_train, y_train, 12) print(len(selected)) print('selected features (%d):' % sum(selected), [feat[i] for i in range(len(selected)) if selected[i]]) X_train = X_train[:, selected] X_test = X_test[:, selected] print('train/test (after feature selection):', X_train.shape, X_test.shape) res_file_pref += '_feature' ######## # train ######## print('training ...') parameters = { 'n_estimators': (50, 100, 150, 200, 250, 300, 350, 400, 450, 500), 'max_depth': [1, 2, 3], 'learning_rate': [0.001, 0.01, 0.1], 'random_state': [42], 'loss': ['ls'] } # parameters = {'n_estimators': (50,), # 'max_depth': [1], # 'learning_rate': [0.001], # 'random_state': [42], # 'loss': ['ls']} clf = GridSearchCV(GradientBoostingRegressor(), param_grid=parameters, cv=TimeSeriesSplit(n_splits=3), scoring='neg_mean_squared_error') clf.fit(X_train, y_train) print(clf.best_params_) ####### # test ####### print('testing (recursive) ...') y_pred = [] for i in range(len(X_test)): # print '-------' * 10 # print 'i:', i # print 'y_pred:', y_pred # print 'feat:', X_test[i][18:] # print 'range(i):', range(i) for j in range(min(i, 49)): X_test[i][j + feat.index('energy-1')] = np.log(y_pred[-j - 1] + 1) # print 'feat:', X_test[i][18:] y_p = clf.predict([X_test[i]])[0] y_p = np.exp(y_p) - 1 # print 'y_p:', y_p, np.log(y_p+1) y_pred.append(y_p) ############# # evaluation ############# mse = mean_squared_error(y_test, y_pred) mape = mean_absolute_percentage_error(y_test, y_pred) print('MSE:', mse) print('MAPE:', mape) print('save result to file ...') pickle.dump({ 'y_test': y_test, 'y_pred': y_pred }, open(res_file_pref + '_mse%.4f_mape%.4f.pkl' % (mse, mape), 'wb')) print('saved.') if is_draw: pyplot.plot(y_test) pyplot.plot(y_pred, color='red') pyplot.show()
#all the work is done here def transform(self, X): X = pd.Series(X) X_tagged = X.apply(self.custom_feat).apply(pd.Series).fillna(0) X_tagged['n_tokens'] = X_tagged.apply(sum, axis=1) #print("Xxxx ", X_tagged) if self.normalize: X_tagged = X_tagged.divide(X_tagged['n_tokens'], axis=0).fillna(0) #print("X tagged ", X_tagged) return X_tagged if __name__ == '__main__': qd = question_detector() feature_sel = feature_selection() #sys.exit() data = pd.read_csv(qd.input_file) X_train_1 = data['post_text'] y_train_1 = data['category'] print(X_train_1.shape, "\t", y_train_1.shape) data = pd.read_csv(qd.input_file_2) X_train_2 = data['post_text'] y_train_2 = data['category'] print(X_train_2.shape, "\t", y_train_2.shape) uniques, count = np.unique(y_train_2, return_counts=True) print(dict(zip(uniques, count))) sys.exit()
import os def preprocessing_train(df): df = pd.get_dummies(df, drop_first=True) return df def train_model(df): X_train, y_train = df.drop(columns=['SalePrice']), df[['SalePrice']] X_train = preprocessing_train(X_train) model = GradientBoostingRegressor(n_estimators=3500, learning_rate=0.01, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state=42) model.fit(X_train, y_train) return model if __name__ == "__main__": os.chdir(os.path.dirname(sys.argv[0])) df = pd.read_csv('../data/raw/train.csv', index_col=0) df = clean(df) df = feature_selection(df) model = train_model(df) joblib.dump(model, '../models/2-gradient-boosting.sav')
def text_classifly_twang(vectorizer, doc_str_list_train, doc_str_list_test,doc_class_list_train, doc_class_list_test, doc_terms_list_train,fs_method, fs_num, cf_method): print('Loading dataset, 80% for training, 20% for testing...') print('Feature selection...') print('fs method:' + fs_method, 'fs num:' + str(fs_num)) selectedFeatures = feature_selection.feature_selection(doc_terms_list_train, doc_class_list_train, fs_method) print('-------select feature_selection') numShow = 500 count = 0 # 打印特证词 # for i in selectedFeatures[1:numShow]: # print(count, ' \t',i) # count += 1 print('特征词的数量:') print(len(selectedFeatures)) term_set_fs = selectedFeatures[:fs_num]#选择前fs_num个特征词 term_dict = dict(zip(term_set_fs, range(len(term_set_fs))))#建立单词字典,key是特证词,value是索引号 #词频矩阵 vectorizer.fixed_vocabulary = True#固定单词库,这样词频矩阵只会统计单词库内的单词出现的频次 vectorizer.vocabulary = term_dict doc_train_vec = vectorizer.fit_transform(doc_str_list_train) doc_test_vec= vectorizer.transform(doc_str_list_test) if(cf_method == 'nb'): print('Building Naive Beyas model...') clf = MultinomialNB().fit(doc_train_vec, doc_class_list_train) #调用MultinomialNB分类��? doc_test_predicted = clf.predict(doc_test_vec) elif(cf_method == 'svm'): print('Building SVM model...') svclf = SVC(kernel = 'linear')#default with 'rbf' svclf.fit(doc_train_vec, doc_class_list_train) doc_test_predicted = svclf.predict(doc_test_vec) elif(cf_method == 'knn'): print('Building KNN model...') knnclf = KNeighborsClassifier(5)#括号内传入k值 knnclf.fit(doc_train_vec, doc_class_list_train) doc_test_predicted = knnclf.predict(doc_test_vec) elif(cf_method == 'bp'): print('Building Multilayer perception classifier model...') mlpclf = MLPClassifier(solver = 'lbfgs', alpha = 1e-5, hidden_layer_sizes = (50,20), random_state = 1) mlpclf.fit(doc_train_vec, doc_class_list_train) doc_test_predicted = mlpclf.predict(doc_test_vec) #here i can't use RBM, there is not prediction attribute # from sklearn.neural_network import BernoulliRBM # RBMclf = BernoulliRBM().fit(doc_train_vec, doc_class_list_train) # doc_test_predicted = RBMclf.predict(doc_test_vec) #打印准确度 acc = np.mean(doc_test_predicted == doc_class_list_test) printOption = True if (printOption): print('Accuracy:', acc) f1_micro = metrics.f1_score(doc_class_list_test, doc_test_predicted, average = 'micro') f1_macro = metrics.f1_score(doc_class_list_test, doc_test_predicted, average = 'macro') #average = macro , 没有考虑不平衡数据集。本次实验使用的三个类样本数目相近,因此f1_macro直接计算分类器在三个类别的f1值的均值,结果可以接受 if (printOption): print('f1_micro:\t', f1_micro, '\tf1_macro:\t', f1_macro) print(metrics.f1_score(doc_class_list_test, doc_test_predicted, average = None)) #只选取了三个类的文章 print(metrics.classification_report(doc_class_list_test, doc_test_predicted, target_names = ['sport','economy','computer'])) print(metrics.confusion_matrix(doc_class_list_test, doc_test_predicted)) else: f1_macro = 0 acc = 0 return f1_macro, acc
trX1 = trX teX1 = teX plambda2 = [0.425] K = 3 v_h1, v_o, c1, co, time22, accuracy22, F22 = second_stage_pretraining( trX, trY, teX, teY, trX1, teX1, K, w_h1, w_h2, w_o, b1, b2, bo, plambda2, a) f_subset1 = np.arange(19) # respiration features f_subset2 = np.arange(19, 32, 1) # wrist features f1 = 19 z1 = 1 f2 = 13 z2 = 1 trX1, teX1 = feature_selection(trX, teX, f_subset1, f1, z1) trX2, teX2 = feature_selection(trX, teX, f_subset2, f2, z2) # firm cascade: plambda = [0.05] t3, accuracy3[t], F3[t], nnz = tree_cascade_v1(trX, trY, teX, teY, trX1, teX1, trX2, teX2, w_h1, w_h2, w_o, b1, b2, bo, v_h1, v_o, c1, co, plambda, a) time3[t] = t3 / 10000 # soft cascade: beta = [0.001] t2, accuracy2[t], F2[t], nnz_soft = tree_soft_cascade_v1( trX, trY, teX, teY, trX1, teX1, trX2, teX2, beta, K, K1, K2) time2[t] = t2 / 10000
def run_model(apt_fname, tr_te_split, res_file_pref, freq, is_feat_select, is_draw): ############ # load data ############ print('========================================' * 2) # print(apt_fname, res_file_pref) print(tr_te_split) df = load_data(apt_fname, freq) train = df[tr_te_split['trb']: tr_te_split['tre']] test = df[tr_te_split['teb']: tr_te_split['tee']] # print(test) # print('train/test:', train.shape, test.shape) feat = list(train.columns.values) feat.remove('energy') feat.remove('raw_energy') # print('raw features (%d):' % len(feat), feat) X_train = train[feat].as_matrix() y_train = train['energy'].as_matrix() X_test = test[feat].as_matrix() y_test = test['raw_energy'].as_matrix() # print('train/test (after converting to matrix):', X_train.shape, X_test.shape) #################### # feature selection #################### if is_feat_select: print('feature seleciton ...') selected = feature_selection(X_train, y_train, 12) print(len(selected)) print('selected features (%d):' % sum(selected), [feat[i] for i in range(len(selected)) if selected[i]]) X_train = X_train[:, selected] X_test = X_test[:, selected] print('train/test (after feature selection):', X_train.shape, X_test.shape) res_file_pref += '_feature' ######## # train ######## print('training ...') parameters = {'C': (0.001, 0.01, 0.1, 1), 'kernel': ['rbf', 'linear', 'poly', 'sigmoid'] } clf = GridSearchCV(svm.SVR(), param_grid=parameters, cv=TimeSeriesSplit(n_splits=3), scoring='neg_mean_squared_error') clf.fit(X_train, y_train) print(clf.best_params_) ####### # test ####### print('testing ...') y_pred = clf.predict(X_test) # y_pred = np.exp(np.cumsum(np.concatenate(([np.log(y_test[0])], y_pred)))) y_pred = np.exp(y_pred) - 1 ############# # evaluation ############# mse = mean_squared_error(y_test, y_pred) mape = mean_absolute_percentage_error(y_test, y_pred) print('MSE:', mse) print('MAPE:', mape) print('save result to file ...') pickle.dump( {'y_test': y_test, 'y_pred': y_pred}, open(res_file_pref + '_mse%.4f_mape%.4f.pkl' % (mse, mape), 'wb')) print('saved.') if is_draw: pyplot.plot(y_test) pyplot.plot(y_pred, color='red') pyplot.show()
with open(results_path, 'w') as f: json.dump(scores, f) if verbose: print("Successfully saved scores.") if __name__ == '__main__': parser = argparse.ArgumentParser(description="Fragile Families Train Script") parser.add_argument('model', help="model") parser.add_argument('outcome', help="outcome") parser.add_argument('-i', dest='imp_method', help="imputation method", default='KNN') parser.add_argument('-m', dest='fs_method', help="feature selection method", default='ElasticNet') parser.add_argument('-d', dest='data_dir', help='data directory', default='data') parser.add_argument('-s', dest='results_dir', help='results directory', default='results') args = parser.parse_args() features_path = 'features_{}_{}.csv'.format(args.outcome, args.imp_method) labels_path = 'labels_{}_{}.csv'.format(args.outcome, args.imp_method) print("Loading dataset...") X, Y = get_data(args.data_dir, features_path, labels_path) is_classf = Y.dtype == np.int8 print("Successfully loaded dataset.") if args.fs_method: print("Performing feature selection using {}...".format(args.fs_method)) X = feature_selection(X, Y, args.outcome, args.fs_method, args.imp_method, args.data_dir, verbose=1) print("X dim: {}".format(X.shape)) train(args.model, X, Y, is_classf, args.outcome, args.fs_method, args.imp_method, args.data_dir, args.results_dir, verbose=1)
def create_dictionary(posfile, negfile, dicfile, use_stopwords, stopwordsfile, use_chi2_select, local_fun, global_fun): stopwords = [] if use_stopwords: stopwords = read_keywords_file(stopwordsfile) fwrite = open(dicfile, 'w') if fwrite is None: raise IOError('%s cannt open' % (dicfile)) # 读取所有文件内的词 words = [] words.extend(read_words(posfile)) words.extend(read_words(negfile)) # 统计词频 cnts = collections.Counter(words).most_common() print('total vocab:%d' % len(cnts)) # 去除停用词+低词频 dictionary = {} idx = 0 reverse_dic = [] for cnt in cnts: if cnt[1] >= min_freq and cnt[0] not in stopwords: dictionary[cnt[0]] = idx reverse_dic.append(cnt[0]) idx += 1 print('total vocab after stop and min_req :%d' % idx) # 卡方检验抽取chi2_rate的词,并计算global_fun if not use_chi2_select: #使用卡方检验 return posdata, poslabel, posidf = read_data_with_label(posfile, 1, dictionary, local_fun, global_fun) negdata, neglabel, negidf = read_data_with_label(negfile, -1, dictionary, local_fun, global_fun) # 文档数 D = len(posdata) + len(negdata) # 获取idf idf = np.log(D / (posidf + negidf)) datas = posdata labels = poslabel datas.extend(negdata) labels.extend(neglabel) global C global kernel global gamma dim_k, C, kernel, gamma, scores, pvals = feature_selection.feature_selection( datas, labels) # chi2值,p值, 单词, idf合并 chi2info = zip(scores, pvals, reverse_dic, idf) chi2info = sorted(chi2info, key=itemgetter(0), reverse=True) vocab_size = dim_k print('total vocab after chi2:%d' % vocab_size) for i in range(vocab_size): fwrite.write( '%lf\t%lf\t%s\t%lf\n' % (chi2info[i][0], chi2info[i][1], chi2info[i][2], chi2info[i][3])) fwrite.close()
# fe_stats x_train, x_test = fe_stats(x_train, x_test, genes_features, cells_features) x_train.head() # group the drug using kmeans if runty == 'traineval': x_train, x_test = fe_cluster(x_train, x_test, genes_features, cells_features, n_cluster_g=cfg_fe.n_clusters_g, n_cluster_c=cfg_fe.n_clusters_c, seed=cfg_fe.seed, runty=runty, path=save_path) elif runty == 'eval': x_train, x_test = fe_cluster(x_train, x_test, genes_features, cells_features, n_cluster_g=cfg_fe.n_clusters_g, n_cluster_c=cfg_fe.n_clusters_c, seed=cfg_fe.seed, runty=runty, path=load_path) # select feature, VarianceThreshold x_train, x_test = feature_selection( x_train, x_test, feature_select=cfg_fe.feature_select, variancethreshold_for_FS=cfg_fe.variancethreshold_for_FS) # one-hot encoding x_train = onehot_encoding(x_train) x_test = onehot_encoding(x_test) feature_cols = [c for c in x_train.columns if (str(c)[0:5] != 'kfold' and c not in [ 'sig_id', 'drug_id', 'cp_type', 'cp_time', 'cp_dose'])] target_cols = [x for x in y_train.columns if x != 'sig_id'] # label smoothing if cfg_fe.regularization_ls: y_train = ls_manual(y_train, ls_rate=cfg_fe.ls_rate)
from feature_engineering import feature_engineering from feature_selection import feature_selection from Models import linear_model,xgb_model import argparser import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) if __name__ == '__main__': parser=argparse.ArgumentParser() parser.add_argument('--train_dataset',help='address of train dataset') parser.add_argument('--test_dataset',help='address of test dataset') parser.add_argument('--model',help='model') train_dataset = parser.train_dataset test_dataset = parser.test_dataset model=parser.model feature_engineering(train_dataset,test_dataset) feature_selection() if model=='linear': linear_model() if model=='xbg': xgb_model() elif: linear_model() xgb_model()
def compute_regression_results(datasets,cities_dict,city,window,setup,baseline,fs_method,fs_feature_num,features,regressor,weights=None): """ Compute regression results args: datasets -- dict containing pandas dataframes from each city and window cities_dict -- a dict which as the country code as key (e.g UK) and the list of cities in this country as a value city -- the string name of the city window -- the number of aggregated timesteps(hours) (valid values: 6,12,24) setup -- the regression setup (valid values: ('cross city' (i.e. all to one),'within city' (i.e. same city))) baseline -- string to indicate whether this experiment is baseline, by defining the prediction metric, or not (valid values: 'idw','mean','NULL')) fs_method -- the feature selection method ('Conly':features with highier correlation with PM2.5 in all cities.(used in paper), ' 'Sonly'':features with lowest correlation variance with PM2.5 in all cities, 'S&C':combination of previous methods, 'None':No feature selection) fs_feature_num -- number of best features to keep after performing feature selection or 'None' features -- list of features in one step regression (e.g ['#aqs','bow_10k_unigrams_normalized']) or list of lists of features for two step regression (e.g [[bow_10k_unigrams_normalized'],['nearby_ground_truth_pm25']]) regressors -- an sklearn regressor for one step regression or a list of two sklearn regressors in two step regression setup weights -- the inverse distance weight matrix from all cities (used only in cross city setups) in order to weight each training sample when training the model or None """ print(city+' -> '+ str(window)+' in '+setup+' setup with features-> '+str(features) ) for code in cities_dict: if city in cities_dict[code]: country_code = code country = cities_dict[code] weights_flag = False if weights is not None: weights_flag = True if setup == 'within city': dataset = datasets[city+'_'+str(window)] train,test = split_dataset_even_odd_months(dataset) if weights is not None: raise Exception('Need to use cross city setup when using weights') elif setup == 'cross city': train,test = create_all_vs_one_datasets(datasets,cities_dict,city,window,weights=weights) #check if it is a 2 step regression if isinstance(features[0],list): two_step = True first_feature = features[0]#feature selection only for the first feature (currently implemented to work only for one bag of words feature) feature_list = features[0] + features[1] else: two_step = False first_feature = features feature_list = features #check if the experiment is a baseline error computation if baseline != 'NULL': copy_test = test.copy() copy_test = test.dropna() if baseline =='idw': predictions = copy_test['idw_pm25'] elif baseline == 'mean': copy_test['mean'] = test.pm25.mean() predictions = copy_test['mean'] print else: raise Exception('invalid baseline parameter') copy_test['pm25_cat'] = copy_test.pm25.apply(to_labels) return[country_code,city,window,setup,baseline,'NULL','NULL','NULL','NULL','NULL','NULL','NULL','NULL', np.sqrt(sm.mean_squared_error(predictions, copy_test.pm25)),sm.mean_absolute_error(predictions, copy_test.pm25), precision_recall_fscore_support(copy_test.pm25_cat,predictions.apply(to_labels),labels=['good','bad'])[0][1], precision_recall_fscore_support(copy_test.pm25_cat,predictions.apply(to_labels),labels=['good','bad'])[1][1], precision_recall_fscore_support(copy_test.pm25_cat,predictions.apply(to_labels),labels=['good','bad'])[2][1]] types = ['BOW','IDW_PM25','Twitter'] type_mask =[False,False,False] representation = 'None' for i in feature_list: if 'bow' in i: type_mask[0] = True representation = 'uni_tf' # only this is supported currently elif 'idw' in i: type_mask[1] = True else: type_mask[2] = True feature_types = '+'.join(list(np.array(types)[type_mask])) #join types if they exist feature_details = '+'.join(feature_list) if two_step: feature_details = feature_details+'_2step' regressor_name = [get_regressor_name(regressor[0]),get_regressor_name(regressor[1])] else: regressor_name = get_regressor_name(regressor) #feature selection if fs_method != 'NULL': if fs_feature_num != 'NULL': if len(first_feature) > 1: raise Exception('You have to use only one bow feature for feature selection') #(currently implemented to work only for one bag of words feature) _,mask = feature_selection(datasets,country,first_feature[0],'pm25',window,method=fs_method) mask = mask[:fs_feature_num]#get the fs_feature_num top features else: mask = None if two_step: #calculate the training predictions using KFold cross validation to train the second step regression model train_dataset,bow_predictions = create_cv_bow_model(regressor[0], features[0],'pm25',train,mask=mask,keep=features[0]+features[1],weights=weights_flag,cv=3) #train and test a regressor with first step features _,test_predictions =first_step_regression(train,test,regressor[0],features[0],mask=mask ,keep =features[0]+features[1],weights=weights_flag) #train second step regressor with second step features and bow prediction model = create_second_step_bow_model(regressor[1],train_dataset,features[1],bow_predictions,weights=weights_flag) #use above model to test second step features with test predictions from first step features rmse_res,mae_res,test_prediction,precision,recall,fscore = testing(model,features[1],'pm25',test,mask=None,additional_features= test_predictions, keep=features[0]+features[1],classification=True) return[country_code,city,window,setup,baseline,weights_flag,fs_method,fs_feature_num, feature_types,feature_details,representation,regressor_name[0],regressor_name[1],rmse_res,mae_res,precision[1],recall[1],fscore[1]] else: #training model,train_prediction = training(regressor,features,'pm25',train,mask=mask,weights=weights_flag) #add mask for feature selecation #testing rmse_res,mae_res,test_prediction,precision,recall,fscore = testing(model,features,'pm25',test,mask=mask,classification=True,verbose=False) return[country_code,city,window,setup,baseline,weights_flag,fs_method,fs_feature_num, feature_types,feature_details,representation,regressor_name,'NULL',rmse_res,mae_res,precision[1],recall[1],fscore[1]]
teY = Y[test_idxs] # parameter alpha of the gating function: a = 10 # number of hidden units in 1LNN: K1 = 10 K2 = 20 # pre-training/initialization of the parameters of 1LNN: w_h1, w_h2, w_o, b1, b2, bo, t1, accuracy1[t], F1[t] = NN_pretraining( trX, trY, teX, teY, K1, K2) time1[t] = t1 / 10000 f_subset1 = np.arange(37) # all the features f = 37 z = 0 trX2, teX2 = feature_selection(trX, teX, f_subset1, f, z) # firm cascade: plambda1 = [0.43] K = 3 v_h1, v_o, c1, co, t22, a22, f22 = second_stage_pretraining( trX, trY, teX, teY, trX2, teX2, K, w_h1, w_h2, w_o, b1, b2, bo, plambda1, a) f_subset2 = [25, 29] # median of (roll,pitch) f = 5 z = 1 trX3, teX3 = feature_selection(trX, teX, f_subset2, f, z) plambda2 = [0.26] t3, accuracy3[t], F3[t], nnz3_firt, nnz3_second = cascade_three_stage(
def main(): cfg_fe = Config_FeatureEngineer() seed_everything(seed_value=cfg_fe.seed) data_dir = '/kaggle/input/lish-moa/' save_path = './' load_path = '/kaggle/input/moatabnetmultimodekfold/' runty = 'eval' train = pd.read_csv(os.path.join(data_dir, 'train_features.csv')) targets_scored = pd.read_csv( os.path.join(data_dir, 'train_targets_scored.csv')) test = pd.read_csv(os.path.join(data_dir, 'test_features.csv')) train_drug = pd.read_csv(os.path.join(data_dir, 'train_drug.csv')) submission = pd.read_csv(os.path.join(data_dir, 'sample_submission.csv')) x_train = train.copy() x_test = test.copy() y_train = targets_scored.copy() genes_features = [column for column in x_train.columns if 'g-' in column] cells_features = [column for column in x_train.columns if 'c-' in column] # scale the data, like RankGauss x_train, x_test = scaling(x_train, x_test, scale=cfg_fe.scale, n_quantiles=cfg_fe.scale_n_quantiles, seed=cfg_fe.seed) # decompose data, like PCA if runty == 'traineval': x_train, x_test = decompo_process(x_train, x_test, decompo=cfg_fe.decompo, genes_variance=cfg_fe.genes_variance, cells_variance=cfg_fe.cells_variance, seed=cfg_fe.seed, pca_drop_orig=cfg_fe.pca_drop_orig, runty=runty, path=save_path) elif runty == 'eval': x_train, x_test = decompo_process(x_train, x_test, decompo=cfg_fe.decompo, genes_variance=cfg_fe.genes_variance, cells_variance=cfg_fe.cells_variance, seed=cfg_fe.seed, pca_drop_orig=cfg_fe.pca_drop_orig, runty=runty, path=load_path) # select feature, VarianceThreshold x_train, x_test = feature_selection( x_train, x_test, feature_select=cfg_fe.feature_select, variancethreshold_for_FS=cfg_fe.variancethreshold_for_FS) # fe_stats x_train, x_test = fe_stats(x_train, x_test, genes_features, cells_features) # group the drug using kmeans if runty == 'traineval': x_train, x_test = fe_cluster(x_train, x_test, genes_features, cells_features, n_cluster_g=cfg_fe.n_clusters_g, n_cluster_c=cfg_fe.n_clusters_c, seed=cfg_fe.seed, runty=runty, path=save_path) elif runty == 'eval': x_train, x_test = fe_cluster(x_train, x_test, genes_features, cells_features, n_cluster_g=cfg_fe.n_clusters_g, n_cluster_c=cfg_fe.n_clusters_c, seed=cfg_fe.seed, runty=runty, path=load_path) # one-hot encoding x_train = onehot_encoding(x_train) x_test = onehot_encoding(x_test) feature_cols = [ c for c in x_train.columns if (str(c)[0:5] != 'kfold' and c not in ['sig_id', 'drug_id', 'cp_type', 'cp_time', 'cp_dose']) ] target_cols = [x for x in y_train.columns if x != 'sig_id'] # label smoothing if cfg_fe.regularization_ls: y_train = ls_manual(y_train, ls_rate=cfg_fe.ls_rate) # merge drug_id and labels x_train = x_train.merge(y_train, on='sig_id') x_train = x_train.merge(train_drug, on='sig_id') # remove sig_id # x_train, x_test, y_train = remove_ctl(x_train, x_test, y_train) # make CVs target_cols = [x for x in targets_scored.columns if x != 'sig_id'] x_train = make_cv_folds(x_train, cfg_fe.seeds, cfg_fe.nfolds, cfg_fe.drug_thresh, target_cols) begin_time = datetime.datetime.now() if (runty == 'traineval'): test_preds_all = train_tabnet(x_train, y_train, x_test, submission, feature_cols, target_cols, cfg_fe.seeds, cfg_fe.nfolds, save_path) y_train = targets_scored[ train['cp_type'] != 'ctl_vehicle'].reset_index(drop=True) test_pred_final = pred_tabnet(x_train, y_train, x_test, submission, feature_cols, target_cols, cfg_fe.seeds, cfg_fe.nfolds, load_path='./', stacking=False) elif (runty == 'eval'): y_train = targets_scored[ train['cp_type'] != 'ctl_vehicle'].reset_index(drop=True) test_pred_final = pred_tabnet(x_train, y_train, x_test, submission, feature_cols, target_cols, cfg_fe.seeds, cfg_fe.nfolds, load_path, stacking=False) time_diff = datetime.datetime.now() - begin_time print(f'Total time is {time_diff}') # make submission all_feat = [col for col in submission.columns if col not in ["sig_id"]] # To obtain the same lenght of test_preds_all and submission # sig_id = test[test["cp_type"] != "ctl_vehicle"].sig_id.reset_index(drop=True) sig_id = test.sig_id tmp = pd.DataFrame(test_pred_final, columns=all_feat) tmp["sig_id"] = sig_id submission = pd.merge(test[["sig_id"]], tmp, on="sig_id", how="left") submission.fillna(0, inplace=True) submission[test["cp_type"] == "ctl_vehicle"] = 0. submission.to_csv("submission_tabbet.csv", index=None)