def variance(self, inputs, epsilon=1e-4, data_format='channels_last'): assert self.use_variance result = FeatureEngineering.variance( inputs, variance_mode=self.variance_mode, relative_variance=self.relative_variance, compress_to_one_channel=self.compress_to_one_channel, epsilon=epsilon, data_format=data_format) return result
def featureFitWide(self, df, desc_dict, version, split_type=None): logging.debug("inside featureFitWide Module of Data Dictionary class.") from FeatureEngineering import FeatureEngineering if split_type == "dev": self.feature[version] = FeatureEngineering(desc_dict, version) if self.basic_dict['data_struct'] == 'widef': merged_df = self.feature[version].createFeaturesWide( df, self.basic_dict['seqvaronly'], self.basic_dict['seqvarstart'], self.basic_dict['seqvarend']) self.feature[version].saveFeatureIterationsDev(merged_df) return merged_df
def featureFit(self, df, desc_dict, version, cohort_period_type=None, feature_type=['c', 's', 'v'], centrality_period=None, centrality_order=None, n_month=12, split_type=None): logging.debug("inside featureFit Module of Data Dictionary class.") from FeatureEngineering import FeatureEngineering if split_type == "dev": self.feature[version] = FeatureEngineering(desc_dict, version) if self.basic_dict['data_struct'] == 'widef': merged_df = self.feature[version].createFeaturesWide( df, self.basic_dict['seqvaronly'], self.basic_dict['seqvarstart'], self.basic_dict['seqvarend']) elif self.basic_dict['data_struct'] == 'longf': if len(feature_type) > 0: self.feature[ version].cohort_period_type = cohort_period_type self.feature[version].feature_type = feature_type self.feature[version].centrality_period = centrality_period self.feature[version].centrality_order = centrality_order self.feature[version].n_month = n_month merged_df = self.Features(df, desc_dict, version, cohort_period_type, feature_type, centrality_period, centrality_order, n_month, split_type) else: merged_df = df.loc[df[self.basic_dict['performance'] [0]] == self.basic_dict['cohort_df'] ['cohort']['dev']] self.feature[version].saveFeatureIterationsDev(merged_df) return merged_df
del traini del testi else: train = pd.read_csv('train_transaction.csv') traini = pd.read_csv('train_identity.csv') train = pd.merge(train, traini, on='TransactionID', how='left') test = train.sample(frac=0.7, random_state=99) train = train[~train.index.isin(test.index)] del traini print("Done!") print("Feature engineering...") train = FE.reduce_mem_usage(train) test = FE.reduce_mem_usage(test) FE.make_ymdhd_feature(train) FE.make_ymdhd_feature(test) if use_sampling == 0: train, _ = granularity_to_use[granularity_key](train) elif use_sampling == 1: train_copy = train.copy() train_month, _ = DS.per_month_down_sampling(train) train = train_copy.copy() train_dow, _ = DS.per_week_down_sampling(train) train = train_copy.copy() train_day, _ = DS.per_day_down_sampling(train) train = train_copy.copy()
if verbal == True: print('The total feature number is ' + str(sum(index == True))) print('The selected feature name is ' + str(getSelectedName)) if not returnCoef: return (X_train, X_test) else: return (X_train, X_test, coef) if __name__ == '__main__': from FeatureEngineering import FeatureEngineering ROOT = '/Users/mac/Desktop/ML_Quant/data' rawDf = pd.read_pickle(os.path.join(ROOT, 'cleanedFactor.pkl')) getFeatures = FeatureEngineering(ROOT) features = getFeatures.combine_feature() rawDf = pd.merge(features, rawDf, on='date') # rawDf = rawDf.fillna(method = 'ffill') rawXs, rawYs = rawDf.iloc[:, :-4], rawDf.iloc[:, -1].astype(bool) def split_train_test_data(X, y, test_size): num_train = int(len(X) - len(X) * test_size) X_train = X.iloc[:num_train, :] X_test = X.iloc[num_train:, :] y_train = y[:num_train] y_test = y[num_train:] return X_train, y_train, X_test, y_test X_train, y_train, X_test, y_test = split_train_test_data(rawXs, rawYs,
from FeatureEngineering import FeatureEngineering ROOT = '../' DATA_PATH = os.path.join(ROOT, '00 data') CLEANED_FACTOR_PATH = os.path.join(ROOT, '02 data process') rawDf = pd.read_pickle( os.path.join(CLEANED_FACTOR_PATH, 'cleanedFactor.pkl')) INDEX_FACTOR_PATH = os.path.join(ROOT, '02 data process') indexDf = pd.read_pickle( os.path.join(INDEX_FACTOR_PATH, 'newIndexFactor.pkl')) rawDf = pd.merge(indexDf, rawDf, on='date', how='right') # rawDf = pd.concat([indexDf,rawDf],axis = 1) #%% # sys.path.append(os.path.join(ROOT, '04 select feature and build model')) from FeatureEngineering import FeatureEngineering getFeatures = FeatureEngineering(DATA_PATH) features = getFeatures.combine_feature() rawDf = pd.merge(features, rawDf, on='date', how='right') # rawDf = rawDf.iloc[58:,:] rawXs, rawYs = rawDf.iloc[:, :-4], rawDf.iloc[:, -1] def split_train_test_data(X, y, test_size): num_train = int(len(X) - len(X) * test_size) X_train = X.iloc[:num_train, :] X_test = X.iloc[num_train:, :] y_train = y[:num_train] y_test = y[num_train:] return X_train, y_train, X_test, y_test X_train, y_train, X_test, y_test = split_train_test_data(rawXs, rawYs,
label='ROC curve (area = %0.2f)' % roc_auc[2]) plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('KNN 5000 Samples') plt.legend(loc="lower right") plt.show() if __name__ == "__main__": fullpath = lambda path: os.path.join(find_project_dir(), path) # Feature Engineering part fe = FeatureEngineering() wv = WordVectorizer() x_ser = fe.read_x_train_features().head(5000) y_mat = fe.read_y_train_features() # x_mat = fe.calc_count_matrix(x_ser) # x_mat = fe.calc_tfid_matrix(x_ser) x_mat = wv.transform(x_ser) X_train, X_test, y_train, y_test = train_test_split(x_mat, y_mat, test_size=0.2, random_state=1) # PCA Stuff below # pca = decomposition.PCA(n_components=50)
@author: zhang_000 """ import pandas as pd import numpy as np from sklearn.ensemble import RandomForestClassifier from sklearn.cross_validation import cross_val_score from sklearn.cross_validation import train_test_split from FeatureEngineering import FeatureEngineering from operator import itemgetter from matplotlib import pyplot as plt from sklearn.linear_model import LogisticRegression from sklearn.metrics import roc_curve, auc rawDataFile = './ds_challenge_v2_1_data (1) (1).csv' FE = FeatureEngineering(rawDataFile) rawData = FE.loadRawData() features = FE.generateFeatures() labels = FE.generateLabels() #test function def performTest(X_test, Y_test, classifier): predictions = classifier.predict(X_test) FA = 0 Accu = 0 MD = 0 Y_test = list(Y_test.as_matrix()) for i in range(len(predictions)): if Y_test[i] == predictions[i]: Accu += 1
log.info(Constants.INITIAL_MSG) # Start calculating execution time start_time = time.time() log.info(Constants.START_MSG) # Data Preprocessing Phase log.info(Constants.DATA_PREPROCESSING_MSG) dp = DataPreprocessing() time_series = dp.preprocessing() # Feature Engineering Phase log.info(Constants.FEATURE_ENGINEERING_MSG) fe = FeatureEngineering() new_time_series = fe.execute_feature_engineering(time_series) # new_time_series = dp.delete_column(time_series, ['Confirmed Cases', 'Deaths', 'Recovered Cases', 'Active Cases']) # Truncate zero values from the time series # new_time_series = dp.truncate_time_series(new_time_series, '26/02/2020') # Preliminary Analysis: Stationarity Check pa = PreliminaryAnalysis() pa.execute_preliminary_analysis(new_time_series) # Data Visualization Phase log.info(Constants.DATA_VISUALIZATION_MSG)
def sweep(loss, csv=True, cln=["Clean "], ngrams=[1, 4, 5, 6, 7], min_df=[0.00001], max_df=[0.5, 0.6, 0.7], K=[]): fullpath = lambda path: os.path.join(find_project_dir(), path) nsamp = [1000, 500, 200, 200, 200, 100, 100, 100, 50, 30, 20, 10, 10] fe = FeatureEngineering() x_ser = fe.read_x_train_features() x_ser_clean = fe.read_clean_x_train_features() y_mat = fe.read_y_train_features() x_ser_test = fe.read_clean_x_test_features() def get_maker(csv): desc_print = "{}TF-IDF Data; min_ngrams:{}, max_ngrams:{}, min_df: {}, max_df: {}" desc_csv = "{}TF-IDF Data, {}, {}, {}, {}, {}" desc = desc_csv if csv else desc_print def make_tup(x): x = list(x) min_ngrams, max_ngrams = x[1] x[1] = min_ngrams x.insert(2, max_ngrams) return (x_ser_clean, desc.format(*x), *x) return make_tup if csv: print( "data, min_ngrams, max_ngrams, min_df, max_df, model, predictor, k, accuracy, precision, recall, f1, boot_acc" ) # ngrams = itertools.combinations(ngrams, 2) ngrams = [(1, i) for i in ngrams] params = itertools.product(cln, ngrams, min_df, max_df, range(len(K) - 1, -1, -1)) tups = list(map(get_maker(csv), params)) n = len(tups) start = time.time() i = 1 for tup in tups: x, dat, cln, min_ngrams, max_ngrams, min_df, max_df, j = tup k = K[j] n_samp = nsamp[j] x_ser = x_ser.head(k) x_ser_clean = x_ser_clean.head(k) y_mat = y_mat[:k, :] print("Starting {}...".format(dat), file=sys.stderr, flush=True, end='') tfidf = TfidfVectorizer(analyzer='word', ngram_range=(min_ngrams, max_ngrams), min_df=min_df, max_df=max_df, norm='l2') # count = CountVectorizer(analyzer='word', ngram_range=(min_ngrams, max_ngrams), min_df=min_df, max_df=max_df) try: x_mat = tfidf.fit_transform(x_ser_clean) # count.fit(x_ser_clean) # x_mat_train = count.transform(x_ser_clean) # x_mat_test = tfidf.transform(x_ser_test) except ValueError as e: continue if not csv: print("\n{}:".format(dat)) models = [WCNB(preproc=None)] bootstrap = Bootstrap(x_mat, y_mat, models, num_samples=n_samp) bootstrap.run() def prepend(x): typ = 'M' return [dat, x.name, typ] if csv: bootstrap.comma_separated_metrics(prepend=prepend) else: bootstrap.print_summary() finish = time.time() print("Done tup {}/{} in {}".format(i, n, finish - start), file=sys.stderr, flush=True) i += 1 start = finish
def main(): fullpath = lambda path: os.path.join(find_project_dir(), path) loss = lambda y_hat, y: np.vectorize(int)(y_hat == y) if False: sweep(loss, csv=True, K=[ 50, 100, 200, 400, 800, 1600, 3200, 6400, 12800, 25600, 51200, 102400, 165000 ], ngrams=[5], max_df=[0.5]) else: fe = FeatureEngineering() # x_ser = fe.read_x_train_features() x_ser_clean = fe.read_clean_x_train_features() y_mat = fe.read_y_train_features() # k = 5000 # k_tfidf = 500 # nsamp = 10 # x_ser = x_ser.head(k) # x_ser_clean = x_ser_clean.head(k) # y_mat = y_mat[:k,:] # y_mat_tfidf = y_mat[:k_tfidf,:] # x_mat = fe.calc_count_matrix(x_ser) # x_mat_clean = fe.calc_count_matrix(x_ser_clean) # x_tfidf_clean = fe.calc_tfid_matrix(x_ser_clean, max_ngrams=3, min_df=0.0001) # count = CountVectorizer() # tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0.0001) # x_mat_clean = fe.calc_count_matrix(x_ser_clean) print("done preproc A") """ Run bootstrap. """ # bootstrap = Bootstrap(x_ser_clean.head(500), y_mat_tfidf, [WCNB(preproc=tfidf)], num_samples=nsamp) # bootstrap.run() # bootstrap.print_summary() # bootstrap.models[0].save(fullpath('models/wcnb3')) # bootstrap = Bootstrap(x_ser_clean, y_mat, [NaiveBayes()], num_samples=nsamp) # bootstrap.run() # bootstrap.print_summary() # bootstrap.models[0].save(fullpath('models/nb')) """ Run fe.cv.transform() or fe.tf.transform() to get features after learning a model. Right now you have to run fe.calc_XXX_matrix on the data that was used to train the model first, then fe.XX.transform(x), where x is a Pandas series. """ preproc = TfidfVectorizer(analyzer='word', ngram_range=(1, 5), min_df=0.00001, max_df=0.5, norm='l2') # count = CountVectorizer(analyzer='word', ngram_range=(1, 5), min_df=0.00001, max_df=0.5) k = "1.5.-5.5" x_mat = preproc.fit_transform(x_ser_clean) # print(type(x_mat)) # pd.DataFrame(x_mat.toarray()).to_csv(fullpath("models/xmat.csv"), index_label=False) # count.fit(x_ser_clean) print("done preproc B") # model = WCNB() # model.fit(x_mat, y_mat) # model.save(fullpath('models/wcnb{}'.format(k))) model = WCNB.load(fullpath('models/wcnb{}'.format(k))) # model.fit(x_mat, y_mat) # x_test = preproc.transform(fe.read_clean_x_test_features()) y_hat = model.predict(x_mat) # pd.DataFrame(y_hat).to_csv(fullpath("models/wcnb{}_output.csv".format(k)), header=['category'], index_label='id') y = pd.get_dummies(pd.DataFrame(y_mat)).as_matrix() y_hat = pd.DataFrame(y_hat) y_hat = pd.get_dummies(y_hat).as_matrix() classes = y_hat.shape[1] plot_roc_curve(classes, y_hat, y)
del traini del testi else: train = pd.read_csv('train_transaction.csv') traini = pd.read_csv('train_identity.csv') train = pd.merge(train, traini, on='TransactionID', how='left') test = train.sample(frac=0.7, random_state=99) train = train[~train.index.isin(test.index)] del traini print("Done!") print("Feature engineering...") train = FE.reduce_mem_usage(train) test = FE.reduce_mem_usage(test) FE.make_ymdhd_feature(train) FE.make_ymdhd_feature(test) train = train.sort_values('day') test = test.sort_values('day') def ecdf(data): """Compute ECDF for a one-dimensional array of measurements.""" # Number of data points: n n = len(data) # x-data for the ECDF: x x = np.sort(data)