def variance(self, inputs, epsilon=1e-4, data_format='channels_last'):
     assert self.use_variance
     result = FeatureEngineering.variance(
         inputs,
         variance_mode=self.variance_mode,
         relative_variance=self.relative_variance,
         compress_to_one_channel=self.compress_to_one_channel,
         epsilon=epsilon,
         data_format=data_format)
     return result
예제 #2
0
 def featureFitWide(self, df, desc_dict, version, split_type=None):
     logging.debug("inside featureFitWide Module of Data Dictionary class.")
     from FeatureEngineering import FeatureEngineering
     if split_type == "dev":
         self.feature[version] = FeatureEngineering(desc_dict, version)
         if self.basic_dict['data_struct'] == 'widef':
             merged_df = self.feature[version].createFeaturesWide(
                 df, self.basic_dict['seqvaronly'],
                 self.basic_dict['seqvarstart'],
                 self.basic_dict['seqvarend'])
         self.feature[version].saveFeatureIterationsDev(merged_df)
         return merged_df
예제 #3
0
    def featureFit(self,
                   df,
                   desc_dict,
                   version,
                   cohort_period_type=None,
                   feature_type=['c', 's', 'v'],
                   centrality_period=None,
                   centrality_order=None,
                   n_month=12,
                   split_type=None):
        logging.debug("inside featureFit Module of Data Dictionary class.")
        from FeatureEngineering import FeatureEngineering

        if split_type == "dev":
            self.feature[version] = FeatureEngineering(desc_dict, version)
            if self.basic_dict['data_struct'] == 'widef':
                merged_df = self.feature[version].createFeaturesWide(
                    df, self.basic_dict['seqvaronly'],
                    self.basic_dict['seqvarstart'],
                    self.basic_dict['seqvarend'])
            elif self.basic_dict['data_struct'] == 'longf':
                if len(feature_type) > 0:
                    self.feature[
                        version].cohort_period_type = cohort_period_type
                    self.feature[version].feature_type = feature_type
                    self.feature[version].centrality_period = centrality_period
                    self.feature[version].centrality_order = centrality_order
                    self.feature[version].n_month = n_month
                    merged_df = self.Features(df, desc_dict, version,
                                              cohort_period_type, feature_type,
                                              centrality_period,
                                              centrality_order, n_month,
                                              split_type)
                else:
                    merged_df = df.loc[df[self.basic_dict['performance']
                                          [0]] == self.basic_dict['cohort_df']
                                       ['cohort']['dev']]
            self.feature[version].saveFeatureIterationsDev(merged_df)
            return merged_df
예제 #4
0
    del traini
    del testi

else:

    train = pd.read_csv('train_transaction.csv')
    traini = pd.read_csv('train_identity.csv')
    train = pd.merge(train, traini, on='TransactionID', how='left')
    test = train.sample(frac=0.7, random_state=99)
    train = train[~train.index.isin(test.index)]
    del traini

print("Done!")

print("Feature engineering...")
train = FE.reduce_mem_usage(train)
test = FE.reduce_mem_usage(test)

FE.make_ymdhd_feature(train)
FE.make_ymdhd_feature(test)

if use_sampling == 0:
    train, _ = granularity_to_use[granularity_key](train)
elif use_sampling == 1:
    train_copy = train.copy()
    train_month, _ = DS.per_month_down_sampling(train)
    train = train_copy.copy()
    train_dow, _ = DS.per_week_down_sampling(train)
    train = train_copy.copy()
    train_day, _ = DS.per_day_down_sampling(train)
    train = train_copy.copy()
    if verbal == True:
        print('The total feature number is ' + str(sum(index == True)))
        print('The selected feature name is ' + str(getSelectedName))

    if not returnCoef:
        return (X_train, X_test)
    else:
        return (X_train, X_test, coef)


if __name__ == '__main__':
    from FeatureEngineering import FeatureEngineering
    ROOT = '/Users/mac/Desktop/ML_Quant/data'
    rawDf = pd.read_pickle(os.path.join(ROOT, 'cleanedFactor.pkl'))
    getFeatures = FeatureEngineering(ROOT)
    features = getFeatures.combine_feature()
    rawDf = pd.merge(features, rawDf, on='date')
    # rawDf = rawDf.fillna(method = 'ffill')
    rawXs, rawYs = rawDf.iloc[:, :-4], rawDf.iloc[:, -1].astype(bool)

    def split_train_test_data(X, y, test_size):
        num_train = int(len(X) - len(X) * test_size)
        X_train = X.iloc[:num_train, :]
        X_test = X.iloc[num_train:, :]
        y_train = y[:num_train]
        y_test = y[num_train:]
        return X_train, y_train, X_test, y_test

    X_train, y_train, X_test, y_test = split_train_test_data(rawXs,
                                                             rawYs,
예제 #6
0
    from FeatureEngineering import FeatureEngineering
    ROOT = '../'
    DATA_PATH = os.path.join(ROOT, '00 data')
    CLEANED_FACTOR_PATH = os.path.join(ROOT, '02 data process')
    rawDf = pd.read_pickle(
        os.path.join(CLEANED_FACTOR_PATH, 'cleanedFactor.pkl'))
    INDEX_FACTOR_PATH = os.path.join(ROOT, '02 data process')
    indexDf = pd.read_pickle(
        os.path.join(INDEX_FACTOR_PATH, 'newIndexFactor.pkl'))
    rawDf = pd.merge(indexDf, rawDf, on='date', how='right')
    # rawDf = pd.concat([indexDf,rawDf],axis = 1)

    #%%
    # sys.path.append(os.path.join(ROOT, '04 select feature and build model'))
    from FeatureEngineering import FeatureEngineering
    getFeatures = FeatureEngineering(DATA_PATH)
    features = getFeatures.combine_feature()
    rawDf = pd.merge(features, rawDf, on='date', how='right')
    # rawDf = rawDf.iloc[58:,:]
    rawXs, rawYs = rawDf.iloc[:, :-4], rawDf.iloc[:, -1]

    def split_train_test_data(X, y, test_size):
        num_train = int(len(X) - len(X) * test_size)
        X_train = X.iloc[:num_train, :]
        X_test = X.iloc[num_train:, :]
        y_train = y[:num_train]
        y_test = y[num_train:]
        return X_train, y_train, X_test, y_test

    X_train, y_train, X_test, y_test = split_train_test_data(rawXs,
                                                             rawYs,
예제 #7
0
                 label='ROC curve (area = %0.2f)' % roc_auc[2])
        plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('KNN 5000 Samples')
        plt.legend(loc="lower right")
        plt.show()


if __name__ == "__main__":
    fullpath = lambda path: os.path.join(find_project_dir(), path)

    # Feature Engineering part
    fe = FeatureEngineering()
    wv = WordVectorizer()

    x_ser = fe.read_x_train_features().head(5000)
    y_mat = fe.read_y_train_features()
    # x_mat = fe.calc_count_matrix(x_ser)
    # x_mat = fe.calc_tfid_matrix(x_ser)
    x_mat = wv.transform(x_ser)

    X_train, X_test, y_train, y_test = train_test_split(x_mat,
                                                        y_mat,
                                                        test_size=0.2,
                                                        random_state=1)

    # PCA Stuff below
    # pca = decomposition.PCA(n_components=50)
예제 #8
0
@author: zhang_000
"""
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split
from FeatureEngineering import FeatureEngineering
from operator import itemgetter
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc

rawDataFile = './ds_challenge_v2_1_data (1) (1).csv'
FE = FeatureEngineering(rawDataFile)
rawData = FE.loadRawData()
features = FE.generateFeatures()
labels = FE.generateLabels()


#test function
def performTest(X_test, Y_test, classifier):
    predictions = classifier.predict(X_test)
    FA = 0
    Accu = 0
    MD = 0
    Y_test = list(Y_test.as_matrix())
    for i in range(len(predictions)):
        if Y_test[i] == predictions[i]:
            Accu += 1
예제 #9
0
log.info(Constants.INITIAL_MSG)

# Start calculating execution time
start_time = time.time()

log.info(Constants.START_MSG)

# Data Preprocessing Phase
log.info(Constants.DATA_PREPROCESSING_MSG)
dp = DataPreprocessing()
time_series = dp.preprocessing()

# Feature Engineering Phase
log.info(Constants.FEATURE_ENGINEERING_MSG)
fe = FeatureEngineering()

new_time_series = fe.execute_feature_engineering(time_series)

# new_time_series = dp.delete_column(time_series, ['Confirmed Cases', 'Deaths', 'Recovered Cases', 'Active Cases'])

# Truncate zero values from the time series
# new_time_series = dp.truncate_time_series(new_time_series, '26/02/2020')

# Preliminary Analysis: Stationarity Check
pa = PreliminaryAnalysis()

pa.execute_preliminary_analysis(new_time_series)

# Data Visualization Phase
log.info(Constants.DATA_VISUALIZATION_MSG)
예제 #10
0
def sweep(loss,
          csv=True,
          cln=["Clean "],
          ngrams=[1, 4, 5, 6, 7],
          min_df=[0.00001],
          max_df=[0.5, 0.6, 0.7],
          K=[]):

    fullpath = lambda path: os.path.join(find_project_dir(), path)
    nsamp = [1000, 500, 200, 200, 200, 100, 100, 100, 50, 30, 20, 10, 10]
    fe = FeatureEngineering()
    x_ser = fe.read_x_train_features()
    x_ser_clean = fe.read_clean_x_train_features()
    y_mat = fe.read_y_train_features()
    x_ser_test = fe.read_clean_x_test_features()

    def get_maker(csv):
        desc_print = "{}TF-IDF Data; min_ngrams:{}, max_ngrams:{}, min_df: {}, max_df: {}"
        desc_csv = "{}TF-IDF Data, {}, {}, {}, {}, {}"
        desc = desc_csv if csv else desc_print

        def make_tup(x):
            x = list(x)
            min_ngrams, max_ngrams = x[1]
            x[1] = min_ngrams
            x.insert(2, max_ngrams)
            return (x_ser_clean, desc.format(*x), *x)

        return make_tup

    if csv:
        print(
            "data, min_ngrams, max_ngrams, min_df, max_df, model, predictor, k, accuracy, precision, recall, f1, boot_acc"
        )

    # ngrams = itertools.combinations(ngrams, 2)
    ngrams = [(1, i) for i in ngrams]
    params = itertools.product(cln, ngrams, min_df, max_df,
                               range(len(K) - 1, -1, -1))

    tups = list(map(get_maker(csv), params))
    n = len(tups)
    start = time.time()
    i = 1
    for tup in tups:
        x, dat, cln, min_ngrams, max_ngrams, min_df, max_df, j = tup

        k = K[j]
        n_samp = nsamp[j]
        x_ser = x_ser.head(k)
        x_ser_clean = x_ser_clean.head(k)
        y_mat = y_mat[:k, :]
        print("Starting {}...".format(dat),
              file=sys.stderr,
              flush=True,
              end='')

        tfidf = TfidfVectorizer(analyzer='word',
                                ngram_range=(min_ngrams, max_ngrams),
                                min_df=min_df,
                                max_df=max_df,
                                norm='l2')
        # count = CountVectorizer(analyzer='word', ngram_range=(min_ngrams, max_ngrams), min_df=min_df, max_df=max_df)
        try:
            x_mat = tfidf.fit_transform(x_ser_clean)
            # count.fit(x_ser_clean)
            # x_mat_train = count.transform(x_ser_clean)
            # x_mat_test = tfidf.transform(x_ser_test)
        except ValueError as e:
            continue

        if not csv: print("\n{}:".format(dat))

        models = [WCNB(preproc=None)]
        bootstrap = Bootstrap(x_mat, y_mat, models, num_samples=n_samp)
        bootstrap.run()

        def prepend(x):
            typ = 'M'
            return [dat, x.name, typ]

        if csv:
            bootstrap.comma_separated_metrics(prepend=prepend)
        else:
            bootstrap.print_summary()

        finish = time.time()
        print("Done tup {}/{} in {}".format(i, n, finish - start),
              file=sys.stderr,
              flush=True)
        i += 1
        start = finish
예제 #11
0
def main():

    fullpath = lambda path: os.path.join(find_project_dir(), path)
    loss = lambda y_hat, y: np.vectorize(int)(y_hat == y)

    if False:
        sweep(loss,
              csv=True,
              K=[
                  50, 100, 200, 400, 800, 1600, 3200, 6400, 12800, 25600,
                  51200, 102400, 165000
              ],
              ngrams=[5],
              max_df=[0.5])
    else:
        fe = FeatureEngineering()
        # x_ser = fe.read_x_train_features()
        x_ser_clean = fe.read_clean_x_train_features()
        y_mat = fe.read_y_train_features()

        # k = 5000
        # k_tfidf = 500
        # nsamp = 10
        # x_ser = x_ser.head(k)
        # x_ser_clean = x_ser_clean.head(k)
        # y_mat = y_mat[:k,:]
        # y_mat_tfidf = y_mat[:k_tfidf,:]

        # x_mat = fe.calc_count_matrix(x_ser)
        # x_mat_clean = fe.calc_count_matrix(x_ser_clean)
        # x_tfidf_clean = fe.calc_tfid_matrix(x_ser_clean, max_ngrams=3, min_df=0.0001)
        # count = CountVectorizer()
        # tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0.0001)
        # x_mat_clean = fe.calc_count_matrix(x_ser_clean)

        print("done preproc A")
        """
        Run bootstrap.
        """
        # bootstrap = Bootstrap(x_ser_clean.head(500), y_mat_tfidf, [WCNB(preproc=tfidf)], num_samples=nsamp)
        # bootstrap.run()
        # bootstrap.print_summary()
        # bootstrap.models[0].save(fullpath('models/wcnb3'))

        # bootstrap = Bootstrap(x_ser_clean, y_mat, [NaiveBayes()], num_samples=nsamp)
        # bootstrap.run()
        # bootstrap.print_summary()
        # bootstrap.models[0].save(fullpath('models/nb'))
        """
        Run fe.cv.transform() or fe.tf.transform() to get features
        after learning a model. Right now you have to run fe.calc_XXX_matrix
        on the data that was used to train the model first, then
        fe.XX.transform(x), where x is a Pandas series.
        """
        preproc = TfidfVectorizer(analyzer='word',
                                  ngram_range=(1, 5),
                                  min_df=0.00001,
                                  max_df=0.5,
                                  norm='l2')
        # count = CountVectorizer(analyzer='word', ngram_range=(1, 5), min_df=0.00001, max_df=0.5)
        k = "1.5.-5.5"
        x_mat = preproc.fit_transform(x_ser_clean)
        # print(type(x_mat))
        # pd.DataFrame(x_mat.toarray()).to_csv(fullpath("models/xmat.csv"), index_label=False)
        # count.fit(x_ser_clean)
        print("done preproc B")
        # model = WCNB()
        # model.fit(x_mat, y_mat)
        # model.save(fullpath('models/wcnb{}'.format(k)))
        model = WCNB.load(fullpath('models/wcnb{}'.format(k)))
        # model.fit(x_mat, y_mat)
        # x_test = preproc.transform(fe.read_clean_x_test_features())
        y_hat = model.predict(x_mat)
        # pd.DataFrame(y_hat).to_csv(fullpath("models/wcnb{}_output.csv".format(k)), header=['category'], index_label='id')
        y = pd.get_dummies(pd.DataFrame(y_mat)).as_matrix()

        y_hat = pd.DataFrame(y_hat)
        y_hat = pd.get_dummies(y_hat).as_matrix()

        classes = y_hat.shape[1]

        plot_roc_curve(classes, y_hat, y)
예제 #12
0
    del traini
    del testi

else:

    train = pd.read_csv('train_transaction.csv')
    traini = pd.read_csv('train_identity.csv')
    train = pd.merge(train, traini, on='TransactionID', how='left')
    test = train.sample(frac=0.7, random_state=99)
    train = train[~train.index.isin(test.index)]
    del traini

print("Done!")

print("Feature engineering...")
train = FE.reduce_mem_usage(train)
test = FE.reduce_mem_usage(test)

FE.make_ymdhd_feature(train)
FE.make_ymdhd_feature(test)

train = train.sort_values('day')
test = test.sort_values('day')


def ecdf(data):
    """Compute ECDF for a one-dimensional array of measurements."""
    # Number of data points: n
    n = len(data)
    # x-data for the ECDF: x
    x = np.sort(data)