Esempio n. 1
0
    def fit(self,
            X,
            Y,
            learning_rate=1e-8,
            reg=1e-12,
            epochs=10000,
            show_fig=False):

        D = X.shape[1]  # number of features
        K = len(set(Y))  # number of classes

        X, Y = shuffle(X, Y)
        X_valid, Y_valid = X[-1000:], Y[-1000:]
        T_valid = one_hot_encoder(Y_valid)
        X, Y = X[:-1000], Y[:-1000]

        T = one_hot_encoder(Y)

        self.W1 = np.random.randn(D, self.M) / np.sqrt(D)
        self.b1 = np.zeros(self.M)
        self.W2 = np.random.randn(self.M, K) / np.sqrt(self.M)
        self.b2 = np.zeros(K)

        costs = []
        best_validation_error = 1
        for epoch in range(epochs):
            Y_hat, Z = self.forward(X)

            # Weight updates ----------------------
            Y_hat_T = Y_hat - T
            self.W2 -= learning_rate * (Z.T.dot(Y_hat_T) + reg * self.W2)
            self.b2 -= learning_rate * (Y_hat_T.sum() + reg * self.b2)

            val = Y_hat_T.dot(self.W2.T) * (1 - Z * Z)  #tanh
            self.W1 -= learning_rate * (X.T.dot(val) + reg * self.W1)
            self.b1 -= learning_rate * (val.sum() + reg * self.b1)
            # -------------------------------------

            if epoch % 10 == 0:
                Y_hat_valid, _ = self.forward(X_valid)
                c = cross_entropy(T_valid, Y_hat_valid)
                costs.append(c)
                e = error_rate(Y_valid, np.argmax(Y_hat_valid, axis=1))
                print("epoch:", epoch, "cost:", c, "error:", e)
                if e < best_validation_error:
                    best_validation_error = e
        print("best_validation_error:", best_validation_error)

        if show_fig:
            plt.plot(costs)
            plt.title('Validation cost')

        print("Final train classification_rate:",
              self.score(Y, self.predict(Y_hat)))
 def generate_batch_hot(self):
     start = self.start
     end = self.end
     self.texts_train = []
     self.labels_train = []
     data_split = self.ids[start:end]
     for i in range(0, len(data_split)):
         ids_index = data_split[i][0].split(" ")
         id = int(ids_index[0])
         index = int(ids_index[1])
         labels = self.labels[index][0]
         split_labels = labels.split(" ")
         labels_temp = np.zeros(config.label_size)
         for j in range(1, len(split_labels)):
             try:
                 label_index = utils.find_label_index(split_labels[j])
                 labels_temp[label_index] = 1.0
             except ValueError:
                 print("Not have label: ", split_labels[j])
         self.labels_train.append(labels_temp)
         text_name = str(id) + "text.txt"
         temp_text = ""
         with open('data/bibtex/over200/train/' + text_name, 'r') as f:
             temp_text = f.read()
         temp_text = temp_text + temp_text.replace(" ", "")
         temp_text = temp_text + temp_text.replace(" ", "").replace(
             "\t", "")
         matrix = utils.one_hot_encoder(temp_text)
         self.texts_train.append(matrix)
Esempio n. 3
0
    def generic_visit(self, node):
        '''
        Is called upon visit to every node.
        '''
        if not hasattr(node, 'visited'):
            if not ast_utils.should_filter(node):
                self.collect_metadata(node)
                self.nodes_stack.append(node)

                token_id = ast_utils.get_token_id(node)
                if token_id == -1:
                    print("[WARNING] --- Found unkown token", node)

                if self.include_vectorized_tokens:
                    ft = feature_utils.token2vec(node, slot=self.slot)

                    if np.count_nonzero(np.isnan(ft)) > 0:
                        print("[WARNING] Found nan feature for node", node)
                        ft = np.zeros(64)
                one_hot_token_type = utils.one_hot_encoder(
                    token_id, 1, min=0, max=max(AST_SYMBOL_DICT.values()))
                if self.include_vectorized_tokens:
                    self.feature_list.append(
                        np.concatenate([ft, one_hot_token_type[0]]))
                else:
                    self.feature_list.append(one_hot_token_type[0])
                self.classes_list.append(ast_utils.get_token_class_id(node))
                node.visited = True

        ast.NodeVisitor.generic_visit(self, node)
Esempio n. 4
0
def kmeans_for_img(kmeans, img):
    h, w, ch = img.shape
    img = np.reshape(img, (h*w, ch))
    img = kmeans.predict(img)
    img = one_hot_encoder(img, 64)
    img = np.reshape(img, (h, w, 64))
    return img
def installments_payments(num_rows=None, nan_as_category=True):
    ins = pd.read_csv('../input/installments_payments.csv', nrows=num_rows)
    ins, cat_cols = utils.one_hot_encoder(ins, nan_as_category=nan_as_category)
    # Percentage and difference paid in each installment (amount paid and installment value)
    ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
    ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']
    # Days past due and days before due (no negative values)
    ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
    ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
    ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)
    ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)
    # Features: Perform aggregations
    aggregations = {
        'NUM_INSTALMENT_VERSION': ['nunique'],
        'DPD': ['max', 'mean', 'sum'],
        'DBD': ['max', 'mean', 'sum'],
        'PAYMENT_PERC': ['max', 'mean', 'sum', 'var'],
        'PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'],
        'AMT_INSTALMENT': ['max', 'mean', 'sum'],
        'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
        'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
    ins_agg.columns = pd.Index(
        ['INS_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
    # Count installments accounts
    ins_agg['INS_COUNT'] = ins.groupby('SK_ID_CURR').size()
    del ins
    gc.collect()
    return ins_agg
def credit_card_balance(num_rows=None, nan_as_category=True):
    """
    load the data from


    """

    cc = pd.read_csv('../input/credit_card_balance.csv', nrows=num_rows)

    # NEW CLEANING
    #cc['AMT_DRAWINGS_ATM_CURRENT'][cc['AMT_DRAWINGS_ATM_CURRENT'] < 0] = np.nan
    #cc['AMT_DRAWINGS_CURRENT'][cc['AMT_DRAWINGS_CURRENT'] < 0] = np.nan
    cc['AMT_DRAWINGS_ATM_CURRENT'] = cc['AMT_DRAWINGS_ATM_CURRENT'].apply(
        lambda x: np.nan if x < 0 else x)
    cc['AMT_DRAWINGS_CURRENT'] = cc['AMT_DRAWINGS_CURRENT'].apply(
        lambda x: np.nan if x < 0 else x)

    cc, cat_cols = utils.one_hot_encoder(cc, nan_as_category=nan_as_category)
    # General aggregations
    cc.drop(columns=['SK_ID_PREV'], inplace=True)

    # aggregate the
    cc_agg = cc.groupby('SK_ID_CURR').agg(['min', 'max', 'mean', 'sum', 'var'])
    cc_agg.columns = pd.Index(
        ['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
    # Count credit card lines
    cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()
    del cc
    gc.collect()
    return cc_agg
def get_feature_from_pre_app(prev):

    prev, cat_cols = utils.one_hot_encoder(prev, nan_as_category=True)
    # Days 365.243 values -> nan
    prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
    prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True)
    prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True)
    prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True)
    prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True)

    # Add feature: value received / value ask
    prev['APP_CREDIT_PERC'] = prev['AMT_CREDIT'] / prev['AMT_APPLICATION']

    # Previous applications' numeric features
    num_aggregations = {
        'AMT_ANNUITY': ['max', 'mean'],
        'AMT_APPLICATION': ['max', 'mean'],
        'AMT_CREDIT': ['max', 'mean'],
        'APP_CREDIT_PERC': ['max', 'mean'],
        'AMT_DOWN_PAYMENT': ['max', 'mean'],
        'AMT_GOODS_PRICE': ['max', 'mean'],
        'RATE_DOWN_PAYMENT': ['max', 'mean'],
        'RATE_INTEREST_PRIMARY': ['max', 'mean'],
        'RATE_INTEREST_PRIVILEGED': ['max', 'mean'],
        'CNT_PAYMENT': ['mean', 'sum'],
    }

    # Previous applications categorical features
    cat_aggregations = {}
    for cat in cat_cols:
        cat_aggregations[cat] = ['mean']
    prev_agg = prev.groupby('SK_ID_CURR').agg({
        **num_aggregations,
        **cat_aggregations
    })
    prev_agg.columns = pd.Index([
        'PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()
    ])

    # Previous Applications: Approved Applications - only numerical features
    approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
    approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
    approved_agg.columns = pd.Index([
        'APPROVED_' + e[0] + "_" + e[1].upper()
        for e in approved_agg.columns.tolist()
    ])
    prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')
    # Previous Applications: Refused Applications - only numerical features
    refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
    refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
    refused_agg.columns = pd.Index([
        'REFUSED_' + e[0] + "_" + e[1].upper()
        for e in refused_agg.columns.tolist()
    ])
    prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')
    del refused, refused_agg, approved, approved_agg, prev
    gc.collect()
    return prev_agg
Esempio n. 8
0
    def fit(self,
            X,
            Y,
            learning_rate=1e-8,
            reg=1e-12,
            epochs=10000,
            show_fig=False):

        D = X.shape[1]  # number of features
        K = len(set(Y))  # number of classes

        X, Y = shuffle(X, Y)
        X_valid, Y_valid = X[-1000:], Y[-1000:]
        T_valid = one_hot_encoder(Y_valid)
        X, Y = X[:-1000], Y[:-1000]

        T = one_hot_encoder(Y)

        self.W = np.random.randn(D, K) / np.sqrt(D)
        self.b = np.zeros(K)

        costs = []
        best_validation_error = 1
        for epoch in range(epochs):
            Y_hat = self.forward(X)

            self.W -= learning_rate * (self.dJ_dw(T, Y_hat, X) + reg * self.W)
            self.b -= learning_rate * (self.dJ_db(T, Y_hat) + reg * self.b)

            if epoch % 100 == 0:
                Y_hat_valid = self.forward(X_valid)
                c = cross_entropy(T_valid, Y_hat_valid)
                costs.append(c)
                e = error_rate(Y_valid, np.argmax(Y_hat_valid, axis=1))
                print("epoch:", epoch, "cost:", c, "error:", e)
                if e < best_validation_error:
                    best_validation_error = e
        print("best_validation_error:", best_validation_error)

        if show_fig:
            plt.plot(costs)
            plt.title('Validation cost')
            plt.show()
        print("Final train classification_rate:", self.score(X, Y))
def get_pos_cash(path, num_rows= None):
    """Preprocess and extract features from POS_CASH_balance file.

    Arguments:
        path: Path to the folder where files are saved (string).
        num_rows: Number of rows to read; None reads all rows (int, default: None).

    Returns:
        df: DataFrame with processed data.
    """
    pos = pd.read_csv(os.path.join(path, 'POS_CASH_balance.csv'), nrows=num_rows)
    pos, categorical_cols = utils.one_hot_encoder(pos, nan_as_category=False)
    # Flag months with late payment
    pos['LATE_PAYMENT'] = pos['SK_DPD'].apply(lambda x: 1 if x > 0 else 0)
    # Aggregate by SK_ID_CURR
    categorical_agg = {key: ['mean'] for key in categorical_cols}
    pos_agg = utils.group(pos, 'POS_', {**config.POS_CASH_AGG, **categorical_agg})
    # Sort and group by SK_ID_PREV
    sort_pos = pos.sort_values(by=['SK_ID_PREV', 'MONTHS_BALANCE'])
    gp = sort_pos.groupby('SK_ID_PREV')
    df = pd.DataFrame()
    df['SK_ID_CURR'] = gp['SK_ID_CURR'].first()
    df['MONTHS_BALANCE_MAX'] = gp['MONTHS_BALANCE'].max()
    # Percentage of previous loans completed and completed before initial term
    df['POS_LOAN_COMPLETED_MEAN'] = gp['NAME_CONTRACT_STATUS_Completed'].mean()
    df['POS_COMPLETED_BEFORE_MEAN'] = gp['CNT_INSTALMENT'].first() - gp['CNT_INSTALMENT'].last()
    df['POS_COMPLETED_BEFORE_MEAN'] = df.apply(lambda x: 1 if x['POS_COMPLETED_BEFORE_MEAN'] > 0
                                                and x['POS_LOAN_COMPLETED_MEAN'] > 0 else 0, axis=1)
    # Number of remaining installments (future installments) and percentage from total
    df['POS_REMAINING_INSTALMENTS'] = gp['CNT_INSTALMENT_FUTURE'].last()
    df['POS_REMAINING_INSTALMENTS_RATIO'] = gp['CNT_INSTALMENT_FUTURE'].last()/gp['CNT_INSTALMENT'].last()
    # Group by SK_ID_CURR and merge
    df_gp = df.groupby('SK_ID_CURR').sum().reset_index()
    df_gp.drop(['MONTHS_BALANCE_MAX'], axis=1, inplace= True)
    pos_agg = pd.merge(pos_agg, df_gp, on= 'SK_ID_CURR', how= 'left')
    del df, gp, df_gp, sort_pos
    gc.collect()

    # Percentage of late payments for the 3 most recent applications
    pos = utils.do_sum(pos, ['SK_ID_PREV'], 'LATE_PAYMENT', 'LATE_PAYMENT_SUM')
    # Last month of each application
    last_month_df = pos.groupby('SK_ID_PREV')['MONTHS_BALANCE'].idxmax()
    # Most recent applications (3)
    sort_pos = pos.sort_values(by=['SK_ID_PREV', 'MONTHS_BALANCE'])
    gp = sort_pos.iloc[last_month_df].groupby('SK_ID_CURR').tail(3)
    gp_mean = gp.groupby('SK_ID_CURR').mean().reset_index()
    pos_agg = pd.merge(pos_agg, gp_mean[['SK_ID_CURR','LATE_PAYMENT_SUM']], on='SK_ID_CURR', how='left')

    # Drop some useless categorical features
    drop_features = [
        'POS_NAME_CONTRACT_STATUS_Canceled_MEAN', 'POS_NAME_CONTRACT_STATUS_Amortized debt_MEAN',
        'POS_NAME_CONTRACT_STATUS_XNA_MEAN']
    pos_agg.drop(drop_features, axis=1, inplace=True)
    return pos_agg
def get_feature_from_credit_card_balance(cc):
    cc, cat_cols = utils.one_hot_encoder(cc, nan_as_category=True)
    # General aggregations
    cc.drop(['SK_ID_PREV'], axis=1, inplace=True)
    cc_agg = cc.groupby('SK_ID_CURR').agg(['max', 'mean', 'sum', 'var'])
    cc_agg.columns = pd.Index(
        ['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
    # Count credit card lines
    cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()
    del cc
    gc.collect()
    return cc_agg
def previous_applications(num_rows=None, nan_as_category=True):
    prev = pd.read_csv('../input/previous_application.csv', nrows=num_rows)
    prev, cat_cols = utils.one_hot_encoder(prev,
                                           nan_as_category=nan_as_category)
    # Days 365.243 values -> nan
    keys = [
        'DAYS_FIRST_DRAWING', 'DAYS_FIRST_DUE', 'DAYS_LAST_DUE_1ST_VERSION',
        'DAYS_LAST_DUE', 'DAYS_TERMINATION'
    ]
    prev[keys] = prev[keys].replace(365243, np.nan)

    # Add feature: value ask / value received percentage
    prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']

    # Previous applications categorical features
    cat_aggregations = {cat: 'mean' for cat in cat_cols}
    prev_agg = aggregate_data(prev, 'PA', cat_aggregations)

    # Previous applications numeric features
    num_aggregations = {
        'AMT_ANNUITY': ['min', 'max', 'mean'],
        'AMT_APPLICATION': ['min', 'max', 'mean'],
        'AMT_CREDIT': ['min', 'max', 'mean'],
        'APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'],
        'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
        'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
        'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'DAYS_DECISION': ['min', 'max', 'mean'],
        'CNT_PAYMENT': ['mean', 'sum'],
    }

    # Previous Applications: Total Applications - only numerical features
    prev_subset = aggregate_data(prev,
                                 prefix='PREV',
                                 aggregates=num_aggregations)
    prev_agg = prev_agg.join(prev_subset, how='left', on='SK_ID_CURR')
    # Previous Applications: Approved Applications - only numerical features
    prev_subset = aggregate_data(
        prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1],
        prefix='APR',
        aggregates=num_aggregations)
    prev_agg = prev_agg.join(prev_subset, how='left', on='SK_ID_CURR')
    # Previous Applications: Refused Applications - only numerical features
    prev_subset = aggregate_data(
        prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1],
        prefix='REF',
        aggregates=num_aggregations)
    prev_agg = prev_agg.join(prev_subset, how='left', on='SK_ID_CURR')

    return prev_agg
Esempio n. 12
0
 def cache(self, node_attribute, num_nodes):
     path_len_2 = pickle.load(open(self.dataset + "/path_len_2.p", "rb"))
     path_len_3 = pickle.load(open(self.dataset + "/path_len_3.p", "rb"))
     path_len_4 = pickle.load(open(self.dataset + "/path_len_4.p", "rb"))
     paths = np.array(path_len_2 + path_len_3 + path_len_4)
     g = np.random.Generator(np.random.PCG64())
     sampled_paths = paths[g.choice(len(paths), 16400, replace=False)]
     real_walk_data = []
     for path in sampled_paths:
         temp_walk = utils.one_hot_encoder(np.array(path), num_nodes + 1)
         temp_type = utils.one_hot_encoder(
             np.array([node_attribute[i] for i in path]), self.num_classes)
         # Padding random walks to the max length
         if temp_type.shape[0] < self.max_path_len:
             temp_walk = utils.pad_along_axis(temp_walk,
                                              self.max_path_len,
                                              axis=0)
             temp_type = utils.pad_along_axis(temp_type,
                                              self.max_path_len,
                                              axis=0)
         real_walk_data.append((temp_type, temp_walk))
     print("Done!")
     return real_walk_data
 def generate_batch_hot(self):
     start = self.start
     end = self.end
     self.texts_train = []
     self.labels_train = []
     #print(self.data[start:end, 2])
     titles = list(self.texts[start:end, 1])
     texts = list(self.texts[start:end, 2])
     for i in range(0, len(texts)):
         text = titles[i] + texts[i]
         text = text.replace(" ", "")
         matrix = utils.one_hot_encoder(text)
         self.texts_train.append(matrix)
     labels = list(self.texts[start:end, 0])
     for i in range(0, len(labels)):
         temp = np.zeros(config.label_size)
         temp[int(labels[i]) - 1] = 1
         self.labels_train.append(temp)
def get_feature_from_pos_cash(pos):
    pos, cat_cols = utils.one_hot_encoder(pos, nan_as_category=True)
    # Features
    aggregations = {
        'MONTHS_BALANCE': ['max', 'mean', 'size'],
        'SK_DPD': ['max', 'mean'],
        'SK_DPD_DEF': ['max', 'mean']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']

    pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
    pos_agg.columns = pd.Index(
        ['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
    # Count pos cash accounts
    pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
    del pos
    gc.collect()
    return pos_agg
def pos_cash(num_rows=None, nan_as_category=True):
    pos = pd.read_csv('../input/POS_CASH_balance.csv', nrows=num_rows)
    pos, cat_cols = utils.one_hot_encoder(pos, nan_as_category=nan_as_category)
    # Features
    aggregations = {
        'MONTHS_BALANCE': ['max', 'mean', 'size'],
        'SK_DPD': ['max', 'mean'],
        'SK_DPD_DEF': ['max', 'mean']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']

    pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
    pos_agg.columns = pd.Index(
        ['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
    # Count pos cash accounts
    pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
    del pos
    gc.collect()
    return pos_agg
Esempio n. 16
0
def get_bureau_balance(path, num_rows=None):
    """Preprocess and extract features from bureau balance.

    Aggregations are done by SK_ID_BUREAU.

    Arguments:
        path: Path to the folder where files are saved (string).
        num_rows: Number of rows to read; None reads all rows (int, default: None).

    Returns:
        df: DataFrame with processed data.
    """
    bb = pd.read_csv(os.path.join(path, 'bureau_balance.csv'), nrows=num_rows)
    bb, categorical_cols = utils.one_hot_encoder(bb, nan_as_category=False)
    bb_processed = bb.groupby('SK_ID_BUREAU')[categorical_cols].mean().reset_index()
    # Min, Max, Count and mean duration of payments (months)
    agg = {'MONTHS_BALANCE': ['min', 'max', 'mean', 'size']}
    bb_processed = utils.group_and_merge(bb, bb_processed, '', agg, 'SK_ID_BUREAU')
    del bb
    gc.collect()
    return bb_processed
Esempio n. 17
0
def build_data_sets(file_name,
                    name="no_name",
                    avg_group_size=None,
                    derivation=None,
                    random_state=42,
                    test_proportion=0.2):
    eeg = EEG(data_reader=matlab_data_reader).read(file_name)
    n_channels = eeg.n_channels
    if avg_group_size:
        eeg.average_trials(avg_group_size, inplace=True)
    derivation = derivation or 'potential'
    if derivation.lower() == "electric_field":
        eeg.get_electric_field(inplace=True)
        eeg.data = eeg.data.reshape(eeg.n_channels, eeg.trial_size, -1,
                                    3).transpose((2, 0, 1, 3))
    elif derivation.lower() == 'laplacian':
        eeg.get_laplacian(inplace=True)
    n_classes = len(np.unique(eeg.trial_labels))
    labels = one_hot_encoder(eeg.trial_labels)
    X_train, X_test, y_train, y_test = train_test_split(
        eeg.data, labels, test_size=test_proportion, random_state=random_state)
    return type(
        'DataSet', (), {
            'train': EEGDataSetBatch(X_train, y_train),
            'test': type('Dataset', (), {
                'samples': X_test,
                'labels': y_test
            }),
            'trial_size': eeg.trial_size,
            'name': name,
            'derivation': derivation,
            'avg_group_size': avg_group_size,
            'random_state': random_state,
            'test_proportion': test_proportion,
            'n_channels': n_channels,
            'n_comps': eeg.n_comps,
            'n_classes': n_classes
        })
def get_credit_card(path, num_rows= None):
    """Preprocess and extract features from credit_card_balance.

    Arguments:
        path: Path to the folder where files are saved (string).
        num_rows: Number of rows to read; None reads all rows (int, default: None).

    Returns:
        df: DataFrame with processed data.
    """
    cc = pd.read_csv(os.path.join(path, 'credit_card_balance.csv'), nrows= num_rows)
    cc, _ = utils.one_hot_encoder(cc, nan_as_category=False)
    cc.rename(columns={'AMT_RECIVABLE': 'AMT_RECEIVABLE'}, inplace=True)
    # Amount used from limit
    cc['LIMIT_USE'] = cc['AMT_BALANCE'] / cc['AMT_CREDIT_LIMIT_ACTUAL']
    # Current payment / Min payment
    cc['PAYMENT_DIV_MIN'] = cc['AMT_PAYMENT_CURRENT'] / cc['AMT_INST_MIN_REGULARITY']
    # Late payment
    cc['LATE_PAYMENT'] = cc['SK_DPD'].apply(lambda x: 1 if x > 0 else 0)
    # How much drawing of limit
    cc['DRAWING_LIMIT_RATIO'] = cc['AMT_DRAWINGS_ATM_CURRENT'] / cc['AMT_CREDIT_LIMIT_ACTUAL']
    # Aggregations by SK_ID_CURR
    cc_agg = cc.groupby('SK_ID_CURR').agg(config.CREDIT_CARD_AGG)
    cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
    cc_agg.reset_index(inplace= True)

    # Last month balance of each credit card application
    last_ids = cc.groupby('SK_ID_PREV')['MONTHS_BALANCE'].idxmax()
    last_months_df = cc[cc.index.isin(last_ids)]
    cc_agg = utils.group_and_merge(last_months_df,cc_agg,'CC_LAST_', {'AMT_BALANCE': ['mean', 'max']})

    # Aggregations for last x months
    for months in [12, 24, 48]:
        cc_prev_id = cc[cc['MONTHS_BALANCE'] >= -months]['SK_ID_PREV'].unique()
        cc_recent = cc[cc['SK_ID_PREV'].isin(cc_prev_id)]
        prefix = 'CC_{}M_'.format(months)
        cc_agg = utils.group_and_merge(cc_recent, cc_agg, prefix, config.CREDIT_CARD_TIME_AGG)
    return cc_agg
Esempio n. 19
0
 def generate_batch_hot(self):
     start = self.start
     end = self.end
     self.texts_train = []
     self.labels_train = []
     #print(self.data[start:end, 2])
     titles = list(self.data[start:end, 1])
     texts = list(self.data[start:end, 2])
     for i in range(0, len(texts)):
         text = titles[i] + texts[i]
         text = text.replace(" ", "")
         matrix = utils.one_hot_encoder(text)
         self.texts_train.append(matrix)
     labels = list(self.data[start:end, 0])
     for i in range(0, len(labels)):
         if labels[i] == '1':
             self.labels_train.append([1, 0, 0, 0])
         if labels[i] == '2':
             self.labels_train.append([0, 1, 0, 0])
         if labels[i] == '3':
             self.labels_train.append([0, 0, 1, 0])
         if labels[i] == '4':
             self.labels_train.append([0, 0, 0, 1])
Esempio n. 20
0
def oneHotGenerate(dataSet, labSet, batchSize):

    dataSet = list(dataSet)
    labSet = list(labSet)

    data_list = np.zeros((batchSize, 500, 25, 1), dtype=np.float32)
    label_list = np.zeros((batchSize))

    setNum = len(dataSet)
    batchFlag = 0
    setFlag = 0

    while True:

        data = dataSet[setFlag]
        label = labSet[setFlag]

        data = one_hot_encoder(data)
        data = normalization_processing(data)

        data_list[batchFlag, :, :, 0] = data
        label_list[batchFlag] = label

        batchFlag += 1
        setFlag += 1

        if setFlag >= setNum:
            setFlag = 0

        if batchFlag >= batchSize:

            oneHotLab = to_categorical(label_list, num_classes=2)
            yield [data_list], [oneHotLab]

            batchFlag = 0
            data_list = np.zeros((batchSize, 500, 25, 1), dtype=np.float32)
            lab_list = np.zeros((batchSize))
Esempio n. 21
0
 def generate_batch_hot(self):
     start = self.start
     end = self.end
     self.texts_train = []
     self.labels_train = []
     data_split = self.ids[start:end]
     for i in range(0, len(data_split)):
         #print(data_split[i])
         ids_index = data_split[i][0].split(" ")
         id = int(ids_index[0])
         index = int(ids_index[1])
         labels = self.labels[index][0]
         split_labels = labels.split(" ")
         labels_temp = np.zeros(config.label_size)
         for j in range(1, len(split_labels)):
             try:
                 label_index = utils.find_label_index(split_labels[j])
                 labels_temp[label_index] = 1.0
             except ValueError:
                 print("Not have label: ", split_labels[j])
         self.labels_train.append(labels_temp)
         text_name = str(id) + "newsML.xml"
         #reuters = et.parse("data/rcv1-2/train-text/" + text_name, et.XMLParser(encoding='ISO-8859-1')).getroot()
         reuters = et.parse("data/rcv1-2/test-text0/" + text_name,
                            et.XMLParser(encoding='ISO-8859-1')).getroot()
         temp_text = ""
         for text in reuters.findall("title"):
             #print(text.text)
             temp_text = temp_text + text.text.replace(" ", "")
         for text in reuters.findall("text"):
             for p in text.findall("p"):
                 temp_text = temp_text + p.text.replace(" ", "").replace(
                     "\t", "")
         #print("ID TExt: ", id)
         #print(temp_text)
         matrix = utils.one_hot_encoder(temp_text)
         self.texts_train.append(matrix)
Esempio n. 22
0
    def fit(self,
            X,
            Y,
            activation=th.nnet.relu,
            learning_rate=1e-8,
            reg=1e-12,
            epochs=10000,
            n_batches=10,
            decay_rate=0.9,
            show_fig=False):
        X = X.astype(np.float32)
        Y = Y.astype(np.int32)

        X, Y = shuffle(X, Y)
        X_valid, Y_valid = X[-1000:], Y[-1000:]
        T_valid = one_hot_encoder(Y_valid)
        X, Y = X[:-1000], Y[:-1000]
        T = one_hot_encoder(Y)

        self.rng = theano.tensor.shared_randomstreams.RandomStreams()

        eps = 1e-10
        D = X.shape[1]  # number of features
        K = len(set(Y))  # number of classes
        batch_size = X.shape[0] // n_batches
        print_time = n_batches // 1

        M1 = D
        for M2 in self.hidden_layer_sizes:
            h = HiddenLayer(M1, M2, activation_fn=activation)
            self.layers.append(h)
            M1 = M2

        # the final layer
        h = HiddenLayer(M1, K, activation_fn=th.nnet.softmax)
        self.layers.append(h)

        for layer in self.layers:
            self.params += layer.params

        dparams = [
            theano.shared(np.zeros_like(p.get_value())) for p in self.params
        ]
        cache = [
            theano.shared(np.zeros_like(p.get_value())) for p in self.params
        ]

        thX = th.matrix('X')
        thT = th.matrix('T')
        thY_train = self.forward_train(thX)

        # Cost
        regularization_cost = reg * th.mean([(p * p).sum()
                                             for p in self.params])
        #cost = -th.mean(th.log(thY[th.arange(thT.shape[0]), thT])) #+ regularization_cost
        cost_train = -th.mean(thT * th.log(thY_train)) + regularization_cost

        # Gradient
        grads = th.grad(cost_train, self.params)

        update_params = [(p, p - learning_rate *
                          (decay_rate * v + (1 - decay_rate) * g + reg * p))
                         for g, v, p in zip(grads, dparams, self.params)]
        update_velocity = [(v, decay_rate * v + (1 - decay_rate) * g)
                           for g, v in zip(grads, dparams)]
        # updates = [(p, p - learning_rate*g) for g, p in zip(grads, self.params)]
        updates = update_params + update_velocity

        train_op = theano.function(inputs=[thX, thT], updates=updates)

        thY_predict = self.forward_predict(thX)
        cost_predict = -th.mean(
            thT * th.log(thY_predict)) + regularization_cost

        # Predictions
        prediction = th.argmax(thY_predict, axis=1)

        cost_predict_op = theano.function(inputs=[thX, thT],
                                          outputs=[cost, prediction])

        costs = []
        for epoch in range(epochs):
            X_shuffled, T_shuffled = shuffle(X, T)
            for batch in range(n_batches):
                # Get the batch
                X_batch = X_shuffled[batch * batch_size:(batch + 1) *
                                     batch_size, :]
                Y_batch = T_shuffled[batch * batch_size:(batch + 1) *
                                     batch_size, :]

                train_op(X_batch, Y_batch)

                if batch % print_time == 0:
                    test_cost, prediction = cost_predict_op(X_valid, T_valid)
                    err = error_rate(Y_valid, prediction)
                    # print(prediction.shape)
                    print(
                        "epoch [%d], batch [%d] : cost=[%.3f], error=[%.3f]" %
                        (epoch, batch, test_cost, err))
                    costs.append(test_cost)

        plt.plot(costs)
        plt.title('Validation cost')
        plt.show()
Esempio n. 23
0
def get_bureau(path, num_rows=None):
    """Preprocess and extract features from bureau and bureau balance.

    Get bureau balance features grouped by SK_ID_BUREAU and append to
    bureau data. After that, it performs aggregations for each customer
    (unique SK_ID_CURR) and return a DataFrame

    Arguments:
        path: Path to the folder where files are saved (string).
        num_rows: Number of rows to read; None reads all rows (int, default: None).

    Returns:
        df: DataFrame with processed data.
    """
    bureau = pd.read_csv(os.path.join(path, 'bureau.csv'), nrows= num_rows)
    # Credit duration and credit/account end date difference
    bureau['CREDIT_DURATION'] = -bureau['DAYS_CREDIT'] + bureau['DAYS_CREDIT_ENDDATE']
    bureau['ENDDATE_DIF'] = bureau['DAYS_CREDIT_ENDDATE'] - bureau['DAYS_ENDDATE_FACT']
    # Credit to debt ratio and difference
    bureau['DEBT_PERCENTAGE'] = bureau['AMT_CREDIT_SUM'] / bureau['AMT_CREDIT_SUM_DEBT']
    bureau['DEBT_CREDIT_DIFF'] = bureau['AMT_CREDIT_SUM'] - bureau['AMT_CREDIT_SUM_DEBT']
    bureau['CREDIT_TO_ANNUITY_RATIO'] = bureau['AMT_CREDIT_SUM'] / bureau['AMT_ANNUITY']

    # One-hot encoder
    bureau, _ = utils.one_hot_encoder(bureau, nan_as_category= False)
    # Join bureau balance features
    bureau = bureau.merge(get_bureau_balance(path, num_rows), how='left', on='SK_ID_BUREAU')
    # Flag months with late payments (days past due)
    bureau['STATUS_12345'] = 0
    for i in range(1,6):
        bureau['STATUS_12345'] += bureau['STATUS_{}'.format(i)]

    # Aggregate by number of months in balance and merge with bureau (loan length agg)
    features = ['AMT_CREDIT_MAX_OVERDUE', 'AMT_CREDIT_SUM_OVERDUE', 'AMT_CREDIT_SUM',
        'AMT_CREDIT_SUM_DEBT', 'DEBT_PERCENTAGE', 'DEBT_CREDIT_DIFF', 'STATUS_0', 'STATUS_12345']
    agg_length = bureau.groupby('MONTHS_BALANCE_SIZE')[features].mean().reset_index()
    agg_length.rename({feat: 'LL_' + feat for feat in features}, axis=1, inplace=True)
    bureau = bureau.merge(agg_length, how='left', on='MONTHS_BALANCE_SIZE')
    del agg_length
    gc.collect()

    # General loans aggregations
    agg_bureau = utils.group(bureau, 'BUREAU_', config.BUREAU_AGG)
    # Active and closed loans aggregations
    active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
    agg_bureau = utils.group_and_merge(active,agg_bureau,'BUREAU_ACTIVE_',config.BUREAU_ACTIVE_AGG)
    closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
    agg_bureau = utils.group_and_merge(closed,agg_bureau,'BUREAU_CLOSED_',config.BUREAU_CLOSED_AGG)
    del active, closed
    gc.collect()
    # Aggregations for the main loan types
    for credit_type in ['Consumer credit', 'Credit card', 'Mortgage', 'Car loan', 'Microloan']:
        type_df = bureau[bureau['CREDIT_TYPE_' + credit_type] == 1]
        prefix = 'BUREAU_' + credit_type.split(' ')[0].upper() + '_'
        agg_bureau = utils.group_and_merge(type_df, agg_bureau, prefix, config.BUREAU_LOAN_TYPE_AGG)
        del type_df
        gc.collect()
    # Time based aggregations: last x months
    for time_frame in [6, 12, 24, 36]:
        prefix = "BUREAU_LAST{}M_".format(time_frame)
        time_frame_df = bureau[bureau['DAYS_CREDIT'] >= -30*time_frame]
        agg_bureau = utils.group_and_merge(time_frame_df,agg_bureau,prefix,config.BUREAU_TIME_AGG)
        del time_frame_df
        gc.collect()

    # Last loan max overdue
    sort_bureau = bureau.sort_values(by=['DAYS_CREDIT'])
    gr = sort_bureau.groupby('SK_ID_CURR')['AMT_CREDIT_MAX_OVERDUE'].last().reset_index()
    gr.rename({'AMT_CREDIT_MAX_OVERDUE': 'BUREAU_LAST_LOAN_MAX_OVERDUE'}, inplace=True)
    agg_bureau = agg_bureau.merge(gr, on='SK_ID_CURR', how='left')
    # Ratios: total debt/total credit and active loans debt/ active loans credit
    agg_bureau['BUREAU_DEBT_OVER_CREDIT'] = \
        agg_bureau['BUREAU_AMT_CREDIT_SUM_DEBT_SUM']/agg_bureau['BUREAU_AMT_CREDIT_SUM_SUM']
    agg_bureau['BUREAU_ACTIVE_DEBT_OVER_CREDIT'] = \
        agg_bureau['BUREAU_ACTIVE_AMT_CREDIT_SUM_DEBT_SUM']/agg_bureau['BUREAU_ACTIVE_AMT_CREDIT_SUM_SUM']
    return agg_bureau
Esempio n. 24
0
    # test set
    x_test1 = feature_all[i]
    y_test = target_all[i]['reason']
    
    # remove foursquare data
#     x_train = x_train.drop(['fsq 0','fsq 1','fsq 2','fsq 3','fsq 4','fsq 5','fsq 6','fsq 7'],axis=1)
#     x_test = x_test.drop(['fsq 0','fsq 1','fsq 2','fsq 3','fsq 4','fsq 5','fsq 6','fsq 7'],axis=1)
    
    # train (layer 1)
    #eta_list = np.array([0.05]*200+[0.02]*200+[0.01]*200)
    gbm1 = xgb.XGBClassifier(max_depth=3, n_estimators=20, learning_rate=0.01, nthread=12, subsample=1,                               max_delta_step=0).fit(x_train1, y_train1)
    y_pred1 = gbm1.predict(x_train1)
    # train (layer 2)
    y_pred1_code = pd.DataFrame(columns=['loc {}'.format(j) for j in range(len(location_top))])
    for j in range(x_train1.shape[0]):
        y_pred1_code.loc[j,:] = one_hot_encoder(y_pred1[j], np.array(location_top))
    x_train2 = pd.concat([x_train1, y_pred1_code], axis=1)
    gbm2 = xgb.XGBClassifier(max_depth=3, n_estimators=20, learning_rate=0.01, nthread=12, subsample=1,                               max_delta_step=0).fit(x_train2, y_train2)
    
    # train performance
#     y_pred = gbm.predict(x_train)
#     conf_train, roc_auc_train = calculate_confusion_matrix(y_pred, y_train)

    # test (layer 1)
    y_pred1 = gbm1.predict(x_test1)
    y_pred1_code = pd.DataFrame(columns=['loc {}'.format(j) for j in range(len(location_top))])
    
    # test (layer 2)
    for j in range(x_test1.shape[0]):
        y_pred1_code.loc[j,:] = one_hot_encoder(y_pred1[j], np.array(location_top))
    x_test2 = pd.concat([x_test1, y_pred1_code], axis=1)
Esempio n. 25
0
    #reading the dataset
    df = utils.read_dataset(path, separator)
    #global analysis of the dataset
    utils.broad_analysis(df)
    utils.missing_values_table(df)
    features_list = df.columns
    #dropping duplicates in the dataset to avoid duplicated data points to have more importance than they really have
    df = df.drop_duplicates()
    #removing columns containing only one value, brining useless noise to the dataset
    df = utils.remove_unique_feature(df)
    #removing proper nouns from the dataset
    name_features = utils.remove_name(nlp, df)
    df = df.drop(name_features, axis=1)
    #one hot encoding of categorical data
    df = utils.one_hot_encoder(df)
    #processing of NaNs values
    df = utils.missing_values(df, 'drop')
    #visualizing correlation matrix (linear correlation only
    utils.visualise_correlation(df)
    features = df.columns.tolist()
    del features[features.index(str(target))]
    #converting float values to log(min(x)+1) if the distribution is skewed
    #this will aloow to correct distribution to be gaussian for better outliers removal
    for feature in features:
        if df[feature].dtypes == 'float64':
            if df[feature].skew() == 0:
                pass
            else:
                print(df.columns)
                df[feature] = df[feature].apply(
def main():
    #file_loc = '/media/avemuri/DEV/Data/deeplearning/mnist/train.csv'
    file_loc = 'D:/dev/data/face_emotion_recognizer/fer2013.csv'
    X_train, Y_train, X_test, Y_test = get_data(file_name=file_loc)
    
    pca = PCA(n_components=400)
    pca.fit(X_train)
    X_train = pca.transform(X_train)
    X_test = pca.transform(X_test)
    T_train = one_hot_encoder(Y_train)
    T_test = one_hot_encoder(Y_test)

    D = X_train.shape[1] # number of features
    K = len(set(Y_train)) # number of classes
    decay_rate = 0.999
    eps = 1e-10
    epochs = 100
    n_batches = 10
    batch_size = X_train.shape[0]//n_batches
    print_time = n_batches
    M = 300
    learning_rate=1e-6
    reg=1e-8
    

    W1_init = np.random.randn(D, M) / np.sqrt(D)
    b1_init = np.zeros(M)
    W2_init = np.random.randn(M, K) / np.sqrt(M)
    b2_init = np.zeros(K)
    

    thX = th.matrix('X')
    thT = th.matrix('Y')
    W1 = theano.shared(W1_init, 'W1')
    b1 = theano.shared(b1_init, 'b1')
    W2 = theano.shared(W2_init, 'W2')
    b2 = theano.shared(b2_init, 'b2')
    cache_W1 = theano.shared(1, 'cache_w1')
    cache_b1 = theano.shared(1, 'cache_b1')
    cache_W2 = theano.shared(1, 'cache_w2')
    cache_b2 = theano.shared(1, 'cache_b2')


    # forward model
    thZ = th.nnet.relu(thX.dot(W1) + b1)
    #thZ[thZ < 0] = 0
    # Z = np.tanh(X.dot(self.W1) + self.b1)
    thY = th.nnet.softmax(thZ.dot(W2) + b2)

    # Cost
    cost = -((thT*th.log(thY)).sum() + reg*((W1*W1).sum() + (b1*b1).sum() + (W2*W2).sum() + (b2*b2).sum()))

    # Prediction
    prediction = th.argmax(thY, axis=1)

    # Updates
    dJ_dW1 = th.grad(cost, W1)
    dJ_db1 = th.grad(cost, b1)
    dJ_dW2 = th.grad(cost, W2)
    dJ_db2 = th.grad(cost, b2)

    cache_W1 = decay_rate*cache_W1 + (1-decay_rate)*dJ_dW1*dJ_dW1
    cache_b1 = decay_rate*cache_b1 + (1-decay_rate)*dJ_db1*dJ_db1
    cache_W2 = decay_rate*cache_W2 + (1-decay_rate)*dJ_dW2*dJ_dW2
    cache_b2 = decay_rate*cache_b2 + (1-decay_rate)*dJ_db2*dJ_db2
    
    update_W1 = W1 - learning_rate*dJ_dW1/(np.sqrt(cache_W1)+eps)
    update_b1 = b1 - learning_rate*dJ_db1/(np.sqrt(cache_b1)+eps)
    update_W2 = W2 - learning_rate*dJ_dW2/(np.sqrt(cache_W2)+eps)
    update_b2 = b2 - learning_rate*dJ_db2/(np.sqrt(cache_b2)+eps)

    train = theano.function(inputs=[thX, thT], updates=[(W1, update_W1), (b1, update_b1), (W2, update_W2), (b2, update_b2)])#

    get_prediction = theano.function(inputs=[thX, thT], outputs=[cost, prediction])
    
    costs = []
    for epoch in range(epochs):
        X_shuffled, T_shuffled = shuffle(X_train, T_train)
        for batch in range(n_batches):
            # Get the batch
            X_batch = X_shuffled[batch*batch_size:(batch+1)*batch_size,:]
            Y_batch = T_shuffled[batch*batch_size:(batch+1)*batch_size,:]

            train(X_batch, Y_batch)
            
            if batch % print_time == 0:
                c, pred = get_prediction(X_test, T_test)
                err = error_rate(Y_test, pred)
                print("epoch [%d], batch [%d] : cost=[%.3f], error=[%.3f]" %(epoch, batch, c, err))
                costs.append(c)

    plt.plot(costs)
    plt.title('Validation cost')
    plt.show()
Esempio n. 27
0
    def fit(self,
            X,
            Y,
            learning_rate=1e-8,
            reg=1e-12,
            epochs=10000,
            n_batches=10,
            show_fig=False):

        D = X.shape[1]  # number of features
        K = len(set(Y))  # number of classes

        X, Y = shuffle(X, Y)
        X_valid, Y_valid = X[-1000:], Y[-1000:]
        T_valid = one_hot_encoder(Y_valid)
        X, Y = X[:-1000], Y[:-1000]

        batch_size = X.shape[0] // n_batches

        T = one_hot_encoder(Y)

        self.W1 = np.random.randn(D, self.M) / np.sqrt(D)
        self.b1 = np.zeros(self.M)
        self.W2 = np.random.randn(self.M, K) / np.sqrt(self.M)
        self.b2 = np.zeros(K)

        # 1st moment
        mW1 = 0
        mb1 = 0
        mW2 = 0
        mb2 = 0

        # 2nd moment
        vW1 = 0
        vb1 = 0
        vW2 = 0
        vb2 = 0

        # hyperparams
        beta1 = 0.9
        beta2 = 0.999
        eps = 1e-8

        costs = []
        t = 1
        for epoch in range(epochs):
            X_shuffled, T_shuffled = shuffle(X, T)
            for ibatch in range(n_batches):
                # Get the batch
                X_batch = X_shuffled[ibatch * batch_size:(ibatch + 1) *
                                     batch_size, :]
                Y_batch = T_shuffled[ibatch * batch_size:(ibatch + 1) *
                                     batch_size, :]

                Y_hat, Z = self.forward(X_batch)

                # Weight updates ----------------------
                Y_hat_T = Y_hat - Y_batch
                dJ_dW2 = Z.T.dot(Y_hat_T) + reg * self.W2
                dJ_db2 = Y_hat_T.sum() + reg * self.b2

                val = (Y_hat - Y_batch).dot(self.W2.T) * (Z > 0)  # Relu
                #val = Y_hat_T.dot(self.W2.T) * (1-Z*Z) # tanh
                dJ_dW1 = X_batch.T.dot(val) + reg * self.W1
                dJ_db1 = val.sum() + reg * self.b1

                # Mean
                mW2 = beta1 * mW2 + (1 - beta1) * dJ_dW2
                mb2 = beta1 * mb2 + (1 - beta1) * dJ_db2
                mW1 = beta1 * mW1 + (1 - beta1) * dJ_dW1
                mb1 = beta1 * mb1 + (1 - beta1) * dJ_db1

                # Velocity terms
                vW2 = beta2 * vW2 + (1 - beta2) * dJ_dW2 * dJ_dW2
                vb2 = beta2 * vb2 + (1 - beta2) * dJ_db2 * dJ_db2
                vW1 = beta2 * vW1 + (1 - beta2) * dJ_dW1 * dJ_dW1
                vb1 = beta2 * vb1 + (1 - beta2) * dJ_db1 * dJ_db1

                correction1 = 1 - beta1**t
                hat_mW2 = mW2 / correction1
                hat_mb2 = mb2 / correction1
                hat_mW1 = mW1 / correction1
                hat_mb1 = mb1 / correction1

                correction2 = 1 - beta2**t
                hat_vW2 = vW2 / correction2
                hat_vb2 = vb2 / correction2
                hat_vW1 = vW1 / correction2
                hat_vb1 = vb1 / correction2

                self.W2 -= learning_rate * hat_mW2 / (np.sqrt(hat_vW2) + eps)
                self.b2 -= learning_rate * hat_mb2 / (np.sqrt(hat_vb2) + eps)
                self.W1 -= learning_rate * hat_mW1 / (np.sqrt(hat_vW1) + eps)
                self.b1 -= learning_rate * hat_mb1 / (np.sqrt(hat_vb1) + eps)
                # -------------------------------------

                Y_hat_valid, _ = self.forward(X_valid)
                c = cross_entropy(T_valid, Y_hat_valid)
                costs.append(c)

                if ibatch % (n_batches) == 0:
                    e = error_rate(Y_valid, np.argmax(Y_hat_valid, axis=1))
                    print("epoch:", epoch, " cost:", c, " error:", e)

                t += 1

        if show_fig:
            plt.plot(costs)
            plt.title('Validation cost')
            plt.show()

        print("Final train classification_rate:", self.score(X, Y))
Esempio n. 28
0
def LogReg2D_classification(dataset, filename):
    """
    Classification of data with 2D logistic regression,
    followed by plotting of ROC and PR curves.

    Parameters
    ---
    dataset: the input dataset, containing training and
       test split data, and the corresponding labels
       for binding- and non-binding sequences.

    filename: an identifier to distinguish different
       plots from each other.

    Returns
    ---
    stats: array containing classification accuracy, precision
        and recall
    """

    # Import training/test set
    X_train = dataset.train.loc[:, 'AASeq'].values
    X_test = dataset.test.loc[:, 'AASeq'].values

    # One hot encode the sequences in 2D
    X_train = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_train]
    X_train_2D_list = []
    for x in range(0, len(X_train)):
        X_train_2D = np.empty([20, 0])
        for y in range(0, X_train[x].shape[1] - 1):
            for z in range(0, X_train[x].shape[0]):
                X_train_2D = np.concatenate(
                    (X_train_2D, X_train[x][z, y] * X_train[x][:, y + 1:]),
                    axis=1)
        X_train_2D_list.append(X_train_2D)
    X_train = [x.flatten('F') for x in X_train_2D_list]

    X_test = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_test]
    X_test_2D_list = []
    for x in range(0, len(X_test)):
        X_test_2D = np.empty([20, 0])
        for y in range(0, X_test[x].shape[1] - 1):
            for z in range(0, X_test[x].shape[0]):
                X_test_2D = np.concatenate(
                    (X_test_2D, X_test[x][z, y] * X_test[x][:, y + 1:]),
                    axis=1)
        X_test_2D_list.append(X_test_2D)
    X_test = [x.flatten('F') for x in X_test_2D_list]

    # Extract labels of training/test set
    y_train = dataset.train.loc[:, 'AgClass'].values
    y_test = dataset.test.loc[:, 'AgClass'].values

    # Fitting Logistic Regression to the training set
    LR_classifier = LogisticRegression(random_state=0)
    LR_classifier.fit(X_train, y_train)

    # Predicting the test set results
    y_pred = LR_classifier.predict(X_test)
    y_score = LR_classifier.predict_proba(X_test)

    # ROC curve
    title = '2D Logistic Regression ROC curve (Train={})'.format(filename)
    plot_ROC_curve(y_test,
                   y_score[:, 1],
                   plot_title=title,
                   plot_dir='figures/2DLR_ROC_Test_{}.png'.format(filename))

    # Precision-recall curve
    title = '2D Logistic Regression Precision-Recall curve (Train={})'.format(
        filename)
    plot_PR_curve(y_test,
                  y_score[:, 1],
                  plot_title=title,
                  plot_dir='figures/2DLR_P-R_Test_{}.png'.format(filename))

    # Calculate statistics
    stats = calc_stat(y_test, y_pred)

    # Return statistics
    return stats
Esempio n. 29
0
    def fit(self,
            X,
            Y,
            activation=tf.nn.relu,
            learning_rate=1e-8,
            reg=1e-12,
            epochs=10000,
            n_batches=10,
            decay_rate=0.9,
            show_fig=False):
        X = X.astype(np.float32)
        Y = Y.astype(np.int32)

        X, Y = shuffle(X, Y)
        X_valid, Y_valid = X[-1000:], Y[-1000:]
        T_valid = one_hot_encoder(Y_valid)
        X, Y = X[:-1000], Y[:-1000]
        T = one_hot_encoder(Y)

        eps = 1e-10
        D = X.shape[1]  # number of features
        K = len(set(Y))  # number of classes
        batch_size = X.shape[0] // n_batches
        print_time = n_batches // 1

        M1 = D
        for M2 in self.hidden_layer_sizes:
            h = HiddenLayer(M1, M2, activation_fn=activation)
            self.layers.append(h)
            M1 = M2

        # the final layer
        h = HiddenLayer(M1, K, activation_fn=tf.nn.softmax)
        self.layers.append(h)

        for layer in self.layers:
            self.params += layer.params

        tfX = tf.placeholder(tf.float32, shape=(None, D), name='tfX')
        tfT = tf.placeholder(tf.float32, shape=(None, K), name='tfT')
        tfY = self.forward(tfX)

        predict_op = tf.argmax(tfY, axis=1)

        cost = tf.reduce_sum(
            tf.nn.softmax_cross_entropy_with_logits_v2(logits=tfY, labels=tfT))
        train_op = tf.train.RMSPropOptimizer(learning_rate,
                                             decay=0.99,
                                             momentum=0.9).minimize(cost)

        costs = []
        init = tf.global_variables_initializer()
        with tf.Session() as session:
            session.run(init)
            for epoch in range(epochs):
                X_shuffled, T_shuffled = shuffle(X, T)
                for batch in range(n_batches):
                    # Get the batch
                    X_batch = X_shuffled[batch * batch_size:(batch + 1) *
                                         batch_size, :]
                    Y_batch = T_shuffled[batch * batch_size:(batch + 1) *
                                         batch_size, :]

                    session.run(train_op,
                                feed_dict={
                                    tfX: X_batch,
                                    tfT: Y_batch
                                })

                    if batch % print_time == 0:
                        test_cost = session.run(cost,
                                                feed_dict={
                                                    tfX: X_valid,
                                                    tfT: T_valid
                                                })
                        prediction = session.run(predict_op,
                                                 feed_dict={tfX: X_valid})
                        err = error_rate(Y_valid, prediction)
                        # print(prediction.shape)
                        print(
                            "epoch [%d], batch [%d] : cost=[%.3f], error=[%.3f]"
                            % (epoch, batch, test_cost, err))
                        costs.append(test_cost)

        plt.plot(costs)
        plt.title('Validation cost')
        plt.show()
Esempio n. 30
0
    #     x_test = x_test.drop(['fsq 0','fsq 1','fsq 2','fsq 3','fsq 4','fsq 5','fsq 6','fsq 7'],axis=1)

    # train (layer 1)
    #eta_list = np.array([0.05]*200+[0.02]*200+[0.01]*200)
    gbm1 = xgb.XGBClassifier(max_depth=3,
                             n_estimators=20,
                             learning_rate=0.05,
                             nthread=12,
                             subsample=1,
                             max_delta_step=0).fit(x_train1, y_train1)
    y_pred1 = gbm1.predict(x_train1)
    # train (layer 2)
    y_pred1_code = pd.DataFrame(
        columns=['loc {}'.format(j) for j in range(len(location_top))])
    for j in range(x_train1.shape[0]):
        y_pred1_code.loc[j, :] = one_hot_encoder(y_pred1[j],
                                                 np.array(location_top))
    x_train2 = pd.concat([x_train1, y_pred1_code], axis=1)
    gbm2 = xgb.XGBClassifier(max_depth=3,
                             n_estimators=20,
                             learning_rate=0.05,
                             nthread=12,
                             subsample=1,
                             max_delta_step=0).fit(x_train2, y_train2)

    # train performance
    #     y_pred = gbm.predict(x_train)
    #     conf_train, roc_auc_train = calculate_confusion_matrix(y_pred, y_train)

    # test (layer 1)
    y_pred1 = gbm1.predict(x_test1)
Esempio n. 31
0
def main():
    #file_loc = '/media/avemuri/DEV/Data/deeplearning/mnist/train.csv'
    file_loc = 'D:/dev/data/mnist/train.csv'
    X_train, Y_train, X_test, Y_test = get_data(file_name=file_loc,
                                                split_train_test=True)

    pca = PCA(n_components=400)
    pca.fit(X_train)
    X_train = pca.transform(X_train)
    #Y = Y_train
    T_train = one_hot_encoder(Y_train)
    X_test = pca.transform(X_test)
    T_test = one_hot_encoder(Y_test)

    #######################################################

    D = X_train.shape[1]  # number of features
    K = len(set(Y_train))  # number of classes
    M = 300
    reg = 0.00001
    batch_size = 500
    n_batches = X_train.shape[0] // batch_size
    learning_rate = 0.00004
    epochs = 10

    W1_init = np.random.randn(D, M) / np.sqrt(D)
    b1_init = np.zeros(M)
    W2_init = np.random.randn(M, K) / np.sqrt(M)
    b2_init = np.zeros(K)

    # Define all variables
    X = tf.placeholder(tf.float32, shape=(None, D), name='X')
    T = tf.placeholder(tf.float32, shape=(None, K), name='Y')
    W1 = tf.Variable(W1_init.astype(np.float32))
    b1 = tf.Variable(b1_init.astype(np.float32))
    W2 = tf.Variable(W2_init.astype(np.float32))
    b2 = tf.Variable(b2_init.astype(np.float32))

    # Model definition
    Z = tf.nn.relu(tf.matmul(X, W1) + b1)
    Y_hat = tf.matmul(Z, W2) + b2

    # Cost
    cost = tf.reduce_sum(
        tf.nn.softmax_cross_entropy_with_logits_v2(logits=Y_hat, labels=T))

    # Optimization
    train = tf.train.RMSPropOptimizer(learning_rate=learning_rate,
                                      decay=0.99,
                                      momentum=0.9).minimize(cost)

    # Predictions
    predic_op = tf.argmax(Y_hat, axis=1)

    costs = []
    init = tf.global_variables_initializer()
    with tf.Session() as session:
        session.run(init)
        for epoch in range(epochs):
            X_shuffled, T_shuffled = shuffle(X_train, T_train)
            for batch in range(n_batches):
                # Get the batch
                X_batch = X_shuffled[batch * batch_size:(batch + 1) *
                                     batch_size, :]
                Y_batch = T_shuffled[batch * batch_size:(batch + 1) *
                                     batch_size, :]

                session.run(train, feed_dict={X: X_batch, T: Y_batch})

                if batch % 10 == 0:
                    c = session.run(cost, feed_dict={X: X_test, T: T_test})
                    Y_test_predictions = session.run(predic_op,
                                                     feed_dict={X: X_test})
                    err = error_rate(Y_test, Y_test_predictions)
                    print(
                        "epoch [%d], batch [%d] : cost=[%.3f], error=[%.3f]" %
                        (epoch, batch, c, err))
                    costs.append(c)

    plt.plot(costs)
    plt.title('Validation cost')
    plt.show()