Example #1
0
def train_card_merchant_embeddings(df, path, iter=1):
    df['merchant_id'] = df['merchant_id'].astype('category').cat.as_ordered()
    val_idx = get_validation_index(df, frac=0.2, random=True)
    x, y, nas = proc_df(df, 'percent', do_scale=False)

    cat_vars = ['card_id', 'merchant_id']
    md = ColumnarModelData.from_data_frame(path, val_idx, x, y.astype(np.float32), cat_flds=cat_vars,
                                           is_reg=True, is_multi=False, bs=128, test_df=None)
    embedding_sizes = get_embedding_sizes(cat_vars, df)
    learner = md.get_learner(embedding_sizes, 0, 0.04, 1, [1000, 500], [0.001, 0.01],
                             y_range=(0.0, 1.0))
    # lr_find(learner)

    # Load model, train, save
    try:
        learner.load('embedding_model')
    except FileNotFoundError:
        pass

    for i in range(iter):
        print(f'training iter {i}')
        learner.fit(1e-4, 1)
        learner.save('embedding_model')

    return learner
Example #2
0
def train_card_embeddings(df, iter=1):
    # We may use a smaller set of data to get a sense of the performance of the model, comment out before final
    # training
    # df = df.sample(frac=0.1)

    df = df[cat_vars + cont_vars + ['purchase_amount']]
    df = df[(df.purchase_amount < 5) & (df.avg_purchases_lag12 < 5) &
            (df.avg_sales_lag12 < 5)]
    df.reset_index(inplace=True, drop=True)
    val_idx = get_validation_index(df, frac=0.25, random=False)

    # make cat_vars, but card_id needs special treatment
    cat_var_no_cid = cat_vars.copy()
    cat_var_no_cid.remove('card_id')
    transform_columns(df, cat_var_no_cid, cont_vars)

    x, y, nas, mapper = proc_df(df, 'purchase_amount', do_scale=True)

    md = ColumnarModelData.from_data_frame(PATH,
                                           val_idx,
                                           x,
                                           y.astype(np.float32),
                                           cat_flds=cat_vars,
                                           is_reg=True,
                                           is_multi=False,
                                           bs=128,
                                           test_df=None)
    embedding_sizes = get_embedding_sizes(cat_vars, df)
    learner = md.get_learner(embedding_sizes,
                             len(x.columns) - len(cat_vars),
                             0.5,
                             1, [50, 7], [0.5, 0.5],
                             y_range=(-1.0, 0.0))

    # Load model, train, save
    try:
        learner.load(MODEL)
    except FileNotFoundError:
        pass

    for i in range(iter):
        print(f'training iter {i}')
        learner.fit(1e-2, 1)
        learner.save(MODEL)

    return learner
Example #3
0
def train_with_card_embedding(debug):
    c_m_df, train, test = load_all_category(PATH, debug)
    card_learner = train_card_merchant_embeddings(c_m_df, PATH, 0)

    train_cat_flds = ['card_id', 'first_active_month']
    set_common_categorical([train, test], 'first_active_month')
    test['target'] = 0

    train.replace([np.inf, -np.inf], np.nan, inplace=True)
    test.replace([np.inf, -np.inf], np.nan, inplace=True)
    train.reset_index(inplace=True, drop=True)
    train_x, train_y, nas, mapper = proc_df(train, 'target', do_scale=True)
    test_x, _, nas, mapper = proc_df(test,
                                     'target',
                                     do_scale=True,
                                     mapper=mapper,
                                     na_dict=nas)
    train_val_idx = get_validation_index(train, frac=0.25)
    md = ColumnarModelData.from_data_frame(PATH,
                                           train_val_idx,
                                           train_x,
                                           train_y.astype(np.float32),
                                           cat_flds=train_cat_flds,
                                           is_reg=True,
                                           bs=128,
                                           test_df=test_x)
    embedding_sizes = get_embedding_sizes(train_cat_flds, train)
    learner = md.get_learner(embedding_sizes,
                             len(train_x.columns) - len(train_cat_flds),
                             0.5,
                             1, [20, 5], [0.5, 0.5],
                             y_range=(-35.0, 20.0))

    # learner.lr_find()
    # learner.sched.plot(100)
    learner.model.embs[0].weight = Parameter(
        card_learner.model.embs[0].weight.data.clone())
    learner.model.embs[0].weight.requires_grad = False

    learner.fit(1e-3, 10)
    predict_and_save(learner, test, 'base')

    print('done')

    return
Example #4
0
    def mutate(self, info, users, books, ratings):
        if len(set([len(users), len(books), len(ratings)])) != 1:
            return Retrain(ok=False)
        if len(users) < 10:
            return Retrain(ok=False)

        data = pd.DataFrame.from_dict({'userID': users, 'bookID': books, 'rating': ratings})

        u_uniq = data.userID.unique()
        user2idx = {o: i for i, o in enumerate(u_uniq)}
        data.userID = data.userID.apply(lambda x: user2idx[x])

        m_uniq = data.bookID.unique()
        book2idx = {o: i for i, o in enumerate(m_uniq)}
        data.bookID = data.bookID.apply(lambda x: book2idx[x])

        n_users = int(data.userID.nunique())
        n_books = int(data.bookID.nunique())

        X = data.drop(['rating'], axis=1)
        y = data['rating'].astype(np.float32)

        val_idxs = get_cv_idxs(len(data))
        model_data = ColumnarModelData.from_data_frame(
            path, val_idxs, X, y, ['userID', 'bookID'], 64)

        N_FACTORS = 50
        WD = 1e-5
        model = EmbeddingDot(n_users, n_books, N_FACTORS)
        opt = optim.SGD(
            model.parameters(), 1e-1, weight_decay=WD, momentum=0.9)

        fit(model, model_data, 20, opt, F.mse_loss)
        set_lrs(opt, 0.01)
        fit(model, model_data, 20, opt, F.mse_loss)

        torch.save(model.state_dict(), 'bookweb-embed-dot.pth')
        with open('model-params.conf', 'w') as conf_file:
            conf_file.write(f'{n_users}\n{n_books}\n')

        return Retrain(ok=True)
def calculateTorchEmbeddingMatrix(emb_size, embedding_names, df, batch_size):
    n_days = 7

    val_idx = get_cv_idxs(len(df))
    data = ColumnarModelData.from_data_frame('', val_idx, df[['weekday']],
                                             df['scaled_users'], ['weekday'],
                                             2)

    def get_emb(num_cat, num_emb):
        e = nn.Embedding(num_cat, num_emb)
        e.weight.data.uniform_(-0.01, 0.01)
        return (e)

    class weekdayEmbedding(nn.Module):
        def __init__(self, n_days):
            super().__init__()
            self.weekdays = get_emb(n_days, emb_size)
            self.lin1 = nn.Linear(emb_size, 40)
            self.lin2 = nn.Linear(40, 10)
            self.lin3 = nn.Linear(10, 1)
            #self.drop1 = nn.Dropout(0.5)

        def forward(self, cats, conts):
            weekdays = cats[:, 0]
            x = self.weekdays(weekdays)
            x = F.relu((self.lin1(x)))
            x = F.relu((self.lin2(x)))
            return (self.lin3(x))

    model = weekdayEmbedding(n_days).cuda()
    opt = optim.Adam(model.parameters(), 1e-3)
    fit(model, data, 30, opt, F.mse_loss)
    fit(model, data, 30, opt, F.mse_loss)

    emb_matrix = model.weekdays.weight.data.cpu().numpy()

    emp_df = pd.DataFrame(emb_matrix, columns=embedding_names)
    emp_df['weekday'] = np.arange(0, 7)
    # list(model.parameters())

    return (emp_df)
def calculateTorchManualEmbeddingMatrix(emb_size, embedding_names, df,
                                        batch_size):
    n_days = 7

    val_idx = get_cv_idxs(len(df))
    dummy_X = pd.get_dummies(df['weekday_name'])
    cols = dummy_X.columns.values.astype(str)

    data = ColumnarModelData.from_data_frame('', val_idx, dummy_X,
                                             df['scaled_users'], [], 2)

    class weekdayEmbeddingManual(nn.Module):
        def __init__(self, n_days):
            super().__init__()
            self.emb = nn.Linear(n_days, emb_size)
            self.lin1 = nn.Linear(emb_size, 40)
            self.lin2 = nn.Linear(40, 10)
            self.lin3 = nn.Linear(10, 1)
            #self.drop1 = nn.Dropout(0.5)

        def forward(self, cats, conts):

            x = self.emb(conts)
            x = F.relu((self.lin1(x)))
            x = F.relu((self.lin2(x)))
            return (self.lin3(x))

    model = weekdayEmbeddingManual(n_days).cuda()
    opt = optim.Adam(model.parameters(), 1e-3)
    fit(model, data, 30, opt, F.mse_loss)
    fit(model, data, 30, opt, F.mse_loss)

    emb_matrix = np.transpose(model.emb.weight.data.cpu().numpy())

    emp_df = pd.DataFrame(emb_matrix, columns=embedding_names)
    emp_df['weekday'] = np.arange(0, 7)
    # list(model.parameters())

    return (emp_df)
Example #7
0
def kfold_fc(train_df,
             test_df,
             num_folds,
             params,
             path,
             label_col,
             target_col,
             feats_excluded=None,
             out_cols=None,
             stratified=False,
             cat_cols=[],
             name=None):
    print("Starting FC. Train shape: {}, test shape: {}".format(
        train_df.shape, test_df.shape))

    train_df[target_col] = train_df[target_col].astype(float)
    # Cross validation model
    if stratified:
        kf = StratifiedKFold(n_splits=num_folds,
                             shuffle=True,
                             random_state=326)
    else:
        kf = KFold(n_splits=num_folds, shuffle=True, random_state=326)

    # Create arrays and dataframes to store results
    test_df[target_col] = 0

    if feats_excluded is None:
        feats_excluded = [label_col, target_col]
    feat_cols = [f for f in train_df.columns if f not in feats_excluded]
    if out_cols is None:
        out_cols = [label_col, target_col]
    print(f'features {feat_cols}')

    train_x, train_y, nas, mapper = proc_df(train_df,
                                            target_col,
                                            do_scale=True,
                                            skip_flds=[label_col])
    test_x, _, nas, mapper = proc_df(test_df,
                                     target_col,
                                     do_scale=True,
                                     mapper=mapper,
                                     na_dict=nas,
                                     skip_flds=[label_col])
    embedding_sizes = get_embedding_sizes(cat_cols, train_df)
    embedding_inputs = 0
    for embs in embedding_sizes:
        embedding_inputs += embs[1]

    default_layer_size = max(
        2,
        int((embedding_inputs + len(train_x.columns) - len(cat_cols))**(1 /
                                                                        3)))
    y_range = [train_df[target_col].min(), train_df[target_col].max()]

    lr = params.get('lr', 1e-3)
    train_metrics = None
    param_metrics = params.get('metrics')
    if param_metrics is not None:
        train_metrics = []
        for metric in param_metrics:
            train_metrics.append(metrics_map[metric])

    for fold, (train_idx, valid_idx) in enumerate(
            kf.split(train_df[feat_cols], train_df[target_col])):
        print("Fold {}".format(fold))
        model_name = f'{name}-{fold}'

        md = ColumnarModelData.from_data_frame(path,
                                               valid_idx,
                                               train_x,
                                               train_y.astype(np.float32),
                                               cat_flds=cat_cols,
                                               is_reg=True,
                                               bs=128,
                                               test_df=test_x)

        learner = md.get_learner(
            embedding_sizes,
            len(train_x.columns) - len(cat_cols),
            params.get('emb_drop', 0.1),
            params.get('out_sz', 1),
            params.get('layers', [default_layer_size**2, default_layer_size]),
            params.get('layers_drop'),
            metrics=train_metrics,
            y_range=y_range)
        if fold == 0 and params.get('lr_find'):
            plt.figure(figsize=(8, 10))
            learner.lr_find()
            learner.sched.plot(100)
            plt.savefig('fc_lr_find.png')

        callback = SaveBestModel(learner, lr, model_name,
                                 params.get('early_stopping', 0))

        if params.get('binary', False):
            learner.crit = F.binary_cross_entropy

        if name is not None:
            try:
                learner.load(model_name)
            except FileNotFoundError:
                pass

        learner.fit(lr, params.get('epochs', 20), callbacks=[callback])

        # load the best model
        print(
            f'Best epoch is {callback.best_epoch} loss {callback.best_loss} metric {callback.best_metric}'
        )
        learner.load(model_name)
        test_df.loc[:, target_col] += (
            learner.predict(is_test=True).reshape(len(test_df)) / kf.n_splits)

        # save submission file
    test_df.reset_index(inplace=True)
    test_df[out_cols].to_csv(f'{path}/fc_pred.csv', index=False)
Example #8
0
def main():
    tables = get_tables(PATH, ['train', 'test'])
    train, test = tables
    val_idx = train.sample(frac=0.5).index

    family_survived = train[['LastName', 'Survived']].groupby('LastName').sum()
    add_family_survived(family_survived, test)

    # We can't train using the same family survived info in the training set because we would be cheating
    # by training on the result itself.
    family_count = train[['LastName', 'Survived']].groupby('LastName').count()
    remove_names = list(family_count[family_count['Survived'] == 1].index)
    remove_names_tuple = set([(x, ) for x in remove_names])
    train_index = train[~train[['LastName']].apply(tuple, 1).
                        isin(remove_names_tuple)].index
    add_family_survived_self(train, train_index, val_idx)

    # Not using last name or cabin directly right now - cardinality is too high
    cat_vars = ['Pclass', 'Sex', 'Embarked', 'Title', 'FamilySurvived']
    cont_vars = ['Age', 'SibSp', 'Parch', 'Fare']

    for v in cat_vars:
        train[v] = train[v].astype('category').cat.as_ordered()
    apply_cats(test, train)

    test['Survived'] = 0
    train = train[cat_vars + cont_vars + ['Survived']]
    test = test[cat_vars + cont_vars + ['Survived', 'PassengerId']]

    df, y, nas, mapper = proc_df(train, 'Survived', do_scale=True)
    df_test, _, nas, mapper = proc_df(test,
                                      'Survived',
                                      do_scale=True,
                                      skip_flds=['PassengerId'],
                                      mapper=mapper,
                                      na_dict=nas)

    md = ColumnarModelData.from_data_frame(PATH,
                                           val_idx,
                                           df,
                                           y.astype(np.float32),
                                           cat_flds=cat_vars,
                                           is_reg=True,
                                           is_multi=False,
                                           bs=128,
                                           test_df=df_test)
    embedding_sizes = get_embedding_sizes(cat_vars, train)

    model = md.get_learner(embedding_sizes,
                           len(df.columns) - len(cat_vars),
                           0.5,
                           1, [10, 5], [0.5, 0.5],
                           y_range=(0, 1))
    model.summary()

    lr = 1e-3
    model.fit(lr, 200)

    # model.load('m-1')
    # model.fit(lr, 10)
    #
    predict_and_save(model, test, PATH, 'base')

    print('done')
Example #9
0
def train_with_card_embedding_inline():
    df = get_train_test_with_features()
    card_emb_df = load_word2vec_merchant_embeddings()
    df = df.merge(card_emb_df, on=['card_id'], how='left')

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    with timer("split train & test"):
        train = df[df['target'].notnull()]
        test = df[df['target'].isnull()]
        del df
        gc.collect()

    set_to_float32(train)
    set_to_float32(test)
    FEATS_EXCLUDED = [
        'first_active_month', 'card_id', 'outliers', 'hist_purchase_date_max',
        'hist_purchase_date_min', 'hist_card_id_size', 'new_purchase_date_max',
        'new_purchase_date_min', 'new_card_id_size', 'OOF_PRED', 'month_0'
    ]
    feats = [f for f in train.columns if f not in FEATS_EXCLUDED]
    train = train[feats].reset_index()
    test = test[feats].reset_index()

    # training and testing with the real train/test set
    train_cat_flds = []

    train_x, train_y, nas, mapper = proc_df(train,
                                            'target',
                                            do_scale=True,
                                            skip_flds=['card_id'])
    test_x, _, nas, mapper = proc_df(test,
                                     'target',
                                     do_scale=True,
                                     mapper=mapper,
                                     na_dict=nas,
                                     skip_flds=['card_id'])
    train_val_idx = get_validation_index(train, frac=0.25)
    md = ColumnarModelData.from_data_frame(PATH,
                                           train_val_idx,
                                           train_x,
                                           train_y.astype(np.float32),
                                           cat_flds=train_cat_flds,
                                           is_reg=True,
                                           bs=128,
                                           test_df=test_x)
    embedding_sizes = get_embedding_sizes(train_cat_flds, train)
    learner = md.get_learner(embedding_sizes,
                             len(train_x.columns) - len(train_cat_flds),
                             0.1,
                             1, [64, 8], [0.5, 0.5],
                             y_range=(-35.0, 20.0))

    # learner.lr_find()
    # learner.sched.plot(100)
    try:
        learner.load('w2v_card_embedding')
    except FileNotFoundError:
        pass

    for i in range(10):
        learner.fit(1e-3, 20)
        learner.save(f'w2v_card_embedding_{i}')

        predict_and_save(learner, test, f'base_{i}')

    print('done')
Example #10
0
x = np.array(x).reshape((-1, 1))
y = np.array(y).reshape((-1, 1))


# Linear Regression Model
class model(nn.Module):
    def __init__(self, i, o):
        super().__init__()
        self.l = nn.Linear(i, o)

    def forward(self, x):
        return self.l(x)


m = model(input_size, output_size).cuda()
md = ColumnarModelData.from_arrays('.', [-1], x, y, bs=batch_size)
opt = optim.SGD(m.parameters(), 1e-4)
fit(m, md, epochs, opt, F.mse_loss)

input_size, output_size, batch_size, epochs = 1, 1, 1, 3


def lin(a, b, x):
    return a * x + b


def gen_fake_data(n, a, b):
    x = s = np.random.uniform(0, 1, n)
    y = lin(a, b, x) + 0.1 * np.random.normal(0, 3, n)
    return x, y