def main():
    # load pkls
    df = read_pickles('../feats/sales_diff')
    df_calendar = loadpkl('../feats/calendar.pkl')
    df_sell_prices = loadpkl('../feats/sell_prices.pkl')

    # merge
    df = df.merge(df_calendar, on='d',how='left')
    df = df.merge(df_sell_prices, on=['store_id','item_id','wm_yr_wk'],how='left')

    del df_calendar, df_sell_prices
    gc.collect()

    # drop pre-release rows
    df = df[df['wm_yr_wk']>=df['release']]

    # make lag features
    df = make_lags(df,28)

    # label encoding
    cols_string = ['item_id','dept_id','cat_id','store_id','state_id']
    for c in cols_string:
        df[c], _ = pd.factorize(df[c])
        df[c].replace(-1,np.nan,inplace=True)

    # add price features
    df_grouped = df[['id','sell_price']].groupby('id')['sell_price']
    df['shift_price_t1'] = df_grouped.transform(lambda x: x.shift(1))
    df['price_change_t1'] = (df['shift_price_t1'] - df['sell_price']) / (df['shift_price_t1'])
    df['rolling_price_max_t365'] = df_grouped.transform(lambda x: x.shift(1).rolling(365).max())
    df['price_change_t365'] = (df['rolling_price_max_t365'] - df['sell_price']) / (df['rolling_price_max_t365'])
    df['rolling_price_std_t7'] = df_grouped.transform(lambda x: x.rolling(7).std())
    df['rolling_price_std_t30'] = df_grouped.transform(lambda x: x.rolling(30).std())

    # features release date
    df['release'] = df['release'] - df['release'].min()

    # price momentum by month & year
    df['price_momentum_m'] = df['sell_price']/df.groupby(['store_id','item_id','month'])['sell_price'].transform('mean')
    df['price_momentum_y'] = df['sell_price']/df.groupby(['store_id','item_id','year'])['sell_price'].transform('mean')

    # days for CustomTimeSeriesSplitter
    df['d_numeric'] = df['d'].apply(lambda x: str(x)[2:]).astype(int)

    # reduce memory usage
    df = reduce_mem_usage(df)

    # save as feather
    to_feature(df, '../feats/f105')

    # save feature name list
    features_json = {'features':df.columns.tolist()}
    to_json(features_json,'../configs/105_all_features_diff.json')

    # LINE notify
    line_notify('{} done.'.format(sys.argv[0]))
Beispiel #2
0
def main(debug=False, use_pkl=False):
    num_rows = 10000 if debug else None
    if use_pkl:
        df = loadpkl('../output/df.pkl')
    else:
        with timer("train & test"):
            df = train_test(num_rows)
        with timer("nightley"):
            df = pd.merge(df, nightley(num_rows), on=['datetime', 'park'], how='outer')
        with timer("hotlink"):
            df = pd.merge(df, hotlink(num_rows), on='datetime', how='outer')
        with timer("colopl"):
            df = pd.merge(df, colopl(num_rows), on=['park', 'year', 'month'], how='outer')
        with timer("weather"):
            df = pd.merge(df, weather(num_rows), on=['datetime', 'park'], how='outer')
        with timer("nied_oyama"):
            df = pd.merge(df, nied_oyama(num_rows), on=['datetime', 'park'], how='outer')
        with timer("agoop"):
            df = pd.merge(df, agoop(num_rows), on=['park', 'year','month'], how='outer')
        with timer("jorudan"):
            df = pd.merge(df, jorudan(num_rows), on=['datetime', 'park'], how='outer')
        with timer("save pkl"):
            save2pkl('../output/df.pkl', df)
    with timer("Run XGBoost with kfold"):
        print("df shape:", df.shape)
        feat_importance = kfold_xgboost(df, num_folds=NUM_FOLDS, stratified=True, debug=debug)
        display_importances(feat_importance ,'../output/xgb_importances.png', '../output/feature_importance_xgb.csv')
Beispiel #3
0
    def from_dict(cls, path: Union[str, Path], gz=False, mode=None):
        """Load and construct dictionary into a file. No restriction about the
        size or freq, just loads given file and set as the dictionary.
        Parameters:
            path (str): Path to the file.
            gz (bool): Whether the file is gz or not.
            mode (str or None): If 'i2w' or 'w2i', the loaded dict is used as the actual mapping of given type, and
                the other mapping is automatically created.
        """
        valid_modes = ['i2w', 'w2i']
        dic: dict = loadpkl(path, gz=gz)

        vocab = Vocab(symbols=[])
        if mode is None:
            vocab.w2i = dic['w2i']
            vocab.i2w = dic['i2w']
        elif mode in valid_modes:
            setattr(vocab, mode, dic)
            setattr(vocab, next(m for m in valid_modes if mode != m),
                    {v: k
                     for k, v in dic.items()})
        elif mode == 'w2i':
            vocab.w2i = dic
        else:
            raise ValueError(f"Invalid mode '{mode}'")
        return vocab
def main(debug=False):
    with timer("Load Datasets"):
        # load pkl
        df = loadpkl('../features/queries_profiles.pkl')

        # use selected features
        df = df[configs['features']]

        # set card_id as index
        df.set_index('sid', inplace=True)

        # split train & test
        train_df = df[df['click_mode'].notnull()]
        test_df = df[df['click_mode'].isnull()]
        del df
        gc.collect()

        if debug:
            train_df = train_df.iloc[:1000]

    with timer("Run LightGBM with kfold"):
        kfold_lightgbm(train_df,
                       test_df,
                       num_folds=NUM_FOLDS,
                       stratified=True,
                       debug=debug)
Beispiel #5
0
    def _try_load_cache(self, path: Path) -> bool:
        r"""Try loading a cached dataset from the data directory.

        :param path: The path to data directory.
        :return: Whether loading was successful.
        """
        cache_dir = path / '_cached'
        if not cache_dir.exists():
            return False
        params_index_path = cache_dir / 'params_index.pkl'
        params_index: List[Dict[str, Any]] = loadpkl(params_index_path)
        params = self._get_params()
        index = next(
            (idx for idx, p in enumerate(params_index)
             if (cache_dir /
                 f'{idx}.pkl').exists() and self._compare_params(p, params)),
            -1)
        if index != -1:
            load_path = cache_dir / f'{index}.pkl'
            self.batches = loadpkl(load_path)
            self.ntokens = {
                split: sum(batch.ntokens for _, batches in dataset
                           for batch in batches)
                for split, dataset in self.batches.items()
            }
            LOGGER.info(
                f"Cached dataset loaded from {load_path}, with settings: {params}"
            )
            # check for excluded keys and warn in case of mismatch
            load_params = params_index[index]
            for key in self.EXCLUDE_KEYS:
                if key in params or key in load_params:
                    current = params.get(key, "<does not exist>")
                    loaded = load_params.get(key, "<does not exist>")
                    if current != loaded:
                        LOGGER.info(
                            Logging.color(
                                'red', f"Ignored data param '{key}' mismatch "
                                f"(current: {current}, loaded: {loaded})"))
            return True
        return False
def main():
    # submitファイルをロード
    sub = pd.read_csv("../input/sample_submit.tsv", sep='\t', header=None)
    sub_lgbm = pd.read_csv("../output/submission_lgbm.tsv",
                           sep='\t',
                           header=None)
    sub_xgb = pd.read_csv("../output/submission_xgb.tsv",
                          sep='\t',
                          header=None)

    # カラム名を変更
    sub.columns = ['index', 'visitors']
    sub_lgbm.columns = ['index', 'visitors']
    sub_xgb.columns = ['index', 'visitors']

    # merge
    sub.loc[:,
            'visitors'] = 0.5 * sub_lgbm['visitors'] + 0.5 * sub_xgb['visitors']

    del sub_lgbm, sub_xgb
    gc.collect()

    # out of foldの予測値をロード
    oof_lgbm = pd.read_csv("../output/oof_lgbm.csv")
    oof_xgb = pd.read_csv("../output/oof_xgb.csv")
    oof_preds = 0.5 * oof_lgbm['OOF_PRED'] + 0.5 * oof_xgb['OOF_PRED']

    # train_dfをロード
    train_df = loadpkl('../output/train_df.pkl')
    train_df = train_df.sort_values('index')

    # local cv scoreを算出
    local_mae = mean_absolute_error(train_df['visitors'], oof_preds)

    # LINE通知
    line_notify('Blend Local MAE score %.6f' % local_mae)

    del oof_lgbm, oof_xgb
    gc.collect()

    # save submit file
    sub[['index',
         'visitors']].sort_values('index').to_csv(submission_file_name,
                                                  index=False,
                                                  header=False,
                                                  sep='\t')
Beispiel #7
0
    def _save_cache(self, path: Path):
        r"""Save the dataset to cache in the data directory.

        :param path: The path to data directory.
        """
        cache_dir = path / '_cached'
        params_index_path = cache_dir / 'params_index.pkl'
        params = self._get_params()
        if not cache_dir.exists():
            cache_dir.mkdir()
            index = 0
            params_index = [params]
        else:
            params_index = loadpkl(params_index_path)
            index = next((idx for idx in range(len(params_index))
                          if not (cache_dir / f'{idx}.pkl').exists()),
                         len(params_index))
            params_index[index:(index + 1)] = [params]  # replace or append
        savepkl(params_index, params_index_path)
        load_path = cache_dir / f'{index}.pkl'
        savepkl(self.batches, load_path)
        LOGGER.info(f"Dataset cached to {load_path}, with settings: {params}")
Beispiel #8
0
    def __init__(self, args: LMArguments, *, vocab_size: int,
                 rel_vocab_size: int, max_unkrel: int):
        self._rel_vocab_size = rel_vocab_size
        self._max_unkrel = max_unkrel
        self._kb_embed_dim = args.kb_embed_dim
        self._fact_embed_dim = args.kb_embed_dim * 2

        pred_input_dim = args.hidden_size + (self._fact_embed_dim if
                                             args.use_knowledge_embed else 0)
        super().__init__(args, vocab_size, pred_input_dim=pred_input_dim)

        self._num_layers = args.num_layers
        self._use_knowledge_embed = args.use_knowledge_embed
        self._train_relation_vec = args.train_relation_vec
        self._alias_disamb = args.alias_disamb_strategy

        self.selector = nn.Linear(self._pred_input_dim, 2)

        if args.alias_disamb_strategy is AliasDisamb.FastText:
            # Alias disambiguation using FastText.
            def _alias_path(name):
                path = Path(args.fasttext_model_path)
                return path.parent / (path.name + f'.{name}')

            self.alias_vec = torch.load(_alias_path('alias_vectors.pt')).to(
                self.device)
            if args.normalize_fasttext_embeds:
                self.alias_vec = self.alias_vec / torch.norm(
                    self.alias_vec, dim=1).unsqueeze(0)
            self.alias_list: List[str] = reverse_map(
                loadpkl(_alias_path('alias_dict.pkl')))
            self._alias_vec_dim = self.alias_vec.size(1)
            self.hid_to_alias = nn.Linear(self._pred_input_dim,
                                          self._alias_vec_dim)

        # Entity disambiguation using entity embeddings.
        self.entity_vec, relation_vec = utils.load_kb_embed(
            args.path, self.device)
        if self._train_relation_vec:
            rel_vocab_size, rel_embed_dim = relation_vec.size()
            self.relation_embed = nn.Embedding(rel_vocab_size, rel_embed_dim)
        else:
            self.relation_vec = relation_vec

        # (added 1) -2: anchor, -3: topic_itself, < -4: UNKs
        self.special_rel_vecs = nn.ParameterList([
            nn.Parameter(torch.Tensor(self._kb_embed_dim))
            for _ in range(max_unkrel + 2)
        ])
        self.unk_entity_vec = nn.Parameter(torch.Tensor(
            self._kb_embed_dim))  # -1

        def _(a, b):
            return a if a != -1 else b

        # Entity prediction
        if args.use_rel_mlp:
            self.hid_to_fact: nn.Module = utils.MLP(
                self._pred_input_dim,
                _(args.fact_key_mlp_hidden_dim, self._fact_embed_dim * 2),
                self._fact_embed_dim,
                dropout=args.dropout)
        else:
            self.hid_to_fact = nn.Linear(self._pred_input_dim,
                                         self._fact_embed_dim)

        self.reset_parameters()
Beispiel #9
0
    def __init__(self, args: LMArguments, *, vocab_size: int,
                 rel_vocab_size: int, max_unkrel: int):
        self._word_embed_dim: int = args.embed_size
        self._hidden_dim: int = args.hidden_size
        self._rel_vocab_size = rel_vocab_size
        self._position_embed_dim: int = args.pos_embed_dim
        self._position_size: int = args.pos_embed_count
        self._kb_embed_dim: int = args.kb_embed_dim
        self._num_layers: int = args.num_layers
        self._dropout: float = args.dropout
        self._use_anchor_rels: bool = args.use_anchor
        self._alias_disamb = args.alias_disamb_strategy

        self._fact_embed_dim: int = self._kb_embed_dim * 2  # concat of TransE embeds for relation & object

        input_dim = self._fact_embed_dim + self._word_embed_dim + self._position_embed_dim
        pred_input_dim = self._hidden_dim + self._fact_embed_dim
        super().__init__(args,
                         vocab_size,
                         input_dim=input_dim,
                         pred_input_dim=pred_input_dim,
                         embed_dim=self._word_embed_dim)

        self._fact_sel_strategy = args.fact_sel_strategy
        self._mask_invalid_pos = args.mask_invalid_pos
        self._use_knowledge_embed = args.use_knowledge_embed

        def _(a, b):
            return a if a != -1 else b

        # All MLP hidden dim specs are copied from the original code
        # Sec 3.2.2 Fact Extraction
        self.fact_key_mlp = utils.MLP(
            self._hidden_dim +
            (self._fact_embed_dim if self._use_knowledge_embed else 0),
            _(args.fact_key_mlp_hidden_dim, self._fact_embed_dim * 2),
            self._fact_embed_dim,
            dropout=self._dropout)
        # Sec 3.2.3 Selecting Word Generation Source
        self.copy_predictor = utils.MLP(self._pred_input_dim,
                                        _(args.copy_mlp_hidden_dim,
                                          self._hidden_dim),
                                        1,
                                        dropout=self._dropout)
        self.pos_predictor = utils.MLP(self._pred_input_dim,
                                       _(args.pos_mlp_hidden_dim,
                                         self._position_embed_dim * 2),
                                       self._position_size,
                                       dropout=self._dropout)

        if args.alias_disamb_strategy is AliasDisamb.FastText:

            def _alias_path(name):
                path = Path(args.fasttext_model_path)
                return path.parent / (path.name + f'.{name}')

            self.alias_vec = torch.load(_alias_path('alias_vectors.pt')).to(
                self.device)
            if args.normalize_fasttext_embeds:
                self.alias_vec = self.alias_vec / torch.norm(
                    self.alias_vec, dim=1).unsqueeze(0)
            self.alias_list: List[str] = reverse_map(
                loadpkl(_alias_path('alias_dict.pkl')))
            self._alias_vec_dim = self.alias_vec.size(1)
            self.hid_to_alias = nn.Linear(self._pred_input_dim,
                                          self._alias_vec_dim)

        # Embeddings
        self.position_embed = nn.Embedding(self._position_size,
                                           self._position_embed_dim)

        # KB related
        self.entity_vec, self.relation_vec = utils.load_kb_embed(
            args.path, self.device)
        self.naf_vec = nn.Parameter(torch.Tensor(self._fact_embed_dim))  # -1
        # (added 1) -2: anchor, -3: topic_itself, < -4: UNKs
        self.special_rel_vecs = nn.ParameterList([
            nn.Parameter(torch.Tensor(self._kb_embed_dim))
            for _ in range(max_unkrel + 2)
        ])
        self.unk_entity_vec = nn.Parameter(torch.Tensor(
            self._kb_embed_dim))  # -1

        self.reset_parameters()
    rows = []
    for i in list(set(tables[0])):
        rows.append(len([k for k in tables[0] if k == i]))
    print(f"rows distribution: {rows}\n")
    plt.plot(rows)
    # plt.plot(range(20, 50), rows[20:50])
    plt.savefig('rows.png')

    unique_cells = get_cell_stats(all_tables)
    total_cells_count = 0
    print(f"unique_cells: {len(unique_cells)}\n")
    cell_len_over_20 = len([i for i in unique_cells if len(i.split(' ')) > 20])
    print(f"cell_len_over_20: {cell_len_over_20}\n")
    total_cells = reduce(
        lambda x, y: x+y, list(map(lambda x: x[0]*x[1], list(zip(tables[0], tables[1])))))
    print(f"total_cells: {total_cells}\n")


if __name__ == "__main__":
    # baseline_f = pd.read_csv('../global_data/features.csv')
    # tables_subset_3k = list(baseline_f['table_id'])
    # tables_subset = list(
    #     set(tables_subset_3k+random.sample(all_tables, 20000)))

    tables_subset = loadpkl('./data/postive_tables_set.pkl')
    read_all_tables = [read_table(js)['data'] for js in tables_subset]
    dataset_stats(read_all_tables)

    vocab = loadpkl('./data/vocab_2D_10-50_complete.pkl')
    print(f'vocab: {len(vocab)}\n')
Beispiel #11
0
from sklearn.model_selection import KFold, StratifiedKFold

from preprocess import train_test, nightley, hotlink, colopl, weather, nied_oyama, jorudan, agoop
from utils import FEATS_EXCLUDED, NUM_FOLDS, loadpkl, line_notify

################################################################################
# optunaによるhyper parameter最適化
# 参考: https://github.com/pfnet/optuna/blob/master/examples/lightgbm_simple.py
################################################################################

NUM_ROWS = None
USE_PKL = True

if USE_PKL:
    DF = loadpkl('../output/df.pkl')
else:
    DF = train_test(NUM_ROWS)
    DF = pd.merge(DF, nightley(NUM_ROWS), on=['datetime', 'park'], how='outer')
    DF = pd.merge(DF, hotlink(NUM_ROWS), on='datetime', how='outer')
    DF = pd.merge(DF, colopl(NUM_ROWS), on=['year', 'month'], how='outer')
    DF = pd.merge(DF, weather(NUM_ROWS), on=['datetime', 'park'], how='outer')
    DF = pd.merge(DF,
                  nied_oyama(NUM_ROWS),
                  on=['datetime', 'park'],
                  how='outer')
    DF = pd.merge(DF,
                  agoop(num_rows),
                  on=['park', 'year', 'month'],
                  how='outer')
    DF = pd.merge(DF, jorudan(num_rows), on=['datetime', 'park'], how='outer')
import numpy as np
import pandas as pd
import optuna
import gc

from sklearn.model_selection import KFold, StratifiedKFold

from utils import FEATS_EXCLUDED, NUM_FOLDS, loadpkl, line_notify

################################################################################
# optunaによるhyper parameter最適化
# 参考: https://github.com/pfnet/optuna/blob/master/examples/lightgbm_simple.py
################################################################################

# load datasets
TRAIN_DF = loadpkl('../output/train_df.pkl')
FEATS = [f for f in TRAIN_DF.columns if f not in FEATS_EXCLUDED]


def objective(trial):
    lgbm_train = lightgbm.Dataset(TRAIN_DF[FEATS],
                                  TRAIN_DF['target'],
                                  free_raw_data=False)

    params = {
        'objective':
        'regression',
        'metric':
        'rmse',
        'verbosity':
        -1,
Beispiel #13
0
 savepkl(f'./data/yp_2D_{MAX_COL_LEN}-{MAX_ROW_LEN}.pkl', y_p)
 '''
 Generating Negative X, y dataset from the tables (1.3 x X_p)
 '''
 size = int(len(X_p) * 1.3)
 with Pool(40) as p:
     X_n = [tqdm(p.imap(generate_neg_table, range(size)), total=size)]
 p.close()
 p.join()
 X_n, y_n = data_prep_pipeline(X_n, '-')
 savepkl(f'./data/xn_2D_{MAX_COL_LEN}-{MAX_ROW_LEN}.pkl', X_n)
 savepkl(f'./data/yn_2D_{MAX_COL_LEN}-{MAX_ROW_LEN}.pkl', y_n)
 '''
 Generating word distribution dataframe
 '''
 X_n = loadpkl('./data/xn_2D_10-50.pkl')
 X_p = loadpkl('./data/xp_2D_10-50.pkl')
 X = np.hstack((X_n, X_p))
 print(X.shape, X_n.shape, X_p.shape)
 result = flatten_1_deg(flatten_1_deg(flatten_1_deg(X)))
 print('Adding queries...')
 query_l = [tokenize_str(i) for i in list(baseline_f['query'].unique())]
 query_l = flatten_1_deg(query_l)
 result += query_l
 print(result[:10])
 count = Counter(result)
 c = [[i, count[i]] for i in count.keys()]
 df = pd.DataFrame(c)
 df.sort_values(by=[1], ascending=False, inplace=True)
 df.to_csv('./data/word_distr_2D_complete.csv', index=False, columns=None)
 '''
Beispiel #14
0
        print('Data preparation happening...')
        baseline_f = pd.read_csv('../global_data/features.csv')

        def t(baseline_f):
            baseline_f['table_tkn'] = baseline_f.table_id.apply(
                lambda x: tokenize_table(read_table(x)['data']))
            baseline_f['query_tkn'] = baseline_f['query'].apply(
                lambda x: tokenize_str(x))
            return baseline_f

        baseline_f = mp(baseline_f, t, 20)
        baseline_f.to_csv('./data/baseline_f_tq-tkn.csv', index=False)

    if args.path:
        config = Config()
        vocab = loadpkl(config['input_files']['vocab_path'])
        output_dir = f'./output/{args.path}'
        model_load = torch.load(os.path.join(output_dir, 'model.pt'))
        baseline_f = pd.read_csv(config['input_files']['baseline_f'])

        trec = TREC_data_prep(model=model_load, vocab=vocab)
        baseline_f = mp(df=baseline_f, func=trec.pipeline, num_partitions=20)
        baseline_f.drop(columns=['table_emb', 'query_emb'], inplace=True)
        # baseline_f.to_csv('./baseline_f_tq-emb_temp.csv', index=False)
        # baseline_f = pd.read_csv('./baseline_f_tq-emb_temp.csv')

        trec_path = os.path.join(output_dir, config['trec']['folder_name'])
        trec_model = TREC_model(data=baseline_f,
                                output_dir=trec_path,
                                config=config)
        trec_model.train()
def main(num_rows=None):
    # load pkls
    df = read_pickles('../features/plans')
    queries = loadpkl('../features/queries.pkl')
    profiles = loadpkl('../features/profiles.pkl')
    queries_pred = loadpkl('../features/queries_pred.pkl')
    queries_profiles_pred = loadpkl('../features/queries_profiles_pred.pkl')

    # merge
    df = pd.merge(df, queries, on=['sid', 'click_mode'], how='left')
    df = pd.merge(df, profiles, on='pid', how='left')
    df = pd.merge(df, queries_pred, on='sid', how='left')
    df = pd.merge(df, queries_profiles_pred, on='sid', how='left')

    del queries, profiles, queries_pred, queries_profiles_pred
    gc.collect()

    # reduce memory usage
    df = reduce_mem_usage(df)

    # count features
    df['pid_count'] = df['pid'].map(df['pid'].value_counts())

    # time diff
    df['plan_req_time_diff'] = (df['plan_time'] - df['req_time']).astype(int)

    # distance ratio
    cols_plan_distance = ['plan_{}_distance'.format(i) for i in range(0, 7)]

    for i, c in enumerate(cols_plan_distance):
        df['plan_queries_distance_ratio{}'.format(
            i)] = df[c] / df['queries_distance']
        df['plan_queries_distance_diff{}'.format(
            i)] = df[c] - df['queries_distance']

    # stats features for preds
    cols_pred_queries = ['pred_queries{}'.format(i) for i in range(0, 12)]
    cols_pred_queries_profiles = [
        'pred_queries_profiles{}'.format(i) for i in range(0, 12)
    ]

    df['pred_queries_mean'] = df[cols_pred_queries].mean(axis=1)
    df['pred_queries_sum'] = df[cols_pred_queries].sum(axis=1)
    df['pred_queries_max'] = df[cols_pred_queries].max(axis=1)
    df['pred_queries_min'] = df[cols_pred_queries].min(axis=1)
    df['pred_queries_var'] = df[cols_pred_queries].var(axis=1)
    df['pred_queries_skew'] = df[cols_pred_queries].skew(axis=1)

    df['pred_queries_profiles_mean'] = df[cols_pred_queries_profiles].mean(
        axis=1)
    df['pred_queries_profiles_sum'] = df[cols_pred_queries_profiles].sum(
        axis=1)
    df['pred_queries_profiles_max'] = df[cols_pred_queries_profiles].max(
        axis=1)
    df['pred_queries_profiles_min'] = df[cols_pred_queries_profiles].min(
        axis=1)
    df['pred_queries_profiles_var'] = df[cols_pred_queries_profiles].var(
        axis=1)
    df['pred_queries_profiles_skew'] = df[cols_pred_queries_profiles].skew(
        axis=1)

    # stats features for each classes
    print('stats features...')
    for i in tqdm(range(0, 12)):
        cols = [
            'pred_queries{}'.format(i), 'pred_queries_profiles{}'.format(i)
        ]
        df['pred_mean{}'.format(i)] = df[cols].mean(axis=1)
        df['pred_sum{}'.format(i)] = df[cols].sum(axis=1)
        df['pred_max{}'.format(i)] = df[cols].max(axis=1)
        df['pred_min{}'.format(i)] = df[cols].min(axis=1)
        df['pred_var{}'.format(i)] = df[cols].var(axis=1)
        df['pred_skew{}'.format(i)] = df[cols].skew(axis=1)

        cols_target = [c for c in df.columns if '_target_{}'.format(i) in c]
        df['target_mean{}'.format(i)] = df[cols_target].mean(axis=1)
        df['target_sum{}'.format(i)] = df[cols_target].sum(axis=1)
        df['target_max{}'.format(i)] = df[cols_target].max(axis=1)
        df['target_min{}'.format(i)] = df[cols_target].min(axis=1)
        df['target_var{}'.format(i)] = df[cols_target].var(axis=1)
        df['target_skew{}'.format(i)] = df[cols_target].skew(axis=1)

    # post processing
    cols_transport_mode = [
        'plan_{}_transport_mode'.format(i) for i in range(0, 7)
    ]
    print('post processing...')
    for i in tqdm(range(1, 12)):
        tmp = np.zeros(len(df))
        for c in cols_transport_mode:
            tmp += (df[c] == i).astype(int)

        cols_target = [c for c in df.columns if '_target_{}'.format(i) in c]
        for c in cols_target + [
                'pred_queries{}'.format(i), 'pred_queries_profiles{}'.format(i)
        ]:
            df[c] = df[c] * (tmp > 0)

    # reduce memory usage
    df = reduce_mem_usage(df)

    # split data by city
    df1 = df[df['y_o'] > 37.5]
    df2 = df[df['y_o'] < 27.5]
    df3 = df[df['x_o'] > 120.0]

    del df
    gc.collect()

    # cols for target encoding
    cols_target_encoding = [
        'plan_weekday', 'plan_hour', 'plan_is_holiday', 'plan_weekday_hour',
        'plan_is_holiday_hour', 'plan_num_plans', 'plan_num_free_plans',
        'x_o_round', 'y_o_round', 'x_d_round', 'y_d_round',
        'queries_distance_round'
    ]

    cols_ratio_plan = [
        'plan_price_distance_ratio_max_plan',
        'plan_price_distance_ratio_min_plan', 'plan_price_eta_ratio_max_plan',
        'plan_price_eta_ratio_min_plan', 'plan_distance_eta_ratio_max_plan',
        'plan_distance_eta_ratio_min_plan',
        'plan_price_distance_prod_max_plan', 'plan_price_eta_prod_max_plan',
        'plan_price_distance_prod_min_plan', 'plan_price_eta_prod_min_plan',
        'plan_distance_eta_prod_max_plan', 'plan_distance_eta_prod_min_plan',
        'plan_price_distance_eta_prod_max_plan',
        'plan_price_distance_eta_prod_min_plan',
        'plan_distance_ratio_0_max_plan', 'plan_distance_ratio_0_min_plan',
        'plan_price_ratio_0_max_plan', 'plan_price_ratio_0_min_plan',
        'plan_eta_ratio_0_max_plan', 'plan_eta_ratio_0_min_plan',
        'plan_price_distance_prod_ratio_0_max_plan',
        'plan_price_distance_prod_ratio_0_min_plan',
        'plan_price_eta_prod_ratio_0_max_plan',
        'plan_price_eta_prod_ratio_0_min_plan',
        'plan_distance_eta_prod_ratio_0_max_plan',
        'plan_distance_eta_prod_ratio_0_min_plan',
        'plan_price_distance_eta_prod_ratio_0_max_plan',
        'plan_price_distance_eta_prod_ratio_0_min_plan'
    ]

    cols_min_max_plan = [
        'plan_distance_max_plan', 'plan_distance_min_plan',
        'plan_price_max_plan', 'plan_price_min_plan', 'plan_eta_max_plan',
        'plan_eta_min_plan'
    ]

    cols_transport_mode = [
        'plan_{}_transport_mode'.format(i) for i in range(0, 7)
    ]

    cols_target_encoding = cols_target_encoding + cols_ratio_plan + cols_min_max_plan + cols_transport_mode + [
        'profile_k_means'
    ]

    # target encoding for each cities
    print('traget encoding...')
    for i, df in tqdm(enumerate([df1, df2, df3])):

        # target encoding
        df = targetEncodingMultiClass(df, 'click_mode', cols_target_encoding)

        # change dtype
        for col in df.columns.tolist():
            if df[col].dtypes == 'float16':
                df[col] = df[col].astype(np.float32)

        # remove missing variables
        col_missing = removeMissingVariables(df, 0.75)
        df.drop(col_missing, axis=1, inplace=True)

        # remove correlated variables
        col_drop = removeCorrelatedVariables(df, 0.95)
        df.drop(col_drop, axis=1, inplace=True)

        # save as feather
        to_feature(df, '../features/feats{}'.format(i + 1))

        # save feature name list
        features_json = {'features': df.columns.tolist()}
        to_json(features_json,
                '../features/00{}_all_features.json'.format(i + 1))

        del df
        gc.collect()

    line_notify('{} finished.'.format(sys.argv[0]))
Beispiel #16
0
def main():
    # load predictions
    pred_lgbm = loadpkl('../features/lgbm_pred.pkl')
    pred_xgb = loadpkl('../features/xgb_pred.pkl')
    plans = loadpkl('../features/plans.pkl')

    # define columns name list
    cols_pred_lgbm = ['pred_lgbm_plans{}'.format(i) for i in range(0, 12)]
    cols_pred_xgb = ['pred_xgb_plans{}'.format(i) for i in range(0, 12)]
    cols_transport_mode = [
        'plan_{}_transport_mode'.format(i) for i in range(0, 7)
    ]

    # merge plans & pred
    pred = pred_lgbm[['sid', 'click_mode']]
    pred = pd.merge(pred,
                    plans[cols_transport_mode + ['sid', 'plan_num_plans']],
                    on='sid',
                    how='left')

    del plans
    gc.collect()

    # scaling predictions
    pred_lgbm[cols_pred_lgbm] = scalingPredictions(pred_lgbm[cols_pred_lgbm])
    pred_xgb[cols_pred_xgb] = scalingPredictions(pred_xgb[cols_pred_xgb])

    # reset index
    pred_lgbm.reset_index(inplace=True, drop=True)
    pred_xgb.reset_index(inplace=True, drop=True)

    # fill predictions for non-exist plans as zero
    for i in range(1, 12):
        tmp = np.zeros(len(pred))
        for c in cols_transport_mode:
            tmp += (pred[c] == i).astype(int)
        pred_lgbm['pred_lgbm_plans{}'.format(
            i)] = pred_lgbm['pred_lgbm_plans{}'.format(i)] * (tmp > 0)
        pred_xgb['pred_xgb_plans{}'.format(
            i)] = pred_xgb['pred_xgb_plans{}'.format(i)] * (tmp > 0)

    # get best weight for lgbm & xgboost
    oof_pred_lgbm = pred_lgbm[pred_lgbm['click_mode'].notnull()]
    oof_pred_xgb = pred_xgb[pred_xgb['click_mode'].notnull()]

    w = getBestWeights(oof_pred_lgbm.click_mode, oof_pred_lgbm, oof_pred_xgb,
                       '../imp/weight.png')

    # calc prediction for each class
    cols_pred = []
    for i in range(0, 12):
        pred['pred_{}'.format(i)] = w * pred_lgbm['pred_lgbm_plans{}'.format(
            i)] + (1.0 - w) * pred_xgb['pred_xgb_plans{}'.format(i)]
        cols_pred.append('pred_{}'.format(i))

    # get out of fold values
    oof_pred = pred[pred['click_mode'].notnull()]

    # get best multiples
    m4 = getBestMultiple(oof_pred, 'pred_4', cols_pred, '../imp/multiple4.png')
    pred['pred_4'] *= m4
    oof_pred['pred_4'] *= m4

    m0 = getBestMultiple(oof_pred, 'pred_0', cols_pred, '../imp/multiple0.png')
    pred['pred_0'] *= m0
    oof_pred['pred_0'] *= m0

    m3 = getBestMultiple(oof_pred, 'pred_3', cols_pred, '../imp/multiple3.png')
    pred['pred_3'] *= m3
    oof_pred['pred_3'] *= m3

    m6 = getBestMultiple(oof_pred, 'pred_6', cols_pred, '../imp/multiple6.png')
    pred['pred_6'] *= m6
    oof_pred['pred_6'] *= m6

    # get recommend mode
    pred['recommend_mode'] = np.argmax(pred[cols_pred].values, axis=1)

    # if number of plans = 1 and recommend mode != 0, set recommend mode as plan 0 mode.
    pred['recommend_mode'][(pred['plan_num_plans'] == 1) & (
        pred['recommend_mode'] != 0)] = pred['plan_0_transport_mode'][
            (pred['plan_num_plans'] == 1) & (pred['recommend_mode'] != 0)]

    # split train & test
    sub_pred = pred[pred['click_mode'].isnull()]
    oof_pred = pred[pred['click_mode'].notnull()]

    # out of fold score
    oof_f1_score = f1_score(oof_pred['click_mode'],
                            oof_pred['recommend_mode'],
                            average='weighted')

    # save csv
    oof_pred[['sid', 'click_mode', 'recommend_mode']].to_csv(oof_file_name,
                                                             index=False)
    sub_pred[['sid', 'recommend_mode']].to_csv(submission_file_name,
                                               index=False)

    # line notify
    line_notify('{} finished. f1 score: {}'.format(sys.argv[0], oof_f1_score))
Beispiel #17
0
from glob import glob
from sklearn.model_selection import KFold, StratifiedKFold
from tqdm import tqdm

from utils import FEATS_EXCLUDED, loadpkl, line_notify, to_json

#==============================================================================
# hyper parameter optimization by optuna
# https://github.com/pfnet/optuna/blob/master/examples/lightgbm_simple.py
#==============================================================================

# load datasets
CONFIGS = json.load(open('../configs/101_lgbm_queries.json'))

# load feathers
DF = loadpkl('../features/queries.pkl')

# split train & test
TRAIN_DF = DF[DF['click_mode'].notnull()]
del DF
gc.collect()

# use selected features
TRAIN_DF = TRAIN_DF[CONFIGS['features']]

# set card_id as index
TRAIN_DF.set_index('sid', inplace=True)

FEATS = [f for f in TRAIN_DF.columns if f not in FEATS_EXCLUDED]

def objective(trial):
Beispiel #18
0
    def _extra_init(self, loaded_batches: bool):
        self.rel_vocab = Vocab.from_dict(self._path / 'rel_names.pkl',
                                         mode='i2w')
        self.vocab: Dict[str, Vocab] = {
            "word": self.word_vocab,
            "rel": self.rel_vocab
        }

        self.max_unkrel = max(
            (-rel_typ - 3 for rel_typ in self.rel_vocab.i2w if rel_typ < -3),
            default=0)

        if self._use_fasttext:

            def _alias_path(name):
                path = Path(self._fasttext_model_path)
                return path.parent / (path.name + f'.{name}')

            # gather all entity aliases and compute fastText embeddings
            alias_dict_path = _alias_path('alias_dict.pkl')
            if alias_dict_path.exists():
                alias_dict: Dict[str, int] = loadpkl(alias_dict_path)
                loaded = True
            else:
                alias_dict = defaultdict(lambda: len(alias_dict))
                loaded = False
            if not loaded_batches:
                for dataset in self.data.values():
                    for example in dataset:
                        for idx, rel in enumerate(
                                example.relations):  # type: ignore
                            example.relations[
                                idx] = rel._replace(  # type: ignore
                                    obj_alias=[
                                        alias_dict[s] for s in rel.obj_alias
                                    ])
            if not alias_dict_path.exists():
                alias_dict = dict(alias_dict)
                savepkl(alias_dict, alias_dict_path)

            alias_vectors_path = _alias_path('alias_vectors.pt')
            if not alias_vectors_path.exists() or not loaded:
                import fastText
                ft_model = fastText.load_model(self._fasttext_model_path)
                alias_vectors = []
                alias_list = utils.reverse_map(alias_dict)
                for alias in utils.progress(alias_list,
                                            desc="Building fastText vectors",
                                            ascii=True,
                                            ncols=80):
                    vectors = [
                        ft_model.get_word_vector(w) for w in alias.split()
                    ]
                    vectors = np.sum(vectors, axis=0).tolist()
                    alias_vectors.append(vectors)
                alias_vectors = torch.tensor(alias_vectors)
                torch.save(alias_vectors, alias_vectors_path)

        if not loaded_batches and (self._exclude_entity_disamb
                                   or self._exclude_alias_disamb):
            # no need to do this if batches are loaded
            if self._exclude_entity_disamb:
                # gather training set stats
                self.entity_count_per_type = self.gather_entity_stats(
                    self.data['train'])

            for dataset in self.data.values():
                for idx in range(len(dataset)):
                    dataset[idx] = self.remove_ambiguity(
                        dataset[idx], self._exclude_entity_disamb,
                        self._exclude_alias_disamb)
    parser.add_argument(
        "--comment", help="additional comments for simulation to be run."
    )
    return parser.parse_args()


if __name__ == '__main__':
    args = get_args()
    output_dir, config = utils.setup_simulation(args)
    model_params = config['model_params']
    input_files = config['input_files']
    trec_config = config['trec']

    torch.manual_seed(model_params['seed'])

    Xp = loadpkl(input_files['Xp_path'])
    yp = loadpkl(input_files['yp_path'])
    logger.info(f"Xp.shape: {Xp.shape}, yp.shape: {yp.shape}")
    # Xn = loadpkl(input_files['Xn_path'])
    # yn = loadpkl(input_files['yn_path'])
    # logger.info(f"Xn.shape: {Xn.shape}, yn.shape: {yn.shape}")
    vocab = loadpkl(input_files['vocab_path'])
    logger.info(f"len(vocab): {len(vocab)}")

    train_writer = make_writer(output_dir, 'train', config)
    test_writer = make_writer(output_dir, 'test', config)

    device = torch.device(
        f"cuda:{args.cuda_no}" if torch.cuda.is_available() else 'cpu')

    model = models.create_model(config['model_props']['type'], params=(
Beispiel #20
0
from glob import glob
from sklearn.model_selection import KFold, StratifiedKFold
from tqdm import tqdm

from utils import FEATS_EXCLUDED, loadpkl, line_notify, to_json

#==============================================================================
# hyper parameter optimization by optuna
# https://github.com/pfnet/optuna/blob/master/examples/lightgbm_simple.py
#==============================================================================

# load datasets
CONFIGS = json.load(open('../configs/102_lgbm_queries_profiles.json'))

# load feathers
DF = loadpkl('../features/queries_profiles.pkl')

# split train & test
TRAIN_DF = DF[DF['click_mode'].notnull()]
del DF
gc.collect()

# use selected features
TRAIN_DF = TRAIN_DF[CONFIGS['features']]

# set card_id as index
TRAIN_DF.set_index('sid', inplace=True)

FEATS = [f for f in TRAIN_DF.columns if f not in FEATS_EXCLUDED]

def objective(trial):
            tables[i] = np.vectorize(lambda y: w2i[y])(np.array(t)).tolist()
        return tables


if __name__ == '__main__':
    start = time.time()
    parser = argparse.ArgumentParser()
    parser.add_argument("-p",
                        "--pad_data_prep",
                        help="path for the scores",
                        action='store_true')
    args = parser.parse_args()

    config = Config()

    X = loadpkl("./data/xp_2D_10-50.pkl")
    y = loadpkl("./data/yp_2D_10-50.pkl")
    vocab = loadpkl(config['input_files']['vocab_path'])
    table_prep_params = config['table_prep_params']
    print(X.shape, y.shape, len(vocab))

    if args.pad_data_prep:

        def pad_table(table):
            rows = len(table)
            for row in table:
                for cell in row:
                    for i in range(
                            0,
                            table_prep_params['LENGTH_PER_CELL'] - len(cell)):
                        cell.append('<PAD>')
def main():
    # load predictions
    pred_lgbm1 = loadpkl('../features/lgbm_pred_1.pkl')
    pred_lgbm2 = loadpkl('../features/lgbm_pred_2.pkl')
    pred_lgbm3 = loadpkl('../features/lgbm_pred_3.pkl')
    plans = read_pickles('../features/plans')
    preds = [pred_lgbm1, pred_lgbm2, pred_lgbm3]

    # define columns name list
    cols_pred_lgbm = ['pred_lgbm_plans{}'.format(i) for i in range(0, 12)]
    cols_transport_mode = [
        'plan_{}_transport_mode'.format(i) for i in range(0, 7)
    ]

    # remove columns
    cols_drop = [
        c for c in plans.columns if c not in cols_transport_mode +
        ['sid', 'plan_num_plans', 'click_mode']
    ]
    plans.drop(cols_drop, axis=1, inplace=True)

    # postprocessing
    sub_preds = []
    oof_preds = []
    for i, pred_lgbm in enumerate(preds):

        # merge plans & pred
        pred = pred_lgbm[['sid', 'click_mode']]
        pred = pd.merge(pred,
                        plans[cols_transport_mode + ['sid', 'plan_num_plans']],
                        on='sid',
                        how='left')

        # scaling predictions
        pred_lgbm[cols_pred_lgbm] = scalingPredictions(
            pred_lgbm[cols_pred_lgbm])

        # reset index
        pred_lgbm.reset_index(inplace=True, drop=True)

        # fill predictions for non-exist plans as zero
        for j in range(1, 12):
            tmp = np.zeros(len(pred))
            for c in cols_transport_mode:
                tmp += (pred[c] == j).astype(int)
            pred_lgbm['pred_lgbm_plans{}'.format(
                j)] = pred_lgbm['pred_lgbm_plans{}'.format(j)] * (tmp > 0)

        # get best weight for lgbm & xgboost
        oof_pred_lgbm = pred_lgbm[pred_lgbm['click_mode'].notnull()]

        # calc prediction for each class
        cols_pred = []
        for j in range(0, 12):
            pred['pred_{}'.format(j)] = pred_lgbm['pred_lgbm_plans{}'.format(
                j)]
            cols_pred.append('pred_{}'.format(j))

        # get out of fold values
        oof_pred = pred[pred['click_mode'].notnull()]

        # get best multiples
        m0 = getBestMultiple(oof_pred, 'pred_0', cols_pred,
                             '../imp/multiple0_{}.png'.format(i + 1))
        pred['pred_0'] *= m0
        oof_pred['pred_0'] *= m0

        m3 = getBestMultiple(oof_pred, 'pred_3', cols_pred,
                             '../imp/multiple3_{}.png'.format(i + 1))
        pred['pred_3'] *= m3
        oof_pred['pred_3'] *= m3

        m4 = getBestMultiple(oof_pred, 'pred_4', cols_pred,
                             '../imp/multiple4_{}.png'.format(i + 1))
        pred['pred_4'] *= m4
        oof_pred['pred_4'] *= m4

        # get recommend mode
        pred['recommend_mode'] = np.argmax(pred[cols_pred].values, axis=1)

        # if number of plans = 1 and recommend mode != 0, fill recommend mode with plan 0 mode.
        pred['recommend_mode'][(pred['plan_num_plans'] == 1) & (
            pred['recommend_mode'] != 0)] = pred['plan_0_transport_mode'][
                (pred['plan_num_plans'] == 1) & (pred['recommend_mode'] != 0)]

        # split train & test
        _sub_pred = pred[pred['click_mode'].isnull()]
        _oof_pred = pred[pred['click_mode'].notnull()]

        sub_preds.append(_sub_pred)
        oof_preds.append(_oof_pred)

        del pred, _sub_pred, _oof_pred
        gc.collect()

    # merge preds
    sub_pred = sub_preds[0].append(sub_preds[1])
    sub_pred = sub_pred.append(sub_preds[2])
    sub_pred = pd.merge(
        plans[plans['click_mode'].isnull()][['sid', 'click_mode']],
        sub_pred[['sid', 'recommend_mode']],
        on='sid',
        how='left')

    oof_pred = oof_preds[0].append(oof_preds[1])
    oof_pred = oof_pred.append(oof_preds[2])
    oof_pred = pd.merge(
        plans[plans['click_mode'].notnull()][['sid', 'click_mode']],
        oof_pred[['sid', 'recommend_mode']],
        on='sid',
        how='left')

    del sub_preds, oof_preds, plans

    # out of fold score
    oof_f1_score = f1_score(oof_pred['click_mode'],
                            oof_pred['recommend_mode'],
                            average='weighted')

    # save csv
    oof_pred[['sid', 'click_mode', 'recommend_mode']].to_csv(oof_file_name,
                                                             index=False)
    sub_pred[['sid', 'recommend_mode']].to_csv(submission_file_name,
                                               index=False)

    # line notify
    line_notify('{} finished. f1 score: {}'.format(sys.argv[0], oof_f1_score))