Esempio n. 1
0
File: io.py Progetto: adgirish/ray
def read_feather(path,
                 nthreads=1):

    warnings.warn("Defaulting to Pandas implementation",
                  PendingDeprecationWarning)

    port_frame = pd.read_feather(path)
    ray_frame = from_pandas(port_frame, get_npartitions())

    return ray_frame
Esempio n. 2
0
def read():
    """Read a data chunk from SciDB. Returns a Pandas DataFrame or None.

    """
    sz = struct.unpack('<Q', stdin.read(8))[0]

    if sz:
        df = pandas.read_feather(io.BytesIO(stdin.read(sz)))
        return df

    else:                       # Last Chunk
        return None
Esempio n. 3
0
def scan_to_table(input_table, genome, scoring, pwmfile=None, ncpus=None):
    """Scan regions in input table with motifs.

    Parameters
    ----------
    input_table : str
        Filename of input table. Can be either a text-separated tab file or a
        feather file.
    
    genome : str
        Genome name. Can be either the name of a FASTA-formatted file or a 
        genomepy genome name.
    
    scoring : str
        "count" or "score"
    
    pwmfile : str, optional
        Specify a PFM file for scanning.
    
    ncpus : int, optional
        If defined this specifies the number of cores to use.
    
    Returns
    -------
    table : pandas.DataFrame
        DataFrame with motif ids as column names and regions as index. Values
        are either counts or scores depending on the 'scoring' parameter.s
    """
    config = MotifConfig()
    
    if pwmfile is None:
        pwmfile = config.get_default_params().get("motif_db", None)
        if pwmfile is not None:
            pwmfile = os.path.join(config.get_motif_dir(), pwmfile)

    if pwmfile is None:
        raise ValueError("no pwmfile given and no default database specified")

    logger.info("reading table")
    if input_table.endswith("feather"):
        df = pd.read_feather(input_table)
        idx = df.iloc[:,0].values
    else:
        df = pd.read_table(input_table, index_col=0, comment="#")
        idx = df.index
    
    regions = list(idx)
    s = Scanner(ncpus=ncpus)
    s.set_motifs(pwmfile)
    s.set_genome(genome)
    s.set_background(genome=genome)
    
    nregions = len(regions)

    scores = []
    if scoring == "count":
        logger.info("setting threshold")
        s.set_threshold(fpr=FPR)
        logger.info("creating count table")
        for row in s.count(regions):
            scores.append(row)
        logger.info("done")
    else:
        s.set_threshold(threshold=0.0)
        logger.info("creating score table")
        for row in s.best_score(regions, normalize=True):
            scores.append(row)
        logger.info("done")
   
    motif_names = [m.id for m in read_motifs(pwmfile)]
    logger.info("creating dataframe")
    return pd.DataFrame(scores, index=idx, columns=motif_names)
                files_.append(f1)
                break

    files = sorted(set(files) - set(files_))

if len(use_files) > 0:
    files_ = []
    for f1 in files:
        for f2 in use_files:
            if f2 in f1:
                files_.append(f1)
                break

    files = sorted(files_[:])

X = pd.concat([pd.read_feather(f) for f in tqdm(files, mininterval=60)],
              axis=1)
y = utils.read_pickles('../data/label').TARGET

if X.columns.duplicated().sum() > 0:
    raise Exception(f'duplicated!: { X.columns[X.columns.duplicated()] }')
print('no dup :) ')
print(f'X.shape {X.shape}')

# =============================================================================
# lgb
# =============================================================================

dtrain = lgb.Dataset(X, y)
model = lgb.train(param, dtrain, 500)
Esempio n. 5
0
 def deserialize(self, value:bytes) -> pd.DataFrame:
     # recover a Python object from bytes
     df_bytes_io = BytesIO(value)
     return pd.read_feather(df_bytes_io)
import sklearn.metrics as metrics

# files = glob.glob('data/processed/*.feather')
# files
# list_ = []
# for file in files:
#     df = pd.read_feather(file)
#     list_.append(df)
#     mdat = pd.concat(list_, sort=False)

# mdat = mdat.reset_index(drop=True)

# dat = mdat

# Full processed data
dat = pd.read_feather(
    'data/full_gfw_10d_effort_model_data_8DAY_2012-01-01_2016-12-26.feather')

dat = dat[dat.geartype == 'drifting_longlines']
dat = dat.sort_values('date')

# ~50% of obs are zero (remove?)
len(dat[dat.fishing_hours > 0]) / len(dat)
dat = dat[dat.fishing_hours > 0]

# If illegally operating inside EEZ (!= ARG)
dat.loc[:, 'illegal'] = np.where(
    ((dat['eez'] == True) & (dat['flag'] != 'ARG')), 1, 0)

# Convert true/false eez to 0/1
dat.loc[:, 'eez'] = dat.eez.astype('uint8')
Esempio n. 7
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 13 09:41:38 2019

@author: Kazuki
"""

import pandas as pd


sub = pd.read_csv('../input/sample_submission.csv.zip')
te_dt = pd.read_feather('../data/test_datetime.f')

pri = sub[te_dt.AvSigVersion>="2018-10-26"]

pri[['MachineIdentifier']].to_pickle('../data/pri_id.pkl')
    #         'nthread': 32,
    'nthread': cpu_count(),
    'bagging_freq': 1,
    'verbose': -1,
    'seed': SEED
}

use_files = ['train_f0', 'train_f1']

# =============================================================================
# load
# =============================================================================

files = utils.get_use_files(use_files, True)

X = pd.concat([pd.read_feather(f) for f in tqdm(files, mininterval=60)],
              axis=1)
y = utils.read_pickles('../data/label').TARGET

if X.columns.duplicated().sum() > 0:
    raise Exception(f'duplicated!: { X.columns[X.columns.duplicated()] }')
print('no dup :) ')
print(f'X.shape {X.shape}')

#X = X.rank(method='dense')
gc.collect()

CAT = list(set(X.columns) & set(utils_cat.ALL))
# =============================================================================
# cv
# =============================================================================
# -*- coding: utf-8 -*-
"""
Created on Thu Dec  6 10:45:53 2018

@author: jiaxx
"""

import pandas as pd
import numpy as np

feat = pd.read_feather('../data/sparkling_meteo.feather')
glm = pd.read_feather('../data/Sparkling_temperatures.feather')

feat.columns
feat.values
feat.values.shape

## import previous data
#x_full_o = np.load('../../../../2017-2018/TF_test/TGDS/processed_features.npy')
#x_raw_full_o = np.load('../../../../2017-2018/TF_test/TGDS/features.npy')
#diag_full_o = np.load('../../../../2017-2018/TF_test/TGDS/diag.npy')
#obs_o = np.load('../../../../2017-2018/TF_test/TGDS/Obs_temp.npy')
#label_o = np.load('../../../../2017-2018/TF_test/TGDS/labels.npy')
#dates_o = np.load('../../../../2017-2018/TF_test/TGDS/dates.npy') # 10592

## SKIP THIS ------------ match with previous data
#new_date = feat.values[:,0]
#for i in range(dates.shape[0]):
#    if dates[i]==new_date[0]:
#        start_idx = i
#    elif dates[i]==new_date[-1]:
Esempio n. 10
0
def get_huc12_results(id, state):
    """Get results for CHAT Rank for a given HUC12.

    Parameters
    ----------
    id : str
        HUC12 ID
    state : str
        CHAT state

    Returns
    -------
    dict
        {
            "priorities": [...],
            "legend": [...],
            "analysis_notes": <analysis_notes>,
            "remainder": <acres outside of input>,
            "remainder_percent" <percent of total acres outside input>
        }
    """

    # 0 values not present for top-level rank
    values = INPUTS[f"{state}chat"]["values"][1:]
    columns = ["id", "total_acres", "analysis_acres", "chat_acres"] + [
        f'chatrank_{v["value"]}' for v in values
    ]

    df = pd.read_feather(out_dir / f"{state}chat.feather", columns=columns).set_index(
        "id"
    )
    if id not in df.index:
        return dict()

    row = df.loc[id]

    se_remainder = max(row.total_acres - row.analysis_acres, 0)
    se_remainder = se_remainder if se_remainder >= 1 else 0

    remainder = max(row.analysis_acres - row.chat_acres, 0)
    remainder = remainder if remainder >= 1 else 0

    priorities = []
    legend = []
    for value in values:
        acres = row[f'chatrank_{value["value"]}']
        priorities.append(
            {
                "value": value["value"],
                "label": value["label"],
                "blueprint": value["blueprint"],
                "acres": acres,
                "percent": 100 * acres / row.total_acres,
            }
        )

        legend.append({"label": value["label"], "color": value["color"]})

    return {
        "priorities": priorities,
        "legend": legend,
        "analysis_notes": get_analysis_notes(),
        "analysis_acres": row.analysis_acres,
        "total_acres": row.total_acres,
        "remainder": remainder,
        "remainder_percent": 100 * remainder / row.total_acres,
        "se_remainder": se_remainder,
        "se_remainder_percent": 100 * se_remainder / row.total_acres,
    }
Esempio n. 11
0
def merge_annual_stats(input_pd_files,
                       country_names_lut_file,
                       out_feather=None,
                       out_excel=None,
                       excel_sheet=None,
                       out_csv=None):
    rsgis_utils = rsgislib.RSGISPyUtils()
    country_names_luts = rsgis_utils.readJSON2Dict(country_names_lut_file)
    years = [
        '1996', '2007', '2008', '2009', '2010', '2015', '2016', '2017', '2018',
        '2019', '2020'
    ]
    year_info = dict()
    comb_df = None
    for year in years:
        year_info[year] = dict()
        for in_file in input_pd_files:
            if year in in_file:
                year_info[year]['year_file'] = in_file

        if 'year_file' in year_info[year]:
            yr_df = pandas.read_feather(year_info[year]['year_file'])
            yr_df = yr_df.rename(columns={
                'count': '{}_count'.format(year),
                'area': '{}_area'.format(year)
            })
            yr_df = yr_df.drop(['uid'], axis=1)
            if year == '1996':
                comb_df = yr_df
            else:
                comb_df = pandas.merge(left=comb_df,
                                       right=yr_df,
                                       left_on='region',
                                       right_on='region')

    if comb_df is not None:
        cnty_lst = list()
        for region in comb_df['region']:
            cnty_lst.append(country_names_luts['gid'][region])
        comb_df['name'] = cnty_lst

        comb_df = comb_df[[
            'region', 'name', '1996_count', '2007_count', '2008_count',
            '2009_count', '2010_count', '2015_count', '2016_count',
            '2017_count', '2018_count', '2019_count', '2020_count',
            '1996_area', '2007_area', '2008_area', '2009_area', '2010_area',
            '2015_area', '2016_area', '2017_area', '2018_area', '2019_area',
            '2020_area'
        ]]

        comb_df = comb_df.sort_values(by=['name']).reset_index()
        comb_df = comb_df.drop(['index'], axis=1)
        print(comb_df)

        if out_feather is not None:
            comb_df.to_feather(out_feather)
        if out_csv is not None:
            comb_df.to_csv(out_csv)
        if out_excel is not None:
            if excel_sheet is None:
                excel_sheet = 'gmw_stats'
            comb_df.to_excel(out_excel, sheet_name=excel_sheet)
Esempio n. 12
0
import torch.optim as optim
import torch
import torchvision as tv
import fastai.vision as faiv
import fastai.train as fait
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from tqdm import tqdm
import pandas as pd

import icae.toy.lone_point_models as models

#%%
path = "data/toy/1024/"
data_df = pd.read_feather(path + "desc.feather")
label_columns = list(data_df.columns[data_df.columns.str.contains("label_*")])
data = (faiv.ImageList.from_df(data_df,
                               path).split_by_rand_pct(0.01).label_from_df(
                                   cols=label_columns,
                                   label_cls=faiv.FloatList).databunch(bs=10))

shape = list(data.one_batch()[0].size())
model = models.SimpleClassifier(
    shape[2:],
    50,
    2,
    # 2,
    kernel=[3, 3],
    channel_progression=lambda x: x + 1,
    batch_normalization=True,
        df = pd.DataFrame()
        df['SK_ID_CURR'] = g.SK_ID_CURR.max()
        df['first_nonzero_SK_DPD'] = g.SK_DPD.apply(first_nonzero)
        df['first_nonzero_SK_DPD_DEF'] = g.SK_DPD_DEF.apply(first_nonzero)
        df['first_nonzero_diff'] = df.first_nonzero_SK_DPD_DEF - df.first_nonzero_SK_DPD
        g = df.reset_index(drop=True).groupby('SK_ID_CURR')
        self.df = pd.concat([
            g.mean().rename(columns=lambda x: x+'_mean'),
            g.max().rename(columns=lambda x: x+'_max'),
        ], axis=1)


if __name__ == '__main__':
    args = get_arguments('POS CASH')
    with timer('load dataset'):
        train = pd.read_feather(TRAIN)[['SK_ID_CURR']]
        test = pd.read_feather(TEST)[['SK_ID_CURR']]
        pos = pd.read_feather(POS)
    
    with timer('preprocessing'):
        pos = pos.sort_values(['SK_ID_CURR', 'MONTHS_BALANCE']).reset_index(drop=True)
        pos.loc[:, pos.columns.str.startswith('SK_DPD')] = np.log1p(pos.filter(regex='^SK_DPD'))
    
    with timer('create dataset'):
        generate_features([
            PosNullCount('pos_null_count'),
            PosLatest('pos', 'latest'),
            PosCount(),
            PosDecay('pos'),
            PosMonthDuplicate('pos'),
            PosCompleteDecay('pos'),
Esempio n. 14
0
def load_dataframe(file_path):
    """Load a dataframe from a parquet file"""
    with open(file_path, 'rb') as file_obj:
        df = pd.read_feather(file_obj)
    return df
Esempio n. 15
0
def get_removed_dams():
    return pd.read_feather(data_dir / "removed_dams.feather")
Esempio n. 16
0
from pathlib import Path

import pandas as pd

from api.logger import log

data_dir = Path("data/api")

### Read source data into memory
# we can do this because data do not consume much memory

try:
    dams = pd.read_feather(data_dir / "dams.feather").set_index("id")
    ranked_dams = dams.loc[dams.Ranked]

    barriers = pd.read_feather(data_dir /
                               "small_barriers.feather").set_index("id")
    ranked_barriers = barriers.loc[barriers.Ranked]

    print(
        f"Loaded {len(dams):,} dams ({len(ranked_dams):,} ranked), {len(barriers):,} barriers ({len(ranked_barriers):,} ranked) "
    )

except Exception as e:
    print("ERROR: not able to load data")
    log.error(e)


# on demand instead of in-memory
def get_removed_dams():
    return pd.read_feather(data_dir / "removed_dams.feather")
Esempio n. 17
0
from functools import partial

from glob import glob
from fastai.collab import *
from fastai.tabular import *

from scipy.spatial.distance import cosine
from server.constants import max_slice, min_slice, random_seed, batch_size, device
from server.model import build_model, build_learner, load_pretrained_embeddings
from server.preprocessor import AnimeRatingsDataset, load_dataset, build_databunch

np.random.seed(random_seed)


anime_df = pd.read_feather("model_resources/animes.feather").set_index(
    "anime_monotonic_id"
)

train_df, test_df = load_dataset()
databunch = build_databunch(train_df=train_df, test_df=test_df)

all_embeddings = load_pretrained_embeddings()
model = build_model(
    anime_genre_embeddings=all_embeddings["anime_with_genre_embeddings"]
)
learn = build_learner(model=model, databunch=databunch)


def sort_by_distance(record, anime_monotonic_id_embeddings, reverse=True):
    target_embedding = anime_monotonic_id_embeddings[
        record["target_anime_monotonic_id"]
Esempio n. 18
0
import os
import sys
import pandas as pd

from validate_utils import FoldValidation

import warnings
warnings.filterwarnings('ignore')

# ===============
# Settings
# ===============
fname = os.path.basename(sys.argv[0])
TRAIN_PATH = f'../data/input/train_data.feather'
OUTPUT_PATH = f'../folds/{fname.split(".")[0]}.feather'
N_FOLD = 5

# ===============
# Main
# ===============
df = pd.read_feather(TRAIN_PATH)
fold_validation = FoldValidation(df,
                                 stratify_arr=df['position'],
                                 fold_num=N_FOLD)
folds = fold_validation.make_split(valid_type='StratifiedKFold')
folds.to_feather(OUTPUT_PATH)
def read_data(folder: Path):
    df_train = pd.read_feather(folder / "train.feather").set_index("id")
    df_test = pd.read_feather(folder / "test.feather").set_index("id")

    return df_train, df_test
        df_train = pd.DataFrame(sc.fit_transform(X_train))
        df_test = pd.DataFrame(sc.transform(X_test))
        res_train, res_test = neighbors(df_train,
                                        df_test,
                                        train.TARGET,
                                        cv,
                                        k=5,
                                        n_trees=10)
        self.train = pd.DataFrame(res_train, columns=['neg', 'pos'])
        self.test = pd.DataFrame(res_test, columns=['neg', 'pos'])


if __name__ == '__main__':
    args = get_arguments('main')
    with timer('load dataset'):
        train = pd.read_feather(TRAIN)[['TARGET']]
        test = pd.read_feather(TEST)
        cv_id = pd.read_feather('../input/cv_id.ftr')
        cv = PredefinedSplit(cv_id)

        dfs = [
            pd.read_feather(str(f))
            for f in sorted(Path('../working/').glob('buro_*_train.ftr'))
        ]
        X_train = pd.concat(dfs, axis=1)  # type: pd.DataFrame
        dfs = [
            pd.read_feather(str(f))
            for f in sorted(Path('../working/').glob('buro_*_test.ftr'))
        ]
        X_test = pd.concat(dfs, axis=1)  # type: pd.DataFrame
Esempio n. 21
0
            self.test[f'area_{f}_mean_diff'] = test[f] - self.test[f'area_{f}_mean']
            self.test[f'area_{f}_median_diff'] = test[f] - self.test[f'area_{f}_median']
            self.test[f'area_{f}_mean_ratio'] = test[f] / self.test[f'area_{f}_mean']
            self.test[f'area_{f}_median_ratio'] = test[f] / self.test[f'area_{f}_median']


class MainRegionAsCategory(Feature):
    def create_features(self):
        self.train = train['REGION_POPULATION_RELATIVE'].astype(str).to_frame('region_as_category')
        self.test = test['REGION_POPULATION_RELATIVE'].astype(str).to_frame('region_as_category')


if __name__ == '__main__':
    args = get_arguments('main')
    with timer('load dataset'):
        train = pd.read_feather(TRAIN)
        test = pd.read_feather(TEST)
        X = pd.concat([
            train.drop('TARGET', axis=1),
            test
        ])  # type: pd.DataFrame
    
    with timer('preprocessing'):
        train.AMT_INCOME_TOTAL.replace(117000000.0, 1170000, inplace=True)
        train.replace({'Yes': 1, 'No': 0, 'Y': 1, 'N': 0, 'XAP': np.nan, 'XAN': np.nan}, inplace=True)
        test.replace({'Yes': 1, 'No': 0, 'Y': 1, 'N': 0, 'XAP': np.nan, 'XAN': np.nan}, inplace=True)
        day_cols = train.filter(regex='DAYS_').columns
        train[day_cols] = train[day_cols].replace(365243, np.nan)
        test[day_cols] = test[day_cols].replace(365243, np.nan)
    
    with timer('create dataset'):
files_tr = sorted(glob('../data/train_f*.f'))

# USE_PREF
li = []
for i in files_tr:
    for j in USE_PREF:
        if j in i:
            li.append(i)
            break
files_tr = li

[print(i,f) for i,f in enumerate(files_tr)]

X_train = pd.concat([
                pd.read_feather(f) for f in tqdm(files_tr, mininterval=60)
               ], axis=1)
y_train = utils.load_target()['HasDetections']

#X.drop(DROP, axis=1, inplace=True)

if X_train.columns.duplicated().sum()>0:
    raise Exception(f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }')
print('no dup :) ')
print(f'X_train.shape {X_train.shape}')

gc.collect()

CAT = list( set(X_train.columns)&set(utils_cat.ALL) )
print(f'CAT: {CAT}')
Esempio n. 23
0
from Dashboard.functions.load_figures import load_piechart
from Dashboard.functions.map import load_figure
from Dashboard.translations import layout_elements, feature_translations, level_options
from config import DASH_CACHE_DIR

log_ = logging.getLogger(__name__)

log_.info('Reading geo data...')
geolayers_all = {}
centers_all = {}
for level in ['city', 'neighborhood']:
    with open(os.path.join(DASH_CACHE_DIR,
                           level + '_geolayers.json')) as json_file:
        geolayers_all[level] = json.load(json_file)
    centers_all[level] = pd.read_feather(
        os.path.join(DASH_CACHE_DIR, level + '_centers.feather'))

log_.info('Reading finished...')


@app.callback(
    Output("choropleth", "figure"),
    [
        Input("level_selector", "value"),
        Input("color_selector", "value"),
        Input('date_selector', 'date'),
        Input('language_tab', 'value'),
    ],
)
def reload_graph(level, colorby, selected_date, lang):
    return load_figure(geolayers=geolayers_all[level],
Esempio n. 24
0
def getErrorTicks():
    file = "db/error.file"
    return (pd.read_feather(file) if os.path.exists(file) else pd.DataFrame(
        columns=["ErrorTicks"]))
def load_df_feather(filename, columns=None):
    if columns == None:
        df = pd.read_feather(filename, use_threads=True)
    else:
        df = pd.read_feather(filename, columns=columns, use_threads=True)
    return df
Esempio n. 26
0
def readFilteredTicks(file="filteredTicks"):
    file = f"db/{file}.file"
    return (pd.read_feather(file) if os.path.exists(file) else pd.DataFrame(
        columns=["Symbol", "Type"]))
Esempio n. 27
0
    """
    m = sp.sparse.load_npz(inpath)
    graph = m.dot(m.T)

    if outpath:
        sp.sparse.save_npz(outpath, graph)

    return graph


if __name__ == '__main__':

    # this is a pre computed file of artist information
    ## this will be replaced in future realeases with API queries to the Spotify Artist API from JavaScript
    rich_artists = pd.read_csv("data/2019-07-30/ranked_rich_artists.bsv",
                               sep="|",
                               index_col="id")

    # output from query.py
    index_id_map = pd.read_feather("data/2019-07-30/artist_indices.feather")
    index_id_map.columns = ["id", "index"]
    index_id_map.set_index("index", inplace=True)

    # output from query.py
    graph = artists_graph("data/2019-07-30/artists_to_playlists.npz")
    load_recommendations(rich_artists, graph, index_id_map, 0.0001)
    load_recommendations(rich_artists, graph, index_id_map, 0.0005)
    load_recommendations(rich_artists, graph, index_id_map, 0.0025)
    load_recommendations(rich_artists, graph, index_id_map, 0.01)
    load_recommendations(rich_artists, graph, index_id_map, 0.02)
Esempio n. 28
0
smoothing = '1400'

# %%
signal_path = \
    f'/faststorage/project/reprator/Andrej/reprator/data'\
    f'/dfs_TCGA_S_B/*.{smoothing}.fitted.fth'

paths = glob(signal_path)
print(f'There are {len(paths)} paths.')

# %%
main_df = pd.DataFrame()
labels = pd.Series()
for path in paths:
    # load data
    df = pd.read_feather(path)
    tcga_id = path.split('/')[-1].split('.')[0]
    gdc_id = tcga_id2gdc_id[tcga_id]
    ctype = tcga_id2ctype[tcga_id]
    # check if cor > 0.3
    cor = df.loc[:, ['bw_signal', 'tcga_signal']].corr().iloc[0, 1]
    print(f'{gdc_id}\t{cor}')
    if cor > 0.3:
        # subsample
        subsampled_signal = get_subsampled_signal(df)
        main_df[gdc_id] = subsampled_signal
        labels[gdc_id] = ctype

# %%
main_df = main_df.dropna()
main_df = main_df.T
Esempio n. 29
0
def main(params: dict, output_dir: str):
    import mlflow
    print("start params={}".format(params))
    model_id = "train_0"
    logger = get_logger()
    # df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle")
    df = pd.read_pickle(
        "../input/riiid-test-answer-prediction/split10/train_0.pickle"
    ).sort_values(["user_id", "timestamp"]).reset_index(drop=True)
    if is_debug:
        df = df.head(30000)
    df["prior_question_had_explanation"] = df[
        "prior_question_had_explanation"].fillna(-1)
    df["answered_correctly"] = df["answered_correctly"].replace(-1, np.nan)
    column_config = {
        ("content_id", "content_type_id"): {
            "type": "category"
        },
        "user_answer": {
            "type": "leakage_feature"
        },
        "answered_correctly": {
            "type": "leakage_feature"
        },
        "part": {
            "type": "category"
        },
        "prior_question_elapsed_time_bin300": {
            "type": "category"
        },
        "duration_previous_content_bin300": {
            "type": "category"
        },
        "prior_question_had_explanation": {
            "type": "category"
        },
        "rating_diff_content_user_id": {
            "type": "numeric"
        },
        "task_container_id_bin300": {
            "type": "category"
        },
        "previous_answer_index_content_id": {
            "type": "category"
        },
        "previous_answer_content_id": {
            "type": "category"
        },
        "timediff-elapsedtime_bin500": {
            "type": "category"
        }
    }

    if not load_pickle or is_debug:
        feature_factory_dict = {"user_id": {}}
        feature_factory_dict["user_id"][
            "DurationPreviousContent"] = DurationPreviousContent(
                is_partial_fit=True)
        feature_factory_dict["user_id"][
            "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder()
        feature_factory_dict["user_id"][
            "UserContentRateEncoder"] = UserContentRateEncoder(
                rate_func="elo", column="user_id")
        feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2(
            groupby="user_id",
            column="content_id",
            is_debug=is_debug,
            model_id=model_id,
            n=300)
        feature_factory_dict["user_id"][
            "StudyTermEncoder2"] = StudyTermEncoder2(is_partial_fit=True)
        feature_factory_dict["user_id"][
            f"MeanAggregatorStudyTimebyUserId"] = MeanAggregator(
                column="user_id", agg_column="study_time", remove_now=False)

        feature_factory_dict["user_id"][
            "ElapsedTimeMeanByContentIdEncoder"] = ElapsedTimeMeanByContentIdEncoder(
            )
        feature_factory_dict["post"] = {
            "DurationFeaturePostProcess": DurationFeaturePostProcess()
        }

        feature_factory_manager = FeatureFactoryManager(
            feature_factory_dict=feature_factory_dict,
            logger=logger,
            split_num=1,
            model_id=model_id,
            load_feature=not is_debug,
            save_feature=not is_debug)
        print("all_predict")
        df = feature_factory_manager.all_predict(df)

        def f(x):
            x = x // 1000
            if x < -100:
                return -100
            if x > 400:
                return 400
            return x

        df["task_container_id_bin300"] = [
            x if x < 300 else 300 for x in df["task_container_id"]
        ]
        df["timediff-elapsedtime_bin500"] = [
            f(x) for x in df["timediff-elapsedtime"].values
        ]
        df = df[[
            "user_id", "content_id", "content_type_id", "part", "user_answer",
            "answered_correctly", "prior_question_elapsed_time_bin300",
            "duration_previous_content_bin300",
            "prior_question_had_explanation", "rating_diff_content_user_id",
            "task_container_id_bin300", "previous_answer_index_content_id",
            "previous_answer_content_id", "row_id",
            "timediff-elapsedtime_bin500"
        ]]
        print(df.head(10))

        print("data preprocess")

    ff_for_transformer = FeatureFactoryForTransformer(
        column_config=column_config,
        dict_path="../feature_engineering/",
        sequence_length=params["max_seq"],
        logger=logger)
    ff_for_transformer.make_dict(df=df)
    n_skill = len(ff_for_transformer.embbed_dict[("content_id",
                                                  "content_type_id")])

    if not load_pickle or is_debug:
        df_val_row = pd.read_feather(
            "../../riiid_takoi/notebook/fe/validation_row_id.feather").head(
                len(df))
        if is_debug:
            df_val_row = df_val_row.head(3000)
        df_val_row["is_val"] = 1

        df = pd.merge(df, df_val_row, how="left", on="row_id")
        df["is_val"] = df["is_val"].fillna(0)

        print(df["is_val"].value_counts())

        w_df = df[df["is_val"] == 0]
        w_df["group"] = (
            w_df.groupby("user_id")["user_id"].transform("count") -
            w_df.groupby("user_id").cumcount()) // params["max_seq"]
        w_df["user_id"] = w_df["user_id"].astype(
            str) + "_" + w_df["group"].astype(str)

        group = ff_for_transformer.all_predict(w_df)

        dataset_train = SAKTDataset(group,
                                    n_skill=n_skill,
                                    max_seq=params["max_seq"])

        del w_df
        gc.collect()

    ff_for_transformer = FeatureFactoryForTransformer(
        column_config=column_config,
        dict_path="../feature_engineering/",
        sequence_length=params["max_seq"],
        logger=logger)
    if not load_pickle or is_debug:
        group = ff_for_transformer.all_predict(df[df["content_type_id"] == 0])
        dataset_val = SAKTDataset(group,
                                  is_test=True,
                                  n_skill=n_skill,
                                  max_seq=params["max_seq"])

    os.makedirs("../input/feature_engineering/model218", exist_ok=True)
    if not is_debug and not load_pickle:
        with open(f"../input/feature_engineering/model218/train.pickle",
                  "wb") as f:
            pickle.dump(dataset_train, f)
        with open(f"../input/feature_engineering/model218/val.pickle",
                  "wb") as f:
            pickle.dump(dataset_val, f)

    if not is_debug and load_pickle:
        with open(f"../input/feature_engineering/model218/train.pickle",
                  "rb") as f:
            dataset_train = pickle.load(f)
        with open(f"../input/feature_engineering/model218/val.pickle",
                  "rb") as f:
            dataset_val = pickle.load(f)
        print("loaded!")
    dataloader_train = DataLoader(dataset_train,
                                  batch_size=params["batch_size"],
                                  shuffle=True)
    dataloader_val = DataLoader(dataset_val,
                                batch_size=params["batch_size"],
                                shuffle=False)

    model = SAKTModel(n_skill,
                      embed_dim=params["embed_dim"],
                      max_seq=params["max_seq"],
                      dropout=dropout,
                      cont_emb=params["cont_emb"])

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(
        optimizer_grouped_parameters,
        lr=params["lr"],
        weight_decay=0.01,
    )
    num_train_optimization_steps = int(len(dataloader_train) * 20)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=params["num_warmup_steps"],
        num_training_steps=num_train_optimization_steps)
    criterion = nn.BCEWithLogitsLoss()

    model.to(device)
    criterion.to(device)

    for epoch in range(epochs):
        loss, acc, auc, auc_val = train_epoch(model, dataloader_train,
                                              dataloader_val, optimizer,
                                              criterion, scheduler, epoch,
                                              device)
        print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}".
              format(epoch, loss, auc, auc_val))

    preds = []
    labels = []
    with torch.no_grad():
        for item in tqdm(dataloader_val):
            label = item["label"].to(device).float()
            output = model(item, device)

            preds.extend(torch.nn.Sigmoid()(
                output[:, -1]).view(-1).data.cpu().numpy().tolist())
            labels.extend(label[:, -1].view(-1).data.cpu().numpy().tolist())

    auc_transformer = roc_auc_score(labels, preds)
    print("single transformer: {:.4f}".format(auc_transformer))
    df_oof = pd.DataFrame()
    # df_oof["row_id"] = df.loc[val_idx].index
    print(len(dataloader_val))
    print(len(preds))
    df_oof["predict"] = preds
    df_oof["target"] = labels

    df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False)
    """
    df_oof2 = pd.read_csv("../output/ex_237/20201213110353/oof_train_0_lgbm.csv")
    df_oof2.columns = ["row_id", "predict_lgbm", "target"]
    df_oof2 = pd.merge(df_oof, df_oof2, how="inner")

    auc_lgbm = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values)
    print("lgbm: {:.4f}".format(auc_lgbm))

    print("ensemble")
    max_auc = 0
    max_nn_ratio = 0
    for r in np.arange(0, 1.05, 0.05):
        auc = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values*(1-r) + df_oof2["predict"].values*r)
        print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc))

        if max_auc < auc:
            max_auc = auc
            max_nn_ratio = r
    print(len(df_oof2))
    """
    if not is_debug:
        mlflow.start_run(experiment_id=10, run_name=os.path.basename(__file__))

        for key, value in params.items():
            mlflow.log_param(key, value)
        mlflow.log_metric("auc_val", auc_transformer)
        mlflow.end_run()
    torch.save(model.state_dict(), f"{output_dir}/transformers.pth")
    del model
    torch.cuda.empty_cache()
    with open(f"{output_dir}/transformer_param.json", "w") as f:
        json.dump(params, f)
    if is_make_feature_factory:
        # feature factory
        feature_factory_dict = {"user_id": {}}
        feature_factory_dict["user_id"][
            "DurationPreviousContent"] = DurationPreviousContent(
                is_partial_fit=True)
        feature_factory_dict["user_id"][
            "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder()
        feature_factory_manager = FeatureFactoryManager(
            feature_factory_dict=feature_factory_dict,
            logger=logger,
            split_num=1,
            model_id="all",
            load_feature=not is_debug,
            save_feature=not is_debug)

        ff_for_transformer = FeatureFactoryForTransformer(
            column_config=column_config,
            dict_path="../feature_engineering/",
            sequence_length=params["max_seq"],
            logger=logger)
        df = pd.read_pickle(
            "../input/riiid-test-answer-prediction/train_merged.pickle")
        if is_debug:
            df = df.head(10000)
        df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True)
        feature_factory_manager.fit(df)
        df = feature_factory_manager.all_predict(df)
        for dicts in feature_factory_manager.feature_factory_dict.values():
            for factory in dicts.values():
                factory.logger = None
        feature_factory_manager.logger = None
        with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f:
            pickle.dump(feature_factory_manager, f)

        ff_for_transformer.fit(df)
        ff_for_transformer.logger = None
        with open(
                f"{output_dir}/feature_factory_manager_for_transformer.pickle",
                "wb") as f:
            pickle.dump(ff_for_transformer, f)
Esempio n. 30
0
 def load_frame(self, key, persist_feather=None):
     if persist_feather:
         self.cfg['data']['frames'][key] = persist_feather
     df = pd.read_feather(self.cfg['data']['frames'][key])
     self.data[key] = df
     return df
Esempio n. 31
0
def run_maelstrom(infile, genome, outdir, pwmfile=None, plot=True, cluster=False, 
        score_table=None, count_table=None, methods=None, ncpus=None):
    """Run maelstrom on an input table.
    
    Parameters
    ----------
    infile : str
        Filename of input table. Can be either a text-separated tab file or a
        feather file.
    
    genome : str
        Genome name. Can be either the name of a FASTA-formatted file or a 
        genomepy genome name.
    
    outdir : str
        Output directory for all results.

    pwmfile : str, optional
        Specify a PFM file for scanning.

    plot : bool, optional
        Create heatmaps.
    
    cluster : bool, optional
        If True and if the input table has more than one column, the data is
        clustered and the cluster activity methods are also run. Not 
        well-tested.
    
    score_table : str, optional
        Filename of pre-calculated table with motif scores.

    count_table : str, optional
        Filename of pre-calculated table with motif counts.

    methods : list, optional
        Activity methods to use. By default are all used.

    ncpus : int, optional
        If defined this specifies the number of cores to use.
    """
    logger.info("Starting maelstrom")
    if infile.endswith("feather"):
        df = pd.read_feather(infile)
        df = df.set_index(df.columns[0])
    else:
        df = pd.read_table(infile, index_col=0, comment="#")
    
    # Check for duplicates
    if df.index.duplicated(keep=False).any():
        raise ValueError("Input file contains duplicate regions! "
                         "Please remove them.")
    
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    if methods is None:
        methods = Moap.list_predictors() 
    methods = [m.lower() for m in methods]

    shutil.copyfile(infile, os.path.join(outdir, "input.table.txt"))
    
    # Copy the motif informatuon
    pwmfile = pwmfile_location(pwmfile) 
    if pwmfile:
        shutil.copy2(pwmfile, outdir)
        mapfile = re.sub(".p[fw]m$", ".motif2factors.txt", pwmfile)
        if os.path.exists(mapfile):
            shutil.copy2(mapfile, outdir)
    
    # Create a file with the number of motif matches
    if not count_table:
        count_table = os.path.join(outdir, "motif.count.txt.gz")
        if not os.path.exists(count_table):
            logger.info("Motif scanning (counts)")
            counts = scan_to_table(infile, genome, "count",
                pwmfile=pwmfile, ncpus=ncpus)
            counts.to_csv(count_table, sep="\t", compression="gzip")
        else:
            logger.info("Counts, using: %s", count_table)

    # Create a file with the score of the best motif match
    if not score_table:
        score_table = os.path.join(outdir, "motif.score.txt.gz")
        if not os.path.exists(score_table):
            logger.info("Motif scanning (scores)")
            scores = scan_to_table(infile, genome, "score",
                pwmfile=pwmfile, ncpus=ncpus)
            scores.to_csv(score_table, sep="\t", float_format="%.3f", 
                compression="gzip")
        else:
            logger.info("Scores, using: %s", score_table)

    if cluster:
        cluster = False
        for method in methods:
            m = Moap.create(method, ncpus=ncpus)
            if m.ptype == "classification":
                cluster = True
                break
        if not cluster:
            logger.info("Skipping clustering, no classification methods")
    
    exps = []
    clusterfile = infile
    if df.shape[1] != 1:
        # More than one column
        for method in Moap.list_regression_predictors():
            if method in methods:
                m = Moap.create(method, ncpus=ncpus)
                exps.append([method, m.pref_table, infile])
                logger.debug("Adding %s", method)

        if cluster:
            clusterfile = os.path.join(outdir,
                    os.path.basename(infile) + ".cluster.txt")
            
            df[:] = scale(df, axis=0)
            names = df.columns
            df_changed = pd.DataFrame(index=df.index)
            df_changed["cluster"] = np.nan
            for name in names:
                df_changed.loc[(df[name] - df.loc[:,df.columns != name].max(1)) > 0.5, "cluster"] = name
            df_changed.dropna().to_csv(clusterfile, sep="\t")
    if df.shape[1] == 1 or cluster:
        for method in Moap.list_classification_predictors():
            if method in methods:
                m = Moap.create(method, ncpus=ncpus)
                exps.append([method, m.pref_table, clusterfile])

    if len(exps) == 0:
        logger.error("No method to run.")
        sys.exit(1)

    for method, scoring, fname in exps:
        try:
            if scoring == "count" and count_table:
                moap_with_table(fname, count_table, outdir, method, scoring, ncpus=ncpus)
            elif scoring == "score" and score_table:
                moap_with_table(fname, score_table, outdir, method, scoring, ncpus=ncpus)
            else:
                moap_with_bg(fname, genome, outdir, method, scoring, pwmfile=pwmfile, ncpus=ncpus)
        
        except Exception as e:
            logger.warn("Method %s with scoring %s failed", method, scoring)
            logger.warn(e)
            logger.warn("Skipping")
            raise 
    dfs = {}
    for method, scoring,fname  in exps:
        t = "{}.{}".format(method,scoring)
        fname = os.path.join(outdir, "activity.{}.{}.out.txt".format(
                           method, scoring))
        try:
            dfs[t] = pd.read_table(fname, index_col=0, comment="#")
        except:
            logging.warn("Activity file for {} not found!\n".format(t))
   
    if len(methods) > 1:
        logger.info("Rank aggregation")
        df_p = df_rank_aggregation(df, dfs, exps)
        df_p.to_csv(os.path.join(outdir, "final.out.csv"), sep="\t")
    #df_p = df_p.join(m2f)

    # Write motif frequency table
    
    if df.shape[1] == 1:
        mcount = df.join(pd.read_table(count_table, index_col=0, comment="#"))
        m_group = mcount.groupby(df.columns[0])
        freq = m_group.sum() / m_group.count()
        freq.to_csv(os.path.join(outdir, "motif.freq.txt"), sep="\t")

    if plot and len(methods) > 1:
        logger.info("html report")
        maelstrom_html_report(
                outdir, 
                os.path.join(outdir, "final.out.csv"),
                pwmfile
                )
        logger.info(os.path.join(outdir, "gimme.maelstrom.report.html"))
Esempio n. 32
0
                     index_col=False,
                     names=headers,
                     dtype=dtypes,
                     parse_dates=['Date'])
    elapsed_read = perf_counter() - tstart
    df_bad = df[df.tFile.isnull()]
    if not df_bad.empty:
        print("Invalid lines, stop processing")
        print(df_bad)
        sys.exit()
    tstart = perf_counter()
    print("Saving cache")
    df.to_feather(cache_file_fp)
    elapsed_write = perf_counter() - tstart
    print(f'Done.  Time read={elapsed_read:.3f}, write={elapsed_write:.3f}')
else:
    print('Loading data from cache.')
    tstart = perf_counter()
    df = pd.read_feather(cache_file_fp)
    elapsed_read = perf_counter() - tstart
    print(f'Done.  Time read={elapsed_read:.3f}')

print(df.info())
print()

date_from = np.datetime64('2021-02-18 21:00:00')
dg: DataFrame = df[(df.Server != 'EU50TSVP412') & (df.Date >= date_from)]
dg.reset_index(inplace=True, drop=True)
print(dg.info())
print(dg.shape)
Esempio n. 33
0
warnings.filterwarnings('ignore')

with open('./configs/default.yaml', 'r') as yf:
    config = yaml.load(yf)

# ===============
# Settings
# ===============
INPUT_PATH = './data/input/train_transaction.feather'
OUTPUT_PATH = './folds/folds7.feather'
SEED = config['seed']
TARGET = 'isFraud'
START_DATE = config['start_date']

# ===============
# Main
# ===============
df = pd.read_feather(INPUT_PATH)
df['datetime_'] = df['TransactionDT'].apply(
    lambda x: (START_DATE + datetime.timedelta(seconds=x)))
df['weekofyear'] = (df['TransactionDT'] - 86400) // (3600 * 24 * 7)

split_groups = df['weekofyear']
week_unique = split_groups.unique()

df['fold_id'] = np.nan
for week in week_unique:
    val_idx = df[df['weekofyear'] == week].index
    df.loc[val_idx, 'fold_id'] = week

df[['fold_id']].astype('int').to_feather(OUTPUT_PATH)
Esempio n. 34
0
#!/usr/bin/env python
# coding: utf-8

# In[1]:

import numpy as np
import pandas as pd
from tqdm import tqdm

# In[2]:

paper = pd.read_feather("../../../input/paper_input_final.ftr")

# In[3]:

paper['abst'] = paper['abst'].apply(lambda s: s.replace('no_content', ''))
paper['corp'] = paper['titl'] + ' ' + paper['keywords'].fillna('').replace(
    ';', ' ') + paper['abst']

# In[4]:

df_train = pd.read_feather("../../../input/tr_input_final.ftr")

# In[5]:

df_train.head()

# In[6]:

df_test = pd.read_feather("../../../input/te_input_final.ftr")
Esempio n. 35
0
    def test_read_feather(self):
        data = pd.read_feather("/input/tests/data/feather-0_3_1.feather")

        self.assertEqual(10, data.size)
files_tr = sorted(glob('../data/train_f*.f'))

# USE_PREF
li = []
for i in files_tr:
    for j in USE_PREF:
        if j in i:
            li.append(i)
            break
files_tr = li

[print(i, f) for i, f in enumerate(files_tr)]

X_train = pd.concat(
    [pd.read_feather(f) for f in tqdm(files_tr, mininterval=30)] +
    [joblib.load('../external/X_train_nejumi.pkl.gz')],
    axis=1)

y_train = utils.load_target()['HasDetections']

# drop
if len(col_drop) > 0:
    X_train.drop(col_drop, axis=1, inplace=True)

if X_train.columns.duplicated().sum() > 0:
    raise Exception(
        f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }')
print('no dup :) ')
print(f'X_train.shape {X_train.shape}')