def read_feather(path, nthreads=1): warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) port_frame = pd.read_feather(path) ray_frame = from_pandas(port_frame, get_npartitions()) return ray_frame
def read(): """Read a data chunk from SciDB. Returns a Pandas DataFrame or None. """ sz = struct.unpack('<Q', stdin.read(8))[0] if sz: df = pandas.read_feather(io.BytesIO(stdin.read(sz))) return df else: # Last Chunk return None
def scan_to_table(input_table, genome, scoring, pwmfile=None, ncpus=None): """Scan regions in input table with motifs. Parameters ---------- input_table : str Filename of input table. Can be either a text-separated tab file or a feather file. genome : str Genome name. Can be either the name of a FASTA-formatted file or a genomepy genome name. scoring : str "count" or "score" pwmfile : str, optional Specify a PFM file for scanning. ncpus : int, optional If defined this specifies the number of cores to use. Returns ------- table : pandas.DataFrame DataFrame with motif ids as column names and regions as index. Values are either counts or scores depending on the 'scoring' parameter.s """ config = MotifConfig() if pwmfile is None: pwmfile = config.get_default_params().get("motif_db", None) if pwmfile is not None: pwmfile = os.path.join(config.get_motif_dir(), pwmfile) if pwmfile is None: raise ValueError("no pwmfile given and no default database specified") logger.info("reading table") if input_table.endswith("feather"): df = pd.read_feather(input_table) idx = df.iloc[:,0].values else: df = pd.read_table(input_table, index_col=0, comment="#") idx = df.index regions = list(idx) s = Scanner(ncpus=ncpus) s.set_motifs(pwmfile) s.set_genome(genome) s.set_background(genome=genome) nregions = len(regions) scores = [] if scoring == "count": logger.info("setting threshold") s.set_threshold(fpr=FPR) logger.info("creating count table") for row in s.count(regions): scores.append(row) logger.info("done") else: s.set_threshold(threshold=0.0) logger.info("creating score table") for row in s.best_score(regions, normalize=True): scores.append(row) logger.info("done") motif_names = [m.id for m in read_motifs(pwmfile)] logger.info("creating dataframe") return pd.DataFrame(scores, index=idx, columns=motif_names)
files_.append(f1) break files = sorted(set(files) - set(files_)) if len(use_files) > 0: files_ = [] for f1 in files: for f2 in use_files: if f2 in f1: files_.append(f1) break files = sorted(files_[:]) X = pd.concat([pd.read_feather(f) for f in tqdm(files, mininterval=60)], axis=1) y = utils.read_pickles('../data/label').TARGET if X.columns.duplicated().sum() > 0: raise Exception(f'duplicated!: { X.columns[X.columns.duplicated()] }') print('no dup :) ') print(f'X.shape {X.shape}') # ============================================================================= # lgb # ============================================================================= dtrain = lgb.Dataset(X, y) model = lgb.train(param, dtrain, 500)
def deserialize(self, value:bytes) -> pd.DataFrame: # recover a Python object from bytes df_bytes_io = BytesIO(value) return pd.read_feather(df_bytes_io)
import sklearn.metrics as metrics # files = glob.glob('data/processed/*.feather') # files # list_ = [] # for file in files: # df = pd.read_feather(file) # list_.append(df) # mdat = pd.concat(list_, sort=False) # mdat = mdat.reset_index(drop=True) # dat = mdat # Full processed data dat = pd.read_feather( 'data/full_gfw_10d_effort_model_data_8DAY_2012-01-01_2016-12-26.feather') dat = dat[dat.geartype == 'drifting_longlines'] dat = dat.sort_values('date') # ~50% of obs are zero (remove?) len(dat[dat.fishing_hours > 0]) / len(dat) dat = dat[dat.fishing_hours > 0] # If illegally operating inside EEZ (!= ARG) dat.loc[:, 'illegal'] = np.where( ((dat['eez'] == True) & (dat['flag'] != 'ARG')), 1, 0) # Convert true/false eez to 0/1 dat.loc[:, 'eez'] = dat.eez.astype('uint8')
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Wed Mar 13 09:41:38 2019 @author: Kazuki """ import pandas as pd sub = pd.read_csv('../input/sample_submission.csv.zip') te_dt = pd.read_feather('../data/test_datetime.f') pri = sub[te_dt.AvSigVersion>="2018-10-26"] pri[['MachineIdentifier']].to_pickle('../data/pri_id.pkl')
# 'nthread': 32, 'nthread': cpu_count(), 'bagging_freq': 1, 'verbose': -1, 'seed': SEED } use_files = ['train_f0', 'train_f1'] # ============================================================================= # load # ============================================================================= files = utils.get_use_files(use_files, True) X = pd.concat([pd.read_feather(f) for f in tqdm(files, mininterval=60)], axis=1) y = utils.read_pickles('../data/label').TARGET if X.columns.duplicated().sum() > 0: raise Exception(f'duplicated!: { X.columns[X.columns.duplicated()] }') print('no dup :) ') print(f'X.shape {X.shape}') #X = X.rank(method='dense') gc.collect() CAT = list(set(X.columns) & set(utils_cat.ALL)) # ============================================================================= # cv # =============================================================================
# -*- coding: utf-8 -*- """ Created on Thu Dec 6 10:45:53 2018 @author: jiaxx """ import pandas as pd import numpy as np feat = pd.read_feather('../data/sparkling_meteo.feather') glm = pd.read_feather('../data/Sparkling_temperatures.feather') feat.columns feat.values feat.values.shape ## import previous data #x_full_o = np.load('../../../../2017-2018/TF_test/TGDS/processed_features.npy') #x_raw_full_o = np.load('../../../../2017-2018/TF_test/TGDS/features.npy') #diag_full_o = np.load('../../../../2017-2018/TF_test/TGDS/diag.npy') #obs_o = np.load('../../../../2017-2018/TF_test/TGDS/Obs_temp.npy') #label_o = np.load('../../../../2017-2018/TF_test/TGDS/labels.npy') #dates_o = np.load('../../../../2017-2018/TF_test/TGDS/dates.npy') # 10592 ## SKIP THIS ------------ match with previous data #new_date = feat.values[:,0] #for i in range(dates.shape[0]): # if dates[i]==new_date[0]: # start_idx = i # elif dates[i]==new_date[-1]:
def get_huc12_results(id, state): """Get results for CHAT Rank for a given HUC12. Parameters ---------- id : str HUC12 ID state : str CHAT state Returns ------- dict { "priorities": [...], "legend": [...], "analysis_notes": <analysis_notes>, "remainder": <acres outside of input>, "remainder_percent" <percent of total acres outside input> } """ # 0 values not present for top-level rank values = INPUTS[f"{state}chat"]["values"][1:] columns = ["id", "total_acres", "analysis_acres", "chat_acres"] + [ f'chatrank_{v["value"]}' for v in values ] df = pd.read_feather(out_dir / f"{state}chat.feather", columns=columns).set_index( "id" ) if id not in df.index: return dict() row = df.loc[id] se_remainder = max(row.total_acres - row.analysis_acres, 0) se_remainder = se_remainder if se_remainder >= 1 else 0 remainder = max(row.analysis_acres - row.chat_acres, 0) remainder = remainder if remainder >= 1 else 0 priorities = [] legend = [] for value in values: acres = row[f'chatrank_{value["value"]}'] priorities.append( { "value": value["value"], "label": value["label"], "blueprint": value["blueprint"], "acres": acres, "percent": 100 * acres / row.total_acres, } ) legend.append({"label": value["label"], "color": value["color"]}) return { "priorities": priorities, "legend": legend, "analysis_notes": get_analysis_notes(), "analysis_acres": row.analysis_acres, "total_acres": row.total_acres, "remainder": remainder, "remainder_percent": 100 * remainder / row.total_acres, "se_remainder": se_remainder, "se_remainder_percent": 100 * se_remainder / row.total_acres, }
def merge_annual_stats(input_pd_files, country_names_lut_file, out_feather=None, out_excel=None, excel_sheet=None, out_csv=None): rsgis_utils = rsgislib.RSGISPyUtils() country_names_luts = rsgis_utils.readJSON2Dict(country_names_lut_file) years = [ '1996', '2007', '2008', '2009', '2010', '2015', '2016', '2017', '2018', '2019', '2020' ] year_info = dict() comb_df = None for year in years: year_info[year] = dict() for in_file in input_pd_files: if year in in_file: year_info[year]['year_file'] = in_file if 'year_file' in year_info[year]: yr_df = pandas.read_feather(year_info[year]['year_file']) yr_df = yr_df.rename(columns={ 'count': '{}_count'.format(year), 'area': '{}_area'.format(year) }) yr_df = yr_df.drop(['uid'], axis=1) if year == '1996': comb_df = yr_df else: comb_df = pandas.merge(left=comb_df, right=yr_df, left_on='region', right_on='region') if comb_df is not None: cnty_lst = list() for region in comb_df['region']: cnty_lst.append(country_names_luts['gid'][region]) comb_df['name'] = cnty_lst comb_df = comb_df[[ 'region', 'name', '1996_count', '2007_count', '2008_count', '2009_count', '2010_count', '2015_count', '2016_count', '2017_count', '2018_count', '2019_count', '2020_count', '1996_area', '2007_area', '2008_area', '2009_area', '2010_area', '2015_area', '2016_area', '2017_area', '2018_area', '2019_area', '2020_area' ]] comb_df = comb_df.sort_values(by=['name']).reset_index() comb_df = comb_df.drop(['index'], axis=1) print(comb_df) if out_feather is not None: comb_df.to_feather(out_feather) if out_csv is not None: comb_df.to_csv(out_csv) if out_excel is not None: if excel_sheet is None: excel_sheet = 'gmw_stats' comb_df.to_excel(out_excel, sheet_name=excel_sheet)
import torch.optim as optim import torch import torchvision as tv import fastai.vision as faiv import fastai.train as fait import numpy as np import matplotlib.pyplot as plt from matplotlib.colors import LogNorm from tqdm import tqdm import pandas as pd import icae.toy.lone_point_models as models #%% path = "data/toy/1024/" data_df = pd.read_feather(path + "desc.feather") label_columns = list(data_df.columns[data_df.columns.str.contains("label_*")]) data = (faiv.ImageList.from_df(data_df, path).split_by_rand_pct(0.01).label_from_df( cols=label_columns, label_cls=faiv.FloatList).databunch(bs=10)) shape = list(data.one_batch()[0].size()) model = models.SimpleClassifier( shape[2:], 50, 2, # 2, kernel=[3, 3], channel_progression=lambda x: x + 1, batch_normalization=True,
df = pd.DataFrame() df['SK_ID_CURR'] = g.SK_ID_CURR.max() df['first_nonzero_SK_DPD'] = g.SK_DPD.apply(first_nonzero) df['first_nonzero_SK_DPD_DEF'] = g.SK_DPD_DEF.apply(first_nonzero) df['first_nonzero_diff'] = df.first_nonzero_SK_DPD_DEF - df.first_nonzero_SK_DPD g = df.reset_index(drop=True).groupby('SK_ID_CURR') self.df = pd.concat([ g.mean().rename(columns=lambda x: x+'_mean'), g.max().rename(columns=lambda x: x+'_max'), ], axis=1) if __name__ == '__main__': args = get_arguments('POS CASH') with timer('load dataset'): train = pd.read_feather(TRAIN)[['SK_ID_CURR']] test = pd.read_feather(TEST)[['SK_ID_CURR']] pos = pd.read_feather(POS) with timer('preprocessing'): pos = pos.sort_values(['SK_ID_CURR', 'MONTHS_BALANCE']).reset_index(drop=True) pos.loc[:, pos.columns.str.startswith('SK_DPD')] = np.log1p(pos.filter(regex='^SK_DPD')) with timer('create dataset'): generate_features([ PosNullCount('pos_null_count'), PosLatest('pos', 'latest'), PosCount(), PosDecay('pos'), PosMonthDuplicate('pos'), PosCompleteDecay('pos'),
def load_dataframe(file_path): """Load a dataframe from a parquet file""" with open(file_path, 'rb') as file_obj: df = pd.read_feather(file_obj) return df
def get_removed_dams(): return pd.read_feather(data_dir / "removed_dams.feather")
from pathlib import Path import pandas as pd from api.logger import log data_dir = Path("data/api") ### Read source data into memory # we can do this because data do not consume much memory try: dams = pd.read_feather(data_dir / "dams.feather").set_index("id") ranked_dams = dams.loc[dams.Ranked] barriers = pd.read_feather(data_dir / "small_barriers.feather").set_index("id") ranked_barriers = barriers.loc[barriers.Ranked] print( f"Loaded {len(dams):,} dams ({len(ranked_dams):,} ranked), {len(barriers):,} barriers ({len(ranked_barriers):,} ranked) " ) except Exception as e: print("ERROR: not able to load data") log.error(e) # on demand instead of in-memory def get_removed_dams(): return pd.read_feather(data_dir / "removed_dams.feather")
from functools import partial from glob import glob from fastai.collab import * from fastai.tabular import * from scipy.spatial.distance import cosine from server.constants import max_slice, min_slice, random_seed, batch_size, device from server.model import build_model, build_learner, load_pretrained_embeddings from server.preprocessor import AnimeRatingsDataset, load_dataset, build_databunch np.random.seed(random_seed) anime_df = pd.read_feather("model_resources/animes.feather").set_index( "anime_monotonic_id" ) train_df, test_df = load_dataset() databunch = build_databunch(train_df=train_df, test_df=test_df) all_embeddings = load_pretrained_embeddings() model = build_model( anime_genre_embeddings=all_embeddings["anime_with_genre_embeddings"] ) learn = build_learner(model=model, databunch=databunch) def sort_by_distance(record, anime_monotonic_id_embeddings, reverse=True): target_embedding = anime_monotonic_id_embeddings[ record["target_anime_monotonic_id"]
import os import sys import pandas as pd from validate_utils import FoldValidation import warnings warnings.filterwarnings('ignore') # =============== # Settings # =============== fname = os.path.basename(sys.argv[0]) TRAIN_PATH = f'../data/input/train_data.feather' OUTPUT_PATH = f'../folds/{fname.split(".")[0]}.feather' N_FOLD = 5 # =============== # Main # =============== df = pd.read_feather(TRAIN_PATH) fold_validation = FoldValidation(df, stratify_arr=df['position'], fold_num=N_FOLD) folds = fold_validation.make_split(valid_type='StratifiedKFold') folds.to_feather(OUTPUT_PATH)
def read_data(folder: Path): df_train = pd.read_feather(folder / "train.feather").set_index("id") df_test = pd.read_feather(folder / "test.feather").set_index("id") return df_train, df_test
df_train = pd.DataFrame(sc.fit_transform(X_train)) df_test = pd.DataFrame(sc.transform(X_test)) res_train, res_test = neighbors(df_train, df_test, train.TARGET, cv, k=5, n_trees=10) self.train = pd.DataFrame(res_train, columns=['neg', 'pos']) self.test = pd.DataFrame(res_test, columns=['neg', 'pos']) if __name__ == '__main__': args = get_arguments('main') with timer('load dataset'): train = pd.read_feather(TRAIN)[['TARGET']] test = pd.read_feather(TEST) cv_id = pd.read_feather('../input/cv_id.ftr') cv = PredefinedSplit(cv_id) dfs = [ pd.read_feather(str(f)) for f in sorted(Path('../working/').glob('buro_*_train.ftr')) ] X_train = pd.concat(dfs, axis=1) # type: pd.DataFrame dfs = [ pd.read_feather(str(f)) for f in sorted(Path('../working/').glob('buro_*_test.ftr')) ] X_test = pd.concat(dfs, axis=1) # type: pd.DataFrame
self.test[f'area_{f}_mean_diff'] = test[f] - self.test[f'area_{f}_mean'] self.test[f'area_{f}_median_diff'] = test[f] - self.test[f'area_{f}_median'] self.test[f'area_{f}_mean_ratio'] = test[f] / self.test[f'area_{f}_mean'] self.test[f'area_{f}_median_ratio'] = test[f] / self.test[f'area_{f}_median'] class MainRegionAsCategory(Feature): def create_features(self): self.train = train['REGION_POPULATION_RELATIVE'].astype(str).to_frame('region_as_category') self.test = test['REGION_POPULATION_RELATIVE'].astype(str).to_frame('region_as_category') if __name__ == '__main__': args = get_arguments('main') with timer('load dataset'): train = pd.read_feather(TRAIN) test = pd.read_feather(TEST) X = pd.concat([ train.drop('TARGET', axis=1), test ]) # type: pd.DataFrame with timer('preprocessing'): train.AMT_INCOME_TOTAL.replace(117000000.0, 1170000, inplace=True) train.replace({'Yes': 1, 'No': 0, 'Y': 1, 'N': 0, 'XAP': np.nan, 'XAN': np.nan}, inplace=True) test.replace({'Yes': 1, 'No': 0, 'Y': 1, 'N': 0, 'XAP': np.nan, 'XAN': np.nan}, inplace=True) day_cols = train.filter(regex='DAYS_').columns train[day_cols] = train[day_cols].replace(365243, np.nan) test[day_cols] = test[day_cols].replace(365243, np.nan) with timer('create dataset'):
files_tr = sorted(glob('../data/train_f*.f')) # USE_PREF li = [] for i in files_tr: for j in USE_PREF: if j in i: li.append(i) break files_tr = li [print(i,f) for i,f in enumerate(files_tr)] X_train = pd.concat([ pd.read_feather(f) for f in tqdm(files_tr, mininterval=60) ], axis=1) y_train = utils.load_target()['HasDetections'] #X.drop(DROP, axis=1, inplace=True) if X_train.columns.duplicated().sum()>0: raise Exception(f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }') print('no dup :) ') print(f'X_train.shape {X_train.shape}') gc.collect() CAT = list( set(X_train.columns)&set(utils_cat.ALL) ) print(f'CAT: {CAT}')
from Dashboard.functions.load_figures import load_piechart from Dashboard.functions.map import load_figure from Dashboard.translations import layout_elements, feature_translations, level_options from config import DASH_CACHE_DIR log_ = logging.getLogger(__name__) log_.info('Reading geo data...') geolayers_all = {} centers_all = {} for level in ['city', 'neighborhood']: with open(os.path.join(DASH_CACHE_DIR, level + '_geolayers.json')) as json_file: geolayers_all[level] = json.load(json_file) centers_all[level] = pd.read_feather( os.path.join(DASH_CACHE_DIR, level + '_centers.feather')) log_.info('Reading finished...') @app.callback( Output("choropleth", "figure"), [ Input("level_selector", "value"), Input("color_selector", "value"), Input('date_selector', 'date'), Input('language_tab', 'value'), ], ) def reload_graph(level, colorby, selected_date, lang): return load_figure(geolayers=geolayers_all[level],
def getErrorTicks(): file = "db/error.file" return (pd.read_feather(file) if os.path.exists(file) else pd.DataFrame( columns=["ErrorTicks"]))
def load_df_feather(filename, columns=None): if columns == None: df = pd.read_feather(filename, use_threads=True) else: df = pd.read_feather(filename, columns=columns, use_threads=True) return df
def readFilteredTicks(file="filteredTicks"): file = f"db/{file}.file" return (pd.read_feather(file) if os.path.exists(file) else pd.DataFrame( columns=["Symbol", "Type"]))
""" m = sp.sparse.load_npz(inpath) graph = m.dot(m.T) if outpath: sp.sparse.save_npz(outpath, graph) return graph if __name__ == '__main__': # this is a pre computed file of artist information ## this will be replaced in future realeases with API queries to the Spotify Artist API from JavaScript rich_artists = pd.read_csv("data/2019-07-30/ranked_rich_artists.bsv", sep="|", index_col="id") # output from query.py index_id_map = pd.read_feather("data/2019-07-30/artist_indices.feather") index_id_map.columns = ["id", "index"] index_id_map.set_index("index", inplace=True) # output from query.py graph = artists_graph("data/2019-07-30/artists_to_playlists.npz") load_recommendations(rich_artists, graph, index_id_map, 0.0001) load_recommendations(rich_artists, graph, index_id_map, 0.0005) load_recommendations(rich_artists, graph, index_id_map, 0.0025) load_recommendations(rich_artists, graph, index_id_map, 0.01) load_recommendations(rich_artists, graph, index_id_map, 0.02)
smoothing = '1400' # %% signal_path = \ f'/faststorage/project/reprator/Andrej/reprator/data'\ f'/dfs_TCGA_S_B/*.{smoothing}.fitted.fth' paths = glob(signal_path) print(f'There are {len(paths)} paths.') # %% main_df = pd.DataFrame() labels = pd.Series() for path in paths: # load data df = pd.read_feather(path) tcga_id = path.split('/')[-1].split('.')[0] gdc_id = tcga_id2gdc_id[tcga_id] ctype = tcga_id2ctype[tcga_id] # check if cor > 0.3 cor = df.loc[:, ['bw_signal', 'tcga_signal']].corr().iloc[0, 1] print(f'{gdc_id}\t{cor}') if cor > 0.3: # subsample subsampled_signal = get_subsampled_signal(df) main_df[gdc_id] = subsampled_signal labels[gdc_id] = ctype # %% main_df = main_df.dropna() main_df = main_df.T
def main(params: dict, output_dir: str): import mlflow print("start params={}".format(params)) model_id = "train_0" logger = get_logger() # df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle") df = pd.read_pickle( "../input/riiid-test-answer-prediction/split10/train_0.pickle" ).sort_values(["user_id", "timestamp"]).reset_index(drop=True) if is_debug: df = df.head(30000) df["prior_question_had_explanation"] = df[ "prior_question_had_explanation"].fillna(-1) df["answered_correctly"] = df["answered_correctly"].replace(-1, np.nan) column_config = { ("content_id", "content_type_id"): { "type": "category" }, "user_answer": { "type": "leakage_feature" }, "answered_correctly": { "type": "leakage_feature" }, "part": { "type": "category" }, "prior_question_elapsed_time_bin300": { "type": "category" }, "duration_previous_content_bin300": { "type": "category" }, "prior_question_had_explanation": { "type": "category" }, "rating_diff_content_user_id": { "type": "numeric" }, "task_container_id_bin300": { "type": "category" }, "previous_answer_index_content_id": { "type": "category" }, "previous_answer_content_id": { "type": "category" }, "timediff-elapsedtime_bin500": { "type": "category" } } if not load_pickle or is_debug: feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"][ "DurationPreviousContent"] = DurationPreviousContent( is_partial_fit=True) feature_factory_dict["user_id"][ "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_dict["user_id"][ "UserContentRateEncoder"] = UserContentRateEncoder( rate_func="elo", column="user_id") feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2( groupby="user_id", column="content_id", is_debug=is_debug, model_id=model_id, n=300) feature_factory_dict["user_id"][ "StudyTermEncoder2"] = StudyTermEncoder2(is_partial_fit=True) feature_factory_dict["user_id"][ f"MeanAggregatorStudyTimebyUserId"] = MeanAggregator( column="user_id", agg_column="study_time", remove_now=False) feature_factory_dict["user_id"][ "ElapsedTimeMeanByContentIdEncoder"] = ElapsedTimeMeanByContentIdEncoder( ) feature_factory_dict["post"] = { "DurationFeaturePostProcess": DurationFeaturePostProcess() } feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id=model_id, load_feature=not is_debug, save_feature=not is_debug) print("all_predict") df = feature_factory_manager.all_predict(df) def f(x): x = x // 1000 if x < -100: return -100 if x > 400: return 400 return x df["task_container_id_bin300"] = [ x if x < 300 else 300 for x in df["task_container_id"] ] df["timediff-elapsedtime_bin500"] = [ f(x) for x in df["timediff-elapsedtime"].values ] df = df[[ "user_id", "content_id", "content_type_id", "part", "user_answer", "answered_correctly", "prior_question_elapsed_time_bin300", "duration_previous_content_bin300", "prior_question_had_explanation", "rating_diff_content_user_id", "task_container_id_bin300", "previous_answer_index_content_id", "previous_answer_content_id", "row_id", "timediff-elapsedtime_bin500" ]] print(df.head(10)) print("data preprocess") ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) ff_for_transformer.make_dict(df=df) n_skill = len(ff_for_transformer.embbed_dict[("content_id", "content_type_id")]) if not load_pickle or is_debug: df_val_row = pd.read_feather( "../../riiid_takoi/notebook/fe/validation_row_id.feather").head( len(df)) if is_debug: df_val_row = df_val_row.head(3000) df_val_row["is_val"] = 1 df = pd.merge(df, df_val_row, how="left", on="row_id") df["is_val"] = df["is_val"].fillna(0) print(df["is_val"].value_counts()) w_df = df[df["is_val"] == 0] w_df["group"] = ( w_df.groupby("user_id")["user_id"].transform("count") - w_df.groupby("user_id").cumcount()) // params["max_seq"] w_df["user_id"] = w_df["user_id"].astype( str) + "_" + w_df["group"].astype(str) group = ff_for_transformer.all_predict(w_df) dataset_train = SAKTDataset(group, n_skill=n_skill, max_seq=params["max_seq"]) del w_df gc.collect() ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) if not load_pickle or is_debug: group = ff_for_transformer.all_predict(df[df["content_type_id"] == 0]) dataset_val = SAKTDataset(group, is_test=True, n_skill=n_skill, max_seq=params["max_seq"]) os.makedirs("../input/feature_engineering/model218", exist_ok=True) if not is_debug and not load_pickle: with open(f"../input/feature_engineering/model218/train.pickle", "wb") as f: pickle.dump(dataset_train, f) with open(f"../input/feature_engineering/model218/val.pickle", "wb") as f: pickle.dump(dataset_val, f) if not is_debug and load_pickle: with open(f"../input/feature_engineering/model218/train.pickle", "rb") as f: dataset_train = pickle.load(f) with open(f"../input/feature_engineering/model218/val.pickle", "rb") as f: dataset_val = pickle.load(f) print("loaded!") dataloader_train = DataLoader(dataset_train, batch_size=params["batch_size"], shuffle=True) dataloader_val = DataLoader(dataset_val, batch_size=params["batch_size"], shuffle=False) model = SAKTModel(n_skill, embed_dim=params["embed_dim"], max_seq=params["max_seq"], dropout=dropout, cont_emb=params["cont_emb"]) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW( optimizer_grouped_parameters, lr=params["lr"], weight_decay=0.01, ) num_train_optimization_steps = int(len(dataloader_train) * 20) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=params["num_warmup_steps"], num_training_steps=num_train_optimization_steps) criterion = nn.BCEWithLogitsLoss() model.to(device) criterion.to(device) for epoch in range(epochs): loss, acc, auc, auc_val = train_epoch(model, dataloader_train, dataloader_val, optimizer, criterion, scheduler, epoch, device) print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}". format(epoch, loss, auc, auc_val)) preds = [] labels = [] with torch.no_grad(): for item in tqdm(dataloader_val): label = item["label"].to(device).float() output = model(item, device) preds.extend(torch.nn.Sigmoid()( output[:, -1]).view(-1).data.cpu().numpy().tolist()) labels.extend(label[:, -1].view(-1).data.cpu().numpy().tolist()) auc_transformer = roc_auc_score(labels, preds) print("single transformer: {:.4f}".format(auc_transformer)) df_oof = pd.DataFrame() # df_oof["row_id"] = df.loc[val_idx].index print(len(dataloader_val)) print(len(preds)) df_oof["predict"] = preds df_oof["target"] = labels df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False) """ df_oof2 = pd.read_csv("../output/ex_237/20201213110353/oof_train_0_lgbm.csv") df_oof2.columns = ["row_id", "predict_lgbm", "target"] df_oof2 = pd.merge(df_oof, df_oof2, how="inner") auc_lgbm = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values) print("lgbm: {:.4f}".format(auc_lgbm)) print("ensemble") max_auc = 0 max_nn_ratio = 0 for r in np.arange(0, 1.05, 0.05): auc = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values*(1-r) + df_oof2["predict"].values*r) print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc)) if max_auc < auc: max_auc = auc max_nn_ratio = r print(len(df_oof2)) """ if not is_debug: mlflow.start_run(experiment_id=10, run_name=os.path.basename(__file__)) for key, value in params.items(): mlflow.log_param(key, value) mlflow.log_metric("auc_val", auc_transformer) mlflow.end_run() torch.save(model.state_dict(), f"{output_dir}/transformers.pth") del model torch.cuda.empty_cache() with open(f"{output_dir}/transformer_param.json", "w") as f: json.dump(params, f) if is_make_feature_factory: # feature factory feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"][ "DurationPreviousContent"] = DurationPreviousContent( is_partial_fit=True) feature_factory_dict["user_id"][ "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id="all", load_feature=not is_debug, save_feature=not is_debug) ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) df = pd.read_pickle( "../input/riiid-test-answer-prediction/train_merged.pickle") if is_debug: df = df.head(10000) df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True) feature_factory_manager.fit(df) df = feature_factory_manager.all_predict(df) for dicts in feature_factory_manager.feature_factory_dict.values(): for factory in dicts.values(): factory.logger = None feature_factory_manager.logger = None with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f: pickle.dump(feature_factory_manager, f) ff_for_transformer.fit(df) ff_for_transformer.logger = None with open( f"{output_dir}/feature_factory_manager_for_transformer.pickle", "wb") as f: pickle.dump(ff_for_transformer, f)
def load_frame(self, key, persist_feather=None): if persist_feather: self.cfg['data']['frames'][key] = persist_feather df = pd.read_feather(self.cfg['data']['frames'][key]) self.data[key] = df return df
def run_maelstrom(infile, genome, outdir, pwmfile=None, plot=True, cluster=False, score_table=None, count_table=None, methods=None, ncpus=None): """Run maelstrom on an input table. Parameters ---------- infile : str Filename of input table. Can be either a text-separated tab file or a feather file. genome : str Genome name. Can be either the name of a FASTA-formatted file or a genomepy genome name. outdir : str Output directory for all results. pwmfile : str, optional Specify a PFM file for scanning. plot : bool, optional Create heatmaps. cluster : bool, optional If True and if the input table has more than one column, the data is clustered and the cluster activity methods are also run. Not well-tested. score_table : str, optional Filename of pre-calculated table with motif scores. count_table : str, optional Filename of pre-calculated table with motif counts. methods : list, optional Activity methods to use. By default are all used. ncpus : int, optional If defined this specifies the number of cores to use. """ logger.info("Starting maelstrom") if infile.endswith("feather"): df = pd.read_feather(infile) df = df.set_index(df.columns[0]) else: df = pd.read_table(infile, index_col=0, comment="#") # Check for duplicates if df.index.duplicated(keep=False).any(): raise ValueError("Input file contains duplicate regions! " "Please remove them.") if not os.path.exists(outdir): os.mkdir(outdir) if methods is None: methods = Moap.list_predictors() methods = [m.lower() for m in methods] shutil.copyfile(infile, os.path.join(outdir, "input.table.txt")) # Copy the motif informatuon pwmfile = pwmfile_location(pwmfile) if pwmfile: shutil.copy2(pwmfile, outdir) mapfile = re.sub(".p[fw]m$", ".motif2factors.txt", pwmfile) if os.path.exists(mapfile): shutil.copy2(mapfile, outdir) # Create a file with the number of motif matches if not count_table: count_table = os.path.join(outdir, "motif.count.txt.gz") if not os.path.exists(count_table): logger.info("Motif scanning (counts)") counts = scan_to_table(infile, genome, "count", pwmfile=pwmfile, ncpus=ncpus) counts.to_csv(count_table, sep="\t", compression="gzip") else: logger.info("Counts, using: %s", count_table) # Create a file with the score of the best motif match if not score_table: score_table = os.path.join(outdir, "motif.score.txt.gz") if not os.path.exists(score_table): logger.info("Motif scanning (scores)") scores = scan_to_table(infile, genome, "score", pwmfile=pwmfile, ncpus=ncpus) scores.to_csv(score_table, sep="\t", float_format="%.3f", compression="gzip") else: logger.info("Scores, using: %s", score_table) if cluster: cluster = False for method in methods: m = Moap.create(method, ncpus=ncpus) if m.ptype == "classification": cluster = True break if not cluster: logger.info("Skipping clustering, no classification methods") exps = [] clusterfile = infile if df.shape[1] != 1: # More than one column for method in Moap.list_regression_predictors(): if method in methods: m = Moap.create(method, ncpus=ncpus) exps.append([method, m.pref_table, infile]) logger.debug("Adding %s", method) if cluster: clusterfile = os.path.join(outdir, os.path.basename(infile) + ".cluster.txt") df[:] = scale(df, axis=0) names = df.columns df_changed = pd.DataFrame(index=df.index) df_changed["cluster"] = np.nan for name in names: df_changed.loc[(df[name] - df.loc[:,df.columns != name].max(1)) > 0.5, "cluster"] = name df_changed.dropna().to_csv(clusterfile, sep="\t") if df.shape[1] == 1 or cluster: for method in Moap.list_classification_predictors(): if method in methods: m = Moap.create(method, ncpus=ncpus) exps.append([method, m.pref_table, clusterfile]) if len(exps) == 0: logger.error("No method to run.") sys.exit(1) for method, scoring, fname in exps: try: if scoring == "count" and count_table: moap_with_table(fname, count_table, outdir, method, scoring, ncpus=ncpus) elif scoring == "score" and score_table: moap_with_table(fname, score_table, outdir, method, scoring, ncpus=ncpus) else: moap_with_bg(fname, genome, outdir, method, scoring, pwmfile=pwmfile, ncpus=ncpus) except Exception as e: logger.warn("Method %s with scoring %s failed", method, scoring) logger.warn(e) logger.warn("Skipping") raise dfs = {} for method, scoring,fname in exps: t = "{}.{}".format(method,scoring) fname = os.path.join(outdir, "activity.{}.{}.out.txt".format( method, scoring)) try: dfs[t] = pd.read_table(fname, index_col=0, comment="#") except: logging.warn("Activity file for {} not found!\n".format(t)) if len(methods) > 1: logger.info("Rank aggregation") df_p = df_rank_aggregation(df, dfs, exps) df_p.to_csv(os.path.join(outdir, "final.out.csv"), sep="\t") #df_p = df_p.join(m2f) # Write motif frequency table if df.shape[1] == 1: mcount = df.join(pd.read_table(count_table, index_col=0, comment="#")) m_group = mcount.groupby(df.columns[0]) freq = m_group.sum() / m_group.count() freq.to_csv(os.path.join(outdir, "motif.freq.txt"), sep="\t") if plot and len(methods) > 1: logger.info("html report") maelstrom_html_report( outdir, os.path.join(outdir, "final.out.csv"), pwmfile ) logger.info(os.path.join(outdir, "gimme.maelstrom.report.html"))
index_col=False, names=headers, dtype=dtypes, parse_dates=['Date']) elapsed_read = perf_counter() - tstart df_bad = df[df.tFile.isnull()] if not df_bad.empty: print("Invalid lines, stop processing") print(df_bad) sys.exit() tstart = perf_counter() print("Saving cache") df.to_feather(cache_file_fp) elapsed_write = perf_counter() - tstart print(f'Done. Time read={elapsed_read:.3f}, write={elapsed_write:.3f}') else: print('Loading data from cache.') tstart = perf_counter() df = pd.read_feather(cache_file_fp) elapsed_read = perf_counter() - tstart print(f'Done. Time read={elapsed_read:.3f}') print(df.info()) print() date_from = np.datetime64('2021-02-18 21:00:00') dg: DataFrame = df[(df.Server != 'EU50TSVP412') & (df.Date >= date_from)] dg.reset_index(inplace=True, drop=True) print(dg.info()) print(dg.shape)
warnings.filterwarnings('ignore') with open('./configs/default.yaml', 'r') as yf: config = yaml.load(yf) # =============== # Settings # =============== INPUT_PATH = './data/input/train_transaction.feather' OUTPUT_PATH = './folds/folds7.feather' SEED = config['seed'] TARGET = 'isFraud' START_DATE = config['start_date'] # =============== # Main # =============== df = pd.read_feather(INPUT_PATH) df['datetime_'] = df['TransactionDT'].apply( lambda x: (START_DATE + datetime.timedelta(seconds=x))) df['weekofyear'] = (df['TransactionDT'] - 86400) // (3600 * 24 * 7) split_groups = df['weekofyear'] week_unique = split_groups.unique() df['fold_id'] = np.nan for week in week_unique: val_idx = df[df['weekofyear'] == week].index df.loc[val_idx, 'fold_id'] = week df[['fold_id']].astype('int').to_feather(OUTPUT_PATH)
#!/usr/bin/env python # coding: utf-8 # In[1]: import numpy as np import pandas as pd from tqdm import tqdm # In[2]: paper = pd.read_feather("../../../input/paper_input_final.ftr") # In[3]: paper['abst'] = paper['abst'].apply(lambda s: s.replace('no_content', '')) paper['corp'] = paper['titl'] + ' ' + paper['keywords'].fillna('').replace( ';', ' ') + paper['abst'] # In[4]: df_train = pd.read_feather("../../../input/tr_input_final.ftr") # In[5]: df_train.head() # In[6]: df_test = pd.read_feather("../../../input/te_input_final.ftr")
def test_read_feather(self): data = pd.read_feather("/input/tests/data/feather-0_3_1.feather") self.assertEqual(10, data.size)
files_tr = sorted(glob('../data/train_f*.f')) # USE_PREF li = [] for i in files_tr: for j in USE_PREF: if j in i: li.append(i) break files_tr = li [print(i, f) for i, f in enumerate(files_tr)] X_train = pd.concat( [pd.read_feather(f) for f in tqdm(files_tr, mininterval=30)] + [joblib.load('../external/X_train_nejumi.pkl.gz')], axis=1) y_train = utils.load_target()['HasDetections'] # drop if len(col_drop) > 0: X_train.drop(col_drop, axis=1, inplace=True) if X_train.columns.duplicated().sum() > 0: raise Exception( f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }') print('no dup :) ') print(f'X_train.shape {X_train.shape}')