def read_feather(path, use_threads=True): """ Load a feather-format object from the file path .. versionadded 0.20.0 Parameters ---------- path : string file path, or file-like object nthreads : int, default 1 Number of CPU threads to use when reading to pandas.DataFrame .. versionadded 0.21.0 .. deprecated 0.24.0 use_threads: bool, default True Whether to parallelize reading using multiple threads .. versionadded 0.24.0 Returns ------- type of object stored in file """ feather, pyarrow = _try_import() path = _stringify_path(path) if LooseVersion(pyarrow.__version__) < LooseVersion('0.11.0'): int_use_threads = int(use_threads) if int_use_threads < 1: int_use_threads = 1 return feather.read_feather(path, nthreads=int_use_threads) return feather.read_feather(path, use_threads=bool(use_threads))
def test_integer_with_nulls(self): # pandas requires upcast to float dtype path = random_path() self.test_files.append(path) int_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'] num_values = 100 writer = FeatherWriter() writer.open(path) null_mask = np.random.randint(0, 10, size=num_values) < 3 expected_cols = [] for name in int_dtypes: values = np.random.randint(0, 100, size=num_values) writer.write_array(name, values, null_mask) expected = values.astype('f8') expected[null_mask] = np.nan expected_cols.append(expected) ex_frame = pd.DataFrame(dict(zip(int_dtypes, expected_cols)), columns=int_dtypes) writer.close() result = read_feather(path) assert_frame_equal(result, ex_frame)
def test_float_nulls(self): num_values = 100 path = random_path() self.test_files.append(path) writer = FeatherWriter() writer.open(path) null_mask = np.random.randint(0, 10, size=num_values) < 3 dtypes = ['f4', 'f8'] expected_cols = [] null_counts = [] for name in dtypes: values = np.random.randn(num_values).astype(name) writer.write_array(name, values, null_mask) values[null_mask] = np.nan expected_cols.append(values) null_counts.append(null_mask.sum()) writer.close() ex_frame = pd.DataFrame(dict(zip(dtypes, expected_cols)), columns=dtypes) result = read_feather(path) assert_frame_equal(result, ex_frame) assert_array_equal(self._get_null_counts(path), null_counts)
def test_filelike_objects(self): from io import BytesIO buf = BytesIO() # the copy makes it non-strided df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=['a', 'b', 'c']).copy() write_feather(df, buf) buf.seek(0) result = read_feather(buf) assert_frame_equal(result, df)
def test_buffer_bounds_error(self): # ARROW-1676 path = random_path() self.test_files.append(path) for i in range(16, 256): values = pa.array([None] + list(range(i)), type=pa.float64()) writer = FeatherWriter() writer.open(path) writer.write_array('arr', values) writer.close() result = read_feather(path) expected = pd.DataFrame({'arr': values.to_pandas()}) assert_frame_equal(result, expected) self._check_pandas_roundtrip(expected, null_counts=[1])
def _check_pandas_roundtrip(self, df, expected=None, path=None, columns=None, null_counts=None): if path is None: path = random_path() self.test_files.append(path) write_feather(df, path) if not os.path.exists(path): raise Exception('file not written') result = read_feather(path, columns) if expected is None: expected = df assert_frame_equal(result, expected) if null_counts is None: null_counts = np.zeros(len(expected.columns)) np.testing.assert_array_equal(self._get_null_counts(path, columns), null_counts)
def test_boolean_nulls(self): # pandas requires upcast to object dtype path = random_path() self.test_files.append(path) num_values = 100 np.random.seed(0) writer = FeatherWriter() writer.open(path) mask = np.random.randint(0, 10, size=num_values) < 3 values = np.random.randint(0, 10, size=num_values) < 5 writer.write_array('bools', values, mask) expected = values.astype(object) expected[mask] = None writer.close() ex_frame = pd.DataFrame({'bools': expected}) result = read_feather(path) assert_frame_equal(result, ex_frame)
def read_covidactnow(fname): return feather.read_feather(fname)
#!/usr/bin/env python # coding: utf-8 # In[270]: import pandas as pd pd.options.mode.chained_assignment = None # default='warn' import numpy as np import math import pyarrow.feather as feather import json import os import sys players = feather.read_feather( '/data/p_dsi/nba_player_movement/team_player_data/players.file') teams = feather.read_feather( '/data/p_dsi/nba_player_movement/team_player_data/teams.file') outcomes = feather.read_feather( '/data/p_dsi/nba_player_movement/outcome_shots_data/outcomes.file') shot_distance_percentage = pd.read_csv( '/data/p_dsi/nba_player_movement/outcome_shots_data/shot_distance_percentage.csv' ) data_path = '/data/p_dsi/nba_player_movement/data_feather_file/group1' feather_path = '/data/p_dsi/nba_player_movement/data_merged_files' files = os.listdir(data_path) if not os.path.exists(feather_path): os.makedirs(feather_path)
from lightgbm import LGBMClassifier full_dataset = True if full_dataset: filePathTRAIN = os.path.join(os.getcwd(), '2_data_preparation', 'features', 'TRAIN_sample.feather') filePathTEST = os.path.join(os.getcwd(), '2_data_preparation', 'features', 'TEST_sample.feather') else: filePathTRAIN = os.path.join(os.getcwd(), '2_data_preparation', 'features', 'TRAIN_sample_afterFS.feather') filePathTEST = os.path.join(os.getcwd(), '2_data_preparation', 'features', 'TEST_sample_afterFS.feather') train = feather.read_feather(filePathTRAIN) test = feather.read_feather(filePathTEST) train = train.fillna(0) y_train = train["TARGET"] #y_train = y_train.values x_train = train.drop(["TARGET"], axis=1) #check if columns contain Inf values and drop them inf_sum = np.isinf(x_train).sum() inf_sum = inf_sum.index[np.where(inf_sum > 0)] x_train.drop(columns=list(inf_sum), inplace=True) test.drop(columns=list(inf_sum), inplace=True)
def convert_to_arrow(my_cccc, in_file_list, out_dir, out_list_file, conf_df, write_location, debug): warno = 189 out_arrows = [] now = datetime.utcnow() create_datetime_list = ['C_', my_cccc, '_', str(now.year).zfill(4), str(now.month).zfill(2), str(now.day).zfill(2), str(now.hour).zfill(2), str(now.minute).zfill(2), str(now.second).zfill(2)] create_datetime = ''.join(create_datetime_list) cccc_set = set([re.sub('^.*/', '', re.sub('/grib/.*$', '', in_file)) for in_file in in_file_list]) cat_subcat_set = set([re.search(r'^[^/]*/[^/]*/', re.sub('^.*/grib/', '', in_file)).group().rstrip('/') for in_file in in_file_list]) for cccc in cccc_set: for cat_subcat in cat_subcat_set: keys = ['stepRange', 'typeOfLevel', 'level', 'shortName'] missingValue = -3.402823e+38 for in_file in in_file_list: property_dict = {} ft_list = [] match = re.search(r'^.*/' + cccc + '/grib/' + cat_subcat + '/.*$', in_file) if not match: continue if not os.access(in_file, os.F_OK): print('Warning', warno, ':', in_file, 'does not exist.', file=sys.stderr) continue elif not os.path.isfile(in_file): print('Warning', warno, ':', in_file, 'is not file.', file=sys.stderr) continue elif not os.access(in_file, os.R_OK): print('Warning', warno, ':', in_file, 'is not readable.', file=sys.stderr) continue dt_str = re.sub('/.*$', '', re.sub('^.*/' + cccc + '/grib/' + cat_subcat + '/', '', in_file)) with open(in_file, 'r') as in_file_stream: if debug: print('Debug', ':', in_file, file=sys.stderr) try: codes_grib_multi_support_on() iid = codes_index_new_from_file(in_file, keys) key_values_list = [] for key in keys: key_values = codes_index_get(iid, key) key_values_list.append(key_values) products = [[]] for key_values in key_values_list: products = [x + [y] for x in products for y in key_values] for product in products: for key_count in range(len(keys)): codes_index_select(iid, keys[key_count], product[key_count]) while True: gid = codes_new_from_index(iid) if gid is None: break codes_set(gid, 'missingValue', missingValue) iterid = codes_keys_iterator_new(gid, 'ls') step_range = None type_of_level = None level = None short_name = None cat = re.sub('/.*$', '', cat_subcat) subcat = re.sub('^.*/', '', cat_subcat) target_conf_df = conf_df[(conf_df['category'] == cat) & (conf_df['subcategory'] == subcat)] while codes_keys_iterator_next(iterid): key = codes_keys_iterator_get_name(iterid) if key in keys: value = codes_get_string(gid, key) if key == 'stepRange' or key == 'level': target_conf_df = target_conf_df[(target_conf_df[key] == int(value))] else: target_conf_df = target_conf_df[(target_conf_df[key] == value)] codes_keys_iterator_delete(iterid) message_np = np.array([]) for conf_row in target_conf_df.itertuples(): ft = codes_get(gid, 'stepRange') if not ft in ft_list: ft_list.append(ft) property_dict[(conf_row.category, conf_row.subcategory, conf_row.stepRange, conf_row.typeOfLevel, conf_row.level, conf_row.shortName, ft)] = np.array(codes_get_values(gid)) if write_location: iterid = codes_grib_iterator_new(gid, 0) lat_list = [] lon_list = [] while True: latitude_longitude_value = codes_grib_iterator_next(iterid) if not latitude_longitude_value: break else: lat_list.append(latitude_longitude_value[0]) if latitude_longitude_value[1] < 180.0: lon_list.append(latitude_longitude_value[1]) else: lon_list.append(latitude_longitude_value[1] - 360.0) codes_grib_iterator_delete(iterid) out_directory_list = [out_dir, cccc, 'grib_to_arrow', conf_row.category, conf_row.subcategory] out_directory = '/'.join(out_directory_list) os.makedirs(out_directory, exist_ok=True) out_file_list = [out_directory, '/location.feather'] out_file = ''.join(out_file_list) with open(out_file, 'bw') as out_f: location_batch = pa.record_batch([pa.array(lat_list, 'float32'), pa.array(lon_list, 'float32')], names=['latitude [degree]', 'longitude [degree]']) location_table = pa.Table.from_batches([location_batch]) feather.write_feather(location_table, out_f, compression='zstd') codes_release(gid) except: print('Warning', warno, ':', in_file, 'is invalid grib.', file=sys.stderr) if len(property_dict) > 0: out_directory_list = [out_dir, cccc, 'grib_to_arrow', conf_row.category, conf_row.subcategory] out_directory = '/'.join(out_directory_list) os.makedirs(out_directory, exist_ok=True) out_file_list = [out_directory, '/location.feather'] out_file = ''.join(out_file_list) location_df = feather.read_feather(out_file) dt = datetime(int(dt_str[0:4]), int(dt_str[4:6]), int(dt_str[6:8]), int(dt_str[8:10]), 0, 0, 0, tzinfo=timezone.utc) dt_list = [dt for i in range(0, len(location_df.index))] for ft in ft_list: name_list = ['latitude [degree]', 'longitude [degree]', 'datetime'] data_list = [pa.array(location_df['latitude [degree]'].values.tolist(), 'float32'), pa.array(location_df['longitude [degree]'].values.tolist(), 'float32')] data_list.append(pa.array(dt_list, pa.timestamp('ms', tz='utc'))) for conf_row in conf_df[(conf_df['category'] == cat) & (conf_df['subcategory'] == subcat)].itertuples(): if len(property_dict[(conf_row.category, conf_row.subcategory, conf_row.stepRange, conf_row.typeOfLevel, conf_row.level, conf_row.shortName, ft)]) > 0: if re.match(r'^.*U wind component.*$', conf_row.name): u_value_np = property_dict[(conf_row.category, conf_row.subcategory, conf_row.stepRange, conf_row.typeOfLevel, conf_row.level, conf_row.shortName, ft)] v_value_np = property_dict[(conf_row.category, conf_row.subcategory, conf_row.stepRange, conf_row.typeOfLevel, conf_row.level, conf_row.shortName.replace('u', 'v'), ft)] wind_speed_np = np.sqrt(np.power(u_value_np, 2) + np.power(v_value_np, 2)) wind_direction_np = np.degrees(np.arctan2(v_value_np, u_value_np)) wind_direction_np = np.array([value + 360.0 if value < 0 else value for value in wind_direction_np]) name_list.append(ft + '/' + re.sub(r'U wind component', 'wind speed [m/s]', conf_row.name)) data_list.append(pa.array(np.array(wind_speed_np, dtype=conf_row.datatype))) name_list.append(ft + '/' + re.sub(r'U wind component', 'wind direction [degree]', conf_row.name)) data_list.append(pa.array(np.array(wind_direction_np, dtype=conf_row.datatype))) elif not re.match(r'^.*V wind component.*$', conf_row.name): value_list = property_dict[(conf_row.category, conf_row.subcategory, conf_row.stepRange, conf_row.typeOfLevel, conf_row.level, conf_row.shortName, ft)] name_list.append(ft + '/' + conf_row.name) data_list.append(pa.array(np.array(value_list, dtype=conf_row.datatype))) out_directory_list = [out_dir, cccc, 'grib_to_arrow', conf_row.category, conf_row.subcategory] out_directory = '/'.join(out_directory_list) os.makedirs(out_directory, exist_ok=True) out_file_list = [out_directory, '/', dt_str, '_', create_datetime, '.feather'] out_file = ''.join(out_file_list) with open(out_file, 'bw') as out_f: property_batch = pa.record_batch(data_list, names=name_list) property_table = pa.Table.from_batches([property_batch]) feather.write_feather(property_table, out_f, compression='zstd') print(out_file, file=out_list_file)
def main( args, n_timesteps=200, prob_willing_min=0.1, prob_willing_max=0.9, turns_per_day_min=3.5, turns_per_day_max=3.5, available_machine_time_min=0.5, available_machine_time_max=0.5, machines_pm_min=4, machines_pm_max=55, recruitment_per_machine_min=0.2, recruitment_per_machine_max=2, return_scaling_factor_min=0.5, return_scaling_factor_max=2.25, return_3rd_scaling_factor_max=1.42, return_4th_scaling_factor_max=1.23, K=0.44068065575293935, lambd=0.02564293675387818, K_3rd=0.7004050311562576, lambd_3rd=0.03913344618984413, K_4th=0.8076846503792819, lambd_4th=0.05034356712490996, population_sizes={ 'NJ': 8882190, 'MA': 6892503, 'IN': 6732219, 'LA': 4648794, 'CT': 3565287, 'MS': 2976149, 'VA': 8535519, 'MD': 6045680, 'NY': 19453561, 'IL': 12671821, 'CA': 39512223 }): state, dist, n_sims, chunk_size = parse_args(args) run_id = str(np.random.randint(100000, high=999999, size=1)[0]) state_data = feather.read_feather("./covidactnow_" + state + "_2020-09-08.feather") start_date = min(state_data.date[state_data.discharges > 0]) end_date = start_date + timedelta(days=n_timesteps - 1) discharges = [ int(i) for i in state_data[(state_data.date >= start_date) & (state_data.date <= end_date)].discharges.tolist() ] n_iterations = len(discharges) #print(state_data.head()) if dist == "empiric": donret_2_dist = feather.read_feather( "./data/donor_return_2nd_cumprob.feather") donret_2_dist_list = [donret_2_dist.copy() for _ in range(n_sims)] donret_3_dist = feather.read_feather( "./data/donor_return_3rd_cumprob.feather") donret_3_dist_list = [donret_3_dist.copy() for _ in range(n_sims)] donret_4_dist = feather.read_feather( "./data/donor_return_4th_cumprob.feather") donret_4_dist_list = [donret_4_dist.copy() for _ in range(n_sims)] elif dist == "parametric": scaling_factors = np.random.uniform(return_scaling_factor_min, return_scaling_factor_max, size=n_sims) t = np.linspace(0, 126, 127) donret_2_dist_list = [] for i in range(n_sims): p = exp_model_func(t, K, lambd, t0=7, scale=scaling_factors[i]) donret_2_dist_list.append( pd.DataFrame({ 'time': np.append(t, 9999.0), 'prob': np.append(p, 1.0) })) donret_3_dist = feather.read_feather( "./data/donor_return_3rd_cumprob.feather") donret_3_dist_list = [donret_3_dist.copy() for _ in range(n_sims)] donret_4_dist = feather.read_feather( "./data/donor_return_4th_cumprob.feather") donret_4_dist_list = [donret_4_dist.copy() for _ in range(n_sims)] elif dist == "parametric_all": scaling_factors = np.random.uniform(return_scaling_factor_min, return_scaling_factor_max, size=n_sims) scaling_factos_propmax = (scaling_factors - return_scaling_factor_min ) / (return_scaling_factor_max - return_scaling_factor_min) scaling_factors_3rd = scaling_factos_propmax * ( return_3rd_scaling_factor_max - return_scaling_factor_min) + return_scaling_factor_min #print("Scaling 3rd:", min(scaling_factors_3rd), max(scaling_factors_3rd)) scaling_factors_4th = scaling_factos_propmax * ( return_4th_scaling_factor_max - return_scaling_factor_min) + return_scaling_factor_min #print("Scaling 4th:", min(scaling_factors_4th), max(scaling_factors_4th)) t = np.linspace(0, 126, 127) donret_2_dist_list = [] donret_3_dist_list = [] donret_4_dist_list = [] for i in range(n_sims): p = exp_model_func(t, K, lambd, t0=7, scale=scaling_factors[i]) donret_2_dist_list.append( pd.DataFrame({ 'time': np.append(t, 9999.0), 'prob': np.append(p, 1.0) })) p = exp_model_func(t, K_3rd, lambd_3rd, t0=7, scale=scaling_factors_3rd[i]) donret_3_dist_list.append( pd.DataFrame({ 'time': np.append(t, 9999.0), 'prob': np.append(p, 1.0) })) p = exp_model_func(t, K_4th, lambd_4th, t0=7, scale=scaling_factors_4th[i]) donret_4_dist_list.append( pd.DataFrame({ 'time': np.append(t, 9999.0), 'prob': np.append(p, 1.0) })) population = population_sizes[state] prob_willing = np.random.uniform(prob_willing_min, prob_willing_max, size=n_sims) macines_pm = np.random.uniform(machines_pm_min, machines_pm_max, size=n_sims) machines = np.round(macines_pm * (population / 1e6)) turns_per_day = np.random.uniform(turns_per_day_min, turns_per_day_max, size=n_sims) available_machine_time = np.random.uniform(available_machine_time_min, available_machine_time_max, size=n_sims) max_collections = np.round(machines * turns_per_day * available_machine_time).astype(int) recruitment_per_machine = np.random.uniform(recruitment_per_machine_min, recruitment_per_machine_max, size=n_sims) max_recruitment = np.round(machines * recruitment_per_machine).astype(int) parameter_list = [] for i in range(n_sims): parameter_list.append({ "run_id": run_id, "iterations": n_iterations, "report_level": 3, "num_agents_init": 0, "recovered": discharges, "historical_collections": None, "prob_eligible": 1, "delay_eligibility": 14, "duration_eligibility": 180, "temp_ineligibility_period": 7, "donor_return_dist_type": dist, "donor_return_second": donret_2_dist_list[i], "donor_return_second_scale": scaling_factors[i] if dist == "parametric" or dist == "parametric_all" else None, "donor_return_third": donret_3_dist_list[i], "donor_return_third_scale": scaling_factors_3rd[i] if dist == "parametric_all" else None, "donor_return_later": donret_4_dist_list[i], "donor_return_later_scale": scaling_factors_4th[i] if dist == "parametric_all" else None, "donor_return_prob_col": "prob", "delay_recruitment": 0, "prob_willing": prob_willing[i], "prob_male": 0.5, "male_relative_propensity": 1, "max_recruitment": int(max_recruitment[i]), "recruitment_pm": recruitment_per_machine[i], "prob_other_deferral": 0.02, "prob_failed_donation": 0.01, "prob_hla_female": 0.09, "prob_tti": 0.002, "prob_Ab_pos": 0.93, "qualify_neut_titers": False, "prob_neut_above_360": None, "max_collections": int(max_collections[i]), "mean_units_per_collection": turns_per_day[i], "max_collection_growth": 9999, # essentially infinite "recruitment_start": 0, "collection_start": 0, "machines_pm": int(macines_pm[i]), "available_machine_time": available_machine_time[i] }) n_chunks = n_sims // chunk_size iter_start = 0 for j in range(n_chunks): simset = "statesim_" + state + "_" + dist + "_" + run_id + "_" + str( j).zfill(3) [ p.update({"chunk_id": j}) for p in parameter_list[iter_start:(iter_start + chunk_size)] ] csim.multi_threaded_run( parameter_list=parameter_list[iter_start:(iter_start + chunk_size)], simulations=chunk_size, root_seed=None, processes=20, simset_name=simset, output_report=True, output_agents=True, output_parameters=True, report_dir="./outputs/", return_results=False) gc.collect() iter_start += chunk_size
import pickle as pkl import pyarrow.feather as feather import pandas as pd # with open('chars60_raw_imputed.feather', 'rb') as f: # chars = feather.read_feather(f) with open('chars60_rank_imputed.feather', 'rb') as f: chars = feather.read_feather(f) print(chars.columns.values) chars['date'] = pd.to_datetime(chars['date']) chars['year'] = chars['date'].dt.year chars_1970s = chars[chars['year'] < 1980] chars_1980s = chars[(chars['year'] >= 1980) & (chars['year'] < 1990)] chars_1990s = chars[(chars['year'] >= 1990) & (chars['year'] < 2000)] chars_2000s = chars[(chars['year'] >= 2000) & (chars['year'] < 2010)] chars_2010s = chars[(chars['year'] >= 2010) & (chars['year'] < 2020)] chars_2020s = chars[(chars['year'] >= 2020) & (chars['year'] < 2030)] # raw # chars_1970s.to_csv('chars60_raw_1970s.csv', index=0) # chars_1980s.to_csv('chars60_raw_1980s.csv', index=0) # chars_1990s.to_csv('chars60_raw_1990s.csv', index=0) # chars_2000s.to_csv('chars60_raw_2000s.csv', index=0) # chars_2010s.to_csv('chars60_raw_2010s.csv', index=0) # rank chars_1970s.to_csv('chars60_rank_1970s.csv', index=0) chars_1980s.to_csv('chars60_rank_1980s.csv', index=0)
def updateEMA(Client, symbol, emas, interval): First = True #check files and directories filename = f"{Client.MAIN_PATH}/data/candles/{interval}" filename2 = f"{Client.MAIN_PATH}/data/ema/{interval}" if not os.path.exists(filename): os.makedirs(filename) if not os.path.exists(filename2): os.makedirs(filename2) fn = f"{filename}/{symbol}.feather" fn2 = f"{filename2}/{symbol}.feather" if os.path.isfile(fn): df_cdl = feather.read_feather(fn) if not df_cdl["OpenTime"].empty: if os.path.isfile(fn2): df_ema = feather.read_feather(fn2) if not df_ema.empty: First = False ema_lastDate = df_ema.iloc[-1] df_add = df_cdl.loc[df_cdl["OpenTime"] > ema_lastDate["OpenTime"], ["OpenTime", "ClosePrice"]] if not df_add.empty: new_lst = [] for num in emas: name = f"EMA{num}" prevMA = ema_lastDate[name] if not math.isnan(prevMA): temp_lst = [] for close in df_add["ClosePrice"]: prevMA = close * (2/(num+1)) + prevMA * (1 - (2/(num+1))) temp_lst.append(prevMA) df_add[name] = temp_lst else: df_ema2 = df_cdl.loc[:,["OpenTime", "ClosePrice"]] new_lst.append((name, abstract.EMA(df_ema2["ClosePrice"], timeperiod=num))) df_ema = df_ema.append(df_add, ignore_index=True) for name, value in new_lst: df_ema[name] = value feather.write_feather(df_ema, fn2) if First: df_ema2 = df_cdl.loc[:,["OpenTime", "ClosePrice"]] if not df_cdl.empty: for num in emas: name = f"EMA{num}" df_ema2[name] = abstract.EMA(df_ema2["ClosePrice"], timeperiod=num) feather.write_feather(df_ema2, fn2) else: print("file is empty")
# Feature Importance with Extra Trees Classifier import numpy as np import pandas as pd from sklearn.ensemble import ExtraTreesClassifier import os import pyarrow.feather as feather filePathTRAIN = os.path.join(os.getcwd(), '2_data_preparation', 'features', 'TRAIN_sample.feather') filePathTEST = os.path.join(os.getcwd(), '2_data_preparation', 'features', 'TEST_sample.feather') df = feather.read_feather(filePathTRAIN) df1 = df.drop("index", axis=1) df1 = df1.fillna(0) colnames = df1.columns.values dfY = df1["TARGET"] Y = dfY.values dfX = df1.drop(["TARGET"], axis=1) #check if columns contain Inf values and drop them inf_sum = np.isinf(dfX).sum() inf_sum = inf_sum.index[np.where(inf_sum > 0)] dfX.drop(columns=list(inf_sum), inplace=True) X = dfX.values model = ExtraTreesClassifier() model.fit(X, Y) # print(model.feature_importances_) list_imp = model.feature_importances_
import pandas as pd import pickle as pkl import pyarrow.feather as feather import numpy as np from tqdm import tqdm from functions import * #################### # All Stocks # #################### with open('chars_q_raw.feather', 'rb') as f: chars_q = feather.read_feather(f) chars_q = chars_q.dropna(subset=['permno']) chars_q[['permno', 'gvkey']] = chars_q[['permno', 'gvkey']].astype(int) chars_q['jdate'] = pd.to_datetime(chars_q['jdate']) chars_q = chars_q.drop_duplicates(['permno', 'jdate']) with open('chars_a_raw.feather', 'rb') as f: chars_a = feather.read_feather(f) chars_a = chars_a.dropna(subset=['permno']) chars_a[['permno', 'gvkey']] = chars_a[['permno', 'gvkey']].astype(int) chars_a['jdate'] = pd.to_datetime(chars_a['jdate']) chars_a = chars_a.drop_duplicates(['permno', 'jdate']) # information list obs_var_list = [ 'gvkey', 'permno', 'jdate', 'sic', 'ret', 'retx', 'retadj', 'exchcd', 'shrcd' ]
DPNI = 1 #How much DPNI to add ################################################################################################### #first import information from the j5 spreadsheet in order to perform appropriate steps #import feather import pyarrow.feather as ft import pandas as pd import numpy as np #################################################################################################### #path for my imac #path = '/Users/jbryant2/Google Drive File Stream/Shared drives/PlantSynBioLab/Cloning/AFB_epistasis_muts/oligo.feather' #note: mac doesn't like to have "C:/" in it's paths #path for my windows machine path = 'C:/Users/jonbr/Documents/GitHub/opentrons/j5_Files/j5__pGP8A-ARF19 domain delete/oligo.feather' oligos = ft.read_feather(path) oligos oligos['ID Number'] = oligos['ID Number'].astype(int) oligos ###################################################################################################### if len(oligos.columns) < 9: oligos['well'] = '' oligos['stock primer concentration'] = '' oligos['volume of stock primer to add'] = '' oligos['concentration of diluted primer'] = '' oligos['volume of diluted primer'] = '' #this is a calculated value oligos[ 'how much of the diluted primer is left'] = '' #also a calculated value oligos ######################################################################################################
def df_from_molchunk(molchunk_path): df = feather.read_feather(molchunk_path) # PandasTools.AddMoleculeColumnToFrame(df, smilesCol='smiles') return df
def seasonal_refund_func(startDate, endDate): # 设置参数 # 箱子容量 max_bin_qty = 20 # 最大箱号 last_bin_no = 0 # 每日库存 daily_invnt_df = pd.DataFrame() # 每日输出数据 daily_results_df = pd.DataFrame() # 监测库存件数 monitor_sku_invnt = pd.DataFrame() # 建立数据库连接,准备存储结果 db = pymysql.connect(host='0.0.0.0', user='******', passwd='root', db='JNBY') cursor = db.cursor() pymysql.converters.encoders[np.float64] = pymysql.converters.escape_float # engine = create_engine( # 'mysql+pymysql://root:[email protected]:3306/JY', encoding='utf8') engine = create_engine('mysql+pymysql://root:[email protected]:3306/JNBY', encoding='utf8') #从数据库查询每日的订单和库存 normal_orders = feather.read_feather( '/home/liqi/PythonProjects/JY/B2B_seasonal/normal_sale.feather') seasonal_refund = feather.read_feather( '/home/liqi/PythonProjects/JY/B2B_seasonal/season_refund_split.feather' ) seasonal_refund.set_index('date', inplace=True) normal_orders.set_index('date', inplace=True) # print(seasonal_refund.head()) for date in tqdm.tqdm(pd.date_range(start=startDate, end=endDate)): # 记录输出结果 results = [date.strftime('%Y-%m-%d')] # 未完成订单 unsold_df = pd.DataFrame() # 读取当天销售和退货数据 refund = seasonal_refund[seasonal_refund.index == date].copy() sales_df = normal_orders[normal_orders.index == date].copy() # 删除数据库不必要信息 if not refund.empty: seasonal_refund.drop(index=date, inplace=True) if not sales_df.empty: normal_orders.drop(index=date, inplace=True) refund.reset_index(inplace=True) sales_df.reset_index(inplace=True) # print(refund) # print(sales_df) #==================== 入库 if not refund.empty: data = CreateBin(refund, last_bin_no, max_bin_qty) refund_df = data.refund # print(refund_df) last_bin_no = data.last_bin_no ''' 入库输出: 入库件数:refund_qty 入库sku数:refund_sku 入库箱数:refund_bin ''' refund_qty = refund_df.qty.sum() refund_sku = len(set(refund_df['sku'])) refund_bin = data.bin_num results.extend([refund_qty, refund_sku, refund_bin]) # 更新库存 daily_invnt_df = daily_invnt_df.append(refund_df, ignore_index=True) # print(daily_invnt_df.groupby(['sku', 'binNo']).sum()) else: results.extend([0, 0, 0]) # print(results) #==================== 出库 ''' 出库输出: 出库件数:deleted_qty 出库SKU数:deleted_sku 出库行数:deleted_order_line 出库订单数:deleted_order_num 出库搬箱数:deleted_bin_num 平均每出库箱命中SKU数:avg_deleted_sku_in_bin 平均每出库箱命中件数:avg_deleted_qty_in_bin 出库后剩余箱数:bins_after_sale ''' if daily_invnt_df.empty: unsold_df.to_sql(name='unsold orders after seasonal', con=engine, if_exists='append', chunksize=1000, index=None) continue data = InventoryMatchSales(daily_invnt_df, sales_df) daily_invnt_df = data.invnt unsold_df = data.unsold # 输出数据 deleted_bin_num = data.move_bin_num deleted_order_num = data.sold_orders deleted_qty = data.deleted_qty deleted_sku = data.deleted_sku deleted_order_line = data.deleted_order_line bins_after_sale = len(set(daily_invnt_df['binNo'])) avg_deleted_sku_in_bin = round(data.avg_deleted_sku_in_bin, 2) avg_deleted_qty_in_bin = round(data.avg_deleted_qty_in_bin, 2) # deleted_invnt_df = data.deleted_invnt # 存储数据 results.extend([ deleted_qty, deleted_sku, deleted_order_line, deleted_order_num, deleted_bin_num, avg_deleted_sku_in_bin, avg_deleted_qty_in_bin, bins_after_sale ]) # print(results) # print(daily_invnt_df.groupby(['sku', 'binNo']).sum()) #==================== 理款 ''' 理款输出: 理款搬出箱数:to_merge_type_bin_num 平均每款所占箱数:avg_type_bin_num 平均每款件数:avg_type_qty_num 理款后回库箱数:merged_type_bin_num ''' data = MergeSku(daily_invnt_df, last_bin_no, max_bin_qty, 1) to_merge_type_bin_num = data.to_merge_type_bin_num avg_bin_per_type_num = round(data.avg_bin_per_type_num, 2) avg_qty_per_type_num = round(data.avg_qty_per_type_num, 2) merged_type_bin_num = data.merged_type_bin_num daily_invnt_df = data.invnt results.extend([ to_merge_type_bin_num, avg_bin_per_type_num, avg_qty_per_type_num, merged_type_bin_num ]) # print(results) # print(daily_invnt_df.groupby(['sku', 'binNo']).sum()) #==================== 理SKU ''' 理SKU输出: 被理货的SKU数量: to_merge_sku_num 平均每个被理货的SKU所占的箱数:avg_bin_per_sku_num 平均每个被理货的SKU的件数:avg_qty_per_sku_num 理SKU搬出的箱数:to_merge_sku_bin_num 理SKU后回库的箱数:merge_sku_bin_num ''' # 逻辑:先删除监测列表中库存为0的sku,再将大于等于20件的sku加入监测列表中 data = MergeSku(daily_invnt_df, last_bin_no, max_bin_qty, 2, monitor_sku_invnt) last_bin_no = data.last_bin_no daily_invnt_df = data.invnt # 输出所需数据 to_merge_sku_num = data.to_merge_sku_num to_merge_sku_bin_num = data.to_merge_sku_bin_num merged_sku_bin_num = data.merged_sku_bin_num avg_bin_per_sku_num = round(data.avg_bin_per_sku_num, 2) avg_qty_per_sku_num = round(data.avg_qty_per_sku_num, 2) monitor_sku_invnt = data.monitor_sku_invnt results.extend([ to_merge_sku_num, avg_bin_per_sku_num, avg_qty_per_sku_num, to_merge_sku_bin_num, merged_sku_bin_num ]) # print(results) # print(unsold_df) daily_results_df = daily_results_df.append([results], ignore_index=True) # 将未完成订单写入数据库 # unsold_df.to_sql(name='unsold orders after seasonal', # con=engine, # if_exists='append', # chunksize=1000, # index=None) '''输出数据: 入库件数:refund_qty 入库sku数:refund_sku 入库箱数:refund_bin 出库件数:deleted_qty 出库SKU数:deleted_sku 出库行数:deleted_order_line 出库订单数:deleted_order_num 出库搬箱数:deleted_bin_num 平均每出库箱命中SKU数:avg_deleted_sku_in_bin 平均每出库箱命中件数:avg_deleted_qty_in_bin 出库后剩余箱数:bins_after_sale 理款搬出箱数:to_merge_type_bin_num 平均每款所占箱数:avg_type_bin_num 平均每款件数:avg_type_qty_num 理款后回库箱数:merged_type_bin_num 被理货的SKU数量: to_merge_sku_num 平均每个被理货的SKU所占的箱数:avg_bin_per_sku_num 平均每个被理货的SKU的件数:avg_qty_per_sku_num 理SKU搬出的箱数:to_merge_sku_bin_num 理SKU后回库的箱数:merge_sku_bin_num ''' col = [ 'date', 'refund_qty', 'refund_sku', 'refund_bin', 'deleted_qty', 'deleted_sku', 'deleted_order_line', 'deleted_order_num', 'deleted_bin_num', 'avg_deleted_sku_in_bin', 'avg_deleted_qty_in_bin', 'bins_after_sale', 'to_merge_type_bin_num', 'avg_type_bin_num', 'avg_type_qty_num', 'merged_type_bin_num', 'to_merge_sku_num', 'avg_bin_per_sku_num', 'avg_qty_per_sku_num', 'to_merge_sku_bin_num', 'merge_sku_bin_num' ] daily_results_df.columns = col # print(daily_results_df) # print(daily_invnt_df) daily_results_df.to_sql(name='daily results for B2B seasonal', con=engine, if_exists='append', chunksize=1000, index=None) # 将每日库存明细写入数据库 daily_invnt_df.to_sql(name='invnt_after_seasonal_3_5', con=engine, if_exists='append', chunksize=1000, index=None)
def __init__(self, datafile): self.datafile = datafile self.data = feather.read_feather(source=datafile, nthreads=16)
def read_file(self, file_name, has_header=True): """Read a matrix of data from file. Parameters ---------- file_name : str Full path and name of the file. has_header : bool, optional True if the file has a header; false otherwise. The default is True. Returns ------- dict Dictionary object containing the matrix ('x') and header ('header'). """ arr = None header = [] if not isfile(file_name): return None if self.get_file_type(file_name) == 'npy': arr = np.load(file_name) if has_header: header = arr[0, :] arr = arr[1:len(arr), :] elif self.get_file_type(file_name) == 'pkl': pd_df = pd.read_pickle(file_name) arr = pd_df.to_numpy() if has_header: header = pd_df.columns elif self.get_file_type(file_name) == 'feather': pd_df = feather.read_feather(file_name) arr = pd_df.to_numpy() if has_header: header = pd_df.columns elif self.get_file_type(file_name) == 'csv' or self.get_file_type( file_name) == 'tsv': arr = [] if has_header: delimiter = ',' if self.get_file_type(file_name) == 'tsv': delimiter = '\t' arr = np.loadtxt(file_name, dtype=str, delimiter=delimiter, skiprows=1) header = np.loadtxt(file_name, dtype=str, delimiter=delimiter, comments=None, skiprows=0, max_rows=1) header[0] = header[0].replace('# ', '') else: arr = np.loadtxt(file_name, dtype=str, delimiter=delimiter) else: return None if len(header) == 0: header = self.get_default_header(arr.shape[1]) return {'x': arr, 'header': header}
# Since some firms only have annual recording before 80s, we need to use annual data as merging benchmark in case # there are some recordings are missing import pandas as pd import pickle as pkl import pyarrow.feather as feather from pandas.tseries.offsets import * with open('chars_a_60.feather', 'rb') as f: chars_a = feather.read_feather(f) chars_a = chars_a.dropna(subset=['permno']) chars_a[['permno', 'gvkey']] = chars_a[['permno', 'gvkey']].astype(int) chars_a['jdate'] = pd.to_datetime(chars_a['jdate']) chars_a = chars_a.drop_duplicates(['permno', 'jdate']) with open('beta.feather', 'rb') as f: beta = feather.read_feather(f) beta['permno'] = beta['permno'].astype(int) beta['jdate'] = pd.to_datetime(beta['date']) + MonthEnd(0) beta = beta[['permno', 'jdate', 'beta']] beta = beta.drop_duplicates(['permno', 'jdate']) chars_a = pd.merge(chars_a, beta, how='left', on=['permno', 'jdate']) with open('rvar_capm.feather', 'rb') as f: rvar_capm = feather.read_feather(f) rvar_capm['permno'] = rvar_capm['permno'].astype(int) rvar_capm['jdate'] = pd.to_datetime(rvar_capm['date']) + MonthEnd(0)
def test_dump_arrow_bytes(df): tbl = pa.Table.from_pandas(df) out = utils.dump_arrow_bytes(tbl) assert isinstance(out, bytes) new = feather.read_feather(BytesIO(out)) pd.testing.assert_frame_equal(df, new)
print("vars and data type: ") train.info() return train features = ['click_id', 'ip', 'app', 'device', 'os', 'channel', 'click_time'] int_features = [ 'click_id', 'ip', 'app', 'device', 'os', 'channel', 'hour', 'ip_n', 'app_n', 'ip_hour_count', 'ip_app_hour_count', 'app_channel_hour_count', 'ip_app_os_hour_count' ] time_features = ['click_time', 'attributed_time'] bool_features = ['is_attributed'] ####### READ THE DATA ####### df = pyfa.read_feather(source='data.feather', nthreads=8) #df = read_data('test.csv') df['click_time'] = df.click_time.astype('int64').floordiv(ONE_SECOND).astype( 'int32') ####### GENERATE COMBINED CATEGORY FOR GROUPING ####### # Collapse all categorical features into a single feature imax = df.ip.max() amax = df.app.max() dmax = df.device.max() omax = df.os.max() cmax = df.channel.max() print(imax, amax, dmax, omax, cmax) df['category'] = df.ip.astype('int64') df.drop(['ip'], axis=1, inplace=True) df['category'] *= amax
import sys import pandas import pyarrow.feather as feather feather_file = sys.argv[1] read_df = feather.read_feather(feather_file)
#import movement.config as CONFIG data_path = '/data/p_dsi/nba_player_movement/concat_dataframe' feather_path = '/data/p_dsi/nba_player_movement/concat_dataframe/agg_files' files = os.listdir(data_path) if not os.path.exists(feather_path): os.makedirs(feather_path) count = 0 for file in files: if '.file' not in file: continue try: df = feather.read_feather('%s/%s' % (data_path, file)) df_agg = df[(df.EVENTMSGTYPE == 1) | (df.EVENTMSGTYPE == 2)].groupby(['game_id','position','player_name','team_name','EVENTMSGTYPE','player_dist_bin']).agg({'spacing_1':'mean', 'spacing_2':'mean'}).reset_index() count += 1 feather.write_feather(df_agg, '%s/%s.file' % (feather_path, 'agg_file'+str(count))) print('finished agg_file' + str(count)) except Exception as e: print('Error in loading: ' + str(file) + ' file, Error: ' + str(e)) print('\n') print('\n') print('Finished concatenating dataframes for all games.') print(str(count) + ' games counted')
fingerprint_files_list_train = [ (dataset_train + '{:01d}'.format(x) + fingerprint_file_ext) for x in range(len(fingerprint_file_names_list_train)) ] scores_files_list_train = [ (dataset_train + '{:01d}'.format(y) + scores_file_ext) for y in range(len(fingerprint_file_names_list_train)) ] npz_list_train = [] scores_list_train = [] names_list_train = [] smiles_list_train = [] for count, batch in enumerate(fingerprint_file_names_list_train): fingerprints = sparse.load_npz(fingerprint_files_list_train[count]) df = feather.read_feather(scores_files_list_train[count]) scores = list(df['scores']) smiles = list(df['smiles']) names = list(df['names']) npz_list_train.append(fingerprints) scores_list_train.append(scores) names_list_train.append(names) smiles_list_train.append(smiles) flat_sparse_fingerprints_train = sparse.vstack(npz_list_train) flat_scores_list_train = [ item for sublist in scores_list_train for item in sublist ] flat_names_list_train = [ item for sublist in names_list_train for item in sublist
import numpy as np import pandas as pd import lightgbm as lgb import pyarrow.feather as pyfa import gc from sklearn.externals import joblib train_data = pyfa.read_feather('train_data.feather',nthreads=8) target = 'is_attributed' predictors = list(train_data.columns) predictors.remove(target) categorical = ['app', 'device', 'os', 'channel', 'hour'] print 'Train test split' y_test = train_data[(train_data.shape[0] - 30000000):train_data.shape[0]][target].values x_test = train_data[(train_data.shape[0] - 30000000):train_data.shape[0]][predictors].values y_train = train_data[0:(train_data.shape[0] - 30000000)][target].values train_data = train_data[0:(train_data.shape[0] - 30000000)][predictors].values gc.collect() print 'Training' lgb_estimator = lgb.LGBMClassifier(n_estimators=2000,boosting_type='gbdt',learning_rate=0.2,num_leaves = 31, max_depth = 5,min_child_samples = 10000, objective='binary',scale_pos_weight=200,subsample=0.5,colsample_bytree=0.7,min_child_weight=0,subsample_for_bin=200000, max_bin=100,silent = False) lgb_estimator.fit(train_data, y_train,eval_set=[(x_test,y_test)],eval_metric='auc',early_stopping_rounds=25,verbose=True,feature_name=predictors,categorical=categorical) print(lgb_estimator.best_iteration_, lgb_estimator.best_score_) joblib.dump(lgb_estimator, 'lgb.pkl')
import numpy as np #from sklearn.model_selection import train_test_split import pandas as pd import pyarrow.feather as feather import os # os.chdir("C:/Users/ppitera002/Documents/hcdr/hcdr") filePath = os.path.join(os.getcwd(), '2_data_preparation', 'features', 'sample.feather') df = feather.read_feather(filePath) # WRONG!!! #X = df.drop(['TARGET'], axis=1) #y = df["TARGET"] #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234) #TRAIN = pd.concat([X_train, y_train], axis=1) #TEST = pd.concat([X_test, y_test], axis=1) TRAIN = df[df.TARGET.notnull()] TEST = df[df.TARGET.isnull()] filePathTRAIN = os.path.join(os.getcwd(), '2_data_preparation', 'features', 'TRAIN_sample.feather') filePathTEST = os.path.join(os.getcwd(), '2_data_preparation', 'features', 'TEST_sample.feather') feather.write_feather(TRAIN, filePathTRAIN) feather.write_feather(TEST, filePathTEST)
import dash_html_components as html import dash_table_experiments as dt import plotly.graph_objs as go from textwrap import dedent from pyarrow import feather import numpy as np import pandas as pd app = dash.Dash() app.config.supress_callback_exceptions = True #####load and process data##### data = feather.read_feather(source='data/aeolus_top5drugs.feather', nthreads=16, columns=['drug_concept_name', 'age_category']) uniq_drugs = data.drug_concept_name.unique() uniq_drugs = uniq_drugs[np.argsort(uniq_drugs)] all_age_cat_counts = (data.groupby(['age_category' ]).apply(lambda x: x.shape[0])) all_age_cat_counts_x = all_age_cat_counts.index.tolist() all_age_cat_counts_y = all_age_cat_counts.values all_age_cat_counts_y_norm = np.round( (all_age_cat_counts_y / all_age_cat_counts.sum()) * 100, 0) ########## app.layout = html.Div(children=[
sql1 = ''' CREATE TABLE `kubot_log`.`part_sale_non_sku_B2B_2` ( `date` varchar(255) NULL, `order_no` varchar(255) NULL, `sku` varchar(255) NULL, `qty` varchar(255) NULL, `order_type` varchar(255) NULL ); ''' cursor.execute(sql1) # 初始库存 inv = [] inventory = Inventory(inv) all_orders = feather.read_feather('B2B_orders.feather') all_refund = feather.read_feather('B2B_normal_refund.feather') # 遍历每一天的sale和refund for date in tqdm.tqdm(pd.date_range(start=startDate, end=endDate)): non_fund_sale = [] # 退货仓出库订单行 refund_sale = pd.DataFrame() # 退货仓结余库存明细 refund_inv = pd.DataFrame() # order = all_orders[all_orders.date == date] a = date.strftime('%Y-%m-%d') order = all_orders[all_orders["date"] == a] nowadays_refund = all_refund[all_refund.date == date]
# libs = [l['libraryName']) for l in stats['statistics']] # use list of specific library names; best if processing local libraries libs = [ # INSERT LIBRARIES HERE ] # ARCHS4 co-expression dataset can be downloaded from the ARCHS4 site # (https://maayanlab.cloud/archs4/download.html) under the section # "Gene Correlation" # extract list of genes that have co-expression data in ARCHS4 with open('archs4_data/archs4_genes.txt', 'r') as f_in: archs4_genes = [g.strip() for g in f_in.readlines()] archs4_df = feather.read_feather('archs4_data/human_correlation_archs4.f') archs4_df.index = archs4_df.columns def augment_archs4(geneset): ''' Augment a list of unique genes {geneset} with ARCHS4 co-expression data. Sum the Pearson correlation scores of each gene in ARCHS4 co-expression matrix for the genes in {geneset}, excluding the genes already in {geneset}, and append the top co-expressed genes to {geneset}. Returns new list. ''' # only augment to ~500 genes for efficiency's sake if len(geneset) >= 500: return geneset add_len = 500 - len(geneset)
os.path.join(dataset + "*" + fingerprint_file_ext)) fingerprint_files_list = [ (dataset + '{:01d}'.format(x) + fingerprint_file_ext) for x in range(len(fingerprint_file_names_list)) ] scores_files_list = [(dataset + '{:01d}'.format(y) + scores_file_ext) for y in range(len(fingerprint_file_names_list))] npz_list = [] scores_list = [] names_list = [] smiles_list = [] for batch_num in range(300): fingerprints = sparse.load_npz(fingerprint_files_list[batch_num]) df = feather.read_feather(scores_files_list[batch_num]) scores = list(df['scores']) smiles = list(df['smiles']) names = list(df['names']) npz_list.append(fingerprints) scores_list.append(scores) names_list.append(names) smiles_list.append(smiles) flat_sparse_fingerprints = sparse.vstack(npz_list) flat_scores_list = [item for sublist in scores_list for item in sublist] flat_names_list = [item for sublist in names_list for item in sublist] flat_smiles_list = [item for sublist in smiles_list for item in sublist] scores_arry = np.array(scores_list, dtype=np.float16) np_scores = np.concatenate(scores_arry)
def test_file_not_exist(): with pytest.raises(pa.ArrowIOError): read_feather('test_invalid_file')
def do_var(df, group_cols, counted, agg_type='float32', show_agg=True): if show_agg: print("Calculating variance of ", counted, " by ", group_cols, '...') gp = df.groupby(group_cols)[counted].var().reset_index().rename( columns={counted: 'new_var'}) df = df.merge(gp, on=group_cols, how='left') del gp df.new_var = df.new_var.astype(agg_type) gc.collect() return df.new_var test = True if test == False: train_data = pyfa.read_feather(source='data.feather', nthreads=8) else: train_data = pd.read_csv('test.csv') print('Extracting new features...') train_data['hour'] = pd.to_datetime( train_data.click_time).dt.hour.astype('uint8') train_data['day'] = pd.to_datetime( train_data.click_time).dt.day.astype('uint8') train_data['minute'] = pd.to_datetime( train_data.click_time).dt.minute.astype('uint8') gc.collect() print 'Calculating unique count...' train_data['uniq_chan_by_ip'] = do_countuniq(train_data[['ip', 'channel']], ['ip'], 'channel', 'uint8')
# `index` varchar(255) NULL, # `date` varchar(255) NULL, # `order` varchar(255) NULL, # `sku` varchar(255) NULL, # `qty` varchar(255) NULL, # `order_type` varchar(255) NULL # ); # ''' # cursor.execute(sql1) # 初始库存 inv = [] inventory = Inventory(inv) remove_cold_sku_inv = [] all_orders = feather.read_feather('all_orders.feather') all_refund = feather.read_feather('all_refund.feather') # 遍历每一天的sale和refund for date in tqdm.tqdm(pd.date_range(start=startDate, end=endDate)): # 退货仓出库订单行 refund_sale = pd.DataFrame() # 退货仓结余库存明细 refund_inv = pd.DataFrame() # 冷sku移库明细 remove_cold_inv = pd.DataFrame() # 理库前后箱子去重计数对比表
import numpy as np import pandas as pd from sklearn.model_selection import train_test_split import pyarrow.feather as pyfa import lightgbm as lgb import gc train_data = pyfa.read_feather('train_data.feather') test_data = train_data[(train_data.shape[0] - 5000000):train_data.shape[0]] train_data = train_data[0:(train_data.shape[0] - 5000000)] gc.collect() target = 'is_attributed' predictors = train_data.columns categorical = ['app', 'device', 'os', 'channel', 'hour'] xgtrain = lgb.Dataset(train_data[predictors].values, label=train_data[target].values, feature_name=predictors,categorical_feature=categorical, free_raw_data=False) xgtrain.save_binary('train_data.bin') del train_data gc.collect() xgtest = lgb.Dataset(test_data[predictors].values, label=test_data[target].values,feature_name=predictors,categorical_feature=categorical,free_raw_data = False,reference=xgtrain) xgtest.save_binary('test_data.bin') del test_data gc.collect() lgb_params = { 'learning_rate': 0.1, 'boosting_type': 'gbdt', 'objective': 'binary',