def read_feather(path, use_threads=True):
    """
    Load a feather-format object from the file path

    .. versionadded 0.20.0

    Parameters
    ----------
    path : string file path, or file-like object
    nthreads : int, default 1
        Number of CPU threads to use when reading to pandas.DataFrame

       .. versionadded 0.21.0
       .. deprecated 0.24.0
    use_threads: bool, default True
        Whether to parallelize reading using multiple threads

       .. versionadded 0.24.0

    Returns
    -------
    type of object stored in file

    """

    feather, pyarrow = _try_import()
    path = _stringify_path(path)

    if LooseVersion(pyarrow.__version__) < LooseVersion('0.11.0'):
        int_use_threads = int(use_threads)
        if int_use_threads < 1:
            int_use_threads = 1
        return feather.read_feather(path, nthreads=int_use_threads)

    return feather.read_feather(path, use_threads=bool(use_threads))
Exemple #2
0
    def test_integer_with_nulls(self):
        # pandas requires upcast to float dtype
        path = random_path()
        self.test_files.append(path)

        int_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8']
        num_values = 100

        writer = FeatherWriter()
        writer.open(path)

        null_mask = np.random.randint(0, 10, size=num_values) < 3
        expected_cols = []
        for name in int_dtypes:
            values = np.random.randint(0, 100, size=num_values)
            writer.write_array(name, values, null_mask)

            expected = values.astype('f8')
            expected[null_mask] = np.nan

            expected_cols.append(expected)

        ex_frame = pd.DataFrame(dict(zip(int_dtypes, expected_cols)),
                                columns=int_dtypes)

        writer.close()

        result = read_feather(path)
        assert_frame_equal(result, ex_frame)
Exemple #3
0
    def test_float_nulls(self):
        num_values = 100

        path = random_path()
        self.test_files.append(path)
        writer = FeatherWriter()
        writer.open(path)

        null_mask = np.random.randint(0, 10, size=num_values) < 3
        dtypes = ['f4', 'f8']
        expected_cols = []
        null_counts = []
        for name in dtypes:
            values = np.random.randn(num_values).astype(name)
            writer.write_array(name, values, null_mask)

            values[null_mask] = np.nan

            expected_cols.append(values)
            null_counts.append(null_mask.sum())

        writer.close()

        ex_frame = pd.DataFrame(dict(zip(dtypes, expected_cols)),
                                columns=dtypes)

        result = read_feather(path)
        assert_frame_equal(result, ex_frame)
        assert_array_equal(self._get_null_counts(path), null_counts)
Exemple #4
0
    def test_filelike_objects(self):
        from io import BytesIO

        buf = BytesIO()

        # the copy makes it non-strided
        df = pd.DataFrame(np.arange(12).reshape(4, 3),
                          columns=['a', 'b', 'c']).copy()
        write_feather(df, buf)

        buf.seek(0)

        result = read_feather(buf)
        assert_frame_equal(result, df)
Exemple #5
0
    def test_buffer_bounds_error(self):
        # ARROW-1676
        path = random_path()
        self.test_files.append(path)

        for i in range(16, 256):
            values = pa.array([None] + list(range(i)), type=pa.float64())

            writer = FeatherWriter()
            writer.open(path)

            writer.write_array('arr', values)
            writer.close()

            result = read_feather(path)
            expected = pd.DataFrame({'arr': values.to_pandas()})
            assert_frame_equal(result, expected)

            self._check_pandas_roundtrip(expected, null_counts=[1])
    def _check_pandas_roundtrip(self, df, expected=None, path=None,
                                columns=None, null_counts=None):
        if path is None:
            path = random_path()

        self.test_files.append(path)
        write_feather(df, path)
        if not os.path.exists(path):
            raise Exception('file not written')

        result = read_feather(path, columns)
        if expected is None:
            expected = df

        assert_frame_equal(result, expected)

        if null_counts is None:
            null_counts = np.zeros(len(expected.columns))

        np.testing.assert_array_equal(self._get_null_counts(path, columns),
                                      null_counts)
Exemple #7
0
    def test_boolean_nulls(self):
        # pandas requires upcast to object dtype
        path = random_path()
        self.test_files.append(path)

        num_values = 100
        np.random.seed(0)

        writer = FeatherWriter()
        writer.open(path)

        mask = np.random.randint(0, 10, size=num_values) < 3
        values = np.random.randint(0, 10, size=num_values) < 5
        writer.write_array('bools', values, mask)

        expected = values.astype(object)
        expected[mask] = None

        writer.close()

        ex_frame = pd.DataFrame({'bools': expected})

        result = read_feather(path)
        assert_frame_equal(result, ex_frame)
def read_covidactnow(fname):
    return feather.read_feather(fname)
Exemple #9
0
#!/usr/bin/env python
# coding: utf-8

# In[270]:

import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
import math
import pyarrow.feather as feather
import json
import os
import sys

players = feather.read_feather(
    '/data/p_dsi/nba_player_movement/team_player_data/players.file')
teams = feather.read_feather(
    '/data/p_dsi/nba_player_movement/team_player_data/teams.file')
outcomes = feather.read_feather(
    '/data/p_dsi/nba_player_movement/outcome_shots_data/outcomes.file')
shot_distance_percentage = pd.read_csv(
    '/data/p_dsi/nba_player_movement/outcome_shots_data/shot_distance_percentage.csv'
)

data_path = '/data/p_dsi/nba_player_movement/data_feather_file/group1'
feather_path = '/data/p_dsi/nba_player_movement/data_merged_files'
files = os.listdir(data_path)

if not os.path.exists(feather_path):
    os.makedirs(feather_path)
Exemple #10
0
from lightgbm import LGBMClassifier

full_dataset = True

if full_dataset:
    filePathTRAIN = os.path.join(os.getcwd(), '2_data_preparation', 'features',
                                 'TRAIN_sample.feather')
    filePathTEST = os.path.join(os.getcwd(), '2_data_preparation', 'features',
                                'TEST_sample.feather')
else:
    filePathTRAIN = os.path.join(os.getcwd(), '2_data_preparation', 'features',
                                 'TRAIN_sample_afterFS.feather')
    filePathTEST = os.path.join(os.getcwd(), '2_data_preparation', 'features',
                                'TEST_sample_afterFS.feather')

train = feather.read_feather(filePathTRAIN)
test = feather.read_feather(filePathTEST)

train = train.fillna(0)

y_train = train["TARGET"]

#y_train = y_train.values
x_train = train.drop(["TARGET"], axis=1)

#check if columns contain Inf values and drop them
inf_sum = np.isinf(x_train).sum()
inf_sum = inf_sum.index[np.where(inf_sum > 0)]

x_train.drop(columns=list(inf_sum), inplace=True)
test.drop(columns=list(inf_sum), inplace=True)
def convert_to_arrow(my_cccc, in_file_list, out_dir, out_list_file, conf_df, write_location, debug):
    warno = 189
    out_arrows = []
    now = datetime.utcnow()
    create_datetime_list = ['C_', my_cccc, '_', str(now.year).zfill(4), str(now.month).zfill(2), str(now.day).zfill(2), str(now.hour).zfill(2), str(now.minute).zfill(2), str(now.second).zfill(2)]
    create_datetime = ''.join(create_datetime_list)
    cccc_set = set([re.sub('^.*/', '', re.sub('/grib/.*$', '', in_file)) for in_file in in_file_list])
    cat_subcat_set = set([re.search(r'^[^/]*/[^/]*/', re.sub('^.*/grib/', '', in_file)).group().rstrip('/') for in_file in in_file_list])
    for cccc in cccc_set:
        for cat_subcat in cat_subcat_set:
            keys = ['stepRange', 'typeOfLevel', 'level', 'shortName']
            missingValue = -3.402823e+38
            for in_file in in_file_list:
                property_dict = {}
                ft_list = []
                match = re.search(r'^.*/' + cccc + '/grib/' + cat_subcat + '/.*$', in_file)
                if not match:
                    continue
                if not os.access(in_file, os.F_OK):
                    print('Warning', warno, ':', in_file, 'does not exist.', file=sys.stderr)
                    continue
                elif not os.path.isfile(in_file):
                    print('Warning', warno, ':', in_file, 'is not file.', file=sys.stderr)
                    continue
                elif not os.access(in_file, os.R_OK):
                    print('Warning', warno, ':', in_file, 'is not readable.', file=sys.stderr)
                    continue
                dt_str = re.sub('/.*$', '', re.sub('^.*/' + cccc + '/grib/' + cat_subcat + '/', '', in_file))
                with open(in_file, 'r') as in_file_stream:
                    if debug:
                        print('Debug', ':', in_file, file=sys.stderr)
                    try:
                        codes_grib_multi_support_on()
                        iid = codes_index_new_from_file(in_file, keys)
                        key_values_list = []
                        for key in keys:
                            key_values = codes_index_get(iid, key)
                            key_values_list.append(key_values)
                        products = [[]]
                        for key_values in key_values_list:
                            products = [x + [y] for x in products for y in key_values]
                        for product in products:
                            for key_count in range(len(keys)):
                                codes_index_select(iid, keys[key_count], product[key_count])
                            while True:
                                gid = codes_new_from_index(iid)
                                if gid is None:
                                    break
                                codes_set(gid, 'missingValue', missingValue)
                                iterid = codes_keys_iterator_new(gid, 'ls')
                                step_range = None
                                type_of_level = None
                                level = None
                                short_name = None
                                cat = re.sub('/.*$', '', cat_subcat)
                                subcat = re.sub('^.*/', '', cat_subcat)
                                target_conf_df = conf_df[(conf_df['category'] == cat) & (conf_df['subcategory'] == subcat)]
                                while codes_keys_iterator_next(iterid):
                                    key = codes_keys_iterator_get_name(iterid)
                                    if key in keys:
                                        value = codes_get_string(gid, key)
                                        if key == 'stepRange' or key == 'level':
                                            target_conf_df = target_conf_df[(target_conf_df[key] == int(value))]
                                        else:
                                            target_conf_df = target_conf_df[(target_conf_df[key] == value)]
                                codes_keys_iterator_delete(iterid)
                                message_np = np.array([])
                                for conf_row in target_conf_df.itertuples():
                                    ft = codes_get(gid, 'stepRange')
                                    if not ft in ft_list:
                                        ft_list.append(ft)
                                    property_dict[(conf_row.category, conf_row.subcategory, conf_row.stepRange, conf_row.typeOfLevel, conf_row.level, conf_row.shortName, ft)] = np.array(codes_get_values(gid))
                                if write_location:
                                    iterid = codes_grib_iterator_new(gid, 0)
                                    lat_list = []
                                    lon_list = []
                                    while True:
                                        latitude_longitude_value = codes_grib_iterator_next(iterid)
                                        if not latitude_longitude_value:
                                            break
                                        else:
                                            lat_list.append(latitude_longitude_value[0])
                                            if latitude_longitude_value[1] < 180.0:
                                                lon_list.append(latitude_longitude_value[1])
                                            else:
                                                lon_list.append(latitude_longitude_value[1] - 360.0)
                                    codes_grib_iterator_delete(iterid)
                                    out_directory_list = [out_dir, cccc, 'grib_to_arrow', conf_row.category, conf_row.subcategory]
                                    out_directory = '/'.join(out_directory_list)
                                    os.makedirs(out_directory, exist_ok=True)
                                    out_file_list = [out_directory, '/location.feather']
                                    out_file = ''.join(out_file_list)
                                    with open(out_file, 'bw') as out_f:
                                        location_batch = pa.record_batch([pa.array(lat_list, 'float32'), pa.array(lon_list, 'float32')], names=['latitude [degree]', 'longitude [degree]'])
                                        location_table = pa.Table.from_batches([location_batch])
                                        feather.write_feather(location_table, out_f, compression='zstd')
                                codes_release(gid)
                    except:
                        print('Warning', warno, ':', in_file, 'is invalid grib.', file=sys.stderr)
                if len(property_dict) > 0:
                    out_directory_list = [out_dir, cccc, 'grib_to_arrow', conf_row.category, conf_row.subcategory]
                    out_directory = '/'.join(out_directory_list)
                    os.makedirs(out_directory, exist_ok=True)
                    out_file_list = [out_directory, '/location.feather']
                    out_file = ''.join(out_file_list)
                    location_df = feather.read_feather(out_file)
                    dt = datetime(int(dt_str[0:4]), int(dt_str[4:6]), int(dt_str[6:8]), int(dt_str[8:10]), 0, 0, 0, tzinfo=timezone.utc)
                    dt_list = [dt for i in range(0, len(location_df.index))]
                    for ft in ft_list:
                        name_list = ['latitude [degree]', 'longitude [degree]', 'datetime']
                        data_list = [pa.array(location_df['latitude [degree]'].values.tolist(), 'float32'), pa.array(location_df['longitude [degree]'].values.tolist(), 'float32')]
                        data_list.append(pa.array(dt_list, pa.timestamp('ms', tz='utc')))
                        for conf_row in conf_df[(conf_df['category'] == cat) & (conf_df['subcategory'] == subcat)].itertuples():
                            if len(property_dict[(conf_row.category, conf_row.subcategory, conf_row.stepRange, conf_row.typeOfLevel, conf_row.level, conf_row.shortName, ft)]) > 0:
                                if re.match(r'^.*U wind component.*$', conf_row.name):
                                    u_value_np = property_dict[(conf_row.category, conf_row.subcategory, conf_row.stepRange, conf_row.typeOfLevel, conf_row.level, conf_row.shortName, ft)]
                                    v_value_np = property_dict[(conf_row.category, conf_row.subcategory, conf_row.stepRange, conf_row.typeOfLevel, conf_row.level, conf_row.shortName.replace('u', 'v'), ft)]
                                    wind_speed_np = np.sqrt(np.power(u_value_np, 2) + np.power(v_value_np, 2))
                                    wind_direction_np = np.degrees(np.arctan2(v_value_np, u_value_np))
                                    wind_direction_np = np.array([value + 360.0 if value < 0 else value for value in wind_direction_np])
                                    name_list.append(ft + '/' + re.sub(r'U wind component', 'wind speed [m/s]', conf_row.name))
                                    data_list.append(pa.array(np.array(wind_speed_np, dtype=conf_row.datatype)))
                                    name_list.append(ft + '/' + re.sub(r'U wind component', 'wind direction [degree]', conf_row.name))
                                    data_list.append(pa.array(np.array(wind_direction_np, dtype=conf_row.datatype)))
                                elif not re.match(r'^.*V wind component.*$', conf_row.name):
                                    value_list = property_dict[(conf_row.category, conf_row.subcategory, conf_row.stepRange, conf_row.typeOfLevel, conf_row.level, conf_row.shortName, ft)]
                                    name_list.append(ft + '/' + conf_row.name)
                                    data_list.append(pa.array(np.array(value_list, dtype=conf_row.datatype)))
                        out_directory_list = [out_dir, cccc, 'grib_to_arrow', conf_row.category, conf_row.subcategory]
                        out_directory = '/'.join(out_directory_list)
                        os.makedirs(out_directory, exist_ok=True)
                        out_file_list = [out_directory, '/', dt_str, '_', create_datetime, '.feather']
                        out_file = ''.join(out_file_list)
                        with open(out_file, 'bw') as out_f:
                            property_batch = pa.record_batch(data_list, names=name_list)
                            property_table = pa.Table.from_batches([property_batch])
                            feather.write_feather(property_table, out_f, compression='zstd')
                            print(out_file, file=out_list_file)
Exemple #12
0
def main(
    args,
    n_timesteps=200,
    prob_willing_min=0.1,
    prob_willing_max=0.9,
    turns_per_day_min=3.5,
    turns_per_day_max=3.5,
    available_machine_time_min=0.5,
    available_machine_time_max=0.5,
    machines_pm_min=4,
    machines_pm_max=55,
    recruitment_per_machine_min=0.2,
    recruitment_per_machine_max=2,
    return_scaling_factor_min=0.5,
    return_scaling_factor_max=2.25,
    return_3rd_scaling_factor_max=1.42,
    return_4th_scaling_factor_max=1.23,
    K=0.44068065575293935,
    lambd=0.02564293675387818,
    K_3rd=0.7004050311562576,
    lambd_3rd=0.03913344618984413,
    K_4th=0.8076846503792819,
    lambd_4th=0.05034356712490996,
    population_sizes={
        'NJ': 8882190,
        'MA': 6892503,
        'IN': 6732219,
        'LA': 4648794,
        'CT': 3565287,
        'MS': 2976149,
        'VA': 8535519,
        'MD': 6045680,
        'NY': 19453561,
        'IL': 12671821,
        'CA': 39512223
    }):

    state, dist, n_sims, chunk_size = parse_args(args)

    run_id = str(np.random.randint(100000, high=999999, size=1)[0])
    state_data = feather.read_feather("./covidactnow_" + state +
                                      "_2020-09-08.feather")
    start_date = min(state_data.date[state_data.discharges > 0])
    end_date = start_date + timedelta(days=n_timesteps - 1)
    discharges = [
        int(i)
        for i in state_data[(state_data.date >= start_date) &
                            (state_data.date <= end_date)].discharges.tolist()
    ]
    n_iterations = len(discharges)
    #print(state_data.head())

    if dist == "empiric":
        donret_2_dist = feather.read_feather(
            "./data/donor_return_2nd_cumprob.feather")
        donret_2_dist_list = [donret_2_dist.copy() for _ in range(n_sims)]
        donret_3_dist = feather.read_feather(
            "./data/donor_return_3rd_cumprob.feather")
        donret_3_dist_list = [donret_3_dist.copy() for _ in range(n_sims)]
        donret_4_dist = feather.read_feather(
            "./data/donor_return_4th_cumprob.feather")
        donret_4_dist_list = [donret_4_dist.copy() for _ in range(n_sims)]

    elif dist == "parametric":
        scaling_factors = np.random.uniform(return_scaling_factor_min,
                                            return_scaling_factor_max,
                                            size=n_sims)
        t = np.linspace(0, 126, 127)
        donret_2_dist_list = []
        for i in range(n_sims):
            p = exp_model_func(t, K, lambd, t0=7, scale=scaling_factors[i])
            donret_2_dist_list.append(
                pd.DataFrame({
                    'time': np.append(t, 9999.0),
                    'prob': np.append(p, 1.0)
                }))
        donret_3_dist = feather.read_feather(
            "./data/donor_return_3rd_cumprob.feather")
        donret_3_dist_list = [donret_3_dist.copy() for _ in range(n_sims)]
        donret_4_dist = feather.read_feather(
            "./data/donor_return_4th_cumprob.feather")
        donret_4_dist_list = [donret_4_dist.copy() for _ in range(n_sims)]
    elif dist == "parametric_all":
        scaling_factors = np.random.uniform(return_scaling_factor_min,
                                            return_scaling_factor_max,
                                            size=n_sims)
        scaling_factos_propmax = (scaling_factors - return_scaling_factor_min
                                  ) / (return_scaling_factor_max -
                                       return_scaling_factor_min)
        scaling_factors_3rd = scaling_factos_propmax * (
            return_3rd_scaling_factor_max -
            return_scaling_factor_min) + return_scaling_factor_min
        #print("Scaling 3rd:", min(scaling_factors_3rd), max(scaling_factors_3rd))
        scaling_factors_4th = scaling_factos_propmax * (
            return_4th_scaling_factor_max -
            return_scaling_factor_min) + return_scaling_factor_min
        #print("Scaling 4th:", min(scaling_factors_4th), max(scaling_factors_4th))
        t = np.linspace(0, 126, 127)
        donret_2_dist_list = []
        donret_3_dist_list = []
        donret_4_dist_list = []
        for i in range(n_sims):
            p = exp_model_func(t, K, lambd, t0=7, scale=scaling_factors[i])
            donret_2_dist_list.append(
                pd.DataFrame({
                    'time': np.append(t, 9999.0),
                    'prob': np.append(p, 1.0)
                }))
            p = exp_model_func(t,
                               K_3rd,
                               lambd_3rd,
                               t0=7,
                               scale=scaling_factors_3rd[i])
            donret_3_dist_list.append(
                pd.DataFrame({
                    'time': np.append(t, 9999.0),
                    'prob': np.append(p, 1.0)
                }))
            p = exp_model_func(t,
                               K_4th,
                               lambd_4th,
                               t0=7,
                               scale=scaling_factors_4th[i])
            donret_4_dist_list.append(
                pd.DataFrame({
                    'time': np.append(t, 9999.0),
                    'prob': np.append(p, 1.0)
                }))

    population = population_sizes[state]
    prob_willing = np.random.uniform(prob_willing_min,
                                     prob_willing_max,
                                     size=n_sims)
    macines_pm = np.random.uniform(machines_pm_min,
                                   machines_pm_max,
                                   size=n_sims)
    machines = np.round(macines_pm * (population / 1e6))
    turns_per_day = np.random.uniform(turns_per_day_min,
                                      turns_per_day_max,
                                      size=n_sims)
    available_machine_time = np.random.uniform(available_machine_time_min,
                                               available_machine_time_max,
                                               size=n_sims)
    max_collections = np.round(machines * turns_per_day *
                               available_machine_time).astype(int)
    recruitment_per_machine = np.random.uniform(recruitment_per_machine_min,
                                                recruitment_per_machine_max,
                                                size=n_sims)
    max_recruitment = np.round(machines * recruitment_per_machine).astype(int)

    parameter_list = []
    for i in range(n_sims):
        parameter_list.append({
            "run_id":
            run_id,
            "iterations":
            n_iterations,
            "report_level":
            3,
            "num_agents_init":
            0,
            "recovered":
            discharges,
            "historical_collections":
            None,
            "prob_eligible":
            1,
            "delay_eligibility":
            14,
            "duration_eligibility":
            180,
            "temp_ineligibility_period":
            7,
            "donor_return_dist_type":
            dist,
            "donor_return_second":
            donret_2_dist_list[i],
            "donor_return_second_scale":
            scaling_factors[i]
            if dist == "parametric" or dist == "parametric_all" else None,
            "donor_return_third":
            donret_3_dist_list[i],
            "donor_return_third_scale":
            scaling_factors_3rd[i] if dist == "parametric_all" else None,
            "donor_return_later":
            donret_4_dist_list[i],
            "donor_return_later_scale":
            scaling_factors_4th[i] if dist == "parametric_all" else None,
            "donor_return_prob_col":
            "prob",
            "delay_recruitment":
            0,
            "prob_willing":
            prob_willing[i],
            "prob_male":
            0.5,
            "male_relative_propensity":
            1,
            "max_recruitment":
            int(max_recruitment[i]),
            "recruitment_pm":
            recruitment_per_machine[i],
            "prob_other_deferral":
            0.02,
            "prob_failed_donation":
            0.01,
            "prob_hla_female":
            0.09,
            "prob_tti":
            0.002,
            "prob_Ab_pos":
            0.93,
            "qualify_neut_titers":
            False,
            "prob_neut_above_360":
            None,
            "max_collections":
            int(max_collections[i]),
            "mean_units_per_collection":
            turns_per_day[i],
            "max_collection_growth":
            9999,  # essentially infinite
            "recruitment_start":
            0,
            "collection_start":
            0,
            "machines_pm":
            int(macines_pm[i]),
            "available_machine_time":
            available_machine_time[i]
        })

    n_chunks = n_sims // chunk_size
    iter_start = 0
    for j in range(n_chunks):
        simset = "statesim_" + state + "_" + dist + "_" + run_id + "_" + str(
            j).zfill(3)
        [
            p.update({"chunk_id": j})
            for p in parameter_list[iter_start:(iter_start + chunk_size)]
        ]
        csim.multi_threaded_run(
            parameter_list=parameter_list[iter_start:(iter_start +
                                                      chunk_size)],
            simulations=chunk_size,
            root_seed=None,
            processes=20,
            simset_name=simset,
            output_report=True,
            output_agents=True,
            output_parameters=True,
            report_dir="./outputs/",
            return_results=False)
        gc.collect()
        iter_start += chunk_size
import pickle as pkl
import pyarrow.feather as feather
import pandas as pd

# with open('chars60_raw_imputed.feather', 'rb') as f:
#     chars = feather.read_feather(f)

with open('chars60_rank_imputed.feather', 'rb') as f:
    chars = feather.read_feather(f)

print(chars.columns.values)

chars['date'] = pd.to_datetime(chars['date'])
chars['year'] = chars['date'].dt.year
chars_1970s = chars[chars['year'] < 1980]
chars_1980s = chars[(chars['year'] >= 1980) & (chars['year'] < 1990)]
chars_1990s = chars[(chars['year'] >= 1990) & (chars['year'] < 2000)]
chars_2000s = chars[(chars['year'] >= 2000) & (chars['year'] < 2010)]
chars_2010s = chars[(chars['year'] >= 2010) & (chars['year'] < 2020)]
chars_2020s = chars[(chars['year'] >= 2020) & (chars['year'] < 2030)]

# raw
# chars_1970s.to_csv('chars60_raw_1970s.csv', index=0)
# chars_1980s.to_csv('chars60_raw_1980s.csv', index=0)
# chars_1990s.to_csv('chars60_raw_1990s.csv', index=0)
# chars_2000s.to_csv('chars60_raw_2000s.csv', index=0)
# chars_2010s.to_csv('chars60_raw_2010s.csv', index=0)

# rank
chars_1970s.to_csv('chars60_rank_1970s.csv', index=0)
chars_1980s.to_csv('chars60_rank_1980s.csv', index=0)
Exemple #14
0
def updateEMA(Client, symbol, emas, interval):

    First = True

    #check files and directories
    filename = f"{Client.MAIN_PATH}/data/candles/{interval}"
    filename2 = f"{Client.MAIN_PATH}/data/ema/{interval}"

    if not os.path.exists(filename):
        os.makedirs(filename)
    
    if not os.path.exists(filename2):
        os.makedirs(filename2)
    
    fn = f"{filename}/{symbol}.feather"
    fn2 = f"{filename2}/{symbol}.feather"

    if os.path.isfile(fn):
        df_cdl = feather.read_feather(fn)

        if not df_cdl["OpenTime"].empty:
        
            if os.path.isfile(fn2):
                df_ema = feather.read_feather(fn2)

                if not df_ema.empty:
                    First = False
                    ema_lastDate = df_ema.iloc[-1]
                    df_add = df_cdl.loc[df_cdl["OpenTime"] > ema_lastDate["OpenTime"], ["OpenTime", "ClosePrice"]]

                    if not df_add.empty:
                        new_lst = []
                        for num in emas:
                            name = f"EMA{num}"
                            prevMA = ema_lastDate[name]
                            
                            if not math.isnan(prevMA):
                                temp_lst = []

                                for close in df_add["ClosePrice"]:
                                    prevMA = close * (2/(num+1)) + prevMA * (1 - (2/(num+1)))
                                    temp_lst.append(prevMA)

                                df_add[name] = temp_lst
                            else:
                                df_ema2 = df_cdl.loc[:,["OpenTime", "ClosePrice"]]

                                new_lst.append((name, abstract.EMA(df_ema2["ClosePrice"], timeperiod=num)))

                        df_ema = df_ema.append(df_add, ignore_index=True)

                        for name, value in new_lst:
                            df_ema[name] = value

                        feather.write_feather(df_ema, fn2)
            
            if First:
                df_ema2 = df_cdl.loc[:,["OpenTime", "ClosePrice"]]
            
                if not df_cdl.empty:
                    for num in emas:
                        name = f"EMA{num}"
                        df_ema2[name] = abstract.EMA(df_ema2["ClosePrice"], timeperiod=num)

                feather.write_feather(df_ema2, fn2)
        else:
            print("file is empty")
Exemple #15
0
# Feature Importance with Extra Trees Classifier
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
import os
import pyarrow.feather as feather

filePathTRAIN = os.path.join(os.getcwd(), '2_data_preparation', 'features',
                             'TRAIN_sample.feather')
filePathTEST = os.path.join(os.getcwd(), '2_data_preparation', 'features',
                            'TEST_sample.feather')
df = feather.read_feather(filePathTRAIN)
df1 = df.drop("index", axis=1)
df1 = df1.fillna(0)
colnames = df1.columns.values
dfY = df1["TARGET"]
Y = dfY.values
dfX = df1.drop(["TARGET"], axis=1)

#check if columns contain Inf values and drop them
inf_sum = np.isinf(dfX).sum()
inf_sum = inf_sum.index[np.where(inf_sum > 0)]
dfX.drop(columns=list(inf_sum), inplace=True)

X = dfX.values

model = ExtraTreesClassifier()
model.fit(X, Y)

# print(model.feature_importances_)
list_imp = model.feature_importances_
Exemple #16
0
import pandas as pd
import pickle as pkl
import pyarrow.feather as feather
import numpy as np
from tqdm import tqdm
from functions import *

####################
#    All Stocks    #
####################
with open('chars_q_raw.feather', 'rb') as f:
    chars_q = feather.read_feather(f)

chars_q = chars_q.dropna(subset=['permno'])
chars_q[['permno', 'gvkey']] = chars_q[['permno', 'gvkey']].astype(int)
chars_q['jdate'] = pd.to_datetime(chars_q['jdate'])
chars_q = chars_q.drop_duplicates(['permno', 'jdate'])

with open('chars_a_raw.feather', 'rb') as f:
    chars_a = feather.read_feather(f)

chars_a = chars_a.dropna(subset=['permno'])
chars_a[['permno', 'gvkey']] = chars_a[['permno', 'gvkey']].astype(int)
chars_a['jdate'] = pd.to_datetime(chars_a['jdate'])
chars_a = chars_a.drop_duplicates(['permno', 'jdate'])

# information list
obs_var_list = [
    'gvkey', 'permno', 'jdate', 'sic', 'ret', 'retx', 'retadj', 'exchcd',
    'shrcd'
]
DPNI = 1  #How much DPNI to add
###################################################################################################
#first import information from the j5 spreadsheet in order to perform appropriate steps
#import feather
import pyarrow.feather as ft
import pandas as pd
import numpy as np
####################################################################################################
#path for my imac
#path = '/Users/jbryant2/Google Drive File Stream/Shared drives/PlantSynBioLab/Cloning/AFB_epistasis_muts/oligo.feather'
#note: mac doesn't like to have "C:/" in it's paths

#path for my windows machine
path = 'C:/Users/jonbr/Documents/GitHub/opentrons/j5_Files/j5__pGP8A-ARF19 domain delete/oligo.feather'

oligos = ft.read_feather(path)
oligos

oligos['ID Number'] = oligos['ID Number'].astype(int)
oligos
######################################################################################################
if len(oligos.columns) < 9:
    oligos['well'] = ''
    oligos['stock primer concentration'] = ''
    oligos['volume of stock primer to add'] = ''
    oligos['concentration of diluted primer'] = ''
    oligos['volume of diluted primer'] = ''  #this is a calculated value
    oligos[
        'how much of the diluted primer is left'] = ''  #also a calculated value
oligos
######################################################################################################
def df_from_molchunk(molchunk_path):
    df = feather.read_feather(molchunk_path)
    # PandasTools.AddMoleculeColumnToFrame(df, smilesCol='smiles')
    return df
Exemple #19
0
def seasonal_refund_func(startDate, endDate):
    # 设置参数
    # 箱子容量
    max_bin_qty = 20
    # 最大箱号
    last_bin_no = 0
    # 每日库存
    daily_invnt_df = pd.DataFrame()
    # 每日输出数据
    daily_results_df = pd.DataFrame()
    # 监测库存件数
    monitor_sku_invnt = pd.DataFrame()

    # 建立数据库连接,准备存储结果
    db = pymysql.connect(host='0.0.0.0', user='******', passwd='root', db='JNBY')
    cursor = db.cursor()
    pymysql.converters.encoders[np.float64] = pymysql.converters.escape_float
    # engine = create_engine(
    #     'mysql+pymysql://root:[email protected]:3306/JY', encoding='utf8')
    engine = create_engine('mysql+pymysql://root:[email protected]:3306/JNBY',
                           encoding='utf8')

    #从数据库查询每日的订单和库存
    normal_orders = feather.read_feather(
        '/home/liqi/PythonProjects/JY/B2B_seasonal/normal_sale.feather')
    seasonal_refund = feather.read_feather(
        '/home/liqi/PythonProjects/JY/B2B_seasonal/season_refund_split.feather'
    )
    seasonal_refund.set_index('date', inplace=True)
    normal_orders.set_index('date', inplace=True)
    # print(seasonal_refund.head())

    for date in tqdm.tqdm(pd.date_range(start=startDate, end=endDate)):
        # 记录输出结果
        results = [date.strftime('%Y-%m-%d')]
        # 未完成订单
        unsold_df = pd.DataFrame()
        # 读取当天销售和退货数据
        refund = seasonal_refund[seasonal_refund.index == date].copy()
        sales_df = normal_orders[normal_orders.index == date].copy()
        # 删除数据库不必要信息
        if not refund.empty:
            seasonal_refund.drop(index=date, inplace=True)
        if not sales_df.empty:
            normal_orders.drop(index=date, inplace=True)
        refund.reset_index(inplace=True)
        sales_df.reset_index(inplace=True)
        # print(refund)
        # print(sales_df)
        #==================== 入库
        if not refund.empty:
            data = CreateBin(refund, last_bin_no, max_bin_qty)
            refund_df = data.refund
            # print(refund_df)
            last_bin_no = data.last_bin_no
            ''' 入库输出:
            入库件数:refund_qty
            入库sku数:refund_sku
            入库箱数:refund_bin
            '''
            refund_qty = refund_df.qty.sum()
            refund_sku = len(set(refund_df['sku']))
            refund_bin = data.bin_num
            results.extend([refund_qty, refund_sku, refund_bin])
            # 更新库存
            daily_invnt_df = daily_invnt_df.append(refund_df, ignore_index=True)
            # print(daily_invnt_df.groupby(['sku', 'binNo']).sum())
        else:
            results.extend([0, 0, 0])
        # print(results)

        #==================== 出库
        ''' 出库输出:
        出库件数:deleted_qty
        出库SKU数:deleted_sku
        出库行数:deleted_order_line
        出库订单数:deleted_order_num
        出库搬箱数:deleted_bin_num
        平均每出库箱命中SKU数:avg_deleted_sku_in_bin
        平均每出库箱命中件数:avg_deleted_qty_in_bin
        出库后剩余箱数:bins_after_sale
        '''
        if daily_invnt_df.empty:
            unsold_df.to_sql(name='unsold orders after seasonal',
                         con=engine,
                         if_exists='append',
                         chunksize=1000,
                         index=None)
            continue
        data = InventoryMatchSales(daily_invnt_df, sales_df)
        daily_invnt_df = data.invnt
        unsold_df = data.unsold
        # 输出数据
        deleted_bin_num = data.move_bin_num
        deleted_order_num = data.sold_orders
        deleted_qty = data.deleted_qty
        deleted_sku = data.deleted_sku
        deleted_order_line = data.deleted_order_line
        bins_after_sale = len(set(daily_invnt_df['binNo']))
        avg_deleted_sku_in_bin = round(data.avg_deleted_sku_in_bin, 2)
        avg_deleted_qty_in_bin = round(data.avg_deleted_qty_in_bin, 2)
        # deleted_invnt_df = data.deleted_invnt

        # 存储数据
        results.extend([
            deleted_qty, deleted_sku, deleted_order_line,
            deleted_order_num, deleted_bin_num, avg_deleted_sku_in_bin,
            avg_deleted_qty_in_bin, bins_after_sale
        ])
        # print(results)
        # print(daily_invnt_df.groupby(['sku', 'binNo']).sum())

        #==================== 理款
        ''' 理款输出:
        理款搬出箱数:to_merge_type_bin_num
        平均每款所占箱数:avg_type_bin_num
        平均每款件数:avg_type_qty_num
        理款后回库箱数:merged_type_bin_num
        '''
        data = MergeSku(daily_invnt_df, last_bin_no, max_bin_qty, 1)
        to_merge_type_bin_num = data.to_merge_type_bin_num
        avg_bin_per_type_num = round(data.avg_bin_per_type_num, 2)
        avg_qty_per_type_num = round(data.avg_qty_per_type_num, 2)
        merged_type_bin_num = data.merged_type_bin_num
        daily_invnt_df = data.invnt
        results.extend([
            to_merge_type_bin_num, avg_bin_per_type_num, avg_qty_per_type_num,
            merged_type_bin_num
        ])
        # print(results)
        # print(daily_invnt_df.groupby(['sku', 'binNo']).sum())

        #==================== 理SKU
        ''' 理SKU输出:
        被理货的SKU数量: to_merge_sku_num
        平均每个被理货的SKU所占的箱数:avg_bin_per_sku_num
        平均每个被理货的SKU的件数:avg_qty_per_sku_num
        理SKU搬出的箱数:to_merge_sku_bin_num
        理SKU后回库的箱数:merge_sku_bin_num
        '''

        # 逻辑:先删除监测列表中库存为0的sku,再将大于等于20件的sku加入监测列表中
        data = MergeSku(daily_invnt_df, last_bin_no, max_bin_qty, 2, monitor_sku_invnt)
        last_bin_no = data.last_bin_no
        daily_invnt_df = data.invnt
        # 输出所需数据
        to_merge_sku_num = data.to_merge_sku_num
        to_merge_sku_bin_num = data.to_merge_sku_bin_num
        merged_sku_bin_num = data.merged_sku_bin_num
        avg_bin_per_sku_num = round(data.avg_bin_per_sku_num, 2)
        avg_qty_per_sku_num = round(data.avg_qty_per_sku_num, 2)
        monitor_sku_invnt = data.monitor_sku_invnt

        results.extend([
            to_merge_sku_num, avg_bin_per_sku_num, avg_qty_per_sku_num,
            to_merge_sku_bin_num, merged_sku_bin_num
        ])
        # print(results)
        # print(unsold_df)
        daily_results_df = daily_results_df.append([results],
                                                   ignore_index=True)
        
        

        # 将未完成订单写入数据库
        # unsold_df.to_sql(name='unsold orders after seasonal',
        #                  con=engine,
        #                  if_exists='append',
        #                  chunksize=1000,
        #                  index=None)
    '''输出数据:
    入库件数:refund_qty
    入库sku数:refund_sku
    入库箱数:refund_bin
    出库件数:deleted_qty
    出库SKU数:deleted_sku
    出库行数:deleted_order_line
    出库订单数:deleted_order_num
    出库搬箱数:deleted_bin_num
    平均每出库箱命中SKU数:avg_deleted_sku_in_bin
    平均每出库箱命中件数:avg_deleted_qty_in_bin
    出库后剩余箱数:bins_after_sale
    理款搬出箱数:to_merge_type_bin_num
    平均每款所占箱数:avg_type_bin_num
    平均每款件数:avg_type_qty_num
    理款后回库箱数:merged_type_bin_num
    被理货的SKU数量: to_merge_sku_num
    平均每个被理货的SKU所占的箱数:avg_bin_per_sku_num
    平均每个被理货的SKU的件数:avg_qty_per_sku_num
    理SKU搬出的箱数:to_merge_sku_bin_num
    理SKU后回库的箱数:merge_sku_bin_num
    '''
    col = [
        'date', 'refund_qty', 'refund_sku', 'refund_bin', 'deleted_qty',
        'deleted_sku', 'deleted_order_line', 'deleted_order_num',
        'deleted_bin_num', 'avg_deleted_sku_in_bin', 'avg_deleted_qty_in_bin',
        'bins_after_sale', 'to_merge_type_bin_num', 'avg_type_bin_num',
        'avg_type_qty_num', 'merged_type_bin_num', 'to_merge_sku_num',
        'avg_bin_per_sku_num', 'avg_qty_per_sku_num', 'to_merge_sku_bin_num',
        'merge_sku_bin_num'
    ]
    daily_results_df.columns = col
    # print(daily_results_df)
    # print(daily_invnt_df)
    daily_results_df.to_sql(name='daily results for B2B seasonal',
                            con=engine,
                            if_exists='append',
                            chunksize=1000,
                            index=None)
    # 将每日库存明细写入数据库
    daily_invnt_df.to_sql(name='invnt_after_seasonal_3_5',
                            con=engine,
                            if_exists='append',
                            chunksize=1000,
                            index=None)
Exemple #20
0
 def __init__(self, datafile):
     self.datafile = datafile
     self.data = feather.read_feather(source=datafile, nthreads=16)
Exemple #21
0
    def read_file(self, file_name, has_header=True):
        """Read a matrix of data from file.

        Parameters
        ----------
        file_name : str
            Full path and name of the file.
        has_header : bool, optional
            True if the file has a header; false otherwise. The default is True.

        Returns
        -------
        dict
            Dictionary object containing the matrix ('x') and header ('header').

        """

        arr = None
        header = []

        if not isfile(file_name):
            return None

        if self.get_file_type(file_name) == 'npy':
            arr = np.load(file_name)
            if has_header:
                header = arr[0, :]
                arr = arr[1:len(arr), :]

        elif self.get_file_type(file_name) == 'pkl':
            pd_df = pd.read_pickle(file_name)
            arr = pd_df.to_numpy()
            if has_header:
                header = pd_df.columns

        elif self.get_file_type(file_name) == 'feather':
            pd_df = feather.read_feather(file_name)
            arr = pd_df.to_numpy()
            if has_header:
                header = pd_df.columns

        elif self.get_file_type(file_name) == 'csv' or self.get_file_type(
                file_name) == 'tsv':

            arr = []
            if has_header:

                delimiter = ','
                if self.get_file_type(file_name) == 'tsv':
                    delimiter = '\t'

                arr = np.loadtxt(file_name,
                                 dtype=str,
                                 delimiter=delimiter,
                                 skiprows=1)

                header = np.loadtxt(file_name,
                                    dtype=str,
                                    delimiter=delimiter,
                                    comments=None,
                                    skiprows=0,
                                    max_rows=1)
                header[0] = header[0].replace('# ', '')
            else:
                arr = np.loadtxt(file_name, dtype=str, delimiter=delimiter)

        else:
            return None

        if len(header) == 0:
            header = self.get_default_header(arr.shape[1])

        return {'x': arr, 'header': header}
# Since some firms only have annual recording before 80s, we need to use annual data as merging benchmark in case
# there are some recordings are missing

import pandas as pd
import pickle as pkl
import pyarrow.feather as feather
from pandas.tseries.offsets import *

with open('chars_a_60.feather', 'rb') as f:
    chars_a = feather.read_feather(f)

chars_a = chars_a.dropna(subset=['permno'])
chars_a[['permno', 'gvkey']] = chars_a[['permno', 'gvkey']].astype(int)
chars_a['jdate'] = pd.to_datetime(chars_a['jdate'])
chars_a = chars_a.drop_duplicates(['permno', 'jdate'])

with open('beta.feather', 'rb') as f:
    beta = feather.read_feather(f)

beta['permno'] = beta['permno'].astype(int)
beta['jdate'] = pd.to_datetime(beta['date']) + MonthEnd(0)
beta = beta[['permno', 'jdate', 'beta']]
beta = beta.drop_duplicates(['permno', 'jdate'])

chars_a = pd.merge(chars_a, beta, how='left', on=['permno', 'jdate'])

with open('rvar_capm.feather', 'rb') as f:
    rvar_capm = feather.read_feather(f)

rvar_capm['permno'] = rvar_capm['permno'].astype(int)
rvar_capm['jdate'] = pd.to_datetime(rvar_capm['date']) + MonthEnd(0)
Exemple #23
0
def test_dump_arrow_bytes(df):
    tbl = pa.Table.from_pandas(df)
    out = utils.dump_arrow_bytes(tbl)
    assert isinstance(out, bytes)
    new = feather.read_feather(BytesIO(out))
    pd.testing.assert_frame_equal(df, new)
    print("vars and data type: ")
    train.info()
    return train


features = ['click_id', 'ip', 'app', 'device', 'os', 'channel', 'click_time']
int_features = [
    'click_id', 'ip', 'app', 'device', 'os', 'channel', 'hour', 'ip_n',
    'app_n', 'ip_hour_count', 'ip_app_hour_count', 'app_channel_hour_count',
    'ip_app_os_hour_count'
]
time_features = ['click_time', 'attributed_time']
bool_features = ['is_attributed']

#######  READ THE DATA  #######
df = pyfa.read_feather(source='data.feather', nthreads=8)
#df = read_data('test.csv')
df['click_time'] = df.click_time.astype('int64').floordiv(ONE_SECOND).astype(
    'int32')

#######  GENERATE COMBINED CATEGORY FOR GROUPING  #######
# Collapse all categorical features into a single feature
imax = df.ip.max()
amax = df.app.max()
dmax = df.device.max()
omax = df.os.max()
cmax = df.channel.max()
print(imax, amax, dmax, omax, cmax)
df['category'] = df.ip.astype('int64')
df.drop(['ip'], axis=1, inplace=True)
df['category'] *= amax
Exemple #25
0
import sys
import pandas
import pyarrow.feather as feather

feather_file = sys.argv[1]
read_df = feather.read_feather(feather_file)
Exemple #26
0
#import movement.config as CONFIG

data_path = '/data/p_dsi/nba_player_movement/concat_dataframe'
feather_path = '/data/p_dsi/nba_player_movement/concat_dataframe/agg_files'
files = os.listdir(data_path)

if not os.path.exists(feather_path):
    os.makedirs(feather_path)

count = 0

for file in files:
    if '.file' not in file:
        continue
    try:
        df = feather.read_feather('%s/%s' % (data_path, file))
        df_agg = df[(df.EVENTMSGTYPE == 1) | (df.EVENTMSGTYPE == 2)].groupby(['game_id','position','player_name','team_name','EVENTMSGTYPE','player_dist_bin']).agg({'spacing_1':'mean', 'spacing_2':'mean'}).reset_index()
        
        count += 1

        feather.write_feather(df_agg, '%s/%s.file' % (feather_path, 'agg_file'+str(count)))

        print('finished agg_file' + str(count))

    except Exception as e:
        print('Error in loading: ' + str(file) + ' file, Error: ' + str(e))

print('\n')
print('\n')
print('Finished concatenating dataframes for all games.')
print(str(count) + ' games counted')
Exemple #27
0
    fingerprint_files_list_train = [
        (dataset_train + '{:01d}'.format(x) + fingerprint_file_ext)
        for x in range(len(fingerprint_file_names_list_train))
    ]
    scores_files_list_train = [
        (dataset_train + '{:01d}'.format(y) + scores_file_ext)
        for y in range(len(fingerprint_file_names_list_train))
    ]

    npz_list_train = []
    scores_list_train = []
    names_list_train = []
    smiles_list_train = []
    for count, batch in enumerate(fingerprint_file_names_list_train):
        fingerprints = sparse.load_npz(fingerprint_files_list_train[count])
        df = feather.read_feather(scores_files_list_train[count])
        scores = list(df['scores'])
        smiles = list(df['smiles'])
        names = list(df['names'])
        npz_list_train.append(fingerprints)
        scores_list_train.append(scores)
        names_list_train.append(names)
        smiles_list_train.append(smiles)

    flat_sparse_fingerprints_train = sparse.vstack(npz_list_train)

    flat_scores_list_train = [
        item for sublist in scores_list_train for item in sublist
    ]
    flat_names_list_train = [
        item for sublist in names_list_train for item in sublist
Exemple #28
0
import numpy as np
import pandas as pd
import lightgbm as lgb
import pyarrow.feather as pyfa
import gc
from sklearn.externals import joblib

train_data = pyfa.read_feather('train_data.feather',nthreads=8)

target = 'is_attributed'
predictors = list(train_data.columns)
predictors.remove(target)
categorical = ['app', 'device', 'os', 'channel', 'hour']

print 'Train test split'
y_test = train_data[(train_data.shape[0] - 30000000):train_data.shape[0]][target].values
x_test = train_data[(train_data.shape[0] - 30000000):train_data.shape[0]][predictors].values
y_train = train_data[0:(train_data.shape[0] - 30000000)][target].values
train_data = train_data[0:(train_data.shape[0] - 30000000)][predictors].values
gc.collect()

print 'Training'
lgb_estimator = lgb.LGBMClassifier(n_estimators=2000,boosting_type='gbdt',learning_rate=0.2,num_leaves = 31, max_depth = 5,min_child_samples = 10000,
                                    objective='binary',scale_pos_weight=200,subsample=0.5,colsample_bytree=0.7,min_child_weight=0,subsample_for_bin=200000,
                                    max_bin=100,silent = False)

lgb_estimator.fit(train_data, y_train,eval_set=[(x_test,y_test)],eval_metric='auc',early_stopping_rounds=25,verbose=True,feature_name=predictors,categorical=categorical)

print(lgb_estimator.best_iteration_, lgb_estimator.best_score_)

joblib.dump(lgb_estimator, 'lgb.pkl')
Exemple #29
0
import numpy as np
#from sklearn.model_selection import train_test_split
import pandas as pd
import pyarrow.feather as feather
import os

# os.chdir("C:/Users/ppitera002/Documents/hcdr/hcdr")

filePath = os.path.join(os.getcwd(), '2_data_preparation', 'features',
                        'sample.feather')
df = feather.read_feather(filePath)

# WRONG!!!
#X = df.drop(['TARGET'], axis=1)
#y = df["TARGET"]
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)
#TRAIN = pd.concat([X_train, y_train], axis=1)
#TEST = pd.concat([X_test, y_test], axis=1)

TRAIN = df[df.TARGET.notnull()]
TEST = df[df.TARGET.isnull()]

filePathTRAIN = os.path.join(os.getcwd(), '2_data_preparation', 'features',
                             'TRAIN_sample.feather')
filePathTEST = os.path.join(os.getcwd(), '2_data_preparation', 'features',
                            'TEST_sample.feather')

feather.write_feather(TRAIN, filePathTRAIN)
feather.write_feather(TEST, filePathTEST)
Exemple #30
0
import dash_html_components as html
import dash_table_experiments as dt
import plotly.graph_objs as go

from textwrap import dedent
from pyarrow import feather
import numpy as np
import pandas as pd

app = dash.Dash()
app.config.supress_callback_exceptions = True

#####load and process data#####

data = feather.read_feather(source='data/aeolus_top5drugs.feather',
                            nthreads=16,
                            columns=['drug_concept_name', 'age_category'])

uniq_drugs = data.drug_concept_name.unique()
uniq_drugs = uniq_drugs[np.argsort(uniq_drugs)]

all_age_cat_counts = (data.groupby(['age_category'
                                    ]).apply(lambda x: x.shape[0]))
all_age_cat_counts_x = all_age_cat_counts.index.tolist()
all_age_cat_counts_y = all_age_cat_counts.values
all_age_cat_counts_y_norm = np.round(
    (all_age_cat_counts_y / all_age_cat_counts.sum()) * 100, 0)

##########

app.layout = html.Div(children=[
sql1 = '''
CREATE TABLE `kubot_log`.`part_sale_non_sku_B2B_2`  (
  `date` varchar(255) NULL,
  `order_no` varchar(255) NULL,
  `sku` varchar(255) NULL,
  `qty` varchar(255) NULL,
  `order_type` varchar(255) NULL
);
'''
cursor.execute(sql1)

# 初始库存
inv = []
inventory = Inventory(inv)

all_orders = feather.read_feather('B2B_orders.feather')
all_refund = feather.read_feather('B2B_normal_refund.feather')

# 遍历每一天的sale和refund
for date in tqdm.tqdm(pd.date_range(start=startDate, end=endDate)):
    non_fund_sale = []

    # 退货仓出库订单行
    refund_sale = pd.DataFrame()

    # 退货仓结余库存明细
    refund_inv = pd.DataFrame()
    # order = all_orders[all_orders.date == date]
    a = date.strftime('%Y-%m-%d')
    order = all_orders[all_orders["date"] == a]
    nowadays_refund = all_refund[all_refund.date == date]
#     libs = [l['libraryName']) for l in stats['statistics']]

# use list of specific library names; best if processing local libraries
libs = [
    # INSERT LIBRARIES HERE
]

# ARCHS4 co-expression dataset can be downloaded from the ARCHS4 site
# (https://maayanlab.cloud/archs4/download.html) under the section
# "Gene Correlation"

# extract list of genes that have co-expression data in ARCHS4
with open('archs4_data/archs4_genes.txt', 'r') as f_in:
    archs4_genes = [g.strip() for g in f_in.readlines()]

archs4_df = feather.read_feather('archs4_data/human_correlation_archs4.f')
archs4_df.index = archs4_df.columns


def augment_archs4(geneset):
    '''
    Augment a list of unique genes {geneset} with ARCHS4 co-expression data. 
    Sum the Pearson correlation scores of each gene in ARCHS4 co-expression 
    matrix for the genes in {geneset}, excluding the genes already in {geneset},
    and append the top co-expressed genes to {geneset}. Returns new list. 
    '''
    # only augment to ~500 genes for efficiency's sake
    if len(geneset) >= 500:
        return geneset
    add_len = 500 - len(geneset)
Exemple #33
0
        os.path.join(dataset + "*" + fingerprint_file_ext))

    fingerprint_files_list = [
        (dataset + '{:01d}'.format(x) + fingerprint_file_ext)
        for x in range(len(fingerprint_file_names_list))
    ]
    scores_files_list = [(dataset + '{:01d}'.format(y) + scores_file_ext)
                         for y in range(len(fingerprint_file_names_list))]

    npz_list = []
    scores_list = []
    names_list = []
    smiles_list = []
    for batch_num in range(300):
        fingerprints = sparse.load_npz(fingerprint_files_list[batch_num])
        df = feather.read_feather(scores_files_list[batch_num])
        scores = list(df['scores'])
        smiles = list(df['smiles'])
        names = list(df['names'])
        npz_list.append(fingerprints)
        scores_list.append(scores)
        names_list.append(names)
        smiles_list.append(smiles)

    flat_sparse_fingerprints = sparse.vstack(npz_list)

    flat_scores_list = [item for sublist in scores_list for item in sublist]
    flat_names_list = [item for sublist in names_list for item in sublist]
    flat_smiles_list = [item for sublist in smiles_list for item in sublist]
    scores_arry = np.array(scores_list, dtype=np.float16)
    np_scores = np.concatenate(scores_arry)
Exemple #34
0
def test_file_not_exist():
    with pytest.raises(pa.ArrowIOError):
        read_feather('test_invalid_file')
def do_var(df, group_cols, counted, agg_type='float32', show_agg=True):
    if show_agg:
        print("Calculating variance of ", counted, " by ", group_cols, '...')
    gp = df.groupby(group_cols)[counted].var().reset_index().rename(
        columns={counted: 'new_var'})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    df.new_var = df.new_var.astype(agg_type)
    gc.collect()
    return df.new_var


test = True
if test == False:
    train_data = pyfa.read_feather(source='data.feather', nthreads=8)
else:
    train_data = pd.read_csv('test.csv')

print('Extracting new features...')
train_data['hour'] = pd.to_datetime(
    train_data.click_time).dt.hour.astype('uint8')
train_data['day'] = pd.to_datetime(
    train_data.click_time).dt.day.astype('uint8')
train_data['minute'] = pd.to_datetime(
    train_data.click_time).dt.minute.astype('uint8')
gc.collect()

print 'Calculating unique count...'
train_data['uniq_chan_by_ip'] = do_countuniq(train_data[['ip', 'channel']],
                                             ['ip'], 'channel', 'uint8')
Exemple #36
0
#   `index` varchar(255) NULL,
#   `date` varchar(255) NULL,
#   `order` varchar(255) NULL,
#   `sku` varchar(255) NULL,
#   `qty` varchar(255) NULL,
#   `order_type` varchar(255) NULL
# );
# '''
# cursor.execute(sql1)

# 初始库存
inv = []
inventory = Inventory(inv)
remove_cold_sku_inv = []

all_orders = feather.read_feather('all_orders.feather')
all_refund = feather.read_feather('all_refund.feather')

# 遍历每一天的sale和refund
for date in tqdm.tqdm(pd.date_range(start=startDate, end=endDate)):

    # 退货仓出库订单行
    refund_sale = pd.DataFrame()

    # 退货仓结余库存明细
    refund_inv = pd.DataFrame()

    # 冷sku移库明细
    remove_cold_inv = pd.DataFrame()

    # 理库前后箱子去重计数对比表
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import pyarrow.feather as pyfa
import lightgbm as lgb
import gc

train_data = pyfa.read_feather('train_data.feather')

test_data = train_data[(train_data.shape[0] - 5000000):train_data.shape[0]]
train_data = train_data[0:(train_data.shape[0] - 5000000)]
gc.collect()

target = 'is_attributed'
predictors = train_data.columns
categorical = ['app', 'device', 'os', 'channel', 'hour']

xgtrain = lgb.Dataset(train_data[predictors].values, label=train_data[target].values, feature_name=predictors,categorical_feature=categorical, free_raw_data=False)
xgtrain.save_binary('train_data.bin')
del train_data
gc.collect()

xgtest = lgb.Dataset(test_data[predictors].values, label=test_data[target].values,feature_name=predictors,categorical_feature=categorical,free_raw_data = False,reference=xgtrain)
xgtest.save_binary('test_data.bin')
del test_data
gc.collect()

lgb_params = {
    'learning_rate': 0.1,
    'boosting_type': 'gbdt',
    'objective': 'binary',