Example #1
0
def read_feather(path, nthreads=1):
    """
    Load a feather-format object from the file path

    .. versionadded 0.20.0

    Parameters
    ----------
    path : string file path, or file-like object
    nthreads : int, default 1
        Number of CPU threads to use when reading to pandas.DataFrame

       .. versionadded 0.21.0

    Returns
    -------
    type of object stored in file

    """

    feather = _try_import()
    path = _stringify_path(path)

    if LooseVersion(feather.__version__) < LooseVersion('0.4.0'):
        return feather.read_dataframe(path)

    return feather.read_dataframe(path, nthreads=nthreads)
Example #2
0
    def _ft(self, tblname, dbname=None, type=None, df=None):
        if type is None:
            type = self.type
        if dbname is None:
            dbname = self.name
        if df is None:
            # return the dataframe if it exists
            df = ft.read_dataframe(
                os.path.expanduser(
                    os.path.join(cf.options.basedir, "databases", "{}.{}.{}.ft".format(type, dbname, tblname))
                )
            )
            if "idx" in df.columns.values:
                df.set_index("idx", drop=True, inplace=True)
                df.index.name = None
            return df

        else:
            if not (df.index.dtype_str == "int64") and not (df.empty):
                df = df.copy()
                df["idx"] = df.index
            ft.write_dataframe(
                df,
                os.path.expanduser(
                    os.path.join(cf.options.basedir, "databases", "{}.{}.{}.ft".format(type, dbname, tblname))
                ),
            )
            if "idx" in df.columns.values:
                del df
            return
Example #3
0
    def test_integer_with_nulls(self):
        # pandas requires upcast to float dtype
        path = random_path()
        self.test_files.append(path)

        int_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8']
        num_values = 100

        writer = FeatherWriter(path)

        null_mask = np.random.randint(0, 10, size=num_values) < 3
        expected_cols = []
        for name in int_dtypes:
            values = np.random.randint(0, 100, size=num_values)
            writer.write_array(name, values, null_mask)

            expected = values.astype('f8')
            expected[null_mask] = np.nan

            expected_cols.append(expected)

        ex_frame = pd.DataFrame(dict(zip(int_dtypes, expected_cols)),
                                columns=int_dtypes)

        writer.close()

        result = feather.read_dataframe(path)
        assert_frame_equal(result, ex_frame)
Example #4
0
    def test_float_nulls(self):
        num_values = 100

        path = random_path()
        self.test_files.append(path)
        writer = FeatherWriter(path)

        null_mask = np.random.randint(0, 10, size=num_values) < 3
        dtypes = ['f4', 'f8']
        expected_cols = []
        for name in dtypes:
            values = np.random.randn(num_values).astype(name)
            writer.write_array(name, values, null_mask)

            values[null_mask] = np.nan

            expected_cols.append(values)

        writer.close()

        ex_frame = pd.DataFrame(dict(zip(dtypes, expected_cols)),
                                columns=dtypes)

        result = feather.read_dataframe(path)
        assert_frame_equal(result, ex_frame)
Example #5
0
def mergeFeathers(files, mergedFilename, writeCSV, deleteSource=True):
    data = [feather.read_dataframe(f) for f in files if not f == '']
    if len(data) > 0:
        df = pd.concat(data, sort=False, axis=0, ignore_index=True, copy=False)
    else:
        print('mergeFeathers: No files to merge!')
        return ''
    
    if writeCSV:
        df.to_csv(mergedFilename)
    else:
        try:
            feather.write_dataframe(df, mergedFilename)
        except:
            print('Error writing merged feather: Trying CSV')
            print(df.shape)
            traceback.print_exc()
            try:
                df.to_csv(mergedFilename.replace('.feather', '.csv'))
            except:
                print('Error writing merged CSV: Writing list of unmerged temp files.')
                with open(mergedFilename.replace('.feather', '.csv'), 'w') as fh:
                    for f in files:
                        fh.write(f + '\n')
                deleteSource = False
    if deleteSource:
        for f in files:
            if not f == '':
                try:
                    os.remove(f)
                except:
                    print('Could not delete merged temp file: %s' % f)
    return mergedFilename
Example #6
0
def test_factor_rep():
    fpath1 = util.random_path()
    fpath2 = util.random_path()

    rcode = """
library(feather)

iris <- read_feather("{0}")
iris$Species <- as.factor(as.character(iris$Species))
write_feather(iris, "{1}")
""".format(fpath1, fpath2)
    tmp_paths = []

    try:
        iris = pd.read_csv('iris.csv')
        levels = ['setosa', 'versicolor', 'virginica']

        iris['Species'] = pd.Categorical(iris['Species'], categories=levels)

        feather.write_dataframe(iris, fpath1)
        util.run_rcode(rcode)

        result = feather.read_dataframe(fpath2)

        tmp_paths.extend([fpath1, fpath2])
        assert_frame_equal(result, iris)
    finally:
        util.remove_paths(tmp_paths)
Example #7
0
File: awj.py Project: tacaswell/awj
 def __getitem__(self, key):
     fn = self._fn_cache[key]
     ret = feather.read_dataframe(fn)
     self._heap_map[key][0] = time.time()
     # ensure the heap invariant
     heapq.heapify(self._heap)
     return ret
Example #8
0
def load_df(path):
    if file_format(path) != 'feather':
        return default_csv_loader(path)
    elif featherpmm and feather:
        ds = featherpmm.read_dataframe(path)
        return ds.df
    elif feather:
        return feather.read_dataframe(path)
    else:
        raise Exception('The Python feather module is not installed.\n'
                        'Use:\n    pip install feather-format\n'
                        'to add capability.\n')
Example #9
0
def main():
    path = os.path.expanduser(sys.argv[1])
    ratings_df = feather.read_dataframe(path)
    num_ratings = ratings_df.shape[0]
    ratings = np.concatenate((np.array(ratings_df['user_id'], dtype=pd.Series).reshape(num_ratings, 1), np.array(ratings_df['item_id'], dtype=pd.Series).reshape(num_ratings, 1), np.array(ratings_df['rating'], dtype=pd.Series).reshape(num_ratings, 1)), axis=1)
    global_mean = mean(ratings[:,2])
    np.random.seed(12)
    ratings_tr, ratings_val = train_test_split(ratings, train_size=.7)
    max_iter = int(sys.argv[2])
    to_learn = sys.argv[3]
    num_users = np.unique(ratings[:,0]).shape[0]
    num_items = np.unique(ratings[:,1]).shape[0]
    if to_learn == "user_bias_lda":
        lda = learn_bias_lda(ratings_tr, 4, [2,4,6,8,10], num_users, num_items, global_mean, max_iter)
        print("Best lambda for user bias is %s" %(lda))
    elif to_learn == "item_bias_lda":
        lda = learn_bias_lda(ratings_tr, 4, [2,4,6,8,10], num_users, num_items, global_mean, max_iter, False)
        print("Best lambda for item bias is %s" %(lda))
    elif to_learn == "user_bias":
        lda = float(sys.argv[4])
        user_bias = get_user_bias(ratings_tr, ratings_val, lda, num_users, num_items, global_mean, max_iter)
        np.save("user_bias", user_bias)
    elif to_learn == "item_bias":
        lda = float(sys.argv[4])
        item_bias = get_item_bias(ratings_tr, ratings_val, lda, num_users, num_items, global_mean, max_iter)
        np.save("item_bias", item_bias)
    elif to_learn == "item_bias_fixed_user":
        lda = float(sys.argv[4])
        user_bias = np.load("user_bias.npy")
        tr, val, finalw, finalh = learn_item_bias_from_fixed_user_bias(ratings_tr, ratings_val, np.load("user_bias.npy"), num_items, lda, global_mean, max_iter)
        print("Final training RMSE %s" % (tr))
        print("Final validation RMSE %s" % (val))
        np.save("item_bias_fixed_user", finalh[1,:].reshape(num_items,))
    elif to_learn == "features":
        lda = float(sys.argv[4])
        rank = int(sys.argv[5])
        user_bias = np.load("user_bias.npy").reshape(num_users, 1)
        item_bias = np.load("item_bias.npy").reshape(1, num_items)
        W, H, reg = create_factors_with_biases(user_bias, item_bias, rank, lda)
        tr, val, finalw, finalh = mf(ratings_tr, ratings_val, W, H, reg, global_mean, max_iter, 1.0, True)
        print("Final training RMSE %s" % (tr))
        print("Final validation RMSE %s" % (val))
        np.save("final_w", finalw)
        np.save("final_h", finalh)
    elif to_learn == "features-only":
        lda = float(sys.argv[4])
        rank = int(sys.argv[5])
        W, H, reg = create_factors_without_biases(num_users, num_items, rank, lda)
        tr, val, finalw, finalh = mf(ratings_tr, ratings_val, W, H, reg, global_mean, max_iter, 1.0, True)
        print("Final training RMSE %s" % (tr))
        print("Final validation RMSE %s" % (val))
        np.save("final_w", finalw)
        np.save("final_h", finalh)
Example #10
0
def maybe_parse(path):
    feather_file = path + ".feather"
    if os.path.exists(feather_file):
        print("loading %s from cache" % path)
        df = feather.read_dataframe(feather_file)
        df = df.set_index("ut_ms")
        return df
    else:
        print("parsing %s" % path)
        df = parse(path)
        feather.write_dataframe(df.reset_index(), feather_file)
        return df
Example #11
0
    def _check_pandas_roundtrip(self, df, expected=None):
        path = random_path()
        self.test_files.append(path)
        feather.write_dataframe(df, path)
        if not os.path.exists(path):
            raise Exception('file not written')

        result = feather.read_dataframe(path)
        if expected is None:
            expected = df

        assert_frame_equal(result, expected)
Example #12
0
def matchSamples(batchFolder, matchStr='*.feather', test=False):
    """Match each row of the metadata with each feather file (sample)
    in the batch folder"""
    mDf = pd.read_csv(opj(batchFolder, 'metadata.csv'))
    featherList = glob(opj(batchFolder, matchStr))
    if len(featherList) == 0:
        print('No feather files matching "%s" in "%s"' % (matchStr, batchFolder))
        return {}
    featherLU = {sample_name:[fn for fn in featherList if sample_name in fn] for sample_name in mDf.sample_name}
    fallback = False
    if not len(featherLU) == mDf.shape[0]:
        print('Could not match all samples in the metadata.')
        fallback = True

    L = pd.Series({k:len(v) for k,v in featherLU.items()})
    if not (L == 1).all():
        print('Some samples in metadata matched to >1 feather file:')
        for k,v in featherLU.items():
            if len(v) > 1:
                print('\t%s: %s' % (k, v[:2]))
        fallback = True

    if fallback:
        featherLU = {}
        print('Attempting to use sample order with check on total event count.')
        for i,sample_name in enumerate(mDf.sample_name):
            events = int(sample_name.split('_')[-1])
            fn = [f for f in featherList if 'gs_%d_' % (i + 1) in f][0]
            f = feather.read_dataframe(opj(batchFolder, fn))
            if events == f.shape[0]:
                featherLU.update({sample_name:fn})
                print('Matched %s to %s. (%d of %d)' % (sample_name, fn, i+1, mDf.shape[0]))
                if test and (i + 1) >= 2:
                    break
            else:
                print('Sample order strategy not working.')
                break
    else:
        featherLU = {k:v[0] for k,v in featherLU.items()}

    if not len(featherLU) == mDf.shape[0]:
        print('Could not match all samples in the metadata.')
    if test:
        out = {}
        i = 0
        for k,v in featherLU.items():
            out.update({k:v})
            i += 1
            if i >= 2:
                break
        featherLU = out
    return featherLU
Example #13
0
def mergeSamples(batchFolder, extractionFunc, extractionKwargs, matchStr='*.feather', test=False, metaCols=None, filters=None):
    """Go through each feather file (sample) in a batch folder,
    apply the analysis function, and merge together."""
    mDf = pd.read_csv(opj(batchFolder, 'metadata.csv'))
    featherList = glob(opj(batchFolder, matchStr))
    featherLU = matchSamples(batchFolder, matchStr=matchStr, test=test)
    
    if not metaCols is None:
        if not 'sample_name' in metaCols:
            metaCols.append('sample_name')
        mDf = mDf[metaCols]
    
    mDf = mDf.set_index('sample_name')
    feathers = []
    i = 1
    print('Extracting from batch %s (%s)' % (batchFolder, time.ctime()))
    sttime = time.time()
    for sample_name, fn in featherLU.items():
        filterOut = False
        if not filters is None:
            """Keep only samples whose meta data matches all of the filters"""
            filterOut = False
            for col, valList in filters.items():
                if not mDf.loc[sample_name, col] in valList:
                    filterOut = True
                    break
        if not filterOut:
            f = feather.read_dataframe(fn)
            # print('Extracting from sample %s (%d of %d)' % (sample_name, i, len(featherLU)))
            try:
                x = extractionFunc(f, **extractionKwargs)
                x.loc[:, 'sample_name'] = sample_name
            except:
                print('Error extracting from batch %s, sample %s (%d)' % (batchFolder, sample_name, i))
                print(x.shape)
                print(x.head())
                traceback.print_exc()
            feathers.append(x)
        i += 1
    if len(feathers) > 0:
        outDf = pd.merge(pd.concat(feathers, axis=0), mDf.reset_index(), how='left', left_on='sample_name', right_on='sample_name')
        print('Finished batch %s (%1.0f minutes)' % (batchFolder, (time.time() - sttime) / 60), flush=True)

        """Write to a temporary merge file and return filename"""
        with tempfile.NamedTemporaryFile(mode='w', suffix='.feather', prefix='merged_tmp_', dir=batchFolder, delete=False) as fh:
            tmpFilename = fh.name
        feather.write_dataframe(outDf, tmpFilename)
    else:
        tmpFilename = ''
    return tmpFilename
Example #14
0
File: plot.py Project: jni/prin
def main(argv):
    args = _argument_parser().parse_args(argv)
    if args.data_frame is not None and os.path.exists(args.data_frame):
        df = feather.read_dataframe(args.data_frame)
    else:
        from . import parsers
        parser = getattr(parsers, args.format).parser
        print('reading network data')
        network = parser(args.datafile, max_num_nodes=args.max_num_nodes)
        print('extracting data')
        df = network_properties(network,
                                in_degree_threshold=args.in_degree_threshold,
                                pagerank_threshold=args.pagerank_threshold,
                                damping=args.damping)
    if args.data_frame is not None:
        feather.write_dataframe(df, args.data_frame)
    print('preparing plots')
    bokeh_plot(df, output=args.output_file, loglog=args.loglog)
Example #15
0
def read_feather(path):
    """
    Load a feather-format object from the file path

    .. versionadded 0.20.0

    Parameters
    ----------
    path : string
        File path

    Returns
    -------
    type of object stored in file

    """

    feather = _try_import()
    return feather.read_dataframe(path)
Example #16
0
    def _check_pandas_roundtrip(self, df, expected=None, path=None, columns=None, null_counts=None):
        if path is None:
            path = random_path()

        self.test_files.append(path)
        feather.write_dataframe(df, path)
        if not os.path.exists(path):
            raise Exception("file not written")

        result = feather.read_dataframe(path, columns)
        if expected is None:
            expected = df

        assert_frame_equal(result, expected)

        if null_counts is None:
            null_counts = np.zeros(len(expected.columns))

        np.testing.assert_array_equal(self._get_null_counts(path, columns), null_counts)
Example #17
0
    def test_boolean_nulls(self):
        # pandas requires upcast to object dtype
        path = random_path()
        self.test_files.append(path)

        num_values = 100
        np.random.seed(0)

        writer = FeatherWriter(path)

        mask = np.random.randint(0, 10, size=num_values) < 3
        values = np.random.randint(0, 10, size=num_values) < 5
        writer.write_array('bools', values, mask)

        expected = values.astype(object)
        expected[mask] = None

        writer.close()

        ex_frame = pd.DataFrame({'bools': expected})

        result = feather.read_dataframe(path)
        assert_frame_equal(result, ex_frame)
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 13 15:50:38 2016

@author: Mia
"""

import pandas as pd    
import feather



# Read data file into a python array
review_df = feather.read_dataframe('../parsed_data/filtered_review_data.feather', 'rb')

review_grouped = review_df.groupby(['city'], sort = True).count()
review_cities = review_grouped.sort_values('text',ascending=False)

# Read data file into a python array
tip_df = feather.read_dataframe('../parsed_data/filtered_tip_data.feather', 'rb')

tip_grouped = tip_df.groupby(['city'], sort = True).count()
tip_cities = tip_grouped.sort_values('text',ascending=False)
Example #19
0
# checking mtv features data
'''
print('Checking mtv features data...')
ok = True
for f in features:
    for name in ['0', '1', 'test', 'rank_0', 'rank_1', 'rank_test']:
        filename = 'features/mtv/%s_pred_%s.npy' % (f, name)
        if not os.path.isfile(filename):
            print('  + Missing %s!' % filename)
            ok = False

if not ok: sys.exit(1)
'''

df_all = feather.read_dataframe('tmp/clicks_train_50_50.feather')
df_test = feather.read_dataframe('tmp/clicks_test.feather')

df_train_0 = df_all[df_all.fold == 0].reset_index(drop=1)
df_train_1 = df_all[df_all.fold == 1].reset_index(drop=1)
del df_train_0['fold'], df_train_1['fold'], df_all
gc.collect()

# training a small model to select best features
# first, load the data

df_train = df_train_0[:2000000].copy()
df_val = df_train_1[:1000000].copy()

for f in features:
    print('loading data for %s...' % f)
import feather
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns


app_trn = feather.read_dataframe('../input/application_train.feather')


def target_bar(col, title):

    df0 = app_trn[app_trn["TARGET"] == 0]
    df1 = app_trn[app_trn["TARGET"] == 1]

    t0 = df0[col].value_counts().rename(col + '0')
    t1 = df1[col].value_counts().rename(col + '1')
    t = pd.concat([t0, t1], axis=1).fillna(0).astype(int)
    t['total'] = t.sum(axis=1)
    t.sort_values('total', inplace=True, ascending=False)
    t.drop(columns=['total'], inplace=True)

    idx = np.arange(len(t))
    width = 0.35

    fig, ax = plt.subplots(figsize=(12, 6))

    ax.bar(idx, t[col + '0'], width)
    ax.bar(idx+width, t[col + '1'], width)

    ax.set_title('Scores by group and gender')
Example #21
0
import os

#Convert ohio voter data from csv to feather dataframe
data_path = '/Volumes/FileStorage/Insight_data/Ohio_data/data_csv/'
output_path = '/Volumes/FileStorage/Insight_data/Ohio_data/data_feather/'
oh_f_1 = 'SWVF_1_22'
oh_f_2 = 'SWVF_23_44'
oh_f_3 = 'SWVF_45_66'
oh_f_4 = 'SWVF_67_88'

#save2feather(data_path,oh_f_1,output_path)
#save2feather(data_path,oh_f_2,output_path)
#save2feather(data_path,oh_f_3,output_path)
#save2feather(data_path,oh_f_4,output_path)

df_oh1 = feather.read_dataframe(output_path + oh_f_1 + '.feather')
df_oh2 = feather.read_dataframe(output_path + oh_f_2 + '.feather')
df_oh3 = feather.read_dataframe(output_path + oh_f_3 + '.feather')
df_oh4 = feather.read_dataframe(output_path + oh_f_4 + '.feather')
oh_df = pd.concat([df_oh1, df_oh2, df_oh3, df_oh4])

#oh_df = feather.read_dataframe(output_path+oh_f_1+'.feather')

#sub_df = df.iloc[:150]
#feather.write_dataframe(sub_df,output_path+'subset_oh.feather')

#RESIDENTIAL_ADDRESS1RESIDENTIAL_SECONDARY_ADDRRESIDENTIAL_CITYRESIDENTIAL_STATERESIDENTIAL_ZIP
oh_unique = oh_df.drop_duplicates(subset=[
    'RESIDENTIAL_ADDRESS1', 'RESIDENTIAL_CITY', 'RESIDENTIAL_STATE',
    'RESIDENTIAL_ZIP'
])
Example #22
0
# reading the leaked documents

docs_size = {}
leak_uuid_dict = {}

with open("tmp/leaked_docs.csv") as f:
    reader = csv.DictReader(f)
    leak_uuid_dict = {}

    for row in reader:
        doc_id = int(row['document_id'])
        uuids = row['uuids'].split(' ')
        leak_uuid_dict[doc_id] = set(uuids)
        docs_size[doc_id] = len(uuids)

df_all = feather.read_dataframe('tmp/clicks_train_50_50.feather')
df_test = feather.read_dataframe('tmp/clicks_test.feather')

# getting user ids and document ids

df_events = pd.read_csv('data/events.csv.zip', usecols=['uuid'])
df_ads = pd.read_csv('data/promoted_content.csv.zip',
                     usecols=['ad_id', 'document_id'])

# joining doc_id and ad_id

ad_to_idx = dict(zip(df_ads.ad_id, df_ads.index))

ad_idx = df_all.ad_id.apply(ad_to_idx.get)
ad_document_id = df_ads.document_id.iloc[ad_idx].reset_index(drop=1)
df_all['ad_document_id'] = ad_document_id
import os

import pandas as pd
import feather

os.getcwd()
fp = os.getcwd().replace("feature_eng", "")

train = feather.read_dataframe(fp + "data/train.feather")

df = pd.pivot_table(
    train,
    values="Demanda_uni_equil",
    index=[
        "Cliente_ID",
        "Producto_ID",
        "Agencia_ID",
        "Canal_ID",
        "Ruta_SAK",
        "Venta_uni_hoy",
        "Venta_hoy",
        "Dev_uni_proxima",
        "Dev_proxima",
    ],
    columns="Semana",
)

df = df.reset_index()

feather.write_dataframe(df, fp + "data/week_split_train.feather")
Example #24
0
#!/usr/bin/env python

import pandas as pd
import numpy as np
import random
import pylab as pl
from scipy import optimize
import time
import feather
import csv

df = feather.read_dataframe('../data/seguimiento_audiencias_val.feather')

# Arreglamos manualmente dos errorcitos
df[['erd3_litigios',
    'era3_litigios']] = df[['erd3_litigios',
                            'era3_litigios']].apply(pd.to_numeric,
                                                    errors='coerce')
df['id'] = df['junta'].map(str) + '_' + df['expediente'].map(
    str) + '_' + df['anio'].map(str)


# Crear dos funciones de validación. Para cada variable, validar
# y luego pegarle el nombre de la variable que falló, si no cumple con el criterio
def valida_cat(var, rango):
    df[var][(~df[var].isin(rango)) & (~pd.isnull(df[var]))] = var + '_rango'
    return [df[var]]


def valida_na(var):
    df[var][pd.isnull(df[var])] = var + '_na'
try:
    freqs = np.array( p['freqs'] )/3600
except KeyError:
    T = (3600*23)  if (p['time_stop'] > 3600*24)  else p['time_stop']
    ff = 1/T
    fn = 1/(2*p['time_step']) if( 1/(2*p['time_step']) < (4/3600) ) else (4/3600)
    freqs = np.arange(ff,fn,ff)


"""
Wavenumber range is determined from frequency range using the
frequency range
    -Compute wave directions from dk resolution from nyquist frequency
"""
depth = np.linspace(0,p['depth_end'],p['depth_res'])
N2    = feather.read_dataframe(p['envfile'])['strat']
K = []
for i in range(len(freqs)):
    iwm = InternalWaveModes(depth,N2,freq=freqs[i])
    K.append([iwm.get_hwavenumber(m) for m in p['modes']] )

K = np.array(K).flatten()
print(K)
p['K'] = list(K.real)


#Fundamental wavenumber
dk = K[0].real

headings = []
for kmag in K:
Example #26
0
jobs_red = []
pq_red = mp.Queue()

jobs_blue = []
pq_blue = mp.Queue()

# MC calculation
for i in range(Nbins):
# for i in range(1):
    # simulated data
    inpathSim_whole = inDirSim + "SimCatSelec_tomo" + str(i+1) +'.feather'
    inpathSim_red = inDirSim + "SimCatSelec_tomo" + str(i+1) +'_TB9_in_less3.feather'
    inpathSim_blue = inDirSim + "SimCatSelec_tomo" + str(i+1) +'_TB9_in_greater3.feather'
    #
    dataSim_whole = feather.read_dataframe(inpathSim_whole)
    dataSim_red = feather.read_dataframe(inpathSim_red)
    dataSim_blue = feather.read_dataframe(inpathSim_blue)
    
    # real data
    inpathReal_whole = inDirReal + 'tomo/all_tomo' + str(i+1) +'.feather'
    inpathReal_red = inDirReal + 'split/all_tomo' + str(i+1) +'_T_B_less3.feather'
    inpathReal_blue = inDirReal + 'split/all_tomo' + str(i+1) +'_T_B_greater3.feather'
    #        
    dataReal_whole = feather.read_dataframe(inpathReal_whole)
    dataReal_red = feather.read_dataframe(inpathReal_red)
    dataReal_blue = feather.read_dataframe(inpathReal_blue)

    p_whole = mp.Process(target=mCalFunc, args=(i+1, dataSim_whole, dataReal_whole, Nbin1, Nbin2, pq_whole))
    p_red = mp.Process(target=mCalFunc, args=(i+1, dataSim_red, dataReal_red, Nbin1, Nbin2, pq_red))
    p_blue = mp.Process(target=mCalFunc, args=(i+1, dataSim_blue, dataReal_blue, Nbin1, Nbin2, pq_blue))
Example #27
0
    def __init__(self,
                 name,
                 comment=None,
                 remove_columns=None,
                 param=None,
                 xgb_seed=None,
                 n_estimators=1000,
                 log=None,
                 predict_feats=False,
                 debug=True):
        self.name = name
        self.comment = comment

        if log is None:
            self.logfile = open('../output/log/{}.txt'.format(name), 'w')
        else:
            self.logfile = open('../output/log/{}.txt'.format(log), 'w')

        if param is None:
            self.param = {
                'objective': 'reg:linear',
                'metric': 'rmse',
                'booster': 'gbtree',
                'learning_rate': 0.02,
                'max_depth': 22,
                'min_child_weight': 57,
                'gamma': 1.45,
                'alpha': 0.0,
                'lambda': 0.0,
                'subsample': 0.67,
                'colsample_bytree': 0.054,
                'colsample_bylevel': 0.50,
                'n_jobs': -1,
                'random_state': 456,
                'sead': 6
                #'verbose': 100,
            }
        else:
            self.param = param

        if xgb_seed is not None:
            self.param['seed'] = xgb_seed
        self.param['n_estimators'] = n_estimators
        self.feature_importance_df = None
        self.regressors = []

        self.x = feather.read_dataframe(BASE_X_PATH)
        self.trn_preds_feats = np.load(TRN_PRED_FEATS)
        self.tes_preds_feats = np.load(TES_PRED_FEATS)

        if remove_columns is not None:
            drop_features = [
                _f for _f in self.x.columns if _f in remove_columns
            ]
            self.x.drop(drop_features, axis=1, inplace=True)
            del drop_features
            gc.collect()

        #read & prepare datasets
        print('read & prepare datasets shape: {}'.format(self.x.shape))

        #split train & test sets
        self.x_train, self.y_train, self.x_test, self.y_train_ag = prep_and_split(
            self.x)

        #debug
        if debug:
            x_train_s = self.x_train.sample(frac=0.3)
            x_test_s = self.x_test.sample(frac=0.3)
            y_train_s = self.y_train_ag.loc[self.y_train_ag.index.isin(
                x_train_s.index)]
        else:
            x_train_s = self.x_train.sample(frac=1)
            x_test_s = self.x_test.sample(frac=1)
            y_train_s = self.y_train_ag.loc[self.y_train_ag.index.isin(
                x_train_s.index)]

        if predict_feats:
            self.x_train, trn_feats = add_pred_feats(x_train_s,
                                                     self.trn_preds_feats,
                                                     None)
            self.x_test, _ = add_pred_feats(x_test_s, self.tes_preds_feats,
                                            trn_feats)
            self.y_train = y_train_s.groupby('fullVisitorId').sum()
            del x_train_s, x_test_s, y_train_s
            gc.collect()
        else:
            self.x_train.reset_index(drop=True, inplace=True)
            self.x_test.reset_index(drop=True, inplace=True)
Example #28
0
# In[1]:

import pandas as pd
import numpy as np
import feather

from tqdm import tqdm

# In[2]:

from outliers import remove_outliers

# In[3]:

df_pays = feather.read_dataframe('data/df_pays_na_test.feather')

# In[4]:

shops = df_pays.shop_id.unique()
shops = sorted(shops)

# In[5]:

from fbprophet import Prophet

# In[7]:


def add_prophet_features(df_shop):
    df = df_shop[['day', 'pays_count']].rename(columns={
"""

import os
os.chdir('D:/yh_min-mfactors')
from alphaFuncs_min_240 import *
from address_data import *
import pandas as pd
import feather as ft

################ 因子计算结果没有 '600485.SH'(停牌)

# 先验证分钟行情数据的时间范围,分钟和日范围
files = os.listdir(add_min_file)
res = []
for filename in files:
    df = ft.read_dataframe(add_min_file+filename, nthreads=100)
    ls = list(df['date'])
    res = list(set(res+ls))  #验证结果,每个股票的时间都是完整的,468480条分钟数据
datetimel = sorted(res)[408000:] #从2017年开始,60480条分钟线数据,每个股票的时间也是完整的   
datel = list(set(map(lambda x : x[:10],datetimel))) # 日数据252条,20170103-20180115


# 沪深300指数成分股
code_HS300 = pd.read_excel(add_gene_file + 'data_mkt.xlsx',sheetname='HS300')
stockList = list(code_HS300['code'][:])

# 分钟线:从2017-01-03 09:31:00 至 2018-01-15 15:00:00
dateList = open(add_mintime_SerialFile).read().split('\n')
alpha_all(stockList, dateList, savepath=add_alpha_min_expand_file)

# -*- coding: utf-8 -*-
"""
Created on Sun May 29 21:34:10 2016

@author: mariaathena
"""

# Prepare environment and load data ------------------------------------------
import pandas as pd
import numpy as np
import feather


cosim_df = feather.read_dataframe('../parsed_data/event_cosine_sim.feather')


# Modify data for easy visualisation -----------------------------------------

# Set cos_sim below certain threshold equal to zero
# threshold for each topic == topic's 75th percentile cosine sim
cosim_df2 = cosim_df.copy()
# cosim_df2.ix[:,3:] = cosim_df2.ix[:,3:].applymap(lambda x: round(x, 2) if x > 0.01 else 0)
cosim_df2.benghazi = cosim_df2.benghazi.apply(lambda x: round(x, 3) if x > np.percentile(cosim_df2.benghazi, 75) else 0)
cosim_df2.wiki_leak = cosim_df2.wiki_leak.apply(lambda x: round(x, 3) if x > np.percentile(cosim_df2.wiki_leak, 75) else 0)
cosim_df2.doctrine = cosim_df2.doctrine.apply(lambda x: round(x, 3) if x > np.percentile(cosim_df2.doctrine, 75) else 0)
cosim_df2.arab_spring = cosim_df2.arab_spring.apply(lambda x: round(x, 3) if x > np.percentile(cosim_df2.arab_spring, 75) else 0)
cosim_df2.russian_reset = cosim_df2.russian_reset.apply(lambda x: round(x, 3) if x > np.percentile(cosim_df2.russian_reset, 75) else 0)
cosim_df2.cancer = cosim_df2.cancer.apply(lambda x: round(x, 3) if x > np.percentile(cosim_df2.cancer, 75) else 0)


## Set emails topic == event with the highest cosine similarity to
Example #31
0
out_dict = {
    "model_vars_": model_files,
    "com_formed_": com_files,  #"agent_vars_":ag_files
}

# Loop through container dictionaries
for key, val in out_dict.items():
    for input_file in glob.glob(join(data_dir, '*' + key + '.feather')):

        # Create a label from the name of the experiment that created the data
        label = "_".join(input_file.split("_")[7:8])
        print(label)

        # Read data and store it in the container dictionary
        with open(input_file, "r") as mydata:
            val[label] = feather.read_dataframe(input_file)

# List of parameters included in the scenario labels
cal_pars = [
    "w_econ", "w_swn", "w_att", "w_subplot", "threshold", "reduction",
    "awareness_mean", "awareness_stdev", "awareness_minergie"
]

pars_d = {"cal_label": cal_pars}

# Rename second column in analysed files to "variable"
for key, df in out_dict["model_vars_"].items():
    df.rename(columns={"Unnamed: 0": 'sim_year'}, inplace=True)

# Put all data frames into one
model_df = pd.concat(out_dict["model_vars_"])
Example #32
0

import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adadelta
from itertools import product
import datetime


# In[8]:


train=feather.read_dataframe("../data/train_set.feather")
test=feather.read_dataframe("../data/test_set.feather")
result=test[['id']].copy()
train_label=train['class'].values
lb = LabelEncoder()
lb.fit(train['class'].values)


# In[9]:


nb_classes =19
dims = x_train.shape[1]
epochs = 15
# parameter grids
param_grid = [
Example #33
0
### Importing required packages

import pandas as pd
import feather
from pyproj import Proj, transform
import calendar as cldr
from geopy.geocoders import Nominatim
import datetime
import numpy as np

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999


# Importing feather datasets
FL_df = feather.read_dataframe("feather_files\FL_raw.feather")
VA_df = feather.read_dataframe("feather_files\VA_raw.feather")
PA_df = feather.read_dataframe("feather_files\PA_raw.feather")
OR_df = feather.read_dataframe("feather_files\OR_raw.feather")
OR_loc_df = feather.read_dataframe("feather_files\OR_locations_raw.feather")
NJ_df = feather.read_dataframe(r"feather_files\NJ_raw.feather")
NJ_loc_df = feather.read_dataframe(r"feather_files\NJ_locations_raw.feather")
MD_df = feather.read_dataframe("feather_files\MD_raw.feather")
ID_df = feather.read_dataframe("feather_files\ID_raw.feather")


# Importing data Scott Worland compiled for PA, VA, and FL 
PA_VA_FL_df = pd.read_csv("PA_VA_FL\public_supply_data_pa_va_fl_rev2.csv")


# Defining 'uid' column as a string to export to fether
Example #34
0
def import_data(data_path,
                use_pandas=False,
                intercept=True,
                valid_fraction=0.2,
                classification=True):
    """Import Data for H2O GPU Edition

    This function will read in data and prepare it for H2O4GPU's GLM solver.

    Note, the data is assumed to be all numeric,i.e.,
    categoricals are one hot encoded, etc.

    :param data_path : str
                 A path to a dataset (The dataset needs to be all numeric)
    :param use_pandas : bool
                  Indicate if Pandas should be used to parse
    :param intercept : bool
                  Indicate if intercept term is needed
    :param valid_fraction : float
                      Percentage of dataset reserved for a validation set
    :param classification : bool
                      Classification problem?
    :returns
    If valid_fraction > 0 it will return the following:
        train_x: numpy array of train input variables
        train_y: numpy array of y variable
        valid_x: numpy array of valid input variables
        valid_y: numpy array of valid y variable
        family : string that would either be "logistic" if classification is set
            to True, otherwise "elasticnet"
    If valid_fraction == 0 it will return the following:
        train_x: numpy array of train input variables
        train_y: numpy array of y variable
        family : string that would either be "logistic" if classification is set
            to True, otherwise "elasticnet"
    """
    #Can import data using pandas or feather.
    use_pandas = use_pandas

    data_file = data_path  # If importing using pandas

    if use_pandas:
        print("Reading Data with Pandas")
        data = pd.read_csv(data_file)
    else:
        print("Reading Data with Feather")
        data = feather.read_dataframe(data_file)
    print(data.shape)
    data_x = np.array(
        data.iloc[:, :data.shape[1] - 1],
        dtype='float32',
        order='C',
        copy=False)
    data_y = np.array(
        data.iloc[:, data.shape[1] - 1], dtype='float32', order='C', copy=False)

    #Setup train / validation set split
    #(assuming form of mxn where m = row count and n = col count)
    morig = data_x.shape[0]
    norig = data_x.shape[1]
    print("Original m=%d n=%d" % (morig, norig))
    sys.stdout.flush()

    #Do train / valid split
    if valid_fraction > 0:
        valid_fraction = valid_fraction
        HO = int(valid_fraction * morig)
        H = morig - HO
        print("Size of Train rows=%d & valid rows=%d" % (H, HO))
        sys.stdout.flush()
        train_x = data_x[0:H, :]
        train_y = data_y[0:H]
        valid_x = data_x[H:morig, :]
        valid_y = data_y[H:morig]
        print("Size of Train cols=%d valid cols=%d" % (train_x.shape[1],
                                                       valid_x.shape[1]))
    else:
        train_x = data_x
        train_y = data_y


#Using intercept
    if intercept:
        train_x = np.hstack(
            [train_x,
             np.ones((train_x.shape[0], 1), dtype=train_x.dtype)])
        if valid_fraction > 0:
            valid_x = np.hstack(
                [valid_x,
                 np.ones((valid_x.shape[0], 1), dtype=valid_x.dtype)])
            print("Size of Train cols=%d & valid cols=%d after adding "
                  "intercept column" % (train_x.shape[1], valid_x.shape[1]))
        else:
            print("Size of Train cols=%d after adding intercept column" %
                  (train_x.shape[1]))

    if classification:
        family = "logistic"
    else:
        family = "elasticnet"
    if valid_fraction > 0:
        return train_x, train_y, valid_x, valid_y, family

    return train_x, train_y, family
Example #35
0
import os
import pandas as pd
import feather
import numpy as np
import pickle
from sklearn.impute import SimpleImputer

path_wd = os.getenv("PATH_WD")
path_input_data = os.getenv("PATH_INPUT_DATA")
path_output_data = os.getenv("PATH_OUTPUT_DATA")
path_output_artifacts = os.getenv("PATH_OUTPUT_ARTIFACTS")
path_airlines = os.path.join(path_input_data,
                             "airlines_small_target_selected.feather")

pd_airlines = feather.read_dataframe(path_airlines)

list_num = [
    line.rstrip('\n') for line in open(
        os.path.join(path_input_data, "airlines_impute_num_vars.txt"), "r")
]
list_cat = [
    line.rstrip('\n') for line in open(
        os.path.join(path_input_data, "airlines_impute_cat_vars.txt"), "r")
]

imp_mean = SimpleImputer(missing_values=np.nan, strategy="mean")
np_airlines_num = imp_mean.fit_transform(pd_airlines[list_num])
pd_airlines_num = pd.DataFrame(np_airlines_num)
pd_airlines_num.columns = list_num
print(pd_airlines_num.head())
print(pd_airlines_num.isnull().sum(axis=0))
Example #36
0
def backtest(y_pred, y_pred_prob, model_name):

    process = False
    dir = '../data/Basketball/Team/gamelog/'
    odds_data_path = '../data/scraped_odds_data.csv'

    teams = [
        'ATL', 'BOS', 'BRK', 'CHI', 'CHO', 'CLE', 'DAL', 'DEN', 'DET', 'GSW',
        'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 'NOP', 'NYK',
        'OKC', 'ORL', 'PHI', 'PHO', 'POR', 'SAC', 'SAS', 'TOR', 'UTA', 'WAS'
    ]

    mapping = {
        'Philadelphia 76ers': 'PHI',
        'Denver Nuggets': 'DEN',
        'Golden State Warriors': 'GSW',
        'Milwaukee Bucks': 'MIL',
        'Toronto Raptors': 'TOR',
        'Los Angeles Clippers': 'LAC',
        'San Antonio Spurs': 'SAS',
        'Houston Rockets': 'HOU',
        'Portland Trail Blazers': 'POR',
        'Utah Jazz': 'UTA',
        'Detroit Pistons': 'DET',
        'Oklahoma City Thunder': 'OKC',
        'Orlando Magic': 'ORL',
        'Indiana Pacers': 'IND',
        'Brooklyn Nets': 'BRK',
        'Boston Celtics': 'BOS',
        'Charlotte Hornets': 'CHO',
        'Los Angeles Lakers': 'LAL',
        'Sacramento Kings': 'SAC',
        'Phoenix Suns': 'PHO',
        'Dallas Mavericks': 'DAL',
        'New Orleans Pelicans': 'NOP',
        'Atlanta Hawks': 'ATL',
        'Miami Heat': 'MIA',
        'Washington Wizards': 'WAS',
        'Minnesota Timberwolves': 'MIN',
        'New York Knicks': 'NYK',
        'Chicago Bulls': 'CHI',
        'Memphis Grizzlies': 'MEM',
        'Cleveland Cavaliers': 'CLE',
    }

    X_test = feather.read_dataframe('../data/X_test_df.feather')
    df = X_test
    y_test = feather.read_dataframe('../data/y_test_df.feather')
    X_test['predictions'] = y_pred
    X_test['pred_prob'] = y_pred_prob
    dodd = pd.read_csv('../data/scraped_odds_data.csv', header=0)
    monthmap = {
        'Jan': 1,
        'Feb': 2,
        'Mar': 3,
        'Apr': 4,
        'May': 5,
        'Jun': 6,
        'Jul': 7,
        'Aug': 8,
        'Sep': 9,
        'Oct': 10,
        'Nov': 11,
        'Dec': 12
    }

    odds_data = baseline.process_odds_data(
        dodd, X_test[[
            'Date', 'Location', 'Home', 'Away', 'target', 'predictions',
            'unique_id', 'W/L', 'Tm_score', 'Opp_score'
        ]])

    test_df = X_test[[
        'Date', 'Location', 'Home', 'Away', 'target', 'pred_prob',
        'predictions', 'unique_id', 'W/L', 'Tm_score', 'Opp_score'
    ]].copy()

    odds_data = odds_data.assign(
        unique_id=lambda x: x['team1'] + '_' + x['team2'])

    test_df = test_df.assign(home_score=lambda x: x['Tm_score'],
                             away_score=lambda x: x['Opp_score'])
    test_df.loc[(test_df['Location'] == 'Away'),
                'home_score'] = test_df.loc[(test_df['Location'] == 'Away'),
                                            'Opp_score']
    test_df.loc[(test_df['Location'] == 'Away'),
                'away_score'] = test_df.loc[(test_df['Location'] == 'Away'),
                                            'Tm_score']

    test_df['home_score'] = test_df['home_score'].astype(int).astype(str)
    test_df['away_score'] = test_df['away_score'].astype(int).astype(str)
    test_df = test_df.assign(
        score=lambda x: x['home_score'] + ':' + x['away_score'])

    odds_data['new_score'] = odds_data['score'].apply(
        lambda x: x.replace('OT', ''))

    df_rets = test_df.merge(odds_data,
                            how='left',
                            left_on=['unique_id', 'score'],
                            right_on=['unique_id', 'new_score'])

    df_rets_drop = df_rets.dropna(subset=['odds1', 'odds2'])

    # df_rets_drop = df_rets_drop.loc[(df_rets_drop['odds1'] > -500)
    #                                 & (df_rets_drop['odds2'] > -500)
    #                                 & (df_rets_drop['odds1'] != 0)
    #                                 & (df_rets_drop['odds2'] != 0), :].reset_index(drop = True)

    # df_rets_drop = df_rets_drop.loc[(df_rets_drop['pred_prob'] > 0.8) | (df_rets_drop['pred_prob'] < 0.2), :].reset_index(drop = True)

    mark = 0
    profit = []
    date = []
    for i in df_rets_drop.index:
        if (df_rets_drop.loc[i]['predictions']
                == True) and (df_rets_drop.loc[i]['pred_prob'] >
                              0.7) and (df_rets_drop.loc[i]['odds1'] > -100):
            # if (df_rets_drop.loc[i]['predictions'] == True):
            mark += 100
            if df_rets_drop.loc[i]['target'] == True:
                if df_rets_drop.loc[i]['odds1'] < 0:
                    earned = -100 * 100 / df_rets_drop.loc[i]['odds1']
                    # profit += -100 * 100 / df_rets_drop.loc[i]['odds1']
                else:
                    earned = df_rets_drop.loc[i]['odds1']
            else:
                earned = -100
                # profit -= 100
            date.append(df_rets_drop.loc[i]['Date'])
            profit.append(earned)
        if (df_rets_drop.loc[i]['predictions']
                == False) and (df_rets_drop.loc[i]['pred_prob'] <
                               0.3) and (df_rets_drop.loc[i]['odds2'] > -100):
            # if (df_rets_drop.loc[i]['predictions'] == False):
            mark += 100
            if df_rets_drop.loc[i]['target'] == True:
                earned = -100
                # profit -= 100
            else:
                if df_rets.loc[i]['odds2'] < 0:
                    earned = -100 * 100 / df_rets_drop.loc[i]['odds2']
                    # profit += -100* 100 / df_rets_drop.loc[i]['odds2']
                else:
                    earned = df_rets_drop.loc[i]['odds2']
            date.append(df_rets_drop.loc[i]['Date'])
            profit.append(earned)

    profit = np.array(profit)
    cum_profit = np.array(profit).cumsum()

    print('Model = {} : Total Profit Is {}'.format(model_name, profit.sum()))

    result = pd.DataFrame({'profit': cum_profit}, index=date)

    result.plot(y='profit',
                title='Cumulative Profit From $100',
                figsize=(8, 8))
    plt.savefig('../output/{}_prob_0.3_0.7.jpg'.format(model_name),
                format='jpg')
    plt.show()

    return profit, result
Example #37
0
    amount_cumsum = sort_valid_data['amount'].cumsum()
    all_amount_cumsum = amount_cumsum.values[-1]
    ax2.plot(ax.get_xticks(), amount_cumsum.values/all_amount_cumsum*100, c = 'g')

    # ax4 = ax3.twinx()
    ax2.plot(ax2.get_xticks(), [trade_cum_ratio*100]*len(ax.get_xticks()), c = 'y')
    x_ids = np.where((amount_cumsum.values/all_amount_cumsum) <= trade_cum_ratio)[0]
    ax2.plot([ax2.get_xticks()[x_ids[-1]]]*2, [0,100], 'm--')
    ax.grid(True)
    ax2.grid(True)
    plt.show()

if __name__ == '__main__':
    # read stock data
    path = './SH600000.feather'
    df = feather.read_dataframe(path)
    # feather.write_dataframe(df, output_path)

    start_date = "2016-09-30 14:30"
    end_date = "2016-09-30 15:00"
    valid_data = extract_valid_data_range(df, start_date, end_date)
    S = calculate_S(valid_data)

    # sort data by "S" -- descending
    S_sort_ids = np.argsort(S.values)
    S_arr = S.values[S_sort_ids[::-1]]
    sort_valid_data = valid_data.iloc[S_sort_ids[::-1]]
    # visualize Q
    visualize_S(sort_valid_data)

    Q = calculate_Q(valid_data, S)
Example #38
0
df = df.merge(bwd, 'left', ['Date', 'Store'], suffixes=['', '_bw'])
df = df.merge(fwd, 'left', ['Date', 'Store'], suffixes=['', '_fw'])

df.columns.is_unique  #True
len(df.columns)  #17
df.drop(columns, 1, inplace=True)  #删除某些栏位,沿着列方向
len(df.columns)  #14

# 大的中间结果最好保存起来
df.to_feather('{}df'.format(PATH))

#temp = df
#df = pd.read_feather('{}df'.format(PATH)) # read_feather错误,改成一下方式
import feather

df = feather.read_dataframe('{}df'.format(PATH))

type(df.Date)  # 文件读取以后,是Series
df['Date'] = pd.to_datetime(df.Date)
type(df.Date)

joined = join_df(joined, df, ['Store', 'Date'])
joined_test = join_df(joined_test, df, ['Store', 'Date'])

# 在移除某些行以后,再次重置索引
joined.reset_index(inplace=True)
joined_test.reset_index(inplace=True)
joined[:10]

#再次保存
joined.to_feather('{}joined2'.format(PATH))
Example #39
0
def main():
    ''' Run ARD NMF'''
    torch.multiprocessing.set_start_method('spawn')

    parser = argparse.ArgumentParser(
        description=
        'NMF with some sparsity penalty described https://arxiv.org/pdf/1111.6085.pdf'
    )
    parser.add_argument('--data', help='Data Matrix', required=True)
    parser.add_argument('--feather',
                        help='Input in feather format',
                        required=False,
                        default=False,
                        action='store_true')
    parser.add_argument('--parquet',
                        help='Input in parquet format',
                        required=False,
                        default=False,
                        action='store_true')
    parser.add_argument('--K0',
                        help='Initial K parameter',
                        required=False,
                        default=None,
                        type=int)
    parser.add_argument('--max_iter',
                        help='maximum iterations',
                        required=False,
                        default=10000,
                        type=int)
    parser.add_argument('--del_',
                        help='Early stop condition based on lambda change',
                        required=False,
                        default=1,
                        type=int)
    parser.add_argument('--tolerance',
                        help='Early stop condition based on max lambda entry',
                        required=False,
                        default=1e-6,
                        type=float)
    parser.add_argument(
        '--phi',
        help='dispersion parameter see paper for discussion of choosing phi '
        'default = 1',
        required=False,
        default=1.0,
        type=float)
    parser.add_argument(
        '--a',
        help=
        'Hyperparamter for lambda. We recommend trying various values of a. Smaller values'
        'will result in sparser results a good starting point might be'
        'a = log(F+N)',
        required=False,
        default=10.0,
        type=float)

    parser.add_argument(
        '--b',
        help=
        'Hyperparamter for lambda. Default used is as recommended in Tan and Fevotte 2012',
        required=False,
        type=float,
        default=None)
    parser.add_argument(
        '--objective',
        help=
        'Defines the data objective. Choose between "poisson" or "gaussian". Defaults to Poisson',
        required=False,
        default='poisson',
        type=str)

    parser.add_argument(
        '--prior_on_W',
        help='Prior on W matrix "L1" (exponential) or "L2" (half-normal)',
        required=False,
        default='L1',
        type=str)
    parser.add_argument(
        '--prior_on_H',
        help='Prior on H matrix "L1" (exponential) or "L2" (half-normal)',
        required=False,
        default='L1',
        type=str)

    parser.add_argument(
        '--output_dir',
        help=
        'output_file_name if run in array mode this correspond to the output directory',
        required=True)
    parser.add_argument('--labeled',
                        help='Input has row and column labels',
                        required=False,
                        default=False,
                        action='store_true')
    parser.add_argument('--report_frequency',
                        help='Number of iterations between progress reports',
                        required=False,
                        default=100,
                        type=int)
    parser.add_argument('--dtype',
                        help='Floating point accuracy',
                        required=False,
                        default='Float32',
                        type=str)
    parser.add_argument(
        '--parameters_file',
        help=
        'allows running many different configurations of the NMF method on a multi'
        'GPU system. To run in this mode provide this argument with a text file with '
        'the following headers:(a,phi,b,prior_on_W,prior_on_H,Beta,label) label '
        'indicates the output stem of the results from each run.',
        required=False,
        default=None)
    args = parser.parse_args()

    print('Reading data frame from ' + args.data)

    if args.dtype == 'Float32':
        args.dtype = torch.float32
    elif args.dtype == 'Float16':
        args.dtype = torch.float16

    if args.parquet:
        dataset = pd.read_parquet(args.data)
    elif args.feather:
        print('loading feather...')
        dataset = feather.read_dataframe(args.data)
    else:
        if args.labeled:
            dataset = pd.read_csv(args.data, sep='\t', header=0, index_col=0)
        else:
            dataset = pd.read_csv(args.data, sep='\t', header=None)

    if args.objective.lower() == 'poisson':
        Beta = 1
    elif args.objective.lower() == 'gaussian':
        Beta = 2
    else:
        print('objective parameter should be one of "gaussian" or "poisson"')
        sys.exit()
    data = ARD_NMF(dataset, args.objective)
    if args.parameters_file != None:
        parameters = pd.read_csv(args.parameters_file, sep='\t')
        run_parameter_sweep(parameters, data, args, Beta)
    else:
        W, H, cost = run_method_engine(data, args.a, args.phi, args.b, Beta,
                                       args.prior_on_W, args.prior_on_H,
                                       args.K0, args.tolerance, args.max_iter)
        nsig = write_output(W, H, data.channel_names, data.sample_names,
                            args.output_dir, args.output_dir)
        return intersection_cardinality / float(x_cardinality)


print(a.c.BOLD + 'Extracting set3d JSON features ...' + a.c.END)

# Get train/test mode from launch argument
mode = a.get_mode(sys.argv, '3_feature_set3d_json1.py')

## Read settings required by script
config = a.read_config()
nthreads = config.preprocessing_nthreads
cache_loc = config.cache_loc
debug = config.debug
if mode == 0:
    root = config.train_images_root
    df = feather.read_dataframe(cache_loc + 'train.fthr')
if mode == 1:
    root = config.test_images_root
    df = feather.read_dataframe(cache_loc + 'test.fthr')

train = df[['itemID_1', 'itemID_2', 'attrsJSON_1', 'attrsJSON_2']]
del df
gc.collect()

train = train.fillna('')

ftrs = []

print('Calculating features ...')
t0 = time.time()
for i in range(0, len(train.index)):
import collections, itertools

if __name__ == '__main__':
    import argparse, os, json
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('--index', default="eurodata", help="index name")
    parser.add_argument('--meta', default="{}", help="metadata")
    parser.add_argument('paths', type=argparse.FileType())
    args = parser.parse_args()

    es = Elasticsearch()
    meta = json.loads(args.meta)

    for fpath in args.paths:
        fpath = fpath.strip()
        df = feather.read_dataframe(fpath)
        df = df.where((pd.notnull(df)), None) # convert NaN to None
        
        name = os.path.basename(fpath)
        res = es.index(index=args.index, doc_type='schema', id=name, body={
            'schema': [s.decode('utf8', 'ignore') for s in df.columns],
            'scrape_meta': meta
        })
        try:
            it = ({
                    '_index': args.index,
                    '_type': 'row',
                    '_id': '%s-%s' % (name, i),
                    '_source': {'row':[str(r).decode('utf8') for r in row]}
                } for i,row in df.iterrows())
            print (name, '\t', sum(1 for _ in helpers.streaming_bulk(es, it)))
Example #42
0
def main(**opt):

    # 准备工作
    gc.enable()
    np.random.seed(123)

    # Get the optimized parameters
    n_folds = opt.pop('n_folds', 5)
    tag = opt.pop('tag', '')
    tmt = datetime.now().strftime('%Y%m%d_%H%M')
    tag += '_' + tmt + '_'
    clf_name = opt.get('model', 'GBMClassifier')
    tag += clf_name + '_'
    clf = getattr(models, clf_name)(opt)
    assert clf is not None

    # data directory
    cur_dir = op.dirname(__file__)
    data_dir = op.join(cur_dir, '../data')
    train_cache_file = op.join(data_dir, 'train_feat_cache.feather')
    test_cache_file = op.join(data_dir, 'test_feat_cache.feather')

    useless_feat_file = op.join(data_dir, '../stat/dump_feat.txt')
    useless_feat = load_useless_feat(useless_feat_file)
    # print(useless_feat)

    if op.exists(train_cache_file) and op.exists(test_cache_file):
        print("Loading train and test feathers cache file ...")
        train = feather.read_dataframe(train_cache_file)
        test = feather.read_dataframe(test_cache_file)
    else:
        train, test = create_features(data_dir, useless_feat)

    train, y = train.iloc[:, :-1], train['TARGET']
    subm = test[['SK_ID_CURR']]
    print("Feature added train shape: {}".format(train.shape))
    train = exclude_column_df(train, useless_feat)
    test = exclude_column_df(test, useless_feat)

    if clf_name in ['RFClassifier', 'ETClassifier', 'XGB_Classifier']:
        print("One hot encoding variables ...")
        train_size = train.shape[0]
        data = pd.concat([train, test])
        del train, test

        obj_cols = [
            c for c in data.columns.tolist()[1:]
            if data[c].dtype == 'object' or data[c].dtype.name == 'category'
        ]
        # print(obj_cols)
        not_obj_cols = [c for c in data.columns.tolist() if c not in obj_cols]

        one_hot_data = pd.get_dummies(data[obj_cols])
        # print(one_hot_data.shape, type(one_hot_data))
        data = pd.concat([data[not_obj_cols], one_hot_data], axis=1)
        data = exclude_column_df(data, useless_feat)
        test = data.iloc[train_size:, :].reset_index(drop=True)
        train = data.iloc[:train_size, :].reset_index(drop=True)
        del data
        print("Encoding done!")

    # may do some tweak using feature importance
    feat_selected = train.columns.tolist()[1:]
    print("Used features count: {}".format(len(feat_selected)))

    # do stacking.
    print("Begin to do cross validation to model data ...")
    cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=521)
    train_pred = np.zeros(train.shape[0])
    test_pred = np.zeros((test.shape[0], n_folds))
    feat_imp = pd.DataFrame(np.zeros((len(feat_selected), n_folds)))
    feat_imp['features'] = feat_selected

    for k, (trn_idx, val_idx) in enumerate(cv.split(train, y)):
        trn_x, trn_y = train[feat_selected].iloc[trn_idx], y.iloc[trn_idx]
        val_x, val_y = train[feat_selected].iloc[val_idx], y.iloc[val_idx]

        clf.fit(train_set=(trn_x, trn_y), valid_set=(val_x, val_y))

        train_pred[val_idx] = clf.predict_proba(val_x)
        test_pred[:, k] = clf.predict_proba(test[feat_selected])

        stat = roc_auc_score(val_y, train_pred[val_idx])
        print("K={}, AUC: {:.3f}".format(k + 1, stat))

        # collect importance info
        feat_imp.iloc[:, k] = clf.get_feat_imp()

    total_auc = roc_auc_score(y, train_pred)
    print("CV-{} had been done! Total train auc is: {:.4f}".format(
        n_folds, total_auc))

    feat_imp['imp_mean'] = feat_imp.iloc[:, :n_folds].mean(axis=1)
    feat_imp['imp_std'] = feat_imp.iloc[:, :n_folds].std(axis=1)
    feat_imp['imp_cv'] = feat_imp['imp_std'] / feat_imp['imp_mean']
    feat_imp = feat_imp.iloc[:, n_folds:].sort_values('imp_cv',
                                                      ascending=True,
                                                      na_position='last')
    ind1 = feat_imp['imp_cv'].isnull()
    ind2 = feat_imp['imp_cv'] > 0.5
    ind3 = feat_imp['imp_mean'] < 10
    ind = ind1 | (ind2 & ind3)
    feat_imp['should_filter'] = ind.astype('int')

    # save to files
    tag += 'kfold_{}_auc_{:.4f}_'.format(n_folds, total_auc)
    print("Begin to save statistic into files")
    stat_dir = op.join(cur_dir, '../stat')
    feat_imp_file = op.join(stat_dir, tag + 'feat_imp.csv')
    feat_imp.to_csv(feat_imp_file, index=False)

    train_pred_ = train[['SK_ID_CURR']]
    train_pred_.loc[:, 'TARGET_PRED'] = train_pred
    train_pred_file = op.join(stat_dir, tag + 'train_cv_pred.csv')
    train_pred_.to_csv(train_pred_file, index=False)

    print("Saving test prediction to files ...")
    subm['TARGET'] = np.mean(test_pred, axis=1)
    subm_file = op.join(cur_dir, '../sub', tag + 'subm.csv.gz')
    subm.to_csv(subm_file, index=False, compression='gzip')

    print("All prediction done!")
import feather
import numpy as np
import pandas as pd

#Import feather files that were downloaded and saved using TCGA2STAT package in R:
gene_counts = feather.read_dataframe('Gene_counts.feather')

#Check data import:
if np.isfinite(gene_counts.shape[0]) :
    print("Gene Counts data set imported!")
else :
    print("Error in Gene_counts import")

"""Gene Counts dataframe are formatted with patient ID as the index and gene
names as columns.  Each value represents the abundance estimate for the gene in
each particular RNA-seq run.  This value is 'raw' in that it is not normalized
by the total number of reads made for the sample.  Thus the first step in data
clean-up is to transform these values to be normalized by the total number of
reads for the sample (in Millions)."""

gene_counts.set_index(['gc_index'], inplace = True) # set the index as the TCGA ID codes
#print(gene_counts.index[1:5]) [Debug]
print("\nDimension of DataFrame:", gene_counts.shape,"\n")

def transformation(dataset) :
    read_count = dataset.sum(axis = 1) #get the total reads for each sample
    for r in range(0,dataset.shape[0]) :
        dataset.iloc[r] = 1000000 * dataset.iloc[r] / read_count.iloc[r] #transform each read abundance (rsem) by the sample reads / million
    if sum(round(dataset.sum(axis = 1)) == 1e6) == dataset.shape[0] :  #the sum of each row in the transformed df should be 1000000.  if every row is transformed correctly, print statement
        print("Transformation Successful!\n")
        print(dataset.shape[0],'Gene count estimate profiles have been transformed from transcript abundance estimates to transcripts per million reads (TPM)')
Example #44
0
def read_df(filename, index_col='date'):
    import feather
    return feather.read_dataframe(filename).set_index(index_col)
Example #45
0
zd1 = hos_dic.ix[0,:]

ttl_fee.head(10)

ttl_fee['hosname'] = ttl_fee['x5'].map(hos_dic15)



hos_dic = pd.read_hdf("/mnt/e/pyr/data/hdf5/R_fee_15.h5",
                      'hos_dic')

hos_dic.columns = ['code','name']

hos_dic15 = dict(zip(hos_dic.code,hos_dic.name))

ttlfee = ft.read_dataframe(
    '/mnt/e/pyr/data/y2015/2015x/2015_x229.pyr')

veri = ft.read_dataframe(
    '/mnt/e/pyr/data/y2015/2015x/2015_x262.pyr')

nm = ft.read_dataframe(
    '/mnt/e/pyr/data/y2015/2015x/2015_x32.pyr')

gender = ft.read_dataframe(
    '/mnt/e/pyr/data/y2015/2015x/2015_x33.pyr')
    
birthdate = ft.read_dataframe(
    '/mnt/e/pyr/data/y2015/2015x/2015_x34.pyr')
    

record  = ft.read_dataframe(
Example #46
0
def read_feather_dask(filepath):
    df = feather.read_dataframe(filepath, columns=p.columns)
    return dd.from_pandas(df, npartitions=p.n_workers)
Example #47
0
def export_data_set(name = None):
    if name is None:
        data = feather.read_dataframe(join(out_path, "iris.data"))
    else:
        data = feather.read_dataframe(join(out_path, name))
    return data
Example #48
0
    # input directory
    indir = "/disks/shear15/ssli/KV450/split/all_tomo"
    # input postfix
    inP_r = "_T_B_less3"
    inP_b = "_T_B_greater3"
    inPs = [inP_r, inP_b]


    area = 341.3 * 3600. # 1/arcmin^2

    outdir = "/disks/shear15/ssli/CosmicShear/covariance/prepare/Ndensity_sigmae"
    # output postfix
    outPs = ["_red", "_blue"]

    for k in range(len(inPs)):
        WorA = 'w' 
        for i in range(5):
            inpath = indir + str(i+1) + inPs[k] + '.feather'
            indata = feather.read_dataframe(inpath)
            outpath = outdir + outPs[k] + '.txt'

            id_zbin = i + 1
            e1 = indata['bias_corrected_e1']
            e2 = indata['bias_corrected_e2']
            wg = indata['recal_weight']

            NeffSigmaeFunc(id_zbin, e1, e2, wg, area, outpath, WorA)
            WorA = 'a'
            print("Finished in", id_zbin, inPs[k])

Example #49
0
from pandas.util.testing import assert_frame_equal
import pandas as pd

import feather

import uuid

nrows = 4000000
ncols = 100

data = np.random.randn(nrows)

df = pd.DataFrame({'c{0}'.format(i): data
                   for i in range(ncols)})

def guid():
    return uuid.uuid4().hex

path = 'test_{0}.feather'.format(guid())

try:
    feather.write_dataframe(df, path)
    df2 = feather.read_dataframe(path)
    assert_frame_equal(df, df2)
finally:
    try:
        os.remove(path)
    except os.error:
        pass
Example #50
0
import argparse
import feather
import magic
#import os

parser = argparse.ArgumentParser(description='wrapper for magic')

parser.add_argument('--matx', dest='matx', help='Matx path')
parser.add_argument('--out', dest='out', help='Output path')
#parser.add_argument('--maxCellSize', dest='maxCS',type=int,default=1000000,help='Max num of reads allow in a cell')
#parser.add_argument('--minCellSize', dest='minCS',type=int,default=1,help='Min num of reads allow in a cell')

args = parser.parse_args()

# Load single-cell RNA-seq data
df = feather.read_dataframe(args.matx)
scdata = magic.mg.SCData(df, 'sc-seq')

# MAGIC
scdata.run_magic(n_pca_components=15,
                 random_pca=True,
                 t=6,
                 k=30,
                 ka=10,
                 epsilon=1,
                 rescale_percent=99)

#output
feather.write_dataframe(scdata.magic.data, args.out)
Example #51
0
def fix_test(df):
    """
    test has missing values for ci and co.
    ci is, on average, 35 days after date_Time
    co is, on average, 2.4 days after ci
    """
    df.date_time = pd.to_datetime(df.date_time, errors='coerce')
    df.srch_ci = pd.to_datetime(df.srch_ci, errors='coerce')
    df.srch_co = pd.to_datetime(df.srch_co, errors='coerce')
    df.srch_ci = df.srch_ci.fillna(df.date_time + timedelta(days=35))
    df.srch_co = df.srch_co.fillna(df.srch_ci + timedelta(days=2))
    return df

print(78*'=')
print("Reading train...")
df_train = feather.read_dataframe('../data/train_only_booked.feather')
print("Creating Features for Train...")
df_train_features = create_features(df_train, train=True)
print("Writing Feather...")
feather.write_dataframe(df_train_features, '../data/train_only_booked_features.feather')
gc.collect()

print(78*'=')
print("Reading holdout...")
df_hold = feather.read_dataframe('../data/holdout.feather')
print("Munging Holdout")
df_hold_feat = create_features(df_hold)
print("Writing Feather...")
feather.write_dataframe(df_hold_feat, '../data/holdout_features.feather')
gc.collect()
Example #52
0
 def from_uri(cls, uri: str, source: Optional[DataObject] = None, **kwargs) -> "FeatherFile":
     data = feather.read_dataframe(uri)
     result = cls(inner_data=data, uri=uri, source=source, **kwargs)
     return result
Example #53
0
]
plot_excl = ["BARW_8"]

# The palette with black:
cbbPalette = [
    "#000000", "#E69F00", "#56B4E9", "#009E73", "#0072B2", "#D55E00", "#CC79A7"
]

site_data = pd.read_excel(
    "/mnt/win/UMoncton/OneDrive - Université de Moncton/Data/sites_deployment_2018.xlsx"
)
# site_data = pd.read_excel(
#     "C:\\UMoncton\\Doctorat\\data\\datasheet\\2018\\sites_deployment_2018.xlsx")

# aci = feather.read_dataframe("src/plots/ACI.feather")
aci = feather.read_dataframe("data/ACI.feather")
print(aci)
# aci.date = aci.date.dt.tz_localize("UTC")
aci = aci.loc[aci.site.isin(sites)]
aci = aci.loc[~aci["plot"].isin(plot_excl)]
aci["julian"] = aci["date"].dt.dayofyear
aci["hour"] = aci["date"].dt.hour
aci = aci.sort_values(["site", "plot", "julian", "date"])
aci = aci.loc[(aci["julian"] > 155) & (aci["julian"] < 220)]
aci = aci.loc[(aci["ACI"] < 50000)]
aci = aci.loc[aci["denoised"] == False]

print(aci.loc[aci["plot"] == "EABA_1"])
aci = aci.reset_index()
res = aci.groupby(["plot"], as_index=False).apply(check_dates, site_data)
print(res)
def load_sample_data(sample, filename):
    tbl = pd.read_csv(filename, index_col=0)
    tbl.columns = ['{}:{}'.format(sample, c) for c in tbl.columns]
    return tbl

def column_sortkey(name):
    sample, c = name.split(':', 1)
    return '{}:{}'.format(GENELEVEL_STATS_COLUMN_ORDER.get(c, c), sample)

# Load all tables
samples = sm.params.samples_by_genome[sm.wildcards.genome]
alldatatbl = pd.concat(list(map(load_sample_data, samples, sm.input)), axis=1)

# Merge selected columns from gene annotations
genes = feather.read_dataframe(os.path.join(sm.params.genomedir, 'annotations-gene.feather'))
for alias in GENETYPE_ALIASES:
    if alias in genes.columns:
        genes['gene_type'] = genes[alias]
genes = genes.set_index('gene_id')[GENEINFO_COLUMNS]

# Determine orders of columns and rows
cols_order = GENEINFO_COLUMNS + sorted(alldatatbl.columns, key=column_sortkey)
alldatatbl['__total_tags'] = alldatatbl[[c for c in alldatatbl.columns
                                        if c.endswith(':polyA_tag_count')]].sum(axis=1)
finaltbl = pd.merge(genes, alldatatbl, how='right', left_index=True,
                    right_index=True).sort_values(
                    by='__total_tags', ascending=False)[cols_order]
del alldatatbl, genes

# Write out in the csv format
Example #55
0
# -*- coding: utf-8 -*-

import feather as ft
import pandas as pd
import pickle

daily = ft.read_dataframe(r'E:\marketData.feather')

daily.head()

daily_2017 = daily[daily['date'] >= '2017-01-01']
daily_2017 = daily_2017[['date', 'symbol', 'close', 'preClose']]

daily_2017['daily_return'] = (daily_2017['close'] - daily_2017['preClose']
                              ) * 100 / daily_2017['preClose']

output = open(r'C:\Users\wuwangchuxin\Desktop\dailyreturn.pickle', 'wb')
pickle.dump(daily_2017, output)
output.close()
Example #56
0

#print(datetime.datetime.now())

#### val
#path = 'D:\\workspace_R\\thalas\\20180427\\dataset\\dataset_val_fold3_blind1_trX.feather'
#train_x = feather.read_dataframe(path)
#
#
#path = 'D:\\workspace_R\\thalas\\20180427\\dataset\\dataset_val_fold3_blind1_trY.feather'
#train_y = feather.read_dataframe(path)
#
#path_write = 'D:\\workspace_R\\thalas\\20180427\\result\\chi\\chi_val_fold3_blind1_k.csv'
### val


#### test
path = 'D:\\workspace_R\\thalas\\20180427\\dataset\\dataset_trainX_test5.feather'
train_x = feather.read_dataframe(path)


path = 'D:\\workspace_R\\thalas\\20180427\\dataset\\dataset_trainY_test5.feather'
train_y = feather.read_dataframe(path)

path_write = 'D:\\workspace_R\\thalas\\20180427\\result\\chi\\chi_test_fold5.csv'
### test

chi2, pval = chi2(X=train_x,y=train_y)
x2 = np.asarray(pval)
#np.savetxt("fold1_score.csv", x2, delimiter=",")
np.savetxt(path_write, x2, delimiter=",")
Example #57
0
def read_dataframe(fn):
    df = feather.read_dataframe(fn)
    df.index = df.iloc[:, 0]
    df = df.iloc[:, 1:]
    return df
"""

# Prepare environment and load and prepare data  -----------------------------

import pandas as pd
import nltk
import feather
import re, math
from collections import Counter

#from sklearn.feature_extraction.text import TfidfVectorizer


# Feather formatted dataframes import directly into pandas dataframes
# New module/package collaboration for easy trasnfer R <--> python 
event_dict = feather.read_dataframe('../parsed_data/parsed_dict.feather')
email_df = feather.read_dataframe('../parsed_data/simplified_email.feather')

## Prepare event dictionary dataframe
event_dict.drop(['NA'], inplace=True, axis=1)
event_dict = event_dict.transpose()
event_dict.columns = event_dict.loc['event']
event_dict = event_dict.reindex(event_dict.index.drop(['event']))

# remove occurrences of "x...#" stemming from wikipedia using regular expression
event_dict = event_dict.applymap(lambda z: re.sub(r'(^|\s)x(\w+,)', r'', z))

## Prepare email dataframe
# Convert string in email_raw column to list of strings
#email_df.email_raw = email_df.email_raw.apply(lambda x: x.split(","))
#output_variables
cell_score_output = snakemake.output["cell_score"]
gene_score_output = snakemake.output["gene_score"]
maximum_overlap_output = snakemake.output["maximum_overlaps"]
ranked_genes_output = snakemake.output["ranked_genes"]
model_output = snakemake.output["model_output"]
metrics_output = snakemake.output["metrics_output"]
#params
seed = int(snakemake.wildcards["seed"])
k = int(snakemake.wildcards["k"])
n_trials = int(snakemake.params["n_trials"])
#set seed
np.random.seed(seed)

#read file
df = feather.read_dataframe(counts_input)
#start analysis
sparse_arr = scipy.sparse.coo_matrix(df.to_numpy())
model = hpf.run_trials(sparse_arr,
                       nfactors=k,
                       ntrials=n_trials,
                       validation_data=None)

metrics = pd.DataFrame()
metrics["loss"] = model.loss[-10:]
metrics["k"] = np.repeat(k, 10)
metrics["seed"] = np.repeat(seed, 10)

cell_score = model.cell_score()
gene_score = model.gene_score()
table = hpf.max_pairwise_table(gene_score,
Example #60
0
from dataprocess import data
import featureengineer
from featureengineer import persona_features
import pandas as pd
import feather
import gc

appsnum, tags_nums = featureengineer.appsnum, featureengineer.tags_nums

lgbOut_Features = persona_features.main()

afterExpand_df_path = persona_features.afterExpand_df_path

noExpand_dfPath = data.noExpand_dfPath

data = feather.read_dataframe(afterExpand_df_path)
persona_df = pd.read_pickle(noExpand_dfPath)
data = data.merge(persona_df, how='left', on='guid')
del persona_df
gc.collect()

sparse_features = [
    'app_version', 'guid', 'netmodel', 'newsid', 'geohash', 'ts_hour',
    'device_info', 'gender'
]

dense_features = [
    'pos', 'level', 'personidentification', 'followscore', 'personalscore'
]

var_features = ['applist', 'new_tag']