Example #1
0
    def test_sas_buffer_format(self):
        # see gh-14947
        b = StringIO("")

        msg = ("If this is a buffer object rather than a string "
               "name, you must specify a format string")
        with pytest.raises(ValueError, match=msg):
            read_sas(b)
Example #2
0
 def test_iterator_read_too_much(self):
     # github #14734
     k = self.test_ix[0][0]
     fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k)
     rdr = pd.read_sas(fname, format="sas7bdat",
                       iterator=True, encoding='utf-8')
     d1 = rdr.read(rdr.row_count + 20)
     rdr = pd.read_sas(fname, iterator=True, encoding="utf-8")
     d2 = rdr.read(rdr.row_count + 20)
     tm.assert_frame_equal(d1, d2)
def test_12659(datapath):
    fname = datapath("io", "sas", "data", "test_12659.sas7bdat")
    df = pd.read_sas(fname)
    fname = datapath("io", "sas", "data", "test_12659.csv")
    df0 = pd.read_csv(fname)
    df0 = df0.astype(np.float64)
    tm.assert_frame_equal(df, df0)
def test_airline(datapath):
    fname = datapath("io", "sas", "data", "airline.sas7bdat")
    df = pd.read_sas(fname)
    fname = datapath("io", "sas", "data", "airline.csv")
    df0 = pd.read_csv(fname)
    df0 = df0.astype(np.float64)
    tm.assert_frame_equal(df, df0, check_exact=False)
Example #5
0
def test_many_columns(datapath):
    # Test for looking for column information in more places (PR #22628)
    fname = datapath("io", "sas", "data", "many_columns.sas7bdat")
    df = pd.read_sas(fname, encoding='latin-1')
    fname = datapath("io", "sas", "data", "many_columns.csv")
    df0 = pd.read_csv(fname, encoding='latin-1')
    tm.assert_frame_equal(df, df0)
Example #6
0
 def test_from_file(self):
     for j in 0, 1:
         df0 = self.data[j]
         for k in self.test_ix[j]:
             fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k)
             df = pd.read_sas(fname, encoding='utf-8')
             tm.assert_frame_equal(df, df0)
def test_encoding_options(datapath):
    fname = datapath("io", "sas", "data", "test1.sas7bdat")
    df1 = pd.read_sas(fname)
    df2 = pd.read_sas(fname, encoding='utf-8')
    for col in df1.columns:
        try:
            df1[col] = df1[col].str.decode('utf-8')
        except AttributeError:
            pass
    tm.assert_frame_equal(df1, df2)

    from pandas.io.sas.sas7bdat import SAS7BDATReader
    rdr = SAS7BDATReader(fname, convert_header_text=False)
    df3 = rdr.read()
    rdr.close()
    for x, y in zip(df1.columns, df3.columns):
        assert(x == y.decode())
def test_productsales(datapath):
    fname = datapath("io", "sas", "data", "productsales.sas7bdat")
    df = pd.read_sas(fname, encoding='utf-8')
    fname = datapath("io", "sas", "data", "productsales.csv")
    df0 = pd.read_csv(fname, parse_dates=['MONTH'])
    vn = ["ACTUAL", "PREDICT", "QUARTER", "YEAR"]
    df0[vn] = df0[vn].astype(np.float64)
    tm.assert_frame_equal(df, df0)
Example #9
0
def test_12659():
    dirpath = tm.get_data_path()
    fname = os.path.join(dirpath, "test_12659.sas7bdat")
    df = pd.read_sas(fname)
    fname = os.path.join(dirpath, "test_12659.csv")
    df0 = pd.read_csv(fname)
    df0 = df0.astype(np.float64)
    tm.assert_frame_equal(df, df0)
Example #10
0
def test_airline():
    dirpath = tm.get_data_path()
    fname = os.path.join(dirpath, "airline.sas7bdat")
    df = pd.read_sas(fname)
    fname = os.path.join(dirpath, "airline.csv")
    df0 = pd.read_csv(fname)
    df0 = df0.astype(np.float64)
    tm.assert_frame_equal(df, df0, check_exact=False)
Example #11
0
def test_productsales():
    dirpath = tm.get_data_path()
    fname = os.path.join(dirpath, "productsales.sas7bdat")
    df = pd.read_sas(fname, encoding='utf-8')
    fname = os.path.join(dirpath, "productsales.csv")
    df0 = pd.read_csv(fname, parse_dates=['MONTH'])
    vn = ["ACTUAL", "PREDICT", "QUARTER", "YEAR"]
    df0[vn] = df0[vn].astype(np.float64)
    tm.assert_frame_equal(df, df0)
Example #12
0
 def test_from_buffer(self):
     for j in 0, 1:
         df0 = self.data[j]
         for k in self.test_ix[j]:
             fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k)
             byts = open(fname, 'rb').read()
             buf = io.BytesIO(byts)
             df = pd.read_sas(buf, format="sas7bdat", encoding='utf-8')
             tm.assert_frame_equal(df, df0, check_exact=False)
Example #13
0
 def test_path_localpath(self):
     from py.path import local as LocalPath
     for j in 0, 1:
         df0 = self.data[j]
         for k in self.test_ix[j]:
             fname = LocalPath(os.path.join(
                 self.dirpath, "test{k}.sas7bdat".format(k=k)))
             df = pd.read_sas(fname, encoding='utf-8')
             tm.assert_frame_equal(df, df0)
Example #14
0
def test_date_time():
    # Support of different SAS date/datetime formats (PR #15871)
    dirpath = tm.get_data_path()
    fname = os.path.join(dirpath, "datetime.sas7bdat")
    df = pd.read_sas(fname)
    fname = os.path.join(dirpath, "datetime.csv")
    df0 = pd.read_csv(fname, parse_dates=['Date1', 'Date2', 'DateTime',
                                          'DateTimeHi', 'Taiw'])
    tm.assert_frame_equal(df, df0)
 def test_path_pathlib(self):
     tm._skip_if_no_pathlib()
     from pathlib import Path
     for j in 0, 1:
         df0 = self.data[j]
         for k in self.test_ix[j]:
             fname = Path(os.path.join(self.dirpath, "test%d.sas7bdat" % k))
             df = pd.read_sas(fname, encoding='utf-8')
             tm.assert_frame_equal(df, df0)
Example #16
0
 def test_from_iterator(self):
     for j in 0, 1:
         df0 = self.data[j]
         for k in self.test_ix[j]:
             fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k)
             rdr = pd.read_sas(fname, iterator=True, encoding='utf-8')
             df = rdr.read(2)
             tm.assert_frame_equal(df, df0.iloc[0:2, :])
             df = rdr.read(3)
             tm.assert_frame_equal(df, df0.iloc[2:5, :])
def test_date_time(datapath):
    # Support of different SAS date/datetime formats (PR #15871)
    fname = datapath("io", "sas", "data", "datetime.sas7bdat")
    df = pd.read_sas(fname)
    fname = datapath("io", "sas", "data", "datetime.csv")
    df0 = pd.read_csv(fname, parse_dates=['Date1', 'Date2', 'DateTime',
                                          'DateTimeHi', 'Taiw'])
    # GH 19732: Timestamps imported from sas will incur floating point errors
    df.iloc[:, 3] = df.iloc[:, 3].dt.round('us')
    tm.assert_frame_equal(df, df0)
Example #18
0
 def test_iterator_loop(self):
     # github #13654
     for j in 0, 1:
         for k in self.test_ix[j]:
             for chunksize in 3, 5, 10, 11:
                 fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k)
                 rdr = pd.read_sas(fname, chunksize=10, encoding='utf-8')
                 y = 0
                 for x in rdr:
                     y += x.shape[0]
                 self.assertTrue(y == rdr.row_count)
Example #19
0
def df_fromsas(fullpath, id_lbl='ind_id'):
    ''' convert .sas7bdat to dataframe.
        unused because fails on incorrectly formatted files. '''

    # read csv in as dataframe
    df = pd.read_sas(fullpath, format='sas7bdat')

    # convert id to str and save as new column
    df[id_lbl] = df[id_lbl].apply(int).apply(str)
    df['ID'] = df[id_lbl]

    return df
Example #20
0
 def test_from_buffer(self):
     for j in 0, 1:
         df0 = self.data[j]
         for k in self.test_ix[j]:
             fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k)
             with open(fname, 'rb') as f:
                 byts = f.read()
             buf = io.BytesIO(byts)
             rdr = pd.read_sas(buf, format="sas7bdat",
                               iterator=True, encoding='utf-8')
             df = rdr.read()
             tm.assert_frame_equal(df, df0, check_exact=False)
             rdr.close()
Example #21
0
 def test_from_iterator(self):
     for j in 0, 1:
         df0 = self.data[j]
         for k in self.test_ix[j]:
             fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k)
             byts = open(fname, 'rb').read()
             buf = io.BytesIO(byts)
             rdr = pd.read_sas(buf, format="sas7bdat",
                               iterator=True, encoding='utf-8')
             df = rdr.read(2)
             tm.assert_frame_equal(df, df0.iloc[0:2, :])
             df = rdr.read(3)
             tm.assert_frame_equal(df, df0.iloc[2:5, :])
Example #22
0
 def test_iterator_loop(self):
     # github #13654
     for j in 0, 1:
         for k in self.test_ix[j]:
             for chunksize in 3, 5, 10, 11:
                 fname = os.path.join(
                     self.dirpath, "test{k}.sas7bdat".format(k=k))
                 rdr = pd.read_sas(fname, chunksize=10, encoding='utf-8')
                 y = 0
                 for x in rdr:
                     y += x.shape[0]
                 assert y == rdr.row_count
                 rdr.close()
Example #23
0
def test_compact_numerical_values(datapath):
    # Regression test for #21616
    fname = datapath("io", "sas", "data", "cars.sas7bdat")
    df = pd.read_sas(fname, encoding='latin-1')
    # The two columns CYL and WGT in cars.sas7bdat have column
    # width < 8 and only contain integral values.
    # Test that pandas doesn't corrupt the numbers by adding
    # decimals.
    result = df['WGT']
    expected = df['WGT'].round()
    tm.assert_series_equal(result, expected, check_exact=True)
    result = df['CYL']
    expected = df['CYL'].round()
    tm.assert_series_equal(result, expected, check_exact=True)
Example #24
0
File: io.py Project: adgirish/ray
def read_sas(filepath_or_buffer,
             format=None,
             index=None,
             encoding=None,
             chunksize=None,
             iterator=False):

    warnings.warn("Defaulting to Pandas implementation",
                  PendingDeprecationWarning)

    port_frame = pd.read_sas(filepath_or_buffer, format, index, encoding,
                             chunksize, iterator)
    ray_frame = from_pandas(port_frame, get_npartitions())

    return ray_frame
Example #25
0
    def reader(self,input,delimiter=None):
        # write in the following formats: xls,xlsx, csv, txt,
        if input.endswith(('.xls','.xlsx','.XLS','.XLSX')):
            tmpDict = pd.read_excel(input,sheetname=None)
            dropList=[]
            for sheet in tmpDict:
                if tmpDict[sheet].empty:
                    dropList.append(sheet)
            for key in dropList:
                del tmpDict[key]
            return tmpDict.values()

        elif input.endswith('.csv'):
            tmp = pd.read_csv(input)
            return [tmp]

        elif input.endswith('.sas7bdat'):
            tmp = pd.read_sas(input)
            return [tmp]
        elif input.endswith(".jmp"):
            return False
Example #26
0
import numpy as np
import skimage
from KPCA_self import kPCA
from scipy.spatial.distance import pdist, squareform, cdist
from scipy.linalg import eigh
from sklearn.metrics.pairwise import euclidean_distances
import numpy as np
from sklearn.decomposition import KernelPCA

results = []
from sas7bdat import SAS7BDAT
with SAS7BDAT('pricedata_17.sas7bdat') as f:
    for row in f:
        pass

saledata = pd.read_sas('pricedata_17.sas7bdat')
saledata = saledata['sale']

#f = lambda x: x.sort_values(ascending=True).reset_index(drop=True)
#sale_t=saledata.groupby(['product']).sale.apply(f).unstack()
#sm=pd.DataFrame.as_matrix(sale_t)
#
#
#grouped=saledata.groupby('product').apply(np.matrix.transpose)

saledata_prod1 = saledata.iloc[:60]
saledata_prod1.plot(x='index', y='sale')

sm = pd.DataFrame.as_matrix(saledata)
sm = sm[0:60]
sm = np.reshape(sm, (60, 1))
Example #27
0
def test_zero_variables():
    # Check if the SAS file has zero variables (PR #18184)
    dirpath = tm.get_data_path()
    fname = os.path.join(dirpath, "zero_variables.sas7bdat")
    with pytest.raises(EmptyDataError):
        pd.read_sas(fname)
Example #28
0
def read_xml(location, result):
    validate_dtd = False
    if result[RDF.type:setl.DTDValidatedXML]:
        validate_dtd = True
    f = iterparse_filter.IterParseFilter(validate_dtd=validate_dtd)
    if result.value(setl.xpath) is None:
        f.iter_end("/*")
    for xp in result[setl.xpath]:
        f.iter_end(xp.value)
    for (i, (event, ele)) in enumerate(f.iterparse(get_content(location))):
        yield i, ele


extractors = {
    setl.XPORT:
    lambda location, result: pandas.read_sas(get_content(location),
                                             format='xport'),
    setl.SAS7BDAT:
    lambda location, result: pandas.read_sas(get_content(location),
                                             format='sas7bdat'),
    setl.Excel:
    read_excel,
    csvw.Table:
    read_csv,
    OWL.Ontology:
    read_graph,
    void.Dataset:
    read_graph,
    setl.JSON:
    lambda location, result: enumerate(
        ijson.items(get_content(location),
                    result.value(api_vocab.selector, default=""))),
# In[ ]:

df = pd.DataFrame()
df.to_stata(r'C:\file.dta')


# *Note: make sure you have the latest version of Pandas for new Stata versions*

# ### SAS

# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sas.html  
# This function works in most cases but files with text can throw nasty encoding errors.

# In[ ]:

sas_file = pd.read_sas(r'C:\file.sas7bdat', format='sas7bdat')


# ## JSON files

# ### Convert JSON file to Pandas dataframe

# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_json.html  
# 
# *Note:* The path can also be a url

# In[ ]:

json_df = pd.read_json(r'C:\file.json')

    plt.plot([1,2,3], [1,3,2], label="col1")
    plt.plot([1,2,3],  [2,1,3], label="col2")
    plt.legend(loc='best')
    plt.grid(True)
    plt.title('Title')
    txt = 'this is an example'
    plt.text(0.05,0.95,txt, transform=fig.transFigure, size=24)
    pdf.savefig()
    plt.close()	
	
	
# Read file, apply desired sampling, and create permanent file
def data_read_sampling (_file, _sample_size):
	email_end(_subject = ' data_read_sampling ' + str(_sample_size) + ' Start ')
	# Load full data 
	df = pd.read_sas(filepath_or_buffer = _file)
	# Sample 
	if len(_sample_size) > 0:
		df_s = df.sample(_sample_size)
		email_end(_subject = ' data_read_sampling ' + str(_sample_size) + ' Done ')
		return df_s
	else:
		email_end(_subject = ' data_read_sampling ' + str(_sample_size) + ' Done ')
		return df


# Convert array to list 
array.tolist()		

# Convert data frame to numpy array
data_array = data.values
Example #31
0
 def extract_data(self, table_name, col_names):
     data_frame = pd.read_sas(self.file_path)
     return pd.DataFrame(data_frame, columns=col_names)
Example #32
0
def test_zero_variables():
    # Check if the SAS file has zero variables (PR #18184)
    dirpath = tm.get_data_path()
    fname = os.path.join(dirpath, "zero_variables.sas7bdat")
    with pytest.raises(EmptyDataError):
        pd.read_sas(fname)
def read_raw_data(write_data=False):
    """
    ITA data is stored by year in stata files that take awhile to read in.
    These were read in once, the desired columns were kept and data was
    written to the location above in /raw
    """
    years = np.arange(2005, 2017, 1)

    df_list = []
    sdict = {}
    for year in years:
        print("Starting on year {}".format(year))
        start = time.time()
        if year == 2007:
            print("2007 Stata is corrupted as of 2018-01-12")
            fpath = "FILENAME"\
                    "gbd 2005-2016 sas7bdat files/gbd_2007.sas7bdat"
            df = pd.read_sas(fpath)
        else:

            fpath = r"FILENAME"\
                    "FILENAME"\
                    "FILEPATH".format(year)
            df = pd.read_stata(fpath)
            read_time = (time.time() - start) / 60
            print("It took {} min to read in year {}".format(read_time, year))

            sdict[year] = df.columns.tolist()

        try:

            ideal_keep = [
                'sesso', 'eta', 'reg_ric', 'regric', 'gg_deg', 'mod_dim',
                'mot_dh', 'tiporic', 'dpr', 'dsec1', 'dsec2', 'dsec3', 'dsec4',
                'dsec5', 'causa_ext', 'tip_ist2', 'data_ric', 'data_dim',
                'data_ricA', 'data_dimA'
            ]
            to_keep = [n for n in ideal_keep if n in df.columns]
            print(("Missing {} for this year".format(
                [n for n in ideal_keep if n not in df.columns])))
            df = df[to_keep]
            df['year_start'] = year
            df_list.append(df)
        except:
            print("well that didn't work for {}".format(year))
        del df

    if write_data:
        df = pd.concat(df_list, ignore_index=True)

        df.loc[(df.data_ric.isnull()) & (df.data_ricA.notnull()), 'data_ric'] =\
            df.loc[(df.data_ric.isnull()) & (df.data_ricA.notnull()), 'data_ricA']
        df.loc[(df.data_dim.isnull()) & (df.data_dimA.notnull()), 'data_dim'] =\
            df.loc[(df.data_dim.isnull()) & (df.data_dimA.notnull()), 'data_dimA']

        df.drop(['data_ricA', 'data_dimA'], axis=1, inplace=True)

        df['gg_deg'] = pd.to_numeric(df.gg_deg, errors='raise')

        write_hosp_file(df, "FILEPATH")
    return df_list
import pandas as pd

codes = pd.read_csv("./data/London_District_codes.csv")
socio = pd.read_spss("./data/London_ward_data_socioeconomic.sav")
health = pd.read_sas("./data/london_ward_data_health.sas7bdat",
                     format='sas7bdat',
                     encoding='latin1')

health = health.drop('Population2011Census', axis=1)

env = pd.read_csv("./data/London_ward_data_environment.csv")
demo = pd.read_csv("./data/London_ward_data_demographics.dat", delimiter='\t')

socio['Districtcode'] = socio['Wardcode'].str[:-2]
socio_env = pd.merge(socio, env, on='Wardcode')

codes['Districtcode'] = codes['Districtcode']\
    .replace(r'\s', '', regex=True)

health[['District', 'Ward', 'remove',
        'remove']] = health['Wardname'].str.split('-', expand=True)
health['District'] = health['District'].str[:-1]
health = health.drop(['Wardname', 'remove'], axis=1)
total_df = pd.merge(socio_env, codes, on='Districtcode')
total_df = pd.merge(total_df, health, on='District')

demo[
    ['District',
        'Ward',
        'remove',
        'remove']
Example #35
0
 def time_read_msgpack(self, format):
     read_sas(self.f, format=format)
Example #36
0
 def time_packers_read_sas7bdat(self):
     pd.read_sas(self.f, format='sas7bdat')
import os, pandas as pd, numpy as np, gc
from matplotlib import pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from hyperopt import fmin, hp, tpe, STATUS_OK, Trials
import lightgbm as lgbm
import warnings
warnings.filterwarnings("ignore")

##import data from sasfile

analysis = pd.read_sas(
    'Dataset/combined_vars_04092019.sas7bdat', encoding='latin1').drop([
        'TRR_ID', 'TRANSPLANT_DT', 'TRANSPLANT_DISCHARGE_DT', 'READMISSION_DT'
    ],
                                                                       axis=1)
biomarker = pd.read_csv('Dataset/feature_extracted.csv').drop('CODE_REHOSP', 1)
X = analysis.merge(biomarker,
                   on='PERSON_ID').drop(['CODE_REHOSP', 'PERSON_ID'], 1)
y = analysis['CODE_REHOSP'].replace(2, 0)
cat_col = [
    X.columns.get_loc(col) for col in X.columns if 2 < X[col].nunique() <= 10
]

for col in X.columns:
    if X[col].dtype == 'O':
        X[col] = LabelEncoder().fit_transform(X[col].fillna('Unknown'))
    elif 2 < X[col].nunique() <= 10:
        X[col] = LabelEncoder().fit_transform(X[col].fillna(99))
Example #38
0
import pandas as pd
import plotly.graph_objects as go
from plotly.offline import plot

# Create random data with numpy
data = pd.read_sas(
    r"C:\Users\luoz14\OneDrive - Pfizer\Test Data\adpc_f001.sas7bdat",
    encoding='latin1')

data = data.drop_duplicates(['_X', 'TRTA'])

fig = go.Figure()

trt1 = data.query("TRTA=='60 mg IR'")
trt2 = data.query("TRTA=='60 mg MR1'")
trt3 = data.query("TRTA=='60 mg MR1, fed'")
# Add traces
fig.add_trace(
    go.Scatter(x=trt1._X,
               y=trt1.AVAL,
               mode='lines+markers',
               name='lines+markers'))
fig.add_trace(
    go.Scatter(x=trt2._X,
               y=trt2.AVAL,
               mode='lines+markers',
               name='lines+markers'))
fig.add_trace(
    go.Scatter(x=trt3._X,
               y=trt3.AVAL,
               mode='lines+markers',
Example #39
0
# In[14]:

pd.read_excel("data/sample.xls", sheet_name=0)

# ##### Read Specific Columns

# In[17]:

pd.read_excel("data/sample.xls", sheet_name=0, usecols="A:C")

# #### Statistical Softwares

# ##### SAS

# In[18]:

pd.read_sas("data/airline.sas7bdat")

# ##### STATA

# In[19]:

pd.read_stata("data/airline.dta")

# #### Summary
#
# - use read_csv for flat/delimited files
# - use read_excel for excel files
# - use read_sas for SAS files
# - use read_stat for STATA files
Example #40
0
```

The default value of the counter is 1 but you can modify this with the `by` option. Also, the index value `i` can be called whatever you want (i.e. `opportunity` would also work). This is similar to a `forval` loop in Stata.

```sas
data out.A;
    do i = 1 to 5;
        y = i*2;
        output;
    end;
run;
```

import pandas as pd

example = pd.read_sas("../data/a.sas7bdat", encoding="latin-1");
example.head(n=50)

```sas
data out.A;
    do i = 1 to 100 by 20;
        y = i*2;
        output;
    end;
run;
```

example = pd.read_sas("../data/a_by.sas7bdat", encoding="latin-1");
example.head(n=50)

Or in open code
Example #41
0
@author: Atte
"""
import pandas as pd
import numpy as np
from extract_birth import extract_birth
from extract_mal import extract_mal
from combine_mal_birth import combine_mal_birth
from squeeze_diag import squeeze_diag
from combine_diag_ped import combine_diag_ped
from load_and_process_ped import load_and_process_ped

#load birth and malformation data and extract them 
bdate = pd.read_csv("/homes/afohr/data/bdate_sex.csv", parse_dates=[1]) # [id, bdate, sex]

mal = pd.read_sas('/homes/aliu/DSGE_LRS/input/anomalies_children_1987_2015_dg.sas7bdat', encoding='latin-1') 
li = ['TNRO','MANNER_OF_BIRTH', 'ICD9', 'ICD10']
mal = extract_mal(mal ,li) #[id, icd9, icd10]
mal2 = pd.read_sas('/homes/aliu/DSGE_LRS/input/anomalies_children_2016_dg.sas7bdat', encoding='latin-1')
mal2 = extract_mal(mal2, li)

#combine mals
mal = pd.concat([mal, mal2])

#combine birth and mal
mal = combine_mal_birth(mal, bdate) #[id, date,icd9, icd10]

#squeeze diags
mal = squeeze_diag(mal) #[id, date, diag]

#load ped 
Example #42
0
def main():

    readinData = pd.read_sas("Data/cust_info_model.sas7bdat")
    #print len(readinData)
    #print readinData.columns
    #dict = {}
    print readinData
    readinData.pop('Cust_Id')
    '''
    readinData = readinData[(readinData['Gender_Cd'] == 'M') | (readinData['Gender_Cd'] == 'F')]
    readinData['guy'] = readinData['Age'].copy(deep=True)
    readinData['girl'] = readinData['Age'].copy(deep=True)
    readinData['guy'][readinData['Gender_Cd'] == 'M'] = 1
    readinData['guy'][readinData['Gender_Cd'] == 'F'] = 0
    readinData['girl'][readinData['Gender_Cd'] == 'F'] = 1
    readinData['girl'][readinData['Gender_Cd'] == 'M'] = 0

    
    readinData['Age_under_30'] = readinData['Age'].copy(deep=True)
    readinData['Age_under_30'][readinData['Age'] < 30] = 1
    readinData['Age_under_30'][readinData['Age'] >= 30] = 0

    readinData['Age_30_65'] = readinData['Age'].copy(deep=True)
    readinData['Age_30_65'][readinData['Age'] < 30] = 0
    readinData['Age_30_65'][(readinData['Age'] >= 30) & (readinData['Age'] <= 65)] = 1
    readinData['Age_30_65'][readinData['Age'] > 65] = 0

    readinData['Age_65_above'] = readinData['Age'].copy(deep=True)
    readinData['Age_65_above'][readinData['Age'] <= 65] = 0
    readinData['Age_65_above'][readinData['Age'] > 65] = 1

    readinData['Year_use'] = readinData['Fst_ACCT_OPEN_DT'].copy(deep=True)
    readinData['Year_use'] = pd.to_numeric(readinData['Year_use'].astype(str).str[:4])

    readinData['Year_use_under_1'] = readinData['Year_use'].copy(deep=True)
    readinData['Year_use_under_1'][readinData['Year_use_under_1'] < 2017] = 0
    readinData['Year_use_under_1'][readinData['Year_use_under_1'] >= 2017] = 1

    readinData['Year_use_1_2'] = readinData['Year_use'].copy(deep=True)
    readinData['Year_use_1_2'][readinData['Year_use_1_2'] < 2016] = 0
    readinData['Year_use_1_2'][(readinData['Year_use_1_2'] >= 2016) & (readinData['Year_use_1_2'] < 2017)] = 1
    readinData['Year_use_1_2'][readinData['Year_use_1_2'] >= 2017] = 0

    readinData['Year_use_3_5'] = readinData['Year_use'].copy(deep=True)
    readinData['Year_use_3_5'][readinData['Year_use_3_5'] < 2013] = 0
    readinData['Year_use_3_5'][(readinData['Year_use_3_5'] >= 2013) & (readinData['Year_use_3_5'] < 2016)] = 1
    readinData['Year_use_3_5'][readinData['Year_use_3_5'] >= 2016] = 0

    readinData['Year_use_5_above'] = readinData['Year_use'].copy(deep=True)
    readinData['Year_use_5_above'][readinData['Year_use_5_above'] < 2013] = 1
    readinData['Year_use_5_above'][readinData['Year_use_5_above'] >= 2013] = 0

    readinData["AUM_0_5"] = readinData['clum28'].copy(deep=True)
    readinData['AUM_0_5'][readinData['AUM_0_5'] <= 0] = 0
    readinData['AUM_0_5'][(readinData['AUM_0_5'] <= 5) & (readinData['AUM_0_5'] > 0)] = 1
    readinData['AUM_0_5'][readinData['AUM_0_5'] > 5] = 0

    readinData["AUM_5_20"] = readinData['clum28'].copy(deep=True)
    readinData['AUM_5_20'][readinData['AUM_5_20'] <= 5] = 0
    readinData['AUM_5_20'][(readinData['AUM_5_20'] <= 20) & (readinData['AUM_5_20'] > 5)] = 1
    readinData['AUM_5_20'][readinData['AUM_5_20'] > 20] = 0

    readinData["AUM_20_100"] = readinData['clum28'].copy(deep=True)
    readinData['AUM_20_100'][readinData['AUM_20_100'] <= 20] = 0
    readinData['AUM_20_100'][(readinData['AUM_20_100'] <= 100) & (readinData['AUM_20_100'] > 20)] = 1
    readinData['AUM_20_100'][readinData['AUM_20_100'] > 100] = 0

    readinData["AUM_above_100"] = readinData['clum28'].copy(deep=True)
    readinData['AUM_above_100'][readinData['AUM_above_100'] <= 100] = 0
    readinData['AUM_above_100'][readinData['AUM_above_100'] > 100] = 1
    '''

    dlist = [
        "Is_NW_Cust", "Is_PB_Cust", "Is_WE_Cust", "Is_DFDK_Cust", "clu19",
        "clu20", "clu21", "clu212", "clu213", "clu214", "indicator_new"
    ]
    ldlen = len(dlist)
    for i in range(ldlen):
        dname = dlist[i]
        readinData[dname] = pd.to_numeric(readinData[dname])

    list = [
        "clu72", "clu73", "clu74", "clu75", "CB_CT_TX_NUM", "CB_PB_TX_NUM",
        "CB_PP_TX_NUM", "CB_NW_TX_NUM", "CB_WE_TX_NUM", "CB_ATM_TX_NUM",
        "CB_EP_TX_NUM", "CB_POS_TX_NUM"
    ]
    llen = len(list)
    for i in range(llen):
        name = list[i]
        #new_name = "New_" + name
        #print new_name
        median = readinData[name].quantile(0.5)
        readinData[dname] = readinData[dname].fillna(median)
        #print median
        #readinData[name] = readinData[name].copy(deep=True)
        readinData[name][readinData[name] <= median] = 0
        readinData[name][readinData[name] > median] = 1

    alist = [
        "clu37", "clu38", "clu39", "clu40", "clu41", "clu42", "clu43", "clu44",
        "clu45", "clu46", "clu47", "clu471", "clu48"
    ]
    alen = len(alist)
    for i in range(alen):
        aname = alist[i]
        # new_name = "New_" + name
        # print new_name
        readinData[aname] = readinData[aname].fillna(0)
        # print median
        # readinData[name] = readinData[name].copy(deep=True)
        readinData[aname][readinData[aname] <= 0] = 0
        readinData[aname][readinData[aname] > 0] = 1

    readinData.pop('Age')
    readinData.pop('Fst_ACCT_OPEN_DT')
    readinData.pop('clum28')
    #readinData.pop('Year_use')
    readinData.pop('Gender_Cd')
    readinData.pop('Is_PP_Cust')
    readinData.pop('clu73')
    readinData.pop('clu20')

    readinData = readinData.rename(
        columns={
            'clu19': '是否持有信用卡',
            'clu20': '是否持有借记卡',
            'clu21': '是否持有存折',
            'clu212': '是否持有存单',
            'clu213': '是否持有定期一本通',
            'clu214': '是否持有活期一本通',
            'AUM_0_5': 'AUM资产在0至5万之间客户数',
            'clu37': '持有定期产品数量',
            'clu38': '持有大额存单数量',
            'clu39': '理财产品数量',
            'clu40': '基金产品数量',
            'clu41': '贵金属产品数量',
            'clu42': '信托产品数量',
            'clu43': '代销储蓄国债产品数量',
            'clu44': '代理保险产品数量',
            'clu45': '银证第三方存管产品数量',
            'clu46': '个人消费贷款产品数量',
            'clu47': '个人经营贷款产品数量',
            'clu471': '个人委托贷款产品数量',
            'clu48': '信用卡数量',
            'clu72': '定期存款业务活跃度',
            'clu74': '贷款业务活跃度',
            'clu75': '理财业务活跃度',
            'Is_NW_Cust': '是否开通网上银行业务',
            'Is_PB_Cust': '是否开通手机银行业务',
            'Is_WE_Cust': '是否开通微信银行业务',
            'Is_DFDK_Cust': '是否代发客户',
            'CB_CT_TX_NUM': '核心客户柜面使用频率',
            'CB_PB_TX_NUM': '核心客户手机银行使用频率',
            'CB_PP_TX_NUM': '核心客户手机贴膜卡使用频率',
            'CB_NW_TX_NUM': '核心客户网上银行使用频率',
            'CB_WE_TX_NUM': '核心客户微信银行使用频率(非动帐)',
            'CB_ATM_TX_NUM': '核心客户ATM使用频率',
            'CB_EP_TX_NUM': '核心客户第三方支付平台使用频率',
            'CB_POS_TX_NUM': '核心客户POS/TPOS使用频率',
            'indicator_new': '是否过路资金账户'
        })
    #print readinData

    #print readinData.columns

    frequent_itemsets = apriori(readinData, min_support=0.1, use_colnames=True)
    rules = association_rules(frequent_itemsets,
                              metric="lift",
                              min_threshold=1)

    rules = rules.sort_values(['confidence'], ascending=False)
    print(rules)
    #print rules.head()

    #writer = pd.ExcelWriter("results/tianjin.xlsx")
    #fos.write(codecs.BOM_UTF8)
    #rules.write()
    rules.to_csv('results/tianjin.csv', encoding='utf_8_sig', index=False)
        write_hosp_file(df, "FILEPATH")
    return df_list


final_list = []

years = np.arange(2005, 2017, 1)
for year in years:
    print("Starting on year {}".format(year))
    start = time.time()

    if year == 2007:
        print("2007 Stata is corrupted as of 2018-01-12")
        fpath = "FILENAME"\
                "gbd 2005-2016 sas7bdat files/gbd_2007.sas7bdat"
        df = pd.read_sas(fpath)
    else:

        fpath = r"FILENAME"\
                "FILENAME"\
                "FILEPATH".format(year)
        df = pd.read_stata(fpath)

        read_time = (time.time() - start) / 60
        print("It took {} min to read in year {}".format(read_time, year))

    try:

        ideal_keep = [
            'sesso', 'eta', 'reg_ric', 'regric', 'gg_deg', 'mod_dim', 'mot_dh',
            'tiporic', 'dpr', 'dsec1', 'dsec2', 'dsec3', 'dsec4', 'dsec5',
Example #44
0
import pandas as pd
import numpy as np
import simplejson as json
from statistics import mode
from decimal import Decimal

# adsldf = pd.read_excel(r"C:\Users\sasg\PycharmProjects\test\src\assets\adsl.xlsx", 'adsl')
adsldf = pd.read_sas(
    r"P:\nn7415\nn7415-4255\ctr_20190130_er\stats\data\adam\xprt\adsl.xpt",
    encoding='UTF-8')


class CatgoSummary:
    def __init__(self, df, anavars, byvars, idvars, stats, totalover=None):
        self.df = df
        self.anavars = self._assign_prop(anavars)
        self.byvars = self._assign_prop(byvars)
        self.idvars = self._assign_prop(idvars)
        self.stats = self._assign_prop(stats)
        self.totalover = self._assign_prop(totalover)

    def _assign_prop(self, prop):
        if isinstance(prop, (list, tuple)):
            return prop
        elif prop is not None:
            return (prop, )
        else:
            return tuple()

    def _dedup(self, df, *vars):
        return df.drop_duplicates(subset=vars)
Example #45
0
1) pull the appropriate total income and total expense fields and divide them by sqft... 
2) make determination on whether I'll include previous year determination
3) verify accuracy of newbase triple net lease field    
"""

import pymongo
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing
from sklearn import model_selection

#importing from SAS... filer15 data is submitted on 6/16 and applies to FY 17/18... the tentative valuation associated is 01/17 and the final roll is 6/17; and so a CAMA pull around that date corresponds to "current determination" (response) and CAMA data from 6/16 is previous yr determination (what we're) using as an independent variable
path = 'G:/Property _CommercialModeling/SAS Data Sets/'
best_15 = pd.read_sas(path + 'ie_re_best_filer15.sas7bdat')

#importing from EC2
#defaults to port 27017
client = pymongo.MongoClient("mongodb://*****:*****@18.222.226.207:27017/rpie")
db = client.rpie
#point to the appropriate collection
filings = db['filings']
#retrieve only 2015 filings
df_15 = filings.find({'filings': {'$elemMatch': {'IE_YEAR': '2015'}}})

df_15_d = filings.aggregate([{
    '$match': {
        'filings': {
            '$elemMatch': {
                'IE_YEAR': '2015'
Example #46
0
# repeated measures per subject for each type.

# The data files can be obtained from these links:
#
# https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/DEMO_G.XPT
# https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/BPX_G.XPT
# https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/BMX_G.XPT

import statsmodels.api as sm
import pandas as pd
import numpy as np

# First, load and merge the data sets.  These are SAS Xport format
# files, which can be read with Pandas.

demog = pd.read_sas("../data/DEMO_G.XPT")
bpx = pd.read_sas("../data/BPX_G.XPT")
bmx = pd.read_sas("../data/BMX_G.XPT")
df = pd.merge(demog, bpx, left_on="SEQN", right_on="SEQN")
df = pd.merge(df, bmx, left_on="SEQN", right_on="SEQN")

# Next we convert the data from wide to long, pivoting the four
# BP measures from columns to rows.

syvars = ["BPXSY%d" % j for j in (1, 2, 3, 4)]
divars = ["BPXDI%d" % j for j in (1, 2, 3, 4)]
vvars = syvars + divars
idvars = ['SEQN', 'RIDAGEYR', 'RIAGENDR', 'BMXBMI']
dx = pd.melt(df,
             id_vars=idvars,
             value_vars=vvars,
Example #47
0
 def get_col_names_by_table_name(self, table_name):
     return pd.read_sas(self.file_path, chunksize=0).columns.tolist()
Example #48
0
def test_from_sas():
    pandas_df = pandas.read_sas(TEST_SAS_FILENAME)
    modin_df = pd.read_sas(TEST_SAS_FILENAME)

    assert modin_df_equals_pandas(modin_df, pandas_df)
Example #49
0
print(reg_model.score(train_x, train_y))
# R squared = 0.7518 - percentage variation in targeted variables
# explained by derived equation. Higher value is more preffered
# R squared >= 0.85 - best fit
# R squared >= 0.7 - good fit
# R squared < 0.5 - poor fit

# Assignment
#  extract the data
# bulid the regression model
# target cost

locs = 'G:/spyder/Python Part 2 ClassNotes/class 11/Assignment/'

auction = pd.read_sas(locs + "auction.sas7bdat")

auction.shape
auction.columns
X = auction.iloc[:, 1:5]
Y = auction.iloc[:, 5]

train_x, test_x, train_y, test_y = train_test_split(X,
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=10)

reg_model = LinearRegression().fit(train_x, train_y)

print(mean_squared_error(test_y, pred_y))
Example #50
0
This script processes the NHANES actigraphy data into a
csv file.  Note that the resulting file will be too small
to read into memory on most computers.

Documentation:
https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/PAXRAW_C.htm

Get the data:
wget https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/PAXRAW_C.ZIP
"""

import pandas as pd
import gzip
import numpy as np

rdr = pd.read_sas("paxraw_c.xpt", format="xport", chunksize=10000000)

out = gzip.open("paxraw.csv.gz", "wt")
first = True

jj = 0
while True:

    try:
        df = next(rdr)
    except StopIteration:
        break

    df = df.astype(np.int)

    out.write(df.to_csv(header=first, index=False))
Example #51
0
def sas_data_to_df(sas_data_path, date_cols):
    data = pd.read_sas(sas_data_path)
    data[date_cols] = data[date_cols].applymap(lambda x: datetime.datetime(1960, 1, 1) + datetime.timedelta(x))
    return data
Example #52
0
# -*- coding: utf-8 -*-
"""
Created on Wed Aug  5 14:54:00 2020

@author: Atte
"""
import pandas as pd
import numpy as np
from extract_birthday_sex import extract_birthday_sex

data = pd.read_sas('/homes/aliu/DSGE_LRS/input/thl2019_804_tutkhenk.sas7bdat',
                   chunksize=2650000,
                   encoding='latin-1')

columns = ['id', 'bdate', 'sex']
df = pd.DataFrame(columns=columns)
for i, chunk in enumerate(data):
    df = pd.concat([
        df,
        extract_birthday_sex(chunk,
                             id_col='SUKULAISEN_TNRO',
                             sex_col='SUKUPUOLI',
                             bdate_col='SUKULAISEN_SYNTYMAPV')
    ])
    #remove duplicates
    df = df.loc[df.duplicated() == False]

data = pd.read_sas('/homes/aliu/DSGE_LRS/input/thl2019_804_tljslv.sas7bdat',
                   chunksize=2650000,
                   encoding='latin-1')
# Episode
df = pd.read_csv('/Users/yuchenli/Box Sync/Yuchen_project/'\
                 'Opioid_progression/Iteration_2/Data/'\
                 'amex_opioid_epis_1212285.csv', sep = ',',
                 index_col = False, header = None, encoding = 'utf-8',
                 dtype = np.str)

df.columns = [
    'year', 'enrolid', 'employer_cd', 'epi_id', 'epi_adm_cnt', 'epi_rx_cnt',
    'svcdate', 'tsvcdat', 'epi_days_cnt', 'epi_grp_cd', 'epi_incomplete_ind',
    'epi_proc_type_cd', 'epi_disease_stg_cd', 'epi_pay', 'epi_submit',
    'epi_netpay', 'epi_op_pay', 'epi_op_submit', 'epi_op_netpay',
    'Allowed Amount Med Epis', 'Charge Submitted Med Epis',
    'Net Payment Med Epis', 'epi_rx_pay', 'epi_rx_submit', 'epi_rx_netpay',
    'epi_tot_pay', 'epi_tot_submit', 'epi_tot_netpay', 'prov_id',
    'mng_phys_type', 'mng_phys_type_cd', 'prov_id', 'mdst_phys_type',
    'stdprov', 'table'
]

df.to_csv('/Users/yuchenli/Box Sync/Yuchen_project/'\
          'Opioid_progression/Iteration_2/Data/'\
          'amex_opioid_epis_1212285_with_column_names.csv', encoding = 'utf-8',
          index = False)

# Read random sas7bdat file
df = pd.read_sas('/Users/yuchenli/Box Sync/Yuchen_project/'
                 'Opioid_progression/Iteration_2/SAS/elig.sas7bdat')

df = pd.read_sas('/Users/yuchenli/Box Sync/Yuchen_project/'
                 'Opioid_progression/Iteration_2/SAS/'
                 'opioid_vs_2017.sas7bdat')
Example #54
0
#%%
def filterout(name,dataframe,columnname,filterdict):
    sizes={}
    sizes['original']=len(dataframe.index)
    
    for key in filterdict:
        dataframe[columnname] = dataframe[columnname].replace(to_replace=filterdict[key], value=np.nan)#dataframe[columnname].map({filterdict[key]: np.nan})
        dataframe=dataframe.dropna()
        sizes[key]=len(dataframe.index)
    
    print(name)
    print(sizes)
    return dataframe

#%%
schinfo = pd.read_sas(r"files\Schinfo.xpt",format='xport',encoding='utf-8')
#%%
#schoolsize and such
skinfo=['SCID','SIZE','METRO','REGION','GRADES','SCHTYPE']
schinfo = schinfo[skinfo].dropna()
schinfo = filterout("schoolsize",schinfo,'SIZE',{'duplicate':'!'})
#%%
skinfo.pop(0)
for d in skinfo:
    schinfo[d]=schinfo[d].astype(int)

#SCHTYPE
#1 public
#2 Catholic
#3 private
Example #55
0
def read_sas(data_fp):
    return pd.read_sas(data_fp)
    def __copy__(self):
        return type(self)(self.default_factory, self)

    def __deepcopy__(self, memo):
        import copy
        return type(self)(self.default_factory, copy.deepcopy(self.items()))

    def __repr__(self):
        return 'OrderedDefaultOrderedDict(%s, %s)' % (
            self.default_factory, OrderedDict.__repr__(self))


# get NDB and NDC data from sas
#TODO: Run programs to update siteinfo with subprocess
ndb_site_data = pd.read_sas(
    r'G:\NIDADSC\spitts\SAS_Projects\NDB\siteinfo.sas7bdat', format='sas7bdat')
ndc_site_data = pd.read_sas(
    r'G:\NIDADSC\spitts\SAS_Projects\NDC\siteinfo.sas7bdat', format='sas7bdat')

platforms = [ndb_site_data, ndc_site_data]

# Containers for data you need
protocol_site_info = DefaultOrderedDict(dict)
protocol_node_info = DefaultOrderedDict(dict)
all_site_info = DefaultOrderedDict(dict)
all_node_info = DefaultOrderedDict(dict)

site_info_tuple = namedtuple('Site_Info', ['prot', 'site', 'status', 'node'])
collection_of_sites = []
for platform in platforms:
    for index, row in platform.iterrows():
# @Filename: test_insider
# @Date: 2017-01-06
# @Author: Mark Wang
# @Email: [email protected]

import os
import datetime

import pandas as pd
import numpy as np

from constant import Constant as const

today_str = datetime.datetime.today().strftime('%Y-%m-%d')
root_path = '/home/wangzg/Documents/WangYouan/Trading/ShanghaiShenzhen'
data_path = os.path.join(root_path, 'data')
temp_path = os.path.join(root_path, 'temp')
today_path = os.path.join(temp_path, today_str)

if not os.path.join(today_path):
    os.makedirs(today_path)

report_info = pd.read_excel(os.path.join(data_path, 'insider2007_2016.xlsx'))
stock_data = pd.read_sas(os.path.join(data_path, 'daily_0516.sas7bdat'))
stock_data.loc[:, 'Markettype'] = stock_data['Markettype'].apply(int)
stock_data.loc[:, 'Trdsta'] = stock_data['Trdsta'].apply(int)
stock_data.loc[:, const.STOCK_DATE] = stock_data[const.STOCK_DATE].apply(
    lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))
stock_data.to_pickle(os.path.join(today_path, 'daily_0516.p'))
Example #58
0
 def test_sas_buffer_format(self):
     # GH14947
     b = StringIO("")
     with pytest.raises(ValueError):
         read_sas(b)
Example #59
0
 def time_packers_read_xport(self):
     pd.read_sas(self.f, format='xport')
#import wids data
df_wids = [
    pd.read_csv("preprocesseddata//ratio//bl_data_processed.csv",
                low_memory=False,
                index_col=0),
    pd.read_csv("preprocesseddata//ratio//cf_data_processed.csv",
                low_memory=False,
                index_col=0),
    pd.read_csv("preprocesseddata//ratio//is_data_processed.csv",
                low_memory=False,
                index_col=0)
]

#import bankruptcy label data
df_sas = pd.read_sas(r"bankrupt.sas7bdat")
df_sas["fyear"] = df_sas["BANK_END_DATE"].dt.year
df_sas = df_sas[(df_sas.fyear > 2000) & (df_sas.fyear < 2019)]

#plot of bankrupt companies by year
dataByYear = df_sas.groupby("fyear")["COMPANY_FKEY"].nunique()
print()
plt.title(r'Companies bankrupt by Year')
plt.bar(list(dataByYear.index), list(dataByYear))
plt.savefig("Descriptives/BankruptByYear.png", transparent=True, dpi=300)
plt.show()

#create disappered label data
cikGrouped = df_wids[0].groupby("cik")
df_disap = []