def test_sas_buffer_format(self): # see gh-14947 b = StringIO("") msg = ("If this is a buffer object rather than a string " "name, you must specify a format string") with pytest.raises(ValueError, match=msg): read_sas(b)
def test_iterator_read_too_much(self): # github #14734 k = self.test_ix[0][0] fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) rdr = pd.read_sas(fname, format="sas7bdat", iterator=True, encoding='utf-8') d1 = rdr.read(rdr.row_count + 20) rdr = pd.read_sas(fname, iterator=True, encoding="utf-8") d2 = rdr.read(rdr.row_count + 20) tm.assert_frame_equal(d1, d2)
def test_12659(datapath): fname = datapath("io", "sas", "data", "test_12659.sas7bdat") df = pd.read_sas(fname) fname = datapath("io", "sas", "data", "test_12659.csv") df0 = pd.read_csv(fname) df0 = df0.astype(np.float64) tm.assert_frame_equal(df, df0)
def test_airline(datapath): fname = datapath("io", "sas", "data", "airline.sas7bdat") df = pd.read_sas(fname) fname = datapath("io", "sas", "data", "airline.csv") df0 = pd.read_csv(fname) df0 = df0.astype(np.float64) tm.assert_frame_equal(df, df0, check_exact=False)
def test_many_columns(datapath): # Test for looking for column information in more places (PR #22628) fname = datapath("io", "sas", "data", "many_columns.sas7bdat") df = pd.read_sas(fname, encoding='latin-1') fname = datapath("io", "sas", "data", "many_columns.csv") df0 = pd.read_csv(fname, encoding='latin-1') tm.assert_frame_equal(df, df0)
def test_from_file(self): for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) df = pd.read_sas(fname, encoding='utf-8') tm.assert_frame_equal(df, df0)
def test_encoding_options(datapath): fname = datapath("io", "sas", "data", "test1.sas7bdat") df1 = pd.read_sas(fname) df2 = pd.read_sas(fname, encoding='utf-8') for col in df1.columns: try: df1[col] = df1[col].str.decode('utf-8') except AttributeError: pass tm.assert_frame_equal(df1, df2) from pandas.io.sas.sas7bdat import SAS7BDATReader rdr = SAS7BDATReader(fname, convert_header_text=False) df3 = rdr.read() rdr.close() for x, y in zip(df1.columns, df3.columns): assert(x == y.decode())
def test_productsales(datapath): fname = datapath("io", "sas", "data", "productsales.sas7bdat") df = pd.read_sas(fname, encoding='utf-8') fname = datapath("io", "sas", "data", "productsales.csv") df0 = pd.read_csv(fname, parse_dates=['MONTH']) vn = ["ACTUAL", "PREDICT", "QUARTER", "YEAR"] df0[vn] = df0[vn].astype(np.float64) tm.assert_frame_equal(df, df0)
def test_12659(): dirpath = tm.get_data_path() fname = os.path.join(dirpath, "test_12659.sas7bdat") df = pd.read_sas(fname) fname = os.path.join(dirpath, "test_12659.csv") df0 = pd.read_csv(fname) df0 = df0.astype(np.float64) tm.assert_frame_equal(df, df0)
def test_airline(): dirpath = tm.get_data_path() fname = os.path.join(dirpath, "airline.sas7bdat") df = pd.read_sas(fname) fname = os.path.join(dirpath, "airline.csv") df0 = pd.read_csv(fname) df0 = df0.astype(np.float64) tm.assert_frame_equal(df, df0, check_exact=False)
def test_productsales(): dirpath = tm.get_data_path() fname = os.path.join(dirpath, "productsales.sas7bdat") df = pd.read_sas(fname, encoding='utf-8') fname = os.path.join(dirpath, "productsales.csv") df0 = pd.read_csv(fname, parse_dates=['MONTH']) vn = ["ACTUAL", "PREDICT", "QUARTER", "YEAR"] df0[vn] = df0[vn].astype(np.float64) tm.assert_frame_equal(df, df0)
def test_from_buffer(self): for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) byts = open(fname, 'rb').read() buf = io.BytesIO(byts) df = pd.read_sas(buf, format="sas7bdat", encoding='utf-8') tm.assert_frame_equal(df, df0, check_exact=False)
def test_path_localpath(self): from py.path import local as LocalPath for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: fname = LocalPath(os.path.join( self.dirpath, "test{k}.sas7bdat".format(k=k))) df = pd.read_sas(fname, encoding='utf-8') tm.assert_frame_equal(df, df0)
def test_date_time(): # Support of different SAS date/datetime formats (PR #15871) dirpath = tm.get_data_path() fname = os.path.join(dirpath, "datetime.sas7bdat") df = pd.read_sas(fname) fname = os.path.join(dirpath, "datetime.csv") df0 = pd.read_csv(fname, parse_dates=['Date1', 'Date2', 'DateTime', 'DateTimeHi', 'Taiw']) tm.assert_frame_equal(df, df0)
def test_path_pathlib(self): tm._skip_if_no_pathlib() from pathlib import Path for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: fname = Path(os.path.join(self.dirpath, "test%d.sas7bdat" % k)) df = pd.read_sas(fname, encoding='utf-8') tm.assert_frame_equal(df, df0)
def test_from_iterator(self): for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) rdr = pd.read_sas(fname, iterator=True, encoding='utf-8') df = rdr.read(2) tm.assert_frame_equal(df, df0.iloc[0:2, :]) df = rdr.read(3) tm.assert_frame_equal(df, df0.iloc[2:5, :])
def test_date_time(datapath): # Support of different SAS date/datetime formats (PR #15871) fname = datapath("io", "sas", "data", "datetime.sas7bdat") df = pd.read_sas(fname) fname = datapath("io", "sas", "data", "datetime.csv") df0 = pd.read_csv(fname, parse_dates=['Date1', 'Date2', 'DateTime', 'DateTimeHi', 'Taiw']) # GH 19732: Timestamps imported from sas will incur floating point errors df.iloc[:, 3] = df.iloc[:, 3].dt.round('us') tm.assert_frame_equal(df, df0)
def test_iterator_loop(self): # github #13654 for j in 0, 1: for k in self.test_ix[j]: for chunksize in 3, 5, 10, 11: fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) rdr = pd.read_sas(fname, chunksize=10, encoding='utf-8') y = 0 for x in rdr: y += x.shape[0] self.assertTrue(y == rdr.row_count)
def df_fromsas(fullpath, id_lbl='ind_id'): ''' convert .sas7bdat to dataframe. unused because fails on incorrectly formatted files. ''' # read csv in as dataframe df = pd.read_sas(fullpath, format='sas7bdat') # convert id to str and save as new column df[id_lbl] = df[id_lbl].apply(int).apply(str) df['ID'] = df[id_lbl] return df
def test_from_buffer(self): for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) with open(fname, 'rb') as f: byts = f.read() buf = io.BytesIO(byts) rdr = pd.read_sas(buf, format="sas7bdat", iterator=True, encoding='utf-8') df = rdr.read() tm.assert_frame_equal(df, df0, check_exact=False) rdr.close()
def test_from_iterator(self): for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) byts = open(fname, 'rb').read() buf = io.BytesIO(byts) rdr = pd.read_sas(buf, format="sas7bdat", iterator=True, encoding='utf-8') df = rdr.read(2) tm.assert_frame_equal(df, df0.iloc[0:2, :]) df = rdr.read(3) tm.assert_frame_equal(df, df0.iloc[2:5, :])
def test_iterator_loop(self): # github #13654 for j in 0, 1: for k in self.test_ix[j]: for chunksize in 3, 5, 10, 11: fname = os.path.join( self.dirpath, "test{k}.sas7bdat".format(k=k)) rdr = pd.read_sas(fname, chunksize=10, encoding='utf-8') y = 0 for x in rdr: y += x.shape[0] assert y == rdr.row_count rdr.close()
def test_compact_numerical_values(datapath): # Regression test for #21616 fname = datapath("io", "sas", "data", "cars.sas7bdat") df = pd.read_sas(fname, encoding='latin-1') # The two columns CYL and WGT in cars.sas7bdat have column # width < 8 and only contain integral values. # Test that pandas doesn't corrupt the numbers by adding # decimals. result = df['WGT'] expected = df['WGT'].round() tm.assert_series_equal(result, expected, check_exact=True) result = df['CYL'] expected = df['CYL'].round() tm.assert_series_equal(result, expected, check_exact=True)
def read_sas(filepath_or_buffer, format=None, index=None, encoding=None, chunksize=None, iterator=False): warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) port_frame = pd.read_sas(filepath_or_buffer, format, index, encoding, chunksize, iterator) ray_frame = from_pandas(port_frame, get_npartitions()) return ray_frame
def reader(self,input,delimiter=None): # write in the following formats: xls,xlsx, csv, txt, if input.endswith(('.xls','.xlsx','.XLS','.XLSX')): tmpDict = pd.read_excel(input,sheetname=None) dropList=[] for sheet in tmpDict: if tmpDict[sheet].empty: dropList.append(sheet) for key in dropList: del tmpDict[key] return tmpDict.values() elif input.endswith('.csv'): tmp = pd.read_csv(input) return [tmp] elif input.endswith('.sas7bdat'): tmp = pd.read_sas(input) return [tmp] elif input.endswith(".jmp"): return False
import numpy as np import skimage from KPCA_self import kPCA from scipy.spatial.distance import pdist, squareform, cdist from scipy.linalg import eigh from sklearn.metrics.pairwise import euclidean_distances import numpy as np from sklearn.decomposition import KernelPCA results = [] from sas7bdat import SAS7BDAT with SAS7BDAT('pricedata_17.sas7bdat') as f: for row in f: pass saledata = pd.read_sas('pricedata_17.sas7bdat') saledata = saledata['sale'] #f = lambda x: x.sort_values(ascending=True).reset_index(drop=True) #sale_t=saledata.groupby(['product']).sale.apply(f).unstack() #sm=pd.DataFrame.as_matrix(sale_t) # # #grouped=saledata.groupby('product').apply(np.matrix.transpose) saledata_prod1 = saledata.iloc[:60] saledata_prod1.plot(x='index', y='sale') sm = pd.DataFrame.as_matrix(saledata) sm = sm[0:60] sm = np.reshape(sm, (60, 1))
def test_zero_variables(): # Check if the SAS file has zero variables (PR #18184) dirpath = tm.get_data_path() fname = os.path.join(dirpath, "zero_variables.sas7bdat") with pytest.raises(EmptyDataError): pd.read_sas(fname)
def read_xml(location, result): validate_dtd = False if result[RDF.type:setl.DTDValidatedXML]: validate_dtd = True f = iterparse_filter.IterParseFilter(validate_dtd=validate_dtd) if result.value(setl.xpath) is None: f.iter_end("/*") for xp in result[setl.xpath]: f.iter_end(xp.value) for (i, (event, ele)) in enumerate(f.iterparse(get_content(location))): yield i, ele extractors = { setl.XPORT: lambda location, result: pandas.read_sas(get_content(location), format='xport'), setl.SAS7BDAT: lambda location, result: pandas.read_sas(get_content(location), format='sas7bdat'), setl.Excel: read_excel, csvw.Table: read_csv, OWL.Ontology: read_graph, void.Dataset: read_graph, setl.JSON: lambda location, result: enumerate( ijson.items(get_content(location), result.value(api_vocab.selector, default=""))),
# In[ ]: df = pd.DataFrame() df.to_stata(r'C:\file.dta') # *Note: make sure you have the latest version of Pandas for new Stata versions* # ### SAS # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sas.html # This function works in most cases but files with text can throw nasty encoding errors. # In[ ]: sas_file = pd.read_sas(r'C:\file.sas7bdat', format='sas7bdat') # ## JSON files # ### Convert JSON file to Pandas dataframe # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_json.html # # *Note:* The path can also be a url # In[ ]: json_df = pd.read_json(r'C:\file.json')
plt.plot([1,2,3], [1,3,2], label="col1") plt.plot([1,2,3], [2,1,3], label="col2") plt.legend(loc='best') plt.grid(True) plt.title('Title') txt = 'this is an example' plt.text(0.05,0.95,txt, transform=fig.transFigure, size=24) pdf.savefig() plt.close() # Read file, apply desired sampling, and create permanent file def data_read_sampling (_file, _sample_size): email_end(_subject = ' data_read_sampling ' + str(_sample_size) + ' Start ') # Load full data df = pd.read_sas(filepath_or_buffer = _file) # Sample if len(_sample_size) > 0: df_s = df.sample(_sample_size) email_end(_subject = ' data_read_sampling ' + str(_sample_size) + ' Done ') return df_s else: email_end(_subject = ' data_read_sampling ' + str(_sample_size) + ' Done ') return df # Convert array to list array.tolist() # Convert data frame to numpy array data_array = data.values
def extract_data(self, table_name, col_names): data_frame = pd.read_sas(self.file_path) return pd.DataFrame(data_frame, columns=col_names)
def read_raw_data(write_data=False): """ ITA data is stored by year in stata files that take awhile to read in. These were read in once, the desired columns were kept and data was written to the location above in /raw """ years = np.arange(2005, 2017, 1) df_list = [] sdict = {} for year in years: print("Starting on year {}".format(year)) start = time.time() if year == 2007: print("2007 Stata is corrupted as of 2018-01-12") fpath = "FILENAME"\ "gbd 2005-2016 sas7bdat files/gbd_2007.sas7bdat" df = pd.read_sas(fpath) else: fpath = r"FILENAME"\ "FILENAME"\ "FILEPATH".format(year) df = pd.read_stata(fpath) read_time = (time.time() - start) / 60 print("It took {} min to read in year {}".format(read_time, year)) sdict[year] = df.columns.tolist() try: ideal_keep = [ 'sesso', 'eta', 'reg_ric', 'regric', 'gg_deg', 'mod_dim', 'mot_dh', 'tiporic', 'dpr', 'dsec1', 'dsec2', 'dsec3', 'dsec4', 'dsec5', 'causa_ext', 'tip_ist2', 'data_ric', 'data_dim', 'data_ricA', 'data_dimA' ] to_keep = [n for n in ideal_keep if n in df.columns] print(("Missing {} for this year".format( [n for n in ideal_keep if n not in df.columns]))) df = df[to_keep] df['year_start'] = year df_list.append(df) except: print("well that didn't work for {}".format(year)) del df if write_data: df = pd.concat(df_list, ignore_index=True) df.loc[(df.data_ric.isnull()) & (df.data_ricA.notnull()), 'data_ric'] =\ df.loc[(df.data_ric.isnull()) & (df.data_ricA.notnull()), 'data_ricA'] df.loc[(df.data_dim.isnull()) & (df.data_dimA.notnull()), 'data_dim'] =\ df.loc[(df.data_dim.isnull()) & (df.data_dimA.notnull()), 'data_dimA'] df.drop(['data_ricA', 'data_dimA'], axis=1, inplace=True) df['gg_deg'] = pd.to_numeric(df.gg_deg, errors='raise') write_hosp_file(df, "FILEPATH") return df_list
import pandas as pd codes = pd.read_csv("./data/London_District_codes.csv") socio = pd.read_spss("./data/London_ward_data_socioeconomic.sav") health = pd.read_sas("./data/london_ward_data_health.sas7bdat", format='sas7bdat', encoding='latin1') health = health.drop('Population2011Census', axis=1) env = pd.read_csv("./data/London_ward_data_environment.csv") demo = pd.read_csv("./data/London_ward_data_demographics.dat", delimiter='\t') socio['Districtcode'] = socio['Wardcode'].str[:-2] socio_env = pd.merge(socio, env, on='Wardcode') codes['Districtcode'] = codes['Districtcode']\ .replace(r'\s', '', regex=True) health[['District', 'Ward', 'remove', 'remove']] = health['Wardname'].str.split('-', expand=True) health['District'] = health['District'].str[:-1] health = health.drop(['Wardname', 'remove'], axis=1) total_df = pd.merge(socio_env, codes, on='Districtcode') total_df = pd.merge(total_df, health, on='District') demo[ ['District', 'Ward', 'remove', 'remove']
def time_read_msgpack(self, format): read_sas(self.f, format=format)
def time_packers_read_sas7bdat(self): pd.read_sas(self.f, format='sas7bdat')
import os, pandas as pd, numpy as np, gc from matplotlib import pyplot as plt from sklearn.metrics import roc_auc_score from sklearn.model_selection import StratifiedKFold, cross_val_score from sklearn.preprocessing import LabelEncoder from sklearn.impute import SimpleImputer from hyperopt import fmin, hp, tpe, STATUS_OK, Trials import lightgbm as lgbm import warnings warnings.filterwarnings("ignore") ##import data from sasfile analysis = pd.read_sas( 'Dataset/combined_vars_04092019.sas7bdat', encoding='latin1').drop([ 'TRR_ID', 'TRANSPLANT_DT', 'TRANSPLANT_DISCHARGE_DT', 'READMISSION_DT' ], axis=1) biomarker = pd.read_csv('Dataset/feature_extracted.csv').drop('CODE_REHOSP', 1) X = analysis.merge(biomarker, on='PERSON_ID').drop(['CODE_REHOSP', 'PERSON_ID'], 1) y = analysis['CODE_REHOSP'].replace(2, 0) cat_col = [ X.columns.get_loc(col) for col in X.columns if 2 < X[col].nunique() <= 10 ] for col in X.columns: if X[col].dtype == 'O': X[col] = LabelEncoder().fit_transform(X[col].fillna('Unknown')) elif 2 < X[col].nunique() <= 10: X[col] = LabelEncoder().fit_transform(X[col].fillna(99))
import pandas as pd import plotly.graph_objects as go from plotly.offline import plot # Create random data with numpy data = pd.read_sas( r"C:\Users\luoz14\OneDrive - Pfizer\Test Data\adpc_f001.sas7bdat", encoding='latin1') data = data.drop_duplicates(['_X', 'TRTA']) fig = go.Figure() trt1 = data.query("TRTA=='60 mg IR'") trt2 = data.query("TRTA=='60 mg MR1'") trt3 = data.query("TRTA=='60 mg MR1, fed'") # Add traces fig.add_trace( go.Scatter(x=trt1._X, y=trt1.AVAL, mode='lines+markers', name='lines+markers')) fig.add_trace( go.Scatter(x=trt2._X, y=trt2.AVAL, mode='lines+markers', name='lines+markers')) fig.add_trace( go.Scatter(x=trt3._X, y=trt3.AVAL, mode='lines+markers',
# In[14]: pd.read_excel("data/sample.xls", sheet_name=0) # ##### Read Specific Columns # In[17]: pd.read_excel("data/sample.xls", sheet_name=0, usecols="A:C") # #### Statistical Softwares # ##### SAS # In[18]: pd.read_sas("data/airline.sas7bdat") # ##### STATA # In[19]: pd.read_stata("data/airline.dta") # #### Summary # # - use read_csv for flat/delimited files # - use read_excel for excel files # - use read_sas for SAS files # - use read_stat for STATA files
``` The default value of the counter is 1 but you can modify this with the `by` option. Also, the index value `i` can be called whatever you want (i.e. `opportunity` would also work). This is similar to a `forval` loop in Stata. ```sas data out.A; do i = 1 to 5; y = i*2; output; end; run; ``` import pandas as pd example = pd.read_sas("../data/a.sas7bdat", encoding="latin-1"); example.head(n=50) ```sas data out.A; do i = 1 to 100 by 20; y = i*2; output; end; run; ``` example = pd.read_sas("../data/a_by.sas7bdat", encoding="latin-1"); example.head(n=50) Or in open code
@author: Atte """ import pandas as pd import numpy as np from extract_birth import extract_birth from extract_mal import extract_mal from combine_mal_birth import combine_mal_birth from squeeze_diag import squeeze_diag from combine_diag_ped import combine_diag_ped from load_and_process_ped import load_and_process_ped #load birth and malformation data and extract them bdate = pd.read_csv("/homes/afohr/data/bdate_sex.csv", parse_dates=[1]) # [id, bdate, sex] mal = pd.read_sas('/homes/aliu/DSGE_LRS/input/anomalies_children_1987_2015_dg.sas7bdat', encoding='latin-1') li = ['TNRO','MANNER_OF_BIRTH', 'ICD9', 'ICD10'] mal = extract_mal(mal ,li) #[id, icd9, icd10] mal2 = pd.read_sas('/homes/aliu/DSGE_LRS/input/anomalies_children_2016_dg.sas7bdat', encoding='latin-1') mal2 = extract_mal(mal2, li) #combine mals mal = pd.concat([mal, mal2]) #combine birth and mal mal = combine_mal_birth(mal, bdate) #[id, date,icd9, icd10] #squeeze diags mal = squeeze_diag(mal) #[id, date, diag] #load ped
def main(): readinData = pd.read_sas("Data/cust_info_model.sas7bdat") #print len(readinData) #print readinData.columns #dict = {} print readinData readinData.pop('Cust_Id') ''' readinData = readinData[(readinData['Gender_Cd'] == 'M') | (readinData['Gender_Cd'] == 'F')] readinData['guy'] = readinData['Age'].copy(deep=True) readinData['girl'] = readinData['Age'].copy(deep=True) readinData['guy'][readinData['Gender_Cd'] == 'M'] = 1 readinData['guy'][readinData['Gender_Cd'] == 'F'] = 0 readinData['girl'][readinData['Gender_Cd'] == 'F'] = 1 readinData['girl'][readinData['Gender_Cd'] == 'M'] = 0 readinData['Age_under_30'] = readinData['Age'].copy(deep=True) readinData['Age_under_30'][readinData['Age'] < 30] = 1 readinData['Age_under_30'][readinData['Age'] >= 30] = 0 readinData['Age_30_65'] = readinData['Age'].copy(deep=True) readinData['Age_30_65'][readinData['Age'] < 30] = 0 readinData['Age_30_65'][(readinData['Age'] >= 30) & (readinData['Age'] <= 65)] = 1 readinData['Age_30_65'][readinData['Age'] > 65] = 0 readinData['Age_65_above'] = readinData['Age'].copy(deep=True) readinData['Age_65_above'][readinData['Age'] <= 65] = 0 readinData['Age_65_above'][readinData['Age'] > 65] = 1 readinData['Year_use'] = readinData['Fst_ACCT_OPEN_DT'].copy(deep=True) readinData['Year_use'] = pd.to_numeric(readinData['Year_use'].astype(str).str[:4]) readinData['Year_use_under_1'] = readinData['Year_use'].copy(deep=True) readinData['Year_use_under_1'][readinData['Year_use_under_1'] < 2017] = 0 readinData['Year_use_under_1'][readinData['Year_use_under_1'] >= 2017] = 1 readinData['Year_use_1_2'] = readinData['Year_use'].copy(deep=True) readinData['Year_use_1_2'][readinData['Year_use_1_2'] < 2016] = 0 readinData['Year_use_1_2'][(readinData['Year_use_1_2'] >= 2016) & (readinData['Year_use_1_2'] < 2017)] = 1 readinData['Year_use_1_2'][readinData['Year_use_1_2'] >= 2017] = 0 readinData['Year_use_3_5'] = readinData['Year_use'].copy(deep=True) readinData['Year_use_3_5'][readinData['Year_use_3_5'] < 2013] = 0 readinData['Year_use_3_5'][(readinData['Year_use_3_5'] >= 2013) & (readinData['Year_use_3_5'] < 2016)] = 1 readinData['Year_use_3_5'][readinData['Year_use_3_5'] >= 2016] = 0 readinData['Year_use_5_above'] = readinData['Year_use'].copy(deep=True) readinData['Year_use_5_above'][readinData['Year_use_5_above'] < 2013] = 1 readinData['Year_use_5_above'][readinData['Year_use_5_above'] >= 2013] = 0 readinData["AUM_0_5"] = readinData['clum28'].copy(deep=True) readinData['AUM_0_5'][readinData['AUM_0_5'] <= 0] = 0 readinData['AUM_0_5'][(readinData['AUM_0_5'] <= 5) & (readinData['AUM_0_5'] > 0)] = 1 readinData['AUM_0_5'][readinData['AUM_0_5'] > 5] = 0 readinData["AUM_5_20"] = readinData['clum28'].copy(deep=True) readinData['AUM_5_20'][readinData['AUM_5_20'] <= 5] = 0 readinData['AUM_5_20'][(readinData['AUM_5_20'] <= 20) & (readinData['AUM_5_20'] > 5)] = 1 readinData['AUM_5_20'][readinData['AUM_5_20'] > 20] = 0 readinData["AUM_20_100"] = readinData['clum28'].copy(deep=True) readinData['AUM_20_100'][readinData['AUM_20_100'] <= 20] = 0 readinData['AUM_20_100'][(readinData['AUM_20_100'] <= 100) & (readinData['AUM_20_100'] > 20)] = 1 readinData['AUM_20_100'][readinData['AUM_20_100'] > 100] = 0 readinData["AUM_above_100"] = readinData['clum28'].copy(deep=True) readinData['AUM_above_100'][readinData['AUM_above_100'] <= 100] = 0 readinData['AUM_above_100'][readinData['AUM_above_100'] > 100] = 1 ''' dlist = [ "Is_NW_Cust", "Is_PB_Cust", "Is_WE_Cust", "Is_DFDK_Cust", "clu19", "clu20", "clu21", "clu212", "clu213", "clu214", "indicator_new" ] ldlen = len(dlist) for i in range(ldlen): dname = dlist[i] readinData[dname] = pd.to_numeric(readinData[dname]) list = [ "clu72", "clu73", "clu74", "clu75", "CB_CT_TX_NUM", "CB_PB_TX_NUM", "CB_PP_TX_NUM", "CB_NW_TX_NUM", "CB_WE_TX_NUM", "CB_ATM_TX_NUM", "CB_EP_TX_NUM", "CB_POS_TX_NUM" ] llen = len(list) for i in range(llen): name = list[i] #new_name = "New_" + name #print new_name median = readinData[name].quantile(0.5) readinData[dname] = readinData[dname].fillna(median) #print median #readinData[name] = readinData[name].copy(deep=True) readinData[name][readinData[name] <= median] = 0 readinData[name][readinData[name] > median] = 1 alist = [ "clu37", "clu38", "clu39", "clu40", "clu41", "clu42", "clu43", "clu44", "clu45", "clu46", "clu47", "clu471", "clu48" ] alen = len(alist) for i in range(alen): aname = alist[i] # new_name = "New_" + name # print new_name readinData[aname] = readinData[aname].fillna(0) # print median # readinData[name] = readinData[name].copy(deep=True) readinData[aname][readinData[aname] <= 0] = 0 readinData[aname][readinData[aname] > 0] = 1 readinData.pop('Age') readinData.pop('Fst_ACCT_OPEN_DT') readinData.pop('clum28') #readinData.pop('Year_use') readinData.pop('Gender_Cd') readinData.pop('Is_PP_Cust') readinData.pop('clu73') readinData.pop('clu20') readinData = readinData.rename( columns={ 'clu19': '是否持有信用卡', 'clu20': '是否持有借记卡', 'clu21': '是否持有存折', 'clu212': '是否持有存单', 'clu213': '是否持有定期一本通', 'clu214': '是否持有活期一本通', 'AUM_0_5': 'AUM资产在0至5万之间客户数', 'clu37': '持有定期产品数量', 'clu38': '持有大额存单数量', 'clu39': '理财产品数量', 'clu40': '基金产品数量', 'clu41': '贵金属产品数量', 'clu42': '信托产品数量', 'clu43': '代销储蓄国债产品数量', 'clu44': '代理保险产品数量', 'clu45': '银证第三方存管产品数量', 'clu46': '个人消费贷款产品数量', 'clu47': '个人经营贷款产品数量', 'clu471': '个人委托贷款产品数量', 'clu48': '信用卡数量', 'clu72': '定期存款业务活跃度', 'clu74': '贷款业务活跃度', 'clu75': '理财业务活跃度', 'Is_NW_Cust': '是否开通网上银行业务', 'Is_PB_Cust': '是否开通手机银行业务', 'Is_WE_Cust': '是否开通微信银行业务', 'Is_DFDK_Cust': '是否代发客户', 'CB_CT_TX_NUM': '核心客户柜面使用频率', 'CB_PB_TX_NUM': '核心客户手机银行使用频率', 'CB_PP_TX_NUM': '核心客户手机贴膜卡使用频率', 'CB_NW_TX_NUM': '核心客户网上银行使用频率', 'CB_WE_TX_NUM': '核心客户微信银行使用频率(非动帐)', 'CB_ATM_TX_NUM': '核心客户ATM使用频率', 'CB_EP_TX_NUM': '核心客户第三方支付平台使用频率', 'CB_POS_TX_NUM': '核心客户POS/TPOS使用频率', 'indicator_new': '是否过路资金账户' }) #print readinData #print readinData.columns frequent_itemsets = apriori(readinData, min_support=0.1, use_colnames=True) rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1) rules = rules.sort_values(['confidence'], ascending=False) print(rules) #print rules.head() #writer = pd.ExcelWriter("results/tianjin.xlsx") #fos.write(codecs.BOM_UTF8) #rules.write() rules.to_csv('results/tianjin.csv', encoding='utf_8_sig', index=False)
write_hosp_file(df, "FILEPATH") return df_list final_list = [] years = np.arange(2005, 2017, 1) for year in years: print("Starting on year {}".format(year)) start = time.time() if year == 2007: print("2007 Stata is corrupted as of 2018-01-12") fpath = "FILENAME"\ "gbd 2005-2016 sas7bdat files/gbd_2007.sas7bdat" df = pd.read_sas(fpath) else: fpath = r"FILENAME"\ "FILENAME"\ "FILEPATH".format(year) df = pd.read_stata(fpath) read_time = (time.time() - start) / 60 print("It took {} min to read in year {}".format(read_time, year)) try: ideal_keep = [ 'sesso', 'eta', 'reg_ric', 'regric', 'gg_deg', 'mod_dim', 'mot_dh', 'tiporic', 'dpr', 'dsec1', 'dsec2', 'dsec3', 'dsec4', 'dsec5',
import pandas as pd import numpy as np import simplejson as json from statistics import mode from decimal import Decimal # adsldf = pd.read_excel(r"C:\Users\sasg\PycharmProjects\test\src\assets\adsl.xlsx", 'adsl') adsldf = pd.read_sas( r"P:\nn7415\nn7415-4255\ctr_20190130_er\stats\data\adam\xprt\adsl.xpt", encoding='UTF-8') class CatgoSummary: def __init__(self, df, anavars, byvars, idvars, stats, totalover=None): self.df = df self.anavars = self._assign_prop(anavars) self.byvars = self._assign_prop(byvars) self.idvars = self._assign_prop(idvars) self.stats = self._assign_prop(stats) self.totalover = self._assign_prop(totalover) def _assign_prop(self, prop): if isinstance(prop, (list, tuple)): return prop elif prop is not None: return (prop, ) else: return tuple() def _dedup(self, df, *vars): return df.drop_duplicates(subset=vars)
1) pull the appropriate total income and total expense fields and divide them by sqft... 2) make determination on whether I'll include previous year determination 3) verify accuracy of newbase triple net lease field """ import pymongo import pandas as pd import re import matplotlib.pyplot as plt import numpy as np from sklearn import preprocessing from sklearn import model_selection #importing from SAS... filer15 data is submitted on 6/16 and applies to FY 17/18... the tentative valuation associated is 01/17 and the final roll is 6/17; and so a CAMA pull around that date corresponds to "current determination" (response) and CAMA data from 6/16 is previous yr determination (what we're) using as an independent variable path = 'G:/Property _CommercialModeling/SAS Data Sets/' best_15 = pd.read_sas(path + 'ie_re_best_filer15.sas7bdat') #importing from EC2 #defaults to port 27017 client = pymongo.MongoClient("mongodb://*****:*****@18.222.226.207:27017/rpie") db = client.rpie #point to the appropriate collection filings = db['filings'] #retrieve only 2015 filings df_15 = filings.find({'filings': {'$elemMatch': {'IE_YEAR': '2015'}}}) df_15_d = filings.aggregate([{ '$match': { 'filings': { '$elemMatch': { 'IE_YEAR': '2015'
# repeated measures per subject for each type. # The data files can be obtained from these links: # # https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/DEMO_G.XPT # https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/BPX_G.XPT # https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/BMX_G.XPT import statsmodels.api as sm import pandas as pd import numpy as np # First, load and merge the data sets. These are SAS Xport format # files, which can be read with Pandas. demog = pd.read_sas("../data/DEMO_G.XPT") bpx = pd.read_sas("../data/BPX_G.XPT") bmx = pd.read_sas("../data/BMX_G.XPT") df = pd.merge(demog, bpx, left_on="SEQN", right_on="SEQN") df = pd.merge(df, bmx, left_on="SEQN", right_on="SEQN") # Next we convert the data from wide to long, pivoting the four # BP measures from columns to rows. syvars = ["BPXSY%d" % j for j in (1, 2, 3, 4)] divars = ["BPXDI%d" % j for j in (1, 2, 3, 4)] vvars = syvars + divars idvars = ['SEQN', 'RIDAGEYR', 'RIAGENDR', 'BMXBMI'] dx = pd.melt(df, id_vars=idvars, value_vars=vvars,
def get_col_names_by_table_name(self, table_name): return pd.read_sas(self.file_path, chunksize=0).columns.tolist()
def test_from_sas(): pandas_df = pandas.read_sas(TEST_SAS_FILENAME) modin_df = pd.read_sas(TEST_SAS_FILENAME) assert modin_df_equals_pandas(modin_df, pandas_df)
print(reg_model.score(train_x, train_y)) # R squared = 0.7518 - percentage variation in targeted variables # explained by derived equation. Higher value is more preffered # R squared >= 0.85 - best fit # R squared >= 0.7 - good fit # R squared < 0.5 - poor fit # Assignment # extract the data # bulid the regression model # target cost locs = 'G:/spyder/Python Part 2 ClassNotes/class 11/Assignment/' auction = pd.read_sas(locs + "auction.sas7bdat") auction.shape auction.columns X = auction.iloc[:, 1:5] Y = auction.iloc[:, 5] train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.3, random_state=10) reg_model = LinearRegression().fit(train_x, train_y) print(mean_squared_error(test_y, pred_y))
This script processes the NHANES actigraphy data into a csv file. Note that the resulting file will be too small to read into memory on most computers. Documentation: https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/PAXRAW_C.htm Get the data: wget https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/PAXRAW_C.ZIP """ import pandas as pd import gzip import numpy as np rdr = pd.read_sas("paxraw_c.xpt", format="xport", chunksize=10000000) out = gzip.open("paxraw.csv.gz", "wt") first = True jj = 0 while True: try: df = next(rdr) except StopIteration: break df = df.astype(np.int) out.write(df.to_csv(header=first, index=False))
def sas_data_to_df(sas_data_path, date_cols): data = pd.read_sas(sas_data_path) data[date_cols] = data[date_cols].applymap(lambda x: datetime.datetime(1960, 1, 1) + datetime.timedelta(x)) return data
# -*- coding: utf-8 -*- """ Created on Wed Aug 5 14:54:00 2020 @author: Atte """ import pandas as pd import numpy as np from extract_birthday_sex import extract_birthday_sex data = pd.read_sas('/homes/aliu/DSGE_LRS/input/thl2019_804_tutkhenk.sas7bdat', chunksize=2650000, encoding='latin-1') columns = ['id', 'bdate', 'sex'] df = pd.DataFrame(columns=columns) for i, chunk in enumerate(data): df = pd.concat([ df, extract_birthday_sex(chunk, id_col='SUKULAISEN_TNRO', sex_col='SUKUPUOLI', bdate_col='SUKULAISEN_SYNTYMAPV') ]) #remove duplicates df = df.loc[df.duplicated() == False] data = pd.read_sas('/homes/aliu/DSGE_LRS/input/thl2019_804_tljslv.sas7bdat', chunksize=2650000, encoding='latin-1')
# Episode df = pd.read_csv('/Users/yuchenli/Box Sync/Yuchen_project/'\ 'Opioid_progression/Iteration_2/Data/'\ 'amex_opioid_epis_1212285.csv', sep = ',', index_col = False, header = None, encoding = 'utf-8', dtype = np.str) df.columns = [ 'year', 'enrolid', 'employer_cd', 'epi_id', 'epi_adm_cnt', 'epi_rx_cnt', 'svcdate', 'tsvcdat', 'epi_days_cnt', 'epi_grp_cd', 'epi_incomplete_ind', 'epi_proc_type_cd', 'epi_disease_stg_cd', 'epi_pay', 'epi_submit', 'epi_netpay', 'epi_op_pay', 'epi_op_submit', 'epi_op_netpay', 'Allowed Amount Med Epis', 'Charge Submitted Med Epis', 'Net Payment Med Epis', 'epi_rx_pay', 'epi_rx_submit', 'epi_rx_netpay', 'epi_tot_pay', 'epi_tot_submit', 'epi_tot_netpay', 'prov_id', 'mng_phys_type', 'mng_phys_type_cd', 'prov_id', 'mdst_phys_type', 'stdprov', 'table' ] df.to_csv('/Users/yuchenli/Box Sync/Yuchen_project/'\ 'Opioid_progression/Iteration_2/Data/'\ 'amex_opioid_epis_1212285_with_column_names.csv', encoding = 'utf-8', index = False) # Read random sas7bdat file df = pd.read_sas('/Users/yuchenli/Box Sync/Yuchen_project/' 'Opioid_progression/Iteration_2/SAS/elig.sas7bdat') df = pd.read_sas('/Users/yuchenli/Box Sync/Yuchen_project/' 'Opioid_progression/Iteration_2/SAS/' 'opioid_vs_2017.sas7bdat')
#%% def filterout(name,dataframe,columnname,filterdict): sizes={} sizes['original']=len(dataframe.index) for key in filterdict: dataframe[columnname] = dataframe[columnname].replace(to_replace=filterdict[key], value=np.nan)#dataframe[columnname].map({filterdict[key]: np.nan}) dataframe=dataframe.dropna() sizes[key]=len(dataframe.index) print(name) print(sizes) return dataframe #%% schinfo = pd.read_sas(r"files\Schinfo.xpt",format='xport',encoding='utf-8') #%% #schoolsize and such skinfo=['SCID','SIZE','METRO','REGION','GRADES','SCHTYPE'] schinfo = schinfo[skinfo].dropna() schinfo = filterout("schoolsize",schinfo,'SIZE',{'duplicate':'!'}) #%% skinfo.pop(0) for d in skinfo: schinfo[d]=schinfo[d].astype(int) #SCHTYPE #1 public #2 Catholic #3 private
def read_sas(data_fp): return pd.read_sas(data_fp)
def __copy__(self): return type(self)(self.default_factory, self) def __deepcopy__(self, memo): import copy return type(self)(self.default_factory, copy.deepcopy(self.items())) def __repr__(self): return 'OrderedDefaultOrderedDict(%s, %s)' % ( self.default_factory, OrderedDict.__repr__(self)) # get NDB and NDC data from sas #TODO: Run programs to update siteinfo with subprocess ndb_site_data = pd.read_sas( r'G:\NIDADSC\spitts\SAS_Projects\NDB\siteinfo.sas7bdat', format='sas7bdat') ndc_site_data = pd.read_sas( r'G:\NIDADSC\spitts\SAS_Projects\NDC\siteinfo.sas7bdat', format='sas7bdat') platforms = [ndb_site_data, ndc_site_data] # Containers for data you need protocol_site_info = DefaultOrderedDict(dict) protocol_node_info = DefaultOrderedDict(dict) all_site_info = DefaultOrderedDict(dict) all_node_info = DefaultOrderedDict(dict) site_info_tuple = namedtuple('Site_Info', ['prot', 'site', 'status', 'node']) collection_of_sites = [] for platform in platforms: for index, row in platform.iterrows():
# @Filename: test_insider # @Date: 2017-01-06 # @Author: Mark Wang # @Email: [email protected] import os import datetime import pandas as pd import numpy as np from constant import Constant as const today_str = datetime.datetime.today().strftime('%Y-%m-%d') root_path = '/home/wangzg/Documents/WangYouan/Trading/ShanghaiShenzhen' data_path = os.path.join(root_path, 'data') temp_path = os.path.join(root_path, 'temp') today_path = os.path.join(temp_path, today_str) if not os.path.join(today_path): os.makedirs(today_path) report_info = pd.read_excel(os.path.join(data_path, 'insider2007_2016.xlsx')) stock_data = pd.read_sas(os.path.join(data_path, 'daily_0516.sas7bdat')) stock_data.loc[:, 'Markettype'] = stock_data['Markettype'].apply(int) stock_data.loc[:, 'Trdsta'] = stock_data['Trdsta'].apply(int) stock_data.loc[:, const.STOCK_DATE] = stock_data[const.STOCK_DATE].apply( lambda x: datetime.datetime.strptime(x, '%Y-%m-%d')) stock_data.to_pickle(os.path.join(today_path, 'daily_0516.p'))
def test_sas_buffer_format(self): # GH14947 b = StringIO("") with pytest.raises(ValueError): read_sas(b)
def time_packers_read_xport(self): pd.read_sas(self.f, format='xport')
#import wids data df_wids = [ pd.read_csv("preprocesseddata//ratio//bl_data_processed.csv", low_memory=False, index_col=0), pd.read_csv("preprocesseddata//ratio//cf_data_processed.csv", low_memory=False, index_col=0), pd.read_csv("preprocesseddata//ratio//is_data_processed.csv", low_memory=False, index_col=0) ] #import bankruptcy label data df_sas = pd.read_sas(r"bankrupt.sas7bdat") df_sas["fyear"] = df_sas["BANK_END_DATE"].dt.year df_sas = df_sas[(df_sas.fyear > 2000) & (df_sas.fyear < 2019)] #plot of bankrupt companies by year dataByYear = df_sas.groupby("fyear")["COMPANY_FKEY"].nunique() print() plt.title(r'Companies bankrupt by Year') plt.bar(list(dataByYear.index), list(dataByYear)) plt.savefig("Descriptives/BankruptByYear.png", transparent=True, dpi=300) plt.show() #create disappered label data cikGrouped = df_wids[0].groupby("cik") df_disap = []