# 2.2 Importing Delimited Text Data import os import settings as st from revoscalepy import rx_import, rx_get_info # Importing Delimited Text Data infile = os.path.join(st.SAMPLE_DATA_DIR, 'claims.txt') claims_data_frame = rx_import(infile) # Displaying info print(rx_get_info(claims_data_frame, get_var_info=True)) # Saving text data into XDF file ## REM: Our RxOptions.set_option("OutDataPath", [RESULTS_LOCATION]) ## is not taken into account (bug?). Hence, we specify the full location outfile = os.path.join(st.RESULTS_LOCATION, 'claims.xdf') rx_import(input_data=infile, output_file=outfile, overwrite=True)
# 4.3 Using the Data Step to Create an .xdf File from a Data Frame import os import settings as st import pandas as pd import numpy as np from revoscalepy import rx_data_step, rx_get_info np.random.seed(39) x1 = np.random.normal(size = 10000) x2 = np.random.uniform(size = 10000) x3 = x1 + x2 s = np.stack((x1,x2,x3)) s = np.transpose(s) myData = pd.DataFrame(s, columns = ['x1','x2','x3']).query("x2 > .1") # Export files directory outFile = os.path.join(st.RESULTS_LOCATION, 'testFile.xdf') rx_data_step(input_data = myData, output_file = outFile, rows_per_read = 5000, overwrite = True) print(rx_get_info(outFile))
# 3.2 Specifying Delimiters import os import settings as st from revoscalepy import rx_import, rx_get_info, RxTextData # Importing delimited text data infile = os.path.join(st.SAMPLE_DATA_DIR, 'hyphens.txt') hyphensTxt = RxTextData(infile, delimiter='-') hyphens_data_frame = rx_import(hyphensTxt) # Displaying info print(rx_get_info(hyphens_data_frame, get_var_info=True)) print(hyphens_data_frame.head())
# For this example, we assume the airOT201201.csv file was downloaded into the current working directory dirpath = os.getcwd() # Other samples in RxOptions.get_option("sampleDataDir") # CSV File (Raw Data for one month) airlineDemoPathCSV = os.path.join(dirpath, "airOT201201.csv") # Create the CSV Data Source using RxTextData ds = RxTextData(airlineDemoPathCSV) # Import first 10 rows of the CSV file airlineDemo = rx_import(input_data=ds, number_rows=10) # Inspect data types using rx_get_info print(rx_get_info(airlineDemo, get_var_info=True)) # Inspect Unique Carrier (factor) print(rx_get_info(airlineDemo, vars_to_keep="UNIQUE_CARRIER", get_var_info=True)) # Import the first 10 rows again, and this time use strings_as_factors = True airlineDemo = rx_import(input_data=ds, strings_as_factors = True, number_rows=10) # Inspect data types print(rx_get_info(airlineDemo, get_var_info=True)) # Inspect Unique Carrier (factor) to see the difference using strings_as_factors = True print(rx_get_info(airlineDemo, vars_to_keep="UNIQUE_CARRIER", get_var_info=True)) # Manually create column_info for the factor column UniqueCarrier # we can use column_infog to specify data types and update factor levels
def dummy_vars(df, context): # Dummy vars df_dummy = pd.get_dummies( df, columns=['Annual_Income_Bucket', 'Highest_Education']) # Rename columns (remove spaces etc.) new_cols = [ x.replace(' ', '_').replace('>', 'gt').replace('<', 'lt') for x in df_dummy.columns ] df_dummy.columns = new_cols return df_dummy output_file = os.path.join(file_path, "Lead_Demography.xdf") # Set a csv as a data source data_source = RxTextData(file=inputfile, delimiter=',') # Dummy variables and output to named xdf rx_data_step(input_data=data_source, output_file=output_file, overwrite=True, transform_function=dummy_vars) rx_get_info(data=output_file, verbose=2) #################################################################### # Call main function to work in SQL compute context #################################################################### main(tablename="Lead_Demography_Tbl", inputdf=output_file, overwrite=True)