# 2.2 Importing Delimited Text Data
import os
import settings as st
from revoscalepy import rx_import, rx_get_info

# Importing Delimited Text Data
infile = os.path.join(st.SAMPLE_DATA_DIR, 'claims.txt')

claims_data_frame = rx_import(infile)

# Displaying info
print(rx_get_info(claims_data_frame, get_var_info=True))

# Saving text data into XDF file

## REM: Our RxOptions.set_option("OutDataPath", [RESULTS_LOCATION])
## is not taken into account (bug?). Hence, we specify the full location
outfile = os.path.join(st.RESULTS_LOCATION, 'claims.xdf')

rx_import(input_data=infile, output_file=outfile, overwrite=True)
# 4.3 Using the Data Step to Create an .xdf File from a Data Frame
import os
import settings as st
import pandas as pd
import numpy as np
from revoscalepy import rx_data_step, rx_get_info

np.random.seed(39)

x1 = np.random.normal(size = 10000)
x2 = np.random.uniform(size = 10000)
x3 = x1 + x2

s = np.stack((x1,x2,x3))
s = np.transpose(s)

myData = pd.DataFrame(s, columns = ['x1','x2','x3']).query("x2 > .1")

# Export files directory
outFile = os.path.join(st.RESULTS_LOCATION, 'testFile.xdf')

rx_data_step(input_data = myData,
    output_file = outFile,
    rows_per_read = 5000, 
    overwrite = True)

print(rx_get_info(outFile))
# 3.2 Specifying Delimiters
import os
import settings as st
from revoscalepy import rx_import, rx_get_info, RxTextData

# Importing delimited text data
infile = os.path.join(st.SAMPLE_DATA_DIR, 'hyphens.txt')

hyphensTxt = RxTextData(infile, delimiter='-')
hyphens_data_frame = rx_import(hyphensTxt)

# Displaying info
print(rx_get_info(hyphens_data_frame, get_var_info=True))
print(hyphens_data_frame.head())
Esempio n. 4
0
# For this example, we assume the airOT201201.csv file was downloaded into the current working directory
dirpath = os.getcwd()
# Other samples in RxOptions.get_option("sampleDataDir")

# CSV File (Raw Data for one month)
airlineDemoPathCSV = os.path.join(dirpath, "airOT201201.csv")

# Create the CSV Data Source using RxTextData
ds = RxTextData(airlineDemoPathCSV)

# Import first 10 rows of the CSV file
airlineDemo = rx_import(input_data=ds, number_rows=10)

# Inspect data types using rx_get_info
print(rx_get_info(airlineDemo, get_var_info=True))

# Inspect Unique Carrier (factor)
print(rx_get_info(airlineDemo, vars_to_keep="UNIQUE_CARRIER", get_var_info=True))

# Import the first 10 rows again, and this time use strings_as_factors = True
airlineDemo = rx_import(input_data=ds, strings_as_factors = True, number_rows=10)

# Inspect data types
print(rx_get_info(airlineDemo, get_var_info=True))

# Inspect Unique Carrier (factor) to see the difference using strings_as_factors = True
print(rx_get_info(airlineDemo, vars_to_keep="UNIQUE_CARRIER", get_var_info=True))

# Manually create column_info for the factor column UniqueCarrier
# we can use column_infog to specify data types and update factor levels
    def dummy_vars(df, context):
        # Dummy vars
        df_dummy = pd.get_dummies(
            df, columns=['Annual_Income_Bucket', 'Highest_Education'])

        # Rename columns (remove spaces etc.)
        new_cols = [
            x.replace(' ', '_').replace('>', 'gt').replace('<', 'lt')
            for x in df_dummy.columns
        ]
        df_dummy.columns = new_cols
        return df_dummy

    output_file = os.path.join(file_path, "Lead_Demography.xdf")

    # Set a csv as a data source
    data_source = RxTextData(file=inputfile, delimiter=',')

    # Dummy variables and output to named xdf
    rx_data_step(input_data=data_source,
                 output_file=output_file,
                 overwrite=True,
                 transform_function=dummy_vars)

    rx_get_info(data=output_file, verbose=2)

    ####################################################################
    # Call main function to work in SQL compute context
    ####################################################################

    main(tablename="Lead_Demography_Tbl", inputdf=output_file, overwrite=True)