Esempio n. 1
0
import numpy as np
import pandas as pd
import time
import sys

tic0 = time.perf_counter()
##----------------------------------------------------------------------------------------
## Logistic Regression with SGD
##----------------------------------------------------------------------------------------
sample_size = 5000
p = 50
partition_method = "systematic"
partition_num = 20

data_pdf = simulate_logistic(sample_size, p, partition_method, partition_num)
data_sdf = spark.createDataFrame(data_pdf)

memsize = sys.getsizeof(data_pdf)

assembler = VectorAssembler(inputCols=["x" + str(x) for x in range(p)],
                            outputCol="features")

tic = time.perf_counter()
parsedData = assembler.transform(data_sdf)
time_parallelize = time.perf_counter() - tic

tic = time.perf_counter()
# Model configuration
lr = LogisticRegression(maxIter=100, regParam=0.3, elasticNetParam=0.8)
Esempio n. 2
0
    sample_size_sub = []
    memsize_sub = []

# Read or load data chunks into pandas
#-----------------------------------------------------------------------------------------
time_2sdf_sub = []
time_repartition_sub = []

loop_counter = 0
for file_no_i in range(n_files):
    tic_2sdf = time.perf_counter()

    if using_data == "simulated_pdf":
        if file_no_i == 0:
            # To test performance, we only simulate one subset of data and replicated it.
            data_pdf_i = simulate_logistic(sample_size_sub[0], p,
                                           partition_method, partition_num_sub)
            memsize_sub0 = sys.getsizeof(data_pdf_i)
        else:
            sample_size_sub.append(sample_size_sub[0])
            memsize_sub.append(memsize_sub0)
            partition_num_sub.append(partition_num_sub[0])

    elif using_data == "real_pdf":  # Read real data
        data_pdf_i0 = clean_airlinedata(os.path.expanduser(
            file_path[file_no_i]),
                                        fit_intercept=fit_intercept)

        # Create an full-column empty DataFrame and resize current subset
        edf = pd.DataFrame(
            columns=list(set(dummy_column_names) - set(data_pdf_i0.columns)))
        data_pdf_i = data_pdf_i0.append(edf, sort=True)