Exemple #1
0
def run_mixed_models(X_train_MM, X_test_MM, y_train_MM, y_test_MM):

    t1 = time.time()
    loom = ProcessLoom(max_runner_cap=2)
    # add the functions to the multiprocessing object, loom
    loom.add_function(
        MM_LR, [X_train_MM['MM_LR'], X_test_MM['MM_LR'], y_train_MM['MM_LR']],
        {})
    loom.add_function(NN, [
        X_train_MM['MM_NN'], X_test_MM['MM_NN'], y_train_MM['MM_NN'],
        y_test_MM['MM_NN']
    ], {})
    # run the processes in parallel
    output = loom.execute()
    t2 = time.time()
    print('total time - run mixed models: ', t2 - t1)

    return output[0]['output'], (output[1]['output']).reshape(-1)
def main():
    stock = Stocks()

    start_time = datetime.datetime.now()

    Nselist = stock.get_list()

    slave1list = Nselist[:800]
    slave2list = Nselist[800:]

    loom = ProcessLoom(max_runner_cap=10)

    loom.add_function(slave, [slave1list, "Slave1"], {})
    loom.add_function(slave, [slave2list, "Slave2"], {})

    loom.execute()

    end_time = datetime.datetime.now()

    totaltime = end_time - start_time

    stock.client.close()

    connect_email(totaltime)
Exemple #3
0
def preprocess(main_data, validationFlag):

    target = pd.DataFrame(main_data['Target'])
    main_data = main_data.drop(['Target'], axis=1)
    # specify the size of train, validation and test sets
    test_offset = r
    train_offset = floor(0.75 * (numberOfDays - test_offset))
    val_offset = numberOfDays - (train_offset + test_offset)
    t1 = time.time()
    # produce train, validation and test data in parallel
    loom = ProcessLoom(max_runner_cap=4)

    if validationFlag:  # validationFlag is 1 if we want to have a validation set and 0 otherwise
        # add the functions to the multiprocessing object, loom
        loom.add_function(
            splitData,
            [numberOfSelectedCounties, main_data, target, train_offset, 0], {})
        loom.add_function(splitData, [
            numberOfSelectedCounties, main_data, target, val_offset,
            train_offset
        ], {})
        loom.add_function(splitData, [
            numberOfSelectedCounties, main_data, target, test_offset,
            train_offset + val_offset
        ], {})
        # run the processes in parallel
        output = loom.execute()
        t2 = time.time()
        #print('total time of data splitting: ', t2 - t1)

        X_train_train = (output[0]['output'][0]).reset_index(drop=True)
        X_train_val = (output[1]['output'][0]).reset_index(drop=True)
        X_test = (output[2]['output'][0]).reset_index(drop=True)

        y_train_train = np.array(output[0]['output'][1]).reshape(-1)
        y_train_val = np.array(output[1]['output'][1]).reshape(-1)
        y_test = np.array(output[2]['output'][1]).reshape(-1)

        return X_train_train, X_train_val, X_test, y_train_train, y_train_val, y_test

    else:
        loom.add_function(splitData, [
            numberOfSelectedCounties, main_data, target,
            train_offset + val_offset, 0
        ], {})
        loom.add_function(splitData, [
            numberOfSelectedCounties, main_data, target, test_offset,
            train_offset + val_offset
        ], {})
        # run the processes in parallel
        output = loom.execute()
        t2 = time.time()
        #print('total time of data splitting: ', t2 - t1)

        X_train = (output[0]['output'][0]).reset_index(drop=True)
        X_test = (output[1]['output'][0]).reset_index(drop=True)

        y_train = np.array(output[0]['output'][1]).reshape(-1)
        y_test = np.array(output[1]['output'][1]).reshape(-1)

        return X_train, X_test, y_train, y_test
Exemple #4
0
def main(maxHistory):

    history = [i for i in range(1, maxHistory + 1)]
    methods = ['GBM', 'GLM', 'KNN', 'NN', 'MM_LR', 'MM_NN']
    none_mixed_methods = ['GBM', 'GLM', 'KNN', 'NN']
    mixed_methods = ['MM_LR', 'MM_NN']
    target_name = 'confirmed'
    base_data = makeHistoricalData(0, r, target_name, str(argv[1]))
    base_data = clean_data(base_data, numberOfSelectedCounties)
    covariates_names = list(base_data.columns)
    covariates_names.remove('Target')
    numberOfCovariates = len(covariates_names)
    print('number of covariates: ', numberOfCovariates)
    y_prediction = {
        'GBM': {},
        'GLM': {},
        'KNN': {},
        'NN': {},
        'MM_LR': {},
        'MM_NN': {}
    }
    y_test_MM = {'MM_LR': {}, 'MM_NN': {}}
    best_h = {}
    best_c = {}
    minError = {
        'GBM': int(1e10),
        'GLM': int(1e10),
        'KNN': int(1e10),
        'NN': int(1e10),
        'MM_LR': int(1e10),
        'MM_NN': int(1e10)
    }
    percentage_errors = {
        'GBM': {},
        'GLM': {},
        'KNN': {},
        'NN': {},
        'MM_LR': {},
        'MM_NN': {}
    }  # percentage of absolute errors
    mae_errors = {
        'GBM': {},
        'GLM': {},
        'KNN': {},
        'NN': {},
        'MM_LR': {},
        'MM_NN': {}
    }  # mean absolute errors
    rmse_errors = {
        'GBM': {},
        'GLM': {},
        'KNN': {},
        'NN': {},
        'MM_LR': {},
        'MM_NN': {}
    }  # root mean squared errors
    adjR2_errors = {
        'GBM': {},
        'GLM': {},
        'KNN': {},
        'NN': {},
        'MM_LR': {},
        'MM_NN': {}
    }  # adjusted R squared errors

    historical_X_train = {}  # X_train for best h and c
    historical_X_test = {}  # X_test for best h and c
    historical_y_train = {}  # y_train for best h and c
    historical_y_test = {}  # y_test for best h and c
    parallel_outputs = {}

    for h in history:
        data = makeHistoricalData(h, 14, target_name, str(argv[1]))
        data = clean_data(data, numberOfSelectedCounties)
        X_train_train, X_train_val, X_test, y_train_train, y_train_val, y_test = preprocess(
            data, 1)
        y_train = np.array((pd.DataFrame(y_train_train).append(
            pd.DataFrame(y_train_val))).reset_index(drop=True)).reshape(-1)
        covariates_list = []
        # covariates are sorted by their correlation with Target. We start from the first important covariate and
        # in each loop we add the next important one
        # the first covariate is Target, we start from the second one

        # initiate loom for parallel processing
        loom = ProcessLoom(
            max_runner_cap=len(base_data.columns) * len(none_mixed_methods) +
            5)

        indx_c = 0
        for c in covariates_names:  # iterate through sorted covariates
            indx_c += 1
            for covariate in data.columns:  # add all historical covariates of this covariate and create a feature
                if c.split(' ')[0] in covariate:
                    covariates_list.append(covariate)
            X_train_train_temp = X_train_train[covariates_list]
            X_train_val_temp = X_train_val[covariates_list]
            for method in none_mixed_methods:
                loom.add_function(parallel_run, [
                    method, X_train_train_temp, X_train_val_temp,
                    y_train_train, y_train_val
                ])
        # run the processes in parallel
        parallel_outputs['non_mixed'] = loom.execute()
        ind = 0
        for c in range(1, numberOfCovariates + 1):
            for method in none_mixed_methods:
                y_prediction[method][(
                    h, c)] = parallel_outputs['non_mixed'][ind]['output']
                ind += 1
        # save the entire session for each h and c
        filename = env_address + 'validation.pkl'
        dill.dump_session(filename)
        # initiate loom for parallel processing
        loom = ProcessLoom(
            max_runner_cap=len(base_data.columns) * len(mixed_methods) + 5)
        for c in range(1, numberOfCovariates + 1):
            for mixed_method in mixed_methods:
                y_predictions = []
                # Construct the outputs for the training dataset of the 'MM' methods
                y_prediction['NN'][(h, c)] = np.array(
                    y_prediction['NN'][(h, c)]).ravel()
                y_predictions.extend([
                    y_prediction['GBM'][(h, c)], y_prediction['GLM'][(h, c)],
                    y_prediction['KNN'][(h, c)], y_prediction['NN'][(h, c)]
                ])
                y_prediction_np = np.array(y_predictions).reshape(
                    len(y_predictions), -1)
                X_mixedModel = pd.DataFrame(y_prediction_np.transpose())
                X_train_MM, X_test_MM, y_train_MM, y_test_MM[mixed_method][(
                    h, c)] = train_test_split(X_mixedModel,
                                              y_train_val,
                                              test_size=0.25)
                loom.add_function(mixed_prallel_run, [
                    mixed_method, X_train_MM, X_test_MM, y_train_MM,
                    y_test_MM[mixed_method][(h, c)]
                ])
        # run the processes in parallel
        parallel_outputs['mixed'] = loom.execute()
        ind = 0
        for c in range(1, numberOfCovariates + 1):
            for mixed_method in mixed_methods:
                y_prediction[mixed_method][(h, c)] = np.array(
                    parallel_outputs['mixed'][ind]['output']).ravel()
                ind += 1
        # save the entire session for each h and c
        filename = env_address + 'validation.pkl'
        dill.dump_session(filename)
        indx_c = 0
        for c in covariates_names:  # iterate through sorted covariates
            indx_c += 1
            for covariate in data.columns:  # add all historical covariates of this covariate and create a feature
                if c.split(' ')[0] in covariate:
                    covariates_list.append(covariate)
            X_train_train_temp = X_train_train[covariates_list]
            X_train_val_temp = X_train_val[covariates_list]
            X_test_temp = X_test[covariates_list]
            y_val = y_train_val
            for method in methods:
                if method == 'MM_LR' or method == 'MM_NN':
                    y_val = y_test_MM[method][(h, indx_c)]
                mae_errors[method][(h, indx_c)], rmse_errors[method][(h, indx_c)], percentage_errors[method][(h, indx_c)], \
                adjR2_errors[method][(h, indx_c)] = get_errors(h, indx_c, method, y_prediction[method][(h, indx_c)], y_val)
                if rmse_errors[method][(h, indx_c)] < minError[method]:
                    minError[method] = rmse_errors[method][(h, indx_c)]
                    best_h[method] = h
                    best_c[method] = indx_c
                    if method != 'MM_LR' and method != 'MM_NN':
                        historical_X_train[method] = (
                            X_train_train_temp.append(X_train_val_temp)
                        ).reset_index(drop=True)
                        historical_X_test[method] = X_test_temp
                        historical_y_train[method] = y_train
                        historical_y_test[method] = y_test
        # save the entire session for each h and c
        filename = env_address + 'validation.pkl'
        dill.dump_session(filename)
    # save the entire session for each h
    filename = env_address + 'validation.pkl'
    dill.dump_session(filename)
    # plot the results of methods on validation set
    plot_results(3, 2, numberOfCovariates, methods, history, percentage_errors,
                 'Percentage Of Absolute Error')
    plot_results(3, 2, numberOfCovariates, methods, history, mae_errors,
                 'Mean Absolute Error')
    plot_results(3, 2, numberOfCovariates, methods, history, rmse_errors,
                 'Root Mean Squared Error')
    plot_results(3, 2, numberOfCovariates, methods, history, adjR2_errors,
                 'Adjusted R Squared Error')
    push()
    #################################################################################################################
    columns_table = [
        'method', 'best_h', 'best_c', 'root mean squared error',
        'mean absolute error', 'percentage of absolute error',
        'adjusted R squared error'
    ]  # table columns names
    y_prediction = {}
    # run non-mixed methods on the whole training set with their best h and c
    X_train_dict, X_test_dict, y_train_dict, y_test_dict = {}, {}, {}, {}

    y_prediction['GBM'], y_prediction['GLM'], y_prediction[
        'KNN'], y_prediction['NN'] = run_algorithms(historical_X_train,
                                                    historical_X_test,
                                                    historical_y_train,
                                                    historical_y_test)

    table_data = []
    for method in none_mixed_methods:
        meanAbsoluteError, rootMeanSquaredError, percentageOfAbsoluteError, adj_r_squared = get_errors(
            best_h[method], best_c[method], method, y_prediction[method],
            historical_y_test[method])
        table_data.append([
            method, best_h[method], best_c[method],
            round(rootMeanSquaredError, 2),
            round(meanAbsoluteError, 2),
            round(percentageOfAbsoluteError, 2),
            round(adj_r_squared, 2)
        ])
        result = pd.DataFrame(historical_y_test[method], columns=['y_test'])
        result['y_prediction'] = y_prediction[method]
        result['absolute_error'] = abs(historical_y_test[method] -
                                       y_prediction[method])
        result.to_csv(test_address + method + '.csv')
    table_name = 'non-mixed methods best results'
    plot_table(table_data, columns_table, table_name)

    # generate data for non-mixed methods with the best h and c of mixed models and fit mixed models on them
    # (with the whole training set)
    y_predictions = {'MM_LR': [], 'MM_NN': []}
    y_prediction = {}
    table_data = []
    X_train_MM_dict, X_test_MM_dict, y_train_MM_dict, y_test_MM_dict = {}, {}, {}, {}
    for mixed_method in mixed_methods:
        y_test = None
        for method in none_mixed_methods:
            X_train, X_test, y_train, y_test = generate_data(
                best_h[mixed_method], best_c[mixed_method], covariates_names)
            X_train_dict[method] = X_train
            X_test_dict[method] = X_test
            y_train_dict[method] = y_train
            y_test_dict[method] = y_test

        y_prediction['GBM'], y_prediction['GLM'], y_prediction[
            'KNN'], y_prediction['NN'] = run_algorithms(
                X_train_dict, X_test_dict, y_train_dict, y_test_dict)
        y_predictions[mixed_method].extend([
            y_prediction['GBM'], y_prediction['GLM'], y_prediction['KNN'],
            y_prediction['NN']
        ])
        y_prediction_np = np.array(y_predictions[mixed_method]).reshape(
            len(y_predictions[mixed_method]), -1)
        X_mixedModel = pd.DataFrame(y_prediction_np.transpose())
        X_train_MM, X_test_MM, y_train_MM, y_test_MM = train_test_split(
            X_mixedModel, y_test, test_size=0.25)
        X_train_MM_dict[mixed_method] = X_train_MM
        X_test_MM_dict[mixed_method] = X_test_MM
        y_train_MM_dict[mixed_method] = y_train_MM
        y_test_MM_dict[mixed_method] = y_test_MM
    # save the entire session
    filename = env_address + 'test.pkl'
    dill.dump_session(filename)
    # mixed model with linear regression and neural network
    y_prediction['MM_LR'], y_prediction['MM_NN'] = run_mixed_models(
        X_train_MM_dict, X_test_MM_dict, y_train_MM_dict, y_test_MM_dict)
    for mixed_method in mixed_methods:
        meanAbsoluteError, rootMeanSquaredError, percentageOfAbsoluteError, adj_r_squared = get_errors(
            best_h[mixed_method], best_c[mixed_method], mixed_method,
            y_prediction[mixed_method], y_test_MM_dict[mixed_method])
        table_data.append([
            mixed_method, best_h[mixed_method], best_c[mixed_method],
            round(rootMeanSquaredError, 2),
            round(meanAbsoluteError, 2),
            round(percentageOfAbsoluteError, 2),
            round(adj_r_squared, 2)
        ])
        result = pd.DataFrame(y_test_MM_dict[mixed_method], columns=['y_test'])
        result['y_prediction'] = y_prediction[mixed_method]
        result['absolute_error'] = abs(y_test_MM_dict[mixed_method] -
                                       y_prediction[mixed_method])
        result.to_csv(test_address + mixed_method + '.csv')
    # save the entire session
    filename = env_address + 'test.pkl'
    dill.dump_session(filename)
    table_name = 'mixed methods best results'
    plot_table(table_data, columns_table, table_name)
    push()
Exemple #5
0
def run_algorithms(X_train_dict, X_val_dict, y_train_dict, y_val_dict):

    t1 = time.time()
    loom = ProcessLoom(max_runner_cap=4)
    # add the functions to the multiprocessing object, loom
    loom.add_function(
        GBM, [X_train_dict['GBM'], X_val_dict['GBM'], y_train_dict['GBM']], {})
    loom.add_function(
        GLM, [X_train_dict['GLM'], X_val_dict['GLM'], y_train_dict['GLM']], {})
    loom.add_function(
        KNN, [X_train_dict['KNN'], X_val_dict['KNN'], y_train_dict['KNN']], {})
    loom.add_function(NN, [
        X_train_dict['NN'], X_val_dict['NN'], y_train_dict['NN'],
        y_val_dict['NN']
    ], {})
    # run the processes in parallel
    output = loom.execute()
    t2 = time.time()
    print('total time - run algorithms: ', t2 - t1)

    return output[0]['output'], output[1]['output'], output[2]['output'], (
        output[3]['output']).reshape(-1)
Exemple #6
0
    verify = True
    if selfSignedCertificate != "" :
        certfile = open(certFileName,'w')
        os.write(certfile,selfSignedCertificate)
        verify = certFileName
    elif selfSignedCertificateS3Bucket != "" :
        s3 = boto3.client('s3')
        verify = certFileName
        with open(certFileName, 'w') as f:
            s3.download_fileobj(selfSignedCertificateS3Bucket, selfSignedCertificateS3Key, f)
        certfile = open(certFileName,'r')
        print(certfile.read())
    elif _allowInValidCerts == True:
        verify = False
    sapcred=json.loads(_get_secret())
    sapUser = sapcred["username"]
    sapPassword = sapcred["password"]
    return requests.get( url, headers=headers, auth=HTTPBasicAuth(sapUser,sapPassword), verify=verify)
    
# ------------------------------------
# Execute Data Extraction from SAP in parallel
# ------------------------------------  
from pexecute.process import ProcessLoom
loom = ProcessLoom(max_runner_cap=9)

x = 0
while (x < totalEntities ):
    loom.add_function(_extract, [x], {})
    x += 5000

output = loom.execute()
Exemple #7
0
from sys import argv
import sys
import os
import subprocess
from pexecute.process import ProcessLoom


def main():
<<<<<<< HEAD:one_by_one_1_validation/1_to_4/sc.py
    
    for i in range(4):
=======
    loom = ProcessLoom(max_runner_cap = 8)
    for i in range(7):
>>>>>>> 5c60d8fa91da4126bd59c3e41b8253582f06fff2:one_by_one_1_validation/sc.py
        print(i)
        subprocess.call("python ./prediction.py "+str(i), shell=True)


if __name__ == "__main__":

    main()
    output2 = resp_data2.read()
    print(output2)

    print(
        "Presigned URL to download the image is given below, link will expire in 30 mins"
    )
    resp3 = f"aws s3 presign {bucket}image_firefox.png --expires-in 1800 --profile default"
    resp_data3 = os.popen(resp3)
    output3 = resp_data3.read()
    print(output3)


#finally running the code

if status_code == 200:

    #parallely executing
    from pexecute.process import ProcessLoom
    loom = ProcessLoom(max_runner_cap=4)
    work = [(test_chrome_browser, [url]), (test_firefox_browser, [url])]
    loom.add_work(work)
    output = loom.execute()

else:
    try:

        print("Please enter correct URL, status of url ")
    except ValueError as e:
        print("Please enter correct URL, status of url ")
        print(e)