Example #1
0
def main(argv):

    # reading the command line
    helpString = 'python python_script_JAD_paper -a <trainingSet> -b <testSet> -t <timeForEachWorker> -n <numWorkers>'
    try:
        opts, args = getopt.getopt(argv, "ha:b:t:n:")
    except getopt.GetoptError:
        print(helpString)
        sys.exit(2)

    # collecting the arguments
    for opt, arg in opts:
        if opt == '-h':
            print(helpString)
            sys.exit()
        elif opt == '-a':
            training_set = arg
        elif opt == '-b':
            test_set = arg
        elif opt == '-t':
            time_left_for_this_task = int(arg)
        elif opt == '-n':
            n_processes = int(arg)

    # starting counting the time
    start_time = time.time()

    # folders
    tmp_folder = './tmp/autosklearn_tmp/' + training_set
    output_folder = './tmp/autosklearn_out/' + training_set

    # ensuring the folders are empty (?)
    for tmpDir in [tmp_folder, output_folder]:
        try:
            shutil.rmtree(tmpDir)
        except OSError as e:
            pass

    # reading the training data
    trainingData = pandas.read_csv(filepath_or_buffer='./tmp/data/' +
                                   training_set + '.csv',
                                   index_col=False)
    y_train = trainingData['target']
    X_train = trainingData.drop('target', 1)

    # reading the test data
    testData = pandas.read_csv(filepath_or_buffer='./tmp/data/' + test_set +
                               '.csv',
                               index_col=False)
    y_test = testData['target']
    X_test = testData.drop('target', 1)

    # main block
    try:

        # creating the sub-process function
        processes = []
        spawn_classifier = get_spawn_classifier(X_train, y_train, training_set,
                                                time_left_for_this_task,
                                                tmp_folder, output_folder)

        # spawning the subprocesses
        for i in range(small_constant, small_constant + n_processes):
            p = multiprocessing.Process(target=spawn_classifier, args=[i])
            p.start()
            processes.append(p)

        # waiting until all processes are done
        for p in processes:
            p.join()

        # retrieving the csRes and concatenating in a single data frame
        csvFiles = glob.glob('./tmp/results/' + training_set + '/*.csv')
        cvRes = pandas.read_csv(filepath_or_buffer=csvFiles[0], index_col=0)
        for csvFile in csvFiles[1:]:
            cvRes_tmp = pandas.read_csv(filepath_or_buffer=csvFile,
                                        index_col=0)
            cvRes = pandas.concat([cvRes, cvRes_tmp], axis=0, sort=False)

        # writing the cvRes on file
        cvRes.to_csv('./tmp/results/' + training_set + '/cvRes.csv',
                     index=False)

        # building the ensemble
        automl_ensemble = AutoSklearnClassifier(
            time_left_for_this_task=
            time_left_for_this_task,  # sec., how long should this seed fit process run
            delete_tmp_folder_after_terminate=False,
            delete_output_folder_after_terminate=False,
            seed=12345,
            shared_mode=True,
            ensemble_size=50,
            ensemble_nbest=50,
            tmp_folder=tmp_folder,
            output_folder=output_folder)
        automl_ensemble.fit_ensemble(y_train.copy(),
                                     task=BINARY_CLASSIFICATION,
                                     metric=autosklearn.metrics.roc_auc)

        # building the best model
        automl_bestModel = AutoSklearnClassifier(
            time_left_for_this_task=
            time_left_for_this_task,  # sec., how long should this seed fit process run
            delete_tmp_folder_after_terminate=False,
            delete_output_folder_after_terminate=False,
            shared_mode=True,
            ensemble_size=1,
            ensemble_nbest=1,
            tmp_folder=tmp_folder,
            output_folder=output_folder)
        automl_bestModel.fit_ensemble(y_train.copy(),
                                      task=BINARY_CLASSIFICATION,
                                      metric=autosklearn.metrics.roc_auc)

        # refitting on the whole dataset
        automl_bestModel.refit(X_train.copy(), y_train.copy())
        automl_ensemble.refit(X_train.copy(), y_train.copy())

        # extracting the performances on test set
        automl_bestModel.target_type = 'multilabel-indicator'
        automl_ensemble.target_type = 'multilabel-indicator'
        predictions_bestModel = automl_bestModel.predict_proba(X_test.copy())
        predictions_ensemble = automl_ensemble.predict_proba(X_test.copy())

        # saving the results on file
        toSave = pandas.DataFrame({'outcome': y_test})
        toSave['prob_ensemble'] = predictions_ensemble[:, 0]
        toSave['prob_bestModel'] = predictions_bestModel[:, 0]
        toSave.to_csv('./tmp/results/' + training_set + '/holdoutRes.csv')

        # stopping counting the time
        end_time = time.time()

        # saving total time
        total_time = end_time - start_time
        time_file = open('./tmp/results/' + training_set + '/etime.txt', "w+")
        tmp = time_file.write('Total time in seconds: %d\n' % total_time)
        time_file.close()

    except Exception as e:
        print(e)

    finally:

        # removing the tmp results folder
        shutil.rmtree(tmp_folder + '/.auto-sklearn/models')