Esempio n. 1
0
def main():
    """ Application main. """

    K = [10]  # folds
    N = [3]  # neighbors
    P = [4.5]  # powers
    # The time scale is the only parameter being trained here, so we
    # consider a number of options.
    C = [0.001 * i for i in range(1, 25)]
    C.extend([0.025 * i for i in range(1, 81)])
    A = [0.75]  # alphas
    M = [3]  # bags

    # Build the list of k-fold configurations under analysis.
    conf_list = [
        kfold.KFoldConf(k, n, p, None, c, a, m) for k in K for n in N
        for p in P for c in C for a in A for m in M
    ]

    # Distribute the RDD of k-fold configurations.
    conf_rdd = SC.parallelize(conf_list, 104).cache()

    # Group all the partitions that are to be examined.
    partition_files = [
        'partitions/monthly_ozone_1990-2015_partition.csv',
        'partitions/monthly_pm25_1990-2015_partition.csv'
    ]

    # Run learning tasks for each partition.
    for file_name in partition_files:

        # Note that we reuse the method from "point.py" here.
        point_list = point.load_point_file(file_name)
        point_list_brd = SC.broadcast(point_list)

        # Define a mapper to run your statistical routines.
        def fold(conf):
            """ Return a result tuple for the given configuration. """
            return (
                conf,  # KFoldConf object
                kfold.mare(conf, point_list_brd),  # MARE statistic
                kfold.rmspe(conf, point_list_brd))  # RMSPE statistic

        # Run the learning routines and generate the report.
        report_rdd = conf_rdd.map(fold).map(report)

        # Write the output to a file in a "results/" directory, regardless of
        # the order in which the partitions were analysed.
        if 'no2' in file_name:
            report_rdd.saveAsTextFile('results/no2_max_results')
        elif 'ozone' in file_name:
            report_rdd.saveAsTextFile('results/ozone_max_results')
        elif 'pm25' in file_name:
            report_rdd.saveAsTextFile('results/pm25_max_results')
        else:
            import sys
            sys.exit(1)
Esempio n. 2
0
def main():
    """ Application main. """

    K = [10]                                        # folds
    N = [3, 4, 5, 6, 7, 8]                          # neighbors
    P = [1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5]         # powers
    C = [0.001 * i for i in range(1, 25)]           # time_scales
    C.extend([0.025 * i for i in range(1, 81)])

    # build the list and then RDD of KFoldConf objects under analysis
    conf_list = [kfold.KFoldConf(k, n, p, None, c)
                 for k in K
                 for n in N
                 for p in P
                 for c in C]
    # add incremental "conf_id" attribute to each KFoldConf object
    for i, conf in enumerate(conf_list):
        conf.conf_id = i
    conf_rdd = SC.parallelize(conf_list, 150).cache()

    # load radius_table and broadcast it
    with open('radius_table.pkl', 'r') as f:
        radius_table = pickle.load(f)
    radius_table_brd = SC.broadcast(radius_table)

    # run learning tasks for each partition
    for i in range(3):
        point_list = load_partition(i)
        point_list_brd = SC.broadcast(point_list)

        def fold(conf):
            """ Return a result tuple for the given configuration. """
            return (i,                                  # partition_id
                    conf,                               # KFoldConf object
                    kfold.mare(conf,                    # MARE statistic
                               point_list_brd,
                               radius_table_brd),   
                    kfold.rmspe(conf,                   # RMSPE statistic
                                point_list_brd,
                                radius_table_brd))  

        report_rdd = conf_rdd.map(fold).map(report)
        report_rdd.saveAsTextFile('results/partition%02d' % i)

    # collect all results into one rdd, then into one file
    result_rdds = [SC.textFile('results/partition0%d/' % i) for i in range(3)]
    results = result_rdds[0].\
        union(result_rdds[1]).\
        union(result_rdds[2]).\
        collect()
    with open('results.csv', 'w') as output:
        results = map(lambda line: line + '\n', results)
        output.writelines(results)
Esempio n. 3
0
def main():
    """ Application main. """

    N = [3, 4, 5, 6, 7]
    P = [1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5]

    conf_list = [kfold.KFoldConf(10, n, p, None, 0.1086) for n in N for p in P]
    conf_rdd = SC.parallelize(conf_list, 45).cache()

    point_list = point.load_pm25_file('../../data/pm25_2009_measured.csv')
    random.shuffle(point_list)
    # The following was used to test execution of this script locally.
    # point_list = point_list[:250]
    point_list_brd = SC.broadcast(point_list)

    def fold(conf):
        return (conf, kfold.mare(conf, point_list_brd),
                kfold.rmspe(conf, point_list_brd))

    report_rdd = conf_rdd.map(fold).map(report)
    report_rdd.saveAsTextFile(
        "hdfs:///user/jf00936/aeolus/experiment_01/results")