Ejemplo n.º 1
0
            row.update(kwargs)
            row.update(glm)
            row.update(val)
            row.update({'wall_clock_secs': wall_clock_secs})
            row.update({'java_heap_GB': java_heap_GB})
            csvWrt.writerow(row)
        h2o.nodes[0].remove_key(k)
    finally:
        output.close()
    
def parse_file(f):
    v = h2o.nodes[0].import_files()['succeeded'][0]
    return h2o.nodes[0].parse(v['key'],timeoutSecs=3600)['destination_key']

if __name__ == '__main__':
    h2o.parse_our_args()
    files = None
    if is_ec2():
        files = ec2_files
        h2o_hosts.build_cloud_with_hosts()
    else:
        files = local_files
        h2o_hosts.build_cloud_with_hosts(use_hdfs=True,base_port=54321)

    # want to ignore columns with missing values, since GLM throws away those rows, (won't analyze as many rows)
    # Distance, CRSEElapsedTime has some...I guess ignore
    # column Year 0 type: int
    # column Month 1 type: int
    # column DayofMonth 2 type: int
    # column DayOfWeek 3 type: int
    # column DepTime 4 type: int num_missing_values: 2302136
Ejemplo n.º 2
0
                                   timeoutSecs = 7200, 
                                   **kwargs)

        h2j.pollWaitJobs(timeoutSecs=4800, pollTimeoutSecs=4800, retryDelaySecs=2)
        pcaTime   = time.time() - pcaStart
        cmd = 'bash startloggers.sh ' + json + ' stop_'
        #stop all loggers
        os.system(cmd)
        row.update({'pcaBuildTime' : pcaTime})
        csvWrt.writerow(row)
    finally:
        output.close()

if __name__ == '__main__':
    dat   = sys.argv.pop(-1)
    debug = sys.argv.pop(-1)
    build = sys.argv.pop(-1)
    json  = sys.argv[-1].split('/')[-1]
    fp    = 'Airlines' if 'Air' in dat else 'AllBedrooms'
    h2o.parse_our_args()
    h2o_hosts.build_cloud_with_hosts()
    if dat == 'Air1x'    : fs = files['Airlines']['train'][0]
    if dat == 'Air10x'   : fs = files['Airlines']['train'][1]
    if dat == 'Air100x'  : fs = files['Airlines']['train'][2]
    if dat == 'AllB1x'   : fs = files['AllBedrooms']['train'][0]
    if dat == 'AllB10x'  : fs = files['AllBedrooms']['train'][1]
    if dat == 'AllB100x' : fs = files['AllBedrooms']['train'][2]
    
    doPCA(fs, fp)
    h2o.tear_down_cloud()