row.update(kwargs) row.update(glm) row.update(val) row.update({'wall_clock_secs': wall_clock_secs}) row.update({'java_heap_GB': java_heap_GB}) csvWrt.writerow(row) h2o.nodes[0].remove_key(k) finally: output.close() def parse_file(f): v = h2o.nodes[0].import_files()['succeeded'][0] return h2o.nodes[0].parse(v['key'],timeoutSecs=3600)['destination_key'] if __name__ == '__main__': h2o.parse_our_args() files = None if is_ec2(): files = ec2_files h2o_hosts.build_cloud_with_hosts() else: files = local_files h2o_hosts.build_cloud_with_hosts(use_hdfs=True,base_port=54321) # want to ignore columns with missing values, since GLM throws away those rows, (won't analyze as many rows) # Distance, CRSEElapsedTime has some...I guess ignore # column Year 0 type: int # column Month 1 type: int # column DayofMonth 2 type: int # column DayOfWeek 3 type: int # column DepTime 4 type: int num_missing_values: 2302136
timeoutSecs = 7200, **kwargs) h2j.pollWaitJobs(timeoutSecs=4800, pollTimeoutSecs=4800, retryDelaySecs=2) pcaTime = time.time() - pcaStart cmd = 'bash startloggers.sh ' + json + ' stop_' #stop all loggers os.system(cmd) row.update({'pcaBuildTime' : pcaTime}) csvWrt.writerow(row) finally: output.close() if __name__ == '__main__': dat = sys.argv.pop(-1) debug = sys.argv.pop(-1) build = sys.argv.pop(-1) json = sys.argv[-1].split('/')[-1] fp = 'Airlines' if 'Air' in dat else 'AllBedrooms' h2o.parse_our_args() h2o_hosts.build_cloud_with_hosts() if dat == 'Air1x' : fs = files['Airlines']['train'][0] if dat == 'Air10x' : fs = files['Airlines']['train'][1] if dat == 'Air100x' : fs = files['Airlines']['train'][2] if dat == 'AllB1x' : fs = files['AllBedrooms']['train'][0] if dat == 'AllB10x' : fs = files['AllBedrooms']['train'][1] if dat == 'AllB100x' : fs = files['AllBedrooms']['train'][2] doPCA(fs, fp) h2o.tear_down_cloud()