def main(): import dhdt import argparse parser = argparse.ArgumentParser( parents=[dhdt.dhdtLog(), dhdt.batchParser()]) parser.add_argument('config', metavar='CFG', help="name of the configuration file") args = parser.parse_args() # read the configuration cfg = dhdt.Config() cfg.readCfg(args.config) if args.submit == 'sge': batch = dhdt.SGEProcess(args) elif args.submit == 'pbs': batch = dhdt.PBSProcess(args) else: batch = None if batch is not None: batch.serial(['createOutDirs', args.config]) else: createOutDirs(cfg)
def main(): parser = argparse.ArgumentParser( parents=[dhdt.dhdtLog(), dhdt.batchParser()]) parser.add_argument('config', metavar='CFG', help="name of the configuration file") parser.add_argument('-B', '--display-progress-bar', default=False, action='store_true', help="display a progress bar") args = parser.parse_args() dhdt.initLog(args) # read the configuration cfg = dhdt.Config() cfg.readCfg(args.config) if args.submit == 'sge': batch = dhdt.SGEProcess(args) elif args.submit == 'pbs': batch = dhdt.PBSProcess(args) else: batch = None if batch is not None: batch.serial(['mergeData', args.config]) else: mergeData(cfg, displayProgress=args.display_progress_bar)
def main(): import argparse parser = argparse.ArgumentParser( parents=[dhdt.dhdtLog(), dhdt.batchParser()]) parser.add_argument('config', metavar='CFG', help="name of the configuration file") parser.add_argument( '-r', '--reprocess-data', action='store_true', default=False, help="process data even if previous run was successful") parser.add_argument( '-n', '--num-processes', type=int, default=8, help= "set the number of processes to use (either on workstation or for MPI taskfarm)" ) parser.add_argument('--monitor-memory', action='store_true', default=False, help="monitor CPU and memory usage") parser.add_argument('-T', '--task-farm', action='store_true', default=False, help="use MPI task farm") args = parser.parse_args() dhdt.initLog(args) if args.submit == 'sge': batch = dhdt.SGEProcess(args) elif args.submit == 'pbs': batch = dhdt.PBSProcess(args) else: batch = None if batch is not None: processSGE(args.config, batch, reprocess=args.reprocess_data, monitor=args.monitor_memory, taskFarm=args.task_farm) else: processMP(args.config, args.num_processes, args.run_time, reprocess=args.reprocess_data, monitor=args.monitor_memory)
def main(): import argparse parser = argparse.ArgumentParser( parents=[dhdt.dhdtLog(), dhdt.batchParser(taskfarm=True)]) parser.add_argument('config', metavar='CFG', help="name of the configuration file") parser.add_argument( '-r', '--reprocess-data', action='store_true', default=False, help="process data even if previous run was successful") parser.add_argument('--monitor-memory', action='store_true', default=False, help="monitor CPU and memory usage") args = parser.parse_args() if args.submit == 'sge': batch = dhdt.SGEProcess(args) elif args.submit == 'pbs': batch = dhdt.PBSProcess(args) else: batch = None # read the configuration cfg = dhdt.Config() cfg.readCfg(args.config) if batch is not None: dhdt.initLog(args) cmd = [ 'processDataTF', args.config, '-l', args.log_level, '-L', args.log_file ] if args.reprocess_data: cmd.append('-r') if args.monitor_memory: cmd.append('--monitor-memory') batch.mpi(cmd) else: from mpi4py import MPI comm = MPI.COMM_WORLD rank = comm.Get_rank() dhdt.initLog(args, mpi_rank=rank) processDataTF(cfg, reprocess=args.reprocess_data, monitor=args.monitor_memory)
def main(): import argparse parser = argparse.ArgumentParser( parents=[dhdt.dhdtLog(), dhdt.batchParser(taskfarm=True)]) parser.add_argument('config', metavar='CFG', help="name of the configuration file") parser.add_argument( '-r', '--rebuild-store', action='store_true', default=False, help= "rebuild data store even though store is newer than the infput files") args = parser.parse_args() dhdt.initLog(args) if args.submit == 'sge': batch = dhdt.SGEProcess(args) elif args.submit == 'pbs': batch = dhdt.PBSProcess(args) else: batch = None # read the configuration cfg = dhdt.Config() cfg.readCfg(args.config) if batch is not None: cmd = ['readDataTF', args.config] if args.rebuild_store: cmd.append('-r') batch.mpi(cmd) else: readDataTF(cfg, rebuild=args.rebuild_store)
def main(): parser = argparse.ArgumentParser( parents=[dhdt.dhdtLog(), dhdt.batchParser()]) parser.add_argument('config', metavar='CFG', help="name of the configuration file") parser.add_argument( '-r', '--reprocess-data', action='store_true', default=False, help="process data even if previous run was successful") parser.add_argument('--process', '-p', metavar='N', default=0, type=int, help="compute tasks for process N") parser.add_argument('--tile-file', '-T', metavar='TFILE', help="get tile IDs from file TFILE") parser.add_argument('--monitor-memory', action='store_true', default=False, help="monitor CPU and memory usage") args = parser.parse_args() dhdt.initLog(args) if args.submit == 'sge': batch = dhdt.SGEProcess(args) elif args.submit == 'pbs': batch = dhdt.PBSProcess(args) else: batch = None # read the configuration cfg = dhdt.Config() cfg.readCfg(args.config) processTile = args.process if args.tile_file is not None: i = 0 tfile = open(args.tile_file, 'r') for line in tfile.readlines(): if i == args.process: processTile = int(line) break i = i + 1 else: parser.error("could not find tile %d in tile file %s" % (args.process, args.tile_file)) tfile.close() if not args.reprocess_data: if dhdt.checkNC(cfg['grid']['output'], processTile): logging.info('tile %d has already been successfully processed' % processTile) return if batch is not None: cmd = [ 'processData', '-p', str(processTile), args.config, '-l', args.log_level, '-L', args.log_file ] if args.monitor_memory: cmd.append('--monitor-memory') if args.reprocess_data: cmd.append('-r') batch.serial(cmd) else: processData(cfg, processTile, monitor=args.monitor_memory, reprocess=args.reprocess_data)
#gpd1 = gpd.GeoDataFrame([['John',1,Point(1,1)],['Smith',1,Point(2,2)],['Soap',1,Point(0,2)]],columns=['Name','ID','geometry']) #gpd2 = gpd.GeoDataFrame([['Work',Point(0,1.1)],['Shops',Point(2.5,2)],['Home',Point(1,1.1)]],columns=['Place','geometry']) #pts3 = gpd2.geometry.unary_union #def near(point, pts=pts3): # find the nearest point and return the corresponding Place value #nearest = gpd2.geometry == nearest_points(point, pts)[1] #return gpd2[nearest].Place.get_values()[0] #gpd1['Nearest'] = gpd1.apply(lambda row: near(row.geometry), axis=1) #def main(): parser = argparse.ArgumentParser(parents=[dhdt.dhdtLog(),dhdt.batchParser()]) parser.add_argument('config',metavar='CFG',help="name of the configuration file") parser.add_argument('-r','--rebuild-store',action='store_true',default=False,help="rebuild data store even though store is newer than the input files") args = parser.parse_args() dhdt.initLog(args) if args.submit == 'sge': batch = dhdt.SGEProcess(args) elif args.submit == 'pbs': batch = dhdt.PBSProcess(args) else: batch = None # read the configuration cfg = dhdt.Config() cfg.readCfg(args.config)
def main(): ''' main entry point to code ''' ########## Config ########## #Set test flag if wish to generate single matlab or csv file to test config ok. test = False converter = 'Matlab' #Or 'CSV' #Optional bbox in wgs84 coordinates [bottom left to top right] #bbox = [81.4,-96.734,81.41,-96.73] bbox = None #Matlab or CSV file pattern match fileglob = '/media/martin/DATA/Data/MatFiles/Swath/2012/*1B_201202*.mat' #Location to store shards storeFolder = '/media/martin/DATA/Data/hdf/swath/2012/' #Shard prefix, so can easily identify. Put 'swath_', 'poca_' or 'oib_' or anything else filePrefix = 'swath_' #Set data lengths - want large for swath (500k, small for poca and oib) recordLength = 500000 recordBuffer = 150000 #Maximum over recordLength to allow in file recordMin = 400000 #Minimum to allow in file - will concatenate files if below this #Alternate for Poca or OIB #recordLength = 6000 #recordBuffer = 2000 #recordMin = 4000 ######### Additional DHDT Config ########### #This is dhdt config parsing code. I have not modified parser = argparse.ArgumentParser( parents=[dhdt.dhdtLog(), dhdt.batchParser()]) parser.add_argument('config', metavar='CFG', help="name of the configuration file") parser.add_argument( '-r', '--rebuild-store', action='store_true', default=False, help= "rebuild data store even though store is newer than the input files") args = parser.parse_args() dhdt.initLog(args) if args.submit == 'sge': batch = dhdt.SGEProcess(args) elif args.submit == 'pbs': batch = dhdt.PBSProcess(args) else: batch = None # read the configuration cfg = dhdt.Config() cfg.readCfg(args.config) ############# Start of code ############## start = time.time() #Define reader if converter == 'Matlab': reader = dhdt.dataReader.MatlabReader() else: reader = dhdt.dataReader.CSVReader() #Temporary variables hold = False holdData = None #Get file list in order fileList = glob.glob(fileglob) fileList.sort() #Set counters ii = 0 fileCount = len(fileList) #Iterate through each file and convert to shards for d in fileList: ii += 1 print('Processing file {} of {}: {}'.format(ii, fileCount, d)) loadData = reader.read(d, bbox) if loadData.shape[0] == 0: print('Empty File - Skipping') if d != fileList[-1]: continue #Hold the data for appending next files (if less than minimum record length) allData = loadData #Deterime if data being held (if less than minimum record length), if so concatenate if hold: if loadData.shape[0] == 0: allData = holdData else: allData = pd.concat([holdData, loadData]) #Less than minimum record length so hold data and continue loop to append next files if allData.shape[0] < recordMin and d != fileList[-1]: hold = True holdData = allData continue else: hold = False holdData = None if allData.shape[0] == 0: continue #Must now be above minimum record length (or at end of file list) #Convert to geo coordinates and project to polar stereographic allData = GeoWrapper.convertToGeo(allData, cfg['data']['projection'], False) allData = GeoWrapper.project(allData, cfg['grid']['projection']) allData = GeoWrapper.extractXYtoCols(allData, 0) allData = allData.drop(['geometry'], axis=1) #Create dataframe allData = pd.DataFrame(allData) #Write counters i = 0 dLength = allData.shape[0] j = 0 #Loop over data to create files of maximum record length while i <= dLength: increment = recordLength if i + recordLength + recordBuffer > dLength: increment = recordLength + recordBuffer #Take slice of data up to maximum data length data = allData[i:i + increment] data = data.reset_index().drop(['index'], axis=1) #Only do next steps if have data if data.shape[0] > 0: #Create index indexData = DataStoreHelper.createIndex( data, ['lat', 'lon', 'x', 'y', 'startTime']) #Create files name fileTime = DataStoreHelper.createFileDateTime(indexData) fullPath = storeFolder + filePrefix + fileTime + '_' + str( j) + '.h5' #Write data store = pd.HDFStore(fullPath, mode='w', complevel=9, complib='blosc') store.append('data', data, index=False, data_columns=True) store.append('index', indexData, index=False, data_columns=True) store.close() #remove in-memory data to keep efficient del data del indexData i += increment j += 1 #remove in-memory data to keep efficient del loadData del allData del holdData #Set if want to run a test if test: if ii >= 1: print("Time Taken: {}".format(time.time() - start)) return print("Complete") print("Time Taken: {}".format(time.time() - start))