def create_full_indexes(args): mode, chno = args dirname = fu.get_month_sample_path_from_mode(mode) path = pjoin(dirname, 'C'+str(chno)+'.h5') store = pd.HDFStore(path) store.create_table_index('df', columns=['clat'], kind='full') store.close()
def create_full_indexes(args): mode, chno = args dirname = fu.get_month_sample_path_from_mode(mode) path = pjoin(dirname, 'C' + str(chno) + '.h5') store = pd.HDFStore(path) store.create_table_index('df', columns=['clat'], kind='full') store.close()
def ptrepack_all(args): mode, chno = args dirname = fu.get_month_sample_path_from_mode(mode) fname_root = pjoin(dirname, 'C'+str(chno)) infile = fname_root + '.h5' outfile = fname_root + '_sorted.h5' cmd = ['ptrepack','--chunkshape=auto','--sortby=clat','--propindexes', infile, outfile] subprocess.call(cmd)
def ptrepack_all(args): mode, chno = args dirname = fu.get_month_sample_path_from_mode(mode) fname_root = pjoin(dirname, 'C' + str(chno)) infile = fname_root + '.h5' outfile = fname_root + '_sorted.h5' cmd = [ 'ptrepack', '--chunkshape=auto', '--sortby=clat', '--propindexes', infile, outfile ] subprocess.call(cmd)
def get_percent_done(timestr, mode): """check how many files have been processed and tweet status regularly. >>> status_checker timestr mode 60[min] timestr determines which folders are being checked. """ fnames_out = os.listdir(fu.get_month_sample_path_from_mode(mode)) fnames = glob.glob(os.path.join(fu.l1adatapath, timestr + '*_L1A.TAB')) timestrs_done = [fu.FileName(i).timestr for i in fnames_out] # 7 = len[3,4,5,6,7,8,9], i.e. thermal channels # if there are not 7 channel files per timestr file, it's not done. fnames_todo = [i for i in fnames if timestrs_done.count(fu.FileName(i).timestr) < 7] all = len(fnames) left = len(fnames_todo) done = all - left return 100*done/all
def get_percent_done(timestr, mode): """check how many files have been processed and tweet status regularly. >>> status_checker timestr mode 60[min] timestr determines which folders are being checked. """ fnames_out = os.listdir(fu.get_month_sample_path_from_mode(mode)) fnames = glob.glob(os.path.join(fu.l1adatapath, timestr + '*_L1A.TAB')) timestrs_done = [fu.FileName(i).timestr for i in fnames_out] # 7 = len[3,4,5,6,7,8,9], i.e. thermal channels # if there are not 7 channel files per timestr file, it's not done. fnames_todo = [ i for i in fnames if timestrs_done.count(fu.FileName(i).timestr) < 7 ] all = len(fnames) left = len(fnames_todo) done = all - left return 100 * done / all
def store_channel_csv_to_h5(args): mode, ch = args dirname = fu.get_month_sample_path_from_mode(mode) searchpath = pjoin(dirname, '*_C'+str(ch)+'_*.csv') fnames = glob.glob(searchpath) if not fnames: print("No files found with searchpath\n",searchpath) return storepath = pjoin(dirname, 'C'+str(ch)+'.h5') store = pd.HDFStore(storepath,'w') for i,fname in enumerate(fnames): print(100*i/len(fnames)) if i % 100 == 0: tweet_machine("C{0} conversion to HDF, {1:g}" " % done.".format(ch, 100*i/len(fnames))) df = pd.io.parsers.read_csv(fname) if len(df) == 0: continue store.append('df', df, data_columns=['clat','clon','cloctime'], index=False) store.close() print("C{0} done.".format(ch))
def plot_channel_filesizes(mode, show=False): dirname = fu.get_month_sample_path_from_mode(mode) fig, axes = plt.subplots(figsize=(12, 10)) for ch in range(3, 10): searchpath = os.path.join(dirname, '*C' + str(ch) + '*.csv') fnames = glob.glob(searchpath) sizes = [] fnames.sort() for fname in fnames: sizes.append(os.path.getsize(fname)) axes.plot(sizes, label='Ch ' + str(ch)) axes.legend(loc='best') axes.set_title("Length of {0} files.".format(mode)) savefname = 'filesizes_' + mode + '.png' plt.savefig(savefname, dpi=300) print("Saving", savefname) if show: plt.show()
def plot_channel_filesizes(mode, show=False): dirname = fu.get_month_sample_path_from_mode(mode) fig, axes = plt.subplots(figsize=(12,10)) for ch in range(3,10): searchpath = os.path.join(dirname, '*C'+str(ch)+'*.csv') fnames = glob.glob(searchpath) sizes = [] fnames.sort() for fname in fnames: sizes.append(os.path.getsize(fname)) axes.plot(sizes,label='Ch '+str(ch)) axes.legend(loc='best') axes.set_title("Length of {0} files.".format(mode)) savefname = 'filesizes_'+mode+'.png' plt.savefig(savefname,dpi=300) print("Saving", savefname) if show: plt.show()
def store_channel_csv_to_h5(args): mode, ch = args dirname = fu.get_month_sample_path_from_mode(mode) searchpath = pjoin(dirname, '*_C' + str(ch) + '_*.csv') fnames = glob.glob(searchpath) if not fnames: print("No files found with searchpath\n", searchpath) return storepath = pjoin(dirname, 'C' + str(ch) + '.h5') store = pd.HDFStore(storepath, 'w') for i, fname in enumerate(fnames): print(100 * i / len(fnames)) if i % 100 == 0: tweet_machine("C{0} conversion to HDF, {1:g}" " % done.".format(ch, 100 * i / len(fnames))) df = pd.io.parsers.read_csv(fname) if len(df) == 0: continue store.append('df', df, data_columns=['clat', 'clon', 'cloctime'], index=False) store.close() print("C{0} done.".format(ch))
def process_one_channel(args): # unpack argument tuple fn, l1a_channel, rdr_channel, tbout_all, radout_all, rdrdf, dfdate, dfutc = args print("Processing channel", rdr_channel) # filter out for channel of interest tbout = tbout_all.filter(regex=l1a_channel + '_') radout = radout_all.filter(regex=l1a_channel + '_') # rename detector names to rdr standard, and reverse detector numbering for # detectors of telescope B if l1a_channel.startswith('b'): tbout.columns = radout.columns = range(21, 0, -1) else: tbout.rename(columns=lambda x: int(x[3:]), inplace=True) radout.rename(columns=lambda x: int(x[3:]), inplace=True) rdrch = rdrdf[rdrdf.c == rdr_channel] rdrout = rdrch[[ 'date', 'utc', 'jdate', 'clat', 'clon', 'sclat', 'sclon', 'scrad', 'orientlat', 'orientlon', 'sunlat', 'sunlon', 'sundist', 'orbit', 'scalt', 'af', 'c', 'det', 'cemis', 'cloctime', 'qca', 'qge', 'qmi' ]] rdrout.columns = [ 'date', 'utc', 'jdate', 'clat', 'clon', 'sclat', 'sclon', 'scrad', 'vert_lat', 'vert_lon', 'sunlat', 'sunlon', 'sundst', 'orbit', 'scalt', 'af', 'c', 'det', 'cemis', 'cloctime', 'qca', 'qge', 'qmi' ] #fix the columns names so the orientlat orientlon will be the wrong name from # divdata: vert_lan and vert_lon # because the index of tbout is less than df, only the dates for tbout's index # should be picked out of df tbout['date'] = dfdate tbout['utc'] = dfutc radout['date'] = dfdate radout['utc'] = dfutc tbmolten = pd.melt(tbout, id_vars=['date', 'utc'], value_vars=range(1, 22)) radmolten = pd.melt(radout, id_vars=['date', 'utc'], value_vars=range(1, 22)) # the melting process left funny columns names. repair. tbmolten.columns = ['date', 'utc', 'det', 'tb'] radmolten.columns = ['date', 'utc', 'det', 'radiance'] data_out = tbmolten.merge(rdrout, on=['date', 'utc', 'det']) data_out = radmolten.merge(data_out, on=['date', 'utc', 'det']) print("Merged successfully. Writing out to csv now.") # create filename basename = fn.timestr + '_C' + str(rdr_channel) + '_' + mode + '_RDR20.csv' dirname = fu.get_month_sample_path_from_mode(mode) if not os.path.exists(dirname): os.makedirs(dirname) outfname = pjoin(dirname, basename) # don't write out the meaningless integer index data_out.to_csv(outfname, index=False) print("Finished", os.path.basename(outfname))
if len(sys.argv) < 2: usage() timestr = sys.argv[1] fnames = glob.glob(os.path.join(fu.l1adatapath, timestr + '*_L1A.TAB')) fnames.sort() if sys.argv[2] == 'test': pprint.pprint(fnames) sys.exit() mode = sys.argv[2] cpus = sys.argv[3] # find outpaths that are done try: fnames_done = glob.glob( pjoin(fu.get_month_sample_path_from_mode(mode), '*.csv')) timestrs_done = [fu.FileName(i).timestr for i in fnames_done] fnames_todo = [ i for i in fnames if timestrs_done.count(fu.FileName(i).timestr) < 7 ] except OSError: fnames_todo = fnames fnames_todo.sort() # create input tuple to have pool.map only 1 parameter to provide list_of_input_tuples = [(i, mode) for i in fnames_todo] pool = mypool.MyPool(int(cpus)) pool.map(main, list_of_input_tuples)
def get_data_from_hour(mode, ch, timestr): ch = ch[:2] dirname = fu.get_month_sample_path_from_mode(mode) fname = '_'.join([timestr, ch, mode, 'RDR20']) fname += '.csv' return pd.io.parsers.read_csv(os.path.join(dirname, fname))
def get_data_from_hour(mode, ch, timestr): ch = ch[:2] dirname = fu.get_month_sample_path_from_mode(mode) fname = "_".join([timestr, ch, mode, "RDR20"]) fname += ".csv" return pd.io.parsers.read_csv(os.path.join(dirname, fname))
def process_one_channel(args): # unpack argument tuple fn, l1a_channel, rdr_channel, tbout_all, radout_all, rdrdf, dfdate, dfutc = args print("Processing channel",rdr_channel) # filter out for channel of interest tbout = tbout_all.filter(regex=l1a_channel+'_') radout = radout_all.filter(regex=l1a_channel+'_') # rename detector names to rdr standard, and reverse detector numbering for # detectors of telescope B if l1a_channel.startswith('b'): tbout.columns = radout.columns = range(21,0,-1) else: tbout.rename(columns=lambda x:int(x[3:]), inplace=True) radout.rename(columns=lambda x:int(x[3:]), inplace=True) rdrch = rdrdf[rdrdf.c == rdr_channel] rdrout = rdrch[['date','utc','jdate','clat','clon','sclat','sclon','scrad', 'orientlat','orientlon', 'sunlat','sunlon','sundist','orbit','scalt','af','c','det','cemis','cloctime', 'qca','qge','qmi']] rdrout.columns = ['date','utc','jdate','clat','clon','sclat','sclon','scrad', 'vert_lat','vert_lon', 'sunlat','sunlon','sundst','orbit','scalt','af','c','det','cemis','cloctime', 'qca','qge','qmi'] #fix the columns names so the orientlat orientlon will be the wrong name from # divdata: vert_lan and vert_lon # because the index of tbout is less than df, only the dates for tbout's index # should be picked out of df tbout['date'] = dfdate tbout['utc'] = dfutc radout['date'] = dfdate radout['utc'] = dfutc tbmolten = pd.melt(tbout, id_vars=['date','utc'], value_vars=range(1,22)) radmolten = pd.melt(radout, id_vars=['date','utc'], value_vars=range(1,22)) # the melting process left funny columns names. repair. tbmolten.columns = ['date','utc','det','tb'] radmolten.columns = ['date','utc','det','radiance'] data_out = tbmolten.merge(rdrout, on=['date','utc','det']) data_out = radmolten.merge(data_out, on=['date','utc','det']) print("Merged successfully. Writing out to csv now.") # create filename basename = fn.timestr + '_C' + str(rdr_channel) + '_' + mode + '_RDR20.csv' dirname = fu.get_month_sample_path_from_mode(mode) if not os.path.exists(dirname): os.makedirs(dirname) outfname = pjoin(dirname, basename) # don't write out the meaningless integer index data_out.to_csv(outfname, index=False) print("Finished",os.path.basename(outfname))
if __name__ == '__main__': if len(sys.argv) < 2: usage() timestr = sys.argv[1] fnames = glob.glob(os.path.join(fu.l1adatapath, timestr + '*_L1A.TAB')) fnames.sort() if sys.argv[2] == 'test': pprint.pprint(fnames) sys.exit() mode = sys.argv[2] cpus = sys.argv[3] # find outpaths that are done try: fnames_done = glob.glob(pjoin(fu.get_month_sample_path_from_mode(mode),'*.csv')) timestrs_done = [fu.FileName(i).timestr for i in fnames_done] fnames_todo = [i for i in fnames if timestrs_done.count(fu.FileName(i).timestr) < 7] except OSError: fnames_todo = fnames fnames_todo.sort() # create input tuple to have pool.map only 1 parameter to provide list_of_input_tuples = [(i, mode) for i in fnames_todo] pool = mypool.MyPool(int(cpus)) pool.map(main, list_of_input_tuples) pool.close() pool.join()