def create_phase_index(debug = False, **kwargs): # kwargs = kwgroups['mei'] from numpy import sort mei = load_mei() from numpy import where, arange, zeros, inf from transform import slp_tf tran = slp_tf() startmon = int(tran[kwargs['months'][0]]) startyr = kwargs['startyr'] idx_start = where((mei.index.year == startyr) & (mei.index.month == startmon)) idx = [] [idx.extend(arange(kwargs['n_mon']) + idx_start + 12*n) for n in range(kwargs['n_year'])] mei_avg = zeros((kwargs['n_year'])) for year, mons in enumerate(idx): mei_avg[year] = mei.values[mons].mean() mei = sort(mei_avg) pos = mei[mei>0] neg = mei[mei<0] n_el = int(round(len(pos)*0.34)) n_la = int(round(len(neg)*0.34)) n_np = int(len(pos) - n_el) n_nn = int(len(neg) - n_la) # cutoffs = { # 'la' : (neg[0], neg[n_la-1]), # 'nn' : (neg[n_la], neg[-10]), # 'np' : (pos[10], pos[n_np-1]), # 'el' : (pos[-n_el], pos[-1]), # 'N' : (neg[n_la/2], pos[n_np]) # } cutoffs = { 'la' : (neg[0], neg[n_la-1]), 'nn' : (neg[n_la], neg[-1]), 'np' : (pos[0], pos[n_np-1]), 'el' : (pos[-n_el], pos[-1]), 'N' : (neg[n_la + 1], pos[n_np-1]) } phaseind = { 'elnino' : (mei_avg >= cutoffs['el'][0]) & (mei_avg <= \ cutoffs['el'][1]), 'lanina' : (mei_avg >= cutoffs['la'][0]) & (mei_avg <= \ cutoffs['la'][1]), 'neutral' : (mei_avg >= cutoffs['N'][0]) & (mei_avg <= \ cutoffs['N'][1]), 'neutpos' : (mei_avg >= cutoffs['np'][0]) & (mei_avg <= \ cutoffs['np'][1]), 'neutneg' : (mei_avg >= cutoffs['nn'][0]) & (mei_avg <= \ cutoffs['nn'][1]), 'allyears' : (mei_avg >= -inf) } return mei_avg, phaseind
def load_climdiv_dataframes(debug = False, **kwargs): ################## ###LOAD MODULES### ################## import numpy as np import pandas as pd #_when you call the function, you use **kwgroups['climdiv'] #_then, kwargs within the function is kwgroups['climdiv'] #_kwgroups is made by calling create_kwgroups (line 177 in data_load) fp = kwargs['filin'] #_see importStates function states = importStates() #_see importDivs function divnums = importDivs() #_look at the text file that fp points to #_we use loadtxt with dtype set to 'string', so that every element in dat is a string #_the first column is the division code (see the readme in the climdiv folder) dat = np.loadtxt(fp, dtype=str) #_now we'll split dat into an nx1 array containing division codes, #_and the data into a nx12 array containing the monthly data climcodes = dat[:,0] climdata = dat[:,1:] #_now i needed to extract the division codes from the year/month part of climcodes divcodes = [] years = [] for item in climcodes: divcodes.append(item[:4]) #_take through the 4th letter in the string years.append(item[-4:]) #_take from 4 from the end to the end. #_the middle items are left out because we know it's precipitation #_now to the lists into arrays divcodes = np.array(divcodes) #_okay this is where we loop through all our arrays, and append the data for one #_division into one long monthly time series. #_alldata is a dictionary where the key is the division name (i.e. 'Alabama-01', a string) #_use the 'next' command with pdb uncommented to see what is what at every step through the loop. alldata = {} divnames = [] #import pdb; pdb.set_trace() for sc in sorted(states): for dc in divnums: division = sc+dc idx = np.where(divcodes == division)[0] #_np.where returns a tuple divdata = [] if len(idx) > 0: for year in idx: yearlydata = climdata[year] for month in range(12): divdata.append(np.float(yearlydata[month])) divname = states[sc] + '-' + dc divnames.append(divname) alldata[divname] = divdata else: pass #import pdb; pdb.set_trace() #_calculate the number of months nperiods = 12*(int(years[-1]) - int(years[0]) + 1) #_start the year at the first year indstartyr = years[0] #_use pandas date_range function to form an index for the data frame. use the help command to see how it works. index = pd.date_range(indstartyr, periods = nperiods, freq = 'M') #_make a data frame from a dictionary (pandas method, which is why we put all the data in a dictinoary earlier) #_it automatically sets the column names as the dictionary keys data = pd.DataFrame.from_dict(alldata) #_now we set the index in the data frame to the one we created data = data.set_index(index) #_replace the missing values with nans to make calculations work corerctly data = data.replace(to_replace = -9.99, value = np.nan) ###################################################### #_from here on out, use pdb.set_trace() to chug through #_and figure out what's going on. It's a little hacky, #_but that's how I roll. #_data is now a data frame with all the months and years in it in order: #_we need data output as seasonal totals for the set of years we #_want to analyze. so all the if statements make new dataframes after #_combining the data the appropriate way, depending on how kwgroups['climdiv'] is set #_through the create_kwgroups function. n = len(index) #_transform for start date from transform import slp_tf tf = slp_tf() #_Now extract if kwargs['months'][-1] > 12: start = str(kwargs['startyr'] + 1) + '-' + tf[kwargs['months'][-1]] nperiods = kwargs['endyr'] - kwargs['startyr'] rangeyrs = range(kwargs['startyr'] + 1 , kwargs['endyr'] + 1) else: start = str(kwargs['startyr']) + '-' + tf[kwargs['months'][-1]] nperiods = kwargs['endyr'] - kwargs['startyr'] + 1 rangeyrs = range(kwargs['startyr'], kwargs['endyr'] + 1) if debug: print rangeyrs print 'Start string is %s' % (start) index = pd.date_range(start, periods = nperiods, freq = '12M') newdataframe = pd.DataFrame(columns = index) indyears = data.index.year indmonths = data.index.month for year in rangeyrs: idx = np.repeat(False, n) bools = (indyears==year) & (indmonths == month) month = kwargs['months'][0] x = len(kwargs['months']) bools = (indyears==year) & (indmonths == month) loc = np.where(bools)[0] for y in range(x): idx[loc+y] = True """ This was the old code, they should both basically do the same thing though now it can split over years (i.e. start in N) for month in kwargs['months']: bools.append((indyears==year) & (indmonths == month)) for b in bools: idx = idx | b """ newdataframe[str(year)] = data[idx].sum() newdataframe = newdataframe.T dataframes = {} for code in states: state = states[code] divlist = [] for div in divnames: if div[:-3] == state: divlist.append(div) dataframes[state] = pd.DataFrame() for div in divlist: dataframes[state][div] = newdataframe[div] regions = importRegions() regionalDF = {} alldivDF = pd.DataFrame(index = dataframes['Wisconsin'].index) #import pdb; pdb.set_trace() for region in regions: regionalDF[region] = pd.DataFrame(index = dataframes['Wisconsin'].index) for state in regions[region]: for div in dataframes[state]: alldivDF[div] = dataframes[state][div] regionalDF[region][div] = dataframes[state][div] return alldivDF, regionalDF, dataframes
def load_slp(newFormat = False, debug = False, anomalies = True, **kwargs): """ This function loads HADSLP2r data. """ from transform import slp_tf, int_to_month from netCDF4 import Dataset from sklearn.preprocessing import scale from numpy import arange, zeros, where from os.path import isfile import pandas as pd import pickle transform = slp_tf() #This is for transforming kwargs into DLargs DLargs = { 'startmon' : transform[kwargs['months'][0]], 'endmon' : transform[kwargs['months'][-1]], 'startyr' : str(kwargs['startyr']), 'endyr' : str(kwargs['endyr']), 'nbox' : str(kwargs['n_mon']) } i2m = int_to_month() #_Use in naming convention fp = EV['DATA'] + '/nipa/SLP/' + i2m[kwargs['months'][0]] + \ DLargs['startyr'] + '_' + i2m[kwargs['months'][-1]] + \ DLargs['endyr'] + '_nbox_' + DLargs['nbox'] if isfile(fp): #print 'Using pickled SLP' f = open(fp) slpdata = pickle.load(f) f.close() if newFormat: from collections import namedtuple seasonal_var = namedtuple('seasonal_var', ('data','lat','lon')) slp = seasonal_var(slpdata['grid'], slpdata['lat'], slpdata['lon']) return slp return slpdata print 'Creating new SLP pickle from netCDF file' #_Next block takes the netCDF file and extracts the time to make #_a time index. nc_fp = EV['DATA'] + '/netCDF/slp.mnmean.real.nc' dat = Dataset(nc_fp) t = dat.variables['time'] extractargs = { 'start' : '1850-01', 'periods' : len(t[:]), 'freq' : 'M', } timeindex = pd.date_range(**extractargs) #Need to get start and end out of time index startyr = kwargs['startyr'] startmon = int(DLargs['startmon']) idx_start = where((timeindex.year == startyr) & (timeindex.month == startmon)) idx = [] [idx.extend(arange(kwargs['n_mon']) + idx_start + 12*n) for n in range(kwargs['n_year'])] """ This is how sst open dap does it but doesn't work for this idx = ((timeindex.year >= int(DLargs['startyr'])) & \ ((timeindex.month >= int(DLargs['startmon'])) & \ (timeindex.month <= int(DLargs['endmon'])))) & \ ((timeindex.year <= int(DLargs['endyr']))) """ if debug: print timeindex[idx][:10] lat = dat.variables['lat'][:] lon = dat.variables['lon'][:] slp = dat.variables['slp'][:] nlat = len(lat) nlon = len(lon) time = timeindex[idx] slpavg = zeros((kwargs['n_year'], nlat, nlon)) for year, mons in enumerate(idx): slpavg[year] = slp[mons].mean(axis=0) if debug: print 'Averaging ', mons #WHERE TO SCALE THE DATA? for i in range(nlat): for j in range(nlon): slpavg[:,i,j] = scale(slpavg[:,i,j]) slpdata = { 'grid' : slpavg, 'lat' : lat, 'lon' : lon } f = open(fp,'w') pickle.dump(slpdata,f) print 'SLP data saved to %s' % (fp) f.close() if newFormat: from collections import namedtuple seasonal_var = namedtuple('seasonal_var', ('data','lat','lon')) slp = seasonal_var(slpdata['grid'], slpdata['lat'], slpdata['lon']) return slp return slpdata