def load_isd_inventory(bucket_name): """ Load the isd_inventory into a dataframe if it already exists. If it doesn't exist, download it from NOAA. """ s3 = boto3.resource('s3') try: inventory = StringIO(s3.Object(bucket_name, 'isd-inventory.csv') .get()['Body'].read()) is_from_NOAA = False except botocore.exceptions.ClientError: # Get the current isd-inventory from NOAA's ftp server inventory = robust_get_from_NOAA_ftp( '/pub/data/noaa/', 'isd-inventory.csv') is_from_NOAA = True inventory = pd.read_csv( inventory, dtype={col: str for col in ['ID', 'USAF', 'WBAN', 'YEAR']}) if is_from_NOAA: """" Add new columns & initialize download records to date before NOAA ftp server existed """ inventory.insert(0, 'ID', inventory['USAF']+'-'+inventory['WBAN']) inventory['Station-Year'] = inventory['ID']+'-'+inventory['YEAR'] inventory['Last_Updated'] = pd.to_datetime(0) else: inventory['Last_Updated'] = pd.to_datetime(inventory['Last_Updated']) inventory.set_index('Station-Year', inplace=True) inventory = organize_inventory_cols(inventory) return inventory
def load_isd_inventory(bucket_name): """ Load the isd_inventory into a dataframe if it already exists. If it doesn't exist, download it from NOAA. """ s3 = boto3.resource('s3') try: inventory = (s3.Object(bucket_name, 'isd-inventory.csv') .get()['Body'].read()) except botocore.exceptions.ClientError: # Get the current isd-inventory from NOAA's ftp server ftp = FTP('ftp.ncdc.noaa.gov') ftp.login() ftp.cwd('/pub/data/noaa/') inventory = StringIO() ftp.retrbinary('RETR isd-inventory.csv', inventory.write) inventory.seek(0) inventory = pd.read_csv( inventory, dtype={col: str for col in ['USAF', 'WBAN']}) if 'ID' not in inventory.columns: inventory.insert(0, 'ID', inventory['USAF']+'-'+inventory['WBAN']) if 'Last_Updated' not in inventory.columns: # initialize download records to date before NOAA ftp server existed inventory['Last_Updated'] = pd.to_datetime(0) return inventory
def fix_l99902(source): """ Force utf-8 encoding """ lines = StringIO(source).readlines() if len(lines) > 2: if ('coding' not in lines[0].lower() and 'coding' not in lines[1].lower()): pos = 1 if lines[0][:2] == "#!" else 0 lines.insert(pos, '# coding: utf-8\n') return ''.join(lines)
ix1=pd.MultiIndex.from_tuples(tp1,names=['first','second']); d5=pd.DataFrame(randn(8,4),index=ix1,columns=['A','B','C','D']); d6=d5[:4]; d7=d5.stack(); d7.unstack(); # multi-idx can be converted to cols ###? result.columns.levels # labels for multi-index; multi-ix order matters # pd.pivot_table(d5,values='D',rows=['A','B'],cols=['C']) # summary_table - grp by A,B,C # concat df d=d0.copy(); pd.concat([d[:2],d[2:5],d[5:]]) # rows pd.concat([d.ix[:,'A':'B'],d.ix[1:3,'C':'D']],axis=1) # cols; note df_single_col=TimeSeries # o1=pd.concat([p1,p2,p3],keys=['first','second','third'],join='outer') # generates hierarchial_multi-index (multi-ix order matters); can use multiple_keys,dict etc; d=pd.DataFrame(randn(10,4),columns=['a','b','c','d'],index=[pd.core.common.rands(5) for _ in xrange(10)]) # rand_strings pd.concat([d.ix[:7,['a','b']],d.ix[2:-2,['c']],d.ix[-7:,['d']]],axis=1,join_axes=[d.index]) # ix_orig (othw ix_sorted) pd.concat([d.ix[:7,['a','b']],d.ix[2:-2,['c']],d.ix[-7:,['d']]],join='inner') # add_row/col,copy,reindex,sql-like merge,fill_nan ts2=pd.Series([1,3,5,np.nan,6,8],index=dt[:6]); d.append([d.ix[1,],d.ix[0,]]); d.append(ts2.T,ignore_index=True); # d is NOT modified; append rows broken??? d.loc[:,'d']=np.array([5]*len(d)); d['g']=ts2[0:4] # cols; data outside of "master date list" is lost d5=d.copy(); d6=d4.pop('C'); del d['g'] d.insert(1,'bar',d['b']) # args posn,lbl,data d6=d.reindex(index=dt[[0,1,4]],columns=list(d.columns)+['E']) # can modify row/col names (can extract data and construct new df); d.rename(columns={'one' : 'foo','two' : 'bar'},index={'a' : 'apple','b' : 'banana','d' : 'durian'}) # rename # pd.DataFrame(np.asarray(d),index=new_index,columns=new_cols); # inefficient but works; d.index=xx; d.columns=xx; d.name=xx; d7=pd.DataFrame({'key':['fo','fo'],'val1':[1,2]}); d8=pd.DataFrame({'key':['fo','fo'],'val2':[4,5]}); pd.merge(d7,d8,on='key') # sql-like merge,very high eff; d.combine_first(d2) # ~fill_nan pref1,pref2, ~d(isnan(d))=d2(isnan(d)); # process nan d[0<d]; # NaN's if no data d[0<d.a]; d[0<d.iloc[:,0]]; # d(d(:,1)<0,:) select rows d.dropna(how='any'); d.fillna(value=5); pd.isnull(d) # f=lambda x:x.fillna(x.mean()); grp=xx; d3=grp.transform(f) # fill with grp mean # stat,grouping d.mean(1); ts.value_counts(); # mean etc excludes missing data d.apply(np.cumsum); d.apply(lambda x:x.max()-x.min());