Example #1
0
def load_isd_inventory(bucket_name):
    """
    Load the isd_inventory into a dataframe if it already exists.
    If it doesn't exist, download it from NOAA.
    """
    s3 = boto3.resource('s3')
    try:
        inventory = StringIO(s3.Object(bucket_name, 'isd-inventory.csv')
                             .get()['Body'].read())
        is_from_NOAA = False
    except botocore.exceptions.ClientError:
        # Get the current isd-inventory from NOAA's ftp server
        inventory = robust_get_from_NOAA_ftp(
            '/pub/data/noaa/', 'isd-inventory.csv')
        is_from_NOAA = True
    inventory = pd.read_csv(
        inventory, dtype={col: str for col in ['ID', 'USAF', 'WBAN', 'YEAR']})
    if is_from_NOAA:
        """"
        Add new columns & initialize download records to date
        before NOAA ftp server existed
        """
        inventory.insert(0, 'ID', inventory['USAF']+'-'+inventory['WBAN'])
        inventory['Station-Year'] = inventory['ID']+'-'+inventory['YEAR']
        inventory['Last_Updated'] = pd.to_datetime(0)
    else:
        inventory['Last_Updated'] = pd.to_datetime(inventory['Last_Updated'])
    inventory.set_index('Station-Year', inplace=True)
    inventory = organize_inventory_cols(inventory)
    return inventory
Example #2
0
def load_isd_inventory(bucket_name):
    """
    Load the isd_inventory into a dataframe if it already exists.
    If it doesn't exist, download it from NOAA.
    """
    s3 = boto3.resource('s3')
    try:
        inventory = (s3.Object(bucket_name, 'isd-inventory.csv')
                     .get()['Body'].read())
    except botocore.exceptions.ClientError:
        # Get the current isd-inventory from NOAA's ftp server
        ftp = FTP('ftp.ncdc.noaa.gov')
        ftp.login()
        ftp.cwd('/pub/data/noaa/')
        inventory = StringIO()
        ftp.retrbinary('RETR isd-inventory.csv', inventory.write)
        inventory.seek(0)
    inventory = pd.read_csv(
        inventory, dtype={col: str for col in ['USAF', 'WBAN']})
    if 'ID' not in inventory.columns:
        inventory.insert(0, 'ID', inventory['USAF']+'-'+inventory['WBAN'])
    if 'Last_Updated' not in inventory.columns:
        # initialize download records to date before NOAA ftp server existed
        inventory['Last_Updated'] = pd.to_datetime(0)
    return inventory
Example #3
0
def fix_l99902(source):
    """
    Force utf-8 encoding
    """
    lines = StringIO(source).readlines()
    if len(lines) > 2:
        if ('coding' not in lines[0].lower() and
                'coding' not in lines[1].lower()):
            pos = 1 if lines[0][:2] == "#!" else 0
            lines.insert(pos, '# coding: utf-8\n')
    return ''.join(lines)
Example #4
0
ix1=pd.MultiIndex.from_tuples(tp1,names=['first','second']); d5=pd.DataFrame(randn(8,4),index=ix1,columns=['A','B','C','D']); d6=d5[:4]; d7=d5.stack(); d7.unstack(); # multi-idx can be converted to cols
###? result.columns.levels # labels for multi-index; multi-ix order matters
# pd.pivot_table(d5,values='D',rows=['A','B'],cols=['C']) # summary_table - grp by A,B,C

# concat df
d=d0.copy(); pd.concat([d[:2],d[2:5],d[5:]]) # rows
pd.concat([d.ix[:,'A':'B'],d.ix[1:3,'C':'D']],axis=1) # cols; note df_single_col=TimeSeries
# o1=pd.concat([p1,p2,p3],keys=['first','second','third'],join='outer') # generates hierarchial_multi-index (multi-ix order matters); can use multiple_keys,dict etc;
d=pd.DataFrame(randn(10,4),columns=['a','b','c','d'],index=[pd.core.common.rands(5) for _ in xrange(10)]) # rand_strings
pd.concat([d.ix[:7,['a','b']],d.ix[2:-2,['c']],d.ix[-7:,['d']]],axis=1,join_axes=[d.index]) # ix_orig (othw ix_sorted)
pd.concat([d.ix[:7,['a','b']],d.ix[2:-2,['c']],d.ix[-7:,['d']]],join='inner')
# add_row/col,copy,reindex,sql-like merge,fill_nan
ts2=pd.Series([1,3,5,np.nan,6,8],index=dt[:6]); d.append([d.ix[1,],d.ix[0,]]); d.append(ts2.T,ignore_index=True); # d is NOT modified; append rows broken???
d.loc[:,'d']=np.array([5]*len(d)); d['g']=ts2[0:4] # cols; data outside of "master date list" is lost
d5=d.copy(); d6=d4.pop('C'); del d['g']
d.insert(1,'bar',d['b']) # args posn,lbl,data
d6=d.reindex(index=dt[[0,1,4]],columns=list(d.columns)+['E']) # can modify row/col names (can extract data and construct new df);
d.rename(columns={'one' : 'foo','two' : 'bar'},index={'a' : 'apple','b' : 'banana','d' : 'durian'}) # rename
# pd.DataFrame(np.asarray(d),index=new_index,columns=new_cols); # inefficient but works; d.index=xx; d.columns=xx; d.name=xx;
d7=pd.DataFrame({'key':['fo','fo'],'val1':[1,2]}); d8=pd.DataFrame({'key':['fo','fo'],'val2':[4,5]}); pd.merge(d7,d8,on='key') # sql-like merge,very high eff;
d.combine_first(d2) # ~fill_nan pref1,pref2, ~d(isnan(d))=d2(isnan(d));

# process nan
d[0<d]; # NaN's if no data
d[0<d.a]; d[0<d.iloc[:,0]]; # d(d(:,1)<0,:) select rows
d.dropna(how='any'); d.fillna(value=5); pd.isnull(d)
# f=lambda x:x.fillna(x.mean()); grp=xx; d3=grp.transform(f) # fill with grp mean

# stat,grouping
d.mean(1); ts.value_counts(); # mean etc excludes missing data
d.apply(np.cumsum); d.apply(lambda x:x.max()-x.min());