Example #1
0
def dataLoader(stationDict, startDate, endDate):

    # Figure out which indice we are downloading
    stationNum = int(stationDict['ID'])

    # Grab the ENSO Nino3.4 data (SST and Anom)
    if stationNum == 1 or stationNum == 2:

        # We'll get as much weekly data as we can, then backfill with monthly data
        # Here are the relevant URLs
        urlMonth = 'http://www.cpc.ncep.noaa.gov/data/indices/sstoi.indices'
        urlWeek = 'http://www.cpc.ncep.noaa.gov/data/indices/wksst8110.for'

        # Get the data
        dataMonth = requests.get(urlMonth)
        dataWeek = requests.get(urlWeek)

        # Process the monthly data
        dataMonth = StringIO(dataMonth.content.decode('utf-8'))
        dataMonth = dataMonth.readlines()
        timestamps = []
        ssts = []
        anoms = []
        for line in dataMonth[1:]:
            values = line.split()
            year = str(values[0])
            month = '0' + str(values[1])
            timestamps.append(
                pd.to_datetime(year + month[-2:] + '15', format='%Y%m%d'))
            ssts.append(float(values[8]))
            anoms.append(float(values[9]))

        dfMonth = pd.DataFrame(np.array([ssts, anoms]).T,
                               index=timestamps,
                               columns=[
                                   'Nino3.4 SST | Indice | degC',
                                   'Nino3.4 ANOM | Indice | degC'
                               ])

        # Process the weekly data
        dataWeek = StringIO(dataWeek.content.decode('utf-8'))
        dataWeek = dataWeek.readlines()
        timestamps = []
        ssts = []
        anoms = []
        for line in dataWeek[4:]:
            values = line.split('     ')
            timestamps.append(pd.to_datetime(values[0]))
            ssts.append(float(values[3][:4]))
            anoms.append(float(values[3][4:]))

        dfWeek = pd.DataFrame(np.array([ssts, anoms]).T,
                              index=timestamps,
                              columns=[
                                  'Nino3.4 SST | Indice | degC',
                                  'Nino3.4 ANOM | Indice | degC'
                              ])
        print(dfWeek)

        # Merge the 2 datasets, keeping all the weekly data and cutting some monthly
        dfMonth = dfMonth[dfMonth.index < dfWeek.index[0]]

        dfCombined = pd.concat([dfMonth, dfWeek]).resample('D').mean()
        dfCombined = dfCombined.fillna(method='ffill')
        dfCombined = dfCombined[dfCombined.index >= startDate]

        df = pd.DataFrame(index=pd.date_range(startDate, endDate))
        df = pd.concat([df, dfCombined], axis=1)
        df = df[df.index >= startDate]
        df = df[df.index <= endDate]

        # Return the correct dataset
        if str(stationNum) == '1':
            del df['Nino3.4 ANOM | Indice | degC']
            return df
        else:
            del df['Nino3.4 SST | Indice | degC']
            return df

    # Otherwise, we'll grab the PNA dataset
    elif stationNum == 3:

        url = "ftp://ftp.cpc.ncep.noaa.gov/cwlinks/norm.daily.pna.index.b500101.current.ascii"
        dataDaily = pd.read_csv(url,
                                names=['year', 'month', 'day', 'PNA'],
                                sep='\s+',
                                error_bad_lines=False,
                                converters={
                                    "year":
                                    int,
                                    "month":
                                    int,
                                    "day":
                                    lambda x: int(x[:2])
                                    if '*' in x else int(x)
                                })
        dataDaily.index = pd.to_datetime(dataDaily[['year', 'month', 'day']])
        del dataDaily['year'], dataDaily['month'], dataDaily['day']
        #url = "http://www.cpc.ncep.noaa.gov/products/precip/CWlink/pna/norm.pna.monthly.b5001.current.ascii"
        #dataMonth = pd.read_csv(url, names = ['year','month','PNA | Indice'], sep='\s+')
        #dataMonth['day'] = len(dataMonth.index)*[1]
        #datetimes = pd.to_datetime(dataMonth[['year','month','day']])
        #dataMonth.set_index(pd.DatetimeIndex(datetimes), inplace=True)
        #del dataMonth['year'], dataMonth['month'], dataMonth['day']
        #dataMonth = dataMonth.resample('D').mean()
        #lastDate = list(dataMonth.index)[-1]
        #if lastDate.month in [1,3,5,7,8,10,12]:
        #    endDay = 31
        #elif lastDate.month == 2:
        #    endDay = 28
        #else:
        #    endDay = 30
        #for day in range(lastDate.day,endDay + 1):
        #    dataMonth.loc[datetime(lastDate.year, lastDate.month, day)] = dataMonth.loc[lastDate]
        #dataMonth = dataMonth.fillna(method='ffill')
        dataDaily = dataDaily.fillna(method='ffill')
        #dataMonth = dataMonth[dataMonth.index >= startDate]
        #dataMonth = dataMonth[dataMonth.index <= endDate]
        dataDaily = dataDaily[dataDaily.index >= startDate]
        dataDaily = dataDaily[dataDaily.index <= endDate]
        df = pd.DataFrame(index=pd.date_range(startDate, endDate))
        df = pd.concat([df, dataDaily], axis=1)
        #df = pd.concat([df, dataMonth], axis = 1)

        return df

    elif stationNum == 4:
        """
        AMO Index"""
        url = 'https://www.esrl.noaa.gov/psd/data/correlation/amon.us.long.data'
        df = pd.read_csv(url,
                         skiprows=1,
                         names=[
                             'year', '1', '2', '3', '4', '5', '6', '7', '8',
                             '9', '10', '11', '12'
                         ],
                         sep='\s+')
        lastRow = df.index[df['year'] == 'AMO'].tolist()[0] - 1
        df = df[df.index < lastRow]
        df = df.melt(id_vars=['year'], var_name='month')
        dates = [
            str(df['year'][i]) + '-' + str(df['month'][i]) for i in df.index
        ]
        df.set_index(pd.DatetimeIndex(pd.to_datetime(dates, format='%Y/%m')),
                     inplace=True)
        df.sort_index(inplace=True)
        df['value'] = pd.to_numeric(df['value'])
        df.replace(to_replace=-99.990, value=np.nan, inplace=True)
        df = pd.DataFrame(df['value'])
        df = df.asfreq('D')
        lastDate = list(df.index)[-1]
        if lastDate.month in [1, 3, 5, 7, 8, 10, 12]:
            endDay = 31
        elif lastDate.month == 2:
            endDay = 28
        else:
            endDay = 30
        for day in range(lastDate.day, endDay + 1):
            df.loc[datetime(lastDate.year, lastDate.month,
                            day)] = df.loc[lastDate]
        df.fillna(method='ffill', inplace=True)
        df = df[df.index >= startDate]
        df = df[df.index <= endDate]

        return df

    # elif stationNum == 5:
    #     """
    #     Mauna Loa CO2 Trend
    #     """
    #     url = 'ftp://aftp.cmdl.noaa.gov/products/trends/co2/co2_mm_mlo.txt'
    #     df = pd.read_csv(url, index_col=False, sep='\s+', comment='#', names=['year','month','time','average_molFrac','interpolated_molFrac','trend','days'])
    #     dates = [str(df['year'][i])+'-'+str(df['month'][i]) for i in df.index]
    #     df.set_index(pd.DatetimeIndex(pd.to_datetime(dates, format='%Y/%m')), inplace=True)
    #     df = pd.DataFrame(df['trend'])
    #     df = df.asfreq('D')
    #     df.fillna(method='ffill',inplace=True)
    #     df = df[df.index>=startDate]
    #     df = df[df.index<=endDate]
    #     return df

    elif stationNum == 5:
        """
        Pacific Multidecadal Oscillation (PDO)
        """
        url = "https://www.ncdc.noaa.gov/teleconnections/pdo/data.json"
        response = requests.get(url)
        response = response.json()
        data = response['data']
        dates = [pd.to_datetime(i, format='%Y%m') for i in list(data.keys())]
        values = [float(val) for val in list(data.values())]
        values = [np.nan if x == -99.99 else x for x in values]
        df = pd.DataFrame(values, index=dates, columns=['PDO'])
        df = df.asfreq('D')
        lastDate = list(df.index)[-1]
        if lastDate.month in [1, 3, 5, 7, 8, 10, 12]:
            endDay = 31
        elif lastDate.month == 2:
            endDay = 28
        else:
            endDay = 30
        for day in range(lastDate.day, endDay + 1):
            df.loc[datetime(lastDate.year, lastDate.month,
                            day)] = df.loc[lastDate]
        df.fillna(method='ffill', inplace=True)
        df = df[df.index >= startDate]
        df = df[df.index <= endDate]
        return df

    elif stationNum == 6:
        """
        Arctic Oscillation Index (AOI) 
        """
        url = "https://www.cpc.ncep.noaa.gov/products/precip/CWlink/daily_ao_index/monthly.ao.index.b50.current.ascii"
        dataMonth = pd.read_csv(url,
                                names=['year', 'month', 'AOI | Indice'],
                                sep='\s+')
        dataMonth['day'] = len(dataMonth.index) * [1]
        datetimes = pd.to_datetime(dataMonth[['year', 'month', 'day']])
        dataMonth.set_index(pd.DatetimeIndex(datetimes), inplace=True)
        del dataMonth['year'], dataMonth['month'], dataMonth['day']
        dataMonth = dataMonth.resample('D').mean()
        dataMonth = dataMonth.fillna(method='ffill')
        dataMonth = dataMonth[dataMonth.index >= startDate]
        dataMonth = dataMonth[dataMonth.index <= endDate]
        df = pd.DataFrame(index=pd.date_range(startDate, endDate))
        df = pd.concat([df, dataMonth], axis=1)
        return df

    elif stationNum == 7:
        """
        Southern Oscillation Index (SOI) 
        """
        url = "https://www.esrl.noaa.gov/psd/data/correlation/soi.data"
        df = pd.read_csv(url,
                         skiprows=1,
                         names=[
                             'year', '1', '2', '3', '4', '5', '6', '7', '8',
                             '9', '10', '11', '12'
                         ],
                         sep='\s+')
        lastRow = df.index[df['year'] == 'SOI'].tolist()[0] - 1
        df = df[df.index < lastRow]
        df = df.melt(id_vars=['year'], var_name='month')
        dates = [
            str(df['year'][i]) + '-' + str(df['month'][i]) for i in df.index
        ]
        df.set_index(pd.DatetimeIndex(pd.to_datetime(dates, format='%Y/%m')),
                     inplace=True)
        df.sort_index(inplace=True)
        df['value'] = pd.to_numeric(df['value'])
        df.replace(to_replace=-99.990, value=np.nan, inplace=True)
        df = pd.DataFrame(df['value'])
        df = df.asfreq('D')
        lastDate = list(df.index)[-1]
        if lastDate.month in [1, 3, 5, 7, 8, 10, 12]:
            endDay = 31
        elif lastDate.month == 2:
            endDay = 28
        else:
            endDay = 30
        for day in range(lastDate.day, endDay + 1):
            df.loc[datetime(lastDate.year, lastDate.month,
                            day)] = df.loc[lastDate]
        df.fillna(method='ffill', inplace=True)
        df = df[df.index >= startDate]
        df = df[df.index <= endDate]
        return df

    elif stationNum == 8:
        """
        Multivariate ENSO Index (MEI) 
        """
        #https://psl.noaa.gov/enso/mei/data/meiv2.data
        url = "https://psl.noaa.gov/enso/mei/data/meiv2.data"
        df = pd.read_csv(url,
                         skiprows=1,
                         names=[
                             'year', '1', '2', '3', '4', '5', '6', '7', '8',
                             '9', '10', '11', '12'
                         ],
                         sep='\s+')
        lastRow = df.index[df['year'] == 'Multivariate'].tolist()[0] - 1
        df = df[df.index < lastRow]
        df = df.melt(id_vars=['year'], var_name='month')
        dates = [
            str(df['year'][i]) + '-' + str(df['month'][i]) for i in df.index
        ]
        df.set_index(pd.DatetimeIndex(pd.to_datetime(dates, format='%Y/%m')),
                     inplace=True)
        df.sort_index(inplace=True)
        df['value'] = pd.to_numeric(df['value'])
        df.replace(to_replace=-999.00, value=np.nan, inplace=True)
        df = pd.DataFrame(df['value'])
        df = df.asfreq('D')
        lastDate = list(df.index)[-1]
        if lastDate.month in [1, 3, 5, 7, 8, 10, 12]:
            endDay = 31
        elif lastDate.month == 2:
            endDay = 28
        else:
            endDay = 30
        for day in range(lastDate.day, endDay + 1):
            df.loc[datetime(lastDate.year, lastDate.month,
                            day)] = df.loc[lastDate]
        df.fillna(method='ffill', inplace=True)
        df = df[df.index >= startDate]
        df = df[df.index <= endDate]
        return df

    else:
        return pd.DataFrame()
Example #2
0
def dataLoader(stationDict, startDate, endDate):
    """
    This dataloader loads data from NOAA's Climate Prediction Center (CPC). The datasets
    are climate indices that are useful for long-range precipitation forecasting. 
    The "DatasetExternalID" option specifies which dataset should be downloaded. Valid 
    paramters are: 
    'nino3.4' - Nino 3.4 Sea Surface Temperature Anomaly (aka ENSO)
    'pna' - Pacific North American Index
    'amo' - Atlantic Multidecadal Oscillation
    'pdo' - Pacific Decadal Oscillation
    DEFAULT OPTIONS
    DatasetExternalID: nino3.4
    """

    # Figure out which indice we are downloading
    stationNum = stationDict['DatasetExternalID']

    # Grab the ENSO Nino3.4 data (SST and Anom)
    if stationNum == 'nino3.4':

        # We'll get as much weekly data as we can, then backfill with monthly data
        # Here are the relevant URLs
        urlMonth = 'http://www.cpc.ncep.noaa.gov/data/indices/sstoi.indices'
        urlWeek = 'http://www.cpc.ncep.noaa.gov/data/indices/wksst8110.for'

        # Get the data
        dataMonth = requests.get(urlMonth)
        dataWeek = requests.get(urlWeek)

        # Process the monthly data
        dataMonth = StringIO(dataMonth.content.decode('utf-8'))
        dataMonth = dataMonth.readlines()
        timestamps = []
        anoms = []
        for line in dataMonth[1:]:
            values = line.split()
            year = str(values[0])
            month = '0' + str(values[1])
            timestamps.append(
                pd.to_datetime(year + month[-2:] + '15', format='%Y%m%d'))
            anoms.append(float(values[9]))

        dfMonth = pd.DataFrame(np.array(anoms).T,
                               index=timestamps,
                               columns=['Nino3.4 ANOM | Indice | degC'])

        # Process the weekly data
        dataWeek = StringIO(dataWeek.content.decode('utf-8'))
        dataWeek = dataWeek.readlines()
        timestamps = []
        anoms = []
        for line in dataWeek[4:]:
            values = line.split('     ')
            timestamps.append(pd.to_datetime(values[0]))
            anoms.append(float(values[3][4:]))

        dfWeek = pd.DataFrame(np.array(anoms).T,
                              index=timestamps,
                              columns=['Nino3.4 ANOM | Indice | degC'])

        # Merge the 2 datasets, keeping all the weekly data and cutting some monthly
        dfMonth = dfMonth[dfMonth.index < dfWeek.index[0]]

        dfCombined = pd.concat([dfMonth, dfWeek]).resample('D').mean()
        dfCombined = dfCombined.fillna(method='ffill')
        dfCombined = dfCombined[dfCombined.index >= startDate]

        df = pd.DataFrame(index=pd.date_range(startDate, endDate))
        df = pd.concat([df, dfCombined], axis=1)
        df = df[df.index >= startDate]
        df = df[df.index <= endDate]

        # Return the correct dataset
        return df

    # Otherwise, we'll grab the PNA dataset
    elif stationNum == 'pna':
        url = "http://www.cpc.ncep.noaa.gov/products/precip/CWlink/pna/norm.pna.monthly.b5001.current.ascii"
        dataMonth = pd.read_csv(url,
                                names=['year', 'month', 'PNA | Indice'],
                                sep='\s+')
        dataMonth['day'] = len(dataMonth.index) * [1]
        datetimes = pd.to_datetime(dataMonth[['year', 'month', 'day']])
        dataMonth.set_index(pd.DatetimeIndex(datetimes), inplace=True)
        del dataMonth['year'], dataMonth['month'], dataMonth['day']
        dataMonth = dataMonth.resample('D').mean()
        dataMonth = dataMonth.fillna(method='ffill')
        dataMonth = dataMonth[dataMonth.index >= startDate]
        df = pd.DataFrame(index=pd.date_range(startDate, endDate))
        df = pd.concat([df, dataMonth], axis=1)
        return df

    elif stationNum == 'amo':
        """
        AMO Index"""
        url = 'https://www.esrl.noaa.gov/psd/data/correlation/amon.us.long.data'
        df = pd.read_csv(url,
                         skiprows=1,
                         names=[
                             'year', '1', '2', '3', '4', '5', '6', '7', '8',
                             '9', '10', '11', '12'
                         ],
                         sep='\s+')
        lastRow = df.index[df['year'] == 'AMO'].tolist()[0] - 1
        df = df[df.index < lastRow]
        df = df.melt(id_vars=['year'], var_name='month')
        dates = [
            str(df['year'][i]) + '-' + str(df['month'][i]) for i in df.index
        ]
        df.set_index(pd.DatetimeIndex(pd.to_datetime(dates, format='%Y/%m')),
                     inplace=True)
        df.sort_index(inplace=True)
        df['value'] = pd.to_numeric(df['value'])
        df.replace(to_replace=-99.990, value=np.nan, inplace=True)
        df = pd.DataFrame(df['value'])
        df = df.asfreq('D')
        df.fillna(method='ffill', inplace=True)
        df = df[df.index >= startDate]
        df = df[df.index <= endDate]

        return df

    # elif stationNum == 5:
    #     """
    #     Mauna Loa CO2 Trend
    #     """
    #     url = 'ftp://aftp.cmdl.noaa.gov/products/trends/co2/co2_mm_mlo.txt'
    #     df = pd.read_csv(url, index_col=False, sep='\s+', comment='#', names=['year','month','time','average_molFrac','interpolated_molFrac','trend','days'])
    #     dates = [str(df['year'][i])+'-'+str(df['month'][i]) for i in df.index]
    #     df.set_index(pd.DatetimeIndex(pd.to_datetime(dates, format='%Y/%m')), inplace=True)
    #     df = pd.DataFrame(df['trend'])
    #     df = df.asfreq('D')
    #     df.fillna(method='ffill',inplace=True)
    #     df = df[df.index>=startDate]
    #     df = df[df.index<=endDate]
    #     return df

    elif stationNum == 'pdo':
        """
        Pacific Multidecadal Oscillation (PDO)
        """
        url = "https://www.ncdc.noaa.gov/teleconnections/pdo/data.json"
        response = requests.get(url)
        response = response.json()
        data = response['data']
        dates = [pd.to_datetime(i, format='%Y%m') for i in list(data.keys())]
        values = [float(val) for val in list(data.values())]
        values = [np.nan if x == -99.99 else x for x in values]
        df = pd.DataFrame(values, index=dates, columns=['PDO'])
        df = df.asfreq('D')
        df.fillna(method='ffill', inplace=True)
        df = df[df.index >= startDate]
        df = df[df.index <= endDate]
        return df

    else:
        return pd.DataFrame()