Example #1
0
def hourly(location='Fresno, CA', days=1, start=None, end=None, years=1, use_cache=True, verbosity=1):
    """ Get detailed (hourly) weather data for the requested days and location

    The Weather Underground URL for Fresno, CA on 1/1/2011 is:
    http://www.wunderground.com/history/airport/KFAT/2011/1/1/DailyHistory.html?MR=1&format=1

    This will fail periodically on Travis, b/c wunderground says "No daily or hourly history data available"
    >> df = hourly('Fresno, CA', verbosity=-1)
    >> 1 <= len(df) <= 24 * 2
    True
    The time zone of the client where this is used to compose the first column label, hence the ellipsis
    >> df.columns  # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
    Index([u'Time...

    >> df = hourly('Fresno, CA', days=5, verbosity=-1)
    >> 24 * 4 <= len(df) <= 24 * (5 + 1) * 2
    True
    """
    airport_code = airport(location, default=location)

    if isinstance(days, int):
        start = start or None
        end = end or datetime.datetime.today().date()
        days = pd.date_range(end=end, periods=days)

    # refresh the cache each calendar month or each change in the number of days in the dataset
    cache_path = 'hourly-{}-{}-{:02d}-{:04d}.csv'.format(airport_code, days[-1].year, days[-1].month, len(days))
    cache_path = os.path.join(CACHE_PATH, cache_path)
    if use_cache:
        try:
            return pd.DataFrame.from_csv(cache_path)
        except:
            pass

    df = pd.DataFrame()
    for day in days:
        url = ('http://www.wunderground.com/history/airport/{airport_code}/{year}/{month}/{day}/DailyHistory.html?MR=1&format=1'.format(
               airport_code=airport_code,
               year=day.year,
               month=day.month,
               day=day.day))
        if verbosity > 1:
            print('GETing *.CSV using "{0}"'.format(url))
        buf = urllib.urlopen(url).read()
        if verbosity > 0:
            N = buf.count('\n')
            M = (buf.count(',') + N) / float(N)
            print('Retrieved CSV for airport code "{}" with appox. {} lines and {} columns = {} cells.'.format(
                  airport_code, N, int(round(M)), int(round(M)) * N))
        if (buf.count('\n') > 2) or ((buf.count('\n') > 1) and buf.split('\n')[1].count(',') > 0):
            table = util.read_csv(buf, format='header+values-list', numbers=True)
            columns = [s.strip() for s in table[0]]
            table = table[1:]
            tzs = [s[4:] for s in columns if (s[5:] in ['ST', 'DT'] and s[4] in 'PMCE' and s[:4].lower() == 'time')]
            if tzs:
                tz = tzs[0]
            else:
                tz = 'UTC'
            for rownum, row in enumerate(table):
                try:
                    table[rownum] = [util.make_tz_aware(row[0], tz)] + row[1:]
                except ValueError:
                    pass
            dates = [row[-1] for row in table]
            if not all(isinstance(date, (datetime.datetime, pd.Timestamp)) for date in dates):
                dates = [row[0] for row in table]
            if len(columns) == len(table[0]):
                df0 = pd.DataFrame(table, columns=columns, index=dates)
                df = df.append(df0)
            elif verbosity >= 0:
                msg = "The number of columns in the 1st row of the table:\n    {}\n    doesn't match the number of column labels:\n    {}\n".format(
                    table[0], columns)
                msg += "Wunderground.com probably can't find the airport: {} ({})\n    or the date: {}\n    in its database.\n".format(
                    airport_code, location, day)
                msg += "Attempted a GET request using the URI:\n    {0}\n".format(url)
                warnings.warn(msg)
    try:
        df.to_csv(cache_path)
    except:
        if verbosity > 0 and use_cache:
            from traceback import print_exc
            print_exc()
            warnings.warn('Unable to write weather data to cache file at {}'.format(cache_path))
    return df
Example #2
0
def daily(location='Fresno, CA', years=1, use_cache=True, verbosity=1):
    """Retrieve weather for the indicated airport code or 'City, ST' string.

    >>> df = daily('Camas, WA', verbosity=-1)
    >>> 365 <= len(df) <= 365 * 2 + 1
    True

    Sacramento data has gaps (airport KMCC):
        8/21/2013 is missing from 2013.
        Whole months are missing from 2014.
    >>> df = daily('Sacramento, CA', years=[2013], verbosity=-1)
    >>> 364 <= len(df) <= 365
    True
    >>> df.columns
    Index([u'PST', u'Max TemperatureF', u'Mean TemperatureF', u'Min TemperatureF', u'Max Dew PointF', u'MeanDew PointF', u'Min DewpointF', u'Max Humidity', u'Mean Humidity', u'Min Humidity', u'Max Sea Level PressureIn', u'Mean Sea Level PressureIn', u'Min Sea Level PressureIn', u'Max VisibilityMiles', u'Mean VisibilityMiles', u'Min VisibilityMiles', u'Max Wind SpeedMPH', u'Mean Wind SpeedMPH', u'Max Gust SpeedMPH', u'PrecipitationIn', u'CloudCover', u'Events', u'WindDirDegrees'], dtype='object')
    """
    this_year = datetime.date.today().year
    if isinstance(years, (int, float)):
        # current (incomplete) year doesn't count in total number of years
        # so 0 would return this calendar year's weather data
        years = np.arange(0, int(years) + 1)
    years = sorted(years)
    if not all(1900 <= yr <= this_year for yr in years):
        years = np.array([abs(yr) if (1900 <= abs(yr) <= this_year) else (this_year - abs(int(yr))) for yr in years])[::-1]

    airport_code = airport(location, default=location)

    # refresh the cache each time the start or end year changes
    cache_path = 'daily-{}-{}-{}.csv'.format(airport_code, years[0], years[-1])
    cache_path = os.path.join(CACHE_PATH, cache_path)
    if use_cache:
        try:
            return pd.DataFrame.from_csv(cache_path)
        except:
            pass

    df = pd.DataFrame()
    for year in years:
        url = ('http://www.wunderground.com/history/airport/{airport}/{yearstart}/1/1/'
               + 'CustomHistory.html?dayend=31&monthend=12&yearend={yearend}'
               + '&req_city=&req_state=&req_statename=&reqdb.zip=&reqdb.magic=&reqdb.wmo=&MR=1&format=1').format(
            airport=airport_code,
            yearstart=year,
            yearend=year
            )
        if verbosity > 1:
            print('GETing *.CSV using "{0}"'.format(url))
        buf = urllib.urlopen(url).read()
        if verbosity > 0:
            N = buf.count('\n')
            M = (buf.count(',') + N) / float(N)
            print('Retrieved CSV for airport code "{}" with appox. {} lines and {} columns = {} cells.'.format(
                  airport_code, N, int(round(M)), int(round(M)) * N))
            if verbosity > 2:
                print(buf)
        table = util.read_csv(buf, format='header+values-list', numbers=True)
        # # clean up the last column (if it contains <br> tags)
        table = [util.strip_br(row) if len(row) > 1 else row for row in table]
        # numcols = max(len(row) for row in table)
        # table = [row for row in table if len(row) == numcols]
        columns = table.pop(0)
        tzs = [s for s in columns if (s[1:] in ['ST', 'DT'] and s[0] in 'PMCE')]
        dates = [float('nan')] * len(table)
        for i, row in enumerate(table):
            for j, value in enumerate(row):
                if not value and value is not None:
                    value = 0
                    continue
                if columns[j] in tzs:
                    table[i][j] = util.make_tz_aware(value, tz=columns[j])
                    if isinstance(table[i][j], datetime.datetime):
                        dates[i] = table[i][j]
                        continue
                try:
                    table[i][j] = float(value)
                    if not (table[i][j] % 1):
                        table[i][j] = int(table[i][j])
                except:
                    pass
        df0 = pd.DataFrame(table, columns=columns, index=dates)
        df = df.append(df0)

    if verbosity > 1:
        print(df)

    try:
        df.to_csv(cache_path)
    except:
        if verbosity > 0 and use_cache:
            from traceback import print_exc
            print_exc()
            warnings.warn('Unable to write weather data to cache file at {}'.format(cache_path))

    return df