Exemple #1
0
def get_climate_data(place):
    def find_separate_weatherbox_template(data):
        if data is False:
            return ""

        # {{cityname weatherbox}} seems to be the usual template name.
        # I'll just look for any template ending with weatherbox.
        # I've not seen a page this breaks on yet.

        # New York City includes its weatherbox through a reference
        # to {{New York City weatherbox/cached}}, where the /cached
        # template contains rendered HTML tables. I want to look at
        # "Template:New York City weatherbox" instead. Not sure how
        # common this is, but NYC is pretty major and handling it
        # is easy, so might as well.
        index2 = max(data.find("weatherbox}}"), data.find("weatherbox/cached}}"), data.find("weatherbox|collapsed=Y}}"))

        if index2 > -1:
            # there is separate template - get it and process it
            index1 = data.rfind("{{", 0, index2)
            template_name = "Template:" + data[index1 + 2 : index2 + 10]

            weatherbox_title, data = get_page_source(template_name)
            if data is not False:
                return find_template(data, "Weather box")

        # if we didn't find template, or we couldn't get it, fall back
        return ""

    def parse(text):
        text = text.strip().replace("−", "-")
        text = text.strip().replace("−", "-")
        if text == "-":
            # used on some pages to indicate a no data condition
            return None
        if text == "trace":
            # used on some pages to indicate essentially 0, I guess
            return 0

        return float(text)

    def month_number(month):
        # convert text month to number
        return MONTHS.index(month) + 1

    def daily_to_monthly(daily, month):
        month = month_number(month)

        # use a non-leap year since I suspect monthly numbers are given
        # for non-leap Februarys
        days = calendar.monthrange(2013, month)[1]

        return daily * days

    result = {"page_error": False}
    for row_name in ROWS:
        result[row_name] = []

    result["title"], data = get_page_source(place)

    if data is False:
        # indicates a problem getting data - signal it so output
        # can be formatted accordingly
        result["page_error"] = True
        return result

    weatherbox = find_template(data, "Weather box")
    weatherbox_info = parse_infobox(weatherbox)

    if len(weatherbox_info) == 0:
        # weatherbox not found directly on page
        # see there's a dedicated city weather template we can look at
        weatherbox = find_separate_weatherbox_template(data).strip()
        weatherbox_info = parse_infobox(weatherbox)

    for key in weatherbox_info:
        value = weatherbox_info[key]

        # try to parse out location data - usually specifies a neighbourhood,
        # weather station, year range info, etc
        if key == "location":
            # trim off wikilink markers, the most common
            # wiki syntax in this field
            result["location"] = value.replace("[", "").replace("]", "")

        month = key[:3]
        if month in MONTHS:
            category = key[3:].strip()  # take out the month to get data category
            value = parse(value)  # parse value as number

            # last token of category name is sometimes the unit
            # (C, F, mm, inch, etc)
            unit = category.rsplit(None, 1)[-1]

            if category in result:
                # straightforward putting the data in
                result[category].append(value)

            elif unit in UNIT_CONVERSIONS:
                # try to convert units to known ones
                for target_unit in UNIT_CONVERSIONS[unit]:
                    # try to find a category we collect that
                    # we know how to convert into
                    converted_category = category.replace(unit, target_unit)
                    if converted_category in result:
                        converted = UNIT_CONVERSIONS[unit][target_unit](value)
                        result[converted_category].append(converted)
                        break

            elif category == "d sun":
                # special handling for daily sun hours
                value = daily_to_monthly(value, month)
                result["sun"].append(value)

            # Process percentsun if present and we haven't found any other sun data.
            # Assume specific hour count is more precise than "% sunshine", so only
            # use percentsun if other data is not more available.
            # TODO: if percentsun is ahead of sun in the template, this
            # precautionary condition will still fail
            elif category == "percentsun" and len(result["sun"]) == 0:
                if "observer" not in result:
                    location = result["title"]

                    # will try to get lat,lng from wikipedia page if location
                    # is not recognized by pyephem directly
                    result["observer"] = astrodata.process_location(location)

                if result["observer"] != False:
                    daylight = astrodata.month_daylight(result["observer"], month_number(month))
                    sun = (daylight.total_seconds() / 3600) * (value / 100)
                    sun = round(sun, 1)
                    result["sun"].append(sun)

    return result
import calendar
import time
import datetime
import ephem

import astrodata
import climate

if __name__ == '__main__':
    cities = climate.get_cities()
    
    data = {'r': [], 'e': []}
    for city in cities:
        time1 = time.time()
        for i in range(12):
            data['e'].append(astrodata.month_daylight(city, i+1, True).total_seconds() / 3600)
        print "exact: " + str(time.time() - time1)

        time2 = time.time()
        for i in range(12):
            data['r'].append(astrodata.month_daylight(city, i+1, False).total_seconds() / 3600)
        print "rough: " + str(time.time() - time2)

    for i in range(12):
        print str(i+1) + ' ',
        print str(round(data['e'][i], 2)) + '-' + str(round(data['r'][i], 2)),
        print ' = ' + str(round(data['e'][i] - data['r'][i], 3)),
        print '\t: ' + str(round(100 * (data['r'][i] - data['e'][i]) / data['e'][i], 2)) + '%'
        # TODO: graph the differences vs actual day lengths to see where 
        # i'm undershooting and try to understand why?