class WeatherComParser: def __init__(self): self._forecast = { ForecastType.TODAY: self._today_forecast, ForecastType.FIVEDAYS: self._five_and_ten_days_forecast, ForecastType.TENDAYS: self._five_and_ten_days_forecast, ForecastType.WEEKEND: self._weekend_forecast, } self._base_url = 'http://weather.com/weather/{forecast}/l/{area}' self._request = Request(self._base_url) self._temp_regex = re.compile('([0-9]+)\D{,2}([0-9]+)') self._only_digits_regex = re.compile('[0-9]+') self._unit_converter = UnitConverter(Unit.FAHRENHEIT) def _get_additional_info(self, content): data = tuple(item.td.span.get_text() for item in content.table.tbody.children) return data[:2] def _clear_str_number(self, str_number): result = self._only_digits_regex.match(str_number) return '--' if result is None else result.group() def _parse(self, container, criteria): results = [self._get_data(item, criteria) for item in container.children] return [result for result in results if result] def _get_data(self, container, search_items): scraped_data = {} for key, value in search_items.items(): result = container.find(value, class_=key) data = None if result is None else result.get_text() if data is not None: scraped_data[key] = data return scraped_data def _prepare_data(self, results, args): forecast_result = [] self._unit_converter.dest_unit = args.unit for item in results: match = self._temp_regex.search(item['temp']) if match is not None: high_temp, low_temp = match.groups() try: dateinfo = item['weather-cell'] date_time, day_detail = dateinfo[:3], dateinfo[3:] item['date-time'] = date_time item['day-detail'] = day_detail except KeyError: pass day_forecast = Forecast( self._unit_converter.convert(item['temp']), item['humidity'], item['wind'], high_temp=self._unit_converter.convert(high_temp), low_temp=self._unit_converter.convert(low_temp), description=item['description'].strip(), forecast_date=f'{item["date-time"]} {item["day-detail"]}', forecast_type=self._forecast_type) forecast_result.append(day_forecast) return forecast_result def _parse_list_forecast(self, content, args): criteria = { 'date-time': 'span', 'day-detail': 'span', 'description': 'td', 'temp': 'td', 'wind': 'td', 'humidity': 'td', } bs = BeautifulSoup(content, 'html.parser') forecast_data = bs.find('table', class_='twc-table') container = forecast_data.tbody return self._parse(container, criteria) def _today_forecast(self, args): criteria = { 'today_nowcard-temp': 'div', 'today_nowcard-phrase': 'div', 'today_nowcard-hilo': 'div', } content = self._request.fetch_data(args.forecast_option.value, args.area_code) bs = BeautifulSoup(content, 'html.parser') container = bs.find('section', class_='today_nowcard-container') weather_conditions = self._parse(container, criteria) if len(weather_conditions) < 1: raise Exception('Could not parse weather foreecast for today.') weatherinfo = weather_conditions[0] temp_regex = re.compile(('H\s+([0-9]+|\-{,2}).+' 'L\s+([0-9]+|\-{,2})')) temp_info = temp_regex.search(weatherinfo['today_nowcard-hilo']) high_temp, low_temp = temp_info.groups() side = container.find('div', class_='today_nowcard-sidecar') wind, humidity = self._get_additional_info(side) curr_temp = self._clear_str_number(weatherinfo['today_nowcard-temp']) self._unit_converter.dest_unit = args.unit td_forecast = Forecast(self._unit_converter.convert(curr_temp), humidity, wind, high_temp=self._unit_converter.convert( high_temp), low_temp=self._unit_converter.convert( low_temp), description=weatherinfo['today_nowcard-phrase']) return [td_forecast] def _five_and_ten_days_forecast(self, args): content = self._request.fetch_data(args.forecast_option.value, args.area_code) results = self._parse_list_forecast(content, args) return self._prepare_data(results, args) def _weekend_forecast(self, args): criteria = { 'weather-cell': 'header', 'temp': 'p', 'weather-phrase': 'h3', 'wind-conditions': 'p', 'humidity': 'p', } mapper = Mapper() mapper.remap_key('wind-conditions', 'wind') mapper.remap_key('weather-phrase', 'description') content = self._request.fetch_data(args.forecast_option.value, args.area_code) bs = BeautifulSoup(content, 'html.parser') forecast_data = bs.find('section', class_='ls-mod') container = forecast_data.div.div partial_results = self._parse(container, criteria) results = mapper.remap(partial_results) return self._prepare_data(results, args) def run(self, args): self._forecast_type = args.forecast_option forecast_function = self._forecast[args.forecast_option] return forecast_function(args)
class WeatherComParser: def __init__(self): self._forecast = { ForecastType.TODAY: self._today_forecast, ForecastType.FIVEDAYS: self._five_and_ten_days_forecast, ForecastType.TENDAYS: self._five_and_ten_days_forecast, ForecastType.WEEKEND: self._weekend_forecast, } self._base_url = 'https://weather.com/weather/{forecast}/l/{area}' self._request = Request(self._base_url) self._temp_regex = re.compile('([0-9]+)\D{,2}([0-9]+)') self._only_digits_regex = re.compile('[0-9]+') self._unit_converter = UnitConverter(Unit.FAHRENHEIT) def _get_additional_info(self, content): data = tuple(item.td.span.get_text() for item in content.table.tbody.children) return data[:2] def _clear_str_number(self, str_number): result = self._only_digits_regex.match(str_number) return '---' if result is None else result.group() def _parse(self, container, criteria): results = [ self._get_data(item, criteria) for item in container.children ] return [result for result in results if result] def _get_data(self, container, search_items): scraped_data = {} for key, value in search_items.items(): result = container.find(value, class_=key) data = None if result is None else result.get_text() if data is not None: scraped_data[key] = data return scraped_data def _today_forecast(self, args): criteria = { 'today_nowcard-temp': 'div', 'today_nowcard-phrase': 'div', 'today_nowcard-hilo': 'div', } content = self._request.fetch_data(args.forecast_option.value, args.area_code) bs = BeautifulSoup(content, 'html.parser') container = bs.find('section', class_='today_nowcard-container') weather_contitions = self._parse(container, criteria) if len(weather_contitions) < 1: raise Exception('Could not parse weather forecast for today.') weatherinfo = weather_contitions[0] temp_regex = re.compile(('H\s+([0-9]+|\-{,2}).+' 'L\s+([0-9]+|\-{,2})')) temp_info = temp_regex.search(weatherinfo['today_nowcard-hilo']) high_temp, low_temp = temp_info.groups() side = container.find('div', class_='today_nowcard-sidecar') humidity, wind = self._get_additional_info(side) curr_temp = self._clear_str_number(weatherinfo['today_nowcard-temp']) self._unit_converter.dest_unit = args.unit td_forecast = Forecast( self._unit_converter.convert(curr_temp), humidity, wind, high_temp=self._unit_converter.convert(high_temp), low_temp=self._unit_converter.convert(low_temp), description=weatherinfo['today_nowcard-phrase']) return [td_forecast] def _five_and_ten_days_forecast(self, args): raise NotImplementedError() def _weekend_forecast(self, args): raise NotImplementedError() def run(self, args): self._forecast_type = args.forecast_option forecast_function = self._forecast[args.forecast_option] return forecast_function(args)
class WeatherComParser: def __init__(self): self._forecast_function_mappings = { ForecastType.TODAY: self._today_forecast, ForecastType.FIVEDAYS: self._five_day_forecast, ForecastType.TENDAYS: self._ten_day_forecast, ForecastType.WEEKEND: self._weekend_forecast } self._request = Request('http://weather.com/weather/{forecast_option}/l/{area_code}') self._unit_converter = UnitConverter(Unit.FAHRENHEIT) def run(self, args): # Main method that parses weather info based on the selected forecast option self._forecast_type = args.forecast_option forecast_function = self._forecast_function_mappings[self._forecast_type] return forecast_function(args) def _today_forecast(self, args): bs = self._make_http_request(args) container = bs.find("div", {"data-testid": "CurrentConditionsContainer"}) criteria = [ ('span', 'data-testid', 'TemperatureValue'), ('div', 'data-testid', 'wxPhrase'), ('div', 'class', 'tempHiLoValue') ] weather_conditions = self._parse(container, criteria) if len(weather_conditions) < 1: raise Exception('Could not parse weather forecast for today.') weatherinfo = weather_conditions[0] curr_temp = self._clear_str_number(weatherinfo['TemperatureValue']) temp_info = weatherinfo['tempHiLoValue'].split('/') high_temp = self._clear_str_number(temp_info[0]) low_temp = self._clear_str_number(temp_info[1]) # Parses the "Weather Today in {location}" card for wind & humidity details_container = bs.find("section", { "data-testid" : "TodaysDetailsModule"}) wind = details_container.find("span", {"data-testid":"Wind"}).get_text() humidity = details_container.find("span", {"data-testid": "PercentageValue"}).get_text() # Determine wind direction by degree angle # wind_direction_style = details_container.find("svg", {"name": "wind-direction"})["style"] # degrees = re.findall(r'\d+', wind_direction_style) # TODO: Determine cardinality by angle: 0 - S, 90 - W, 180 - N, 270 - E # wind = cardinality + " " + wind self._unit_converter.dest_unit = args.unit td_forecast = Forecast( self._unit_converter.convert(curr_temp), humidity, wind, high_temp = self._unit_converter.convert(high_temp), low_temp = self._unit_converter.convert(low_temp), description = weatherinfo['wxPhrase'] ) return [td_forecast] def _five_day_forecast(self, args): bs = self._make_http_request(args) container = bs.find('table', class_ = 'twc-table').tbody criteria = { 'date-time': 'span', 'day-detail': 'span', 'description': 'td', 'temp': 'td', 'wind': 'td', 'humidity': 'td' } results = self._parse(container, criteria) # 5 day forecast actually returns 6 days. Pare the list down results = results[:5] return self._prepare_data(results, args) def _ten_day_forecast(self, args): bs = self._make_http_request(args) container = bs.find("section", {"data-testid": "DailyForecast"}).find("div", { "class": re.compile("DisclosureList")}) criteria = [ ("h3", "data-testid", "daypartName"), ("div", "data-testid", "wxIcon"), ("div", "data-testid", "detailsTemperature"), ("span", "data-testid", "Wind"), ("span", "data-testid", "PercentageValue") ] results = self._parse(container, criteria) # 10 day forecast actually returns 15 days. Pare the list down results = results[:10] mapper = Mapper() mapper.remap_key('daypartName', 'date-time') mapper.remap_key('wxIcon', 'description') mapper.remap_key('detailsTemperature', 'temp') mapper.remap_key('Wind', 'wind') mapper.remap_key('PercentageValue', 'humidity') results = mapper.remap(results) return self._prepare_data(results, args) def _weekend_forecast(self, args): bs = self._make_http_request(args) container = bs.find('section', class_ = 'ls-mod').div.div criteria = { 'weather-cell': 'header', 'temp': 'p', 'weather-phrase': 'h3', 'wind-conditions': 'p', 'humidity': 'p' } partial_results = self._parse(container, criteria) mapper = Mapper() mapper.remap_key('wind-conditions', 'wind') mapper.remap_key('weather-phrase', 'description') results = mapper.remap(partial_results) return self._prepare_data(results, args) def _make_http_request(self, args): content = self._request.fetch_data( self._forecast_type.value, args.area_code ) bs = BeautifulSoup(content, 'html.parser') return bs def _parse(self, container, criteria): results = [self._get_data(item, criteria) for item in container.children] return [result for result in results if result] def _get_data(self, container, search_items): # Called by _parse() to return a list of controls in the DOM container that match search_items scraped_data = {} if type(search_items) is dict: # dictionary (key = class name, value = HTML tag) for key, value in search_items.items(): result = container.find(value, class_ = key) data = None \ if result is None \ else result.get_text() if data is not None: scraped_data[key] = data elif type(search_items) is list and type(search_items[0]) == tuple: # tuple list (HTML tag, attribute name, attribute ID) for tpl in search_items: result = container.find(tpl[0], {"class": re.compile(tpl[2])}) \ if tpl[1] == 'class' \ else container.find(tpl[0], {tpl[1]: tpl[2]}) data = None \ if result is None \ else result.get_text() if data is not None: scraped_data[tpl[2]] = data return scraped_data def _clear_str_number(self, str_number): # Clears out non-numeric characters from str_number self._only_digits_regex = re.compile('[0-9]+') result = self._only_digits_regex.match(str_number) return '--' if result is None else result.group() def _prepare_data(self, results, args): # Used by 5day/10day/weekend forecasts to further parse data, then return Forecast objects forecast_result = [] self._unit_converter.dest_unit = args.unit for item in results: try: high_temp, low_temp = re.findall(r'\d+', item['temp']) except: high_temp = 0 low_temp = re.findall(r'\d+', item['temp'])[0] # specific to weekend forecast markup try: dateinfo = item['weather-cell'] date_time, day_detail = dateinfo[:3], dateinfo[3:] item['date-time'] = date_time item['day-detail'] = day_detail except KeyError: pass if 'day-detail' not in item: item['day-detail'] = '' day_forecast = Forecast( self._unit_converter.convert(item['temp']), item['humidity'], item['wind'], high_temp = self._unit_converter.convert(high_temp), low_temp = self._unit_converter.convert(low_temp), description = item['description'].strip(), forecast_date = f'{item["date-time"]} {item["day-detail"]}', forecast_type = self._forecast_type ) forecast_result.append(day_forecast) return forecast_result
class WeatherComParser: def __init__(self): self._forecast = { ForecastType.TODAY: self._today_forecast, ForecastType.FIVEDAYS: self._five_and_ten_days_forecast, ForecastType.TENDAYS: self._five_and_ten_days_forecast, ForecastType.WEEKEND: self._weekend_forecast } self._base_url = 'http://weather.com/weather/{forecast}/l/{area}' self._request = Request(self._base_url) self._temp_regex = re.compile('([0-9]+)\D{,2}([0-9]+)') self._only_digits_regex = re.compile('[0-9]+') self._unit_converter = UnitConverter(Unit.FAHRENHEIT) def _get_data(self, container, search_items): scraped_data = {} for key, value in search_items.items(): result = container.find(value, class_=key) data = None if result is None else result.get_text() if data is not None: scraped_data[key] = data return scraped_data #The container is a DOM element and #the criterion is a dictionary of nodes that we want to find def _parse(self, container, criteria): #The results are a list of dictionaries with all the items that have been found results = [ self._get_data(item, criteria) for item in container.children ] #return the dictionaries that are not empty return [result for result in results if result] #This method will use a regular expression to make sure #that only digits are returned. def _clear_str_number(self, str_number): result = self._only_digits_regex.match(str_number) return '--' if result is None else result.group() #loops through the table rows, getting the text of every cell. def _get_additional_info(self, content): #Return lots of information about the weather data = tuple(item.td.span.get_text() for item in content.table.tbody.children) #but we are only interested in the first 2, the wind and the humidity. return data[:2] #This function will be called when the -td or --today flag is used #on the command line def _today_forecast(self, args): criteria = { 'today_nowcard-temp': 'div', 'today_nowcard-phrase': 'div', 'today_nowcard-hilo': 'div', } content = self._request.fetch_data(args.forecast_option.value, args.area_code) bs = BeautifulSoup(content, 'html.parser') #looks for the HTML elements that are containers container = bs.find('section', class_='today_nowcard-container') #search through the children elements and #try to find items that we defined in the dictionary criteria. #Returns a list weather_conditions = self._parse(container, criteria) if len(weather_conditions) < 1: raise Exception('Could not parse weather forecast for today.') weatherinfo = weather_conditions[0] temp_regex = re.compile(('H\s+(\d+|\-{,2}).+' 'L\s+(\d+|\-{,2})')) temp_info = temp_regex.search(weatherinfo['today_nowcard-hilo']) high_temp, low_temp = temp_info.groups() side = container.find('div', class_='today_nowcard-sidecar') wind, humidity = self._get_additional_info(side) curr_temp = self._clear_str_number(weatherinfo['today_nowcard-temp']) self._unit_converter.dest_unit = args.unit td_forecast = Forecast( self._unit_converter.convert(curr_temp), humidity, wind, high_temp=self._unit_converter.convert(high_temp), low_temp=self._unit_converter.convert(low_temp), description=weatherinfo['today_nowcard-phrase']) return [td_forecast] def _five_and_ten_days_forecast(self, args): raise NotImplementedError() def _weekend_forecast(self, args): raise NotImplementedError() def run(self, args): self._forecast_type = args.forecast_option forecast_function = self._forecast[args.forecast_option] return forecast_function(args)
class WeatherComParser: def __init__(self): self._forecast = { ForecastType.TODAY: self._today_forecast, ForecastType.FIVEDAYS: self._five_and_ten_days_forecast, ForecastType.TENDAYS: self._five_and_ten_days_forecast, ForecastType.WEEKEND: self._weekend_forecast, } # the URL template to perform requests to weather website self._base_url = 'http://weather.com/weather/{forecast}/1/{area}' self._request = Request(self._base_url) # attribute for Request class self._temp_regex = re.compile('([0-9]+)\D{,2}([0-9]+)') self._only_digits_regex = re.compile('[0-9]+') # attribute for unit conversion self._unit_converter = UnitConverter( Unit.FAHRENHEIT) # default unit is set to Fahrenheit def _get_data(self, container, search_items): # container is a DOM element in the HTML # search_items is a dictionary where key is the class and value is the type of HTML element scraped_data = {} for key, value in search_items.items(): result = container.find(value, class_=key) data = None if result is None else result.get_text( ) # .get_text() to extract text from DOM element if data is not None: scraped_data[key] = data # return scraped_data def _parse(self, container, criteria): # items are the children of the section container in the website that house a lot of the web info # criteria is the dictionary received in _today_forecast method results = [ self._get_data(item, criteria) for item in container.children ] return [result for result in results if result] def _clear_str_number(self, str_number): """To return only digits""" result = self._only_digits_regex.match(str_number) return '--' if None else result.group() def _get_humidity_and_weather(self, content): """The information for humidity and weather can be traced from the content tag to td tag which contains a list of tr tags when the weathercom website is inspected """ data = tuple(item.td.span.get_text() for item in content.table.tbody. children) # the tr tags are the children of tbody # the first two tr tags are the ones with info on humidity and weather so we retrieve them return data[:2] def _parse_list_forecast(self, content, args): # data for 5-day and 10-day have the same CSS class and DOM elements criteria = { 'date-time': 'span', # element contains string containing day of the week 'day-detail': 'span', # element contains string with date 'description': 'td', # element contains description of weather 'temp': 'td', # element contains low temp and high temp 'wind': 'td', # element contains wind information 'humidity': 'td', # humidity contains humidity info } bs = BeautifulSoup(content, 'html.parser') # CSS class and DOM element where you find data for 5 and 10 day forecasts forecast_data = bs.find('table', class_='twc-table') container = forecast_data.tbody return self._parse(container, criteria) def _prepare_data(self, results, args): forecast_data = [] self._unit_converter.dest_unit = args.unit for item in results: match = self._temp_regex.search(item['temp']) if match is not None: high_temp, low_temp = match.groups() try: dateinfo = item['weather-cell'] date_time, day_detail = dateinfo[:3], dateinfo[3:] item['day-detail'] = day_detail except KeyError: pass day_forecast = Forecast( self._unit_converter.convert(item['temp']), item['humidity'], item['wind'], high_temp=self._unit_converter.convert(high_temp), low_temp=self._unit_converter.convert(low_temp), description=item['description'].strip(), forecast_data=f'{item["date-time"]} {item["day-detail"]}', forecast_type=self._forecast_type) forecast_data.append(day_forecast) return forecast_data def _today_forecast(self, args): """returns weather parses for the day""" # contains the DOM elements that want to find in the HTML of the weather website for today's scraping # key is the name of the CSS class and value is the type of HTML criteria = { 'today_nowcard-temp': 'div', # CSS class containing current temperature 'today_nowcard-phrase': 'div', # CSS class containing weather conditions text for description 'today_nowcard-hilo': 'div', # CSS class containing highest and lowest temperature } content = self._request.fetch_data(args.area_code, args.forecast_option.value) bs = BeautifulSoup( content, 'html.parser') # as bs object of the page is returned # container is the section tag on weathercom website that holds most of the info on weather container = bs.find('section', class_='today_nowcard-container') # to find elements in children or subtags of container that are in criteria to retrieve/scrape them weather_conditions = self._parse(container, criteria) # if len of weather_conditions is less than 1, no info was obtained or scraped if len(weather_conditions) < 1: raise Exception('Could not parse weather for today') weather_info = weather_conditions[0] temp_regex = re.compile(('H\s+(\d+|\-{,2}).+' 'L\s+(\d+|\-{,2})')) temp_info = temp_regex.search(weather_info['today_nowcard-hilo']) high_temp, low_temp = temp_info.groups() # getting wind and humidity info side = container.find('div', class_='today_nowcard-sidecar') humidity, wind = self._get_humidity_and_weather(side) # getting current temp current_temperature = self._clear_str_number( weather_info['today_nowcard-temp']) # set default unit to the value of args.unit attribute self._unit_converter.dest_unit = args.unit today_forecast = Forecast( self._unit_converter.convert(current_temperature), humidity, wind, self._unit_converter.convert(high_temp), self._unit_converter.convert(low_temp), description=weather_info['today_nowcard-phrase']) # return today_forecast object as a list return [today_forecast] def _five_and_ten_day_forecast(self, args): content = self._request.fetch_data(args.forest_option.value, args.area_code) results = self._parse_list_forecast(content, args) return self._prepare_data(results) def _weekend_forecast(self, args): criteria = { 'weather-cell': 'header', 'temp': 'p', 'weather-phrase': 'h3', 'wind-conditions': 'p', 'humidity': 'p' } mapper = Mapper() mapper.remap_key('wind-conditions', 'wind') mapper.remap_key('weather-phrase', 'description') content = self._request.fetch_data(args.forecast_option.value, args.area_code) bs = BeautifulSoup(content, 'html.parser') forecast_data = bs.find('article', class_='ls-mod') container = forecast_data.div.div partial_results = self._parse(container, criteria) results = mapper.remap(partial_results) return self._prepare_data(results, args) def run(self, args): self._forecast_type = args.forecast_option forecast_function = self._forecast[args.forecast_option] return forecast_function(args)