def _parse_show(self, link): event_doc = html_util.fetch_and_parse(link, parse_500 = True) event_detail = html_util.get_first_element(event_doc, ".event-detail") artist_info = html_util.get_first_element(event_doc, ".artist-boxes") date_txt = html_util.get_first_element(event_detail, ".dates").text_content() performers = [] for el in html_util.get_elements(event_doc, '.headliners'): for name in lang_util.parse_performers(el.text_content()): performers.append(Performer(name, headliner = True)) for el in html_util.get_elements(event_doc, '.supports'): for name in lang_util.parse_performers(el.text_content()): performers.append(Performer(name, headliner = False)) show = Show() show.merge_key = link show.venue = self.venue() show.performers = performers show.show_time = date_util.parse_date_and_time(date_txt, html_util.get_first_element(event_detail, ".times").text_content()) show.resources.show_url = link show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail, artist_info) img = html_util.get_first_element(event_detail, "img", optional = True) if img is not None: show.resources.image_url = img.get('src') return show
def _get_parser(self): calendar_file = urllib2.urlopen(self.CALENDAR_URL) calendar = '' for data in calendar_file: calendar += data # Europa has an errant closing html tag at the top of the document that messes up lxml calendar = calendar.replace('</html>', '') doc = lxml.html.document_fromstring(calendar) doc.make_links_absolute(self.CALENDAR_URL) main_section = None for el in doc.iter(tag = 'table'): if el.get('width') == '756': main_section = el break if main_section is None: raise Exception('Unable to find main section') for el in html_util.get_elements(main_section, 'table'): if el.get('width') == '700' and el.get('height') == '80': show = self._parse_show(el) if show: yield show
def _parse_show(self, link): LOG.debug("Fetching show: %s" % link) event_doc = html_util.fetch_and_parse(link) event_detail = event_doc.get_element_by_id("mainColumn") show = Show() for performer in html_util.get_elements(event_detail, 'h1'): name = performer.text_content().strip(' \n\r\t') if name: show.performers.append(Performer(name)) date_txt = html_util.get_first_element(event_detail, '.date').text_content() event_match = self.EVENT_URL.match(link) show.merge_key = event_match.group('page_id') show.venue = self.venue() show.show_time = date_util.parse_date_time(date_txt).replace(hour = 21) LOG.debug('Date: %s' % show.date) show.resources.show_url = link show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail) for img_tag in event_detail.iter(tag = 'img'): if 'main' in img_tag.get('src'): show.resources.image_url = img_tag.get('src') break return show
def _parse_v2(self, doc): content = parsing.get_first_element(doc, '.content.contentMid') html_boxes = list(parsing.get_elements(content, '.htmlBoxModule')) resources = self.resource_extractor.extract_resources(*html_boxes) return ArtistProfileParserResult(resources)
def _parse_shows(self, base_date, td): day = int(html_util.get_first_element(td, '.day').text_content()) date = base_date.replace(day = day) logger.debug('Parsing shows on %s' % date.strftime('%F')) lr_shows = html_util.get_elements(td, '.lr_color a') googie_shows = html_util.get_elements(td, '.googie_color a') shows = [] if lr_shows: shows.append(self._parse_show(date, lr_shows)) if googie_shows: shows.append(self._parse_show(date, googie_shows)) return shows
def _month_parser(self, request_date): month_url = '%scalendar/%d-%d' % (self.BASE_URL, request_date.year, request_date.month) logger.debug('Parsing: %s' % month_url) doc = html_util.fetch_and_parse(month_url) main_table = html_util.get_first_element(doc, '.month-view table') for td in html_util.get_elements(main_table, 'td.has-events'): for show in self._parse_shows(request_date, td): yield show
def _parse_show(self, link): event_doc = html_util.fetch_and_parse(link) match = self.IS_EVENT.match(link) event_id = int(match.group("event_id")) event_detail = html_util.get_first_element(event_doc, ".tfly-event-id-%d" % event_id) date_txt = html_util.get_first_element(event_doc, ".dates").text_content() time_txt = html_util.get_first_element(event_doc, ".times").text_content() img = html_util.get_first_element(event_detail, "img") performers = [] for p in html_util.get_elements(event_detail, ".headliners"): performers.append(Performer(p.text_content(), headliner=True)) for p in html_util.get_elements(event_detail, ".supports"): for pi in lang_util.parse_performers(p.text_content()): performers.append(Performer(pi, headliner=False)) show = Show() show.merge_key = link show.venue = self.venue() show.performers = performers show.show_time = date_util.parse_show_time(date_txt, time_txt) show.door_time = date_util.parse_door_time(date_txt, time_txt) show.resources.show_url = link show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail) if img is not None: show.resources.image_url = img.get("src") return show