def _find_extra(tag: Tag) -> str: extra_tag = tag.find("h4", {"class": "extra"}) if extra_tag is None: return "" extra = ParserUtil.remove_children_text_from(extra_tag, extra_tag.text) extra = VeraParser._add_sup_text_from_text(extra_tag, extra) return ParserUtil.sanitize_text(extra)
def _transform(venue: Venue, data: Dict, parsing_context: ParsingContext) -> Event: parsing_context.currently_parsing = data source = venue.source_url paradiso_url = f"https://api.paradiso.nl/api/library/lists/events/{data['id']}?lang=en" title = data["title"] description = data["subtitle"] description = description if ParserUtil.not_empty( description) else title when_format = f'{data["start_date_time"]}' when = dateparser.parse( when_format, languages=["en"], settings={ "TIMEZONE": venue.timezone, "RETURN_AS_TIMEZONE_AWARE": True }) or datetime.now(tz=pytz.timezone(venue.timezone)) return Event( url=paradiso_url, title=title, description=description, date_published=datetime.now(), venue_timezone=venue.timezone, venue_id=venue.venue_id, venue_name=venue.name, venue_url=venue.url, venue_country=venue.country, venue_city=venue.city, venue_short_name=venue.short_name, source=source, when=when, )
def _transform(venue: Venue, article: Tag, parsing_context: ParsingContext) -> Event: parsing_context.currently_parsing = article source = venue.source_url base_url = venue.url url = article.a.get("href") content = article.find("div", {"class": "program__content"}) figure = article.find("figure").img.get("data-src") if article.find( "figure").img else None date = datetime.fromtimestamp(int(article["data-datetime"]), tz=timezone(venue.timezone)) title = content.h1 content_title = (title.text if title.find("span") is None else title.text.replace(title.span.text, "") + " - " + title.span.text) description = ParserUtil.stripped_text_or_default_if_empty( content.p, content_title) return Event( url=url, title=content_title, description=description, venue_timezone=venue.timezone, venue_id=venue.venue_id, venue_name=venue.name, venue_url=venue.url, venue_city=venue.city, venue_country=venue.country, venue_short_name=venue.short_name, image_url=f"{base_url}{figure}", source=source, date_published=datetime.now(), when=date, )
def update_event_with_details(self, event: Event, additional_details: str) -> Event: soup = BeautifulSoup(additional_details, features="html.parser") image_url = None when_date = None if soup.find("meta", {"name": "twitter:image"}) is not None: image_url = soup.find("meta", {"name": "twitter:image"})["content"] if soup.find("meta", {"property": "og:image"}): image_url = soup.find("meta", {"property": "og:image"})["content"] summary_div = soup.find("div", {"class": "summary"}) summary_item_divs = summary_div.find_all("div", {"class": "summary__item"}) if len(summary_item_divs) == 2: date = summary_item_divs[0].text time = summary_item_divs[1].text date = ParserUtil.sanitize_text(date) time = ParserUtil.sanitize_text(time) time = time[time.index("Aanvang ") + 8 :] when_date = ParserUtil.parse_date_time_to_datetime(date, time, event.venue_timezone) if when_date is not None: event.when = when_date event.image_url = image_url return event
def do_parse(self, parsing_context: ParsingContext) -> List[Event]: venue = parsing_context.venue source = venue.source_url content = json.loads(parsing_context.content) results = [] for day in content: events = [ event for event in day["events"] if event["type"] == "event" ] for event in events: parsing_context.currently_parsing = event description = MelkwegParser._make_description(event) date = datetime.fromtimestamp(int(event["date"]), pytz.timezone(venue.timezone)) title = event["name"] image_url = ( f"https://s3-eu-west-1.amazonaws.com/static.melkweg.nl/uploads/images/" f'scaled/agenda_thumbnail/{event["thumbnail"]}') url = f'https://www.melkweg.nl/nl/agenda/{event["slug"]}' results.append( Event( url=url, title=title, description=ParserUtil.sanitize_text( description[:1400]), venue_timezone=venue.timezone, venue_id=venue.venue_id, venue_name=venue.name, venue_url=venue.url, venue_country=venue.country, venue_city=venue.city, venue_short_name=venue.short_name, source=source, date_published=datetime.now(), when=date, image_url=image_url, )) return results
def _transform(venue: Venue, tag: Tag, parsing_context: ParsingContext) -> Event: parsing_context.currently_parsing = tag source = venue.source_url vera_url = tag.find("a", {"class": "event-link"})["href"] artist_tag = tag.find("h3", {"class": re.compile(r"artist|artist ")}) if artist_tag is not None: artist = ParserUtil.remove_children_text_from( artist_tag, artist_tag.text) artist = VeraParser._add_sup_text_from_text(artist_tag, artist) artist = ParserUtil.sanitize_text(artist) else: artist = vera_url extra = VeraParser._find_extra(tag) extra_title = tag.find("h4", {"class": "pretitle"}) if extra_title is not None: extra_title = f"({ParserUtil.sanitize_text(extra_title.text)})" else: extra_title = "" when_tag = tag.find("div", {"class": "date"}) if when_tag is not None: when = ParserUtil.remove_children_text_from( when_tag, when_tag.text) when = ParserUtil.sanitize_text(when) when_time = tag.find("div", {"class": "schedule"}).text when_time = when_time[when_time.find("start: ") + 7:when_time.find("start: ") + 12] when_date: Optional[datetime] = dateparser.parse( f"{when} {when_time}", languages=["nl"], settings={ "TIMEZONE": venue.timezone, "RETURN_AS_TIMEZONE_AWARE": True }, ) if when_date is not None and when_date < (datetime.now( pytz.timezone(venue.timezone)) - relativedelta(days=100)): when_date = when_date + relativedelta(years=1) image_url = tag.find("div", {"class": "artist-image"})["style"] image_url_end = image_url.find("'", image_url.find("https") + 4) image_url = image_url[image_url.find("https"):image_url_end] when_date = when_date if when_date is not None else datetime.now( pytz.timezone(venue.timezone)) return Event( url=vera_url, title=f"{artist} {extra_title}".strip(), description= f'{artist}{" with support" if extra != "" else ""} {extra}'.strip( ), venue_timezone=venue.timezone, venue_id=venue.venue_id, venue_name=venue.name, venue_url=venue.url, venue_country=venue.country, venue_city=venue.city, venue_short_name=venue.short_name, source=source, date_published=datetime.now(), when=when_date, image_url=image_url, )
def _add_sup_text_from_text(parent_tag: Tag, text: str) -> str: sup = parent_tag.find("sup") return f"{text} ({sup.text})" if ParserUtil.has_non_empty_text( sup) else text