def _transform(venue: Venue, tag: Tag, parsing_context: ParsingContext) -> Event: parsing_context.currently_parsing = tag simplon_url = tag.get("href") title = tag.get("title") subtitle_tag = tag.find("div", {"class": "subtitle"}) details_tag = tag.find("div", {"class": "details"}) description = subtitle_tag.text if subtitle_tag is not None else details_tag.text when = tag.find("div", {"class": "date"}).text time = details_tag.text time = time[time.find("Aanvang: ") + 9 : time.find("Aanvang: ") + 15] when_datetime = dateparser.parse( f"{when} {time}", settings={"TIMEZONE": venue.timezone, "RETURN_AS_TIMEZONE_AWARE": True} ) or datetime.now(tz=pytz.timezone(venue.timezone)) image_url_style = tag.find("div", {"class": "item-image"}).get("style") image_url_start = image_url_style.find("https") image_url = image_url_style[image_url_start : image_url_style.find(".jpg") + 4] return Event( url=simplon_url, title=f"{title}", description=description, venue_timezone=venue.timezone, venue_id=venue.venue_id, venue_name=venue.name, venue_url=venue.url, venue_country=venue.country, venue_city=venue.city, venue_short_name=venue.short_name, source=venue.url, date_published=datetime.now(), when=when_datetime, image_url=image_url, )
def _transform(venue: Venue, data: Dict, parsing_context: ParsingContext) -> Event: parsing_context.currently_parsing = data source = venue.source_url paradiso_url = f"https://api.paradiso.nl/api/library/lists/events/{data['id']}?lang=en" title = data["title"] description = data["subtitle"] description = description if ParserUtil.not_empty( description) else title when_format = f'{data["start_date_time"]}' when = dateparser.parse( when_format, languages=["en"], settings={ "TIMEZONE": venue.timezone, "RETURN_AS_TIMEZONE_AWARE": True }) or datetime.now(tz=pytz.timezone(venue.timezone)) return Event( url=paradiso_url, title=title, description=description, date_published=datetime.now(), venue_timezone=venue.timezone, venue_id=venue.venue_id, venue_name=venue.name, venue_url=venue.url, venue_country=venue.country, venue_city=venue.city, venue_short_name=venue.short_name, source=source, when=when, )
def _transform(venue: Venue, json_event: Dict, parsing_context: ParsingContext) -> Event: parsing_context.currently_parsing = json_event title = json_event["title"] description = json_event["subTitle"] if not description: description = title if "supportAct" in json_event and not json_event["supportAct"] is None: title = f"{title} {json_event['supportAct']}" source = venue.source_url url = f"{venue.url}{json_event['url']}" starts_at = json_event["dates"]["startsAt"] date = dateparser.parse( starts_at, settings={ "TIMEZONE": venue.timezone, "RETURN_AS_TIMEZONE_AWARE": True }) or datetime.now(tz=pytz.timezone(venue.timezone)) image_url = f"{venue.url}{json_event['images']['regular']['mobile']}" return Event( url=url, title=title, venue_timezone=venue.timezone, venue_id=venue.venue_id, venue_name=venue.name, venue_url=venue.url, image_url=image_url, venue_city=venue.city, venue_country=venue.country, venue_short_name=venue.short_name, source=source, description=description, date_published=datetime.now(), when=date, )
def _transform(venue: Venue, article: Tag, parsing_context: ParsingContext) -> Event: parsing_context.currently_parsing = article source = venue.source_url base_url = venue.url url = article.a.get("href") content = article.find("div", {"class": "program__content"}) figure = article.find("figure").img.get("data-src") if article.find( "figure").img else None date = datetime.fromtimestamp(int(article["data-datetime"]), tz=timezone(venue.timezone)) title = content.h1 content_title = (title.text if title.find("span") is None else title.text.replace(title.span.text, "") + " - " + title.span.text) description = ParserUtil.stripped_text_or_default_if_empty( content.p, content_title) return Event( url=url, title=content_title, description=description, venue_timezone=venue.timezone, venue_id=venue.venue_id, venue_name=venue.name, venue_url=venue.url, venue_city=venue.city, venue_country=venue.country, venue_short_name=venue.short_name, image_url=f"{base_url}{figure}", source=source, date_published=datetime.now(), when=date, )
def do_parse(self, parsing_context: ParsingContext) -> List[Event]: root = etree.fromstring(parsing_context.content) events = [] for item in root.iter("item"): parsing_context.currently_parsing = item maybe_url = item.find("link") maybe_description = item.find("description") maybe_title = item.find("title") if maybe_url is None or maybe_title is None or maybe_description is None: break url = maybe_url.text description = maybe_description.text title = maybe_title.text events.append( Event( url=str(url), title=str(title), description=str(description), venue_timezone=parsing_context.venue.timezone, venue_id=parsing_context.venue.venue_id, venue_name=parsing_context.venue.name, venue_url=parsing_context.venue.url, venue_city=parsing_context.venue.city, venue_country=parsing_context.venue.country, venue_short_name=parsing_context.venue.short_name, source=parsing_context.venue.url, date_published=datetime.now(), when=None, image_url=None, ) ) return events
async def fetch_page_indexed( self, session: ClientSession, items_per_page: int) -> AsyncIterable[List[Event]]: page_index = 0 done = False while not done: page_index += 1 data = await fetch(url=self.scrape_url.format(page_index), session=session) new_events = self.parser.parse( ParsingContext(venue=self.venue, content=data)) yield new_events done = len(new_events) < items_per_page
def do_parse(self, parsing_context: ParsingContext) -> List[Event]: venue = parsing_context.venue source = venue.source_url content = json.loads(parsing_context.content) results = [] for day in content: events = [ event for event in day["events"] if event["type"] == "event" ] for event in events: parsing_context.currently_parsing = event description = MelkwegParser._make_description(event) date = datetime.fromtimestamp(int(event["date"]), pytz.timezone(venue.timezone)) title = event["name"] image_url = ( f"https://s3-eu-west-1.amazonaws.com/static.melkweg.nl/uploads/images/" f'scaled/agenda_thumbnail/{event["thumbnail"]}') url = f'https://www.melkweg.nl/nl/agenda/{event["slug"]}' results.append( Event( url=url, title=title, description=ParserUtil.sanitize_text( description[:1400]), venue_timezone=venue.timezone, venue_id=venue.venue_id, venue_name=venue.name, venue_url=venue.url, venue_country=venue.country, venue_city=venue.city, venue_short_name=venue.short_name, source=source, date_published=datetime.now(), when=date, image_url=image_url, )) return results
def _transform(venue: Venue, article: Tag, parsing_context: ParsingContext) -> Event: parsing_context.currently_parsing = article source = venue.source_url base_url = venue.url url = f"{base_url}{article.a.get('href')}" title = article.find("h1").text description = title figure = None when = None if article.find("figure") is not None: figure = article.find("figure").img.get("src") figure = f"https:{figure}" if not figure.startswith( "https://") else figure if article.find("time"): date = article.find("time", {"class": "date"})["datetime"] time = article.find("time", {"class": "time"})["datetime"] when = Event.convert_utc_to_timezone( when=datetime.fromisoformat(f"{date}T{time}"), tz_str=venue.timezone) return Event( url=url, title=title, description=description, venue_timezone=venue.timezone, venue_id=venue.venue_id, venue_name=venue.name, venue_url=venue.url, venue_city=venue.city, venue_country=venue.country, venue_short_name=venue.short_name, image_url=figure, source=source, date_published=datetime.now(), when=when, )
def _transform(venue: Venue, tag: Tag, parsing_context: ParsingContext) -> Event: parsing_context.currently_parsing = tag when_text = tag.find("span", {"class": "agenda-date"}).text when_text = when_text.replace("\n", "").strip() when_text = when_text[0 : when_text.find("/")].strip() when_datetime = ( dateparser.parse( f"{when_text}", languages=["nl"], settings={"TIMEZONE": venue.timezone, "RETURN_AS_TIMEZONE_AWARE": True}, ) or datetime.now(tz=pytz.timezone(venue.timezone)) ) title = tag.find("h3", {"class": "agenda-title"}).text description_tag = tag.find("span", {"class": "small"}) description = description_tag.text if description_tag is not None else title oost_url = tag.find("a", {"class": "item-link"}).get("href") image_url = f'{venue.url}/{tag.find("img").get("src")}' return Event( url=oost_url, title=f"{title}", description=description, venue_timezone=venue.timezone, venue_id=venue.venue_id, venue_name=venue.name, venue_url=venue.url, venue_country=venue.country, venue_city=venue.city, venue_short_name=venue.short_name, source=venue.source_url, date_published=datetime.now(), when=when_datetime, image_url=image_url, )
def _transform(venue: Venue, tag: Tag, parsing_context: ParsingContext) -> Event: parsing_context.currently_parsing = tag source = venue.source_url vera_url = tag.find("a", {"class": "event-link"})["href"] artist_tag = tag.find("h3", {"class": re.compile(r"artist|artist ")}) if artist_tag is not None: artist = ParserUtil.remove_children_text_from( artist_tag, artist_tag.text) artist = VeraParser._add_sup_text_from_text(artist_tag, artist) artist = ParserUtil.sanitize_text(artist) else: artist = vera_url extra = VeraParser._find_extra(tag) extra_title = tag.find("h4", {"class": "pretitle"}) if extra_title is not None: extra_title = f"({ParserUtil.sanitize_text(extra_title.text)})" else: extra_title = "" when_tag = tag.find("div", {"class": "date"}) if when_tag is not None: when = ParserUtil.remove_children_text_from( when_tag, when_tag.text) when = ParserUtil.sanitize_text(when) when_time = tag.find("div", {"class": "schedule"}).text when_time = when_time[when_time.find("start: ") + 7:when_time.find("start: ") + 12] when_date: Optional[datetime] = dateparser.parse( f"{when} {when_time}", languages=["nl"], settings={ "TIMEZONE": venue.timezone, "RETURN_AS_TIMEZONE_AWARE": True }, ) if when_date is not None and when_date < (datetime.now( pytz.timezone(venue.timezone)) - relativedelta(days=100)): when_date = when_date + relativedelta(years=1) image_url = tag.find("div", {"class": "artist-image"})["style"] image_url_end = image_url.find("'", image_url.find("https") + 4) image_url = image_url[image_url.find("https"):image_url_end] when_date = when_date if when_date is not None else datetime.now( pytz.timezone(venue.timezone)) return Event( url=vera_url, title=f"{artist} {extra_title}".strip(), description= f'{artist}{" with support" if extra != "" else ""} {extra}'.strip( ), venue_timezone=venue.timezone, venue_id=venue.venue_id, venue_name=venue.name, venue_url=venue.url, venue_country=venue.country, venue_city=venue.city, venue_short_name=venue.short_name, source=source, date_published=datetime.now(), when=when_date, image_url=image_url, )
async def fetch_page_in_one_call( self, session: ClientSession) -> AsyncIterable[List[Event]]: data = await fetch(url=self.scrape_url, session=session) events = self.parser.parse( ParsingContext(venue=self.venue, content=data)) yield events