def test_parse(self): venue = OostGroningenProcessor.create_venue() content = fetch(venue.url) results = OostGroningenParser().parse( ParsingContext(venue=venue, content=content)) assert_that(len(results), equal_to(8)) event = results[0] assert_that( event.title, equal_to('HOMOOST • Movie Night: Party Monster the Shockumentary')) assert_that(event.description, equal_to('Movie Screening • Group Discussion')) assert_that(event.when, is_not(none())) assert_that( event.url, equal_to('https://www.facebook.com/events/610421539383220/')) assert_that( event.image_url, equal_to( 'https://www.komoost.nl/media/56721601_1992667177522931_8267801960216788992_o.jpg' )) assert_that(event.venue, equal_to(venue)) assert_that(event.source, equal_to('https://www.komoost.nl')) assert_that(event.event_id, is_not(none())) assert_that(event.date_published, is_not(none())) [assert_that(event.when, is_not(none)) for event in results] [assert_that(event.description, is_not(none())) for event in results] [assert_that(event.title, is_not(none())) for event in results] [assert_that(event.url, is_not(none())) for event in results]
def test_sample_file_page_1(self): venue = ParadisoProcessor.create_venue() parser = ParadisoParser() data = fetch(f'{venue.source_url}/page=1') results = parser.parse(ParsingContext(venue=venue, content=data)) assert_that(len(results), equal_to(30)) event = results[0] assert_that( event.url, equal_to('https://www.paradiso.nl/en/program/giant-rooks/54827')) assert_that(event.venue, equal_to(venue)) assert_that(event.title, equal_to("Giant Rooks")) assert_that(event.description, equal_to("Aanstormende Duitse indiepopband")) assert_that(event.when, is_not(none())) assert_that(event.image_url, none()) assert_that(event.date_published, is_not(none())) assert_that(event.source, equal_to('https://www.paradiso.nl/')) for event in results: assert_that(event.when, is_not(none)) assert_that(event.description, is_not(none)) assert_that(event.title, is_not(none)) assert_that(event.url, is_not(none))
def test_sample_file(self): venue = TivoliProcessor.create_venue() parser = TivoliParser() data = fetch(f'{venue.url}/page=1') results = parser.parse(ParsingContext(venue=venue, content=data)) assert_that(len(results), equal_to(30)) event = [ result for result in results if result.title == "Leuk Dat Je d'r Bent Band" ][0] assert_that( event.url, equal_to( 'https://www.tivolivredenburg.nl/agenda/leuk-dat-je-dr-bent-band-27-04-2019/' )) assert_that(event.venue, equal_to(venue)) assert_that(event.title, equal_to("Leuk Dat Je d'r Bent Band")) assert_that(event.when, is_not(none())) assert_that( event.image_url, equal_to( 'https://www.tivolivredenburg.nl/wp-content/uploads/2019/03/dezegebruikenleuk-195x130.jpg' )) assert_that(event.description, equal_to("met EK '88 thema!")) assert_that(event.date_published, is_not(none())) assert_that(event.source, equal_to('https://www.tivolivredenburg.nl/agenda/')) for event in results: assert_that(event.when, is_not(none)) assert_that(event.description, is_not(none)) assert_that(event.title, is_not(none)) assert_that(event.url, is_not(none))
def melkweg_observer(self, observer: Observer, _) -> Observer: parser = MelkwegParser() data = fetch(self.scrape_url) observer.on_next( parser.parse(ParsingContext(venue=self.venue, content=data))) observer.on_completed() return observer
def test_sample_file(self): venue = SpotProcessor.create_venue() data = fetch(venue.url) results = self.parser.parse(ParsingContext(venue=venue, content=data)) assert_that(results, is_not(none())) assert_that(len(results), equal_to(58)) kamagurka = [item for item in results if item.url == 'https://www.spotgroningen.nl/programma/kamagurka/'] assert_that(len(kamagurka), equal_to(1)) assert_that(kamagurka[0].source, equal_to('https://www.spotgroningen.nl/programma')) assert_that(kamagurka[0].description, equal_to('De overtreffende trap van absurditeit')) assert_that(kamagurka[0].date_published, is_not(none())) assert_that(kamagurka[0].image_url, equal_to('https://www.spotgroningen.nl/wp-content/uploads/2019/02/' 'Kamagurka-20-20De-20grenzen-20van-20de-20ernst-20' 'Kamagurka-202-20300-20dpi-20RGB-150x150.jpg')) assert_that(kamagurka[0].title, equal_to('Kamagurka - De grenzen van de ernst')) assert_that(kamagurka[0].when, is_not(none())) assert_that(kamagurka[0].url, equal_to('https://www.spotgroningen.nl/programma/kamagurka/')) assert_that(kamagurka[0].event_id, is_not(none())) assert_that(kamagurka[0].venue, equal_to(venue)) for event in results: assert_that(event.when, is_not(none)) assert_that(event.description, is_not(none)) assert_that(event.title, is_not(none)) assert_that(event.url, is_not(none))
def test_parse_sample(self): venue = SimplonProcessor.create_venue() parser = SimplonParser() data = fetch(venue.url) results = parser.parse(ParsingContext(venue=venue, content=data)) assert_that(len(results), equal_to(29)) event = results[0] assert_that(event.title, equal_to('Foxlane + Car Pets')) assert_that(event.venue, equal_to(venue)) assert_that(event.description, equal_to('Simplon UP')) assert_that(event.url, equal_to('http://simplon.nl/?post_type=events&p=17602')) assert_that( event.image_url, equal_to( 'https://simplon.nl/content/uploads/2019/03/FOXLANE-MAIN-PRESS-PHOTO-600x600.jpg' )) assert_that(event.when, is_not(none())) assert_that(event.source, equal_to('https://www.simplon.nl')) assert_that(event.date_published, is_not(none())) for event in results: assert_that(event.when, is_not(none)) assert_that(event.description, is_not(none)) assert_that(event.title, is_not(none)) assert_that(event.url, is_not(none))
def simplon_observer(self, observer: Observer, _) -> Disposable: parser = SimplonParser() data = fetch(self.scrape_url) events = parser.parse(ParsingContext(venue=self.venue, content=data)) observer.on_next(events) observer.on_completed() return observer
def test_raw_fetches(self): venue = VeraProcessor.create_venue() parser = VeraParser() data = fetch(f'{venue.url}/page=1') results = parser.parse(ParsingContext(venue=venue, content=data)) for event in results: assert_that(event.when, is_not(none)) assert_that(event.description, is_not(none)) assert_that(event.title, is_not(none)) assert_that(event.url, is_not(none)) data = fetch(f'{venue.url}/page=2') results = parser.parse(ParsingContext(venue=venue, content=data)) for event in results: assert_that(event.when, is_not(none)) assert_that(event.description, is_not(none)) assert_that(event.title, is_not(none)) assert_that(event.url, is_not(none))
def parse_all_at_once_observable( observer: Observer, parser: Parser, venue: Venue, scrape_url: str, ) -> Observer: data = fetch(scrape_url) events = parser.parse(ParsingContext(venue=venue, content=data)) observer.on_next(events) observer.on_completed() return observer
def test_sample_file_page_2(self): venue = ParadisoProcessor.create_venue() parser = ParadisoParser() data = fetch(f'{venue.source_url}/page=2') results = parser.parse(ParsingContext(venue=venue, content=data)) assert_that(len(results), equal_to(1)) for event in results: assert_that(event.when, is_not(none)) assert_that(event.description, is_not(none)) assert_that(event.title, is_not(none)) assert_that(event.url, is_not(none))
def test_small_sample(self): venue = MelkwegProcessor.create_venue() data = fetch(venue.source_url) events = MelkwegParser().parse(ParsingContext(venue, data)) assert_that(len(events), equal_to(46)) for event in events: assert_that(event.title, is_not(none())) assert_that(event.description, is_not(none())) assert_that(event.image_url, is_not(none())) assert_that(event.source, is_not(none())) assert_that(event.when, is_not(none())) assert_that(event.url, is_not(none())) assert_that(event.is_valid(), equal_to(True))
def vera_observer(self, observer: Observer, _) -> Disposable: vera_parser = VeraParser() items_per_page = 20 page_index = 0 done = False while not done: page_index += 1 url = self.scrape_url.format(page_index, items_per_page) data = fetch(url) new_events = vera_parser.parse(ParsingContext(venue=self.venue, content=data)) observer.on_next(new_events) done = len(new_events) < items_per_page observer.on_completed() return observer
def parse_page_indexed_observable(observer: Observer, parser: Parser, venue: Venue, scrape_url_format: str, items_per_page: int) -> Observer: page_index = 0 done = False while not done: page_index += 1 data = fetch(scrape_url_format.format(page_index)) new_events = parser.parse(ParsingContext(venue=venue, content=data)) observer.on_next(new_events) done = len(new_events) < items_per_page observer.on_completed() return observer
def test_parse(self): content = fetch('https://www.melkweg.nl/large-json') venue = MelkwegProcessor.create_venue() parser = MelkwegParser() results = parser.parse(ParsingContext(venue=venue, content=content)) assert_that(len(results), equal_to(378)) inna_event = [ r for r in results if r.title == 'Inna de Yard feat. Horace Andy' ][0] assert_that(inna_event.title, equal_to('Inna de Yard feat. Horace Andy')) assert_that( inna_event.description, equal_to( 'Inna de Yard is het resultaat van een historische ontmoeting van twee generaties Jamaicaanse ' 'zangers en muzikanten tijdens traditionele akoestische jamsessies \'inna de yard\'. Levende legendes ' 'van de gouden jaren van de rootsreggae als Horace Andy, Ken Boothe en Cedric Myton werken samen met ' 'jong talent van het eiland en blazen zo de originele essentie van \'jamrock\' nieuw leven in. ' 'Na het succes van het eerste album in 2017 en enkele geweldige concerten in Parijs, is er nu een ' 'vervolg met een nieuw album en een film. De nieuwe tour is nu al legendarisch en gelukkig slaan ' 'ze Amsterdam niet over!\xa0')) assert_that( inna_event.image_url, equal_to( 'https://s3-eu-west-1.amazonaws.com/static.melkweg.nl/uploads/images/scaled/agenda_thumbnail/25520' )) assert_that(inna_event.source, equal_to('https://www.melkweg.nl/agenda')) assert_that( inna_event.url, equal_to( 'https://www.melkweg.nl/nl/agenda/inna-da-yard-13-06-2019')) assert_that( inna_event.when, equal_to(datetime.fromisoformat('2019-06-13T19:30:00+02:00'))) for event in results: assert_that(event.title, is_not(none())) assert_that(event.description, is_not(none())) assert_that(event.image_url, is_not(none())) assert_that(event.source, is_not(none())) assert_that(event.when, is_not(none())) assert_that(event.url, is_not(none()))
def test_sample_file(self): venue = VeraProcessor.create_venue() parser = VeraParser() data = fetch(f'{venue.url}/page=1') results = parser.parse(ParsingContext(venue=venue, content=data)) assert_that(len(results), equal_to(20)) event = results[0] assert_that( event.url, equal_to( 'http://www.vera-groningen.nl/?post_type=events&p=98899&lang=nl' )) assert_that(event.venue, equal_to(venue)) assert_that(event.title, equal_to('Beyond Hip Hop (STUDIUM GENERALE PRESENTS)')) assert_that(event.when, is_not(none())) assert_that( event.image_url, equal_to( 'https://www.vera-groningen.nl/content/uploads/2019/03/rich-medina-website2-360x250.jpg' )) assert_that( event.description, equal_to('Beyond Hip Hop with support A Lecture By Rich Medina')) assert_that(event.date_published, is_not(none())) assert_that(event.source, equal_to('https://www.vera-groningen.nl/programma/')) event = results[2] assert_that( event.description, equal_to('Marissa Nadler (USA) with support Klaske Oenema (NL)')) assert_that(event.title, equal_to('Marissa Nadler (USA)')) for event in results: assert_that(event.when, is_not(none)) assert_that(event.description, is_not(none)) assert_that(event.title, is_not(none)) assert_that(event.url, is_not(none))