def _parse_2010(self, response): for section in Selector(response).xpath('//div[@class="proposal_list_summary"]'): il = SpeakerLoader(selector=section) il.add_xpath('name', './span[1]') il.add_value('year', str(response.meta['year'])) il.add_value('conference', 'PyCon US') yield il.load_item()
def parse_new(self, response): sel = Selector(response) speakers = sel.css('.archive .talk .speakers > .speaker') for speaker in speakers: il = SpeakerLoader(selector=speaker) il.add_value('conference', 'EuroPython') il.add_css('name', "span::text") il.add_css('image_urls', "a > img::attr(src)", lambda x: [urljoin(response.url, y) for y in x]) il.add_value('year', str(response.meta['cookiejar'])) yield il.load_item()
def parse(self, response): sel = Selector(response) for speaker in sel.xpath('//span[@class="en_speaker_name"]').extract(): il = SpeakerLoader(response=response) il.add_value('name', speaker) il.add_value('year', str(response.meta['year'])) il.add_value('conference', 'OSCON') yield il.load_item()
def _parse_video(self, response): for section in Selector(response).xpath( "//div[@class = 'videos']//div[@class = 'presenters']/a"): il = SpeakerLoader(selector=section) il.add_xpath('name', ".") il.add_value('conference', str(response.meta['conference'])) il.add_value('year', str(response.meta['year'])) yield il.load_item()
def _parse_2014(self, response): for section in Selector(response).xpath( "//div[@class='sched-person']"): il = SpeakerLoader(selector=section) il.add_xpath('name', "./h2/a") il.add_value('conference', str(response.meta['conference'])) il.add_value('year', str(response.meta['year'])) yield il.load_item()
def _parse_2013(self, response): for section in Selector(response).xpath( "//div[@class='data-mid2']/h2[1]/a[1]"): il = SpeakerLoader(selector=section) il.add_xpath('name', ".") il.add_value('conference', str(response.meta['conference'])) il.add_value('year', str(response.meta['year'])) yield il.load_item()
def _parse_2006(self, response): sel = Selector(response) for name in sel.xpath('//div[@id="content"]/p[strong]/following-sibling::*[1]'): il = SpeakerLoader(selector=name) il.add_xpath('name', '.') il.add_value('year', str(response.meta['year'])) il.add_value('conference', 'PyCon US') yield il.load_item()
def parse_2010(self, response): sel = Selector(response) for authors in sel.css('ul > li > em::text').extract(): for author in authors.split(','): sl = SpeakerLoader(selector=sel, response=response) sl.add_value('name', author) sl.add_value('year', response.meta['year']) sl.add_value('conference', 'SciPy') yield sl.load_item()
def _parse_2013(self, response): for section in Selector(response).xpath("//div[@class='data-mid2']/h2[1]/a[1]"): il = SpeakerLoader(selector=section) il.add_xpath('name', ".") il.add_value('conference', str(response.meta['conference'])) il.add_value('year', str(response.meta['year'])) yield il.load_item()
def parse(self, response): sel = Selector(response) speakers = sel.css('div.mini-profile') for speaker in speakers: il = SpeakerLoader(selector=speaker) il.add_css('name', ".name > a::text") il.add_css('image_urls', "img::attr(src)") il.add_value('year', str(response.meta['cookiejar'])) il.add_value('conference', 'EuroPython') yield il.load_item() # pagination pages = sel.css('.pagination a::attr(href)').extract() for page in pages: yield Request(urljoin(response.url, page), meta=response.meta)
def _parse_2014(self, response): for section in Selector(response).xpath("//div[@class='sched-person']"): il = SpeakerLoader(selector=section) il.add_xpath('name', "./h2/a") il.add_value('conference', str(response.meta['conference'])) il.add_value('year', str(response.meta['year'])) yield il.load_item()
def _parse_video(self, response): for section in Selector(response).xpath("//div[@class = 'videos']//div[@class = 'presenters']/a"): il = SpeakerLoader(selector=section) il.add_xpath('name', ".") il.add_value('conference', str(response.meta['conference'])) il.add_value('year', str(response.meta['year'])) yield il.load_item()
def parse(self, response): # The parameter __force_display allows to return all talks without # pagination. sel = Selector(response) for author in sel.xpath('//tr/td[2]/text()').extract(): sl = SpeakerLoader(selector=sel, response=response) # TODO: handle/remove affiliation value and possibly multiple # authors. sl.add_value('conference', 'EuroSciPy') sl.add_value('name', author) sl.add_value('year', response.meta['year']) yield sl.load_item()
def parse_2013(self, response): sel = Selector(response) # Probably this is the nicest layout of all versions. for authors in sel.css('.authors::text').extract(): # FIXME: few entries miss the multiple-author separator ';'. for author in authors.split(';'): sl = SpeakerLoader(selector=sel, response=response) # FIXME: most author entry have the institution at the end. sl.add_value('name', author) sl.add_value('year', response.meta['year']) sl.add_value('conference', 'SciPy') yield sl.load_item()
def parse_2012(self, response): sel = Selector(response) # Here we take a pure-regex approach as the layout varies between the # entries a little and the authors text have a fair uniform pattern. for author in sel.css('#registrants_table').re( '>\s*-\s*(.+?)\s*(?:$|<)'): if author == '--': # No author. continue sl = SpeakerLoader(selector=sel, response=response) sl.add_value('name', author) sl.add_value('year', response.meta['year']) sl.add_value('conference', 'SciPy') yield sl.load_item()
def _parse(self, response): for section in Selector(response).xpath( '//div[@class="speakers"]//div[@class="name"]'): il = SpeakerLoader(selector=section) il.add_xpath('name', '.') il.add_value('year', str(response.meta['year'])) yield il.load_item()
def _parse_2013(self, response): for section in Selector(response).xpath( "//div[contains(@class,'speaker')]"): il = SpeakerLoader(selector=section) il.add_xpath('name', "./a[@class='name']") il.add_value('year', str(response.meta['year'])) yield il.load_item()
def parse_speakers(self, response): sel = Selector(response) for speaker_div in sel.xpath("//div[contains(@class, 'speaker')]"): loader = SpeakerLoader(selector=speaker_div) loader.add_xpath('name', ".//h5/a[@target='_blank']/text()") loader.add_value('year', str(response.meta['year'])) yield loader.load_item()
def parse_2009(self, response): sel = Selector(response) author_re = '<strong>.+</strong>.+\((.+)\)<' for authors in sel.css('.section > p').re(author_re): # There are few multiple authors entries, some of them separated by # '&' and others with comma. The problem comes from entires with # author plus institution, i.e.: "Armando Sole, ESRF, France". # For now, we extract only the first author. author = authors.partition(',')[0] sl = SpeakerLoader(selector=sel, response=response) sl.add_value('name', author) sl.add_value('year', response.meta['year']) sl.add_value('conference', 'SciPy') yield sl.load_item()
def _parse_workshop_2012(self, response): for section in Selector(response).xpath( "//div[contains(@class,'speaker')]"): for name in section.xpath(".//p/text()").extract(): il = SpeakerLoader(selector=section) il.add_value('name', name) il.add_value('year', str(response.meta['year'])) yield il.load_item()
def parse_2011(self, response): sel = Selector(response) speakers = sel.css('.speakers') for speaker in speakers: il = SpeakerLoader(selector=speaker) il.add_css('name', '.speakers::text') il.add_value('year', str(response.meta['cookiejar'])) yield il.load_item()
def parse_2012(self, response): sel = Selector(response) # Here we take a pure-regex approach as the layout varies between the # entries a little and the authors text have a fair uniform pattern. for author in sel.css('#registrants_table').re('>\s*-\s*(.+?)\s*(?:$|<)'): if author == '--': # No author. continue sl = SpeakerLoader(selector=sel, response=response) sl.add_value('name', author) sl.add_value('year', response.meta['year']) sl.add_value('conference', 'SciPy') yield sl.load_item()
def parse_2008(self, response): sel = Selector(response) talk_author_re = re.compile('^(?P<title>.+) \((?P<authors>.+?)\)$') for event in sel.css('.section > p::text').extract(): # For some reason, some entries have the character '\n' between the # talk name/author. event = event.replace('\n', ' ').strip() m = talk_author_re.search(event) if m: data = talk_author_re.search(event).groupdict() for author in data['authors'].split(','): sl = SpeakerLoader(selector=sel, response=response) sl.add_value('name', author) sl.add_value('year', response.meta['year']) sl.add_value('conference', 'SciPy') yield sl.load_item()
def _parse_workshop_2013(self, response): for section in Selector(response).xpath( "//div[contains(@id,'workshop')]"): names = section.xpath(".//h2/text()").extract()[0] for name in self._split_names(names): il = SpeakerLoader(selector=section) il.add_value('name', name) il.add_value('year', str(response.meta['year'])) yield il.load_item()
def parse_speakers(self, response): sel = Selector(response) for speaker_div in sel.xpath("//div[contains(@class, 'speaker')]"): loader = SpeakerLoader(selector=speaker_div) loader.add_xpath("name", ".//h5/a[@target='_blank']/text()") loader.add_value("year", str(response.meta["year"])) yield loader.load_item()
def _parse_workshop_2012(self, response): for section in Selector(response).xpath("//div[contains(@class,'speaker')]"): for name in section.xpath(".//p/text()").extract(): il = SpeakerLoader(selector=section) il.add_value('name', name) il.add_value('year', str(response.meta['year'])) yield il.load_item()
def _parse_workshop_2013(self, response): for section in Selector(response).xpath("//div[contains(@id,'workshop')]"): names = section.xpath(".//h2/text()").extract()[0] for name in self._split_names(names): il = SpeakerLoader(selector=section) il.add_value('name', name) il.add_value('year', str(response.meta['year'])) yield il.load_item()
def _follow_sessions(self, response): for speaker in Selector(response).xpath("//a[contains(@href, 'speakerDetail.ww')]"): il = SpeakerLoader(selector=speaker) il.add_xpath('name', "./text()") il.add_value('year', str(response.meta['year'])) yield il.load_item()
def _parse(self, response): for section in Selector(response).xpath('//div[@class="speakers"]//div[@class="name"]'): il = SpeakerLoader(selector=section) il.add_xpath('name', '.') il.add_value('year', str(response.meta['year'])) yield il.load_item()
def parse(self, response): sel = Selector(response) speakers = sel.css('div.mini-profile') for speaker in speakers: il = SpeakerLoader(selector=speaker) il.add_css('name', ".name > a::text") il.add_css('name', ".name::text") il.add_css('image_urls', "img::attr(src)") il.add_value('year', str(response.meta['cookiejar'])) il.add_value('conference', 'EuroPython') yield il.load_item() # pagination pages = sel.css('.pagination a::attr(href)').extract() for page in pages: yield Request(urljoin(response.url, page), meta=response.meta)
def _parse_2013(self, response): for section in Selector(response).xpath("//div[contains(@class,'speaker')]"): il = SpeakerLoader(selector=section) il.add_xpath('name', "./a[@class='name']") il.add_value('year', str(response.meta['year'])) yield il.load_item()
def parse_old_format(self, response): sel = Selector(response) speakers = sel.xpath('//div[@class="speaker-blurb"]//h3').extract() for speaker in speakers: il = SpeakerLoader(response=response) il.add_value('name', speaker) il.add_value('year', str(response.meta['year'])) il.add_value('conference', 'OSCON') yield il.load_item() more_speakers = sel.xpath( '//span/a[contains(@href, "e_spkr")]//text()').extract() for speaker in more_speakers: il = SpeakerLoader(response=response) il.add_value('name', speaker.replace('N/A', '')) il.add_value('year', str(response.meta['year'])) il.add_value('conference', 'OSCON') yield il.load_item()
def _follow_speakers(self, response): il = SpeakerLoader(response=response) il.add_xpath('name', "//a[contains(@href, '/speaker/profile/')]") il.add_value('year', str(response.meta['year'])) yield il.load_item()
def _follow_speakers(self, response): il = SpeakerLoader(response=response) il.add_xpath('name', "//a[contains(@href, '/speaker/profile/')]") il.add_value('year', str(response.meta['year'])) il.add_value('conference', 'PyCon US') yield il.load_item()