Example #1
0
 def _parse_2010(self, response):
     for section in Selector(response).xpath('//div[@class="proposal_list_summary"]'):
         il = SpeakerLoader(selector=section)
         il.add_xpath('name', './span[1]')
         il.add_value('year', str(response.meta['year']))
         il.add_value('conference', 'PyCon US')
         yield il.load_item()
 def parse_new(self, response):
     sel = Selector(response)
     speakers = sel.css('.archive .talk .speakers > .speaker')
     for speaker in speakers:
         il = SpeakerLoader(selector=speaker)
         il.add_value('conference', 'EuroPython')
         il.add_css('name', "span::text")
         il.add_css('image_urls', "a > img::attr(src)", lambda x:
                     [urljoin(response.url, y) for y in x])
         il.add_value('year', str(response.meta['cookiejar']))
         yield il.load_item()
Example #3
0
 def parse(self, response):
     sel = Selector(response)
     for speaker in sel.xpath('//span[@class="en_speaker_name"]').extract():
         il = SpeakerLoader(response=response)
         il.add_value('name', speaker)
         il.add_value('year', str(response.meta['year']))
         il.add_value('conference', 'OSCON')
         yield il.load_item()
Example #4
0
 def _parse_video(self, response):
     for section in Selector(response).xpath(
             "//div[@class = 'videos']//div[@class = 'presenters']/a"):
         il = SpeakerLoader(selector=section)
         il.add_xpath('name', ".")
         il.add_value('conference', str(response.meta['conference']))
         il.add_value('year', str(response.meta['year']))
         yield il.load_item()
Example #5
0
 def _parse_2014(self, response):
     for section in Selector(response).xpath(
             "//div[@class='sched-person']"):
         il = SpeakerLoader(selector=section)
         il.add_xpath('name', "./h2/a")
         il.add_value('conference', str(response.meta['conference']))
         il.add_value('year', str(response.meta['year']))
         yield il.load_item()
Example #6
0
 def _parse_2013(self, response):
     for section in Selector(response).xpath(
             "//div[@class='data-mid2']/h2[1]/a[1]"):
         il = SpeakerLoader(selector=section)
         il.add_xpath('name', ".")
         il.add_value('conference', str(response.meta['conference']))
         il.add_value('year', str(response.meta['year']))
         yield il.load_item()
Example #7
0
 def _parse_2006(self, response):
     sel = Selector(response)
     for name in sel.xpath('//div[@id="content"]/p[strong]/following-sibling::*[1]'):
         il = SpeakerLoader(selector=name)
         il.add_xpath('name', '.')
         il.add_value('year', str(response.meta['year']))
         il.add_value('conference', 'PyCon US')
         yield il.load_item()
Example #8
0
 def parse_2010(self, response):
     sel = Selector(response)
     for authors in sel.css('ul > li > em::text').extract():
         for author in authors.split(','):
             sl = SpeakerLoader(selector=sel, response=response)
             sl.add_value('name', author)
             sl.add_value('year', response.meta['year'])
             sl.add_value('conference', 'SciPy')
             yield sl.load_item()
 def _parse_2013(self, response):
     for section in Selector(response).xpath("//div[@class='data-mid2']/h2[1]/a[1]"):
         il = SpeakerLoader(selector=section)
         il.add_xpath('name', ".")
         il.add_value('conference', str(response.meta['conference']))
         il.add_value('year', str(response.meta['year']))
         yield il.load_item()
Example #10
0
 def parse(self, response):
     sel = Selector(response)
     speakers = sel.css('div.mini-profile')
     for speaker in speakers:
         il = SpeakerLoader(selector=speaker)
         il.add_css('name', ".name > a::text")
         il.add_css('image_urls', "img::attr(src)")
         il.add_value('year', str(response.meta['cookiejar']))
         il.add_value('conference', 'EuroPython')
         yield il.load_item()
     # pagination
     pages = sel.css('.pagination a::attr(href)').extract()
     for page in pages:
         yield Request(urljoin(response.url, page), meta=response.meta)
 def _parse_2014(self, response):
     for section in Selector(response).xpath("//div[@class='sched-person']"):
         il = SpeakerLoader(selector=section)
         il.add_xpath('name', "./h2/a")
         il.add_value('conference', str(response.meta['conference']))
         il.add_value('year', str(response.meta['year']))
         yield il.load_item()
 def _parse_video(self, response):
     for section in Selector(response).xpath("//div[@class = 'videos']//div[@class = 'presenters']/a"):
         il = SpeakerLoader(selector=section)
         il.add_xpath('name', ".")
         il.add_value('conference', str(response.meta['conference']))
         il.add_value('year', str(response.meta['year']))
         yield il.load_item()
 def parse(self, response):
     sel = Selector(response)
     for speaker in sel.xpath('//span[@class="en_speaker_name"]').extract():
         il = SpeakerLoader(response=response)
         il.add_value('name', speaker)
         il.add_value('year', str(response.meta['year']))
         il.add_value('conference', 'OSCON')
         yield il.load_item()
Example #14
0
 def parse(self, response):
     # The parameter __force_display allows to return all talks without
     # pagination.
     sel = Selector(response)
     for author in sel.xpath('//tr/td[2]/text()').extract():
         sl = SpeakerLoader(selector=sel, response=response)
         # TODO: handle/remove affiliation value and possibly multiple
         # authors.
         sl.add_value('conference', 'EuroSciPy')
         sl.add_value('name', author)
         sl.add_value('year', response.meta['year'])
         yield sl.load_item()
 def parse_2010(self, response):
     sel = Selector(response)
     for authors in sel.css('ul > li > em::text').extract():
         for author in authors.split(','):
             sl = SpeakerLoader(selector=sel, response=response)
             sl.add_value('name', author)
             sl.add_value('year', response.meta['year'])
             sl.add_value('conference', 'SciPy')
             yield sl.load_item()
Example #16
0
    def parse_2013(self, response):
        sel = Selector(response)
        # Probably this is the nicest layout of all versions.
        for authors in sel.css('.authors::text').extract():
            # FIXME: few entries miss the multiple-author separator ';'.
            for author in authors.split(';'):
                sl = SpeakerLoader(selector=sel, response=response)
                # FIXME: most author entry have the institution at the end.
                sl.add_value('name', author)
                sl.add_value('year', response.meta['year'])
                sl.add_value('conference', 'SciPy')

                yield sl.load_item()
Example #17
0
 def parse_2012(self, response):
     sel = Selector(response)
     # Here we take a pure-regex approach as the layout varies between the
     # entries a little and the authors text have a fair uniform pattern.
     for author in sel.css('#registrants_table').re(
             '>\s*-\s*(.+?)\s*(?:$|<)'):
         if author == '--':  # No author.
             continue
         sl = SpeakerLoader(selector=sel, response=response)
         sl.add_value('name', author)
         sl.add_value('year', response.meta['year'])
         sl.add_value('conference', 'SciPy')
         yield sl.load_item()
Example #18
0
 def _parse(self, response):
     for section in Selector(response).xpath(
             '//div[@class="speakers"]//div[@class="name"]'):
         il = SpeakerLoader(selector=section)
         il.add_xpath('name', '.')
         il.add_value('year', str(response.meta['year']))
         yield il.load_item()
Example #19
0
 def _parse_2013(self, response):
     for section in Selector(response).xpath(
             "//div[contains(@class,'speaker')]"):
         il = SpeakerLoader(selector=section)
         il.add_xpath('name', "./a[@class='name']")
         il.add_value('year', str(response.meta['year']))
         yield il.load_item()
Example #20
0
 def parse_speakers(self, response):
     sel = Selector(response)
     for speaker_div in sel.xpath("//div[contains(@class, 'speaker')]"):
         loader = SpeakerLoader(selector=speaker_div)
         loader.add_xpath('name', ".//h5/a[@target='_blank']/text()")
         loader.add_value('year', str(response.meta['year']))
         yield loader.load_item()
Example #21
0
 def parse_2009(self, response):
     sel = Selector(response)
     author_re = '<strong>.+</strong>.+\((.+)\)<'
     for authors in sel.css('.section > p').re(author_re):
         # There are few multiple authors entries, some of them separated by
         # '&' and others with comma. The problem comes from entires with
         # author plus institution, i.e.: "Armando Sole, ESRF, France".
         # For now, we extract only the first author.
         author = authors.partition(',')[0]
         sl = SpeakerLoader(selector=sel, response=response)
         sl.add_value('name', author)
         sl.add_value('year', response.meta['year'])
         sl.add_value('conference', 'SciPy')
         yield sl.load_item()
Example #22
0
 def _parse_workshop_2012(self, response):
     for section in Selector(response).xpath(
             "//div[contains(@class,'speaker')]"):
         for name in section.xpath(".//p/text()").extract():
             il = SpeakerLoader(selector=section)
             il.add_value('name', name)
             il.add_value('year', str(response.meta['year']))
             yield il.load_item()
Example #23
0
 def parse_2011(self, response):
     sel = Selector(response)
     speakers = sel.css('.speakers')
     for speaker in speakers:
         il = SpeakerLoader(selector=speaker)
         il.add_css('name', '.speakers::text')
         il.add_value('year', str(response.meta['cookiejar']))
         yield il.load_item()
 def parse_2012(self, response):
     sel = Selector(response)
     # Here we take a pure-regex approach as the layout varies between the
     # entries a little and the authors text have a fair uniform pattern.
     for author in sel.css('#registrants_table').re('>\s*-\s*(.+?)\s*(?:$|<)'):
         if author == '--':  # No author.
             continue
         sl = SpeakerLoader(selector=sel, response=response)
         sl.add_value('name', author)
         sl.add_value('year', response.meta['year'])
         sl.add_value('conference', 'SciPy')
         yield sl.load_item()
Example #25
0
 def parse_2008(self, response):
     sel = Selector(response)
     talk_author_re = re.compile('^(?P<title>.+) \((?P<authors>.+?)\)$')
     for event in sel.css('.section > p::text').extract():
         # For some reason, some entries have the character '\n' between the
         # talk name/author.
         event = event.replace('\n', ' ').strip()
         m = talk_author_re.search(event)
         if m:
             data = talk_author_re.search(event).groupdict()
             for author in data['authors'].split(','):
                 sl = SpeakerLoader(selector=sel, response=response)
                 sl.add_value('name', author)
                 sl.add_value('year', response.meta['year'])
                 sl.add_value('conference', 'SciPy')
                 yield sl.load_item()
Example #26
0
 def _parse_workshop_2013(self, response):
     for section in Selector(response).xpath(
             "//div[contains(@id,'workshop')]"):
         names = section.xpath(".//h2/text()").extract()[0]
         for name in self._split_names(names):
             il = SpeakerLoader(selector=section)
             il.add_value('name', name)
             il.add_value('year', str(response.meta['year']))
             yield il.load_item()
    def parse_2013(self, response):
        sel = Selector(response)
        # Probably this is the nicest layout of all versions.
        for authors in sel.css('.authors::text').extract():
            # FIXME: few entries miss the multiple-author separator ';'.
            for author in authors.split(';'):
                sl = SpeakerLoader(selector=sel, response=response)
                # FIXME: most author entry have the institution at the end.
                sl.add_value('name', author)
                sl.add_value('year', response.meta['year'])
                sl.add_value('conference', 'SciPy')

                yield sl.load_item()
Example #28
0
 def parse_speakers(self, response):
     sel = Selector(response)
     for speaker_div in sel.xpath("//div[contains(@class, 'speaker')]"):
         loader = SpeakerLoader(selector=speaker_div)
         loader.add_xpath("name", ".//h5/a[@target='_blank']/text()")
         loader.add_value("year", str(response.meta["year"]))
         yield loader.load_item()
 def parse_2009(self, response):
     sel = Selector(response)
     author_re = '<strong>.+</strong>.+\((.+)\)<'
     for authors in sel.css('.section > p').re(author_re):
         # There are few multiple authors entries, some of them separated by
         # '&' and others with comma. The problem comes from entires with
         # author plus institution, i.e.: "Armando Sole, ESRF, France".
         # For now, we extract only the first author.
         author = authors.partition(',')[0]
         sl = SpeakerLoader(selector=sel, response=response)
         sl.add_value('name', author)
         sl.add_value('year', response.meta['year'])
         sl.add_value('conference', 'SciPy')
         yield sl.load_item()
 def _parse_workshop_2012(self, response):
     for section in Selector(response).xpath("//div[contains(@class,'speaker')]"):
         for name in section.xpath(".//p/text()").extract():
             il = SpeakerLoader(selector=section)
             il.add_value('name', name)
             il.add_value('year', str(response.meta['year']))
             yield il.load_item()  
 def _parse_workshop_2013(self, response):
     for section in Selector(response).xpath("//div[contains(@id,'workshop')]"):
         names = section.xpath(".//h2/text()").extract()[0]
         for name in self._split_names(names):
             il = SpeakerLoader(selector=section)
             il.add_value('name', name)
             il.add_value('year', str(response.meta['year']))
             yield il.load_item()        
	def parse_2011(self, response):
		sel = Selector(response)
		speakers = sel.css('.speakers')
		for speaker in speakers:
			il = SpeakerLoader(selector=speaker)
			il.add_css('name', '.speakers::text')
			il.add_value('year', str(response.meta['cookiejar']))
			yield il.load_item()
 def parse_2008(self, response):
     sel = Selector(response)
     talk_author_re = re.compile('^(?P<title>.+) \((?P<authors>.+?)\)$')
     for event in sel.css('.section > p::text').extract():
         # For some reason, some entries have the character '\n' between the
         # talk name/author.
         event = event.replace('\n', ' ').strip()
         m = talk_author_re.search(event)
         if m:
             data = talk_author_re.search(event).groupdict()
             for author in data['authors'].split(','):
                 sl = SpeakerLoader(selector=sel, response=response)
                 sl.add_value('name', author)
                 sl.add_value('year', response.meta['year'])
                 sl.add_value('conference', 'SciPy')
                 yield sl.load_item()
 def _follow_sessions(self, response):
     for speaker in Selector(response).xpath("//a[contains(@href, 'speakerDetail.ww')]"):
         il = SpeakerLoader(selector=speaker)
         il.add_xpath('name', "./text()")
         il.add_value('year', str(response.meta['year']))
         yield il.load_item()
 def _parse(self, response):
     for section in Selector(response).xpath('//div[@class="speakers"]//div[@class="name"]'):
         il = SpeakerLoader(selector=section)
         il.add_xpath('name', '.')
         il.add_value('year', str(response.meta['year']))
         yield il.load_item()
Example #36
0
 def parse(self, response):
     sel = Selector(response)
     speakers = sel.css('div.mini-profile')
     for speaker in speakers:
         il = SpeakerLoader(selector=speaker)
         il.add_css('name', ".name > a::text")
         il.add_css('name', ".name::text")
         il.add_css('image_urls', "img::attr(src)")
         il.add_value('year', str(response.meta['cookiejar']))
         il.add_value('conference', 'EuroPython')
         yield il.load_item()
     # pagination
     pages = sel.css('.pagination a::attr(href)').extract()
     for page in pages:
         yield Request(urljoin(response.url, page), meta=response.meta)
Example #37
0
 def parse_new(self, response):
     sel = Selector(response)
     speakers = sel.css('.archive .talk .speakers > .speaker')
     for speaker in speakers:
         il = SpeakerLoader(selector=speaker)
         il.add_value('conference', 'EuroPython')
         il.add_css('name', "span::text")
         il.add_css('image_urls', "a > img::attr(src)",
                    lambda x: [urljoin(response.url, y) for y in x])
         il.add_value('year', str(response.meta['cookiejar']))
         yield il.load_item()
 def _parse_2013(self, response):
     for section in Selector(response).xpath("//div[contains(@class,'speaker')]"):
         il = SpeakerLoader(selector=section)
         il.add_xpath('name', "./a[@class='name']")
         il.add_value('year', str(response.meta['year']))
         yield il.load_item()
Example #39
0
 def parse_old_format(self, response):
     sel = Selector(response)
     speakers = sel.xpath('//div[@class="speaker-blurb"]//h3').extract()
     for speaker in speakers:
         il = SpeakerLoader(response=response)
         il.add_value('name', speaker)
         il.add_value('year', str(response.meta['year']))
         il.add_value('conference', 'OSCON')
         yield il.load_item()
     more_speakers = sel.xpath(
         '//span/a[contains(@href, "e_spkr")]//text()').extract()
     for speaker in more_speakers:
         il = SpeakerLoader(response=response)
         il.add_value('name', speaker.replace('N/A', ''))
         il.add_value('year', str(response.meta['year']))
         il.add_value('conference', 'OSCON')
         yield il.load_item()
 def parse_old_format(self, response):
     sel = Selector(response)
     speakers = sel.xpath('//div[@class="speaker-blurb"]//h3').extract()
     for speaker in speakers:
         il = SpeakerLoader(response=response)
         il.add_value('name', speaker)
         il.add_value('year', str(response.meta['year']))
         il.add_value('conference', 'OSCON')
         yield il.load_item()
     more_speakers = sel.xpath(
         '//span/a[contains(@href, "e_spkr")]//text()').extract()
     for speaker in more_speakers:
         il = SpeakerLoader(response=response)
         il.add_value('name', speaker.replace('N/A', ''))
         il.add_value('year', str(response.meta['year']))
         il.add_value('conference', 'OSCON')
         yield il.load_item()
Example #41
0
 def _follow_speakers(self, response):
     il = SpeakerLoader(response=response)
     il.add_xpath('name', "//a[contains(@href, '/speaker/profile/')]")
     il.add_value('year', str(response.meta['year']))
     yield il.load_item()
Example #42
0
 def _follow_speakers(self, response):
     il = SpeakerLoader(response=response)
     il.add_xpath('name', "//a[contains(@href, '/speaker/profile/')]")
     il.add_value('year', str(response.meta['year']))
     il.add_value('conference', 'PyCon US')
     yield il.load_item()