def parse(self, response): # for each author ID on the page,create a new authorItem for ids in response.xpath( '//*[@id="gsc_ccl"]/div/div/div[@class="gsc_1usr_int"]'): full = ids.extract() fos = re.findall('=label:([^"]+)"', full) if fos: for f in fos: it = ItemLoader(item=FOSItem(), response=response) self.logger.debug(f) it.add_value('field_name', f) yield it.load_item() # generate next url new1 = response.xpath( '//*[@id="gsc_authors_bottom_pag"]/span/button[2]').extract_first( ) if new1: new2 = re.search('mauthors(.*)\'"', new1) if new2: newUrl = str(new2.group(1)).replace('\\x3d', '=').replace('\\x26', '&') newUrl = self.base_url + newUrl self.container.append(newUrl) # proceed with another random url to randomize access pattern to gscholar next = utils.pop_random(self.container) if next: yield Request(url=next)
def next_label_from_db(self): next_label = utils.pop_random(self.fields) if not next_label: return None enc = urllib2.quote(next_label.name.encode('utf-8')).encode('ASCII') self.logger.debug('Choosing existing org %s.' % enc) self.curr= next_label.id return self.pattern.format(enc)
def choose_next(self): if random.random() > 0.5: if len(self.container) == 0: l = self.next_label_from_db() return l else: u = utils.pop_random(self.container) self.logger.debug('Choosing existing url %s.' % u) return u else: next_url = self.next_label_from_db() if next_url: return next_url next_url = utils.pop_random(self.container) self.logger.debug('Choosing existing url %s.' % next_url) return next_url
def __init__(self, *args, **kwargs): super(self.__class__, self).__init__(*args, **kwargs) settings = get_project_settings() with open(settings['SEED_NAME_LIST'], mode='r') as f: self.container = [(self.base_url + '={0}').format(urllib.quote(i)) for i in f.readlines() if len(i) > 0] self.logger.info('Starting with %d surnames.', len(self.container)) start = utils.pop_random(self.container) if start: self.start_urls = [start]
def choose_next(self): # do not choose from database if we only want to scrape the given authors if self.scrape_given: return utils.pop_random(self.container) if random.random() > 0.5: if len(self.container) == 0: l = self.next_author_from_db() return l else: u = utils.pop_random(self.container) self.logger.debug('Choosing existing url %s.' % u) return u else: next_author = self.next_author_from_db() if next_author: return next_author next_author = utils.pop_random(self.container) self.logger.debug('Choosing existing url %s.' % next_author) return next_author
def __init__(self, *args, **kwargs): super(self.__class__, self).__init__(*args, **kwargs) # fields from the database self.fields = self.all_fields() # select a field to start at if self.fields: start_org = utils.pop_random(self.fields) print 'starting with org %s ' % start_org.name enc = urllib2.quote(start_org.name.encode('utf-8')).encode('ASCII') self.curr = start_org.id self.start_urls = [self.pattern.format(enc)]
def parse(self, response): # for each author ID on the page,create a new authorItem for ids in response.xpath('//*[@id="gsc_ccl"]/div/div/div[@class="gsc_1usr_int"]'): full = ids.extract() fos = re.findall('=label:([^"]+)"', full) if fos: for f in fos: it = ItemLoader(item=FOSItem(), response=response) self.logger.debug(f) it.add_value('field_name', f) yield it.load_item() # generate next url new1 = response.xpath('//*[@id="gsc_authors_bottom_pag"]/span/button[2]').extract_first() if new1: new2 = re.search('mauthors(.*)\'"', new1) if new2: newUrl = str(new2.group(1)).replace('\\x3d','=').replace('\\x26', '&') newUrl = self.base_url + newUrl self.container.append(newUrl) # proceed with another random url to randomize access pattern to gscholar next = utils.pop_random(self.container) if next: yield Request(url=next)
def choose_next(self): # do not choose from database if we only want to scrape the given authors return utils.pop_random(self.container)