def parse_page(self, response): ''' ''' if self.type == 'post': yield scrapy.Request(url=response.url, callback=self.parse_post, priority=10, meta={'index': 1}) elif self.type == 'page': #select all posts for post in response.xpath( "//div[contains(@data-ft,'top_level_post_id')]"): many_features = post.xpath('./@data-ft').get() date = [] date.append(many_features) date = parse_date(date, {'lang': self.lang}) current_date = datetime.strptime( date, '%Y-%m-%d %H:%M:%S') if date is not None else date if current_date is None: date_string = post.xpath('.//abbr/text()').get() date = parse_date2([date_string], {'lang': self.lang}) current_date = datetime( date.year, date.month, date.day) if date is not None else date # try: # current_date = datetime(date.year,date.month,date.day) if date is not None else date # except: # date_month = month.index(date[1]) + 1 # current_date = datetime(2019,int(date_month),int(date[0])) # current_date = datetime(date.year,date.month,date.day) if date is not None else date date = str(date) if abs(self.count) + 1 > self.max: raise CloseSpider( 'Reached max num of post: {}. Crawling finished'. format(abs(self.count))) self.logger.info('Parsing post n = {}, post_date = {}'.format( abs(self.count) + 1, date)) #returns full post-link in a list post = post.xpath( ".//a[contains(@href,'footer')]/@href").extract() temp_post = response.urljoin(post[0]) self.count -= 1 yield scrapy.Request(temp_post, self.parse_post, priority=self.count, meta={'index': 1}) #load following page, try to click on "more" #after few pages have been scraped, the "more" link might disappears #if not present look for the highest year not parsed yet #click once on the year and go back to clicking "more" #new_page is different for groups if self.group == 1: new_page = response.xpath( "//div[contains(@id,'stories_container')]/div[2]/a/@href" ).extract() else: new_page = response.xpath( "//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href" ).extract() #this is why lang is needed if not new_page: self.logger.info( '[!] "more" link not found, will look for a "year" link') #self.k is the year link that we look for if response.meta['flag'] == self.k and self.k >= self.year: xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str( self.k) + "')]/@href" new_page = response.xpath(xpath).extract() if new_page: new_page = response.urljoin(new_page[0]) self.k -= 1 self.logger.info( 'Found a link for year "{}", new_page = {}'.format( self.k, new_page)) yield scrapy.Request(new_page, callback=self.parse_page, priority=-1000, meta={'flag': self.k}) else: while not new_page: #sometimes the years are skipped this handles small year gaps self.logger.info( 'Link not found for year {}, trying with previous year {}' .format(self.k, self.k - 1)) self.k -= 1 if self.k < self.year: raise CloseSpider( 'Reached date: {}. Crawling finished'. format(self.date)) xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str( self.k) + "')]/@href" new_page = response.xpath(xpath).extract() self.logger.info( 'Found a link for year "{}", new_page = {}'.format( self.k, new_page)) new_page = response.urljoin(new_page[0]) self.k -= 1 yield scrapy.Request(new_page, callback=self.parse_page, priority=-1000, meta={'flag': self.k}) else: self.logger.info('Crawling has finished with no errors!') else: new_page = response.urljoin(new_page[0]) if 'flag' in response.meta: self.logger.info( 'Page scraped, clicking on "more"! new_page = {}'. format(new_page)) yield scrapy.Request(new_page, callback=self.parse_page, priority=-1000, meta={'flag': response.meta['flag']}) else: self.logger.info( 'First page scraped, clicking on "more"! new_page = {}' .format(new_page)) yield scrapy.Request(new_page, callback=self.parse_page, priority=-1000, meta={'flag': self.k})
def parse_page(self, response): ''' Parse the given page selecting the posts. Then ask recursively for another page. ''' # #open page in browser for debug # from scrapy.utils.response import open_in_browser # open_in_browser(response) #select all posts for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): many_features = post.xpath('./@data-ft').get() date = [] date.append(many_features) date = parse_date(date,{'lang':self.lang}) current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') if date is not None else date if current_date is None: date_string = post.xpath('.//abbr/text()').get() date = parse_date2([date_string],{'lang':self.lang}) current_date = datetime(date.year,date.month,date.day) if date is not None else date date = str(date) #if 'date' argument is reached stop crawling if self.date > current_date: raise CloseSpider('Reached date: {}'.format(self.date)) new = ItemLoader(item=FbcrawlItem(),selector=post) if abs(self.count) + 1 > self.max: raise CloseSpider('Reached max num of post: {}. Crawling finished'.format(abs(self.count))) self.logger.info('Parsing post n = {}, post_date = {}'.format(abs(self.count)+1,date)) new.add_xpath('comments', './div[2]/div[2]/a[1]/text()') new.add_value('date',date) new.add_xpath('post_id','./@data-ft') new.add_xpath('url', ".//a[contains(@href,'footer')]/@href") #page_url #new.add_value('url',response.url) #returns full post-link in a list post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() temp_post = response.urljoin(post[0]) self.count -= 1 yield scrapy.Request(temp_post, self.parse_post, priority = self.count, meta={'item':new}) #load following page, try to click on "more" #after few pages have been scraped, the "more" link might disappears #if not present look for the highest year not parsed yet #click once on the year and go back to clicking "more" #new_page is different for groups if self.group == 1: new_page = response.xpath("//div[contains(@id,'stories_container')]/div[2]/a/@href").extract() else: new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract() #this is why lang is needed ^^^^^^^^^^^^^^^^^^^^^^^^^^ if not new_page: self.logger.info('[!] "more" link not found, will look for a "year" link') #self.k is the year link that we look for if response.meta['flag'] == self.k and self.k >= self.year: xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href" new_page = response.xpath(xpath).extract() if new_page: new_page = response.urljoin(new_page[0]) self.k -= 1 self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k}) else: while not new_page: #sometimes the years are skipped this handles small year gaps self.logger.info('Link not found for year {}, trying with previous year {}'.format(self.k,self.k-1)) self.k -= 1 if self.k < self.year: raise CloseSpider('Reached date: {}. Crawling finished'.format(self.date)) xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href" new_page = response.xpath(xpath).extract() self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page)) new_page = response.urljoin(new_page[0]) self.k -= 1 yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k}) else: self.logger.info('Crawling has finished with no errors!') else: new_page = response.urljoin(new_page[0]) if 'flag' in response.meta: self.logger.info('Page scraped, clicking on "more"! new_page = {}'.format(new_page)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':response.meta['flag']}) else: self.logger.info('First page scraped, clicking on "more"! new_page = {}'.format(new_page)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k})
def parse_page(self, response): ''' ''' #select all posts for post in response.xpath( "//article[contains(@data-ft,'top_level_post_id')]"): many_features = post.xpath('./@data-ft').get() date = [] date.append(many_features) date = parse_date(date, {'lang': self.lang}) current_date = datetime.strptime( date, '%Y-%m-%d %H:%M:%S') if date is not None else date if current_date is None: date_string = post.xpath('.//abbr/text()').get() date = parse_date2([date_string], {'lang': self.lang}) current_date = datetime(date.year, date.month, date.day) if date is not None else date date = str(date) # prev_date = self.date - d1.timedelta(days=1) # #print(prev_date > current_date) # if prev_date >= current_date: # raise CloseSpider('Reached date: {}'.format(self.date)) if abs(self.count) + 1 > self.max: raise CloseSpider( 'Reached max num of post: {}. Crawling finished'.format( abs(self.count))) self.logger.info('Parsing post n = {}, post_date = {}'.format( abs(self.count) + 1, date)) post_id = id_strip(post.xpath("./@data-ft").extract()) #returns full post-link in a list post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() temp_post = response.urljoin(post[0]) self.count -= 1 yield scrapy.Request(temp_post, self.parse_post, priority=self.count, meta={ 'index': 1, 'post_id': post_id, 'current_date': current_date }) #load following page, try to click on "more" #if "more" link not present look for the highest year not parsed yet if self.group == 1: new_page = response.xpath( "//div[contains(@id,'stories_container')]/div[2]/a/@href" ).extract() else: new_page = response.xpath( "//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href" ).extract() if not new_page: self.logger.info( '[!] "more" link not found, will look for a "year" link') #self.k is the year link that we look for if response.meta['flag'] == self.k and self.k >= self.year: xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str( self.k) + "')]/@href" new_page = response.xpath(xpath).extract() if new_page: new_page = response.urljoin(new_page[0]) self.k -= 1 self.logger.info( 'Found a link for year "{}", new_page = {}'.format( self.k, new_page)) yield scrapy.Request(new_page, callback=self.parse_page, priority=-1000, meta={'flag': self.k}) else: while not new_page: self.logger.info( 'Link not found for year {}, trying with previous year {}' .format(self.k, self.k - 1)) self.k -= 1 if self.k < self.year: raise CloseSpider( 'Reached date: {}. Crawling finished'.format( self.date)) xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str( self.k) + "')]/@href" new_page = response.xpath(xpath).extract() self.logger.info( 'Found a link for year "{}", new_page = {}'.format( self.k, new_page)) new_page = response.urljoin(new_page[0]) self.k -= 1 yield scrapy.Request(new_page, callback=self.parse_page, priority=-1000, meta={'flag': self.k}) else: self.logger.info('Crawling has finished with no errors!') else: new_page = response.urljoin(new_page[0]) if 'flag' in response.meta: self.logger.info( 'Page scraped, clicking on "more"! new_page = {}'.format( new_page)) yield scrapy.Request(new_page, callback=self.parse_page, priority=-1000, meta={'flag': response.meta['flag']}) else: self.logger.info( 'First page scraped, clicking on "more"! new_page = {}'. format(new_page)) yield scrapy.Request(new_page, callback=self.parse_page, priority=-1000, meta={'flag': self.k})
def parse_page(self, response): ''' ''' if self.type == 'post': yield scrapy.Request(url=response.url, callback=self.parse_post, priority=10, meta={'index':1}) elif self.type == 'page': #select all posts for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): many_features = post.xpath('./@data-ft').get() date = [] date.append(many_features) date = parse_date(date,{'lang':self.lang}) current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') if date is not None else date if current_date is None: date_string = post.xpath('.//abbr/text()').get() date = parse_date2([date_string],{'lang':self.lang}) current_date = datetime(date.year,date.month,date.day) if date is not None else date date = str(date) if abs(self.count) + 1 > self.max: raise CloseSpider('Reached max num of post: {}. Crawling finished'.format(abs(self.count))) self.logger.info('Parsing post n = {}, post_date = {}'.format(abs(self.count)+1,date)) #returns full post-link in a list post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() temp_post = response.urljoin(post[0]) self.count -= 1 yield scrapy.Request(temp_post, self.parse_post, priority = self.count, meta={'index':1}) #load following page, try to click on "more" #after few pages have been scraped, the "more" link might disappears #if not present look for the highest year not parsed yet #click once on the year and go back to clicking "more" #new_page is different for groups if self.group == 1: new_page = response.xpath("//div[contains(@id,'stories_container')]/div[2]/a/@href").extract() else: new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract() #this is why lang is needed if not new_page: self.logger.info('[!] "more" link not found, will look for a "year" link') #self.k is the year link that we look for if response.meta['flag'] == self.k and self.k >= self.year: xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href" new_page = response.xpath(xpath).extract() if new_page: new_page = response.urljoin(new_page[0]) self.k -= 1 self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page)) yield scrapy.Request(new_page, callback=self.parse_page, priority = -1000, meta={'flag':self.k}) else: while not new_page: #sometimes the years are skipped this handles small year gaps self.logger.info('Link not found for year {}, trying with previous year {}'.format(self.k,self.k-1)) self.k -= 1 if self.k < self.year: raise CloseSpider('Reached date: {}. Crawling finished'.format(self.date)) xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href" new_page = response.xpath(xpath).extract() self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page)) new_page = response.urljoin(new_page[0]) self.k -= 1 yield scrapy.Request(new_page, callback=self.parse_page, priority = -1000, meta={'flag':self.k}) else: self.logger.info('Crawling has finished with no errors!') else: new_page = response.urljoin(new_page[0]) if 'flag' in response.meta: self.logger.info('Page scraped, clicking on "more"! new_page = {}'.format(new_page)) yield scrapy.Request(new_page, callback=self.parse_page, priority = -1000, meta={'flag':response.meta['flag']}) else: self.logger.info('First page scraped, clicking on "more"! new_page = {}'.format(new_page)) yield scrapy.Request(new_page, callback=self.parse_page, priority = -1000, meta={'flag':self.k})
def parse_page(self, response): ''' Parse the given page for all the posts. Then recursively for another page. ''' #select all posts for post in response.xpath( "//article[contains(@data-ft,'top_level_post_id')]"): many_features = post.xpath('./@data-ft').get() date = [] date.append(many_features) date = parse_date(date, {'lang': self.lang}) current_date = datetime.strptime( date, '%Y-%m-%d %H:%M:%S') if date is not None else date if current_date is None: date_string = post.xpath('.//abbr/text()').get() date = parse_date2([date_string], {'lang': self.lang}) current_date = datetime(date.year, date.month, date.day) if date is not None else date date = str(date) #'date' argument is reached stop crawling, scraps more to capture all required data till date as spiders stops immediately #prev_date = self.date - d1.timedelta(days=1) #if prev_date >= current_date: # if self.date >= current_date: # time.sleep(60) # raise CloseSpider('Reached date: {}'.format(self.date)) new = ItemLoader(item=FbcrawlItem(), selector=post) if abs(self.count) + 1 > self.max: raise CloseSpider( 'Reached max num of post: {}. Crawling finished'.format( abs(self.count))) self.logger.info('Parsing post n = {}, post_date = {}'.format( abs(self.count) + 1, date)) new.add_xpath('comments', './footer/div[2]/a[1]/text()') new.add_xpath('reactions', "./footer/div[2]/span/a[1]/text()") new.add_xpath('content', ".//tr/td/h3/text()") #new.add_xpath('image',".//div/div[2]/div/a/@href") new.add_value('date', date) new.add_xpath('post_id', './@data-ft') story_url = url_strip( post.xpath(".//a[contains(@href,'footer')]/@href").extract()) post_url = response.urljoin(story_url) new.add_value('url', post_url) #returns full post-link in a list post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() temp_post = response.urljoin(post[0]) self.count -= 1 yield scrapy.Request(temp_post, self.parse_post, priority=self.count, meta={ 'item': new, 'current_date': current_date }) #load following page, try to click on "more" #if "more" link not present look for the highest year not parsed yet #new_page is different for groups if self.group == 1: new_page = response.xpath( "//div[contains(@id,'stories_container')]/div[2]/a/@href" ).extract() else: new_page = response.xpath( "//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href" ).extract() if not new_page: self.logger.info( '[!] "more" link not found, will look for a "year" link') #self.k is the year link that we look for if response.meta['flag'] == self.k and self.k >= self.year: xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str( self.k) + "')]/@href" new_page = response.xpath(xpath).extract() if new_page: new_page = response.urljoin(new_page[0]) self.k -= 1 self.logger.info( 'Found a link for year "{}", new_page = {}'.format( self.k, new_page)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': self.k}) else: while not new_page: self.logger.info( 'Link not found for year {}, trying with previous year {}' .format(self.k, self.k - 1)) self.k -= 1 if self.k < self.year: raise CloseSpider( 'Reached date: {}. Crawling finished'.format( self.date)) xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str( self.k) + "')]/@href" new_page = response.xpath(xpath).extract() self.logger.info( 'Found a link for year "{}", new_page = {}'.format( self.k, new_page)) new_page = response.urljoin(new_page[0]) self.k -= 1 yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': self.k}) else: self.logger.info('Crawling has finished with no errors!') else: new_page = response.urljoin(new_page[0]) if 'flag' in response.meta: self.logger.info( 'Page scraped, clicking on "more"! new_page = {}'.format( new_page)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': response.meta['flag']}) else: self.logger.info( 'First page scraped, clicking on "more"! new_page = {}'. format(new_page)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': self.k})
def parse_page(self, response): for post in response.xpath( "//article[contains(@data-ft,'top_level_post_id')]"): try: many_features = post.xpath('./@data-ft').get() date = [] date.append(many_features) date = parse_date(date, {'lang': self.lang}) current_date = datetime.strptime( date, '%Y-%m-%d %H:%M:%S') if date is not None else date if current_date is None: date_string = post.xpath('.//abbr/text()').get() date = parse_date2([date_string], {'lang': self.lang}) current_date = datetime( date.year, date.month, date.day) if date is not None else date date = str(date) #if 'date' argument is reached stop crawling if self.date > current_date: raise CloseSpider('Reached date: {}'.format(self.date)) new = ItemLoader(item=FbcrawlItem(), selector=post) if abs(self.count) + 1 > self.max: raise CloseSpider( 'Reached max num of post: {}. Crawling finished'. format(abs(self.count))) self.logger.info('Parsing post n = {}, post_date = {}'.format( abs(self.count) + 1, date)) new.add_xpath('comments', './div[2]/div[2]/a[1]/text()') new.add_value('date', date) new.add_xpath('post_id', './@data-ft') new.add_xpath('url', ".//a[contains(@href,'footer')]/@href") #page_url #new.add_value('url',response.url) #returns full post-link in a list post = post.xpath( ".//a[contains(@href,'footer')]/@href").extract() temp_post = response.urljoin(post[0]) self.count -= 1 yield scrapy.Request(temp_post, self.parse_post, priority=self.count, meta={'item': new}) except: continue #load following page, try to click on "more" #after few pages have been scraped, the "more" link might disappears #if not present look for the highest year not parsed yet #click once on the year and go back to clicking "more" #this literally only get the more button link if there exists one... #new_page is different for groups if self.group == 1: new_page = response.xpath( "//div[contains(@id,'stories_container')]/div[2]/a/@href" ).extract() print(new_page) else: new_page = response.xpath( '//*[@id="structured_composer_async_container"]/div[2]/a/@href' ).extract() # with open('0.html', 'wb') as f: # f.write(response.body) #this is why lang is needed ^^^^^^^^^^^^^^^^^^^^^^^^^^ #'//*[@id="structured_composer_async_container"]/div[2]/a/@href' if not new_page or 'Recent' in str( response.xpath( '//*[@id="structured_composer_async_container"]/div[2]/a'). extract()): self.logger.info( '[!] "more" link not found, will look for a "year" link') #self.k is the year link that we look for if response.meta['flag'] == self.k and self.k >= self.year: xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str( self.k) + "')]/@href" new_page = response.xpath(xpath).extract() if new_page: new_page = response.urljoin(new_page[0]) self.k -= 1 self.logger.info( 'Found a link for year "{}", new_page = {}'.format( self.k, new_page)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': self.k}) else: # print('does this ever happen????') while not new_page: #sometimes the years are skipped this handles small year gaps self.logger.info( 'Link not found for year {}, trying with previous year {}' .format(self.k, self.k - 1)) self.k -= 1 if self.k < self.year: raise CloseSpider( 'Reached date: {}. Crawling finished'.format( self.date)) xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str( self.k) + "')]/@href" new_page = response.xpath(xpath).extract() self.logger.info( 'Found a link for year "{}", new_page = {}'.format( self.k, new_page)) new_page = response.urljoin(new_page[0]) self.k -= 1 yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': self.k}) else: self.logger.info('Crawling has finished with no errors!') else: new_page = response.urljoin(new_page[0]) # print(new_page) # print(new_page[0]) # sys.exit("this is what it looks like") if 'flag' in response.meta: self.logger.info( 'Page scraped, clicking on "more"! new_page = {}'.format( new_page)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': response.meta['flag']}) else: self.logger.info( 'First page scraped, clicking on "more"! new_page = {}'. format(new_page)) yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag': self.k})
def parse_page(self, response): #allowed maximum number of outdated post in a page maximum_outdated_count = 3 outdated_count = 0 posts = [] if response.meta['group'] == 1: posts = response.xpath( "//div[@id='m_group_stories_container']//div[contains(@data-ft,'mf_story_key')]" ) else: posts = response.xpath( "//div[contains(@data-ft,'top_level_post_id')]") for post in posts: many_features = post.xpath('./@data-ft').get() date = [] date.append(many_features) date = parse_date(date, {'lang': self.lang}) current_date = datetime.strptime( date, '%Y-%m-%d %H:%M:%S') if date is not None else date if current_date is None: date_string = post.xpath('.//abbr/text()').get() if date_string is None: continue date = parse_date2([date_string], {'lang': self.lang}) current_date = datetime(date.year, date.month, date.day) if date is not None else date date = str(date) #if 'date' argument is reached stop crawling if self.date > current_date: outdated_count += 1 if outdated_count > maximum_outdated_count: self.logger.info( 'Reached date: {} for crawling page {}. Crawling finished' .format(self.date, response.url)) return else: continue #if 'skipto_date' argument is not reached, skip crawling if self.skipto_date < current_date: continue #returns full post-link in a list post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() temp_post = response.urljoin(post[0]) yield scrapy.Request(temp_post, self.parse_post, meta={ 'index': 1, 'page_url': response.url }) #load following page, try to click on "more" #after few pages have been scraped, the "more" link might disappears #if not present look for the highest year not parsed yet #click once on the year and go back to clicking "more" #new_page is different for groups if response.meta['group'] == 1: new_page = response.xpath( "//div[contains(@id,'stories_container')]/div[2]/a/@href" ).extract() else: new_page = response.xpath( "//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href" ).extract() #this is why lang is needed if not new_page: self.logger.info( '[!] "more" link not found, will look for a "year" link') #self.k is the year link that we look for if response.meta['flag'] == self.k and self.k >= self.year: xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str( self.k) + "')]/@href" new_page = response.xpath(xpath).extract() if new_page: new_page = response.urljoin(new_page[0]) self.k -= 1 self.logger.info( 'Found a link for year "{}", new_page = {}'.format( self.k, new_page)) yield scrapy.Request(new_page, callback=self.parse_page, priority=-1000, meta={ 'flag': self.k, 'group': response.meta['group'] }) else: while not new_page: #sometimes the years are skipped this handles small year gaps self.logger.info( 'Link not found for year {}, trying with previous year {}' .format(self.k, self.k - 1)) self.k -= 1 if self.k < self.year: self.logger.info( 'Reached date: {} for crawling page {}. Crawling finished' .format(self.date, response.url)) return xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str( self.k) + "')]/@href" new_page = response.xpath(xpath).extract() self.logger.info( 'Found a link for year "{}", new_page = {}'.format( self.k, new_page)) new_page = response.urljoin(new_page[0]) self.k -= 1 yield scrapy.Request(new_page, callback=self.parse_page, priority=-1000, meta={ 'flag': self.k, 'group': response.meta['group'] }) else: self.logger.info('Crawling has finished with no errors!') else: new_page = response.urljoin(new_page[0]) if 'flag' in response.meta: self.logger.info( 'Page scraped, clicking on "more"! new_page = {}'.format( new_page)) yield scrapy.Request(new_page, callback=self.parse_page, priority=-1000, meta={ 'flag': response.meta['flag'], 'group': response.meta['group'] }) else: self.logger.info( 'First page scraped, clicking on "more"! new_page = {}'. format(new_page)) yield scrapy.Request(new_page, callback=self.parse_page, priority=-1000, meta={ 'flag': self.k, 'group': response.meta['group'] })