def parse_item_from_redirect(self, response): self.log('Hi, this is an item page!') print "Scraping a new page (FROM REDIRECT)" print "URL is " + str(response.url) assert(response.info()['Content-Type'] == 'application/pdf') new_title = response.info()['Content-Disposition'].split('filename=')[-1] new_title = new_title.replace('"','').replace("'","") url = response.url # Save to database try: item_duplicate = DataItem.objects.get(link = url, title = new_title) except DataItem.DoesNotExist: new_item = DataItem(spider = self.spidermodels, session = self.session, date_scraped = timezone.now(), link = url, referrer = response.url, title = new_title) new_item.save() return return
def parse_item(self, response): self.log('Hi, this is an item page!') print "Scraping a new page and searching" print "URL is " + str(response.url) try: sel = Selector(response) except AttributeError: print " Possible redirect" response = urllib2.urlopen(response.url) if response.info()['Content-Type'] == 'application/pdf': print "Redirect encountered. Parse appropriately" return self.parse_item_from_redirect(response) else: print "BAD LINK - useless" return [] sites = [] for filetype in self.required_filetypes: sites += sel.xpath('//a[contains(@href, ".%s")]' % filetype) items = [] for site in sites: item = ToolkitItem() # Get referrer item['referrer'] = response.url # Get link try: item['link'] = urljoin(response.url, site.xpath('@href').extract()[0]) except IndexError: item['link'] = urljoin(response.url, site.xpath('@href').extract()) # Get title try: item['title'] = site.xpath('text()').extract()[0] except IndexError: item['title'] = site.xpath('text()').extract() item['date'] = timezone.now() #datetime.now() items.append(item) # Save to database try: item_duplicate = DataItem.objects.get(link=item['link'], title=item['title']) except DataItem.DoesNotExist: new_item = DataItem(spider=self.spidermodels, session=self.session, date_scraped=item['date'], link=item['link'], referrer=item['referrer'], title=item['title']) new_item.save() return item return items
def parse_item_from_redirect(self, response): self.log('Hi, this is an item page!') print "Scraping a new page (FROM REDIRECT)" print "URL is " + str(response.url) assert (response.info()['Content-Type'] == 'application/pdf') new_title = response.info()['Content-Disposition'].split( 'filename=')[-1] new_title = new_title.replace('"', '').replace("'", "") url = response.url # Save to database try: item_duplicate = DataItem.objects.get(link=url, title=new_title) except DataItem.DoesNotExist: new_item = DataItem(spider=self.spidermodels, session=self.session, date_scraped=timezone.now(), link=url, referrer=response.url, title=new_title) new_item.save() return return
def process_item(self, item, spider): try: item_duplicate = DataItem.objects.get(link = item['link'], title = item['title']) except DataItem.DoesNotExist: new_item = DataItem(date_scraped = item['date'], link = item['link'], referrer = item['referrer'], title = item['title']) new_item.save() return item
def process_item(self, item, spider): try: item_duplicate = DataItem.objects.get(link=item['link'], title=item['title']) except DataItem.DoesNotExist: new_item = DataItem(date_scraped=item['date'], link=item['link'], referrer=item['referrer'], title=item['title']) new_item.save() return item
def parse_item(self, response): self.log('Hi, this is an item page!') print "Scraping a new page and searching" print "URL is " + str(response.url) try: sel = Selector(response) except AttributeError: print " Possible redirect" response = urllib2.urlopen(response.url) if response.info()['Content-Type'] == 'application/pdf': print "Redirect encountered. Parse appropriately" return self.parse_item_from_redirect(response) else: print "BAD LINK - useless" return [] sites = [] for filetype in self.required_filetypes: sites += sel.xpath('//a[contains(@href, ".%s")]' % filetype) items = [] for site in sites: item = ToolkitItem() # Get referrer item['referrer'] = response.url # Get link try: item['link'] = urljoin(response.url, site.xpath('@href').extract()[0]) except IndexError: item['link'] = urljoin(response.url, site.xpath('@href').extract()) # Get title try: item['title'] = site.xpath('text()').extract()[0] except IndexError: item['title'] = site.xpath('text()').extract() item['date'] = timezone.now() #datetime.now() items.append(item) # Save to database try: item_duplicate = DataItem.objects.get(link = item['link'], title = item['title']) except DataItem.DoesNotExist: new_item = DataItem(spider = self.spidermodels, session = self.session, date_scraped = item['date'], link = item['link'], referrer = item['referrer'], title = item['title']) new_item.save() return item return items