Example #1
0
 def parse_item_from_redirect(self, response):
     self.log('Hi, this is an item page!')
     print "Scraping a new page (FROM REDIRECT)"
     print "URL is " + str(response.url)
     
     assert(response.info()['Content-Type'] == 'application/pdf')
     
     new_title = response.info()['Content-Disposition'].split('filename=')[-1]
     new_title = new_title.replace('"','').replace("'","")
     
     url = response.url
         
     # Save to database
     try:
         item_duplicate = DataItem.objects.get(link = url,
                                               title = new_title)
     except DataItem.DoesNotExist:
         new_item = DataItem(spider = self.spidermodels,
                             session = self.session,
                             date_scraped = timezone.now(),
                             link = url,
                             referrer = response.url,
                             title = new_title) 
         new_item.save()
         return
    
     return
             
Example #2
0
    def parse_item(self, response):
        self.log('Hi, this is an item page!')
        print "Scraping a new page and searching"
        print "URL is " + str(response.url)

        try:
            sel = Selector(response)
        except AttributeError:
            print "   Possible redirect"
            response = urllib2.urlopen(response.url)
            if response.info()['Content-Type'] == 'application/pdf':
                print "Redirect encountered. Parse appropriately"
                return self.parse_item_from_redirect(response)
            else:
                print "BAD LINK - useless"
                return []

        sites = []
        for filetype in self.required_filetypes:
            sites += sel.xpath('//a[contains(@href, ".%s")]' % filetype)

        items = []
        for site in sites:
            item = ToolkitItem()

            # Get referrer
            item['referrer'] = response.url

            # Get link
            try:
                item['link'] = urljoin(response.url,
                                       site.xpath('@href').extract()[0])
            except IndexError:
                item['link'] = urljoin(response.url,
                                       site.xpath('@href').extract())

            # Get title
            try:
                item['title'] = site.xpath('text()').extract()[0]
            except IndexError:
                item['title'] = site.xpath('text()').extract()

            item['date'] = timezone.now()  #datetime.now()
            items.append(item)

            # Save to database
            try:
                item_duplicate = DataItem.objects.get(link=item['link'],
                                                      title=item['title'])
            except DataItem.DoesNotExist:
                new_item = DataItem(spider=self.spidermodels,
                                    session=self.session,
                                    date_scraped=item['date'],
                                    link=item['link'],
                                    referrer=item['referrer'],
                                    title=item['title'])
                new_item.save()
                return item

        return items
Example #3
0
    def parse_item_from_redirect(self, response):
        self.log('Hi, this is an item page!')
        print "Scraping a new page (FROM REDIRECT)"
        print "URL is " + str(response.url)

        assert (response.info()['Content-Type'] == 'application/pdf')

        new_title = response.info()['Content-Disposition'].split(
            'filename=')[-1]
        new_title = new_title.replace('"', '').replace("'", "")

        url = response.url

        # Save to database
        try:
            item_duplicate = DataItem.objects.get(link=url, title=new_title)
        except DataItem.DoesNotExist:
            new_item = DataItem(spider=self.spidermodels,
                                session=self.session,
                                date_scraped=timezone.now(),
                                link=url,
                                referrer=response.url,
                                title=new_title)
            new_item.save()
            return

        return
Example #4
0
 def process_item(self, item, spider):
     
     try:
         item_duplicate = DataItem.objects.get(link = item['link'],
                                               title = item['title'])
     except DataItem.DoesNotExist:
         new_item = DataItem(date_scraped = item['date'],
                             link = item['link'],
                             referrer = item['referrer'],
                             title = item['title']) 
         new_item.save()
         return item
Example #5
0
    def process_item(self, item, spider):

        try:
            item_duplicate = DataItem.objects.get(link=item['link'],
                                                  title=item['title'])
        except DataItem.DoesNotExist:
            new_item = DataItem(date_scraped=item['date'],
                                link=item['link'],
                                referrer=item['referrer'],
                                title=item['title'])
            new_item.save()
            return item
Example #6
0
    def parse_item(self, response):
        self.log('Hi, this is an item page!')
        print "Scraping a new page and searching"
        print "URL is " + str(response.url)
        
        try:
            sel = Selector(response)
        except AttributeError:
            print "   Possible redirect"
            response = urllib2.urlopen(response.url)
            if response.info()['Content-Type'] == 'application/pdf':
                print "Redirect encountered. Parse appropriately"
                return self.parse_item_from_redirect(response)
            else:
                print "BAD LINK - useless"
                return []
        
        sites = []
        for filetype in self.required_filetypes:
            sites += sel.xpath('//a[contains(@href, ".%s")]' % filetype)
        
        items = []
        for site in sites:
            item = ToolkitItem()
            
            # Get referrer
            item['referrer'] = response.url
            
            # Get link
            try:
                item['link'] = urljoin(response.url, 
                                       site.xpath('@href').extract()[0])
            except IndexError:
                item['link'] = urljoin(response.url, 
                                       site.xpath('@href').extract())
                
            # Get title
            try:
                item['title'] = site.xpath('text()').extract()[0]
            except IndexError:
                item['title'] = site.xpath('text()').extract()
            
            item['date'] = timezone.now() #datetime.now()
            items.append(item)
            
            # Save to database
            try:
                item_duplicate = DataItem.objects.get(link = item['link'],
                                                      title = item['title'])
            except DataItem.DoesNotExist:
                new_item = DataItem(spider = self.spidermodels,
                                    session = self.session,
                                    date_scraped = item['date'],
                                    link = item['link'],
                                    referrer = item['referrer'],
                                    title = item['title']) 
                new_item.save()
                return item            


        return items