Esempi in Python per SchedulerRuntime, esempi in Python per dynamic_scraper.models.SchedulerRuntime

Esempio n. 1

0

Mostra file

File: pipelines.py Progetto: brentcappello/imagesden

    def process_item(self, item, spider):
        try:
            item['news_website'] = spider.ref_object
            item['search_term'] = spider.search_terms #I added this its how we see what was searched per item
            
            checker_rt = SchedulerRuntime(runtime_type='C')
            checker_rt.save()
            item['checker_runtime'] = checker_rt
            item.save()


#            p1 = Den.objects.get(title='baby')
#            a1 = Article(search_term=spider.search_terms)
#            a1.dens.add(p1)


#            busi = item.save(commit=False)
#            p1 = Den.objects.get(title='pretty')
#            busi.dens.add(p1)
#            p1 = Den(title='pretty')
#            a1 = Article(search_term=spider.search_terms)
#            a1.dens.add(p1)
#            p1 = Den(title='pretty')
#            a1 = Article(search_term=spider.search_terms)
#            a1.dens.add(p1)

            spider.action_successful = True
            spider.log("Item saved.", log.INFO)
                
        except IntegrityError, e:
            spider.log(str(e), log.ERROR)
            raise DropItem("Missing attribute.")

Esempio n. 2

0

Mostra file

File: scraper_run_test.py Progetto: tonyzhu/django-dynamic-scraper

 def test_double_standard_id_field(self):
     checker_rt = SchedulerRuntime()
     checker_rt.save()
     event = Event(title=u'Event 1', event_website=self.event_website,
         description=u'Event 1 description',
         url=u'http://localhost:8010/static/site_generic/event5.html',
         checker_runtime=checker_rt)
     event.save()
     event = Event(title=u'Event 2', event_website=self.event_website,
         description=u'Event 1 description',
         url=u'http://localhost:8010/static/site_generic/event6.html',
         checker_runtime=checker_rt)
     event.save()
     event = Event(title=u'Event 1', event_website=self.event_website,
         description=u'Event 2 description',
         url=u'http://localhost:8010/static/site_generic/event7.html',
         checker_runtime=checker_rt)
     event.save()
     self.soa_url.id_field = False
     self.soa_url.save()
     self.soa_title.id_field = True
     self.soa_title.save()
     self.soa_desc.id_field = True
     self.soa_desc.save()
     self.run_event_spider(1)
     
     self.assertEqual(len(Event.objects.all()), 6)
     self.assertEqual(Event.objects.filter(description='Event 1 description').count(), 2)

Esempio n. 3

0

Mostra file

File: pipelines.py Progetto: jlbarrera/raceye

    def process_item(self, item, spider):
        if spider.conf['DO_ACTION']:  # Necessary since DDS v.0.9+
            try:
                item['races_website'] = spider.ref_object

                checker_rt = SchedulerRuntime(runtime_type='C')
                checker_rt.save()
                item['checker_runtime'] = checker_rt
                item['date'] = self.process_date(item['date'], spider)
                item['city'] = self.process_city(item['city'], spider)
                item['province'] = self.process_province(item['province'], spider)
                item.save()
                spider.action_successful = True
                spider.log("Item saved.", logging.INFO)

            except IntegrityError as e:
                spider.log(str(e), logging.ERROR)
                spider.log(str(item._errors), logging.ERROR)
                raise DropItem("Missing attribute.")
        else:
            if not item.is_valid():
                spider.log(str(item._errors), logging.ERROR)
                raise DropItem("Missing attribute.")

        return item

Esempio n. 4

0

Mostra file

File: pipelines.py Progetto: aobo711/sometv

    def process_item(self, item, spider):
        try:

            checker_rt = SchedulerRuntime(runtime_type='C')
            checker_rt.save()
            item['checker_runtime'] = checker_rt
            item['source'] = spider.ref_object

            try:
                item_model = item_to_model(item)
            except TypeError:
                return item
            
            model, created = get_or_create(item_model)
            
            update_model(model, item_model)
            
            if created:
                spider.log('==' + model.name + '== created.', log.INFO)
                
            else:
                spider.log('==' + model.name + '== updated.', log.INFO)

            spider.action_successful = True

        except IntegrityError, e:
            spider.log(str(e), log.ERROR)
            raise DropItem("Missing attribute.")

Esempio n. 5

0

Mostra file

File: pipelines.py Progetto: umrashrf/django-address-manager

    def process_item(self, item, spider):
        if spider.conf['DO_ACTION']:  # Necessary since DDS v.0.9+
            try:
                item['source'] = spider.ref_object

                checker_rt = SchedulerRuntime(runtime_type='C')
                checker_rt.save()
                item['checker_runtime'] = checker_rt

                item.save()
                spider.action_successful = True
                dds_id_str = str(item._dds_item_page) + '-' + str(
                    item._dds_item_id)
                spider.struct_log(
                    "{cs}Item {id} saved to Django DB.{ce}".format(
                        id=dds_id_str,
                        cs=spider.bcolors['OK'],
                        ce=spider.bcolors['ENDC']))

            except IntegrityError as e:
                spider.log(str(e), logging.ERROR)
                spider.log(str(item._errors), logging.ERROR)
                raise DropItem("Missing attribute.")
        else:
            if not item.is_valid():
                spider.log(str(item._errors), logging.ERROR)
                raise DropItem("Missing attribute.")

        return item

Esempio n. 6

0

Mostra file

File: pipelines.py Progetto: eyelee/pingjia_repo_0.1

 def process_item(self, item, spider):
     
     if isinstance(spider,ProductSpider):
         #spider.log("spider: " + spider.name)
         spider.log("item time is: " + item['time'])
         item['time']=process_date(item['time'])
         # to do:
         # drop item if price is null
         # drop item if time > no            
     try:
         #if (item == ArticleItem):
             #item['news_website'] = spider.ref_object
         #else:
         item['source'] = spider.ref_object
         
         checker_rt = SchedulerRuntime(runtime_type='C')
         checker_rt.save()
         item['checker_runtime'] = checker_rt
         
         item.save()
         spider.action_successful = True
         spider.log("Item saved.", log.INFO)           
             
     except IntegrityError, e:
         spider.log(str(e), log.ERROR)
         raise DropItem("Missing attribute.")

Esempio n. 7

0

Mostra file

    def process_item(self, item, spider):

        if spider.conf['DO_ACTION']:

            try:
                item['website'] = spider.ref_object

                checker_rt = SchedulerRuntime(runtime_type='C')
                checker_rt.save()
                item['checker_runtime'] = checker_rt

                item.save()
                spider.action_successful = True
                spider.log("Item saved.", logging.INFO)

            except IntegrityError as e:
                spider.log(str(e), logging.ERROR)
                spider.log(str(item._errors), logging.ERROR)
                raise DropItem("Missing attribute.")
        else:
            if not item.is_valid():
                spider.log(str(item._errors), logging.ERROR)
                raise DropItem("Missing attribute.")

        return item

Esempio n. 8

0

Mostra file

File: pipelines.py Progetto: naseralmuhana/Django-NewsScraper

    def process_item(self, item, spider):
        if spider.conf['DO_ACTION']:  # Necessary since DDS v.0.9+
            try:
                item['news_website'] = spider.ref_object
                if 'description' in item:
                    item['description'] = convert_Html_to_text_and_make_sumarization(item['description'])
                if 'image' in item:
                    item['image'] = change_image_size(item['image'],spider.ref_object.name)


                checker_rt = SchedulerRuntime(runtime_type='C')
                checker_rt.save()
                item['checker_runtime'] = checker_rt

                item.save()
                spider.action_successful = True
                dds_id_str = str(item._dds_item_page) + '-' + str(item._dds_item_id)
                spider.struct_log("{cs}Item {id} saved to Django DB.{ce}".format(
                    id=dds_id_str,
                    cs=spider.bcolors['OK'],
                    ce=spider.bcolors['ENDC']))

            except IntegrityError as e:
                spider.log(str(e), logging.ERROR)
                spider.log(str(item._errors), logging.ERROR)
                raise DropItem("Missing attribute.")
        else:
            if not item.is_valid():
                spider.log(str(item._errors), logging.ERROR)
                raise DropItem("Missing attribute.")

        return item

Esempio n. 9

0

Mostra file

File: scraper_run_test.py Progetto: dmpeters/django-dynamic-scraper

 def test_double(self):
     checker_rt = SchedulerRuntime()
     checker_rt.save()
     event = Event(title=u'Event 1', url=u'http://localhost:8010/static/site_generic/event1.html',
         checker_runtime=checker_rt)
     event.save()
     self.run_event_spider(1)
     
     self.assertEqual(len(Event.objects.all()), 4)
     self.assertEqual(len(Event.objects.filter(title='Event 1')), 1)

Esempio n. 10

0

Mostra file

File: web_scraper_service.py Progetto: huokedu/dynamic-scrapy

def add_listing_checker(listing):
  listing_source_cfg = ListingSourceScraperConfig.objects.get(pk=listing.listing_source_id)

  checker_rt = SchedulerRuntime(runtime_type='C', next_action_time=timezone.now() + timedelta(days=1))
  checker_rt.save()

  checker_config = ListingCheckerConfig(listing=listing, checker_runtime=checker_rt, scraper=listing_source_cfg.scraper)
  checker_config.save()

  return checker_config

Esempio n. 11

0

Mostra file

File: scraper_run_test.py Progetto: tonyzhu/django-dynamic-scraper

 def test_detail_page_url_id_field(self):
     checker_rt = SchedulerRuntime()
     checker_rt.save()
     event = Event(title=u'Event 1', event_website=self.event_website, 
         url=u'http://localhost:8010/static/site_generic/event5.html',
         checker_runtime=checker_rt)
     event.save()
     self.run_event_spider(1)
     
     self.assertEqual(len(Event.objects.all()), 5)
     self.assertEqual(Event.objects.filter(title='Event 1').count(), 2)

Esempio n. 12

0

Mostra file

File: scraper_run_test.py Progetto: tonyzhu/django-dynamic-scraper

 def test_double(self):
     checker_rt = SchedulerRuntime()
     checker_rt.save()
     event = Event(title=u'Event 1', event_website=self.event_website, 
         url=u'http://localhost:8010/static/site_generic/event1.html',
         checker_runtime=checker_rt)
     event.save()
     self.run_event_spider(1)
     
     self.assertEqual(len(Event.objects.all()), 4)
     self.assertEqual(len(Event.objects.filter(title='Event 1')), 1)

Esempio n. 13

0

Mostra file

File: checker_run_test.py Progetto: MechanisM/django-dynamic-scraper

 def setUp(self):
     super(CheckerRunTest, self).setUp()
     
     self.scraper.checker_x_path = u'//div[@class="event_not_found"]/div/text()'
     self.scraper.checker_x_path_result = u'Event was deleted!'
     self.scraper.save()
     
     scheduler_rt = SchedulerRuntime()
     scheduler_rt.save()
     
     self.event = Event(title='Event 1', event_website=self.event_website,
         description='Event 1 description', 
         url='http://localhost:8010/static/site_for_checker/event1.html',
         checker_runtime=scheduler_rt)
     self.event.save()

Esempio n. 14

0

Mostra file

    def process_item(self, item, spider):
        try:
            item['news_website'] = spider.ref_object

            checker_rt = SchedulerRuntime(runtime_type='C')
            checker_rt.save()
            item['checker_runtime'] = checker_rt

            item.save()
            spider.action_successful = True
            spider.log("Item saved.", log.INFO)

        except IntegrityError, e:
            spider.log(str(e), log.ERROR)
            raise DropItem("Missing attribute.")

Esempio n. 15

0

Mostra file

File: pipelines.py Progetto: dot-Sean/spyspy

    def process_item(self, item, spider):
        try:
            item['news_website'] = spider.ref_object

            checker_rt = SchedulerRuntime(runtime_type='C')
            checker_rt.save()
            item['checker_runtime'] = checker_rt

            item.save()
            spider.action_successful = True
            spider.log("Item saved.", log.INFO)

        except IntegrityError, e:
            spider.log(str(e), log.ERROR)
            raise DropItem("Missing attribute.")

Esempio n. 16

0

Mostra file

File: pipelines.py Progetto: cmwaura/Final_Red_Scrap

	def process_item(self, item, spider):
		if spider.conf['DO_ACTION']:
			try:
				item['job_website'] = spider.ref_object

				checker_rt = SchedulerRuntime(runtime_type='C')
				checker_rt.save()
				item['checker_runtime'] = checker_rt
				item.save()
				spider.action_successful = True
				spider.log("Items saved in the DB", logging.INFO)

			except IntegrityError, e:
				spider.log(str(e), logging.ERROR)
				raise DropItem("missing attrib")

Esempio n. 17

0

Mostra file

File: scraper_run_test.py Progetto: tonyzhu/django-dynamic-scraper

 def test_standard_update_field_update(self):
     checker_rt = SchedulerRuntime()
     checker_rt.save()
     event = Event(title=u'Event 1 - Old Title', event_website=self.event_website, 
         url=u'http://localhost:8010/static/site_generic/event1.html',
         checker_runtime=checker_rt)
     event.save()
     self.soa_title.attr_type = 'T'
     self.soa_title.save()
     
     self.run_event_spider(1)
     
     event_updated = Event.objects.get(pk=event.id)
     self.assertEqual(event_updated.title, 'Event 1')
     self.assertEqual(len(Event.objects.filter(title='Event 1 - Old Title')), 0)

Esempio n. 18

0

Mostra file

File: scraper_json_run_test.py Progetto: goroot/django-dynamic-scraper

 def extraSetUpHTMLChecker(self):
     self.scraper.checker_type = 'X'
     self.scraper.checker_x_path = u'//div[@class="event_not_found"]/div/text()'
     self.scraper.checker_x_path_result = u'Event not found!'
     self.scraper.checker_ref_url = u'http://localhost:8010/static/site_with_json_content_type/event_not_found.html'
     self.scraper.save()
     
     scheduler_rt = SchedulerRuntime()
     scheduler_rt.save()
     
     self.event = Event(title='Event 1', event_website=self.event_website,
         description='Event 1 description', 
         url='http://localhost:8010/static/site_with_json_content_type/event_not_found.html',
         checker_runtime=scheduler_rt)
     self.event.save()

Esempio n. 19

0

Mostra file

 def test_single_standard_id_field(self):
     checker_rt = SchedulerRuntime()
     checker_rt.save()
     event = Event(title='Event 1', event_website=self.event_website, 
         url='http://localhost:8010/static/site_generic/event5.html',
         checker_runtime=checker_rt)
     event.save()
     self.soa_url.id_field = False
     self.soa_url.save()
     self.soa_title.id_field = True
     self.soa_title.save()
     self.run_event_spider(1)
     
     self.assertEqual(len(Event.objects.all()), 4)
     self.assertEqual(Event.objects.filter(title='Event 1').count(), 1)

Esempio n. 20

0

Mostra file

File: scraper_js_run_test.py Progetto: mtaziz/django-dynamic-scraper

    def setUpScraperJSChecker(self, path):
        super(ScraperJSRunTest, self).setUp()

        self.scraper.checker_type = 'X'
        self.scraper.checker_x_path = u'//div[@class="event_not_found"]/div/text()'
        self.scraper.checker_ref_url = u'%ssite_with_js/event_not_found.html' % path
        self.scraper.save()
        
        scheduler_rt = SchedulerRuntime()
        scheduler_rt.save()
        
        self.event = Event(title='Event 1', event_website=self.event_website,
            description='Event 1 description', 
            url='%ssite_with_js/event_not_found.html' % path,
            checker_runtime=scheduler_rt)
        self.event.save()

Esempio n. 21

0

Mostra file

File: pipelines.py Progetto: righttrack/GabyBots

    def process_item(self, item, spider):
        try:
            # This name must match Article's source
            item["source"] = spider.ref_object

            checker_rt = SchedulerRuntime(runtime_type="C")
            checker_rt.save()
            item["checker_runtime"] = checker_rt

            item.save()
            spider.action_successful = True
            spider.log("Item saved.", log.INFO)

        except IntegrityError, e:
            spider.log(str(e), log.ERROR)
            raise DropItem("Missing attribute.")

Esempio n. 22

0

Mostra file

File: scraper_json_run_test.py Progetto: goroot/django-dynamic-scraper

 def extraSetUpJSONChecker(self):
     self.scraper.detail_page_content_type = 'J'
     self.scraper.checker_type = 'X'
     self.scraper.checker_x_path = u'event_not_found'
     self.scraper.checker_x_path_result = u'Event not found!'
     self.scraper.checker_ref_url = u'http://localhost:8010/static/site_with_json_content_type/event_not_found.json'
     self.scraper.save()
     
     scheduler_rt = SchedulerRuntime()
     scheduler_rt.save()
     
     self.event = Event(title='Event 1', event_website=self.event_website,
         description='Event 1 description', 
         url='http://localhost:8010/static/site_with_json_content_type/event_not_found.json',
         checker_runtime=scheduler_rt)
     self.event.save()

Esempio n. 23

0

Mostra file

    def setUpScraperJSChecker(self, path):
        super(ScraperJSRunTest, self).setUp()

        self.scraper.checker_type = 'X'
        self.scraper.checker_x_path = u'//div[@class="event_not_found"]/div/text()'
        self.scraper.checker_ref_url = u'%ssite_with_js/event_not_found.html' % path
        self.scraper.save()

        scheduler_rt = SchedulerRuntime()
        scheduler_rt.save()

        self.event = Event(title='Event 1',
                           event_website=self.event_website,
                           description='Event 1 description',
                           url='%ssite_with_js/event_not_found.html' % path,
                           checker_runtime=scheduler_rt)
        self.event.save()

Esempio n. 24

0

Mostra file

    def extraSetUpHTMLChecker(self):
        self.scraper.checker_type = 'X'
        self.scraper.checker_x_path = u'//div[@class="event_not_found"]/div/text()'
        self.scraper.checker_x_path_result = u'Event not found!'
        self.scraper.checker_ref_url = u'http://localhost:8010/static/site_with_json_content_type/event_not_found.html'
        self.scraper.save()

        scheduler_rt = SchedulerRuntime()
        scheduler_rt.save()

        self.event = Event(
            title='Event 1',
            event_website=self.event_website,
            description='Event 1 description',
            url=
            'http://localhost:8010/static/site_with_json_content_type/event_not_found.html',
            checker_runtime=scheduler_rt)
        self.event.save()

Esempio n. 25

0

Mostra file

File: pipelines.py Progetto: HyeonjuPark/django-tutorial

    def process_item(self, item, spider):
      if spider.conf['DO_ACTION']: #Necessary since DDS v.0.9+
            try:
                print('HJ start saving')
                item['post_site'] = spider.ref_object

                checker_rt = SchedulerRuntime(runtime_type='C')
                checker_rt.save()
                item['checker_runtime'] = checker_rt

                print(item['foo'])

                if len(item['foo']) != 0:
                    selector = Selector(text=item['foo'])
                    options = selector.xpath('//option')
                    option_items = []
                    for option in options:
                        option_items.append(option.xpath("text()").extract())
                    print(option_items)
                    option_items.pop(0)
                    item['foo'] = option_items
                    # item['foo'] = json.dumps(option_items)

                print(item['foo'])
                item.save()
                spider.action_successful = True
                dds_id_str = str(item._dds_item_page) + '-' + str(item._dds_item_id)
                spider.struct_log("{cs}Item {id} saved to Django DB.{ce}".format(
                    id=dds_id_str,
                    cs=spider.bcolors['OK'],
                    ce=spider.bcolors['ENDC']))

            except IntegrityError as e:
                print('HJ integrity error')
                spider.log(str(e), logging.ERROR)
                spider.log(str(item._errors), logging.ERROR)
                raise DropItem("Missing attribute.")
      else:
          print('HJ not do_action')
          if not item.is_valid():
              spider.log(str(item._errors), logging.ERROR)
              raise DropItem("Missing attribute.")

      return item

Esempio n. 26

0

Mostra file

    def extraSetUpJSONChecker(self):
        self.scraper.detail_page_content_type = 'J'
        self.scraper.checker_type = 'X'
        self.scraper.checker_x_path = u'event_not_found'
        self.scraper.checker_x_path_result = u'Event not found!'
        self.scraper.checker_ref_url = u'http://localhost:8010/static/site_with_json_content_type/event_not_found.json'
        self.scraper.save()

        scheduler_rt = SchedulerRuntime()
        scheduler_rt.save()

        self.event = Event(
            title='Event 1',
            event_website=self.event_website,
            description='Event 1 description',
            url=
            'http://localhost:8010/static/site_with_json_content_type/event_not_found.json',
            checker_runtime=scheduler_rt)
        self.event.save()

Esempio n. 27

0

Mostra file

    def setUp(self):
        super(CheckerRunTest, self).setUp()

        self.scraper.checker_type = 'X'
        self.scraper.checker_x_path = u'//div[@class="event_not_found"]/div/text()'
        self.scraper.checker_x_path_result = u'Event was deleted!'
        self.scraper.checker_ref_url = u'http://localhost:8010/static/site_for_checker/event_not_found.html'
        self.scraper.save()

        scheduler_rt = SchedulerRuntime()
        scheduler_rt.save()

        self.event = Event(
            title='Event 1',
            event_website=self.event_website,
            description='Event 1 description',
            url='http://localhost:8010/static/site_for_checker/event1.html',
            checker_runtime=scheduler_rt)
        self.event.save()

Esempio n. 28

0

Mostra file

File: checker_run_test.py Progetto: Brother-Simon/django-dynamic-scraper

    def setUp(self):
        super(CheckerRunTest, self).setUp()

        self.scraper.checker_type = "X"
        self.scraper.checker_x_path = u'//div[@class="event_not_found"]/div/text()'
        self.scraper.checker_x_path_result = u"Event was deleted!"
        self.scraper.checker_ref_url = u"http://localhost:8010/static/site_for_checker/event_not_found.html"
        self.scraper.save()

        scheduler_rt = SchedulerRuntime()
        scheduler_rt.save()

        self.event = Event(
            title="Event 1",
            event_website=self.event_website,
            description="Event 1 description",
            url="http://localhost:8010/static/site_for_checker/event1.html",
            checker_runtime=scheduler_rt,
        )
        self.event.save()

Esempio n. 29

0

Mostra file

File: pipelines.py Progetto: wanghao524151/scrapy_joy

    def process_item(self, item, spider):
        try:
            if isinstance(spider.ref_object, LoanScraper):
                item["loan_scraper"] = spider.ref_object
            elif isinstance(spider.ref_object, InsuranceWebsite):
                item["insurance_website"] = spider.ref_object
            else:
                item["news_website"] = spider.ref_object

            checker_rt = SchedulerRuntime(runtime_type="C")
            checker_rt.save()
            item["checker_runtime"] = checker_rt

            item.save()
            spider.action_successful = True
            spider.log("Item saved.", log.INFO)

        except IntegrityError, e:
            spider.log(str(e), log.ERROR)
            raise DropItem("Missing attribute.")

Esempio n. 30

0

Mostra file

File: scraper_json_run_test.py Progetto: holgerd77/django-dynamic-scraper

    def extraSetUpHTMLChecker(self):
        self.checker = Checker()
        self.checker.scraped_obj_attr = self.soa_url
        self.checker.scraper = self.scraper
        self.checker.checker_type = "X"
        self.checker.checker_x_path = '//div[@class="event_not_found"]/div/text()'
        self.checker.checker_x_path_result = "Event not found!"
        self.checker.checker_ref_url = "http://localhost:8010/static/site_with_json_content_type/event_not_found.html"
        self.checker.save()

        scheduler_rt = SchedulerRuntime()
        scheduler_rt.save()

        self.event = Event(
            title="Event 1",
            event_website=self.event_website,
            description="Event 1 description",
            url="http://localhost:8010/static/site_with_json_content_type/event_not_found.html",
            checker_runtime=scheduler_rt,
        )
        self.event.save()

Esempio n. 31

0

Mostra file

File: pipelines.py Progetto: bopo/django-dynamic-scraper

 def process_item(self, item, spider):
     if spider.conf['DO_ACTION']:
         try:
             item['news_website'] = spider.ref_object
             
             checker_rt = SchedulerRuntime(runtime_type='C')
             checker_rt.save()
             item['checker_runtime'] = checker_rt
             
             item.save()
             spider.action_successful = True
             spider.struct_log("{cs}Item {id} saved to Django DB.{ce}".format(
                 id=item._dds_id_str,
                 cs=spider.bcolors['OK'],
                 ce=spider.bcolors['ENDC']))
                 
         except IntegrityError as e:
             spider.log(str(e), logging.ERROR)
             raise DropItem("Missing attribute.")
             
     return item

Esempio n. 32

0

Mostra file

File: pipelines.py Progetto: attract/eat-bot

    def process_item(self, item, spider):
        if spider.conf['DO_ACTION']:
            try:
                item['food_website'] = spider.ref_object

                checker_rt = SchedulerRuntime(runtime_type='C')
                checker_rt.save()
                item['checker_runtime'] = checker_rt

                item.save()
                spider.action_successful = True
                spider.struct_log(
                    "{cs}Item {id} saved to Django DB.{ce}".format(
                        id=item._dds_id_str,
                        cs=spider.bcolors['OK'],
                        ce=spider.bcolors['ENDC']))

            except IntegrityError as e:
                spider.log(str(e), logging.ERROR)
                raise DropItem("Missing attribute.")

        return item

Esempio n. 33

0

Mostra file

File: pipelines.py Progetto: cmwaura/Newspade

	def process_item(self, item, spider):
		'''

		this is the processing portion of the spider to the django ORM/Database. What this particular
		portion does is that it gets the spider and based on the configuration it will save the 
		information in my DB. It follows the same rules and principles of the scrapy pipeline model. In 
		case there is an intgrity error, it will drop the items and give back and errot of missing attrib

		'''
		
		if spider.conf['DO_ACTION']:
			try:
				item['news_website'] = spider.ref_object

				checker_rt = SchedulerRuntime(runtime_type='C')
				checker_rt.save()
				item['checker_runtime'] = checker_rt
				item.save()
				spider.action_successful = True
				spider.log("Items saved in the DB", logging.INFO)

			except IntegrityError, e:
				spider.log(str(e), logging.ERROR)
				raise DropItem("missing attrib")

Esempio n. 34

0

Mostra file

File: pipelines.py Progetto: aobo711/Investments

    def process_item(self, item, spider):
        if spider.conf['DO_ACTION']: #Necessary since DDS v.0.9+
            try:

                item['source'] = spider.ref_object

                checker_rt = SchedulerRuntime(runtime_type='C')
                checker_rt.save()
                item['checker_runtime'] = checker_rt
                if 'started_at' in item:
                    item['started_at'] = item['started_at'] + '-01-01'
                item['industy'], created = Industy.objects.get_or_create(name=item['industy'])

                try:
                    item_model = item_to_model(item)
                except TypeError:
                    return item

                model, created = get_or_create(item_model)

                model.tags_raw = ''
                model.save();
            
                tags = ast.literal_eval(item['tags_raw'].encode('utf-8'))
                tag_objs = []
                for tag in tags:
                    tag_name = tag['tag_name']
                    tag_obj, tag_created = Tag.objects.get_or_create(name=tag_name)
                    if tag:
                        tag_obj.save()
                        tag_objs.append(tag_obj)

                if tag_objs:
                    model.tags.add(*tag_objs)

                model.tags_raw = ','.join(t.name for t in tag_objs)


                # 保存融资信息
                invest_firm = ''
                investments = ast.literal_eval(item['investment_raw'].encode('utf-8'))                
                for i in investments:

                    invest_date = '-'.join(str(i) for i in [i['invse_year'], i['invse_month'],i['invse_month']])

                    if i['invse_detail_money'] == 0:
                        invest_amount = i['invse_similar_money']['invse_similar_money_name'] + i['invse_currency']['invse_currency_name']
                    else:
                        invest_amount = str(i['invse_detail_money']) + i['invse_currency']['invse_currency_name']

                    if i['invse_rel_invst_name']:
                        invest_firm = i['invse_rel_invst_name']
                    else:
                        invest_firm = ' '.join([org['invst_name'] for org in i['invse_orags_list']])



                    invest_round = i['invse_round']['invse_round_name']
                    investment, investment_created = Investment.objects.get_or_create(invest_date = invest_date,
                        invest_firm = invest_firm,
                        invest_round = invest_round,
                        invest_amount = invest_amount,
                        invest_to = model)

                    investment.save()

                model.investment_raw = invest_firm
                model.save()

                # backup
                # save tags_soup
                # tags_soup = Soup(item['tags_raw'], 'lxml')
                # tags = []
                # for tag_soup in select(tags_soup, 'a span'):
                #     tag = tag_soup.string
                #     tag_obj, tag_created = Tag.objects.get_or_create(name=tag)
                #     if tag:
                #         tag_obj.save()
                #         tags.append(tag_obj)

                # if tags:
                #     model.tags.add(*tags)

                # model.tags_raw = ','.join(t.name for t in tags)


                #save investment
                # soup = Soup(item['investment_raw'], 'lxml')
                # invest_firm = ''
                # for investment_soup in soup.find_all('tr'):
                #     invest_date = select(investment_soup, 'span.date')[0].string.replace('.', '-')
                #     invest_amount = select(investment_soup, 'span.finades a')[0].string

                #     tds = select(investment_soup, 'td')
                #     if tds[3]:
                #         invest_firm = ','.join(i.string for i in tds[3].find_all('a'))

                #     invest_round = select(investment_soup, 'span.round a')[0].string
                #     investment, investment_created = Investment.objects.get_or_create(invest_date = invest_date,
                #         invest_firm = invest_firm,
                #         invest_round = invest_round,
                #         invest_amount = invest_amount,
                #         invest_to = model)

                #     investment.save()

                # model.investment_raw = invest_firm
                

                if created:
                    spider.log('==' + model.name + '== created.', log.INFO)
                    
                else:
                    spider.log('==' + model.name + '== updated.', log.INFO)

                spider.action_successful = True
                
            except IntegrityError, e:
                spider.log(str(e), logging.ERROR)
                spider.log(str(item._errors), logging.ERROR)
                raise DropItem("Missing attribute.")