コード例 #1
0
ファイル: pipelines.py プロジェクト: brentcappello/imagesden
    def process_item(self, item, spider):
        try:
            item['news_website'] = spider.ref_object
            item['search_term'] = spider.search_terms #I added this its how we see what was searched per item
            
            checker_rt = SchedulerRuntime(runtime_type='C')
            checker_rt.save()
            item['checker_runtime'] = checker_rt
            item.save()


#            p1 = Den.objects.get(title='baby')
#            a1 = Article(search_term=spider.search_terms)
#            a1.dens.add(p1)


#            busi = item.save(commit=False)
#            p1 = Den.objects.get(title='pretty')
#            busi.dens.add(p1)
#            p1 = Den(title='pretty')
#            a1 = Article(search_term=spider.search_terms)
#            a1.dens.add(p1)
#            p1 = Den(title='pretty')
#            a1 = Article(search_term=spider.search_terms)
#            a1.dens.add(p1)

            spider.action_successful = True
            spider.log("Item saved.", log.INFO)
                
        except IntegrityError, e:
            spider.log(str(e), log.ERROR)
            raise DropItem("Missing attribute.")
コード例 #2
0
 def test_double_standard_id_field(self):
     checker_rt = SchedulerRuntime()
     checker_rt.save()
     event = Event(title=u'Event 1', event_website=self.event_website,
         description=u'Event 1 description',
         url=u'http://localhost:8010/static/site_generic/event5.html',
         checker_runtime=checker_rt)
     event.save()
     event = Event(title=u'Event 2', event_website=self.event_website,
         description=u'Event 1 description',
         url=u'http://localhost:8010/static/site_generic/event6.html',
         checker_runtime=checker_rt)
     event.save()
     event = Event(title=u'Event 1', event_website=self.event_website,
         description=u'Event 2 description',
         url=u'http://localhost:8010/static/site_generic/event7.html',
         checker_runtime=checker_rt)
     event.save()
     self.soa_url.id_field = False
     self.soa_url.save()
     self.soa_title.id_field = True
     self.soa_title.save()
     self.soa_desc.id_field = True
     self.soa_desc.save()
     self.run_event_spider(1)
     
     self.assertEqual(len(Event.objects.all()), 6)
     self.assertEqual(Event.objects.filter(description='Event 1 description').count(), 2)
コード例 #3
0
ファイル: pipelines.py プロジェクト: jlbarrera/raceye
    def process_item(self, item, spider):
        if spider.conf['DO_ACTION']:  # Necessary since DDS v.0.9+
            try:
                item['races_website'] = spider.ref_object

                checker_rt = SchedulerRuntime(runtime_type='C')
                checker_rt.save()
                item['checker_runtime'] = checker_rt
                item['date'] = self.process_date(item['date'], spider)
                item['city'] = self.process_city(item['city'], spider)
                item['province'] = self.process_province(item['province'], spider)
                item.save()
                spider.action_successful = True
                spider.log("Item saved.", logging.INFO)

            except IntegrityError as e:
                spider.log(str(e), logging.ERROR)
                spider.log(str(item._errors), logging.ERROR)
                raise DropItem("Missing attribute.")
        else:
            if not item.is_valid():
                spider.log(str(item._errors), logging.ERROR)
                raise DropItem("Missing attribute.")

        return item
コード例 #4
0
ファイル: pipelines.py プロジェクト: aobo711/sometv
    def process_item(self, item, spider):
        try:

            checker_rt = SchedulerRuntime(runtime_type='C')
            checker_rt.save()
            item['checker_runtime'] = checker_rt
            item['source'] = spider.ref_object

            try:
                item_model = item_to_model(item)
            except TypeError:
                return item
            
            model, created = get_or_create(item_model)
            
            update_model(model, item_model)
            
            if created:
                spider.log('==' + model.name + '== created.', log.INFO)
                
            else:
                spider.log('==' + model.name + '== updated.', log.INFO)

            spider.action_successful = True

        except IntegrityError, e:
            spider.log(str(e), log.ERROR)
            raise DropItem("Missing attribute.")
コード例 #5
0
    def process_item(self, item, spider):
        if spider.conf['DO_ACTION']:  # Necessary since DDS v.0.9+
            try:
                item['source'] = spider.ref_object

                checker_rt = SchedulerRuntime(runtime_type='C')
                checker_rt.save()
                item['checker_runtime'] = checker_rt

                item.save()
                spider.action_successful = True
                dds_id_str = str(item._dds_item_page) + '-' + str(
                    item._dds_item_id)
                spider.struct_log(
                    "{cs}Item {id} saved to Django DB.{ce}".format(
                        id=dds_id_str,
                        cs=spider.bcolors['OK'],
                        ce=spider.bcolors['ENDC']))

            except IntegrityError as e:
                spider.log(str(e), logging.ERROR)
                spider.log(str(item._errors), logging.ERROR)
                raise DropItem("Missing attribute.")
        else:
            if not item.is_valid():
                spider.log(str(item._errors), logging.ERROR)
                raise DropItem("Missing attribute.")

        return item
コード例 #6
0
ファイル: pipelines.py プロジェクト: eyelee/pingjia_repo_0.1
 def process_item(self, item, spider):
     
     if isinstance(spider,ProductSpider):
         #spider.log("spider: " + spider.name)
         spider.log("item time is: " + item['time'])
         item['time']=process_date(item['time'])
         # to do:
         # drop item if price is null
         # drop item if time > no            
     try:
         #if (item == ArticleItem):
             #item['news_website'] = spider.ref_object
         #else:
         item['source'] = spider.ref_object
         
         checker_rt = SchedulerRuntime(runtime_type='C')
         checker_rt.save()
         item['checker_runtime'] = checker_rt
         
         item.save()
         spider.action_successful = True
         spider.log("Item saved.", log.INFO)           
             
     except IntegrityError, e:
         spider.log(str(e), log.ERROR)
         raise DropItem("Missing attribute.")
コード例 #7
0
    def process_item(self, item, spider):

        if spider.conf['DO_ACTION']:

            try:
                item['website'] = spider.ref_object

                checker_rt = SchedulerRuntime(runtime_type='C')
                checker_rt.save()
                item['checker_runtime'] = checker_rt

                item.save()
                spider.action_successful = True
                spider.log("Item saved.", logging.INFO)

            except IntegrityError as e:
                spider.log(str(e), logging.ERROR)
                spider.log(str(item._errors), logging.ERROR)
                raise DropItem("Missing attribute.")
        else:
            if not item.is_valid():
                spider.log(str(item._errors), logging.ERROR)
                raise DropItem("Missing attribute.")

        return item
コード例 #8
0
    def process_item(self, item, spider):
        if spider.conf['DO_ACTION']:  # Necessary since DDS v.0.9+
            try:
                item['news_website'] = spider.ref_object
                if 'description' in item:
                    item['description'] = convert_Html_to_text_and_make_sumarization(item['description'])
                if 'image' in item:
                    item['image'] = change_image_size(item['image'],spider.ref_object.name)


                checker_rt = SchedulerRuntime(runtime_type='C')
                checker_rt.save()
                item['checker_runtime'] = checker_rt

                item.save()
                spider.action_successful = True
                dds_id_str = str(item._dds_item_page) + '-' + str(item._dds_item_id)
                spider.struct_log("{cs}Item {id} saved to Django DB.{ce}".format(
                    id=dds_id_str,
                    cs=spider.bcolors['OK'],
                    ce=spider.bcolors['ENDC']))

            except IntegrityError as e:
                spider.log(str(e), logging.ERROR)
                spider.log(str(item._errors), logging.ERROR)
                raise DropItem("Missing attribute.")
        else:
            if not item.is_valid():
                spider.log(str(item._errors), logging.ERROR)
                raise DropItem("Missing attribute.")

        return item
コード例 #9
0
 def test_double(self):
     checker_rt = SchedulerRuntime()
     checker_rt.save()
     event = Event(title=u'Event 1', url=u'http://localhost:8010/static/site_generic/event1.html',
         checker_runtime=checker_rt)
     event.save()
     self.run_event_spider(1)
     
     self.assertEqual(len(Event.objects.all()), 4)
     self.assertEqual(len(Event.objects.filter(title='Event 1')), 1)
コード例 #10
0
def add_listing_checker(listing):
  listing_source_cfg = ListingSourceScraperConfig.objects.get(pk=listing.listing_source_id)

  checker_rt = SchedulerRuntime(runtime_type='C', next_action_time=timezone.now() + timedelta(days=1))
  checker_rt.save()

  checker_config = ListingCheckerConfig(listing=listing, checker_runtime=checker_rt, scraper=listing_source_cfg.scraper)
  checker_config.save()

  return checker_config
コード例 #11
0
 def test_detail_page_url_id_field(self):
     checker_rt = SchedulerRuntime()
     checker_rt.save()
     event = Event(title=u'Event 1', event_website=self.event_website, 
         url=u'http://localhost:8010/static/site_generic/event5.html',
         checker_runtime=checker_rt)
     event.save()
     self.run_event_spider(1)
     
     self.assertEqual(len(Event.objects.all()), 5)
     self.assertEqual(Event.objects.filter(title='Event 1').count(), 2)
コード例 #12
0
 def test_double(self):
     checker_rt = SchedulerRuntime()
     checker_rt.save()
     event = Event(title=u'Event 1', event_website=self.event_website, 
         url=u'http://localhost:8010/static/site_generic/event1.html',
         checker_runtime=checker_rt)
     event.save()
     self.run_event_spider(1)
     
     self.assertEqual(len(Event.objects.all()), 4)
     self.assertEqual(len(Event.objects.filter(title='Event 1')), 1)
コード例 #13
0
 def setUp(self):
     super(CheckerRunTest, self).setUp()
     
     self.scraper.checker_x_path = u'//div[@class="event_not_found"]/div/text()'
     self.scraper.checker_x_path_result = u'Event was deleted!'
     self.scraper.save()
     
     scheduler_rt = SchedulerRuntime()
     scheduler_rt.save()
     
     self.event = Event(title='Event 1', event_website=self.event_website,
         description='Event 1 description', 
         url='http://localhost:8010/static/site_for_checker/event1.html',
         checker_runtime=scheduler_rt)
     self.event.save()
コード例 #14
0
    def process_item(self, item, spider):
        try:
            item['news_website'] = spider.ref_object

            checker_rt = SchedulerRuntime(runtime_type='C')
            checker_rt.save()
            item['checker_runtime'] = checker_rt

            item.save()
            spider.action_successful = True
            spider.log("Item saved.", log.INFO)

        except IntegrityError, e:
            spider.log(str(e), log.ERROR)
            raise DropItem("Missing attribute.")
コード例 #15
0
ファイル: pipelines.py プロジェクト: dot-Sean/spyspy
    def process_item(self, item, spider):
        try:
            item['news_website'] = spider.ref_object

            checker_rt = SchedulerRuntime(runtime_type='C')
            checker_rt.save()
            item['checker_runtime'] = checker_rt

            item.save()
            spider.action_successful = True
            spider.log("Item saved.", log.INFO)

        except IntegrityError, e:
            spider.log(str(e), log.ERROR)
            raise DropItem("Missing attribute.")
コード例 #16
0
ファイル: pipelines.py プロジェクト: cmwaura/Final_Red_Scrap
	def process_item(self, item, spider):
		if spider.conf['DO_ACTION']:
			try:
				item['job_website'] = spider.ref_object

				checker_rt = SchedulerRuntime(runtime_type='C')
				checker_rt.save()
				item['checker_runtime'] = checker_rt
				item.save()
				spider.action_successful = True
				spider.log("Items saved in the DB", logging.INFO)

			except IntegrityError, e:
				spider.log(str(e), logging.ERROR)
				raise DropItem("missing attrib")
コード例 #17
0
 def test_standard_update_field_update(self):
     checker_rt = SchedulerRuntime()
     checker_rt.save()
     event = Event(title=u'Event 1 - Old Title', event_website=self.event_website, 
         url=u'http://localhost:8010/static/site_generic/event1.html',
         checker_runtime=checker_rt)
     event.save()
     self.soa_title.attr_type = 'T'
     self.soa_title.save()
     
     self.run_event_spider(1)
     
     event_updated = Event.objects.get(pk=event.id)
     self.assertEqual(event_updated.title, 'Event 1')
     self.assertEqual(len(Event.objects.filter(title='Event 1 - Old Title')), 0)
コード例 #18
0
 def extraSetUpHTMLChecker(self):
     self.scraper.checker_type = 'X'
     self.scraper.checker_x_path = u'//div[@class="event_not_found"]/div/text()'
     self.scraper.checker_x_path_result = u'Event not found!'
     self.scraper.checker_ref_url = u'http://localhost:8010/static/site_with_json_content_type/event_not_found.html'
     self.scraper.save()
     
     scheduler_rt = SchedulerRuntime()
     scheduler_rt.save()
     
     self.event = Event(title='Event 1', event_website=self.event_website,
         description='Event 1 description', 
         url='http://localhost:8010/static/site_with_json_content_type/event_not_found.html',
         checker_runtime=scheduler_rt)
     self.event.save()
コード例 #19
0
 def test_single_standard_id_field(self):
     checker_rt = SchedulerRuntime()
     checker_rt.save()
     event = Event(title='Event 1', event_website=self.event_website, 
         url='http://localhost:8010/static/site_generic/event5.html',
         checker_runtime=checker_rt)
     event.save()
     self.soa_url.id_field = False
     self.soa_url.save()
     self.soa_title.id_field = True
     self.soa_title.save()
     self.run_event_spider(1)
     
     self.assertEqual(len(Event.objects.all()), 4)
     self.assertEqual(Event.objects.filter(title='Event 1').count(), 1)
コード例 #20
0
    def setUpScraperJSChecker(self, path):
        super(ScraperJSRunTest, self).setUp()

        self.scraper.checker_type = 'X'
        self.scraper.checker_x_path = u'//div[@class="event_not_found"]/div/text()'
        self.scraper.checker_ref_url = u'%ssite_with_js/event_not_found.html' % path
        self.scraper.save()
        
        scheduler_rt = SchedulerRuntime()
        scheduler_rt.save()
        
        self.event = Event(title='Event 1', event_website=self.event_website,
            description='Event 1 description', 
            url='%ssite_with_js/event_not_found.html' % path,
            checker_runtime=scheduler_rt)
        self.event.save()
コード例 #21
0
ファイル: pipelines.py プロジェクト: righttrack/GabyBots
    def process_item(self, item, spider):
        try:
            # This name must match Article's source
            item["source"] = spider.ref_object

            checker_rt = SchedulerRuntime(runtime_type="C")
            checker_rt.save()
            item["checker_runtime"] = checker_rt

            item.save()
            spider.action_successful = True
            spider.log("Item saved.", log.INFO)

        except IntegrityError, e:
            spider.log(str(e), log.ERROR)
            raise DropItem("Missing attribute.")
コード例 #22
0
 def extraSetUpJSONChecker(self):
     self.scraper.detail_page_content_type = 'J'
     self.scraper.checker_type = 'X'
     self.scraper.checker_x_path = u'event_not_found'
     self.scraper.checker_x_path_result = u'Event not found!'
     self.scraper.checker_ref_url = u'http://localhost:8010/static/site_with_json_content_type/event_not_found.json'
     self.scraper.save()
     
     scheduler_rt = SchedulerRuntime()
     scheduler_rt.save()
     
     self.event = Event(title='Event 1', event_website=self.event_website,
         description='Event 1 description', 
         url='http://localhost:8010/static/site_with_json_content_type/event_not_found.json',
         checker_runtime=scheduler_rt)
     self.event.save()
コード例 #23
0
    def setUpScraperJSChecker(self, path):
        super(ScraperJSRunTest, self).setUp()

        self.scraper.checker_type = 'X'
        self.scraper.checker_x_path = u'//div[@class="event_not_found"]/div/text()'
        self.scraper.checker_ref_url = u'%ssite_with_js/event_not_found.html' % path
        self.scraper.save()

        scheduler_rt = SchedulerRuntime()
        scheduler_rt.save()

        self.event = Event(title='Event 1',
                           event_website=self.event_website,
                           description='Event 1 description',
                           url='%ssite_with_js/event_not_found.html' % path,
                           checker_runtime=scheduler_rt)
        self.event.save()
コード例 #24
0
    def extraSetUpHTMLChecker(self):
        self.scraper.checker_type = 'X'
        self.scraper.checker_x_path = u'//div[@class="event_not_found"]/div/text()'
        self.scraper.checker_x_path_result = u'Event not found!'
        self.scraper.checker_ref_url = u'http://localhost:8010/static/site_with_json_content_type/event_not_found.html'
        self.scraper.save()

        scheduler_rt = SchedulerRuntime()
        scheduler_rt.save()

        self.event = Event(
            title='Event 1',
            event_website=self.event_website,
            description='Event 1 description',
            url=
            'http://localhost:8010/static/site_with_json_content_type/event_not_found.html',
            checker_runtime=scheduler_rt)
        self.event.save()
コード例 #25
0
    def process_item(self, item, spider):
      if spider.conf['DO_ACTION']: #Necessary since DDS v.0.9+
            try:
                print('HJ start saving')
                item['post_site'] = spider.ref_object

                checker_rt = SchedulerRuntime(runtime_type='C')
                checker_rt.save()
                item['checker_runtime'] = checker_rt

                print(item['foo'])

                if len(item['foo']) != 0:
                    selector = Selector(text=item['foo'])
                    options = selector.xpath('//option')
                    option_items = []
                    for option in options:
                        option_items.append(option.xpath("text()").extract())
                    print(option_items)
                    option_items.pop(0)
                    item['foo'] = option_items
                    # item['foo'] = json.dumps(option_items)

                print(item['foo'])
                item.save()
                spider.action_successful = True
                dds_id_str = str(item._dds_item_page) + '-' + str(item._dds_item_id)
                spider.struct_log("{cs}Item {id} saved to Django DB.{ce}".format(
                    id=dds_id_str,
                    cs=spider.bcolors['OK'],
                    ce=spider.bcolors['ENDC']))

            except IntegrityError as e:
                print('HJ integrity error')
                spider.log(str(e), logging.ERROR)
                spider.log(str(item._errors), logging.ERROR)
                raise DropItem("Missing attribute.")
      else:
          print('HJ not do_action')
          if not item.is_valid():
              spider.log(str(item._errors), logging.ERROR)
              raise DropItem("Missing attribute.")

      return item
コード例 #26
0
    def extraSetUpJSONChecker(self):
        self.scraper.detail_page_content_type = 'J'
        self.scraper.checker_type = 'X'
        self.scraper.checker_x_path = u'event_not_found'
        self.scraper.checker_x_path_result = u'Event not found!'
        self.scraper.checker_ref_url = u'http://localhost:8010/static/site_with_json_content_type/event_not_found.json'
        self.scraper.save()

        scheduler_rt = SchedulerRuntime()
        scheduler_rt.save()

        self.event = Event(
            title='Event 1',
            event_website=self.event_website,
            description='Event 1 description',
            url=
            'http://localhost:8010/static/site_with_json_content_type/event_not_found.json',
            checker_runtime=scheduler_rt)
        self.event.save()
コード例 #27
0
    def setUp(self):
        super(CheckerRunTest, self).setUp()

        self.scraper.checker_type = 'X'
        self.scraper.checker_x_path = u'//div[@class="event_not_found"]/div/text()'
        self.scraper.checker_x_path_result = u'Event was deleted!'
        self.scraper.checker_ref_url = u'http://localhost:8010/static/site_for_checker/event_not_found.html'
        self.scraper.save()

        scheduler_rt = SchedulerRuntime()
        scheduler_rt.save()

        self.event = Event(
            title='Event 1',
            event_website=self.event_website,
            description='Event 1 description',
            url='http://localhost:8010/static/site_for_checker/event1.html',
            checker_runtime=scheduler_rt)
        self.event.save()
コード例 #28
0
    def setUp(self):
        super(CheckerRunTest, self).setUp()

        self.scraper.checker_type = "X"
        self.scraper.checker_x_path = u'//div[@class="event_not_found"]/div/text()'
        self.scraper.checker_x_path_result = u"Event was deleted!"
        self.scraper.checker_ref_url = u"http://localhost:8010/static/site_for_checker/event_not_found.html"
        self.scraper.save()

        scheduler_rt = SchedulerRuntime()
        scheduler_rt.save()

        self.event = Event(
            title="Event 1",
            event_website=self.event_website,
            description="Event 1 description",
            url="http://localhost:8010/static/site_for_checker/event1.html",
            checker_runtime=scheduler_rt,
        )
        self.event.save()
コード例 #29
0
ファイル: pipelines.py プロジェクト: wanghao524151/scrapy_joy
    def process_item(self, item, spider):
        try:
            if isinstance(spider.ref_object, LoanScraper):
                item["loan_scraper"] = spider.ref_object
            elif isinstance(spider.ref_object, InsuranceWebsite):
                item["insurance_website"] = spider.ref_object
            else:
                item["news_website"] = spider.ref_object

            checker_rt = SchedulerRuntime(runtime_type="C")
            checker_rt.save()
            item["checker_runtime"] = checker_rt

            item.save()
            spider.action_successful = True
            spider.log("Item saved.", log.INFO)

        except IntegrityError, e:
            spider.log(str(e), log.ERROR)
            raise DropItem("Missing attribute.")
コード例 #30
0
    def extraSetUpHTMLChecker(self):
        self.checker = Checker()
        self.checker.scraped_obj_attr = self.soa_url
        self.checker.scraper = self.scraper
        self.checker.checker_type = "X"
        self.checker.checker_x_path = '//div[@class="event_not_found"]/div/text()'
        self.checker.checker_x_path_result = "Event not found!"
        self.checker.checker_ref_url = "http://localhost:8010/static/site_with_json_content_type/event_not_found.html"
        self.checker.save()

        scheduler_rt = SchedulerRuntime()
        scheduler_rt.save()

        self.event = Event(
            title="Event 1",
            event_website=self.event_website,
            description="Event 1 description",
            url="http://localhost:8010/static/site_with_json_content_type/event_not_found.html",
            checker_runtime=scheduler_rt,
        )
        self.event.save()
コード例 #31
0
 def process_item(self, item, spider):
     if spider.conf['DO_ACTION']:
         try:
             item['news_website'] = spider.ref_object
             
             checker_rt = SchedulerRuntime(runtime_type='C')
             checker_rt.save()
             item['checker_runtime'] = checker_rt
             
             item.save()
             spider.action_successful = True
             spider.struct_log("{cs}Item {id} saved to Django DB.{ce}".format(
                 id=item._dds_id_str,
                 cs=spider.bcolors['OK'],
                 ce=spider.bcolors['ENDC']))
                 
         except IntegrityError as e:
             spider.log(str(e), logging.ERROR)
             raise DropItem("Missing attribute.")
             
     return item
コード例 #32
0
ファイル: pipelines.py プロジェクト: attract/eat-bot
    def process_item(self, item, spider):
        if spider.conf['DO_ACTION']:
            try:
                item['food_website'] = spider.ref_object

                checker_rt = SchedulerRuntime(runtime_type='C')
                checker_rt.save()
                item['checker_runtime'] = checker_rt

                item.save()
                spider.action_successful = True
                spider.struct_log(
                    "{cs}Item {id} saved to Django DB.{ce}".format(
                        id=item._dds_id_str,
                        cs=spider.bcolors['OK'],
                        ce=spider.bcolors['ENDC']))

            except IntegrityError as e:
                spider.log(str(e), logging.ERROR)
                raise DropItem("Missing attribute.")

        return item
コード例 #33
0
ファイル: pipelines.py プロジェクト: cmwaura/Newspade
	def process_item(self, item, spider):
		'''

		this is the processing portion of the spider to the django ORM/Database. What this particular
		portion does is that it gets the spider and based on the configuration it will save the 
		information in my DB. It follows the same rules and principles of the scrapy pipeline model. In 
		case there is an intgrity error, it will drop the items and give back and errot of missing attrib

		'''
		
		if spider.conf['DO_ACTION']:
			try:
				item['news_website'] = spider.ref_object

				checker_rt = SchedulerRuntime(runtime_type='C')
				checker_rt.save()
				item['checker_runtime'] = checker_rt
				item.save()
				spider.action_successful = True
				spider.log("Items saved in the DB", logging.INFO)

			except IntegrityError, e:
				spider.log(str(e), logging.ERROR)
				raise DropItem("missing attrib")
コード例 #34
0
ファイル: pipelines.py プロジェクト: aobo711/Investments
    def process_item(self, item, spider):
        if spider.conf['DO_ACTION']: #Necessary since DDS v.0.9+
            try:

                item['source'] = spider.ref_object

                checker_rt = SchedulerRuntime(runtime_type='C')
                checker_rt.save()
                item['checker_runtime'] = checker_rt
                if 'started_at' in item:
                    item['started_at'] = item['started_at'] + '-01-01'
                item['industy'], created = Industy.objects.get_or_create(name=item['industy'])

                try:
                    item_model = item_to_model(item)
                except TypeError:
                    return item

                model, created = get_or_create(item_model)

                model.tags_raw = ''
                model.save();
            
                tags = ast.literal_eval(item['tags_raw'].encode('utf-8'))
                tag_objs = []
                for tag in tags:
                    tag_name = tag['tag_name']
                    tag_obj, tag_created = Tag.objects.get_or_create(name=tag_name)
                    if tag:
                        tag_obj.save()
                        tag_objs.append(tag_obj)

                if tag_objs:
                    model.tags.add(*tag_objs)

                model.tags_raw = ','.join(t.name for t in tag_objs)


                # 保存融资信息
                invest_firm = ''
                investments = ast.literal_eval(item['investment_raw'].encode('utf-8'))                
                for i in investments:

                    invest_date = '-'.join(str(i) for i in [i['invse_year'], i['invse_month'],i['invse_month']])

                    if i['invse_detail_money'] == 0:
                        invest_amount = i['invse_similar_money']['invse_similar_money_name'] + i['invse_currency']['invse_currency_name']
                    else:
                        invest_amount = str(i['invse_detail_money']) + i['invse_currency']['invse_currency_name']

                    if i['invse_rel_invst_name']:
                        invest_firm = i['invse_rel_invst_name']
                    else:
                        invest_firm = ' '.join([org['invst_name'] for org in i['invse_orags_list']])



                    invest_round = i['invse_round']['invse_round_name']
                    investment, investment_created = Investment.objects.get_or_create(invest_date = invest_date,
                        invest_firm = invest_firm,
                        invest_round = invest_round,
                        invest_amount = invest_amount,
                        invest_to = model)

                    investment.save()

                model.investment_raw = invest_firm
                model.save()

                # backup
                # save tags_soup
                # tags_soup = Soup(item['tags_raw'], 'lxml')
                # tags = []
                # for tag_soup in select(tags_soup, 'a span'):
                #     tag = tag_soup.string
                #     tag_obj, tag_created = Tag.objects.get_or_create(name=tag)
                #     if tag:
                #         tag_obj.save()
                #         tags.append(tag_obj)

                # if tags:
                #     model.tags.add(*tags)

                # model.tags_raw = ','.join(t.name for t in tags)


                #save investment
                # soup = Soup(item['investment_raw'], 'lxml')
                # invest_firm = ''
                # for investment_soup in soup.find_all('tr'):
                #     invest_date = select(investment_soup, 'span.date')[0].string.replace('.', '-')
                #     invest_amount = select(investment_soup, 'span.finades a')[0].string

                #     tds = select(investment_soup, 'td')
                #     if tds[3]:
                #         invest_firm = ','.join(i.string for i in tds[3].find_all('a'))

                #     invest_round = select(investment_soup, 'span.round a')[0].string
                #     investment, investment_created = Investment.objects.get_or_create(invest_date = invest_date,
                #         invest_firm = invest_firm,
                #         invest_round = invest_round,
                #         invest_amount = invest_amount,
                #         invest_to = model)

                #     investment.save()

                # model.investment_raw = invest_firm
                

                if created:
                    spider.log('==' + model.name + '== created.', log.INFO)
                    
                else:
                    spider.log('==' + model.name + '== updated.', log.INFO)

                spider.action_successful = True
                
            except IntegrityError, e:
                spider.log(str(e), logging.ERROR)
                spider.log(str(item._errors), logging.ERROR)
                raise DropItem("Missing attribute.")