Beispiel #1
0
    def process_item(self, item, spider):

        if spider.conf['DO_ACTION']:

            try:
                item['website'] = spider.ref_object

                checker_rt = SchedulerRuntime(runtime_type='C')
                checker_rt.save()
                item['checker_runtime'] = checker_rt

                item.save()
                spider.action_successful = True
                spider.log("Item saved.", logging.INFO)

            except IntegrityError as e:
                spider.log(str(e), logging.ERROR)
                spider.log(str(item._errors), logging.ERROR)
                raise DropItem("Missing attribute.")
        else:
            if not item.is_valid():
                spider.log(str(item._errors), logging.ERROR)
                raise DropItem("Missing attribute.")

        return item
 def test_double_standard_id_field(self):
     checker_rt = SchedulerRuntime()
     checker_rt.save()
     event = Event(title=u'Event 1', event_website=self.event_website,
         description=u'Event 1 description',
         url=u'http://localhost:8010/static/site_generic/event5.html',
         checker_runtime=checker_rt)
     event.save()
     event = Event(title=u'Event 2', event_website=self.event_website,
         description=u'Event 1 description',
         url=u'http://localhost:8010/static/site_generic/event6.html',
         checker_runtime=checker_rt)
     event.save()
     event = Event(title=u'Event 1', event_website=self.event_website,
         description=u'Event 2 description',
         url=u'http://localhost:8010/static/site_generic/event7.html',
         checker_runtime=checker_rt)
     event.save()
     self.soa_url.id_field = False
     self.soa_url.save()
     self.soa_title.id_field = True
     self.soa_title.save()
     self.soa_desc.id_field = True
     self.soa_desc.save()
     self.run_event_spider(1)
     
     self.assertEqual(len(Event.objects.all()), 6)
     self.assertEqual(Event.objects.filter(description='Event 1 description').count(), 2)
    def process_item(self, item, spider):
        if spider.conf['DO_ACTION']:  # Necessary since DDS v.0.9+
            try:
                item['news_website'] = spider.ref_object
                if 'description' in item:
                    item['description'] = convert_Html_to_text_and_make_sumarization(item['description'])
                if 'image' in item:
                    item['image'] = change_image_size(item['image'],spider.ref_object.name)


                checker_rt = SchedulerRuntime(runtime_type='C')
                checker_rt.save()
                item['checker_runtime'] = checker_rt

                item.save()
                spider.action_successful = True
                dds_id_str = str(item._dds_item_page) + '-' + str(item._dds_item_id)
                spider.struct_log("{cs}Item {id} saved to Django DB.{ce}".format(
                    id=dds_id_str,
                    cs=spider.bcolors['OK'],
                    ce=spider.bcolors['ENDC']))

            except IntegrityError as e:
                spider.log(str(e), logging.ERROR)
                spider.log(str(item._errors), logging.ERROR)
                raise DropItem("Missing attribute.")
        else:
            if not item.is_valid():
                spider.log(str(item._errors), logging.ERROR)
                raise DropItem("Missing attribute.")

        return item
    def process_item(self, item, spider):
        if spider.conf['DO_ACTION']:  # Necessary since DDS v.0.9+
            try:
                item['source'] = spider.ref_object

                checker_rt = SchedulerRuntime(runtime_type='C')
                checker_rt.save()
                item['checker_runtime'] = checker_rt

                item.save()
                spider.action_successful = True
                dds_id_str = str(item._dds_item_page) + '-' + str(
                    item._dds_item_id)
                spider.struct_log(
                    "{cs}Item {id} saved to Django DB.{ce}".format(
                        id=dds_id_str,
                        cs=spider.bcolors['OK'],
                        ce=spider.bcolors['ENDC']))

            except IntegrityError as e:
                spider.log(str(e), logging.ERROR)
                spider.log(str(item._errors), logging.ERROR)
                raise DropItem("Missing attribute.")
        else:
            if not item.is_valid():
                spider.log(str(item._errors), logging.ERROR)
                raise DropItem("Missing attribute.")

        return item
 def test_detail_page_url_id_field(self):
     checker_rt = SchedulerRuntime()
     checker_rt.save()
     event = Event(title=u'Event 1', event_website=self.event_website, 
         url=u'http://localhost:8010/static/site_generic/event5.html',
         checker_runtime=checker_rt)
     event.save()
     self.run_event_spider(1)
     
     self.assertEqual(len(Event.objects.all()), 5)
     self.assertEqual(Event.objects.filter(title='Event 1').count(), 2)
 def test_double(self):
     checker_rt = SchedulerRuntime()
     checker_rt.save()
     event = Event(title=u'Event 1', event_website=self.event_website, 
         url=u'http://localhost:8010/static/site_generic/event1.html',
         checker_runtime=checker_rt)
     event.save()
     self.run_event_spider(1)
     
     self.assertEqual(len(Event.objects.all()), 4)
     self.assertEqual(len(Event.objects.filter(title='Event 1')), 1)
 def test_standard_update_field_update(self):
     checker_rt = SchedulerRuntime()
     checker_rt.save()
     event = Event(title=u'Event 1 - Old Title', event_website=self.event_website, 
         url=u'http://localhost:8010/static/site_generic/event1.html',
         checker_runtime=checker_rt)
     event.save()
     self.soa_title.attr_type = 'T'
     self.soa_title.save()
     
     self.run_event_spider(1)
     
     event_updated = Event.objects.get(pk=event.id)
     self.assertEqual(event_updated.title, 'Event 1')
     self.assertEqual(len(Event.objects.filter(title='Event 1 - Old Title')), 0)
Beispiel #8
0
    def process_item(self, item, spider):
        try:
            item['news_website'] = spider.ref_object

            checker_rt = SchedulerRuntime(runtime_type='C')
            checker_rt.save()
            item['checker_runtime'] = checker_rt

            item.save()
            spider.action_successful = True
            spider.log("Item saved.", log.INFO)

        except IntegrityError, e:
            spider.log(str(e), log.ERROR)
            raise DropItem("Missing attribute.")
Beispiel #9
0
 def test_single_standard_id_field(self):
     checker_rt = SchedulerRuntime()
     checker_rt.save()
     event = Event(title='Event 1', event_website=self.event_website, 
         url='http://localhost:8010/static/site_generic/event5.html',
         checker_runtime=checker_rt)
     event.save()
     self.soa_url.id_field = False
     self.soa_url.save()
     self.soa_title.id_field = True
     self.soa_title.save()
     self.run_event_spider(1)
     
     self.assertEqual(len(Event.objects.all()), 4)
     self.assertEqual(Event.objects.filter(title='Event 1').count(), 1)
Beispiel #10
0
    def setUpScraperJSChecker(self, path):
        super(ScraperJSRunTest, self).setUp()

        self.scraper.checker_type = 'X'
        self.scraper.checker_x_path = u'//div[@class="event_not_found"]/div/text()'
        self.scraper.checker_ref_url = u'%ssite_with_js/event_not_found.html' % path
        self.scraper.save()

        scheduler_rt = SchedulerRuntime()
        scheduler_rt.save()

        self.event = Event(title='Event 1',
                           event_website=self.event_website,
                           description='Event 1 description',
                           url='%ssite_with_js/event_not_found.html' % path,
                           checker_runtime=scheduler_rt)
        self.event.save()
Beispiel #11
0
    def extraSetUpHTMLChecker(self):
        self.scraper.checker_type = 'X'
        self.scraper.checker_x_path = u'//div[@class="event_not_found"]/div/text()'
        self.scraper.checker_x_path_result = u'Event not found!'
        self.scraper.checker_ref_url = u'http://localhost:8010/static/site_with_json_content_type/event_not_found.html'
        self.scraper.save()

        scheduler_rt = SchedulerRuntime()
        scheduler_rt.save()

        self.event = Event(
            title='Event 1',
            event_website=self.event_website,
            description='Event 1 description',
            url=
            'http://localhost:8010/static/site_with_json_content_type/event_not_found.html',
            checker_runtime=scheduler_rt)
        self.event.save()
    def process_item(self, item, spider):
      if spider.conf['DO_ACTION']: #Necessary since DDS v.0.9+
            try:
                print('HJ start saving')
                item['post_site'] = spider.ref_object

                checker_rt = SchedulerRuntime(runtime_type='C')
                checker_rt.save()
                item['checker_runtime'] = checker_rt

                print(item['foo'])

                if len(item['foo']) != 0:
                    selector = Selector(text=item['foo'])
                    options = selector.xpath('//option')
                    option_items = []
                    for option in options:
                        option_items.append(option.xpath("text()").extract())
                    print(option_items)
                    option_items.pop(0)
                    item['foo'] = option_items
                    # item['foo'] = json.dumps(option_items)

                print(item['foo'])
                item.save()
                spider.action_successful = True
                dds_id_str = str(item._dds_item_page) + '-' + str(item._dds_item_id)
                spider.struct_log("{cs}Item {id} saved to Django DB.{ce}".format(
                    id=dds_id_str,
                    cs=spider.bcolors['OK'],
                    ce=spider.bcolors['ENDC']))

            except IntegrityError as e:
                print('HJ integrity error')
                spider.log(str(e), logging.ERROR)
                spider.log(str(item._errors), logging.ERROR)
                raise DropItem("Missing attribute.")
      else:
          print('HJ not do_action')
          if not item.is_valid():
              spider.log(str(item._errors), logging.ERROR)
              raise DropItem("Missing attribute.")

      return item
Beispiel #13
0
    def extraSetUpJSONChecker(self):
        self.scraper.detail_page_content_type = 'J'
        self.scraper.checker_type = 'X'
        self.scraper.checker_x_path = u'event_not_found'
        self.scraper.checker_x_path_result = u'Event not found!'
        self.scraper.checker_ref_url = u'http://localhost:8010/static/site_with_json_content_type/event_not_found.json'
        self.scraper.save()

        scheduler_rt = SchedulerRuntime()
        scheduler_rt.save()

        self.event = Event(
            title='Event 1',
            event_website=self.event_website,
            description='Event 1 description',
            url=
            'http://localhost:8010/static/site_with_json_content_type/event_not_found.json',
            checker_runtime=scheduler_rt)
        self.event.save()
Beispiel #14
0
    def setUp(self):
        super(CheckerRunTest, self).setUp()

        self.scraper.checker_type = 'X'
        self.scraper.checker_x_path = u'//div[@class="event_not_found"]/div/text()'
        self.scraper.checker_x_path_result = u'Event was deleted!'
        self.scraper.checker_ref_url = u'http://localhost:8010/static/site_for_checker/event_not_found.html'
        self.scraper.save()

        scheduler_rt = SchedulerRuntime()
        scheduler_rt.save()

        self.event = Event(
            title='Event 1',
            event_website=self.event_website,
            description='Event 1 description',
            url='http://localhost:8010/static/site_for_checker/event1.html',
            checker_runtime=scheduler_rt)
        self.event.save()
Beispiel #15
0
    def process_item(self, item, spider):
        if spider.conf['DO_ACTION']:
            try:
                item['food_website'] = spider.ref_object

                checker_rt = SchedulerRuntime(runtime_type='C')
                checker_rt.save()
                item['checker_runtime'] = checker_rt

                item.save()
                spider.action_successful = True
                spider.struct_log(
                    "{cs}Item {id} saved to Django DB.{ce}".format(
                        id=item._dds_id_str,
                        cs=spider.bcolors['OK'],
                        ce=spider.bcolors['ENDC']))

            except IntegrityError as e:
                spider.log(str(e), logging.ERROR)
                raise DropItem("Missing attribute.")

        return item