def process_item(self, item, spider): try: item['news_website'] = spider.ref_object item['search_term'] = spider.search_terms #I added this its how we see what was searched per item checker_rt = SchedulerRuntime(runtime_type='C') checker_rt.save() item['checker_runtime'] = checker_rt item.save() # p1 = Den.objects.get(title='baby') # a1 = Article(search_term=spider.search_terms) # a1.dens.add(p1) # busi = item.save(commit=False) # p1 = Den.objects.get(title='pretty') # busi.dens.add(p1) # p1 = Den(title='pretty') # a1 = Article(search_term=spider.search_terms) # a1.dens.add(p1) # p1 = Den(title='pretty') # a1 = Article(search_term=spider.search_terms) # a1.dens.add(p1) spider.action_successful = True spider.log("Item saved.", log.INFO) except IntegrityError, e: spider.log(str(e), log.ERROR) raise DropItem("Missing attribute.")
def test_double_standard_id_field(self): checker_rt = SchedulerRuntime() checker_rt.save() event = Event(title=u'Event 1', event_website=self.event_website, description=u'Event 1 description', url=u'http://localhost:8010/static/site_generic/event5.html', checker_runtime=checker_rt) event.save() event = Event(title=u'Event 2', event_website=self.event_website, description=u'Event 1 description', url=u'http://localhost:8010/static/site_generic/event6.html', checker_runtime=checker_rt) event.save() event = Event(title=u'Event 1', event_website=self.event_website, description=u'Event 2 description', url=u'http://localhost:8010/static/site_generic/event7.html', checker_runtime=checker_rt) event.save() self.soa_url.id_field = False self.soa_url.save() self.soa_title.id_field = True self.soa_title.save() self.soa_desc.id_field = True self.soa_desc.save() self.run_event_spider(1) self.assertEqual(len(Event.objects.all()), 6) self.assertEqual(Event.objects.filter(description='Event 1 description').count(), 2)
def process_item(self, item, spider): if spider.conf['DO_ACTION']: # Necessary since DDS v.0.9+ try: item['races_website'] = spider.ref_object checker_rt = SchedulerRuntime(runtime_type='C') checker_rt.save() item['checker_runtime'] = checker_rt item['date'] = self.process_date(item['date'], spider) item['city'] = self.process_city(item['city'], spider) item['province'] = self.process_province(item['province'], spider) item.save() spider.action_successful = True spider.log("Item saved.", logging.INFO) except IntegrityError as e: spider.log(str(e), logging.ERROR) spider.log(str(item._errors), logging.ERROR) raise DropItem("Missing attribute.") else: if not item.is_valid(): spider.log(str(item._errors), logging.ERROR) raise DropItem("Missing attribute.") return item
def process_item(self, item, spider): try: checker_rt = SchedulerRuntime(runtime_type='C') checker_rt.save() item['checker_runtime'] = checker_rt item['source'] = spider.ref_object try: item_model = item_to_model(item) except TypeError: return item model, created = get_or_create(item_model) update_model(model, item_model) if created: spider.log('==' + model.name + '== created.', log.INFO) else: spider.log('==' + model.name + '== updated.', log.INFO) spider.action_successful = True except IntegrityError, e: spider.log(str(e), log.ERROR) raise DropItem("Missing attribute.")
def process_item(self, item, spider): if spider.conf['DO_ACTION']: # Necessary since DDS v.0.9+ try: item['source'] = spider.ref_object checker_rt = SchedulerRuntime(runtime_type='C') checker_rt.save() item['checker_runtime'] = checker_rt item.save() spider.action_successful = True dds_id_str = str(item._dds_item_page) + '-' + str( item._dds_item_id) spider.struct_log( "{cs}Item {id} saved to Django DB.{ce}".format( id=dds_id_str, cs=spider.bcolors['OK'], ce=spider.bcolors['ENDC'])) except IntegrityError as e: spider.log(str(e), logging.ERROR) spider.log(str(item._errors), logging.ERROR) raise DropItem("Missing attribute.") else: if not item.is_valid(): spider.log(str(item._errors), logging.ERROR) raise DropItem("Missing attribute.") return item
def process_item(self, item, spider): if isinstance(spider,ProductSpider): #spider.log("spider: " + spider.name) spider.log("item time is: " + item['time']) item['time']=process_date(item['time']) # to do: # drop item if price is null # drop item if time > no try: #if (item == ArticleItem): #item['news_website'] = spider.ref_object #else: item['source'] = spider.ref_object checker_rt = SchedulerRuntime(runtime_type='C') checker_rt.save() item['checker_runtime'] = checker_rt item.save() spider.action_successful = True spider.log("Item saved.", log.INFO) except IntegrityError, e: spider.log(str(e), log.ERROR) raise DropItem("Missing attribute.")
def process_item(self, item, spider): if spider.conf['DO_ACTION']: try: item['website'] = spider.ref_object checker_rt = SchedulerRuntime(runtime_type='C') checker_rt.save() item['checker_runtime'] = checker_rt item.save() spider.action_successful = True spider.log("Item saved.", logging.INFO) except IntegrityError as e: spider.log(str(e), logging.ERROR) spider.log(str(item._errors), logging.ERROR) raise DropItem("Missing attribute.") else: if not item.is_valid(): spider.log(str(item._errors), logging.ERROR) raise DropItem("Missing attribute.") return item
def process_item(self, item, spider): if spider.conf['DO_ACTION']: # Necessary since DDS v.0.9+ try: item['news_website'] = spider.ref_object if 'description' in item: item['description'] = convert_Html_to_text_and_make_sumarization(item['description']) if 'image' in item: item['image'] = change_image_size(item['image'],spider.ref_object.name) checker_rt = SchedulerRuntime(runtime_type='C') checker_rt.save() item['checker_runtime'] = checker_rt item.save() spider.action_successful = True dds_id_str = str(item._dds_item_page) + '-' + str(item._dds_item_id) spider.struct_log("{cs}Item {id} saved to Django DB.{ce}".format( id=dds_id_str, cs=spider.bcolors['OK'], ce=spider.bcolors['ENDC'])) except IntegrityError as e: spider.log(str(e), logging.ERROR) spider.log(str(item._errors), logging.ERROR) raise DropItem("Missing attribute.") else: if not item.is_valid(): spider.log(str(item._errors), logging.ERROR) raise DropItem("Missing attribute.") return item
def test_double(self): checker_rt = SchedulerRuntime() checker_rt.save() event = Event(title=u'Event 1', url=u'http://localhost:8010/static/site_generic/event1.html', checker_runtime=checker_rt) event.save() self.run_event_spider(1) self.assertEqual(len(Event.objects.all()), 4) self.assertEqual(len(Event.objects.filter(title='Event 1')), 1)
def add_listing_checker(listing): listing_source_cfg = ListingSourceScraperConfig.objects.get(pk=listing.listing_source_id) checker_rt = SchedulerRuntime(runtime_type='C', next_action_time=timezone.now() + timedelta(days=1)) checker_rt.save() checker_config = ListingCheckerConfig(listing=listing, checker_runtime=checker_rt, scraper=listing_source_cfg.scraper) checker_config.save() return checker_config
def test_detail_page_url_id_field(self): checker_rt = SchedulerRuntime() checker_rt.save() event = Event(title=u'Event 1', event_website=self.event_website, url=u'http://localhost:8010/static/site_generic/event5.html', checker_runtime=checker_rt) event.save() self.run_event_spider(1) self.assertEqual(len(Event.objects.all()), 5) self.assertEqual(Event.objects.filter(title='Event 1').count(), 2)
def test_double(self): checker_rt = SchedulerRuntime() checker_rt.save() event = Event(title=u'Event 1', event_website=self.event_website, url=u'http://localhost:8010/static/site_generic/event1.html', checker_runtime=checker_rt) event.save() self.run_event_spider(1) self.assertEqual(len(Event.objects.all()), 4) self.assertEqual(len(Event.objects.filter(title='Event 1')), 1)
def setUp(self): super(CheckerRunTest, self).setUp() self.scraper.checker_x_path = u'//div[@class="event_not_found"]/div/text()' self.scraper.checker_x_path_result = u'Event was deleted!' self.scraper.save() scheduler_rt = SchedulerRuntime() scheduler_rt.save() self.event = Event(title='Event 1', event_website=self.event_website, description='Event 1 description', url='http://localhost:8010/static/site_for_checker/event1.html', checker_runtime=scheduler_rt) self.event.save()
def process_item(self, item, spider): try: item['news_website'] = spider.ref_object checker_rt = SchedulerRuntime(runtime_type='C') checker_rt.save() item['checker_runtime'] = checker_rt item.save() spider.action_successful = True spider.log("Item saved.", log.INFO) except IntegrityError, e: spider.log(str(e), log.ERROR) raise DropItem("Missing attribute.")
def process_item(self, item, spider): if spider.conf['DO_ACTION']: try: item['job_website'] = spider.ref_object checker_rt = SchedulerRuntime(runtime_type='C') checker_rt.save() item['checker_runtime'] = checker_rt item.save() spider.action_successful = True spider.log("Items saved in the DB", logging.INFO) except IntegrityError, e: spider.log(str(e), logging.ERROR) raise DropItem("missing attrib")
def test_standard_update_field_update(self): checker_rt = SchedulerRuntime() checker_rt.save() event = Event(title=u'Event 1 - Old Title', event_website=self.event_website, url=u'http://localhost:8010/static/site_generic/event1.html', checker_runtime=checker_rt) event.save() self.soa_title.attr_type = 'T' self.soa_title.save() self.run_event_spider(1) event_updated = Event.objects.get(pk=event.id) self.assertEqual(event_updated.title, 'Event 1') self.assertEqual(len(Event.objects.filter(title='Event 1 - Old Title')), 0)
def extraSetUpHTMLChecker(self): self.scraper.checker_type = 'X' self.scraper.checker_x_path = u'//div[@class="event_not_found"]/div/text()' self.scraper.checker_x_path_result = u'Event not found!' self.scraper.checker_ref_url = u'http://localhost:8010/static/site_with_json_content_type/event_not_found.html' self.scraper.save() scheduler_rt = SchedulerRuntime() scheduler_rt.save() self.event = Event(title='Event 1', event_website=self.event_website, description='Event 1 description', url='http://localhost:8010/static/site_with_json_content_type/event_not_found.html', checker_runtime=scheduler_rt) self.event.save()
def test_single_standard_id_field(self): checker_rt = SchedulerRuntime() checker_rt.save() event = Event(title='Event 1', event_website=self.event_website, url='http://localhost:8010/static/site_generic/event5.html', checker_runtime=checker_rt) event.save() self.soa_url.id_field = False self.soa_url.save() self.soa_title.id_field = True self.soa_title.save() self.run_event_spider(1) self.assertEqual(len(Event.objects.all()), 4) self.assertEqual(Event.objects.filter(title='Event 1').count(), 1)
def setUpScraperJSChecker(self, path): super(ScraperJSRunTest, self).setUp() self.scraper.checker_type = 'X' self.scraper.checker_x_path = u'//div[@class="event_not_found"]/div/text()' self.scraper.checker_ref_url = u'%ssite_with_js/event_not_found.html' % path self.scraper.save() scheduler_rt = SchedulerRuntime() scheduler_rt.save() self.event = Event(title='Event 1', event_website=self.event_website, description='Event 1 description', url='%ssite_with_js/event_not_found.html' % path, checker_runtime=scheduler_rt) self.event.save()
def process_item(self, item, spider): try: # This name must match Article's source item["source"] = spider.ref_object checker_rt = SchedulerRuntime(runtime_type="C") checker_rt.save() item["checker_runtime"] = checker_rt item.save() spider.action_successful = True spider.log("Item saved.", log.INFO) except IntegrityError, e: spider.log(str(e), log.ERROR) raise DropItem("Missing attribute.")
def extraSetUpJSONChecker(self): self.scraper.detail_page_content_type = 'J' self.scraper.checker_type = 'X' self.scraper.checker_x_path = u'event_not_found' self.scraper.checker_x_path_result = u'Event not found!' self.scraper.checker_ref_url = u'http://localhost:8010/static/site_with_json_content_type/event_not_found.json' self.scraper.save() scheduler_rt = SchedulerRuntime() scheduler_rt.save() self.event = Event(title='Event 1', event_website=self.event_website, description='Event 1 description', url='http://localhost:8010/static/site_with_json_content_type/event_not_found.json', checker_runtime=scheduler_rt) self.event.save()
def extraSetUpHTMLChecker(self): self.scraper.checker_type = 'X' self.scraper.checker_x_path = u'//div[@class="event_not_found"]/div/text()' self.scraper.checker_x_path_result = u'Event not found!' self.scraper.checker_ref_url = u'http://localhost:8010/static/site_with_json_content_type/event_not_found.html' self.scraper.save() scheduler_rt = SchedulerRuntime() scheduler_rt.save() self.event = Event( title='Event 1', event_website=self.event_website, description='Event 1 description', url= 'http://localhost:8010/static/site_with_json_content_type/event_not_found.html', checker_runtime=scheduler_rt) self.event.save()
def process_item(self, item, spider): if spider.conf['DO_ACTION']: #Necessary since DDS v.0.9+ try: print('HJ start saving') item['post_site'] = spider.ref_object checker_rt = SchedulerRuntime(runtime_type='C') checker_rt.save() item['checker_runtime'] = checker_rt print(item['foo']) if len(item['foo']) != 0: selector = Selector(text=item['foo']) options = selector.xpath('//option') option_items = [] for option in options: option_items.append(option.xpath("text()").extract()) print(option_items) option_items.pop(0) item['foo'] = option_items # item['foo'] = json.dumps(option_items) print(item['foo']) item.save() spider.action_successful = True dds_id_str = str(item._dds_item_page) + '-' + str(item._dds_item_id) spider.struct_log("{cs}Item {id} saved to Django DB.{ce}".format( id=dds_id_str, cs=spider.bcolors['OK'], ce=spider.bcolors['ENDC'])) except IntegrityError as e: print('HJ integrity error') spider.log(str(e), logging.ERROR) spider.log(str(item._errors), logging.ERROR) raise DropItem("Missing attribute.") else: print('HJ not do_action') if not item.is_valid(): spider.log(str(item._errors), logging.ERROR) raise DropItem("Missing attribute.") return item
def extraSetUpJSONChecker(self): self.scraper.detail_page_content_type = 'J' self.scraper.checker_type = 'X' self.scraper.checker_x_path = u'event_not_found' self.scraper.checker_x_path_result = u'Event not found!' self.scraper.checker_ref_url = u'http://localhost:8010/static/site_with_json_content_type/event_not_found.json' self.scraper.save() scheduler_rt = SchedulerRuntime() scheduler_rt.save() self.event = Event( title='Event 1', event_website=self.event_website, description='Event 1 description', url= 'http://localhost:8010/static/site_with_json_content_type/event_not_found.json', checker_runtime=scheduler_rt) self.event.save()
def setUp(self): super(CheckerRunTest, self).setUp() self.scraper.checker_type = 'X' self.scraper.checker_x_path = u'//div[@class="event_not_found"]/div/text()' self.scraper.checker_x_path_result = u'Event was deleted!' self.scraper.checker_ref_url = u'http://localhost:8010/static/site_for_checker/event_not_found.html' self.scraper.save() scheduler_rt = SchedulerRuntime() scheduler_rt.save() self.event = Event( title='Event 1', event_website=self.event_website, description='Event 1 description', url='http://localhost:8010/static/site_for_checker/event1.html', checker_runtime=scheduler_rt) self.event.save()
def setUp(self): super(CheckerRunTest, self).setUp() self.scraper.checker_type = "X" self.scraper.checker_x_path = u'//div[@class="event_not_found"]/div/text()' self.scraper.checker_x_path_result = u"Event was deleted!" self.scraper.checker_ref_url = u"http://localhost:8010/static/site_for_checker/event_not_found.html" self.scraper.save() scheduler_rt = SchedulerRuntime() scheduler_rt.save() self.event = Event( title="Event 1", event_website=self.event_website, description="Event 1 description", url="http://localhost:8010/static/site_for_checker/event1.html", checker_runtime=scheduler_rt, ) self.event.save()
def process_item(self, item, spider): try: if isinstance(spider.ref_object, LoanScraper): item["loan_scraper"] = spider.ref_object elif isinstance(spider.ref_object, InsuranceWebsite): item["insurance_website"] = spider.ref_object else: item["news_website"] = spider.ref_object checker_rt = SchedulerRuntime(runtime_type="C") checker_rt.save() item["checker_runtime"] = checker_rt item.save() spider.action_successful = True spider.log("Item saved.", log.INFO) except IntegrityError, e: spider.log(str(e), log.ERROR) raise DropItem("Missing attribute.")
def extraSetUpHTMLChecker(self): self.checker = Checker() self.checker.scraped_obj_attr = self.soa_url self.checker.scraper = self.scraper self.checker.checker_type = "X" self.checker.checker_x_path = '//div[@class="event_not_found"]/div/text()' self.checker.checker_x_path_result = "Event not found!" self.checker.checker_ref_url = "http://localhost:8010/static/site_with_json_content_type/event_not_found.html" self.checker.save() scheduler_rt = SchedulerRuntime() scheduler_rt.save() self.event = Event( title="Event 1", event_website=self.event_website, description="Event 1 description", url="http://localhost:8010/static/site_with_json_content_type/event_not_found.html", checker_runtime=scheduler_rt, ) self.event.save()
def process_item(self, item, spider): if spider.conf['DO_ACTION']: try: item['news_website'] = spider.ref_object checker_rt = SchedulerRuntime(runtime_type='C') checker_rt.save() item['checker_runtime'] = checker_rt item.save() spider.action_successful = True spider.struct_log("{cs}Item {id} saved to Django DB.{ce}".format( id=item._dds_id_str, cs=spider.bcolors['OK'], ce=spider.bcolors['ENDC'])) except IntegrityError as e: spider.log(str(e), logging.ERROR) raise DropItem("Missing attribute.") return item
def process_item(self, item, spider): if spider.conf['DO_ACTION']: try: item['food_website'] = spider.ref_object checker_rt = SchedulerRuntime(runtime_type='C') checker_rt.save() item['checker_runtime'] = checker_rt item.save() spider.action_successful = True spider.struct_log( "{cs}Item {id} saved to Django DB.{ce}".format( id=item._dds_id_str, cs=spider.bcolors['OK'], ce=spider.bcolors['ENDC'])) except IntegrityError as e: spider.log(str(e), logging.ERROR) raise DropItem("Missing attribute.") return item
def process_item(self, item, spider): ''' this is the processing portion of the spider to the django ORM/Database. What this particular portion does is that it gets the spider and based on the configuration it will save the information in my DB. It follows the same rules and principles of the scrapy pipeline model. In case there is an intgrity error, it will drop the items and give back and errot of missing attrib ''' if spider.conf['DO_ACTION']: try: item['news_website'] = spider.ref_object checker_rt = SchedulerRuntime(runtime_type='C') checker_rt.save() item['checker_runtime'] = checker_rt item.save() spider.action_successful = True spider.log("Items saved in the DB", logging.INFO) except IntegrityError, e: spider.log(str(e), logging.ERROR) raise DropItem("missing attrib")
def process_item(self, item, spider): if spider.conf['DO_ACTION']: #Necessary since DDS v.0.9+ try: item['source'] = spider.ref_object checker_rt = SchedulerRuntime(runtime_type='C') checker_rt.save() item['checker_runtime'] = checker_rt if 'started_at' in item: item['started_at'] = item['started_at'] + '-01-01' item['industy'], created = Industy.objects.get_or_create(name=item['industy']) try: item_model = item_to_model(item) except TypeError: return item model, created = get_or_create(item_model) model.tags_raw = '' model.save(); tags = ast.literal_eval(item['tags_raw'].encode('utf-8')) tag_objs = [] for tag in tags: tag_name = tag['tag_name'] tag_obj, tag_created = Tag.objects.get_or_create(name=tag_name) if tag: tag_obj.save() tag_objs.append(tag_obj) if tag_objs: model.tags.add(*tag_objs) model.tags_raw = ','.join(t.name for t in tag_objs) # 保存融资信息 invest_firm = '' investments = ast.literal_eval(item['investment_raw'].encode('utf-8')) for i in investments: invest_date = '-'.join(str(i) for i in [i['invse_year'], i['invse_month'],i['invse_month']]) if i['invse_detail_money'] == 0: invest_amount = i['invse_similar_money']['invse_similar_money_name'] + i['invse_currency']['invse_currency_name'] else: invest_amount = str(i['invse_detail_money']) + i['invse_currency']['invse_currency_name'] if i['invse_rel_invst_name']: invest_firm = i['invse_rel_invst_name'] else: invest_firm = ' '.join([org['invst_name'] for org in i['invse_orags_list']]) invest_round = i['invse_round']['invse_round_name'] investment, investment_created = Investment.objects.get_or_create(invest_date = invest_date, invest_firm = invest_firm, invest_round = invest_round, invest_amount = invest_amount, invest_to = model) investment.save() model.investment_raw = invest_firm model.save() # backup # save tags_soup # tags_soup = Soup(item['tags_raw'], 'lxml') # tags = [] # for tag_soup in select(tags_soup, 'a span'): # tag = tag_soup.string # tag_obj, tag_created = Tag.objects.get_or_create(name=tag) # if tag: # tag_obj.save() # tags.append(tag_obj) # if tags: # model.tags.add(*tags) # model.tags_raw = ','.join(t.name for t in tags) #save investment # soup = Soup(item['investment_raw'], 'lxml') # invest_firm = '' # for investment_soup in soup.find_all('tr'): # invest_date = select(investment_soup, 'span.date')[0].string.replace('.', '-') # invest_amount = select(investment_soup, 'span.finades a')[0].string # tds = select(investment_soup, 'td') # if tds[3]: # invest_firm = ','.join(i.string for i in tds[3].find_all('a')) # invest_round = select(investment_soup, 'span.round a')[0].string # investment, investment_created = Investment.objects.get_or_create(invest_date = invest_date, # invest_firm = invest_firm, # invest_round = invest_round, # invest_amount = invest_amount, # invest_to = model) # investment.save() # model.investment_raw = invest_firm if created: spider.log('==' + model.name + '== created.', log.INFO) else: spider.log('==' + model.name + '== updated.', log.INFO) spider.action_successful = True except IntegrityError, e: spider.log(str(e), logging.ERROR) spider.log(str(item._errors), logging.ERROR) raise DropItem("Missing attribute.")