def process_item(self, item, spider): adapter = ItemAdapter(item) if (adapter.get('headline') is None or adapter.get('text') is None or adapter.get('author') is None or adapter.get('url') is None): raise DropItem(f"Missing required fields in {item}") else: return item
def process_item(self, item, spider): adapter = ItemAdapter(item) if not adapter.get('vinmonopoletProductId'): raise DropItem(f"Missing vinmonopoletProductId in {item}") if not adapter.get('points'): raise DropItem(f"Missing points in {item}") return item
def process_item(self, item, spider): adapter = ItemAdapter(item) if adapter.get('upc'): adapter['page_url'] = clean_data(adapter['page_url']) adapter['title'] = clean_data(adapter['title']) adapter['price'] = clean_data(adapter['price']) adapter['image_url'] = clean_data(adapter['image_url']) adapter['rating'] = clean_data(adapter['rating']) # adapter['description'] = clean_data(adapter['description']) adapter['upc'] = clean_data(adapter['upc']) adapter['product_type'] = clean_data(adapter['product_type']) adapter['price_without_tax'] = clean_data(adapter['price_without_tax']) adapter['price_with_tax'] = clean_data(adapter['price_with_tax']) adapter['tax'] = clean_data(adapter['tax']) adapter['availability'] = clean_data(adapter['availability']) adapter['number_of_reviews'] = clean_data(adapter['number_of_reviews']) if adapter.get('price'): # convirtiendo a soles adapter['price'] = float(adapter['price']) * self.vat_factor adapter['price_without_tax'] = float(adapter['price_without_tax']) * self.vat_factor adapter['price_with_tax'] = float(adapter['price_with_tax']) * self.vat_factor adapter['tax'] = float(adapter['tax']) * self.vat_factor return item # return adapter else: raise DropItem("Missing upc in %s" % item)
def process_item(self, item, spider): video = ItemAdapter(item) try: subs = yts.get_transcript( video.get("video_id"), languages=["zh-CN", "zh-Hans"] ) except (NoTranscriptFound, TranscriptsDisabled): raise DropItem(f"Video has no zh subs") with self.conn.cursor() as cur: try: cur.execute( f"INSERT INTO {self.table_name} VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", ( video.get("video_id"), video.get("url"), video.get("embed_url"), video.get("name"), video.get("description"), video.get("thumbnail_url"), video.get("channel"), video.get("date_published"), video.get("genre"), json.dumps(subs), ), ) self.conn.commit() except psycopg2.errors.UniqueViolation: logger.warning("duplicate video") self.conn.rollback() return item
def process_item(self, item, spider): # Only handle CommentItems if not isinstance(item, BaseCommentItem): return item adapter = ItemAdapter(item) topic_id = adapter.get('topic_id', 0) if isinstance(item, TopicCompletedItem): self.logger.debug(f'exporting CommentItems (id: {topic_id})') path = prepare_path(base_dir=self.base_dir_path, dirname_template=self.dirname_tmplt, filename_template=self.filename_tmplt, item=dict(topic_id=topic_id)) with path.open('w') as wh: writer = csv.writer(wh) for row in self.comment_item_buffers[topic_id]: writer.writerow(row) self.logger.debug(f'exported CommentItems (id: {topic_id})') else: # Remove excess spaces if comment is not Ascii Art. if not adapter.get('is_aa') and not adapter.get('body'): adapter['is_aa'] = False remove_excess_spaces(item, 'body') self.comment_item_buffers[topic_id].store(item) return item
def process_item(self, item, spider): adapter = ItemAdapter(item) site_name_gender_type = spider.name site_keys = site_name_gender_type.split('_') site_name = site_keys[0] site_gender = site_keys[1] site_type = site_keys[2] try: site = Site.objects.get(name=site_name, gender=site_gender, type=site_type) title = adapter.get('title') price = adapter.get('price') sale_price = adapter.get('sale_price') images = adapter.get('images') image_filename = None hq_image_filename = None if len(images) == 1: image_filename = images[0].get('path') hq_image_filename = None elif len(images) == 2: image_filename = images[0].get('path') hq_image_filename = images[1].get('path') product_link = adapter.get('product_link') try: product = Product.objects.get(site=site, product_link=product_link) product.price = price product.sale_price = sale_price product.image_filename = image_filename product.hq_image_filename = hq_image_filename product.product_link = product_link product.save() print("Product: {} updated.".format(title)) except Product.DoesNotExist: Product.objects.create(title=title, price=price, sale_price=sale_price, image_filename=image_filename, hq_image_filename=hq_image_filename, product_link=product_link, site=site) print("Product: {} added.".format(title)) except Product.MultipleObjectsReturned: products = Product.objects.filter(site=site, product_link=product_link) products.delete() print("Multiple object returned and deleted.") Product.objects.create(title=title, price=price, sale_price=sale_price, image_filename=image_filename, hq_image_filename=hq_image_filename, product_link=product_link, site=site) print("Product: {} added.".format(title)) except Site.DoesNotExist: print("{} does not exist".format(site_name_gender_type)) return item
def process_item(self,item,spider): adapter=ItemAdapter(item) if adapter.get('price'): if adapter.get('price_excludes_vat'): adapter['price']=adapter['price'] * self.vtr_Factor return item else: raise DropItem(f'Missing price in {item}')
def process_item(self, item, spider): adapter = ItemAdapter(item) if adapter.get('price'): if is_cheaper_then_max(adapter.get('price')) and is_available( adapter.get('availability')): return item else: raise DropItem(f"Missing price or item is not available")
def process_item(self, item, spider): itemAdapter = ItemAdapter(item=item) if itemAdapter.get("area"): area = str(itemAdapter.get("area")) itemAdapter.update({"area": self.getSquare(area)}) return itemAdapter.item else: return DropItem( f"Missing field in real estate at {itemAdapter.get('source')}")
def process_item(self, item, spider): adapter = ItemAdapter(item) if adapter.get('price'): if is_cheaper_then_max(adapter.get('price')) and is_available( adapter.get('availability')): line = json.dumps(ItemAdapter(item).asdict()) + "\n" self.file.write(line) return item else: raise DropItem(f"Missing price or item is not available")
def process_item(self, item, spider): itemAdapter = ItemAdapter(item=item) if itemAdapter.get("value"): value = str(itemAdapter.get("value")) area = str(itemAdapter.get("area")) itemAdapter.update({"value": self.getPrice(value, area)}) return itemAdapter.item else: return DropItem( f"Missing real estate in paper at {itemAdapter.get('source')}")
def process_item(self, item, spider): itemAdapter = ItemAdapter(item=item) if itemAdapter.get('title') and itemAdapter.get('value') and itemAdapter.get('area') \ and itemAdapter.get('address') and itemAdapter.get('ward') and itemAdapter.get('district') and itemAdapter.get('province') \ and itemAdapter.get('type') and itemAdapter.get('description') and itemAdapter.get('sellerName') \ and itemAdapter.get('time') and itemAdapter.get('image'): return item else: return DropItem( f"Missing field in real estate at {itemAdapter.get('source')}")
def process_item(self, item, spider): adapter = ItemAdapter(item) if adapter.get("title"): if adapter.get("author"): match = self.re.match(r"BY:\u00a0 ([^>]+)", adapter["author"]) adapter["author"] = match.group( 1) if match else adapter["author"] return item else: raise DropItem(f"Missing title in {item}")
def process_item(self, item, spider): adapter = ItemAdapter(item) if self.session.query(Business.id).filter_by(business_id=adapter.get('business_id')).scalar() is None: business = Business(title=adapter.get('title'), direct_url=adapter.get('direct_url'), business_id=adapter.get('business_id'), main_img_url=adapter.get('main_img_url'),phone=adapter.get('phone'),email=adapter.get('email'), address=json.dumps(adapter.get('address')),average_rating=adapter.get('average_rating'),review_count=adapter.get('review_count'), categories=adapter.get('categories'), site=adapter.get('site'),schedule=json.dumps(adapter.get('schedule')), description=adapter.get('description'),amenities=adapter.get('amenities')) self.session.add(business) self.session.flush() self.session.commit() self.session.close() return item
def process_item(self, item, spider): adapter = ItemAdapter(item) # Don't save item if car was damaged. if adapter.get('damaged') is True: raise DropItem(f'Dropping - {item} - damaged was True!') # Don't save item if car has more that 1 000 000 KMs of mileage. elif adapter.get('mileage') and int(adapter.get('mileage')) > 1000000: raise DropItem( f'Dropping - {item} - mileage is over 1 000 000 KMs!') else: self.exporter.export_item(item) self.file.write(b'\n') return item
def process_item(self, item, spider): today = date.today().strftime("%d/%m/%Y") yesterday = (date.today() - timedelta(days=1)).strftime("%d/%m/%Y") itemAdapter = ItemAdapter(item=item) if itemAdapter.get("time"): time = str(itemAdapter.get("time")) time = time.replace(" ", "").replace("ngàyđăng:", "").replace( "hômnay", today).replace("hômqua", yesterday) itemAdapter.update({"time": time}) return itemAdapter.item else: return DropItem( f"Missing field in real estate at {itemAdapter.get('source')}")
def process_item(self, item, spider): adapter = ItemAdapter(item) if isinstance(item, TimeseriesItem) and adapter.get( 'date') and adapter.get('value'): date = adapter['date'] value = adapter['value'] obj = models.MacroData(id=spider.name, date=date, value=value) self.ldc.update(date) self.sess.merge(obj) self.sess.commit() status.update_last_date(spider.name, self.ldc.get()) else: raise DropItem(f"Invalid data {item}") return item
def process_item(self, item, spider): adapter = ItemAdapter(item) try: adapter["brand"] = self.clean_text(adapter.get("brand")) adapter["name"] = self.clean_text(adapter.get("name")) adapter["price"] = int( float(adapter.get("price").replace(",", "").split()[1])) adapter["image"] = adapter.get("image").split("?")[0] except Exception: DropItem(f"Missing essential properties in {item}") self.csv.writerow(adapter.asdict()) return item
def process_item(self, item, spider): adapter = ItemAdapter(item) file_name = adapter.get('title') + '.txt' dir_name = adapter.get('tag') # if not asign any specific tag, will create dir based on novels/*.txt if not dir_name: dir_name = 'novels' if not os.path.exists(dir_name): os.makedirs(dir_name) new_path = os.path.join(dir_name, file_name) if adapter.get('title'): self.file = open(new_path, 'w', encoding='utf-8') self.file.write(adapter.get('content') + '\n') return item
def process_item(self, item, spider): adapter = ItemAdapter(item) title = adapter.get('title') excerpt = adapter.get('excerpt') adapter['title'] = title.replace('\n', '') if not excerpt: excerpt = '' adapter['excerpt'] = excerpt.replace('\n', '').replace('\t', '').replace(' ', '') image = adapter.get('image') adapter['image'] = image if image is not None else '' return item
def process_item(self, item, spider): adapter = ItemAdapter(item) title = adapter.get('title') source = adapter.get('source') if source == 'baha': title = convert(title, 'zh-cn') tokens = list(jieba.cut(title)) tags = [] for token in tokens: if token not in stopwords: tags.append(token) adapter['tags'] = tags return item
def get_template_kwargs(item: Any) -> Dict[str, Any]: adapter = ItemAdapter(item) # Prepare Comment ID cmt_id = adapter.get('comment_id') # Prepare Topic ID tpc_id = adapter.get('topic_id') tpc_id = tpc_id if tpc_id else get_parent_id(cmt_id) frm_id = adapter.get('forum_id') frm_id = frm_id if frm_id else get_parent_id(tpc_id) kwargs = {'forum_id': frm_id, 'topic_id': tpc_id, 'comment_id': cmt_id} return kwargs
def process_item(self, item, spider): adapter = ItemAdapter(item) if adapter.get('ads_job_code'): if adapter['ads_job_code'] in self.ids_seen: raise DropItem(f"Duplicate item found: {item!r}") self.ids_seen.add(adapter['ads_job_code']) return item
def process_item(self, item, spider): adapter = ItemAdapter(item) author = adapter.get("author") if author in self.authors: return item else: raise DropItem("Not from wanted author")
def process_item(self, item, spider): adapter = ItemAdapter(item) product_link = adapter.get('product_link') status = adapter.get('status') try: products = Product.objects.filter(product_link=product_link) if status == 404: products.delete() else: for product in products: product.status = status product.save() except Product.DoesNotExist: logger = logging.getLogger(__name__) logger.warning("Product doesn't exist: {}".format(product_link)) return item
def process_item(self, item, spider): adapter = ItemAdapter(item) if adapter.get('furnished',False): raise DropItem(f"Furnished property: {item!r}") else: return item
def process_item(self, item, spider): itemAdapter = ItemAdapter(item=item) if itemAdapter.get('title') and itemAdapter.get('description'): type = self.get_refine_type(str(itemAdapter.get("type"))) if type == "UNKNOW": text = [ str(itemAdapter.get('title')) + str(itemAdapter.get('description')) ] prediction = self.classifier.predict(text) type = prediction[0] itemAdapter.update({'type': type}) return itemAdapter.item else: return DropItem( f"Missing field in paper at {itemAdapter.get('source')}")
def _exporter_for_item(self, item): adapter = ItemAdapter(item) title = adapter.get('title') title = str(title) pubtime = adapter.get('pubtime') #pubtime = str(pubtime) pubtime_m = datetime.strftime(pubtime, "%Y_%m_%d") source = adapter.get('source') source = str(source) url = adapter.get("url") url = str(url) if title not in self.title_to_exporter: if source == "artagenda": url = url.split("/") #urlname = url[-1]_{url[-2]} filename = f"{pubtime_m}_{url[-1]}_{source}" else: if len(title)<55: filename = f"{pubtime_m}_{title}_{source}" else: filename = f"{pubtime_m}_{title[:55]}_{source}" filename = filename_clean(filename) filename = f"{filename}.json" #linux #filepath = os.path.join(os.path.expanduser('~'),'Desktop/scrapy/articles_dump', filename) #win filepath = os.path.join(os.path.expanduser('~'), 'PycharmProjects/art_scraper/artscraper/articles_dump', filename ) print("Saved: " + filename) self.close_exporters() #shutsdowns old self.close_files() #ditto f = open(filepath, 'wb' ) #open statement self.files.append(f) #adds file to files bin exporter = JsonItemExporter(f) exporter.export_empty_fields=True exporter.start_exporting() self.title_to_exporter[title] = exporter return self.title_to_exporter[title]
def process_item(self, item, spider): adapter = ItemAdapter(item) print('In Process Item', adapter.asdict()) if adapter.get('product_price'): return item else: raise DropItem("Missing price in %s" % item)
def process_item(self, item, spider): adapter = ItemAdapter(item) content = adapter.get('content') url = adapter.get('url') path = adapter.get('path') if content: self.log(f"Processing Item {adapter.get('url')}", logging.INFO) content = BeautifulSoup(content, 'html.parser').get_text() Path(path).parent.absolute().mkdir(parents=True, exist_ok=True) with open(path, 'w', encoding="utf-8") as f: f.write(content) self.log(f'Saved file {path}', logging.INFO) return item else: raise DropItem(f"Missing content in {url}")