def process_item(self, item: Item, spider: scrapy.Spider): if not isinstance(item, Item): spider.log("Invalid item type") return filename = None basepath = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'items')) if isinstance(item, Tune): basepath = os.path.join(basepath, item.artistId) if not os.path.isdir(basepath): Path(basepath).mkdir(parents=True, exist_ok=True) if isinstance(item, Tune): nitem = item # Remove filesystem unfriendly characters nitem.artist = validatechars(nitem.artist) nitem.title = validatechars(nitem.title) filename = f"{nitem.artist} - {nitem.title}.{nitem.format}" if filename is None: raise ValueError("No filename") # Save to temporary file tmpf = NamedTemporaryFile("wb", prefix="amp-", suffix=f".bin", delete=False) with tmpf as f: f.write(item.data) f.flush() spider.logger.info(f"saved as {f.name}") # Rename and move the temporary file to actual file newpath = move(tmpf.name, os.path.join(basepath, filename)) spider.logger.info(f"renamed {tmpf.name} to {newpath}")
def process_item(self, item: Item, spider: scrapy.Spider): if not isinstance(item, Item): spider.log("invalid item type {0}".format(type(item))) return if isinstance(item, Memory): self.items.append(item)
def process_request(self, request: Request, spider: Spider): """ 参考: https://www.jianshu.com/p/d64b13a2322b https://docs.scrapy.org/en/latest/topics/downloader- middleware.html#writing-your-own-downloader-middleware :param request: :param spider: :return: """ try: spider.log('Chrome driver begin...') self.driver.implicitly_wait(3) self.driver.set_script_timeout(5) self.driver.set_page_load_timeout(5) self.driver.get(request.url) # 获取网页链接内容 return HtmlResponse(url=request.url, body=self.driver.page_source, request=request, encoding='utf-8', status=200) # 返回HTML数据 except TimeoutException: self.sqlite.update_retry_time_field(self.proxy_ip) retry_times: int = self.sqlite.fetch_value_of_retry_time( self.proxy_ip) # 该代理ip重试两次后依旧超时,标记为无效 if retry_times >= 2: self.sqlite.update_is_ok_field(self.proxy_ip) spider.log('Chrome driver end...') return HtmlResponse(url=request.url, request=request, encoding='utf-8', status=500)
def process_item(self, item: dict, spider: Spider): if isinstance(spider, RssFeedBaseSpider): _item = item.copy() pub_date_tz = parse_date(_item['pub_date']) _item['pub_date'] = pub_date_tz.astimezone(tz=UTC) try: with self.db.transaction(): RssNews.create(**_item) except DatabaseError as err: spider.log('Raise DatabaseError: {0}'.format(err), level=ERROR) return item
def process_item(self, item: Item, spider: scrapy.Spider): if not isinstance(item, Item): spider.log("Invalid item type") return filename = None basepath = os.path.abspath( os.path.join(os.path.dirname(__file__), '..', 'items')) if isinstance(item, Tune): if item.data is None: spider.logger.info(f"Data is none") return if len(item.data) == 0: spider.logger.info(f"No data") return nitem: Tune = item # Remove filesystem unfriendly characters nitem.arranger = validatechars(nitem.arranger) nitem.title = validatechars(nitem.title) nitem.composer = validatechars(nitem.composer) filename = f"{nitem.arranger} - {nitem.title} [{nitem.composer}] {nitem.added.strftime('%Y-%m-%d')}.mp3" if filename is None: raise ValueError("No filename") if not os.path.isdir(basepath): Path(basepath).mkdir(parents=True, exist_ok=True) # Save to temporary file tmpf = NamedTemporaryFile("wb", prefix="amigaremix-", suffix=f".mp3", delete=False) with tmpf as f: f.write(item.data) f.flush() spider.logger.info(f"saved as {f.name}") # Rename and move the temporary file to actual file newpath = move(tmpf.name, os.path.join(basepath, filename)) spider.logger.info(f"renamed {tmpf.name} to {newpath}")
def open(self, spider: Spider) -> None: self.spider = spider try: self.queue = load_object(self.queue_cls)( server=self.server, spider=spider, key=self.queue_key % { 'spider': spider.name }, serializer=self.serializer, ) except TypeError as e: raise ValueError("Failed to instantiate queue class '%s': %s", self.queue_cls, e) self.df = load_object(self.dupefilter_cls).from_spider(spider) if self.flush_on_start: self.flush() # notice if there are requests already in the queue to resume the crawl if len(self.queue): spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
def open_spider(self, spider: scrapy.Spider): """Creates initializes the output folders to store the comment items. """ try: os.mkdir('data') spider.log(' Directory data/ created', level=logging.INFO) except FileExistsError: spider.log(' Directory data/ already exists', level=logging.INFO) os.mkdir('data/{}-{}'.format(spider.name, self.key)) spider.log(' Directory data/{}-{} created'.format( spider.name, self.key), level=logging.INFO) filename = 'data/{0}-{1}/part-{2:05d}.jl'.format( spider.name, self.key, self.file_index) self.file = open(filename, 'a')
def process_item(self, item: scrapy.Item, spider: scrapy.Spider) -> List: for k, v in item.items(): spider.log('{}: {}'.format(k, v), logging.INFO) return item
def process_item(self, item: ScraperItem, spider: Spider) -> ScraperItem: spider.log(pformat(dict(item), indent=4)) return item
def process_item(self, item, spider): self.db[self.collection_name].insert_one(dict(item)) Spider.log(spider, 'inserted %s' % item['_id'], logging.DEBUG) return item
def process_item(self, item: Item, spider: Spider): """ 重写框架方法 :param item: 由框架自动传入 :param spider: 由框架自动传入 :return: """ if item.__class__ == ZhugefangDetailUrlsItem: insert_sql: str = """ INSERT INTO python_detail_urls (building_url, city_id, building_from, is_new, commit_time) values ('{building_url}','{city_id}', '{building_from}','{is_new}','{commit_time}') """.format(building_url=item['comm_url'], city_id=item['city_id'], building_from=item['comm_from'], is_new=item['is_new'], commit_time=item['commit_time']) try: spider.log('SQL Prepared: ' + insert_sql) self.cursor.execute(insert_sql) except MySQLError: import traceback traceback.print_exc() self.cursor.close() self.conn.close() spider.log('The DB connection has closed!') else: self.conn.commit() return item elif item.__class__ == ZhugefangOldItem: insert_sql: str = """INSERT INTO python_project (pj_name, addr, avg_price, const_era, property_desc, plot_ratio, greening, buildings_amount, houses_amount, property_price, property, develop, const_area, building_type, city_id, building_from, building_url, is_new, commit_time) VALUES ('{pj_name}', '{addr}', '{avg_price}', '{const_era}','{property_desc}', '{plot_ratio}','{greening}', '{buildings_amount}','{houses_amount}', '{property_price}','{property}', '{develop}','{const_area}','{building_type}', '{city_id}','{building_from}', '{building_url}','{is_new}','{commit_time}')""".format( pj_name=item['comm_name'], addr=item['comm_addr'], avg_price=item['comm_price'], const_era=item['const_era'], property_desc=item['property_desc'], plot_ratio=item['plot_ratio'], greening=item['greening_ratio'], buildings_amount=item['buildings_amount'], houses_amount=item['houses_amount'], property_price=item['property_fee'], property=item['property_comp'], develop=item['dev_comp'], const_area=item['const_area'], building_type=item['const_type'], city_id=item['city_id'], building_from=item['comm_from'], building_url=item['comm_url'], is_new=item['is_new'], commit_time=item['commit_time']) try: spider.log('SQL Prepared: ' + insert_sql) self.cursor.execute(insert_sql) except MySQLError: import traceback traceback.print_exc() self.cursor.close() self.conn.close() spider.log('The DB connection has closed!') else: self.conn.commit() return item