def process_item( self, item: Item, spider: Spider) -> Item: """ Save the whole html page to a text file. Parameters ---------- item: Item. The scraped item, ie the full web page + meta data. spider: Spider. The spider, one per document type. Returns ------- out: Item. The input item, unscathed. """ __provider = ''.join(item.get( 'provider', ['none'])) __text = ''.join(item.get( 'text', [''])) __file_path = os.path.join( self._path, getattr(spider, 'name', 'default'), __provider + '.html') with open(__file_path, 'w') as __file: __file.write(__text) return item
def process_item(self, item: Item, spider): item['title'] = item['title'].split()[0] if isinstance(item, CommentItem): self.collect2.insert_one(dict(item)) elif isinstance(item, ReviewItem): if item.get('num', None): item['num'] = int(item['num'].split('.')[1]) item['actor'] = ''.join(item['actor'].split()) data = dict(item) self.collect.find_one_and_update({'title': item['title']}, {'$set': data}, upsert=True) return item