def process_item(self, item, spider): if not spider.pluck: return item value = None if spider.package_pointer: try: package = _get_package(item) except NotImplementedError as e: value = f'error: {e}' else: value = _resolve_pointer(package, spider.package_pointer) else: # spider.release_pointer if item['data_type'] in ('release_package', 'release_package_list', 'release_package_list_in_results', 'release_list', 'release'): data = _get_releases(item) if data: value = max(_resolve_pointer(r, spider.release_pointer) for r in data) elif item['data_type'] in ('record_package', 'record_package_list', 'record_package_list_in_results', 'record'): data = _get_records(item) if data: # This assumes that the first record in the record package has the desired value. data = data[0] if 'releases' in data: value = max(_resolve_pointer(r, spider.release_pointer) for r in data['releases']) elif 'compiledRelease' in data: value = _resolve_pointer(data['compiledRelease'], spider.release_pointer) if value and spider.truncate: value = value[:spider.truncate] return PluckedItem({'value': value})
def test_disabled(): with TemporaryDirectory() as tmpdirname: spider = spider_with_crawler( settings={'KINGFISHER_PLUCK_PATH': tmpdirname}) extension = KingfisherPluck.from_crawler(spider.crawler) item = PluckedItem({'value': '2020-10-01'}) extension.item_scraped(item, spider) extension.spider_closed(spider, 'itemcount') assert not glob(os.path.join(tmpdirname, 'pluck*.csv'))
def test_process_item_package_pointer(data_type, data): spider = spider_with_crawler(package_pointer='/publishedDate') pipeline = Pluck() item = File({ 'file_name': 'test', 'data': json.dumps(data), 'data_type': data_type, 'url': 'http://test.com', }) assert pipeline.process_item(item, spider) == PluckedItem( {'value': '2000-01-01T00:00:00Z'})
def test_process_item_release_pointer(data_type, data): spider = spider_with_crawler(release_pointer='/date', truncate=10) pipeline = Pluck() item = File({ 'file_name': 'test', 'data': json.dumps(data), 'data_type': data_type, 'url': 'http://test.com', }) assert pipeline.process_item(item, spider) == PluckedItem( {'value': '2020-10-01'})
def test_process_item_non_package_data_type(): spider = spider_with_crawler(package_pointer='/publishedDate') pipeline = Pluck() item = File({ 'file_name': 'test', 'data': json.dumps(releases[0]), 'data_type': 'release', 'url': 'http://test.com', }) assert pipeline.process_item(item, spider) == PluckedItem( {'value': 'error: /publishedDate not found'})
def test_process_item_nonexistent_pointer(kwargs): spider = spider_with_crawler(**kwargs) pipeline = Pluck() item = File({ 'file_name': 'test', 'data': json.dumps(release_package), 'data_type': 'release_package', 'url': 'http://test.com', }) assert pipeline.process_item(item, spider) == PluckedItem( {'value': 'error: /nonexistent not found'})
def test_spider_closed_with_items(): with TemporaryDirectory() as tmpdirname: spider = spider_with_crawler( settings={'KINGFISHER_PLUCK_PATH': tmpdirname}, release_pointer='/date') extension = KingfisherPluck.from_crawler(spider.crawler) item = PluckedItem({'value': '2020-10-01'}) extension.item_scraped(item, spider) extension.spider_closed(spider, 'itemcount') with open(os.path.join(tmpdirname, 'pluck-release-date.csv')) as f: assert '2020-10-01,test\n' == f.read()
def process_item(self, item, spider): if not spider.pluck: return item value = None if spider.package_pointer: pointer = spider.package_pointer if isinstance(item["data"], dict): value = _resolve_pointer(item["data"], pointer) else: try: value = next( ijson.items(item["data"], pointer[1:].replace("/", ".")) ) except StopIteration: value = f"error: {pointer} not found" except ijson.common.IncompleteJSONError as e: message = str(e).split("\n", 1)[0] if message.endswith( ( # The JSON text can be truncated by a `bytes_received` handler. "premature EOF", # These messages occur if the JSON text is truncated at `"\\u` or `"\\`. r"lexical error: invalid (non-hex) character occurs after '\u' inside string.", r"lexical error: inside a string, '\' occurs before a character which it may not.", ) ): value = f"error: {pointer} not found within initial bytes" else: raise else: # spider.release_pointer if isinstance(item["data"], dict): data = item["data"] else: data = json.loads(item["data"]) if item["data_type"].startswith("release"): releases = data["releases"] if releases: value = max( _resolve_pointer(r, spider.release_pointer) for r in releases ) elif item["data_type"].startswith("record"): records = data["records"] if records: # This assumes that the first record in the record package has the desired value. record = records[0] if "releases" in record: value = max( _resolve_pointer(r, spider.release_pointer) for r in record["releases"] ) elif "compiledRelease" in record: value = _resolve_pointer( record["compiledRelease"], spider.release_pointer ) if value and spider.truncate: value = value[: spider.truncate] return PluckedItem({"value": value})