def main(sys_argv): # Arguments parser = argparse.ArgumentParser(description="Return WARC filepaths for passing to other commandlines.") parser.add_argument("--harvest-start", help="ISO8601 datetime after which harvest was performed. For example, " "2015-02-22T14:49:07Z") parser.add_argument("--harvest-end", help="ISO8601 datetime before which harvest was performed. For example, " "2015-02-22T14:49:07Z") parser.add_argument("--warc-start", help="ISO8601 datetime after which WARC was created. For example, " "2015-02-22T14:49:07Z") parser.add_argument("--warc-end", help="ISO8601 datetime before which WARC was created. For example, " "2015-02-22T14:49:07Z") default_api_base_url = "http://api:8080" parser.add_argument("--api-base-url", help="Base url of the SFM API. Default is {}.".format(default_api_base_url), default=default_api_base_url) parser.add_argument("--debug", type=lambda v: v.lower() in ("yes", "true", "t", "1"), nargs="?", default="False", const="True") parser.add_argument("--newline", action="store_true", help="Separates WARCs by newline instead of space.") parser.add_argument("collection", nargs="+", help="Limit to WARCs of this collection. " "Truncated collection ids may be used.") # Explicitly using sys.argv so that can mock out for testing. args = parser.parse_args(sys_argv[1:]) # Logging logging.basicConfig(format='%(asctime)s: %(name)s --> %(message)s', level=logging.DEBUG if args.debug else logging.INFO) logging.getLogger("requests").setLevel(logging.DEBUG if args.debug else logging.INFO) api_client = ApiClient(args.api_base_url) collection_ids = [] for collection_id_part in args.collection: log.debug("Looking up collection id part %s", collection_id_part) if len(collection_id_part) == 32: collection_ids.append(collection_id_part) else: collections = list(api_client.collections(collection_id_startswith=collection_id_part)) if len(collections) == 0: print("No matching collections for {}".format(collection_id_part)) sys.exit(1) elif len(collections) > 1: print("Multuple matching collections for {}".format(collection_id_part)) sys.exit(1) else: collection_ids.append(collections[0]["collection_id"]) warc_filepaths = set() for collection_id in collection_ids: log.debug("Looking up warcs for %s", collection_id) warcs = api_client.warcs(collection_id=collection_id, harvest_date_start=args.harvest_start, harvest_date_end=args.harvest_end, created_date_start=args.warc_start, created_date_end=args.warc_end) for warc in warcs: warc_filepaths.add(warc["path"]) sep = "\n" if args.newline else " " return sep.join(sorted(warc_filepaths))
def main(sys_argv): # Arguments parser = argparse.ArgumentParser(description="Return WARC filepaths for passing to other commandlines.") parser.add_argument("--include-web", action="store_true", help="Include WARCs for web harvests.") parser.add_argument("--harvest-start", help="ISO8601 datetime after which harvest was performed. For example, " "2015-02-22T14:49:07Z") parser.add_argument("--harvest-end", help="ISO8601 datetime before which harvest was performed. For example, " "2015-02-22T14:49:07Z") default_api_base_url = "http://api:8080" parser.add_argument("--api-base-url", help="Base url of the SFM API. Default is {}.".format(default_api_base_url), default=default_api_base_url) parser.add_argument("--debug", type=lambda v: v.lower() in ("yes", "true", "t", "1"), nargs="?", default="False", const="True") parser.add_argument("collection", nargs="+", help="Limit to WARCs of this collection. " "Truncated collection ids may be used.") # Explicitly using sys.argv so that can mock out for testing. args = parser.parse_args(sys_argv[1:]) # Logging logging.basicConfig(format='%(asctime)s: %(name)s --> %(message)s', level=logging.DEBUG if args.debug else logging.INFO) logging.getLogger("requests").setLevel(logging.DEBUG if args.debug else logging.INFO) api_client = ApiClient(args.api_base_url) collection_ids = [] for collection_id_part in args.collection: log.debug("Looking up collection id part %s", collection_id_part) if len(collection_id_part) == 32: collection_ids.append(collection_id_part) else: collections = list(api_client.collections(collection_id_startswith=collection_id_part)) if len(collections) == 0: print "No matching collections for {}".format(collection_id_part) sys.exit(1) return elif len(collections) > 1: print "Multuple matching collections for {}".format(collection_id_part) sys.exit(1) return else: collection_ids.append(collections[0]["collection_id"]) warc_filepaths = set() for collection_id in collection_ids: log.debug("Looking up warcs for %s", collection_id) warcs = api_client.warcs(collection_id=collection_id, harvest_date_start=args.harvest_start, harvest_date_end=args.harvest_end, exclude_web=not args.include_web) for warc in warcs: warc_filepaths.add(warc["path"]) return " ".join(sorted(warc_filepaths))
class BaseExporter(BaseConsumer): def __init__(self, api_base_url, warc_iter_cls, table_cls, working_path, mq_config=None, warc_base_path=None, limit_item_types=None, host=None): BaseConsumer.__init__(self, mq_config=mq_config, working_path=working_path, persist_messages=True) self.api_client = ApiClient(api_base_url) self.warc_iter_cls = warc_iter_cls self.table_cls = table_cls self.limit_item_types = limit_item_types # This is for unit tests only. self.warc_base_path = warc_base_path self.host = host or os.environ.get("HOSTNAME", "localhost") def on_message(self): assert self.message export_id = self.message["id"] log.info("Performing export %s", export_id) self.result = ExportResult() self.result.started = datetime_now() # Send status indicating that it is running self._send_response_message(STATUS_RUNNING, self.routing_key, export_id, self.result) # Get the WARCs from the API collection_id = self.message.get("collection", {}).get("id") seed_ids = [] seed_uids = [] for seed in self.message.get("seeds", []): seed_ids.append(seed["id"]) seed_uids.append(seed["uid"]) if (collection_id or seed_ids) and not (collection_id and seed_ids): harvest_date_start = self.message.get("harvest_date_start") harvest_date_end = self.message.get("harvest_date_end") # Only request seed ids if < 20. If use too many, will cause problems calling API. # 20 is an arbitrary number warc_paths = self._get_warc_paths( collection_id, seed_ids if len(seed_ids) <= 20 else None, harvest_date_start, harvest_date_end) export_format = self.message["format"] export_segment_size = self.message["segment_size"] export_path = self.message["path"] dedupe = self.message.get("dedupe", False) item_date_start = iso8601.parse_date( self.message["item_date_start"] ) if "item_date_start" in self.message else None item_date_end = iso8601.parse_date( self.message["item_date_end"] ) if "item_date_end" in self.message else None temp_path = os.path.join(self.working_path, "tmp") base_filepath = os.path.join(temp_path, export_id) if warc_paths: # Clean the temp directory if os.path.exists(temp_path): shutil.rmtree(temp_path) os.makedirs(temp_path) # We get a lot of bang from PETL export_formats = { "csv": ("csv", petl.tocsv), "tsv": ("tsv", petl.totsv), "html": ("html", petl.tohtml), "xlsx": ("xlsx", to_xlsx), "json": ("json", to_lineoriented_json) } # Other possibilities: XML, databases, HDFS if export_format == "json_full": self._full_json_export(warc_paths, base_filepath, dedupe, item_date_start, item_date_end, seed_uids, export_segment_size) elif export_format == "dehydrate": tables = self.table_cls(warc_paths, dedupe, item_date_start, item_date_end, seed_uids, export_segment_size) for idx, table in enumerate(tables): filepath = "{}_{}.txt".format(base_filepath, str(idx + 1).zfill(3)) log.info("Exporting to %s", filepath) petl.totext(table, filepath, template="{{{}}}\n".format( tables.id_field())) elif export_format in export_formats: tables = self.table_cls(warc_paths, dedupe, item_date_start, item_date_end, seed_uids, export_segment_size) for idx, table in enumerate(tables): filepath = "{}_{}.{}".format( base_filepath, str(idx + 1).zfill(3), export_formats[export_format][0]) log.info("Exporting to %s", filepath) export_formats[export_format][1](table, filepath) if export_format == 'html': self._file_fix( filepath, prefix= "<html><head><meta charset='utf-8'></head>\n", suffix="</html>") else: self.result.errors.append( Msg(CODE_UNSUPPORTED_EXPORT_FORMAT, "{} is not supported".format(export_format))) self.result.success = False # Move files from temp path to export path if os.path.exists(export_path): shutil.rmtree(export_path) shutil.move(temp_path, export_path) else: self.result.errors.append( Msg(CODE_NO_WARCS, "No WARC files from which to export")) self.result.success = False else: self.result.errors.append( Msg(CODE_BAD_REQUEST, "Request export of a seed or collection.")) self.result.success = False self.result.ended = datetime_now() self._send_response_message( STATUS_SUCCESS if self.result.success else STATUS_FAILURE, self.routing_key, export_id, self.result) def _file_fix(self, filepath, prefix=None, suffix=None): """ create a temp file to save the large file object, don't need to load file to memory """ with tempfile.NamedTemporaryFile(dir=self.working_path, delete=False) as outfile: if prefix: outfile.write(prefix) shutil.copyfileobj(open(filepath, 'r'), outfile) if suffix: outfile.write(suffix) shutil.move(outfile.name, filepath) def _full_json_export(self, warc_paths, base_filepath, dedupe, item_date_start, item_date_end, seed_uids, export_segment_size): warcs = self.warc_iter_cls(warc_paths, seed_uids).iter( dedupe=dedupe, item_date_start=item_date_start, item_date_end=item_date_end, limit_item_types=self.limit_item_types) for idx, statuses in enumerate( self._chunk_json(warcs, export_segment_size)): export_filepath = "{}_{}.json".format(base_filepath, str(idx + 1).zfill(3)) log.info("Exporting to %s", export_filepath) with codecs.open(export_filepath, "w") as f: for status in statuses: json.dump(status.item, f) f.write("\n") @staticmethod def _chunk_json(warcs, chunk_size): iterable = iter(warcs) split_size = chunk_size - 1 if chunk_size else None for post in iterable: # define the chunk def chunk(): # get the first yield post # get the left chunk_size for more in islice(iterable, split_size): yield more yield chunk() def _get_warc_paths(self, collection_id, seed_ids, harvest_date_start, harvest_date_end): """ Get list of WARC files and make sure they exists. """ warc_paths = [] log.debug("Getting warcs for collection %s", collection_id) for warc in self.api_client.warcs( collection_id=collection_id, seed_ids=seed_ids, harvest_date_start=harvest_date_start, harvest_date_end=harvest_date_end): warc_path = os.path.join( self.warc_base_path, warc["path"]) if self.warc_base_path else warc["path"] if os.path.exists(warc_path): warc_paths.append(warc_path) else: self.result.errors.append( Msg(CODE_WARC_MISSING, "{} is missing".format(warc_path))) self.result.success = False log.debug("Warcs are %s", warc_paths) return warc_paths def _send_response_message(self, status, export_request_routing_key, export_id, export_result): # Just add additional info to job message message = { "id": export_id, "status": status, "infos": [msg.to_map() for msg in export_result.infos], "warnings": [msg.to_map() for msg in export_result.warnings], "errors": [msg.to_map() for msg in export_result.errors], "date_started": export_result.started.isoformat(), # This will add spaces before caps "service": re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', self.__class__.__name__), "host": self.host, "instance": str(os.getpid()) } if export_result.ended: message["date_ended"] = export_result.ended.isoformat() # Routing key may be none response_routing_key = export_request_routing_key.replace( "start", "status") self._publish_message(response_routing_key, message) @staticmethod def main(cls, queue, routing_keys): """ A configurable main() for an exporter. For example: if __name__ == "__main__": FlickrExporter.main(FlickrExporter, QUEUE, [ROUTING_KEY]) :param cls: the exporter class :param queue: queue for the harvester :param routing_keys: list of routing keys for the exporter """ # Logging logging.basicConfig(format='%(asctime)s: %(name)s --> %(message)s', level=logging.DEBUG) # Arguments parser = argparse.ArgumentParser() parser.add_argument("--debug", type=lambda v: v.lower() in ("yes", "true", "t", "1"), nargs="?", default="False", const="True") subparsers = parser.add_subparsers(dest="command") service_parser = subparsers.add_parser( "service", help="Run export service that consumes messages from " "messaging queue.") service_parser.add_argument("host") service_parser.add_argument("username") service_parser.add_argument("password") service_parser.add_argument("api") service_parser.add_argument("working_path") service_parser.add_argument("--skip-resume", action="store_true") file_parser = subparsers.add_parser("file", help="Export based on a file.") file_parser.add_argument("filepath", help="Filepath of the export file.") file_parser.add_argument("api", help="Base url of SFM-UI API") file_parser.add_argument("working_path") file_parser.add_argument("--host") file_parser.add_argument("--username") file_parser.add_argument("--password") args = parser.parse_args() # Logging logging.getLogger().setLevel( logging.DEBUG if args.debug else logging.INFO) if args.command == "service": exporter = cls(args.api, args.working_path, mq_config=MqConfig(args.host, args.username, args.password, EXCHANGE, {queue: routing_keys})) if not args.skip_resume: exporter.resume_from_file() exporter.run() elif args.command == "file": mq_config = MqConfig(args.host, args.username, args.password, EXCHANGE, None) \ if args.host and args.username and args.password else None exporter = cls(args.api, args.working_path, mq_config=mq_config) exporter.message_from_file(args.filepath) if exporter.result: log.info("Result is: %s", exporter.result) sys.exit(0) else: log.warning("Result is: %s", exporter.result) sys.exit(1)
class TestApiClient(TestCase): def setUp(self): self.client = ApiClient("http://192.168.99.100:8081/") @vcr.use_cassette() def test_all_warcs(self): self.assertEqual(3, len(list(self.client.warcs()))) @vcr.use_cassette() def test_warcs_by_collection(self): self.assertEqual(2, len(list(self.client.warcs(collection_id="005b131f5f854402afa2b08a4b7ba960")))) self.assertEqual(0, len(list(self.client.warcs(collection_id="x005b131f5f854402afa2b08a4b7ba960")))) @vcr.use_cassette() def test_warcs_by_seed(self): self.assertEqual(2, len(list(self.client.warcs(seed_ids="ded3849618b04818ae100a489d67d395")))) self.assertEqual(0, len(list(self.client.warcs(seed_ids="xded3849618b04818ae100a489d67d395")))) self.assertEqual(2, len(list(self.client.warcs(seed_ids=["ded3849618b04818ae100a489d67d395"])))) self.assertEqual(2, len(list(self.client.warcs(seed_ids=["ded3849618b04818ae100a489d67d395", "x"])))) self.assertEqual(3, len( list(self.client.warcs(seed_ids=["48722ac6154241f592fd74da775b7ab7", "3ce76759a3ee40b894562a35359dfa54"])))) @vcr.use_cassette() def test_warcs_by_harvest_date_start(self): self.assertEqual(3, len(list(self.client.warcs(harvest_date_start="2015-02-22T14:49:07Z")))) self.assertEqual(1, len(list(self.client.warcs(harvest_date_start="2016-02-22T14:49:07Z")))) self.assertEqual(0, len(list(self.client.warcs(harvest_date_start="2017-02-22T14:48:07Z")))) @vcr.use_cassette() def test_warcs_by_harvest_date_end(self): self.assertEqual(0, len(list(self.client.warcs(harvest_date_end="2015-02-22T14:49:07Z")))) self.assertEqual(2, len(list(self.client.warcs(harvest_date_end="2016-02-22T14:49:07Z")))) self.assertEqual(3, len(list(self.client.warcs(harvest_date_end="2017-02-22T14:48:07Z")))) @vcr.use_cassette() def test_exclude_web(self): self.assertEqual(4, len(list(self.client.warcs(exclude_web=True)))) self.assertEqual(5, len(list(self.client.warcs(exclude_web=False)))) @vcr.use_cassette() def test_all_collections(self): self.assertEqual(5, len(list(self.client.collections()))) @vcr.use_cassette() def test_collections_startswith(self): self.assertEqual(1, len(list(self.client.collections(collection_id_startswith="8fcb71eb883745")))) self.assertEqual(0, len(list(self.client.collections(collection_id_startswith="x8fcb71eb883745"))))
class BaseExporter(BaseConsumer): def __init__(self, api_base_url, warc_iter_cls, table_cls, working_path, mq_config=None, warc_base_path=None, limit_item_types=None, host=None): BaseConsumer.__init__(self, mq_config=mq_config, working_path=working_path, persist_messages=True) self.api_client = ApiClient(api_base_url) self.warc_iter_cls = warc_iter_cls self.table_cls = table_cls self.limit_item_types = limit_item_types # This is for unit tests only. self.warc_base_path = warc_base_path self.host = host or os.environ.get("HOSTNAME", "localhost") def on_message(self): assert self.message export_id = self.message["id"] log.info("Performing export %s", export_id) self.result = ExportResult() self.result.started = datetime_now() # Send status indicating that it is running self._send_response_message(STATUS_RUNNING, self.routing_key, export_id, self.result) # Get the WARCs from the API collection_id = self.message.get("collection", {}).get("id") seed_ids = [] seed_uids = [] for seed in self.message.get("seeds", []): seed_ids.append(seed["id"]) seed_uids.append(seed["uid"]) if (collection_id or seed_ids) and not (collection_id and seed_ids): harvest_date_start = self.message.get("harvest_date_start") harvest_date_end = self.message.get("harvest_date_end") warc_paths = self._get_warc_paths(collection_id, seed_ids, harvest_date_start, harvest_date_end) export_format = self.message["format"] export_segment_size = self.message["segment_size"] export_path = self.message["path"] dedupe = self.message.get("dedupe", False) item_date_start = iso8601.parse_date( self.message["item_date_start"]) if "item_date_start" in self.message else None item_date_end = iso8601.parse_date( self.message["item_date_end"]) if "item_date_end" in self.message else None temp_path = os.path.join(self.working_path, "tmp") base_filepath = os.path.join(temp_path, export_id) if warc_paths: # Clean the temp directory if os.path.exists(temp_path): shutil.rmtree(temp_path) os.makedirs(temp_path) # We get a lot of bang from PETL export_formats = { "csv": ("csv", petl.tocsv), "tsv": ("tsv", petl.totsv), "html": ("html", petl.tohtml), "xlsx": ("xlsx", to_xlsx), "json": ("json", to_lineoriented_json) } # Other possibilities: XML, databases, HDFS if export_format == "json_full": self._full_json_export(warc_paths, base_filepath, dedupe, item_date_start, item_date_end, seed_uids, export_segment_size) elif export_format == "dehydrate": tables = self.table_cls(warc_paths, dedupe, item_date_start, item_date_end, seed_uids, export_segment_size) for idx, table in enumerate(tables): filepath = "{}_{}.txt".format(base_filepath, str(idx + 1).zfill(3)) log.info("Exporting to %s", filepath) petl.totext(table, filepath, template="{{{}}}\n".format(tables.id_field())) elif export_format in export_formats: tables = self.table_cls(warc_paths, dedupe, item_date_start, item_date_end, seed_uids, export_segment_size) for idx, table in enumerate(tables): filepath = "{}_{}.{}".format(base_filepath, str(idx + 1).zfill(3), export_formats[export_format][0]) log.info("Exporting to %s", filepath) export_formats[export_format][1](table, filepath) if export_format == 'html': self._file_fix(filepath, prefix="<html><head><meta charset='utf-8'></head>\n", suffix="</html>") else: self.result.errors.append( Msg(CODE_UNSUPPORTED_EXPORT_FORMAT, "{} is not supported".format(export_format))) self.result.success = False # Move files from temp path to export path if os.path.exists(export_path): shutil.rmtree(export_path) shutil.move(temp_path, export_path) else: self.result.errors.append(Msg(CODE_NO_WARCS, "No WARC files from which to export")) self.result.success = False else: self.result.errors.append(Msg(CODE_BAD_REQUEST, "Request export of a seed or collection.")) self.result.success = False self.result.ended = datetime_now() self._send_response_message(STATUS_SUCCESS if self.result.success else STATUS_FAILURE, self.routing_key, export_id, self.result) def _file_fix(self, filepath, prefix=None, suffix=None): """ create a temp file to save the large file object, don't need to load file to memory """ with tempfile.NamedTemporaryFile(dir=self.working_path, delete=False) as outfile: if prefix: outfile.write(prefix) shutil.copyfileobj(file(filepath, 'r'), outfile) if suffix: outfile.write(suffix) shutil.move(outfile.name, filepath) def _full_json_export(self, warc_paths, base_filepath, dedupe, item_date_start, item_date_end, seed_uids, export_segment_size): warcs = self.warc_iter_cls(warc_paths, seed_uids).iter(dedupe=dedupe, item_date_start=item_date_start, item_date_end=item_date_end, limit_item_types=self.limit_item_types) for idx, statuses in enumerate(self._chunk_json(warcs, export_segment_size)): export_filepath = "{}_{}.json".format(base_filepath, str(idx + 1).zfill(3)) log.info("Exporting to %s", export_filepath) with codecs.open(export_filepath, "w") as f: for status in statuses: json.dump(status.item, f) f.write("\n") def _chunk_json(self, warcs, chunk_size): iterable = iter(warcs) split_size = chunk_size - 1 if chunk_size else None for post in iterable: # define the chunk def chunk(): # get the first yield post # get the left chunk_size for more in islice(iterable, split_size): yield more yield chunk() def _get_warc_paths(self, collection_id, seed_ids, harvest_date_start, harvest_date_end): """ Get list of WARC files and make sure they exists. """ warc_paths = [] log.debug("Getting warcs for collection %s", collection_id) for warc in self.api_client.warcs(collection_id=collection_id, seed_ids=seed_ids, harvest_date_start=harvest_date_start, harvest_date_end=harvest_date_end, exclude_web=True): warc_path = os.path.join(self.warc_base_path, warc["path"]) if self.warc_base_path else warc["path"] if os.path.exists(warc_path): warc_paths.append(warc_path) else: self.result.errors.append(Msg(CODE_WARC_MISSING, "{} is missing".format(warc_path))) self.result.success = False log.debug("Warcs are %s", warc_paths) return warc_paths def _send_response_message(self, status, export_request_routing_key, export_id, export_result): # Just add additional info to job message message = { "id": export_id, "status": status, "infos": [msg.to_map() for msg in export_result.infos], "warnings": [msg.to_map() for msg in export_result.warnings], "errors": [msg.to_map() for msg in export_result.errors], "date_started": export_result.started.isoformat(), # This will add spaces before caps "service": re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', self.__class__.__name__), "host": self.host, "instance": str(os.getpid()) } if export_result.ended: message["date_ended"] = export_result.ended.isoformat() # Routing key may be none response_routing_key = export_request_routing_key.replace("start", "status") self._publish_message(response_routing_key, message) @staticmethod def main(cls, queue, routing_keys): """ A configurable main() for an exporter. For example: if __name__ == "__main__": FlickrExporter.main(FlickrExporter, QUEUE, [ROUTING_KEY]) :param cls: the exporter class :param queue: queue for the harvester :param routing_keys: list of routing keys for the exporter """ # Logging logging.basicConfig(format='%(asctime)s: %(name)s --> %(message)s', level=logging.DEBUG) # Arguments parser = argparse.ArgumentParser() parser.add_argument("--debug", type=lambda v: v.lower() in ("yes", "true", "t", "1"), nargs="?", default="False", const="True") subparsers = parser.add_subparsers(dest="command") service_parser = subparsers.add_parser("service", help="Run export service that consumes messages from " "messaging queue.") service_parser.add_argument("host") service_parser.add_argument("username") service_parser.add_argument("password") service_parser.add_argument("api") service_parser.add_argument("working_path") service_parser.add_argument("--skip-resume", action="store_true") file_parser = subparsers.add_parser("file", help="Export based on a file.") file_parser.add_argument("filepath", help="Filepath of the export file.") file_parser.add_argument("api", help="Base url of SFM-UI API") file_parser.add_argument("working_path") file_parser.add_argument("--host") file_parser.add_argument("--username") file_parser.add_argument("--password") args = parser.parse_args() # Logging logging.getLogger().setLevel(logging.DEBUG if args.debug else logging.INFO) if args.command == "service": exporter = cls(args.api, args.working_path, mq_config=MqConfig(args.host, args.username, args.password, EXCHANGE, {queue: routing_keys})) if not args.skip_resume: exporter.resume_from_file() exporter.run() elif args.command == "file": mq_config = MqConfig(args.host, args.username, args.password, EXCHANGE, None) \ if args.host and args.username and args.password else None exporter = cls(args.api, args.working_path, mq_config=mq_config) exporter.message_from_file(args.filepath) if exporter.result: log.info("Result is: %s", exporter.result) sys.exit(0) else: log.warning("Result is: %s", exporter.result) sys.exit(1)
class TestApiClient(TestCase): def setUp(self): self.client = ApiClient("http://localhost:8080/") @vcr.use_cassette() def test_all_warcs(self): self.assertEqual(3, len(list(self.client.warcs()))) @vcr.use_cassette() def test_warcs_by_collection(self): self.assertEqual( 2, len( list( self.client.warcs( collection_id="366439dbb28146a9bd439dcc3f076c70")))) self.assertEqual( 0, len( list( self.client.warcs( collection_id="x366439dbb28146a9bd439dcc3f076c70")))) @vcr.use_cassette() def test_warcs_by_seed(self): self.assertEqual( 2, len( list( self.client.warcs( seed_ids="4117a0b5c42646589f5dc81b0fa5eb0c")))) self.assertEqual( 0, len( list( self.client.warcs( seed_ids="x4117a0b5c42646589f5dc81b0fa5eb0c")))) self.assertEqual( 2, len( list( self.client.warcs( seed_ids=["4117a0b5c42646589f5dc81b0fa5eb0c"])))) self.assertEqual( 2, len( list( self.client.warcs( seed_ids=["4117a0b5c42646589f5dc81b0fa5eb0c", "x"])))) self.assertEqual( 3, len( list( self.client.warcs(seed_ids=[ "4117a0b5c42646589f5dc81b0fa5eb0c", "c07e9e180dd24abcac700d1934bda3d1" ])))) @vcr.use_cassette() def test_warcs_by_harvest_date_start(self): self.assertEqual( 3, len( list( self.client.warcs( harvest_date_start="2017-05-25T13:57:47.980000Z")))) self.assertEqual( 1, len( list( self.client.warcs( harvest_date_start="2018-05-25T13:56:47.980000Z")))) self.assertEqual( 0, len( list( self.client.warcs( harvest_date_start="2019-05-25T13:57:47.980000Z")))) @vcr.use_cassette() def test_warcs_by_warc_created_date(self): self.assertEqual( 3, len( list( self.client.warcs( harvest_date_start="2017-05-25T13:57:47.980000Z")))) self.assertEqual( 1, len( list( self.client.warcs( harvest_date_start="2018-05-25T13:56:47.980000Z")))) self.assertEqual( 0, len( list( self.client.warcs( harvest_date_start="2019-05-25T13:57:47.980000Z")))) @vcr.use_cassette() def test_all_collections(self): self.assertEqual(2, len(list(self.client.collections()))) @vcr.use_cassette() def test_collections_startswith(self): self.assertEqual( 1, len( list( self.client.collections( collection_id_startswith="366439dbb")))) self.assertEqual( 0, len( list( self.client.collections( collection_id_startswith="x366439dbb"))))