Ejemplo n.º 1
0
def main(sys_argv):
    # Arguments
    parser = argparse.ArgumentParser(description="Return WARC filepaths for passing to other commandlines.")
    parser.add_argument("--harvest-start", help="ISO8601 datetime after which harvest was performed. For example, "
                                                "2015-02-22T14:49:07Z")
    parser.add_argument("--harvest-end", help="ISO8601 datetime before which harvest was performed. For example, "
                                              "2015-02-22T14:49:07Z")
    parser.add_argument("--warc-start", help="ISO8601 datetime after which WARC was created. For example, "
                                             "2015-02-22T14:49:07Z")
    parser.add_argument("--warc-end", help="ISO8601 datetime before which WARC was created. For example, "
                                           "2015-02-22T14:49:07Z")
    default_api_base_url = "http://api:8080"
    parser.add_argument("--api-base-url", help="Base url of the SFM API. Default is {}.".format(default_api_base_url),
                        default=default_api_base_url)
    parser.add_argument("--debug", type=lambda v: v.lower() in ("yes", "true", "t", "1"), nargs="?",
                        default="False", const="True")
    parser.add_argument("--newline", action="store_true", help="Separates WARCs by newline instead of space.")
    parser.add_argument("collection", nargs="+", help="Limit to WARCs of this collection. "
                                                      "Truncated collection ids may be used.")

    # Explicitly using sys.argv so that can mock out for testing.
    args = parser.parse_args(sys_argv[1:])

    # Logging
    logging.basicConfig(format='%(asctime)s: %(name)s --> %(message)s',
                        level=logging.DEBUG if args.debug else logging.INFO)
    logging.getLogger("requests").setLevel(logging.DEBUG if args.debug else logging.INFO)

    api_client = ApiClient(args.api_base_url)
    collection_ids = []
    for collection_id_part in args.collection:
        log.debug("Looking up collection id part %s", collection_id_part)
        if len(collection_id_part) == 32:
            collection_ids.append(collection_id_part)
        else:
            collections = list(api_client.collections(collection_id_startswith=collection_id_part))
            if len(collections) == 0:
                print("No matching collections for {}".format(collection_id_part))
                sys.exit(1)
            elif len(collections) > 1:
                print("Multuple matching collections for {}".format(collection_id_part))
                sys.exit(1)
            else:
                collection_ids.append(collections[0]["collection_id"])
    warc_filepaths = set()
    for collection_id in collection_ids:
        log.debug("Looking up warcs for %s", collection_id)
        warcs = api_client.warcs(collection_id=collection_id, harvest_date_start=args.harvest_start,
                                 harvest_date_end=args.harvest_end,
                                 created_date_start=args.warc_start, created_date_end=args.warc_end)
        for warc in warcs:
            warc_filepaths.add(warc["path"])
    sep = "\n" if args.newline else " "
    return sep.join(sorted(warc_filepaths))
Ejemplo n.º 2
0
def main(sys_argv):
    # Arguments
    parser = argparse.ArgumentParser(description="Return WARC filepaths for passing to other commandlines.")
    parser.add_argument("--include-web", action="store_true", help="Include WARCs for web harvests.")
    parser.add_argument("--harvest-start", help="ISO8601 datetime after which harvest was performed. For example, "
                                                "2015-02-22T14:49:07Z")
    parser.add_argument("--harvest-end", help="ISO8601 datetime before which harvest was performed. For example, "
                                              "2015-02-22T14:49:07Z")
    default_api_base_url = "http://api:8080"
    parser.add_argument("--api-base-url", help="Base url of the SFM API. Default is {}.".format(default_api_base_url),
                        default=default_api_base_url)
    parser.add_argument("--debug", type=lambda v: v.lower() in ("yes", "true", "t", "1"), nargs="?",
                        default="False", const="True")
    parser.add_argument("collection", nargs="+", help="Limit to WARCs of this collection. "
                                                      "Truncated collection ids may be used.")

    # Explicitly using sys.argv so that can mock out for testing.
    args = parser.parse_args(sys_argv[1:])

    # Logging
    logging.basicConfig(format='%(asctime)s: %(name)s --> %(message)s',
                        level=logging.DEBUG if args.debug else logging.INFO)
    logging.getLogger("requests").setLevel(logging.DEBUG if args.debug else logging.INFO)

    api_client = ApiClient(args.api_base_url)
    collection_ids = []
    for collection_id_part in args.collection:
        log.debug("Looking up collection id part %s", collection_id_part)
        if len(collection_id_part) == 32:
            collection_ids.append(collection_id_part)
        else:
            collections = list(api_client.collections(collection_id_startswith=collection_id_part))
            if len(collections) == 0:
                print "No matching collections for {}".format(collection_id_part)
                sys.exit(1)
                return
            elif len(collections) > 1:
                print "Multuple matching collections for {}".format(collection_id_part)
                sys.exit(1)
                return
            else:
                collection_ids.append(collections[0]["collection_id"])
    warc_filepaths = set()
    for collection_id in collection_ids:
        log.debug("Looking up warcs for %s", collection_id)
        warcs = api_client.warcs(collection_id=collection_id, harvest_date_start=args.harvest_start,
                                 harvest_date_end=args.harvest_end, exclude_web=not args.include_web)
        for warc in warcs:
            warc_filepaths.add(warc["path"])
    return " ".join(sorted(warc_filepaths))
Ejemplo n.º 3
0
 def __init__(self, api_base_url, warc_iter_cls, table_cls, working_path, mq_config=None, warc_base_path=None,
              limit_item_types=None, host=None):
     BaseConsumer.__init__(self, mq_config=mq_config, working_path=working_path, persist_messages=True)
     self.api_client = ApiClient(api_base_url)
     self.warc_iter_cls = warc_iter_cls
     self.table_cls = table_cls
     self.limit_item_types = limit_item_types
     # This is for unit tests only.
     self.warc_base_path = warc_base_path
     self.host = host or os.environ.get("HOSTNAME", "localhost")
Ejemplo n.º 4
0
 def __init__(self,
              api_base_url,
              warc_iter_cls,
              table_cls,
              working_path,
              mq_config=None,
              warc_base_path=None,
              limit_item_types=None,
              host=None):
     BaseConsumer.__init__(self,
                           mq_config=mq_config,
                           working_path=working_path,
                           persist_messages=True)
     self.api_client = ApiClient(api_base_url)
     self.warc_iter_cls = warc_iter_cls
     self.table_cls = table_cls
     self.limit_item_types = limit_item_types
     # This is for unit tests only.
     self.warc_base_path = warc_base_path
     self.host = host or os.environ.get("HOSTNAME", "localhost")
Ejemplo n.º 5
0
class BaseExporter(BaseConsumer):
    def __init__(self,
                 api_base_url,
                 warc_iter_cls,
                 table_cls,
                 working_path,
                 mq_config=None,
                 warc_base_path=None,
                 limit_item_types=None,
                 host=None):
        BaseConsumer.__init__(self,
                              mq_config=mq_config,
                              working_path=working_path,
                              persist_messages=True)
        self.api_client = ApiClient(api_base_url)
        self.warc_iter_cls = warc_iter_cls
        self.table_cls = table_cls
        self.limit_item_types = limit_item_types
        # This is for unit tests only.
        self.warc_base_path = warc_base_path
        self.host = host or os.environ.get("HOSTNAME", "localhost")

    def on_message(self):
        assert self.message

        export_id = self.message["id"]
        log.info("Performing export %s", export_id)

        self.result = ExportResult()
        self.result.started = datetime_now()

        # Send status indicating that it is running
        self._send_response_message(STATUS_RUNNING, self.routing_key,
                                    export_id, self.result)

        # Get the WARCs from the API
        collection_id = self.message.get("collection", {}).get("id")
        seed_ids = []
        seed_uids = []
        for seed in self.message.get("seeds", []):
            seed_ids.append(seed["id"])
            seed_uids.append(seed["uid"])

        if (collection_id or seed_ids) and not (collection_id and seed_ids):
            harvest_date_start = self.message.get("harvest_date_start")
            harvest_date_end = self.message.get("harvest_date_end")
            # Only request seed ids if < 20. If use too many, will cause problems calling API.
            # 20 is an arbitrary number
            warc_paths = self._get_warc_paths(
                collection_id, seed_ids if len(seed_ids) <= 20 else None,
                harvest_date_start, harvest_date_end)
            export_format = self.message["format"]
            export_segment_size = self.message["segment_size"]
            export_path = self.message["path"]
            dedupe = self.message.get("dedupe", False)
            item_date_start = iso8601.parse_date(
                self.message["item_date_start"]
            ) if "item_date_start" in self.message else None
            item_date_end = iso8601.parse_date(
                self.message["item_date_end"]
            ) if "item_date_end" in self.message else None
            temp_path = os.path.join(self.working_path, "tmp")
            base_filepath = os.path.join(temp_path, export_id)

            if warc_paths:

                # Clean the temp directory
                if os.path.exists(temp_path):
                    shutil.rmtree(temp_path)
                os.makedirs(temp_path)

                # We get a lot of bang from PETL
                export_formats = {
                    "csv": ("csv", petl.tocsv),
                    "tsv": ("tsv", petl.totsv),
                    "html": ("html", petl.tohtml),
                    "xlsx": ("xlsx", to_xlsx),
                    "json": ("json", to_lineoriented_json)
                }
                # Other possibilities: XML, databases, HDFS
                if export_format == "json_full":
                    self._full_json_export(warc_paths, base_filepath, dedupe,
                                           item_date_start, item_date_end,
                                           seed_uids, export_segment_size)
                elif export_format == "dehydrate":
                    tables = self.table_cls(warc_paths, dedupe,
                                            item_date_start, item_date_end,
                                            seed_uids, export_segment_size)
                    for idx, table in enumerate(tables):
                        filepath = "{}_{}.txt".format(base_filepath,
                                                      str(idx + 1).zfill(3))
                        log.info("Exporting to %s", filepath)
                        petl.totext(table,
                                    filepath,
                                    template="{{{}}}\n".format(
                                        tables.id_field()))
                elif export_format in export_formats:
                    tables = self.table_cls(warc_paths, dedupe,
                                            item_date_start, item_date_end,
                                            seed_uids, export_segment_size)
                    for idx, table in enumerate(tables):
                        filepath = "{}_{}.{}".format(
                            base_filepath,
                            str(idx + 1).zfill(3),
                            export_formats[export_format][0])
                        log.info("Exporting to %s", filepath)
                        export_formats[export_format][1](table, filepath)
                        if export_format == 'html':
                            self._file_fix(
                                filepath,
                                prefix=
                                "<html><head><meta charset='utf-8'></head>\n",
                                suffix="</html>")
                else:
                    self.result.errors.append(
                        Msg(CODE_UNSUPPORTED_EXPORT_FORMAT,
                            "{} is not supported".format(export_format)))
                    self.result.success = False

                # Move files from temp path to export path
                if os.path.exists(export_path):
                    shutil.rmtree(export_path)
                shutil.move(temp_path, export_path)

            else:
                self.result.errors.append(
                    Msg(CODE_NO_WARCS, "No WARC files from which to export"))
                self.result.success = False

        else:
            self.result.errors.append(
                Msg(CODE_BAD_REQUEST,
                    "Request export of a seed or collection."))
            self.result.success = False

        self.result.ended = datetime_now()
        self._send_response_message(
            STATUS_SUCCESS if self.result.success else STATUS_FAILURE,
            self.routing_key, export_id, self.result)

    def _file_fix(self, filepath, prefix=None, suffix=None):
        """
        create a temp file to save the large file object, don't
        need to load file to memory
        """
        with tempfile.NamedTemporaryFile(dir=self.working_path,
                                         delete=False) as outfile:
            if prefix:
                outfile.write(prefix)
            shutil.copyfileobj(open(filepath, 'r'), outfile)
            if suffix:
                outfile.write(suffix)
        shutil.move(outfile.name, filepath)

    def _full_json_export(self, warc_paths, base_filepath, dedupe,
                          item_date_start, item_date_end, seed_uids,
                          export_segment_size):

        warcs = self.warc_iter_cls(warc_paths, seed_uids).iter(
            dedupe=dedupe,
            item_date_start=item_date_start,
            item_date_end=item_date_end,
            limit_item_types=self.limit_item_types)

        for idx, statuses in enumerate(
                self._chunk_json(warcs, export_segment_size)):
            export_filepath = "{}_{}.json".format(base_filepath,
                                                  str(idx + 1).zfill(3))
            log.info("Exporting to %s", export_filepath)
            with codecs.open(export_filepath, "w") as f:
                for status in statuses:
                    json.dump(status.item, f)
                    f.write("\n")

    @staticmethod
    def _chunk_json(warcs, chunk_size):
        iterable = iter(warcs)
        split_size = chunk_size - 1 if chunk_size else None
        for post in iterable:
            # define the chunk
            def chunk():
                # get the first
                yield post
                # get the left chunk_size
                for more in islice(iterable, split_size):
                    yield more

            yield chunk()

    def _get_warc_paths(self, collection_id, seed_ids, harvest_date_start,
                        harvest_date_end):
        """
        Get list of WARC files and make sure they exists.
        """
        warc_paths = []
        log.debug("Getting warcs for collection %s", collection_id)
        for warc in self.api_client.warcs(
                collection_id=collection_id,
                seed_ids=seed_ids,
                harvest_date_start=harvest_date_start,
                harvest_date_end=harvest_date_end):
            warc_path = os.path.join(
                self.warc_base_path,
                warc["path"]) if self.warc_base_path else warc["path"]
            if os.path.exists(warc_path):
                warc_paths.append(warc_path)
            else:
                self.result.errors.append(
                    Msg(CODE_WARC_MISSING, "{} is missing".format(warc_path)))
                self.result.success = False
        log.debug("Warcs are %s", warc_paths)
        return warc_paths

    def _send_response_message(self, status, export_request_routing_key,
                               export_id, export_result):
        # Just add additional info to job message
        message = {
            "id": export_id,
            "status": status,
            "infos": [msg.to_map() for msg in export_result.infos],
            "warnings": [msg.to_map() for msg in export_result.warnings],
            "errors": [msg.to_map() for msg in export_result.errors],
            "date_started": export_result.started.isoformat(),
            # This will add spaces before caps
            "service": re.sub(r'(?<=[a-z])(?=[A-Z])', ' ',
                              self.__class__.__name__),
            "host": self.host,
            "instance": str(os.getpid())
        }

        if export_result.ended:
            message["date_ended"] = export_result.ended.isoformat()

        # Routing key may be none
        response_routing_key = export_request_routing_key.replace(
            "start", "status")
        self._publish_message(response_routing_key, message)

    @staticmethod
    def main(cls, queue, routing_keys):
        """
        A configurable main() for an exporter.

        For example:
            if __name__ == "__main__":
                FlickrExporter.main(FlickrExporter, QUEUE, [ROUTING_KEY])

        :param cls: the exporter class
        :param queue: queue for the harvester
        :param routing_keys: list of routing keys for the exporter
        """

        # Logging
        logging.basicConfig(format='%(asctime)s: %(name)s --> %(message)s',
                            level=logging.DEBUG)

        # Arguments
        parser = argparse.ArgumentParser()
        parser.add_argument("--debug",
                            type=lambda v: v.lower() in
                            ("yes", "true", "t", "1"),
                            nargs="?",
                            default="False",
                            const="True")

        subparsers = parser.add_subparsers(dest="command")

        service_parser = subparsers.add_parser(
            "service",
            help="Run export service that consumes messages from "
            "messaging queue.")
        service_parser.add_argument("host")
        service_parser.add_argument("username")
        service_parser.add_argument("password")
        service_parser.add_argument("api")
        service_parser.add_argument("working_path")
        service_parser.add_argument("--skip-resume", action="store_true")

        file_parser = subparsers.add_parser("file",
                                            help="Export based on a file.")
        file_parser.add_argument("filepath",
                                 help="Filepath of the export file.")
        file_parser.add_argument("api", help="Base url of SFM-UI API")
        file_parser.add_argument("working_path")
        file_parser.add_argument("--host")
        file_parser.add_argument("--username")
        file_parser.add_argument("--password")

        args = parser.parse_args()

        # Logging
        logging.getLogger().setLevel(
            logging.DEBUG if args.debug else logging.INFO)

        if args.command == "service":
            exporter = cls(args.api,
                           args.working_path,
                           mq_config=MqConfig(args.host, args.username,
                                              args.password, EXCHANGE,
                                              {queue: routing_keys}))
            if not args.skip_resume:
                exporter.resume_from_file()
            exporter.run()
        elif args.command == "file":
            mq_config = MqConfig(args.host, args.username, args.password, EXCHANGE, None) \
                if args.host and args.username and args.password else None
            exporter = cls(args.api, args.working_path, mq_config=mq_config)
            exporter.message_from_file(args.filepath)
            if exporter.result:
                log.info("Result is: %s", exporter.result)
                sys.exit(0)
            else:
                log.warning("Result is: %s", exporter.result)
                sys.exit(1)
Ejemplo n.º 6
0
 def setUp(self):
     self.client = ApiClient("http://192.168.99.100:8081/")
Ejemplo n.º 7
0
class TestApiClient(TestCase):
    def setUp(self):
        self.client = ApiClient("http://192.168.99.100:8081/")

    @vcr.use_cassette()
    def test_all_warcs(self):
        self.assertEqual(3, len(list(self.client.warcs())))

    @vcr.use_cassette()
    def test_warcs_by_collection(self):
        self.assertEqual(2, len(list(self.client.warcs(collection_id="005b131f5f854402afa2b08a4b7ba960"))))
        self.assertEqual(0, len(list(self.client.warcs(collection_id="x005b131f5f854402afa2b08a4b7ba960"))))

    @vcr.use_cassette()
    def test_warcs_by_seed(self):
        self.assertEqual(2, len(list(self.client.warcs(seed_ids="ded3849618b04818ae100a489d67d395"))))
        self.assertEqual(0, len(list(self.client.warcs(seed_ids="xded3849618b04818ae100a489d67d395"))))
        self.assertEqual(2, len(list(self.client.warcs(seed_ids=["ded3849618b04818ae100a489d67d395"]))))
        self.assertEqual(2, len(list(self.client.warcs(seed_ids=["ded3849618b04818ae100a489d67d395", "x"]))))
        self.assertEqual(3, len(
            list(self.client.warcs(seed_ids=["48722ac6154241f592fd74da775b7ab7", "3ce76759a3ee40b894562a35359dfa54"]))))

    @vcr.use_cassette()
    def test_warcs_by_harvest_date_start(self):
        self.assertEqual(3, len(list(self.client.warcs(harvest_date_start="2015-02-22T14:49:07Z"))))
        self.assertEqual(1, len(list(self.client.warcs(harvest_date_start="2016-02-22T14:49:07Z"))))
        self.assertEqual(0, len(list(self.client.warcs(harvest_date_start="2017-02-22T14:48:07Z"))))

    @vcr.use_cassette()
    def test_warcs_by_harvest_date_end(self):
        self.assertEqual(0, len(list(self.client.warcs(harvest_date_end="2015-02-22T14:49:07Z"))))
        self.assertEqual(2, len(list(self.client.warcs(harvest_date_end="2016-02-22T14:49:07Z"))))
        self.assertEqual(3, len(list(self.client.warcs(harvest_date_end="2017-02-22T14:48:07Z"))))

    @vcr.use_cassette()
    def test_exclude_web(self):
        self.assertEqual(4, len(list(self.client.warcs(exclude_web=True))))
        self.assertEqual(5, len(list(self.client.warcs(exclude_web=False))))

    @vcr.use_cassette()
    def test_all_collections(self):
        self.assertEqual(5, len(list(self.client.collections())))

    @vcr.use_cassette()
    def test_collections_startswith(self):
        self.assertEqual(1, len(list(self.client.collections(collection_id_startswith="8fcb71eb883745"))))
        self.assertEqual(0, len(list(self.client.collections(collection_id_startswith="x8fcb71eb883745"))))
Ejemplo n.º 8
0
class BaseExporter(BaseConsumer):
    def __init__(self, api_base_url, warc_iter_cls, table_cls, working_path, mq_config=None, warc_base_path=None,
                 limit_item_types=None, host=None):
        BaseConsumer.__init__(self, mq_config=mq_config, working_path=working_path, persist_messages=True)
        self.api_client = ApiClient(api_base_url)
        self.warc_iter_cls = warc_iter_cls
        self.table_cls = table_cls
        self.limit_item_types = limit_item_types
        # This is for unit tests only.
        self.warc_base_path = warc_base_path
        self.host = host or os.environ.get("HOSTNAME", "localhost")

    def on_message(self):
        assert self.message

        export_id = self.message["id"]
        log.info("Performing export %s", export_id)

        self.result = ExportResult()
        self.result.started = datetime_now()

        # Send status indicating that it is running
        self._send_response_message(STATUS_RUNNING, self.routing_key, export_id, self.result)

        # Get the WARCs from the API
        collection_id = self.message.get("collection", {}).get("id")
        seed_ids = []
        seed_uids = []
        for seed in self.message.get("seeds", []):
            seed_ids.append(seed["id"])
            seed_uids.append(seed["uid"])

        if (collection_id or seed_ids) and not (collection_id and seed_ids):
            harvest_date_start = self.message.get("harvest_date_start")
            harvest_date_end = self.message.get("harvest_date_end")
            warc_paths = self._get_warc_paths(collection_id, seed_ids, harvest_date_start, harvest_date_end)
            export_format = self.message["format"]
            export_segment_size = self.message["segment_size"]
            export_path = self.message["path"]
            dedupe = self.message.get("dedupe", False)
            item_date_start = iso8601.parse_date(
                self.message["item_date_start"]) if "item_date_start" in self.message else None
            item_date_end = iso8601.parse_date(
                self.message["item_date_end"]) if "item_date_end" in self.message else None
            temp_path = os.path.join(self.working_path, "tmp")
            base_filepath = os.path.join(temp_path, export_id)

            if warc_paths:

                # Clean the temp directory
                if os.path.exists(temp_path):
                    shutil.rmtree(temp_path)
                os.makedirs(temp_path)

                # We get a lot of bang from PETL
                export_formats = {
                    "csv": ("csv", petl.tocsv),
                    "tsv": ("tsv", petl.totsv),
                    "html": ("html", petl.tohtml),
                    "xlsx": ("xlsx", to_xlsx),
                    "json": ("json", to_lineoriented_json)
                }
                # Other possibilities: XML, databases, HDFS
                if export_format == "json_full":
                    self._full_json_export(warc_paths, base_filepath, dedupe, item_date_start, item_date_end, seed_uids,
                                           export_segment_size)
                elif export_format == "dehydrate":
                    tables = self.table_cls(warc_paths, dedupe, item_date_start, item_date_end, seed_uids,
                                            export_segment_size)
                    for idx, table in enumerate(tables):
                        filepath = "{}_{}.txt".format(base_filepath, str(idx + 1).zfill(3))
                        log.info("Exporting to %s", filepath)
                        petl.totext(table, filepath, template="{{{}}}\n".format(tables.id_field()))
                elif export_format in export_formats:
                    tables = self.table_cls(warc_paths, dedupe, item_date_start, item_date_end, seed_uids,
                                            export_segment_size)
                    for idx, table in enumerate(tables):
                        filepath = "{}_{}.{}".format(base_filepath, str(idx + 1).zfill(3),
                                                     export_formats[export_format][0])
                        log.info("Exporting to %s", filepath)
                        export_formats[export_format][1](table, filepath)
                        if export_format == 'html':
                            self._file_fix(filepath, prefix="<html><head><meta charset='utf-8'></head>\n",
                                           suffix="</html>")
                else:
                    self.result.errors.append(
                        Msg(CODE_UNSUPPORTED_EXPORT_FORMAT, "{} is not supported".format(export_format)))
                    self.result.success = False

                # Move files from temp path to export path
                if os.path.exists(export_path):
                    shutil.rmtree(export_path)
                shutil.move(temp_path, export_path)

            else:
                self.result.errors.append(Msg(CODE_NO_WARCS, "No WARC files from which to export"))
                self.result.success = False

        else:
            self.result.errors.append(Msg(CODE_BAD_REQUEST, "Request export of a seed or collection."))
            self.result.success = False

        self.result.ended = datetime_now()
        self._send_response_message(STATUS_SUCCESS if self.result.success else STATUS_FAILURE, self.routing_key,
                                    export_id, self.result)

    def _file_fix(self, filepath, prefix=None, suffix=None):
        """
        create a temp file to save the large file object, don't
        need to load file to memory
        """
        with tempfile.NamedTemporaryFile(dir=self.working_path, delete=False) as outfile:
            if prefix:
                outfile.write(prefix)
            shutil.copyfileobj(file(filepath, 'r'), outfile)
            if suffix:
                outfile.write(suffix)
        shutil.move(outfile.name, filepath)

    def _full_json_export(self, warc_paths, base_filepath, dedupe, item_date_start, item_date_end, seed_uids,
                          export_segment_size):

        warcs = self.warc_iter_cls(warc_paths, seed_uids).iter(dedupe=dedupe,
                                                               item_date_start=item_date_start,
                                                               item_date_end=item_date_end,
                                                               limit_item_types=self.limit_item_types)

        for idx, statuses in enumerate(self._chunk_json(warcs, export_segment_size)):
            export_filepath = "{}_{}.json".format(base_filepath, str(idx + 1).zfill(3))
            log.info("Exporting to %s", export_filepath)
            with codecs.open(export_filepath, "w") as f:
                for status in statuses:
                    json.dump(status.item, f)
                    f.write("\n")

    def _chunk_json(self, warcs, chunk_size):
        iterable = iter(warcs)
        split_size = chunk_size - 1 if chunk_size else None
        for post in iterable:
            # define the chunk
            def chunk():
                # get the first
                yield post
                # get the left chunk_size
                for more in islice(iterable, split_size):
                    yield more

            yield chunk()

    def _get_warc_paths(self, collection_id, seed_ids, harvest_date_start, harvest_date_end):
        """
        Get list of WARC files and make sure they exists.
        """
        warc_paths = []
        log.debug("Getting warcs for collection %s", collection_id)
        for warc in self.api_client.warcs(collection_id=collection_id, seed_ids=seed_ids,
                                          harvest_date_start=harvest_date_start, harvest_date_end=harvest_date_end,
                                          exclude_web=True):
            warc_path = os.path.join(self.warc_base_path, warc["path"]) if self.warc_base_path else warc["path"]
            if os.path.exists(warc_path):
                warc_paths.append(warc_path)
            else:
                self.result.errors.append(Msg(CODE_WARC_MISSING, "{} is missing".format(warc_path)))
                self.result.success = False
        log.debug("Warcs are %s", warc_paths)
        return warc_paths

    def _send_response_message(self, status, export_request_routing_key, export_id, export_result):
        # Just add additional info to job message
        message = {
            "id": export_id,
            "status": status,
            "infos": [msg.to_map() for msg in export_result.infos],
            "warnings": [msg.to_map() for msg in export_result.warnings],
            "errors": [msg.to_map() for msg in export_result.errors],
            "date_started": export_result.started.isoformat(),
            # This will add spaces before caps
            "service": re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', self.__class__.__name__),
            "host": self.host,
            "instance": str(os.getpid())
        }

        if export_result.ended:
            message["date_ended"] = export_result.ended.isoformat()

        # Routing key may be none
        response_routing_key = export_request_routing_key.replace("start", "status")
        self._publish_message(response_routing_key, message)

    @staticmethod
    def main(cls, queue, routing_keys):
        """
        A configurable main() for an exporter.

        For example:
            if __name__ == "__main__":
                FlickrExporter.main(FlickrExporter, QUEUE, [ROUTING_KEY])

        :param cls: the exporter class
        :param queue: queue for the harvester
        :param routing_keys: list of routing keys for the exporter
        """

        # Logging
        logging.basicConfig(format='%(asctime)s: %(name)s --> %(message)s', level=logging.DEBUG)

        # Arguments
        parser = argparse.ArgumentParser()
        parser.add_argument("--debug", type=lambda v: v.lower() in ("yes", "true", "t", "1"), nargs="?",
                            default="False", const="True")

        subparsers = parser.add_subparsers(dest="command")

        service_parser = subparsers.add_parser("service", help="Run export service that consumes messages from "
                                                               "messaging queue.")
        service_parser.add_argument("host")
        service_parser.add_argument("username")
        service_parser.add_argument("password")
        service_parser.add_argument("api")
        service_parser.add_argument("working_path")
        service_parser.add_argument("--skip-resume", action="store_true")

        file_parser = subparsers.add_parser("file", help="Export based on a file.")
        file_parser.add_argument("filepath", help="Filepath of the export file.")
        file_parser.add_argument("api", help="Base url of SFM-UI API")
        file_parser.add_argument("working_path")
        file_parser.add_argument("--host")
        file_parser.add_argument("--username")
        file_parser.add_argument("--password")

        args = parser.parse_args()

        # Logging
        logging.getLogger().setLevel(logging.DEBUG if args.debug else logging.INFO)

        if args.command == "service":
            exporter = cls(args.api, args.working_path,
                           mq_config=MqConfig(args.host, args.username, args.password, EXCHANGE,
                                              {queue: routing_keys}))
            if not args.skip_resume:
                exporter.resume_from_file()
            exporter.run()
        elif args.command == "file":
            mq_config = MqConfig(args.host, args.username, args.password, EXCHANGE, None) \
                if args.host and args.username and args.password else None
            exporter = cls(args.api, args.working_path, mq_config=mq_config)
            exporter.message_from_file(args.filepath)
            if exporter.result:
                log.info("Result is: %s", exporter.result)
                sys.exit(0)
            else:
                log.warning("Result is: %s", exporter.result)
                sys.exit(1)
Ejemplo n.º 9
0
 def setUp(self):
     self.client = ApiClient("http://localhost:8080/")
Ejemplo n.º 10
0
class TestApiClient(TestCase):
    def setUp(self):
        self.client = ApiClient("http://localhost:8080/")

    @vcr.use_cassette()
    def test_all_warcs(self):
        self.assertEqual(3, len(list(self.client.warcs())))

    @vcr.use_cassette()
    def test_warcs_by_collection(self):
        self.assertEqual(
            2,
            len(
                list(
                    self.client.warcs(
                        collection_id="366439dbb28146a9bd439dcc3f076c70"))))
        self.assertEqual(
            0,
            len(
                list(
                    self.client.warcs(
                        collection_id="x366439dbb28146a9bd439dcc3f076c70"))))

    @vcr.use_cassette()
    def test_warcs_by_seed(self):
        self.assertEqual(
            2,
            len(
                list(
                    self.client.warcs(
                        seed_ids="4117a0b5c42646589f5dc81b0fa5eb0c"))))
        self.assertEqual(
            0,
            len(
                list(
                    self.client.warcs(
                        seed_ids="x4117a0b5c42646589f5dc81b0fa5eb0c"))))
        self.assertEqual(
            2,
            len(
                list(
                    self.client.warcs(
                        seed_ids=["4117a0b5c42646589f5dc81b0fa5eb0c"]))))
        self.assertEqual(
            2,
            len(
                list(
                    self.client.warcs(
                        seed_ids=["4117a0b5c42646589f5dc81b0fa5eb0c", "x"]))))
        self.assertEqual(
            3,
            len(
                list(
                    self.client.warcs(seed_ids=[
                        "4117a0b5c42646589f5dc81b0fa5eb0c",
                        "c07e9e180dd24abcac700d1934bda3d1"
                    ]))))

    @vcr.use_cassette()
    def test_warcs_by_harvest_date_start(self):
        self.assertEqual(
            3,
            len(
                list(
                    self.client.warcs(
                        harvest_date_start="2017-05-25T13:57:47.980000Z"))))
        self.assertEqual(
            1,
            len(
                list(
                    self.client.warcs(
                        harvest_date_start="2018-05-25T13:56:47.980000Z"))))
        self.assertEqual(
            0,
            len(
                list(
                    self.client.warcs(
                        harvest_date_start="2019-05-25T13:57:47.980000Z"))))

    @vcr.use_cassette()
    def test_warcs_by_warc_created_date(self):
        self.assertEqual(
            3,
            len(
                list(
                    self.client.warcs(
                        harvest_date_start="2017-05-25T13:57:47.980000Z"))))
        self.assertEqual(
            1,
            len(
                list(
                    self.client.warcs(
                        harvest_date_start="2018-05-25T13:56:47.980000Z"))))
        self.assertEqual(
            0,
            len(
                list(
                    self.client.warcs(
                        harvest_date_start="2019-05-25T13:57:47.980000Z"))))

    @vcr.use_cassette()
    def test_all_collections(self):
        self.assertEqual(2, len(list(self.client.collections())))

    @vcr.use_cassette()
    def test_collections_startswith(self):
        self.assertEqual(
            1,
            len(
                list(
                    self.client.collections(
                        collection_id_startswith="366439dbb"))))
        self.assertEqual(
            0,
            len(
                list(
                    self.client.collections(
                        collection_id_startswith="x366439dbb"))))