Exemple #1
0
def objects_manifest_publish(
    ctx,
    file,
    thread_num,
    append_urls,
    manifest_file_delimiter,
    out_manifest_file,
):
    auth = ctx.obj["auth_factory"].get()
    loop = get_or_create_event_loop_for_thread()

    if not file:
        file = click.prompt("Enter Discovery metadata file path to publish")

    click.echo(
        f"Publishing/writing object data from {file}...\n    to: {auth.endpoint}"
    )

    index_object_manifest(
        commons_url=auth.endpoint,
        manifest_file=file,
        thread_num=thread_num,
        auth=auth,
        replace_urls=not append_urls,
        manifest_file_delimiter=manifest_file_delimiter,
        output_filename=out_manifest_file,
        submit_additional_metadata_columns=True,
    )
Exemple #2
0
def write_page_records_to_files(commons_url, pages, num_processes,
                                max_concurrent_requests):
    """
    Command line interface function for requesting a number of pages of
    records from indexd and writing to a file in that process. num_processes
    is only used to calculate how many open connections this process should request.

    Args:
        commons_url (str): root domain for commons where indexd lives
        pages (List[int/str]): List of indexd pages to request
        num_processes (int): number of concurrent processes being requested
            (including this one)
        max_concurrent_requests (int): the maximum number of concurrent requests allowed
            NOTE: This is the TOTAL number, not just for this process. Used to help
            determine how many requests a process should be making at one time

    Raises:
        AttributeError: No pages specified to get records from
    """
    if not pages:
        raise AttributeError("No pages specified to get records from.")

    pages = pages.strip().split(",")
    loop = get_or_create_event_loop_for_thread()

    result = loop.run_until_complete(
        _get_records_and_write_to_file(commons_url, pages, num_processes,
                                       max_concurrent_requests))
    return result
async def _parse_from_queue(queue, lock, commons_url, output_queue,
                            metadata_source):
    """
    Keep getting items from the queue and verifying that mds contains the expected
    fields from that row. If there are any issues, log errors into a file. Return
    when nothing is left in the queue.

    Args:
        queue (asyncio.Queue): queue to read mds records from
        lock (asyncio.Semaphore): semaphones used to limit ammount of concurrent http
            connections
        commons_url (str): root domain for commons where mds lives
        output_queue (asyncio.Queue): queue for output
        metadata_source (str): the source of the metadata you are verifying, in practice
            this means the first nested section in the metadata service
    """
    loop = get_or_create_event_loop_for_thread()

    while not queue.empty():
        row = await queue.get()

        guid = manifest_row_parsers["guid"](row)
        metadata = manifest_row_parsers["metadata"](row)

        actual_record = await _get_record_from_mds(guid, commons_url, lock)
        if not actual_record:
            output = f"{guid}|no_record|expected {row}|actual None\n"
            await output_queue.put(output)
            logging.error(output)
        else:
            logging.info(f"verifying {guid}...")

            for expected_key, expected_value in metadata.items():
                actual_value_raw = actual_record.get(metadata_source,
                                                     {}).get(expected_key)
                try:
                    actual_value = json.loads(str(actual_value_raw))
                except json.decoder.JSONDecodeError as exc:
                    actual_value = actual_value_raw

                try:
                    expected_value = json.loads(str(expected_value))
                except json.decoder.JSONDecodeError as exc:
                    pass

                # compare as dicts if necessary, otherwise just compare values
                if (isinstance(expected_value, Mapping) and
                        not _are_matching_dicts(expected_value, actual_value)
                    ) or (actual_value != expected_value):
                    output = f"{guid}|{metadata_source}.{expected_key}|expected {expected_value}|actual {actual_value}\n"
                    await output_queue.put(output)
                    logging.error(output)
Exemple #4
0
def objects_manifest_read(ctx, output_file, num_processes, max_concurrent_requests):
    auth = ctx.obj["auth_factory"].get()
    loop = get_or_create_event_loop_for_thread()
    click.echo(f"Getting minimal object metadata from {auth.endpoint}")
    loop.run_until_complete(
        indexing.async_download_object_manifest(
            auth.endpoint,
            output_filename=output_file,
            num_processes=num_processes,
            max_concurrent_requests=max_concurrent_requests,
        )
    )
    click.echo(output_file)
Exemple #5
0
def discovery_read(ctx, limit, agg):
    """
    Download the metadata used to populate a commons' discovery page into a TSV.
    Outputs the TSV filename with format {commons-url}-discovery_metadata.tsv
    """
    auth = ctx.obj["auth_factory"].get()
    loop = get_or_create_event_loop_for_thread()
    endpoint = ctx.obj.get("endpoint")
    output_file = loop.run_until_complete(
        output_expanded_discovery_metadata(auth,
                                           endpoint=endpoint,
                                           limit=limit,
                                           use_agg_mds=agg))

    click.echo(output_file)
Exemple #6
0
def discovery_publish(ctx, file, use_default_file, omit_empty):
    """
    Run a discovery metadata ingestion on a given metadata TSV file with guid column.
    If [FILE] is omitted and --default-file not set, prompts for TSV file name.
    """
    auth = ctx.obj["auth_factory"].get()
    if not file and not use_default_file:
        file = click.prompt("Enter discovery metadata TSV file to publish")

    loop = get_or_create_event_loop_for_thread()
    endpoint = ctx.obj.get("endpoint")
    loop.run_until_complete(
        publish_discovery_metadata(auth,
                                   file,
                                   endpoint=endpoint,
                                   omit_empty_values=omit_empty))
Exemple #7
0
def objects_manifest_delete_all_guids(ctx, file):
    auth = ctx.obj["auth_factory"].get()
    loop = get_or_create_event_loop_for_thread()

    if not file:
        file = click.prompt("Enter Discovery metadata file path to delete")

    click.echo(f"        DELETING ALL GUIDS\n  from: {file}\n    in: {auth.endpoint}")
    click.confirm(
        f"Are you sure you want to DELETE ALL GUIDS in {auth.endpoint} as specified by this file: {file}?",
        abort=True,
    )
    click.confirm(
        f"Please confirm again, this is irreversible. All GUIDs specified specified in {file} will be deleted from {auth.endpoint}. You are sure?",
        abort=True,
    )

    delete_all_guids(auth, file)
Exemple #8
0
async def _parse_from_queue(queue):
    """
    Read from the queue and write to a file

    Args:
        queue (asyncio.Queue): queue to read indexd records from
    """
    loop = get_or_create_event_loop_for_thread()

    file_name = TMP_FOLDER + f"{os.getpid()}.csv"
    async with aiofiles.open(file_name, "w+", encoding="utf8") as file:
        logging.info(f"Write to {file_name}")
        csv_writer = csv.writer(file)

        records = await queue.get()
        while records != "DONE":
            if records:
                for record in list(records):
                    manifest_row = [
                        record.get("did"),
                        " ".join(
                            sorted([
                                url.replace(" ", "%20")
                                for url in record.get("urls")
                            ])),
                        " ".join(
                            sorted([
                                auth.replace(" ", "%20")
                                for auth in record.get("authz")
                            ])),
                        " ".join(
                            sorted([
                                a.replace(" ", "%20")
                                for a in record.get("acl")
                            ])),
                        record.get("hashes", {}).get("md5"),
                        record.get("size"),
                        record.get("file_name"),
                    ]
                    await csv_writer.writerow(manifest_row)

            records = await queue.get()

        file.flush()
Exemple #9
0
def objects_manifest_verify(ctx, file, max_concurrent_requests):
    auth = ctx.obj["auth_factory"].get()
    loop = get_or_create_event_loop_for_thread()

    if not file:
        file = click.prompt("Enter Discovery metadata file path to publish")

    click.echo(f"Verifying {file}...\n    Against: {auth.endpoint}")

    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)

    loop.run_until_complete(
        indexing.async_verify_object_manifest(
            auth.endpoint,
            manifest_file=file,
            max_concurrent_requests=max_concurrent_requests,
        )
    )
Exemple #10
0
async def _parse_from_queue(queue, lock, commons_url, output_queue):
    """
    Keep getting items from the queue and verifying that indexd contains the expected
    fields from that row. If there are any issues, log errors into a file. Return
    when nothing is left in the queue.

    Args:
        queue (asyncio.Queue): queue to read indexd records from
        lock (asyncio.Semaphore): semaphones used to limit ammount of concurrent http
            connections
        commons_url (str): root domain for commons where indexd lives
        output_queue (asyncio.Queue): queue for output
    """
    loop = get_or_create_event_loop_for_thread()

    row = await queue.get()

    while row != "DONE":
        guid = manifest_row_parsers["guid"](row)
        authz = manifest_row_parsers["authz"](row)
        acl = manifest_row_parsers["acl"](row)
        file_size = manifest_row_parsers["file_size"](row)
        md5 = manifest_row_parsers["md5"](row)
        urls = manifest_row_parsers["urls"](row)
        file_name = manifest_row_parsers["file_name"](row)

        actual_record = await _get_record_from_indexd(guid, commons_url, lock)
        if not actual_record:
            output = f"{guid}|no_record|expected {row}|actual None\n"
            await output_queue.put(output)
            logging.error(output)
        else:
            logging.info(f"verifying {guid}...")

            if sorted(authz) != sorted(actual_record["authz"]):
                output = (
                    f"{guid}|authz|expected {authz}|actual {actual_record['authz']}\n"
                )
                await output_queue.put(output)
                logging.error(output)

            if sorted(acl) != sorted(actual_record["acl"]):
                output = f"{guid}|acl|expected {acl}|actual {actual_record['acl']}\n"
                await output_queue.put(output)
                logging.error(output)

            if file_size != actual_record["size"]:
                if (not file_size and file_size != 0
                        and not actual_record["size"]
                        and actual_record["size"] != 0):
                    # actual and expected are both either empty string or None
                    # so even though they're not equal, they represent null value so
                    # we don't need to consider this an error in validation
                    pass
                else:
                    output = f"{guid}|file_size|expected {file_size}|actual {actual_record['size']}\n"
                    await output_queue.put(output)
                    logging.error(output)

            if md5 != actual_record["hashes"].get("md5"):
                if (not md5 and md5 != 0
                        and not actual_record["hashes"].get("md5")
                        and actual_record["hashes"].get("md5") != 0):
                    # actual and expected are both either empty string or None
                    # so even though they're not equal, they represent null value so
                    # we don't need to consider this an error in validation
                    pass
                else:
                    output = f"{guid}|md5|expected {md5}|actual {actual_record['hashes'].get('md5')}\n"
                    await output_queue.put(output)
                    logging.error(output)
            urls = [url.replace("%20", " ") for url in urls]
            if sorted(urls) != sorted(actual_record["urls"]):
                output = f"{guid}|urls|expected {urls}|actual {actual_record['urls']}\n"
                await output_queue.put(output)
                logging.error(output)

            if not actual_record["file_name"] and file_name:
                # if the actual record name is "" or None but something was specified
                # in the manifest, we have a problem
                output = f"{guid}|file_name|expected {file_name}|actual {actual_record['file_name']}\n"
                await output_queue.put(output)
                logging.error(output)

        row = await queue.get()