コード例 #1
0
def index_client(indexd_server):
    """
    Handles getting all the docs from an
    indexing endpoint. Currently this is changing from
    signpost to indexd, so we'll use just indexd_client now.
    I.E. test to a common interface this could be multiply our
    tests:
    https://docs.pytest.org/en/latest/fixture.html#parametrizing-fixtures
    """
    setup_database()
    client = Gen3Index(indexd_server.baseurl,
                       create_user("admin", "admin"),
                       service_location="")
    yield client
    clear_database()
コード例 #2
0
async def _get_record_from_indexd(guid, commons_url, lock):
    """
    Gets a semaphore then requests a record for the given guid

    Args:
        guid (str): indexd record globally unique id
        commons_url (str): root domain for commons where indexd lives
        lock (asyncio.Semaphore): semaphones used to limit ammount of concurrent http
            connections
    """
    index = Gen3Index(commons_url)
    async with lock:
        # default ssl handling unless it's explicitly http://
        ssl = None
        if "https" not in commons_url:
            ssl = False

        return await index.async_get_record(guid, _ssl=ssl)
コード例 #3
0
async def _get_with_params_from_indexd(params, commons_url, lock):
    """
    Gets a semaphore then requests a record for the given params

    Args:
        params (str): params to match
        commons_url (str): root domain for commons where mds lives
        lock (asyncio.Semaphore): semaphones used to limit ammount of concurrent http
            connections
    """
    index = Gen3Index(commons_url)
    async with lock:
        # default ssl handling unless it's explicitly http://
        ssl = None
        if "https" not in commons_url:
            ssl = False

        return await index.async_get_with_params(params, _ssl=ssl)
コード例 #4
0
def delete_all_guids(auth, file):
    """
    Delete all GUIDs specified in the object manifest.

    WARNING: THIS COMPLETELY REMOVES INDEX RECORDS. USE THIS ONLY IF YOU KNOW
             THE IMPLICATIONS.
    """
    index = Gen3Index(auth.endpoint, auth_provider=auth)
    if not index.is_healthy():
        logging.debug(
            f"uh oh! The indexing service is not healthy in the commons {auth.endpoint}"
        )
        exit()

    # try to get delimeter based on file ext
    file_ext = os.path.splitext(file)
    if file_ext[-1].lower() == ".tsv":
        manifest_file_delimiter = "\t"
    else:
        # default, assume CSV
        manifest_file_delimiter = ","

    with open(file, "r", encoding="utf-8-sig") as input_file:
        csvReader = csv.DictReader(input_file,
                                   delimiter=manifest_file_delimiter)
        fieldnames = csvReader.fieldnames

        logging.debug(f"got fieldnames from {file}: {fieldnames}")

        # figure out which permutation of the name GUID is being used 1 time
        # then use it for all future rows
        guid_name = "guid"
        for name in ["guid", "GUID", "did", "DID"]:
            if name in fieldnames:
                guid_name = name

        logging.debug(f"using {guid_name} to retrieve GUID to delete...")

        for row in csvReader:
            guid = row.get(guid_name)

            if guid:
                logging.debug(f"deleting GUID record:{guid}")
                logging.debug(index.delete_record(guid=guid))
コード例 #5
0
async def _put_records_from_page_in_queue(page, commons_url, lock, queue):
    """
    Gets a semaphore then requests records for the given page and
    puts them in a queue.

    Args:
        commons_url (str): root domain for commons where indexd lives
        page (int/str): indexd page to request
        lock (asyncio.Semaphore): semaphones used to limit ammount of concurrent http
            connections
        queue (asyncio.Queue): queue to put indexd records in
    """
    index = Gen3Index(commons_url)
    async with lock:
        # default ssl handling unless it's explicitly http://
        ssl = None
        if "https" not in commons_url:
            ssl = False

        records = await index.async_get_records_on_page(
            page=page, limit=INDEXD_RECORD_PAGE_SIZE, _ssl=ssl)
        await queue.put(records)
コード例 #6
0
def _get_page_and_write_records_to_file(queue, commons):
    """
    Pops off queue until it sees a "STOP".
    Sends a request to get all records on a given popped queue page, parses the records,
    converts to manifest format, and writes to a tsv file in a tmp directory (all files
    will be combined later)

    Args:
        queue (multiprocessing.Queue): thread-safe multi-producer/consumer queue
        commons (str): root domain for commons where indexd
    """
    index = Gen3Index(commons)
    page = queue.get()
    process_name = multiprocessing.current_process().name
    while page != "STOP":
        records = index.get_records_on_page(page=page, limit=INDEXD_RECORD_PAGE_SIZE)

        logging.info(f"{process_name}:Read page {page} with {len(records)} records")

        if records:
            file_name = TMP_FOLDER + str(page) + ".tsv"
            with open(file_name, "w+", encoding="utf8") as file:
                logging.info(f"{process_name}:Write to {file_name}")
                tsvwriter = csv.writer(file, delimiter="\t")
                for record in records:
                    manifest_row = [
                        record.get("did"),
                        record.get("urls"),
                        record.get("authz"),
                        record.get("acl"),
                        record.get("md5"),
                        record.get("size"),
                    ]
                    tsvwriter.writerow(manifest_row)
        page = queue.get()

    logging.info(f"{process_name}:Stop")
コード例 #7
0
async def _is_indexed_file_object(guid, commons_url, lock):
    """
    Gets a semaphore then requests a record for the given guid

    Args:
        guid (str): indexd record globally unique id
        commons_url (str): root domain for commons where mds lives
        lock (asyncio.Semaphore): semaphones used to limit ammount of concurrent http
            connections
    """
    index = Gen3Index(commons_url)
    async with lock:
        # default ssl handling unless it's explicitly http://
        ssl = None
        if "https" not in commons_url:
            ssl = False

        try:
            record = await index.async_get_record(guid, _ssl=ssl)
        except Exception as exc:
            # if error, assume it does not exist
            return False

        return bool(record)
コード例 #8
0
async def _write_all_index_records_to_file(commons_url, output_filename,
                                           num_processes,
                                           max_concurrent_requests):
    """
    Spins up number of processes provided to parse indexd records and eventually
    write to a single output file manifest.

    Args:
        commons_url (str): root domain for commons where indexd lives
        output_filename (str, optional): filename for output
        num_processes (int, optional): number of parallel python processes to use for
          hitting indexd api and processing
        max_concurrent_requests (int): the maximum number of concurrent requests allowed
            NOTE: This is the TOTAL number, not just for this process. Used to help
            determine how many requests a process should be making at one time
    """
    index = Gen3Index(commons_url)
    logging.debug(f"requesting indexd stats...")
    num_files = int(index.get_stats().get("fileCount"))
    logging.debug(f"number files: {num_files}")
    # paging is 0-based, so subtract 1 from ceiling
    # note: float() is necessary to force Python 3 to not floor the result
    max_page = int(math.ceil(float(num_files) / INDEXD_RECORD_PAGE_SIZE)) - 1
    logging.debug(f"max page: {max_page}")
    logging.debug(f"num processes: {num_processes}")

    pages = [x for x in range(max_page + 1)]

    # batch pages into subprocesses
    chunk_size = int(math.ceil(float(len(pages)) / num_processes))
    logging.debug(f"page chunk size: {chunk_size}")

    if not chunk_size:
        page_chunks = []
    else:
        page_chunks = [
            pages[i:i + chunk_size] for i in range(0, len(pages), chunk_size)
        ]

    processes = []
    for x in range(len(page_chunks)):
        pages = ",".join(map(str, page_chunks[x]))

        # call the cli function below and pass in chunks of pages for each process
        command = (
            f"python {CURRENT_DIR}/download_manifest.py --commons_url "
            f"{commons_url} --pages {pages} --num_processes {num_processes} "
            f"--max_concurrent_requests {max_concurrent_requests}")
        logging.info(command)

        process = await asyncio.create_subprocess_shell(command)

        logging.info(f"Process_{process.pid} - Started w/: {command}")
        processes.append(process)

    for process in processes:
        # wait for the subprocesses to finish
        stdout, stderr = await process.communicate()

        if process.returncode == 0:
            logging.info(f"Process_{process.pid} - Done")
        else:
            logging.info(f"Process_{process.pid} - FAILED")

    logging.info(
        f"done processing, combining outputs to single file {output_filename}")

    # remove existing output if it exists
    if os.path.isfile(output_filename):
        os.unlink(output_filename)

    with open(output_filename, "wb") as outfile:
        outfile.write(
            "guid,urls,authz,acl,md5,file_size,file_name\n".encode("utf8"))
        for filename in glob.glob(TMP_FOLDER + "*"):
            if output_filename == filename:
                # don't want to copy the output into the output
                continue
            logging.info(f"combining {filename} into {output_filename}")
            with open(filename, "rb") as readfile:
                shutil.copyfileobj(readfile, outfile)

    logging.info(f"done writing output to file {output_filename}")
コード例 #9
0
def build_auth_indexer(cred=None):
    auth = build_auth(cred)
    return Gen3Index(GEN3_URL, auth_provider=auth)
コード例 #10
0
def build_public_indexer():
    return Gen3Index(GEN3_URL)
コード例 #11
0
def _write_all_index_records_to_file(commons, output_filename, num_processes):
    """
    Spins up number of processes provided to parse indexd records and eventually
    write to a single output file manfiest.

    Args:
        commons (str): root domain for commons where indexd
        output_filename (str, optional): filename for output
        num_processes (int, optional): number of parallel python processes to use for
          hitting indexd api and processing

    Raises:
        IndexError: If script detects missing files in indexd after initial parsing
    """
    index = Gen3Index(commons)
    logging.debug(f"requesting indexd stats...")
    num_files = int(index.get_stats().get("fileCount"))
    # paging is 0-based, so subtract 1 from ceiling
    # note: float() is necessary to force Python 3 to not floor the result
    max_page = int(math.ceil(float(num_files) / INDEXD_RECORD_PAGE_SIZE)) - 1

    queue = Queue(max_page + num_processes)

    pages = [x for x in range(max_page)]
    _add_pages_to_queue_and_process(pages, queue, commons, num_processes)

    logging.info(f"checking if files were added since we started...")
    current_num_files = int(index.get_stats().get("fileCount"))

    # don't handle if files are actively being deleted
    if current_num_files < num_files:
        raise IndexError("Files were removed during pagination.")

    # if files we added we can try to parse them
    if current_num_files > num_files:
        logging.warning(
            f"current files {current_num_files} is not the same as when "
            f"we started {num_files}! Will attempt to get the new files but if more "
            "are ACTIVELY being added via the API this manifest WILL NOT BE COMPLETE."
        )

        new_extra_files = current_num_files - num_files
        new_pages_to_parse = int(
            math.ceil(float(new_extra_files) / INDEXD_RECORD_PAGE_SIZE)
        )

        # NOTE: start at previous max_page so we can pick up any addition files added to
        #       that page
        _add_pages_to_queue_and_process(
            [x for x in range(max_page, max_page + new_pages_to_parse)],
            queue,
            commons,
            num_processes,
        )

    logging.info(
        f"done processing queue, combining outputs to single file {output_filename}"
    )

    # remove existing output if it exists
    if os.path.isfile(output_filename):
        os.unlink(output_filename)

    with open(output_filename, "wb") as outfile:
        outfile.write("GUID, urls, authz, acl, md5, size\n".encode("utf8"))
        for filename in glob.glob(TMP_FOLDER + "*"):
            if output_filename == filename:
                # don't want to copy the output into the output
                continue
            logging.info(f"combining {filename} into {output_filename}")
            with open(filename, "rb") as readfile:
                shutil.copyfileobj(readfile, outfile)

    logging.info(f"done writing output to file {output_filename}")