def index_client(indexd_server): """ Handles getting all the docs from an indexing endpoint. Currently this is changing from signpost to indexd, so we'll use just indexd_client now. I.E. test to a common interface this could be multiply our tests: https://docs.pytest.org/en/latest/fixture.html#parametrizing-fixtures """ setup_database() client = Gen3Index(indexd_server.baseurl, create_user("admin", "admin"), service_location="") yield client clear_database()
async def _get_record_from_indexd(guid, commons_url, lock): """ Gets a semaphore then requests a record for the given guid Args: guid (str): indexd record globally unique id commons_url (str): root domain for commons where indexd lives lock (asyncio.Semaphore): semaphones used to limit ammount of concurrent http connections """ index = Gen3Index(commons_url) async with lock: # default ssl handling unless it's explicitly http:// ssl = None if "https" not in commons_url: ssl = False return await index.async_get_record(guid, _ssl=ssl)
async def _get_with_params_from_indexd(params, commons_url, lock): """ Gets a semaphore then requests a record for the given params Args: params (str): params to match commons_url (str): root domain for commons where mds lives lock (asyncio.Semaphore): semaphones used to limit ammount of concurrent http connections """ index = Gen3Index(commons_url) async with lock: # default ssl handling unless it's explicitly http:// ssl = None if "https" not in commons_url: ssl = False return await index.async_get_with_params(params, _ssl=ssl)
def delete_all_guids(auth, file): """ Delete all GUIDs specified in the object manifest. WARNING: THIS COMPLETELY REMOVES INDEX RECORDS. USE THIS ONLY IF YOU KNOW THE IMPLICATIONS. """ index = Gen3Index(auth.endpoint, auth_provider=auth) if not index.is_healthy(): logging.debug( f"uh oh! The indexing service is not healthy in the commons {auth.endpoint}" ) exit() # try to get delimeter based on file ext file_ext = os.path.splitext(file) if file_ext[-1].lower() == ".tsv": manifest_file_delimiter = "\t" else: # default, assume CSV manifest_file_delimiter = "," with open(file, "r", encoding="utf-8-sig") as input_file: csvReader = csv.DictReader(input_file, delimiter=manifest_file_delimiter) fieldnames = csvReader.fieldnames logging.debug(f"got fieldnames from {file}: {fieldnames}") # figure out which permutation of the name GUID is being used 1 time # then use it for all future rows guid_name = "guid" for name in ["guid", "GUID", "did", "DID"]: if name in fieldnames: guid_name = name logging.debug(f"using {guid_name} to retrieve GUID to delete...") for row in csvReader: guid = row.get(guid_name) if guid: logging.debug(f"deleting GUID record:{guid}") logging.debug(index.delete_record(guid=guid))
async def _put_records_from_page_in_queue(page, commons_url, lock, queue): """ Gets a semaphore then requests records for the given page and puts them in a queue. Args: commons_url (str): root domain for commons where indexd lives page (int/str): indexd page to request lock (asyncio.Semaphore): semaphones used to limit ammount of concurrent http connections queue (asyncio.Queue): queue to put indexd records in """ index = Gen3Index(commons_url) async with lock: # default ssl handling unless it's explicitly http:// ssl = None if "https" not in commons_url: ssl = False records = await index.async_get_records_on_page( page=page, limit=INDEXD_RECORD_PAGE_SIZE, _ssl=ssl) await queue.put(records)
def _get_page_and_write_records_to_file(queue, commons): """ Pops off queue until it sees a "STOP". Sends a request to get all records on a given popped queue page, parses the records, converts to manifest format, and writes to a tsv file in a tmp directory (all files will be combined later) Args: queue (multiprocessing.Queue): thread-safe multi-producer/consumer queue commons (str): root domain for commons where indexd """ index = Gen3Index(commons) page = queue.get() process_name = multiprocessing.current_process().name while page != "STOP": records = index.get_records_on_page(page=page, limit=INDEXD_RECORD_PAGE_SIZE) logging.info(f"{process_name}:Read page {page} with {len(records)} records") if records: file_name = TMP_FOLDER + str(page) + ".tsv" with open(file_name, "w+", encoding="utf8") as file: logging.info(f"{process_name}:Write to {file_name}") tsvwriter = csv.writer(file, delimiter="\t") for record in records: manifest_row = [ record.get("did"), record.get("urls"), record.get("authz"), record.get("acl"), record.get("md5"), record.get("size"), ] tsvwriter.writerow(manifest_row) page = queue.get() logging.info(f"{process_name}:Stop")
async def _is_indexed_file_object(guid, commons_url, lock): """ Gets a semaphore then requests a record for the given guid Args: guid (str): indexd record globally unique id commons_url (str): root domain for commons where mds lives lock (asyncio.Semaphore): semaphones used to limit ammount of concurrent http connections """ index = Gen3Index(commons_url) async with lock: # default ssl handling unless it's explicitly http:// ssl = None if "https" not in commons_url: ssl = False try: record = await index.async_get_record(guid, _ssl=ssl) except Exception as exc: # if error, assume it does not exist return False return bool(record)
async def _write_all_index_records_to_file(commons_url, output_filename, num_processes, max_concurrent_requests): """ Spins up number of processes provided to parse indexd records and eventually write to a single output file manifest. Args: commons_url (str): root domain for commons where indexd lives output_filename (str, optional): filename for output num_processes (int, optional): number of parallel python processes to use for hitting indexd api and processing max_concurrent_requests (int): the maximum number of concurrent requests allowed NOTE: This is the TOTAL number, not just for this process. Used to help determine how many requests a process should be making at one time """ index = Gen3Index(commons_url) logging.debug(f"requesting indexd stats...") num_files = int(index.get_stats().get("fileCount")) logging.debug(f"number files: {num_files}") # paging is 0-based, so subtract 1 from ceiling # note: float() is necessary to force Python 3 to not floor the result max_page = int(math.ceil(float(num_files) / INDEXD_RECORD_PAGE_SIZE)) - 1 logging.debug(f"max page: {max_page}") logging.debug(f"num processes: {num_processes}") pages = [x for x in range(max_page + 1)] # batch pages into subprocesses chunk_size = int(math.ceil(float(len(pages)) / num_processes)) logging.debug(f"page chunk size: {chunk_size}") if not chunk_size: page_chunks = [] else: page_chunks = [ pages[i:i + chunk_size] for i in range(0, len(pages), chunk_size) ] processes = [] for x in range(len(page_chunks)): pages = ",".join(map(str, page_chunks[x])) # call the cli function below and pass in chunks of pages for each process command = ( f"python {CURRENT_DIR}/download_manifest.py --commons_url " f"{commons_url} --pages {pages} --num_processes {num_processes} " f"--max_concurrent_requests {max_concurrent_requests}") logging.info(command) process = await asyncio.create_subprocess_shell(command) logging.info(f"Process_{process.pid} - Started w/: {command}") processes.append(process) for process in processes: # wait for the subprocesses to finish stdout, stderr = await process.communicate() if process.returncode == 0: logging.info(f"Process_{process.pid} - Done") else: logging.info(f"Process_{process.pid} - FAILED") logging.info( f"done processing, combining outputs to single file {output_filename}") # remove existing output if it exists if os.path.isfile(output_filename): os.unlink(output_filename) with open(output_filename, "wb") as outfile: outfile.write( "guid,urls,authz,acl,md5,file_size,file_name\n".encode("utf8")) for filename in glob.glob(TMP_FOLDER + "*"): if output_filename == filename: # don't want to copy the output into the output continue logging.info(f"combining {filename} into {output_filename}") with open(filename, "rb") as readfile: shutil.copyfileobj(readfile, outfile) logging.info(f"done writing output to file {output_filename}")
def build_auth_indexer(cred=None): auth = build_auth(cred) return Gen3Index(GEN3_URL, auth_provider=auth)
def build_public_indexer(): return Gen3Index(GEN3_URL)
def _write_all_index_records_to_file(commons, output_filename, num_processes): """ Spins up number of processes provided to parse indexd records and eventually write to a single output file manfiest. Args: commons (str): root domain for commons where indexd output_filename (str, optional): filename for output num_processes (int, optional): number of parallel python processes to use for hitting indexd api and processing Raises: IndexError: If script detects missing files in indexd after initial parsing """ index = Gen3Index(commons) logging.debug(f"requesting indexd stats...") num_files = int(index.get_stats().get("fileCount")) # paging is 0-based, so subtract 1 from ceiling # note: float() is necessary to force Python 3 to not floor the result max_page = int(math.ceil(float(num_files) / INDEXD_RECORD_PAGE_SIZE)) - 1 queue = Queue(max_page + num_processes) pages = [x for x in range(max_page)] _add_pages_to_queue_and_process(pages, queue, commons, num_processes) logging.info(f"checking if files were added since we started...") current_num_files = int(index.get_stats().get("fileCount")) # don't handle if files are actively being deleted if current_num_files < num_files: raise IndexError("Files were removed during pagination.") # if files we added we can try to parse them if current_num_files > num_files: logging.warning( f"current files {current_num_files} is not the same as when " f"we started {num_files}! Will attempt to get the new files but if more " "are ACTIVELY being added via the API this manifest WILL NOT BE COMPLETE." ) new_extra_files = current_num_files - num_files new_pages_to_parse = int( math.ceil(float(new_extra_files) / INDEXD_RECORD_PAGE_SIZE) ) # NOTE: start at previous max_page so we can pick up any addition files added to # that page _add_pages_to_queue_and_process( [x for x in range(max_page, max_page + new_pages_to_parse)], queue, commons, num_processes, ) logging.info( f"done processing queue, combining outputs to single file {output_filename}" ) # remove existing output if it exists if os.path.isfile(output_filename): os.unlink(output_filename) with open(output_filename, "wb") as outfile: outfile.write("GUID, urls, authz, acl, md5, size\n".encode("utf8")) for filename in glob.glob(TMP_FOLDER + "*"): if output_filename == filename: # don't want to copy the output into the output continue logging.info(f"combining {filename} into {output_filename}") with open(filename, "rb") as readfile: shutil.copyfileobj(readfile, outfile) logging.info(f"done writing output to file {output_filename}")