def build_hierarchy( contexts, root_dir, driver_name, crawler_cbs, specfile_cbs, allow_partial_failure=False, max_retries=1 ): """ Given a crawler_callbacks and specfile_callbacks bundle and a list of contexts, generate a hierarchy by crawling the dataset. Spawn one thread per context """ hierarchy_dict = {} # generate and store data based on the caller's include_cb generator_cb = lambda abs_path, is_dir: AG_specfile.add_hierarchy_element( abs_path, is_dir, driver_name, crawler_cbs.include_cb, specfile_cbs, hierarchy_dict ) # override the include_cb in crawler_cbs to build up the hierarchy, based on the user-given include_cb's decisions generator_callbacks = crawler_callbacks( include_cb=generator_cb, listdir_cb=crawler_cbs.listdir_cb, isdir_cb=crawler_cbs.isdir_cb ) status = walk_dataset( contexts, root_dir, generator_callbacks, max_retries ) if not status and not allow_partial_failure: return None AG_specfile.add_hierarchy_prefixes( root_dir, driver_name, crawler_cbs.include_cb, specfile_cbs, hierarchy_dict ) return hierarchy_dict
def generate_specfile_from_global_listing( gsutil_binary_path, root_dir, include_cb, specfile_cbs, output_fd, max_retries=3, compressed_listing_path=None ): """ Build up the specfile from the global dataset listing. Write the result to output_fd. NOTE: this can be memory intensive, if there are a lot of directories """ directories = {} if compressed_listing_path is None: # get the global dataset compressed_listing_path = tempfile.mktemp() rc = gsutil_download_global_dataset_listing( gsutil_binary_path, compressed_listing_path, max_retries=max_retries ) if not rc: log.error("Failed to download listing") return False listing_fd, listing_path = tempfile.mkstemp() listing_file = os.fdopen( listing_fd, "r+" ) os.unlink( listing_path ) # extract it rc = gsutil_extract_global_dataset_listing( compressed_listing_path, listing_file ) if not rc: log.error("Failed to extract listing") listing_file.close() return False listing_file.seek(0) # make the specfile... AG_specfile.generate_specfile_header( output_fd ) AG_specfile.generate_specfile_config( {} ) # iterate through each line while True: # next line line = listing_file.readline() if len(line) == 0: break line = line.strip() # extract path path = gsutil_parse_path( line ) if path is None: log.error("Failed to parse '%s'" % line) continue # is it a child of root_dir? if not path.startswith(root_dir): continue # add all prefixes up to the parent directory new_directories = AG_specfile.add_hierarchy_prefixes( os.path.dirname( path ), DRIVER_NAME, include_cb, specfile_cbs, directories ) new_directories.sort() # write all new directories for new_directory in new_directories: dir_data = directories[new_directory] AG_specfile.generate_specfile_pair( dir_data, output_fd ) # add this entry file_data_dict = {} AG_specfile.add_hierarchy_element( path, False, DRIVER_NAME, include_cb, specfile_cbs, file_data_dict ) AG_specfile.generate_specfile_pair( file_data_dict[path], output_fd ) AG_specfile.generate_specfile_footer( output_fd ) listing_file.close() return True
def generate_specfile_from_global_listing(gsutil_binary_path, root_dir, include_cb, specfile_cbs, output_fd, max_retries=3, compressed_listing_path=None): """ Build up the specfile from the global dataset listing. Write the result to output_fd. NOTE: this can be memory intensive, if there are a lot of directories """ directories = {} if compressed_listing_path is None: # get the global dataset compressed_listing_path = tempfile.mktemp() rc = gsutil_download_global_dataset_listing(gsutil_binary_path, compressed_listing_path, max_retries=max_retries) if not rc: log.error("Failed to download listing") return False listing_fd, listing_path = tempfile.mkstemp() listing_file = os.fdopen(listing_fd, "r+") os.unlink(listing_path) # extract it rc = gsutil_extract_global_dataset_listing(compressed_listing_path, listing_file) if not rc: log.error("Failed to extract listing") listing_file.close() return False listing_file.seek(0) # make the specfile... AG_specfile.generate_specfile_header(output_fd) AG_specfile.generate_specfile_config({}) # iterate through each line while True: # next line line = listing_file.readline() if len(line) == 0: break line = line.strip() # extract path path = gsutil_parse_path(line) if path is None: log.error("Failed to parse '%s'" % line) continue # is it a child of root_dir? if not path.startswith(root_dir): continue # add all prefixes up to the parent directory new_directories = AG_specfile.add_hierarchy_prefixes( os.path.dirname(path), DRIVER_NAME, include_cb, specfile_cbs, directories) new_directories.sort() # write all new directories for new_directory in new_directories: dir_data = directories[new_directory] AG_specfile.generate_specfile_pair(dir_data, output_fd) # add this entry file_data_dict = {} AG_specfile.add_hierarchy_element(path, False, DRIVER_NAME, include_cb, specfile_cbs, file_data_dict) AG_specfile.generate_specfile_pair(file_data_dict[path], output_fd) AG_specfile.generate_specfile_footer(output_fd) listing_file.close() return True