Exemple #1
0
def build_hierarchy( contexts, root_dir, driver_name, crawler_cbs, specfile_cbs, allow_partial_failure=False, max_retries=1 ):
   """
   Given a crawler_callbacks and specfile_callbacks bundle and a list of contexts, generate a hierarchy by crawling the dataset.
   Spawn one thread per context 
   """
   
   hierarchy_dict = {}
   
   # generate and store data based on the caller's include_cb
   generator_cb = lambda abs_path, is_dir: AG_specfile.add_hierarchy_element( abs_path, is_dir, driver_name, crawler_cbs.include_cb, specfile_cbs, hierarchy_dict )
   
   # override the include_cb in crawler_cbs to build up the hierarchy, based on the user-given include_cb's decisions 
   generator_callbacks = crawler_callbacks( include_cb=generator_cb,
                                            listdir_cb=crawler_cbs.listdir_cb,
                                            isdir_cb=crawler_cbs.isdir_cb )
   
   status = walk_dataset( contexts, root_dir, generator_callbacks, max_retries )
   
   if not status and not allow_partial_failure:
      return None 
   
   AG_specfile.add_hierarchy_prefixes( root_dir, driver_name, crawler_cbs.include_cb, specfile_cbs, hierarchy_dict )
   
   return hierarchy_dict
Exemple #2
0
def generate_specfile_from_global_listing( gsutil_binary_path, root_dir, include_cb, specfile_cbs, output_fd, max_retries=3, compressed_listing_path=None ):
   """
   Build up the specfile from the global dataset listing.
   Write the result to output_fd.
   NOTE: this can be memory intensive, if there are a lot of directories
   """
   
   directories = {}
   
   if compressed_listing_path is None:
      
      # get the global dataset 
      compressed_listing_path = tempfile.mktemp()
      
      rc = gsutil_download_global_dataset_listing( gsutil_binary_path, compressed_listing_path, max_retries=max_retries )
      if not rc:
         log.error("Failed to download listing")
         return False
      
   listing_fd, listing_path = tempfile.mkstemp()
   listing_file = os.fdopen( listing_fd, "r+" )
   
   os.unlink( listing_path )
   
   # extract it
   rc = gsutil_extract_global_dataset_listing( compressed_listing_path, listing_file )
   if not rc:
      log.error("Failed to extract listing")
      
      listing_file.close()
      return False
   
   listing_file.seek(0)
   
   # make the specfile...
   AG_specfile.generate_specfile_header( output_fd )
   AG_specfile.generate_specfile_config( {} )
   
   # iterate through each line
   while True:
      
      # next line 
      line = listing_file.readline()
      if len(line) == 0:
         break
      
      line = line.strip()
      
      # extract path 
      path = gsutil_parse_path( line )
      if path is None:
         log.error("Failed to parse '%s'" % line)
         continue 
      
      # is it a child of root_dir?
      if not path.startswith(root_dir):
         continue 
      
      # add all prefixes up to the parent directory
      new_directories = AG_specfile.add_hierarchy_prefixes( os.path.dirname( path ), DRIVER_NAME, include_cb, specfile_cbs, directories )
      
      new_directories.sort()
      
      # write all new directories 
      for new_directory in new_directories:
         
         dir_data = directories[new_directory]
         AG_specfile.generate_specfile_pair( dir_data, output_fd )
         
      # add this entry 
      file_data_dict = {}
      AG_specfile.add_hierarchy_element( path, False, DRIVER_NAME, include_cb, specfile_cbs, file_data_dict )
      
      AG_specfile.generate_specfile_pair( file_data_dict[path], output_fd )
      
   AG_specfile.generate_specfile_footer( output_fd )
   
   listing_file.close()
   
   return True
   
   
Exemple #3
0
def generate_specfile_from_global_listing(gsutil_binary_path,
                                          root_dir,
                                          include_cb,
                                          specfile_cbs,
                                          output_fd,
                                          max_retries=3,
                                          compressed_listing_path=None):
    """
   Build up the specfile from the global dataset listing.
   Write the result to output_fd.
   NOTE: this can be memory intensive, if there are a lot of directories
   """

    directories = {}

    if compressed_listing_path is None:

        # get the global dataset
        compressed_listing_path = tempfile.mktemp()

        rc = gsutil_download_global_dataset_listing(gsutil_binary_path,
                                                    compressed_listing_path,
                                                    max_retries=max_retries)
        if not rc:
            log.error("Failed to download listing")
            return False

    listing_fd, listing_path = tempfile.mkstemp()
    listing_file = os.fdopen(listing_fd, "r+")

    os.unlink(listing_path)

    # extract it
    rc = gsutil_extract_global_dataset_listing(compressed_listing_path,
                                               listing_file)
    if not rc:
        log.error("Failed to extract listing")

        listing_file.close()
        return False

    listing_file.seek(0)

    # make the specfile...
    AG_specfile.generate_specfile_header(output_fd)
    AG_specfile.generate_specfile_config({})

    # iterate through each line
    while True:

        # next line
        line = listing_file.readline()
        if len(line) == 0:
            break

        line = line.strip()

        # extract path
        path = gsutil_parse_path(line)
        if path is None:
            log.error("Failed to parse '%s'" % line)
            continue

        # is it a child of root_dir?
        if not path.startswith(root_dir):
            continue

        # add all prefixes up to the parent directory
        new_directories = AG_specfile.add_hierarchy_prefixes(
            os.path.dirname(path), DRIVER_NAME, include_cb, specfile_cbs,
            directories)

        new_directories.sort()

        # write all new directories
        for new_directory in new_directories:

            dir_data = directories[new_directory]
            AG_specfile.generate_specfile_pair(dir_data, output_fd)

        # add this entry
        file_data_dict = {}
        AG_specfile.add_hierarchy_element(path, False, DRIVER_NAME, include_cb,
                                          specfile_cbs, file_data_dict)

        AG_specfile.generate_specfile_pair(file_data_dict[path], output_fd)

    AG_specfile.generate_specfile_footer(output_fd)

    listing_file.close()

    return True