Ejemplo n.º 1
0
def rucio_file_exists(sample):
    """ Check if `rucio ls` returns the sample"""

    # Check if sample name contains wildcards
    if '*' in sample or '?' in sample:
        print "Attempting wildcard search with sample: ", sample
        print "\tUse exact file name"
        sys.exit()

    # Get rucio ls output
    rucio_cmd = 'rucio ls ' + sample
    rucio_output = tools.get_cmd_output(rucio_cmd)

    # Check rucio ls output
    return True if any(sample in x for x in rucio_output) else False
Ejemplo n.º 2
0
def write_file_info(sample):
    """ Add file information from rucio list-files"""

    # Get rucio list-files output
    rucio_cmd = 'rucio list-files ' + sample
    rucio_output = tools.get_cmd_output(rucio_cmd)

    info_to_write = '\t\tFILE STATS - '
    for line in rucio_output:
        if 'Total' not in line: continue
        info = line.strip()[len('Total '):]
        info_to_write += "%s, "%info 
    
    # remove trailing comma and add newline
    info_to_write = info_to_write[:-2] + '\n'    
    
    return info_to_write
Ejemplo n.º 3
0
def write_replica_info(sample):
    """ Add dataset replica information from rucio list-dataset-replicas"""

    # Get rucio list-dataset-replicas output
    rucio_cmd = 'rucio list-dataset-replicas ' + sample
    rucio_output = tools.get_cmd_output(rucio_cmd)


    # Check if sample is split
    n_samples = len([x for x in rucio_output if "DATASET" in x])
    is_split = True if n_samples > 1 else False
    prefix = '\t\tREPLICA - '
    info_to_write = prefix if not is_split else ''
    
    reject_patterns = ['DATASET', 'RSE', '-'] # don't grab from these lines
    for line in rucio_output:
        line = line.strip()

        # Add new line for each split sample 
        if is_split and 'DATASET' in line:
            info_to_write += '\n' if info_to_write else ''
            split_id_start = line.find(sample)+len(sample)
            split_id = line[split_id_start:]
            info_to_write += prefix + '(%s) '%split_id 
        
        # Get replica information
        if not line or any(x in line for x in ['DATASET', 'RSE', '---']): 
            continue
        RSE_site = line.split()[1]
        files_found = int(line.split()[3])
        files_expected = int(line.split()[5])
        flag = '*' if files_found < files_expected else ''
        info_to_write += "%s (%d/%d)%s, "%(RSE_site, files_found, 
                                           files_expected, flag) 

    # remove trailing comma and add newline
    info_to_write = info_to_write[:-2] + '\n'    
    return info_to_write
Ejemplo n.º 4
0
def get_rse_info(sample):
    """ 
    get dataset rse information from rucio list-dataset-replicas
    
    params:
        sample (str): Dataset Identification (i.e. scope:name)
    returns:
        (dict) : map from RSE sites to information about samples on that site.
                 
                 Information includes available and total files per site and, 
                 if necessary, the ID tag appended to the DID for cases where
                 the sample is split across multiple sites

                 example: {'BNL-OSG2_MCTAPE' : ['id_tag1 (500/500)'; 'id_tag2 (298/298)']
                           'BNL-OSG2_DATADISK' : ['id_tag2 (500/500)']}
        
    """

    # Get rucio list-dataset-replicas output
    rucio_cmd = 'rucio list-dataset-replicas ' + sample
    rucio_output = tools.get_cmd_output(rucio_cmd)

    # Check if sample is split
    n_samples = sum(1 for x in rucio_output if "DATASET" in x)
    is_split = n_samples > 1
    
    # Initialize tools for extraction
    reject_patterns = ['DATASET', 'RSE', '---','SCRATCHDISK'] # don't grab from these lines
    RSE_sites = {} 
    id_found_expected = defaultdict(lambda :[0,0])
    info = defaultdict(list)
    def prepare_rse_info(info_map):
        # Rearrange RSE information into desired format
        if not info_map: return
        for split_id, rse_info_list in info_map.iteritems():
            for info_list in rse_info_list:
                RSE, found, exp, flag = info_list
                id_found_expected[RSE][0] += found 
                id_found_expected[RSE][1] += exp 
        for RSE, (found, exp) in id_found_expected.iteritems():
            flag = "*" if found != exp else ''
            RSE_sites[RSE] = '(%d/%d)%s'%(found,exp,flag) 
        info.clear()
        
    # Extract informtion from rucio output
    for line in rucio_output:
        line = line.strip()
        
        # Get dataset info 
        if 'DATASET' in line:
            # Save previous dataset info if any
            prepare_rse_info(info)
            # Get tag added to DID when split
            split_id_start = line.find(sample)+len(sample)
            split_id = line[split_id_start:]

        # Get replica information
        if not line or any(x in line for x in reject_patterns): 
            continue
        RSE_site = line.split()[1]
        files_found = int(line.split()[3])
        files_expected = int(line.split()[5])
        flag = '*' if files_found < files_expected else ''
        rse_info = [RSE_site, files_found, files_expected, flag]
        info[split_id].append(rse_info)

    prepare_rse_info(info)
    return RSE_sites
Ejemplo n.º 5
0
def get_file_info(sample):
    """ 
    Get file information from rucio list-files
    
    params:
        sample (str): Dataset Identification (i.e. scope:name)
    returns:
        (int) : number of files
        (str) : size of all files (Terabytes)
        (int) : number of events in all files
        Return values are empty strings if not found in rucio output 
    """

    # Get rucio list-files output
    rucio_cmd = 'rucio list-files ' + sample
    rucio_output = tools.get_cmd_output(rucio_cmd)
     
    n_files = size = n_events = ''
    for line in rucio_output:
        if 'Total' not in line: continue
        elif 'Total files' in line: 
            n_files =  int(line.strip().split()[-1]) 
        elif 'Total size' in line:
            size = float(line.strip().split()[-2]) 
            if size == 0: 
                size = '0'
                continue

            # Convert everything to same units
            units = line.strip().split()[-1]
            if units=='TB': scale = 1
            elif units=='GB': scale = 1e-3
            elif units=='MB': scale = 1e-6
            elif units=='KB': scale = 1e-9
            elif units=='B': scale = 1e-12
            else:
                print "Unexpected units: (%s)->(%s) "%(line,units)
            size *= scale 

            # Convert to desired units
            if args.units=='TB': scale = 1
            elif args.units=='GB': scale = 1e+3
            elif args.units=='MB': scale = 1e+6
            elif args.units=='KB': scale = 1e+9
            elif args.units=='B': scale = 1e+12
            size *= scale

            # Include at least 3 significant figures 
            power = int(log10(abs(size)))
            if power <= -9: prec = 12
            elif power <= -6: prec = 9
            elif power <= -3: prec = 6
            elif power <= 2: prec = 3
            else: prec = 0

            size = str(round(size, prec))
        elif 'Total events' in line:
            n_events = int(line.strip().split()[-1])
        else:
            print "WARNING :: Unknown file info:", line
    
    return n_files, size, n_events
Ejemplo n.º 6
0
def main():
    """ Main Function """

    global args

    # Sanity check
    if not os.path.exists(args.input_datasets):
        print "ERROR :: Input file not found:", args.input_datasets
        sys.exit()
    if os.path.exists(args.output) and not args.append:
        print "ERROR :: Output file already exists:", args.output
        print "\tDelete it, change output name, or use '--append' option"
        sys.exit()
    if not os.path.exists(args.output) and args.append:
        print "ERROR :: Cannot append. Output file doesn't exist:", args.output
        sys.exit()
    if not os.path.isdir(args.sample_dir):
        print "ERROR :: Cannot found sample directory", args.sample_dir
        sys.exit()
    check_environment()
    print "All checks cleared"

    # Initilize
    print "\n===== BEGIN ====="
    not_found_dids = []
    successful_sites = set()
    bad_sites = set()
    incomplete_dids = []
    local_dids = []
    no_progress_dids = []
    failed_dids = []
    successful_downloads = 0

    if not args.dry_run:
        write_or_append = 'a' if args.append else 'w'
        ofile = open(args.output, write_or_append)
        ofile.write("#" * 80 + "\n")
        ofile.write("RUCIO OUTPUT")

    # Download each dataset
    ifile = open(args.input_datasets, 'r')
    n_datasets = 0
    for dataset in ifile:
        if not dataset.strip() or dataset.startswith("#"): continue
        n_datasets += 1
    ifile.seek(0)
    print "Downloading datasets:"
    count = 0
    for dataset in ifile:
        dataset = dataset.strip()
        if not dataset or dataset.startswith("#"): continue
        count += 1

        # Run rucio command
        rucio_cmd = 'rucio get %s --ndownloader 5 --dir %s' % (dataset,
                                                               args.sample_dir)
        progress_str = "[%d/%d] %s" % (count, n_datasets, dataset)
        print progress_str
        if args.dry_run: continue
        ofile.write("\n%s\n" % progress_str)
        ofile.write("Running >> %s\n" % rucio_cmd)

        rucio_output = tools.get_cmd_output(rucio_cmd, print_output=True)

        # Write output and extract relevant information
        total = n_downloaded = n_local = n_failed = 0
        for line in rucio_output:
            line = tools.strip_ansi_escape(line)

            if 'Failed to get did info' in line:
                not_found_dids.append(dataset)
                break
            elif 'successfully downloaded from' in line:
                successful_sites.add(line.split()[-1].strip())
            elif 'is blacklisted for reading' in line:
                bad_sites.add(line.split(':')[-1].split()[0].strip())
            elif 'Total files' in line:
                total = int(line.split()[-1].strip())
            elif 'Downloaded files' in line:
                n_downloaded = int(line.split()[-1].strip())
            elif 'Files already found locally' in line:
                n_local = int(line.split()[-1].strip())
            elif 'Files that cannot be downloaded' in line:
                n_failed = int(line.split()[-1].strip())

            if not args.save_all and "INFO" in line: continue
            ofile.write("\t%s\n" % line.strip())

        # Determine status of download
        if not total: continue
        elif total == n_local: local_dids.append(dataset)
        elif total == n_failed: failed_dids.append(dataset)
        elif total == n_local + n_downloaded: successful_downloads += 1
        elif total == n_local + n_failed: no_progress_dids.append(dataset)
        elif n_downloaded and n_failed: incomplete_dids.append(dataset)
        else:
            print "Unexpected info from rucio (2)"
            print "%d != %d + %d + %d" % (total, n_downloaded, n_local,
                                          n_failed)

    ifile.close()
    if args.dry_run:
        print "End of dry run"
        return

    # Print summary information
    summary_str = "\n\n" + "#" * 80 + "\n"
    summary_str += "Dataset Download Summary\n"
    summary_str += " - %d total datasets\n" % (n_datasets)
    summary_str += " - %d (%6.2f%%) downloads successful\n" % (
        successful_downloads, successful_downloads / float(n_datasets) * 100)
    summary_str += " - %d (%6.2f%%) downloads incomplete\n" % (
        len(incomplete_dids), len(incomplete_dids) / float(n_datasets) * 100)
    summary_str += " - %d (%6.2f%%) downloads already local\n" % (
        len(local_dids), len(local_dids) / float(n_datasets) * 100)
    summary_str += " - %d (%6.2f%%) downloads added nothing new\n" % (
        len(no_progress_dids), len(no_progress_dids) / float(n_datasets) * 100)
    summary_str += " - %d (%6.2f%%) downloads with no success\n" % (
        len(failed_dids), len(failed_dids) / float(n_datasets) * 100)
    summary_str += " - %d (%6.2f%%) datasets not found\n" % (
        len(not_found_dids), len(not_found_dids) / float(n_datasets) * 100)

    if successful_sites: summary_str += "\nSites with successful downloads:\n"
    for rse in successful_sites:
        summary_str += " >> %s\n" % rse
    if bad_sites: summary_str += "\nSites with failed downloads:\n"
    for rse in bad_sites:
        summary_str += " >> %s\n" % rse

    if not_found_dids: summary_str += "\nDIDs not found on rucio:\n"
    for did in not_found_dids:
        summary_str += " >> %s\n" % did
    if incomplete_dids: summary_str += "\nIncomplete download DIDs:\n"
    for did in incomplete_dids:
        summary_str += " >> %s\n" % did
    if local_dids: summary_str += "\nDIDs already stored locally:\n"
    for did in local_dids:
        summary_str += " >> %s\n" % did
    if no_progress_dids: summary_str += "\nDIDs adding nothing new to :\n"
    for did in no_progress_dids:
        summary_str += " >> %s\n" % did
    if failed_dids: summary_str += "\nDIDs with no success:\n"
    for did in failed_dids:
        summary_str += " >> %s\n" % did
    #ofile.seek(0)
    ofile.write(summary_str)
    ofile.close()

    print "Output written to", args.output
    print "Samples downloaded to", args.sample_dir
    print "===== COMPLETED ====="