def rucio_file_exists(sample): """ Check if `rucio ls` returns the sample""" # Check if sample name contains wildcards if '*' in sample or '?' in sample: print "Attempting wildcard search with sample: ", sample print "\tUse exact file name" sys.exit() # Get rucio ls output rucio_cmd = 'rucio ls ' + sample rucio_output = tools.get_cmd_output(rucio_cmd) # Check rucio ls output return True if any(sample in x for x in rucio_output) else False
def write_file_info(sample): """ Add file information from rucio list-files""" # Get rucio list-files output rucio_cmd = 'rucio list-files ' + sample rucio_output = tools.get_cmd_output(rucio_cmd) info_to_write = '\t\tFILE STATS - ' for line in rucio_output: if 'Total' not in line: continue info = line.strip()[len('Total '):] info_to_write += "%s, "%info # remove trailing comma and add newline info_to_write = info_to_write[:-2] + '\n' return info_to_write
def write_replica_info(sample): """ Add dataset replica information from rucio list-dataset-replicas""" # Get rucio list-dataset-replicas output rucio_cmd = 'rucio list-dataset-replicas ' + sample rucio_output = tools.get_cmd_output(rucio_cmd) # Check if sample is split n_samples = len([x for x in rucio_output if "DATASET" in x]) is_split = True if n_samples > 1 else False prefix = '\t\tREPLICA - ' info_to_write = prefix if not is_split else '' reject_patterns = ['DATASET', 'RSE', '-'] # don't grab from these lines for line in rucio_output: line = line.strip() # Add new line for each split sample if is_split and 'DATASET' in line: info_to_write += '\n' if info_to_write else '' split_id_start = line.find(sample)+len(sample) split_id = line[split_id_start:] info_to_write += prefix + '(%s) '%split_id # Get replica information if not line or any(x in line for x in ['DATASET', 'RSE', '---']): continue RSE_site = line.split()[1] files_found = int(line.split()[3]) files_expected = int(line.split()[5]) flag = '*' if files_found < files_expected else '' info_to_write += "%s (%d/%d)%s, "%(RSE_site, files_found, files_expected, flag) # remove trailing comma and add newline info_to_write = info_to_write[:-2] + '\n' return info_to_write
def get_rse_info(sample): """ get dataset rse information from rucio list-dataset-replicas params: sample (str): Dataset Identification (i.e. scope:name) returns: (dict) : map from RSE sites to information about samples on that site. Information includes available and total files per site and, if necessary, the ID tag appended to the DID for cases where the sample is split across multiple sites example: {'BNL-OSG2_MCTAPE' : ['id_tag1 (500/500)'; 'id_tag2 (298/298)'] 'BNL-OSG2_DATADISK' : ['id_tag2 (500/500)']} """ # Get rucio list-dataset-replicas output rucio_cmd = 'rucio list-dataset-replicas ' + sample rucio_output = tools.get_cmd_output(rucio_cmd) # Check if sample is split n_samples = sum(1 for x in rucio_output if "DATASET" in x) is_split = n_samples > 1 # Initialize tools for extraction reject_patterns = ['DATASET', 'RSE', '---','SCRATCHDISK'] # don't grab from these lines RSE_sites = {} id_found_expected = defaultdict(lambda :[0,0]) info = defaultdict(list) def prepare_rse_info(info_map): # Rearrange RSE information into desired format if not info_map: return for split_id, rse_info_list in info_map.iteritems(): for info_list in rse_info_list: RSE, found, exp, flag = info_list id_found_expected[RSE][0] += found id_found_expected[RSE][1] += exp for RSE, (found, exp) in id_found_expected.iteritems(): flag = "*" if found != exp else '' RSE_sites[RSE] = '(%d/%d)%s'%(found,exp,flag) info.clear() # Extract informtion from rucio output for line in rucio_output: line = line.strip() # Get dataset info if 'DATASET' in line: # Save previous dataset info if any prepare_rse_info(info) # Get tag added to DID when split split_id_start = line.find(sample)+len(sample) split_id = line[split_id_start:] # Get replica information if not line or any(x in line for x in reject_patterns): continue RSE_site = line.split()[1] files_found = int(line.split()[3]) files_expected = int(line.split()[5]) flag = '*' if files_found < files_expected else '' rse_info = [RSE_site, files_found, files_expected, flag] info[split_id].append(rse_info) prepare_rse_info(info) return RSE_sites
def get_file_info(sample): """ Get file information from rucio list-files params: sample (str): Dataset Identification (i.e. scope:name) returns: (int) : number of files (str) : size of all files (Terabytes) (int) : number of events in all files Return values are empty strings if not found in rucio output """ # Get rucio list-files output rucio_cmd = 'rucio list-files ' + sample rucio_output = tools.get_cmd_output(rucio_cmd) n_files = size = n_events = '' for line in rucio_output: if 'Total' not in line: continue elif 'Total files' in line: n_files = int(line.strip().split()[-1]) elif 'Total size' in line: size = float(line.strip().split()[-2]) if size == 0: size = '0' continue # Convert everything to same units units = line.strip().split()[-1] if units=='TB': scale = 1 elif units=='GB': scale = 1e-3 elif units=='MB': scale = 1e-6 elif units=='KB': scale = 1e-9 elif units=='B': scale = 1e-12 else: print "Unexpected units: (%s)->(%s) "%(line,units) size *= scale # Convert to desired units if args.units=='TB': scale = 1 elif args.units=='GB': scale = 1e+3 elif args.units=='MB': scale = 1e+6 elif args.units=='KB': scale = 1e+9 elif args.units=='B': scale = 1e+12 size *= scale # Include at least 3 significant figures power = int(log10(abs(size))) if power <= -9: prec = 12 elif power <= -6: prec = 9 elif power <= -3: prec = 6 elif power <= 2: prec = 3 else: prec = 0 size = str(round(size, prec)) elif 'Total events' in line: n_events = int(line.strip().split()[-1]) else: print "WARNING :: Unknown file info:", line return n_files, size, n_events
def main(): """ Main Function """ global args # Sanity check if not os.path.exists(args.input_datasets): print "ERROR :: Input file not found:", args.input_datasets sys.exit() if os.path.exists(args.output) and not args.append: print "ERROR :: Output file already exists:", args.output print "\tDelete it, change output name, or use '--append' option" sys.exit() if not os.path.exists(args.output) and args.append: print "ERROR :: Cannot append. Output file doesn't exist:", args.output sys.exit() if not os.path.isdir(args.sample_dir): print "ERROR :: Cannot found sample directory", args.sample_dir sys.exit() check_environment() print "All checks cleared" # Initilize print "\n===== BEGIN =====" not_found_dids = [] successful_sites = set() bad_sites = set() incomplete_dids = [] local_dids = [] no_progress_dids = [] failed_dids = [] successful_downloads = 0 if not args.dry_run: write_or_append = 'a' if args.append else 'w' ofile = open(args.output, write_or_append) ofile.write("#" * 80 + "\n") ofile.write("RUCIO OUTPUT") # Download each dataset ifile = open(args.input_datasets, 'r') n_datasets = 0 for dataset in ifile: if not dataset.strip() or dataset.startswith("#"): continue n_datasets += 1 ifile.seek(0) print "Downloading datasets:" count = 0 for dataset in ifile: dataset = dataset.strip() if not dataset or dataset.startswith("#"): continue count += 1 # Run rucio command rucio_cmd = 'rucio get %s --ndownloader 5 --dir %s' % (dataset, args.sample_dir) progress_str = "[%d/%d] %s" % (count, n_datasets, dataset) print progress_str if args.dry_run: continue ofile.write("\n%s\n" % progress_str) ofile.write("Running >> %s\n" % rucio_cmd) rucio_output = tools.get_cmd_output(rucio_cmd, print_output=True) # Write output and extract relevant information total = n_downloaded = n_local = n_failed = 0 for line in rucio_output: line = tools.strip_ansi_escape(line) if 'Failed to get did info' in line: not_found_dids.append(dataset) break elif 'successfully downloaded from' in line: successful_sites.add(line.split()[-1].strip()) elif 'is blacklisted for reading' in line: bad_sites.add(line.split(':')[-1].split()[0].strip()) elif 'Total files' in line: total = int(line.split()[-1].strip()) elif 'Downloaded files' in line: n_downloaded = int(line.split()[-1].strip()) elif 'Files already found locally' in line: n_local = int(line.split()[-1].strip()) elif 'Files that cannot be downloaded' in line: n_failed = int(line.split()[-1].strip()) if not args.save_all and "INFO" in line: continue ofile.write("\t%s\n" % line.strip()) # Determine status of download if not total: continue elif total == n_local: local_dids.append(dataset) elif total == n_failed: failed_dids.append(dataset) elif total == n_local + n_downloaded: successful_downloads += 1 elif total == n_local + n_failed: no_progress_dids.append(dataset) elif n_downloaded and n_failed: incomplete_dids.append(dataset) else: print "Unexpected info from rucio (2)" print "%d != %d + %d + %d" % (total, n_downloaded, n_local, n_failed) ifile.close() if args.dry_run: print "End of dry run" return # Print summary information summary_str = "\n\n" + "#" * 80 + "\n" summary_str += "Dataset Download Summary\n" summary_str += " - %d total datasets\n" % (n_datasets) summary_str += " - %d (%6.2f%%) downloads successful\n" % ( successful_downloads, successful_downloads / float(n_datasets) * 100) summary_str += " - %d (%6.2f%%) downloads incomplete\n" % ( len(incomplete_dids), len(incomplete_dids) / float(n_datasets) * 100) summary_str += " - %d (%6.2f%%) downloads already local\n" % ( len(local_dids), len(local_dids) / float(n_datasets) * 100) summary_str += " - %d (%6.2f%%) downloads added nothing new\n" % ( len(no_progress_dids), len(no_progress_dids) / float(n_datasets) * 100) summary_str += " - %d (%6.2f%%) downloads with no success\n" % ( len(failed_dids), len(failed_dids) / float(n_datasets) * 100) summary_str += " - %d (%6.2f%%) datasets not found\n" % ( len(not_found_dids), len(not_found_dids) / float(n_datasets) * 100) if successful_sites: summary_str += "\nSites with successful downloads:\n" for rse in successful_sites: summary_str += " >> %s\n" % rse if bad_sites: summary_str += "\nSites with failed downloads:\n" for rse in bad_sites: summary_str += " >> %s\n" % rse if not_found_dids: summary_str += "\nDIDs not found on rucio:\n" for did in not_found_dids: summary_str += " >> %s\n" % did if incomplete_dids: summary_str += "\nIncomplete download DIDs:\n" for did in incomplete_dids: summary_str += " >> %s\n" % did if local_dids: summary_str += "\nDIDs already stored locally:\n" for did in local_dids: summary_str += " >> %s\n" % did if no_progress_dids: summary_str += "\nDIDs adding nothing new to :\n" for did in no_progress_dids: summary_str += " >> %s\n" % did if failed_dids: summary_str += "\nDIDs with no success:\n" for did in failed_dids: summary_str += " >> %s\n" % did #ofile.seek(0) ofile.write(summary_str) ofile.close() print "Output written to", args.output print "Samples downloaded to", args.sample_dir print "===== COMPLETED ====="