def chunk_input(self, input_files, chunksize): """Chunk input files into pieces for performance and parallelism.""" part_lists = [] # Lists of partial files known_nlines = None part_suffix = "" chunk_nlines = chunksize * 2 for input_file in input_files: # Count number of lines in the file nlines = int( command.execute_with_output("wc -l %s" % input_file).strip().split()[0]) # Number of lines should be the same in paired files if known_nlines is not None: msg = "Mismatched line counts in supposedly paired files: {}".format( input_files) assert nlines == known_nlines, msg known_nlines = nlines # Set number of pieces and names numparts = (nlines + chunk_nlines - 1) // chunk_nlines ndigits = len(str(numparts - 1)) part_suffix = "-chunksize-%d-numparts-%d-part-" % (chunksize, numparts) out_prefix_base = os.path.basename(input_file) + part_suffix out_prefix = os.path.join(self.chunks_result_dir_local, out_prefix_base) # Split large file into smaller named pieces command.execute("split -a %d --numeric-suffixes -l %d %s %s" % (ndigits, chunk_nlines, input_file, out_prefix)) command.execute_with_retries( f"aws s3 sync --only-show-errors {self.chunks_result_dir_local}/ {self.chunks_result_dir_s3}/ --exclude '*' --include '{out_prefix_base}*'" ) # Get the partial file names partial_files = [] paths = command.execute_with_output( "ls %s*" % out_prefix).rstrip().split("\n") for pf in paths: partial_files.append(os.path.basename(pf)) # Check that the partial files match our expected chunking pattern pattern = "{:0%dd}" % ndigits expected_partial_files = [(out_prefix_base + pattern.format(i)) for i in range(numparts)] msg = "something went wrong with chunking: {} != {}".format( partial_files, expected_partial_files) assert expected_partial_files == partial_files, msg part_lists.append(partial_files) # Ex: [["input_R1.fasta-part-1", "input_R2.fasta-part-1"], # ["input_R1.fasta-part-2", "input_R2.fasta-part-2"], # ["input_R1.fasta-part-3", "input_R2.fasta-part-3"],...] input_chunks = [list(part) for part in zip(*part_lists)] return part_suffix, input_chunks
def delete_tag_with_retries(instance_id, tag_key): command.execute_with_retries( command_patterns.SingleCommand( cmd="aws", args=[ "ec2", "delete-tags", "--resources", instance_id, "--tags", f"Key={tag_key}" ] ) )
def chunk_input(self, input_files, chunksize): """Chunk input files into pieces for performance and parallelism.""" part_lists = [] # Lists of partial files known_nlines = None part_suffix = "" chunk_nlines = chunksize * 2 for input_file in input_files: # Count number of lines in the file cmd_output = command.execute_with_output( command_patterns.SingleCommand(cmd="wc", args=["-l", input_file])) nlines = int(cmd_output.strip().split()[0]) # Number of lines should be the same in paired files if known_nlines is not None: msg = "Mismatched line counts in supposedly paired files: {}".format( input_files) assert nlines == known_nlines, msg known_nlines = nlines # Set number of pieces and names numparts = (nlines + chunk_nlines - 1) // chunk_nlines ndigits = len(str(numparts - 1)) part_suffix = "-chunksize-%d-numparts-%d-part-" % (chunksize, numparts) out_prefix_base = os.path.basename(input_file) + part_suffix out_prefix = os.path.join(self.chunks_result_dir_local, out_prefix_base) # Split large file into smaller named pieces command.execute( command_patterns.SingleCommand(cmd="split", args=[ "-a", ndigits, "--numeric-suffixes", "-l", chunk_nlines, input_file, out_prefix ])) command.execute_with_retries( command_patterns.SingleCommand( cmd="aws", args=[ "s3", "sync", "--only-show-errors", os.path.join(self.chunks_result_dir_local, ""), os.path.join(self.chunks_result_dir_s3, ""), "--exclude", "*", "--include", out_prefix_base + "*" ])) # Get the partial file names partial_files = [] paths = command.glob(glob_pattern=out_prefix + "*", strip_folder_names=True) partial_files.extend(paths) # Check that the partial files match our expected chunking pattern pattern = "{:0%dd}" % ndigits expected_partial_files = [(out_prefix_base + pattern.format(i)) for i in range(numparts)] msg = "something went wrong with chunking: {} != {}".format( partial_files, expected_partial_files) assert expected_partial_files == partial_files, msg part_lists.append(partial_files) # Ex: [["input_R1.fasta-part-1", "input_R2.fasta-part-1"], # ["input_R1.fasta-part-2", "input_R2.fasta-part-2"], # ["input_R1.fasta-part-3", "input_R2.fasta-part-3"],...] input_chunks = [list(part) for part in zip(*part_lists)] return part_suffix, input_chunks
def delete_tag_with_retries(instance_iD, tag_key): command.execute_with_retries(f"aws ec2 delete-tags --resources {instance_iD} --tags Key={tag_key}")