def test_glob_with_limit(self): '''WHEN glob is invoked with a limit, THEN it returns folders matching the pattern''' result = command.glob(f"{TMP_PARENT_FOLDER}/*", max_results=1) either_one_of_those = {TMP_SRC_FOLDER, TMP_DEST_FOLDER} self.assertTrue( len(either_one_of_those.intersection(result)) == 1, f"result should be an array containing either {TMP_SRC_FOLDER} or {TMP_DEST_FOLDER}, but result contains {{ {result} }}" )
def run(self): input_fas = self.input_fas() output_fas = self.output_files_local() genome_dir = fetch_reference( self.additional_files["bowtie2_genome"], self.ref_dir_local, allow_s3mi=True, auto_untar=True) output_sam_file = os.path.join( self.output_dir_local, self.additional_attributes["output_sam_file"]) self.additional_output_files_hidden.append(output_sam_file) # The file structure looks like # "bowtie2_genome/GRCh38.primary_assembly.genome.3.bt2" genome_basename = command.glob(f"{genome_dir}/*.bt2*", max_results=1)[0] # remove two extensions: ex: hg38_phiX_rRNA_mito_ERCC.3.bt2 -> hg38_phiX_rRNA_mito_ERCC genome_basename = os.path.splitext(os.path.splitext(genome_basename)[0])[0] bowtie2_params = [ '-q', '-x', genome_basename, '-f', '--very-sensitive-local', '-S', output_sam_file ] # --seed cannot be used with -p multithreading # We have observed the lack of multithreading resulting in # severe performance degradation in some cases. So for the # time being multithreading is being chosen over determinism. # To seed bowtie2 do something similar to: # bowtie2_params.extend(['--seed', '4']) bowtie2_params.extend(['-p', str(multiprocessing.cpu_count())]) if len(input_fas) == 2: bowtie2_params.extend(['-1', input_fas[0], '-2', input_fas[1]]) else: bowtie2_params.extend(['-U', input_fas[0]]) # Example: # bowtie2 -q -x /mnt/idseq/ref/bowtie2_genome/hg38_phiX_rRNA_mito_ERCC -f \ # --very-sensitive-local -S /mnt/idseq/results/589/bowtie2_human.sam \ # -p 32 \ # -1 /mnt/idseq/results/589/unmapped_human_1.fa -2 /mnt/idseq/results/589/unmapped_human_2.fa command.execute( command_patterns.SingleCommand( cmd='bowtie2', args=bowtie2_params ) ) log.write("Finished Bowtie alignment.") if len(input_fas) == 2: convert.generate_unmapped_pairs_from_sam(output_sam_file, output_fas) else: convert.generate_unmapped_singles_from_sam(output_sam_file, output_fas[0])
def run(self): input_fas = self.input_files_local[0][0:2] output_fas = self.output_files_local() genome_dir = fetch_reference( self.additional_files["bowtie2_genome"], self.ref_dir_local, allow_s3mi=True, auto_untar=True) output_sam_file = os.path.join( self.output_dir_local, self.additional_attributes["output_sam_file"]) self.additional_output_files_hidden.append(output_sam_file) # The file structure looks like # "bowtie2_genome/GRCh38.primary_assembly.genome.3.bt2" genome_basename = command.glob(f"{genome_dir}/*.bt2*", max_results=1)[0] # remove two extensions: ex: hg38_phiX_rRNA_mito_ERCC.3.bt2 -> hg38_phiX_rRNA_mito_ERCC genome_basename = os.path.splitext(os.path.splitext(genome_basename)[0])[0] bowtie2_params = [ '-q', '-x', genome_basename, '-f', '--very-sensitive-local', '-S', output_sam_file ] seed = self.additional_attributes.get("random_seed") if seed: bowtie2_params.extend(['--seed', str(seed)]) else: # Seed option won't work with -p threading option. bowtie2_params.extend(['-p', str(multiprocessing.cpu_count())]) if len(input_fas) == 2: bowtie2_params.extend(['-1', input_fas[0], '-2', input_fas[1]]) else: bowtie2_params.extend(['-U', input_fas[0]]) # Example: # bowtie2 -q -x /mnt/idseq/ref/bowtie2_genome/hg38_phiX_rRNA_mito_ERCC -f \ # --very-sensitive-local -S /mnt/idseq/results/589/bowtie2_human.sam \ # -p 32 \ # -1 /mnt/idseq/results/589/unmapped_human_1.fa -2 /mnt/idseq/results/589/unmapped_human_2.fa command.execute( command_patterns.SingleCommand( cmd='bowtie2', args=bowtie2_params ) ) log.write("Finished Bowtie alignment.") if len(input_fas) == 2: convert.generate_unmapped_pairs_from_sam(output_sam_file, output_fas) else: convert.generate_unmapped_singles_from_sam(output_sam_file, output_fas[0])
def run(self): input_fas = self.input_fas() output_fas = self.output_files_local() genome_dir = fetch_reference(self.additional_files["bowtie2_genome"], self.ref_dir_local, allow_s3mi=True, auto_untar=True) output_sam_file = os.path.join( self.output_dir_local, self.additional_attributes["output_sam_file"]) self.additional_output_files_hidden.append(output_sam_file) # The file structure looks like # "bowtie2_genome/GRCh38.primary_assembly.genome.3.bt2" genome_basename = command.glob(f"{genome_dir}/*.bt2*", max_results=1)[0] # remove two extensions: ex: hg38_phiX_rRNA_mito_ERCC.3.bt2 -> hg38_phiX_rRNA_mito_ERCC genome_basename = os.path.splitext( os.path.splitext(genome_basename)[0])[0] bowtie2_params = [ '-q', '-x', genome_basename, '-f', '--very-sensitive-local', '-S', output_sam_file ] # FIXME: https://jira.czi.team/browse/IDSEQ-2738 # We want to move towards a general randomness solution in which # all randomness is seeded based on the content of the original input. # This is currently introducing non-determinism and hard coding # an arbitrary seed here shouldn't impact correctness. bowtie2_params.extend( ['--seed', '4']) # chosen by fair dice role, guaranteed to be random if len(input_fas) == 2: bowtie2_params.extend(['-1', input_fas[0], '-2', input_fas[1]]) else: bowtie2_params.extend(['-U', input_fas[0]]) # Example: # bowtie2 -q -x /mnt/idseq/ref/bowtie2_genome/hg38_phiX_rRNA_mito_ERCC -f \ # --very-sensitive-local -S /mnt/idseq/results/589/bowtie2_human.sam \ # -p 32 \ # -1 /mnt/idseq/results/589/unmapped_human_1.fa -2 /mnt/idseq/results/589/unmapped_human_2.fa command.execute( command_patterns.SingleCommand(cmd='bowtie2', args=bowtie2_params)) log.write("Finished Bowtie alignment.") if len(input_fas) == 2: convert.generate_unmapped_pairs_from_sam(output_sam_file, output_fas) else: convert.generate_unmapped_singles_from_sam(output_sam_file, output_fas[0])
def chunk_input(self, input_files, chunksize): """Chunk input files into pieces for performance and parallelism.""" part_lists = [] # Lists of partial files known_nlines = None part_suffix = "" chunk_nlines = chunksize * 2 for input_file in input_files: # Count number of lines in the file cmd_output = command.execute_with_output( command_patterns.SingleCommand(cmd="wc", args=["-l", input_file])) nlines = int(cmd_output.strip().split()[0]) # Number of lines should be the same in paired files if known_nlines is not None: msg = "Mismatched line counts in supposedly paired files: {}".format( input_files) assert nlines == known_nlines, msg known_nlines = nlines # Set number of pieces and names numparts = (nlines + chunk_nlines - 1) // chunk_nlines ndigits = len(str(numparts - 1)) part_suffix = "-chunksize-%d-numparts-%d-part-" % (chunksize, numparts) out_prefix_base = os.path.basename(input_file) + part_suffix out_prefix = os.path.join(self.chunks_result_dir_local, out_prefix_base) # Split large file into smaller named pieces command.execute( command_patterns.SingleCommand(cmd="split", args=[ "-a", ndigits, "--numeric-suffixes", "-l", chunk_nlines, input_file, out_prefix ])) command.execute_with_retries( command_patterns.SingleCommand( cmd="aws", args=[ "s3", "sync", "--only-show-errors", os.path.join(self.chunks_result_dir_local, ""), os.path.join(self.chunks_result_dir_s3, ""), "--exclude", "*", "--include", out_prefix_base + "*" ])) # Get the partial file names partial_files = [] paths = command.glob(glob_pattern=out_prefix + "*", strip_folder_names=True) partial_files.extend(paths) # Check that the partial files match our expected chunking pattern pattern = "{:0%dd}" % ndigits expected_partial_files = [(out_prefix_base + pattern.format(i)) for i in range(numparts)] msg = "something went wrong with chunking: {} != {}".format( partial_files, expected_partial_files) assert expected_partial_files == partial_files, msg part_lists.append(partial_files) # Ex: [["input_R1.fasta-part-1", "input_R2.fasta-part-1"], # ["input_R1.fasta-part-2", "input_R2.fasta-part-2"], # ["input_R1.fasta-part-3", "input_R2.fasta-part-3"],...] input_chunks = [list(part) for part in zip(*part_lists)] return part_suffix, input_chunks
def test_glob(self): '''WHEN glob is invoked, THEN it returns folders matching the pattern''' result = command.glob(f"{TMP_PARENT_FOLDER}/*") # assertCountEqual has a very misleading name, it actually compares two arrays disregarding item order - https://bugs.python.org/issue27071 self.assertCountEqual(result, [TMP_SRC_FOLDER, TMP_DEST_FOLDER])