Beispiel #1
0
    def test_glob_with_limit(self):
        '''WHEN glob is invoked with a limit, THEN it returns folders matching the pattern'''
        result = command.glob(f"{TMP_PARENT_FOLDER}/*", max_results=1)

        either_one_of_those = {TMP_SRC_FOLDER, TMP_DEST_FOLDER}
        self.assertTrue(
            len(either_one_of_those.intersection(result)) == 1,
            f"result should be an array containing either {TMP_SRC_FOLDER} or {TMP_DEST_FOLDER}, but result contains {{ {result} }}"
        )
Beispiel #2
0
    def run(self):
        input_fas = self.input_fas()
        output_fas = self.output_files_local()
        genome_dir = fetch_reference(
            self.additional_files["bowtie2_genome"],
            self.ref_dir_local,
            allow_s3mi=True,
            auto_untar=True)
        output_sam_file = os.path.join(
            self.output_dir_local,
            self.additional_attributes["output_sam_file"])
        self.additional_output_files_hidden.append(output_sam_file)
        # The file structure looks like
        # "bowtie2_genome/GRCh38.primary_assembly.genome.3.bt2"
        genome_basename = command.glob(f"{genome_dir}/*.bt2*", max_results=1)[0]
        # remove two extensions: ex: hg38_phiX_rRNA_mito_ERCC.3.bt2 -> hg38_phiX_rRNA_mito_ERCC
        genome_basename = os.path.splitext(os.path.splitext(genome_basename)[0])[0]

        bowtie2_params = [
            '-q', '-x', genome_basename, '-f',
            '--very-sensitive-local', '-S', output_sam_file
        ]

        # --seed cannot be used with -p multithreading
        # We have observed the lack of multithreading resulting in
        # severe performance degradation in some cases. So for the
        # time being multithreading is being chosen over determinism.
        # To seed bowtie2 do something similar to:
        # bowtie2_params.extend(['--seed', '4'])
        bowtie2_params.extend(['-p', str(multiprocessing.cpu_count())])

        if len(input_fas) == 2:
            bowtie2_params.extend(['-1', input_fas[0], '-2', input_fas[1]])
        else:
            bowtie2_params.extend(['-U', input_fas[0]])

        # Example:
        # bowtie2 -q -x /mnt/idseq/ref/bowtie2_genome/hg38_phiX_rRNA_mito_ERCC -f \
        #         --very-sensitive-local -S /mnt/idseq/results/589/bowtie2_human.sam \
        #         -p 32 \
        #         -1 /mnt/idseq/results/589/unmapped_human_1.fa -2 /mnt/idseq/results/589/unmapped_human_2.fa
        command.execute(
            command_patterns.SingleCommand(
                cmd='bowtie2',
                args=bowtie2_params
            )
        )
        log.write("Finished Bowtie alignment.")

        if len(input_fas) == 2:
            convert.generate_unmapped_pairs_from_sam(output_sam_file,
                                                     output_fas)
        else:
            convert.generate_unmapped_singles_from_sam(output_sam_file,
                                                       output_fas[0])
Beispiel #3
0
    def run(self):
        input_fas = self.input_files_local[0][0:2]
        output_fas = self.output_files_local()
        genome_dir = fetch_reference(
            self.additional_files["bowtie2_genome"],
            self.ref_dir_local,
            allow_s3mi=True,
            auto_untar=True)
        output_sam_file = os.path.join(
            self.output_dir_local,
            self.additional_attributes["output_sam_file"])
        self.additional_output_files_hidden.append(output_sam_file)
        # The file structure looks like
        # "bowtie2_genome/GRCh38.primary_assembly.genome.3.bt2"
        genome_basename = command.glob(f"{genome_dir}/*.bt2*", max_results=1)[0]
        # remove two extensions: ex: hg38_phiX_rRNA_mito_ERCC.3.bt2 -> hg38_phiX_rRNA_mito_ERCC
        genome_basename = os.path.splitext(os.path.splitext(genome_basename)[0])[0]

        bowtie2_params = [
            '-q', '-x', genome_basename, '-f',
            '--very-sensitive-local', '-S', output_sam_file
        ]

        seed = self.additional_attributes.get("random_seed")
        if seed:
            bowtie2_params.extend(['--seed', str(seed)])
        else:
            # Seed option won't work with -p threading option.
            bowtie2_params.extend(['-p', str(multiprocessing.cpu_count())])

        if len(input_fas) == 2:
            bowtie2_params.extend(['-1', input_fas[0], '-2', input_fas[1]])
        else:
            bowtie2_params.extend(['-U', input_fas[0]])

        # Example:
        # bowtie2 -q -x /mnt/idseq/ref/bowtie2_genome/hg38_phiX_rRNA_mito_ERCC -f \
        #         --very-sensitive-local -S /mnt/idseq/results/589/bowtie2_human.sam \
        #         -p 32 \
        #         -1 /mnt/idseq/results/589/unmapped_human_1.fa -2 /mnt/idseq/results/589/unmapped_human_2.fa
        command.execute(
            command_patterns.SingleCommand(
                cmd='bowtie2',
                args=bowtie2_params
            )
        )
        log.write("Finished Bowtie alignment.")

        if len(input_fas) == 2:
            convert.generate_unmapped_pairs_from_sam(output_sam_file,
                                                     output_fas)
        else:
            convert.generate_unmapped_singles_from_sam(output_sam_file,
                                                       output_fas[0])
    def run(self):
        input_fas = self.input_fas()
        output_fas = self.output_files_local()
        genome_dir = fetch_reference(self.additional_files["bowtie2_genome"],
                                     self.ref_dir_local,
                                     allow_s3mi=True,
                                     auto_untar=True)
        output_sam_file = os.path.join(
            self.output_dir_local,
            self.additional_attributes["output_sam_file"])
        self.additional_output_files_hidden.append(output_sam_file)
        # The file structure looks like
        # "bowtie2_genome/GRCh38.primary_assembly.genome.3.bt2"
        genome_basename = command.glob(f"{genome_dir}/*.bt2*",
                                       max_results=1)[0]
        # remove two extensions: ex: hg38_phiX_rRNA_mito_ERCC.3.bt2 -> hg38_phiX_rRNA_mito_ERCC
        genome_basename = os.path.splitext(
            os.path.splitext(genome_basename)[0])[0]

        bowtie2_params = [
            '-q', '-x', genome_basename, '-f', '--very-sensitive-local', '-S',
            output_sam_file
        ]

        # FIXME: https://jira.czi.team/browse/IDSEQ-2738
        #  We want to move towards a general randomness solution in which
        #  all randomness is seeded based on the content of the original input.
        #  This is currently introducing non-determinism and hard coding
        #  an arbitrary seed here shouldn't impact correctness.
        bowtie2_params.extend(
            ['--seed',
             '4'])  # chosen by fair dice role, guaranteed to be random

        if len(input_fas) == 2:
            bowtie2_params.extend(['-1', input_fas[0], '-2', input_fas[1]])
        else:
            bowtie2_params.extend(['-U', input_fas[0]])

        # Example:
        # bowtie2 -q -x /mnt/idseq/ref/bowtie2_genome/hg38_phiX_rRNA_mito_ERCC -f \
        #         --very-sensitive-local -S /mnt/idseq/results/589/bowtie2_human.sam \
        #         -p 32 \
        #         -1 /mnt/idseq/results/589/unmapped_human_1.fa -2 /mnt/idseq/results/589/unmapped_human_2.fa
        command.execute(
            command_patterns.SingleCommand(cmd='bowtie2', args=bowtie2_params))
        log.write("Finished Bowtie alignment.")

        if len(input_fas) == 2:
            convert.generate_unmapped_pairs_from_sam(output_sam_file,
                                                     output_fas)
        else:
            convert.generate_unmapped_singles_from_sam(output_sam_file,
                                                       output_fas[0])
Beispiel #5
0
    def chunk_input(self, input_files, chunksize):
        """Chunk input files into pieces for performance and parallelism."""
        part_lists = []  # Lists of partial files
        known_nlines = None
        part_suffix = ""
        chunk_nlines = chunksize * 2

        for input_file in input_files:
            # Count number of lines in the file
            cmd_output = command.execute_with_output(
                command_patterns.SingleCommand(cmd="wc",
                                               args=["-l", input_file]))
            nlines = int(cmd_output.strip().split()[0])
            # Number of lines should be the same in paired files
            if known_nlines is not None:
                msg = "Mismatched line counts in supposedly paired files: {}".format(
                    input_files)
                assert nlines == known_nlines, msg
            known_nlines = nlines

            # Set number of pieces and names
            numparts = (nlines + chunk_nlines - 1) // chunk_nlines
            ndigits = len(str(numparts - 1))
            part_suffix = "-chunksize-%d-numparts-%d-part-" % (chunksize,
                                                               numparts)
            out_prefix_base = os.path.basename(input_file) + part_suffix
            out_prefix = os.path.join(self.chunks_result_dir_local,
                                      out_prefix_base)

            # Split large file into smaller named pieces
            command.execute(
                command_patterns.SingleCommand(cmd="split",
                                               args=[
                                                   "-a", ndigits,
                                                   "--numeric-suffixes", "-l",
                                                   chunk_nlines, input_file,
                                                   out_prefix
                                               ]))
            command.execute_with_retries(
                command_patterns.SingleCommand(
                    cmd="aws",
                    args=[
                        "s3", "sync", "--only-show-errors",
                        os.path.join(self.chunks_result_dir_local, ""),
                        os.path.join(self.chunks_result_dir_s3, ""),
                        "--exclude", "*", "--include", out_prefix_base + "*"
                    ]))

            # Get the partial file names
            partial_files = []
            paths = command.glob(glob_pattern=out_prefix + "*",
                                 strip_folder_names=True)
            partial_files.extend(paths)

            # Check that the partial files match our expected chunking pattern
            pattern = "{:0%dd}" % ndigits
            expected_partial_files = [(out_prefix_base + pattern.format(i))
                                      for i in range(numparts)]
            msg = "something went wrong with chunking: {} != {}".format(
                partial_files, expected_partial_files)
            assert expected_partial_files == partial_files, msg
            part_lists.append(partial_files)

        # Ex: [["input_R1.fasta-part-1", "input_R2.fasta-part-1"],
        # ["input_R1.fasta-part-2", "input_R2.fasta-part-2"],
        # ["input_R1.fasta-part-3", "input_R2.fasta-part-3"],...]
        input_chunks = [list(part) for part in zip(*part_lists)]
        return part_suffix, input_chunks
Beispiel #6
0
    def test_glob(self):
        '''WHEN glob is invoked, THEN it returns folders matching the pattern'''
        result = command.glob(f"{TMP_PARENT_FOLDER}/*")

        # assertCountEqual has a very misleading name, it actually compares two arrays disregarding item order - https://bugs.python.org/issue27071
        self.assertCountEqual(result, [TMP_SRC_FOLDER, TMP_DEST_FOLDER])