Beispiel #1
0
    def run(self):
        input_fas = self.input_files_local[0][0:2]
        output_fas = self.output_files_local()
        output_sam_file = os.path.join(
            self.output_dir_local,
            self.additional_attributes["output_sam_file"])
        self.additional_output_files_hidden.append(output_sam_file)

        genome_dir = fetch_reference(self.additional_files["gsnap_genome"],
                                     self.ref_dir_local,
                                     allow_s3mi=True,
                                     auto_untar=True)
        gsnap_base_dir = os.path.dirname(genome_dir)
        gsnap_index_name = os.path.basename(genome_dir)
        # Run Gsnap
        gsnap_params = [
            '-A', 'sam', '--batch=0', '--use-shared-memory=0',
            '--gmap-mode=all', '--npaths=1', '--ordered', '-t', 32,
            '--max-mismatches=40', '-D', gsnap_base_dir, '-d',
            gsnap_index_name, '-o', output_sam_file
        ] + input_fas
        command.execute(
            command_patterns.SingleCommand(cmd='gsnapl', args=gsnap_params))
        log.write("Finished GSNAP alignment.")

        # Extract out unmapped files from sam
        if len(input_fas) == 2:
            convert.generate_unmapped_pairs_from_sam(output_sam_file,
                                                     output_fas)
        else:
            convert.generate_unmapped_singles_from_sam(output_sam_file,
                                                       output_fas[0])
Beispiel #2
0
    def run(self):
        input_fas = self.input_fas()
        output_fas = self.output_files_local()
        genome_dir = fetch_reference(
            self.additional_files["bowtie2_genome"],
            self.ref_dir_local,
            allow_s3mi=True,
            auto_untar=True)
        output_sam_file = os.path.join(
            self.output_dir_local,
            self.additional_attributes["output_sam_file"])
        self.additional_output_files_hidden.append(output_sam_file)
        # The file structure looks like
        # "bowtie2_genome/GRCh38.primary_assembly.genome.3.bt2"
        genome_basename = command.glob(f"{genome_dir}/*.bt2*", max_results=1)[0]
        # remove two extensions: ex: hg38_phiX_rRNA_mito_ERCC.3.bt2 -> hg38_phiX_rRNA_mito_ERCC
        genome_basename = os.path.splitext(os.path.splitext(genome_basename)[0])[0]

        bowtie2_params = [
            '-q', '-x', genome_basename, '-f',
            '--very-sensitive-local', '-S', output_sam_file
        ]

        # --seed cannot be used with -p multithreading
        # We have observed the lack of multithreading resulting in
        # severe performance degradation in some cases. So for the
        # time being multithreading is being chosen over determinism.
        # To seed bowtie2 do something similar to:
        # bowtie2_params.extend(['--seed', '4'])
        bowtie2_params.extend(['-p', str(multiprocessing.cpu_count())])

        if len(input_fas) == 2:
            bowtie2_params.extend(['-1', input_fas[0], '-2', input_fas[1]])
        else:
            bowtie2_params.extend(['-U', input_fas[0]])

        # Example:
        # bowtie2 -q -x /mnt/idseq/ref/bowtie2_genome/hg38_phiX_rRNA_mito_ERCC -f \
        #         --very-sensitive-local -S /mnt/idseq/results/589/bowtie2_human.sam \
        #         -p 32 \
        #         -1 /mnt/idseq/results/589/unmapped_human_1.fa -2 /mnt/idseq/results/589/unmapped_human_2.fa
        command.execute(
            command_patterns.SingleCommand(
                cmd='bowtie2',
                args=bowtie2_params
            )
        )
        log.write("Finished Bowtie alignment.")

        if len(input_fas) == 2:
            convert.generate_unmapped_pairs_from_sam(output_sam_file,
                                                     output_fas)
        else:
            convert.generate_unmapped_singles_from_sam(output_sam_file,
                                                       output_fas[0])
Beispiel #3
0
    def run(self):
        input_fas = self.input_files_local[0][0:2]
        output_fas = self.output_files_local()
        genome_dir = fetch_reference(
            self.additional_files["bowtie2_genome"],
            self.ref_dir_local,
            allow_s3mi=True,
            auto_untar=True)
        output_sam_file = os.path.join(
            self.output_dir_local,
            self.additional_attributes["output_sam_file"])
        self.additional_output_files_hidden.append(output_sam_file)
        # The file structure looks like
        # "bowtie2_genome/GRCh38.primary_assembly.genome.3.bt2"
        genome_basename = command.glob(f"{genome_dir}/*.bt2*", max_results=1)[0]
        # remove two extensions: ex: hg38_phiX_rRNA_mito_ERCC.3.bt2 -> hg38_phiX_rRNA_mito_ERCC
        genome_basename = os.path.splitext(os.path.splitext(genome_basename)[0])[0]

        bowtie2_params = [
            '-q', '-x', genome_basename, '-f',
            '--very-sensitive-local', '-S', output_sam_file
        ]

        seed = self.additional_attributes.get("random_seed")
        if seed:
            bowtie2_params.extend(['--seed', str(seed)])
        else:
            # Seed option won't work with -p threading option.
            bowtie2_params.extend(['-p', str(multiprocessing.cpu_count())])

        if len(input_fas) == 2:
            bowtie2_params.extend(['-1', input_fas[0], '-2', input_fas[1]])
        else:
            bowtie2_params.extend(['-U', input_fas[0]])

        # Example:
        # bowtie2 -q -x /mnt/idseq/ref/bowtie2_genome/hg38_phiX_rRNA_mito_ERCC -f \
        #         --very-sensitive-local -S /mnt/idseq/results/589/bowtie2_human.sam \
        #         -p 32 \
        #         -1 /mnt/idseq/results/589/unmapped_human_1.fa -2 /mnt/idseq/results/589/unmapped_human_2.fa
        command.execute(
            command_patterns.SingleCommand(
                cmd='bowtie2',
                args=bowtie2_params
            )
        )
        log.write("Finished Bowtie alignment.")

        if len(input_fas) == 2:
            convert.generate_unmapped_pairs_from_sam(output_sam_file,
                                                     output_fas)
        else:
            convert.generate_unmapped_singles_from_sam(output_sam_file,
                                                       output_fas[0])
    def run(self):
        input_fas = self.input_fas()
        output_fas = self.output_files_local()
        genome_dir = fetch_reference(self.additional_files["bowtie2_genome"],
                                     self.ref_dir_local,
                                     allow_s3mi=True,
                                     auto_untar=True)
        output_sam_file = os.path.join(
            self.output_dir_local,
            self.additional_attributes["output_sam_file"])
        self.additional_output_files_hidden.append(output_sam_file)
        # The file structure looks like
        # "bowtie2_genome/GRCh38.primary_assembly.genome.3.bt2"
        genome_basename = command.glob(f"{genome_dir}/*.bt2*",
                                       max_results=1)[0]
        # remove two extensions: ex: hg38_phiX_rRNA_mito_ERCC.3.bt2 -> hg38_phiX_rRNA_mito_ERCC
        genome_basename = os.path.splitext(
            os.path.splitext(genome_basename)[0])[0]

        bowtie2_params = [
            '-q', '-x', genome_basename, '-f', '--very-sensitive-local', '-S',
            output_sam_file
        ]

        # FIXME: https://jira.czi.team/browse/IDSEQ-2738
        #  We want to move towards a general randomness solution in which
        #  all randomness is seeded based on the content of the original input.
        #  This is currently introducing non-determinism and hard coding
        #  an arbitrary seed here shouldn't impact correctness.
        bowtie2_params.extend(
            ['--seed',
             '4'])  # chosen by fair dice role, guaranteed to be random

        if len(input_fas) == 2:
            bowtie2_params.extend(['-1', input_fas[0], '-2', input_fas[1]])
        else:
            bowtie2_params.extend(['-U', input_fas[0]])

        # Example:
        # bowtie2 -q -x /mnt/idseq/ref/bowtie2_genome/hg38_phiX_rRNA_mito_ERCC -f \
        #         --very-sensitive-local -S /mnt/idseq/results/589/bowtie2_human.sam \
        #         -p 32 \
        #         -1 /mnt/idseq/results/589/unmapped_human_1.fa -2 /mnt/idseq/results/589/unmapped_human_2.fa
        command.execute(
            command_patterns.SingleCommand(cmd='bowtie2', args=bowtie2_params))
        log.write("Finished Bowtie alignment.")

        if len(input_fas) == 2:
            convert.generate_unmapped_pairs_from_sam(output_sam_file,
                                                     output_fas)
        else:
            convert.generate_unmapped_singles_from_sam(output_sam_file,
                                                       output_fas[0])
Beispiel #5
0
    def run(self):
        input_fas = self.input_files_local[0][0:2]
        output_fas = self.output_files_local()
        genome_dir = fetch_from_s3(self.additional_files["bowtie2_genome"],
                                   self.ref_dir_local,
                                   allow_s3mi=True,
                                   auto_untar=True)
        output_sam_file = os.path.join(
            self.output_dir_local,
            self.additional_attributes["output_sam_file"])
        self.additional_files_to_upload.append(output_sam_file)
        # The file structure looks like
        # "bowtie2_genome/GRCh38.primary_assembly.genome.3.bt2"
        # The code below will handle up to "bowtie2_genome/GRCh38.primary_assembly.
        # genome.99.bt2" but not 100.
        cmd = "ls {genome_dir}/*.bt2*".format(genome_dir=genome_dir)
        local_genome_dir_ls = command.execute_with_output(cmd)
        genome_basename = local_genome_dir_ls.split("\n")[0][:-6]
        if genome_basename[-1] == '.':
            genome_basename = genome_basename[:-1]
        bowtie2_params = [
            'bowtie2', '-q', '-x', genome_basename, '-f',
            '--very-sensitive-local', '-S', output_sam_file
        ]

        seed = self.additional_attributes.get("random_seed")
        if seed:
            bowtie2_params.extend(['--seed', str(seed)])
        else:
            # Seed option won't work with -p threading option.
            bowtie2_params.extend(['-p', str(multiprocessing.cpu_count())])

        if len(input_fas) == 2:
            bowtie2_params.extend(['-1', input_fas[0], '-2', input_fas[1]])
        else:
            bowtie2_params.extend(['-U', input_fas[0]])
        command.execute(" ".join(bowtie2_params))
        log.write("Finished Bowtie alignment.")

        if len(input_fas) == 2:
            convert.generate_unmapped_pairs_from_sam(output_sam_file,
                                                     output_fas)
        else:
            convert.generate_unmapped_singles_from_sam(output_sam_file,
                                                       output_fas[0])
Beispiel #6
0
    def run(self):
        input_fas = self.input_fas()
        output_fas = self.output_files_local()
        output_sam_file = os.path.join(self.output_dir_local,
                                       self.additional_attributes["output_sam_file"])
        self.additional_output_files_hidden.append(output_sam_file)

        genome_dir = fetch_reference(self.additional_files["gsnap_genome"],
                                     self.ref_dir_local,
                                     allow_s3mi=True,
                                     auto_untar=True)
        gsnap_base_dir = os.path.dirname(genome_dir)
        gsnap_index_name = os.path.basename(genome_dir)
        # Hack to determine gsnap vs gsnapl
        error_message = subprocess.run(
                ['gsnapl', '-D', gsnap_base_dir, '-d', gsnap_index_name],
                input='>'.encode('utf-8'),
                stderr=subprocess.PIPE,
                stdout=subprocess.PIPE
            ).stderr
        gsnap_exe = "gsnap" if 'please run gsnap instead' in error_message.decode('utf-8') else "gsnapl"
        # Run Gsnap
        gsnap_params = [
            '-A', 'sam', '--batch=0', '--use-shared-memory=0',
            '--gmap-mode=all', '--npaths=1', '--ordered', '-t', 32,
            '--max-mismatches=40', '-D', gsnap_base_dir, '-d', gsnap_index_name,
            '-o',
            output_sam_file
        ] + input_fas
        command.execute(
            command_patterns.SingleCommand(
                cmd=gsnap_exe,
                args=gsnap_params
            )
        )
        log.write("Finished GSNAP alignment.")

        # Extract out unmapped files from sam
        if len(input_fas) == 2:
            convert.generate_unmapped_pairs_from_sam(
                output_sam_file, output_fas)
        else:
            convert.generate_unmapped_singles_from_sam(
                output_sam_file, output_fas[0])