def run_blast_nt(blast_index_path, blast_m8, assembled_contig,
                  reference_fasta, blast_top_m8):
     blast_type = 'nucl'
     blast_command = 'blastn'
     min_alignment_length = NT_MIN_ALIGNMENT_LEN
     min_pident = NT_MIN_PIDENT
     max_evalue = MAX_EVALUE_THRESHOLD
     command.execute(
         command_patterns.SingleCommand(
             cmd="makeblastdb",
             args=[
                 "-in", reference_fasta, "-dbtype", blast_type, "-out",
                 blast_index_path
             ],
         ))
     command.execute(
         command_patterns.SingleCommand(
             cmd=blast_command,
             args=[
                 "-query", assembled_contig, "-db", blast_index_path,
                 "-out", blast_m8, "-outfmt",
                 '6 ' + ' '.join(m8.BLAST_OUTPUT_NT_SCHEMA.keys()),
                 '-evalue', 1e-10, '-max_target_seqs', 5000, "-num_threads",
                 16
             ],
             # We can only pass BATCH_SIZE as an env var.  The default is 100,000 for blastn;  10,000 for blastp.
             # Blast concatenates input queries until they exceed this size, then runs them together, for efficiency.
             # Unfortunately if too many short and low complexity queries are in the input, this can expand too
             # much the memory required.  We have found empirically 10,000 to be a better default.  It is also the
             # value used as default for remote blast.
             env=dict(os.environ, BATCH_SIZE="10000")))
     # further processing of getting the top m8 entry for each contig.
     PipelineStepBlastContigs.get_top_m8_nt(blast_m8, blast_top_m8,
                                            min_alignment_length,
                                            min_pident, max_evalue)
 def fetch_ncbi(accession):
     query = accession
     base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
     search_url = f"{base}/esearch.fcgi?db=nuccore&term={query}&usehistory=y"
     output = command.execute_with_output(
         command_patterns.SingleCommand(
             cmd="curl",
             args=[search_url]
         )
     )
     root = ET.fromstring(output)
     web = root.find('WebEnv').text
     key = root.find('QueryKey').text
     fetch_url = f"{base}/efetch.fcgi?db=nuccore&query_key={key}&WebEnv={web}&rettype=gb&retmode=xml"
     genbank_xml = command.execute_with_output(
         command_patterns.SingleCommand(
             cmd="curl",
             args=[fetch_url]
         )
     )
     return {
         'search_url': search_url,
         'fetch_url': fetch_url,
         'genbank_xml': genbank_xml
     }
Beispiel #3
0
 def execute_srst2(self, is_paired, is_fasta, is_zipped):
     """Executes srst2 with appropriate parameters based on whether input files are zipped,
        paired reads and on file type."""
     srst2_params = []
     srst2_params.extend(self.get_common_params())
     if is_fasta:
         file_ext = '.fasta.gz' if is_zipped else '.fasta'
         srst2_params.extend(['--read_type', 'f'])
     else:
         file_ext = '.fastq.gz' if is_zipped else '.fastq'
     if is_paired:
         srst2_params.extend(['--input_pe'])
     else:
         srst2_params.extend(['--input_se'])
     for i, rd in enumerate(self.input_files_local[0]):
         link_name = f"_R{i+1}_001{file_ext}"
         command.execute(
             command_patterns.SingleCommand(cmd='ln',
                                            args=['-sf', rd, link_name]))
         srst2_params.append(link_name)
     if is_paired:
         srst2_params.extend(
             ['--forward', '_R1_001', '--reverse', '_R2_001'])
     command.execute(
         command_patterns.SingleCommand(cmd='srst2', args=srst2_params))
    def run(self):
        """
          Generate GSNAP index. To be called from idseq-infra
        """
        nt_db = self.input_files_local[0][0]
        output_nt_index_tar = self.output_files_local()[0]
        output_nt_index_parent_dir = os.path.dirname(output_nt_index_tar)
        output_tar_base = os.path.basename(output_nt_index_tar)
        output_nt_index_dir_base = output_tar_base[:-4]
        k = self.additional_attributes.get("k", 16)  # kmer k
        log.write(f"input: {nt_db} output: {output_nt_index_tar}")
        command.execute(
            command_patterns.SingleCommand(cmd="gmap_build",
                                           args=[
                                               "-D",
                                               output_nt_index_parent_dir,
                                               "-d", output_nt_index_dir_base,
                                               "-k", k, nt_db
                                           ]))

        output_nt_index_dir = os.path.join(output_nt_index_parent_dir,
                                           output_nt_index_dir_base)
        self.additional_output_folders_hidden.append(output_nt_index_dir)

        command.execute(
            command_patterns.SingleCommand(
                cd=output_nt_index_parent_dir,
                cmd="tar",
                args=["cvf", output_tar_base, output_nt_index_dir_base]))
Beispiel #5
0
 def get_genbank_genomes(self,
                         reference_taxids,
                         destination_dir,
                         superkingdom_name,
                         n=10):
     '''
     Retrieve up to n GenBank reference genomes under the reference_taxids.
     Assumes reference_taxids are species-level or below.
     Also assumes they are all in the same superkingdom, which is the only thing we need in our application.
     Saves the references under file names compatible with MakeKSNP3infile.
     TODO: Retrieve the genomes from S3 rather than ftp.ncbi.nih.gov (JIRA/IDSEQ-334).
     '''
     if n == 0 or not reference_taxids:
         return {}
     n_per_taxid = max(n // len(reference_taxids), 1)
     genbank_categories_by_superkingdom = {
         "Viruses": ["viral"],
         "Bacteria": ["bacteria"],
         "Eukaryota": ["fungi", "protozoa"],
         None: ["bacteria", "viral", "fungi", "protozoa"]
     }
     # additional options in genbank that we probably don't need right now:
     # ["archaea", "plant",
     # "vertebrate_mammalian", "vertebrate_other", "invertebrate",
     # "other", "metagenomes"]
     categories = genbank_categories_by_superkingdom[superkingdom_name]
     for cat in categories:
         genome_list_path_s3 = f"s3://idseq-public-references/genbank/{cat}/assembly_summary.txt"  # source: ftp://ftp.ncbi.nih.gov/genomes/genbank/{cat}/assembly_summary.txt
         genome_list_local = s3.fetch_from_s3(genome_list_path_s3,
                                              destination_dir)
         genomes = []
         for taxid in reference_taxids:
             taxid_genomes = PipelineStepGeneratePhyloTree.get_taxid_genomes(
                 genome_list_local, taxid, n_per_taxid)
             genomes += [
                 entry for entry in taxid_genomes if entry not in genomes
             ]
         genomes = genomes[:n]
         command.remove_file(genome_list_local)
         if genomes:
             genbank_fastas = {}
             for line in genomes:
                 assembly_accession, taxid, _species_taxid, _organism_name, ftp_path = line.split(
                     "\t")
                 ftp_fasta_gz = f"{ftp_path}/{os.path.basename(ftp_path)}_genomic.fna.gz"
                 tree_node_name = f"genbank_{self.clean_name_for_ksnp3(assembly_accession)}"
                 local_fasta = f"{destination_dir}/{tree_node_name}.fasta"
                 if os.path.isfile(local_fasta):
                     local_fasta = f"{local_fasta.split('.')[0]}__I.fasta"
                 command.execute(
                     command_patterns.SingleCommand(
                         cmd='wget',
                         args=["-O", f"{local_fasta}.gz", ftp_fasta_gz]))
                 command.execute(
                     command_patterns.SingleCommand(
                         cmd='gunzip', args=[f"{local_fasta}.gz"]))
                 genbank_fastas[assembly_accession] = local_fasta
             return genbank_fastas
     return {}
Beispiel #6
0
 def test_cd(self):
     '''WHEN using cd parameter, THEN it executes the command in the selected directory and resets it to previous dir before executing next command'''
     _shared_test_cd_parameter(
         test_context=self,
         pwd_command_pattern_with_cd=command_patterns.SingleCommand(
             cd=TMP_FOLDER, cmd="pwd", args=[]),
         pwd_command_pattern_without_cd=command_patterns.SingleCommand(
             cmd="pwd", args=[]))
Beispiel #7
0
 def get_total_reads(self, is_zipped, is_fasta):
     """Gets the total number of reads in the sample by counting them directly from the
         fastq or fasta files."""
     # TODO: factor out into utility function, see nonhost_fastq
     input_filenames = self.input_files_local[0]
     if is_zipped:
         unzipped_filenames = []
         for filename in input_filenames:
             if not os.path.exists(filename[:len(filename) - 3]):
                 command.execute(
                     command_patterns.SingleCommand(
                         cmd='gunzip',
                         args=[
                             '-k',
                             filename
                         ]
                     )
                 )
             unzipped_filenames.append(filename[:len(filename) - 3])
         input_filenames = unzipped_filenames
     if is_fasta:  # Number of lines per read can vary, so we use grep
         grep_output = command.execute_with_output(
             command_patterns.SingleCommand(
                 cmd='grep',
                 args=[
                     '-c',
                     '^>',  # fastas start reads with "^>".
                     *input_filenames
                 ]
             )
         )
         output_lines = [line for line in grep_output.split("\n") if line != '']
         if ":" in output_lines[0]:
             # for paired fastas - when run on just one file, grep outputs only
             # a number. But when this command is run on two files, grep outputs
             # a string formatted as filename:count for each file, with count being
             # what we want to add up.
             read_counts = map(lambda line: int(line.split(":")[1]), output_lines)
             return reduce(lambda x, y: x + y, list(read_counts))
         else:
             return int(output_lines[0])
     else:  # fastqs have 4 lines for every read, so we count lines and divide by 4
         wc_params = ['wc', '-l']
         wc_params.extend(input_filenames)
         wc_output = command.execute_with_output(" ".join(wc_params))
         # take the set of characters from the last line, which is the total number of lines
         # for paired reads or the only line for unpaired reads
         wc_lines = [line for line in wc_output.split("\n") if line != '']
         wc_target_line = [line for line in wc_lines[-1].split(" ") if line != '']
         total_line_count = int(wc_target_line[0])
         return total_line_count / 4
Beispiel #8
0
    def assemble(
            input_fasta,
            input_fasta2,
            bowtie_fasta,  # fasta file for running bowtie against contigs
            duplicate_cluster_sizes_path,
            assembled_contig,
            assembled_scaffold,
            bowtie_sam,
            contig_stats,
            read2contig,
            memory=100):
        basedir = os.path.dirname(assembled_contig)
        assembled_dir = os.path.join(basedir, 'spades')
        command.make_dirs(assembled_dir)
        assembled_contig_tmp = os.path.join(assembled_dir, 'contigs.fasta')
        assembled_scaffold_tmp = os.path.join(assembled_dir, 'scaffolds.fasta')

        try:
            if input_fasta2:
                command.execute(
                    command_patterns.SingleCommand(cmd="spades.py",
                                                   args=[
                                                       "-1", input_fasta, "-2",
                                                       input_fasta2, "-o",
                                                       assembled_dir, "-m",
                                                       memory, "-t", 32,
                                                       "--only-assembler"
                                                   ]))
            else:
                command.execute(
                    command_patterns.SingleCommand(cmd="spades.py",
                                                   args=[
                                                       "-s", input_fasta, "-o",
                                                       assembled_dir, "-m",
                                                       memory, "-t", 32,
                                                       "--only-assembler"
                                                   ]))
            command.move_file(assembled_contig_tmp, assembled_contig)
            command.move_file(assembled_scaffold_tmp, assembled_scaffold)

            PipelineStepRunAssembly.generate_read_to_contig_mapping(
                assembled_contig, bowtie_fasta, read2contig,
                duplicate_cluster_sizes_path, bowtie_sam, contig_stats)
        except:
            # Assembly failed. create dummy output files
            command.write_text_to_file(';ASSEMBLY FAILED', assembled_contig)
            command.write_text_to_file(';ASSEMBLY FAILED', assembled_scaffold)
            command.write_text_to_file('@NO INFO', bowtie_sam)
            command.write_text_to_file('{}', contig_stats)
            traceback.print_exc()
        command.remove_rf(assembled_dir)
Beispiel #9
0
def upload_with_retries(from_f, to_f, checksum=False):
    with IOSTREAM_UPLOADS:
        with IOSTREAM:
            if checksum:
                command.execute(
                    command_patterns.SingleCommand(
                        cmd="s3parcp",
                        args=["--checksum", from_f, to_f],
                        env=dict(os.environ, **refreshed_credentials())))
            else:
                command.execute(
                    command_patterns.SingleCommand(
                        cmd="aws",
                        args=["s3", "cp", "--only-show-errors", from_f, to_f],
                        env=dict(os.environ, **refreshed_credentials())))
Beispiel #10
0
def list_s3_keys(s3_path_prefix):
    """Returns a list of s3 keys prefixed by s3_path_prefix."""
    with log.log_context(context_name="s3.list_s3_objects",
                         values={'s3_path_prefix': s3_path_prefix},
                         log_context_mode=log.LogContextMode.EXEC_LOG_EVENT):
        parsed_url = urlparse(s3_path_prefix, allow_fragments=False)
        bucket = parsed_url.netloc
        prefix = parsed_url.path.lstrip('/')
        # Use the AWS CLI instead of boto for thread safety
        raw_response = command.execute(
            command_patterns.SingleCommand(
                cmd="aws",
                args=[
                    "s3api",
                    "list-objects-v2",
                    "--bucket",
                    bucket,
                    "--prefix",
                    prefix,
                ],
                env=dict(os.environ, **refreshed_credentials()),
            ),
            capture_stdout=True,
        )
        parsed_response = json.loads(raw_response)
        return [item['Key'] for item in parsed_response['Contents']]
    def run(self):
        input_fas = self.input_files_local[0]
        output_files = self.output_files_local()
        assert len(output_files) == len(
            input_fas) + 2, f"Context: {input_fas} -> {output_files}."
        output_fas = output_files[:len(input_fas)]
        duplicate_cluster_sizes_path = output_files[-1]
        assert duplicate_cluster_sizes_path.endswith(".tsv"), str(output_files)
        duplicate_clusters_path = output_files[-2]
        assert duplicate_clusters_path.endswith(".csv"), str(output_files)

        # See docstring above for explanation of these options.
        idseq_dedup_params = [
            '-i',
            input_fas[0],
            '-o',
            output_fas[0],
            '-l',
            '70',
            '-c',
            duplicate_clusters_path,
        ]
        if len(input_fas) == 2:
            idseq_dedup_params += ['-i', input_fas[1], '-o', output_fas[1]]
        command.execute(
            command_patterns.SingleCommand(cmd='idseq-dedup',
                                           args=idseq_dedup_params))

        # Emit cluster sizes.  One line per cluster.  Format "<cluster_size> <cluster_read_id>".
        # This info is loaded in multiple subsequent steps using m8.load_duplicate_cluster_sizes,
        # and used to convert unique read counts to original read counts, and also to compute
        # per-taxon DCRs emitted alongside taxon_counts.
        clusters_dict = parse_clusters_file(duplicate_clusters_path)
        save_duplicate_cluster_sizes(duplicate_cluster_sizes_path,
                                     clusters_dict)
Beispiel #12
0
 def generate_read_to_contig_mapping(assembled_contig, fasta_file,
                                     read2contig,
                                     duplicate_cluster_sizes_path,
                                     output_bowtie_sam,
                                     output_contig_stats):
     ''' read -> contig mapping through bowtie2 alignment '''
     base_output_dir = os.path.dirname(fasta_file)
     # build bowtie index based on assembled_contig
     bowtie_index_path = os.path.join(base_output_dir, 'bowtie-contig')
     command.make_dirs(bowtie_index_path)
     command.execute(
         command_patterns.SingleCommand(
             cmd='bowtie2-build',
             args=[assembled_contig, bowtie_index_path]))
     command.execute(
         command_patterns.ShellScriptCommand(
             script=
             r'''bowtie2 -x "${bowtie_index_path}" -f -U "${fasta_file}" --very-sensitive -p 32 > "${output_bowtie_sam}";''',
             named_args={
                 'bowtie_index_path': bowtie_index_path,
                 'fasta_file': fasta_file,
                 'output_bowtie_sam': output_bowtie_sam
             }))
     contig_stats = PipelineStepRunAssembly.generate_info_from_sam(
         output_bowtie_sam, read2contig, duplicate_cluster_sizes_path)
     with open(output_contig_stats, 'w') as ocf:
         json.dump(contig_stats, ocf)
Beispiel #13
0
    def run(self):
        """
          1. extract contigs.fasta and read-contig.sam
          2. run pile up
        """
        contigs, _scaffolds, read_contig_sam, _stats = self.input_files_local[
            0]
        coverage_json, coverage_summary_csv = self.output_files_local()

        if os.path.getsize(contigs) < MIN_CONTIG_FILE_SIZE:
            command.write_text_to_file('{}', coverage_json)
            command.write_text_to_file('No Contigs', coverage_summary_csv)
            return

        # generate bam files
        bam_file = read_contig_sam.replace(".sam", ".bam")
        command.execute(
            command_patterns.ShellScriptCommand(
                script=
                r'''samtools view -S -b "${read_contig_sam}" | samtools sort - -o "${bam_file}";''',
                named_args={
                    'read_contig_sam': read_contig_sam,
                    'bam_file': bam_file
                }))
        command.execute(
            command_patterns.SingleCommand(cmd="samtools",
                                           args=["index", bam_file]))
        # run coverage info
        output_csv, output_json = self.calc_contig2coverage(bam_file)
        os.rename(output_csv, coverage_summary_csv)
        os.rename(output_json, coverage_json)
Beispiel #14
0
    def run(self):
        input_fas = self.input_files_local[0][0:2]
        output_fas = self.output_files_local()
        output_sam_file = os.path.join(
            self.output_dir_local,
            self.additional_attributes["output_sam_file"])
        self.additional_output_files_hidden.append(output_sam_file)

        genome_dir = fetch_reference(self.additional_files["gsnap_genome"],
                                     self.ref_dir_local,
                                     allow_s3mi=True,
                                     auto_untar=True)
        gsnap_base_dir = os.path.dirname(genome_dir)
        gsnap_index_name = os.path.basename(genome_dir)
        # Run Gsnap
        gsnap_params = [
            '-A', 'sam', '--batch=0', '--use-shared-memory=0',
            '--gmap-mode=all', '--npaths=1', '--ordered', '-t', 32,
            '--max-mismatches=40', '-D', gsnap_base_dir, '-d',
            gsnap_index_name, '-o', output_sam_file
        ] + input_fas
        command.execute(
            command_patterns.SingleCommand(cmd='gsnapl', args=gsnap_params))
        log.write("Finished GSNAP alignment.")

        # Extract out unmapped files from sam
        if len(input_fas) == 2:
            convert.generate_unmapped_pairs_from_sam(output_sam_file,
                                                     output_fas)
        else:
            convert.generate_unmapped_singles_from_sam(output_sam_file,
                                                       output_fas[0])
Beispiel #15
0
    def subsample_fastas(input_fas, output_fas, max_fragments):
        ''' In memory subsampling '''
        paired = len(input_fas) >= 2
        # count lines
        cmd_output = command.execute_with_output(
            command_patterns.SingleCommand(cmd="wc", args=["-l",
                                                           input_fas[0]]))
        lines_count = int(cmd_output.strip().split(' ')[0])
        total_records = lines_count // 2
        log.write("total reads: %d" % total_records)
        log.write("target reads: %d" % max_fragments)
        if total_records <= max_fragments:
            for infile, outfile in zip(input_fas, output_fas):
                command.copy_file(infile, outfile)
            return

        # total_records > max_fragments, sample
        randgen = random.Random(x=hash(input_fas[0]))
        records_to_keep = randgen.sample(range(total_records), max_fragments)
        PipelineStepRunSubsample.subset(input_fas[0], output_fas[0],
                                        records_to_keep)
        if paired:
            PipelineStepRunSubsample.subset(input_fas[1], output_fas[1],
                                            records_to_keep)
            if len(input_fas) == 3 and len(output_fas) == 3:
                # subset the merged fasta
                records_to_keep_merged = []
                for r in records_to_keep:
                    records_to_keep_merged += [2 * r, 2 * r + 1]
                PipelineStepRunSubsample.subset(input_fas[2], output_fas[2],
                                                records_to_keep_merged)
Beispiel #16
0
    def make_star_index(fasta_file, gtf_file, output_star_genome_path,
                        max_star_part_size):
        star_genome_dir_name = output_star_genome_path[:-4]

        # star genome organization
        # STAR_genome/part-${i}, parts.txt
        fasta_file_list = []
        if max_star_part_size and os.path.getsize(
                fasta_file) > max_star_part_size:
            fasta_file_list = PipelineStepGenerateHostGenome.split_fasta(
                fasta_file, max_star_part_size)
        else:
            fasta_file_list.append(fasta_file)

        for i in range(len(fasta_file_list)):
            log.write("start making STAR index part %d" % i)
            gtf_command_part = []
            if i == 0 and gtf_file:
                gtf_command_part = ["--sjdbGTFfile", gtf_file]

            star_genome_part_dir = f"{star_genome_dir_name}/part-{i}"

            command.make_dirs(star_genome_part_dir)
            star_command_params = [
                '--runThreadN',
                str(multiprocessing.cpu_count()), '--runMode',
                'genomeGenerate', *gtf_command_part, '--genomeDir',
                star_genome_part_dir, '--genomeFastaFiles', fasta_file_list[i],
                '--limitGenomeGenerateRAM',
                virtual_memory().available
            ]
            command.execute(
                command_patterns.SingleCommand(cmd='STAR',
                                               args=star_command_params))
            log.write(f"finished making STAR index part {i}")
        # record # parts into parts.txt
        command.write_text_to_file(
            len(fasta_file_list),
            os.path.join(star_genome_dir_name, "parts.txt"))
        star_genome = os.path.basename(star_genome_dir_name)
        star_work_dir = os.path.dirname(star_genome_dir_name)
        command.execute(
            command_patterns.SingleCommand(cmd="tar",
                                           args=[
                                               "cvf", output_star_genome_path,
                                               "-C", star_work_dir, star_genome
                                           ]))
Beispiel #17
0
    def test_execute_python_cmd(self):
        '''WHEN using SingleCommand to invoke a .py file, THEN it works as expected'''
        cp1 = command_patterns.SingleCommand(cmd=TESTSCRIPT_HAPPY_PY,
                                             args=["Hello!"])

        result = command.execute_with_output(cp1)

        self.assertEqual(result, "Python is happy to say: Hello!\n")
Beispiel #18
0
    def run_star_part(self,
                      output_dir,
                      genome_dir,
                      input_files,
                      count_genes,
                      use_starlong):
        command.make_dirs(output_dir)

        cpus = str(multiprocessing.cpu_count())
        cd = output_dir
        cmd = 'STARlong' if use_starlong else 'STAR'
        params = [
            '--outFilterMultimapNmax', '99999',
            '--outFilterScoreMinOverLread', '0.5',
            '--outFilterMatchNminOverLread', '0.5',
            '--outReadsUnmapped', 'Fastx',
            '--outFilterMismatchNmax', '999',
            '--clip3pNbases', '0',
            '--runThreadN', cpus,
            '--genomeDir', genome_dir,
            '--readFilesIn', *input_files
        ]

        if self.collect_insert_size_metrics_for == "rna":
            params += [
                '--outSAMtype', 'BAM', 'Unsorted',
                '--outSAMmode', 'NoQS',
                # Based on experimentation we always want --quantMode TranscriptomeSAM GeneCounts
                #   for RNA to collect transcriptome-specific results to compute insert size metrics on
                #   https://czi.quip.com/4niiAhiJsFNx/2019-11-15-CollectInsertSizeMetrics-for-RNA
                '--quantMode', 'TranscriptomeSAM', 'GeneCounts',
            ]
        else:
            if self.collect_insert_size_metrics_for == "dna":
                params += ['--outSAMtype', 'BAM', 'Unsorted', '--outSAMmode', 'NoQS', ]
            else:
                params += ['--outSAMmode', 'None']

            count_file = f"{genome_dir}/sjdbList.fromGTF.out.tab"
            if count_genes and os.path.isfile(count_file):
                params += ['--quantMode', 'GeneCounts']

        if use_starlong:
            params += [
                '--seedSearchStartLmax', '20',
                '--seedPerReadNmax', '100000',
                '--seedPerWindowNmax', '1000',
                '--alignTranscriptsPerReadNmax', '100000',
                '--alignTranscriptsPerWindowNmax', '10000']

        command.execute(
            command_patterns.SingleCommand(
                cd=cd,
                cmd=cmd,
                args=params
            )
        )
Beispiel #19
0
 def make_bowtie2_index(host_name, fasta_file, output_bowtie2_index):
     bowtie2_genome_dir_name = output_bowtie2_index[:-4]
     command.make_dirs(bowtie2_genome_dir_name)
     command.execute(
         command_patterns.SingleCommand(cd=bowtie2_genome_dir_name,
                                        cmd='bowtie2-build',
                                        args=[fasta_file, host_name]))
     log.write("finished making bowtie2 index")
     # archive
     bowtie_genome = os.path.basename(bowtie2_genome_dir_name)
     bowtie_work_dir = os.path.dirname(bowtie2_genome_dir_name)
     command.execute(
         command_patterns.SingleCommand(cmd="tar",
                                        args=[
                                            "cvf", output_bowtie2_index,
                                            "-C", bowtie_work_dir,
                                            bowtie_genome
                                        ]))
Beispiel #20
0
def remote(base_command, key_path, remote_username, instance_ip):
    # ServerAliveInterval to fix issue with containers keeping open an SSH
    # connection even after worker machines had finished running.
    return command_patterns.SingleCommand(
        cmd="ssh",
        args=[
            "-o", "StrictHostKeyChecking no", "-o", "ConnectTimeout 15", "-o",
            "ServerAliveInterval 60", "-i", key_path,
            f"{remote_username}@{instance_ip}", base_command
        ])
Beispiel #21
0
    def test_execute_single_command_2(self):
        '''WHEN using SingleCommand with args that have special shell characters, THEN it doesn't execute subcommands'''
        assert " " in TESTFILE_ABC_TXT

        cp1 = command_patterns.SingleCommand(cmd="cat",
                                             args=[TESTFILE_ABC_TXT])

        result = command.execute_with_output(cp1)

        self.assertEqual(result, "abc")
Beispiel #22
0
    def run(self):
        input_fas = self.input_fas()
        output_fas = self.output_files_local()
        genome_dir = fetch_reference(
            self.additional_files["bowtie2_genome"],
            self.ref_dir_local,
            allow_s3mi=True,
            auto_untar=True)
        output_sam_file = os.path.join(
            self.output_dir_local,
            self.additional_attributes["output_sam_file"])
        self.additional_output_files_hidden.append(output_sam_file)
        # The file structure looks like
        # "bowtie2_genome/GRCh38.primary_assembly.genome.3.bt2"
        genome_basename = command.glob(f"{genome_dir}/*.bt2*", max_results=1)[0]
        # remove two extensions: ex: hg38_phiX_rRNA_mito_ERCC.3.bt2 -> hg38_phiX_rRNA_mito_ERCC
        genome_basename = os.path.splitext(os.path.splitext(genome_basename)[0])[0]

        bowtie2_params = [
            '-q', '-x', genome_basename, '-f',
            '--very-sensitive-local', '-S', output_sam_file
        ]

        # --seed cannot be used with -p multithreading
        # We have observed the lack of multithreading resulting in
        # severe performance degradation in some cases. So for the
        # time being multithreading is being chosen over determinism.
        # To seed bowtie2 do something similar to:
        # bowtie2_params.extend(['--seed', '4'])
        bowtie2_params.extend(['-p', str(multiprocessing.cpu_count())])

        if len(input_fas) == 2:
            bowtie2_params.extend(['-1', input_fas[0], '-2', input_fas[1]])
        else:
            bowtie2_params.extend(['-U', input_fas[0]])

        # Example:
        # bowtie2 -q -x /mnt/idseq/ref/bowtie2_genome/hg38_phiX_rRNA_mito_ERCC -f \
        #         --very-sensitive-local -S /mnt/idseq/results/589/bowtie2_human.sam \
        #         -p 32 \
        #         -1 /mnt/idseq/results/589/unmapped_human_1.fa -2 /mnt/idseq/results/589/unmapped_human_2.fa
        command.execute(
            command_patterns.SingleCommand(
                cmd='bowtie2',
                args=bowtie2_params
            )
        )
        log.write("Finished Bowtie alignment.")

        if len(input_fas) == 2:
            convert.generate_unmapped_pairs_from_sam(output_sam_file,
                                                     output_fas)
        else:
            convert.generate_unmapped_singles_from_sam(output_sam_file,
                                                       output_fas[0])
Beispiel #23
0
    def run(self):
        input_fas = self.input_files_local[0][0:2]
        output_fas = self.output_files_local()
        genome_dir = fetch_reference(
            self.additional_files["bowtie2_genome"],
            self.ref_dir_local,
            allow_s3mi=True,
            auto_untar=True)
        output_sam_file = os.path.join(
            self.output_dir_local,
            self.additional_attributes["output_sam_file"])
        self.additional_output_files_hidden.append(output_sam_file)
        # The file structure looks like
        # "bowtie2_genome/GRCh38.primary_assembly.genome.3.bt2"
        genome_basename = command.glob(f"{genome_dir}/*.bt2*", max_results=1)[0]
        # remove two extensions: ex: hg38_phiX_rRNA_mito_ERCC.3.bt2 -> hg38_phiX_rRNA_mito_ERCC
        genome_basename = os.path.splitext(os.path.splitext(genome_basename)[0])[0]

        bowtie2_params = [
            '-q', '-x', genome_basename, '-f',
            '--very-sensitive-local', '-S', output_sam_file
        ]

        seed = self.additional_attributes.get("random_seed")
        if seed:
            bowtie2_params.extend(['--seed', str(seed)])
        else:
            # Seed option won't work with -p threading option.
            bowtie2_params.extend(['-p', str(multiprocessing.cpu_count())])

        if len(input_fas) == 2:
            bowtie2_params.extend(['-1', input_fas[0], '-2', input_fas[1]])
        else:
            bowtie2_params.extend(['-U', input_fas[0]])

        # Example:
        # bowtie2 -q -x /mnt/idseq/ref/bowtie2_genome/hg38_phiX_rRNA_mito_ERCC -f \
        #         --very-sensitive-local -S /mnt/idseq/results/589/bowtie2_human.sam \
        #         -p 32 \
        #         -1 /mnt/idseq/results/589/unmapped_human_1.fa -2 /mnt/idseq/results/589/unmapped_human_2.fa
        command.execute(
            command_patterns.SingleCommand(
                cmd='bowtie2',
                args=bowtie2_params
            )
        )
        log.write("Finished Bowtie alignment.")

        if len(input_fas) == 2:
            convert.generate_unmapped_pairs_from_sam(output_sam_file,
                                                     output_fas)
        else:
            convert.generate_unmapped_singles_from_sam(output_sam_file,
                                                       output_fas[0])
 def get_command(self, input_file):
     output_file = input_file + '.lz4'
     log.write(f"input: {input_file} output: {output_file}")
     return command_patterns.SingleCommand(
         cmd="lz4",
         args=[
             "-9",  # max compression
             "-f",  # force overwrite output file
             input_file,
             output_file,
         ])
    def run(self):
        """
          Build custom blast index from an S3 location or a url
        """
        _input_files = self.input_files_local[0]  # dummy in this case  # noqa
        output_tar_file = self.output_files_local()[0]

        db_type = self.additional_attributes['db_type']
        file_source = self.additional_attributes['data_source']
        output_db_name = output_tar_file.replace(".tar", "")

        if file_source.startswith("s3://"):
            db_file = s3.fetch_from_s3(file_source, self.output_dir_local)
        else:
            # Download with wget
            db_file = os.path.join(self.output_dir_local,
                                   os.path.basename(file_source))
            urllib.request.urlretrieve(file_source, db_file)
            self.additional_output_files_hidden.append(db_file)

        # Build blast index
        if db_file.endswith(".bz2"):
            command.execute(
                command_patterns.SingleCommand(cmd='bzip2',
                                               args=["-dk", db_file]))
            db_file = db_file[:-4]
        elif db_file.endswith(".zip"):
            command.execute(
                command_patterns.SingleCommand(cmd='unzip', args=[db_file]))
            db_file = db_file[:-4]

        command.execute(
            command_patterns.SingleCommand(cmd='makeblastdb',
                                           args=[
                                               "-in", db_file, "-dbtype",
                                               db_type, "-out", output_db_name
                                           ]))
        command.execute(
            command_patterns.SingleCommand(
                cmd='tar',
                args=["cvf", output_tar_file, output_db_name + ".*"]))
Beispiel #26
0
 def trim_adapters_in_place(local_file):
     local_file_trimmed = os.path.join(
         os.path.dirname(local_file),
         "trimmed_" + os.path.basename(local_file))
     command.execute(
         command_patterns.SingleCommand(cmd='cutadapt',
                                        args=[
                                            "-a", "AGATCGGAAGAGCACACGTCT",
                                            "-o", local_file_trimmed,
                                            local_file
                                        ]))
     command.move_file(local_file_trimmed, local_file)
    def run(self):
        input_fas = self.input_fas()
        output_fas = self.output_files_local()
        genome_dir = fetch_reference(self.additional_files["bowtie2_genome"],
                                     self.ref_dir_local,
                                     allow_s3mi=True,
                                     auto_untar=True)
        output_sam_file = os.path.join(
            self.output_dir_local,
            self.additional_attributes["output_sam_file"])
        self.additional_output_files_hidden.append(output_sam_file)
        # The file structure looks like
        # "bowtie2_genome/GRCh38.primary_assembly.genome.3.bt2"
        genome_basename = command.glob(f"{genome_dir}/*.bt2*",
                                       max_results=1)[0]
        # remove two extensions: ex: hg38_phiX_rRNA_mito_ERCC.3.bt2 -> hg38_phiX_rRNA_mito_ERCC
        genome_basename = os.path.splitext(
            os.path.splitext(genome_basename)[0])[0]

        bowtie2_params = [
            '-q', '-x', genome_basename, '-f', '--very-sensitive-local', '-S',
            output_sam_file
        ]

        # FIXME: https://jira.czi.team/browse/IDSEQ-2738
        #  We want to move towards a general randomness solution in which
        #  all randomness is seeded based on the content of the original input.
        #  This is currently introducing non-determinism and hard coding
        #  an arbitrary seed here shouldn't impact correctness.
        bowtie2_params.extend(
            ['--seed',
             '4'])  # chosen by fair dice role, guaranteed to be random

        if len(input_fas) == 2:
            bowtie2_params.extend(['-1', input_fas[0], '-2', input_fas[1]])
        else:
            bowtie2_params.extend(['-U', input_fas[0]])

        # Example:
        # bowtie2 -q -x /mnt/idseq/ref/bowtie2_genome/hg38_phiX_rRNA_mito_ERCC -f \
        #         --very-sensitive-local -S /mnt/idseq/results/589/bowtie2_human.sam \
        #         -p 32 \
        #         -1 /mnt/idseq/results/589/unmapped_human_1.fa -2 /mnt/idseq/results/589/unmapped_human_2.fa
        command.execute(
            command_patterns.SingleCommand(cmd='bowtie2', args=bowtie2_params))
        log.write("Finished Bowtie alignment.")

        if len(input_fas) == 2:
            convert.generate_unmapped_pairs_from_sam(output_sam_file,
                                                     output_fas)
        else:
            convert.generate_unmapped_singles_from_sam(output_sam_file,
                                                       output_fas[0])
    def run(self):
        """
          Generate Rapsearch2 index. To be called from idseq-infra

        """
        nr_db = self.input_files_local[0][0]
        output_nr_index = self.output_files_local()[0]
        output_nr_info_file = output_nr_index + '.info'
        log.write(f"input: {nr_db} output: {output_nr_index}")
        command.execute(
            command_patterns.SingleCommand(
                cmd="prerapsearch", args=["-d", nr_db, "-n", output_nr_index]))
        self.additional_output_files_hidden.append(output_nr_info_file)
Beispiel #29
0
 def normalize_bam_file(self):
     """Ensure files needed are actually present"""
     if os.path.exists(self.output_files_local()[5]):
         return
     # For unpaired fastq inputs, srst2 gives a different name to the sorted bam file that it outputs
     # We rename the bam file to what we expect (as specified in the dag)
     unpaired_bam_path = f'{self.output_dir_local}/output___R1_001.ARGannot_r2.sorted.bam'
     if os.path.exists(unpaired_bam_path):
         command.execute(
             command_patterns.SingleCommand(
                 cmd='mv',
                 args=[unpaired_bam_path,
                       self.output_files_local()[5]]))
Beispiel #30
0
def scp(key_path, remote_username, instance_ip, remote_path, local_path):
    assert " " not in key_path
    assert " " not in remote_path
    assert " " not in local_path
    # ServerAliveInterval to fix issue with containers keeping open an SSH
    # connection even after worker machines had finished running.
    return command_patterns.SingleCommand(
        cmd="scp",
        args=[
            "-o", "StrictHostKeyChecking no", "-o", "ConnectTimeout 15", "-o",
            "ServerAliveInterval 60", "-i", key_path,
            f"{remote_username}@{instance_ip}:{remote_path}", local_path
        ])