def manage_reference_downloads_cache(self):
     command.make_dirs(PURGE_SENTINEL_DIR)
     command.touch(PURGE_SENTINEL)
     time.sleep(
         3
     )  # 1 should be enough for the sentinel to predate all current stage downloads
     present_set, missing_set = self.references_roll_call()
     total_set = present_set | missing_set
     if total_set:
         present = len(present_set)
         missing = len(missing_set)
         total = len(total_set)
         log.write(
             f"Reference download cache: {present} of {total} large files ({100 * present / total:3.1f} percent) already exist locally."
         )
         structured_report = {
             "summary_stats": {
                 "cache_requests": total,
                 "cache_hits": present,
                 "cache_misses": missing,
                 "cache_hit_rate": present / total
             },
             "per_request_stats": {
                 f: "hit" if f in present_set else "miss"
                 for f in sorted(total_set)
             }
         }
         log.log_event("Reference downloads cache efficiency report",
                       values=structured_report)
     idseq_dag.util.s3.make_space(
     )  # make sure to touch this stage's files before deleting LRU ones
Beispiel #2
0
    def test_make_dirs(self):
        '''WHEN make_dirs is invoked, THEN it creates all required subdirs'''
        path = os.path.join(TMP_FOLDER, "1", "2")

        command.make_dirs(path)

        self.assertTrue(os.path.exists(path), f"folder {path} doesn't exist")
Beispiel #3
0
    def generate_mapped_reads_tsv(self):
        """Use bedtools to generate a table of mapped reads for each genome in the ARG ANNOT database.
            If a new resistance gene db is used, the .bed file will need to be updated manually."""
        bed_file_path = fetch_reference(
            self.additional_files["resist_genome_bed"],
            self.ref_dir_local,
            allow_s3mi=False)
        sample_bam_file_path = self.output_files_local()[5]

        tmp_sort_dir = os.path.join(self.output_dir_local, "tmp_sort")
        command.make_dirs(tmp_sort_dir)

        # Convert the sorted.bam output from SRST2 to the bed format, then sort the bed file.
        # This allows us to use the "sorted" mode of bedtools coverage, which is memory-efficient.
        # Otherwise, large sorted.bam files will cause our machines to run out of RAM.
        #
        # Note that despite being called "sorted.bam", the bam is not sorted the way we need it to be.
        #
        # env LC_ALL=C ensures that the sort command uses the same sort order on all machines.
        #
        # The -T flag with tmp_sort_dir ensures that we make tmp files inside /mnt, which is where our huge AWS volumes are mounted.
        # By default, the sort command creates temp files in /tmp, which has very little disk space.
        command.execute(
            command_patterns.ShellScriptCommand(
                script='''
                    bedtools bamtobed -i "$1" |
                    env LC_ALL=C sort -T "$2" -k1,1 -k2,2n |
                    bedtools coverage -sorted -a "$3" -b stdin > "$4";''',
                args=[
                    sample_bam_file_path, tmp_sort_dir, bed_file_path,
                    os.path.join(self.output_dir_local, MATCHED_READS_FILE)
                ]))

        command.remove_rf(tmp_sort_dir)
Beispiel #4
0
 def generate_read_to_contig_mapping(assembled_contig, fasta_file,
                                     read2contig,
                                     duplicate_cluster_sizes_path,
                                     output_bowtie_sam,
                                     output_contig_stats):
     ''' read -> contig mapping through bowtie2 alignment '''
     base_output_dir = os.path.dirname(fasta_file)
     # build bowtie index based on assembled_contig
     bowtie_index_path = os.path.join(base_output_dir, 'bowtie-contig')
     command.make_dirs(bowtie_index_path)
     command.execute(
         command_patterns.SingleCommand(
             cmd='bowtie2-build',
             args=[assembled_contig, bowtie_index_path]))
     command.execute(
         command_patterns.ShellScriptCommand(
             script=
             r'''bowtie2 -x "${bowtie_index_path}" -f -U "${fasta_file}" --very-sensitive -p 32 > "${output_bowtie_sam}";''',
             named_args={
                 'bowtie_index_path': bowtie_index_path,
                 'fasta_file': fasta_file,
                 'output_bowtie_sam': output_bowtie_sam
             }))
     contig_stats = PipelineStepRunAssembly.generate_info_from_sam(
         output_bowtie_sam, read2contig, duplicate_cluster_sizes_path)
     with open(output_contig_stats, 'w') as ocf:
         json.dump(contig_stats, ocf)
Beispiel #5
0
    def run(self):
        scratch_dir = os.path.join(self.output_dir_local,
                                   "scratch_nonhost_fastq")
        command.make_dirs(scratch_dir)
        self.nonhost_headers = [
            os.path.join(scratch_dir, "nonhost_headers_r1.txt"),
            os.path.join(scratch_dir, "nonhost_headers_r2.txt")
        ]

        # Assumed to be [R1.fastq, R2.fastq] if there are two read files.
        fastqs = self.input_files_local[0]

        nonhost_fasta = self.input_files_local[1][0]
        output_fastqs = self.output_files_local()
        fastqs = self.unzip_files(fastqs)

        self.generate_nonhost_headers(nonhost_fasta)

        for i in range(len(fastqs)):
            self.generate_nonhost_fastq(self.nonhost_headers[i], fastqs[i],
                                        output_fastqs[i])

        # Clean up scratch files.
        for nonhost_headers in self.nonhost_headers:
            os.remove(nonhost_headers)
    def run(self):
        # Setup
        input_fa = self.input_files_local[0][0]
        out_files = self.output_files_local()
        tmp = os.path.join(self.output_dir_local, "scratch_taxid_locator")
        command.make_dirs(tmp)

        # TODO: Design a way to map in/out files more robustly, e.g. by name/type
        # Generate locator files for species NT, species NR, genus NT...
        i = 0
        for level in ["species", "genus", "family"]:
            for name in ("NT", "NR"):
                taxid_field = f"{level}_{name.lower()}"
                output_fa = out_files[i]
                output_json = out_files[i + 1]
                PipelineStepGenerateTaxidLocator.generate_locator_work(
                    input_fa, taxid_field, name, output_fa, output_json, tmp)
                i += 2

        # Generate combined JSON file (even-numbered in the output list)
        input_jsons = [f for i, f in enumerate(out_files) if i % 2 == 1]
        output_json = out_files[-1]  # Last combined file
        PipelineStepGenerateTaxidLocator.combine_json(input_jsons, output_json)

        # Cleanup
        command.remove_rf(tmp)
    def __init__(self, lazy_run, dag_json, versioned_output):
        '''
            See examples/example_dag.json and
                idseq_dag.main.validate_dag_json for more details.
        '''
        self.lazy_run = lazy_run
        dag = PipelineFlow.parse_and_validate_conf(dag_json)
        self.targets = dag["targets"]
        self.steps = dag["steps"]
        self.given_targets = dag["given_targets"]
        self.output_dir_s3 = dag["output_dir_s3"]
        self.name = dag["name"]
        if versioned_output:
            self.output_dir_s3 = os.path.join(
                self.output_dir_s3,
                self.parse_output_version(idseq_dag.__version__))

        self.output_dir_local = dag.get("output_dir_local",
                                        DEFAULT_OUTPUT_DIR_LOCAL).rstrip('/')
        self.ref_dir_local = dag.get("ref_dir_local", DEFAULT_REF_DIR_LOCAL)
        idseq_dag.util.s3.config["REF_DIR"] = self.ref_dir_local
        idseq_dag.util.s3.config["PURGE_SENTINEL"] = PURGE_SENTINEL
        # idseq_dag.util.s3.config["REF_FETCH_LOG_DIR"] = os.path.join(self.ref_dir_local, "fetch_log")
        self.large_file_list = []

        command.make_dirs(self.output_dir_local)
        command.make_dirs(self.ref_dir_local)
Beispiel #8
0
    def run_star_part(self,
                      output_dir,
                      genome_dir,
                      input_files,
                      count_genes,
                      use_starlong):
        command.make_dirs(output_dir)

        cpus = str(multiprocessing.cpu_count())
        cd = output_dir
        cmd = 'STARlong' if use_starlong else 'STAR'
        params = [
            '--outFilterMultimapNmax', '99999',
            '--outFilterScoreMinOverLread', '0.5',
            '--outFilterMatchNminOverLread', '0.5',
            '--outReadsUnmapped', 'Fastx',
            '--outFilterMismatchNmax', '999',
            '--clip3pNbases', '0',
            '--runThreadN', cpus,
            '--genomeDir', genome_dir,
            '--readFilesIn', *input_files
        ]

        if self.collect_insert_size_metrics_for == "rna":
            params += [
                '--outSAMtype', 'BAM', 'Unsorted',
                '--outSAMmode', 'NoQS',
                # Based on experimentation we always want --quantMode TranscriptomeSAM GeneCounts
                #   for RNA to collect transcriptome-specific results to compute insert size metrics on
                #   https://czi.quip.com/4niiAhiJsFNx/2019-11-15-CollectInsertSizeMetrics-for-RNA
                '--quantMode', 'TranscriptomeSAM', 'GeneCounts',
            ]
        else:
            if self.collect_insert_size_metrics_for == "dna":
                params += ['--outSAMtype', 'BAM', 'Unsorted', '--outSAMmode', 'NoQS', ]
            else:
                params += ['--outSAMmode', 'None']

            count_file = f"{genome_dir}/sjdbList.fromGTF.out.tab"
            if count_genes and os.path.isfile(count_file):
                params += ['--quantMode', 'GeneCounts']

        if use_starlong:
            params += [
                '--seedSearchStartLmax', '20',
                '--seedPerReadNmax', '100000',
                '--seedPerWindowNmax', '1000',
                '--alignTranscriptsPerReadNmax', '100000',
                '--alignTranscriptsPerWindowNmax', '10000']

        command.execute(
            command_patterns.SingleCommand(
                cd=cd,
                cmd=cmd,
                args=params
            )
        )
Beispiel #9
0
 def __init__(self, *args, **kwrds):
     PipelineStep.__init__(self, *args, **kwrds)
     self.chunks_in_flight = threading.Semaphore(
         self.additional_attributes['chunks_in_flight'])
     self.chunks_result_dir_local = os.path.join(self.output_dir_local,
                                                 "chunks")
     self.chunks_result_dir_s3 = os.path.join(self.output_dir_s3, "chunks")
     self.iostream_upload = multiprocessing.Semaphore(
         MAX_CONCURRENT_CHUNK_UPLOADS)
     command.make_dirs(self.chunks_result_dir_local)
 def __init__(self, *args, **kwrds):
     PipelineStep.__init__(self, *args, **kwrds)
     # TODO: (tmorse) remove service compatibility https://jira.czi.team/browse/IDSEQ-2568
     self.alignment_algorithm = self.additional_attributes.get(
         "alignment_algorithm", self.additional_attributes.get("service"))
     assert self.alignment_algorithm in ("gsnap", "rapsearch2")
     self.chunks_in_flight_semaphore = threading.Semaphore(
         MAX_CHUNKS_IN_FLIGHT)
     self.chunks_result_dir_local = os.path.join(self.output_dir_local,
                                                 "chunks")
     self.chunks_result_dir_s3 = os.path.join(self.output_dir_s3, "chunks")
     self.batch_job_desc_bucket = get_batch_job_desc_bucket()
     command.make_dirs(self.chunks_result_dir_local)
Beispiel #11
0
    def assemble(
            input_fasta,
            input_fasta2,
            bowtie_fasta,  # fasta file for running bowtie against contigs
            duplicate_cluster_sizes_path,
            assembled_contig,
            assembled_scaffold,
            bowtie_sam,
            contig_stats,
            read2contig,
            memory=100):
        basedir = os.path.dirname(assembled_contig)
        assembled_dir = os.path.join(basedir, 'spades')
        command.make_dirs(assembled_dir)
        assembled_contig_tmp = os.path.join(assembled_dir, 'contigs.fasta')
        assembled_scaffold_tmp = os.path.join(assembled_dir, 'scaffolds.fasta')

        try:
            if input_fasta2:
                command.execute(
                    command_patterns.SingleCommand(cmd="spades.py",
                                                   args=[
                                                       "-1", input_fasta, "-2",
                                                       input_fasta2, "-o",
                                                       assembled_dir, "-m",
                                                       memory, "-t", 32,
                                                       "--only-assembler"
                                                   ]))
            else:
                command.execute(
                    command_patterns.SingleCommand(cmd="spades.py",
                                                   args=[
                                                       "-s", input_fasta, "-o",
                                                       assembled_dir, "-m",
                                                       memory, "-t", 32,
                                                       "--only-assembler"
                                                   ]))
            command.move_file(assembled_contig_tmp, assembled_contig)
            command.move_file(assembled_scaffold_tmp, assembled_scaffold)

            PipelineStepRunAssembly.generate_read_to_contig_mapping(
                assembled_contig, bowtie_fasta, read2contig,
                duplicate_cluster_sizes_path, bowtie_sam, contig_stats)
        except:
            # Assembly failed. create dummy output files
            command.write_text_to_file(';ASSEMBLY FAILED', assembled_contig)
            command.write_text_to_file(';ASSEMBLY FAILED', assembled_scaffold)
            command.write_text_to_file('@NO INFO', bowtie_sam)
            command.write_text_to_file('{}', contig_stats)
            traceback.print_exc()
        command.remove_rf(assembled_dir)
Beispiel #12
0
    def run(self):
        """
        Extract data from input files.
        Generate coverage viz data.
        Output JSON output files.
        """
        max_num_bins_coverage = self.additional_attributes.get(
            "max_num_bins_coverage", MAX_NUM_BINS_COVERAGE)
        num_accessions_per_taxon = self.additional_attributes.get(
            "num_accessions_per_taxon", NUM_ACCESSIONS_PER_TAXON)
        min_contig_size = self.additional_attributes.get(
            "min_contig_size", MIN_CONTIG_SIZE)

        info_db = s3.fetch_reference(self.additional_files["info_db"],
                                     self.ref_dir_local,
                                     allow_s3mi=True)
        with open_file_db_by_extension(
                info_db, IdSeqDictValue.VALUE_TYPE_ARRAY) as info_dict:
            # Extract data from input files.
            (taxon_data, accession_data, contig_data,
             read_data) = self.prepare_data(self.input_files_local, info_dict,
                                            min_contig_size,
                                            num_accessions_per_taxon)

        # Generate the coverage viz data for each accession.
        coverage_viz_data = self.generate_coverage_viz_data(
            accession_data, contig_data, read_data, max_num_bins_coverage)

        # Generate the summary data, which contains a dict of all taxons for which coverage viz data is available.
        # For each taxon, summary data for the best accessions, plus the number of total accessions, is included.
        coverage_viz_summary_data = self.generate_coverage_viz_summary_data(
            taxon_data, accession_data, coverage_viz_data)

        coverage_viz_summary = self.output_files_local()[0]
        # Write the summary JSON file which is initially loaded on the report page.
        with open(coverage_viz_summary, 'w') as cvs:
            json.dump(coverage_viz_summary_data, cvs)

        # Create a separate coverage viz JSON file for each accession.
        # This file will be passed to the front-end when the user views that particular accession.
        coverage_viz_dir = os.path.join(self.output_dir_local, "coverage_viz")
        command.make_dirs(coverage_viz_dir)
        for accession_id in coverage_viz_data:
            upload_file = os.path.join(coverage_viz_dir,
                                       f"{accession_id}_coverage_viz.json")

            with open(upload_file, 'w') as uf:
                json.dump(coverage_viz_data[accession_id], uf)

        self.additional_output_folders_hidden.append(coverage_viz_dir)
Beispiel #13
0
    def run_with_tax_ids(self, tax_ids: Optional[Set[int]],
                         filename: Optional[str]) -> None:
        assert (tax_ids and filename) or not (
            tax_ids or
            filename), 'Must be supplied with tax_ids and filename or neither'

        scratch_dir = os.path.join(self.output_dir_local,
                                   "scratch_nonhost_fastq")
        command.make_dirs(scratch_dir)
        self.nonhost_headers = [
            os.path.join(scratch_dir, "nonhost_headers_r1.txt"),
            os.path.join(scratch_dir, "nonhost_headers_r2.txt")
        ]

        # Assumed to be [R1.fastq, R2.fastq] if there are two read files.
        fastqs = self.input_files_local[0]

        nonhost_fasta = self.input_files_local[1][0]

        clusters_dict = None
        if READ_COUNTING_MODE == ReadCountingMode.COUNT_ALL \
                and self.additional_attributes.get("use_taxon_whitelist"):
            # TODO: (gdingle): Show all duplicate reads, not just if
            # use_taxon_whitelist. See https://jira.czi.team/browse/IDSEQ-2598.
            # NOTE: this will load the set of all original read headers, which
            # could be several GBs in the worst case.
            clusters_dict = parse_clusters_file(self.input_files_local[2][0],
                                                self.input_files_local[3][0])

        if filename is None:
            output_fastqs = self.output_files_local()
        else:
            output_fastqs = [
                f"{os.path.dirname(fastq)}/{filename}__{os.path.basename(self.output_files_local()[i])}"
                for i, fastq in enumerate(fastqs)
            ]
            self.additional_output_files_hidden.extend(output_fastqs)

        fastqs = self.unzip_files(fastqs)

        self.generate_nonhost_headers(nonhost_fasta, clusters_dict, tax_ids)

        for i in range(len(fastqs)):
            self.generate_nonhost_fastq(self.nonhost_headers[i], fastqs[i],
                                        output_fastqs[i])

        # Clean up scratch files.
        for nonhost_headers in self.nonhost_headers:
            os.remove(nonhost_headers)
Beispiel #14
0
 def fetch_input_files_from_s3(input_files, input_dir_s3, result_dir_local):
     for f in input_files:
         s3_file = os.path.join(input_dir_s3, f)
         local_file = os.path.join(result_dir_local, f)
         local_dir = os.path.dirname(local_file)
         command.make_dirs(local_dir)
         # copy the file over
         output_file = idseq_dag.util.s3.fetch_from_s3(s3_file, local_dir, allow_s3mi=True)
         if output_file:
             # write the done_file
             done_file = PipelineStep.done_file(local_file)
             fmt_now = datetime.datetime.now(tz=pytz.UTC).strftime("%a %b %e %H:%M:%S %Z %Y")
             command.write_text_to_file(fmt_now, done_file)
         else:
             raise RuntimeError(f"{s3_file} likely doesn't exist")
 def __init__(self, *args, **kwrds):
     PipelineStep.__init__(self, *args, **kwrds)
     self.alignment_algorithm = self.additional_attributes.get("alignment_algorithm")
     assert self.alignment_algorithm in ("gsnap", "rapsearch2")
     self.chunks_in_flight_semaphore = threading.Semaphore(MAX_CHUNKS_IN_FLIGHT)
     self.chunks_result_dir_local = os.path.join(self.output_dir_local, "chunks")
     self.chunks_result_dir_s3 = os.path.join(self.output_dir_s3, "chunks")
     self.batch_job_desc_bucket = get_batch_job_desc_bucket()
     self.is_local_run = bool(self.additional_attributes.get("run_locally"))
     self.genome_name = self.additional_attributes.get("genome_name", "nt_k16")
     self.index = self.additional_files.get("index")
     if self.is_local_run:
         assert self.index, "local runs require an index to be passed in"
     else:
         assert not self.index, "passing in an index is not supported for remote runs"
         command.make_dirs(self.chunks_result_dir_local)
    def get_step_object(step_class, step_name, paired=True):
        """Return the PipelineStep with the default parameters ready for
        test.
        """
        if paired:
            dag = IdseqStepSetup.paired_dag()
        else:
            dag = IdseqStepSetup.single_dag()
        step_info = {}
        for step in dag["steps"]:
            if step["out"] == step_name:
                step_info = step
                break
        if not step_info:
            raise ValueError(f"no steps correspond to {step_name}")

        # Download input data to local
        output_dir_s3 = os.path.join(dag["output_dir_s3"],
                                     "testrun_%s_%d_%d" % (step_name,
                                                           int(paired),
                                                           int(time.time())))
        result_dir_local = "/mnt/idseq/results/%s_%d/%d" % (step_name,
                                                            int(paired),
                                                            os.getpid())
        ref_dir_local = '/mnt/idseq/ref'
        command.make_dirs(result_dir_local)
        command.make_dirs(ref_dir_local)

        input_files = []
        for target in step_info["in"]:
            if target in dag["given_targets"]:
                input_dir_s3 = dag["given_targets"][target]["s3_dir"]
            else:
                input_dir_s3 = dag["output_dir_s3"]
            input_files.append(dag["targets"][target])
            PipelineFlow.fetch_input_files_from_s3(
                input_files[-1], input_dir_s3, result_dir_local)

        return step_class(
            name=step_name,
            input_files=input_files,
            output_files=dag["targets"][step_info["out"]],
            output_dir_local=result_dir_local,
            output_dir_s3=output_dir_s3,
            ref_dir_local=ref_dir_local,
            additional_files=step_info["additional_files"],
            additional_attributes=step_info["additional_attributes"])
Beispiel #17
0
    def make_star_index(fasta_file, gtf_file, output_star_genome_path,
                        max_star_part_size):
        star_genome_dir_name = output_star_genome_path[:-4]

        # star genome organization
        # STAR_genome/part-${i}, parts.txt
        fasta_file_list = []
        if max_star_part_size and os.path.getsize(
                fasta_file) > max_star_part_size:
            fasta_file_list = PipelineStepGenerateHostGenome.split_fasta(
                fasta_file, max_star_part_size)
        else:
            fasta_file_list.append(fasta_file)

        for i in range(len(fasta_file_list)):
            log.write("start making STAR index part %d" % i)
            gtf_command_part = []
            if i == 0 and gtf_file:
                gtf_command_part = ["--sjdbGTFfile", gtf_file]

            star_genome_part_dir = f"{star_genome_dir_name}/part-{i}"

            command.make_dirs(star_genome_part_dir)
            star_command_params = [
                '--runThreadN',
                str(multiprocessing.cpu_count()), '--runMode',
                'genomeGenerate', *gtf_command_part, '--genomeDir',
                star_genome_part_dir, '--genomeFastaFiles', fasta_file_list[i],
                '--limitGenomeGenerateRAM',
                virtual_memory().available
            ]
            command.execute(
                command_patterns.SingleCommand(cmd='STAR',
                                               args=star_command_params))
            log.write(f"finished making STAR index part {i}")
        # record # parts into parts.txt
        command.write_text_to_file(
            len(fasta_file_list),
            os.path.join(star_genome_dir_name, "parts.txt"))
        star_genome = os.path.basename(star_genome_dir_name)
        star_work_dir = os.path.dirname(star_genome_dir_name)
        command.execute(
            command_patterns.SingleCommand(cmd="tar",
                                           args=[
                                               "cvf", output_star_genome_path,
                                               "-C", star_work_dir, star_genome
                                           ]))
    def get_test_step_object(step_class,
                             step_name,
                             dag_file,
                             input_dir_s3=None,
                             output_dir_s3=None):
        with open(dag_file) as f:
            dag = json.load(f)
        step_info = {}
        for step in dag["steps"]:
            if step["out"] == step_name:
                step_info = step
                break
        if not step_info:
            raise ValueError(f"no steps correspond to {step_name}")

        # Download input data to local
        od = output_dir_s3 or dag["output_dir_s3"]
        output_dir_s3 = os.path.join(od,
                                     f"testrun_{step_name}_{int(time.time())}")
        result_dir_local = f"/mnt/idseq/results/{step_name}/{os.getpid()}"
        ref_dir_local = '/mnt/idseq/ref'
        command.make_dirs(result_dir_local)
        command.make_dirs(ref_dir_local)

        input_files = []
        for target in step_info["in"]:
            if input_dir_s3:
                input_dir = input_dir_s3
            elif target in dag["given_targets"]:
                input_dir = dag["given_targets"][target]["s3_dir"]
            else:
                input_dir = dag["output_dir_s3"]
            input_files.append(dag["targets"][target])
            PipelineFlow.fetch_input_files_from_s3(input_files[-1], input_dir,
                                                   result_dir_local)

        return step_class(
            name=step_name,
            input_files=input_files,
            output_files=dag["targets"][step_info["out"]],
            output_dir_local=result_dir_local,
            output_dir_s3=output_dir_s3,
            ref_dir_local=ref_dir_local,
            additional_files=step_info["additional_files"],
            additional_attributes=step_info["additional_attributes"])
Beispiel #19
0
 def make_bowtie2_index(host_name, fasta_file, output_bowtie2_index):
     bowtie2_genome_dir_name = output_bowtie2_index[:-4]
     command.make_dirs(bowtie2_genome_dir_name)
     command.execute(
         command_patterns.SingleCommand(cd=bowtie2_genome_dir_name,
                                        cmd='bowtie2-build',
                                        args=[fasta_file, host_name]))
     log.write("finished making bowtie2 index")
     # archive
     bowtie_genome = os.path.basename(bowtie2_genome_dir_name)
     bowtie_work_dir = os.path.dirname(bowtie2_genome_dir_name)
     command.execute(
         command_patterns.SingleCommand(cmd="tar",
                                        args=[
                                            "cvf", output_bowtie2_index,
                                            "-C", bowtie_work_dir,
                                            bowtie_genome
                                        ]))
Beispiel #20
0
    def run_with_tax_ids(
        self,
        tax_ids: Optional[Set[int]],
        filename: Optional[str],
        clusters_dict: Dict[str, List] = None,
    ) -> None:
        assert (tax_ids and filename) or not (
            tax_ids or
            filename), 'Must be supplied with tax_ids and filename or neither'

        scratch_dir = os.path.join(self.output_dir_local,
                                   "scratch_nonhost_fastq")
        command.make_dirs(scratch_dir)
        self.nonhost_headers = [
            os.path.join(scratch_dir, "nonhost_headers_r1.txt"),
            os.path.join(scratch_dir, "nonhost_headers_r2.txt")
        ]

        # Assumed to be [R1.fastq, R2.fastq] if there are two read files.
        fastqs = self.input_files_local[0]

        nonhost_fasta = self.input_files_local[1][0]

        if filename is None:
            output_fastqs = self.output_files_local()
        else:
            output_fastqs = [
                f"{os.path.dirname(fastq)}/{filename}__{os.path.basename(self.output_files_local()[i])}"
                for i, fastq in enumerate(fastqs)
            ]
            self.additional_output_files_hidden.extend(output_fastqs)

        fastqs = self.unzip_files(fastqs)

        self.generate_nonhost_headers(nonhost_fasta, clusters_dict, tax_ids)

        for i in range(len(fastqs)):
            self.generate_nonhost_fastq(self.nonhost_headers[i], fastqs[i],
                                        output_fastqs[i])

        # Clean up scratch files.
        for nonhost_headers in self.nonhost_headers:
            os.remove(nonhost_headers)
Beispiel #21
0
    def dump_align_viz_json(self, output_json_dir, db_type, result_dict):
        def align_viz_name(tag, lin_id):
            return f"{output_json_dir}/{db_type}.{tag}.{int(lin_id)}.align_viz.json"

        # Generate JSON files for the align_viz folder
        command.make_dirs(output_json_dir)
        for (family_id, family_dict) in result_dict.items():
            fn = align_viz_name("family", family_id)
            with open(fn, 'w') as out_f:
                json.dump(family_dict, out_f)

            for (genus_id, genus_dict) in family_dict.items():
                fn = align_viz_name("genus", genus_id)
                with open(fn, 'w') as out_f:
                    json.dump(genus_dict, out_f)

                for (species_id, species_dict) in genus_dict.items():
                    fn = align_viz_name("species", species_id)
                    with open(fn, 'w') as out_f:
                        json.dump(species_dict, out_f)
        self.additional_output_folders_hidden.append(output_json_dir)
Beispiel #22
0
    def run_star_part(self,
                      output_dir,
                      genome_dir,
                      input_files,
                      count_genes,
                      use_starlong):
        command.make_dirs(output_dir)

        # FIXME: https://jira.czi.team/browse/IDSEQ-2738
        #  We want to move towards a general randomness solution in which
        #  all randomness is seeded based on the content of the original input.
        #  STAR takes in a rng seed and it defaults this seed to 777. It is
        #  explicitly set here to call out this behavior so it can be updated
        #  when we update the rest of our rng seeds.
        rng_seed = '777'

        # default is 1000000, this caused a crash so it was doubled
        limit_out_sj_collapsed = '2000000'

        cpus = str(multiprocessing.cpu_count())
        cd = output_dir
        cmd = 'STARlong' if use_starlong else 'STAR'
        params = [
            '--outFilterMultimapNmax', '99999',
            '--outFilterScoreMinOverLread', '0.5',
            '--outFilterMatchNminOverLread', '0.5',
            '--outReadsUnmapped', 'Fastx',
            '--outFilterMismatchNmax', '999',
            '--clip3pNbases', '0',
            '--limitOutSJcollapsed', limit_out_sj_collapsed,
            '--runThreadN', cpus,
            '--runRNGseed', rng_seed,
            '--genomeDir', genome_dir,
            '--readFilesIn', *input_files
        ]

        if self.collect_insert_size_metrics_for == "rna":
            params += [
                '--outSAMtype', 'BAM', 'Unsorted',
                '--outSAMmode', 'NoQS',
                # Based on experimentation we always want --quantMode TranscriptomeSAM GeneCounts
                #   for RNA to collect transcriptome-specific results to compute insert size metrics on
                #   https://czi.quip.com/4niiAhiJsFNx/2019-11-15-CollectInsertSizeMetrics-for-RNA
                '--quantMode', 'TranscriptomeSAM', 'GeneCounts',
            ]
        else:
            if self.collect_insert_size_metrics_for == "dna":
                params += ['--outSAMtype', 'BAM', 'Unsorted', '--outSAMmode', 'NoQS', ]
            else:
                params += ['--outSAMmode', 'None']

            count_file = f"{genome_dir}/sjdbList.fromGTF.out.tab"
            if count_genes and os.path.isfile(count_file):
                params += ['--quantMode', 'GeneCounts']

        if use_starlong:
            params += [
                '--seedSearchStartLmax', '20',
                '--seedPerReadNmax', '100000',
                '--seedPerWindowNmax', '1000',
                '--alignTranscriptsPerReadNmax', '100000',
                '--alignTranscriptsPerWindowNmax', '10000']

        command.execute(
            command_patterns.SingleCommand(
                cd=cd,
                cmd=cmd,
                args=params
            )
        )
Beispiel #23
0
def fetch_from_s3(
        src,  # pylint: disable=dangerous-default-value
        dst,
        auto_unzip=DEFAULT_AUTO_UNZIP,
        auto_untar=DEFAULT_AUTO_UNTAR,
        allow_s3mi=DEFAULT_ALLOW_S3MI,
        okay_if_missing=False,
        is_reference=False,
        touch_only=False,
        mutex=TraceLock("fetch_from_s3", multiprocessing.RLock()),
        locks={}):
    """Fetch a file from S3 if needed, using either s3mi or aws cp.

    IT IS NOT SAFE TO CALL THIS FUNCTION FROM MULTIPLE PROCESSES.
    It is totally fine to call it from multiple threads (it is designed for that).

    When is_reference=True, "dst" must be an existing directory.

    If src does not exist or there is a failure fetching it, the function returns None,
    without raising an exception.  If the download is successful, it returns the path
    to the downloaded file or folder.  If the download already exists, it is touched
    to update its timestamp.

    When touch_only=True, if the destination does not already exist, the function
    simply returns None (as if the download failed).  If the destination does exist,
    it is touched as usual.  This is useful in implementing an LRU cache policy.

    An exception is raised only if there is a coding error or equivalent problem,
    not if src simply doesn't exist.
    """
    # FIXME: this is a compatibility hack so we can replace this function
    #   We are removing ad-hoc s3 downloads from within steps and converting
    #   additional_files to wdl inputs. These files will be transparently
    #   downloaded by miniwdl. miniwdl will also handle the caching that
    #   is currently done here. This hack bypasses the s3 download if the
    #   source is already a local file, and returns the source (which is
    #   a local file path). This way, when we change the additional_files
    #   to inputs we can provide the local file path to the step instead
    #   of the s3 path and seamlessly transition without a coordinated
    #   change between idseq-dag and the idseq monorepo.
    if not src.startswith("s3://"):
        log.write(
            f"fetch_from_s3 is skipping download because source: {src} does not start with s3://"
        )
        if not os.path.isfile(src):
            return None
        if auto_untar and src.endswith(".tar"):
            dst = src[:-4]
            if not os.path.isdir(dst):
                command.make_dirs(dst + ".untarring")
                script = 'tar xvf "${src}" -C "${tmp_destdir}"'
                named_args = {"src": src, "tmp_destdir": dst + ".untarring"}
                command.execute(
                    command_patterns.ShellScriptCommand(script=script,
                                                        named_args=named_args))
                command.rename(dst + ".untarring/" + os.path.basename(dst),
                               dst)
            return dst
        return src

    # Do not be mislead by the multiprocessing.RLock() above -- that just means it won't deadlock
    # if called from multiple processes but does not mean the behaivior will be correct.  It will
    # be incorrect, because the locks dict (cointaining per-file locks) cannot be shared across
    # processes, the way it can be shared across threads.

    if is_reference:
        assert config[
            "REF_DIR"], "The is_reference code path becomes available only after initializing gloabal config['REF_DIR']"

    if os.path.exists(dst) and os.path.isdir(dst):
        dirname, basename = os.path.split(src)
        if is_reference or os.path.abspath(dst).startswith(config["REF_DIR"]):
            # Downloads to the reference dir are persisted from job to job, so we must include
            # version information from the full s3 path.
            #
            # The final destination for s3://path/to/source.db will look like /mnt/ref/s3__path__to/source.db
            # The final destination for s3://path/to/myarchive.tar will look like /mnt/ref/s3__path__to/myarchive/...
            #
            # We considered some other alternatives, for example /mnt/ref/s3__path__to__source.db, but unfortunately,
            # some tools incorporate the base name of their database input into the output filenames, so any approach
            # that changes the basename causes problems downstream.  An example such tool is srst2.
            is_reference = True
            if dirname.startswith("s3://"):
                dirname = dirname.replace("s3://", "s3__", 1)
            # If dirname contains slashes, it has to be flattened to single level.
            dirname = dirname.replace("/", "__")
            dst = os.path.join(dst, dirname, basename)
        else:
            dst = os.path.join(dst, basename)
    else:
        assert not is_reference, f"When fetching references, dst must be an existing directory: {dst}"

    unzip = ""
    if auto_unzip:
        file_without_ext, ext = os.path.splitext(dst)
        if ext in ZIP_EXTENSIONS:
            unzip = " | " + ZIP_EXTENSIONS[
                ext]  # this command will be used to decompress stdin to stdout
            dst = file_without_ext  # remove file extension from dst
    untar = auto_untar and dst.lower().endswith(".tar")
    if untar:
        dst = dst[:-4]  # Remove .tar

    # Downloads are staged under tmp_destdir.  Only after a download completes successfully it is moved to dst.
    destdir = os.path.dirname(dst)
    tmp_destdir = os.path.join(destdir, "tmp_downloads")
    tmp_dst = os.path.join(tmp_destdir, os.path.basename(dst))

    abspath = os.path.abspath(dst)
    with mutex:
        if abspath not in locks:
            locks[abspath] = TraceLock(f"fetch_from_s3: {abspath}",
                                       multiprocessing.RLock())
        destination_lock = locks[abspath]

    # shouldn't happen and makes it impossible to ensure that any dst that exists is complete and correct.
    assert tmp_dst != dst, f"Problematic use of fetch_from_s3 with tmp_dst==dst=='{dst}'"

    with destination_lock:
        # This check is a bit imperfect when untarring... unless you follow the discipline that
        # all contents of file foo.tar are under directory foo/... (which we do follow in IDseq)
        if os.path.exists(dst):
            command.touch(dst)
            return dst

        if touch_only:
            return None

        for (kind, ddir) in [("destinaiton", destdir),
                             ("temporary download", tmp_destdir)]:
            try:
                if ddir:
                    command.make_dirs(ddir)
            except OSError as e:
                # It's okay if the parent directory already exists, but all other
                # errors fail the download.
                if e.errno != errno.EEXIST:
                    log.write(f"Error in creating {kind} directory.")
                    return None

        with IOSTREAM:
            try:
                if allow_s3mi:
                    wait_start = time.time()
                    allow_s3mi = S3MI_SEM.acquire(timeout=MAX_S3MI_WAIT)
                    wait_duration = time.time() - wait_start
                    if not allow_s3mi:
                        log.write(
                            f"Failed to acquire S3MI semaphore after waiting {wait_duration} seconds for {src}."
                        )
                    elif wait_duration >= 5:
                        log.write(
                            f"Waited {wait_duration} seconds to acquire S3MI semaphore for {src}."
                        )

                if untar:
                    write_dst = r''' | tar xvf - -C "${tmp_destdir}";'''
                    named_args = {'tmp_destdir': tmp_destdir}
                else:
                    write_dst = r''' > "${tmp_dst}";'''
                    named_args = {'tmp_dst': tmp_dst}
                command_params = f"{unzip} {write_dst}"

                named_args.update({'src': src})

                try_cli = not allow_s3mi
                if allow_s3mi:
                    if os.path.exists(tmp_dst):
                        command.remove_rf(tmp_dst)
                    try:
                        command.execute(
                            command_patterns.ShellScriptCommand(
                                script=
                                r'set -o pipefail; s3mi cat --quiet "${src}" '
                                + command_params,
                                named_args=named_args))
                    except subprocess.CalledProcessError:
                        try_cli = not okay_if_missing
                        allow_s3mi = False
                        S3MI_SEM.release()
                        if try_cli:
                            log.write(
                                "Failed to download with s3mi. Trying with aws s3 cp..."
                            )
                        else:
                            raise
                if try_cli:
                    if os.path.exists(tmp_dst):
                        command.remove_rf(tmp_dst)
                    if okay_if_missing:
                        script = r'set -o pipefail; aws s3 cp --quiet "${src}" - ' + command_params
                    else:
                        script = r'set -o pipefail; aws s3 cp --only-show-errors "${src}" - ' + command_params
                    command.execute(
                        command_patterns.ShellScriptCommand(
                            script=script,
                            named_args=named_args,
                            env=dict(os.environ, **refreshed_credentials())))
                # Move staged download into final location.  Leave this last, so it only happens if no exception has occurred.
                # By this point we have already asserted that tmp_dst != dst.
                command.rename(tmp_dst, dst)
                return dst
            except BaseException as e:  # Deliberately super broad to make doubly certain that dst will be removed if there has been any exception
                if os.path.exists(dst):
                    command.remove_rf(dst)
                if not isinstance(e, subprocess.CalledProcessError):
                    # Coding error of some sort.  Best not hide it.
                    raise
                if okay_if_missing:
                    # We presume.
                    log.write("File most likely does not exist in S3.")
                else:
                    log.write("Failed to fetch file from S3.")
                return None
            finally:
                if allow_s3mi:
                    S3MI_SEM.release()
                if os.path.exists(
                        tmp_dst
                ):  # by this point we have asserted that tmp_dst != dst (and that assert may have failed, but so be it)
                    command.remove_rf(tmp_dst)
 def create_local_dirs(self):
     ''' make sure proper local directories are created for files with subdirs '''
     for f in self.output_files_local():
         command.make_dirs(os.path.dirname(f))
    def run(self):
        output_files = self.output_files_local()
        local_taxon_fasta_files = [f for input_item in self.input_files_local for f in input_item]
        taxid = self.additional_attributes["taxid"]
        reference_taxids = self.additional_attributes.get("reference_taxids", [taxid])  # Note: will only produce a result if species-level or below
        # During phylo tree creation, if the taxon is in an unknown superkingdom then the k selected from k_config is supposed to be from the key None.
        superkingdom_name = self.additional_attributes.get("superkingdom_name") if self.additional_attributes.get("superkingdom_name") != '' else None

        # knsp3 has a command (MakeKSNP3infile) for making a ksnp3-compatible input file from a directory of fasta files.
        # Before we can use the command, we symlink all fasta files to a dedicated directory.
        # The command makes certain unreasonable assumptions:
        # - current directory is parent directory of the fasta file directory
        # - file names do not have dots except before extension (also no spaces)
        # - file names cannot be too long (for kSNP3 tree building).
        input_dir_for_ksnp3 = f"{self.output_dir_local}/inputs_for_ksnp3"
        command.make_dirs(input_dir_for_ksnp3)
        for local_file in local_taxon_fasta_files:
            command.execute(
                command_patterns.SingleCommand(
                    cmd="ln",
                    args=[
                        "-s",
                        local_file,
                        os.path.join(input_dir_for_ksnp3, os.path.basename(local_file))
                    ]
                )
            )

        # Retrieve Genbank references (full assembled genomes).
        genbank_fastas = self.get_genbank_genomes(reference_taxids, input_dir_for_ksnp3, superkingdom_name, 0)

        # Retrieve NCBI NT references for the accessions in the alignment viz files.
        # These are the accessions (not necessarily full genomes) that were actually matched
        # by the sample's reads during GSNAP alignment.
        accession_fastas = self.get_accession_sequences(input_dir_for_ksnp3, taxid, 10)

        # Retrieve NCBI metadata for the accessions
        metadata_by_node = self.get_metadata_by_tree_node({**accession_fastas, **genbank_fastas})
        metadata_output = output_files[1]
        with open(metadata_output, 'w') as f:
            json.dump(metadata_by_node, f)

        # Run MakeKSNP3infile.
        ksnp3_input_file = f"{self.output_dir_local}/inputs.txt"
        command.execute(
            command_patterns.SingleCommand(
                cd=self.output_dir_local,
                cmd='MakeKSNP3infile',
                args=[
                    os.path.basename(input_dir_for_ksnp3),
                    ksnp3_input_file,
                    "A"
                ]
            )
        )

        # Specify the names of finished reference genomes.
        # Used for annotation & variant-calling.
        annotated_genome_input = f"{self.output_dir_local}/annotated_genomes"
        reference_fasta_files = list(genbank_fastas.values()) + list(accession_fastas.values())
        if reference_fasta_files:
            grep_options = (("-e", path) for path in reference_fasta_files)
            grep_options = list(itertools.chain.from_iterable(grep_options))  # flatmap
            command.execute(
                command_patterns.ShellScriptCommand(
                    script=r'''grep "${grep_options[@]}" "${ksnp3_input_file}" | cut -f2 > "${annotated_genome_input}";''',
                    named_args={
                        'ksnp3_input_file': ksnp3_input_file,
                        'annotated_genome_input': annotated_genome_input,
                        'grep_options': grep_options
                    }
                )
            )

        # Now build ksnp3 command:
        k_config = {
            # All entries to be revisited and benchmarked.
            # Values for viruses and bacteria come from kSNP3 recommendations (13-15 / 19-21).
            "Viruses": 13,
            "Bacteria": 19,
            "Eukaryota": 19,
            None: 13
        }
        k = k_config[superkingdom_name]
        ksnp_output_dir = f"{self.output_dir_local}/ksnp3_outputs"
        command.make_dirs(ksnp_output_dir)
        ksnp_cd = os.path.dirname(ksnp_output_dir)
        ksnp_cmd = "kSNP3"
        ksnap_args = [
            "-in",
            "inputs.txt",
            "-outdir",
            os.path.basename(ksnp_output_dir),
            "-k",
            k
        ]

        # Annotate SNPs using reference genomes:
        # TODO: fix gi vs accession problem
        if os.path.isfile(annotated_genome_input):
            ksnap_args.extend([
                "-annotate",
                os.path.basename(annotated_genome_input)
            ])
            snps_all_annotated = f"{ksnp_output_dir}/SNPs_all_annotated"
            if os.path.isfile(snps_all_annotated):
                self.additional_output_files_hidden.append(snps_all_annotated)

        # Produce VCF file with respect to first reference genome in annotated_genome_input:
        if os.path.isfile(annotated_genome_input):
            ksnap_args.append("-vcf")

        # Run ksnp3 command:
        command.execute(
            command_patterns.SingleCommand(
                cd=ksnp_cd,
                cmd=ksnp_cmd,
                args=ksnap_args
            )
        )

        # Postprocess output names in preparation for upload:
        command.move_file(os.path.join(ksnp_output_dir, "tree.parsimony.tre"), output_files[0])
        ksnp_vcf_file = glob.glob(f"{ksnp_output_dir}/*.vcf")
        if ksnp_vcf_file:
            target_vcf_file = f"{ksnp_output_dir}/variants_reference1.vcf"
            self.name_samples_vcf(ksnp_vcf_file[0], target_vcf_file)
            self.additional_output_files_hidden.append(target_vcf_file)

        # Upload all kSNP3 output files for potential future reference
        supplementary_files = [f for f in glob.glob(f"{ksnp_output_dir}/*")
                               if os.path.isfile(f) and
                               f not in self.additional_output_files_hidden]
        self.additional_output_files_hidden.extend(supplementary_files)