Beispiel #1
0
    def test_move_file_to_folder(self):
        '''WHEN move_file is invoked with a file path and a folder name, THEN it copies the file to the destination folder'''
        dest_file_path = os.path.join(TMP_DEST_FOLDER, TMP_FILE_NAME)

        command.move_file(TMP_SOURCE_FILE_PATH, TMP_DEST_FOLDER)

        self.assertTrue(os.path.exists(dest_file_path),
                        f"file {dest_file_path} doesn't exist")
        self.assertFalse(os.path.exists(TMP_SOURCE_FILE_PATH),
                         f"file {TMP_SOURCE_FILE_PATH} shouldn't exist")
Beispiel #2
0
    def test_move_file_with_different_name(self):
        '''WHEN move_file is invoked with a file path and a full path with a different file name, THEN it copies the file using the new file name'''
        new_file_name = TMP_FILE_NAME + ".new"
        dest_file_path = os.path.join(TMP_DEST_FOLDER, new_file_name)

        command.move_file(TMP_SOURCE_FILE_PATH, dest_file_path)

        self.assertTrue(os.path.exists(dest_file_path),
                        f"file {dest_file_path} doesn't exist")
        self.assertFalse(os.path.exists(TMP_SOURCE_FILE_PATH),
                         f"file {TMP_SOURCE_FILE_PATH} shouldn't exist")
Beispiel #3
0
 def trim_adapters_in_place(local_file):
     local_file_trimmed = os.path.join(
         os.path.dirname(local_file),
         "trimmed_" + os.path.basename(local_file))
     command.execute(
         command_patterns.SingleCommand(cmd='cutadapt',
                                        args=[
                                            "-a", "AGATCGGAAGAGCACACGTCT",
                                            "-o", local_file_trimmed,
                                            local_file
                                        ]))
     command.move_file(local_file_trimmed, local_file)
Beispiel #4
0
    def assemble(
            input_fasta,
            input_fasta2,
            bowtie_fasta,  # fasta file for running bowtie against contigs
            duplicate_cluster_sizes_path,
            assembled_contig,
            assembled_scaffold,
            bowtie_sam,
            contig_stats,
            read2contig,
            memory=100):
        basedir = os.path.dirname(assembled_contig)
        assembled_dir = os.path.join(basedir, 'spades')
        command.make_dirs(assembled_dir)
        assembled_contig_tmp = os.path.join(assembled_dir, 'contigs.fasta')
        assembled_scaffold_tmp = os.path.join(assembled_dir, 'scaffolds.fasta')

        try:
            if input_fasta2:
                command.execute(
                    command_patterns.SingleCommand(cmd="spades.py",
                                                   args=[
                                                       "-1", input_fasta, "-2",
                                                       input_fasta2, "-o",
                                                       assembled_dir, "-m",
                                                       memory, "-t", 32,
                                                       "--only-assembler"
                                                   ]))
            else:
                command.execute(
                    command_patterns.SingleCommand(cmd="spades.py",
                                                   args=[
                                                       "-s", input_fasta, "-o",
                                                       assembled_dir, "-m",
                                                       memory, "-t", 32,
                                                       "--only-assembler"
                                                   ]))
            command.move_file(assembled_contig_tmp, assembled_contig)
            command.move_file(assembled_scaffold_tmp, assembled_scaffold)

            PipelineStepRunAssembly.generate_read_to_contig_mapping(
                assembled_contig, bowtie_fasta, read2contig,
                duplicate_cluster_sizes_path, bowtie_sam, contig_stats)
        except:
            # Assembly failed. create dummy output files
            command.write_text_to_file(';ASSEMBLY FAILED', assembled_contig)
            command.write_text_to_file(';ASSEMBLY FAILED', assembled_scaffold)
            command.write_text_to_file('@NO INFO', bowtie_sam)
            command.write_text_to_file('{}', contig_stats)
            traceback.print_exc()
        command.remove_rf(assembled_dir)
Beispiel #5
0
    def run(self):
        """Run STAR to filter out host reads."""
        # Setup
        if self.sequence_input_files is not None and self.validated_input_counts_file is not None:
            validated_input_counts_file = self.validated_input_counts_file
            input_files = self.sequence_input_files
        else:
            validated_input_counts_file = self.input_files_local[0][0]
            input_files = self.input_files_local[0][1:3]

        num_inputs = len(input_files)
        scratch_dir = os.path.join(self.output_dir_local, "scratch_star")

        output_files_local = self.output_files_local()
        output_gene_file = self.additional_attributes.get("output_gene_file")
        output_log_file = self.additional_attributes.get("output_log_file")

        genome_dir = s3.fetch_reference(
            self.additional_files["star_genome"],
            self.ref_dir_local,
            allow_s3mi=True,
            auto_untar=True)

        # Check parts file for the number of partitioned indexes
        parts_file = os.path.join(genome_dir, "parts.txt")
        assert os.path.isfile(parts_file)
        with open(parts_file, 'rb') as parts_f:
            num_parts = int(parts_f.read())

        # Don't compute insert size metrics if the STAR index has more than one part
        #   Logic for combining BAM output from STAR or insert size metrics not implemented
        if self.collect_insert_size_metrics_for and num_parts != 1:
            log.write("Insert size metrics were expected to be collected for sample but were not because the STAR index has more than one part")
            self.collect_insert_size_metrics_for = None

        # Run STAR on each partition and save the unmapped read info
        unmapped = input_files

        with open(validated_input_counts_file) as validated_input_counts_f:
            validated_input_counts = json.load(validated_input_counts_f)

        use_starlong = validated_input_counts[vc.BUCKET_LONG] > 1 or \
            validated_input_counts[vc.BUCKET_TOO_LONG] > 1

        for part_idx in range(num_parts):
            tmp = f"{scratch_dir}/star-part-{part_idx}"
            genome_part = f"{genome_dir}/part-{part_idx}"
            count_genes = part_idx == 0
            self.run_star_part(tmp, genome_part, unmapped, count_genes, use_starlong)

            unmapped, too_discrepant = PipelineStepRunStar.sync_pairs(
                PipelineStepRunStar.unmapped_files_in(tmp, num_inputs))

            if too_discrepant:
                raise BrokenReadPairError("Broken pairs")

            # Run part 0 in gene-counting mode:
            # (a) ERCCs are doped into part 0 and we want their counts.
            # (b) If there is only 1 part (e.g. human), the host gene counts also
            # make sense.
            if part_idx == 0:
                gene_count_file = os.path.join(tmp, "ReadsPerGene.out.tab")
                if os.path.isfile(gene_count_file) and output_gene_file:
                    moved = os.path.join(self.output_dir_local,
                                         output_gene_file)
                    command.move_file(gene_count_file, moved)
                    self.additional_output_files_hidden.append(moved)

                log_file = os.path.join(tmp, "Log.final.out")
                if os.path.isfile(log_file) and output_log_file:
                    moved = os.path.join(self.output_dir_local, output_log_file)
                    command.move_file(log_file, moved)

                # STAR names the output BAM file Aligned.out.bam without TranscriptomeSAM and
                #  Aligned.toTranscriptome.out.bam with  TranscriptomeSAM, this doesn't
                #  appear to be configurable
                is_dna = self.collect_insert_size_metrics_for == "dna"
                bam_filename = "Aligned.out.bam" if is_dna else "Aligned.toTranscriptome.out.bam"
                if self.collect_insert_size_metrics_for:
                    bam_path = os.path.join(tmp, bam_filename)

                    # If this file wasn't generated but self.collect_insert_size_metrics_for has a value
                    #   something unexpected has gone wrong
                    assert(os.path.isfile(bam_path)), \
                        "Expected STAR to generate Aligned.out.bam but it was not found"
                    try:
                        self.collect_insert_size_metrics(tmp, bam_path, self.output_metrics_file, self.output_histogram_file)
                        if os.path.exists(self.output_metrics_file):
                            self.additional_output_files_visible.append(self.output_metrics_file)
                        else:
                            message = "expected picard to generate a metrics file but none was found"
                            log.write(message=message, warning=True)
                        if os.path.exists(self.output_histogram_file):
                            self.additional_output_files_visible.append(self.output_histogram_file)
                        else:
                            message = "expected picard to generate a histogram file but none was found"
                            log.write(message=message, warning=True)
                    except Exception as e:
                        log.write(message=f"encountered error while running picard: {type(e).__name__}: {e}", warning=True)

        # Sort unmapped files for deterministic output
        for unmapped_file in unmapped:
            sort_fastx_by_entry_id(unmapped_file)
        # Cleanup
        for src, dst in zip(unmapped, output_files_local):
            command.move_file(src, dst)    # Move out of scratch dir
        command.remove_rf(f"{scratch_dir}/*")
    def run(self):
        output_files = self.output_files_local()
        local_taxon_fasta_files = [f for input_item in self.input_files_local for f in input_item]
        taxid = self.additional_attributes["taxid"]
        reference_taxids = self.additional_attributes.get("reference_taxids", [taxid])  # Note: will only produce a result if species-level or below
        # During phylo tree creation, if the taxon is in an unknown superkingdom then the k selected from k_config is supposed to be from the key None.
        superkingdom_name = self.additional_attributes.get("superkingdom_name") if self.additional_attributes.get("superkingdom_name") != '' else None

        # knsp3 has a command (MakeKSNP3infile) for making a ksnp3-compatible input file from a directory of fasta files.
        # Before we can use the command, we symlink all fasta files to a dedicated directory.
        # The command makes certain unreasonable assumptions:
        # - current directory is parent directory of the fasta file directory
        # - file names do not have dots except before extension (also no spaces)
        # - file names cannot be too long (for kSNP3 tree building).
        input_dir_for_ksnp3 = f"{self.output_dir_local}/inputs_for_ksnp3"
        command.make_dirs(input_dir_for_ksnp3)
        for local_file in local_taxon_fasta_files:
            command.execute(
                command_patterns.SingleCommand(
                    cmd="ln",
                    args=[
                        "-s",
                        local_file,
                        os.path.join(input_dir_for_ksnp3, os.path.basename(local_file))
                    ]
                )
            )

        # Retrieve Genbank references (full assembled genomes).
        genbank_fastas = self.get_genbank_genomes(reference_taxids, input_dir_for_ksnp3, superkingdom_name, 0)

        # Retrieve NCBI NT references for the accessions in the alignment viz files.
        # These are the accessions (not necessarily full genomes) that were actually matched
        # by the sample's reads during GSNAP alignment.
        accession_fastas = self.get_accession_sequences(input_dir_for_ksnp3, taxid, 10)

        # Retrieve NCBI metadata for the accessions
        metadata_by_node = self.get_metadata_by_tree_node({**accession_fastas, **genbank_fastas})
        metadata_output = output_files[1]
        with open(metadata_output, 'w') as f:
            json.dump(metadata_by_node, f)

        # Run MakeKSNP3infile.
        ksnp3_input_file = f"{self.output_dir_local}/inputs.txt"
        command.execute(
            command_patterns.SingleCommand(
                cd=self.output_dir_local,
                cmd='MakeKSNP3infile',
                args=[
                    os.path.basename(input_dir_for_ksnp3),
                    ksnp3_input_file,
                    "A"
                ]
            )
        )

        # Specify the names of finished reference genomes.
        # Used for annotation & variant-calling.
        annotated_genome_input = f"{self.output_dir_local}/annotated_genomes"
        reference_fasta_files = list(genbank_fastas.values()) + list(accession_fastas.values())
        if reference_fasta_files:
            grep_options = (("-e", path) for path in reference_fasta_files)
            grep_options = list(itertools.chain.from_iterable(grep_options))  # flatmap
            command.execute(
                command_patterns.ShellScriptCommand(
                    script=r'''grep "${grep_options[@]}" "${ksnp3_input_file}" | cut -f2 > "${annotated_genome_input}";''',
                    named_args={
                        'ksnp3_input_file': ksnp3_input_file,
                        'annotated_genome_input': annotated_genome_input,
                        'grep_options': grep_options
                    }
                )
            )

        # Now build ksnp3 command:
        k_config = {
            # All entries to be revisited and benchmarked.
            # Values for viruses and bacteria come from kSNP3 recommendations (13-15 / 19-21).
            "Viruses": 13,
            "Bacteria": 19,
            "Eukaryota": 19,
            None: 13
        }
        k = k_config[superkingdom_name]
        ksnp_output_dir = f"{self.output_dir_local}/ksnp3_outputs"
        command.make_dirs(ksnp_output_dir)
        ksnp_cd = os.path.dirname(ksnp_output_dir)
        ksnp_cmd = "kSNP3"
        ksnap_args = [
            "-in",
            "inputs.txt",
            "-outdir",
            os.path.basename(ksnp_output_dir),
            "-k",
            k
        ]

        # Annotate SNPs using reference genomes:
        # TODO: fix gi vs accession problem
        if os.path.isfile(annotated_genome_input):
            ksnap_args.extend([
                "-annotate",
                os.path.basename(annotated_genome_input)
            ])
            snps_all_annotated = f"{ksnp_output_dir}/SNPs_all_annotated"
            if os.path.isfile(snps_all_annotated):
                self.additional_output_files_hidden.append(snps_all_annotated)

        # Produce VCF file with respect to first reference genome in annotated_genome_input:
        if os.path.isfile(annotated_genome_input):
            ksnap_args.append("-vcf")

        # Run ksnp3 command:
        command.execute(
            command_patterns.SingleCommand(
                cd=ksnp_cd,
                cmd=ksnp_cmd,
                args=ksnap_args
            )
        )

        # Postprocess output names in preparation for upload:
        command.move_file(os.path.join(ksnp_output_dir, "tree.parsimony.tre"), output_files[0])
        ksnp_vcf_file = glob.glob(f"{ksnp_output_dir}/*.vcf")
        if ksnp_vcf_file:
            target_vcf_file = f"{ksnp_output_dir}/variants_reference1.vcf"
            self.name_samples_vcf(ksnp_vcf_file[0], target_vcf_file)
            self.additional_output_files_hidden.append(target_vcf_file)

        # Upload all kSNP3 output files for potential future reference
        supplementary_files = [f for f in glob.glob(f"{ksnp_output_dir}/*")
                               if os.path.isfile(f) and
                               f not in self.additional_output_files_hidden]
        self.additional_output_files_hidden.extend(supplementary_files)
    def get_accession_sequences(self, dest_dir, taxid, n=10):
        '''
        Retrieve NCBI NT references for the most-matched accession in each hitsummary2 file, up to a maximum of n references.
        Write each reference to a separate fasta file.
        '''
        if n == 0:
            return {}

        # Retrieve files
        nt_db = self.additional_attributes["nt_db"]
        nt_loc_db = s3.fetch_reference(
            self.additional_files["nt_loc_db"],
            self.ref_dir_local,
            allow_s3mi=True)

        # Choose accessions to process.
        s3_hitsummary2_files = self.additional_attributes["hitsummary2_files"].values()
        accessions = defaultdict(lambda: 0)
        # TODO: Address issue where accessions in nr can be chosen in the following code.
        # These accessions will not be found in nt_loc and will be subsequently omitted.
        for file_list in s3_hitsummary2_files:
            tally = defaultdict(lambda: 0)
            for s3_file in file_list:
                local_basename = s3_file.replace("/", "-").replace(":", "-")
                local_file = s3.fetch_from_s3(
                    s3_file,
                    os.path.join(self.output_dir_local, local_basename))
                if local_file is None:
                    continue
                with open(local_file, 'r') as f:
                    for line in f:
                        acc, species_taxid, genus_taxid, family_taxid = line.rstrip().split("\t")[3:7]
                        if any(int(hit_taxid) == taxid for hit_taxid in [species_taxid, genus_taxid, family_taxid]):
                            tally[acc] += 1
            if tally:
                best_acc, max_count = max(tally.items(), key=lambda x: x[1])
                accessions[best_acc] += max_count
        if len(accessions) > n:
            accessions = dict(sorted(accessions.items(), key=lambda x: x[1], reverse=True)[:n])
        accessions = set(accessions.keys())

        # Make map of accession to sequence file
        accession2info = dict((acc, {}) for acc in accessions)
        with open_file_db_by_extension(nt_loc_db) as nt_loc_dict:
            PipelineStepGenerateAlignmentViz.get_sequences_by_accession_list_from_s3(
                accession2info, nt_loc_dict, nt_db)

        # Put 1 fasta file per accession into the destination directory
        accession_fastas = {}
        for acc, info in accession2info.items():
            if 'seq_file' not in info or info['seq_file'] is None:
                log.write(f"WARNING: No sequence retrieved for {acc}")
                continue
            clean_accession = self.clean_name_for_ksnp3(acc)
            local_fasta = f"{dest_dir}/NCBI_NT_accession_{clean_accession}.fasta"
            command.execute(
                command_patterns.SingleCommand(
                    cmd="ln",
                    args=[
                        "-s",
                        info['seq_file'],
                        local_fasta
                    ]
                )
            )
            command.execute_with_output(
                command_patterns.ShellScriptCommand(
                    script=r'''echo ">${acc}" | cat - "${local_fasta}" > temp_file;''',
                    named_args={
                        'acc': acc,
                        'local_fasta': local_fasta
                    }
                )
            )
            command.move_file('temp_file', local_fasta)

            accession_fastas[acc] = local_fasta

        # Return kept accessions and paths of their fasta files
        return accession_fastas
Beispiel #8
0
    def generate_lzw_filtered(self, fasta_files, output_files, cutoff_scores,
                              threshold_readlength):
        assert len(fasta_files) == len(output_files)

        cutoff_scores.sort(
            reverse=True)  # Make sure cutoff is from high to low

        # This is the bulk of the computation.  Everything else below is just binning by cutoff score.
        coalesced_score_file = PipelineStepRunLZW.lzw_compute(
            fasta_files, threshold_readlength, cutoff_scores[0])

        readcount_list = []  # one item per cutoff
        outstreams_list = []  # one item per cutoff
        outfiles_list = []  # one item per cutoff

        for cutoff in cutoff_scores:
            readcount_list.append(0)
            outstreams = []
            outfiles = []
            for f in output_files:
                outfile_name = "%s-%f" % (f, cutoff)
                outfiles.append(outfile_name)
                outstreams.append(open(outfile_name, 'w'))

            outstreams_list.append(outstreams)
            outfiles_list.append(outfiles)

        outstreams_for_cutoff = list(zip(outstreams_list, cutoff_scores))

        def score_iterator(score_file: str) -> Iterator[float]:
            with open(score_file, "r") as sf:
                for line in sf:
                    yield float(line)

        total_reads = 0
        for reads, score in zip(fasta.synchronized_iterator(fasta_files),
                                score_iterator(coalesced_score_file)):
            total_reads += 1
            for i, (outstreams, cutoff) in enumerate(outstreams_for_cutoff):
                if score > cutoff:
                    readcount_list[i] += 1
                    for ostr, r in zip(outstreams, reads):
                        ostr.write(r.header + "\n")
                        ostr.write(r.sequence + "\n")
                    break
        os.remove(coalesced_score_file)

        # closing all the streams
        for outstreams in outstreams_list:
            for ostr in outstreams:
                ostr.close()

        # get the right output file and metrics
        kept_count = 0
        filtered = total_reads
        cutoff_frac = None
        for cutoff_frac, readcount, outfiles in zip(cutoff_scores,
                                                    readcount_list,
                                                    outfiles_list):
            if readcount > 0:
                # found the right bin
                kept_count = readcount
                filtered = total_reads - kept_count
                # move the output files over
                for outfile, output_file in zip(outfiles, output_files):
                    command.move_file(outfile, output_file)
                break

        if kept_count == 0:
            self.input_file_error = InputFileErrors.INSUFFICIENT_READS
            self.status = StepStatus.INVALID_INPUT
            return

        kept_ratio = float(kept_count) / float(total_reads)
        msg = "LZW filter: cutoff_frac: %f, total reads: %d, filtered reads: %d, " \
              "kept ratio: %f" % (cutoff_frac, total_reads, filtered, kept_ratio)
        log.write(msg)