Exemple #1
0
 def test_cd(self):
     '''WHEN using cd parameter, THEN it executes the command in the selected directory and resets it to previous dir before executing next command'''
     _shared_test_cd_parameter(
         test_context=self,
         pwd_command_pattern_with_cd=command_patterns.ShellScriptCommand(
             cd=TMP_FOLDER, script="pwd", args=[]),
         pwd_command_pattern_without_cd=command_patterns.ShellScriptCommand(
             script="pwd", args=[]))
Exemple #2
0
 def generate_read_to_contig_mapping(assembled_contig, fasta_file,
                                     read2contig,
                                     duplicate_cluster_sizes_path,
                                     output_bowtie_sam,
                                     output_contig_stats):
     ''' read -> contig mapping through bowtie2 alignment '''
     base_output_dir = os.path.dirname(fasta_file)
     # build bowtie index based on assembled_contig
     bowtie_index_path = os.path.join(base_output_dir, 'bowtie-contig')
     command.make_dirs(bowtie_index_path)
     command.execute(
         command_patterns.SingleCommand(
             cmd='bowtie2-build',
             args=[assembled_contig, bowtie_index_path]))
     command.execute(
         command_patterns.ShellScriptCommand(
             script=
             r'''bowtie2 -x "${bowtie_index_path}" -f -U "${fasta_file}" --very-sensitive -p 32 > "${output_bowtie_sam}";''',
             named_args={
                 'bowtie_index_path': bowtie_index_path,
                 'fasta_file': fasta_file,
                 'output_bowtie_sam': output_bowtie_sam
             }))
     contig_stats = PipelineStepRunAssembly.generate_info_from_sam(
         output_bowtie_sam, read2contig, duplicate_cluster_sizes_path)
     with open(output_contig_stats, 'w') as ocf:
         json.dump(contig_stats, ocf)
Exemple #3
0
    def generate_mapped_reads_tsv(self):
        """Use bedtools to generate a table of mapped reads for each genome in the ARG ANNOT database.
            If a new resistance gene db is used, the .bed file will need to be updated manually."""
        bed_file_path = fetch_reference(
            self.additional_files["resist_genome_bed"],
            self.ref_dir_local,
            allow_s3mi=False)
        sample_bam_file_path = self.output_files_local()[5]

        tmp_sort_dir = os.path.join(self.output_dir_local, "tmp_sort")
        command.make_dirs(tmp_sort_dir)

        # Convert the sorted.bam output from SRST2 to the bed format, then sort the bed file.
        # This allows us to use the "sorted" mode of bedtools coverage, which is memory-efficient.
        # Otherwise, large sorted.bam files will cause our machines to run out of RAM.
        #
        # Note that despite being called "sorted.bam", the bam is not sorted the way we need it to be.
        #
        # env LC_ALL=C ensures that the sort command uses the same sort order on all machines.
        #
        # The -T flag with tmp_sort_dir ensures that we make tmp files inside /mnt, which is where our huge AWS volumes are mounted.
        # By default, the sort command creates temp files in /tmp, which has very little disk space.
        command.execute(
            command_patterns.ShellScriptCommand(
                script='''
                    bedtools bamtobed -i "$1" |
                    env LC_ALL=C sort -T "$2" -k1,1 -k2,2n |
                    bedtools coverage -sorted -a "$3" -b stdin > "$4";''',
                args=[
                    sample_bam_file_path, tmp_sort_dir, bed_file_path,
                    os.path.join(self.output_dir_local, MATCHED_READS_FILE)
                ]))

        command.remove_rf(tmp_sort_dir)
Exemple #4
0
def reads(local_file_path, max_reads=None):
    '''
    Count reads in a local file based on file format inferred from extension,
    up to a maximum of max_reads.
    '''
    if local_file_path.endswith(".gz"):
        cmd = r'''zcat "${local_file_path}"'''
        file_format = local_file_path.split(".")[-2]
    else:
        cmd = r'''cat "${local_file_path}"'''
        file_format = local_file_path.split(".")[-1]

    named_args = {
        'local_file_path': local_file_path
    }

    if max_reads:
        max_lines = reads2lines(max_reads, file_format)
        assert max_lines is not None, "Could not convert max_reads to max_lines"
        cmd += r''' | head -n "${max_lines}"'''
        named_args.update({
            'max_lines': max_lines
        })

    cmd += " |  wc -l"

    cmd_output = command.execute_with_output(
        command_patterns.ShellScriptCommand(
            script=cmd,
            named_args=named_args
        )
    )
    line_count = int(cmd_output.strip().split(' ')[0])
    return lines2reads(line_count, file_format)
Exemple #5
0
    def run(self):
        """
          1. extract contigs.fasta and read-contig.sam
          2. run pile up
        """
        contigs, _scaffolds, read_contig_sam, _stats = self.input_files_local[
            0]
        coverage_json, coverage_summary_csv = self.output_files_local()

        if os.path.getsize(contigs) < MIN_CONTIG_FILE_SIZE:
            command.write_text_to_file('{}', coverage_json)
            command.write_text_to_file('No Contigs', coverage_summary_csv)
            return

        # generate bam files
        bam_file = read_contig_sam.replace(".sam", ".bam")
        command.execute(
            command_patterns.ShellScriptCommand(
                script=
                r'''samtools view -S -b "${read_contig_sam}" | samtools sort - -o "${bam_file}";''',
                named_args={
                    'read_contig_sam': read_contig_sam,
                    'bam_file': bam_file
                }))
        command.execute(
            command_patterns.SingleCommand(cmd="samtools",
                                           args=["index", bam_file]))
        # run coverage info
        output_csv, output_json = self.calc_contig2coverage(bam_file)
        os.rename(output_csv, coverage_summary_csv)
        os.rename(output_json, coverage_json)
 def generate_unidentified_fasta(input_fa, output_fa):
     # TODO  remove annotated fasta intermediate file and replace > with : below
     command.execute(
         command_patterns.ShellScriptCommand(
             script=
             r'''grep -A 1 '>NR::NT::' "$1" | sed '/^--$/d' > "$2";''',
             args=[input_fa, output_fa]))
Exemple #7
0
def execute(
    command: Union[command_patterns.CommandPattern, str],
    progress_file: str = None,
    timeout: int = None,
    grace_period: int = None,
    capture_stdout: bool = False,
    merge_stderr: bool = False,
    log_context_mode: log.LogContextMode = log.LogContextMode.
    START_END_LOG_EVENTS
) -> Union[str, None]:
    """Primary way to start external commands in subprocesses and handle
    execution with logging.
    """
    if not isinstance(command, command_patterns.CommandPattern):
        # log warning if using legacy format
        log.write(
            warning=True,
            message=
            f"Command parameter is using legacy type str. Use idseq_dag.util.command_patterns.",
            obj_data={
                "cmd": command,
                "type": type(command)
            })
        cmd = command_patterns.ShellScriptCommand(script=command, args=[])
    else:
        cmd = command

    with CommandTracker() as ct:
        log_values = {"cid": f"Command {ct.id}", "command": cmd.as_dict()}
        with log.log_context('command_execute',
                             values=log_values,
                             log_context_mode=log_context_mode) as lctx:
            with ProgressFile(progress_file):
                if timeout:
                    ct.timeout = timeout
                if grace_period:
                    ct.grace_period = grace_period
                if capture_stdout:
                    # Capture only stdout. Child stderr = parent stderr unless
                    # merge_stderr specified. Child input = parent stdin.
                    ct.proc = cmd.open(stdin=sys.stdin.fileno(),
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.STDOUT if merge_stderr
                                       else sys.stderr.fileno())
                    stdout, _ = ct.proc.communicate()
                else:
                    # Capture nothing. Child inherits parent stdin/out/err.
                    ct.proc = cmd.open()
                    ct.proc.wait()
                    stdout = None

                lctx.values.update({"returncode": ct.proc.returncode})

                if ct.proc.returncode:
                    raise subprocess.CalledProcessError(
                        ct.proc.returncode, str(command), stdout)
                if capture_stdout:
                    return stdout
Exemple #8
0
 def grab_wgs_accessions(self, source_file, dest_file):
     command.execute(
         command_patterns.ShellScriptCommand(
             script=
             r'''grep '^>' "${source_file}" | grep 'complete genome' | cut -f 1 -d' ' > "${dest_file}";''',
             named_args={
                 'source_file': source_file,
                 'dest_file': dest_file
             }))
Exemple #9
0
    def test_open_2(self):
        '''WHEN script uses shell variables within the script THEN it can get access to those variables but can use them from parameters'''
        cp1 = command_patterns.ShellScriptCommand(
            script=r'abc=123; echo abc=$abc, \$1=$1', args=["$abc"])

        p = cp1.open(stdout=subprocess.PIPE)
        stdout, stderr = p.communicate()

        self.assertFalse(stderr)
        self.assertEqual(stdout.decode(), "abc=123, $1=$abc\n")
def multilinefa2singlelinefa(input_fasta, output_fasta):
    ''' Multi-line FASTA to Single-line FASTA conversion '''
    command.execute(
        command_patterns.ShellScriptCommand(
            script=
            r'''awk 'NR==1 {print $0} NR>1 && /^>/ {printf("\n%s\n",$0);next; } NR>1 { printf("%s",$0);}  END {printf("\n");}' <"${input_fasta}" > "${output_fasta}";''',
            named_args={
                'input_fasta': input_fasta,
                'output_fasta': output_fasta
            }))
def fq2fa(input_fastq, output_fasta):
    ''' FASTQ to FASTA conversion '''
    command.execute(
        command_patterns.ShellScriptCommand(
            script=
            r'''sed -n '1~4s/^@/>/p;2~4p' <"${input_fastq}" > "${output_fasta}";''',
            named_args={
                'input_fastq': input_fastq,
                'output_fasta': output_fasta
            }))
Exemple #12
0
    def test_shellscript_with_param_array(self):
        '''WHEN using ShellScriptCommand to invoke a command with an array of parameters, THEN it works as expected'''
        cp1 = command_patterns.ShellScriptCommand(
            script=r'''paste "${slice_outputs[@]}"''',
            named_args={
                'slice_outputs':
                ["-d", r"\n", TESTFILE_ABC_TXT, TESTFILE_BCD_TXT]
            })

        result = command.execute_with_output(cp1)

        self.assertEqual(result, "abc\nbcd\n")
Exemple #13
0
    def test_specific_pattern_1(self):
        '''WHEN using ShellScriptCommand with multiline script, THEN it works as expected'''
        cp1 = command_patterns.ShellScriptCommand(script=r'''
                a=123;
                echo May $a the force be with you \
                | sed "s/a/Z/g"
            ''',
                                                  args=[])

        result = command.execute_with_output(cp1)

        self.assertEqual(result, "MZy 123 the force be with you\n")
 def _vcf_replace_column_description(input_file, output_file, new_column_description):
     escaped_new_column_description = new_column_description.replace("\\", "\\\\").replace("&", "\\&").replace("/", r"\/")
     command.execute(
         command_patterns.ShellScriptCommand(
             script=r'''sed "${sed_pattern}" "${input_file}" > "${output_file}"''',
             named_args={
                 'sed_pattern': f"s/^#CHROM.*/{escaped_new_column_description}/",
                 'input_file': input_file,
                 'output_file': output_file
             }
         )
     )
 def truncate_file(self, infile, outfile, is_fastq, max_fragments):
     num_lines = self.calc_max_num_lines(is_fastq, max_fragments)
     command.execute(
         command_patterns.ShellScriptCommand(
             script=
             r'''head -n "${num_lines}" "${infile}" > "${outfile}";''',
             named_args={
                 'num_lines': num_lines,
                 'infile': infile,
                 'outfile': outfile
             }))
     num_fragments = count.reads(outfile)
     self.summary_dict[vc.BUCKET_NORMAL] += num_fragments
     return num_fragments
 def generate_nonhost_fastq(
     nonhost_headers: str,
     fastq: str,
     output_file: str
 ) -> None:
     command.execute(
         command_patterns.ShellScriptCommand(
             script=r'''seqtk subseq "$1" "$2" > "$3";''',
             args=[
                 fastq,
                 nonhost_headers,
                 output_file
             ]
         )
     )
 def delimit_fasta(input_fa, tmp, taxid_field_num, output_fa):
     # Put every 2-line fasta record on a single line with delimiter
     # ":lineseparator:":
     script = r'''awk 'NR % 2 == 1 { o=$0 ; next } { print o ":lineseparator:" $0 }' "${input_fa}" '''
     # Sort the records based on the field containing the taxids
     script += r''' | sort -T "${tmp}" --key "${taxid_field_num}" --field-separator ':' --numeric-sort '''
     # Split every record back over 2 lines
     script += r''' | sed 's/:lineseparator:/\n/g' > "${output_fa}";'''
     command.execute(
         command_patterns.ShellScriptCommand(script=script,
                                             named_args={
                                                 "input_fa": input_fa,
                                                 "tmp": tmp,
                                                 "taxid_field_num":
                                                 taxid_field_num,
                                                 "output_fa": output_fa
                                             }))
Exemple #18
0
    def test_execute_shell_script_command_2(self):
        '''WHEN using SingleCommand with args that have special shell characters, THEN it doesn't execute subcommands'''
        cp1 = command_patterns.ShellScriptCommand(
            script=r'echo ${@: 1:$#-1} | sed ${@: $#}',
            args=[
                "1", 2, "$(pwd)", ";ls", ";", "ls", "\n", "ls", "&& ls",
                "`pwd`", ">", "test.txt", "> test.txt", ">> test.txt", "&&",
                "ls", "$", "abc\nls"
                "pwd", "s/w/a/g"
            ])

        result = command.execute_with_output(cp1)

        self.assertEqual(
            result,
            "1 2 $(pad) ;ls ; ls ls && ls `pad` > test.txt > test.txt >> test.txt && ls $ abc lspad\n"
        )
Exemple #19
0
    def lzw_compute(input_files,
                    threshold_readlength,
                    cutoff,
                    slice_step=NUM_SLICES):
        """Spawn subprocesses on NUM_SLICES of the input files, then coalesce the
        scores into a temp file, and return that file's name."""

        temp_file_names = [
            f"lzwslice_{slice_step}_{slice_start}.txt"
            for slice_start in range(slice_step + 1)
        ]
        for tfn in temp_file_names:
            assert not os.path.exists(tfn)

        @run_in_subprocess
        def lzw_compute_slice(slice_start):
            """For each read, or read pair, in input_files, such that read_index % slice_step == slice_start,
            output the lzw score for the read, or the min lzw score for the pair."""
            lzw_score = PipelineStepRunLZW.lzw_score
            with open(temp_file_names[slice_start], "a") as slice_output:
                for i, reads in enumerate(
                        fasta.synchronized_iterator(input_files)):
                    if i % slice_step == slice_start:
                        lzw_min_score = min(
                            lzw_score(r.sequence, threshold_readlength, cutoff)
                            for r in reads)
                        slice_output.write(str(lzw_min_score) + "\n")

        # slices run in parallel
        mt_map(lzw_compute_slice, range(slice_step))

        slice_outputs = temp_file_names[:-1]
        coalesced_score_file = temp_file_names[-1]
        # Paste can insert newlines at the end;  we grep those out.
        command.execute(
            command_patterns.ShellScriptCommand(
                script=
                r'''paste -d '\n' "${slice_outputs[@]}" | grep -v ^$ > "${coalesced_score_file}";''',
                named_args={
                    'coalesced_score_file': coalesced_score_file,
                    'slice_outputs': slice_outputs
                }))
        for tfn in slice_outputs:
            os.remove(tfn)
        return coalesced_score_file
Exemple #20
0
    def test_named_args(self):
        '''WHEN parameter named_args is used THEN variables are automatically expanded'''
        cp1 = command_patterns.ShellScriptCommand(
            script=r'''
                set -e;
                echo "original_string = \"${original_string}\"";
                echo "sed_patterns = \"${sed_patterns[@]}\"";
                echo "end_sed_options = \"${end_sed_options[@]}\"";
                echo "sed_patterns[0] = \"${sed_patterns[0]}\"";
                echo "sed_patterns[1] = \"${sed_patterns[1]}\"";
                echo "sed_patterns[2] = \"${sed_patterns[2]}\"";
                echo "sed_patterns[3] = \"${sed_patterns[3]}\"";
                echo "sed_patterns[4] = \"${sed_patterns[4]}\"";
                echo "sed_patterns[5] = \"${sed_patterns[5]}\"";
                echo "empty_array = \"${empty_array[@]}\"";
                echo "empty_str = \"${empty_str}\"";
                echo "${original_string}" | $sed_command "${sed_patterns[@]}" | sed "${end_sed_options[@]}" 
            ''',
            named_args={
                'original_string': "ABCDEF",
                'sed_command': 'sed',
                'end_sed_options': ['-e', 's/X/Y/'],
                'sed_patterns':
                ['-e', 's/A/Z /', '-e', 's/B/X/', '-e', 's/;&`/^/'],
                'empty_str': '',
                'empty_array': []
            })

        p = cp1.open(stdout=subprocess.PIPE)
        stdout, stderr = p.communicate()

        self.assertFalse(stderr)
        self.assertEqual(
            stdout.decode(), 'original_string = "ABCDEF"\n'
            'sed_patterns = "-e s/A/Z / -e s/B/X/ -e s/;&`/^/"\n'
            'end_sed_options = "-e s/X/Y/"\n'
            'sed_patterns[0] = "-e"\n'
            'sed_patterns[1] = "s/A/Z /"\n'
            'sed_patterns[2] = "-e"\n'
            'sed_patterns[3] = "s/B/X/"\n'
            'sed_patterns[4] = "-e"\n'
            'sed_patterns[5] = "s/;&`/^/"\n'
            'empty_array = ""\n'
            'empty_str = ""\n'
            'Z YCDEF\n')
 def get_taxid_genomes(genome_list_local, taxid, n_per_taxid):
     cmd = command_patterns.ShellScriptCommand(
         script=(
             # columns: 1 = assembly_accession; 6 = taxid; 7 = species_taxid, 8 = organism_name, 20 = ftp_path
             r'''cut -f1,6,7,8,20 "${genome_list_local}" '''
             # try to find taxid in the taxid column (2nd column of the piped input)
             r''' | awk -F '\t' "${awk_find_pattern}" '''
             # take only top n_per_taxid results
             r''' | head -n "${n_per_taxid}";'''
         ),
         named_args={
             'genome_list_local': genome_list_local,
             'awk_find_pattern': f'$2 == "{taxid}"',
             'n_per_taxid': n_per_taxid
         }
     )
     taxid_genomes = list(filter(None, command.execute_with_output(cmd).split("\n")))
     return taxid_genomes
Exemple #22
0
    def test_open_1(self):
        '''WHEN args have special shell characters, THEN it doesn't execute subcommands'''
        cp1 = command_patterns.ShellScriptCommand(
            script=r'''echo "${@:1:$#-1}" ${@:1:$#-1} | sed "${@:$#}";''',
            args=[
                "1", 2, "$(pwd)", ";ls", ";", "ls", "\n", "ls", "&& ls",
                "`pwd`", ">", "test.txt", "> test.txt", ">> test.txt", "&&",
                "ls", "$", "abc\nls", "pwd", '"quotes"', "s/w/a/g"
            ])

        p = cp1.open(stdout=subprocess.PIPE)
        stdout, stderr = p.communicate()

        self.assertFalse(stderr)
        self.assertEqual(
            stdout.decode(),
            "1 2 $(pad) ;ls ; ls \n ls && ls `pad` > test.txt > test.txt >> test.txt && ls $ abc\nls pad \"quotes\" "
            "1 2 $(pad) ;ls ; ls ls && ls `pad` > test.txt > test.txt >> test.txt && ls $ abc ls pad \"quotes\"\n"
        )
Exemple #23
0
    def test_execute_shell_script_command_1(self):
        '''WHEN using ShellScriptCommand with args that contain spaces or special characters, THEN it doesn't split them into separate arguments'''
        cp1 = command_patterns.ShellScriptCommand(
            script=r'''
                echo May the force be with you >> "$1"
                echo The truth is out there > "$1"
                echo Live longer and prosper >> "$1"
                grep "${@:3}" "$1" > "$2"
                cat "$2"
                rm "$1" "$2"
            ''',
            args=[
                r'''/tmp/tmp file with spaces, 'quotes', "double-quotes" and other bizarre characters `~&>.txt''',
                r'''/tmp/another tmp output file.txt''', "-e", "is out", "-e",
                "longer and prosper"
            ])

        result = command.execute_with_output(cp1)

        self.assertEqual(result,
                         "The truth is out there\nLive longer and prosper\n")
Exemple #24
0
    def run(self):
        """
        Generate host genome indexes for STAR and bowtie2
        """
        # Set up
        input_fasta_path = self.input_files_local[0][0]
        ercc_fasta_path = s3.fetch_from_s3(self.additional_files["ercc_fasta"],
                                           self.output_dir_local,
                                           allow_s3mi=True,
                                           auto_unzip=True)
        if input_fasta_path[-3:] == '.gz':
            # unzip the file
            dest_path = input_fasta_path[:-3]
            command.execute(
                command_patterns.ShellScriptCommand(
                    script=
                    r'''gzip -dc "${input_fasta_path}" > "${dest_path}";''',
                    named_args={
                        'input_fasta_path': input_fasta_path,
                        'dest_path': dest_path
                    }))

            input_fasta_path = dest_path

        input_gtf_path = None
        if self.additional_files.get("input_gtf"):
            input_gtf_path = s3.fetch_from_s3(
                self.additional_files["input_gtf"],
                self.output_dir_local,
                allow_s3mi=True)

        ercc_gtf_path = s3.fetch_from_s3(self.additional_files["ercc_gtf"],
                                         self.output_dir_local,
                                         allow_s3mi=True,
                                         auto_unzip=True)

        host_name = self.additional_attributes["host_name"]
        max_star_part_size = self.additional_attributes.get(
            "max_star_part_size")
        input_fasta_with_ercc = f"{input_fasta_path}.with_ercc"
        command.execute(
            command_patterns.ShellScriptCommand(
                script=
                r'''cat "${ercc_fasta_path}" "${input_fasta_path}" > "${input_fasta_with_ercc}";''',
                named_args={
                    'ercc_fasta_path': ercc_fasta_path,
                    'input_fasta_path': input_fasta_path,
                    'input_fasta_with_ercc': input_fasta_with_ercc
                }))

        input_gtf_with_ercc = ercc_gtf_path
        if input_gtf_path:
            input_gtf_with_ercc = f"{input_gtf_path}.with_ercc"
            command.execute(
                command_patterns.ShellScriptCommand(
                    script=
                    r'''cat "${ercc_gtf_path}" "${input_gtf_path}" > "${input_gtf_with_ercc}";''',
                    named_args={
                        'ercc_gtf_path': ercc_gtf_path,
                        'input_gtf_path': input_gtf_path,
                        'input_gtf_with_ercc': input_gtf_with_ercc
                    }))

        output_fasta_file, output_gtf_file, output_star_index, output_bowtie2_index = self.output_files_local(
        )

        command.copy_file(input_fasta_with_ercc, output_fasta_file)
        command.copy_file(input_gtf_with_ercc, output_gtf_file)

        # make STAR index
        self.make_star_index(input_fasta_with_ercc, input_gtf_with_ercc,
                             output_star_index, max_star_part_size)

        # make bowtie2 index
        self.make_bowtie2_index(host_name, input_fasta_with_ercc,
                                output_bowtie2_index)
    def run(self):
        # Setup
        input_files = self.input_files_local[0][0:2]
        num_inputs = len(input_files)
        assert num_inputs in [1, 2], 'Invalid number of input files'
        output_files = self.output_files_local()[1:3]
        summary_file = self.output_files_local()[0]
        max_fragments = self.additional_attributes["truncate_fragments_to"]

        file_ext = self.additional_attributes.get("file_ext")
        assert file_ext in ['fastq', 'fasta'], 'Invalid file extension'

        is_fastq = file_ext == 'fastq'

        try:
            for i in range(num_inputs):
                input_file = input_files[i]
                splited_input_file_name, splited_input_file_ext = os.path.splitext(
                    input_file)

                num_lines = self.calc_max_num_lines(is_fastq, max_fragments)

                # unzip if .gz file
                if splited_input_file_ext == '.gz':
                    input_files[i] = splited_input_file_name
                    try:
                        # test if a valid gzip file
                        command.execute(
                            command_patterns.SingleCommand(
                                cmd="gzip", args=["-t", input_file]))
                        # then decompress it
                        command.execute(
                            command_patterns.ShellScriptCommand(
                                script=
                                r'''gzip -dc "${input_file}" | cut -c -"$[max_line_length+1]" | head -n "${num_lines}" | awk -f "${awk_script_file}" -v max_line_length="${max_line_length}" > "${output_file}";''',
                                named_args={
                                    "input_file":
                                    input_file,
                                    "awk_script_file":
                                    command.get_resource_filename(
                                        "scripts/fastq-fasta-line-validation.awk"
                                    ),
                                    "max_line_length":
                                    vc.MAX_LINE_LENGTH,
                                    "num_lines":
                                    num_lines,
                                    "output_file":
                                    splited_input_file_name
                                }))
                    except:
                        raise InvalidFileFormatError(
                            "Invalid fastq/fasta/gzip file")
                else:
                    # Validate and truncate the input file to keep behavior consistent with gz input files
                    try:
                        tmp_file = splited_input_file_name + ".tmp"
                        command.execute(
                            command_patterns.ShellScriptCommand(
                                script=
                                r'''cat "${input_file}" | cut -c -"$[max_line_length+1]" | head -n "${num_lines}" | awk -f "${awk_script_file}" -v max_line_length="${max_line_length}" > "${output_file}";''',
                                named_args={
                                    "input_file":
                                    input_file,
                                    "awk_script_file":
                                    command.get_resource_filename(
                                        "scripts/fastq-fasta-line-validation.awk"
                                    ),
                                    "max_line_length":
                                    vc.MAX_LINE_LENGTH,
                                    "num_lines":
                                    num_lines,
                                    "output_file":
                                    tmp_file
                                }))
                        input_files[i] = tmp_file
                    except:
                        raise InvalidFileFormatError(
                            "Invalid fastq/fasta file")

            # keep a dictionary of the distribution of read lengths in the files
            self.summary_dict = {
                vc.BUCKET_TOO_SHORT: 0,
                vc.BUCKET_NORMAL: 0,
                vc.BUCKET_LONG: 0,
                vc.BUCKET_TOO_LONG: 0
            }

            quick_check_passed = \
                self.quick_check_file(input_files[0], is_fastq) and \
                (num_inputs == 1 or self.quick_check_file(input_files[1], is_fastq))

            all_fragments = []

            for infile, outfile in zip(input_files, output_files):
                if quick_check_passed:
                    num_fragments = self.truncate_file(infile, outfile,
                                                       is_fastq, max_fragments)
                else:
                    num_fragments = self._full_check_and_truncate_file(
                        infile, outfile, is_fastq, max_fragments, num_inputs)
                all_fragments.append(num_fragments)

            if len(all_fragments) == 2 and abs(all_fragments[1] -
                                               all_fragments[0]) > 1000:
                raise InvalidFileFormatError(
                    "Paired input files need to contain the same number of reads"
                )

            with open(summary_file, 'w') as summary_f:
                json.dump(self.summary_dict, summary_f)

        except Exception as e:
            with open(summary_file, 'w') as summary_f:
                json.dump({'Validation error': str(e)}, summary_f)
            s3_path = self.s3_path(summary_file)
            s3.upload_with_retries(summary_file, s3_path)
            raise e

        return
Exemple #26
0
    def calc_contig2coverage(bam_filename):
        # PySAM pileup is CPU-intenstive.  Each CPU core is assigned a slice of the input bam file on which to perform pileup.  The slice contigs are selected by slice_idx modulo num_slices.  Each slice gets its own pair of temporary output files, one in CSV format and one in JSON.  In the end, these slice outputs are concatenated.  This is a similar pattern to run_lzw.
        num_physical_cpu = (cpu_count() + 1) // 2
        num_slices = num_physical_cpu
        output_csv_filenames = [
            f"tmp_slice_{num_slices}_{slice}.csv"
            for slice in range(num_slices + 1)
        ]
        output_json_filenames = [
            f"tmp_slice_{num_slices}_{slice}.json"
            for slice in range(num_slices + 1)
        ]
        for fn in output_csv_filenames + output_json_filenames:
            if os.path.exists(fn):
                os.remove(fn)

        @run_in_subprocess
        def compute_slice(slice_idx):
            with open(output_csv_filenames[slice_idx], "w") as output_csv, \
                 open(output_json_filenames[slice_idx], "w") as output_json, \
                 pysam.AlignmentFile(bam_filename, "rb") as input_bam:  # noqa: E126
                for contig_idx, contig_name in enumerate(input_bam.references):
                    if contig_idx % num_slices == slice_idx:
                        PipelineStepGenerateCoverageStats._process_contig(
                            input_bam, output_csv, output_json, contig_name)

        # Compute pileup for each slice
        with LongRunningCodeSection(
                "PipelineStepGenerateCoverageStats.calc_contig2coverage.mt_map"
        ):
            mt_map(compute_slice, range(num_slices))
        # Output CSV headers
        with open(output_csv_filenames[-1], "w") as ocsv:
            ocsv.write(",".join(COVERAGE_STATS_SCHEMA))
            ocsv.write("\n")
        # Output JSON dict open paren
        with open(output_json_filenames[-1], "w") as ojson:
            ojson.write("{")
        # Collate CSV slices
        command.execute(
            command_patterns.ShellScriptCommand(
                script=
                r'''cat "${individual_slice_outputs[@]}" >> "${collated_csv}";''',  # note >> for appending
                named_args={
                    'collated_csv': output_csv_filenames[-1],
                    'individual_slice_outputs': output_csv_filenames[:-1]
                }))
        for tfn in output_csv_filenames[:-1]:
            os.remove(tfn)
        # Collate JSON slices, replacing final ", " with "}"
        command.execute(
            command_patterns.ShellScriptCommand(
                script=
                r'''cat "${individual_slice_outputs[@]}" | sed 's=, $=}=' >> "${collated_json}";''',  # note >> for appending
                named_args={
                    'collated_json': output_json_filenames[-1],
                    'individual_slice_outputs': output_json_filenames[:-1]
                }))
        for tfn in output_json_filenames[:-1]:
            os.remove(tfn)
        return (output_csv_filenames[-1], output_json_filenames[-1])
    def get_accession_sequences(self, dest_dir, taxid, n=10):
        '''
        Retrieve NCBI NT references for the most-matched accession in each hitsummary2 file, up to a maximum of n references.
        Write each reference to a separate fasta file.
        '''
        if n == 0:
            return {}

        # Retrieve files
        nt_db = self.additional_attributes["nt_db"]
        nt_loc_db = s3.fetch_reference(
            self.additional_files["nt_loc_db"],
            self.ref_dir_local,
            allow_s3mi=True)

        # Choose accessions to process.
        s3_hitsummary2_files = self.additional_attributes["hitsummary2_files"].values()
        accessions = defaultdict(lambda: 0)
        # TODO: Address issue where accessions in nr can be chosen in the following code.
        # These accessions will not be found in nt_loc and will be subsequently omitted.
        for file_list in s3_hitsummary2_files:
            tally = defaultdict(lambda: 0)
            for s3_file in file_list:
                local_basename = s3_file.replace("/", "-").replace(":", "-")
                local_file = s3.fetch_from_s3(
                    s3_file,
                    os.path.join(self.output_dir_local, local_basename))
                if local_file is None:
                    continue
                with open(local_file, 'r') as f:
                    for line in f:
                        acc, species_taxid, genus_taxid, family_taxid = line.rstrip().split("\t")[3:7]
                        if any(int(hit_taxid) == taxid for hit_taxid in [species_taxid, genus_taxid, family_taxid]):
                            tally[acc] += 1
            if tally:
                best_acc, max_count = max(tally.items(), key=lambda x: x[1])
                accessions[best_acc] += max_count
        if len(accessions) > n:
            accessions = dict(sorted(accessions.items(), key=lambda x: x[1], reverse=True)[:n])
        accessions = set(accessions.keys())

        # Make map of accession to sequence file
        accession2info = dict((acc, {}) for acc in accessions)
        with open_file_db_by_extension(nt_loc_db) as nt_loc_dict:
            PipelineStepGenerateAlignmentViz.get_sequences_by_accession_list_from_s3(
                accession2info, nt_loc_dict, nt_db)

        # Put 1 fasta file per accession into the destination directory
        accession_fastas = {}
        for acc, info in accession2info.items():
            if 'seq_file' not in info or info['seq_file'] is None:
                log.write(f"WARNING: No sequence retrieved for {acc}")
                continue
            clean_accession = self.clean_name_for_ksnp3(acc)
            local_fasta = f"{dest_dir}/NCBI_NT_accession_{clean_accession}.fasta"
            command.execute(
                command_patterns.SingleCommand(
                    cmd="ln",
                    args=[
                        "-s",
                        info['seq_file'],
                        local_fasta
                    ]
                )
            )
            command.execute_with_output(
                command_patterns.ShellScriptCommand(
                    script=r'''echo ">${acc}" | cat - "${local_fasta}" > temp_file;''',
                    named_args={
                        'acc': acc,
                        'local_fasta': local_fasta
                    }
                )
            )
            command.move_file('temp_file', local_fasta)

            accession_fastas[acc] = local_fasta

        # Return kept accessions and paths of their fasta files
        return accession_fastas
Exemple #28
0
def fetch_from_s3(
        src,  # pylint: disable=dangerous-default-value
        dst,
        auto_unzip=DEFAULT_AUTO_UNZIP,
        auto_untar=DEFAULT_AUTO_UNTAR,
        allow_s3mi=DEFAULT_ALLOW_S3MI,
        okay_if_missing=False,
        is_reference=False,
        touch_only=False,
        mutex=TraceLock("fetch_from_s3", multiprocessing.RLock()),
        locks={}):
    """Fetch a file from S3 if needed, using either s3mi or aws cp.

    IT IS NOT SAFE TO CALL THIS FUNCTION FROM MULTIPLE PROCESSES.
    It is totally fine to call it from multiple threads (it is designed for that).

    When is_reference=True, "dst" must be an existing directory.

    If src does not exist or there is a failure fetching it, the function returns None,
    without raising an exception.  If the download is successful, it returns the path
    to the downloaded file or folder.  If the download already exists, it is touched
    to update its timestamp.

    When touch_only=True, if the destination does not already exist, the function
    simply returns None (as if the download failed).  If the destination does exist,
    it is touched as usual.  This is useful in implementing an LRU cache policy.

    An exception is raised only if there is a coding error or equivalent problem,
    not if src simply doesn't exist.
    """
    # FIXME: this is a compatibility hack so we can replace this function
    #   We are removing ad-hoc s3 downloads from within steps and converting
    #   additional_files to wdl inputs. These files will be transparently
    #   downloaded by miniwdl. miniwdl will also handle the caching that
    #   is currently done here. This hack bypasses the s3 download if the
    #   source is already a local file, and returns the source (which is
    #   a local file path). This way, when we change the additional_files
    #   to inputs we can provide the local file path to the step instead
    #   of the s3 path and seamlessly transition without a coordinated
    #   change between idseq-dag and the idseq monorepo.
    if not src.startswith("s3://"):
        log.write(
            f"fetch_from_s3 is skipping download because source: {src} does not start with s3://"
        )
        if not os.path.isfile(src):
            return None
        if auto_untar and src.endswith(".tar"):
            dst = src[:-4]
            if not os.path.isdir(dst):
                command.make_dirs(dst + ".untarring")
                script = 'tar xvf "${src}" -C "${tmp_destdir}"'
                named_args = {"src": src, "tmp_destdir": dst + ".untarring"}
                command.execute(
                    command_patterns.ShellScriptCommand(script=script,
                                                        named_args=named_args))
                command.rename(dst + ".untarring/" + os.path.basename(dst),
                               dst)
            return dst
        return src

    # Do not be mislead by the multiprocessing.RLock() above -- that just means it won't deadlock
    # if called from multiple processes but does not mean the behaivior will be correct.  It will
    # be incorrect, because the locks dict (cointaining per-file locks) cannot be shared across
    # processes, the way it can be shared across threads.

    if is_reference:
        assert config[
            "REF_DIR"], "The is_reference code path becomes available only after initializing gloabal config['REF_DIR']"

    if os.path.exists(dst) and os.path.isdir(dst):
        dirname, basename = os.path.split(src)
        if is_reference or os.path.abspath(dst).startswith(config["REF_DIR"]):
            # Downloads to the reference dir are persisted from job to job, so we must include
            # version information from the full s3 path.
            #
            # The final destination for s3://path/to/source.db will look like /mnt/ref/s3__path__to/source.db
            # The final destination for s3://path/to/myarchive.tar will look like /mnt/ref/s3__path__to/myarchive/...
            #
            # We considered some other alternatives, for example /mnt/ref/s3__path__to__source.db, but unfortunately,
            # some tools incorporate the base name of their database input into the output filenames, so any approach
            # that changes the basename causes problems downstream.  An example such tool is srst2.
            is_reference = True
            if dirname.startswith("s3://"):
                dirname = dirname.replace("s3://", "s3__", 1)
            # If dirname contains slashes, it has to be flattened to single level.
            dirname = dirname.replace("/", "__")
            dst = os.path.join(dst, dirname, basename)
        else:
            dst = os.path.join(dst, basename)
    else:
        assert not is_reference, f"When fetching references, dst must be an existing directory: {dst}"

    unzip = ""
    if auto_unzip:
        file_without_ext, ext = os.path.splitext(dst)
        if ext in ZIP_EXTENSIONS:
            unzip = " | " + ZIP_EXTENSIONS[
                ext]  # this command will be used to decompress stdin to stdout
            dst = file_without_ext  # remove file extension from dst
    untar = auto_untar and dst.lower().endswith(".tar")
    if untar:
        dst = dst[:-4]  # Remove .tar

    # Downloads are staged under tmp_destdir.  Only after a download completes successfully it is moved to dst.
    destdir = os.path.dirname(dst)
    tmp_destdir = os.path.join(destdir, "tmp_downloads")
    tmp_dst = os.path.join(tmp_destdir, os.path.basename(dst))

    abspath = os.path.abspath(dst)
    with mutex:
        if abspath not in locks:
            locks[abspath] = TraceLock(f"fetch_from_s3: {abspath}",
                                       multiprocessing.RLock())
        destination_lock = locks[abspath]

    # shouldn't happen and makes it impossible to ensure that any dst that exists is complete and correct.
    assert tmp_dst != dst, f"Problematic use of fetch_from_s3 with tmp_dst==dst=='{dst}'"

    with destination_lock:
        # This check is a bit imperfect when untarring... unless you follow the discipline that
        # all contents of file foo.tar are under directory foo/... (which we do follow in IDseq)
        if os.path.exists(dst):
            command.touch(dst)
            return dst

        if touch_only:
            return None

        for (kind, ddir) in [("destinaiton", destdir),
                             ("temporary download", tmp_destdir)]:
            try:
                if ddir:
                    command.make_dirs(ddir)
            except OSError as e:
                # It's okay if the parent directory already exists, but all other
                # errors fail the download.
                if e.errno != errno.EEXIST:
                    log.write(f"Error in creating {kind} directory.")
                    return None

        with IOSTREAM:
            try:
                if allow_s3mi:
                    wait_start = time.time()
                    allow_s3mi = S3MI_SEM.acquire(timeout=MAX_S3MI_WAIT)
                    wait_duration = time.time() - wait_start
                    if not allow_s3mi:
                        log.write(
                            f"Failed to acquire S3MI semaphore after waiting {wait_duration} seconds for {src}."
                        )
                    elif wait_duration >= 5:
                        log.write(
                            f"Waited {wait_duration} seconds to acquire S3MI semaphore for {src}."
                        )

                if untar:
                    write_dst = r''' | tar xvf - -C "${tmp_destdir}";'''
                    named_args = {'tmp_destdir': tmp_destdir}
                else:
                    write_dst = r''' > "${tmp_dst}";'''
                    named_args = {'tmp_dst': tmp_dst}
                command_params = f"{unzip} {write_dst}"

                named_args.update({'src': src})

                try_cli = not allow_s3mi
                if allow_s3mi:
                    if os.path.exists(tmp_dst):
                        command.remove_rf(tmp_dst)
                    try:
                        command.execute(
                            command_patterns.ShellScriptCommand(
                                script=
                                r'set -o pipefail; s3mi cat --quiet "${src}" '
                                + command_params,
                                named_args=named_args))
                    except subprocess.CalledProcessError:
                        try_cli = not okay_if_missing
                        allow_s3mi = False
                        S3MI_SEM.release()
                        if try_cli:
                            log.write(
                                "Failed to download with s3mi. Trying with aws s3 cp..."
                            )
                        else:
                            raise
                if try_cli:
                    if os.path.exists(tmp_dst):
                        command.remove_rf(tmp_dst)
                    if okay_if_missing:
                        script = r'set -o pipefail; aws s3 cp --quiet "${src}" - ' + command_params
                    else:
                        script = r'set -o pipefail; aws s3 cp --only-show-errors "${src}" - ' + command_params
                    command.execute(
                        command_patterns.ShellScriptCommand(
                            script=script,
                            named_args=named_args,
                            env=dict(os.environ, **refreshed_credentials())))
                # Move staged download into final location.  Leave this last, so it only happens if no exception has occurred.
                # By this point we have already asserted that tmp_dst != dst.
                command.rename(tmp_dst, dst)
                return dst
            except BaseException as e:  # Deliberately super broad to make doubly certain that dst will be removed if there has been any exception
                if os.path.exists(dst):
                    command.remove_rf(dst)
                if not isinstance(e, subprocess.CalledProcessError):
                    # Coding error of some sort.  Best not hide it.
                    raise
                if okay_if_missing:
                    # We presume.
                    log.write("File most likely does not exist in S3.")
                else:
                    log.write("Failed to fetch file from S3.")
                return None
            finally:
                if allow_s3mi:
                    S3MI_SEM.release()
                if os.path.exists(
                        tmp_dst
                ):  # by this point we have asserted that tmp_dst != dst (and that assert may have failed, but so be it)
                    command.remove_rf(tmp_dst)
    def run(self):
        output_files = self.output_files_local()
        local_taxon_fasta_files = [f for input_item in self.input_files_local for f in input_item]
        taxid = self.additional_attributes["taxid"]
        reference_taxids = self.additional_attributes.get("reference_taxids", [taxid])  # Note: will only produce a result if species-level or below
        # During phylo tree creation, if the taxon is in an unknown superkingdom then the k selected from k_config is supposed to be from the key None.
        superkingdom_name = self.additional_attributes.get("superkingdom_name") if self.additional_attributes.get("superkingdom_name") != '' else None

        # knsp3 has a command (MakeKSNP3infile) for making a ksnp3-compatible input file from a directory of fasta files.
        # Before we can use the command, we symlink all fasta files to a dedicated directory.
        # The command makes certain unreasonable assumptions:
        # - current directory is parent directory of the fasta file directory
        # - file names do not have dots except before extension (also no spaces)
        # - file names cannot be too long (for kSNP3 tree building).
        input_dir_for_ksnp3 = f"{self.output_dir_local}/inputs_for_ksnp3"
        command.make_dirs(input_dir_for_ksnp3)
        for local_file in local_taxon_fasta_files:
            command.execute(
                command_patterns.SingleCommand(
                    cmd="ln",
                    args=[
                        "-s",
                        local_file,
                        os.path.join(input_dir_for_ksnp3, os.path.basename(local_file))
                    ]
                )
            )

        # Retrieve Genbank references (full assembled genomes).
        genbank_fastas = self.get_genbank_genomes(reference_taxids, input_dir_for_ksnp3, superkingdom_name, 0)

        # Retrieve NCBI NT references for the accessions in the alignment viz files.
        # These are the accessions (not necessarily full genomes) that were actually matched
        # by the sample's reads during GSNAP alignment.
        accession_fastas = self.get_accession_sequences(input_dir_for_ksnp3, taxid, 10)

        # Retrieve NCBI metadata for the accessions
        metadata_by_node = self.get_metadata_by_tree_node({**accession_fastas, **genbank_fastas})
        metadata_output = output_files[1]
        with open(metadata_output, 'w') as f:
            json.dump(metadata_by_node, f)

        # Run MakeKSNP3infile.
        ksnp3_input_file = f"{self.output_dir_local}/inputs.txt"
        command.execute(
            command_patterns.SingleCommand(
                cd=self.output_dir_local,
                cmd='MakeKSNP3infile',
                args=[
                    os.path.basename(input_dir_for_ksnp3),
                    ksnp3_input_file,
                    "A"
                ]
            )
        )

        # Specify the names of finished reference genomes.
        # Used for annotation & variant-calling.
        annotated_genome_input = f"{self.output_dir_local}/annotated_genomes"
        reference_fasta_files = list(genbank_fastas.values()) + list(accession_fastas.values())
        if reference_fasta_files:
            grep_options = (("-e", path) for path in reference_fasta_files)
            grep_options = list(itertools.chain.from_iterable(grep_options))  # flatmap
            command.execute(
                command_patterns.ShellScriptCommand(
                    script=r'''grep "${grep_options[@]}" "${ksnp3_input_file}" | cut -f2 > "${annotated_genome_input}";''',
                    named_args={
                        'ksnp3_input_file': ksnp3_input_file,
                        'annotated_genome_input': annotated_genome_input,
                        'grep_options': grep_options
                    }
                )
            )

        # Now build ksnp3 command:
        k_config = {
            # All entries to be revisited and benchmarked.
            # Values for viruses and bacteria come from kSNP3 recommendations (13-15 / 19-21).
            "Viruses": 13,
            "Bacteria": 19,
            "Eukaryota": 19,
            None: 13
        }
        k = k_config[superkingdom_name]
        ksnp_output_dir = f"{self.output_dir_local}/ksnp3_outputs"
        command.make_dirs(ksnp_output_dir)
        ksnp_cd = os.path.dirname(ksnp_output_dir)
        ksnp_cmd = "kSNP3"
        ksnap_args = [
            "-in",
            "inputs.txt",
            "-outdir",
            os.path.basename(ksnp_output_dir),
            "-k",
            k
        ]

        # Annotate SNPs using reference genomes:
        # TODO: fix gi vs accession problem
        if os.path.isfile(annotated_genome_input):
            ksnap_args.extend([
                "-annotate",
                os.path.basename(annotated_genome_input)
            ])
            snps_all_annotated = f"{ksnp_output_dir}/SNPs_all_annotated"
            if os.path.isfile(snps_all_annotated):
                self.additional_output_files_hidden.append(snps_all_annotated)

        # Produce VCF file with respect to first reference genome in annotated_genome_input:
        if os.path.isfile(annotated_genome_input):
            ksnap_args.append("-vcf")

        # Run ksnp3 command:
        command.execute(
            command_patterns.SingleCommand(
                cd=ksnp_cd,
                cmd=ksnp_cmd,
                args=ksnap_args
            )
        )

        # Postprocess output names in preparation for upload:
        command.move_file(os.path.join(ksnp_output_dir, "tree.parsimony.tre"), output_files[0])
        ksnp_vcf_file = glob.glob(f"{ksnp_output_dir}/*.vcf")
        if ksnp_vcf_file:
            target_vcf_file = f"{ksnp_output_dir}/variants_reference1.vcf"
            self.name_samples_vcf(ksnp_vcf_file[0], target_vcf_file)
            self.additional_output_files_hidden.append(target_vcf_file)

        # Upload all kSNP3 output files for potential future reference
        supplementary_files = [f for f in glob.glob(f"{ksnp_output_dir}/*")
                               if os.path.isfile(f) and
                               f not in self.additional_output_files_hidden]
        self.additional_output_files_hidden.extend(supplementary_files)