Ejemplo n.º 1
0
 def test_failed_run_not_verbose(self):
     input_fp = self.get_data_path('unaligned-dna-sequences-1.fasta')
     input_sequences = DNAFASTAFormat(input_fp, mode='r')
     output_alignment = AlignedDNAFASTAFormat()
     unaligned_fp = str(input_sequences)
     aligned_fp = str(output_alignment)
     cmd = ["mafft", "--not-a-real-parameter", unaligned_fp]
     with self.assertRaises(subprocess.CalledProcessError):
         with redirected_stdio(stderr=os.devnull):
             run_command(cmd, aligned_fp, verbose=False)
Ejemplo n.º 2
0
 def test_failed_run_not_verbose(self):
     input_fp = self.get_data_path('unaligned-dna-sequences-1.fasta')
     input_sequences = DNAFASTAFormat(input_fp, mode='r')
     output_alignment = AlignedDNAFASTAFormat()
     unaligned_fp = str(input_sequences)
     aligned_fp = str(output_alignment)
     cmd = ["mafft", "--not-a-real-parameter", unaligned_fp]
     with self.assertRaises(subprocess.CalledProcessError):
         with redirected_stdio(stderr=os.devnull):
             run_command(cmd, aligned_fp, verbose=False)
Ejemplo n.º 3
0
def ipcress(sequence: DNAFASTAFormat,
            primer_a: str,
            primer_b: str,
            min_product_len: int = 50,
            max_product_len: int = 1600,
            mismatch: int = 0,
            memory: int = 2048,
            seed: int = 12) -> DNAFASTAFormat:
    sequence_fp = str(sequence)

    temp_dir = tempfile.TemporaryDirectory(prefix='q2-ipcress-')
    input_fp = str(Path(temp_dir.name) / 'input.ipcress')
    with open(input_fp, 'w') as fp:
        fp.write(' '.join([
            'q2', primer_a, primer_b,
            str(min_product_len),
            str(max_product_len)
        ]))

    output_fp = str(Path(temp_dir.name) / 'output.ipcress')

    cmd = [
        'ipcress', '--input', input_fp, '--sequence', sequence_fp,
        '--mismatch',
        str(mismatch), '--memory',
        str(memory), '--seed',
        str(seed), '--pretty', 'False', '--products', 'True'
    ]

    run_command(cmd, output_fp)

    reads = DNAFASTAFormat()
    reads_fp = str(reads)

    with open(output_fp) as ofp:
        with open(reads_fp, 'w') as rfp:
            for line in ofp:
                if 'ipcress' in line:
                    continue
                if line.startswith('>'):
                    line = '>' + line.split()[2].split(':')[0] + '\n'
                rfp.write(line)

    return reads
Ejemplo n.º 4
0
def _mafft(sequences_fp, alignment_fp, n_threads, parttree):
    # Save original sequence IDs since long ids (~250 chars) can be truncated
    # by mafft. We'll replace the IDs in the aligned sequences file output by
    # mafft with the originals.
    #
    # https://github.com/qiime2/q2-alignment/issues/37
    aligned_seq_ids = {}
    unaligned_seq_ids = {}

    # if alignment_fp is not None:
    #     for seq in skbio.io.read(alignment_fp, format='fasta',
    #                              constructor=skbio.Protein):
    #         id_ = seq.metadata['id']
    #         if id_ in aligned_seq_ids:
    #             raise ValueError(
    #                 "A sequence ID is duplicated in the aligned sequences: "
    #                 "%r" % id_)
    #         else:
    #             aligned_seq_ids[id_] = True

    for seq in skbio.io.read(sequences_fp,
                             format='fasta',
                             constructor=skbio.Protein):
        id_ = seq.metadata['id']
        if id_ in unaligned_seq_ids:
            raise ValueError(
                "A sequence ID is duplicated in the unaligned sequences: "
                "%r" % id_)
        elif id_ in aligned_seq_ids:
            raise ValueError(
                "A sequence ID is present in both the aligned and unaligned "
                "sequences: %r" % id_)
        else:
            unaligned_seq_ids[id_] = True

    result = AlignedProteinFASTAFormat()
    result_fp = str(result)
    ids = {**aligned_seq_ids, **unaligned_seq_ids}

    # mafft will fail if the number of sequences is larger than 1 million.
    # mafft requires using parttree which is an algorithm to build an
    # approximate tree from a large number of unaligned sequences.
    # By catching the error below if a user has not used parttree flag, we are
    # eliminating the need for the mafft error to be shown to the user which
    # can be confusing and intimidating.

    if not parttree and len(ids) > 1000000:
        raise ValueError(
            "The number of sequences in your feature table is larger than "
            "1 million, please use the parttree parameter")

    # mafft's signal for utilizing all cores is -1. We want to our users
    # to enter auto for using all cores. This is to prevent any confusion and
    # to keep the UX consisent.
    if n_threads == 'auto':
        n_threads = -1

    # `--inputorder` must be turned on because we need the input and output in
    # the same sequence order to replace the IDs below. This is mafft's default
    # behavior but we pass the flag in case that changes in the future.
    cmd = [
        "mafft", "--preservecase", "--inputorder", "--thread",
        str(n_threads)
    ]

    if parttree:
        cmd += ['--parttree']

    if alignment_fp is not None:
        cmd += ['--add', sequences_fp, alignment_fp]
    else:
        cmd += [sequences_fp]

    run_command(cmd, result_fp)

    # Read output alignment into memory, reassign original sequence IDs, and
    # write alignment back to disk.
    msa = skbio.TabularMSA.read(result_fp,
                                format='fasta',
                                constructor=skbio.Protein)
    # Using `assert` because mafft would have had to add or drop sequences
    # while aligning, which would be a bug on mafft's end. This is just a
    # sanity check and is not expected to trigger in practice.
    assert len(ids) == len(msa)
    for id, seq in zip(ids, msa):
        seq.metadata['id'] = id

    # Turning off roundtripping options to speed up writing. We can safely turn
    # these options off because we know the sequence IDs are rountrip-safe
    # since we read them from a FASTA file above.
    #
    # http://scikit-bio.org/docs/latest/generated/
    #     skbio.io.format.fasta.html#writer-specific-parameters
    msa.write(result_fp,
              id_whitespace_replacement=None,
              description_newline_replacement=None)
    return result