Esempio n. 1
0
    def test_dna_fasta_format_empty_file(self):
        filepath = os.path.join(self.temp_dir.name, 'empty')
        with open(filepath, 'w') as fh:
            fh.write('\n')
        format = DNAFASTAFormat(filepath, mode='r')

        format.validate()
 def _load_DNAFASTAFormat(reads_fn):
     reads_fp = self.get_data_path(reads_fn)
     return DNAFASTAFormat(reads_fp, mode='r')
Esempio n. 3
0
 def setUp(self):
     super().setUp()
     input_fp = self.get_data_path('cleanseq-test-1.fasta')
     self.seqs1 = DNAFASTAFormat(input_fp, mode='r').view(DNAIterator)
Esempio n. 4
0
    def test_dna_fasta_format_no_id(self):
        filepath = self.get_data_path('dna-sequences-no-id.fasta')
        format = DNAFASTAFormat(filepath, mode='r')

        with self.assertRaisesRegex(ValidationError, '1.*missing an ID'):
            format.validate()
Esempio n. 5
0
    def test_dna_fasta_format_duplicate_ids(self):
        filepath = self.get_data_path('dna-sequences-duplicate-id.fasta')
        format = DNAFASTAFormat(filepath, mode='r')

        with self.assertRaisesRegex(ValidationError, '3.*duplicate.*1'):
            format.validate()
Esempio n. 6
0
def _2(db: Bowtie2IndexDirFmt) -> DNAFASTAFormat:
    result = DNAFASTAFormat()
    bowtie2_inspect_cmd = ['bowtie2-inspect', str(db.path / db.get_basename())]
    run_command(bowtie2_inspect_cmd, stdout=open(str(result), 'w'))
    return result
Esempio n. 7
0
    def test_dna_fasta_format_validate_positive(self):
        filepath = self.get_data_path('dna-sequences.fasta')
        format = DNAFASTAFormat(filepath, mode='r')

        format.validate()
Esempio n. 8
0
    def test_dna_fasta_format_bom_passes(self):
        filepath = self.get_data_path('dna-with-bom-passes.fasta')
        format = DNAFASTAFormat(filepath, mode='r')

        format.validate()
Esempio n. 9
0
def cluster_features_closed_reference(sequences: DNAFASTAFormat,
                                      table: biom.Table,
                                      reference_sequences: DNAFASTAFormat,
                                      perc_identity: float,
                                      strand: str = 'plus',
                                      threads: int = 1
                                      ) -> (biom.Table, DNAFASTAFormat,
                                            DNAFASTAFormat):

    table_ids = set(table.ids(axis='observation'))
    sequence_ids = {e.metadata['id'] for e in skbio.io.read(
                    str(sequences), constructor=skbio.DNA, format='fasta')}
    _error_on_nonoverlapping_ids(table_ids, sequence_ids)
    matched_seqs, unmatched_seqs = DNAFASTAFormat(), DNAFASTAFormat()

    with tempfile.NamedTemporaryFile() as fasta_with_sizes, \
            tempfile.NamedTemporaryFile() as out_uc, \
            tempfile.NamedTemporaryFile() as tmp_unmatched_seqs:
        _fasta_with_sizes(str(sequences), fasta_with_sizes.name, table)
        cmd = ['vsearch',
               '--usearch_global', fasta_with_sizes.name,
               '--id', str(perc_identity),
               '--db', str(reference_sequences),
               '--uc', out_uc.name,
               '--strand', str(strand),
               '--qmask', 'none',  # ensures no lowercase DNA chars
               '--notmatched', tmp_unmatched_seqs.name,
               '--threads', str(threads)]
        run_command(cmd)
        out_uc.seek(0)

        # It is possible for there to be no unmatched sequences --- if that
        # is the case, skip thie following clean-up.
        if os.path.getsize(tmp_unmatched_seqs.name) > 0:
            # We don't really need to sort the matched sequences, this
            # is just to let us use --xsize, which strips the counts from
            # the Feature ID. It would be more ideal if --usearch_global,
            # above let us pass in --xsize, but unfortunately it isn't
            # supported.
            cmd = ['vsearch',
                   '--sortbysize', tmp_unmatched_seqs.name,
                   '--xsize',
                   '--output', str(unmatched_seqs)]
            run_command(cmd)

        try:
            conn = _uc_to_sqlite(out_uc)
            collapse_f = _collapse_f_from_sqlite(conn)
            _fasta_from_sqlite(conn, str(sequences), str(matched_seqs))
        except ValueError:
            raise VSearchError('No matches were identified to '
                               'reference_sequences. This can happen if '
                               'sequences are not homologous to '
                               'reference_sequences, or if sequences are '
                               'not in the same orientation as reference_'
                               'sequences (i.e., if sequences are reverse '
                               'complemented with respect to reference '
                               'sequences). Sequence orientation can be '
                               'adjusted with the strand parameter.')

        unmatched_ids = [e.metadata['id']
                         for e in skbio.io.read(open(str(unmatched_seqs)),
                                                constructor=skbio.DNA,
                                                format='fasta')]
    table.filter(ids_to_keep=unmatched_ids, invert=True, axis='observation',
                 inplace=True)
    table = table.collapse(collapse_f, norm=False, min_group_size=1,
                           axis='observation',
                           include_collapsed_metadata=False)

    return table, matched_seqs, unmatched_seqs
Esempio n. 10
0
def _rna_to_dna(iterator):
    ff = DNAFASTAFormat()
    generator = _rna_to_dna_iterator(iterator)
    skbio.io.write(iter(generator), format='fasta', into=str(ff))
    return ff
 def setUp(self):
     super().setUp()
     dna_path = pkg_resources.resource_filename('rescript.tests',
                                                'data/derep-test.fasta')
     self.dna_seqs = DNAFASTAFormat(dna_path, mode='r').view(DNAIterator)
Esempio n. 12
0
def _rna_to_dna(path):
    ff = DNAFASTAFormat()
    with ff.open() as outfasta:
        for seq in _read_rna_fasta(path):
            seq.reverse_transcribe().write(outfasta)
    return ff
Esempio n. 13
0
 def setUp(self):
     super().setUp()
     self.taxonomy_fp = self.get_data_path('taxonomy.tsv')
     self.taxonomy = pd.Series.from_csv(self.taxonomy_fp, sep='\t')
     self.reads_fp = self.get_data_path('se-dna-sequences.fasta')
     self.reads = DNAFASTAFormat(self.reads_fp, mode='r')
Esempio n. 14
0
def align_regional_kmers(kmers: DNAFASTAFormat, 
    rep_seq: pd.Series, 
    region: str, 
    max_mismatch: int=2, 
    chunk_size:int=100, 
    debug:bool=False, 
    n_workers:int=1,
    client_address:str=None) -> KmerAlignFormat:
    """
    Performs regional alignment between database "kmers" and ASVs

    Parameters
    ----------
    kmers : DNAFastaFormat
        The set of reference sequences extracted from the database. These are
        assumes to be start in the same position of the 16s rRNA sequence as 
        the sequence being tested and assumed to be the same length as the
        ASVs being aligned.
    rep_seq: DNAFastaFormat
        The representative sequences for the regional ASV table being aligned.
        These are assumed to start at the same position as the kmers and 
        should be trimmed to the same length.
    region: str
        An identifier for the region. Ideally, this matches the identifier 
        used in the reference region map
    max_mismatch: int
        the maximum number of mismatched nucleotides allowed in mapping 
        between a sequence and kmer.
    debug: bool
        Whether the function should be run in debug mode (without a client)
        or not. `debug` superceeds all options
    n_workers: int, optional
        The number of jobs to initiate. When `n_workers` is 0, the cluster 
        will be able to access all avaliable resources.

    Returns
    -------
    DataFrame
        A mapping between the kmer (`kmer`) and the asv (`asv`), including 
        the region (`region`), number of mismatched basepairs (`mismatch`) and 
        the sequence length (`length`).
    DNAFASTAFormat
        The ASVs which could not be aligned to kmers 

    """
     # Sets up the client
    _setup_dask_client(debug=debug, cluster_config=None,  
                       n_workers=n_workers, address=client_address)

    # Converts the representative sequences to a delayed object
    num_asvs, asv_length = _check_read_lengths(rep_seq, 'rep_seq')
    rep_seq_ids  = rep_seq.index.values
    rep_seq = dd.from_pandas(rep_seq.astype(str),
                             chunksize=chunk_size)

    ff = KmerAlignFormat()

    # Performs the alignment
    for i,  batch in enumerate(_chunks(kmers.view(DNAIterator), 
                               chunk_size * 100)):
       

        if i == 0:
            batch = pd.Series({s.metadata['id']: str(s) for s in batch})
            num_kmers, kmer_length = _check_read_lengths(batch, 'kmer')

            if kmer_length != asv_length:
                raise ValueError('The kmer and ASV sequences must be the'
                                 ' same length')
            batch = dd.from_pandas(batch, chunksize=chunk_size)
        else:
            batch = dd.from_pandas(
                pd.Series({s.metadata['id']: str(s) for s in batch}),
                chunksize=chunk_size
                )

        aligned_batch = np.hstack([
            dask.delayed(_align_kmers)(kmer, asv, max_mismatch)
            for kmer, asv in it.product(batch.to_delayed(), rep_seq.to_delayed())
            ])

        aligned_batch = pd.concat(axis=0, objs=dask.compute(*aligned_batch))

        aligned_batch['region'] = region
        aligned_batch['max-mismatch'] = max_mismatch
        if i  == 0:
            aligned_batch.to_csv(str(ff), sep='\t', index=False, 
                                 mode='w')
        else:
            aligned_batch.to_csv(str(ff), sep='\t', index=False, 
                                 header=False,
                                 mode='a')

    return ff
Esempio n. 15
0
def prepare_extracted_region(sequences: DNAFASTAFormat, 
    region:str, 
    trim_length:int, 
    fwd_primer:str, 
    rev_primer:str, 
    reverse_complement_rev:bool=True,
    reverse_complement_result:bool=False,
    chunk_size:int=10000, 
    debug:bool=False, 
    n_workers:int=1,
    client_address:str=None,
    ) -> (DNAFASTAFormat, pd.DataFrame):
    """
    Prepares and extracted database for regional alignment

    This function takes an amplified region of the database, expands the
    degenerate sequences and collapses the duplciated sequences under a 
    single id that can be untangled later.

    Parameters
    ----------
    sequences: q2_type.DNAFASTAFormat
        The regional sequences to be collapsed
    region: str
        A unique name for the region being handled
    trim_length : int
        The length of final sequences to matched the trimmed kmers for 
        kmer-based alignment.
    chunk_size: int, optional
        The number of sequences to group for analysis
    debug: bool
        Whether the function should be run in debug mode (without a client)
        or not. `debug` superceeds all options
    n_workers: int, optional
        The number of jobs to initiate. When `n_workers` is 0, the cluster 
        will be able to access all avaliable resources.

    Returns
    -------
    q2_types.DNAFASTAFormat
        The reads with degenerate nucleotides expanded and duplicated 
        sequences collapsed.
    DataFrame
        A mapping between the kmer sequence name and the the full database 
        sequence name, along with regional information
    """

    # Sets up the client
    _setup_dask_client(debug=debug, cluster_config=None,  
                       n_workers=n_workers, address=client_address)

    # Reverse complements the reverse primer
    if reverse_complement_rev:
        rev_primer = str(DNA(rev_primer).reverse_complement())

    # Reads in the sequences
    sequences = sequences.view(DNAIterator)
    seq_blocks = [dask.delayed(_block_seqs)(seq)
                  for seq in _chunks(sequences, int((chunk_size)))]
    # Makes the fake extraction position based on the trim length
    fragment = [dask.delayed(_artifical_trim)(seq, trim_length) 
                for seq in seq_blocks]
    # Prepares the amplicon for collapsing
    condensed = dd.from_delayed([
        dask.delayed(_condense_seqs)(seq) for seq in fragment],
        meta=[('amplicon', 'str'), ('seq-name', 'str')]
    )
    # Writes the 
    ff, group2 = _collapse_all_sequences(condensed, reverse_complement_result)
    ids = _expand_ids(group2, fwd_primer, rev_primer, region, trim_length,
                      chunk_size)

    return (ff, ids.compute().set_index('db-seq').sort_index())
Esempio n. 16
0
    def test_dna_fasta_format_validate_positive(self):
        filepath = self.get_data_path('dna-sequences.fasta')
        format = DNAFASTAFormat(filepath, mode='r')

        format.validate()
Esempio n. 17
0
    def test_dna_fasta_format_missing_initial_ID(self):
        filepath = self.get_data_path('dna-sequences-first-line-not-id.fasta')
        format = DNAFASTAFormat(filepath, mode='r')

        with self.assertRaisesRegex(ValidationError, 'First line'):
            format.validate()
Esempio n. 18
0
    def test_dna_fasta_format_validate_negative(self):
        filepath = self.get_data_path('not-dna-sequences')
        format = DNAFASTAFormat(filepath, mode='r')

        with self.assertRaisesRegex(ValidationError, 'DNAFASTA'):
            format.validate()
Esempio n. 19
0
    def test_dna_fasta_format_corrupt_characters(self):
        filepath = self.get_data_path('dna-sequences-corrupt-characters.fasta')
        format = DNAFASTAFormat(filepath, mode='r')

        with self.assertRaisesRegex(ValidationError, 'utf-8.*2'):
            format.validate()
Esempio n. 20
0
    def test_dna_fasta_format_validate_negative(self):
        filepath = self.get_data_path('not-dna-sequences')
        format = DNAFASTAFormat(filepath, mode='r')

        with self.assertRaisesRegex(ValueError, 'DNAFASTA'):
            format.validate()
Esempio n. 21
0
    def test_dna_fasta_format_bom_fails(self):
        filepath = self.get_data_path('dna-with-bom-fails.fasta')
        format = DNAFASTAFormat(filepath, mode='r')

        with self.assertRaisesRegex(ValidationError, 'First line'):
            format.validate()