Exemple #1
0
def filter_seqs_length_by_taxon(
        sequences: DNAFASTAFormat,
        taxonomy: pd.Series,
        labels: str,
        min_lens: int = None,
        max_lens: int = None,
        global_min: int = None,
        global_max: int = None) -> (DNAFASTAFormat, DNAFASTAFormat):
    # Validate filtering options
    if min_lens is max_lens is None:
        raise ValueError(ERROR_FILTER_OPTIONS + 'min_lens, max_lens.')

    # validate that all seqIDs are present in taxonomy
    # Note we view as DNAIterator to take a first pass (should take a few
    # seconds) as initial validation before performing length filtering.
    seq_ids = {i.metadata['id'] for i in sequences.view(DNAIterator)}
    _index_is_superset(seq_ids, set(taxonomy.index))

    # set filter options
    mins = maxs = None
    if min_lens is not None:
        if len(labels) != len(min_lens):
            raise ValueError(
                'labels and min_lens must contain the same number of elements')
        else:
            mins = {k: v for k, v in zip(labels, min_lens)}

    if max_lens is not None:
        if len(labels) != len(max_lens):
            raise ValueError(
                'labels and max_lens must contain the same number of elements')
        else:
            maxs = {k: v for k, v in zip(labels, max_lens)}

    # Stream seqs, apply filter(s)
    result = DNAFASTAFormat()
    failures = DNAFASTAFormat()
    with result.open() as out_fasta, failures.open() as out_failed:
        for seq in sequences.view(DNAIterator):
            # taxon is required, we always use taxon-based filtering
            # grab taxon affiliation for seq
            taxon = taxonomy[seq.metadata['id']]
            # search taxon for filter terms
            # NOTE: we find all matching search terms and pass them all to
            # _seq_length_within_range below; that function determines and
            # applies the most stringent matching length thresholds.
            taxahits = [t for t in labels if t in taxon]
            # if there are no taxahits or global filters, just write out
            if not any(taxahits) and global_min is global_max is None:
                seq.write(out_fasta)
            # if there are taxahits or global filters, always check length
            elif _seq_length_within_range(seq, taxahits, mins, maxs,
                                          global_min, global_max):
                seq.write(out_fasta)
            else:
                seq.write(out_failed)
    return result, failures
Exemple #2
0
def scaffold_hybrid_tree_foundation_tree(
    otu_map: OtuMapFormat,
    extension_taxonomy: TSVTaxonomyFormat,
    extension_sequences: DNAFASTAFormat,
    foundation_tree: NewickFormat,
    foundation_taxonomy: TSVTaxonomyFormat,
    graft_level: str = _ghost_tree_defaults['graft_level'],
) -> NewickFormat:

    otu_map_fh = otu_map.open()
    extension_taxonomy_fh = extension_taxonomy.open()
    extension_sequences_fh = extension_sequences.open()
    foundation_alignment_fh = foundation_tree.open()
    if foundation_taxonomy:
        foundation_taxonomy_fh = foundation_taxonomy.open()
    else:
        foundation_taxonomy_fh = None

    with tempfile.TemporaryDirectory() as tmp:

        # need ghost_tree.nwk here otherwise file exists
        gt_path = os.path.join(tmp, 'ghost_tree')
        thetree = extensions_onto_foundation(otu_map_fh, extension_taxonomy_fh,
                                             extension_sequences_fh,
                                             foundation_alignment_fh, gt_path,
                                             graft_level,
                                             foundation_taxonomy_fh)[0]

        # write new file to tmp file; gets deleted when this block is done
        gt_temp_file = open(tmp + 'ghost_tree', 'w')
        gt_temp_file.write(thetree)
        gt_temp_file.close()

        return NewickFormat(tmp + 'ghost_tree', 'r')
Exemple #3
0
def _process_primers(primer_fwd: Union[str, None],
                     primer_rev: Union[str, None]) -> DNAFASTAFormat:
    """
    Convert provided primers into skbio DNA format. Will reverse complement
    the reverse primer, if provided.

    Arguments:
        primer_fwd (str, None): forward primer
        primer_rev (str, None): reverse primer

    Returns:
        primers_fasta (DNAFASTAFormat): primers in FASTA format
    """

    primers = {
        'forward':
        DNA(primer_fwd, metadata={'id': 'forward'}) if primer_fwd else None,
        'reverse':
        DNA(primer_rev, metadata={
            'id': 'reverse'
        }).reverse_complement() if primer_rev else None
    }

    # save primers in that format to pass them to mafft_add
    primers_fasta = DNAFASTAFormat()
    with primers_fasta.open() as out:
        [primer.write(out) for primer in primers.values() if primer]

    return primers_fasta
Exemple #4
0
def _4(fmt: GISAIDDNAFASTAFormat) -> DNASequencesDirectoryFormat:
    data = _read_gisaid_dna_fasta(str(fmt))
    df = DNASequencesDirectoryFormat()
    ff = DNAFASTAFormat()

    with ff.open() as file:
        skbio.io.write(data, format='fasta', into=file)

    df.file.write_data(ff, DNAFASTAFormat)
    return df
def _split_fasta(sequences, train_ids, test_ids):
    '''
    Split FeatureData[Sequence] artifact into two, based on two sets of IDs.
    sequences: FeatureData[Sequence] Artifact
    train_ids: set
    test_ids: set
    '''
    train_seqs = DNAFASTAFormat()
    test_seqs = DNAFASTAFormat()
    with train_seqs.open() as _train, test_seqs.open() as _test:
        for s in sequences.view(DNAIterator):
            _id = s.metadata['id']
            if s.metadata['id'] in train_ids:
                _train.write('>%s\n%s\n' % (_id, str(s)))
            elif s.metadata['id'] in test_ids:
                _test.write('>%s\n%s\n' % (_id, str(s)))
    train_seqs = q2.Artifact.import_data('FeatureData[Sequence]', train_seqs)
    test_seqs = q2.Artifact.import_data('FeatureData[Sequence]', test_seqs)
    return train_seqs, test_seqs
def cull_seqs(sequences: DNAIterator, num_degenerates: int = 5,
              homopolymer_length: int = 8) -> DNAFASTAFormat:
    result = DNAFASTAFormat()
    with result.open() as out_fasta:
        for seq in sequences:
            degen = _filt_seq_with_degenerates(seq, num_degenerates)
            if not degen:
                poly = _filter_homopolymer(seq, homopolymer_length)
                if not poly:  # if we make it here, write seq to file
                    seq.write(out_fasta)
    return result
Exemple #7
0
def degap_seqs(aligned_sequences: AlignedDNAIterator,
               min_length: int = 1) -> DNAFASTAFormat:
    result = DNAFASTAFormat()
    with result.open() as out_fasta:
        for seq in aligned_sequences:
            dg_seq = seq.degap()
            #  If seq is all gaps, then dg_seq will be an empty string
            #  and we'll not write it out.
            if len(dg_seq) >= min_length:
                dg_seq.write(out_fasta)
    return result
Exemple #8
0
def _4(fmt: GISAIDDNAFASTAFormat) -> DNASequencesDirectoryFormat:
    df = DNASequencesDirectoryFormat()
    ff = DNAFASTAFormat()

    with ff.open() as file, \
         tempfile.TemporaryFile(mode='w+') as temp_fh:
        data = _read_gisaid_dna_fasta(str(fmt), temp_fh)
        skbio.io.write(data, format='fasta', into=file)

    df.file.write_data(ff, DNAFASTAFormat)
    return df
Exemple #9
0
def _convert_seq_block_to_dna_fasta_format(seqs):
    """
    Converts to a DNA fasta format
    """
    seqs = pd.concat(axis=0, sort=True, objs=seqs).fillna('')
    seqs = seqs.apply(lambda x: ''.join(x), axis=1)
    ff = DNAFASTAFormat()
    with ff.open() as f:
        for id_, seq_ in seqs.items():
            sequence = skbio.DNA(seq_, metadata={'id': id_})
            skbio.io.write(sequence, format='fasta', into=f)
    return ff
Exemple #10
0
def extensions_cluster(extension_sequences: DNAFASTAFormat,
                       similarity_threshold: str) -> OtuMapFormat:

    extension_sequences_fh = extension_sequences.open()

    with tempfile.TemporaryDirectory() as tmp:

        # need ghost_tree.nwk here otherwise file exists
        gt_path = os.path.join(tmp, 'otu_map')
        preprocess_extension_tree_sequences(str(extension_sequences_fh.name),
                                            str(similarity_threshold), gt_path)

        copyfile(gt_path, tmp + 'otu_map')

        return OtuMapFormat(tmp + 'otu_map', 'r')
Exemple #11
0
def _dereplicate_taxa(taxa, raw_seqs, derep_seqs, uc, mode):
    # we only want to grab hits for uniq mode
    if mode == 'uniq':
        centroid_ids = set(uc['centroidID'].unique())
        uc = uc[uc['seqID'] != uc['centroidID']]
    # map to taxonomy labels
    uc['Taxon'] = uc['seqID'].apply(lambda x: taxa.loc[x])
    uc['centroidtaxa'] = uc['centroidID'].apply(lambda x: taxa.loc[x])

    if mode == 'uniq':
        # filter out hits that do not match centroid taxonomy
        rereplicates = uc[uc['Taxon'] != uc['centroidtaxa']]
        # drop duplicates that share centroid ID and taxa assignment
        rereplicates = rereplicates.drop_duplicates(['centroidID', 'Taxon'])
        # grab associated seqs
        rereplicate_ids = centroid_ids.union(rereplicates['seqID'].unique())
        # write out seqs for centroids and daughters with unique taxonomies
        seqs_out = DNAFASTAFormat()
        with seqs_out.open() as out_fasta:
            seq_fp = str(raw_seqs)
            for s in skbio.read(seq_fp, format='fasta', constructor=skbio.DNA):
                if s.metadata['id'] in rereplicate_ids:
                    s.write(out_fasta)
        # generate list of dereplicated taxa
        derep_taxa = taxa.reindex(rereplicate_ids)

    else:
        # group seqs that share centroids (this includes the centroid)
        derep_taxa = uc.groupby('centroidID')['Taxon'].apply(lambda x: list(x))
        # find LCA within each cluster
        if mode == 'lca':
            derep_taxa = derep_taxa.apply(lambda x: ';'.join(
                _find_lca([y.split(';') for y in x]))).to_frame()
        # find majority superset LCA within each cluster
        elif mode == 'super':
            derep_taxa = derep_taxa.apply(lambda x: ';'.join(
                _find_super_lca([y.split(';') for y in x]))).to_frame()
        # find majority taxon within each cluster
        elif mode == 'majority':
            derep_taxa = derep_taxa.apply(lambda x: _majority(x)).to_frame()
        # LCA and majority do nothing with the seqs
        seqs_out = derep_seqs

    # gotta please the type validator gods
    derep_taxa.index.name = 'Feature ID'

    return derep_taxa, seqs_out
Exemple #12
0
def orient_seqs(
    sequences: DNAFASTAFormat,
    reference_sequences: DNAFASTAFormat,
    perc_identity: float = 0.9,
    query_cov: float = 0.9,
    threads: int = 1,
    left_justify: bool = False,
) -> (DNAFASTAFormat, DNAFASTAFormat):
    matched_temp, notmatched = DNAFASTAFormat(), DNAFASTAFormat()
    # use vsearch to search query seqs against reference database
    # report orientation of query seqs relative to reference seqs.
    with tempfile.NamedTemporaryFile() as out:
        # note: qmask is disabled as DNAFASTAFormat requires all output seqs
        # to be uppercase. Could loop through output seqs to convert to upper
        # but which is faster: disabling masking or looping through with skbio?
        cmd = [
            'vsearch', '--usearch_global',
            str(sequences), '--matched',
            str(matched_temp), '--notmatched',
            str(notmatched), '--db',
            str(reference_sequences), '--id',
            str(perc_identity), '--maxaccepts', '1', '--strand', 'both',
            '--qmask', 'none', '--query_cov',
            str(query_cov), '--threads',
            str(threads), '--userfields', 'qstrand', '--userout', out.name
        ]
        if left_justify:
            cmd.append('--leftjust')
        run_command(cmd)
        with open(out.name, 'r') as orient:
            orientations = [line.strip() for line in orient]

    # if any query seqs are in reverse orientation, reverse complement
    if '-' in orientations:
        matched = DNAFASTAFormat()
        with matched.open() as out_fasta:
            for seq, orientation in zip(matched_temp.view(DNAIterator),
                                        orientations):
                if orientation == '+':
                    seq.write(out_fasta)
                elif orientation == '-':
                    seq.reverse_complement().write(out_fasta)
    else:
        matched = matched_temp

    return matched, notmatched
Exemple #13
0
def _rna_to_dna(path):
    ff = DNAFASTAFormat()
    with ff.open() as outfasta:
        for seq in _read_rna_fasta(path):
            seq.reverse_transcribe().write(outfasta)
    return ff