Example #1
0
    def test_invalid_files(self):
        for constructor in [BiologicalSequence, NucleotideSequence, DNASequence, RNASequence, ProteinSequence]:
            for invalid, kwargs, errors, etype in self.invalid_files:
                with self.assertRaises(etype) as cm:
                    for kwarg in kwargs:
                        _drop_kwargs(kwarg, "constructor", "filter")

                        read(invalid, format="qseq", verify=False, into=constructor, **kwarg)
                for e in errors:
                    self.assertIn(e, str(cm.exception))
Example #2
0
    def test_invalid_files(self):
        for constructor in [Sequence, DNA, RNA, Protein]:
            for invalid, kwargs, errors, etype in self.invalid_files:
                with self.assertRaises(etype) as cm:
                    for kwarg in kwargs:
                        _drop_kwargs(kwarg, 'constructor', 'filter')

                        read(invalid, format='qseq', verify=False,
                             into=constructor, **kwarg)
                for e in errors:
                    self.assertIn(e, str(cm.exception))
Example #3
0
    def test_invalid_files(self):
        for constructor in [BiologicalSequence, NucleotideSequence,
                            DNASequence, RNASequence, ProteinSequence]:
            for invalid, kwargs, errors, etype in self.invalid_files:
                with self.assertRaises(etype) as cm:
                    for kwarg in kwargs:
                        _drop_kwargs(kwarg, 'constructor', 'filter')

                        read(invalid, format='qseq', verify=False,
                             into=constructor, **kwarg)
                for e in errors:
                    self.assertIn(e, str(cm.exception))
Example #4
0
    def test_dna_iterator_to_dna_fasta_format(self):
        transformer = self.get_transformer(DNAIterator, DNAFASTAFormat)
        filepath = self.get_data_path('dna-sequences.fasta')
        generator = skbio.read(filepath, format='fasta', constructor=skbio.DNA)
        input = DNAIterator(generator)

        obs = transformer(input)
        self.assertIsInstance(obs, DNAFASTAFormat)
        obs = skbio.read(str(obs), format='fasta', constructor=skbio.DNA)

        for act, exp in zip(obs, input):
            self.assertEqual(act, exp)
Example #5
0
    def test_dna_iterator_to_dna_fasta_format(self):
        transformer = self.get_transformer(DNAIterator, DNAFASTAFormat)
        filepath = self.get_data_path('dna-sequences.fasta')
        generator = skbio.read(filepath, format='fasta', constructor=skbio.DNA)
        input = DNAIterator(generator)

        obs = transformer(input)
        self.assertIsInstance(obs, DNAFASTAFormat)
        obs = skbio.read(str(obs), format='fasta', constructor=skbio.DNA)

        for act, exp in zip(obs, input):
            self.assertEqual(act, exp)
Example #6
0
    def test_pair_dna_sequences_directory_format_to_pair_dna_iterator(self):
        filenames = ('left-dna-sequences.fasta', 'right-dna-sequences.fasta')
        input, obs = self.transform_format(PairedDNASequencesDirectoryFormat,
                                           PairedDNAIterator,
                                           filenames=filenames)

        exp_left = skbio.read(self.get_data_path(filenames[0]),
                              format='fasta', constructor=skbio.DNA)
        exp_right = skbio.read(self.get_data_path(filenames[1]),
                               format='fasta', constructor=skbio.DNA)
        for act, exp in zip(obs, zip(exp_left, exp_right)):
            self.assertEqual(act, exp)
        self.assertIsInstance(obs, PairedDNAIterator)
Example #7
0
    def test_pair_dna_sequences_directory_format_to_pair_dna_iterator(self):
        filenames = ('left-dna-sequences.fasta', 'right-dna-sequences.fasta')
        input, obs = self.transform_format(PairedDNASequencesDirectoryFormat,
                                           PairedDNAIterator,
                                           filenames=filenames)

        exp_left = skbio.read(self.get_data_path(filenames[0]),
                              format='fasta', constructor=skbio.DNA)
        exp_right = skbio.read(self.get_data_path(filenames[1]),
                               format='fasta', constructor=skbio.DNA)
        for act, exp in zip(obs, zip(exp_left, exp_right)):
            self.assertEqual(act, exp)
        self.assertIsInstance(obs, PairedDNAIterator)
Example #8
0
def annotate(in_fp,
             in_fmt,
             out_dir,
             out_fmt,
             cpus,
             kingdom,
             force,
             config,
             cache=False):
    '''Annotate the sequences in the input file.

    Parameters
    ----------
    in_fp : file_handle
        Input file handler object.
    in_fmt : str
        Input file format.
    out_dir : str
        Output file directory.
    out_fmt : str
        Output file format.
    kingdom : str
        Kingdom index corresponding to database (i.e. virus, bacteria ...)
    cpus : int
        Number of cpus to use.
    force : boolean
        Force to overwrite.
    config : ``micronota.config.Configuration``
        Container for configuration options.
    '''
    _overwrite(out_dir, overwrite=force)
    makedirs(out_dir, exist_ok=force)
    prefix = splitext(basename(in_fp))[0]
    fn = '{p}.{f}'.format(p=prefix, f=out_fmt)
    out_fp = join(out_dir, fn)

    # declare DiamondCache
    if cache:
        cache = DiamondCache()
    else:
        cache = None

    with open(out_fp, 'w') as out:
        for seq in read(in_fp, format=in_fmt):
            # dir for useful intermediate files for the current input seq
            # replace non alnum char with "_"
            seq_fn = ''.join(x if x.isalnum() else '_'
                             for x in seq.metadata['id'])
            seq_dir = join(out_dir, seq_fn)
            # identify all features specified
            im = identify_all_features(seq, seq_dir, config)
            # pass in and retrieve DiamondCache
            im, cache = annotate_all_cds(im,
                                         seq_dir,
                                         kingdom,
                                         config,
                                         cache=cache)

            seq.interval_metadata.concat(IntervalMetadata(im), inplace=True)
            seq.write(out, format=out_fmt)
Example #9
0
 def setUp(self):
     super().setUp()
     tests = ('blastp', 'WP_009885814.faa')
     self.blast = (tests[0], get_data_path(tests[1]),
                   _get_named_data_path('%s.diamond' % tests[1]))
     seqs = skbio.read(_get_named_data_path('cache.faa'), format='fasta')
     self.cache = DiamondCache(list(seqs))
Example #10
0
def fungi_from_fasta(fasta_fh, accession_fh, taxonomy_fh):
    """Filter SILVA sequences to keep only fungi.

    Filters a fasta file of aligned or unaligned sequences to include only
    fungi. Only keeps sequences that have accession numbers that can be mapped
    to a fungal taxonomy string that ends at the genus rank.

    Parameters
    ----------
    fasta_fh : filehandle
        Fasta file of aligned or unaligned SILVA sequences. Each sequence
        identifier must be an accession number.
    accession_fh : filehandle
        A tab-separated file mapping accession numbers to a mapping number in
        `taxonomy_map`. This file should contain exactly two columns:
        accession number and mapping number.
    taxonomy_fh: filehandle
        A tab-separated file that identifes the taxonomy and rank of a mapping
        number in `accession_fh`. This file should contain exactly five
        columns beginning with taxonomy, mapping number and rank. The last two
        columns are ignored.

    Returns
    -------
    generator
        Yields ``skbio.BiologicalSequence`` objects.

    """
    accession_map = _parse_accession_map(accession_fh)
    taxonomy_map = _parse_taxonomy_map(taxonomy_fh)
    for seq in skbio.read(fasta_fh, format="fasta"):
        map_num = accession_map[seq.id]
        if map_num in taxonomy_map:
            yield seq
Example #11
0
 def _parse_fasta_dictionary(self):
     fasta_dictionary = {}
     sequence_type = self.sequence_type
     for seq_entry in read(self.fasta_path, format="fasta"):
         seq_id = seq_entry.metadata["id"]
         fasta_dictionary[seq_id] = sequence_type(seq_entry)
     return fasta_dictionary
Example #12
0
    def filter_fasta(exp, filename, negate=False, inplace=False):
        '''Filter features from experiment based on fasta file

        Parameters
        ----------
        filename : str
            the fasta filename containing the sequences to use for filtering
        negate : bool (optional)
            False (default) to keep only sequences matching the fasta file, True to remove sequences in the fasta file.
        inplace : bool (optional)
            False (default) to create a copy of the experiment, True to filter inplace

        Returns
        -------
        newexp : Experiment
            filtered so contains only sequence present in exp and in the fasta file
        '''
        logger.debug('filter_fasta using file %s' % filename)
        okpos = []
        tot_seqs = 0
        for cseq in skbio.read(filename, format='fasta'):
            tot_seqs += 1
            cseq = str(cseq).upper()
            if cseq in exp.feature_metadata.index:
                pos = exp.feature_metadata.index.get_loc(cseq)
                okpos.append(pos)
        logger.debug('loaded %d sequences. found %d sequences in experiment' %
                     (tot_seqs, len(okpos)))
        if negate:
            okpos = np.setdiff1d(np.arange(len(exp.feature_metadata.index)),
                                 okpos,
                                 assume_unique=True)

        newexp = exp.reorder(okpos, axis=1, inplace=inplace)
        return newexp
def classify_sklearn(reads: DNAFASTAFormat,
                     classifier: Pipeline,
                     reads_per_batch: int = 0,
                     n_jobs: int = 1,
                     pre_dispatch: str = '2*n_jobs',
                     confidence: float = 0.7,
                     read_orientation: str = None) -> pd.DataFrame:
    # autotune reads per batch
    if reads_per_batch == 0:
        reads_per_batch = _autotune_reads_per_batch(reads, n_jobs)

    # transform reads to DNAIterator
    reads = DNAIterator(
        skbio.read(str(reads), format='fasta', constructor=skbio.DNA))

    reads = _autodetect_orientation(reads,
                                    classifier,
                                    read_orientation=read_orientation)
    predictions = predict(reads,
                          classifier,
                          chunk_size=reads_per_batch,
                          n_jobs=n_jobs,
                          pre_dispatch=pre_dispatch,
                          confidence=confidence)
    seq_ids, taxonomy, confidence = list(zip(*predictions))
    result = pd.DataFrame({
        'Taxon': taxonomy,
        'Confidence': confidence
    },
                          index=seq_ids,
                          columns=['Taxon', 'Confidence'])
    result.index.name = 'Feature ID'
    return result
Example #14
0
def _filter_sequence_ids(in_fp, out_fp, ids, negate=False):
    '''Filter away the seq with specified IDs.'''
    with open(out_fp, 'w') as out:
        for seq in read(in_fp, format='fasta', constructor=Sequence):
            seq_id = seq.metadata['id']
            if seq_id not in ids:
                write(seq, format='fasta', into=out)
Example #15
0
    def test_fastq_to_sequence(self):
        for constructor in [partial(Sequence), partial(DNA, validate=False),
                            partial(RNA, validate=False),
                            partial(Protein, validate=False)]:
            for valid_files, kwargs, components in self.valid_configurations:
                for valid in valid_files:
                    # skip empty file case since we cannot read a specific
                    # sequencefrom an empty file
                    if len(components) == 0:
                        continue

                    for kwarg in kwargs:
                        _drop_kwargs(kwarg, 'constructor')

                        seq_num = kwarg.get('seq_num', 1)
                        c = components[seq_num - 1]
                        expected = \
                            constructor(
                                c[2], metadata={'id': c[0],
                                                'description': c[1]},
                                positional_metadata={'quality': np.array(c[3],
                                                     dtype=np.uint8)})

                        observed = read(valid, into=constructor.func,
                                        format='fastq', verify=False, **kwarg)
                        self.assertEqual(observed, expected)
Example #16
0
    def test_fastq_to_sequence(self):
        for constructor in [
                BiologicalSequence, NucleotideSequence, DNASequence,
                RNASequence, ProteinSequence
        ]:
            for valid, kwargs, components in self.valid_files:
                # skip empty file case since we cannot read a specific sequence
                # from an empty file
                if len(components) == 0:
                    continue

                for kwarg in kwargs:
                    _drop_kwargs(kwarg, 'constructor')

                    seq_num = kwarg.get('seq_num', 1)
                    c = components[seq_num - 1]
                    expected = constructor(c[2],
                                           id=c[0],
                                           description=c[1],
                                           quality=c[3])

                    observed = read(valid,
                                    into=constructor,
                                    format='fastq',
                                    verify=False,
                                    **kwarg)
                    self.assertTrue(observed.equals(expected))
Example #17
0
def find_tree(npop: int,
              numerical_label: 'np.ndarray[int]',
              arr: 'np.ndarray[float]',
              ) -> TreeNode:
    """Find tree topology using the centers of mass of clusters.
    'inferred_labels' contains assigned labels. Return the neighbor join tree, population sizes,
    and the bloks of original distance matrix that correspond to given
    population pairs (for further determination of fitting window).
    """
    if npop == 2:
        tree = read(StringIO('(0:0.1, 1:0.1);'), format='newick', into=TreeNode)
        return tree

    arr = arr[:, :npop + OFFSET]
    ds = np.zeros((npop, npop))
    coords = np.zeros((npop, npop+OFFSET))
    for i in set(numerical_label):
        coords[i, :] = np.mean(arr[np.where(numerical_label == i)[0], :], axis=0)
    for i in range(npop):
        for j in range(npop):
            ds[i, j] = np.sqrt(np.sum((coords[i] - coords[j])**2))

    ids = list(map(str, range(npop)))
    dm = DistanceMatrix(ds, ids)
    tree = nj(dm)
    new_tree = tree.root_at_midpoint()
    print(new_tree.ascii_art())
    print(new_tree)
    return new_tree
Example #18
0
def body_site(coords, mapping_file, output, filename, sample):
    """Generates a bodysite figure for a sample in the coordinates file"""
    o = read(coords, into=OrdinationResults)

    # coordinates
    c_df = pd.DataFrame(o.site, o.site_ids)

    # mapping file
    mf = pd.read_csv(mapping_file, sep='\t', dtype=str)
    mf.set_index('#SampleID', inplace=True)

    mf = mf.loc[o.site_ids]

    if sample not in o.site_ids:
        raise ValueError("Sample %s not found" % sample)

    color_hmp_fecal = sns.color_palette('Paired', 12)[10]  # light brown
    color_agp_fecal = sns.color_palette('Paired', 12)[11]  # dark brown
    color_hmp_oral = sns.color_palette('Paired', 12)[0]    # light blue
    color_agp_oral = sns.color_palette('Paired', 12)[1]    # dark blue
    color_hmp_skin = sns.color_palette('Paired', 12)[2]    # light green
    color_agp_skin = sns.color_palette('Paired', 12)[3]    # dark green

    grp_colors = {'AGP-FECAL': color_agp_fecal,
                  'AGP-ORAL':  color_agp_oral,
                  'AGP-SKIN':  color_agp_skin,
                  'HMP-FECAL': color_hmp_fecal,
                  'GG-FECAL':  color_hmp_fecal,
                  'PGP-FECAL': color_hmp_fecal,
                  'HMP-ORAL':  color_hmp_oral,
                  'PGP-ORAL':  color_hmp_oral,
                  'HMP-SKIN':  color_hmp_skin,
                  'PGP-SKIN':  color_hmp_skin}

    # plot categories as 50 slices with random zorder
    for grp, color in grp_colors.iteritems():
        sub_coords = c_df[mf.TITLE_BODY_SITE == grp].values
        for i in np.array_split(sub_coords, 50):
            if i.size == 0:
                continue
            plt.scatter(i[:, 0], i[:, 1], color=color,
                        edgecolor=np.asarray(color)*0.6, lw=LINE_WIDTH,
                        alpha=ALPHA, zorder=np.random.rand())

    # plot participant's dot
    plt.scatter(c_df.loc[sample][0], c_df.loc[sample][1],
                color=grp_colors[mf.loc[sample]['TITLE_BODY_SITE']],
                s=270, edgecolor='w', zorder=1, lw=LINE_WIDTH_WHITE)
    plt.scatter(c_df.loc[sample][0], c_df.loc[sample][1],
                color=grp_colors[mf.loc[sample]['TITLE_BODY_SITE']],
                s=250, edgecolor=np.asarray(
                grp_colors[mf.loc[sample]['TITLE_BODY_SITE']])*0.6,
                zorder=2, lw=LINE_WIDTH_BLACK)

    plt.axis('off')
    my_dpi = 72
    figsize = (1000 / my_dpi, 1000 / my_dpi)
    out_file = os.path.join(output, filename)
    plt.savefig(out_file, figsize=figsize, dpi=my_dpi)
    plt.close()
Example #19
0
    def test_valid_files(self):
        for constructor in [Sequence, DNA, RNA, Protein]:
            for valid, kwargs, components in self.valid_files:
                for observed_kwargs in kwargs:
                    expected_kwargs = {}
                    # Currently not validating the alphabet for qseq
                    # files that are read in for this test.
                    if hasattr(constructor, 'alphabet'):
                        observed_kwargs['validate'] = False
                        expected_kwargs['validate'] = False
                    _drop_kwargs(observed_kwargs, 'constructor', 'filter')

                    seq_num = observed_kwargs.get('seq_num', 1)
                    c = components[seq_num - 1]
                    expected = constructor(
                        c['sequence'],
                        metadata={'id': c['id'],
                                  'machine_name': c['machine_name'],
                                  'run_number': c['run_number'],
                                  'lane_number': c['lane_number'],
                                  'tile_number': c['tile_number'],
                                  'x': c['x'],
                                  'y': c['y'],
                                  'index': c['index'],
                                  'read_number': c['read_number']},
                        positional_metadata={
                            'quality': np.array(c['quality'], np.uint8)},
                        **expected_kwargs)

                    observed = read(valid, into=constructor,
                                    format='qseq', verify=False,
                                    **observed_kwargs)
                    self.assertEqual(observed, expected)
Example #20
0
    def _annotate_fp(self, fp, aligner='blastp', evalue=0.001, cpus=1,
                     outfmt='tab', params=None) -> pd.DataFrame:
        '''Annotate the sequences in the file.

        Parameters
        ----------
        params : dict-like
            Parameters for diamond blastp/blastx that pass to ``run_blast``.
        '''
        found = []
        res = pd.DataFrame()
        for db in self.dat:
            out_prefix = splitext(basename(db))[0]
            daa_fp = join(self.out_dir, '%s.daa' % out_prefix)
            out_fp = join(self.out_dir, '%s.diamond' % out_prefix)
            self.run_blast(fp, daa_fp, db, aligner=aligner,
                           evalue=evalue, cpus=cpus, params=params)
            self.run_view(daa_fp, out_fp, params={'--outfmt': outfmt})
            res = res.append(self.parse_tabular(out_fp))
            found.extend(res.index)
            # save to a tmp file the seqs that do not hit current database
            new_fp = join(self.tmp_dir, '%s.fa' % out_prefix)
            with open(new_fp, 'w') as f:
                for seq in read(fp, format='fasta'):
                    if seq.metadata['id'] not in found:
                        seq.write(f, format='fasta')
            # no seq left
            if stat(new_fp).st_size == 0:
                break
            else:
                fp = new_fp
        return res
Example #21
0
def read(file_name, file_format='newick'):
    """ Reads in contents from a file.
    """

    if file_format == 'newick':
        tree = skbio.read(file_name, file_format, into=TreeNode)
        return tree
    return None
Example #22
0
    def test_dna_fasta_format_to_dna_iterator(self):
        input, obs = self.transform_format(DNAFASTAFormat, DNAIterator,
                                           filename='dna-sequences.fasta')

        exp = skbio.read(str(input), format='fasta', constructor=skbio.DNA)

        for observed, expected in zip(obs, exp):
            self.assertEqual(observed, expected)
Example #23
0
    def test_dna_fasta_format_to_dna_iterator(self):
        input, obs = self.transform_format(DNAFASTAFormat, DNAIterator,
                                           filename='dna-sequences.fasta')

        exp = skbio.read(str(input), format='fasta', constructor=skbio.DNA)

        for observed, expected in zip(obs, exp):
            self.assertEqual(observed, expected)
Example #24
0
def set_tree_from_input(asd_file, simulation) -> Tuple[TreeNode, 'np.ndarray[int]', 'np.ndarray[float]']:
    """Using the given tree topology, Return the neighbor join tree, population sizes,
    and the bloks of original distance matrix that correspond to given
    population pairs (for further determination of fitting window).
    """
    print(simulation.topology)
    tree = read(StringIO(simulation.topology),format='newick', into=TreeNode)
    print(tree.ascii_art())
    return tree
Example #25
0
def sort_uniref(db_fp, uniref_fp, out_d, resolution, force=False):
    '''Sort UniRef sequences into different partitions.

    This will sort UniRef100 seq into following partitions based on both
    quality and taxon:

    * ``uniref100/Swiss-Prot_Archaea.fasta``
    * ``uniref100/Swiss-Prot_Bacteria.fasta``
    * ``uniref100/Swiss-Prot_Viruses.fasta``
    * ``uniref100/Swiss-Prot_other.fasta``
    * ``uniref100/Swiss-Prot_Eukaryota.fasta``
    * ``uniref100/TrEMBL_Archaea.fasta``
    * ``uniref100/TrEMBL_Bacteria.fasta``
    * ``uniref100/TrEMBL_Viruses.fasta``
    * ``uniref100/TrEMBL_other.fasta``
    * ``uniref100/TrEMBL_Eukaryota.fasta``
    * ``uniref100/_other.fasta``

    Parameters
    ----------
    db_fp : str
        The database file created by ``prepare_metadata``.
    uniref_fp : str
        The UniRef100 fasta file. gzipped or not.
    out_d : str
        The output directory to place the resulting fasta files.
    '''
    _overwrite(out_d, force)
    makedirs(out_d)
    logger = getLogger(__name__)
    logger.info('Sorting UniRef sequences')
    fns = ['%s_%s' % (i, j) for i, j in product(_status, _kingdom)]
    fns.append('_other')
    fps = [join(out_d, 'uniref%d_%s.fasta' % (resolution, f)) for f in fns]
    files = {fn: open(fp, 'w') for fp, fn in zip(fps, fns)}

    with connect(db_fp) as conn:
        cursor = conn.cursor()
        for seq in read(uniref_fp, format='fasta', constructor=Sequence):
            id = seq.metadata['id']
            ac = id.replace('UniRef%d_' % resolution, '')
            group = ['', 'other']
            cursor.execute('''SELECT * FROM metadata
                              WHERE ac = ?''',
                           (ac,))
            for _, s, k in cursor.fetchall():
                group[0] = _status[s]
                group[1] = _kingdom[k]
            seq.write(files['_'.join(group)])

    for f in files:
        files[f].close()
    for fp in fps:
        # if the fasta file is not empty
        if stat(fp).st_size > 0:
            make_db(fp)
Example #26
0
def gradient(coords, mapping_file, color, output, filename, sample):
    """Generates as many figures as samples in the coordinates file"""
    o = read(coords, into=OrdinationResults)

    # coordinates
    c_df = pd.DataFrame(o.site, o.site_ids)

    # mapping file
    mf = pd.read_csv(mapping_file, '\t', converters=defaultdict(str),
                     dtype=str)
    mf.set_index('#SampleID', inplace=True)
    mf = mf.loc[o.site_ids]
    mf[color] = mf[color].convert_objects(convert_numeric=True)

    if sample not in o.site_ids:
        raise ValueError("Sample %s not found" % sample)

    numeric = mf[~pd.isnull(mf[color])]
    non_numeric = mf[pd.isnull(mf[color])]

    color_array = plt.cm.RdBu(numeric[color]/max(numeric[color]))

    # plot numeric metadata as colored gradient
    ids = numeric.index
    x, y = c_df.loc[ids][0], c_df.loc[ids][1]
    plt.scatter(x, y, c=numeric[color], cmap=plt.get_cmap('RdBu'),
                alpha=ALPHA, lw=LINE_WIDTH, edgecolor=color_array*0.6)

    # plot non-numeric metadata as gray
    ids = non_numeric.index
    x, y = c_df.loc[ids][0], c_df.loc[ids][1]
    plt.scatter(x, y, c='0.5', alpha=ALPHA, lw=LINE_WIDTH, edgecolor='0.3')

    # plot individual's dot
    try:
        color_index = numeric.index.tolist().index(sample)
    except ValueError:
        color_index = None

    if color_index is None:
        _color = (0.5, 0.5, 0.5)
    else:
        _color = color_array[color_index]

    plt.scatter(c_df.loc[sample][0], c_df.loc[sample][1],
                color=_color, s=270, edgecolor='w', lw=LINE_WIDTH_WHITE)
    plt.scatter(c_df.loc[sample][0], c_df.loc[sample][1],
                color=_color, s=250, edgecolor=np.asarray(_color)*0.6,
                lw=LINE_WIDTH_BLACK)

    plt.axis('off')
    my_dpi = 72
    figsize = (1000 / my_dpi, 1000 / my_dpi)
    out_file = os.path.join(output, filename)
    plt.savefig(out_file, figsize=figsize, dpi=my_dpi)
    plt.close()
Example #27
0
def sort_uniref(db_fp, uniref_fp, out_d, resolution, force=False):
    '''Sort UniRef sequences into different partitions.

    This will sort UniRef100 seq into following partitions based on both
    quality and taxon:

    * ``uniref100/Swiss-Prot_Archaea.fasta``
    * ``uniref100/Swiss-Prot_Bacteria.fasta``
    * ``uniref100/Swiss-Prot_Viruses.fasta``
    * ``uniref100/Swiss-Prot_other.fasta``
    * ``uniref100/Swiss-Prot_Eukaryota.fasta``
    * ``uniref100/TrEMBL_Archaea.fasta``
    * ``uniref100/TrEMBL_Bacteria.fasta``
    * ``uniref100/TrEMBL_Viruses.fasta``
    * ``uniref100/TrEMBL_other.fasta``
    * ``uniref100/TrEMBL_Eukaryota.fasta``
    * ``uniref100/_other.fasta``

    Parameters
    ----------
    db_fp : str
        The database file created by ``prepare_metadata``.
    uniref_fp : str
        The UniRef100 fasta file. gzipped or not.
    out_d : str
        The output directory to place the resulting fasta files.
    '''
    _overwrite(out_d, force)
    makedirs(out_d)
    logger = getLogger(__name__)
    logger.info('Sorting UniRef sequences')
    fns = ['%s_%s' % (i, j) for i, j in product(_status, _kingdom)]
    fns.append('_other')
    fps = [join(out_d, 'uniref%d_%s.fasta' % (resolution, f)) for f in fns]
    files = {fn: open(fp, 'w') for fp, fn in zip(fps, fns)}

    with connect(db_fp) as conn:
        cursor = conn.cursor()
        for seq in read(uniref_fp, format='fasta', constructor=Sequence):
            id = seq.metadata['id']
            ac = id.replace('UniRef%d_' % resolution, '')
            group = ['', 'other']
            cursor.execute(
                '''SELECT * FROM metadata
                              WHERE ac = ?''', (ac, ))
            for _, s, k in cursor.fetchall():
                group[0] = _status[s]
                group[1] = _kingdom[k]
            seq.write(files['_'.join(group)])

    for f in files:
        files[f].close()
    for fp in fps:
        # if the fasta file is not empty
        if stat(fp).st_size > 0:
            run_makedb(fp)
Example #28
0
 def test_save_fasta(self):
     exp = ca.read(self.test1_biom, self.test1_samp, normalize=None)
     d = mkdtemp()
     f = join(d, 'test1.fasta')
     exp.save_fasta(f)
     seqs = []
     for seq in skbio.read(f, format='fasta'):
         seqs.append(str(seq))
     self.assertCountEqual(seqs, exp.feature_metadata.index.values)
     shutil.rmtree(d)
Example #29
0
    def test_pair_dna_iterator_to_pair_dna_sequences_directory_format(self):
        transformer = self.get_transformer(PairedDNAIterator,
                                           PairedDNASequencesDirectoryFormat)

        l_seqs = skbio.read(self.get_data_path('left-dna-sequences.fasta'),
                            format='fasta', constructor=skbio.DNA)
        r_seqs = skbio.read(self.get_data_path('right-dna-sequences.fasta'),
                            format='fasta', constructor=skbio.DNA)
        input = PairedDNAIterator(zip(l_seqs, r_seqs))

        obs = transformer(input)
        obs_l = skbio.read('%s/left-dna-sequences.fasta' % str(obs),
                           format='fasta', constructor=skbio.DNA)
        obs_r = skbio.read('%s/right-dna-sequences.fasta' % str(obs),
                           format='fasta', constructor=skbio.DNA)

        for act, exp in zip(zip(obs_l, obs_r), zip(l_seqs, r_seqs)):
            self.assertEqual(act, exp)
        self.assertIsInstance(obs, PairedDNASequencesDirectoryFormat)
Example #30
0
    def test_pair_dna_iterator_to_pair_dna_sequences_directory_format(self):
        transformer = self.get_transformer(PairedDNAIterator,
                                           PairedDNASequencesDirectoryFormat)

        l_seqs = skbio.read(self.get_data_path('left-dna-sequences.fasta'),
                            format='fasta', constructor=skbio.DNA)
        r_seqs = skbio.read(self.get_data_path('right-dna-sequences.fasta'),
                            format='fasta', constructor=skbio.DNA)
        input = PairedDNAIterator(zip(l_seqs, r_seqs))

        obs = transformer(input)
        obs_l = skbio.read('%s/left-dna-sequences.fasta' % str(obs),
                           format='fasta', constructor=skbio.DNA)
        obs_r = skbio.read('%s/right-dna-sequences.fasta' % str(obs),
                           format='fasta', constructor=skbio.DNA)

        for act, exp in zip(zip(obs_l, obs_r), zip(l_seqs, r_seqs)):
            self.assertEqual(act, exp)
        self.assertIsInstance(obs, PairedDNASequencesDirectoryFormat)
Example #31
0
def read_qiime2(fp, sample_metadata_file=None, rep_seq_file=None, taxonomy_file=None, **kwargs):
    '''Read a qiime2 feature table and additional optional artifact files (representative sequences and taxonomy) into a Calour.AmpliconExperiment

    Parameters
    ----------
    fp: str
        name of the qiime2 feature table .qza artifact file
    sample_metadata_file : None or str, optional
        None (default) to just use sample names (no additional metadata).
        if not None, file path to the sample metadata (aka mapping file in QIIME).
    rep_seq_file: None or str, optional
        None (default) to use the feature ids in the feature table
        if not None, file path to the qiime2 representative sequences artifact file (defined by the qiime2 --o-representative-sequences parameter)
    taxonomy_file: None or str, optional
        if not None, add taxonomy for each feature using the qiime2 taxonomy artifact file (output of the qiime2 feature-classifier command)

    Keyword Arguments
    -----------------
    %(io.read.parameters)s
    '''
    newexp = read_amplicon(fp, sample_metadata_file=sample_metadata_file, data_file_type='qiime2', **kwargs)
    with tempfile.TemporaryDirectory() as tempdir:
        # if rep-seqs file is supplied, translate hashes to sequences
        if rep_seq_file is not None:
            logger.debug('loading rep_seqs file %s' % rep_seq_file)
            rs_name = _file_from_zip(tempdir, rep_seq_file, internal_data='data/dna-sequences.fasta')
            rseqs = []
            rids = []
            for cseq in skbio.read(rs_name, format='fasta'):
                rseqs.append(str(cseq).upper())
                rids.append(cseq.metadata['id'])
            rep_seqs = pd.Series(data=rseqs, index=rids, name='_feature_id')

            # test if all hashes are identical to the rep_seqs file supplied
            if not newexp.feature_metadata.index.equals(rep_seqs.index):
                logger.info('Rep seqs hashes and table hashes are not equal. Using table hashes.')
            # switch the columns so now _feature_id (and the index) is the sequence and not the hash. The hash is copied to '_hash'
            newexp.feature_metadata.rename(columns={'_feature_id': '_hash'}, inplace=True)
            newexp.feature_metadata = newexp.feature_metadata.join(other=rep_seqs, on='_hash', how='left')
            newexp.feature_metadata.set_index('_feature_id', inplace=True, drop=False)

        # if taxonomy file is supplied, load it into the feature metadata
        if taxonomy_file is not None:
            logger.debug('loading taxonomy file %s' % taxonomy_file)
            tax_name = _file_from_zip(tempdir, taxonomy_file, internal_data='data/taxonomy.tsv')
            taxonomy_df = pd.read_table(tax_name)
            taxonomy_df.set_index('Feature ID', inplace=True)
            newexp.feature_metadata = newexp.feature_metadata.join(other=taxonomy_df, how='left')
            if len(newexp.feature_metadata.index.intersection(taxonomy_df.index)) == 0:
                logger.info('No matching sequences in taxonomy file.')
                if '_hash' in newexp.feature_metadata.columns:
                    logger.info('Trying to use hashes for taxonomy')
                    newexp.feature_metadata = newexp.feature_metadata.drop(taxonomy_df.columns, axis=1)
                    newexp.feature_metadata = newexp.feature_metadata.join(other=taxonomy_df, on='_hash', how='left')
    return newexp
Example #32
0
    def test_filter_partial_genes(self):
        in_fp = join(self.tmpd, 'in.gff')
        out_fp = join(self.tmpd, 'out.gff')
        imd1 = IntervalMetadata(None)
        imd1.add(
            [(0, 100)],
            metadata={
                'partial': '01',
                'phase': 0,
                'source': 'Prodigal_v2.6.3',
                'strand': '.',
                'type': '.',
                'score': '.'
            })
        imd2 = IntervalMetadata(None)
        imd2.add(
            [(200, 300)],
            metadata={
                'partial': '10',
                'phase': 1,
                'source': 'Prodigal_v2.6.3',
                'strand': '-',
                'type': 'CDS',
                'score': '1'
            })
        imd2.add(
            [(2000, 3000)],
            metadata={
                'partial': '00',
                'phase': 1,
                'source': 'Prodigal_v2.6.3',
                'strand': '.',
                'type': '.',
                'score': '.'
            })

        imd3 = IntervalMetadata(None)
        imd3.add(
            [(2000, 3000)],
            metadata={
                'partial': '00',
                'phase': 1,
                'source': 'Prodigal_v2.6.3',
                'strand': '.',
                'type': '.',
                'score': '.'
            })

        data = (('seq1', imd1), ('seq2', imd2))
        write(((sid, imd) for sid, imd in data), into=in_fp, format='gff3')
        filter_partial_genes(in_fp, out_fp)
        obs = read(out_fp, format='gff3')
        for i, j in zip(obs, [('seq2', imd3)]):
            self.assertEqual(i, j)
Example #33
0
    def _annotate_fp(self, fp, aligner='blastp', evalue=0.001, cpus=1,
                     outfmt='sam', params=None):
        '''Annotate the sequences in the file.'''

        if self.has_cache() and not self.cache.is_empty():
            self.cache.build()
            dbs = [self.cache.db] + self.dat
        else:
            dbs = self.dat

        seqs = []
        found = set()
        res = pd.DataFrame()
        logger = getLogger(__name__)
        for db in dbs:
            out_prefix = splitext(basename(db))[0]
            daa_fp = join(self.out_dir, '%s.daa' % out_prefix)
            out_fp = join(self.out_dir, '%s.diamond' % out_prefix)
            self.run_blast(fp, daa_fp, db, aligner=aligner,
                           evalue=evalue, cpus=cpus, params=params)
            self.run_view(daa_fp, out_fp, params={'--outfmt': outfmt})
            # res = res.append(self.parse_tabular(out_fp))
            if outfmt == 'tab':
                res = res.append(
                    self._filter_best(self.parse_tabular(out_fp)))
            elif outfmt == 'sam':
                res = res.append(
                    self._filter_id_cov(self.parse_sam(out_fp)))

            # save to a tmp file the seqs that do not hit current database
            new_fp = join(self.tmp_dir, '%s.fa' % out_prefix)
            found = found | set(res.index)
            with open(new_fp, 'w') as f:
                for seq in read(fp, format='fasta'):
                    if seq.metadata['id'] not in found:
                        seq.write(f, format='fasta')
            logger.info('Number of diamond hits: %d' % len(res.index))

            # no seq left
            if stat(new_fp).st_size == 0:
                break
            else:
                fp = new_fp
        if outfmt == 'sam' and self.has_cache():
            for x in res.index:
                seqs.append(
                    Sequence(res.loc[x, 'sseq'],
                             metadata={'id': res.loc[x, 'sseqid']}))

        # Update cache (inplace)
        if self.has_cache():
            self.cache.update(seqs)
            self.cache.close()
        return res
Example #34
0
    def setUp(self):
        super().setUp()
        cases = [('blastp', 'WP_009885814.faa'),
                 ('blastx', 'WP_009885814.fna')]
        Test = namedtuple('Test', ['aligner', 'input', 'exp'])
        self.tests = [
            Test(i[0], get_data_path(i[1]),
                 _get_named_data_path('%s.diamond' % i[1])) for i in cases
        ]

        seqs = skbio.read(_get_named_data_path('cache.faa'), format='fasta')
        self.cache = DiamondCache(list(seqs))
Example #35
0
def subsample_dm(distmat, mapping_file, max, category, output):
    """Subsample the distmat to max samples per category value"""
    mf = pd.read_csv(mapping_file, '\t', converters=defaultdict(str),
                     index_col='#SampleID')
    id_to_cat = dict(mf[category])

    def bin_f(x):
        return id_to_cat[x]

    dm = read(distmat, into=DistanceMatrix)
    dm = dm.filter([id for _, id in isubsample(dm.ids, max, bin_f=bin_f)])
    dm.to_file(output)
Example #36
0
    def test_valid_files(self):
        for constructor in [BiologicalSequence, NucleotideSequence, DNASequence, RNASequence, ProteinSequence]:
            for valid, kwargs, components in self.valid_files:
                for kwarg in kwargs:
                    _drop_kwargs(kwarg, "constructor", "filter")

                    seq_num = kwarg.get("seq_num", 1)
                    c = components[seq_num - 1]
                    expected = constructor(c[1], id=c[0], quality=c[2])

                    observed = read(valid, into=constructor, format="qseq", verify=False, **kwarg)
                    self.assertTrue(observed.equals(expected))
Example #37
0
    def setUp(self):
        super().setUp()
        cases = [('blastp', 'WP_009885814.faa'),
                 ('blastx', 'WP_009885814.fna')]
        Test = namedtuple('Test', ['aligner', 'input', 'exp'])
        self.tests = [Test(i[0],
                           get_data_path(i[1]),
                           _get_named_data_path('%s.diamond' % i[1]))
                      for i in cases]

        seqs = skbio.read(_get_named_data_path('cache.faa'), format='fasta')
        self.cache = DiamondCache(list(seqs))
Example #38
0
def sequence_generator(input_fp):
    """Yield (id, sequence) from an input file

    Parameters
    ----------
    input_fp : filepath
        A filepath, which can be any valid fasta or fastq file within the
        limitations of scikit-bio's IO registry.

    Notes
    -----
    The use of this method is a stopgap to replicate the existing `parse_fasta`
    functionality while at the same time allowing for fastq support.

    Raises
    ------
    skbio.io.FormatIdentificationWarning
        If the format of the input file cannot be determined.

    Returns
    -------
    (str, str)
        The ID and sequence.

    """
    logger = logging.getLogger(__name__)
    kw = {}
    if sniff_fasta(input_fp)[0]:
        format = 'fasta'
    elif sniff_fastq(input_fp)[0]:
        format = 'fastq'

        # WARNING: the variant is currently forced to illumina 1.8 as the
        # quality scores are _not_ used in downstream processing. However, if
        # in the future, quality scores are to be interrogated, it is critical
        # that this variant parameter be exposed to the user at the command
        # line. The list of allowable paramters can be found here:
        # http://scikit-bio.org/docs/latest/generated/skbio.io.format.fastq.html#format-parameters
        kw['variant'] = 'illumina1.8'
    else:
        # usually happens when the fasta file is empty
        # so need to return no sequences (and warn)
        msg = "input file %s does not appear to be FASTA or FASTQ" % input_fp
        logger.warn(msg)
        warnings.warn(msg, UserWarning)
        return

    # some of the test code is using file paths, some is using StringIO.
    if isinstance(input_fp, io.TextIOBase):
        input_fp.seek(0)

    for record in skbio.read(input_fp, format=format, **kw):
        yield (record.metadata['id'], str(record))
Example #39
0
    def test_valid_files(self):
        for constructor in [BiologicalSequence, NucleotideSequence,
                            DNASequence, RNASequence, ProteinSequence]:
            for valid, kwargs, components in self.valid_files:
                for kwarg in kwargs:
                    _drop_kwargs(kwarg, 'constructor', 'filter')

                    seq_num = kwarg.get('seq_num', 1)
                    c = components[seq_num - 1]
                    expected = constructor(c[1], id=c[0], quality=c[2])

                    observed = read(valid, into=constructor, format='qseq',
                                    verify=False, **kwarg)
                    self.assertTrue(observed.equals(expected))
Example #40
0
    def test_summarize(self):
        gff = get_data_path('summarize.gff')
        seqs = [
            DNA('A' * 5000000,
                metadata={'id': 'gi|556503834|ref|NC_000913.3|'}),
            DNA('AG' * 2500000,
                metadata={'id': 'gi|556503834|ref|NC_000913.2|'})
        ]
        for (seq_id, imd), seq in zip(read(gff, format='gff3'), seqs):
            seq.interval_metadata = imd

        with StringIO() as obs, open(get_data_path('summarize.txt')) as exp:
            summarize(seqs, obs)
            self.assertEqual(obs.getvalue(), exp.read())
Example #41
0
    def parse_sam(diamond_res, column=None, collapse=False):
        '''Parse the output of diamond blastp/blastx.

        Parameters
        ----------
        diamond_res : str
            file path
        column : str
            The column used to pick the best hits.

        Returns
        -------
        pandas.DataFrame
            The best matched records for each query sequence.
        '''
        seqs = read(diamond_res, format='sam')
        columns = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch',
                   'gapopen', 'qstart', 'qend', 'sstart', 'send',
                   'evalue', 'bitscore', 'sequence']
        df = pd.DataFrame(columns=columns)
        for i, seq in enumerate(seqs):
            s = str(seq)

            qseqid = seq.metadata['QNAME']
            sseqid = seq.metadata['RNAME']
            pident = seq.metadata['ZI']
            length = seq.metadata['ZL']
            mismatch = seq.metadata['CIGAR']
            gapopen = ''
            qstart = seq.metadata['POS']
            qend = ''
            sstart = seq.metadata['ZS']
            send = ''
            evalue = seq.metadata['ZE']
            bitscore = seq.metadata['ZR']
            row = pd.Series([qseqid, sseqid, pident,
                             length, mismatch, gapopen,
                             qstart, qend, sstart, send,
                             evalue, bitscore, s],
                            index=columns)
            df.loc[i] = row

        if column is not None:
            idx = df.groupby('qseqid')[column].idxmax()
            df_max = df.loc[idx]
            df_max.index = idx.index
            df = df_max[['sseqid', 'evalue', 'bitscore', 'sequence']]
        else:
            df = df[['sseqid', 'evalue', 'bitscore', 'sequence']]
        return df
def main(argv):
    parser = argparse.ArgumentParser(
        description=
        'Filter sequences from biom table using a fasta file. Version ' +
        __version__)
    parser.add_argument('-i',
                        '--inputtable',
                        help='input biom table file name')
    parser.add_argument('-o', '--output', help='output biom file name')
    parser.add_argument('-f', '--fasta', help='filtering fasta file name')
    parser.add_argument(
        '-n',
        '--number',
        help='number of sOTUs from the fasta file to use (-1 means all)',
        default=-1,
        type=int)
    parser.add_argument(
        '--ignore_table_seq_length',
        help=
        "don't trim the fasta file sequences to the biom table sequence length",
        action='store_true')

    args = parser.parse_args(argv)

    seqs = skbio.read(args.fasta, format='fasta')
    table = biom.load_table(args.inputtable)
    totorigreads = table.sum(axis='whole')
    print('loaded biom table %s containing %d unique sOTUs' %
          (args.inputtable, table.shape[0]))
    length = min(map(len, table.ids(axis='observation')))
    if not args.ignore_table_seq_length:
        seqs = trim_seqs(seqs, seqlength=length)

    # if need to remove only a subset of the sOTUs from the fasta file
    seqs = list(seqs)
    if args.number >= 0:
        if len(seqs) > args.number:
            seqs = seqs[:args.number]

    print('filtering %d sOTUs (from file %s)' % (len(seqs), args.fasta))
    outtable = remove_seqs(table, seqs)
    totfilteredreads = outtable.sum(axis='whole')
    print('removed %d reads (from %d to %d)' %
          (totorigreads - totfilteredreads, totorigreads, totfilteredreads))
    print('saving filtered biom table with %d sOTUs to file %s' %
          (outtable.shape[0], args.output))

    with biom.util.biom_open(args.output, 'w') as f:
        outtable.to_hdf5(f, "filterbiomseqs")
Example #43
0
def annotate(in_fp, in_fmt, out_dir, out_fmt,
             cpus, kingdom, force, config, cache=False):
    '''Annotate the sequences in the input file.

    Parameters
    ----------
    in_fp : file_handle
        Input file handler object.
    in_fmt : str
        Input file format.
    out_dir : str
        Output file directory.
    out_fmt : str
        Output file format.
    kingdom : str
        Kingdom index corresponding to database (i.e. virus, bacteria ...)
    cpus : int
        Number of cpus to use.
    force : boolean
        Force to overwrite.
    config : ``micronota.config.Configuration``
        Container for configuration options.
    '''
    _overwrite(out_dir, overwrite=force)
    makedirs(out_dir, exist_ok=force)
    prefix = splitext(basename(in_fp))[0]
    fn = '{p}.{f}'.format(p=prefix, f=out_fmt)
    out_fp = join(out_dir, fn)

    # declare DiamondCache
    if cache:
        cache = DiamondCache()
    else:
        cache = None

    with open(out_fp, 'w') as out:
        for seq in read(in_fp, format=in_fmt):
            # dir for useful intermediate files for the current input seq
            # replace non alnum char with "_"
            seq_fn = ''.join(x if x.isalnum() else '_'
                             for x in seq.metadata['id'])
            seq_dir = join(out_dir, seq_fn)
            # identify all features specified
            im = identify_all_features(seq, seq_dir, config)
            # pass in and retrieve DiamondCache
            im, cache = annotate_all_cds(im, seq_dir, kingdom, config, cache=cache)

            seq.interval_metadata.concat(IntervalMetadata(im), inplace=True)
            seq.write(out, format=out_fmt)
Example #44
0
    def test_valid_files(self):
        for constructor in [partial(Sequence), partial(DNA, validate=False),
                            partial(RNA, validate=False),
                            partial(Protein, validate=False)]:
            for valid, kwargs, components in self.valid_files:
                for kwarg in kwargs:
                    _drop_kwargs(kwarg, 'constructor', 'filter')

                    seq_num = kwarg.get('seq_num', 1)
                    c = components[seq_num - 1]
                    expected = constructor(c[1], id=c[0], quality=c[2])

                    observed = read(valid, into=constructor.func,
                                    format='qseq', verify=False, **kwarg)
                    self.assertTrue(observed.equals(expected))
Example #45
0
def check_seq(in_seq, in_fmt=None, discard=lambda s: len(s) < 500):
    '''Validate and filter input seq file.

    1. filter seq;
    2. validate seq IDs (no duplicates)
    3. remove gaps in the sequence if there is any

    Parameters
    ----------
    in_seq : str or Iterable of ``Sequence`` objects
        input seq file path if it is a str
    in_fmt : str
        the format of seq file
    discard : callable
        a callable that applies on a ``Sequence`` and return a boolean

    Yields
    ------
    ``Sequence`` object

    TODO
    ----
    add an option to ignore the abnormal seq and continue yielding
    '''
    logger.info('Filter and validate input sequences')
    ids = set()

    if isinstance(in_seq, str):
        # allow lowercase in DNA seq
        in_seq = read(in_seq, format=in_fmt, constructor=DNA, lowercase=True)

    for seq in in_seq:
        seq = seq.degap()
        if discard(seq):
            continue

        if in_fmt == 'genbank':
            seq.metadata['id'] = seq.metadata['LOCUS']['locus_name']
        try:
            ident = seq.metadata['id']
        except KeyError:
            raise KeyError('Ill input file format: at least one sequences do not have IDs.')
        if ident in ids:
            raise ValueError(
                'Duplicate seq IDs in your input file: {}'.format(ident))
        else:
            ids.add(ident)
            yield seq
Example #46
0
def subsample_dm(distmat, mapping_file, max, category, output):
    """Subsample the distmat to max samples per category value"""
    mf = pd.read_csv(mapping_file,
                     '\t',
                     converters=defaultdict(str),
                     dtype=str)
    mf.set_index('#SampleID', inplace=True)

    id_to_cat = dict(mf[category])

    def bin_f(x):
        return id_to_cat.get(x)

    dm = read(distmat, into=DistanceMatrix)
    dm = dm.filter([id for _, id in isubsample(dm.ids, max, bin_f=bin_f)])
    dm.to_file(output)
Example #47
0
def convert(in_f, in_fmt, out_f, out_fmt):
    '''convert between file formats

    Parameters
    ----------
    in_fmt : str
        input file format
    out_fmt : str
        output file format
    in_f : str
        input file path
    out_f: str
        output file path
    '''
    for obj in read(in_f, format=in_fmt):
        write(obj, format=out_fmt, into=out_f)
Example #48
0
def _make_nr_foundation_alignment(foundation_alignment_fh,
                                  extension_genus_accession_list_dic):
    all_genus_list = extension_genus_accession_list_dic.keys()
    global foundation_accession_genus_dic
    foundation_accession_genus_dic = {}
    for seq in skbio.read(foundation_alignment_fh, format="fasta"):
        try:
            for i in all_genus_list:
                if_case = (re.search(";" + i + ";", seq.description) or
                           re.search("g__" + i + ";", seq.description))
                if if_case:
                    all_genus_list.remove(i)
                    foundation_accession_genus_dic[seq.id] = i
                    yield seq
        except:
            pass
Example #49
0
def _make_nr_foundation_alignment(foundation_alignment_fh,
                                  extension_genus_accession_list_dic):
    all_genus_list = extension_genus_accession_list_dic.keys()
    global foundation_accession_genus_dic
    foundation_accession_genus_dic = {}
    for seq in skbio.read(foundation_alignment_fh, format="fasta"):
        try:
            for i in all_genus_list:
                if_case = (re.search(";" + i + ";", seq.description)
                           or re.search("g__" + i + ";", seq.description))
                if if_case:
                    all_genus_list.remove(i)
                    foundation_accession_genus_dic[seq.id] = i
                    yield seq
        except:
            pass
Example #50
0
def sequence_generator(input_fp):
    """Yield (id, sequence) from an input file

    Parameters
    ----------
    input_fp : filepath
        A filepath, which can be any valid fasta or fastq file within the
        limitations of scikit-bio's IO registry.

    Notes
    -----
    The use of this method is a stopgap to replicate the existing `parse_fasta`
    functionality while at the same time allowing for fastq support.

    Raises
    ------
    skbio.io.FormatIdentificationWarning
        If the format of the input file cannot be determined.

    Returns
    -------
    (str, str)
        The ID and sequence.

    """
    logger = logging.getLogger(__name__)
    kw = {}
    if sniff_fasta(input_fp)[0]:
        format = 'fasta'
    elif sniff_fastq(input_fp)[0]:
        format = 'fastq'

        kw['variant'] = _get_fastq_variant(input_fp)
    else:
        # usually happens when the fasta file is empty
        # so need to return no sequences (and warn)
        msg = "input file %s does not appear to be FASTA or FASTQ" % input_fp
        logger.warn(msg)
        warnings.warn(msg, UserWarning)
        return

    # some of the test code is using file paths, some is using StringIO.
    if isinstance(input_fp, io.TextIOBase):
        input_fp.seek(0)

    for record in skbio.read(input_fp, format=format, **kw):
        yield (record.metadata['id'], str(record))
def _fasta_from_sqlite(conn, input_fasta_fp, output_fasta_fp):
    input_seqs = skbio.read(input_fasta_fp,
                            format='fasta',
                            constructor=skbio.DNA)
    c = conn.cursor()
    # Create a second in-memory table with the following schema (displayed
    # below with dummy data):
    # feature_id | sequence_string
    # -----------|------------------
    # feature1   | ACGTACGTACGTACGT
    # feature2   | GGGGAAAACCCCTTTT
    # feature3   | TCAGAAAATTTTTCAG
    # feature4   | AAAAAAAAAAAAAAAA
    # feature5   | GGGGGGGGGGGGGGGG
    c.execute('CREATE TABLE rep_seqs (feature_id TEXT PRIMARY KEY, '
              'sequence_string TEXT NOT NULL);')
    c.executemany('INSERT INTO rep_seqs VALUES (?, ?);',
                  [(seq.metadata['id'], str(seq)) for seq in input_seqs])
    conn.commit()
    # Preemptively sort the table to deal with tie-breaking, later.
    # This is a table, not a view, because we want/need sqlite's rowid.
    c.execute('CREATE TABLE sorted_feature_cluster_map AS '
              'SELECT * FROM feature_cluster_map ORDER BY cluster_id ASC,'
              'feature_id ASC;')
    c.execute('CREATE INDEX idx2 ON '
              'sorted_feature_cluster_map(cluster_id, count);')
    conn.commit()
    # The results from this query should look like the following (displayed
    # below with dummy data):
    # cluster_id | sequence_string
    # -----------|------------------
    # r1         | ACGTACGTACGTACGT
    # r2         | AAAAAAAAAAAAAAAA
    c.execute('''SELECT fcm.cluster_id, rs.sequence_string, MAX(fcm.count)
                   FROM sorted_feature_cluster_map fcm
             INNER JOIN rep_seqs rs ON rs.feature_id = fcm.feature_id
               GROUP BY fcm.cluster_id
               ORDER BY fcm.cluster_id ASC;
    ''')
    with open(output_fasta_fp, 'w') as output_seqs:
        while True:
            partial_results = c.fetchmany(size=100)
            if partial_results:
                output_seqs.writelines(
                    ['>%s\n%s\n' % (i, s) for (i, s, _) in partial_results])
            else:
                break
Example #52
0
    def __init__(self, config):
        biom_fp = config.get("distance", "biom_table")
        tree_path = config.get("distance", "rep_tree")

        assert(biom_fp and tree_path)

        self.otu_table = biom.load_table(biom_fp)
        self.sample_names = self.otu_table.ids(axis="sample")

        tree = read(tree_path, format="newick", into=TreeNode).root_at_midpoint()
        self.tips = [tip.name for tip in tree.tips()]

        ids = self.otu_table.ids(axis="observation")
        self.id_mask = np.array([id_ in self.tips for id_ in ids], dtype=bool)
        self.masked_ids = ids[self.id_mask]
        tree = tree.shear(self.masked_ids)
        self.tree_index = tree.to_array(nan_length_value=0.0)
Example #53
0
    def test_valid_files(self):
        for constructor in [partial(Sequence), partial(DNA, validate=False),
                            partial(RNA, validate=False),
                            partial(Protein, validate=False)]:
            for valid, kwargs, components in self.valid_files:
                for kwarg in kwargs:
                    _drop_kwargs(kwarg, 'constructor', 'filter')

                    seq_num = kwarg.get('seq_num', 1)
                    c = components[seq_num - 1]
                    expected = constructor(
                        c[1],
                        metadata={'id': c[0]},
                        positional_metadata={
                            'quality': np.array(c[2], np.uint8)})

                    observed = read(valid, into=constructor.func,
                                    format='qseq', verify=False, **kwarg)
                    self.assertEqual(observed, expected)
Example #54
0
    def _annotate_fp(self, fp, aligner='blastp', evalue=0.001, cpus=1,
                     outfmt='tab', params=None):
        '''Annotate the sequences in the file.'''

        if self.has_cache():
            # Build cache
            self.cache.build()
            dbs = [self.cache.db] + self.dat
        else:
            dbs = self.dat

        found = []
        res = pd.DataFrame()
        seqs = []
        for db in dbs:
            out_prefix = splitext(basename(db))[0]
            daa_fp = join(self.out_dir, '%s.daa' % out_prefix)
            out_fp = join(self.out_dir, '%s.diamond' % out_prefix)
            self.run_blast(fp, daa_fp, db, aligner=aligner,
                           evalue=evalue, cpus=cpus, params=params)
            self.run_view(daa_fp, out_fp, params={'--outfmt': outfmt})
            res = res.append(self.parse_tabular(out_fp))

            found.extend(res.index)
            # save to a tmp file the seqs that do not hit current database
            new_fp = join(self.tmp_dir, '%s.fa' % out_prefix)
            with open(new_fp, 'w') as f:
                for seq in read(fp, format='fasta'):
                    if seq.metadata['id'] not in found:
                        seq.write(f, format='fasta')
                        seqs.append(seq)
            # no seq left
            if stat(new_fp).st_size == 0:
                break
            else:
                fp = new_fp

        # Update cache (inplace)
        if self.has_cache():
            self.cache.update(seqs)
            self.cache.close()
        return res
Example #55
0
    def test_fastq_to_sequence(self):
        for constructor in [Sequence, DNA, RNA, Protein]:
            for valid_files, kwargs, components in self.valid_configurations:
                for valid in valid_files:
                    # skip empty file case since we cannot read a specific
                    # sequencefrom an empty file
                    if len(components) == 0:
                        continue

                    for observed_kwargs in kwargs:
                        expected_kwargs = {}

                        # TODO:
                        # some of the test files contain characters which are
                        # invalid for RNA, so don't validate for now. Need to
                        # fix this
                        if constructor is RNA:
                            observed_kwargs['validate'] = False
                            expected_kwargs['validate'] = False

                        _drop_kwargs(observed_kwargs, 'constructor')

                        # Can't use partials for this because the read
                        # function below can't operate on partials
                        if hasattr(constructor, 'lowercase'):
                            expected_kwargs['lowercase'] = 'introns'
                            observed_kwargs['lowercase'] = 'introns'

                        seq_num = observed_kwargs.get('seq_num', 1)
                        c = components[seq_num - 1]
                        expected = \
                            constructor(
                                c[2], metadata={'id': c[0],
                                                'description': c[1]},
                                positional_metadata={'quality': np.array(c[3],
                                                     dtype=np.uint8)},
                                **expected_kwargs)

                        observed = read(valid, into=constructor,
                                        format='fastq', verify=False,
                                        **observed_kwargs)
                        self.assertEqual(observed, expected)
Example #56
0
    def test_fastq_to_sequence(self):
        for constructor in [BiologicalSequence, NucleotideSequence,
                            DNASequence, RNASequence, ProteinSequence]:
            for valid, kwargs, components in self.valid_files:
                # skip empty file case since we cannot read a specific sequence
                # from an empty file
                if len(components) == 0:
                    continue

                for kwarg in kwargs:
                    _drop_kwargs(kwarg, 'constructor')

                    seq_num = kwarg.get('seq_num', 1)
                    c = components[seq_num - 1]
                    expected = constructor(c[2], id=c[0], description=c[1],
                                           quality=c[3])

                    observed = read(valid, into=constructor, format='fastq',
                                    verify=False, **kwarg)
                    self.assertTrue(observed.equals(expected))
Example #57
0
    def setUp(self):
        self.test_dir = abspath(
            join('micronota', 'db', 'tests', 'data', 'uniref', 'uniref100'))
        files = [
            'Swiss-Prot_Archaea.fna',
            'Swiss-Prot_Bacteria.fna',
            'Swiss-Prot_Eukaryota.fna',
            'Swiss-Prot_Viruses.fna',
            'TrEMBL_Archaea.fna',
            'TrEMBL_Bacteria.fna',
            'TrEMBL_Eukaryota.fna',
            'TrEMBL_Viruses.fna']
        files = [join(self.test_dir, f) for f in files]
        self.tmp = mkdtemp()
        self.test1 = join(self.tmp, 'test1.fna')
        self.test1_exp = 'test1.genbank'
        with open(self.test1, 'w') as f:
            for seq in read(files[1], format='fasta'):
                write(seq, format='fasta', into=f)

        self.obs_tmp = mkdtemp()
Example #58
0
    def parse_sam(diamond_res):
        '''Parse the output of diamond blastp/blastx.

        Parameters
        ----------
        diamond_res : str
            file path

        Returns
        -------
        pandas.DataFrame
            The best matched records for each query sequence.
        '''
        columns = ['qseqid', 'sseqid', 'pident', 'qlen', 'mismatch',
                   'qstart', 'sstart', 'evalue', 'bitscore', 'sseq']
        df = pd.DataFrame(columns=columns)
        try:
            seqs = read(diamond_res, format='sam')
        except StopIteration:
            return df
        for i, seq in enumerate(seqs):
            sseq = str(seq)

            qseqid = seq.metadata['QNAME']
            sseqid = seq.metadata['RNAME']
            pident = seq.metadata['ZI']
            qlen = seq.metadata['ZL']
            mismatch = seq.metadata['CIGAR']
            qstart = seq.metadata['POS']
            sstart = seq.metadata['ZS']
            evalue = seq.metadata['ZE']
            bitscore = seq.metadata['ZR']
            row = pd.Series([qseqid, sseqid, pident,
                             qlen, mismatch,
                             qstart, sstart,
                             evalue, bitscore, sseq],
                            index=columns)
            df.loc[i] = row
        return df