Ejemplos de DNAFASTAFormat.DNAFASTAFormat en Python, ejemplos de q2_types.feature_data.DNAFASTAFormat.DNAFASTAFormat en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: _chimera.py Proyecto: jcmcnch/q2-vsearch

def _uchime_denovo(sequences, table, dn, mindiffs, mindiv, minh, xn):
    # this function only exists to simplify testing
    chimeras = DNAFASTAFormat()
    nonchimeras = DNAFASTAFormat()
    uchime_stats = UchimeStatsFmt()
    with tempfile.NamedTemporaryFile() as fasta_with_sizes:
        with tempfile.NamedTemporaryFile() as temp_chimeras:
            _fasta_with_sizes(str(sequences), fasta_with_sizes.name, table)
            cmd = ['vsearch',
                   '--uchime_denovo', fasta_with_sizes.name,
                   '--uchimeout', str(uchime_stats),
                   '--nonchimeras', str(nonchimeras),
                   '--chimeras', temp_chimeras.name,
                   '--dn', str(dn),
                   '--mindiffs', str(mindiffs),
                   '--mindiv', str(mindiv),
                   '--minh', str(minh),
                   '--xn', str(xn),
                   '--qmask', 'none',  # ensures no lowercase DNA chars
                   '--xsize',
                   '--minseqlength', '1',
                   '--fasta_width', '0']
            run_command(cmd)
            # this processing step should be removed, pending fix of:
            # https://github.com/qiime2/q2-vsearch/issues/39
            _fix_chimera_ids(temp_chimeras, chimeras)

    return cmd, chimeras, nonchimeras, uchime_stats

Ejemplo n.º 2

0

Mostrar archivo

def filter_seqs_length_by_taxon(
        sequences: DNAFASTAFormat,
        taxonomy: pd.Series,
        labels: str,
        min_lens: int = None,
        max_lens: int = None,
        global_min: int = None,
        global_max: int = None) -> (DNAFASTAFormat, DNAFASTAFormat):
    # Validate filtering options
    if min_lens is max_lens is None:
        raise ValueError(ERROR_FILTER_OPTIONS + 'min_lens, max_lens.')

    # validate that all seqIDs are present in taxonomy
    # Note we view as DNAIterator to take a first pass (should take a few
    # seconds) as initial validation before performing length filtering.
    seq_ids = {i.metadata['id'] for i in sequences.view(DNAIterator)}
    _index_is_superset(seq_ids, set(taxonomy.index))

    # set filter options
    mins = maxs = None
    if min_lens is not None:
        if len(labels) != len(min_lens):
            raise ValueError(
                'labels and min_lens must contain the same number of elements')
        else:
            mins = {k: v for k, v in zip(labels, min_lens)}

    if max_lens is not None:
        if len(labels) != len(max_lens):
            raise ValueError(
                'labels and max_lens must contain the same number of elements')
        else:
            maxs = {k: v for k, v in zip(labels, max_lens)}

    # Stream seqs, apply filter(s)
    result = DNAFASTAFormat()
    failures = DNAFASTAFormat()
    with result.open() as out_fasta, failures.open() as out_failed:
        for seq in sequences.view(DNAIterator):
            # taxon is required, we always use taxon-based filtering
            # grab taxon affiliation for seq
            taxon = taxonomy[seq.metadata['id']]
            # search taxon for filter terms
            # NOTE: we find all matching search terms and pass them all to
            # _seq_length_within_range below; that function determines and
            # applies the most stringent matching length thresholds.
            taxahits = [t for t in labels if t in taxon]
            # if there are no taxahits or global filters, just write out
            if not any(taxahits) and global_min is global_max is None:
                seq.write(out_fasta)
            # if there are taxahits or global filters, always check length
            elif _seq_length_within_range(seq, taxahits, mins, maxs,
                                          global_min, global_max):
                seq.write(out_fasta)
            else:
                seq.write(out_failed)
    return result, failures

Ejemplo n.º 3

0

Mostrar archivo

Archivo: test_cluster_features.py Proyecto: thermokarst-forks/q2-vsearch

 def setUp(self):
     super().setUp()
     input_sequences_fp = self.get_data_path('dna-sequences-1.fasta')
     self.input_sequences = DNAFASTAFormat(input_sequences_fp, mode='r')
     ref_sequences_1_fp = self.get_data_path('ref-sequences-1.fasta')
     self.ref_sequences_1 = DNAFASTAFormat(ref_sequences_1_fp, mode='r')
     ref_sequences_2_fp = self.get_data_path('ref-sequences-2.fasta')
     self.ref_sequences_2 = DNAFASTAFormat(ref_sequences_2_fp, mode='r')
     self.input_table = biom.Table(
         np.array([[100, 101, 103], [1, 1, 2], [4, 5, 6], [7, 8, 9]]),
         ['feature1', 'feature2', 'feature3', 'feature4'],
         ['sample1', 'sample2', 'sample3'])
     self.input_sequences_list = _read_seqs(self.input_sequences)

Ejemplo n.º 4

0

Mostrar archivo

    def test_uchime_ref_no_chimeras(self):
        ref_sequences_fp = self.get_data_path('ref-sequences-4.fasta')
        ref_sequences = DNAFASTAFormat(ref_sequences_fp, mode='r')
        with redirected_stdio(stderr=os.devnull):
            chime, nonchime, stats = uchime_ref(
                sequences=self.input_sequences,
                table=self.input_table,
                reference_sequences=ref_sequences)

        obs_chime = _read_seqs(chime)
        exp_chime = []
        self.assertEqual(obs_chime, exp_chime)

        # sequences are reverse-sorted by abundance in output
        obs_nonchime = _read_seqs(nonchime)
        exp_nonchime = [
            self.input_sequences_list[0], self.input_sequences_list[1],
            self.input_sequences_list[2], self.input_sequences_list[3]
        ]
        self.assertEqual(obs_nonchime, exp_nonchime)

        with stats.open() as stats_fh:
            stats_text = stats_fh.read()
        self.assertTrue('feature1' in stats_text)
        self.assertTrue('feature2' in stats_text)
        self.assertTrue('feature3' in stats_text)
        self.assertTrue('feature4' in stats_text)
        stats_lines = [e for e in stats_text.split('\n') if len(e) > 0]
        self.assertEqual(len(stats_lines), 4)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: test_mafft.py Proyecto: qiime2/q2-alignment

    def _prepare_sequence_data(self):
        sequences_fp = self.get_data_path('unaligned-dna-sequences-1.fasta')
        sequences = DNAFASTAFormat(sequences_fp, mode='r')
        alignment_fp = self.get_data_path('aligned-dna-sequences-1.fasta')
        alignment = AlignedDNAFASTAFormat(alignment_fp, mode='r')
        exp = skbio.TabularMSA([
            skbio.DNA('AGGGGG-',
                      metadata={
                          'id': 'aln-seq-1',
                          'description': ''
                      }),
            skbio.DNA('AGGGGGG',
                      metadata={
                          'id': 'aln-seq-2',
                          'description': ''
                      }),
            skbio.DNA('AGGGGGG', metadata={
                'id': 'seq1',
                'description': ''
            }),
            skbio.DNA('-GGGGGG', metadata={
                'id': 'seq2',
                'description': ''
            })
        ])

        return alignment, sequences, exp

Ejemplo n.º 6

0

Mostrar archivo

Archivo: test_format.py Proyecto: Oddant1/q2-types

    def test_dna_fasta_format_empty_file(self):
        filepath = os.path.join(self.temp_dir.name, 'empty')
        with open(filepath, 'w') as fh:
            fh.write('\n')
        format = DNAFASTAFormat(filepath, mode='r')

        format.validate()

Ejemplo n.º 7

0

Mostrar archivo

Archivo: test_mafft.py Proyecto: qiime2/q2-alignment

    def test_duplicate_input_ids(self):
        input_fp = self.get_data_path('unaligned-duplicate-ids.fasta')
        input_sequences = DNAFASTAFormat(input_fp, mode='r')

        with self.assertRaisesRegex(ValueError, 'the unaligned.*id1'):
            with redirected_stdio(stderr=os.devnull):
                mafft(input_sequences)

Ejemplo n.º 8

0

Mostrar archivo

Archivo: test_format.py Proyecto: Oddant1/q2-types

    def test_dna_fasta_format_invalid_characters(self):
        filepath = self.get_data_path('not-dna-sequences.fasta')
        format = DNAFASTAFormat(filepath, mode='r')

        with self.assertRaisesRegex(ValidationError, "Invalid character '1' "
                                                     ".*0 on line 2"):
            format.validate()

Ejemplo n.º 9

0

Mostrar archivo

Archivo: test_format.py Proyecto: Oddant1/q2-types

    def test_dna_fasta_format_consecutive_IDs(self):
        filepath = self.get_data_path('dna-sequences-consecutive-ids.fasta')
        format = DNAFASTAFormat(filepath, mode='r')

        with self.assertRaisesRegex(
                ValidationError, 'consecutive descriptions.*1'):
            format.validate()

Ejemplo n.º 10

0

Mostrar archivo

Archivo: test_format.py Proyecto: Oddant1/q2-types

    def test_dna_fasta_format_id_starts_with_space(self):
        filepath = self.get_data_path(
            'dna-sequences-id-starts-with-space.fasta')
        format = DNAFASTAFormat(filepath, mode='r')

        with self.assertRaisesRegex(ValidationError, '1 starts with a space'):
            format.validate()

Ejemplo n.º 11

0

Mostrar archivo

def _process_primers(primer_fwd: Union[str, None],
                     primer_rev: Union[str, None]) -> DNAFASTAFormat:
    """
    Convert provided primers into skbio DNA format. Will reverse complement
    the reverse primer, if provided.

    Arguments:
        primer_fwd (str, None): forward primer
        primer_rev (str, None): reverse primer

    Returns:
        primers_fasta (DNAFASTAFormat): primers in FASTA format
    """

    primers = {
        'forward':
        DNA(primer_fwd, metadata={'id': 'forward'}) if primer_fwd else None,
        'reverse':
        DNA(primer_rev, metadata={
            'id': 'reverse'
        }).reverse_complement() if primer_rev else None
    }

    # save primers in that format to pass them to mafft_add
    primers_fasta = DNAFASTAFormat()
    with primers_fasta.open() as out:
        [primer.write(out) for primer in primers.values() if primer]

    return primers_fasta

Ejemplo n.º 12

0

Mostrar archivo

Archivo: test_consensus_assignment.py Proyecto: nbokulich/q2-feature-classifier

    def setUp(self):
        super().setUp()
        taxonomy = Artifact.import_data('FeatureData[Taxonomy]',
                                        self.get_data_path('taxonomy.tsv'))
        self.taxonomy = taxonomy.view(pd.Series)
        self.taxartifact = taxonomy
        # TODO: use `Artifact.import_data` here once we have a transformer
        # for DNASequencesDirectoryFormat -> DNAFASTAFormat
        reads_fp = self.get_data_path('se-dna-sequences.fasta')
        reads = DNAFASTAFormat(reads_fp, mode='r')
        self.reads = Artifact.import_data('FeatureData[Sequence]', reads)

        fitter = getattr(feature_classifier.methods,
                         'fit_classifier_' + _specific_fitters[0][0])
        self.classifier = fitter(self.reads, self.taxartifact).classifier

        self.query = Artifact.import_data(
            'FeatureData[Sequence]',
            pd.Series({
                'A':
                'GCCTAACACATGCAAGTCGAACGGCAGCGGGGGAAAGCTTGCTTTCCTGCCGGCGA',
                'B':
                'TAACACATGCAAGTCAACGATGCTTATGTAGCAATATGTAAGTAGAGTGGCGCACG',
                'C':
                'ATACATGCAAGTCGTACGGTATTCCGGTTTCGGCCGGGAGAGAGTGGCGGATGGGT',
                'D':
                'GACGAACGCTGGCGACGTGCTTAACACATGCAAGTCGTGCGAGGACGGGCGGTGCT'
                'TGCACTGCTCGAGCCGAGCGGCGGACGGGTGAGTAACACGTGAGCAACCTATCTCC'
                'GTGCGGGGGACAACCCGGGGAAACCCGGGCTAATACCG'
            }))

Ejemplo n.º 13

0

Mostrar archivo

def cluster_features_de_novo(sequences: DNAFASTAFormat, table: biom.Table,
                             perc_identity: float, threads: int = 1
                             ) -> (biom.Table, DNAFASTAFormat):
    clustered_sequences = DNAFASTAFormat()
    with tempfile.NamedTemporaryFile() as fasta_with_sizes:
        with tempfile.NamedTemporaryFile() as out_uc:
            _fasta_with_sizes(str(sequences), fasta_with_sizes.name, table)
            cmd = ['vsearch',
                   '--cluster_size', fasta_with_sizes.name,
                   '--id', str(perc_identity),
                   '--centroids', str(clustered_sequences),
                   '--uc', out_uc.name,
                   '--qmask', 'none',  # ensures no lowercase DNA chars
                   '--xsize',
                   '--threads', str(threads)]
            run_command(cmd)
            out_uc.seek(0)

            conn = _uc_to_sqlite(out_uc)
            collapse_f = _collapse_f_from_sqlite(conn)

    table = table.collapse(collapse_f, norm=False, min_group_size=1,
                           axis='observation',
                           include_collapsed_metadata=False)

    return table, clustered_sequences

Ejemplo n.º 14

0

Mostrar archivo

Archivo: orient.py Proyecto: thermokarst-forks/RESCRIPt

def orient_seqs(
    sequences: DNAFASTAFormat,
    reference_sequences: DNAFASTAFormat,
    perc_identity: float = 0.9,
    query_cov: float = 0.9,
    threads: int = 1,
    left_justify: bool = False,
) -> (DNAFASTAFormat, DNAFASTAFormat):
    matched_temp, notmatched = DNAFASTAFormat(), DNAFASTAFormat()
    # use vsearch to search query seqs against reference database
    # report orientation of query seqs relative to reference seqs.
    with tempfile.NamedTemporaryFile() as out:
        # note: qmask is disabled as DNAFASTAFormat requires all output seqs
        # to be uppercase. Could loop through output seqs to convert to upper
        # but which is faster: disabling masking or looping through with skbio?
        cmd = [
            'vsearch', '--usearch_global',
            str(sequences), '--matched',
            str(matched_temp), '--notmatched',
            str(notmatched), '--db',
            str(reference_sequences), '--id',
            str(perc_identity), '--maxaccepts', '1', '--strand', 'both',
            '--qmask', 'none', '--query_cov',
            str(query_cov), '--threads',
            str(threads), '--userfields', 'qstrand', '--userout', out.name
        ]
        if left_justify:
            cmd.append('--leftjust')
        run_command(cmd)
        with open(out.name, 'r') as orient:
            orientations = [line.strip() for line in orient]

    # if any query seqs are in reverse orientation, reverse complement
    if '-' in orientations:
        matched = DNAFASTAFormat()
        with matched.open() as out_fasta:
            for seq, orientation in zip(matched_temp.view(DNAIterator),
                                        orientations):
                if orientation == '+':
                    seq.write(out_fasta)
                elif orientation == '-':
                    seq.reverse_complement().write(out_fasta)
    else:
        matched = matched_temp

    return matched, notmatched

Ejemplo n.º 15

0

Mostrar archivo

 def setUp(self):
     super().setUp()
     aligned_input_fp = self.get_data_path('trim-test-alignment.fasta')
     unaligned_input_fp = self.get_data_path('trim-test-sequences.fasta')
     self.alignedseqs = AlignedDNAFASTAFormat(
         aligned_input_fp, mode='r').view(AlignedDNAIterator)
     self.seqs = DNAFASTAFormat(unaligned_input_fp,
                                mode='r').view(DNAIterator)

Ejemplo n.º 16

0

Mostrar archivo

Archivo: _call_otus.py Proyecto: cduvallet/q2-dbotu

def call_otus(table: pd.DataFrame,
              sequences: DNAFASTAFormat,
              gen_crit: float = 0.1,
              abund_crit: float = 10.0,
              pval_crit: float = 0.0005) -> (pd.DataFrame, DNAFASTAFormat):
    '''
    Read in input files, call OTUs, and return output feature table.

    table: pandas Dataframe
      sequence count table
    sequences: DNAFASTAFormat
      sequences fasta
    gen_crit, abund_crit, pval_crit: float
      threshold values for genetic criterion, abundance criterion, and distribution criterion (pvalue)
    '''

    # ensure valid argument values
    assert gen_crit >= 0
    assert abund_crit >= 0.0
    assert pval_crit >= 0.0 and pval_crit <= 1.0

    ## read in the sequences table
    #seq_table = read_sequence_table(seq_table_fh)

    ## set up the input fasta records
    # Note: calling str(DNAFastaFormat) returns the file path of the fasta
    records = SeqIO.index(str(sequences), 'fasta')

    # generate the caller object
    # Note: the dbotu code needs sequences in rows and samples in columns.
    # qiime feature tables have sequences in columns and samples in rows.
    # need to transpose the table when calling dbotu caller and before writing
    # results
    caller = dbotu.DBCaller(table.T,
                            records,
                            gen_crit,
                            abund_crit,
                            pval_crit,
                            log=None,
                            debug=None)

    # Call OTUs
    caller.run()

    # Get OTU table and sequences
    # Need to transpose to get back into qiime format
    dbotu_table = caller.otu_table().T

    # Write the representative sequences
    # First, initiate new object with type DNAFASTAFormat
    clustered_sequences = DNAFASTAFormat()
    # Pass it in to write_fasta as a file handle
    caller.write_fasta(open(str(clustered_sequences), 'w'))

    # Print the membership (only shows up if --verbose flag is used)
    caller.write_membership(sys.stdout)

    return dbotu_table, clustered_sequences

Ejemplo n.º 17

0

Mostrar archivo

Archivo: test_mafft.py Proyecto: qiime2/q2-alignment

    def test_duplicate_input_ids_in_unaligned(self):
        input_fp = self.get_data_path('unaligned-duplicate-ids.fasta')
        sequences = DNAFASTAFormat(input_fp, mode='r')

        alignment, _, _ = self._prepare_sequence_data()

        with self.assertRaisesRegex(ValueError, 'the unaligned.*id1'):
            with redirected_stdio(stderr=os.devnull):
                mafft_add(alignment, sequences)

Ejemplo n.º 18

0

Mostrar archivo

Archivo: test_consensus_assignment.py Proyecto: nbokulich/q2-feature-classifier

 def setUp(self):
     super().setUp()
     taxonomy = Artifact.import_data('FeatureData[Taxonomy]',
                                     self.get_data_path('taxonomy.tsv'))
     self.taxonomy = taxonomy.view(pd.Series)
     # TODO: use `Artifact.import_data` here once we have a transformer
     # for DNASequencesDirectoryFormat -> DNAFASTAFormat
     self.reads_fp = self.get_data_path('se-dna-sequences.fasta')
     self.reads = DNAFASTAFormat(self.reads_fp, mode='r')

Ejemplo n.º 19

0

Mostrar archivo

    def _prepare_sequence_data(self):
        input_fp = self.get_data_path('unaligned-dna-sequences-1.fasta')
        input_sequences = DNAFASTAFormat(input_fp, mode='r')
        exp = skbio.TabularMSA(
            [skbio.DNA('AGGGGGG', metadata={'id': 'seq1', 'description': ''}),
             skbio.DNA('-GGGGGG', metadata={'id': 'seq2', 'description': ''})]
        )

        return input_sequences, exp

Ejemplo n.º 20

0

Mostrar archivo

Archivo: test_mafft.py Proyecto: qiime2/q2-alignment

    def test_duplicate_input_ids_across_aligned_and_unaligned(self):
        input_fp = self.get_data_path('aligned-duplicate-ids-2.fasta')
        alignment = DNAFASTAFormat(input_fp, mode='r')

        _, sequences, _ = self._prepare_sequence_data()

        with self.assertRaisesRegex(ValueError, 'aligned and unaligned.*seq1'):
            with redirected_stdio(stderr=os.devnull):
                mafft_add(alignment, sequences)

Ejemplo n.º 21

0

Mostrar archivo

Archivo: test_mafft.py Proyecto: qiime2/q2-alignment

 def test_mafft_parttree_exception(self):
     input_fp = os.path.join(self.temp_dir.name, 'million.fasta')
     with open(input_fp, "w") as f:
         for i in range(0, 1000002):
             f.write('>%d\nAAGCAAGC\n' % i)
     input_sequences = DNAFASTAFormat(input_fp, mode='r')
     with self.assertRaisesRegex(ValueError, '1 million'):
         with redirected_stdio(stderr=os.devnull):
             mafft(input_sequences)

Ejemplo n.º 22

0

Mostrar archivo

    def setUp(self):
        super().setUp()
        focal_seqs1 = self.get_data_path('focal-seqs-1.fasta')
        self.focal_seqs1 = DNAFASTAFormat(focal_seqs1, 'r')

        context_seqs1 = self.get_data_path('context-seqs-1.fasta')
        self.context_seqs1 = DNAFASTAFormat(context_seqs1, 'r')

        context_md1 = self.get_data_path('context-metadata-1.tsv')
        self.context_md1 = qiime2.Metadata.load(context_md1)

        focal_seqs2 = self.get_data_path('focal-seqs-2.fasta')
        self.focal_seqs2 = DNAFASTAFormat(focal_seqs2, 'r')

        context_seqs2 = self.get_data_path('context-seqs-2.fasta')
        self.context_seqs2 = DNAFASTAFormat(context_seqs2, 'r')

        context_md2 = self.get_data_path('context-metadata-2.tsv')
        self.context_md2 = qiime2.Metadata.load(context_md2)

Ejemplo n.º 23

0

Mostrar archivo

Archivo: cross_validate.py Proyecto: mikerobeson/RESCRIPt

def _split_fasta(sequences, train_ids, test_ids):
    '''
    Split FeatureData[Sequence] artifact into two, based on two sets of IDs.
    sequences: FeatureData[Sequence] Artifact
    train_ids: set
    test_ids: set
    '''
    train_seqs = DNAFASTAFormat()
    test_seqs = DNAFASTAFormat()
    with train_seqs.open() as _train, test_seqs.open() as _test:
        for s in sequences.view(DNAIterator):
            _id = s.metadata['id']
            if s.metadata['id'] in train_ids:
                _train.write('>%s\n%s\n' % (_id, str(s)))
            elif s.metadata['id'] in test_ids:
                _test.write('>%s\n%s\n' % (_id, str(s)))
    train_seqs = q2.Artifact.import_data('FeatureData[Sequence]', train_seqs)
    test_seqs = q2.Artifact.import_data('FeatureData[Sequence]', test_seqs)
    return train_seqs, test_seqs

Ejemplo n.º 24

0

Mostrar archivo

Archivo: test_mafft.py Proyecto: qiime2/q2-alignment

 def test_failed_run_not_verbose(self):
     input_fp = self.get_data_path('unaligned-dna-sequences-1.fasta')
     input_sequences = DNAFASTAFormat(input_fp, mode='r')
     output_alignment = AlignedDNAFASTAFormat()
     unaligned_fp = str(input_sequences)
     aligned_fp = str(output_alignment)
     cmd = ["mafft", "--not-a-real-parameter", unaligned_fp]
     with self.assertRaises(subprocess.CalledProcessError):
         with redirected_stdio(stderr=os.devnull):
             run_command(cmd, aligned_fp, verbose=False)

Ejemplo n.º 25

0

Mostrar archivo

    def setUp(self):
        super().setUp()
        input_sequences_fp = self.get_data_path('dna-sequences-3.fasta')
        self.input_sequences = DNAFASTAFormat(input_sequences_fp, mode='r')
        self.input_sequences_list = _read_seqs(self.input_sequences)

        self.input_table = biom.Table(
            np.array([[100, 101, 103], [99, 98, 99], [4, 5, 6], [2, 2, 2]]),
            ['feature1', 'feature2', 'feature3', 'feature4'],
            ['sample1', 'sample2', 'sample3'])

Ejemplo n.º 26

0

Mostrar archivo

def _4(fmt: GISAIDDNAFASTAFormat) -> DNASequencesDirectoryFormat:
    data = _read_gisaid_dna_fasta(str(fmt))
    df = DNASequencesDirectoryFormat()
    ff = DNAFASTAFormat()

    with ff.open() as file:
        skbio.io.write(data, format='fasta', into=file)

    df.file.write_data(ff, DNAFASTAFormat)
    return df

Ejemplo n.º 27

0

Mostrar archivo

Archivo: _chimera.py Proyecto: jakereps/q2-vsearch

def _uchime_ref(sequences, table, reference_sequences, dn, mindiffs, mindiv,
                minh, xn, threads):
    # this function only exists to simplify testing
    chimeras = DNAFASTAFormat()
    nonchimeras = DNAFASTAFormat()
    uchime_stats = UchimeStatsFmt()
    with tempfile.NamedTemporaryFile() as fasta_with_sizes:
        _fasta_with_sizes(str(sequences), fasta_with_sizes.name, table)
        cmd = [
            'vsearch',
            '--uchime_ref',
            fasta_with_sizes.name,
            '--uchimeout',
            str(uchime_stats),
            '--nonchimeras',
            str(nonchimeras),
            '--chimeras',
            str(chimeras),
            '--dn',
            str(dn),
            '--mindiffs',
            str(mindiffs),
            '--mindiv',
            str(mindiv),
            '--minh',
            str(minh),
            '--xn',
            str(xn),
            '--db',
            str(reference_sequences),
            '--qmask',
            'none',  # ensures no lowercase DNA chars
            '--xsize',
            '--threads',
            str(threads),
            '--minseqlength',
            '1',
            '--fasta_width',
            '0'
        ]
        run_command(cmd)

    return cmd, chimeras, nonchimeras, uchime_stats

Ejemplo n.º 28

0

Mostrar archivo

def _4(fmt: GISAIDDNAFASTAFormat) -> DNASequencesDirectoryFormat:
    df = DNASequencesDirectoryFormat()
    ff = DNAFASTAFormat()

    with ff.open() as file, \
         tempfile.TemporaryFile(mode='w+') as temp_fh:
        data = _read_gisaid_dna_fasta(str(fmt), temp_fh)
        skbio.io.write(data, format='fasta', into=file)

    df.file.write_data(ff, DNAFASTAFormat)
    return df

Ejemplo n.º 29

0

Mostrar archivo

Archivo: degap.py Proyecto: mikerobeson/RESCRIPt

def degap_seqs(aligned_sequences: AlignedDNAIterator,
               min_length: int = 1) -> DNAFASTAFormat:
    result = DNAFASTAFormat()
    with result.open() as out_fasta:
        for seq in aligned_sequences:
            dg_seq = seq.degap()
            #  If seq is all gaps, then dg_seq will be an empty string
            #  and we'll not write it out.
            if len(dg_seq) >= min_length:
                dg_seq.write(out_fasta)
    return result

Ejemplo n.º 30

0

Mostrar archivo

Archivo: screenseq.py Proyecto: thermokarst-forks/RESCRIPt

def cull_seqs(sequences: DNAIterator, num_degenerates: int = 5,
              homopolymer_length: int = 8) -> DNAFASTAFormat:
    result = DNAFASTAFormat()
    with result.open() as out_fasta:
        for seq in sequences:
            degen = _filt_seq_with_degenerates(seq, num_degenerates)
            if not degen:
                poly = _filter_homopolymer(seq, homopolymer_length)
                if not poly:  # if we make it here, write seq to file
                    seq.write(out_fasta)
    return result