def setup(self):
        self._testfile = utils.get_temp_filename('test.fa')
        shutil.copy(utils.get_test_data('test.fa'), self._testfile)
        screed.read_fasta_sequences(self._testfile)

        self._db = screed.ScreedDB(self._testfile)
        self._ns = nostring()
 def setup(self):
     self._fileName = os.path.join(os.path.dirname(__file__), 'fastaRecovery')
     self._testfa = os.path.join(os.path.dirname(__file__), 'test.fa')
     screed.read_fasta_sequences(self._testfa)
     screed.ToFasta(self._testfa, self._fileName)
     screed.read_fasta_sequences(self._fileName)
     self.db = screed.ScreedDB(self._fileName)
Beispiel #3
0
    def setup(self):
        self._testfile = utils.get_temp_filename('test.fa')
        shutil.copy(utils.get_test_data('test.fa'), self._testfile)
        screed.read_fasta_sequences(self._testfile)

        self._db = screed.ScreedDB(self._testfile)
        self._ns = nostring()
Beispiel #4
0
 def setup(self):
     self._fileName = os.path.join(os.path.dirname(__file__),
                                   'fastaRecovery')
     self._testfa = os.path.join(os.path.dirname(__file__), 'test.fa')
     screed.read_fasta_sequences(self._testfa)
     screed.ToFasta(self._testfa, self._fileName)
     screed.read_fasta_sequences(self._fileName)
     self.db = screed.ScreedDB(self._fileName)
    def setup(self):
        self._fileName = utils.get_temp_filename('fastaRecovery')

        self._testfa = utils.get_temp_filename('test.fa')
        shutil.copy(utils.get_test_data('test.fa'), self._testfa)

        screed.read_fasta_sequences(self._testfa)
        screed.ToFasta(self._testfa, self._fileName)
        screed.read_fasta_sequences(self._fileName)
        self.db = screed.ScreedDB(self._fileName)
Beispiel #6
0
 def add_reads( self, reads, read_name_sep='_' ) :
     if exists( reads + '_screed' ) :
         print 'reads previously indexed.'
     else :
         print 'indexing records...'
         read_fasta_sequences(reads)
     print 'building database...'
     db = ScreedDB(reads)
     self.db = db
     self.read_name_sep = read_name_sep
     self.reads_path = reads
Beispiel #7
0
    def setup(self):
        thisdir = os.path.dirname(__file__)
        self._fqName = os.path.join(thisdir, 'fa_to_fq')
        self._faName = os.path.join(thisdir, 'fq_to_fa')
        self._testfa = os.path.join(thisdir, 'test.fa')

        screed.read_fasta_sequences(self._testfa)
        screed.ToFastq(self._testfa, self._fqName)  # Fasta db -> fasta text
        screed.read_fastq_sequences(self._fqName)  # Fastq file -> fastq db
        screed.ToFasta(self._fqName, self._faName)  # Fastq db -> fasta text
        screed.read_fasta_sequences(self._faName)  # Fasta file -> fasta db
        self.db = screed.ScreedDB(self._faName)
Beispiel #8
0
    def setup(self):

        self._fqName = utils.get_temp_filename('fa_to_fq')
        self._faName = utils.get_temp_filename('fq_to_fa')
        self._testfa = utils.get_temp_filename('test.fa')
        shutil.copy(utils.get_test_data('test.fa'), self._testfa)

        screed.read_fasta_sequences(self._testfa)
        screed.ToFastq(self._testfa, self._fqName)  # Fasta db -> fasta text
        screed.read_fastq_sequences(self._fqName)  # Fastq file -> fastq db
        screed.ToFasta(self._fqName, self._faName)  # Fastq db -> fasta text
        screed.read_fasta_sequences(self._faName)  # Fasta file -> fasta db
        self.db = screed.ScreedDB(self._faName)
Beispiel #9
0
    def setup(self):

        self._fqName = utils.get_temp_filename('fa_to_fq')
        self._faName = utils.get_temp_filename('fq_to_fa')
        self._testfa = utils.get_temp_filename('test.fa')
        shutil.copy(utils.get_test_data('test.fa'), self._testfa)

        screed.read_fasta_sequences(self._testfa)
        screed.ToFastq(self._testfa, self._fqName)  # Fasta db -> fasta text
        screed.read_fastq_sequences(self._fqName)  # Fastq file -> fastq db
        screed.ToFasta(self._fqName, self._faName)  # Fastq db -> fasta text
        screed.read_fasta_sequences(self._faName)  # Fasta file -> fasta db
        self.db = screed.ScreedDB(self._faName)
Beispiel #10
0
def split(fasta_fp, ref_fp, output_dp):
    if not os.path.exists(output_dp): os.makedirs(output_dp)
    fasta_db = screed.read_fasta_sequences(fasta_fp)

    n = 0
    first_entry = True
    for i, seq in enumerate(fasta_db):
        if i == 0:
            output = open(os.path.join(output_dp, 'tmp_{}.fasta'.format(n)),
                          'w+')
        elif i % split_num == 0:
            n += 1
            output.close()
            first_entry = True
            output = open(os.path.join(output_dp, 'tmp_{}.fasta'.format(n)),
                          'w+')

        if first_entry:
            output.write('>{}\n{}'.format(fasta_db[seq].name,
                                          str(fasta_db[seq].sequence)))
            first_entry = False
        else:
            output.write('\n>{}\n{}'.format(fasta_db[seq].name,
                                            str(fasta_db[seq].sequence)))
    output.close()
Beispiel #11
0
def main():
    if os.path.isfile(fa + "_screed"):
        from screed import ScreedDB
        fadb = ScreedDB(fa)
    else:
        fadb = screed.read_fasta_sequences(fa)

    makeKmerArray(fadb, int(args.ksize), norm)
Beispiel #12
0
def main():
        if os.path.isfile(fa + "_screed"):      
                from screed import ScreedDB
                fadb = ScreedDB(fa)
        else:
                fadb = screed.read_fasta_sequences(fa)

	makeSlices(fadb,maxSlice)
Beispiel #13
0
def main():
	if os.path.isfile(fa + "_screed"):	
		from screed import ScreedDB
		fadb = ScreedDB(fa)
	else:
		fadb = screed.read_fasta_sequences(fa)

	makeKmerArray(fadb,int(args.ksize),norm)
Beispiel #14
0
def openDB(fileName):
    """Opening screed DB; making if not already existing
    Args:
    fileName -- Name of sequence file or screedDB file
    """
    logging.info('{}: Making/opening screed database for: "{}"'.format(my_time(), fileName))
    
    # making db if needed
    if not fileName.endswith('_screed'):
        try:
            screed.read_fastq_sequences(fileName)
            fileName = fileName + '_screed'
        except KeyError:
            try:
                screed.read_fasta_sequences(fileName)
                fileName = fileName + '_screed'
            except IOError:
                msg = 'Cannot open {}'.format(fileName)
                raise IOError(msg)

    # init screed db
    return screed.ScreedDB(fileName)
Beispiel #15
0
def build_get_hit_length_function(referenceLengths):
    """
    Given the referenceLengths parameter return a lambda function that will
    map a reference sequence id to its sequence length

    The referenceLenths parameter may be either a python dict or a str name
    of a fasta file. In the latter case, the file is parsed to get lengths
    """
    if isinstance(referenceLengths, str):
        import screed
        # assume we have the path to a fasta file
        # has it been parsed by screed?
        if not os.path.exists("%s_screed" % (referenceLengths)):
            # TODO: just use Bio.SeqIO to get lengths if
            #   screed module or screed index is missing.
            #   screed is overkill here.
            screed.read_fasta_sequences(referenceLengths)
        refScreed = screed.ScreedDB(referenceLengths)

        return lambda h: len(refScreed[h]['sequence'])
    else:
        return lambda h: referenceLengths[h]
Beispiel #16
0
def save_reads_only_in_k2g(index):
    k2g = k2g_fastas[index]
    no_k2g = os.path.join(NO_K2G_FASTA_PATH, os.path.basename(k2g))
    record_names_no_k2g = get_record_names(no_k2g)
    filename = os.path.join(
        K2G_INTERSECT,
        os.path.basename(k2g).replace(".fasta", "_intersect.fasta"))
    result_fasta = open(filename, "a")
    print(record_names_no_k2g[:5])
    print(k2g)
    db_k2g = screed.read_fasta_sequences(k2g)
    for name in db_k2g:
        if name not in record_names_no_k2g:
            continue
        result_fasta.write(">{}\n{}\n".format(name, db_k2g[name].sequence))
Beispiel #17
0
def psl_filter(fasta_fp, psl_fp, output_fp):
    hits = pd.read_csv(psl_fp, sep='\t', names=['matches', 'misMatches', 'repMatches', 'nCount',
                                                  'qNumInsert', 'qBaseInsert', 'tNumInsert', 'tBaseInsert',
                                                  'strand', 'qName', 'qSize', 'qStart', 'qEnd',
                                                  'tName', 'tSize', 'tStart', 'tEnd',
                                                  'blockCount', 'blockSizes', 'qStarts', 'tStarts'])
    fasta_db = screed.read_fasta_sequences(fasta_fp)
    output = open(output_fp, 'w+')
    first_entry = True
    for seq in fasta_db:
        if seq in hits['qName'].unique():
            if first_entry:
                output.write('>{}\n{}'.format(fasta_db[seq].name, str(fasta_db[seq].sequence)))
                first_entry = False
            else:
                output.write('\n>{}\n{}'.format(fasta_db[seq].name, str(fasta_db[seq].sequence)))
    output.close()
"""
import time as t
import os, os.path
import glob
import platform
global array
import screed
import sys
from screed import ScreedDB
import string

#corriendo = "N"
bioinfo_path = "/Users/ivanjimenez/Desktop/CLASES/INTERNSHIPS/BIOINFO INTERNSHIP FILES/RESULTS/newresults/"
viralgenome_path = bioinfo_path + "copy_birna_x_virus.fa"

screed.read_fasta_sequences("/Users/ivanjimenez/Desktop/CLASES/INTERNSHIPS/BIOINFO INTERNSHIP FILES/RESULTS/newresults/copy_birna_x_virus.fa")
birna_x_virusdb = ScreedDB(viralgenome_path + "_screed")

#Setting the number of mismatches that are allowed...
k = 6

def getpath():
    wd = os.path.dirname(os.path.abspath(__file__))
    if platform.system() == 'Windows':
        array = wd.split('\\')
        destination = "\\\\".join(array)
        destination += '\\\\'
    else:
        array = wd.split('//')
        destination = "////".join(array)
        destination += '////'
Beispiel #19
0
def run(reads_dp, mothur_dp, dependencies_dp, num_cpu):
    """
    TODO: save outputs to independent directory for easier use by downstream processes
    """
    if not os.path.exists(reads_dp):
        sys.exit('read_dp {} DOES NOT EXIST'.format(reads_dp))
    if not os.path.exists(mothur_dp):
        os.makedirs(mothur_dp)
    if not os.path.exists(dependencies_dp):
        sys.exit('dependencies_dp {} DOES NOT EXIST'.format(dependencies_dp))
    generate_stability_file(reads_dp, mothur_dp)

    if not os.path.exists(
            os.path.join(dependencies_dp, 'silva',
                         'silva.seed_v132.pcr.align')):
        cmd = [
            '''mothur "#pcr.seqs(fasta={}, start=11894, end=25319, keepdots=F, processors={});"'''
            .format(
                os.path.join(dependencies_dp, 'silva',
                             'silva.seed_v132.align'), num_cpu)
        ]
        call(cmd, shell=True)
    if not os.path.exists(
            os.path.join(dependencies_dp, 'silva',
                         'silva.seed_v132.pcr.align')):
        sys.exit('silva.seed_v132.pcr.align NOT CREATED.')

    miniconda_bin_dp = os.path.join(dependencies_dp, 'mothur', 'bin')

    # make contigs
    cmd = [
        '''mothur "#set.dir(input={}, tempdefault={});
                       make.contigs(file='stability.files',
                                    processors={});
                       screen.seqs(fasta=current,
                                   group=current,
                                   maxambig=0,
                                   maxlength=275,
                                   minlength=240);
                       summary.seqs(count=current);
                       unique.seqs(fasta=current);
                       count.seqs(name=current,
                                  group=current);
                       align.seqs(fasta=current,
                                  reference={});
                       summary.seqs(fasta=current,
                                    count=current);
                       screen.seqs(fasta=current,
                                   count=current,
                                   summary=current,
                                   start=1968,
                                   end=11550,
                                   maxhomop=8);
                       summary.seqs(count=current,
                                    count=current);
                       filter.seqs(fasta=current,
                                   vertical=T,
                                   trump=.);
                       unique.seqs(fasta=current,
                                   count=current);
                       pre.cluster(fasta=current,
                                   count=current,
                                   diffs=2);
                       chimera.vsearch(fasta=current,
                                      count=current,
                                      dereplicate=t);
                       remove.seqs(fasta=current, accnos=current);
                       summary.seqs(fasta=current,
                                    count=current);
                       classify.seqs(fasta=current,
                                     count=current,
                                     reference={},
                                     taxonomy={},
                                     cutoff=80);
                       remove.lineage(fasta=current,
                                      count=current,
                                      taxonomy=current,
                                      taxon=Chloroplast-Mitochondria-unknown-Archaea-Eukaryota);
                       summary.seqs(fasta=current,
                                    count=current);
                       cluster.split(fasta=current,
                                     count=current,
                                     taxonomy=current,
                                     splitmethod=classify,
                                     taxlevel=4,
                                     cutoff=0.03);
                       make.shared(list=current,
                                   count=current,
                                   label=0.03);
                       classify.otu(list=current,
                                    count=current,
                                    taxonomy=current,
                                    label=0.03);
                       tree.shared(shared=current,
                                   calc=jest-thetayc-braycurtis);
                       get.oturep(column=current,
                                  name=current,
                                  fasta=current,
                                  list=current);"'''.format(
            mothur_dp, miniconda_bin_dp, num_cpu,
            os.path.join(dependencies_dp, 'silva',
                         'silva.seed_v132.pcr.align'),
            os.path.join(dependencies_dp, 'silva', 'silva.nr_v132.align'),
            os.path.join(dependencies_dp, 'silva', 'silva.nr_v132.tax'))
    ]
    call(cmd, shell=True)
    #db = screed.read_fasta_sequences(os.path.join(mothur_dp, 'stability.trim.contigs.good.unique.good.filter.unique.precluster.pick.opti_mcc.unique_list.0.03.rep.fasta'))
    db = screed.read_fasta_sequences(
        os.path.join(
            mothur_dp,
            'stability.trim.contigs.good.unique.good.filter.unique.precluster.pick.pick.opti_mcc.0.03.rep.fasta'
        ))
    output = open(os.path.join(mothur_dp, '../', 'otus.fasta'), 'w+')
    for otu in db:
        output.write('>{}\n'.format(otu.split('\t')[1].split('|')[0]))
        output.write('{}\n'.format(db[otu].sequence))
    output.close()
    return None
Beispiel #20
0
 def setup(self):
     self._testfa = os.path.join(os.path.dirname(__file__),
                                 'test-whitespace.fa')
     screed.read_fasta_sequences(self._testfa)
     self.db = screed.ScreedDB(self._testfa)
 def setup(self):
     self._testfa = os.path.join(os.path.dirname(__file__), 'test.fa')
     screed.read_fasta_sequences(self._testfa)
     self.db = screed.ScreedDB(self._testfa)
def plotHitCoverageByLengthBins(ax, lengths, hits, referenceLengths, bins=20, lengthRange=None, barcolor='b',baredgecolor='k',hlog=False,hcolor='r', includeMissed=False):
    """
    Given a dictionary of transcript lengths, a dictionary of hits, and a dict of reference sequence lengths...
    Produce a plot of reference coverate rate by length bin. IE: What fracton of total residues in the reference sequences were matched.

    The param referenceLengths can be a dictionary from hit names to lengths or a fasta file of sequences. The names in both should match the hit names in the "hits" dictionary.
    """
    import screed
    
    # Don't try to plot empty data
    if len(lengths)==0:
        raise Exception("Lengths cannot be empty!")
        
    transcriptCounts,boundaries=numpy.histogram(lengths.values(), bins=bins, range=lengthRange)

    if isinstance(referenceLengths, str):
        # assume we have the path to a fasta file
        # has it been parsed by screed?
        if not os.path.exists("%s_screed" % (referenceLengths)):
            screed.read_fasta_sequences(referenceLengths)
        refScreed=screed.ScreedDB(referenceLengths)
        getHitLength=lambda h: len(refScreed[h]['sequence'])
    else:
        getHitLength=lambda h: referenceLengths[h]

    # count bases by bin
    hitBaseCounts=numpy.zeros(transcriptCounts.shape)
    referenceBaseCounts=numpy.zeros(transcriptCounts.shape)
    totalBaseCounts=numpy.zeros(transcriptCounts.shape)
    for transcript,hitList in hits.iteritems():
        try:
            index=getBin(lengths[transcript],boundaries)
        except ValueError:
            # length was outside range
            continue
        totalBaseCounts[index]+=lengths[transcript]
        firstHit=hitList[0]
        hitLength=getHitLength(firstHit.hit)
        logger.debug("Hit of length %d goes from %d to %d" % (hitLength,
            firstHit.hstart, firstHit.hend))
        referenceBaseCounts[index]+=hitLength
        hitBaseCounts[index]+=numpy.abs(firstHit.hend-firstHit.hstart)+1

    if includeMissed:
        for transcript,length in lengths.iteritems():
            if transcript not in hits:
                totalBaseCounts[index]+=length
                
    # Simulate stepped histogram of total bases
    ax2=ax.twinx()
    x,y = getSteppedBars(totalBaseCounts, boundaries)
    if hlog:
        ax2.set_yscale("log",nonposy='clip')
    ax2.plot(x,y,color=hcolor)
    ax2.set_ylabel('total bases',color=hcolor)
    for tl in ax2.get_yticklabels():
        tl.set_color(hcolor)
                
    # normalize hit counts by transcript counts
    hitRate = hitBaseCounts/referenceBaseCounts
    # remove infinities
    hitRate[totalBaseCounts==0]=0
    
    # Draw histogram bars
    lefts=boundaries[:-1]
    widths=[boundaries[i+1]-boundaries[i] for i in range(len(boundaries)-1)]
    ax.bar(lefts,hitRate,width=widths,color=barcolor,edgecolor=baredgecolor)
    ax.set_ylim([0,1])
    ax.set_ylabel('% reference matched')
    ax.set_xlabel('transcript length')
Beispiel #23
0
    def setup(self):
        self._testfa = utils.get_temp_filename('test-whitespace.fa')
        shutil.copy(utils.get_test_data('test-whitespace.fa'), self._testfa)

        screed.read_fasta_sequences(self._testfa)
        self.db = screed.ScreedDB(self._testfa)
Beispiel #24
0
def setup():
    screed.read_fasta_sequences(testfa)
Beispiel #25
0
#! /usr/bin/env python
import sys
import os
import screed

filename = sys.argv[1]

try:
    os.unlink(filename + '_screed')
except OSError:
    pass

db = screed.read_fasta_sequences(filename)

###

from whoosh.index import create_in
from whoosh.fields import *
schema = Schema(name=TEXT(stored=True),
                description=TEXT(stored=True))

import os, shutil
indexdir = filename + '.whooshd'
try:
    shutil.rmtree(indexdir)
except OSError:                 # doesn't exit
    pass

os.mkdir(indexdir)
ix = create_in(indexdir, schema)
Beispiel #26
0
def setup():
    screed.read_fasta_sequences(testfa)
Beispiel #27
0
def generate_tetra(project, scaffolds, min_len=2000):
    """
    gene_file: HMMER predicted genes.

    """
    #TODO: screed is only for python 2, so can't use this on py3
    #TODO: also screed fails if two instances try to open the same file
    import screed

    gene_file = project + '_hmm.txt'
    tetra_file = project + '_tetra.txt'
    gff_file = project + '_mgm.gff'

    # generate a mapping to merge reverse complement tetranucleotides
    seq_map = {'': 4 * 'N'}
    for s in (''.join(i) for i in itertools.product(*(4 * ['ATGC']))):
        if rc(s) not in seq_map or s not in seq_map:
            seq_map[s] = s
            seq_map[rc(s)] = s

    # write out the headers for the gene/tetranucleotide file
    fout = open(tetra_file, 'w')
    srted_vals = list(set(seq_map.values()))
    srted_vals.sort()
    fout.write(','.join(['Gene'] + srted_vals) + '\n')

    f = open(gff_file, "r")
    for i in range(7):
        f.readline()

    # make a dictionary to map genes back to their contigs
    gene2contig = {}
    for ln in f:
        flds = ln.strip().split('\t')
        gene2contig[flds[-1].replace(' ', '_')] = flds[0].split(' ')[0]
    f.close()

    hmm = open(gene_file, 'r')
    contigs = screed.read_fasta_sequences(scaffolds)

    for i in range(3):
        hmm.readline()

    p_gff_name = ''
    for ln in hmm:
        gene_name = ln[0:21].strip()
        gff_name = ln[32:53].strip()

        # if the same gene_id is listed multiple times in a row
        # that means HMMER found multiple matches for it. We only
        # want the first one (with the lowest E-value).
        if gff_name != p_gff_name:
            p_gff_name = gff_name
            cc = contigs[gene2contig[gff_name]]
            if len(cc.sequence) < min_len:
                continue
            frq = dict([(s, 0) for s in seq_map.values()])
            for ss in slid_win(str(cc.sequence).upper(), 4):
                frq[seq_map.get(ss, 'NNNN')] += 1

            sum_frq = float(sum(frq.values()))
            if sum_frq == 0:
                sum_frq = 1
            fout.write(','.join([gene_name] + [str(frq[i] / sum_frq) \
                                            for i in srted_vals]))
            fout.write('\n')
            fout.flush()
    hmm.close()
    fout.close()
Beispiel #28
0
#! /usr/bin/env python

import screed

db = screed.read_fasta_sequences('galGal4.fa.masked')

keys = [k for k in db.keys() if "_" not in k and "Un" not in k]

filtered = screed.fasta.FASTA_Writer('galGal4.fa.masked.filtered')
for k in keys:
    record = db[k]
    filtered.write(record)

for k in keys:
    record = db[k]
    filtered = screed.fasta.FASTA_Writer(k)
    filtered.write(record)
Beispiel #29
0
#! /usr/bin/env python
import sys
import os
import screed

filename = sys.argv[1]

try:
    os.unlink(filename + '_screed')
except OSError:
    pass

db = screed.read_fasta_sequences(filename)

###

from whoosh.index import create_in
from whoosh.fields import *
schema = Schema(name=TEXT(stored=True), description=TEXT(stored=True))

import os, shutil
indexdir = filename + '.whooshd'
try:
    shutil.rmtree(indexdir)
except OSError:  # doesn't exit
    pass

os.mkdir(indexdir)
ix = create_in(indexdir, schema)

writer = ix.writer()
Beispiel #30
0
 def setup(self):
     self._testfa = os.path.join(os.path.dirname(__file__), "test-whitespace.fa")
     screed.read_fasta_sequences(self._testfa)
     self.db = screed.ScreedDB(self._testfa)
Beispiel #31
0
    def setup(self):
        self._testfa = utils.get_temp_filename('test-whitespace.fa')
        shutil.copy(utils.get_test_data('test-whitespace.fa'), self._testfa)

        screed.read_fasta_sequences(self._testfa)
        self.db = screed.ScreedDB(self._testfa)
Beispiel #32
0
def get_record_names(path):
    print(path)
    db = screed.read_fasta_sequences(path)
    names = db.keys()
    db.close()
    return names