def setup(self): self._testfile = utils.get_temp_filename('test.fa') shutil.copy(utils.get_test_data('test.fa'), self._testfile) screed.read_fasta_sequences(self._testfile) self._db = screed.ScreedDB(self._testfile) self._ns = nostring()
def setup(self): self._fileName = os.path.join(os.path.dirname(__file__), 'fastaRecovery') self._testfa = os.path.join(os.path.dirname(__file__), 'test.fa') screed.read_fasta_sequences(self._testfa) screed.ToFasta(self._testfa, self._fileName) screed.read_fasta_sequences(self._fileName) self.db = screed.ScreedDB(self._fileName)
def setup(self): self._fileName = utils.get_temp_filename('fastaRecovery') self._testfa = utils.get_temp_filename('test.fa') shutil.copy(utils.get_test_data('test.fa'), self._testfa) screed.read_fasta_sequences(self._testfa) screed.ToFasta(self._testfa, self._fileName) screed.read_fasta_sequences(self._fileName) self.db = screed.ScreedDB(self._fileName)
def add_reads( self, reads, read_name_sep='_' ) : if exists( reads + '_screed' ) : print 'reads previously indexed.' else : print 'indexing records...' read_fasta_sequences(reads) print 'building database...' db = ScreedDB(reads) self.db = db self.read_name_sep = read_name_sep self.reads_path = reads
def setup(self): thisdir = os.path.dirname(__file__) self._fqName = os.path.join(thisdir, 'fa_to_fq') self._faName = os.path.join(thisdir, 'fq_to_fa') self._testfa = os.path.join(thisdir, 'test.fa') screed.read_fasta_sequences(self._testfa) screed.ToFastq(self._testfa, self._fqName) # Fasta db -> fasta text screed.read_fastq_sequences(self._fqName) # Fastq file -> fastq db screed.ToFasta(self._fqName, self._faName) # Fastq db -> fasta text screed.read_fasta_sequences(self._faName) # Fasta file -> fasta db self.db = screed.ScreedDB(self._faName)
def setup(self): self._fqName = utils.get_temp_filename('fa_to_fq') self._faName = utils.get_temp_filename('fq_to_fa') self._testfa = utils.get_temp_filename('test.fa') shutil.copy(utils.get_test_data('test.fa'), self._testfa) screed.read_fasta_sequences(self._testfa) screed.ToFastq(self._testfa, self._fqName) # Fasta db -> fasta text screed.read_fastq_sequences(self._fqName) # Fastq file -> fastq db screed.ToFasta(self._fqName, self._faName) # Fastq db -> fasta text screed.read_fasta_sequences(self._faName) # Fasta file -> fasta db self.db = screed.ScreedDB(self._faName)
def split(fasta_fp, ref_fp, output_dp): if not os.path.exists(output_dp): os.makedirs(output_dp) fasta_db = screed.read_fasta_sequences(fasta_fp) n = 0 first_entry = True for i, seq in enumerate(fasta_db): if i == 0: output = open(os.path.join(output_dp, 'tmp_{}.fasta'.format(n)), 'w+') elif i % split_num == 0: n += 1 output.close() first_entry = True output = open(os.path.join(output_dp, 'tmp_{}.fasta'.format(n)), 'w+') if first_entry: output.write('>{}\n{}'.format(fasta_db[seq].name, str(fasta_db[seq].sequence))) first_entry = False else: output.write('\n>{}\n{}'.format(fasta_db[seq].name, str(fasta_db[seq].sequence))) output.close()
def main(): if os.path.isfile(fa + "_screed"): from screed import ScreedDB fadb = ScreedDB(fa) else: fadb = screed.read_fasta_sequences(fa) makeKmerArray(fadb, int(args.ksize), norm)
def main(): if os.path.isfile(fa + "_screed"): from screed import ScreedDB fadb = ScreedDB(fa) else: fadb = screed.read_fasta_sequences(fa) makeSlices(fadb,maxSlice)
def main(): if os.path.isfile(fa + "_screed"): from screed import ScreedDB fadb = ScreedDB(fa) else: fadb = screed.read_fasta_sequences(fa) makeKmerArray(fadb,int(args.ksize),norm)
def openDB(fileName): """Opening screed DB; making if not already existing Args: fileName -- Name of sequence file or screedDB file """ logging.info('{}: Making/opening screed database for: "{}"'.format(my_time(), fileName)) # making db if needed if not fileName.endswith('_screed'): try: screed.read_fastq_sequences(fileName) fileName = fileName + '_screed' except KeyError: try: screed.read_fasta_sequences(fileName) fileName = fileName + '_screed' except IOError: msg = 'Cannot open {}'.format(fileName) raise IOError(msg) # init screed db return screed.ScreedDB(fileName)
def build_get_hit_length_function(referenceLengths): """ Given the referenceLengths parameter return a lambda function that will map a reference sequence id to its sequence length The referenceLenths parameter may be either a python dict or a str name of a fasta file. In the latter case, the file is parsed to get lengths """ if isinstance(referenceLengths, str): import screed # assume we have the path to a fasta file # has it been parsed by screed? if not os.path.exists("%s_screed" % (referenceLengths)): # TODO: just use Bio.SeqIO to get lengths if # screed module or screed index is missing. # screed is overkill here. screed.read_fasta_sequences(referenceLengths) refScreed = screed.ScreedDB(referenceLengths) return lambda h: len(refScreed[h]['sequence']) else: return lambda h: referenceLengths[h]
def save_reads_only_in_k2g(index): k2g = k2g_fastas[index] no_k2g = os.path.join(NO_K2G_FASTA_PATH, os.path.basename(k2g)) record_names_no_k2g = get_record_names(no_k2g) filename = os.path.join( K2G_INTERSECT, os.path.basename(k2g).replace(".fasta", "_intersect.fasta")) result_fasta = open(filename, "a") print(record_names_no_k2g[:5]) print(k2g) db_k2g = screed.read_fasta_sequences(k2g) for name in db_k2g: if name not in record_names_no_k2g: continue result_fasta.write(">{}\n{}\n".format(name, db_k2g[name].sequence))
def psl_filter(fasta_fp, psl_fp, output_fp): hits = pd.read_csv(psl_fp, sep='\t', names=['matches', 'misMatches', 'repMatches', 'nCount', 'qNumInsert', 'qBaseInsert', 'tNumInsert', 'tBaseInsert', 'strand', 'qName', 'qSize', 'qStart', 'qEnd', 'tName', 'tSize', 'tStart', 'tEnd', 'blockCount', 'blockSizes', 'qStarts', 'tStarts']) fasta_db = screed.read_fasta_sequences(fasta_fp) output = open(output_fp, 'w+') first_entry = True for seq in fasta_db: if seq in hits['qName'].unique(): if first_entry: output.write('>{}\n{}'.format(fasta_db[seq].name, str(fasta_db[seq].sequence))) first_entry = False else: output.write('\n>{}\n{}'.format(fasta_db[seq].name, str(fasta_db[seq].sequence))) output.close()
""" import time as t import os, os.path import glob import platform global array import screed import sys from screed import ScreedDB import string #corriendo = "N" bioinfo_path = "/Users/ivanjimenez/Desktop/CLASES/INTERNSHIPS/BIOINFO INTERNSHIP FILES/RESULTS/newresults/" viralgenome_path = bioinfo_path + "copy_birna_x_virus.fa" screed.read_fasta_sequences("/Users/ivanjimenez/Desktop/CLASES/INTERNSHIPS/BIOINFO INTERNSHIP FILES/RESULTS/newresults/copy_birna_x_virus.fa") birna_x_virusdb = ScreedDB(viralgenome_path + "_screed") #Setting the number of mismatches that are allowed... k = 6 def getpath(): wd = os.path.dirname(os.path.abspath(__file__)) if platform.system() == 'Windows': array = wd.split('\\') destination = "\\\\".join(array) destination += '\\\\' else: array = wd.split('//') destination = "////".join(array) destination += '////'
def run(reads_dp, mothur_dp, dependencies_dp, num_cpu): """ TODO: save outputs to independent directory for easier use by downstream processes """ if not os.path.exists(reads_dp): sys.exit('read_dp {} DOES NOT EXIST'.format(reads_dp)) if not os.path.exists(mothur_dp): os.makedirs(mothur_dp) if not os.path.exists(dependencies_dp): sys.exit('dependencies_dp {} DOES NOT EXIST'.format(dependencies_dp)) generate_stability_file(reads_dp, mothur_dp) if not os.path.exists( os.path.join(dependencies_dp, 'silva', 'silva.seed_v132.pcr.align')): cmd = [ '''mothur "#pcr.seqs(fasta={}, start=11894, end=25319, keepdots=F, processors={});"''' .format( os.path.join(dependencies_dp, 'silva', 'silva.seed_v132.align'), num_cpu) ] call(cmd, shell=True) if not os.path.exists( os.path.join(dependencies_dp, 'silva', 'silva.seed_v132.pcr.align')): sys.exit('silva.seed_v132.pcr.align NOT CREATED.') miniconda_bin_dp = os.path.join(dependencies_dp, 'mothur', 'bin') # make contigs cmd = [ '''mothur "#set.dir(input={}, tempdefault={}); make.contigs(file='stability.files', processors={}); screen.seqs(fasta=current, group=current, maxambig=0, maxlength=275, minlength=240); summary.seqs(count=current); unique.seqs(fasta=current); count.seqs(name=current, group=current); align.seqs(fasta=current, reference={}); summary.seqs(fasta=current, count=current); screen.seqs(fasta=current, count=current, summary=current, start=1968, end=11550, maxhomop=8); summary.seqs(count=current, count=current); filter.seqs(fasta=current, vertical=T, trump=.); unique.seqs(fasta=current, count=current); pre.cluster(fasta=current, count=current, diffs=2); chimera.vsearch(fasta=current, count=current, dereplicate=t); remove.seqs(fasta=current, accnos=current); summary.seqs(fasta=current, count=current); classify.seqs(fasta=current, count=current, reference={}, taxonomy={}, cutoff=80); remove.lineage(fasta=current, count=current, taxonomy=current, taxon=Chloroplast-Mitochondria-unknown-Archaea-Eukaryota); summary.seqs(fasta=current, count=current); cluster.split(fasta=current, count=current, taxonomy=current, splitmethod=classify, taxlevel=4, cutoff=0.03); make.shared(list=current, count=current, label=0.03); classify.otu(list=current, count=current, taxonomy=current, label=0.03); tree.shared(shared=current, calc=jest-thetayc-braycurtis); get.oturep(column=current, name=current, fasta=current, list=current);"'''.format( mothur_dp, miniconda_bin_dp, num_cpu, os.path.join(dependencies_dp, 'silva', 'silva.seed_v132.pcr.align'), os.path.join(dependencies_dp, 'silva', 'silva.nr_v132.align'), os.path.join(dependencies_dp, 'silva', 'silva.nr_v132.tax')) ] call(cmd, shell=True) #db = screed.read_fasta_sequences(os.path.join(mothur_dp, 'stability.trim.contigs.good.unique.good.filter.unique.precluster.pick.opti_mcc.unique_list.0.03.rep.fasta')) db = screed.read_fasta_sequences( os.path.join( mothur_dp, 'stability.trim.contigs.good.unique.good.filter.unique.precluster.pick.pick.opti_mcc.0.03.rep.fasta' )) output = open(os.path.join(mothur_dp, '../', 'otus.fasta'), 'w+') for otu in db: output.write('>{}\n'.format(otu.split('\t')[1].split('|')[0])) output.write('{}\n'.format(db[otu].sequence)) output.close() return None
def setup(self): self._testfa = os.path.join(os.path.dirname(__file__), 'test-whitespace.fa') screed.read_fasta_sequences(self._testfa) self.db = screed.ScreedDB(self._testfa)
def setup(self): self._testfa = os.path.join(os.path.dirname(__file__), 'test.fa') screed.read_fasta_sequences(self._testfa) self.db = screed.ScreedDB(self._testfa)
def plotHitCoverageByLengthBins(ax, lengths, hits, referenceLengths, bins=20, lengthRange=None, barcolor='b',baredgecolor='k',hlog=False,hcolor='r', includeMissed=False): """ Given a dictionary of transcript lengths, a dictionary of hits, and a dict of reference sequence lengths... Produce a plot of reference coverate rate by length bin. IE: What fracton of total residues in the reference sequences were matched. The param referenceLengths can be a dictionary from hit names to lengths or a fasta file of sequences. The names in both should match the hit names in the "hits" dictionary. """ import screed # Don't try to plot empty data if len(lengths)==0: raise Exception("Lengths cannot be empty!") transcriptCounts,boundaries=numpy.histogram(lengths.values(), bins=bins, range=lengthRange) if isinstance(referenceLengths, str): # assume we have the path to a fasta file # has it been parsed by screed? if not os.path.exists("%s_screed" % (referenceLengths)): screed.read_fasta_sequences(referenceLengths) refScreed=screed.ScreedDB(referenceLengths) getHitLength=lambda h: len(refScreed[h]['sequence']) else: getHitLength=lambda h: referenceLengths[h] # count bases by bin hitBaseCounts=numpy.zeros(transcriptCounts.shape) referenceBaseCounts=numpy.zeros(transcriptCounts.shape) totalBaseCounts=numpy.zeros(transcriptCounts.shape) for transcript,hitList in hits.iteritems(): try: index=getBin(lengths[transcript],boundaries) except ValueError: # length was outside range continue totalBaseCounts[index]+=lengths[transcript] firstHit=hitList[0] hitLength=getHitLength(firstHit.hit) logger.debug("Hit of length %d goes from %d to %d" % (hitLength, firstHit.hstart, firstHit.hend)) referenceBaseCounts[index]+=hitLength hitBaseCounts[index]+=numpy.abs(firstHit.hend-firstHit.hstart)+1 if includeMissed: for transcript,length in lengths.iteritems(): if transcript not in hits: totalBaseCounts[index]+=length # Simulate stepped histogram of total bases ax2=ax.twinx() x,y = getSteppedBars(totalBaseCounts, boundaries) if hlog: ax2.set_yscale("log",nonposy='clip') ax2.plot(x,y,color=hcolor) ax2.set_ylabel('total bases',color=hcolor) for tl in ax2.get_yticklabels(): tl.set_color(hcolor) # normalize hit counts by transcript counts hitRate = hitBaseCounts/referenceBaseCounts # remove infinities hitRate[totalBaseCounts==0]=0 # Draw histogram bars lefts=boundaries[:-1] widths=[boundaries[i+1]-boundaries[i] for i in range(len(boundaries)-1)] ax.bar(lefts,hitRate,width=widths,color=barcolor,edgecolor=baredgecolor) ax.set_ylim([0,1]) ax.set_ylabel('% reference matched') ax.set_xlabel('transcript length')
def setup(self): self._testfa = utils.get_temp_filename('test-whitespace.fa') shutil.copy(utils.get_test_data('test-whitespace.fa'), self._testfa) screed.read_fasta_sequences(self._testfa) self.db = screed.ScreedDB(self._testfa)
def setup(): screed.read_fasta_sequences(testfa)
#! /usr/bin/env python import sys import os import screed filename = sys.argv[1] try: os.unlink(filename + '_screed') except OSError: pass db = screed.read_fasta_sequences(filename) ### from whoosh.index import create_in from whoosh.fields import * schema = Schema(name=TEXT(stored=True), description=TEXT(stored=True)) import os, shutil indexdir = filename + '.whooshd' try: shutil.rmtree(indexdir) except OSError: # doesn't exit pass os.mkdir(indexdir) ix = create_in(indexdir, schema)
def generate_tetra(project, scaffolds, min_len=2000): """ gene_file: HMMER predicted genes. """ #TODO: screed is only for python 2, so can't use this on py3 #TODO: also screed fails if two instances try to open the same file import screed gene_file = project + '_hmm.txt' tetra_file = project + '_tetra.txt' gff_file = project + '_mgm.gff' # generate a mapping to merge reverse complement tetranucleotides seq_map = {'': 4 * 'N'} for s in (''.join(i) for i in itertools.product(*(4 * ['ATGC']))): if rc(s) not in seq_map or s not in seq_map: seq_map[s] = s seq_map[rc(s)] = s # write out the headers for the gene/tetranucleotide file fout = open(tetra_file, 'w') srted_vals = list(set(seq_map.values())) srted_vals.sort() fout.write(','.join(['Gene'] + srted_vals) + '\n') f = open(gff_file, "r") for i in range(7): f.readline() # make a dictionary to map genes back to their contigs gene2contig = {} for ln in f: flds = ln.strip().split('\t') gene2contig[flds[-1].replace(' ', '_')] = flds[0].split(' ')[0] f.close() hmm = open(gene_file, 'r') contigs = screed.read_fasta_sequences(scaffolds) for i in range(3): hmm.readline() p_gff_name = '' for ln in hmm: gene_name = ln[0:21].strip() gff_name = ln[32:53].strip() # if the same gene_id is listed multiple times in a row # that means HMMER found multiple matches for it. We only # want the first one (with the lowest E-value). if gff_name != p_gff_name: p_gff_name = gff_name cc = contigs[gene2contig[gff_name]] if len(cc.sequence) < min_len: continue frq = dict([(s, 0) for s in seq_map.values()]) for ss in slid_win(str(cc.sequence).upper(), 4): frq[seq_map.get(ss, 'NNNN')] += 1 sum_frq = float(sum(frq.values())) if sum_frq == 0: sum_frq = 1 fout.write(','.join([gene_name] + [str(frq[i] / sum_frq) \ for i in srted_vals])) fout.write('\n') fout.flush() hmm.close() fout.close()
#! /usr/bin/env python import screed db = screed.read_fasta_sequences('galGal4.fa.masked') keys = [k for k in db.keys() if "_" not in k and "Un" not in k] filtered = screed.fasta.FASTA_Writer('galGal4.fa.masked.filtered') for k in keys: record = db[k] filtered.write(record) for k in keys: record = db[k] filtered = screed.fasta.FASTA_Writer(k) filtered.write(record)
#! /usr/bin/env python import sys import os import screed filename = sys.argv[1] try: os.unlink(filename + '_screed') except OSError: pass db = screed.read_fasta_sequences(filename) ### from whoosh.index import create_in from whoosh.fields import * schema = Schema(name=TEXT(stored=True), description=TEXT(stored=True)) import os, shutil indexdir = filename + '.whooshd' try: shutil.rmtree(indexdir) except OSError: # doesn't exit pass os.mkdir(indexdir) ix = create_in(indexdir, schema) writer = ix.writer()
def setup(self): self._testfa = os.path.join(os.path.dirname(__file__), "test-whitespace.fa") screed.read_fasta_sequences(self._testfa) self.db = screed.ScreedDB(self._testfa)
def get_record_names(path): print(path) db = screed.read_fasta_sequences(path) names = db.keys() db.close() return names