def addIllumina(readsetfh, orgname, rundata, nembasedir, outlog): """Add Illumina experiments to our readset.""" index = 1 for exp in rundata.expsIllumina: trinsetsingle = os.path.join(nembasedir, "illumina", "_trinityASM_%s" % exp, \ "assemblydir", "single", "Trinity.fasta") trinsetpaired = os.path.join(nembasedir, "illumina", "_trinityASM_%s" % exp, \ "assemblydir", "paired", "Trinity.fasta") oasset = os.path.join(nembasedir, "illumina", "_oasesASM_%s" % exp, \ "assemblydir", "transcripts.fa") if not (os.path.exists(trinsetsingle) and os.path.exists(trinsetpaired) and \ os.path.exists(oasset)): print "Warning: Experiment %s not found for %s" % (exp, orgname) outlog.write("Warning: Experiment %s not found for %s\n" % (exp, orgname)) continue elif os.stat(trinsetsingle)[6] == 0 and os.stat(trinsetpaired)[6] == 0: print "Trinity assembly empty, using Oases" for rec in fasta_itr(oasset): rec.header = "Contig_Illumina_%s" % index writeSeq(rec, readsetfh) index += 1 else: for rec in fasta_itr(trinsetsingle): rec.header = "Contig_Illumina_%s" % index writeSeq(rec, readsetfh) index += 1 for rec in fasta_itr(trinsetpaired): rec.header = "Contig_Illumina_%s" % index writeSeq(rec, readsetfh) index += 1
def getFromFastaByName(cf): """Print records where names match their header. If partial is true, check for partial matches. names can also be a list of compiled regular expressions (or objects that have a findall method).""" names, status = getNames(cf, cf.get_parameter('regexp', 'boolean')) if not status == constants.OK: return status fastafile = cf.get_input('fastafile') outputfile = open(cf.get_output('outputfile'), 'w') partial = cf.get_parameter('partial', 'boolean') negate = cf.get_parameter('negate', 'boolean') for rec in fasta_itr(fastafile): found = False for name in names: if hasattr(name, 'findall'): matches = name.findall(rec.header) if len(matches) > 0: found = True; break elif partial and rec.header.find(name) >= 0 or \ rec.header == name: found = True; break if not negate and found or \ negate and not found: outputfile.write(str(rec) + "\n") outputfile.close() return constants.OK
def getOverRepClusters(cf): """Identify over represented clusters in a fastqfile and write the cluster seed to a file.""" fastqfile = cf.get_input('fastqfile') resultsuc = cf.get_input('resultsuc') resultsfa = cf.get_input('resultsfa') percRep = cf.get_parameter('percRep', 'float') output = cf.get_output('resultsfa') totalSeqs = 0 fqp = FastqParser() for rec in fqp.parse(open(fastqfile, 'rb')): totalSeqs += 1 clusterCounts = {} reader = csv.reader(open(resultsuc, 'rb'), quoting=csv.QUOTE_NONE, delimiter='\t') for row in reader: if row[0] == 'H': if not clusterCounts.has_key(row[-1]): clusterCounts[row[-1]] = 0 clusterCounts[row[-1]] += 1 outfh = open(output, 'wb') for rec in fasta_itr(resultsfa): if not clusterCounts.has_key(rec.header): continue clusterRep = (float(clusterCounts[rec.header]) / float(totalSeqs)) * 100 if clusterRep >= percRep: outfh.write(str(rec) + '\n') outfh.close() return constants.OK
def fasta_read(file_name): """read the sequence from a file in fasta format""" seq_dict = dict() for record in fasta.fasta_itr(file_name): header = record.header seq_dict[header] = record.sequence return seq_dict
def getAllData(dataFile): allData = [] for record in fasta.fasta_itr(dataFile): #Create parallel arrays for negative sequence/headers sequence = re.sub('[()\'\']', '', record.sequence.strip()); header = re.sub('[()\'\']', '', record.header.strip()); allData.append([header, sequence]); return allData;
def printMIRAContigs(orgdir): """Print the assembled mira contigs to an output file.""" outputfile = os.path.join(orgdir, "contigs.fa") outfh = open(outputfile, 'w') miraasm = os.path.join(orgdir, "mira_assembly", "mira_d_results", \ "mira_out.unpadded.fasta") for rec in fasta_itr(miraasm): outfh.write(str(rec) + "\n") outfh.close()
def fasta_read(file_name): """read the sequence from a file in fasta format""" seq_dict = dict() for record in fasta.fasta_itr(file_name): header = record.header seq_dict[header] = record.sequence; return seq_dict;
def getAllData(dataFile): allData = [] for record in fasta.fasta_itr( dataFile): #Create parallel arrays for negative sequence/headers sequence = re.sub('[()\'\']', '', record.sequence.strip()) header = re.sub('[()\'\']', '', record.header.strip()) allData.append([header, sequence]) return allData
def appendUnassembledReads(orgdir, asmreads): """Append the unassembled reads to the final contig file.""" outputfile = os.path.join(orgdir, "contigs.fa") outfh = open(outputfile, 'a') readfile = os.path.join(orgdir, "reads.fa") for rec in fasta_itr(readfile): if asmreads.has_key(rec.header): continue outfh.write(str(rec) + "\n") outfh.close()
def fasta_merge(cf): """Merge an array of fastafiles.""" outfh = open(cf.get_output("output"), "w") fastafiles = get_array(cf, "fastafiles") cf.write_log(str(fastafiles)) for key, fastafile in fastafiles: for rec in fasta_itr(fastafile): outfh.write(str(rec) + "\n") outfh.close() return constants.OK
def CreateNegDict(NegativeFileName): NegSequences = [] NegHeaders = [] NegativeFile = open(NegativeFileName, "r") for record in fasta.fasta_itr(NegativeFileName): #Create parallel arrays for negative sequence/headers sequence = record.sequence header = record.header NegSequences.append(sequence) NegHeaders.append(header) return NegSequences, NegHeaders;
def addESTs(readsetfh, orgname, nembasedir, outlog): """Add a set of ESTs to our readset.""" estset = os.path.join(nembasedir, "est", "_outputCtgSet_%s" % orgname, "output") if not os.path.exists(estset): outlog.write("ESTs for %s not found \n" % orgname) return index = 1 for rec in fasta_itr(estset): rec.header = "Contig_EST_%s" % index writeSeq(rec, readsetfh) index += 1
def add454(readsetfh, orgname, rundata, nembasedir, outlog): """Add 454 experiments to our readset.""" index = 1 for exp in rundata.exps454: cap3contigs = os.path.join(nembasedir, "454", "_cap3Asm_%s" % exp, "assemblydir", \ "output.cap.contigs") cap3singlets = os.path.join(nembasedir, "454", "_cap3Asm_%s" % exp, "assemblydir", \ "output.cap.singlets") if not (os.path.exists(cap3contigs) and os.path.exists(cap3singlets)): print "Warning: Experiment %s not found for %s" % (exp, orgname) outlog.write("Warning: Experiment %s not found for %s\n" % (exp, orgname)) continue for rec in fasta_itr(cap3contigs): rec.header = "Contig_454_%s" % index writeSeq(rec, readsetfh) index += 1 for rec in fasta_itr(cap3singlets): rec.header = "Contig_454_%s" % index writeSeq(rec, readsetfh) index += 1
def get_seqs(f): seqs = [] fg_gc_list = [] fg_lengths = [] stream = open(f) for record in fasta.fasta_itr(f): record.sequence= record.sequence.upper() seqs.append(record) fg_gc_list.append(GC(record.sequence)) fg_lengths.append(len(record.sequence)) stream.close() return seqs, fg_gc_list, fg_lengths
def renameContigs(indir, outdir): """Rename the contigs in indir and write them to outdir.""" for d in os.listdir(indir): code = getSpeciesCode(d) os.mkdir(os.path.join(outdir, d)) infile = os.path.join(indir, d, "contigs.fa") outfile = os.path.join(outdir, d, "contigs.fa") outfh = open(outfile, 'w') index = 1 for rec in fasta_itr(infile): rec.header = code + "_" + str(index) index += 1 outfh.write(str(rec) + "\n") outfh.close()
#!/usr/bin/python # # This software is freely provided for any use. # # erik garrison <*****@*****.**> import fasta from fasta import fasta_itr import sys if len(sys.argv) < 3: print "usage:", sys.argv[0], "<contig_file> <contig length cutoff>" exit() contig_file = sys.argv[1] cutoff = int(sys.argv[2]) for rec in fasta_itr(contig_file): if len(rec.sequence) > cutoff: print rec
import re import os #Prepare and open files PositiveFileName = sys.argv[1] NegativeFileName = sys.argv[2] PosFileBaseName = os.path.basename(PositiveFileName) print PosFileBaseName OutputFileName = "NON" + PosFileBaseName print OutputFileName PositiveFile = open(PositiveFileName, "r") NegativeFile = open(NegativeFileName, "r") OutputFile = open(OutputFileName, "w") PosLengths = [] for record in fasta.fasta_itr( PositiveFileName): #Remember lengths of positive sequences seqLength = len(record.sequence) PosLengths.append(seqLength) NegSequences = [] NegHeaders = [] for record in fasta.fasta_itr( NegativeFileName ): #Create parallel arrays for negative sequence/headers sequence = record.sequence header = record.header NegSequences.append(sequence) NegHeaders.append(header) indexArr = []
import fasta import sys import random PositiveFileName = sys.argv[1] NegativeFileName = sys.argv[2] OutputFileName = sys.argv[3] #OutputFileName = PositiveFileName.lstrip("TTP_PARCLIP_ConversionSpecificity") #OutputFileName = OutputFileName.rstrip(".txt") #OutputFileName = "Non_TTPPARCLIP" + OutputFileName + ".txt" PositiveFile = open(PositiveFileName, "r") NegativeFile = open(NegativeFileName, "r") OutputFile = open(OutputFileName, "w") PosLengths = [] for record in fasta.fasta_itr( PositiveFileName): #Remember lengths of positive sequences seqLength = len(record.sequence) PosLengths.append(seqLength) NegSequences = [] NegHeaders = [] for record in fasta.fasta_itr(NegativeFileName): sequence = record.sequence header = record.header NegSequences.append(sequence) NegHeaders.append(header) for number in PosLengths: gotLength = False seqLength = 0
import re import os #Prepare and open files PositiveFileName = sys.argv[1] NegativeFileName = sys.argv[2] PosFileBaseName = os.path.basename(PositiveFileName); print PosFileBaseName OutputFileName = "NON" + PosFileBaseName print OutputFileName; PositiveFile = open(PositiveFileName, "r") NegativeFile = open(NegativeFileName, "r") OutputFile = open(OutputFileName , "w") PosLengths = [] for record in fasta.fasta_itr(PositiveFileName): #Remember lengths of positive sequences seqLength = len(record.sequence) PosLengths.append(seqLength) NegSequences = [] NegHeaders = [] for record in fasta.fasta_itr(NegativeFileName): #Create parallel arrays for negative sequence/headers sequence = record.sequence header = record.header NegSequences.append(sequence) NegHeaders.append(header) indexArr = [] #Iterate through lengths in positive length array, picking a random negative sequence #and picking LENGTH nucleotides at random start point
import fasta import sys import random PositiveFileName = sys.argv[1] NegativeFileName = sys.argv[2] OutputFileName = sys.argv[3]; #OutputFileName = PositiveFileName.lstrip("TTP_PARCLIP_ConversionSpecificity") #OutputFileName = OutputFileName.rstrip(".txt") #OutputFileName = "Non_TTPPARCLIP" + OutputFileName + ".txt" PositiveFile = open(PositiveFileName, "r") NegativeFile = open(NegativeFileName, "r") OutputFile = open(OutputFileName , "w") PosLengths = [] for record in fasta.fasta_itr(PositiveFileName): #Remember lengths of positive sequences seqLength = len(record.sequence) PosLengths.append(seqLength) NegSequences = [] NegHeaders = [] for record in fasta.fasta_itr(NegativeFileName): sequence = record.sequence header = record.header NegSequences.append(sequence) NegHeaders.append(header) for number in PosLengths: gotLength = False
# # erik garrison <*****@*****.**> import fasta from fasta import fasta_itr import sys if len(sys.argv) < 2: print "usage:", sys.argv[0], "<fasta file>" exit() fasta_file = sys.argv[1] print '\t'.join(["header", "length", "a", "t", "g", "c", "at", "gc", "other"]) for rec in fasta_itr(fasta_file): l = len(rec.sequence) a,t,g,c,at,gc,other = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 for bp in rec.sequence: if bp == 'A': a += 1 elif bp == 'T': t += 1 elif bp == 'G': g += 1 elif bp == 'C': c += 1 else: other += 1 at = a + t gc = g + c
#os.environ["HMMERDB"] += ":"+os.path.abspath(options.hmm_path) #print os.environ["HMMERDB"] out_fname = os.path.abspath(options.out_fname) out_dir = os.path.dirname(out_fname) fname = os.path.abspath(options.input_fasta) tr = string.maketrans("gatcryswkmbdhvnGATCRYSWKMBDHVN", "ctagyrswmkvhdbnCTAGYRSWMKVHDBN") def rev_record(record): return ">" + record.header + "|rev\n" + format( record.sequence[::-1].translate(tr)) records = [rec for rec in fasta.fasta_itr(fname)] headers = [[rec.header, len(rec.sequence)] for rec in records] ff = open(out_fname + '.fa', 'w') for (i, rec) in enumerate(records): ff.write('>s' + str(i) + '\n' + format(rec.sequence) + '\n') ff.write('>s' + str(i) + '|rev\n' + format(rec.sequence[::-1].translate(tr)) + '\n') ff.close() #sys.exit(1) # a temporary fasta file, use s(int) to easy the parsing def parse_hmmsearch(kingdom, moltype, src): # function to parse hmmsearch output resu = []
parser.print_help() sys.exit(1) #print "%s"% os.path.abspath(options.hmm_path) #os.environ["HMMERDB"] += ":"+os.path.abspath(options.hmm_path) #print os.environ["HMMERDB"] fname = os.path.abspath(options.input_fasta) tr = string.maketrans("gatcryswkmbdhvnGATCRYSWKMBDHVN","ctagyrswmkvhdbnCTAGYRSWMKVHDBN") def rev_record(record): return ">"+record.header+"|rev\n"+format(record.sequence[::-1].translate(tr)) records = [rec for rec in fasta.fasta_itr(fname)] headers = [[rec.header,len(rec.sequence)] for rec in records] temp_fasta = tempfile.NamedTemporaryFile(delete=False) ff = open(temp_fasta.name,'w') for (i, rec) in enumerate(records): ff.write('>s'+str(i)+'\n'+format(rec.sequence)+'\n') ff.write('>s'+str(i)+'|rev\n'+format(rec.sequence[::-1].translate(tr))+'\n') ff.close() #sys.exit(1) # a temporary fasta file, use s(int) to easy the parsing def parse_hmmsearch(kingdom, moltype, src): # function to parse hmmsearch output resu = []