class Pyro: def __init__(self, name, fasta_filename, aln_length=None, rename_dups=False): """ if aln_length is not given then it is *guessed* by looking at the alignment in <fasta_filename> """ self.name = name self.rename_dups = rename_dups self.fasta_filename = fasta_filename self.fasta_reader = FastaReader(self.fasta_filename, self.rename_dups) if aln_length is None: key1 = self.fasta_reader.iterkeys().next() self.aln_length = len(self.fasta_reader[key1].seq) else: self.aln_length = aln_length self.nseq = len(self.fasta_reader.keys()) def __getitem__(self, key): return self.fasta_reader[key] def keys(self): return self.fasta_reader.keys() def make_DF(self): df = DF(self.name, self.aln_length) for id in self.fasta_reader.iterkeys(): r = self.fasta_reader[id] for nt in DF.nucleotides(): # TODO: make find_all_indices iterative to be mem-efficient seq = r.seq.tostring().replace("U", "T") positions = find_all_indices(seq, nt) df.add_to_vec(nt=nt, positions=positions, counts=[1] * len(positions)) # for gapped_pos,nt in enumerate(r.seq): # df.add_to_vec(nt=nt, positions=[gapped_pos], counts=[1]) return df
def inputFile(self, fname): self.fname=fname self.uniqueID= '' self.drugsGiven=[] reader = FastaReader(fname) mutationList= [] for header, seq in reader.readFasta(): mutationList.append((header,seq)) self.seqt0= mutationList[0][1] self.seqtf= mutationList[-1][1] #Shaves '>' self.findMutations() self.possibleMutations= findAllPossibleMutations(self.seqt0) #Parse the header and put in relevant information finalHeader= mutationList[-1][0] #print(finalHeader) readHeader= True firstUnderScore= True builtStr='' readDrugs=False for char in header: #print("Char:" +char) #print("builtStr: "+builtStr) if readHeader: if char=='_': if firstUnderScore: builtStr+=char firstUnderScore= False else: readHeader= False self.uniqueID= builtStr else: builtStr+=char elif readDrugs: if char== '_': self.drugsGiven.append(builtStr) builtStr='' elif builtStr == 'None': break else: builtStr+=char elif builtStr== '__': readDrugs= True builtStr='' builtStr+=char elif char != '_': builtStr= '' else: builtStr+=char
def load_sg_seq(self, fasta_fn): all_read_ids = set() # read ids in the graph for v, w in self.sg_edges: type_ = self.sg_edges[ (v, w) ][-1] if type_ != "G": continue v = v.split(":")[0] w = w.split(":")[0] all_read_ids.add(v) all_read_ids.add(w) seqs = {} # load all p-read name into memory f = FastaReader(fasta_fn) for r in f: if r.name not in all_read_ids: continue seqs[r.name] = r.sequence.upper() for v, w in self.sg_edges: seq_id, s, t = self.sg_edges[ (v, w) ][0] type_ = self.sg_edges[ (v, w) ][-1] if type_ != "G": continue if s < t: e_seq = seqs[ seq_id ][ s:t ] else: e_seq = "".join([ RCMAP[c] for c in seqs[ seq_id ][ t:s ][::-1] ]) self.sg_edge_seqs[ (v, w) ] = e_seq
def __init__(self, name, fasta_filename, aln_length=None, rename_dups=False): """ if aln_length is not given then it is *guessed* by looking at the alignment in <fasta_filename> """ self.name = name self.rename_dups = rename_dups self.fasta_filename = fasta_filename self.fasta_reader = FastaReader(self.fasta_filename, self.rename_dups) if aln_length is None: key1 = self.fasta_reader.iterkeys().next() self.aln_length = len(self.fasta_reader[key1].seq) else: self.aln_length = aln_length self.nseq = len(self.fasta_reader.keys())
def main(): fastaDir = os.path.abspath(sys.argv[1]) # file directory fastaReader = FastaReader() fastaReader.readFastaFile(fastaDir) # read file # PROBLEM 1. (Detail Algorithm implemented in Sequence.py (subStringSearch() function)) searchString = sys.argv[2] numOfSubstring = fastaReader.numberOfSubstring(searchString) print(f"Problem 1: String, {searchString}, appears {numOfSubstring} times in file {fastaReader.getFileName()}") # PROBLEM 2. (Detail Algorithm implemented in FastaReader.py ()) lengthOfString = int(sys.argv[3]) highestFreqString, appearTimes, subseqCounter = fastaReader.findHighestOccurrence(lengthOfString) print(f"Problem 2: {lengthOfString}-mer subsequence, {highestFreqString}, has highest occurrences, {appearTimes} times, in file {fastaReader.getFileName()}") # output the all substring counts in substringCount.csv with open(os.path.join(os.path.abspath("./"), "Output", "substringCountLarger10.csv"), 'w') as outputFile: # column names outputFile.write("Subsequence,Counts\n") for subseq, counter in subseqCounter.items(): if counter >= 10: outputFile.write(f"{subseq},{counter}\n")
def sample(chr): L=len(chr) pos=int(random.uniform(0,L-100000)) while(pos<L-3): codon=chr[pos:pos+3] if(codon=="ATG"): break pos+=3 if(codon!="ATG"): return -1 codons=0 while(pos<L-3): codon=chr[pos:pos+3] if(codon[0]=='N' or codon[1]=='N' or codon[2]=='N'): return -1 if(stops.get(codon,False)): break codons+=1 pos+=3 return codons #============================ main() ============================== reader=FastaReader(filename) while(True): [defline,seq]=reader.nextSequence() if(not defline): break for i in range(NUM_ORFS): length=sample(seq) if(length<0): continue print(length,flush=True) reader.close()
# The above imports should allow this program to run in both Python 2 and # Python 3. You might need to update your version of module "future". import sys from FastaReader import FastaReader from FastaWriter import FastaWriter from GffTranscriptReader import GffTranscriptReader if(len(sys.argv)!=4): exit(sys.argv[0]+" <in.fasta> <in.gff> <out.fasta>") (fastaFile,gffFile,outFile)=sys.argv[1:] reader=GffTranscriptReader() transcripts=reader.loadGFF(gffFile) keep=set() for transcript in transcripts: if(transcript.getID()[:3]!="ALT"): continue keep.add(transcript.getSubstrate()) reader=FastaReader(fastaFile) writer=FastaWriter() fh=open(outFile,"wt") while(True): (defline,seq)=reader.nextSequence() if(not defline): break (id,attr)=FastaReader.parseDefline(defline) if(id not in keep): continue writer.addToFasta(defline,seq,fh) fh.close() print("[done]",file=sys.stderr)
with_statement) from builtins import (bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, oct, open, pow, round, super, filter, map, zip) # The above imports should allow this program to run in both Python 2 and # Python 3. You might need to update your version of module "future". import sys import ProgramName from FastaReader import FastaReader from FastaWriter import FastaWriter from Rex import Rex rex = Rex() #========================================================================= # main() #========================================================================= if (len(sys.argv) != 3): exit(ProgramName.get() + " <in.fasta> <out.fasta>\n") (infile, outfile) = sys.argv[1:] OUT = open(outfile, "wt") writer = FastaWriter() reader = FastaReader(infile) while (True): (defline, seq) = reader.nextSequence() if (not defline): break if (not rex.find(">chr", defline)): continue writer.addToFasta(defline, seq, OUT) OUT.close()
index_opt=None, index_outdir = "./bwa_index/") """ Main function of RefMasker that integrate database creation, blast and homology masking * Instantiate Blast database and blastn object * Perform iterative blasts of query sequences against the subject database and create a list of hits. """ # Try to validate a index from an existing one try: if not index_path: raise Exception("No index provided. An index will be generated") print("Existing index provided") FastaRef = FastaReader(ref1_path, ref2_path, write_merge=False) Index = ExistingIndex(bwa_path, index_path) # If no index or if an error occured during validation of the existing index = create a new one except Exception as E: print (E) print("Merge References...") mkdir(ref_outdir) FastaRef = FastaReader([ref1_path,ref2_path], write_merge=True, output="merged.fa") print("Generating index...") mkdir(db_outdir) Index = GenerateIndex(bwa_path, FastaRef.merge_ref, index_opt) remove (FastaRef.merge_ref)
from __future__ import print_function from FastaReader import FastaReader f = FastaReader("tmp.fa") count = 0 for r in f: rlen = len(r.sequence) print(">ccs/{}/{}_{}".format(count, 0, rlen)) for s in xrange(0, rlen, 60): print(r.sequence[s:s + 60]) count += 1
from FastaWriter import FastaWriter from Translation import Translation #filename="/home/bmajoros/1000G/assembly/combined/HG00096/1.fasta" #reader=FastaReader("/home/bmajoros/1000G/assembly/combined/HG00096/1.fasta") #while(True): # [defline,seq]=reader.nextSequence() # if(not defline): break # print("defline="+defline); # L=len(seq) # print("length="+str(L)) #filename="/home/bmajoros/1000G/assembly/BRCA1-NA19782.fasta"; filename="/Users/bmajoros/python/test/data/subset.fasta" print(FastaReader.getSize(filename)) [defline,seq]=FastaReader.firstSequence(filename) print(len(seq)) #filename="/home/bmajoros/1000G/assembly/test.fasta" filename="/Users/bmajoros/python/test/data/subset.fasta" hash=FastaReader.readAllAndKeepDefs(filename) for key in hash.keys(): [defline,seq]=hash[key] print(defline) [id,attrs]=FastaReader.parseDefline(defline) print("id="+id) for key,value in attrs.items(): print(key+"="+value)