def getGeneData(): # Look for non-complements second arg in gb.GenLocs is False path = 'bacteriaGB/Genbank/' # lists for starts and nonStarts startList = [] nonStartList = [] for filename in os.listdir(path): #print(path + filename) data,dna = readGenesIn(path+filename) # Find the keyword locations and gene locations klocs = gb.FindKeywordLocs ( data ) glocs = gb.GeneLocs ( data , klocs ) for start in glocs: # check for valid gene goodness,stNum,endNum = preCheck(start,dna) if (goodness): # Check that the complement is False aka a Non-Complement isComp = start[1] codon = dna[stNum:stNum+3] if (not isComp and codon == 'atg'): # these are the START startList.append(dna[stNum-30:stNum+23]) else: # This is NOT a START nonStartList.append(dna[stNum-30:stNum+23]) return startList,nonStartList
def DumpSequences( gbname ): data = gb.ReadFile( gbname ) klocs = gb.FindKeywordLocs(data) N = len( klocs ) genes = [] for i in range( N ): g = gb.Translation( data, klocs[i]) genes.append( g ) return genes
def FileReadConvert(fname, codkeys): # fname is the name of a Genbank file # codkeys is codons.keys( ) were codons is from genbank.Codons() # read in the DNA sequences for the genes data = genbank.ReadGenbank(fname) dna = genbank.ParseDNA(data) klocs = genbank.FindKeywordLocs(data) glocs = genbank.GeneLocs(data, klocs) NG = len(glocs) # number of genes codons = [] for i in range(NG): # extract DNA for this sequence cdna = genbank.GetCodingDNA(dna, glocs[i]) # convert to codons c = [] # codons for this gene for j in range(0, len(cdna), 3): c.append(codkeys.index(cdna[j:j + 3])) codons.append(c) return codons
def GetData(fn): gb = genbank.ReadGenbank(fn) dna = genbank.ParseDNA(gb) klocs = genbank.FindKeywordLocs(gb) genes = genbank.GeneLocs(gb, klocs) return genes, dna