def retrieveIEs(self): myReaderIE = FastAreader(self.intronerFile) for header, seq in myReaderIE.readFasta(): headList = header.split('_') #print(headList) fam = headList[0] #print(fam) #print(scaff) scaff = headList[1] #scaff = head[1:len(head):1] #print(scaff) coord = headList[2] coordList = coord.split("-") start = int(coordList[0]) #print(start) stop = int(coordList[1]) #print(stop) ieDic = { 'fam': fam, 'scaff': scaff, 'start': start, 'stop': stop, 'seq': seq } self.ieList.append(ieDic)
def retrieveAll(self): """ extract from scott's script output file """ allList = [] myReaderAll = FastAreader(self.exintFile) for header, seq in myReaderAll.readFasta(): #seq.upper() data = header + ';' + seq allList.append(data) for data in allList: # print(data) intron = data.split(';')[-1] # print(data) header = data.split(';')[0] # print(header) for ieDic in self.ieList: ieSeq = ieDic['seq'] #intronList = intron.split() #intronUpper = intronList.join('').upper() intronUpper = copy.deepcopy(intron) intronUpper = intronUpper.upper() if ieSeq in intronUpper: # print(intron) del ieDic['seq'] ieDic['seq'] = intron ieDic['header'] = header self.finalList.append(ieDic) #break sequence = Seq(ieDic['seq']) revComp = sequence.reverse_complement() strrevComp = str(revComp) if strrevComp in intronUpper: #print(intron) del ieDic['seq'] ieDic['header'] = header ieDic['seq'] = intron self.finalList.append(ieDic) #break #print(self.finalList) return self.finalList
def getProteins(self, fam): print("Get Proteins") myReaderIE = FastAreader('Data_{0}/{1}'.format(self.NAME,self.fasta)) for header, sequence in myReaderIE.readFasta(): for head, locus in self.geneDic.items(): # print('New Protein') if locus in header and locus not in self.proteinList: self.proteinList.append(locus) with open('Data_{0}/ieProteins2.fa'.format(self.NAME), 'a') as proteinFile: proteinFile.write('>{1}_{2}_{3}_{0}\n'.format(locus, fam, self.NAME, head)) proteinFile.write('{0}\n'.format(sequence)) return self.proteinDic
def cluster(): transcriptDic = {} for i in ['P_eremicus', 'P_leucopus', 'P_polionotus', 'P_manuclatus']: myReaderInGenes = FastAreader('{0}/reconstructed_CDS.fa'.format(i)) for header, sequence in myReaderInGenes.readFasta(): transcriptDic[header] = float(len(sequence)) with open('all_hits.out.filtered', 'w') as outf: with open('all_hits.out', 'r') as f: for line in f: sp = line.strip().split('\t') if float(sp[2]) > 90.0: if float(sp[3] ) > transcriptDic[sp[0].strip()] * .8 and float( sp[3]) > transcriptDic[sp[1].strip()] * .8: outf.write(line) os.system( "awk '{print $1\" \"$2}' all_hits.out.filtered > all_hits_IDs.out") # 181276 is the number of sequences upper bound os.system('silixx 181276 all_hits_IDs.out > clusters')
def ieReader(self): print('reading IE file ...') myReaderIE = FastAreader('Data_{0}/finalIEs.fa'.format(self.NAME)) for header, sequence in myReaderIE.readFasta(): fam = header[0:2:1] if fam[1].isdigit() == False: fam = header[0] head = header[1:len(header):1] else: fam = header[0:2:1] head = header[2:len(header):1] #ieDic = {head : sequence} if fam not in self.famDic.keys(): self.famDic[fam] = {head: sequence} else: self.famDic[fam][head] = sequence
def readIeFile(self): myReaderIE = FastAreader(self.ieFile) for header, seq in myReaderIE.readFasta(): fam = header[0:2:1] if fam[1].isdigit() == False: head = header[1:len(header):1] fam = header[0] else: fam = header[0:2:1] head = header[2:len(header):1] if fam not in self.ieHeadDic.keys(): print("NEW FAM") self.ieHeadDic[fam] = [head] else: print('Append') self.ieHeadDic[fam].append(head) return self.ieHeadDic
def main(): myData = csvReader('eukaryotes.csv') genomeData = myData.csv() for assembly in genomeData: NAME = assembly['Reference'] if 'GCA' not in assembly: pass else: ieList = [] print('Removing gene duplicsates from {0}'.format(NAME)) os.system('wget {0}'.format(assembly['Fasta'])) os.system('gunzip {0}*'.format(NAME)) os.system('cp {0}* ./Data_{0}'.format(NAME)) os.system('gunzip ./Data_{0}/*'.format(NAME)) os.system('rm -r {0}*'.format(NAME)) fastaList = assembly['Fasta'].split("/") fastaGz = fastaList[-2] fasta = fastaGz + '_genomic.fna' print(fasta) famDic={} myReaderIE = FastAreader('Data_{0}/IEfamilies*'.format(NAME)) for header, seq in myReaderIE.readFasta(): headList = header.split('_') head = headList[0] fam = head[0:1:1] if isinstance(fam,int) == False: fam = int(fam[0]) else: fam = int(fam) scaff = headList[2] coord = headList[3] coordList = coord.split("-") start = int(coordList[0]) stop = int(coordList[1]) ieDic = {'fam':fam, 'scaff':scaff,'start':start,'stop':stop, 'seq': seq} if str(fam) not in famDic.keys(): famDic[str(fam)]=1 else: famDic[str(fam)]+=1 ieList.append(ieDic) # print(ieList) for i in range(0,len(ieList)-1,1): ieDic1 = ieList[i] start1 = ieDic1['start'] stop1 = ieDic1['stop'] scaff1 = ieDic1['scaff'] for k in range(0,len(ieList),-1): ieDic2 = ieList[k] start2 = ieDic2['start'] stop2 = ieDic2['stop'] scaff2 = ieDic2['scaff'] if scaff1==scaff2: if start2 < start1 and stop1 < stop2: ieList.remove(ieDic1) print(ieDic) ieDicSeqs = [] myReaderGenome = FastAreader('Data_{0}/{1}'.format(NAME,fasta)) for header, sequence in myReaderGenome.readFasta(): portion = header.split(" ") head = portion[0] for ieDic in ieList: if ieDic['scaff']==head: start = ieDic['start'] stop = ieDic['stop'] up = sequence[int(start)-100:int(start):1] down =sequence[int(stop):int(stop)+100:1] ieDic['up'] = up ieDic['down'] = down ieDicSeqs.append(ieDic) myFolder = geneDups(ieList,NAME) myFolder.makeFile() myFolder.avaBlast myRemoveDups = removeDups(ieList, NAME, famDic) myRemoveDups.remove()
from sequenceAnalyzer import FastAreader import argparse parser = argparse.ArgumentParser(description='Summarize the distribution of ambiguity and missing data in a SARS-CoV-2 phylogeny') parser.add_argument('-msa', nargs='?', required=True,help='Path to msa file for SARS-CoV-2 phylogeny') parser.add_argument('-min_MAF', nargs='?', required=True, help='Minimum minor allele frequency for reported sites') args = vars(parser.parse_args()) ambDic = {} NDic = {} seqDic = {} myReaderRef = FastAreader('NC_045512v2.fa') for header, sequence in myReaderRef.readFasta(): refHead = header refSeq = sequence.upper() ambDicPerSite = {} NDicPerSite = {} myReaderFasta = FastAreader(args['msa']) for header, sequence in myReaderFasta.readFasta(): if sequence.replace('-','').upper() == refSeq: refSeq = sequence seqDic[header] = sequence.upper() NDic[header] = 0 ambDic[header] = 0 ambDicPerSite[header] = {} NDicPerSite[header] = {} NambDicPerSite = {} altDic = {} charDic = {'A':'','G':'','C':'','T':''}
import os from sequenceAnalyzer import FastAreader clusterDic = {} transcriptDic = {} with open('Grouped_clusters.tsv', 'r') as f: for line in f: sp = line.strip().split('\t') clusterDic[sp[0]] = [] for i in sp[1:]: clusterDic[sp[0]].append(i) transcriptDic[i] = '' for i in ['P_eremicus', 'P_leucopus', 'P_polionotus', 'P_manuclatus']: myReaderInGenes = FastAreader('{0}/reconstructed_CDS.fa'.format(i)) for header, sequence in myReaderInGenes.readFasta(): if header in transcriptDic: if len(sequence) % 3 != 0: remainder = len(sequence) % 3 addString = 'N' * (3 - remainder) transcriptDic[header] = sequence + addString else: transcriptDic[header] = sequence if len(transcriptDic[header]) % 3 != 0: print(len(transcriptDic[header]), header) os.system('mkdir selection_analysis') for cluster in clusterDic: if str(cluster) == '2': continue os.system('mkdir selection_analysis/cluster_{0}'.format(str(cluster)))
def main(): """ Introners are for the boys. """ myData = csvReader('algae.csv') genomeData = myData.csv() for assembly in genomeData: PATH = './' NAME = assembly['Reference'] if 'GCA' not in NAME: pass else: print('Downloading files for {0} assembly'.format(NAME)) #print(assembly['Reference']) os.system('mkdir Data_{0}'.format(NAME)) os.system('rm -r ./Data_{0}/blastOutIntrons.fa'.format(NAME)) print(assembly['Fasta']) os.system('wget {0}'.format(assembly['Fasta'])) print(assembly['Annotation']) os.system('wget {0}'.format(assembly['Annotation'])) os.system('gunzip {0}*'.format(NAME)) os.system('cp {0}* ./Data_{0}'.format(NAME)) os.system('gunzip ./Data_{0}/*'.format(NAME)) os.system('rm -r {0}*'.format(NAME)) annotationList = assembly['Annotation'].split("/") annotationGz = annotationList[-2] annotation = annotationGz + '_genomic.gff' print(annotation) fastaList = assembly['Fasta'].split("/") fastaGz = fastaList[-2] fasta = fastaGz + '_genomic.fna' print(fasta) print('Finding introner elements in {0}'.format(NAME)) mygeneData = GeneDataDic('{0}Data_{1}/{2}'.format(PATH, NAME, annotation)) cdsData = mygeneData.genedatadic() comparison = IntronRecognition(cdsData) intronList = comparison.introns() #Get rid of gene duplicates ########################### intronSeqs = [] noDupList = [] portion = header.split(" ") head = portion[0] myDups = GeneDups(intronList, head, sequence) myDups.flanks() newList = myDups.prune() noDupList = noDupList + newList #print(noDupList) ########################### print('Extracting Introns') myReaderGenome = FastAreader('{0}Data_{1}/{2}'.format(PATH, NAME, fasta)) for header, sequence in myReaderGenome.readFasta(): for header, sequence in myReaderGenome.readFasta(): portion = header.split(" ") head = portion[0] MyIntrons = RetrieveIntrons(head, sequence, noDupList) #changed this from intronList intronSeqs.append(MyIntrons.retrieve()) finalIntronList = list(filter(None, intronSeqs)) MyReads = MakeFasta(finalIntronList, PATH, NAME) MyReads.fasta() ################################################################ #print('Performing all vs all alignment with minimap2') # os.system("./Tools/minimap2/minimap2 -X -N 1000 {0}Data_{1}/Reads.fa {0}Data_{1}/Reads.fa | awk '$10>50' > {0}Data_{1}/overlaps.paf".format(PATH, NAME)) # #os.system("./Tools/minimap2/minimap2 -X -N 1000 {0}Data_{1}/Reads.fa {0}Data_{1}/Reads.fa > {0}Data_{1}/overlaps.paf".format(PATH, NAME)) ############################################################### print("Performing all-v-all BLAST") os.system("./Tools/ncbi-blast-2.7.1+/bin/makeblastdb -dbtype nucl -in {0}Data_{1}/Reads.fa -title introns -out {0}Data_{1}/intronsDB".format(PATH, NAME)) os.system("./Tools/ncbi-blast-2.7.1+/bin/blastn -db {0}Data_{1}/intronsDB -query {0}Data_{1}/Reads.fa -outfmt 6 -perc_identity 80 -out {0}Data_{1}/all-vs-all.tsv".format(PATH,NAME)) os.system("awk '$1 != $2 && awk $4 > 30' {0}Data_{1}/all-vs-all.tsv > {0}Data_{1}/all-vs-all_deduped.tsv".format(PATH,NAME)) print('Clustering introns from minimap output') #Data = Graph('./Data_{0}/overlaps.paf'.format(NAME), NAME) Data = Graph('./Data_{0}/all-vs-all_deduped.tsv'.format(NAME), NAME) IEfamilies = Data.graph() # myReaderReads = FastAreader('./Data_{0}/Reads.fa'.format(NAME)) count = 1 with open('./Data_{0}/IEfamilies.fa'.format(NAME), 'w') as file: for family in IEfamilies: if len(family) > 5: #print(family) #print(len(family)) for header, genomeSeq in myReaderGenome.readFasta(): for ie in family: portion = header.split(" ") head = portion[0] ieLabelList = ie.split('_') scaff = ieLabelList[2] coords = ieLabelList[3].split('-') start = coords[0] stop = coords[1] if head == scaff: sequence = genomeSeq[int(start):int(stop):1] if sequence[0] == 'C': #If intron was found on the noncoding strand seq = Seq(sequence) revcomp = seq.reverse_complement() #Return reverse complement so that all introns are in the same orientation file.write('>{1}{0}\n'.format(ie, count)) file.write('{0}\n'.format(revcomp)) else: file.write('>{1}{0}\n'.format(ie, count)) file.write('{0}\n'.format(sequence)) count += 1 #Running minimap2 on Blastn results # print('Running BLAST on putative introners') # os.system('./Tools/ncbi-blast-2.7.1+/bin/blastn -query {0}Data_{1}/IEfamilies.fa -subject {0}Data_{1}/{2} -perc_identity 85 -outfmt 6 >{0}Data_{1}/blasthit.txt'.format(PATH, NAME, fasta)) # # # data = DataDic('{0}Data_{1}/blasthit.txt'.format(PATH, NAME)) # # blastOut = data.datadic() # blastOutIntrons = [] # blastOutDups = [] # # print(blastOut) # for header, sequence in myReaderGenome.readFasta(): # portion = header.split(" ") # head = portion[0] # extractions = Extraction(blastOut, head, sequence) # blastOutDups.append(extractions.extract()) # # #Check with Russ, we could accidently remove insertions here # # print(blastOutDups) # for result in blastOutDups: #Remove duplicates # if result is not '': # for elem in result: # if elem not in blastOutIntrons: # blastOutIntrons.append(elem) # else: # print('Removed {0}'.format(elem)) # print('Writing final IE fasta file') # # os.system('./bin/fastx_collapser < ./Data_{0}/blastOutIntrons.fa > ./Data_{0}/uniqueIEs.fa'.format(NAME)) os.system('mv blastOutIntrons.fa . ./Data_{0}'.format(NAME)) os.system("rm -r {0}Data_{1}/all-vs-all.tsv".format(PATH,NAME)) # os.system("rm -r {0}Data_{1}/all-vs-all_deduped.tsv".format(PATH,NAME)) os.system("gzip {0}Data_{1}/all-vs-all_deduped.tsv".format(PATH,NAME)) os.system("rm -r {0}Data_{1}/intron*".format(PATH,NAME)) os.system('rm -r ./Data_{0}/{0}*'.format(NAME)) os.system('rm -r ./Data_{0}/o*'.format(NAME)) print('-------------------------------wow----wow-----wee----wow-----') print('Just took a fat dub')
def main(): parser = argparse.ArgumentParser( description= 'Employs a Girvin-Newman algorithm to cluster DNA sequences into families based on similarity' ) parser.add_argument('-fasta', nargs='?', required=True, help='Path to fasta file containing sequences') parser.add_argument('-algorithm', nargs='?', required=True, help='Options are BLAST and minimap2') parser.add_argument( '-min_aln_proportion', nargs='?', required=False, default=.8, help='Minimum alignment length relative to query length (Default = .8)' ) parser.add_argument( '-min_cluster_size', nargs='?', required=False, default=2, help= 'Minimum number of sequences required in a particular cluster (Default = 2)' ) parser.add_argument( '-min_percentID', nargs='?', required=False, default=80, help='Minimum percent identity for all v all BLAST (Default = 80)') args = vars(parser.parse_args()) print('Reading sequences') lengthDic = {} seqDic = {} myReaderReads = FastAreader(args['fasta']) for header, seq in myReaderReads.readFasta(): seqDic[header] = seq lengthDic[header] = len(seq) if args['algorithm'] == 'minimap2': if which('minimap2') is not None: print('Performing all vs all alignment with minimap2') os.system("minimap2 -X -N 1000 {0} {0} > overlaps.paf".format( args['fasta'])) #| awk '$10>50' os.system("awk '$1 != $6' overlaps.paf > all-vs-all_deduped.tsv") else: print("minimap2 doesn't seem to be in your PATH") sys.exit() elif args['algorithm'] == 'BLAST': if which('blastn') is not None: print("Performing all-v-all BLAST") min = args['min_percentID'] os.system( "makeblastdb -dbtype nucl -in {0} -title cluster_input -out seqDB" .format(args['fasta'])) os.system( 'blastn -db seqDB -query {0} -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore sstrand" -perc_identity {1} -out all-vs-all.tsv' .format(args['fasta'], int(min))) os.system("awk '$1 != $2' all-vs-all.tsv > all-vs-all_deduped.tsv") else: print("BLAST doesn't seem to be in your PATH") sys.exit() print('Clustering sequences...') Data = Graph('all-vs-all_deduped.tsv', args['algorithm'], args['min_aln_proportion'], lengthDic, args['min_cluster_size']) IEfamilies, strandDic, backup = Data.graph() count = 1 with open('clusters.fa', 'w') as file: for family in IEfamilies: if len(family) > int(args['min_cluster_size']): for header in family: try: #print(header,strandDic[header]) if strandDic[header] == '-' or strandDic[ header] == 'minus': #If aln was found on the noncoding strand sequence = Seq(seqDic[header]) revcomp = sequence.reverse_complement( ) #Return reverse complement so that all introns are in the same orientation file.write('>{1}_{0}\n'.format(header, count)) file.write('{0}\n'.format(revcomp)) else: file.write('>{1}_{0}\n'.format(header, count)) file.write('{0}\n'.format(seqDic[header])) except KeyError: if backup[header] == '-' or backup[ header] == 'minus': #If aln was found on the noncoding strand sequence = Seq(seqDic[header]) revcomp = sequence.reverse_complement( ) #Return reverse complement so that all introns are in the same orientation file.write('>{1}_{0}\n'.format(header, count)) file.write('{0}\n'.format(revcomp)) else: file.write('>{1}_{0}\n'.format(header, count)) file.write('{0}\n'.format(seqDic[header])) count += 1