Esempio n. 1
0
    def retrieveIEs(self):
        myReaderIE = FastAreader(self.intronerFile)
        for header, seq in myReaderIE.readFasta():
            headList = header.split('_')
            #print(headList)
            fam = headList[0]

            #print(fam)
            #print(scaff)
            scaff = headList[1]
            #scaff = head[1:len(head):1]
            #print(scaff)
            coord = headList[2]
            coordList = coord.split("-")
            start = int(coordList[0])
            #print(start)
            stop = int(coordList[1])
            #print(stop)
            ieDic = {
                'fam': fam,
                'scaff': scaff,
                'start': start,
                'stop': stop,
                'seq': seq
            }
            self.ieList.append(ieDic)
Esempio n. 2
0
    def retrieveAll(self):
        """
        extract from scott's script output file
        """
        allList = []
        myReaderAll = FastAreader(self.exintFile)
        for header, seq in myReaderAll.readFasta():
            #seq.upper()
            data = header + ';' + seq
            allList.append(data)

        for data in allList:
            # print(data)
            intron = data.split(';')[-1]
            #   print(data)
            header = data.split(';')[0]
            # print(header)
            for ieDic in self.ieList:
                ieSeq = ieDic['seq']
                #intronList = intron.split()
                #intronUpper = intronList.join('').upper()
                intronUpper = copy.deepcopy(intron)
                intronUpper = intronUpper.upper()
                if ieSeq in intronUpper:
                    # print(intron)
                    del ieDic['seq']

                    ieDic['seq'] = intron
                    ieDic['header'] = header
                    self.finalList.append(ieDic)
                    #break
                sequence = Seq(ieDic['seq'])
                revComp = sequence.reverse_complement()
                strrevComp = str(revComp)
                if strrevComp in intronUpper:

                    #print(intron)
                    del ieDic['seq']
                    ieDic['header'] = header

                    ieDic['seq'] = intron
                    self.finalList.append(ieDic)
                    #break
        #print(self.finalList)

        return self.finalList
Esempio n. 3
0
    def getProteins(self, fam):
        print("Get Proteins")

        myReaderIE = FastAreader('Data_{0}/{1}'.format(self.NAME,self.fasta))
        for header, sequence in myReaderIE.readFasta():  
            for head, locus in self.geneDic.items():
#                print('New Protein')
                
                
                
                if locus in header and locus not in self.proteinList:
                    self.proteinList.append(locus)
                    with open('Data_{0}/ieProteins2.fa'.format(self.NAME), 'a') as proteinFile:
                            
                        proteinFile.write('>{1}_{2}_{3}_{0}\n'.format(locus, fam, self.NAME, head))
                        proteinFile.write('{0}\n'.format(sequence))
        return self.proteinDic
Esempio n. 4
0
def cluster():
    transcriptDic = {}
    for i in ['P_eremicus', 'P_leucopus', 'P_polionotus', 'P_manuclatus']:
        myReaderInGenes = FastAreader('{0}/reconstructed_CDS.fa'.format(i))
        for header, sequence in myReaderInGenes.readFasta():
            transcriptDic[header] = float(len(sequence))
    with open('all_hits.out.filtered', 'w') as outf:
        with open('all_hits.out', 'r') as f:
            for line in f:
                sp = line.strip().split('\t')
                if float(sp[2]) > 90.0:
                    if float(sp[3]
                             ) > transcriptDic[sp[0].strip()] * .8 and float(
                                 sp[3]) > transcriptDic[sp[1].strip()] * .8:
                        outf.write(line)
    os.system(
        "awk '{print $1\" \"$2}' all_hits.out.filtered > all_hits_IDs.out")
    # 181276 is the number of sequences upper bound
    os.system('silixx 181276 all_hits_IDs.out > clusters')
Esempio n. 5
0
    def ieReader(self):
        print('reading IE file ...')

        myReaderIE = FastAreader('Data_{0}/finalIEs.fa'.format(self.NAME))
        for header, sequence in myReaderIE.readFasta():
            fam = header[0:2:1]

            if fam[1].isdigit() == False:
                fam = header[0]
                head = header[1:len(header):1]

            else:
                fam = header[0:2:1]
                head = header[2:len(header):1]

            #ieDic = {head : sequence}
            if fam not in self.famDic.keys():
                self.famDic[fam] = {head: sequence}
            else:
                self.famDic[fam][head] = sequence
Esempio n. 6
0
    def readIeFile(self):
        myReaderIE = FastAreader(self.ieFile)
        for header, seq in myReaderIE.readFasta():
            fam = header[0:2:1]

            if fam[1].isdigit() == False:
                head = header[1:len(header):1]
                fam = header[0]

            else:
                fam = header[0:2:1]

                head = header[2:len(header):1]
            if fam not in self.ieHeadDic.keys():
                print("NEW FAM")

                self.ieHeadDic[fam] = [head]
            else:
                print('Append')
                self.ieHeadDic[fam].append(head)
        return self.ieHeadDic
Esempio n. 7
0
def main():
    myData = csvReader('eukaryotes.csv')
    genomeData = myData.csv()
         
    for assembly in genomeData:
        
        NAME = assembly['Reference']

        if 'GCA' not in assembly:
            pass
        else:
            ieList = []
            print('Removing gene duplicsates from {0}'.format(NAME))            

            os.system('wget {0}'.format(assembly['Fasta']))
            os.system('gunzip {0}*'.format(NAME))
            os.system('cp {0}* ./Data_{0}'.format(NAME))
            os.system('gunzip ./Data_{0}/*'.format(NAME))
            os.system('rm -r {0}*'.format(NAME))
            fastaList = assembly['Fasta'].split("/")
            fastaGz = fastaList[-2]
            fasta = fastaGz + '_genomic.fna'
            print(fasta)
            famDic={}
            
            myReaderIE = FastAreader('Data_{0}/IEfamilies*'.format(NAME))
            for header, seq in myReaderIE.readFasta():
                headList = header.split('_')
                head = headList[0]
                fam = head[0:1:1]
        
                if isinstance(fam,int) == False:
                    fam = int(fam[0])
                else:
                    fam = int(fam)
        
                scaff = headList[2]
                coord = headList[3]
                coordList = coord.split("-")
                start = int(coordList[0])
                stop = int(coordList[1])
                ieDic = {'fam':fam, 'scaff':scaff,'start':start,'stop':stop, 'seq': seq}
                if str(fam) not in famDic.keys():
                    famDic[str(fam)]=1
                else:
                    famDic[str(fam)]+=1
                ieList.append(ieDic)
           # print(ieList)
            
            for i in range(0,len(ieList)-1,1):
                ieDic1 = ieList[i]
                start1 = ieDic1['start']
                stop1 = ieDic1['stop']
                scaff1 = ieDic1['scaff']
                for k in range(0,len(ieList),-1):
                    ieDic2 = ieList[k]
                    start2 = ieDic2['start']
                    stop2 = ieDic2['stop']
                    scaff2 = ieDic2['scaff']
                    if scaff1==scaff2:
                        if start2 < start1 and stop1 < stop2:
                            ieList.remove(ieDic1)
                            print(ieDic)
            

            
            ieDicSeqs = []
            myReaderGenome = FastAreader('Data_{0}/{1}'.format(NAME,fasta))
            for header, sequence in myReaderGenome.readFasta():
                portion =  header.split(" ")
                head = portion[0]
                for ieDic in ieList:
                    if ieDic['scaff']==head:
                        start = ieDic['start']
                        stop = ieDic['stop']
                        up = sequence[int(start)-100:int(start):1]
                        down =sequence[int(stop):int(stop)+100:1]
                        ieDic['up'] = up
                        ieDic['down'] = down

                        ieDicSeqs.append(ieDic)
                     
            myFolder = geneDups(ieList,NAME)
            myFolder.makeFile()
            myFolder.avaBlast
            
            myRemoveDups = removeDups(ieList, NAME, famDic)
            myRemoveDups.remove()
from sequenceAnalyzer import FastAreader
import argparse 

parser = argparse.ArgumentParser(description='Summarize the distribution of ambiguity and missing data in a SARS-CoV-2 phylogeny') 
parser.add_argument('-msa', nargs='?', required=True,help='Path to msa file for SARS-CoV-2 phylogeny')
parser.add_argument('-min_MAF', nargs='?', required=True,
                    help='Minimum minor allele frequency for reported sites') 
args = vars(parser.parse_args())

ambDic = {}
NDic = {}
seqDic = {}
myReaderRef = FastAreader('NC_045512v2.fa')
for header, sequence in myReaderRef.readFasta():
    refHead = header
    refSeq = sequence.upper()
ambDicPerSite = {}
NDicPerSite = {}
myReaderFasta = FastAreader(args['msa'])
for header, sequence in myReaderFasta.readFasta():
    if sequence.replace('-','').upper() == refSeq:
        refSeq = sequence
    seqDic[header] = sequence.upper()
    NDic[header] = 0
    ambDic[header] = 0 
    ambDicPerSite[header] = {}
    NDicPerSite[header] = {}

NambDicPerSite = {}
altDic = {}
charDic = {'A':'','G':'','C':'','T':''}
import os
from sequenceAnalyzer import FastAreader

clusterDic = {}
transcriptDic = {}
with open('Grouped_clusters.tsv', 'r') as f:
    for line in f:
        sp = line.strip().split('\t')
        clusterDic[sp[0]] = []
        for i in sp[1:]:
            clusterDic[sp[0]].append(i)
            transcriptDic[i] = ''

for i in ['P_eremicus', 'P_leucopus', 'P_polionotus', 'P_manuclatus']:
    myReaderInGenes = FastAreader('{0}/reconstructed_CDS.fa'.format(i))
    for header, sequence in myReaderInGenes.readFasta():
        if header in transcriptDic:
            if len(sequence) % 3 != 0:
                remainder = len(sequence) % 3
                addString = 'N' * (3 - remainder)
                transcriptDic[header] = sequence + addString
            else:
                transcriptDic[header] = sequence
            if len(transcriptDic[header]) % 3 != 0:
                print(len(transcriptDic[header]), header)

os.system('mkdir selection_analysis')
for cluster in clusterDic:
    if str(cluster) == '2':
        continue
    os.system('mkdir selection_analysis/cluster_{0}'.format(str(cluster)))
Esempio n. 10
0
def main():
    """
    Introners are for the boys.
    """

    myData = csvReader('algae.csv')
    genomeData = myData.csv()
         
    for assembly in genomeData:
        
        PATH = './'
        
        NAME = assembly['Reference']
        if 'GCA' not in NAME:
            pass
        else:
            print('Downloading files for {0} assembly'.format(NAME))
            #print(assembly['Reference'])
            
            os.system('mkdir Data_{0}'.format(NAME))
            os.system('rm -r ./Data_{0}/blastOutIntrons.fa'.format(NAME))
            print(assembly['Fasta'])
            os.system('wget {0}'.format(assembly['Fasta']))
            print(assembly['Annotation'])
            os.system('wget {0}'.format(assembly['Annotation']))
            os.system('gunzip {0}*'.format(NAME))
            os.system('cp {0}* ./Data_{0}'.format(NAME))
            os.system('gunzip ./Data_{0}/*'.format(NAME))
            os.system('rm -r {0}*'.format(NAME))
            
            annotationList = assembly['Annotation'].split("/")
            annotationGz = annotationList[-2]
            annotation = annotationGz + '_genomic.gff'
            print(annotation)
            
            fastaList = assembly['Fasta'].split("/")
            fastaGz = fastaList[-2]
            fasta = fastaGz + '_genomic.fna'
            print(fasta)
            
            print('Finding introner elements in {0}'.format(NAME))
            
            mygeneData = GeneDataDic('{0}Data_{1}/{2}'.format(PATH, NAME, annotation)) 
            cdsData = mygeneData.genedatadic()
            
            comparison = IntronRecognition(cdsData)
            intronList = comparison.introns()
            #Get rid of gene duplicates
            ###########################
            intronSeqs = []
            noDupList = []
                portion =  header.split(" ")
                head = portion[0]
                myDups = GeneDups(intronList, head, sequence)
                myDups.flanks()
                newList = myDups.prune() 
                noDupList = noDupList + newList
            #print(noDupList)
            
            ###########################
            
            
            
            print('Extracting Introns')
                        myReaderGenome = FastAreader('{0}Data_{1}/{2}'.format(PATH, NAME, fasta))
            for header, sequence in myReaderGenome.readFasta():

            
            for header, sequence in myReaderGenome.readFasta():
                portion =  header.split(" ")
                head = portion[0]
                MyIntrons = RetrieveIntrons(head, sequence, noDupList) #changed this from intronList
                intronSeqs.append(MyIntrons.retrieve())    
            finalIntronList = list(filter(None, intronSeqs))
            MyReads = MakeFasta(finalIntronList, PATH, NAME)    
            MyReads.fasta()
            ################################################################
            
            #print('Performing all vs all alignment with minimap2')
           # os.system("./Tools/minimap2/minimap2 -X -N 1000 {0}Data_{1}/Reads.fa {0}Data_{1}/Reads.fa | awk '$10>50' > {0}Data_{1}/overlaps.paf".format(PATH, NAME))
           # #os.system("./Tools/minimap2/minimap2 -X -N 1000 {0}Data_{1}/Reads.fa {0}Data_{1}/Reads.fa > {0}Data_{1}/overlaps.paf".format(PATH, NAME))
            ###############################################################
            print("Performing all-v-all BLAST")
            
            os.system("./Tools/ncbi-blast-2.7.1+/bin/makeblastdb -dbtype nucl -in {0}Data_{1}/Reads.fa -title introns -out {0}Data_{1}/intronsDB".format(PATH, NAME))
            os.system("./Tools/ncbi-blast-2.7.1+/bin/blastn -db {0}Data_{1}/intronsDB -query {0}Data_{1}/Reads.fa -outfmt 6 -perc_identity 80 -out {0}Data_{1}/all-vs-all.tsv".format(PATH,NAME))
            os.system("awk '$1 != $2 && awk $4 > 30' {0}Data_{1}/all-vs-all.tsv > {0}Data_{1}/all-vs-all_deduped.tsv".format(PATH,NAME))
            

            print('Clustering introns from minimap output')
            #Data = Graph('./Data_{0}/overlaps.paf'.format(NAME), NAME)
            Data = Graph('./Data_{0}/all-vs-all_deduped.tsv'.format(NAME), NAME)
            IEfamilies = Data.graph()
           # myReaderReads = FastAreader('./Data_{0}/Reads.fa'.format(NAME))
            count = 1
            with open('./Data_{0}/IEfamilies.fa'.format(NAME), 'w') as file:
            
                for family in IEfamilies:
                    if len(family) > 5:
                        #print(family)
                        #print(len(family))
                        for header, genomeSeq in myReaderGenome.readFasta():

                            for ie in family:
                                portion =  header.split(" ")
                                head = portion[0]
                                ieLabelList = ie.split('_')
                                scaff = ieLabelList[2]
                                coords = ieLabelList[3].split('-')
                                start = coords[0]
                                stop = coords[1]
                                if head == scaff:
                                    sequence = genomeSeq[int(start):int(stop):1]
                                    if sequence[0] == 'C': #If intron was found on the noncoding strand
                                        seq = Seq(sequence)
                                        revcomp = seq.reverse_complement() #Return reverse complement so that all introns are in the same orientation
        
                                        file.write('>{1}{0}\n'.format(ie, count))
                                        file.write('{0}\n'.format(revcomp))
                                    else:                                
                                        file.write('>{1}{0}\n'.format(ie, count))
                                        file.write('{0}\n'.format(sequence))
                        count += 1
    
            #Running minimap2 on Blastn results 
#            print('Running BLAST on putative introners')
#            os.system('./Tools/ncbi-blast-2.7.1+/bin/blastn -query {0}Data_{1}/IEfamilies.fa -subject {0}Data_{1}/{2} -perc_identity 85 -outfmt 6 >{0}Data_{1}/blasthit.txt'.format(PATH, NAME, fasta)) 
#        
#            
#            data = DataDic('{0}Data_{1}/blasthit.txt'.format(PATH, NAME))
#            
#            blastOut = data.datadic()
#            blastOutIntrons = []
#            blastOutDups = []
#           # print(blastOut)
#            for header, sequence in myReaderGenome.readFasta():
#                portion =  header.split(" ")
#                head = portion[0]
#                extractions = Extraction(blastOut, head, sequence)
#                blastOutDups.append(extractions.extract())
#            
#             #Check with Russ, we could accidently remove insertions here
#           # print(blastOutDups)
#            for result in blastOutDups: #Remove duplicates
#                if result is not '':
#                    for elem in result:
#                        if elem not in blastOutIntrons:
#                            blastOutIntrons.append(elem)
#                        else:
#                            print('Removed {0}'.format(elem))
#        
            print('Writing final IE fasta file')
#           
           # os.system('./bin/fastx_collapser < ./Data_{0}/blastOutIntrons.fa > ./Data_{0}/uniqueIEs.fa'.format(NAME))
           
            os.system('mv blastOutIntrons.fa . ./Data_{0}'.format(NAME))
            os.system("rm -r {0}Data_{1}/all-vs-all.tsv".format(PATH,NAME))
           # os.system("rm -r {0}Data_{1}/all-vs-all_deduped.tsv".format(PATH,NAME))
            os.system("gzip {0}Data_{1}/all-vs-all_deduped.tsv".format(PATH,NAME))
            os.system("rm -r {0}Data_{1}/intron*".format(PATH,NAME))
            os.system('rm -r ./Data_{0}/{0}*'.format(NAME))
            os.system('rm -r ./Data_{0}/o*'.format(NAME))

            print('-------------------------------wow----wow-----wee----wow-----')
            print('Just took a fat dub')
Esempio n. 11
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Employs a Girvin-Newman algorithm to cluster DNA sequences into families based on similarity'
    )
    parser.add_argument('-fasta',
                        nargs='?',
                        required=True,
                        help='Path to fasta file containing sequences')
    parser.add_argument('-algorithm',
                        nargs='?',
                        required=True,
                        help='Options are BLAST and minimap2')
    parser.add_argument(
        '-min_aln_proportion',
        nargs='?',
        required=False,
        default=.8,
        help='Minimum alignment length relative to query length  (Default = .8)'
    )
    parser.add_argument(
        '-min_cluster_size',
        nargs='?',
        required=False,
        default=2,
        help=
        'Minimum number of sequences required in a particular cluster  (Default = 2)'
    )
    parser.add_argument(
        '-min_percentID',
        nargs='?',
        required=False,
        default=80,
        help='Minimum percent identity for all v all BLAST   (Default = 80)')
    args = vars(parser.parse_args())

    print('Reading sequences')
    lengthDic = {}
    seqDic = {}
    myReaderReads = FastAreader(args['fasta'])
    for header, seq in myReaderReads.readFasta():
        seqDic[header] = seq
        lengthDic[header] = len(seq)

    if args['algorithm'] == 'minimap2':
        if which('minimap2') is not None:

            print('Performing all vs all alignment with minimap2')
            os.system("minimap2 -X -N 1000 {0} {0} > overlaps.paf".format(
                args['fasta']))  #| awk '$10>50'
            os.system("awk '$1 != $6' overlaps.paf > all-vs-all_deduped.tsv")

        else:
            print("minimap2 doesn't seem to be in your PATH")
            sys.exit()

    elif args['algorithm'] == 'BLAST':
        if which('blastn') is not None:

            print("Performing all-v-all BLAST")
            min = args['min_percentID']
            os.system(
                "makeblastdb -dbtype nucl -in {0} -title cluster_input -out seqDB"
                .format(args['fasta']))
            os.system(
                'blastn -db seqDB -query {0} -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore sstrand" -perc_identity {1} -out all-vs-all.tsv'
                .format(args['fasta'], int(min)))
            os.system("awk '$1 != $2' all-vs-all.tsv > all-vs-all_deduped.tsv")
        else:
            print("BLAST doesn't seem to be in your PATH")
            sys.exit()

    print('Clustering sequences...')
    Data = Graph('all-vs-all_deduped.tsv', args['algorithm'],
                 args['min_aln_proportion'], lengthDic,
                 args['min_cluster_size'])

    IEfamilies, strandDic, backup = Data.graph()
    count = 1

    with open('clusters.fa', 'w') as file:

        for family in IEfamilies:
            if len(family) > int(args['min_cluster_size']):
                for header in family:
                    try:
                        #print(header,strandDic[header])
                        if strandDic[header] == '-' or strandDic[
                                header] == 'minus':  #If aln was found on the noncoding strand
                            sequence = Seq(seqDic[header])
                            revcomp = sequence.reverse_complement(
                            )  #Return reverse complement so that all introns are in the same orientation

                            file.write('>{1}_{0}\n'.format(header, count))
                            file.write('{0}\n'.format(revcomp))
                        else:
                            file.write('>{1}_{0}\n'.format(header, count))
                            file.write('{0}\n'.format(seqDic[header]))
                    except KeyError:
                        if backup[header] == '-' or backup[
                                header] == 'minus':  #If aln was found on the noncoding strand
                            sequence = Seq(seqDic[header])
                            revcomp = sequence.reverse_complement(
                            )  #Return reverse complement so that all introns are in the same orientation

                            file.write('>{1}_{0}\n'.format(header, count))
                            file.write('{0}\n'.format(revcomp))
                        else:
                            file.write('>{1}_{0}\n'.format(header, count))
                            file.write('{0}\n'.format(seqDic[header]))
                count += 1