Esempio n. 1
0
def identifyScaffChimereBU(species,infile,rep = "."):

    # tentative d amelioration de la routine de mise en evidence des scaff chimeres
    # resultats non probants!!
    # qd matche d une meme position sur un scaffold, voulais garder celui qui a le meilleur score uniquement, mais on observe souvent
    # soit des scores identiques soit des scores + grands pour les genes issus des chromo differents
    # du coup, les resultats sont limites pires!!
    # peut etre vaut il mieux post-traiter les resultats issus de la 1ere routine 

    os.chdir(rep)
    allScaff = fasta.fromFastaToDico(infile)
    outfile = infile.replace(".fasta","_chimere.log")
    of = open(outfile,"w")
    print species
    if species.lower() == "sace":
        db = "/Volumes/BioSan/Users/friedrich/BlastDB/Peptidic/Sace/s288c.pep"
    elif species.lower() == "lakl" or species.lower() == "sakl":
        db = "/Volumes/BioSan/Users/friedrich/BlastDB/Peptidic/Sakl/saklRef.pep"
    else:
        print "Species name not understood. Should be Sace, Sakl or Lakl"
        sys.exit()
        
    for scaff in allScaff.keys():
        if len(allScaff[scaff]) > 1000:
            fileN = "%s.fasta" % scaff
            fasta.fromSeqToFasta(allScaff[scaff],scaff,fileN)
            fileO = fileN.replace("fasta","blastx")
            alignement.run_blastxFmt(fileN,fileO,db,1000)
            if os.path.isfile(fileO):
                lines = open(fileO,"r").read().split("\n")
                chimere = 0
                dicProt = {}
                dicProtScore = {}
                for line in lines:
                    if line != "":
                        el = line.split("\t")
                        if string.atof(el[2]) > 95 and string.atof(el[3]) > 70:
                            if string.atoi(el[6]) in dicProt:
                                prot = dicProt[string.atoi(el[6])]
                                score = dicProtScore[prot]
                                if score < string.atof(el[11]):
                                    del dicProt[string.atoi(el[6])]
                                    dicProt[string.atoi(el[6])] = el[1]
                                    dicProtScore[el[1]] = string.atof(el[11])
                            else:
                                dicProt[string.atoi(el[6])] = el[1]
                                dicProtScore[el[1]] = string.atof(el[11])
                
                lChr = []
                for key in dicProt.keys():
                    if species.lower() == "sace":
                        if dicProt[key][0:2] not in lChr:
                            lChr.append(dicProt[key][0:2])
                    else:
                        if dicProt[key][0:6] not in lChr:
                            lChr.append(dicProt[key][0:6])
                # si on a plus de 1 chromo dans la liste, on a a faire a un scaff chimere
                if len(lChr) > 1:
                    lProt = dicProt.items()
                    lProt.sort()
                    print "%s - %s bp" % (scaff,len(allScaff[scaff]))
                    of.write("%s - %s bp\n" % (scaff,len(allScaff[scaff])))
                    print lProt
                    of.write("%s\n" % lProt)
                    fileO2 = fileO.replace("blastx","blastx-std")
                    alignement.run_blastx(fileN,fileO2,db,1000)
                else:
                    os.remove(fileO)
            os.remove(fileN)
    of.close()
Esempio n. 2
0
def identifyScaffChimere(species,infile,rep = "."):

    os.chdir(rep)
    allScaff = fasta.fromFastaToDico(infile)
        
    #outfile = infile.replace(".fasta","_chimere.log")
    
    ## RUN 25 Strains ##
    outfile = infile.replace(".scafSeq","_chimere.log")
    ##
    
    of = open(outfile,"w")
    print species
    if species.lower() == "sace":
        db = "/Volumes/BioSan/Users/friedrich/BlastDB/Peptidic/Sace/s288c.pep"
    elif species.lower() == "lakl" or species.lower() == "sakl":
        db = "/Volumes/BioSan/Users/friedrich/BlastDB/Peptidic/Sakl/saklRef.pep"
    else:
        print "Species name not understood. Should be Sace, Sakl or Lakl"
        sys.exit()
        
    for scaff in allScaff.keys():
        if len(allScaff[scaff]) > 1000:
            fileN = "%s.fasta" % scaff
            fasta.fromSeqToFasta(allScaff[scaff],scaff,fileN)
            fileO = fileN.replace("fasta","blastx")
            alignement.run_blastxFmt(fileN,fileO,db,1000)
            if os.path.isfile(fileO):
                lines = open(fileO,"r").read().split("\n")
                chr = ""
                chimere = 0
                dicProt = {}
                lP1 = []
                lP2 = []
                for line in lines:
                    if line != "":
                        el = line.split("\t")
                        #if string.atof(el[2]) > 90 and string.atof(el[3]) > 50:
                        if string.atof(el[2]) > 95 and string.atof(el[3]) > 70:
                            dicProt[el[1]] = string.atoi(el[6])
                            if chr == "":
                                if species.lower() == "sace":
                                    chr = el[1][0:2]
                                else:
                                    chr = el[1][0:6]
                                if string.atoi(el[6]) not in lP1:
                                    lP1.append(string.atoi(el[6]))
                            elif el[1][0:2] != chr and el[1][0:6] != chr:
                                chimere = 1
                                if string.atoi(el[6]) not in lP2:
                                    lP2.append(string.atoi(el[6]))
                                else:
                                    if string.atoi(el[6]) not in lP1:
                                        lP1.append(string.atoi(el[6]))
                           
                              
                if chimere == 1:
                    # ne tient pas compte des scaffs pour lesquels chimerisation due a la presence d un paralogue
                    chimere = 0
                    if len(lP1) < len(lP2):
                        for p1 in lP1:
                            if p1 not in lP2:
                                chimere = 1
                        else:
                            for p2 in lP2:
                                if p2 not in lP1:
                                    chimere = 1

                if chimere == 1:
                    lProt = dicProt.items()
                    lProt.sort(cmpval)
                    pos = 0
                    lchrom = []
                    # variable toR indique s il est possible de retirer la derniere valeur de la liste ou non
                    #print "%s - %s bp" % (scaff,len(allScaff[scaff]))
                    #print lProt
                    toR = 0
                    for prot in lProt:
                        #print prot
                        if prot[1] > pos:
                            if species.lower() == "sace":
                                if prot[0][0:2] not in lchrom:
                                    lchrom.append(prot[0][0:2])
                                    toR = 1
                            else:
                                if prot[0][0:6] not in lchrom:
                                    lchrom.append(prot[0][0:6])
                                    toR = 1
                            pos = prot[1]
                            #print lchrom
                        # les donnees etant triees
                        # si la valeur n est pas superieure, c est qu elle est egale
                        else:
                            if toR == 1:
                                # je retire le dernier chromo mis dans la liste
                                # ma liste ne contient aucune info par rapport au paralogue qui sont situees a la meme position
                                lchrom.pop()
                                # il faut que j ote la possibilite de remover pour le prochain tour, au cas ou on a + de 2 paralogues identifies a la meme position
                                toR = 0
                            
                    list(set(lchrom))
                    
                    if species.lower() == "sace":
                        firstChrom = lProt[0][0][0:2]
                        lastChrom = lProt[-1][0][0:2]
                    else:
                        firstChrom = lProt[0][0][0:6]
                        lastChrom = lProt[-1][0][0:6]

                    #if len(lchrom) > 1 and firstChrom != lastChrom:                        
                    if len(lchrom) > 1:
                        print "%s - %s bp" % (scaff,len(allScaff[scaff]))
                        of.write("%s - %s bp\n" % (scaff,len(allScaff[scaff])))
                    
                        print lProt
                        of.write("%s\n" % lProt)
                        fileO2 = fileO.replace("blastx","blastx-std")
                        alignement.run_blastx(fileN,fileO2,db,1000)
                    else:
                        os.remove(fileO)
                        os.remove(fileN)
                else:
                    os.remove(fileO)
                    os.remove(fileN)
    of.close()
def stats4StrainsALarrache(suff,outfile = ""):

    if outfile != "":
        of = open(outfile, 'a') 

    fileP = "AliProt/%s-4strains.fasta" % suff
    
    aliP = fasta.fromFastaToDico(fileP)
    lSeqILP = list()
    for seqNP in aliP.keys():
        if "IL01" in seqNP:
            lSeqILP.append(seqNP)
    
    seqRefP = aliP[suff]
    nbDiffP = 0
    lgILP = 0
    for seqILnP in lSeqILP:
        print "%s\t%s" % (suff,seqILnP)
        seqILP = aliP[seqILnP]
        lgILP += len(string.replace(seqILP,"-",""))
        for i in range(0, len(seqILP)):
            if seqILP[i] != seqRefP[i] and seqILP[i] != "-":
                aVerif = 0
                resWE = ""
                resNC = ""
                resYJM = ""
                # opt1 symbolise le fait que IL = WE ou IL = NC
                opt1 = 0
                # opt2 symbolise la fait que YJM = FY
                opt2 = 0
                # opt3 symbolise notre profile recherche, IL = YJM et diff des 3 autres
                opt3 = 0

                for (seqNP,sequenceP) in aliP.items():
                    if "IL01" in seqNP:
                        if sequenceP[i] == seqRefP[i]:
                            aVerif = 1
                            
                    else:
                        #aVerif = 0

                        if "YJM981" in seqNP:
                            if sequenceP[i] != "-":
                                if resYJM == "":
                                    resYJM = sequenceP[i]
                                elif resYJM != sequenceP[i]:
                                    resYJM = "ambigue"
                        if "WE372" in seqNP:
                            if sequenceP[i] != "-":
                                if resWE == "":
                                    resWE = sequenceP[i]
                                elif resWE != sequenceP[i]:
                                    resWE = "ambigue"
                        if "NC02" in seqNP:
                            if sequenceP[i] != "-":
                                if resNC == "":
                                    resNC = sequenceP[i]
                                elif resNC != sequenceP[i]:
                                    resNC = "ambigue"
                #print "%s, %s, %s, %s, %s" % (seqILP[i],resWE,resNC,resYJM,seqRefP[i])
                if aVerif == 1:
                    if outfile == "":
                        print "%s\t%s\tposition a verifier pour chevauchement IL01" % (suff,i+1)
                    else:
                        of.write("%s\t%s\tposition a verifier pour chevauchement IL01\n" % (suff,i+1))
                else:    
            
                    if resWE != "-":
                        if seqILP[i] == resWE:
                            opt1 = 1
                    if resNC != "-":
                        if seqILP[i] == resNC:
                            opt1 = 1
                    if resYJM != "-":
                        if seqRefP[i] == resYJM:
                            opt2 = 1
                        elif seqILP[i] == resYJM:
                            if opt1 == 0:
                                opt3 = 1
                        
                    nbDiffP += 1
    
                    if outfile == "":
                        print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (suff,i+1,seqILP[i],seqRefP[i],resWE,resNC,resYJM,opt1,opt2,opt3)
                    else:
                        of.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (suff,i+1,seqILP[i],seqRefP[i],resWE,resNC,resYJM,opt1,opt2,opt3))
    if outfile != "":
        of.close()
def statsALarrache(suff,outfile = ""):
    
    if outfile != "":
        of = open(outfile, 'a') 
        
    fileP = "AliProt/%s-11strains.fasta" % suff
    fileG = "AliGene/%s-G-11strains.fasta" % suff
    #fileP = "AliProt/%s-4strains.fasta" % suff
    #fileG = "AliGene/%s-G-4strains.fasta" % suff
    
    aliP = fasta.fromFastaToDico(fileP)
    lSeqILP = list()
    for seqNP in aliP.keys():
        if "IL01" in seqNP:
            lSeqILP.append(seqNP)
    if outfile == "":
        print "\nTraite %s:\n----Alignement des proteines----\non a %s sequence de IL01" % (suff,len(lSeqILP))
    else:
        of.write("\nTraite %s:\n----Alignement des proteines----\non a %s sequence de IL01\n" % (suff,len(lSeqILP)))
    # recuperation de l alignement de la reference
    seqRefP = aliP[suff]
    nbDiffP = 0
    nbDiffPIsolee = 0
    lgILP = 0
    for seqILnP in lSeqILP:
        if outfile == "":
            print "+++traite %s+++" % seqILnP
        else:
            of.write("+++traite %s+++\n" % seqILnP)
        seqILP = aliP[seqILnP]
        lgILP += len(string.replace(seqILP,"-",""))
        for i in range(0, len(seqILP)):
            if seqILP[i] != seqRefP[i] and seqILP[i] != "-":
                # je peux recuperer le nombre de positions differentes, leur localisation
                # le nombre de position isolee
                lSeqIdP = list()
                for (seqNP,sequenceP) in aliP.items():
                    if "IL01" in seqNP:
                        if sequenceP[i] == seqRefP[i]:
                            aVerif = 1
                            break
                    else:
                        aVerif = 0
                        if sequenceP[i] == seqILP[i]:
                            lSeqIdP.append(seqNP)
                
                if aVerif == 1:
                    if outfile == "":
                        print "%s : position a verifier pour chevauchement IL01" % (i+1)
                    else:
                        of.write("%s : position a verifier pour chevauchement IL01\n" % (i+1))
                else:
                    nbDiffP += 1
                    if len(lSeqIdP) == 0:
                        nbDiffPIsolee += 1
                    if outfile == "":
                        print "%s\t%s\t%s\n" % (i+1,len(lSeqIdP),lSeqIdP)
                    else:
                        of.write("%s\t%s\t%s\n" % (i+1,len(lSeqIdP),lSeqIdP))

    #print "%s\t%s\t%s\t%s\t%s" % (suff,len(string.replace(seqRefP,"-","")),lgILP,nbDiffP,nbDiffPIsolee)
    
    aliG = fasta.fromFastaToDico(fileG)
    lSeqILG = list()
    for seqNG in aliG.keys():
        if "IL01" in seqNG:
            lSeqILG.append(seqNG)
    if outfile == "":
        print "----Alignement des genes----\non a ",len(lSeqILG)," sequence de IL01"
    else:
        of.write("----Alignement des genes----\non a %s sequence de IL01\n" % len(lSeqILG))
    
    # recuperation de l alignement de la reference
    seqRefG = aliG[suff] 
    nbDiffG = 0
    nbDiffGIsolee = 0 
    lgILG = 0
    for seqILnG in lSeqILG:
        if outfile == "":
            print "+++traite %s+++" % seqILnG
        else:
            of.write("+++traite %s+++\n" % seqILnG)
        seqILG = aliG[seqILnG]
        lgILG += len(string.replace(seqILG,"-",""))

        for i in range(0, len(seqILG)):
            if seqILG[i].upper() != seqRefG[i].upper() and seqILG[i] != "-":
                lSeqIdG = list()
                for (seqNG,sequenceG) in aliG.items():
                    if "IL01" in seqNG:
                        if sequenceG[i].upper() == seqRefG[i].upper():
                            aVerif = 1
                            break
                    else:
                        aVerif = 0
                        if sequenceG[i].upper() == seqILG[i].upper():
                            lSeqIdG.append(seqNG)
                
                if aVerif == 1:
                    if outfile == "":
                        print "%s : position a verifier pour chevauchement IL01" % (i+1)
                    else:
                        of.write("%s : position a verifier pour chevauchement IL01\n" % (i+1))
                else:
                    nbDiffG += 1
                    if len(lSeqIdG) == 0:
                        nbDiffGIsolee += 1
                    if outfile == "":
                        print "%s\t%s\t%s" % (i+1,len(lSeqIdG),lSeqIdG)
                    else:
                        of.write("%s\t%s\t%s\n" % (i+1,len(lSeqIdG),lSeqIdG))
    if outfile != "":
        of.close()
    #print "%s\t%s\t%s\t%s\t%s" % (suff,len(string.replace(seqRefG,"-","")),lgILG,nbDiffG,nbDiffGIsolee)
    print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (suff,len(string.replace(seqRefP,"-","")),len(lSeqILP),lgILP,nbDiffP,nbDiffPIsolee,len(string.replace(seqRefG,"-","")),len(lSeqILG),lgILG,nbDiffG,nbDiffGIsolee)
Esempio n. 5
0
def longScaff(infile):
    allSeq = fasta.fromFastaToDico(infile)
    for scaff in allSeq.keys():
        print "%s\t%s" % (scaff,len(allSeq[scaff]))