def identifyScaffChimereBU(species,infile,rep = "."): # tentative d amelioration de la routine de mise en evidence des scaff chimeres # resultats non probants!! # qd matche d une meme position sur un scaffold, voulais garder celui qui a le meilleur score uniquement, mais on observe souvent # soit des scores identiques soit des scores + grands pour les genes issus des chromo differents # du coup, les resultats sont limites pires!! # peut etre vaut il mieux post-traiter les resultats issus de la 1ere routine os.chdir(rep) allScaff = fasta.fromFastaToDico(infile) outfile = infile.replace(".fasta","_chimere.log") of = open(outfile,"w") print species if species.lower() == "sace": db = "/Volumes/BioSan/Users/friedrich/BlastDB/Peptidic/Sace/s288c.pep" elif species.lower() == "lakl" or species.lower() == "sakl": db = "/Volumes/BioSan/Users/friedrich/BlastDB/Peptidic/Sakl/saklRef.pep" else: print "Species name not understood. Should be Sace, Sakl or Lakl" sys.exit() for scaff in allScaff.keys(): if len(allScaff[scaff]) > 1000: fileN = "%s.fasta" % scaff fasta.fromSeqToFasta(allScaff[scaff],scaff,fileN) fileO = fileN.replace("fasta","blastx") alignement.run_blastxFmt(fileN,fileO,db,1000) if os.path.isfile(fileO): lines = open(fileO,"r").read().split("\n") chimere = 0 dicProt = {} dicProtScore = {} for line in lines: if line != "": el = line.split("\t") if string.atof(el[2]) > 95 and string.atof(el[3]) > 70: if string.atoi(el[6]) in dicProt: prot = dicProt[string.atoi(el[6])] score = dicProtScore[prot] if score < string.atof(el[11]): del dicProt[string.atoi(el[6])] dicProt[string.atoi(el[6])] = el[1] dicProtScore[el[1]] = string.atof(el[11]) else: dicProt[string.atoi(el[6])] = el[1] dicProtScore[el[1]] = string.atof(el[11]) lChr = [] for key in dicProt.keys(): if species.lower() == "sace": if dicProt[key][0:2] not in lChr: lChr.append(dicProt[key][0:2]) else: if dicProt[key][0:6] not in lChr: lChr.append(dicProt[key][0:6]) # si on a plus de 1 chromo dans la liste, on a a faire a un scaff chimere if len(lChr) > 1: lProt = dicProt.items() lProt.sort() print "%s - %s bp" % (scaff,len(allScaff[scaff])) of.write("%s - %s bp\n" % (scaff,len(allScaff[scaff]))) print lProt of.write("%s\n" % lProt) fileO2 = fileO.replace("blastx","blastx-std") alignement.run_blastx(fileN,fileO2,db,1000) else: os.remove(fileO) os.remove(fileN) of.close()
def identifyScaffChimere(species,infile,rep = "."): os.chdir(rep) allScaff = fasta.fromFastaToDico(infile) #outfile = infile.replace(".fasta","_chimere.log") ## RUN 25 Strains ## outfile = infile.replace(".scafSeq","_chimere.log") ## of = open(outfile,"w") print species if species.lower() == "sace": db = "/Volumes/BioSan/Users/friedrich/BlastDB/Peptidic/Sace/s288c.pep" elif species.lower() == "lakl" or species.lower() == "sakl": db = "/Volumes/BioSan/Users/friedrich/BlastDB/Peptidic/Sakl/saklRef.pep" else: print "Species name not understood. Should be Sace, Sakl or Lakl" sys.exit() for scaff in allScaff.keys(): if len(allScaff[scaff]) > 1000: fileN = "%s.fasta" % scaff fasta.fromSeqToFasta(allScaff[scaff],scaff,fileN) fileO = fileN.replace("fasta","blastx") alignement.run_blastxFmt(fileN,fileO,db,1000) if os.path.isfile(fileO): lines = open(fileO,"r").read().split("\n") chr = "" chimere = 0 dicProt = {} lP1 = [] lP2 = [] for line in lines: if line != "": el = line.split("\t") #if string.atof(el[2]) > 90 and string.atof(el[3]) > 50: if string.atof(el[2]) > 95 and string.atof(el[3]) > 70: dicProt[el[1]] = string.atoi(el[6]) if chr == "": if species.lower() == "sace": chr = el[1][0:2] else: chr = el[1][0:6] if string.atoi(el[6]) not in lP1: lP1.append(string.atoi(el[6])) elif el[1][0:2] != chr and el[1][0:6] != chr: chimere = 1 if string.atoi(el[6]) not in lP2: lP2.append(string.atoi(el[6])) else: if string.atoi(el[6]) not in lP1: lP1.append(string.atoi(el[6])) if chimere == 1: # ne tient pas compte des scaffs pour lesquels chimerisation due a la presence d un paralogue chimere = 0 if len(lP1) < len(lP2): for p1 in lP1: if p1 not in lP2: chimere = 1 else: for p2 in lP2: if p2 not in lP1: chimere = 1 if chimere == 1: lProt = dicProt.items() lProt.sort(cmpval) pos = 0 lchrom = [] # variable toR indique s il est possible de retirer la derniere valeur de la liste ou non #print "%s - %s bp" % (scaff,len(allScaff[scaff])) #print lProt toR = 0 for prot in lProt: #print prot if prot[1] > pos: if species.lower() == "sace": if prot[0][0:2] not in lchrom: lchrom.append(prot[0][0:2]) toR = 1 else: if prot[0][0:6] not in lchrom: lchrom.append(prot[0][0:6]) toR = 1 pos = prot[1] #print lchrom # les donnees etant triees # si la valeur n est pas superieure, c est qu elle est egale else: if toR == 1: # je retire le dernier chromo mis dans la liste # ma liste ne contient aucune info par rapport au paralogue qui sont situees a la meme position lchrom.pop() # il faut que j ote la possibilite de remover pour le prochain tour, au cas ou on a + de 2 paralogues identifies a la meme position toR = 0 list(set(lchrom)) if species.lower() == "sace": firstChrom = lProt[0][0][0:2] lastChrom = lProt[-1][0][0:2] else: firstChrom = lProt[0][0][0:6] lastChrom = lProt[-1][0][0:6] #if len(lchrom) > 1 and firstChrom != lastChrom: if len(lchrom) > 1: print "%s - %s bp" % (scaff,len(allScaff[scaff])) of.write("%s - %s bp\n" % (scaff,len(allScaff[scaff]))) print lProt of.write("%s\n" % lProt) fileO2 = fileO.replace("blastx","blastx-std") alignement.run_blastx(fileN,fileO2,db,1000) else: os.remove(fileO) os.remove(fileN) else: os.remove(fileO) os.remove(fileN) of.close()
def stats4StrainsALarrache(suff,outfile = ""): if outfile != "": of = open(outfile, 'a') fileP = "AliProt/%s-4strains.fasta" % suff aliP = fasta.fromFastaToDico(fileP) lSeqILP = list() for seqNP in aliP.keys(): if "IL01" in seqNP: lSeqILP.append(seqNP) seqRefP = aliP[suff] nbDiffP = 0 lgILP = 0 for seqILnP in lSeqILP: print "%s\t%s" % (suff,seqILnP) seqILP = aliP[seqILnP] lgILP += len(string.replace(seqILP,"-","")) for i in range(0, len(seqILP)): if seqILP[i] != seqRefP[i] and seqILP[i] != "-": aVerif = 0 resWE = "" resNC = "" resYJM = "" # opt1 symbolise le fait que IL = WE ou IL = NC opt1 = 0 # opt2 symbolise la fait que YJM = FY opt2 = 0 # opt3 symbolise notre profile recherche, IL = YJM et diff des 3 autres opt3 = 0 for (seqNP,sequenceP) in aliP.items(): if "IL01" in seqNP: if sequenceP[i] == seqRefP[i]: aVerif = 1 else: #aVerif = 0 if "YJM981" in seqNP: if sequenceP[i] != "-": if resYJM == "": resYJM = sequenceP[i] elif resYJM != sequenceP[i]: resYJM = "ambigue" if "WE372" in seqNP: if sequenceP[i] != "-": if resWE == "": resWE = sequenceP[i] elif resWE != sequenceP[i]: resWE = "ambigue" if "NC02" in seqNP: if sequenceP[i] != "-": if resNC == "": resNC = sequenceP[i] elif resNC != sequenceP[i]: resNC = "ambigue" #print "%s, %s, %s, %s, %s" % (seqILP[i],resWE,resNC,resYJM,seqRefP[i]) if aVerif == 1: if outfile == "": print "%s\t%s\tposition a verifier pour chevauchement IL01" % (suff,i+1) else: of.write("%s\t%s\tposition a verifier pour chevauchement IL01\n" % (suff,i+1)) else: if resWE != "-": if seqILP[i] == resWE: opt1 = 1 if resNC != "-": if seqILP[i] == resNC: opt1 = 1 if resYJM != "-": if seqRefP[i] == resYJM: opt2 = 1 elif seqILP[i] == resYJM: if opt1 == 0: opt3 = 1 nbDiffP += 1 if outfile == "": print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (suff,i+1,seqILP[i],seqRefP[i],resWE,resNC,resYJM,opt1,opt2,opt3) else: of.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (suff,i+1,seqILP[i],seqRefP[i],resWE,resNC,resYJM,opt1,opt2,opt3)) if outfile != "": of.close()
def statsALarrache(suff,outfile = ""): if outfile != "": of = open(outfile, 'a') fileP = "AliProt/%s-11strains.fasta" % suff fileG = "AliGene/%s-G-11strains.fasta" % suff #fileP = "AliProt/%s-4strains.fasta" % suff #fileG = "AliGene/%s-G-4strains.fasta" % suff aliP = fasta.fromFastaToDico(fileP) lSeqILP = list() for seqNP in aliP.keys(): if "IL01" in seqNP: lSeqILP.append(seqNP) if outfile == "": print "\nTraite %s:\n----Alignement des proteines----\non a %s sequence de IL01" % (suff,len(lSeqILP)) else: of.write("\nTraite %s:\n----Alignement des proteines----\non a %s sequence de IL01\n" % (suff,len(lSeqILP))) # recuperation de l alignement de la reference seqRefP = aliP[suff] nbDiffP = 0 nbDiffPIsolee = 0 lgILP = 0 for seqILnP in lSeqILP: if outfile == "": print "+++traite %s+++" % seqILnP else: of.write("+++traite %s+++\n" % seqILnP) seqILP = aliP[seqILnP] lgILP += len(string.replace(seqILP,"-","")) for i in range(0, len(seqILP)): if seqILP[i] != seqRefP[i] and seqILP[i] != "-": # je peux recuperer le nombre de positions differentes, leur localisation # le nombre de position isolee lSeqIdP = list() for (seqNP,sequenceP) in aliP.items(): if "IL01" in seqNP: if sequenceP[i] == seqRefP[i]: aVerif = 1 break else: aVerif = 0 if sequenceP[i] == seqILP[i]: lSeqIdP.append(seqNP) if aVerif == 1: if outfile == "": print "%s : position a verifier pour chevauchement IL01" % (i+1) else: of.write("%s : position a verifier pour chevauchement IL01\n" % (i+1)) else: nbDiffP += 1 if len(lSeqIdP) == 0: nbDiffPIsolee += 1 if outfile == "": print "%s\t%s\t%s\n" % (i+1,len(lSeqIdP),lSeqIdP) else: of.write("%s\t%s\t%s\n" % (i+1,len(lSeqIdP),lSeqIdP)) #print "%s\t%s\t%s\t%s\t%s" % (suff,len(string.replace(seqRefP,"-","")),lgILP,nbDiffP,nbDiffPIsolee) aliG = fasta.fromFastaToDico(fileG) lSeqILG = list() for seqNG in aliG.keys(): if "IL01" in seqNG: lSeqILG.append(seqNG) if outfile == "": print "----Alignement des genes----\non a ",len(lSeqILG)," sequence de IL01" else: of.write("----Alignement des genes----\non a %s sequence de IL01\n" % len(lSeqILG)) # recuperation de l alignement de la reference seqRefG = aliG[suff] nbDiffG = 0 nbDiffGIsolee = 0 lgILG = 0 for seqILnG in lSeqILG: if outfile == "": print "+++traite %s+++" % seqILnG else: of.write("+++traite %s+++\n" % seqILnG) seqILG = aliG[seqILnG] lgILG += len(string.replace(seqILG,"-","")) for i in range(0, len(seqILG)): if seqILG[i].upper() != seqRefG[i].upper() and seqILG[i] != "-": lSeqIdG = list() for (seqNG,sequenceG) in aliG.items(): if "IL01" in seqNG: if sequenceG[i].upper() == seqRefG[i].upper(): aVerif = 1 break else: aVerif = 0 if sequenceG[i].upper() == seqILG[i].upper(): lSeqIdG.append(seqNG) if aVerif == 1: if outfile == "": print "%s : position a verifier pour chevauchement IL01" % (i+1) else: of.write("%s : position a verifier pour chevauchement IL01\n" % (i+1)) else: nbDiffG += 1 if len(lSeqIdG) == 0: nbDiffGIsolee += 1 if outfile == "": print "%s\t%s\t%s" % (i+1,len(lSeqIdG),lSeqIdG) else: of.write("%s\t%s\t%s\n" % (i+1,len(lSeqIdG),lSeqIdG)) if outfile != "": of.close() #print "%s\t%s\t%s\t%s\t%s" % (suff,len(string.replace(seqRefG,"-","")),lgILG,nbDiffG,nbDiffGIsolee) print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (suff,len(string.replace(seqRefP,"-","")),len(lSeqILP),lgILP,nbDiffP,nbDiffPIsolee,len(string.replace(seqRefG,"-","")),len(lSeqILG),lgILG,nbDiffG,nbDiffGIsolee)
def longScaff(infile): allSeq = fasta.fromFastaToDico(infile) for scaff in allSeq.keys(): print "%s\t%s" % (scaff,len(allSeq[scaff]))