def identifyScaffChimereBU(species,infile,rep = "."): # tentative d amelioration de la routine de mise en evidence des scaff chimeres # resultats non probants!! # qd matche d une meme position sur un scaffold, voulais garder celui qui a le meilleur score uniquement, mais on observe souvent # soit des scores identiques soit des scores + grands pour les genes issus des chromo differents # du coup, les resultats sont limites pires!! # peut etre vaut il mieux post-traiter les resultats issus de la 1ere routine os.chdir(rep) allScaff = fasta.fromFastaToDico(infile) outfile = infile.replace(".fasta","_chimere.log") of = open(outfile,"w") print species if species.lower() == "sace": db = "/Volumes/BioSan/Users/friedrich/BlastDB/Peptidic/Sace/s288c.pep" elif species.lower() == "lakl" or species.lower() == "sakl": db = "/Volumes/BioSan/Users/friedrich/BlastDB/Peptidic/Sakl/saklRef.pep" else: print "Species name not understood. Should be Sace, Sakl or Lakl" sys.exit() for scaff in allScaff.keys(): if len(allScaff[scaff]) > 1000: fileN = "%s.fasta" % scaff fasta.fromSeqToFasta(allScaff[scaff],scaff,fileN) fileO = fileN.replace("fasta","blastx") alignement.run_blastxFmt(fileN,fileO,db,1000) if os.path.isfile(fileO): lines = open(fileO,"r").read().split("\n") chimere = 0 dicProt = {} dicProtScore = {} for line in lines: if line != "": el = line.split("\t") if string.atof(el[2]) > 95 and string.atof(el[3]) > 70: if string.atoi(el[6]) in dicProt: prot = dicProt[string.atoi(el[6])] score = dicProtScore[prot] if score < string.atof(el[11]): del dicProt[string.atoi(el[6])] dicProt[string.atoi(el[6])] = el[1] dicProtScore[el[1]] = string.atof(el[11]) else: dicProt[string.atoi(el[6])] = el[1] dicProtScore[el[1]] = string.atof(el[11]) lChr = [] for key in dicProt.keys(): if species.lower() == "sace": if dicProt[key][0:2] not in lChr: lChr.append(dicProt[key][0:2]) else: if dicProt[key][0:6] not in lChr: lChr.append(dicProt[key][0:6]) # si on a plus de 1 chromo dans la liste, on a a faire a un scaff chimere if len(lChr) > 1: lProt = dicProt.items() lProt.sort() print "%s - %s bp" % (scaff,len(allScaff[scaff])) of.write("%s - %s bp\n" % (scaff,len(allScaff[scaff]))) print lProt of.write("%s\n" % lProt) fileO2 = fileO.replace("blastx","blastx-std") alignement.run_blastx(fileN,fileO2,db,1000) else: os.remove(fileO) os.remove(fileN) of.close()
def identifyScaffChimere(species,infile,rep = "."): os.chdir(rep) allScaff = fasta.fromFastaToDico(infile) #outfile = infile.replace(".fasta","_chimere.log") ## RUN 25 Strains ## outfile = infile.replace(".scafSeq","_chimere.log") ## of = open(outfile,"w") print species if species.lower() == "sace": db = "/Volumes/BioSan/Users/friedrich/BlastDB/Peptidic/Sace/s288c.pep" elif species.lower() == "lakl" or species.lower() == "sakl": db = "/Volumes/BioSan/Users/friedrich/BlastDB/Peptidic/Sakl/saklRef.pep" else: print "Species name not understood. Should be Sace, Sakl or Lakl" sys.exit() for scaff in allScaff.keys(): if len(allScaff[scaff]) > 1000: fileN = "%s.fasta" % scaff fasta.fromSeqToFasta(allScaff[scaff],scaff,fileN) fileO = fileN.replace("fasta","blastx") alignement.run_blastxFmt(fileN,fileO,db,1000) if os.path.isfile(fileO): lines = open(fileO,"r").read().split("\n") chr = "" chimere = 0 dicProt = {} lP1 = [] lP2 = [] for line in lines: if line != "": el = line.split("\t") #if string.atof(el[2]) > 90 and string.atof(el[3]) > 50: if string.atof(el[2]) > 95 and string.atof(el[3]) > 70: dicProt[el[1]] = string.atoi(el[6]) if chr == "": if species.lower() == "sace": chr = el[1][0:2] else: chr = el[1][0:6] if string.atoi(el[6]) not in lP1: lP1.append(string.atoi(el[6])) elif el[1][0:2] != chr and el[1][0:6] != chr: chimere = 1 if string.atoi(el[6]) not in lP2: lP2.append(string.atoi(el[6])) else: if string.atoi(el[6]) not in lP1: lP1.append(string.atoi(el[6])) if chimere == 1: # ne tient pas compte des scaffs pour lesquels chimerisation due a la presence d un paralogue chimere = 0 if len(lP1) < len(lP2): for p1 in lP1: if p1 not in lP2: chimere = 1 else: for p2 in lP2: if p2 not in lP1: chimere = 1 if chimere == 1: lProt = dicProt.items() lProt.sort(cmpval) pos = 0 lchrom = [] # variable toR indique s il est possible de retirer la derniere valeur de la liste ou non #print "%s - %s bp" % (scaff,len(allScaff[scaff])) #print lProt toR = 0 for prot in lProt: #print prot if prot[1] > pos: if species.lower() == "sace": if prot[0][0:2] not in lchrom: lchrom.append(prot[0][0:2]) toR = 1 else: if prot[0][0:6] not in lchrom: lchrom.append(prot[0][0:6]) toR = 1 pos = prot[1] #print lchrom # les donnees etant triees # si la valeur n est pas superieure, c est qu elle est egale else: if toR == 1: # je retire le dernier chromo mis dans la liste # ma liste ne contient aucune info par rapport au paralogue qui sont situees a la meme position lchrom.pop() # il faut que j ote la possibilite de remover pour le prochain tour, au cas ou on a + de 2 paralogues identifies a la meme position toR = 0 list(set(lchrom)) if species.lower() == "sace": firstChrom = lProt[0][0][0:2] lastChrom = lProt[-1][0][0:2] else: firstChrom = lProt[0][0][0:6] lastChrom = lProt[-1][0][0:6] #if len(lchrom) > 1 and firstChrom != lastChrom: if len(lchrom) > 1: print "%s - %s bp" % (scaff,len(allScaff[scaff])) of.write("%s - %s bp\n" % (scaff,len(allScaff[scaff]))) print lProt of.write("%s\n" % lProt) fileO2 = fileO.replace("blastx","blastx-std") alignement.run_blastx(fileN,fileO2,db,1000) else: os.remove(fileO) os.remove(fileN) else: os.remove(fileO) os.remove(fileN) of.close()