def creeSequenceDeGene(scaff,deb,fin): header = "%s %s %s" % (scaff,deb,fin) filename = "%s-%s-%s.tfa" % (scaff,deb,fin) ficScaff = "ScaffTfa/%s.tfa" % scaff seq = fasta.seqEnVar(ficScaff) seqGen = extraitSeqGene(seq,string.atoi(deb),string.atoi(fin)) if os.path.isfile(filename): filename = "%s-2" % filename fasta.fromSeqToFasta(seqGen,header,filename)
def creeSequenceDeGene(scaff,deb,fin,geneN,repGene,repSeq): header = "%s-%s-%s" % (scaff,deb,fin) filename = "%s/%s_%s-%s-%s.tfa" % (repGene,geneN,scaff,deb,fin) ficScaff = "%s/%s.tfa" % (repSeq,scaff) seq = fasta.seqEnVar(ficScaff) seqGen = extraitSeqGene(seq,string.atoi(deb),string.atoi(fin)) if os.path.isfile(filename): print "il existe deja" if not os.path.isfile(filename): fasta.fromSeqToFasta(seqGen,header,filename)
def GC_Cleft(): liStrains = ['55-86_1','62-1041','CBS3082a','CBS3082b','77-1003','NCYC543','62-196','CBS6545','CBS6546','CBS6547','CBS6626','NRBC1892','CBS10367','CBS10368','CBS4104','68917-2','DBVPG4002','67-588','NRBC1811','NRBC10572','NRBC10955','NRBC101999','CBS10369','CBS5828','dd281a','CBS2861','CBS4568','DBVPG3452','DBVPG3108'] rep = "/Volumes/BioSan/Users/friedrich/GB-3G/BWA/Nuclear/CleanPE" for strain in liStrains: repStrain = "%s/%s" % (rep,strain) seqStrain = "%s/cons%s.fasta" % (repStrain,strain) seqC = fasta.multiSeqEnVar(seqStrain,"Sakl0C") header = "cleft_%s" % strain fout = "%s/%s.fasta" % (repStrain,header) fasta.fromSeqToFasta(seqC[:989693],header,fout) print strain GC.txGC(seqC[:989693])
def creeSequenceDeGene(scaff, deb, fin, geneN, repGene, repSeq, long): header = "%s" % (scaff) filename = "%s/%s_%s-%s-%s.tfa" % (repGene, geneN, scaff, deb, fin) ficScaff = "%s/%s.tfa" % (repSeq, scaff) seq = fasta.seqEnVar(ficScaff) seqGen = extraitSeqGene(seq, string.atoi(deb), string.atoi(fin)).upper() # creation de la sequence uniquement si taille > a 98% de la sequence reference et si la taille est un multiple de 3 #if (len(seqGen) > long * 0.85 or len(seqGen) > 500) and len(seqGen) % 3 == 0: if 1 == 1: if os.path.isfile(filename): print "il existe deja" else: seqGen = seqGen.replace("X", "N") seqGen = seqGen.replace("S", "N") seqGen = seqGen.replace("W", "N") seqGen = seqGen.replace("R", "N") seqGen = seqGen.replace("Y", "N") seqGen = seqGen.replace("K", "N") seqGen = seqGen.replace("M", "N") fasta.fromSeqToFasta(seqGen, header, filename)
def identifyScaffChimereBU(species,infile,rep = "."): # tentative d amelioration de la routine de mise en evidence des scaff chimeres # resultats non probants!! # qd matche d une meme position sur un scaffold, voulais garder celui qui a le meilleur score uniquement, mais on observe souvent # soit des scores identiques soit des scores + grands pour les genes issus des chromo differents # du coup, les resultats sont limites pires!! # peut etre vaut il mieux post-traiter les resultats issus de la 1ere routine os.chdir(rep) allScaff = fasta.fromFastaToDico(infile) outfile = infile.replace(".fasta","_chimere.log") of = open(outfile,"w") print species if species.lower() == "sace": db = "/Volumes/BioSan/Users/friedrich/BlastDB/Peptidic/Sace/s288c.pep" elif species.lower() == "lakl" or species.lower() == "sakl": db = "/Volumes/BioSan/Users/friedrich/BlastDB/Peptidic/Sakl/saklRef.pep" else: print "Species name not understood. Should be Sace, Sakl or Lakl" sys.exit() for scaff in allScaff.keys(): if len(allScaff[scaff]) > 1000: fileN = "%s.fasta" % scaff fasta.fromSeqToFasta(allScaff[scaff],scaff,fileN) fileO = fileN.replace("fasta","blastx") alignement.run_blastxFmt(fileN,fileO,db,1000) if os.path.isfile(fileO): lines = open(fileO,"r").read().split("\n") chimere = 0 dicProt = {} dicProtScore = {} for line in lines: if line != "": el = line.split("\t") if string.atof(el[2]) > 95 and string.atof(el[3]) > 70: if string.atoi(el[6]) in dicProt: prot = dicProt[string.atoi(el[6])] score = dicProtScore[prot] if score < string.atof(el[11]): del dicProt[string.atoi(el[6])] dicProt[string.atoi(el[6])] = el[1] dicProtScore[el[1]] = string.atof(el[11]) else: dicProt[string.atoi(el[6])] = el[1] dicProtScore[el[1]] = string.atof(el[11]) lChr = [] for key in dicProt.keys(): if species.lower() == "sace": if dicProt[key][0:2] not in lChr: lChr.append(dicProt[key][0:2]) else: if dicProt[key][0:6] not in lChr: lChr.append(dicProt[key][0:6]) # si on a plus de 1 chromo dans la liste, on a a faire a un scaff chimere if len(lChr) > 1: lProt = dicProt.items() lProt.sort() print "%s - %s bp" % (scaff,len(allScaff[scaff])) of.write("%s - %s bp\n" % (scaff,len(allScaff[scaff]))) print lProt of.write("%s\n" % lProt) fileO2 = fileO.replace("blastx","blastx-std") alignement.run_blastx(fileN,fileO2,db,1000) else: os.remove(fileO) os.remove(fileN) of.close()
def identifyScaffChimere(species,infile,rep = "."): os.chdir(rep) allScaff = fasta.fromFastaToDico(infile) #outfile = infile.replace(".fasta","_chimere.log") ## RUN 25 Strains ## outfile = infile.replace(".scafSeq","_chimere.log") ## of = open(outfile,"w") print species if species.lower() == "sace": db = "/Volumes/BioSan/Users/friedrich/BlastDB/Peptidic/Sace/s288c.pep" elif species.lower() == "lakl" or species.lower() == "sakl": db = "/Volumes/BioSan/Users/friedrich/BlastDB/Peptidic/Sakl/saklRef.pep" else: print "Species name not understood. Should be Sace, Sakl or Lakl" sys.exit() for scaff in allScaff.keys(): if len(allScaff[scaff]) > 1000: fileN = "%s.fasta" % scaff fasta.fromSeqToFasta(allScaff[scaff],scaff,fileN) fileO = fileN.replace("fasta","blastx") alignement.run_blastxFmt(fileN,fileO,db,1000) if os.path.isfile(fileO): lines = open(fileO,"r").read().split("\n") chr = "" chimere = 0 dicProt = {} lP1 = [] lP2 = [] for line in lines: if line != "": el = line.split("\t") #if string.atof(el[2]) > 90 and string.atof(el[3]) > 50: if string.atof(el[2]) > 95 and string.atof(el[3]) > 70: dicProt[el[1]] = string.atoi(el[6]) if chr == "": if species.lower() == "sace": chr = el[1][0:2] else: chr = el[1][0:6] if string.atoi(el[6]) not in lP1: lP1.append(string.atoi(el[6])) elif el[1][0:2] != chr and el[1][0:6] != chr: chimere = 1 if string.atoi(el[6]) not in lP2: lP2.append(string.atoi(el[6])) else: if string.atoi(el[6]) not in lP1: lP1.append(string.atoi(el[6])) if chimere == 1: # ne tient pas compte des scaffs pour lesquels chimerisation due a la presence d un paralogue chimere = 0 if len(lP1) < len(lP2): for p1 in lP1: if p1 not in lP2: chimere = 1 else: for p2 in lP2: if p2 not in lP1: chimere = 1 if chimere == 1: lProt = dicProt.items() lProt.sort(cmpval) pos = 0 lchrom = [] # variable toR indique s il est possible de retirer la derniere valeur de la liste ou non #print "%s - %s bp" % (scaff,len(allScaff[scaff])) #print lProt toR = 0 for prot in lProt: #print prot if prot[1] > pos: if species.lower() == "sace": if prot[0][0:2] not in lchrom: lchrom.append(prot[0][0:2]) toR = 1 else: if prot[0][0:6] not in lchrom: lchrom.append(prot[0][0:6]) toR = 1 pos = prot[1] #print lchrom # les donnees etant triees # si la valeur n est pas superieure, c est qu elle est egale else: if toR == 1: # je retire le dernier chromo mis dans la liste # ma liste ne contient aucune info par rapport au paralogue qui sont situees a la meme position lchrom.pop() # il faut que j ote la possibilite de remover pour le prochain tour, au cas ou on a + de 2 paralogues identifies a la meme position toR = 0 list(set(lchrom)) if species.lower() == "sace": firstChrom = lProt[0][0][0:2] lastChrom = lProt[-1][0][0:2] else: firstChrom = lProt[0][0][0:6] lastChrom = lProt[-1][0][0:6] #if len(lchrom) > 1 and firstChrom != lastChrom: if len(lchrom) > 1: print "%s - %s bp" % (scaff,len(allScaff[scaff])) of.write("%s - %s bp\n" % (scaff,len(allScaff[scaff]))) print lProt of.write("%s\n" % lProt) fileO2 = fileO.replace("blastx","blastx-std") alignement.run_blastx(fileN,fileO2,db,1000) else: os.remove(fileO) os.remove(fileN) else: os.remove(fileO) os.remove(fileN) of.close()
def decomposeChromos(): liStrains = ['55-86_1','62-1041','CBS3082a','CBS3082b','77-1003','NCYC543','62-196','CBS6545','CBS6546','CBS6547','CBS6626','NRBC1892','CBS10367','CBS10368','CBS4104','68917-2','DBVPG4002','67-588','NRBC1811','NRBC10572','NRBC10955','NRBC101999','CBS10369','CBS5828','dd281a','CBS2861','CBS4568','DBVPG3452','DBVPG3108'] rep = "/Volumes/BioSan/Users/friedrich/GB-3G/BWA/Nuclear/CleanPE" for strain in liStrains: repStrain = "%s/%s" % (rep,strain) seqStrain = "%s/cons%s.fasta" % (repStrain,strain) seqA = fasta.multiSeqEnVar(seqStrain,"Sakl0A") seqB = fasta.multiSeqEnVar(seqStrain,"Sakl0B") seqC = fasta.multiSeqEnVar(seqStrain,"Sakl0C") seqD = fasta.multiSeqEnVar(seqStrain,"Sakl0D") seqE = fasta.multiSeqEnVar(seqStrain,"Sakl0E") seqF = fasta.multiSeqEnVar(seqStrain,"Sakl0F") seqG = fasta.multiSeqEnVar(seqStrain,"Sakl0G") seqH = fasta.multiSeqEnVar(seqStrain,"Sakl0H") headerA = "Sakl0A_%s" % strain headerB = "Sakl0B_%s" % strain headerC = "Sakl0C_%s" % strain headerD = "Sakl0D_%s" % strain headerE = "Sakl0E_%s" % strain headerF = "Sakl0F_%s" % strain headerG = "Sakl0G_%s" % strain headerH = "Sakl0H_%s" % strain foutA = "%s/%s.fasta" % (repStrain,headerA) foutB = "%s/%s.fasta" % (repStrain,headerB) foutC = "%s/%s.fasta" % (repStrain,headerC) foutD = "%s/%s.fasta" % (repStrain,headerD) foutE = "%s/%s.fasta" % (repStrain,headerE) foutF = "%s/%s.fasta" % (repStrain,headerF) foutG = "%s/%s.fasta" % (repStrain,headerG) foutH = "%s/%s.fasta" % (repStrain,headerH) fasta.fromSeqToFasta(seqA,headerA,foutA) fasta.fromSeqToFasta(seqB,headerB,foutB) fasta.fromSeqToFasta(seqC,headerC,foutC) fasta.fromSeqToFasta(seqD,headerD,foutD) fasta.fromSeqToFasta(seqE,headerE,foutE) fasta.fromSeqToFasta(seqF,headerF,foutF) fasta.fromSeqToFasta(seqG,headerG,foutG) fasta.fromSeqToFasta(seqH,headerH,foutH)