def extractListFromFasta2(sequenceFile, FileList): dicoOutput = {} # Ouverture des sequences fasta MGG et chargement dans dictionnaire dictSequences = fasta2dict(sequenceFile) # ouverture des identifiants a garder listKeep = FileList keep = 0 noKeep = 0 for ID, record in dictSequences.items(): if ID in listKeep: keep += 1 dicoOutput[record] = record else: noKeep += 1 return dicoOutput
souche = soucheIndice[i] #print(souche) if souche not in dicoSeqBuild[geneID].keys(): dicoSeqBuild[geneID][souche] = SNP else: dicoSeqBuild[geneID][souche] += SNP i += 1 ctr += 1 #print(dict2txt(dicoSeqBuild)) # chargement des sequences 3 souches for files in fastaPath.lsExtInDirToList("fasta"): geneID = files.split("_")[1].replace("gene", "gene_") dico = fasta2dict(files) if geneID in dicoSeqBuild.keys(): dicoSeqBuild[geneID].update(dico) countOnlyN = 0 listSeqNFind = [] for geneID, dico in dicoSeqBuild.items(): with open( "/work/carlier.j/globalPopGenomicMF/buildSeq62/test/" + geneID + ".fasta", "w") as output_handle: seqNfind = False for souche, txtseq in dico.items(): if type(txtseq) is not SeqRecord: txtseqNdel = txtseq.replace("N", "")
for MGG in listFiles: if MGG in dicoMGG2BR32.keys(): BR32List.append(dicoMGG2BR32[MGG]) else: notMGG.append(MGG) print("BR32List count:%i" %(len(BR32List))) print("notMGG count:%i" %(len(notMGG))) dicoFastaSample = {} for sampleFile in pathSampleIn.listFiles: souche = sampleFile.split("/")[-1].split(".")[0] #print(souche) dicoFastaSample[souche] = fasta2dict(sampleFile) #print(dicoFastaSample) #print(pathDirectoryOut.pathDirectory) count = 0 for filename in pathDirectoryIn.listFiles: basename = filename.split("/")[-1] MGG = "_".join(filename.split("/")[-1].split("_")[0:2]) if MGG in dicoMGG2BR32.keys() : BR32ID = dicoMGG2BR32[MGG] first = 0 for souche in dicoFastaSample.keys():
nbEmpty=0 other=0 listCDSfiles = workingObjDir.lsExtInDirToList("codingseq") statFile = open(workingObjDir.pathDirectory+"statsCDSInfo.txt","w") listFile = open(workingObjDir.pathDirectory+"CDSfilelist.txt","w") statFile.write("fileName\tnbstartonly\tnbstoponly\tnbstartandstop\tnbEmpty\ttotal\tOther\n") for fileCDS in sorted(listCDSfiles): fileName = fileCDS.split("/")[-1].split(".")[0] listFile.write(fileCDS+"\n") if args.fastaValue in ["yes","y"]: output_file = open(workingObjDir.pathDirectory+fileName+".fasta", "w") record_dict = fasta2dict(fileCDS) for name in sorted(record_dict.keys(), key=sort_human): record = record_dict[name] oldNumID = record.id new_record_name = fileName+"_"+oldNumID record.id = new_record_name record.name = "" seq = record.seq.upper() firstCodon = seq[:3] endCodon = seq[-3:] # Test ATG start and Stop codons if str(firstCodon.upper()) in "ATG": startATG=1 else: startATG=0
#Welcome message print("#################################################################") print("# Welcome in extractSeqFastaFromLen (Version " + version + ") #") print("#################################################################") print('Start time: ', start_time, '\n') # Récupère le fichier de conf passer en argument fastaFile = relativeToAbsolutePath(args.fastaFile) outputfilename = relativeToAbsolutePath(args.paramoutfile) lenSize = args.lenSize keepValue = args.keepValue output_handle = open(relativeToAbsolutePath(outputfilename), "w") dicoSize = lenSeq2dict(fastaFile) dicoFasta = fasta2dict(fastaFile) nbKeep = 0 nbTotal = len(dicoFasta.keys()) for ID in sorted(dicoSize.keys(), key=sort_human): lenSeq = dicoSize[ID] if keepValue in ["g", "greater"]: if lenSeq >= lenSize: sequence = dicoFasta[ID] SeqIO.write(sequence, output_handle, "fasta") nbKeep += 1 elif keepValue in ["l", "lower"]: if lenSeq <= lenSize: sequence = dicoFasta[ID]
seq = record.seq SeqIO.write(record.upper(), output_handle, "fasta") ###### os.makedirs(outputfilePath + "orthologue/", exist_ok=True) os.system("rm " + outputfilePath + "orthologue/*.fasta") #Concatenation des orthologues de Farman et Gemo listFastaOut = lsFastaInDirToList(outputfilePath) nblignetotal = len(listFastaOut) #print(listFastaOut) ctr = 0 dicoOpenFile = {} for fastaFile in listFastaOut: dictSequences = {} dictSequences = fasta2dict(fastaFile) print(fastaFile) percent = (float(ctr) / float(nblignetotal)) * 100 sys.stdout.write("\rProcessed up to %0.2f %%...\t" % percent) sys.stdout.flush() for geneId, record in dictSequences.items(): #print(geneId) MGGName = "_".join(geneId.split("_")[0:2]) if MGGName in toRM or "T1" in MGGName or "T2" in MGGName: #print(MGGName) MGGName = MGGName.replace("T0", "T0").replace("T1", "T0").replace( "T2", "T0") #print(MGGName) souche = geneId.split("_")[2]