def extractListFromFasta2(sequenceFile, FileList):
    dicoOutput = {}
    # Ouverture des sequences fasta MGG et chargement dans dictionnaire
    dictSequences = fasta2dict(sequenceFile)

    # ouverture des identifiants a garder
    listKeep = FileList
    keep = 0
    noKeep = 0
    for ID, record in dictSequences.items():
        if ID in listKeep:
            keep += 1
            dicoOutput[record] = record
        else:
            noKeep += 1
    return dicoOutput
Example #2
0
                    souche = soucheIndice[i]
                    #print(souche)
                    if souche not in dicoSeqBuild[geneID].keys():
                        dicoSeqBuild[geneID][souche] = SNP
                    else:
                        dicoSeqBuild[geneID][souche] += SNP
                    i += 1

            ctr += 1

    #print(dict2txt(dicoSeqBuild))

    # chargement des sequences 3 souches
    for files in fastaPath.lsExtInDirToList("fasta"):
        geneID = files.split("_")[1].replace("gene", "gene_")
        dico = fasta2dict(files)
        if geneID in dicoSeqBuild.keys():
            dicoSeqBuild[geneID].update(dico)

    countOnlyN = 0
    listSeqNFind = []
    for geneID, dico in dicoSeqBuild.items():

        with open(
                "/work/carlier.j/globalPopGenomicMF/buildSeq62/test/" +
                geneID + ".fasta", "w") as output_handle:
            seqNfind = False
            for souche, txtseq in dico.items():

                if type(txtseq) is not SeqRecord:
                    txtseqNdel = txtseq.replace("N", "")
Example #3
0
	for MGG in listFiles:
		if MGG in dicoMGG2BR32.keys():
			BR32List.append(dicoMGG2BR32[MGG])
		else:

			notMGG.append(MGG)

	print("BR32List count:%i" %(len(BR32List)))
	print("notMGG count:%i" %(len(notMGG)))


	dicoFastaSample = {}
	for sampleFile in pathSampleIn.listFiles:
		souche = sampleFile.split("/")[-1].split(".")[0]
		#print(souche)
		dicoFastaSample[souche] = fasta2dict(sampleFile)

	#print(dicoFastaSample)

	#print(pathDirectoryOut.pathDirectory)

	count = 0
	for filename in pathDirectoryIn.listFiles:
		basename = filename.split("/")[-1]
		MGG = "_".join(filename.split("/")[-1].split("_")[0:2])

		if MGG in dicoMGG2BR32.keys() :
			BR32ID = dicoMGG2BR32[MGG]

			first = 0
			for souche in dicoFastaSample.keys():
Example #4
0
	nbEmpty=0
	other=0
	listCDSfiles = workingObjDir.lsExtInDirToList("codingseq")
	statFile = open(workingObjDir.pathDirectory+"statsCDSInfo.txt","w")
	listFile = open(workingObjDir.pathDirectory+"CDSfilelist.txt","w")

	statFile.write("fileName\tnbstartonly\tnbstoponly\tnbstartandstop\tnbEmpty\ttotal\tOther\n")

	for fileCDS in sorted(listCDSfiles):
		fileName = fileCDS.split("/")[-1].split(".")[0]
		listFile.write(fileCDS+"\n")

		if args.fastaValue in ["yes","y"]:
			output_file = open(workingObjDir.pathDirectory+fileName+".fasta", "w")

		record_dict = fasta2dict(fileCDS)
		for name in sorted(record_dict.keys(), key=sort_human):
			record = record_dict[name]
			oldNumID = record.id
			new_record_name = fileName+"_"+oldNumID
			record.id = new_record_name
			record.name = ""
			seq = record.seq.upper()
			firstCodon = seq[:3]
			endCodon = seq[-3:]

			# Test ATG start and Stop codons
			if str(firstCodon.upper()) in "ATG":
				startATG=1
			else:
				startATG=0
    #Welcome message
    print("#################################################################")
    print("#        Welcome in extractSeqFastaFromLen (Version " + version +
          ")          #")
    print("#################################################################")
    print('Start time: ', start_time, '\n')

    # Récupère le fichier de conf passer en argument
    fastaFile = relativeToAbsolutePath(args.fastaFile)
    outputfilename = relativeToAbsolutePath(args.paramoutfile)
    lenSize = args.lenSize
    keepValue = args.keepValue
    output_handle = open(relativeToAbsolutePath(outputfilename), "w")

    dicoSize = lenSeq2dict(fastaFile)
    dicoFasta = fasta2dict(fastaFile)

    nbKeep = 0
    nbTotal = len(dicoFasta.keys())

    for ID in sorted(dicoSize.keys(), key=sort_human):
        lenSeq = dicoSize[ID]
        if keepValue in ["g", "greater"]:
            if lenSeq >= lenSize:
                sequence = dicoFasta[ID]
                SeqIO.write(sequence, output_handle, "fasta")
                nbKeep += 1

        elif keepValue in ["l", "lower"]:
            if lenSeq <= lenSize:
                sequence = dicoFasta[ID]
                seq = record.seq
                SeqIO.write(record.upper(), output_handle, "fasta")

    ######
    os.makedirs(outputfilePath + "orthologue/", exist_ok=True)
    os.system("rm " + outputfilePath + "orthologue/*.fasta")

    #Concatenation des orthologues de Farman et Gemo
    listFastaOut = lsFastaInDirToList(outputfilePath)
    nblignetotal = len(listFastaOut)
    #print(listFastaOut)
    ctr = 0
    dicoOpenFile = {}
    for fastaFile in listFastaOut:
        dictSequences = {}
        dictSequences = fasta2dict(fastaFile)
        print(fastaFile)
        percent = (float(ctr) / float(nblignetotal)) * 100
        sys.stdout.write("\rProcessed up to %0.2f %%...\t" % percent)
        sys.stdout.flush()

        for geneId, record in dictSequences.items():
            #print(geneId)
            MGGName = "_".join(geneId.split("_")[0:2])
            if MGGName in toRM or "T1" in MGGName or "T2" in MGGName:
                #print(MGGName)
                MGGName = MGGName.replace("T0", "T0").replace("T1",
                                                              "T0").replace(
                                                                  "T2", "T0")
            #print(MGGName)
            souche = geneId.split("_")[2]