Esempio n. 1
0
    print("#################################################################")
    print("#        Welcome in extractSeqFastaFromLen (Version " + version +
          ")          #")
    print("#################################################################")
    print('Start time: ', start_time, '\n')

    # Récupère le fichier de conf passer en argument
    fastaFile = os.path.abspath(args.fastaFile)
    outputfilename = os.path.abspath(args.paramoutfile)
    lenSize = args.lenSize
    keep = args.keep

    output_handle = open(outputfilename, "w")

    dicoSize = lenSeq2dict(fastaFile)
    dicoFasta = fasta2dict(fastaFile)
    filename = fastaFile.split('/')[-1].replace('.fasta', '')
    nbTotal = len(dicoFasta.keys())

    count = 1
    for ID in sorted(dicoSize.keys(), key=dicoSize.get, reverse=True):
        lenSeq = dicoSize[ID]

        sequence = dicoFasta[ID]
        strain = outputfilename.split('/')[-1].split('_')[0].replace(
            '.fasta', '')
        seqName = f"Scaffold_{count}"
        #seqName = '%s_%s'%(strain,seqName)
        descrip = "length={}".format(lenSeq)
        if str(sequence.seq).count('N') < (len(str(sequence.seq)) - 20):
            if keep == 'g' and lenSeq >= lenSize or (keep == 'l'
########### Gestion directory ##############
	verifFichier(gff)
	verifFichier(fasta)
	nameGFF = gff.split('/')[-1].replace('.gz','')


####################### main #################
	# Create Variable  for the start and stop codon, this variable help for evaluate the annotation
	start_codon = 'ATG'
	stop_codon = ['TGA','TAA','TAG']
	# Create dictionary for cds and gene information (start, stop and strand)
	dico_cds = collections.defaultdict(dict)
	dico_gene = collections.defaultdict(list)
	augustus = False
	fasta_dico = fasta2dict(fasta)
	liste_scaff = fasta_dico.keys()
	# Initiate the dictionary for cds
	for elt in liste_scaff :
		dico_cds[elt] = collections.defaultdict(list)
	# Open gff for retrieve all information for cds and gene
	with open(gff,'rt') as gff_file :
		for line in gff_file :
			if '# This output was generated with AUGUSTUS' in line :
				augustus = True
			if line[0] != '#' :
				tabLine = line.split('\t')
				type = tabLine[2]
				# Parse gene information
				if type == 'gene' :
				 	id = tabLine[-1].strip().split(';')[0].replace('ID=','')
Esempio n. 3
0
	cdsFile = directory +"/"+ folder + '_cds.fna'
	protFile = directory +"/"+ folder + '_protein.faa'
	gffFile = directory +"/"+ folder + '_merge.gff3'
	f = open(gffFile,'r')
	lines = f.readlines()
	f.close()
	print('Creation dico GFF')
	for line in lines :
		if 'mRNA' in line :
			lineSplit = line.split('\t')
			ids = lineSplit[8].split('ID=')[-1].split(';Parent=')[0]
			position =  'pos=%s_%s:%s'%(lineSplit[0],lineSplit[3],lineSplit[4])
			tools = lineSplit[1]
			dico_Gff[ids] = (position,tools)
	print('Creation dico des fasta')
	dico_cds = fasta2dict(cdsFile)
	dico_prot = fasta2dict(protFile)
	f = open(cdsFile.replace('.fna','.fasta'),'w')
	for idSeq in sorted(dico_cds.keys(), key=sort_human):
		position = dico_Gff[idSeq][0]
		tools = dico_Gff[idSeq][1]
		length = len(str(dico_cds[idSeq].seq))
		seqObj = dico_cds[idSeq].seq
		record = SeqRecord(seqObj,id=idSeq,name=idSeq, description='| %s | %s | %s ' %(position,tools,length))
		SeqIO.write(record,f, "fasta")
	f.close()
	f = open(protFile.replace('.faa','.fasta'),'w')
	for idSeq in sorted(dico_prot.keys(), key=sort_human):
		position = dico_Gff[idSeq.replace('0P','0T')][0]
		tools = dico_Gff[idSeq.replace('0P','0T')][1]
		length = len(str(dico_prot[idSeq].seq))
Esempio n. 4
0
	print(form("\n\t---------------------------------------------------------",'yellow','bold'))
	print("\t"+form("|",'yellow','bold')+form("       Welcome in QualityAssemblage (Version " + version + ")       ",type='bold')+form("|",'yellow','bold'))
	print(form("\t---------------------------------------------------------",'yellow','bold')+'\n')

################## Main ################################
	with open(outFile,'w') as f :
		f.write('n\tn:500\tL50\tmin\tN80\tN50\tN20\tE-size\tmax\tsum\tname\n')
		for files in os.listdir(directory):
			if files.endswith('.fasta') :
				print(files+ ' in process')

				Pathfile = directory+files
				isFasta(Pathfile)
				strain = recupId(Pathfile.split('/')[-1])
				dico_fasta = fasta2dict(Pathfile)
				lengthGenome = 0
				nbScaffold = 0
				lengthN50 = 0
				lengthN80 = 0
				lengthN20 = 0
				Esize = 0
				L50 = 0
				n500 = 0
				first = True




				for elt in dico_fasta.values():
					nbScaffold += 1
Esempio n. 5
0
        nbGeneA = 0
        nbGeneB = 0
        lengthGene = 0
        lengthGenome = 0
        for line in lines:
            if line[0] != '#':
                typeLine = line.split('\t')[2]
                typeAnnotation = line.split('\t')[1]
                posStart = line.split('\t')[3]
                posEnd = line.split('\t')[4]
                length = int(posEnd) - int(posStart)

                if typeLine == 'gene' and typeAnnotation == 'AUGUSTUS_BGPI':
                    nbGeneA += 1
                    lengthGene = lengthGene + length
                if typeLine == 'gene' and typeAnnotation == 'BRAKER':
                    nbGeneB += 1
                    lengthGene = lengthGene + length
        fastaPath = genome + '/' + name + '.fasta'
        dico_fasta = fasta2dict(fastaPath)
        for elt in dico_fasta.values():
            lengthGenome = lengthGenome + len(elt.seq)

        f.write('%s\t%s\t%s\t%s\t%s\t%s\n' %
                (name.center(10), str(nbGeneA).center(10),
                 str(nbGeneB).center(10), str(nbGeneB + nbGeneA).center(10),
                 str(round(lengthGene / (nbGeneA + nbGeneB), 2)).center(10),
                 str(round(lengthGenome / 1000000, 2)).center(10)))
        print('%s Done' % name)
f.close()