Python runBlastParser Examples

Programming Language: Python

Namespace/Package Name: CommonFastaFunctions

Method/Function: runBlastParser

Examples at hotexamples.com: 33

Python runBlastParser - 33 examples found. These are the top rated real world Python examples of CommonFastaFunctions.runBlastParser extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: BSRfunctions.py Project: bfrgoncalves/Allele_Calling

def getBlastScoreRatios(FASTAfile, allelescore, queryDef, databasePath, queryProteomeName, referenceGenomeArray, referenceCDS, isXML):
    
    alleleProt=''
    proteome=""
    countP=0
    countCDS=0

    if isXML == 'True':
        blast_out_file = 'BLASTresults.xml'

        cline = NcbiblastpCommandline(query=queryProteomeName, db=databasePath, out=blast_out_file, outfmt=5, num_alignments=7000, num_descriptions=7000)

        print 'BSR:'
            #print cline
        blast_records = runBlastParser(cline,blast_out_file, False)

        startTime = datetime.now()
        ToNewAllele = parseBLASTRecordsXML(blast_records, allelescore, queryDef, referenceGenomeArray, referenceCDS)
        print 'CheckResults:' + str(datetime.now() - startTime)

    else:
        blast_out_file = 'BLASTresults.tab'

        cline = NcbiblastpCommandline(query=queryProteomeName, db=databasePath, out=blast_out_file, outfmt=6, num_alignments=7000, num_descriptions=7000)

        print 'BSR:'
        blast_records = runBlastParserTAB(cline,blast_out_file, False)

        startTime = datetime.now()
        ToNewAllele = parseBLASTRecordsTAB(blast_records, allelescore, queryDef, referenceGenomeArray, referenceCDS)
        print 'CheckResults:' + str(datetime.now() - startTime)

    os.remove(queryProteomeName)

    return ToNewAllele

Example #2

Show file

File: callAlleles_protein2.py Project: mickaelsilva/bacterial_wgMLST

def reDogetBlastScoreRatios(genefile,basepath,alleleI,allelescores2,newGene_Blast_DB_name,alleleList2,picklepath):
	
	gene_fp = HTSeq.FastaReader(genefile)

	alleleProt=''
	
	alleleI+=1
		
	proteinfastaPath=genefile
	
	print ("Re-starting Blast alleles at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	
	blast_out_file2 = os.path.join(basepath,'blastdbs/temp.xml')

	cline = NcbiblastpCommandline(query=proteinfastaPath, db=newGene_Blast_DB_name, evalue=0.001, out=blast_out_file2, outfmt=5)
	allelescore=0
	blast_records = runBlastParser(cline,blast_out_file2, proteinfastaPath)
	
	print ("Blasted alleles at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	
	found =False
	for blast_record in blast_records:
		
		for alignment in blast_record.alignments:
			
			
			for match in alignment.hsps:
				allelescores2.append(int(match.score))
				

	var=[alleleI,allelescores2]
	with open(picklepath,'wb') as f:
		currentCDSDict = pickle.dump(var, f)
	
	return int(alleleI),allelescores2,alleleList2

Example #3

Show file

File: BSRfunctions.py Project: bfrgoncalves/Allele_Calling

def getOwnBlastScore(FASTAfile, databasePath, queryProteomeName, numberOfLocus, blastResultsPath, LocusToUse, queryFile):

    allelescores = []
    alleleNumbers = {}
    sameAlleles = {}
    prevAlleleName = {}

    databasePath, isEmpty, proteinsToQueryFile, queryAlleleList, prevAlleleName = CreateQueryDatabase(FASTAfile, databasePath,queryProteomeName)


    if isEmpty:
        return allelescores, isEmpty, proteinsToQueryFile, alleleNumbers, sameAlleles, prevAlleleName

    blast_out_file = blastResultsPath + '/' + numberOfLocus + '_BLASTresults.xml'

    cline = NcbiblastpCommandline(query=queryProteomeName, db=databasePath, out=blast_out_file, outfmt=5, num_alignments=7000, num_descriptions=7000)

        #print cline
    allelescore=0

    blast_records = runBlastParser(cline,blast_out_file, False)


    allelescores, alleleList, alleleNumbers, sameAlleles = parseOwnBLASTRecordsAndDuplicates(blast_records, FASTAfile, queryAlleleList)

    proteinsToQueryFile = translateAlleleList(alleleList, queryProteomeName, LocusToUse, queryFile)


    os.remove(databasePath+ ".pin")
    os.remove(databasePath+ ".phr")
    os.remove(databasePath+ ".psq")
    os.remove(databasePath+ "_blast.log")
    os.remove(blast_out_file)

    return allelescores, isEmpty, proteinsToQueryFile, alleleNumbers, sameAlleles, prevAlleleName

Example #4

Show file

def getBlastScoreRatios(allelescore, alleleList, databasePath,
                        queryProteomeName, referenceGenomeArray, referenceCDS,
                        bestmatches, referenceCDSsequences, referenceFileName,
                        countNumberOfGenomes, blastResultsPath, LocusToUse):

    alleleProt = ''
    proteome = ""
    countP = 0
    countCDS = 0

    blast_out_file = blastResultsPath + countNumberOfGenomes + '_BLASTresults.xml'

    cline = NcbiblastpCommandline(query=queryProteomeName,
                                  db=databasePath,
                                  out=blast_out_file,
                                  outfmt=5,
                                  num_alignments=7000,
                                  num_descriptions=7000)

    #print cline
    blast_records = runBlastParser(cline, blast_out_file, False)

    resultsList, addNewAlleles = parseBLASTRecordsXML(
        blast_records, allelescore, alleleList, referenceGenomeArray,
        referenceCDS, bestmatches, referenceCDSsequences, referenceFileName,
        LocusToUse)

    os.remove(blast_out_file)

    return resultsList, addNewAlleles

Example #5

Show file

File: SearchAll.py Project: B-UMMI/ProGenViZ

def getOwnBlastScore(FASTAfile):
    gene_fp = HTSeq.FastaReader(FASTAfile)
    #alleleI=0
    names=""
    alleleProt=''
    proteome=""
    for allele in gene_fp: #new db for each allele to blast it against himself
        try:
            x = str(translateSeq(allele.seq))
        except:
            continue
        #print str(allele.name)
        #names=allele.name.split("|")[3]
        #print allele.seq
        alleleProt+=">"+str(allele.name)+"\n"+x+"\n"
        proteome+=">"+str(allele.name)+"\n"+x+"\n"
    with open(pathRef+'allAllelesAA.fasta', "wb") as f:
        f.write(alleleProt)
    with open(pathRef+nameOrg+'proteome.fasta', "wb") as v:
        v.write(proteome)
    Gene_Blast_DB_name = Create_Blastdb(pathRef+'allAllelesAA.fasta',1,True)
        # --- get BLAST score ratio --- #
    cline = NcbiblastpCommandline(query=pathRef+nameOrg+'proteome.fasta', db=name, out=blast_out_file, outfmt=5, num_alignments=7000, num_descriptions=7000)
        #print cline
    allelescore=0
    blast_records = runBlastParser(cline,blast_out_file, alleleProt)
    allelescores={}
    for blast_record in blast_records:
        found=False 
        for alignment in blast_record.alignments:
            if found is False:
                #print blast_record.query, alignment.hit_def
                for match in alignment.hsps:
                    #print alignment.hit_def
                        #print "---------------------"
                    #print alignment.hit_def
                    #print blast_record.query
                    #print alignment.hit_def
                    try:
                        if allelescores[str(alignment.hit_def)] < match.score:
                            allelescores[str(alignment.hit_def)] = int(match.score)
                            break
                    except KeyError:
                        allelescores[str(alignment.hit_def)] = int(match.score)
                        break
            else:
                break
    #print allelescores
    #for i in allelescores:
        #hitsName.append(str(i)+";"+str(allelescores[i])+";")
    #hitsName.sort(key=Align_sort_key)
    #print hitsName
    #return alleleI,allelescores,Gene_Blast_DB_name
    #print alleleI
    #print len(allelescores)
    return allelescores

Example #6

Show file

File: ParalogRemove.py Project: mickaelsilva/pythonscripts

def main():

	parser = argparse.ArgumentParser(description="Given an ffn file, recovers the genes that are not paralogs and have a size bigger than the g parameter provided")
	parser.add_argument('-i', nargs='?', type=str, help='ffn file', required=True)
	parser.add_argument('-g', nargs='?', type=int, help='int minimum size', required=True)
	
	args = parser.parse_args()
	genes = args.i
	sizethresh = args.g
	
	gene_fp = HTSeq.FastaReader(genes)
	geneFile = os.path.abspath( genes )
	Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 )
	
	geneF = os.path.splitext( geneFile )[0]
	blast_out_file = geneF + '.xml'

	# list of results - the output of the function
	resultsList = []

					# ------------------------------ RUNNING BLAST ------------------------------ #

	cline = NcbiblastnCommandline(query=geneFile, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5)
	blast_records = runBlastParser(cline, blast_out_file, geneFile)
	paralogs=[]
	for blast_record in blast_records:
		try:
			alignment=blast_record.alignments[1]
			paralogs.append( alignment.hit_def)

		except:
			continue
	
	pathfiles=os.path.dirname(geneFile)
	pathfiles=pathfiles+"/"
	print pathfiles
	
	g_fp = HTSeq.FastaReader( genes )
	removedparalogs=0
	removedsize=0
	for contig in g_fp:
		name = contig.name+" "+contig.descr
		if name not in paralogs:
			if int(len(contig.seq))>sizethresh:
				namefile=contig.name
				namefile=namefile.replace("|","_")
				with open(pathfiles+namefile+".fasta", "wb") as f:
					f.write(">1\n"+contig.seq+"\n")
			else:
				removedsize+=1
		else:
			print name
			removedparalogs+=1
	print "Removed %s paralog genes" % str(removedparalogs)
	print "Removed %s because of size :" % str(removedsize)

Example #7

Show file

File: callAlleles_protein.py Project: mickaelsilva/pythonscripts

def reDogetBlastScoreRatios(genefile,basepath,alleleI,allelescores2,newGene_Blast_DB_name,alleleList2):
	
	gene_fp = HTSeq.FastaReader(genefile)
	#alleleI=0
	#allelescores=[]
	alleleProt=''
	#alleleList=[]
	"""for allele in gene_fp: #new db for each allele to blast it against himself
		print allele
		alleleI+=1
		genome=-1
		alleleList2.append(allele.seq)
		translatedSequence,x,y=translateSeq(allele.seq)
		print translatedSequence
		alleleProt+=">"+str(alleleI)+"\n"+str(translatedSequence+"\n")"""
	
	alleleI+=1
		
	proteinfastaPath=genefile
	print proteinfastaPath
	blast_out_file2 = os.path.join(basepath,'blastdbs/temp.xml')
	#with open(proteinfastaPath, "wb") as f:
	#	f.write(alleleProt)
	#Gene_Blast_DB_name = Create_Blastdb( proteinfastaPath, 1, True )
		# --- get BLAST score ratio --- #
	cline = NcbiblastpCommandline(query=proteinfastaPath, db=newGene_Blast_DB_name, evalue=0.001, out=blast_out_file2, outfmt=5)
		#print cline
	allelescore=0
	blast_records = runBlastParser(cline,blast_out_file2, proteinfastaPath)
	found =False
	for blast_record in blast_records:
		found=False 
		print blast_record
		#print blast_record.header
		#print blast_record.alignments[0]
		for alignment in blast_record.alignments:
			print alignment,alignment.hsps
			if found is False:
				#print blast_record.query, alignment.hit_def
				for match in alignment.hsps:
					print match
					
						#print "---------------------"
					if(int(alignment.hit_def)== int(blast_record.query)):
						#print match
						allelescores2.append(int(match.score))
						found=True
						break
			else:
				break
	#print allelescores2, alleleList2
	return alleleI,allelescores2,alleleList2

Example #8

Show file

File: callAlleles_protein.py Project: mickaelsilva/pythonscripts

def getBlastScoreRatios(genefile,basepath):
	
	gene_fp = HTSeq.FastaReader(genefile)
	alleleI=0
	allelescores=[]
	alleleProt=''
	alleleList=[]
	for allele in gene_fp: #new db for each allele to blast it against himself
		alleleI+=1
		genome=-1
		alleleList.append(allele.seq)
		translatedSequence,x,y=translateSeq(allele.seq)
		alleleProt+=">"+str(alleleI)+"\n"+str(translatedSequence+"\n")
	#basepath="./blastdbs/temp"+str(os.path.basename(genefile))
	#if not os.path.exists(basepath):
	#	os.makedirs(basepath)
	proteinfastaPath=os.path.join(basepath,str(os.path.basename(genefile)+'_protein.fasta'))
	blast_out_file = os.path.join(basepath,"blastdbs/"+os.path.basename(genefile) + '.xml')
	with open(proteinfastaPath, "wb") as f:
		f.write(alleleProt)
	
	print ("Starting Blast alleles at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	Gene_Blast_DB_name = Create_Blastdb( proteinfastaPath, 1, True )
	print proteinfastaPath
		# --- get BLAST score ratio --- #
	cline = NcbiblastpCommandline(query=proteinfastaPath, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5)
		#print cline
	allelescore=0
	
	print ("Parse bsr blast at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	blast_records = runBlastParser(cline,blast_out_file, alleleProt)
	
	print ("Blasted alleles at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	
	for blast_record in blast_records:
		found=False 
		for alignment in blast_record.alignments:
			if found is False:
				#print blast_record.query, alignment.hit_def
				for match in alignment.hsps:
					
						#print "---------------------"
					if(int(alignment.hit_def)== int(blast_record.query)):
						#print match
						allelescores.append(int(match.score))
						found=True
						break
			else:
				break
	#print allelescores
	return alleleI,allelescores,Gene_Blast_DB_name,alleleList

Example #9

Show file

File: callAlleles_protein3.py Project: ODiogoSilva/chewBBACA

def reDogetBlastScoreRatios(genefile, basepath, alleleI, allelescores2,
                            newGene_Blast_DB_name, alleleList2, picklepath,
                            verbose, blastPath, listAllelesNames):
    if verbose:

        def verboseprint(*args):
            for arg in args:
                print(arg),
            print
    else:
        verboseprint = lambda *a: None  # do-nothing function

    #gene_fp = HTSeq.FastaReader(genefile)

    alleleProt = ''

    proteinfastaPath = genefile

    verboseprint("Starting Blast of new alleles to calculate BSR at : " +
                 time.strftime("%H:%M:%S-%d/%m/%Y"))

    blast_out_file2 = os.path.join(basepath, 'blastdbs/temp.xml')

    cline = NcbiblastpCommandline(cmd=blastPath,
                                  query=proteinfastaPath,
                                  db=newGene_Blast_DB_name,
                                  evalue=0.001,
                                  out=blast_out_file2,
                                  outfmt=5,
                                  num_threads=1)
    allelescore = 0
    blast_records = runBlastParser(cline, blast_out_file2)

    verboseprint("Blasted alleles at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))

    found = False
    matchscore = 0
    for blast_record in blast_records:

        for alignment in blast_record.alignments:

            for match in alignment.hsps:
                matchscore = int(match.score)

    allelescores2[alleleI] = matchscore
    with open(picklepath, 'wb') as f:
        pickle.dump(allelescores2, f)

    return allelescores2, alleleList2, listAllelesNames

Example #10

Show file

File: checkNeighbors.py Project: bfrgoncalves/NeighbourGenes

def BLASTp(queryFile, dbName, blast_out_path, queryNames, sequenceLengths):
	blast_out_file = os.path.join(blast_out_path,'blastOut.xml')
	cline = NcbiblastpCommandline(query=queryFile, db=dbName, out=blast_out_file, outfmt=5)
	blast_records = runBlastParser(cline,blast_out_file, "")
	matchGene = ''
	score = -1
	for blast_record in blast_records:
		queryGeneIndex = queryNames.index(blast_record.query.strip('|'))
		querySequenceLength = sequenceLengths[queryGeneIndex]
		for alignment in blast_record.alignments:
			for match in alignment.hsps:
				identity_length_ratio = float(match.identities)/float(querySequenceLength)
				if identity_length_ratio >= 0.8:
					if score < match.score:
						matchGene = alignment.hit_def
						score = match.score

	return matchGene

Example #11

Show file

File: BSRfunctions.py Project: bfrgoncalves/Allele_Calling

def getBlastScoreRatios(allelescore, alleleList, databasePath, queryProteomeName, referenceGenomeArray, referenceCDS, bestmatches, referenceCDSsequences,referenceFileName, countNumberOfGenomes, blastResultsPath, LocusToUse):
    
    alleleProt=''
    proteome=""
    countP=0
    countCDS=0

    blast_out_file = blastResultsPath + countNumberOfGenomes + '_BLASTresults.xml'


    cline = NcbiblastpCommandline(query=queryProteomeName, db=databasePath, out=blast_out_file, outfmt=5, num_alignments=7000, num_descriptions=7000)

        #print cline
    blast_records = runBlastParser(cline,blast_out_file, False)

    resultsList, addNewAlleles = parseBLASTRecordsXML(blast_records, allelescore, alleleList, referenceGenomeArray, referenceCDS, bestmatches, referenceCDSsequences, referenceFileName, LocusToUse)

    os.remove(blast_out_file)

    return resultsList, addNewAlleles

Example #12

Show file

def getOwnBlastScore(FASTAfile, databasePath, queryProteomeName, numberOfLocus,
                     blastResultsPath, LocusToUse, queryFile):

    allelescores = []
    alleleNumbers = {}
    sameAlleles = {}
    prevAlleleName = {}

    databasePath, isEmpty, proteinsToQueryFile, queryAlleleList, prevAlleleName = CreateQueryDatabase(
        FASTAfile, databasePath, queryProteomeName)

    if isEmpty:
        return allelescores, isEmpty, proteinsToQueryFile, alleleNumbers, sameAlleles, prevAlleleName

    blast_out_file = blastResultsPath + '/' + numberOfLocus + '_BLASTresults.xml'

    cline = NcbiblastpCommandline(query=queryProteomeName,
                                  db=databasePath,
                                  out=blast_out_file,
                                  outfmt=5,
                                  num_alignments=7000,
                                  num_descriptions=7000)

    #print cline
    allelescore = 0

    blast_records = runBlastParser(cline, blast_out_file, False)

    allelescores, alleleList, alleleNumbers, sameAlleles = parseOwnBLASTRecordsAndDuplicates(
        blast_records, FASTAfile, queryAlleleList)

    proteinsToQueryFile = translateAlleleList(alleleList, queryProteomeName,
                                              LocusToUse, queryFile)

    os.remove(databasePath + ".pin")
    os.remove(databasePath + ".phr")
    os.remove(databasePath + ".psq")
    os.remove(databasePath + "_blast.log")
    os.remove(blast_out_file)

    return allelescores, isEmpty, proteinsToQueryFile, alleleNumbers, sameAlleles, prevAlleleName

Example #13

Show file

File: alleleCalling_ORFbased_protein.py Project: mickaelsilva/pythonscripts

def getBlastScoreRatios(genefile):
	
	gene_fp = HTSeq.FastaReader(genefile)
	alleleI=0
	allelescores=[]
	alleleProt=''
	for allele in gene_fp: #new db for each allele to blast it against himself
		alleleI+=1
		genome=-1
		alleleProt+=">"+str(alleleI)+"\n"+str(translateSeq(allele.seq)+"\n")
	basepath="./blastdbs/temp"+str(os.path.basename(genefile))
	if not os.path.exists(basepath):
		os.makedirs(basepath)
	with open(basepath+'/protein.fasta', "wb") as f:
		f.write(alleleProt)
	Gene_Blast_DB_name = Create_Blastdb( basepath+'/protein.fasta', 1, True )
		# --- get BLAST score ratio --- #
	cline = NcbiblastpCommandline(query=basepath+'/protein.fasta', db=Gene_Blast_DB_name, evalue=0.001, out=basepath+'protein.xml', outfmt=5)
		#print cline
	allelescore=0
	blast_records = runBlastParser(cline,basepath+'protein.xml', alleleProt)
	
	for blast_record in blast_records:
		found=False 
		for alignment in blast_record.alignments:
			if found is False:
				#print blast_record.query, alignment.hit_def
				for match in alignment.hsps:
					
						#print "---------------------"
					if(int(alignment.hit_def)== int(blast_record.query)):
						#print match
						allelescores.append(int(match.score))
						found=True
						break
			else:
				break
	#print allelescores
	return alleleI,allelescores,Gene_Blast_DB_name

Example #14

Show file

def reDogetBlastScoreRatios(genefile, basepath, alleleI, allelescores2,
                            newGene_Blast_DB_name, alleleList2, picklepath):

    gene_fp = HTSeq.FastaReader(genefile)

    alleleProt = ''

    alleleI += 1

    proteinfastaPath = genefile

    print("Re-starting Blast alleles at : " +
          time.strftime("%H:%M:%S-%d/%m/%Y"))

    blast_out_file2 = os.path.join(basepath, 'blastdbs/temp.xml')

    cline = NcbiblastpCommandline(query=proteinfastaPath,
                                  db=newGene_Blast_DB_name,
                                  evalue=0.001,
                                  out=blast_out_file2,
                                  outfmt=5)
    allelescore = 0
    blast_records = runBlastParser(cline, blast_out_file2, proteinfastaPath)

    print("Blasted alleles at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))

    found = False
    for blast_record in blast_records:

        for alignment in blast_record.alignments:

            for match in alignment.hsps:
                allelescores2.append(int(match.score))

    var = [alleleI, allelescores2]
    with open(picklepath, 'wb') as f:
        currentCDSDict = pickle.dump(var, f)

    return int(alleleI), allelescores2, alleleList2

Example #15

Show file

File: alleleCalling_ORFbased_protein.py Project: mickaelsilva/pythonscripts

def callAlleles(argumentList):
	
	geneFile = argumentList[0]
	genomesList = argumentList[1]
	listOfProts = argumentList[2]
	listAllCDS = argumentList[3]
	#print geneFile
	gene_fp = HTSeq.FastaReader(geneFile)
	alleleI = 0
	#inverted=False
	#orderedAlleleNames=[]
	resultsList = []
	i = 0
	perfectMatchIdAllele=[]
	bestmatch=[0,0,False,'',0] #score, score ratio, perfectmatch, key name of the DNA sequence string, allele ID
	allelescores=[]
	
	
	alleleI,allelescores,Gene_Blast_DB_name=getBlastScoreRatios(geneFile)
	
	genome=-1	
	
	for protList in listOfProts:

		#alleleI = 0
		#alleleProt=''
		#for allele in gene_fp: #new db for each allele to blast it against himself
		#	alleleI+=1
		#	alleleProt+=">"+str(alleleI)+"\n"+str(translateSeq(allele.seq)+"\n")
		basepath="./blastdbs/temp"+str(os.path.basename(geneFile))
		#if not os.path.exists(basepath):
		#	os.makedirs(basepath)
		#with open(basepath+'/protein.fasta', "wb") as f:
		#	f.write(alleleProt)
		#Gene_Blast_DB_name = Create_Blastdb( basepath+'/protein.fasta', 1, True )
		genome+=1
		with open(basepath+'/proteinList.fasta', "wb") as f:
			f.write(protList)
		#Gene_Blast_DB_name = Create_Blastdb( './temp/proteinList.fasta', 1, True )
		cline = NcbiblastpCommandline(query=basepath+'/proteinList.fasta', db=Gene_Blast_DB_name, evalue=0.001, out=basepath+'proteinList.xml', outfmt=5)
		#print cline
		blast_records = runBlastParser(cline, basepath+'proteinList.xml', basepath+'/proteinList.fasta')
		for blast_record in blast_records:
				
			for alignment in blast_record.alignments:
				#print alignment
					#print alignment.hsps
				#print alignment.hit_id
				#print alignment.hit_def
					#print alignment.title
				for match in alignment.hsps:
					#print blast_record.query
					#print match
					#print alleleI, len(allelescores)
					scoreRatio=float(match.score)/float(allelescores[int(alignment.hit_def)-1])
					#print scoreRatio
					#print alignment.hit_def
					cdsStrName=blast_record.query
					if(scoreRatio == 1 and bestmatch[2] is False):
						bestmatch=[match.score,scoreRatio,True,cdsStrName,int(alignment.hit_def)]
						#print alignment
						#print match
					elif(scoreRatio == 1 and match.score>bestmatch[0]):
						bestmatch=[match.score,scoreRatio,True,cdsStrName,int(alignment.hit_def)]
						#print match
					elif(match.score>bestmatch[0] and scoreRatio>0.4 and scoreRatio>bestmatch[1] and bestmatch[2] is False):
						#print match.query
						#print match.sbjct
						#print allelescores
						bestmatch=[match.score,scoreRatio,False,cdsStrName,int(alignment.hit_def)]
						#print match
		#print bestmatch
				
		if bestmatch[0]==0:
					#if no best match was found
					
					###################
					# LOCUS NOT FOUND #
					###################
						
			resultsList.append('LNF3:-1')            # append result to the list of results
			perfectMatchIdAllele.append('LNF')
			#printinfo(genomeFile,geneFile)
			print "Locus not found, no matches \n"
			
				
		elif bestmatch[2] is True:
						
					#if a perfect match was found
					
						################################################
						# EXACT MATCH --- MATCH == GENE --- GENE FOUND #
						################################################
					
			perfectMatchIdAllele.append(str(bestmatch[4]))
			resultsList.append('EXC:' + str(bestmatch[4]) )
				
		else:
					#######################
					# ADD INFERRED ALLELE #		# a new allele 
					#######################
					
											
				#print "infered allele has location : "+(CDSType)
				#printinfo(genomeFile,geneFile) 
			tagAux='INF'
			perfectMatchIdAllele.append( tagAux +"-"+str(alleleI+1))
			print "New allele! Adding allele "+ tagAux + str(alleleI+1) +" to the database\n"
																				
			resultsList.append( tagAux + str(alleleI+1) )

					#orderedAlleleNames.append('allele_' + str(alleleI) + '_' + tagAux[:-1] +"_" +str(os.path.basename(genomeFile)))	
										# --- add the new allele to the gene fasta --- #

			fG = open( geneFile, 'a' )
			fG.write('>allele_' + str(alleleI+1) + '_' + tagAux[:-1] +"_" + str(os.path.basename(genomesList[genome])) + '\n')
					#print alleleStr
				
			listOfCDS=listAllCDS[genome]
			#print listOfCDS
			fG.write( listOfCDS[">"+bestmatch[3]] + '\n')
			fG.close()
					#alleleI += 1
					# --- remake blast DB --- #
			Gene_Blast_DB_name = Create_Blastdb( basepath+'/protein.fasta', 1, True )
			alleleI,allelescores,Gene_Blast_DB_name=getBlastScoreRatios(geneFile)
	#x=y
	shutil.rmtree(basepath)

	
	final =	(resultsList,perfectMatchIdAllele)	
	#return (resultsList)
	return final

Example #16

Show file

File: alleleCalling_old.py Project: mickaelsilva/pythonscripts

def callAlleles(argumentList):

	geneFile = argumentList[0]
	genomesList = argumentList[1]
	listOfCDSDicts = argumentList[2]
	listOfGenomesDict = argumentList[3]
	
	gene_fp = HTSeq.FastaReader(geneFile)
	geneDict = {}
	alleleI = 1
	inverted=False
	orderedAlleleNames=[]
	biggestAllelelen=0
	for allele in gene_fp:
		if allele.seq in geneDict:
			print "\nWARNING: this file contains a repeated allele, it should be checked. Ignoring it now!\n", geneFile
		else:
			if len(allele.seq)>biggestAllelelen:
				biggestAllelelen=len(allele.seq)
			orderedAlleleNames.append(allele.name)
			geneDict[ allele.seq ] = alleleI
			alleleI += 1
	#print geneDict
	#print orderedAlleleNames

	# --- make 1st blast DB --- #

	Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 )
	geneF = os.path.splitext( geneFile )[0]
	blast_out_file = geneF + '.xml'

	# list of results - the output of the function
	resultsList = []
	i = 0
	perfectMatchIdAllele=[]
	
	for genomeFile in genomesList:
		#print geneDict
		currentCDSDict = listOfCDSDicts[i]
		currentGenomeDict = listOfGenomesDict[i]
		
		#print genomeFile
		#print resultsList
		#print geneDict
		#print orderedAlleleNames
		i+=1		# it has to be incremented here
		if genomeFile[-1] == '\n':
			genomeFile = genomeFile[:-1]


                # ------------------------------ RUNNING BLAST ------------------------------ #

		cline = NcbiblastnCommandline(query=genomeFile, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5)
		blast_records = runBlastParser(cline, blast_out_file, genomeFile)


		# ------ DETERMINING BEST MATCH ------ #

		# bestMatch = ['rec.query','hsp', lenRatio]
		bestMatch = ['','', 0]
		bestMatchContig=''
		bestMatchContigLen=''
		bestalignlen=0
		perfectMatch=False
		bmAlleleLen2=0
		bmAllele=''
		#noAlignment=False
		for blast_record in blast_records:
			# --- the LNF cases are now called outside de loop --- #
			#print blast_record
			if perfectMatch==True:
				break
			try:
				#print blast_record.alignments
				hspC = blast_record.alignments[0]
				
				if bestMatch[0] == '' and bestMatch[1] == '':
					bestMatch[0] = blast_record.query
					bestMatch[1] = hspC
			except IndexError:
				continue


			# --- the contig tag is used in the progigal function --- #

			contigTag = blast_record.query
			# --- brute force parsing of the contig tag - better solution is advisable --- #			

			j=0
			for l in contigTag:
				if l == ' ':
					break
				j+=1

			contigTag = contigTag[:j]

			contigLen = blast_record.query_letters
			#print blast_record.query_id
			
			# --- iterating over all the results to determine the best match --- #
			for alignment in blast_record.alignments:
				
				index=orderedAlleleNames.index(alignment.hit_def)
				#print alignment.hit_def
				for k, v in geneDict.iteritems():
					if v == index+1:
						bmAlleleLen2= len(k)
					
				if perfectMatch:
					break
				for match in alignment.hsps:
					#print match
					scoreRatio = float(match.score) / float(bmAlleleLen2)
					
					#print alignment.hit_def
					#print match.identities
					#print bmAlleleLen2
					#print
					#print match.identities
					#print len(match.sbjct)

					
					if (int(match.identities)==int(bmAlleleLen2) and int(match.identities)==int(len(match.sbjct)) and "N" not in match.query ): 
						index=orderedAlleleNames.index(alignment.hit_def)
						bmAlleleLen= len(geneDict.keys()[index])
						
						lenratio=float(len(match.query))/float(bmAlleleLen)
						bestMatch = [blast_record.query, match, scoreRatio, alignment.hit_def,lenratio,bmAlleleLen]
						bestMatchContig=contigTag
						perfectMatch=True
						index=orderedAlleleNames.index(alignment.hit_def)
						bmAlleleLen= len(geneDict.keys()[index])
						break
						
					elif scoreRatio > bestMatch[2]:
						index=orderedAlleleNames.index(alignment.hit_def)
						bmAllele=geneDict.keys()[index]
						bmAlleleLen= len(bmAllele)
						lenratio=float(len(match.query))/float(bmAlleleLen)
						bestMatch = [blast_record.query, match, scoreRatio, alignment.hit_def,lenratio,bmAlleleLen]
						bestMatchContig=contigTag
						bestMatchContigLen=blast_record.query_letters
						if match.sbjct_start > match.sbjct_end:
							inverted=True
						#print match.query
						bestalignlen=alignment.length
						
						
					
					if perfectMatch==True:
						break


		# ---------- ALLELE CALLING AFTER DETERMINING BEST MATCH ---------- #

		
		
		try:
			#print bestMatch[0]
			match = bestMatch[1]
			#print match.query
			geneLen = bestMatch[5]
			alleleStr = match.query
			nIdentities = match.identities
			idPercent = float(nIdentities) / float(geneLen)
			scoreRatio = bestMatch[2]
			lenRatio = bestMatch[4]
			#print perfectMatch
			#print "\nContig best/exact match is :"
			#print bestMatchContig +"\n"
		
		except:
			resultsList.append('LNF3:-1')            # append result to the list of results
			perfectMatchIdAllele.append('LNF')
			printinfo(genomeFile,geneFile)
			print "Locus not found \n"
			continue
		
		#TODO check identities >0.8
		
		if perfectMatch is True:
			#TODO perfect match to top
			if match.sbjct_start > match.sbjct_end:
				alleleStr = reverseComplement(alleleStr)
			#TODO test replace -
			#alleleStr = alleleStr.replace('-', '')
			alleleNumber = geneDict[ alleleStr ]
			
					################################################
					# EXACT MATCH --- MATCH == GENE --- GENE FOUND #
					################################################
			if "_" in bestMatch[3]:
				a=bestMatch[3].split("_")
				perfectMatchIdAllele.append(a[1])
			else:
				perfectMatchIdAllele.append(bestMatch[3])
			resultsList.append('EXC:' + str(alleleNumber) )
		
						###################
						# LOCUS NOT FOUND #
						###################
			
		#elif bestMatch[0] == '':
		#	resultsList.append('LNF:-1')            # append result to the list of results
		#	perfectMatchIdAllele.append('LNF')
		#	printinfo(genomeFile,geneFile)
		#	print "Locus not found \n"

		elif bestMatch[0] != '' and perfectMatch is not True:
						
				

					###########################
					# LOCUS ON THE CONTIG TIP #
					###########################
			
			
			#if match.query_start == 1 or bestMatchContigLen <= match.query_end:
			## TODO-
			## 1 -  LOT5 match.query_start ==1 and match.length < match.subj.length (allele length) alignement length
			## 2 - LOT 3' match.query_end == match.query.length (contig length) and match.length < contig length (allele length??)
			## 3 - LOT SC bestMatchContigLen <= allele length
			
			if match.query_start ==1 and len(match.query) < geneLen:
			
				resultsList.append('LOT5:-1')
				perfectMatchIdAllele.append('LOT5')
				printinfo(genomeFile,geneFile)
				
				print "Locus is on the 5' tip of the contig \n"
			
			
			elif match.query_end == bestMatchContigLen and len(match.query) < bestMatchContigLen:
				
				resultsList.append('LOT3:-1')
				perfectMatchIdAllele.append('LOT3')
				printinfo(genomeFile,geneFile)

				print "Locus is on the 3' tip of the contig \n"
			
			elif bestMatchContigLen <= geneLen:
				
				resultsList.append('LOTSC:-1')
				perfectMatchIdAllele.append('LOTSC')
				printinfo(genomeFile,geneFile)
				#print match.query_start
				print "Locus is bigger than the contig \n"
					
					
				

			elif 'N' in alleleStr:
				#TODO gravar para ficheiro
					#####################
					# ALLELE NOT FOUND  #		# N base found!
					#####################
				
				geneFile2= os.path.splitext(geneFile)[0] + "LNFN.fasta"
				print geneFile2
				with open(geneFile2, 'a') as f:
					f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+"\n")
					f.write((alleleStr) +"\n")
				resultsList.append('LNFN:-1')
				perfectMatchIdAllele.append('LNFN')
				printinfo(genomeFile,geneFile) 
				print "LNFN, contains N bases! \n"
			
			
			
			else:
				# ------------------------------------------------------------------------------------------------------- #
				#                                                                                                         #
				#                                   USING PRODIGAL TO TRY TO EXTEND CDS                                   #
				#                                                                                                         #
				# ------------------------------------------------------------------------------------------------------- #
				
				CDSType=''
				extended, strCDS, CDSType = extendCDS(bestMatchContig, currentCDSDict, match.query_start, match.query_end, currentGenomeDict, geneLen)
				# --- if it was possible to extend it using prodigal --- #
				
				#print extended
				#print strCDS
				#print CDSType
				

				if extended :
					alleleStr = strCDS
					
					lenRatio = float(len(strCDS)) / float(geneLen)
					#print alleleStr
					#print lenRatio
				elif not extended and biggestAllelelen > geneLen:
					extended, strCDS, CDSType = extendCDS(bestMatchContig, currentCDSDict, match.query_start, match.query_end, currentGenomeDict, biggestAllelelen)
					if extended :
						alleleStr = strCDS
					
						lenRatio = float(len(strCDS)) / float(geneLen)
					else:
						alleleStr = alleleStr.replace('-', '')
				
				
				else:
					# --- removing gaps '-' --- #
				#print alleleStr
					
					alleleStr = alleleStr.replace('-', '')

				# --- continuing the allele calling --- #

					
			
					#print geneDict
					#print alleleStr
					# --- it might be needed to obtain the reverse complement of the allele string --- #
				if match.sbjct_start > match.sbjct_end:
					alleleStr = reverseComplement(alleleStr)
					
				if alleleStr in geneDict:
					alleleNumber = geneDict[ alleleStr ]
					
					################################################
					# EXACT MATCH --- MATCH == GENE --- GENE FOUND #						
					################################################
					perfectMatchIdAllele.append(alleleNumber)
					resultsList.append('EXC:' + str(alleleNumber) )
						

				else:

					isUndefined = False	
					#print geneDict.keys()[0]
					defAllele=''
					defAlleleName=''
					for k in geneDict.keys():
						if alleleStr in k:
							defAllele=k
							#print alleleStr
							isUndefined = True
							defAlleleName=geneDict.get(k)
							break

						
					if extended and isUndefined and idPercent > 0.8 and ((int(len(match.query))==int(len(defAllele)) or int(len(match.query))==int(len(defAllele))+1 or int(len(match.query))==int(len(defAllele))-1)) :
						#extended allele to compare may be different from the allele to compare from bm	
						alleleStr=match.query
							
						alleleStr = alleleStr.replace('-', '')
							
						if match.sbjct_start > match.sbjct_end:    #### - error??
							alleleStr = reverseComplement(alleleStr)
							
						if int(len(alleleStr))==int(len(defAllele)): # se o match for do mesmo tamanho que o alello
							tagAux = 'NA1:'
							printinfo(genomeFile,geneFile) 
							perfectMatchIdAllele.append("NA1-"+str(alleleI))
							
						elif int(len(alleleStr))==int(len(defAllele))-1 : # se o match tiver uma base a mais que o alelo
							tagAux = 'NA2:'
							printinfo(genomeFile,geneFile) 
							perfectMatchIdAllele.append("NA2-"+str(alleleI))
							
						else:												#se o match tiver uma base a menos que o alelo
							tagAux = 'NA3:'
							printinfo(genomeFile,geneFile) 
							perfectMatchIdAllele.append("NA3-"+str(alleleI))
							
								
						print "New allele found! Adding allele "+ tagAux + str(alleleI) +" to the database"
						geneDict[alleleStr] = alleleI
							
						resultsList.append( tagAux + str(alleleI) )
							
						orderedAlleleNames.append('allele_' + str(alleleI) + '_' + tagAux[:-1] + str(os.path.basename(genomeFile)))									
						# --- add the new allele to the gene fasta --- #
							
						fG = open( geneFile, 'a' )
						fG.write('>allele_' + str(alleleI) + '_' + tagAux[:-1] + str(os.path.basename(genomeFile)) + '\n')
						fG.write( alleleStr + '\n')
						fG.close()
						alleleI += 1
						Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 )
					
					elif not extended and idPercent > 0.8 and ((int(len(match.query))==int(geneLen) or int(len(match.query))==int(geneLen)+1 or int(len(match.query))==int(geneLen)-1)) :
							
						alleleStr=match.query
							
						alleleStr = alleleStr.replace('-', '')
							
						if match.sbjct_start > match.sbjct_end:    #### - error??
							alleleStr = reverseComplement(alleleStr)
							
						if int(len(alleleStr))==int(geneLen): # se o match for do mesmo tamanho que o alello
							tagAux = 'NA4:'
							printinfo(genomeFile,geneFile) 
							perfectMatchIdAllele.append("NA4-"+str(alleleI))
							
						elif int(len(alleleStr))==int(geneLen)-1 : # se o match tiver uma base a mais que o alelo
							tagAux = 'NA5:'
							printinfo(genomeFile,geneFile) 
							perfectMatchIdAllele.append("NA5-"+str(alleleI))
							
						else:												#se o match tiver uma base a menos que o alelo
							tagAux = 'NA6:'
							printinfo(genomeFile,geneFile) 
							perfectMatchIdAllele.append("NA6-"+str(alleleI))
							
								
						print "New allele found! Adding allele "+ tagAux + str(alleleI) +" to the database"
						geneDict[alleleStr] = alleleI
							
						resultsList.append( tagAux + str(alleleI) )
							
						orderedAlleleNames.append('allele_' + str(alleleI) + '_' + tagAux[:-1] + str(os.path.basename(genomeFile)))									
						# --- add the new allele to the gene fasta --- #
							
						fG = open( geneFile, 'a' )
						fG.write('>allele_' + str(alleleI) + '_' + tagAux[:-1] + str(os.path.basename(genomeFile)) + '\n')
						fG.write( alleleStr + '\n')
						fG.close()
						alleleI += 1
						Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 )
					
							
					elif isUndefined:

							####################
							# UNDEFINED ALLELE #		# it is contained in another allele
							####################
							
						alleleStr=match.query
						#if match.sbjct_start > match.sbjct_end:    #### - error
							#alleleStr = reverseComplement(alleleStr)
						resultsList.append('UND:-1')
						perfectMatchIdAllele.append("undefined allele")
						printinfo(genomeFile,geneFile) 
						print "Undefined allele \n"
						
						geneFile2= os.path.splitext(geneFile)[0] + "undefined.fasta"
						print geneFile2
						with open(geneFile2, 'a') as f:
							f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n")
							f.write((alleleStr) +"\n")
							#f.write(">BlastBestMatch"+str(defAlleleName)+"\n")
							#f.write((alleleStr)+"\n")
							f.write(">Allele"+str(defAlleleName)+"\n")
							f.write((defAllele)+"\n")
						
					
						
						
						
					else:
						if not extended :
							
								
							if lenRatio < 0.5:
							
									###############
									# SMALL MATCH #
									###############
									
								resultsList.append('SAC:-1')		# don't know what 'SAC' stands for
								perfectMatchIdAllele.append('small match')
								printinfo(genomeFile,geneFile) 
								print "lower than 50% match \n"	
								
							elif lenRatio < 0.8 and idPercent < 0.5:

								#####################
								# INCOMPLETE ALLELE #		# it was not possible to extend it to at least 80% of the length of the gene
								#####################

								resultsList.append('INC:-1')
								perfectMatchIdAllele.append('allele incomplete')
								printinfo(genomeFile,geneFile)
								print "Incomplete allele\n"
							
							else:	
								##################
								# LNF WTFFF #
								##################
								geneFile2= os.path.splitext(geneFile)[0] + "LNF2.fasta"
								print geneFile2
								with open(geneFile2, 'a') as f:
									f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n")
									f.write((alleleStr) +"\n")
									f.write(">Allele\n")
									f.write((bmAllele)+"\n")
								resultsList.append('LNF2')
								printinfo(genomeFile,geneFile) 
								perfectMatchIdAllele.append("LNF2")
								print "Not extended and no allele found"

						else:

								#######################
								# ADD INFERRED ALLELE #		# a new allele that was extended with prodigal
								#######################
							if(CDSType=='larger than match'):
								tagAux = 'INF1:'
							elif(CDSType=='start codon inside match'):
								tagAux = 'INF2:'
							elif(CDSType=='early stop codon in match'):
								tagAux = 'INF3:'
							elif(CDSType=='same size as allele'):
								tagAux = 'INF4:'
							else:
								tagAux = 'INF5:'
								
							print "infered allele has location : "+(CDSType)
							printinfo(genomeFile,geneFile) 
							perfectMatchIdAllele.append( tagAux +"-"+str(alleleI))
							print "New allele Infered with prodigal! Adding allele "+ tagAux + str(alleleI) +" to the database\n"
							
							
								
							geneDict[alleleStr] = alleleI
								
							resultsList.append( tagAux + str(alleleI) )
								
							orderedAlleleNames.append('allele_' + str(alleleI) + '_' + tagAux[:-1] + str(os.path.basename(genomeFile)))	
							# --- add the new allele to the gene fasta --- #

							fG = open( geneFile, 'a' )
							fG.write('>allele_' + str(alleleI) + '_' + tagAux[:-1] + str(os.path.basename(genomeFile)) + '\n')
							#print alleleStr
							fG.write( alleleStr + '\n')
							fG.close()
							alleleI += 1
							

							# --- remake blast DB --- #
							
							Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 )
							
	final =	(resultsList,perfectMatchIdAllele)	
	#return (resultsList)
	return final

Example #17

Show file

File: CreateSchema.py Project: mickaelsilva/bacterial_wgMLST

def main():

	parser = argparse.ArgumentParser(description="Given an ffn file, recovers the genes that are not paralogs and have a size bigger than the g parameter provided")
	parser.add_argument('-i', nargs='?', type=str, help='ffn file', required=True)
	parser.add_argument('-g', nargs='?', type=int, help='int minimum size', required=True)
	
	args = parser.parse_args()
	genes = args.i
	sizethresh = args.g
	passSteps = False

	
	#translate to protein and create new file
	abspath=os.path.abspath(genes)
	filename=os.path.basename(genes)
	abspath=abspath.replace(filename,'')
	proteinfile=os.path.join(abspath,'proteins.fasta') 
	
	geneDict = {}
	protDict={}
	orderedprotDict=collections.OrderedDict()
	alreadyIn=[]
	totalgenes=0
	repeatedgenes=0
	smallgenes=0
	
	if not passSteps:
		print "not passing steps"
		with open(proteinfile, "wb") as f:
			g_fp = HTSeq.FastaReader( genes )
			totalgenes+=1
			for gene in g_fp:
				dnaseq=	str(gene.seq)
				protseq,x,y=translateSeq(dnaseq)
				if len(protseq)>1:
					
					if str(protseq) in alreadyIn:
						repeatedgenes+=1
					
					elif len(str(protseq))<67:
						smallgenes+=1
						
					else:	
						alreadyIn.append(str(protseq))
						protname=">"+str(gene.name)+"\n"
														
						f.write(protname+str(protseq)+"\n")
						protDict[protname] = str(protseq)
						geneDict[str(gene.name)] = gene.seq
				else:

					print gene.name
			
			orderedprotList=[]
			orderedprotList=sorted(protDict.items(), key=lambda x: len(x[1]), reverse=True)
			
			
			i=0
			while i < len(orderedprotList):
				elem=orderedprotList[i]
				orderedprotDict[elem[0]] = elem[1]
				i+=1
				
			#print orderedprotDict
		print str(repeatedgenes) + " repeated genes out of "+ str(totalgenes)
		print str(smallgenes) + " small genes out of "+ str(totalgenes)
		print "protein file created"
				
		# first step -  remove genes contained in other genes or 100% equal genes
		
		# list of results - the output of the function
		resultsList = []
		
		auxDict={}
		g_fp = HTSeq.FastaReader( proteinfile )
		g=0
		j=0
		
		print "Checking if proteins are equal or substring of others..."
		
		# for each gene from all the annotated genes - starting with an empty dictionary, only add a new gene if the "to be added gene" is not contained or equal to a gene already added to the dictionary
		auxprot=[]

		for elem in orderedprotDict.items():

			contained=False
			
			prot=str(elem[1])
			if any(prot in x for x in auxprot):
				g+=1
				contained=True
			
			else:
				auxDict[elem[1]] = elem[0]
				auxprot.append(str(elem[1]))
			
			print str(j)+ " out of " + str(len(orderedprotDict)	)
				
			j+=1
			#print "____" +str(j)
		print "%s genes are contained in other genes" %  (g)
		
		#overwrite the original file, obtaining a new file with unique genes
		
		with open(proteinfile, "wb") as f:
			allsequences=''
			for k,v in auxDict.iteritems():
				allsequences+=v+k+"\n"
			f.write(allsequences)
	
	else:
		
		totalgenes=0
		smallgenes=0
		g_fp = HTSeq.FastaReader( genes )
		totalgenes+=1
		for gene in g_fp:
			dnaseq=	str(gene.seq)
			protseq,x,y=translateSeq(dnaseq)
			if len(protseq)>1:
				
				if str(protseq) in alreadyIn:
					repeatedgenes+=1
					#print gene.name + " already saved "
				
				elif len(str(protseq))<67:
					smallgenes+=1
					
				else:	
					alreadyIn.append(str(protseq))
					protname=">"+str(gene.name)+"\n"
							
						#print protseq
							
					protDict[protname] = str(protseq)
					geneDict[str(gene.name)] = gene.seq
			else:

				print gene.name
	
	
	
	geneFile = os.path.abspath( proteinfile )
	print proteinfile
	Gene_Blast_DB_name = Create_Blastdb( geneFile, 1, True )
	
	geneF = os.path.splitext( geneFile )[0]
	blast_out_file = geneF + '.xml'
					# ------------------------------ RUNNING BLAST ------------------------------ #

	cline = NcbiblastpCommandline(query=geneFile, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5)
	blast_records = runBlastParser(cline, blast_out_file, geneFile)
	toRemove=[]
	genesToKeep=[]
	log=["removed\tcause\texplanation"]
	for blast_record in blast_records:
		
		allelename=blast_record.query
		allelename=allelename.split(" ")
		allelename=allelename[0]
		alleleLength=len(geneDict[allelename])

		try:
			
			#if gene A is not on the toRemove list yet, add to genesToKeep list
			
			if str(blast_record.query) not in toRemove:
				genesToKeep.append(blast_record.query)
				
				i=0
				#if first alignement is not against self, gene B is bigger than gene A and very simillar - remove gene A from genesToKeep and add gene B instead
				if  not str(blast_record.query) == str((blast_record.alignments[0]).hit_def):
					genesToKeep.remove(str(blast_record.query))
					toRemove.append(str(blast_record.query))
					log.append(str(blast_record.query)+"\t"+str((blast_record.alignments[0]).hit_def)+"\t"+"2 is first best match")
					
					#if gene B is not on the toRemove list, add to genesToKeep list
					if str((blast_record.alignments[0]).hit_def) not in toRemove:
						genesToKeep.append(str((blast_record.alignments[0]).hit_def))

					raise
				
				selfblastscore=(((blast_record.alignments[0]).hsps)[0]).score


				
				
				while i<len(blast_record.alignments):
					align=blast_record.alignments[i]
					
					match=(align.hsps)[0]
					scoreRatio=float(match.score)/float(selfblastscore)
					
					alleleLength2=len(geneDict[str(align.hit_def)])
					
					#if good match and gene B not in toremove list
					if(scoreRatio>0.6 and not str(align.hit_def) == str(blast_record.query) and str(align.hit_def) not in toRemove):
						
						#if gene B is bigger than gene A, keep bigger gene B
						if alleleLength2>alleleLength :
							genesToKeep.append(str(align.hit_def))
							genesToKeep.remove(str(blast_record.query))
							toRemove.append(str(blast_record.query))
							log.append(str(blast_record.query)+"\t"+str(align.hit_def)+"\t"+"2 is bigger and bsr >0.6")
							
							raise
						#else add gene B to toremove list
						elif str(align.hit_def) in genesToKeep:
							genesToKeep.remove(str(align.hit_def))
							toRemove.append(str(align.hit_def))
							log.append(str(align.hit_def)+"\t"+str(blast_record.query)+"\t"+"2 is bigger and bsr >0.6")
							
					i+=1
			
			#else gene A is on toRemove list, add all similar genes (not in genesToKeep) list to the toRemove list
			else:		
						
				i=0
				selfblastscore=0
				for align in blast_record.alignments:
					if not (str(align.hit_def) == str(blast_record.query)):
						selfblastscore=((align.hsps)[0]).score
						print "gene "+str(align.hit_def)+" is bigger than gene "+str(blast_record.query)
						raise
				
				while i<len(blast_record.alignments):
					align=blast_record.alignments[i]
					match=(align.hsps)[0]
					scoreRatio=float(match.score)/float(selfblastscore)
					
					if align.hit_def not in genesToKeep and not str(align.hit_def) == str(blast_record.query) and scoreRatio>0.6 :
						toRemove.append(align.hit_def)
						log.append(str(align.hit_def)+"\t"+str(blast_record.query)+"\t"+"2 was on the removed list and bsr >0.6")
							
					else:
						pass

					i+=1
			

		except Exception as e:
			#print e
			pass
	with open("logfile.txt", "wb") as f:
		for elem in log:
			
			f.write(str(elem)+"\n")
	

	
	genesToKeep=list(set(genesToKeep))
	toRemove=list(set(toRemove))
	s = set(toRemove)
	notcommonToKeep= [x for x in genesToKeep if x not in s]
	print len(toRemove)
	print len(genesToKeep)
	print len(notcommonToKeep)

	pathfiles=os.path.dirname(geneFile)
	pathfiles=pathfiles+"/"

	g_fp = HTSeq.FastaReader( genes )
	removedparalogs=0
	removedsize=0
	totalgenes=0
	rest=0
	concatenatedFile=''
	for contig in g_fp:
		totalgenes+=1
		name = contig.name+" "+contig.descr
		name2= contig.name
		
		
		if name2 not in toRemove and name2 in genesToKeep:
			if int(len(contig.seq))>sizethresh:
				namefile=contig.name
				namefile=namefile.replace("|","_")
				with open(pathfiles+namefile+".fasta", "wb") as f:
					f.write(">1\n"+contig.seq+"\n")
				rest+=1	
				concatenatedFile+=">"+namefile+"\n"+contig.seq+"\n"
			else:
				removedsize+=1
		else:

			removedparalogs+=1
		
	print "%s genes are contained in other genes" %  (g)
	print "Removed %s same Locus genes" % str(removedparalogs)
	print "Removed %s because of size " % str(removedsize)
	print "%s Scheme genes " % str(rest)
	print "total genes:" + str(totalgenes)
	
	with open (pathfiles+"concatenated.fasta","wb") as f:
		f.write (concatenatedFile)

Example #18

Show file

def main():
    print("Starting script at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))
    try:
        input_file = sys.argv[1]
        temppath = sys.argv[2]
    except IndexError:
        print "usage: list_pickle_obj"

    argumentList = []
    with open(input_file, 'rb') as f:
        argumentList = pickle.load(f)

    geneFile = argumentList[0]
    genomesList = argumentList[1]

    basepath = os.path.join(temppath, os.path.splitext(geneFile)[0])
    if not os.path.exists(basepath):
        os.makedirs(basepath)

    gene_fp = HTSeq.FastaReader(geneFile)
    alleleI = 0

    resultsList = []
    i = 0
    perfectMatchIdAllele = []
    perfectMatchIdAllele2 = []
    allelescores = []

    print("Getting BSR at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))

    geneScorePickle = os.path.abspath(geneFile) + '_bsr.txt'

    #check if bsr as arealdy been calculated and recalculate it

    if os.path.isfile(geneScorePickle):

        alleleI, allelescores, alleleList = getBlastScoreRatios(
            geneFile, basepath, False)

    else:
        alleleI, allelescores, alleleList = getBlastScoreRatios(
            geneFile, basepath, True)

    print("Finished BSR at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))
    genome = -1

    genomeDict = {}
    print("starting allele call at: " + time.strftime("%H:%M:%S-%d/%m/%Y"))
    for genomeFile in genomesList:
        print genomeFile
        bestmatch = [
            0, 0, False, '', 0
        ]  #score, score ratio, perfectmatch, key name of the DNA sequence string, allele ID
        currentGenomeDict = {}
        currentCDSDict = {}

        # load the translated CDS from the genome to a dictionary
        filepath = os.path.join(
            temppath,
            str(os.path.basename(genomeFile)) + "_ORF_Protein.txt")
        with open(filepath, 'rb') as f:
            currentCDSDict = pickle.load(f)

        #load the contig info of the genome to a dictionary
        g_fp = HTSeq.FastaReader(genomeFile)
        for contig in g_fp:
            sequence = str(contig.seq)
            genomeDict[contig.name] = sequence

        currentGenomeDict = genomeDict

        genome += 1
        listOfCDS = currentCDSDict
        genomeProteinfastaPath = os.path.join(
            temppath, str(os.path.basename(genomeFile) + '_Protein.fasta'))

        print("Blasting alleles on genome at : " +
              time.strftime("%H:%M:%S-%d/%m/%Y"))

        blast_out_file = os.path.join(
            basepath, "blastdbs/" + os.path.basename(geneFile) + '_List.xml')

        Gene_Blast_DB_name = os.path.join(
            temppath,
            str(os.path.basename(genomeFile)) + "/" +
            str(os.path.basename(genomeFile)) + "_db")

        proteinfastaPath = os.path.join(
            basepath, str(os.path.basename(geneFile) + '_protein.fasta'))

        #blast the genome CDS against the translated locus
        cline = NcbiblastpCommandline(query=proteinfastaPath,
                                      db=Gene_Blast_DB_name,
                                      evalue=0.001,
                                      out=blast_out_file,
                                      outfmt=5)

        blast_records = runBlastParser(cline, blast_out_file, proteinfastaPath)
        print("Blasted alleles on genome at : " +
              time.strftime("%H:%M:%S-%d/%m/%Y"))

        alleleSizes = []
        for allele in alleleList:
            alleleSizes.append(len(allele))

        biggestSizeAllele = 0

        moda = max(set(alleleSizes), key=alleleSizes.count)
        contador = Counter(alleleSizes).most_common()

        if (contador[0])[1] == 1:
            moda = alleleSizes[0]

        try:

            # iterate through the blast results
            for blast_record in blast_records:

                locationcontigs = []

                for alignment in blast_record.alignments:

                    # select the best match
                    for match in alignment.hsps:

                        alleleMatchid = str(
                            blast_record.query_id).split("_")[1]

                        scoreRatio = float(match.score) / float(
                            allelescores[int(alleleMatchid) - 1])

                        cdsStrName = ((alignment.title).split(" "))[1]

                        DNAstr = listOfCDS[">" + cdsStrName]

                        AlleleDNAstr = alleleList[int(alleleMatchid) - 1]
                        if len(AlleleDNAstr) > biggestSizeAllele:
                            biggestSizeAllele = len(AlleleDNAstr)

                        compare = False

                        #compare the DNA match and the allele DNA sequence (protein sequences may be equal and DNA different)
                        if DNAstr == AlleleDNAstr is False:
                            try:
                                DNAstr = reverseComplement(DNAstr)
                                if DNAstr == AlleleDNAstr is False:
                                    pass
                                else:
                                    compare = True
                            except:
                                pass
                        else:
                            compare = True

                        if scoreRatio > 0.6:
                            locationcontigs.append(cdsStrName)

                        if "N" in DNAstr or "K" in DNAstr or "R" in DNAstr:
                            pass

                        elif (scoreRatio == 1 and bestmatch[2] is False
                              and compare is True):
                            bestmatch = [
                                match.score, scoreRatio, True, cdsStrName,
                                int(alleleMatchid), match,
                                len(AlleleDNAstr)
                            ]

                        elif (scoreRatio == 1 and match.score > bestmatch[0]
                              and compare is True):
                            bestmatch = [
                                match.score, scoreRatio, True, cdsStrName,
                                int(alleleMatchid), match,
                                len(AlleleDNAstr)
                            ]

                        elif (scoreRatio == 1 and bestmatch[2] is False
                              and compare is False):
                            bestmatch = [
                                match.score, scoreRatio, False, cdsStrName,
                                int(alleleMatchid), match,
                                len(AlleleDNAstr)
                            ]

                        elif (scoreRatio == 1 and match.score > bestmatch[0]
                              and compare is False):
                            bestmatch = [
                                match.score, scoreRatio, False, cdsStrName,
                                int(alleleMatchid), match,
                                len(AlleleDNAstr)
                            ]

                        elif (match.score > bestmatch[0] and scoreRatio > 0.6
                              and scoreRatio > bestmatch[1]
                              and bestmatch[2] is False):
                            bestmatch = [
                                match.score, scoreRatio, False, cdsStrName,
                                int(alleleMatchid), match,
                                len(AlleleDNAstr)
                            ]

            print("Classifying the match at : " +
                  time.strftime("%H:%M:%S-%d/%m/%Y"))

            #if no best match was found it's a Locus Not Found
            if bestmatch[
                    0] == 0 or "N" in AlleleDNAstr or "K" in AlleleDNAstr or "R" in AlleleDNAstr:

                ###################
                # LOCUS NOT FOUND #
                ###################
                if bestmatch[0] == 0:
                    resultsList.append('LNF3:-1')
                    perfectMatchIdAllele.append('LNF')
                    perfectMatchIdAllele2.append('LNF')
                    print "Locus not found, no matches \n"
                else:
                    resultsList.append('LNFN:-1')
                    perfectMatchIdAllele.append('LNF')
                    perfectMatchIdAllele2.append('LNF')
                    print "Locus has strange base (N, K or R) \n"

            #if more than one BSR >0.6 in two different CDSs it's a Non Paralog Locus
            elif len(list(set(locationcontigs))) > 1:
                resultsList.append('NIPL')
                perfectMatchIdAllele.append('NIPL')
                perfectMatchIdAllele2.append('NIPL')
                for elem in locationcontigs:
                    print elem

            #in case the DNA match sequence equal to the DNA sequence of the comparing allele
            elif bestmatch[2] is True:
                contigname = bestmatch[3]

                contigname = contigname.split("&")
                matchLocation = contigname[2]
                contigname = contigname[0]
                print contigname
                alleleStr = listOfCDS[">" + bestmatch[3]]
                protSeq, alleleStr, Reversed = translateSeq(alleleStr)

                #check for possible locus on tip
                match = bestmatch[5]
                matchLocation2 = matchLocation.split("-")
                seq = currentGenomeDict[contigname]
                bestMatchContigLen = len(seq)

                rightmatchContig = bestMatchContigLen - int(matchLocation2[1])
                leftmatchContig = int(matchLocation2[0])

                if Reversed:
                    aux = rightmatchContig
                    rightmatchContig = leftmatchContig
                    leftmatchContig = aux

                # get extra space to the right and left between the allele and match

                possibleExtra = int(moda) - ((int(match.query_end) * 3) -
                                             (int(match.query_start) * 3))

                if possibleExtra < 0:
                    perfectMatchIdAllele.append(str(bestmatch[4]))
                    if not Reversed:
                        perfectMatchIdAllele2.append(
                            str(contigname) + "&" + str(matchLocation) + "&" +
                            "+")
                    else:
                        perfectMatchIdAllele2.append(
                            str(contigname) + "&" + str(matchLocation) + "&" +
                            "-")
                    resultsList.append('EXC:' + str(bestmatch[4]))

                else:
                    rightmatchAllele = possibleExtra
                    leftmatchAllele = possibleExtra

                    if leftmatchContig < leftmatchAllele and rightmatchContig < rightmatchAllele:

                        resultsList.append('PLOTSC:-1')
                        perfectMatchIdAllele.append('PLOTSC')
                        perfectMatchIdAllele2.append('PLOTSC')

                        print match
                        print "contig extras (l,r)"
                        print leftmatchContig, rightmatchContig
                        print "allele extras (l,r)"
                        print leftmatchAllele, rightmatchAllele

                        print "Locus is possibly bigger than the contig \n"

                    elif leftmatchContig < leftmatchAllele:

                        resultsList.append('PLOT3:-1')
                        perfectMatchIdAllele.append('PLOT3')
                        perfectMatchIdAllele2.append('PLOT3')

                        print match
                        print "contig extras (l,r)"
                        print leftmatchContig, rightmatchContig
                        print "allele extras (l,r)"
                        print leftmatchAllele, rightmatchAllele

                        print "Locus is possibly on the 3' tip of the contig \n"

                    elif rightmatchContig < rightmatchAllele:

                        resultsList.append('PLOT5:-1')
                        perfectMatchIdAllele.append('PLOT5')
                        perfectMatchIdAllele2.append('PLOT5')

                        print match
                        print "contig extras (l,r)"
                        print leftmatchContig, rightmatchContig
                        print "allele extras (l,r)"
                        print leftmatchAllele, rightmatchAllele

                        print "Locus is possibly on the 5' tip of the contig \n"

                    else:
                        #if a perfect match was found

                        ################################################
                        # EXACT MATCH --- MATCH == GENE --- GENE FOUND #
                        ################################################

                        perfectMatchIdAllele.append(str(bestmatch[4]))
                        if not Reversed:
                            perfectMatchIdAllele2.append(
                                str(contigname) + "&" + str(matchLocation) +
                                "&" + "+")
                        else:
                            perfectMatchIdAllele2.append(
                                str(contigname) + "&" + str(matchLocation) +
                                "&" + "-")
                        resultsList.append('EXC:' + str(bestmatch[4]))

            # if match with BSR >0.6 and not equal DNA sequences
            else:

                match = bestmatch[5]
                geneLen = bestmatch[6]

                contigname = bestmatch[3]

                contigname = contigname.split("&")
                matchLocation = contigname[2]
                matchLocation = matchLocation.split("-")
                contigname = contigname[0]

                seq = currentGenomeDict[contigname]
                bestMatchContigLen = len(seq)

                alleleStr = listOfCDS[">" + bestmatch[3]]
                protSeq, alleleStr, Reversed = translateSeq(alleleStr)

                rightmatchContig = bestMatchContigLen - int(matchLocation[1])
                leftmatchContig = int(matchLocation[0])

                if Reversed:
                    aux = rightmatchContig
                    rightmatchContig = leftmatchContig
                    leftmatchContig = aux

                print rightmatchContig, leftmatchContig

                # get extra space to the right and left between the allele and match and check if it's still inside the contig

                rightmatchAllele = geneLen - ((int(match.query_end) + 1) * 3)
                leftmatchAllele = ((int(match.query_start) - 1) * 3)

                ###########################
                # LOCUS ON THE CONTIG TIP #
                ###########################

                if leftmatchContig < leftmatchAllele and rightmatchContig < rightmatchAllele:

                    resultsList.append('LOTSC:-1')
                    perfectMatchIdAllele.append('LOTSC')
                    perfectMatchIdAllele2.append('LOTSC')
                    print match
                    print contigname
                    print geneFile
                    print leftmatchAllele, rightmatchAllele
                    print "Locus is bigger than the contig \n"

                elif leftmatchContig < leftmatchAllele:

                    resultsList.append('LOT3:-1')
                    perfectMatchIdAllele.append('LOT3')
                    perfectMatchIdAllele2.append('LOT3')
                    print match
                    print contigname
                    print geneFile
                    print leftmatchAllele, rightmatchAllele
                    print "Locus is on the 3' tip of the contig \n"

                elif rightmatchContig < rightmatchAllele:

                    resultsList.append('LOT5:-1')
                    perfectMatchIdAllele.append('LOT5')
                    perfectMatchIdAllele2.append('LOT5')
                    print match
                    print contigname
                    print geneFile
                    print leftmatchAllele, rightmatchAllele
                    print "Locus is on the 5' tip of the contig \n"

                elif len(alleleStr) > moda + (moda * 0.2):

                    print moda
                    print alleleStr
                    resultsList.append('ALM')
                    perfectMatchIdAllele.append('ALM')
                    perfectMatchIdAllele2.append('ALM')

                elif len(alleleStr) < moda - (moda * 0.2):

                    print moda
                    print alleleStr
                    resultsList.append('ASM')
                    perfectMatchIdAllele.append('ASM')
                    perfectMatchIdAllele2.append('ASM')

                else:
                    #######################
                    # ADD INFERRED ALLELE #		# a new allele
                    #######################

                    tagAux = 'INF'
                    perfectMatchIdAllele.append(tagAux + "-" +
                                                str(alleleI + 1))

                    if not Reversed:
                        perfectMatchIdAllele2.append(
                            str(contigname) + "&" + str(matchLocation[0]) +
                            "-" + str(matchLocation[1]) + "&" + "+")
                    else:
                        perfectMatchIdAllele2.append(
                            str(contigname) + "&" + str(matchLocation[0]) +
                            "-" + str(matchLocation[1]) + "&" + "-")

                    print "New allele! Adding allele " + tagAux + str(
                        alleleI + 1) + " to the database\n"

                    resultsList.append(tagAux + str(alleleI + 1))

                    # --- add the new allele to the gene fasta --- #

                    appendAllele = '>allele_' + str(
                        alleleI + 1) + '_' + tagAux[:-1] + "_" + str(
                            os.path.basename(genomesList[genome])) + '\n'
                    fG = open(geneFile, 'a')
                    fG.write(appendAllele)

                    fG.write(alleleStr + '\n')
                    fG.close()

                    fG = open(
                        os.path.join(
                            basepath,
                            str(
                                os.path.basename(geneFile) +
                                '_protein2.fasta')), 'w')
                    fG.write('>' + str(alleleI + 1) + '\n' + str(protSeq) +
                             '\n')
                    fG.close()
                    fG = open(
                        os.path.join(
                            basepath,
                            str(os.path.basename(geneFile) +
                                '_protein.fasta')), 'a')
                    fG.write('>' + str(alleleI + 1) + '\n' + str(protSeq) +
                             '\n')
                    fG.close()

                    match = bestmatch[5]

                    # --- remake blast DB and recalculate the BSR for the locus --- #
                    alleleList.append(alleleStr)
                    print os.path.join(
                        basepath,
                        str(os.path.basename(geneFile) + '_protein.fasta'))
                    genefile2 = os.path.join(
                        basepath,
                        str(os.path.basename(geneFile) + '_protein2.fasta'))
                    Gene_Blast_DB_name2 = Create_Blastdb(genefile2, 1, True)
                    print("Re-calculating BSR at : " +
                          time.strftime("%H:%M:%S-%d/%m/%Y"))
                    alleleI, allelescores, alleleList = reDogetBlastScoreRatios(
                        genefile2, basepath, alleleI, allelescores,
                        Gene_Blast_DB_name2, alleleList, geneScorePickle)
                    print "allele id " + str(alleleI)
                    print("Done Re-calculating BSR at : " +
                          time.strftime("%H:%M:%S-%d/%m/%Y"))

        except Exception as e:
            print "some error occurred"
            print e
            print 'Error on line {}'.format(sys.exc_info()[-1].tb_lineno)
            perfectMatchIdAllele2.append("ERROR")
            perfectMatchIdAllele.append("ERROR")
            resultsList.append('ERROR')

    final = (resultsList, perfectMatchIdAllele)
    print("Finished allele calling at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))
    filepath = os.path.join(temppath,
                            os.path.basename(geneFile) + "_result.txt")
    filepath2 = os.path.join(temppath,
                             os.path.basename(geneFile) + "_result2.txt")
    with open(filepath, 'wb') as f:
        pickle.dump(final, f)
    with open(filepath2, 'wb') as f:
        pickle.dump(perfectMatchIdAllele2, f)
    shutil.rmtree(basepath)
    return True

Example #19

Show file

File: SearchAll.py Project: B-UMMI/ProGenViZ

def getBlastScoreRatios(orgName,allelescores,cdsDict,prodigalPath):
    
    openPresults = prodigalPath
    Presults=open(openPresults, 'r')
    linesP = Presults.readlines()
    lastlineP=len(linesP)
    alleleProt=''
    proteome=""
    countP=0
    countCDS=0

    if isContig=="no":
        CreateProteome(nameOrg)
    else:
        queryCDS = CreateProteomeContig(nameOrg,cdsDict)


    cline = NcbiblastpCommandline(query=pathRef+nameOrg+'proteome.fasta', db=name, out=blast_out_file, outfmt=5, num_alignments=7000, num_descriptions=7000)
        #print cline

    allelescore=0
    blast_records = runBlastParser(cline,blast_out_file, alleleProt)

    os.remove(pathRef+nameOrg+'proteome.fasta')
    
    blastScoreRatio=0
    countRecords=0
    bestMatches={}
    BestMatchResults= []
    length=[]
    alignment_posStart=[]
    query_length=[]
    for blast_record in blast_records:
        found=False
        countRecords+=1 
        for alignment in blast_record.alignments:
            if found is False:
                #print blast_record.query, alignment.hit_def
                scoreToUse=0
                for match in alignment.hsps:
                    if len(blast_record.alignments)==0:
                        countResults=countResults
                    else:
                        blastScoreRatio = float(match.score) / float(allelescores[str(alignment.hit_def)])
                                #or re.search("ENA|",alignment.title)
                                #print alignment.title
                        try:
                            geneName=alignment.title.split("|")[5]
                        except IndexError:
                            geneName=alignment.title.split("|")[2]
                                #print geneName
                                #products.append(alignment.title.split("|")[6].split("[")[0])
                                #if hsp.expect < 0.001 and 100 <= hsp.align_length:
                        if geneName.strip() not in BestMatchResults and blastScoreRatio>0.6:
                            BestMatchResults.append(genomeDB+"..."+str(geneName).strip())
                            length.append(str(match.align_length-1))
                            #score.append(str(Score))
                            alignment_posStart.append(str(match.query_start))
                            query_length.append(str(len(match.query)))
                    break
            else:
                break
        #print str(blast_record.query)
        bestMatches[str(countRecords)] = [BestMatchResults,length,alignment_posStart,query_length,str(blast_record.query)]
        BestMatchResults= []
        length=[]
        alignment_posStart=[]
        query_length=[]
    print countRecords


    #fG = open( pathRef+'AllAlleles.fasta', 'a' )
    #for i in ToNewAllele:
        #print i
        #fG.write(i)
    #fG.close()


    #Create_Blastdb(pathRef+'allAllelesAA.fasta',1,True)
    #print matchR
    #print allelescores
    #return alleleI,allelescores,Gene_Blast_DB_name
    #print alleleI
    #print len(allelescores)
    #print countT
    return bestMatches,queryCDS

Example #20

Show file

def getBlastScoreRatios(genefile, basepath, doAll):

    gene_fp = HTSeq.FastaReader(genefile)
    alleleI = 0
    allelescores = []
    alleleProt = ''
    alleleAllProt = ''
    alleleList = []
    for allele in gene_fp:  #new db for each allele to blast it against himself
        alleleI += 1
        genome = -1
        alleleList.append(allele.seq)
        translatedSequence, x, y = translateSeq(allele.seq)

        if translatedSequence == '':
            pass

        else:
            alleleProt = ">" + str(alleleI) + "\n" + str(translatedSequence +
                                                         "\n")
            alleleAllProt += ">" + str(alleleI) + "\n" + str(
                translatedSequence + "\n")
            proteinfastaPath = os.path.join(
                basepath, str(os.path.basename(genefile) + '_protein2.fasta'))

            with open(proteinfastaPath, "wb") as f:
                f.write(alleleProt)
            Gene_Blast_DB_name = Create_Blastdb(proteinfastaPath, 1, True)
            if doAll:

                blast_out_file = os.path.join(basepath, 'blastdbs/temp.xml')
                print("Starting Blast alleles at : " +
                      time.strftime("%H:%M:%S-%d/%m/%Y"))

                # --- get BLAST score ratio --- #
                cline = NcbiblastpCommandline(query=proteinfastaPath,
                                              db=Gene_Blast_DB_name,
                                              evalue=0.001,
                                              out=blast_out_file,
                                              outfmt=5)
                allelescore = 0

                blast_records = runBlastParser(cline, blast_out_file,
                                               alleleProt)

                print("Blasted alleles at : " +
                      time.strftime("%H:%M:%S-%d/%m/%Y"))

                for blast_record in blast_records:

                    for alignment in blast_record.alignments:

                        for match in alignment.hsps:

                            allelescores.append(int(match.score))

                geneScorePickle = os.path.abspath(genefile) + '_bsr.txt'
                print "________"
                var = [alleleI, allelescores]
                with open(geneScorePickle, 'wb') as f:
                    pickle.dump(var, f)

            else:
                geneScorePickle = os.path.abspath(genefile) + '_bsr.txt'
                with open(geneScorePickle, 'rb') as f:
                    var = pickle.load(f)
                    allelescores = var[1]

    proteinfastaPath = os.path.join(
        basepath, str(os.path.basename(genefile) + '_protein.fasta'))
    with open(proteinfastaPath, "wb") as f:
        f.write(alleleAllProt)

    return int(alleleI), allelescores, alleleList

Example #21

Show file

File: callAlleles_protein2.py Project: mickaelsilva/bacterial_wgMLST

def getBlastScoreRatios(genefile,basepath,doAll):
	
	gene_fp = HTSeq.FastaReader(genefile)
	alleleI=0
	allelescores=[]
	alleleProt=''
	alleleAllProt=''
	alleleList=[]
	for allele in gene_fp: #new db for each allele to blast it against himself
		alleleI+=1
		genome=-1
		alleleList.append(allele.seq)
		translatedSequence,x,y=translateSeq(allele.seq)
		
		if translatedSequence =='':
			pass
			
		else:	
			alleleProt=">"+str(alleleI)+"\n"+str(translatedSequence+"\n")
			alleleAllProt+=">"+str(alleleI)+"\n"+str(translatedSequence+"\n")
			proteinfastaPath=os.path.join(basepath,str(os.path.basename(genefile)+'_protein2.fasta'))
			
			with open(proteinfastaPath, "wb") as f:
				f.write(alleleProt)
			Gene_Blast_DB_name = Create_Blastdb( proteinfastaPath, 1, True )
			if doAll:
				
				blast_out_file = os.path.join(basepath,'blastdbs/temp.xml')
				print ("Starting Blast alleles at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
				

				# --- get BLAST score ratio --- #
				cline = NcbiblastpCommandline(query=proteinfastaPath, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5)
				allelescore=0
			
				blast_records = runBlastParser(cline,blast_out_file, alleleProt)
			
				print ("Blasted alleles at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
			
				for blast_record in blast_records:

					for alignment in blast_record.alignments:

						for match in alignment.hsps:
								
							allelescores.append(int(match.score))
							
				geneScorePickle=os.path.abspath(genefile)+'_bsr.txt'
				print "________"
				var=[alleleI,allelescores]
				with open(geneScorePickle,'wb') as f:
					pickle.dump(var, f)			
			
			else:
				geneScorePickle=os.path.abspath(genefile)+'_bsr.txt'
				with open(geneScorePickle,'rb') as f:
					var = pickle.load(f)
					allelescores=var[1]
				
	proteinfastaPath=os.path.join(basepath,str(os.path.basename(genefile)+'_protein.fasta'))
	with open(proteinfastaPath, "wb") as f:
			f.write(alleleAllProt)
			
			
	return int(alleleI),allelescores,alleleList

Example #22

Show file

File: callAlleles_protein2.py Project: mickaelsilva/bacterial_wgMLST

def main():
	print ("Starting script at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	try:
		input_file = sys.argv[1]
		temppath = sys.argv[2]
	except IndexError:
		print "usage: list_pickle_obj"

	argumentList=[]
	with open(input_file,'rb') as f:
		argumentList = pickle.load(f)
	
	geneFile = argumentList[0]
	genomesList = argumentList[1]
	
	basepath=os.path.join(temppath,os.path.splitext(geneFile)[0])
	if not os.path.exists(basepath):
			os.makedirs(basepath)

	gene_fp = HTSeq.FastaReader(geneFile)
	alleleI = 0

	resultsList = []
	i = 0
	perfectMatchIdAllele=[]
	perfectMatchIdAllele2=[]
	allelescores=[]
	
	print ("Getting BSR at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))

	geneScorePickle=os.path.abspath(geneFile)+'_bsr.txt'
	
	#check if bsr as arealdy been calculated and recalculate it

	if os.path.isfile(geneScorePickle) :
		
		alleleI,allelescores,alleleList=getBlastScoreRatios(geneFile,basepath,False)
		
	else:	
		alleleI,allelescores,alleleList=getBlastScoreRatios(geneFile,basepath,True)
		
			
			
	print ("Finished BSR at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	genome=-1	
	
	genomeDict = {}
	print ("starting allele call at: "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	for genomeFile in genomesList:
		print genomeFile
		bestmatch=[0,0,False,'',0] #score, score ratio, perfectmatch, key name of the DNA sequence string, allele ID
		currentGenomeDict={}
		currentCDSDict={}
		
		# load the translated CDS from the genome to a dictionary
		filepath=os.path.join(temppath,str(os.path.basename(genomeFile))+"_ORF_Protein.txt")
		with open(filepath,'rb') as f:
			currentCDSDict = pickle.load(f)
		
		#load the contig info of the genome to a dictionary
		g_fp = HTSeq.FastaReader( genomeFile )
		for contig in g_fp:
			sequence=str(contig.seq)
			genomeDict[ contig.name ] = sequence
		
		currentGenomeDict = genomeDict

		genome+=1
		listOfCDS=currentCDSDict
		genomeProteinfastaPath=os.path.join(temppath,str(os.path.basename(genomeFile)+'_Protein.fasta'))
		
		print ("Blasting alleles on genome at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
		
		blast_out_file = os.path.join(basepath,"blastdbs/"+os.path.basename(geneFile)+ '_List.xml')

		Gene_Blast_DB_name = os.path.join(temppath,str(os.path.basename(genomeFile))+"/"+str(os.path.basename(genomeFile))+"_db")

		proteinfastaPath=os.path.join(basepath,str(os.path.basename(geneFile)+'_protein.fasta'))
		
		
		#blast the genome CDS against the translated locus
		cline = NcbiblastpCommandline(query=proteinfastaPath, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5)
			
		blast_records = runBlastParser(cline, blast_out_file, proteinfastaPath)
		print ("Blasted alleles on genome at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
		
		alleleSizes=[]
		for allele in alleleList:
			alleleSizes.append(len(allele))
		
		biggestSizeAllele=0
		
		moda=max(set(alleleSizes), key=alleleSizes.count)
		contador= Counter(alleleSizes).most_common()
		
		if (contador[0])[1] ==1:
			moda= alleleSizes[0]

		try:
			
			# iterate through the blast results
			for blast_record in blast_records:
					
				locationcontigs=[]
				
				for alignment in blast_record.alignments:
					
					# select the best match
					for match in alignment.hsps:
						
						alleleMatchid=str(blast_record.query_id).split("_")[1]
						
						scoreRatio=float(match.score)/float(allelescores[int(alleleMatchid)-1])

						cdsStrName=((alignment.title).split(" "))[1]
						
						DNAstr=listOfCDS[">"+cdsStrName]

						AlleleDNAstr=alleleList[int(alleleMatchid)-1]
						if len(AlleleDNAstr)>biggestSizeAllele:
							biggestSizeAllele=len(AlleleDNAstr)
							
						compare=False
						
						#compare the DNA match and the allele DNA sequence (protein sequences may be equal and DNA different)
						if DNAstr==AlleleDNAstr is False:
							try:
								DNAstr=reverseComplement(DNAstr)
								if DNAstr==AlleleDNAstr is False:
									pass
								else:
									compare=True
							except:
								pass
						else:
							compare=True
						
						if scoreRatio>0.6:
							locationcontigs.append(cdsStrName)
							
						if "N" in DNAstr or "K" in DNAstr or "R" in DNAstr:
							pass
							
						elif(scoreRatio == 1 and bestmatch[2] is False and compare is True):
							bestmatch=[match.score,scoreRatio,True,cdsStrName,int(alleleMatchid),match,len(AlleleDNAstr)]

						elif(scoreRatio == 1 and match.score>bestmatch[0] and compare is True):
							bestmatch=[match.score,scoreRatio,True,cdsStrName,int(alleleMatchid),match,len(AlleleDNAstr)]

						elif(scoreRatio == 1 and bestmatch[2] is False and compare is False):
							bestmatch=[match.score,scoreRatio,False,cdsStrName,int(alleleMatchid),match,len(AlleleDNAstr)]
						
						elif(scoreRatio == 1 and match.score>bestmatch[0] and compare is False):
							bestmatch=[match.score,scoreRatio,False,cdsStrName,int(alleleMatchid),match,len(AlleleDNAstr)]

						elif(match.score>bestmatch[0] and scoreRatio>0.6 and scoreRatio>bestmatch[1] and bestmatch[2] is False):
							bestmatch=[match.score,scoreRatio,False,cdsStrName,int(alleleMatchid),match,len(AlleleDNAstr)]
							
										
			print ("Classifying the match at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))		
			
			#if no best match was found it's a Locus Not Found
			if bestmatch[0]==0 or "N" in AlleleDNAstr or "K" in AlleleDNAstr or "R" in AlleleDNAstr :
						
						###################
						# LOCUS NOT FOUND #
						###################
				if 	bestmatch[0]==0:		
					resultsList.append('LNF3:-1')
					perfectMatchIdAllele.append('LNF')
					perfectMatchIdAllele2.append('LNF')
					print "Locus not found, no matches \n"
				else:
					resultsList.append('LNFN:-1')
					perfectMatchIdAllele.append('LNF')
					perfectMatchIdAllele2.append('LNF')
					print "Locus has strange base (N, K or R) \n"
			
			#if more than one BSR >0.6 in two different CDSs it's a Non Paralog Locus
			elif len(list(set(locationcontigs)))>1:
				resultsList.append('NIPL')            
				perfectMatchIdAllele.append('NIPL')
				perfectMatchIdAllele2.append('NIPL')
				for elem in locationcontigs:
					print elem
				
			
			#in case the DNA match sequence equal to the DNA sequence of the comparing allele
			elif bestmatch[2] is True:
				contigname=bestmatch[3]	
				
				contigname=contigname.split("&")
				matchLocation=contigname[2]	
				contigname=contigname[0]	
				print contigname
				alleleStr=listOfCDS[">"+bestmatch[3]]
				protSeq,alleleStr,Reversed=translateSeq(alleleStr)
				

				#check for possible locus on tip
				match=bestmatch[5]
				matchLocation2=matchLocation.split("-")			
				seq=currentGenomeDict[ contigname ]
				bestMatchContigLen=len(seq)
				
				rightmatchContig=bestMatchContigLen-int(matchLocation2[1])	
				leftmatchContig=int(matchLocation2[0])
				
				if Reversed:
					aux=rightmatchContig
					rightmatchContig=leftmatchContig
					leftmatchContig=aux
				
				
				
				
				
				# get extra space to the right and left between the allele and match
				
				possibleExtra=int(moda)-((int(match.query_end)*3)-(int(match.query_start)*3))
				
				if possibleExtra<0:
					perfectMatchIdAllele.append(str(bestmatch[4]))
					if not Reversed:
						perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation)+"&"+"+")
					else:
						perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation)+"&"+"-")
					resultsList.append('EXC:' + str(bestmatch[4]) )
				
				else:	
					rightmatchAllele=possibleExtra
					leftmatchAllele=possibleExtra
					
					if leftmatchContig<leftmatchAllele and 	rightmatchContig < rightmatchAllele:
				
						resultsList.append('PLOTSC:-1')
						perfectMatchIdAllele.append('PLOTSC')
						perfectMatchIdAllele2.append('PLOTSC')

						print match
						print "contig extras (l,r)"
						print leftmatchContig,rightmatchContig
						print "allele extras (l,r)"
						print leftmatchAllele,rightmatchAllele
						
						print "Locus is possibly bigger than the contig \n"
					
					elif leftmatchContig<leftmatchAllele:
						
						
						resultsList.append('PLOT3:-1')
						perfectMatchIdAllele.append('PLOT3')
						perfectMatchIdAllele2.append('PLOT3')
						
						print match
						print "contig extras (l,r)"
						print leftmatchContig,rightmatchContig
						print "allele extras (l,r)"
						print leftmatchAllele,rightmatchAllele
						
						print "Locus is possibly on the 3' tip of the contig \n"
					
					
					elif 	rightmatchContig < rightmatchAllele:
						
						resultsList.append('PLOT5:-1')
						perfectMatchIdAllele.append('PLOT5')
						perfectMatchIdAllele2.append('PLOT5')
						
						print match
						print "contig extras (l,r)"
						print leftmatchContig,rightmatchContig
						print "allele extras (l,r)"
						print leftmatchAllele,rightmatchAllele

						print "Locus is possibly on the 5' tip of the contig \n"
				
					else:
						#if a perfect match was found
								
						################################################
						# EXACT MATCH --- MATCH == GENE --- GENE FOUND #
						################################################
								
						perfectMatchIdAllele.append(str(bestmatch[4]))
						if not Reversed:
							perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation)+"&"+"+")
						else:
							perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation)+"&"+"-")
						resultsList.append('EXC:' + str(bestmatch[4]) )

			
			# if match with BSR >0.6 and not equal DNA sequences
			else:
				
				match=bestmatch[5]
				geneLen=bestmatch[6]

				contigname=bestmatch[3]	
				
				contigname=contigname.split("&")
				matchLocation=contigname[2]	
				matchLocation=matchLocation.split("-")
				contigname=contigname[0]
				
				seq=currentGenomeDict[ contigname ]
				bestMatchContigLen=len(seq)
				
				alleleStr=listOfCDS[">"+bestmatch[3]]
				protSeq,alleleStr,Reversed=translateSeq(alleleStr)
				
				
				rightmatchContig=bestMatchContigLen-int(matchLocation[1])	
				leftmatchContig=int(matchLocation[0])
				
				if Reversed:
					aux=rightmatchContig
					rightmatchContig=leftmatchContig
					leftmatchContig=aux
				
				
				print rightmatchContig,leftmatchContig
				
				
				# get extra space to the right and left between the allele and match and check if it's still inside the contig
				
				rightmatchAllele=geneLen-((int(match.query_end)+1)*3)	
				leftmatchAllele=((int(match.query_start)-1)*3)
				

						###########################
						# LOCUS ON THE CONTIG TIP #
						###########################
				
				
				
				if leftmatchContig<leftmatchAllele and 	rightmatchContig < rightmatchAllele:
				
					resultsList.append('LOTSC:-1')
					perfectMatchIdAllele.append('LOTSC')
					perfectMatchIdAllele2.append('LOTSC')
					print match
					print contigname
					print geneFile
					print leftmatchAllele,rightmatchAllele
					print "Locus is bigger than the contig \n"
				
				elif leftmatchContig<leftmatchAllele:
					
					
					resultsList.append('LOT3:-1')
					perfectMatchIdAllele.append('LOT3')
					perfectMatchIdAllele2.append('LOT3')
					print match
					print contigname
					print geneFile
					print leftmatchAllele,rightmatchAllele
					print "Locus is on the 3' tip of the contig \n"
				
				
				elif 	rightmatchContig < rightmatchAllele:
					
					resultsList.append('LOT5:-1')
					perfectMatchIdAllele.append('LOT5')
					perfectMatchIdAllele2.append('LOT5')
					print match
					print contigname
					print geneFile
					print leftmatchAllele,rightmatchAllele
					print "Locus is on the 5' tip of the contig \n"
				
				
							
				elif len(alleleStr) > moda+(moda*0.2) :
					
					print moda
					print alleleStr
					resultsList.append('ALM')
					perfectMatchIdAllele.append('ALM')
					perfectMatchIdAllele2.append('ALM')
				
				elif len(alleleStr) < moda-(moda*0.2):
					
					print moda
					print alleleStr
					resultsList.append('ASM')
					perfectMatchIdAllele.append('ASM')
					perfectMatchIdAllele2.append('ASM')
			
					
				else:
							#######################
							# ADD INFERRED ALLELE #		# a new allele 
							#######################
							
													
					tagAux='INF'
					perfectMatchIdAllele.append( tagAux +"-"+str(alleleI+1))
					
					if not Reversed:
						perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"+")
					else:
						perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"-")
					
					
					print "New allele! Adding allele "+ tagAux + str(alleleI+1) +" to the database\n"
																						
					resultsList.append( tagAux + str(alleleI+1) )

												# --- add the new allele to the gene fasta --- #
					
					
					appendAllele='>allele_' + str(alleleI+1) + '_' + tagAux[:-1] +"_" + str(os.path.basename(genomesList[genome])) + '\n'
					fG = open( geneFile, 'a' )
					fG.write(appendAllele)
						
					fG.write( alleleStr + '\n')
					fG.close()
					
					fG = open( os.path.join(basepath,str(os.path.basename(geneFile)+'_protein2.fasta')), 'w' )
					fG.write('>'+str(alleleI+1)+'\n'+str(protSeq) + '\n')
					fG.close()
					fG = open( os.path.join(basepath,str(os.path.basename(geneFile)+'_protein.fasta')), 'a' )
					fG.write('>'+str(alleleI+1)+'\n'+str(protSeq) + '\n')
					fG.close()	
					
					match=bestmatch[5]
					
					# --- remake blast DB and recalculate the BSR for the locus --- #
					alleleList.append(alleleStr)
					print os.path.join(basepath,str(os.path.basename(geneFile)+'_protein.fasta'))
					genefile2= os.path.join(basepath,str(os.path.basename(geneFile)+'_protein2.fasta'))
					Gene_Blast_DB_name2 = Create_Blastdb( genefile2, 1, True )
					print ("Re-calculating BSR at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
					alleleI,allelescores,alleleList=reDogetBlastScoreRatios(genefile2,basepath,alleleI,allelescores,Gene_Blast_DB_name2,alleleList,geneScorePickle)
					print "allele id " + str(alleleI)
					print ("Done Re-calculating BSR at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
		
		except Exception as e:
			print "some error occurred"
			print e
			print 'Error on line {}'.format(sys.exc_info()[-1].tb_lineno)
			perfectMatchIdAllele2.append("ERROR")
			perfectMatchIdAllele.append("ERROR")
			resultsList.append('ERROR')  
		
	
	final =	(resultsList,perfectMatchIdAllele)	
	print ("Finished allele calling at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	filepath=os.path.join(temppath , os.path.basename(geneFile)+"_result.txt")
	filepath2=os.path.join(temppath , os.path.basename(geneFile)+"_result2.txt")
	with open(filepath, 'wb') as f:
		pickle.dump(final, f)
	with open(filepath2, 'wb') as f:
		pickle.dump(perfectMatchIdAllele2, f)
	shutil.rmtree(basepath)
	return True

Example #23

Show file

File: CompareSameLocus.py Project: mickaelsilva/pythonscripts

def main():

	parser = argparse.ArgumentParser(description="Given two list of genes, creates a folder with paired files when located on the same locus")
	parser.add_argument('-i', nargs='?', type=str, help='1st list of genes files to compare', required=True)
	parser.add_argument('-g', nargs='?', type=str, help='2nd list of genes files to compare', required=True)
	
	args = parser.parse_args()
	geneFiles1 = args.i
	geneFiles2 = args.g
	
		
	name1="concat1.fasta"
	name2="concat2.fasta"
		
	concat_genes(geneFiles1, name1)
	concat_genes(geneFiles2, name2)
	
	#orderedAlleleNames=[]

	geneDict={}
	gene_fp = HTSeq.FastaReader(name1)
	alleleI=0
	for allele in gene_fp:
		#if allele.seq in geneDict:
		#	print "\nWARNING: this file contains a repeated allele, it should be checked. Ignoring it now!\n", geneFile
		#else:
			#orderedAlleleNames.append(allele.name)
		geneDict[ allele.seq ] = allele.name
		alleleI += 1
	
	gene_fp = HTSeq.FastaReader(name1)
	geneFile = os.path.abspath( name1 )
	Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 , False)
	
	geneF = os.path.splitext( geneFile )[0]
	blast_out_file = geneF + '.xml'

	# list of results - the output of the function
	resultsList = []

					# ------------------------------ RUNNING BLAST ------------------------------ #

	cline = NcbiblastnCommandline(query=name2, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5)
	blast_records = runBlastParser(cline, blast_out_file, name2)
	samelocus=0
	alreadyUsed=[]
	nomatch=0
	small=0
	if not os.path.exists("./sameLocus"):
		os.makedirs("./sameLocus")
	LocusID=0
	for blast_record in blast_records:
		try:
			alignment=blast_record.alignments[1]
			#print blast_record.query
			#print alignment.num_alignments
			
			try:
					#print alleleLength, alignment.length
				i=0
				align=blast_record.alignments[i]	
				while i<len(blast_record.alignments):
					if align.hit_def:
						result,allelename2,LocusID,alreadyUsed=alignHasGoodMatch(align,geneDict,LocusID,blast_record,alreadyUsed)
						if result>0 and allelename2:
							samelocus+=result
							i+=999
						else:
							small+=1
							i+=999
							alreadyUsed.append(allelename2)
					elif allelename :
						#alreadyUsed.append(allelename)
						result,allelename,LocusID,alreadyUsed=alignHasGoodMatch(align,geneDict,LocusID,blast_record,alreadyUsed)
						if result>0:
							samelocus+=result
							i+=999
						else:
							small+=1
							i+=999
							#alreadyUsed.append(allelename2)
					else :
						nomatch+=1
					#print align.length, alleleleng
					
					i+=1
			except Exception as e:
				print e
					#print "lkjh"
				pass
		except:
			try:
				alignment=blast_record.alignments[0]
				#print blast_record.query
				
				result,allelename,LocusID,alreadyUsed=alignHasGoodMatch(alignment,geneDict,LocusID,blast_record,alreadyUsed)
				if result>0 and allelename:
					samelocus+=result
				else :
					small+=1
				#alreadyUsed.append(allelename)
				#alreadyUsed.append(alignment.hit_def)
			except:
				nomatch+=1
				
	
	print "%s are within same locus, %s had no match and %s had a bigger than 0.2 ratio size difference or less than 0.8 similarity ratio" % (samelocus,nomatch, small)
	
	os.remove(name1)
	os.remove(name2)
	shutil.rmtree('./blastdbs')

Example #24

Show file

File: callAlleles.py Project: mickaelsilva/pythonscripts

def main():
	
	try:
		input_file = sys.argv[1]
		temppath = sys.argv[2]
	except IndexError:
		print "usage: list_pickle_obj"

	argumentList=[]
	with open(input_file,'rb') as f:
		argumentList = pickle.load(f)
	
	geneFile = argumentList[0]
	genomesList = argumentList[1]
	#listOfCDSDicts = argumentList[2]
	
	basepath=temppath
	
	gene_fp = HTSeq.FastaReader(geneFile)
	geneDict = {}
	alleleI = 1
	inverted=False
	orderedAlleleNames=[]
	biggestAllelelen=0
	smallestAllelelen=99999
	for allele in gene_fp:
		if allele.seq in geneDict:
			print "\nWARNING: this file contains a repeated allele, it should be checked. Ignoring it now!\n", geneFile
		else:
			if len(allele.seq)>biggestAllelelen:
				biggestAllelelen=len(allele.seq)
			if len(allele.seq)<smallestAllelelen:
				smallestAllelelen=len(allele.seq)
			orderedAlleleNames.append(allele.name)
			geneDict[ allele.seq ] = alleleI
			alleleI += 1
	#print geneDict
	#print orderedAlleleNames

	# --- make 1st blast DB --- #

	Gene_Blast_DB_name = Create_Blastdb( geneFile, 1, False )
	geneF = os.path.basename(geneFile)
	blast_out_file = os.path.dirname(geneFile)+"/blastdbs/"+geneF + '.xml'

	# list of results - the output of the function
	resultsList = []
	i = 0
	perfectMatchIdAllele=[]
	genomeDict = {}
	for genomeFile in genomesList:
		#currentCDSDict = listOfCDSDicts[i]
		
		filepath=os.path.join(basepath,str(os.path.basename(genomeFile))+"_ORF.txt")
		with open(filepath,'rb') as f:
			currentCDSDict = pickle.load(f)
		
		g_fp = HTSeq.FastaReader( genomeFile )
		for contig in g_fp:
			sequence=str(contig.seq)
			genomeDict[ contig.name ] = sequence
		
		currentGenomeDict = genomeDict
		
		#print genomeFile
		#print resultsList
		#print geneDict
		#print orderedAlleleNames
		i+=1		# it has to be incremented here
		if genomeFile[-1] == '\n':
			genomeFile = genomeFile[:-1]

                # ------------------------------ RUNNING BLAST ------------------------------ #
		#print Gene_Blast_DB_name
		#cline = NcbiblastnCommandline(query=genomeFile, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5)
		cline = NcbiblastnCommandline(query=genomeFile, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5)
		#print cline
		blast_records = runBlastParser(cline, blast_out_file, genomeFile)

		print ("Finished Blast at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))

		# ------ DETERMINING BEST MATCH ------ #

		# bestMatch = ['rec.query','hsp', lenRatio]
		bestMatch = ['','', 0]
		bestMatchContig=''
		bestMatchContigLen=''
		bestalignlen=0
		perfectMatch=False
		bmAlleleLen2=0
		bmAllele=''
		#noAlignment=False
		for blast_record in blast_records:
			# --- the LNF cases are now called outside de loop --- #
			#print blast_record
			if perfectMatch==True:
				break
			try:
				#print blast_record.alignments
				hspC = blast_record.alignments[0]
				
				if bestMatch[0] == '' and bestMatch[1] == '':
					bestMatch[0] = blast_record.query
					bestMatch[1] = hspC
			except IndexError:
				continue


			# --- the contig tag is used in the progigal function --- #

			contigTag = blast_record.query
			# --- brute force parsing of the contig tag - better solution is advisable --- #			

			j=0
			for l in contigTag:
				if l == ' ':
					break
				j+=1

			contigTag = contigTag[:j]

			contigLen = blast_record.query_letters
			#print blast_record.query_id
			
			# --- iterating over all the results to determine the best match --- #
			for alignment in blast_record.alignments:
				
				index=orderedAlleleNames.index(alignment.hit_def)
				#print alignment.hit_def
				for k, v in geneDict.iteritems():
					if v == index+1:
						bmAlleleLen2= len(k)
					
				if perfectMatch:
					break
				for match in alignment.hsps:
					#print match
					scoreRatio = float(match.score) / float(bmAlleleLen2)
					
					#print alignment.hit_def
					#print match.identities
					#print bmAlleleLen2
					#print
					#print match.identities
					#print len(match.sbjct)

					#if #identities is the same as the length of the allele and it has no gaps or N's
					if (int(match.identities)==int(bmAlleleLen2) and int(match.identities)==int(len(match.sbjct)) and "N" not in match.query and "K" not in match.query and "R" not in match.query ): 
						index=orderedAlleleNames.index(alignment.hit_def)
						for seq, alleleid in geneDict.iteritems():
							if alleleid == index+1:
								bmAllele=seq
								break
						bmAlleleLen= len(bmAllele)
						
						lenratio=float(len(match.query))/float(bmAlleleLen)
						bestMatch = [blast_record.query, match, scoreRatio, alignment.hit_def,lenratio,bmAlleleLen]
						bestMatchContig=contigTag
						perfectMatch=True
						index=orderedAlleleNames.index(alignment.hit_def)
						bmAlleleLen= len(geneDict.keys()[index])
						break
						
					#choose the match with the best score ratio score/length of allele	
					elif scoreRatio > bestMatch[2]:
						index=orderedAlleleNames.index(alignment.hit_def)
						for seq, alleleid in geneDict.iteritems():
							if alleleid == index+1:
								bmAllele=seq
								break
						bmAlleleLen= len(bmAllele)
						lenratio=float(len(match.query))/float(bmAlleleLen)
						bestMatch = [blast_record.query, match, scoreRatio, alignment.hit_def,lenratio,bmAlleleLen]
						bestMatchContig=contigTag
						bestMatchContigLen=blast_record.query_letters
						if match.sbjct_start > match.sbjct_end:
							inverted=True
						#print match.query
						bestalignlen=alignment.length
						#print match
						#print bmAlleleLen, bestMatchContig
						
						
					
					if perfectMatch==True:
						break


		# ---------- ALLELE CALLING AFTER DETERMINING BEST MATCH ---------- #
		print ("Finished choosing best match at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
		
		
		try:
			#print bestMatch[0]
			match = bestMatch[1]
			#print match.query
			geneLen = bestMatch[5]
			alleleStr = match.query
			nIdentities = match.identities
			idPercent = float(nIdentities) / float(geneLen)
			scoreRatio = bestMatch[2]
			lenRatio = bestMatch[4]
			#print perfectMatch
			#print "\nContig best/exact match is :"
			#print bestMatchContig +"\n"
		
		except:
			#if no best match was found
			
			###################
			# LOCUS NOT FOUND #
			###################
			
			resultsList.append('LNF3:-1')            # append result to the list of results
			perfectMatchIdAllele.append('LNF')
			printinfo(genomeFile,geneFile)
			print "Locus not found, no matches \n"
			continue
		
		
		if perfectMatch is True:
			
			#if a perfect match was found
			
			if match.sbjct_start > match.sbjct_end: #reverse the order if needed
				alleleStr = reverseComplement(alleleStr)
			alleleNumber = geneDict[ alleleStr ]
			
					################################################
					# EXACT MATCH --- MATCH == GENE --- GENE FOUND #
					################################################
			if "_" in bestMatch[3]:
				a=bestMatch[3].split("_")
				perfectMatchIdAllele.append(a[1])
			else:
				perfectMatchIdAllele.append(bestMatch[3])
			resultsList.append('EXC:' + str(alleleNumber) )
		
						
			

		elif bestMatch[0] != '' and perfectMatch is not True:
						
			#if a best match was found but it's not an exact match	

					###########################
					# LOCUS ON THE CONTIG TIP #
					###########################
			
			
			
			if match.query_start ==1 and len(match.query) < geneLen:
			
				resultsList.append('LOT5:-1')
				perfectMatchIdAllele.append('LOT5')
				printinfo(genomeFile,geneFile)
				
				print "Locus is on the 5' tip of the contig \n"
			
			
			elif match.query_end == bestMatchContigLen and len(match.query) < geneLen:
				
				resultsList.append('LOT3:-1')
				perfectMatchIdAllele.append('LOT3')
				printinfo(genomeFile,geneFile)

				print "Locus is on the 3' tip of the contig \n"
			
			elif bestMatchContigLen <= geneLen:
				
				resultsList.append('LOTSC:-1')
				perfectMatchIdAllele.append('LOTSC')
				printinfo(genomeFile,geneFile)
				#print match.query_start
				print "Locus is bigger than the contig \n"
					
					
				

			elif 'N' in alleleStr:
				#TODO gravar para ficheiro
					#####################
					# ALLELE NOT FOUND  #		# N base found!
					#####################
				
				geneFile2= os.path.splitext(geneFile)[0] + "LNFN.fasta"
				print geneFile2
				with open(geneFile2, 'a') as f:
					f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+"\n")
					f.write((alleleStr) +"\n")
				resultsList.append('LNFN:-1')
				perfectMatchIdAllele.append('LNFN')
				printinfo(genomeFile,geneFile) 
				print "LNFN, contains N bases! \n"
			
			
			
			else:
				# ------------------------------------------------------------------------------------------------------- #
				#                                                                                                         #
				#                                   USING PRODIGAL TO TRY TO EXTEND CDS                                   #
				#                                                                                                         #
				# ------------------------------------------------------------------------------------------------------- #
				
				CDSType=''
				sizeratio=0.2
				ORFFoundInMatch, strCDS, CDSType = extendCDS(bestMatchContig, currentCDSDict, match.query_start, match.query_end, currentGenomeDict, biggestAllelelen, smallestAllelelen,sizeratio)
				# --- if it was possible to extend it using prodigal --- #
				print ("Finished extension at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
				#print ORFFoundInMatch
				#print strCDS
				#print CDSType
				isContainedDefinedAllele = False
				
				try:	
					if ORFFoundInMatch :
						alleleStr = strCDS
						if match.sbjct_start > match.sbjct_end: #reverse the order if needed
							alleleStr = reverseComplement(alleleStr)
						
						lenRatio = float(len(strCDS)) / float(geneLen)
						defAllele=[]
						if alleleStr in geneDict:  #if ORF found is already defined
							alleleNumber = geneDict[ alleleStr ]
							
							################################################
							# EXACT MATCH --- MATCH == GENE --- GENE FOUND #						
							################################################
							perfectMatchIdAllele.append(alleleNumber)
							resultsList.append('EXC2:' + str(alleleNumber) )


							
						else:
									#######################
									# ADD INFERRED ALLELE #		# a new allele that was extended with prodigal
									#######################
							if(CDSType=='stop codon in match end'):
								tagAux = 'INF1:'
							elif(CDSType=='start codon in match beggining'):
								tagAux = 'INF2:'
							elif(CDSType=='bigger than match'):
								tagAux = 'INF3:'
							elif(CDSType=='same size as match'):
								tagAux = 'INF4:'
							elif(CDSType=='cds inside match'):
								tagAux = 'INF5:'
							elif(CDSType=='start codon inside match'):
								tagAux = 'INF6:'
							else:
								tagAux = 'INF7:'
									
							print "infered allele has location : "+(CDSType)
							printinfo(genomeFile,geneFile) 
							perfectMatchIdAllele.append( tagAux +"-"+str(alleleI))
							print "New allele Infered with prodigal! Adding allele "+ tagAux + str(alleleI) +" to the database\n"
								
								
									
							geneDict[alleleStr] = alleleI
									
							resultsList.append( tagAux + str(alleleI) )

							orderedAlleleNames.append('allele_' + str(alleleI) + '_' + tagAux[:-1] +"_" +str(os.path.basename(genomeFile)))	
								# --- add the new allele to the gene fasta --- #

							fG = open( geneFile, 'a' )
							fG.write('>allele_' + str(alleleI) + '_' + tagAux[:-1] +"_" + str(os.path.basename(genomeFile)) + '\n')
							#print alleleStr
							fG.write( alleleStr + '\n')
							fG.close()
							alleleI += 1
								

								# --- remake blast DB --- #
								
							Gene_Blast_DB_name = Create_Blastdb( geneFile, 1,False )
							
							
							
					else:
								
					##################
					# LNF WTFFF #
					##################
						geneFile2= os.path.splitext(geneFile)[0] + "LNF2.fasta"
						print geneFile2
						with open(geneFile2, 'a') as f:
							f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n")
							f.write((alleleStr) +"\n")
							f.write(">Allele\n")
							f.write((bmAllele)+"\n")
						resultsList.append('LNF2')
						printinfo(genomeFile,geneFile) 
						perfectMatchIdAllele.append("LNF2")
						print "CDS not found"
				
				except:
					if ORFFoundInMatch :
						alleleStr = strCDS
					##################
					# LNF WTFFF #
					##################
					geneFile2= os.path.splitext(geneFile)[0] + "LNF99.fasta"
					print geneFile2
					with open(geneFile2, 'a') as f:
						f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n")
						f.write((alleleStr) +"\n")
						f.write(">Allele\n")
						f.write((bmAllele)+"\n")
					resultsList.append('LNF99')
					printinfo(genomeFile,geneFile) 
					perfectMatchIdAllele.append("LNF99")
					print "A problem occurred"
						
							
	final =	(resultsList,perfectMatchIdAllele)	
	#return (resultsList)
	print ("Finished allele calling at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	filepath=os.path.join(basepath , os.path.basename(geneFile)+"_result.txt")
	#print filepath
	with open(filepath, 'wb') as f:
		pickle.dump(final, f)
	return True

Example #25

Show file

File: alleleCalling_baseprobe.py Project: mickaelsilva/pythonscripts

def callAlleles(argumentList):

	geneFile = argumentList[0]
	genomesList = argumentList[1]
	listOfGenomesDict = argumentList[2]
	
	gene_fp = HTSeq.FastaReader(geneFile)
	geneDict = {}
	alleleI = 1
	orderedAlleleNames=[]
	biggestAllelelen=0
	smallestAllelelen=99999
	for allele in gene_fp:
		if allele.seq in geneDict:
			print "\nWARNING: this file contains a repeated allele, it should be checked. Ignoring it now!\n", geneFile
		else:
			if len(allele.seq)>biggestAllelelen:
				biggestAllelelen=len(allele.seq)
			if len(allele.seq)<smallestAllelelen:
				smallestAllelelen=len(allele.seq)
			orderedAlleleNames.append(allele.name)
			geneDict[ allele.seq ] = alleleI
			alleleI += 1
	#print geneDict
	#print orderedAlleleNames

	# --- make 1st blast DB --- #

	Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 )
	geneF = os.path.splitext( geneFile )[0]
	blast_out_file = geneF + '.xml'

	# list of results - the output of the function
	resultsList = []
	i = 0
	perfectMatchIdAllele=[]
	
	for genomeFile in genomesList:
		#print geneDict
		currentGenomeDict = listOfGenomesDict[i]
		
		#print genomeFile
		#print resultsList
		#print geneDict
		#print orderedAlleleNames
		i+=1		# it has to be incremented here
		if genomeFile[-1] == '\n':
			genomeFile = genomeFile[:-1]


                # ------------------------------ RUNNING BLAST ------------------------------ #

		cline = NcbiblastnCommandline(query=genomeFile, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5)
		blast_records = runBlastParser(cline, blast_out_file, genomeFile)
		

		# ------ DETERMINING BEST MATCH ------ #

		# bestMatch = ['rec.query','hsp', lenRatio]
		bestMatch = ['','', 0]
		bestMatchContig=''
		bestMatchContigLen=''
		bestalignlen=0
		perfectMatch=False
		bmAlleleLen2=0
		bmAllele=''
		#noAlignment=False
		for blast_record in blast_records:
			# --- the LNF cases are now called outside de loop --- #
			#print blast_record
			if perfectMatch==True:
				break
			try:
				#print blast_record.alignments
				hspC = blast_record.alignments[0]
				
				if bestMatch[0] == '' and bestMatch[1] == '':
					bestMatch[0] = blast_record.query
					bestMatch[1] = hspC
			except IndexError:
				continue


			# --- the contig tag is used in the progigal function --- #

			contigTag = blast_record.query
			# --- brute force parsing of the contig tag - better solution is advisable --- #			

			j=0
			for l in contigTag:
				if l == ' ':
					break
				j+=1

			contigTag = contigTag[:j]

			contigLen = blast_record.query_letters
			#print blast_record.query_id
			
			# --- iterating over all the results to determine the best match --- #
			for alignment in blast_record.alignments:

				index=orderedAlleleNames.index(alignment.hit_def)
				#print alignment.hit_def
				for k, v in geneDict.iteritems():
					if v == index+1:
						bmAlleleLen2= len(k)
					
				if perfectMatch:
					break
				for match in alignment.hsps:
					#print match
					scoreRatio = float(match.score) / float(bmAlleleLen2)
					
					#print alignment.hit_def
					#print match.identities
					#print bmAlleleLen2
					#print
					#print match.identities
					#print len(match.sbjct)
					
					#if #identities is the same as the length of the allele and it has no gaps or N's
					if (int(match.identities)==int(bmAlleleLen2) and int(match.identities)==int(len(match.sbjct)) and "N" not in match.query ): 
						index=orderedAlleleNames.index(alignment.hit_def)
						
						for seq, alleleid in geneDict.iteritems():
							if alleleid == index+1:
								bmAllele=seq
								break
						bmAlleleLen= len(bmAllele)
						
						lenratio=float(len(match.query))/float(bmAlleleLen)
						bestMatch = [blast_record.query, match, scoreRatio, alignment.hit_def,lenratio,bmAlleleLen]
						bestMatchContig=contigTag
						perfectMatch=True
						index=orderedAlleleNames.index(alignment.hit_def)
						bmAlleleLen= len(geneDict.keys()[index])
						break
						
					#choose the match with the best score ratio score/length of allele	
					elif scoreRatio > bestMatch[2]:
						index=orderedAlleleNames.index(alignment.hit_def)
						#print orderedAlleleNames
						#print geneDict
						#print orderedAlleleNames
						#print alignment.hit_def
						#print index
						#print geneDict
						for seq, alleleid in geneDict.iteritems():
							if alleleid == index+1:
								bmAllele=seq
								break
						bmAlleleLen= len(bmAllele)
						#print bmAllele
						lenratio=float(len(match.query))/float(bmAlleleLen)
						bestMatch = [blast_record.query, match, scoreRatio, alignment.hit_def,lenratio,bmAlleleLen]
						bestMatchContig=contigTag
						bestMatchContigLen=blast_record.query_letters
						
						#print match.query
						bestalignlen=alignment.length
						
						
					
					if perfectMatch==True:
						break


		# ---------- ALLELE CALLING AFTER DETERMINING BEST MATCH ---------- #

		
		
		try:
			#print bestMatch[0]
			match = bestMatch[1]
			#print match
			#print match.sbjct
			geneLen = bestMatch[5]
			alleleStr = match.query
			nIdentities = match.identities
			idPercent = float(nIdentities) / float(geneLen)
			scoreRatio = bestMatch[2]
			lenRatio = bestMatch[4]
			#print perfectMatch
			#print "\nContig best/exact match is :"
			#print bestMatchContig +"\n"
		
		except:
			#if no best match was found
			
			###################
			# LOCUS NOT FOUND #
			###################
			
			resultsList.append('LNF:-1')            # append result to the list of results
			perfectMatchIdAllele.append('LNF')
			printinfo(genomeFile,geneFile)
			print "Locus not found \n"
			continue
		
		
		if perfectMatch is True:
			
			#if a perfect match was found
			
			if match.sbjct_start > match.sbjct_end: #reverse the order if needed
				alleleStr = reverseComplement(alleleStr)
			alleleNumber = geneDict[ alleleStr ]
			
					################################################
					# EXACT MATCH --- MATCH == GENE --- GENE FOUND #
					################################################
			if "_" in bestMatch[3]:
				a=bestMatch[3].split("_")
				perfectMatchIdAllele.append(a[1])
			else:
				perfectMatchIdAllele.append(bestMatch[3])
			resultsList.append('EXC:' + str(alleleNumber) )
								
			

		elif bestMatch[0] != '' and perfectMatch is not True:
						
			#if a best match was found but it's not an exact match	

					###########################
					# LOCUS ON THE CONTIG TIP #
					###########################
						
			
			if match.query_start ==1 and len(match.query) < geneLen:
			
				resultsList.append('LOT5:-1')
				perfectMatchIdAllele.append('LOT5')
				printinfo(genomeFile,geneFile)
				
				print "Locus is on the 5' tip of the contig \n"
			
			
			elif match.query_end == bestMatchContigLen and len(match.query) < bestMatchContigLen:
				
				resultsList.append('LOT3:-1')
				perfectMatchIdAllele.append('LOT3')
				printinfo(genomeFile,geneFile)

				print "Locus is on the 3' tip of the contig \n"
			
			elif bestMatchContigLen <= geneLen:
				
				resultsList.append('LOTSC:-1')
				perfectMatchIdAllele.append('LOTSC')
				printinfo(genomeFile,geneFile)
				#print match.query_start
				print "Locus is bigger than the contig \n"
								
				
			elif 'N' in alleleStr:
				#TODO gravar para ficheiro
					#####################
					# ALLELE NOT FOUND  #		# N base found!
					#####################
				
				geneFile2= os.path.splitext(geneFile)[0] + "LNFN.fasta"
				print geneFile2
				with open(geneFile2, 'a') as f:
					f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+"\n")
					f.write((alleleStr) +"\n")
				resultsList.append('LNFN:-1')
				perfectMatchIdAllele.append('LNFN')
				printinfo(genomeFile,geneFile) 
				print "LNFN, contains N bases! \n"
			
			
			
			else:
				
				#removing gaps
					
				alleleStr = alleleStr.replace('-', '')
				#lenExtraThresh=int(biggestAllelelen*0.1)
				lenExtraThresh=50
			
				#print alleleStr
				# --- it might be needed to obtain the reverse complement of the allele string --- #
				if match.sbjct_start > match.sbjct_end:
					alleleStr = reverseComplement(alleleStr)
					
				#if alleleStr in geneDict:  #if best match without gaps is already defined, example: best match allele was already defined but without gaps it's equal to a NA added
				#	alleleNumber = geneDict[ alleleStr ]
					
					################################################
					# EXACT MATCH --- MATCH == GENE --- GENE FOUND #						
					################################################
				#	perfectMatchIdAllele.append("EXC2-"+str(alleleNumber))
				#	resultsList.append('EXC2:' + str(alleleNumber) )
						

				#else: #check if best match without gaps are contained inside an already defined allele

				isContainedDefinedAllele = False	
				#print geneDict.keys()[0]
				definedAllele=''
				definedAlleleName=''
				for k in geneDict.keys():
					if alleleStr in k:
						definedAllele=k
						#print alleleStr
						isContainedDefinedAllele = True
						definedAlleleName=geneDict.get(k)
						break
						
				if isContainedDefinedAllele  and int(len(match.query))<=int(len(definedAllele))+lenExtraThresh and int(len(match.query))>=int(len(definedAllele))-lenExtraThresh :
					#allele without gaps is contained in a defined allele
					#best match with gaps has same size +1/-1 base as the defined allele
					
					#print int(len(definedAllele)), int(len(match.sbjct))
					
						
					if int(len(alleleStr))==int(len(definedAllele)): # if match without gaps has same size as the defined allele 
						tagAux = 'NA1:'
						printinfo(genomeFile,geneFile) 
						perfectMatchIdAllele.append("NA1-"+str(alleleI))
						
					elif int(len(alleleStr))==int(len(definedAllele))-1 : # if match without gaps has minus one base than the defined allele
						
						tagAux = 'NA2:'
						printinfo(genomeFile,geneFile) 
						perfectMatchIdAllele.append("NA2-"+str(alleleI))
					#elif int(len(alleleStr))==int(len(definedAllele))+1 : # if match without gaps has plus one base than the defined allele 
					#	tagAux = 'NA3:'
					#	printinfo(genomeFile,geneFile) 
					#	perfectMatchIdAllele.append("NA3-"+str(alleleI))
						
					else:												# if match without gaps has more than one base missing comparing to the defined allele 
						tagAux = 'NA4:'
						printinfo(genomeFile,geneFile) 
						perfectMatchIdAllele.append("NA4-"+str(alleleI))
					#TODO catch +1 and others
							
					print "New allele found! Adding allele "+ tagAux + str(alleleI) +" to the database"
					geneDict[alleleStr] = alleleI
						
					resultsList.append( tagAux + str(alleleI) )
						
					orderedAlleleNames.append('allele_' + str(alleleI) + '_' + tagAux[:-1] +'_' + str(os.path.basename(genomeFile)))									
					# --- add the new allele to the gene fasta --- #
						
					fG = open( geneFile, 'a' )
					fG.write('>allele_' + str(alleleI) + '_' + tagAux[:-1] +'_' + str(os.path.basename(genomeFile)) + '\n')
					fG.write( alleleStr + '\n')
					fG.close()
					alleleI += 1
					Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 )
					
				#if best match is not contained in an already defined allele, check if it has similar size with the match allele and has 0.8 similarity
				elif not isContainedDefinedAllele and idPercent > 0.8 and int(len(match.query))<=int(geneLen)+lenExtraThresh and int(len(match.query))>=int(geneLen)-lenExtraThresh :
					#best match with gaps has 80% identity
					#best match with gaps is the same size or +1/-1 as the defined allele
					
					ratio=float(len(alleleStr)) / float(geneLen)
					
					if ratio>=0.8 and ratio<=1.2: # if match without gaps has same size as the best match allele and 80%similarity
						
						tagAux = ''
						extraleft=0
						extraright=0
						tS=0
						tE=0
						#print int(geneLen), len(match.sbjct)
						#print match.sbjct
						#print match
						handle = open(genomeFile, "rU")
						record_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
						handle.close()
						record= record_dict[bestMatchContig]
						#print match.sbjct
						#if(int(len(alleleStr))<int(len(match.query)) and int(len(match.query))<int(geneLen)) and int(geneLen)==int(match.sbjct_start): #if best match allele has missing bases, the tips would be cut
						
						#if len(match.sbjct)<geneLen and "-" not in match.sbjct:  #if the allele is not fully used against the match, compensate the tips
						
						if (1<int(match.sbjct_start) and 1<int(match.sbjct_end)):
							
							if match.sbjct_start > match.sbjct_end:
								extraleft=match.sbjct_end-1
								
							else:
								extraleft=match.sbjct_start-1
								
								
						if (int(geneLen)>int(match.sbjct_start) and int(geneLen)>int(match.sbjct_end) ): # if 3' tip bases of the allele are missing on the match
							
							
							if match.sbjct_start > match.sbjct_end:
								extraright=geneLen-match.sbjct_start
								
							else:
								extraright=geneLen-match.sbjct_end
								
						#print 	extraleft, 	extraright
						
						
						if match.sbjct_start > match.sbjct_end:
							tS=match.query_start-extraright-1
							tE=match.query_end+extraleft
							alleleStr=str(record.seq[tS:tE])
							alleleStr = reverseComplement(alleleStr)
						else:
							tS=match.query_start-extraleft-1
							tE=match.query_end+extraright
							alleleStr=str(record.seq[tS:tE])
						
						tagAux = 'NA5:'
						printinfo(genomeFile,geneFile) 
						perfectMatchIdAllele.append("NA5-"+str(alleleI))
							
						print "New allele found! Adding allele "+ tagAux + str(alleleI) +" to the database"
						geneDict[alleleStr] = alleleI
							
						resultsList.append( tagAux + str(alleleI) )
							
						orderedAlleleNames.append('allele_' + str(alleleI) + '_' + tagAux[:-1] +'_' + str(os.path.basename(genomeFile)))									
						# --- add the new allele to the gene fasta --- #
							
						fG = open( geneFile, 'a' )
						fG.write('>allele_' + str(alleleI) + '_' + tagAux[:-1] +'_' + str(os.path.basename(genomeFile)) + '\n')
						fG.write( alleleStr + '\n')
						fG.close()
						alleleI += 1
						Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 )
						
					else:
						##################
						# LNF WTFFF #
						##################
						geneFile2= os.path.splitext(geneFile)[0] + "LNF3.fasta"
						print geneFile2
						with open(geneFile2, 'a') as f:
							f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n")
							f.write((alleleStr) +"\n")
							f.write(">Allele\n")
							f.write((bmAllele)+"\n")
						resultsList.append('LNF3')
						printinfo(genomeFile,geneFile) 
						perfectMatchIdAllele.append("LNF3")
						print "No allele found"
						
				elif isContainedDefinedAllele:
							####################
						# UNDEFINED ALLELE #		# it is contained in another allele
						####################
						
					alleleStr=match.query
					#if match.sbjct_start > match.sbjct_end:    #### - error
						#alleleStr = reverseComplement(alleleStr)
					resultsList.append('UND:-1')
					perfectMatchIdAllele.append("undefined allele")
					printinfo(genomeFile,geneFile) 
					print "Undefined allele \n"
					
					geneFile2= os.path.splitext(geneFile)[0] + "undefined.fasta"
					print geneFile2
					with open(geneFile2, 'a') as f:
						f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n")
						f.write((alleleStr) +"\n")
						#f.write(">BlastBestMatch"+str(definedAlleleName)+"\n")
						#f.write((alleleStr)+"\n")
						f.write(">Allele"+str(definedAlleleName)+"\n")
						f.write((definedAllele)+"\n")
					
				
							
				elif lenRatio < 0.5:
						
					###############
					# SMALL MATCH #
					###############
								
					resultsList.append('SAC:-1')		# don't know what 'SAC' stands for
					perfectMatchIdAllele.append('small match')
					printinfo(genomeFile,geneFile) 
					print "lower than 50% match \n"	
							
				elif lenRatio < 0.8 and idPercent < 0.5:
						#####################
					# INCOMPLETE ALLELE #		# it was not possible to extend it to at least 80% of the length of the gene
					#####################
					resultsList.append('INC:-1')
					perfectMatchIdAllele.append('allele incomplete')
					printinfo(genomeFile,geneFile)
					print "Incomplete allele\n"
						
				else:	
					##################
					# LNF WTFFF #
					##################
					geneFile2= os.path.splitext(geneFile)[0] + "LNF2.fasta"
					print geneFile2
					with open(geneFile2, 'a') as f:
						f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n")
						f.write((alleleStr) +"\n")
						f.write(">Allele\n")
						f.write((bmAllele)+"\n")
					resultsList.append('LNF2')
					printinfo(genomeFile,geneFile) 
					perfectMatchIdAllele.append("LNF2")
					print "No allele found"

						
							
	final =	(resultsList,perfectMatchIdAllele)	
	#return (resultsList)
	return final

Example #26

Show file

File: callAlleles_probe_based2.py Project: mickaelsilva/bacterial_wgMLST

def main():
	
	try:
		input_file = sys.argv[1]
		temppath = sys.argv[2]
	except IndexError:
		print "usage: list_pickle_obj"

	argumentList=[]
	with open(input_file,'rb') as f:
		argumentList = pickle.load(f)
	
	geneFile = argumentList[0]
	genomesList = argumentList[1]

	basepath=temppath+"/"+os.path.basename(geneFile)

	if not os.path.exists(basepath+"/blastdbs/"):
		os.makedirs(basepath+"/blastdbs/")
	
	
	gene_fp = HTSeq.FastaReader(geneFile)
	geneDict = {}
	alleleI = 1
	inverted=False
	orderedAlleleNames=[]
	biggestAllelelen=0
	smallestAllelelen=99999
	for allele in gene_fp:
		if allele.seq in geneDict:
			print "\nWARNING: this file contains a repeated allele, it should be checked. Ignoring it now!\n", geneFile
		else:
			if len(allele.seq)>biggestAllelelen:
				biggestAllelelen=len(allele.seq)
			if len(allele.seq)<smallestAllelelen:
				smallestAllelelen=len(allele.seq)
			orderedAlleleNames.append(str(alleleI))
			geneDict[ allele.seq ] = alleleI
			alleleI += 1

	# --- make 1st blast DB --- #

	geneF = os.path.basename(geneFile)
	blast_out_file = os.path.dirname(geneFile)+"/blastdbs/"+geneF + '.xml'

	# list of results - the output of the function
	resultsList = []
	i = 0
	perfectMatchIdAllele=[]
	perfectMatchIdAllele2=[]
	genomeDict = {}
	genome=-1	
	print genomesList
	for genomeFile in genomesList:
		print "_______________________________________________________"

		printinfo(genomeFile,geneFile)
		
		g_fp = HTSeq.FastaReader( genomeFile )
		for contig in g_fp:
			sequence=str(contig.seq)
			genomeDict[ contig.name ] = sequence
		
		currentGenomeDict = genomeDict
		
		genome+=1
		
		print ("Blasting alleles on genome at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
		
		blast_out_file = os.path.join(basepath,"blastdbs/"+os.path.basename(geneFile)+ '_List.xml')

		Gene_Blast_DB_name = os.path.join(temppath,str(os.path.basename(genomeFile))+"/"+str(os.path.basename(genomeFile))+"_db")

		
		cline = NcbiblastnCommandline(query=geneFile, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5)

			
		blast_records = runBlastParser(cline, blast_out_file, geneFile)
		print ("Blasted alleles on genome at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))

		# ------ DETERMINING BEST MATCH ------ #

		bestMatch = ['','', 0]
		bestMatchContig=''
		bestMatchContigLen=''
		bestalignlen=0
		perfectMatch=False
		bmAlleleLen2=0
		bmAllele=''

		for blast_record in blast_records:
		
			
			if perfectMatch==True:
				break
			try:

				hspC = blast_record.alignments[0]
				
				if bestMatch[0] == '' and bestMatch[1] == '':
					bestMatch[0] = blast_record.query
					bestMatch[1] = hspC
			except IndexError:
				continue


			# --- the contig tag is used in the progigal function --- #

			contigTag = blast_record.query
			
	
			j=0
			for l in contigTag:
				if l == ' ':
					break
				j+=1

			contigTag = contigTag[:j]

			contigLen = blast_record.query_letters
			
			# --- iterating over all the results to determine the best match --- #
			for alignment in blast_record.alignments:
				contigTag = alignment.hit_def
				contigTag=(contigTag.split(" "))[0]

				index=orderedAlleleNames.index(str(blast_record.query_id).split("_")[1])
				
				#print alignment.hit_def
				for k, v in geneDict.iteritems():
					if v == index+1:
						bmAlleleLen2= len(k)
					
				if perfectMatch:
					break
				for match in alignment.hsps:

					scoreRatio = float(match.score) / float(bmAlleleLen2)
					


					#if # of identities is the same as the length of the allele and it has no gaps or N's
					if (int(match.identities)==int(bmAlleleLen2) and int(match.identities)==int(len(match.query)) and "N" not in match.sbjct and "K" not in match.sbjct and "Y" not in match.sbjct and "R" not in match.sbjct ): 
						
						index=orderedAlleleNames.index(str(blast_record.query_id).split("_")[1])
						for seq, alleleid in geneDict.iteritems():
							if alleleid == index+1:
								bmAllele=seq
								break
						bmAlleleLen= len(bmAllele)
						
						lenratio=float(len(match.sbjct))/float(bmAlleleLen)
						bestMatch = [blast_record.query, match, scoreRatio, blast_record.query_id,lenratio,bmAlleleLen]
						bestMatchContig=contigTag
						perfectMatch=True
						index=orderedAlleleNames.index(str(blast_record.query_id).split("_")[1])
						bmAlleleLen= len(geneDict.keys()[index])
						break
						
					#choose the match with the best score ratio score/length of allele	
					elif scoreRatio > bestMatch[2]:
						index=orderedAlleleNames.index(str(blast_record.query_id).split("_")[1])
						for seq, alleleid in geneDict.iteritems():
							if alleleid == index+1:
								bmAllele=seq
								break
						bmAlleleLen= len(bmAllele)
						lenratio=float(len(match.sbjct))/float(bmAlleleLen)
						bestMatch = [blast_record.query, match, scoreRatio, blast_record.query_id,lenratio,bmAlleleLen]
						bestMatchContig=contigTag
						bestMatchContigLen=len(currentGenomeDict[contigTag])
						print contigTag
						bestalignlen=alignment.length
						
						
					
					if perfectMatch==True:
						break


		# ---------- ALLELE CALLING AFTER DETERMINING BEST MATCH ---------- #
		print ("Finished choosing best match at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
		
		
		try:

			match = bestMatch[1]
			bestMatchStart=match.sbjct_start
			bestMatchEnd=match.sbjct_end
			if match.query_start > match.query_end:
				bestMatchEnd=match.sbjct_start
				bestMatchStart=match.sbjct_end
			

			geneLen = bestMatch[5]
			alleleStr = match.sbjct
			nIdentities = match.identities
			idPercent = float(nIdentities) / float(geneLen)
			scoreRatio = bestMatch[2]
			lenRatio = bestMatch[4]

		
		except:
			#if no best match was found
			
			###################
			# LOCUS NOT FOUND #
			###################
			
			resultsList.append('LNF3:-1')
			perfectMatchIdAllele.append('LNF')
			perfectMatchIdAllele2.append('LNF')
			
			print "Locus not found, no matches \n"
			continue
		
		print "is perfect match true?" +str(perfectMatch)
		if perfectMatch is True:
			
			#if a perfect match was found (DNA sequence is the same)
			

			try:
				alleleNumber = geneDict[ alleleStr ]
			except:
				alleleStr=reverseComplement(alleleStr)
				alleleNumber = geneDict[ alleleStr ]
			
					################################################
					# EXACT MATCH --- MATCH == GENE --- GENE FOUND #
					################################################
			if "_" in bestMatch[3]:
				a=bestMatch[3].split("_")
				perfectMatchIdAllele.append(a[1])
				perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(bestMatchStart)+"-"+str(bestMatchEnd)+"&"+"+")
			else:
				perfectMatchIdAllele.append(bestMatch[3])
				perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(bestMatchStart)+"-"+str(bestMatchEnd)+"&"+"+")
			resultsList.append('EXC:' + str(alleleNumber) )
			printinfo(genomeFile,geneFile)
			print "Exact match \n"
			continue
						
			

		elif bestMatch[0] != '' and perfectMatch is not True:
						
			#if a best match was found but it's not an exact match	

					###########################
					# LOCUS ON THE CONTIG TIP #
					###########################
			
			#check if the match is on the tip of the contig
			
			if bestMatchContigLen <= geneLen:
				

				
				resultsList.append('LOTSC:-1')
				perfectMatchIdAllele.append('LOTSC')
				perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(bestMatchStart)+"-"+str(bestMatchEnd)+"&"+"+")
				printinfo(genomeFile,geneFile)
				print "Locus is bigger than the contig \n"
			
			
			elif match.sbjct_start ==1 and len(match.query) < geneLen:
			
				resultsList.append('LOT5:-1')
				perfectMatchIdAllele.append('LOT5')
				perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(bestMatchStart)+"-"+str(bestMatchEnd)+"&"+"+")
				printinfo(genomeFile,geneFile)
				
				print "Locus is on the 5' tip of the contig \n"
			
			elif match.sbjct_end ==1 and len(match.query) < geneLen and match.sbjct_start > match.sbjct_end:
			
				resultsList.append('LOT3:-1')
				perfectMatchIdAllele.append('LOT3')
				perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(bestMatchStart)+"-"+str(bestMatchEnd)+"&"+"+")
				printinfo(genomeFile,geneFile)
				
				print "Locus is on the 3' tip of the contig \n"
			
			
			
			elif match.sbjct_end == bestMatchContigLen and len(match.query) < bestMatchContigLen:
				
				resultsList.append('LOT3:-1')
				perfectMatchIdAllele.append('LOT3')
				perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(bestMatchStart)+"-"+str(bestMatchEnd)+"&"+"+")
				printinfo(genomeFile,geneFile)

				print "Locus is on the 3' tip of the contig \n"
			
			elif match.sbjct_start == bestMatchContigLen and len(match.query) < bestMatchContigLen and match.sbjct_start > match.sbjct_end:
				
				resultsList.append('LOT5:-1')
				perfectMatchIdAllele.append('LOT5')
				perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(bestMatchStart)+"-"+str(bestMatchEnd)+"&"+"+")
				printinfo(genomeFile,geneFile)

				print "Locus is on the 5' tip of the contig \n"

				

			elif 'N' in alleleStr or "K" in alleleStr or "R" in alleleStr or "Y" in alleleStr:
				
					#####################
					# ALLELE NOT FOUND  #		
					#####################
				
				# strange base found!
				
				geneFile2= os.path.splitext(geneFile)[0] + "LNFN.fasta"

				with open(geneFile2, 'a') as f:
					f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+"\n")
					f.write((alleleStr) +"\n")
				resultsList.append('LNFN:-1')
				perfectMatchIdAllele.append('LNFN')
				perfectMatchIdAllele2.append('LNFN')
				printinfo(genomeFile,geneFile) 
				print "LNFN, contains strange (N,K,R) bases! \n"
			
			
			
			else:
				
				print "new allele?"
					
				alleleStr = alleleStr.replace('-', '')
				lenExtraThresh=int(biggestAllelelen*0.2)

				#else: #check if best match without gaps are contained inside an already defined allele

				isContainedDefinedAllele = False	
				definedAllele=''
				definedAlleleName=''

				for k in geneDict.keys():
					if alleleStr in k:
						definedAllele=k
						isContainedDefinedAllele = True
						definedAlleleName=geneDict.get(k)
						break
				print "is contained? " + str(isContainedDefinedAllele)
				print idPercent
				print geneLen
				print lenExtraThresh
				print lenRatio
				
				if isContainedDefinedAllele  and int(len(match.sbjct))<=int(len(definedAllele))+lenExtraThresh and int(len(match.sbjct))>=int(len(definedAllele))-lenExtraThresh :
					#allele without gaps is contained in a defined allele
					#best match with gaps has same size +1/-1 base as the defined allele
					
					isnewallele=False
						
					if int(len(alleleStr))==int(len(definedAllele)): # if match without gaps has same size as the defined allele 
						tagAux = 'NA1:'
						printinfo(genomeFile,geneFile) 
						perfectMatchIdAllele.append("NA1-"+str(alleleI))
						perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(bestMatchStart)+"-"+str(bestMatchEnd)+"&"+"+")
						isnewallele=True
						
					elif int(len(alleleStr))==int(len(definedAllele))-1 : # if match without gaps has minus one base than the defined allele
						
						tagAux = 'NA2:'
						printinfo(genomeFile,geneFile) 
						perfectMatchIdAllele.append("NA2-"+str(alleleI))
						perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(bestMatchStart)+"-"+str(bestMatchEnd)+"&"+"+")
						isnewallele=True
						
					else:
							extraleft=0
							extraright=0
							tS=0
							tE=0

							handle = open(genomeFile, "rU")
							record_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
							handle.close()
							record= record_dict[bestMatchContig]
							
							
							# if match without gaps has more than one base missing comparing to the defined allele 
							if (1<int(match.query_start) and 1<int(match.query_end)):
								
								if match.query_start > match.query_end:
									extraleft=match.query_end-1
									
								else:
									extraleft=match.query_start-1
							
							print 	extraleft, 	extraright		
									
									
							# if 3' tip bases of the allele are missing on the match	
							if (int(geneLen)>int(match.query_start) and int(geneLen)>int(match.query_end) ): 
								
								if match.query_start > match.query_end:
									extraright=geneLen-match.query_start
									
								else:
									extraright=geneLen-match.query_end
									
							print 	extraleft, 	extraright
							
							
							if match.sbjct_start > match.sbjct_end:
								tE=match.sbjct_start+extraleft
								tS=match.sbjct_end-extraright-1
								alleleStr=str(record.seq[tS:tE])
								alleleStr = reverseComplement(alleleStr)
							else:
								tS=match.sbjct_start-extraleft-1
								tE=match.sbjct_end+extraright
								alleleStr=str(record.seq[tS:tE])
							
							print tS
							print tE
							print "allele is:"
							print alleleStr
							
							if tE> bestMatchContigLen:
								resultsList.append('LOT3B:-1')
								perfectMatchIdAllele.append('LOT3B')
								perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(tS)+"-"+str(bestMatchContigLen)+"&"+"+")
								printinfo(genomeFile,geneFile)
								
								print "Locus is on the 3B' tip of the contig \n"
							
							elif tS<0:
								resultsList.append('LOT5B:-1')
								perfectMatchIdAllele.append('LOT5B')
								perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(0)+"-"+str(tE)+"&"+"+")
								printinfo(genomeFile,geneFile)
								
								print "Locus is on the 5B' tip of the contig \n"
						
						
							else:
						
								tagAux = 'NA4:'
								printinfo(genomeFile,geneFile) 
								perfectMatchIdAllele.append("NA4-"+str(alleleI))
								perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(tS)+"-"+str(tE)+"&"+"+")
								isnewallele=True
						
					
					if isnewallele:
						print "New allele found! Adding allele "+ tagAux + str(alleleI) +" to the database"
						geneDict[alleleStr] = alleleI
							
						resultsList.append( tagAux + str(alleleI) )
							
						orderedAlleleNames.append(str(alleleI))						
						# --- add the new allele to the gene fasta --- #
							
						fG = open( geneFile, 'a' )
						fG.write('>allele_' + str(alleleI) + '_' + tagAux[:-1] +'_' + str(os.path.basename(genomeFile)) + '\n')
						fG.write( alleleStr + '\n')
						fG.close()
						alleleI += 1
					
				#if best match is not contained in an already defined allele, check if it has similar size with the match allele and has 0.8 similarity
				
				elif not isContainedDefinedAllele and idPercent >= 0.8 and int(len(match.sbjct))<=int(geneLen)+lenExtraThresh and int(len(match.sbjct))>=int(geneLen)-lenExtraThresh :

					ratio=float(len(alleleStr)) / float(geneLen)
					
					if ratio>=0.8 and ratio<=1.2: # if match without gaps has same size as the best match allele and 80%similarity
						
						tagAux = ''
						extraleft=0
						extraright=0
						tS=0
						tE=0

						handle = open(genomeFile, "rU")
						record_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
						handle.close()
						record= record_dict[bestMatchContig]

						#if len(match.sbjct)<geneLen and "-" not in match.sbjct:  #if the allele is not fully used against the match, compensate the tips
						try:
							print match
							if (1<int(match.query_start) and 1<int(match.query_end)):
								
								if match.query_start > match.query_end:
									extraleft=match.query_end-1
									
								else:
									extraleft=match.query_start-1
							
							print 	extraleft, 	extraright		
									
							if (int(geneLen)>int(match.query_start) and int(geneLen)>int(match.query_end) ): # if 3' tip bases of the allele are missing on the match
								
								
								if match.query_start > match.query_end:
									extraright=geneLen-match.query_start
									
								else:
									extraright=geneLen-match.query_end
									
							print 	extraleft, 	extraright
							
							
							if match.sbjct_start > match.sbjct_end:
								tE=match.sbjct_start+extraleft
								tS=match.sbjct_end-extraright-1
								alleleStr=str(record.seq[tS:tE])
								alleleStr = reverseComplement(alleleStr)
							else:
								tS=match.sbjct_start-extraleft-1
								tE=match.sbjct_end+extraright
								alleleStr=str(record.seq[tS:tE])
							
							print tS
							print tE
							print "allele is:"
							print alleleStr
							
							if tE> bestMatchContigLen:
								resultsList.append('LOT3B:-1')
								perfectMatchIdAllele.append('LOT3B')
								perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(tS)+"-"+str(bestMatchContigLen)+"&"+"+")
								printinfo(genomeFile,geneFile)
								
								print "Locus is on the 3B' tip of the contig \n"
							
							elif tS<0:
								resultsList.append('LOT5B:-1')
								perfectMatchIdAllele.append('LOT5B')
								perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(0)+"-"+str(tE)+"&"+"+")
								printinfo(genomeFile,geneFile)
								
								print "Locus is on the 5B' tip of the contig \n"
							
							else:
								tagAux = 'NA5:'
								printinfo(genomeFile,geneFile) 
								perfectMatchIdAllele.append("NA5-"+str(alleleI))
								perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(tS)+"-"+str(tE)+"&"+"+")
									
								print "New allele found! Adding allele "+ tagAux + str(alleleI) +" to the database"
								geneDict[alleleStr] = alleleI
									
								resultsList.append( tagAux + str(alleleI) )
									
								#orderedAlleleNames.append('allele_' + str(alleleI) + '_' + tagAux[:-1] +'_' + str(os.path.basename(genomeFile)))									
								orderedAlleleNames.append(str(alleleI))
								# --- add the new allele to the gene fasta --- #
									
								fG = open( geneFile, 'a' )
								fG.write('>allele_' + str(alleleI) + '_' + tagAux[:-1] +'_' + str(os.path.basename(genomeFile)) + '\n')
								fG.write( alleleStr + '\n')
								fG.close()
								alleleI += 1

						except Exception as e:
							##################
							# LNF WTF #
							##################
							print e
							geneFile2= os.path.splitext(geneFile)[0] + "LNF3.fasta"
							print geneFile2
							with open(geneFile2, 'a') as f:
								f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n")
								f.write((alleleStr) +"\n")
								f.write(">Allele\n")
								f.write((bmAllele)+"\n")
							resultsList.append('LNF3')
							printinfo(genomeFile,geneFile) 
							perfectMatchIdAllele.append("LNF3")
							perfectMatchIdAllele2.append("LNF3")
							print "No allele found"
					else:
						##################
						# LNF WTF2 #
						##################
						geneFile2= os.path.splitext(geneFile)[0] + "LNF4.fasta"
						print geneFile2
						with open(geneFile2, 'a') as f:
							f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n")
							f.write((alleleStr) +"\n")
							f.write(">Allele\n")
							f.write((bmAllele)+"\n")
						resultsList.append('LNF4')
						printinfo(genomeFile,geneFile) 
						perfectMatchIdAllele.append("LNF4")
						perfectMatchIdAllele2.append("LNF4")
						print "No allele found"
					
					
					
					
					
						
				elif isContainedDefinedAllele:
						####################
						# UNDEFINED ALLELE #		# it is contained in another allele
						####################
						
					alleleStr=match.query

					resultsList.append('UND:-1')
					perfectMatchIdAllele.append("undefined allele")
					perfectMatchIdAllele2.append("undefined allele")
					printinfo(genomeFile,geneFile) 
					print "Undefined allele \n"
					
					geneFile2= os.path.splitext(geneFile)[0] + "undefined.fasta"
					print geneFile2
					"""with open(geneFile2, 'a') as f:
						f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n")
						f.write((alleleStr) +"\n")
						#f.write(">BlastBestMatch"+str(definedAlleleName)+"\n")
						#f.write((alleleStr)+"\n")
						f.write(">Allele"+str(definedAlleleName)+"\n")
						f.write((definedAllele)+"\n")"""
					
				
							
				elif lenRatio < 0.5:
						
					###############
					# SMALL MATCH #
					###############
								
					resultsList.append('SAC:-1')		# don't know what 'SAC' stands for
					perfectMatchIdAllele.append('small match')
					perfectMatchIdAllele2.append('small match')
					printinfo(genomeFile,geneFile) 
					print "lower than 50% match \n"	
							
				elif lenRatio < 0.8 and idPercent < 0.5:
					
					#####################
					# INCOMPLETE ALLELE #		# it was not possible to extend it to at least 80% of the length of the gene
					#####################
					resultsList.append('INC:-1')
					perfectMatchIdAllele.append('allele incomplete')
					perfectMatchIdAllele2.append('allele incomplete')
					printinfo(genomeFile,geneFile)
					print "Incomplete allele\n"
						
				else:	
					##################
					# LNF WTFFF #
					##################
					geneFile2= os.path.splitext(geneFile)[0] + "LNF.fasta"
					print geneFile2
					with open(geneFile2, 'a') as f:
						f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n")
						f.write((alleleStr) +"\n")
						f.write(">Allele\n")
						f.write((bmAllele)+"\n")
					resultsList.append('LNF5')
					printinfo(genomeFile,geneFile) 
					perfectMatchIdAllele.append("LNF5")
					perfectMatchIdAllele2.append("LNF5")
					print "Locus not found"
						
							
	final =	(resultsList,perfectMatchIdAllele)	

	print ("Finished allele calling at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	filepath=os.path.join(temppath , os.path.basename(geneFile)+"_result.txt")
	filepath2=os.path.join(temppath , os.path.basename(geneFile)+"_result2.txt")

	with open(filepath, 'wb') as f:
		pickle.dump(final, f)
	with open(filepath2, 'wb') as f:
		pickle.dump(perfectMatchIdAllele2, f)
	return True

Example #27

Show file

File: callAlleles_protein.py Project: mickaelsilva/pythonscripts

def main():
	print ("Starting script at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	try:
		input_file = sys.argv[1]
		temppath = sys.argv[2]
	except IndexError:
		print "usage: list_pickle_obj"

	argumentList=[]
	with open(input_file,'rb') as f:
		argumentList = pickle.load(f)
	
	geneFile = argumentList[0]
	genomesList = argumentList[1]
	#listOfCDSDicts = argumentList[2]
	
	basepath=os.path.join(temppath,os.path.splitext(geneFile)[0])
	if not os.path.exists(basepath):
			os.makedirs(basepath)
	#print geneFile
	gene_fp = HTSeq.FastaReader(geneFile)
	alleleI = 0
	#inverted=False
	#orderedAlleleNames=[]
	resultsList = []
	i = 0
	perfectMatchIdAllele=[]
	perfectMatchIdAllele2=[]
	allelescores=[]
	
	print ("Getting BSR at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	alleleI,allelescores,Gene_Blast_DB_name,alleleList=getBlastScoreRatios(geneFile,basepath)
	print ("Finished BSR at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	genome=-1	
	
	genomeDict = {}
	print ("starting allele call at: "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	for genomeFile in genomesList:
		print genomeFile
		bestmatch=[0,0,False,'',0] #score, score ratio, perfectmatch, key name of the DNA sequence string, allele ID
		currentGenomeDict={}
		currentCDSDict={}
		#currentCDSDict = listOfCDSDicts[i]
		
		filepath=os.path.join(temppath,str(os.path.basename(genomeFile))+"_ORF_Protein.txt")
		with open(filepath,'rb') as f:
			currentCDSDict = pickle.load(f)
		
		g_fp = HTSeq.FastaReader( genomeFile )
		for contig in g_fp:
			sequence=str(contig.seq)
			genomeDict[ contig.name ] = sequence
		
		currentGenomeDict = genomeDict
		#print currentGenomeDict
		#alleleI = 0
		#alleleProt=''
		#for allele in gene_fp: #new db for each allele to blast it against himself
		#	alleleI+=1
		#	alleleProt+=">"+str(alleleI)+"\n"+str(translateSeq(allele.seq)+"\n")
		#basepath="./blastdbs/temp"+str(os.path.basename(geneFile))
		#if not os.path.exists(basepath):
		#	os.makedirs(basepath)
		#with open(basepath+'/protein.fasta', "wb") as f:
		#	f.write(alleleProt)
		#Gene_Blast_DB_name = Create_Blastdb( basepath+'/protein.fasta', 1, True )
		genome+=1
		listOfCDS=currentCDSDict
		genomeProteinfastaPath=os.path.join(temppath,str(os.path.basename(genomeFile)+'_Protein.fasta'))
		
		
		print ("Blasting alleles on genome at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
		
		blast_out_file = os.path.join(basepath,"blastdbs/"+os.path.basename(geneFile)+ '_List.xml')

		#with open(basepath+'/proteinList.fasta', "wb") as f:
		#	f.write(protList)
		#Gene_Blast_DB_name = Create_Blastdb( './temp/proteinList.fasta', 1, True )
		cline = NcbiblastpCommandline(query=genomeProteinfastaPath, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5)
		#print cline
		#try:
		
		print ("Parse bsr blast at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
		
		blast_records = runBlastParser(cline, blast_out_file, genomeProteinfastaPath)
		#except:
		#	cline = NcbiblastpCommandline(query=genomeProteinfastaPath, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5)
		
		print ("Blasted alleles on genome at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
		
		for blast_record in blast_records:
				
			
			for alignment in blast_record.alignments:
				
				for match in alignment.hsps:
					#print blast_record.query
					#print match
					#print alleleI, len(allelescores)
					scoreRatio=float(match.score)/float(allelescores[int(alignment.hit_def)-1])
					#print scoreRatio
					#print alignment.hit_def
					cdsStrName=blast_record.query
					DNAstr=listOfCDS[">"+cdsStrName]

					AlleleDNAstr=alleleList[int(alignment.hit_def)-1]
					compare=False
					if DNAstr==AlleleDNAstr is False:
						try:
							DNAstr=reverseComplement(DNAstr)
							if DNAstr==AlleleDNAstr is False:
								pass
							else:
								compare=True
						except:
							pass
					else:
						compare=True
						
					if "N" in DNAstr or "K" in DNAstr or "R" in DNAstr:
						pass
						
					elif(scoreRatio == 1 and bestmatch[2] is False and compare is True):
						bestmatch=[match.score,scoreRatio,True,cdsStrName,int(alignment.hit_def),match,len(AlleleDNAstr),blast_record.query_letters]
						#print alignment
						#print match
					elif(scoreRatio == 1 and match.score>bestmatch[0] and compare is True):
						bestmatch=[match.score,scoreRatio,True,cdsStrName,int(alignment.hit_def),match,len(AlleleDNAstr),blast_record.query_letters]
						#print match
					elif(scoreRatio == 1 and bestmatch[2] is False and compare is False):
						bestmatch=[match.score,scoreRatio,False,cdsStrName,int(alignment.hit_def),match,len(AlleleDNAstr),blast_record.query_letters]
						#print alignment
						#print match
					
					elif(scoreRatio == 1 and match.score>bestmatch[0] and compare is False):
						bestmatch=[match.score,scoreRatio,False,cdsStrName,int(alignment.hit_def),match,len(AlleleDNAstr),blast_record.query_letters]
						#print match
					elif(match.score>bestmatch[0] and scoreRatio>0.6 and scoreRatio>bestmatch[1] and bestmatch[2] is False):
						#print match.query
						#print match.sbjct
						#print allelescores
						bestmatch=[match.score,scoreRatio,False,cdsStrName,int(alignment.hit_def),match,len(AlleleDNAstr),blast_record.query_letters]
						#print match
		#print bestmatch
		
		print ("Classifying the match at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))		
		if bestmatch[0]==0 or "N" in AlleleDNAstr or "K" in AlleleDNAstr or "R" in AlleleDNAstr :
					#if no best match was found
					
					###################
					# LOCUS NOT FOUND #
					###################
			if 	bestmatch[0]==0:		
				resultsList.append('LNF3:-1')            # append result to the list of results
				perfectMatchIdAllele.append('LNF')
				perfectMatchIdAllele2.append('LNF')
				#printinfo(genomeFile,geneFile)
				print "Locus not found, no matches \n"
			else:
				resultsList.append('LNFN:-1')            # append result to the list of results
				perfectMatchIdAllele.append('LNF')
				perfectMatchIdAllele2.append('LNF')
				#printinfo(genomeFile,geneFile)
				print "Locus has strange base (N, K or R) \n"
		
		
		elif bestmatch[2] is True:
			contigname=bestmatch[3]	
			
			contigname=contigname.split("&")
			matchLocation=contigname[2]	
			#matchLocation=matchLocation.split("-")
			contigname=contigname[0]	
			
			alleleStr=listOfCDS[">"+bestmatch[3]]
			protSeq,alleleStr,Reversed=translateSeq(alleleStr)
					#if a perfect match was found
					
						################################################
						# EXACT MATCH --- MATCH == GENE --- GENE FOUND #
						################################################
					
			perfectMatchIdAllele.append(str(bestmatch[4]))
			if not Reversed:
				perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation)+"&"+"+")
			else:
				perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation)+"&"+"-")
			resultsList.append('EXC:' + str(bestmatch[4]) )
			
		else:
			
			match=bestmatch[5]
			#print match
			geneLen=bestmatch[6]

			contigname=bestmatch[3]	
			#print contigname
			
			contigname=contigname.split("&")
			matchLocation=contigname[2]	
			matchLocation=matchLocation.split("-")
			contigname=contigname[0]
			
			seq=currentGenomeDict[ contigname ]
			bestMatchContigLen=len(seq)
			
			alleleStr=listOfCDS[">"+bestmatch[3]]
			protSeq,alleleStr,Reversed=translateSeq(alleleStr)
			
			
			print match
			print matchLocation
			print bestMatchContigLen
			# get extra space to the right and left between the contig and match 
			rightmatchContig=bestMatchContigLen-int(matchLocation[1])	
			leftmatchContig=int(matchLocation[0])
			
			if Reversed:
				aux=rightmatchContig
				rightmatchContig=leftmatchContig
				leftmatchContig=aux
			"""else:
				rightmatchContig=bestMatchContigLen-int(matchLocation[0])	
				leftmatchContig=int(matchLocation[1])"""
			
			print rightmatchContig,leftmatchContig
			
			
			# get extra space to the right and left between the allele and match
			
			rightmatchAllele=geneLen-(int(match.sbjct_end)*3)	
			leftmatchAllele=(int(match.sbjct_start)*3)
			"""if Reversed: 
				aux=rightmatchAllele
				rightmatchAllele=leftmatchAllele
				leftmatchAllele=aux"""
				
			"""else:
				rightmatchAllele=geneLen-(int(match.sbjct_start	)*3)
				leftmatchAllele=(int(match.sbjct_end)*3)"""
			
			#if a best match was found but it's not an exact match	

					###########################
					# LOCUS ON THE CONTIG TIP #
					###########################
			
			print rightmatchAllele, leftmatchAllele
			print geneLen
			
			
			
			#if bestMatchContigLen <= geneLen:
			if leftmatchContig<leftmatchAllele and 	rightmatchContig < rightmatchAllele:
			
				resultsList.append('LOTSC:-1')
				perfectMatchIdAllele.append('LOTSC')
				perfectMatchIdAllele2.append('LOTSC')
				#printinfo(genomeFile,geneFile)
				#print match.query_start
				print "Locus is bigger than the contig \n"
			
			#if match.query_start ==1 and len(match.query) < geneLen:		
			elif leftmatchContig<leftmatchAllele:
				
				
				resultsList.append('LOT3:-1')
				perfectMatchIdAllele.append('LOT3')
				perfectMatchIdAllele2.append('LOT3')
				
				print "Locus is on the 3' tip of the contig \n"
			
			
			#elif match.query_end == bestMatchContigLen and len(match.query) < geneLen:
			elif 	rightmatchContig < rightmatchAllele:
				
				resultsList.append('LOT5:-1')
				perfectMatchIdAllele.append('LOT5')
				perfectMatchIdAllele2.append('LOT5')

				print "Locus is on the 5' tip of the contig \n"
			
			
						
				
		
				
			else:
						#######################
						# ADD INFERRED ALLELE #		# a new allele 
						#######################
						
												
					#print "infered allele has location : "+(CDSType)
					#printinfo(genomeFile,geneFile) 
				tagAux='INF'
				perfectMatchIdAllele.append( tagAux +"-"+str(alleleI+1))
				#perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1]))
				
				if not Reversed:
					perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"+")
				else:
					perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"-")
				
				
				print "New allele! Adding allele "+ tagAux + str(alleleI+1) +" to the database\n"
																					
				resultsList.append( tagAux + str(alleleI+1) )

						#orderedAlleleNames.append('allele_' + str(alleleI) + '_' + tagAux[:-1] +"_" +str(os.path.basename(genomeFile)))	
											# --- add the new allele to the gene fasta --- #
				
				
				appendAllele='>allele_' + str(alleleI+1) + '_' + tagAux[:-1] +"_" + str(os.path.basename(genomesList[genome])) + '\n'
				fG = open( geneFile, 'a' )
				#fG.write('>allele_' + str(alleleI+1) + '_' + tagAux[:-1] +"_" + str(os.path.basename(genomesList[genome])) + '\n')
				fG.write(appendAllele)
						#print alleleStr
					
				
				#print listOfCDS
				#alleleStr=listOfCDS[">"+bestmatch[3]]
				#match=bestmatch[5]
				#reverse the order if needed
				#if match.sbjct_start > match.sbjct_end: 
				#	alleleStr = reverseComplement(alleleStr)
				fG.write( alleleStr + '\n')
				fG.close()
				
				fG = open( os.path.join(basepath,str(os.path.basename(geneFile)+'_protein2.fasta')), 'w' )
				#fG.write('>allele_' + str(alleleI+1) + '_' + tagAux[:-1] +"_" + str(os.path.basename(genomesList[genome])) + '\n')
				fG.write('>'+str(alleleI+1)+'\n'+str(protSeq) + '\n')
				fG.close()
				fG = open( os.path.join(basepath,str(os.path.basename(geneFile)+'_protein.fasta')), 'a' )
				#fG.write('>allele_' + str(alleleI+1) + '_' + tagAux[:-1] +"_" + str(os.path.basename(genomesList[genome])) + '\n')
				fG.write('>'+str(alleleI+1)+'\n'+str(protSeq) + '\n')
						#print alleleStr
				fG.close()	
				
				#print listOfCDS
				#alleleStr=listOfCDS[">"+bestmatch[3]]
				match=bestmatch[5]
				
						#alleleI += 1
						# --- remake blast DB --- #
				alleleList.append(alleleStr)
				Gene_Blast_DB_name = Create_Blastdb( os.path.join(basepath,str(os.path.basename(geneFile)+'_protein.fasta')), 1, True )
				print os.path.join(basepath,str(os.path.basename(geneFile)+'_protein.fasta'))
				genefile2= os.path.join(basepath,str(os.path.basename(geneFile)+'_protein2.fasta'))
				Gene_Blast_DB_name2 = Create_Blastdb( genefile2, 1, True )
				print ("Re-calculating BSR at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
				alleleI,allelescores,alleleList=reDogetBlastScoreRatios(genefile2,basepath,alleleI,allelescores,Gene_Blast_DB_name2,alleleList)
				print allelescores
				print ("Done Re-calculating BSR at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	#x=y
	#shutil.rmtree(basepath)

	
	final =	(resultsList,perfectMatchIdAllele)	
	#return (resultsList)
	print ("Finished allele calling at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	filepath=os.path.join(temppath , os.path.basename(geneFile)+"_result.txt")
	filepath2=os.path.join(temppath , os.path.basename(geneFile)+"_result2.txt")
	#print filepath
	with open(filepath, 'wb') as f:
		pickle.dump(final, f)
	with open(filepath2, 'wb') as f:
		pickle.dump(perfectMatchIdAllele2, f)
	shutil.rmtree(basepath)
	return True

Example #28

Show file

File: callAlleles_protein3.py Project: ODiogoSilva/chewBBACA

def getBlastScoreRatios(genefile, basepath, doAll, verbose, blastPath):
    if verbose:

        def verboseprint(*args):
            for arg in args:
                print(arg),
            print
    else:
        verboseprint = lambda *a: None  # do-nothing function

    #gene_fp = HTSeq.FastaReader(genefile)
    allelescores = []
    alleleProt = ''
    alleleAllProt = ''
    alleleList = []
    alleleI = 0
    alleleIlist = []
    listAllelesNames = []
    # calculate bsr for each allele
    for allele in SeqIO.parse(genefile, "fasta", generic_dna):

        # usually first allele name is just >1 and after that it has >gene_id_genome
        aux = allele.id.split("_")
        if len(aux) < 2:
            alleleI = int(aux[0])
        else:
            alleleI = int(aux[-1])

        # try to translate the allele
        alleleIlist.append(alleleI)
        alleleList.append(str(allele.seq))
        listAllelesNames.append(allele.id)
        translatedSequence, x, y = translateSeq(allele.seq)

        if translatedSequence == '':
            print("cannot translate allele on bsr calculation")
            pass

        # calculate BSR for the allele
        else:
            alleleProt = ">" + str(alleleI) + "\n" + str(translatedSequence +
                                                         "\n")
            alleleAllProt += ">" + str(alleleI) + "\n" + str(
                translatedSequence + "\n")
            proteinfastaPath = os.path.join(
                basepath, str(os.path.basename(genefile) + '_protein2.fasta'))

            # new db for each allele to blast it against himself
            with open(proteinfastaPath, "w") as f:
                f.write(alleleProt)
            Gene_Blast_DB_name = Create_Blastdb(proteinfastaPath, 1, True)

            # if bsr hasn't been calculated, do the BLAST
            if doAll:

                blast_out_file = os.path.join(basepath, 'blastdbs/temp.xml')
                verboseprint("Starting Blast alleles at : " +
                             time.strftime("%H:%M:%S-%d/%m/%Y"))

                # --- get BLAST score ratio --- #
                cline = NcbiblastpCommandline(cmd=blastPath,
                                              query=proteinfastaPath,
                                              db=Gene_Blast_DB_name,
                                              evalue=0.001,
                                              out=blast_out_file,
                                              outfmt=5,
                                              num_threads=1)
                allelescore = 0

                blast_records = runBlastParser(cline, blast_out_file)

                verboseprint("Blasted alleles at : " +
                             time.strftime("%H:%M:%S-%d/%m/%Y"))

                for blast_record in blast_records:

                    for alignment in blast_record.alignments:

                        for match in alignment.hsps:
                            allelescores.append(int(match.score))

                geneScorePickle = os.path.abspath(genefile) + '_bsr.txt'
                verboseprint("________")
                # ~ var=[alleleI,allelescores]
                var = dict(zip(alleleIlist, allelescores))
                with open(geneScorePickle, 'wb') as f:
                    pickle.dump(var, f)

            # bsr had already been calculated, load it to memory
            else:
                geneScorePickle = os.path.abspath(genefile) + '_bsr.txt'
                with open(geneScorePickle, 'rb') as f:
                    var = pickle.load(f)
                # ~ allelescores=var[1]

    proteinfastaPath = os.path.join(
        basepath, str(os.path.basename(genefile) + '_protein.fasta'))
    with open(proteinfastaPath, "w") as f:
        f.write(alleleAllProt)

    # returning all allele BSR scores and list of alleles for this gene
    return var, alleleList, listAllelesNames

Example #29

Show file

File: CreateSchema.py Project: ODiogoSilva/chewBBACA

def main():
    parser = argparse.ArgumentParser(
        description=
        "Given an ffn file, recovers the genes that are not paralogs and have a size bigger than the g parameter provided"
    )
    parser.add_argument('-i',
                        nargs='?',
                        type=str,
                        help='ffn file',
                        required=True)
    parser.add_argument('-l',
                        nargs='?',
                        type=int,
                        help='int minimum length',
                        required=True)
    parser.add_argument(
        '--cpu',
        nargs='?',
        type=int,
        help="Number of cpus, if over the maximum uses maximum -2",
        required=False)
    parser.add_argument('-p',
                        nargs='?',
                        type=str,
                        help="file with protein",
                        required=False,
                        default=False)
    parser.add_argument('-o',
                        nargs='?',
                        type=str,
                        help="output filename",
                        required=False,
                        default=False)
    parser.add_argument('-b',
                        nargs='?',
                        type=str,
                        help="BLAST full path",
                        required=False,
                        default='blastp')
    parser.add_argument('--bsr',
                        nargs='?',
                        type=float,
                        help="minimum BSR similarity",
                        required=False,
                        default=0.6)
    parser.add_argument("-v",
                        "--verbose",
                        help="increase output verbosity",
                        dest='verbose',
                        action="store_true",
                        default=False)

    args = parser.parse_args()
    genes = args.i
    sizethresh = args.l
    cpuToUse = args.cpu
    proteinFIlePath = args.p
    outputFIlePath = args.o
    BlastpPath = args.b
    bsr = args.bsr
    verbose = args.verbose

    if verbose:

        def verboseprint(*args):

            for arg in args:
                print(arg),
            print
    else:
        verboseprint = lambda *a: None

    starttime = "\nStarting Script at : " + time.strftime("%H:%M:%S-%d/%m/%Y")
    verboseprint("\nStarting Script at : " +
                 time.strftime("%H:%M:%S-%d/%m/%Y"))

    verboseprint("Checking Blast installed... " + str(which(BlastpPath)))

    # translate to protein and create new file
    abspath = os.path.abspath(genes)
    filename = os.path.basename(genes)
    abspath = abspath.replace(filename, '')
    proteinfile = os.path.join(abspath, 'proteins.fasta')

    geneDict = {}
    protDict = {}
    orderedprotDict = collections.OrderedDict()
    alreadyIn = []
    totalgenes = 0
    repeatedgenes = 0
    smallgenes = 0
    nottranslatable = 0

    verboseprint("Checking translatability of the loci:\n")

    if not proteinFIlePath:
        # print "not passing steps"
        with open(proteinfile, "w") as f:
            #g_fp = HTSeq.FastaReader(genes)

            for gene in SeqIO.parse(genes, "fasta", generic_dna):
                dnaseq = str(gene.seq)
                protseq, seq, y = translateSeq(dnaseq, gene.id)
                totalgenes += 1
                if len(protseq) > 1:

                    if str(protseq) in alreadyIn:
                        repeatedgenes += 1

                    elif len(str(seq)) < sizethresh:
                        smallgenes += 1

                    else:
                        alreadyIn.append(str(protseq))
                        protname = ">" + str(gene.id) + "\n"

                        f.write(protname + str(protseq) + "\n")
                        protDict[protname] = str(protseq)
                        geneDict[str(gene.name)] = dnaseq
                else:
                    nottranslatable += 1
                    continue

            verboseprint(
                str(nottranslatable) + " not translatable out of " +
                str(totalgenes))

            verboseprint("\nChecking if repeated protein sequences:\n")

            orderedprotList = []
            orderedprotList = sorted(protDict.items(),
                                     key=lambda x: len(x[1]),
                                     reverse=True)

            i = 0
            while i < len(orderedprotList):
                elem = orderedprotList[i]
                orderedprotDict[elem[0]] = elem[1]
                i += 1

        verboseprint(
            str(repeatedgenes) + " repeated loci out of " + str(totalgenes))
        verboseprint(
            str(smallgenes) + " loci out of " + str(totalgenes) +
            " smaller than " + str(sizethresh) + "bp")
        verboseprint("\nprotein file created\n")

        # first step -  remove genes contained in other genes or 100% equal genes

        # list of results - the output of the function
        resultsList = []

        auxDict = {}
        #g_fp = HTSeq.FastaReader(proteinfile)
        g = 0
        j = 0

        verboseprint(
            "Checking if protein sequences are contained in others...")

        # for each gene from all the annotated genes - starting with an empty dictionary, only add a new gene if the "to be added gene" is not contained or equal to a gene already added to the dictionary
        auxprot = []

        for elem in orderedprotDict.items():

            contained = False

            prot = str(elem[1])
            if any(prot in x for x in auxprot):
                g += 1
                contained = True

            else:
                auxDict[elem[1]] = elem[0]
                auxprot.append(str(elem[1]))

            j += 1
        verboseprint(str(g) + " loci are contained in other genes\n")

        # overwrite the original file, obtaining a new file with unique genes

        with open(proteinfile, "w") as f:
            allsequences = ''
            for k, v in auxDict.items():
                allsequences += v + k + "\n"
            f.write(allsequences)

    else:
        # print "passed steps"

        proteinfile = proteinFIlePath
        totalgenes = 0
        smallgenes = 0
        #g_fp = HTSeq.FastaReader(genes)
        proteinfile = proteinFIlePath
        for gene in SeqIO.parse(genes, "fasta", generic_dna):
            #for gene in g_fp:
            dnaseq = str(gene.seq)

            protname = ">" + str(gene.id) + "\n"
            # protDict[protname] = str(protseq)
            geneDict[str(gene.name)] = dnaseq

    verboseprint("Starting Blast")
    # print "Blasting the total of "+ str(len(auxDict.keys())) + " loci"

    geneFile = os.path.abspath(proteinfile)
    Gene_Blast_DB_name = Create_Blastdb(geneFile, 1, True)

    geneF = os.path.splitext(geneFile)[0]
    blast_out_file = geneF + '.xml'
    # ------------------------------ RUNNING BLAST ------------------------------ #
    if cpuToUse:
        cline = NcbiblastpCommandline(cmd=BlastpPath,
                                      query=geneFile,
                                      db=Gene_Blast_DB_name,
                                      evalue=0.001,
                                      out=blast_out_file,
                                      outfmt=5,
                                      num_threads=int(cpuToUse))
    else:
        cline = NcbiblastpCommandline(cmd=BlastpPath,
                                      query=geneFile,
                                      db=Gene_Blast_DB_name,
                                      evalue=0.001,
                                      out=blast_out_file,
                                      outfmt=5,
                                      num_threads=1)
    blast_records = runBlastParser(cline, blast_out_file)
    verboseprint("Finished blast")

    toRemove = []
    genesToKeep = []
    log = ["removed\tcause\texplanation"]
    for blast_record in blast_records:

        allelename = blast_record.query
        allelename = allelename.split(" ")
        allelename = allelename[0]
        alleleLength = len(geneDict[allelename])

        try:

            # if gene A is not on the toRemove list yet, add to genesToKeep list

            if str(blast_record.query) not in toRemove:
                genesToKeep.append(blast_record.query)

                i = 0
                # if first alignement is not against self, gene B is bigger than gene A and very simillar - remove gene A from genesToKeep and add gene B instead
                if not str(blast_record.query) == str(
                    (blast_record.alignments[0]).hit_def):
                    genesToKeep.remove(str(blast_record.query))
                    toRemove.append(str(blast_record.query))
                    log.append(
                        str(blast_record.query) + "\t" +
                        str((blast_record.alignments[0]).hit_def) + "\t" +
                        "2 is first best match")

                    # if gene B is not on the toRemove list, add to genesToKeep list
                    if str(
                        (blast_record.alignments[0]).hit_def) not in toRemove:
                        genesToKeep.append(
                            str((blast_record.alignments[0]).hit_def))

                    raise

                selfblastscore = (((blast_record.alignments[0]).hsps)[0]).score

                while i < len(blast_record.alignments):
                    align = blast_record.alignments[i]

                    match = (align.hsps)[0]
                    scoreRatio = float(match.score) / float(selfblastscore)

                    alleleLength2 = len(geneDict[str(align.hit_def)])

                    # if good match and gene B not in toremove list
                    if (scoreRatio > bsr and
                            not str(align.hit_def) == str(blast_record.query)
                            and str(align.hit_def) not in toRemove):

                        # if gene B is bigger than gene A, keep bigger gene B
                        if alleleLength2 > alleleLength:
                            genesToKeep.append(str(align.hit_def))
                            genesToKeep.remove(str(blast_record.query))
                            toRemove.append(str(blast_record.query))
                            log.append(
                                str(blast_record.query) + "\t" +
                                str(align.hit_def) + "\t" +
                                "2 is bigger and bsr >" + str(bsr))

                            raise
                        # else add gene B to toremove list
                        elif str(align.hit_def) in genesToKeep:
                            genesToKeep.remove(str(align.hit_def))
                            toRemove.append(str(align.hit_def))
                            log.append(
                                str(align.hit_def) + "\t" +
                                str(blast_record.query) + "\t" +
                                "2 is bigger and bsr >" + str(bsr))

                    i += 1

            # else gene A is on toRemove list, add all similar genes (not in genesToKeep) list to the toRemove list
            else:

                i = 0
                selfblastscore = 0
                for align in blast_record.alignments:
                    if not (str(align.hit_def) == str(blast_record.query)):
                        selfblastscore = ((align.hsps)[0]).score
                        # print "gene "+str(align.hit_def)+" is larger than gene "+str(blast_record.query)
                        raise

                while i < len(blast_record.alignments):
                    align = blast_record.alignments[i]
                    match = (align.hsps)[0]
                    scoreRatio = float(match.score) / float(selfblastscore)

                    if align.hit_def not in genesToKeep and not str(
                            align.hit_def) == str(
                                blast_record.query) and scoreRatio > bsr:
                        toRemove.append(align.hit_def)
                        log.append(
                            str(align.hit_def) + "\t" +
                            str(blast_record.query) + "\t" +
                            "2 was on the removed list and bsr >" + str(bsr))

                    else:
                        pass

                    i += 1

        except Exception as e:
            # print e
            pass

    genesToKeep = list(set(genesToKeep))
    toRemove = list(set(toRemove))
    s = set(toRemove)
    notcommonToKeep = [x for x in genesToKeep if x not in s]

    pathfiles = os.path.dirname(geneFile)
    pathfiles = pathfiles + "/"
    listfiles = []

    #g_fp = HTSeq.FastaReader(genes)
    removedparalogs = 0
    removedsize = 0
    totalgenes = 0
    rest = 0
    concatenatedFile = ''
    schema_folder_path = os.path.join(pathfiles, 'schema_seed')

    if not os.path.exists(
            schema_folder_path) and not proteinFIlePath and not outputFIlePath:
        os.makedirs(schema_folder_path)
    elif not proteinFIlePath and outputFIlePath:
        os.makedirs(outputFIlePath)

    for contig in SeqIO.parse(genes, "fasta", generic_dna):
        totalgenes += 1
        #name = contig.name + " " + contig.descr
        name2 = contig.id

        # print name2
        if name2 not in toRemove and name2 in genesToKeep:
            if int(len(contig.seq)) > sizethresh:
                namefile = contig.name
                namefile = namefile.replace("|", "_")
                namefile = namefile.replace("_", "-")
                namefile = namefile.replace("(", "")
                namefile = namefile.replace(")", "")
                namefile = namefile.replace("'", "")
                namefile = namefile.replace("\"", "")
                namefile = namefile.replace(":", "")

                if not proteinFIlePath and not outputFIlePath:
                    newFile = os.path.join(schema_folder_path,
                                           namefile + ".fasta")
                    listfiles.append(newFile)
                    with open(newFile, "w") as f:
                        f.write(">" + namefile + "_1\n" + str(contig.seq) +
                                "\n")
                elif not proteinFIlePath and outputFIlePath:
                    newFile = os.path.join(outputFIlePath, namefile + ".fasta")
                    listfiles.append(newFile)
                    with open(newFile, "w") as f:
                        f.write(">" + namefile + "_1\n" + str(contig.seq) +
                                "\n")
                else:
                    concatenatedFile += ">" + contig.id + " \n" + str(
                        contig.seq) + "\n"

                rest += 1

            else:
                removedsize += 1
        else:

            removedparalogs += 1

    if proteinFIlePath and outputFIlePath:
        with open(outputFIlePath, "w") as f:
            f.write(concatenatedFile)
    elif not proteinFIlePath and outputFIlePath:
        get_Short(listfiles)
        verboseprint("\nRemoved " + str(removedparalogs) +
                     " with a high similarity (BSR>" + str(bsr) + ")")
        print("Total of " + str(rest) + " loci that constitute the schema")
        os.remove(proteinfile)

    # create short folder
    else:
        # ~ with open("schemacreation.log", "wb") as f:
        # ~ for elem in log:
        # ~ f.write(str(elem)+"\n")
        get_Short(listfiles)
        verboseprint("\nRemoved " + str(removedparalogs) +
                     " with a high similarity (BSR>" + str(bsr) + ")")
        print("Total of " + str(rest) + " loci that constitute the schema")
        os.remove(proteinfile)

    shutil.rmtree(os.path.join(pathfiles, 'blastdbs'))

    os.remove(blast_out_file)

    verboseprint(starttime)
    verboseprint("Finished Script at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))

Example #30

Show file

File: callAlleles_protein3.py Project: ODiogoSilva/chewBBACA

def main():
    try:
        input_file = sys.argv[1]
        temppath = sys.argv[2]
        blastPath = sys.argv[3]
        verbose = sys.argv[4]
        bsrTresh = sys.argv[5]

        if verbose == 'True':
            verbose = True
        else:
            verbose = False

    except IndexError:
        print(
            "Error starting the callAlleleles_protein3 script. usage: list_pickle_obj"
        )

    bsrTresh = float(bsrTresh)

    argumentList = []
    with open(input_file, 'rb') as f:
        argumentList = pickle.load(f)

    if verbose:

        def verboseprint(*args):

            for arg in args:
                print(arg),
            print
    else:
        verboseprint = lambda *a: None

    geneFile = argumentList[0]

    verboseprint("Using gene: " + str(geneFile))
    shortgeneFile = os.path.join(os.path.dirname(argumentList[0]), "short",
                                 os.path.basename(argumentList[0]))
    shortgeneFile = shortgeneFile.replace(".fasta", "_short.fasta")
    genomesList = argumentList[1]
    genesList = argumentList[2]

    newListgenes = []
    with open(genesList, 'r') as gene_fp:
        for gene in gene_fp:
            gene = gene.rstrip('\n')
            gene = gene.rstrip('\r')
            newListgenes.append(gene)

    statusbar = float(newListgenes.index(str(geneFile))) / len(newListgenes)
    locusnumber = (newListgenes.index(str(geneFile)))
    totalocusnumber = len(newListgenes)
    basepath = os.path.join(temppath, os.path.splitext(geneFile)[0])

    print("\rProcessing " + os.path.basename(geneFile) + ". Start " +
          time.strftime("%H:%M:%S-%d/%m/%Y") + " Locus " + str(locusnumber) +
          " of " + str(totalocusnumber) + ". Done " +
          str(int(statusbar * 100)) + "%.",
          end="")

    if not os.path.exists(basepath):
        os.makedirs(basepath)

    #gene_fp = HTSeq.FastaReader(geneFile)

    fullAlleleList = []
    fullAlleleNameList = []
    alleleI = 0
    # get full list of alleles from main gene file and last allele number id
    for allele in SeqIO.parse(geneFile, "fasta", generic_dna):
        aux = allele.id.split("_")
        if len(aux) < 2:
            alleleI = int(aux[0])
        else:
            alleleI = int(aux[-1])
        fullAlleleList.append(str(allele.seq))
        fullAlleleNameList.append(allele.id)

    resultsList = []
    i = 0
    perfectMatchIdAllele = []
    perfectMatchIdAllele2 = []
    allelescores = []
    listShortAllelesNames = []

    verboseprint("Getting BSR at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))

    geneScorePickle = os.path.abspath(shortgeneFile) + '_bsr.txt'

    # check if bsr as arealdy been calculated and recalculate it if necessary

    if os.path.isfile(geneScorePickle):
        allelescores, alleleList, listShortAllelesNames = getBlastScoreRatios(
            shortgeneFile, basepath, False, verbose, blastPath)

    else:
        allelescores, alleleList, listShortAllelesNames = getBlastScoreRatios(
            shortgeneFile, basepath, True, verbose, blastPath)

    verboseprint("Finished BSR at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))

    verboseprint("starting allele call blast at: " +
                 time.strftime("%H:%M:%S-%d/%m/%Y"))
    for genomeFile in genomesList:
        verboseprint(genomeFile)
        bestmatch = [
            0, 0, False, '', 0
        ]  # score, score ratio, perfectmatch, key name of the DNA sequence string, allele ID
        currentGenomeDict = {}
        currentCDSDict = {}

        # load the CDS from the genome to a dictionary
        filepath = os.path.join(
            temppath,
            str(os.path.basename(genomeFile)) + "_ORF_Protein.txt")

        with open(filepath, 'rb') as f:
            currentCDSDict = pickle.load(f)

        try:
            intersection = set(fullAlleleList).intersection(
                currentCDSDict.values())
            intersection = list(intersection)

            if len(intersection) > 1:
                perfectMatchIdAllele.append('NIPHEM')
                perfectMatchIdAllele2.append('NIPHEM')
                verboseprint(
                    os.path.basename(genomeFile) + " has " +
                    str(len(intersection)) + " multiple exact match : " +
                    os.path.basename(geneFile) +
                    " MULTIPLE ALLELES as EXACT MATCH")
                raise ValueError("MULTIPLE ALLELES as EXACT MATCH")

            elif len(intersection) == 1:
                alleleStr = intersection[0]
                # it doenst return both keys with equal values
                # ~ elem=currentCDSDict.keys()[currentCDSDict.values().index(alleleStr)]

                elem = [
                    key for key, value in currentCDSDict.items()
                    if value == alleleStr
                ]
                if len(elem) > 1:
                    perfectMatchIdAllele.append('NIPHEM')
                    perfectMatchIdAllele2.append('NIPHEM')
                    verboseprint(
                        os.path.basename(genomeFile) + " has " +
                        str(len(intersection)) + " multiple exact match : " +
                        os.path.basename(geneFile) +
                        " MULTIPLE ALLELES as EXACT MATCH")
                    raise ValueError("MULTIPLE ALLELES as EXACT MATCH")

                contigname = elem[0].split("&")
                matchLocation = contigname[2]
                # starting CDS base need to be +1
                matchLocation = matchLocation.split("-")
                matchLocation = [
                    int(matchLocation[0]) + 1,
                    int(matchLocation[1])
                ]
                contigname = (contigname[0]).replace(">", "")
                alleleName = ''
                alleleMatchid = 0

                alleleName = fullAlleleNameList[fullAlleleList.index(
                    alleleStr)]
                alleleMatchid = int((alleleName.split("_"))[-1])
                perfectMatchIdAllele.append(str(alleleMatchid))

                if matchLocation[0] > matchLocation[1]:
                    perfectMatchIdAllele2.append(
                        str(contigname) + "&" + str(matchLocation[0]) + "-" +
                        str(matchLocation[1]) + "&" + "-")
                else:

                    perfectMatchIdAllele2.append(
                        str(contigname) + "&" + str(matchLocation[0]) + "-" +
                        str(matchLocation[1]) + "&" + "+")

                # check if atributed allele is contained or contains
                try:
                    containedInfo = (alleleName.split("_"))[1]
                except:
                    containedInfo = ''
                if containedInfo == "CD":
                    resultsList.append([(os.path.basename(genomeFile)),
                                        str(alleleMatchid),
                                        containedInfo.rstrip()])
                elif containedInfo == "CS":
                    resultsList.append([(os.path.basename(genomeFile)),
                                        str(alleleMatchid),
                                        containedInfo.rstrip()])
                else:
                    pass

                raise ValueError("EQUAL")
        except Exception as e:
            # ~ exc_type, exc_obj, exc_tb = sys.exc_info()
            # ~ fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
            # ~ print(exc_tb.tb_lineno)
            # ~ print e
            continue

        else:
            verboseprint("Blasting alleles on genome at : " +
                         time.strftime("%H:%M:%S-%d/%m/%Y"))

            blast_out_file = os.path.join(
                basepath,
                "blastdbs/" + os.path.basename(geneFile) + '_List.xml')

            Gene_Blast_DB_name = os.path.join(
                temppath,
                str(os.path.basename(genomeFile)) + "/" +
                str(os.path.basename(genomeFile)) + "_db")

            proteinfastaPath = os.path.join(
                basepath,
                str(os.path.basename(shortgeneFile) + '_protein.fasta'))

            # blast the genome CDS against the translated locus
            # cline = NcbiblastpCommandline(query=proteinfastaPath, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5,max_target_seqs=10,max_hsps_per_subject=10)
            # 2.2.28 up
            cline = NcbiblastpCommandline(cmd=blastPath,
                                          query=proteinfastaPath,
                                          db=Gene_Blast_DB_name,
                                          evalue=0.001,
                                          out=blast_out_file,
                                          outfmt=5,
                                          max_target_seqs=10,
                                          max_hsps=10,
                                          num_threads=1)

            blast_records = runBlastParser(cline, blast_out_file)
            verboseprint("Blasted alleles on genome at : " +
                         time.strftime("%H:%M:%S-%d/%m/%Y"))

            alleleSizes = []
            for allele in fullAlleleList:
                alleleSizes.append(len(allele))

            biggestSizeAllele = max(alleleSizes)

            # get mode allele size
            moda = max(set(alleleSizes), key=alleleSizes.count)
            contador = Counter(alleleSizes).most_common()

            # if most common allele size appears 1 time, get first allele size
            if (contador[0])[1] == 1:
                moda = alleleSizes[0]

            try:

                # iterate through the blast results
                for blast_record in blast_records:

                    locationcontigs = []

                    for alignment in blast_record.alignments:

                        # select the best match
                        for match in alignment.hsps:

                            # query id comes with query_id, not name of the allele
                            alleleMatchid = int(
                                (blast_record.query_id.split("_"))[-1])

                            # ~ scoreRatio=float(match.score)/float(allelescores[int(alleleMatchid)-1])
                            # query_id starts with 1
                            alleleMatchid2 = ((
                                listShortAllelesNames[alleleMatchid -
                                                      1]).split("_"))[-1]
                            scoreRatio = float(match.score) / float(
                                allelescores[int(alleleMatchid2)])

                            cdsStrName = (alignment.title.split(" "))[1]

                            #DNAstr = str(currentCDSDict[">" + cdsStrName])

                            AlleleDNAstr = alleleList[int(alleleMatchid) - 1]

                            if scoreRatio >= bsrTresh:
                                locationcontigs.append(cdsStrName)

                            # select the best match from BLAST results

                            if scoreRatio == 1 and match.score > bestmatch[0]:
                                bestmatch = [
                                    match.score, scoreRatio, False, cdsStrName,
                                    int(alleleMatchid), match,
                                    len(AlleleDNAstr)
                                ]

                            elif (match.score > bestmatch[0]
                                  and scoreRatio >= bsrTresh
                                  and scoreRatio > bestmatch[1]
                                  and bestmatch[2] is False):
                                bestmatch = [
                                    match.score, scoreRatio, False, cdsStrName,
                                    int(alleleMatchid), match,
                                    len(AlleleDNAstr)
                                ]

                verboseprint("Classifying the match at : " +
                             time.strftime("%H:%M:%S-%d/%m/%Y"))

                # if no best match was found it's a Locus Not Found

                # check for ambiguious bases
                if not bestmatch[0] == 0:
                    alleleStr = currentCDSDict[">" + bestmatch[3]]
                    listFoundAmbiguities = []
                    listambiguousBases = [
                        'K', 'M', 'R', 'Y', 'S', 'W', 'B', 'V', 'H', 'D', 'X',
                        'N', '-', '.'
                    ]
                    listFoundAmbiguities = [
                        e for e in listambiguousBases if e in alleleStr
                    ]

                if bestmatch[0] == 0 or len(listFoundAmbiguities) > 0:

                    ###################
                    # LOCUS NOT FOUND #
                    ###################
                    if bestmatch[0] == 0:
                        perfectMatchIdAllele.append('LNF')
                        perfectMatchIdAllele2.append('LNF')
                        verboseprint("Locus not found, no matches \n")
                    else:

                        perfectMatchIdAllele.append('LNF')
                        perfectMatchIdAllele2.append('LNF')
                        verboseprint("Locus has strange base \n")

                # if more than one BSR >0.6 in two different CDSs it's a Non Paralog Locus
                elif len(list(set(locationcontigs))) > 1:
                    verboseprint("NIPH", "")
                    perfectMatchIdAllele.append('NIPH')
                    perfectMatchIdAllele2.append('NIPH')
                    for elem in locationcontigs:
                        verboseprint(elem)

                # if match with BSR >0.6 and not equal DNA sequences
                else:

                    # load the contig info of the genome to a dictionary
                    #g_fp = HTSeq.FastaReader(genomeFile)
                    for contig in SeqIO.parse(genomeFile, "fasta",
                                              generic_dna):
                        currentGenomeDict[contig.id] = len(str(contig.seq))

                    match = bestmatch[5]
                    geneLen = bestmatch[6]
                    alleleStr = currentCDSDict[">" + bestmatch[3]]
                    contigname = bestmatch[3]

                    contigname = contigname.split("&")
                    matchLocation = contigname[2]
                    matchLocation = matchLocation.split("-")
                    matchLocation = [
                        int(matchLocation[0]) + 1, matchLocation[1]
                    ]
                    contigname = contigname[0]

                    bestMatchContigLen = currentGenomeDict[contigname]

                    protSeq, alleleStr, Reversed = translateSeq(alleleStr)

                    # get extra space to the right and left between the allele and match and check if it's still inside the contig

                    rightmatchAllele = geneLen - (
                        (int(match.query_end) + 1) * 3)
                    leftmatchAllele = ((int(match.query_start) - 1) * 3)

                    # ~ if Reversed swap left and right contig extra
                    if int(matchLocation[1]) < int(matchLocation[0]):
                        rightmatchContig = bestMatchContigLen - int(
                            matchLocation[0])
                        leftmatchContig = int(matchLocation[1])
                        aux = rightmatchAllele
                        rightmatchAllele = leftmatchAllele
                        leftmatchAllele = aux

                    else:
                        rightmatchContig = bestMatchContigLen - int(
                            matchLocation[1])
                        leftmatchContig = int(matchLocation[0])

                    ###########################
                    # LOCUS ON THE CONTIG TIP #
                    ###########################

                    # check if contig is smaller than the matched allele
                    if leftmatchContig < leftmatchAllele and rightmatchContig < rightmatchAllele:

                        # ~ resultsList.append('PLOTSC:-1')
                        perfectMatchIdAllele.append('LOTSC')
                        perfectMatchIdAllele2.append('LOTSC')
                        # ~ if not Reversed:
                        # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"+")
                        # ~ else:
                        # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"-")

                        verboseprint(match, contigname, geneFile,
                                     leftmatchAllele, rightmatchAllele,
                                     "Locus is bigger than the contig \n")

                    elif leftmatchContig < leftmatchAllele:

                        # ~ resultsList.append('PLOT3:-1')
                        perfectMatchIdAllele.append('PLOT3')
                        perfectMatchIdAllele2.append('PLOT3')
                        # ~ if not Reversed:
                        # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"+")
                        # ~ else:
                        # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"-")

                        verboseprint(
                            match, contigname, geneFile, leftmatchAllele,
                            rightmatchAllele,
                            "Locus is on the 3' tip of the contig \n")

                    elif rightmatchContig < rightmatchAllele:

                        # ~ resultsList.append('PLOT5:-1')
                        perfectMatchIdAllele.append('PLOT5')
                        perfectMatchIdAllele2.append('PLOT5')
                        # ~ if not Reversed:
                        # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"+")
                        # ~ else:
                        # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"-")

                        verboseprint(
                            match, contigname, geneFile, leftmatchAllele,
                            rightmatchAllele,
                            "Locus is on the 5' tip of the contig \n")

                    elif float(len(alleleStr)) > moda + (moda * 0.2):

                        verboseprint("Locus is larger than mode", moda,
                                     alleleStr)

                        # ~ resultsList.append('ALM')
                        perfectMatchIdAllele.append('ALM')
                        perfectMatchIdAllele2.append('ALM')
                    # ~ if not Reversed:
                    # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"+")
                    # ~ else:
                    # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"-")

                    elif float(len(alleleStr)) < moda - (moda * 0.2):

                        verboseprint("Locus is smaller than mode", moda,
                                     alleleStr)

                        # ~ resultsList.append('ASM')
                        perfectMatchIdAllele.append('ASM')
                        perfectMatchIdAllele2.append('ASM')
                    # ~ if not Reversed:
                    # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"+")
                    # ~ else:
                    # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"-")

                    else:
                        #######################
                        # ADD INFERRED ALLELE #		# a new allele
                        #######################

                        wasContained = False
                        tagAuxC = 'S'
                        for alleleaux in fullAlleleList:

                            if alleleStr in alleleaux:
                                alleleName = fullAlleleNameList[
                                    fullAlleleList.index(alleleaux)]
                                alleleMatchid = (alleleName.split("_"))[-1]
                                tagAuxC = 'CD' + alleleMatchid.rstrip()
                                resultsList.append([
                                    (os.path.basename(genomeFile)),
                                    str(alleleI + 1), tagAuxC
                                ])
                                break
                            elif alleleaux in alleleStr:
                                alleleName = fullAlleleNameList[
                                    fullAlleleList.index(alleleaux)]
                                alleleMatchid = (alleleName.split("_"))[-1]
                                tagAuxC = 'CS' + alleleMatchid.rstrip()
                                resultsList.append([
                                    (os.path.basename(genomeFile)),
                                    str(alleleI + 1), tagAuxC
                                ])
                                break

                        if not wasContained:
                            tagAux = 'INF'

                            perfectMatchIdAllele.append(tagAux + "-" +
                                                        str(alleleI + 1))

                            if not Reversed:
                                perfectMatchIdAllele2.append(
                                    str(contigname) + "&" +
                                    str(matchLocation[0]) + "-" +
                                    str(matchLocation[1]) + "&" + "+")
                            else:
                                perfectMatchIdAllele2.append(
                                    str(contigname) + "&" +
                                    str(matchLocation[0]) + "-" +
                                    str(matchLocation[1]) + "&" + "-")

                            verboseprint("New allele! Adding allele " +
                                         tagAux + str(alleleI + 1) +
                                         " to the database\n")

                            # --- add the new allele to the gene fasta --- #

                            alleleI += 1
                            appendAllele = '>' + str((
                                ((os.path.basename(geneFile)).split("."))[0]
                            ).replace("_", "-")) + "_" + tagAuxC + "_" + (str(
                                os.path.basename(genomeFile))).replace(
                                    "_", "-") + "_" + str(alleleI) + '\n'
                            fG = open(geneFile, 'a')
                            fG.write(appendAllele)
                            fG.write(alleleStr + '\n')
                            fG.close()
                            fullAlleleList.append(alleleStr)
                            fullAlleleNameList.append(appendAllele)

                            if bestmatch[1] >= int(bsrTresh) and float(
                                    bestmatch[1]) < int(bsrTresh) + 0.1:
                                fG = open(shortgeneFile, 'a')
                                fG.write(appendAllele)
                                fG.write(alleleStr + '\n')
                                fG.close()

                                geneTransalatedPath2 = os.path.join(
                                    basepath,
                                    str(
                                        os.path.basename(shortgeneFile) +
                                        '_protein2.fasta'))
                                geneTransalatedPath = os.path.join(
                                    basepath,
                                    str(
                                        os.path.basename(shortgeneFile) +
                                        '_protein.fasta'))

                                with open(geneTransalatedPath2, 'w') as fG:
                                    fG.write('>' + str(alleleI) + '\n' +
                                             str(protSeq) + '\n')
                                with open(geneTransalatedPath, 'a') as fG:
                                    fG.write('>' + str(alleleI) + '\n' +
                                             str(protSeq) + '\n')

                                match = bestmatch[5]

                                # --- remake blast DB and recalculate the BSR for the locus --- #
                                alleleList.append(alleleStr)
                                listShortAllelesNames.append(appendAllele)

                                genefile2 = geneTransalatedPath2
                                Gene_Blast_DB_name2 = Create_Blastdb(
                                    genefile2, 1, True)
                                verboseprint(
                                    "Re-calculating BSR at : " +
                                    time.strftime("%H:%M:%S-%d/%m/%Y"))
                                allelescores, alleleList, listShortAllelesNames = reDogetBlastScoreRatios(
                                    genefile2, basepath, alleleI, allelescores,
                                    Gene_Blast_DB_name2, alleleList,
                                    geneScorePickle, verbose, blastPath,
                                    listShortAllelesNames)
                                verboseprint(
                                    "Done Re-calculating BSR at : " +
                                    time.strftime("%H:%M:%S-%d/%m/%Y"))

            except Exception as e:
                print("some error occurred")
                print(e)
                print('Error on line {}'.format(sys.exc_info()[-1].tb_lineno))
                perfectMatchIdAllele2.append("ERROR")
                perfectMatchIdAllele.append("ERROR")

    final = (resultsList, perfectMatchIdAllele)
    verboseprint("Finished allele calling at : " +
                 time.strftime("%H:%M:%S-%d/%m/%Y"))
    filepath = os.path.join(temppath,
                            os.path.basename(geneFile) + "_result.txt")
    filepath2 = os.path.join(temppath,
                             os.path.basename(geneFile) + "_result2.txt")
    with open(filepath, 'wb') as f:
        pickle.dump(final, f)
    with open(filepath2, 'wb') as f:
        pickle.dump(perfectMatchIdAllele2, f)
    shutil.rmtree(basepath)
    return True

Example #31

Show file

File: CreateSchema.py Project: mickaelsilva/wgMLST

def main():

    parser = argparse.ArgumentParser(
        description=
        "Given an ffn file, recovers the genes that are not paralogs and have a size bigger than the g parameter provided"
    )
    parser.add_argument('-i',
                        nargs='?',
                        type=str,
                        help='ffn file',
                        required=True)
    parser.add_argument('-g',
                        nargs='?',
                        type=int,
                        help='int minimum size',
                        required=True)

    args = parser.parse_args()
    genes = args.i
    sizethresh = args.g
    passSteps = False

    #translate to protein and create new file
    abspath = os.path.abspath(genes)
    filename = os.path.basename(genes)
    abspath = abspath.replace(filename, '')
    proteinfile = os.path.join(abspath, 'proteins.fasta')

    geneDict = {}
    protDict = {}
    orderedprotDict = collections.OrderedDict()
    alreadyIn = []
    totalgenes = 0
    repeatedgenes = 0
    smallgenes = 0

    if not passSteps:
        print "not passing steps"
        with open(proteinfile, "wb") as f:
            g_fp = HTSeq.FastaReader(genes)
            totalgenes += 1
            for gene in g_fp:
                dnaseq = str(gene.seq)
                protseq, x, y = translateSeq(dnaseq)
                if len(protseq) > 1:

                    if str(protseq) in alreadyIn:
                        repeatedgenes += 1

                    elif len(str(protseq)) < 67:
                        smallgenes += 1

                    else:
                        alreadyIn.append(str(protseq))
                        protname = ">" + str(gene.name) + "\n"

                        f.write(protname + str(protseq) + "\n")
                        protDict[protname] = str(protseq)
                        geneDict[str(gene.name)] = gene.seq
                else:

                    print gene.name

            orderedprotList = []
            orderedprotList = sorted(protDict.items(),
                                     key=lambda x: len(x[1]),
                                     reverse=True)

            i = 0
            while i < len(orderedprotList):
                elem = orderedprotList[i]
                orderedprotDict[elem[0]] = elem[1]
                i += 1

            #print orderedprotDict
        print str(repeatedgenes) + " repeated genes out of " + str(totalgenes)
        print str(smallgenes) + " small genes out of " + str(totalgenes)
        print "protein file created"

        # first step -  remove genes contained in other genes or 100% equal genes

        # list of results - the output of the function
        resultsList = []

        auxDict = {}
        g_fp = HTSeq.FastaReader(proteinfile)
        g = 0
        j = 0

        print "Checking if proteins are equal or substring of others..."

        # for each gene from all the annotated genes - starting with an empty dictionary, only add a new gene if the "to be added gene" is not contained or equal to a gene already added to the dictionary
        auxprot = []

        for elem in orderedprotDict.items():

            contained = False

            prot = str(elem[1])
            if any(prot in x for x in auxprot):
                g += 1
                contained = True

            else:
                auxDict[elem[1]] = elem[0]
                auxprot.append(str(elem[1]))

            print str(j) + " out of " + str(len(orderedprotDict))

            j += 1
            #print "____" +str(j)
        print "%s genes are contained in other genes" % (g)

        #overwrite the original file, obtaining a new file with unique genes

        with open(proteinfile, "wb") as f:
            allsequences = ''
            for k, v in auxDict.iteritems():
                allsequences += v + k + "\n"
            f.write(allsequences)

    else:

        totalgenes = 0
        smallgenes = 0
        g_fp = HTSeq.FastaReader(genes)
        totalgenes += 1
        for gene in g_fp:
            dnaseq = str(gene.seq)
            protseq, x, y = translateSeq(dnaseq)
            if len(protseq) > 1:

                if str(protseq) in alreadyIn:
                    repeatedgenes += 1
                    #print gene.name + " already saved "

                elif len(str(protseq)) < 67:
                    smallgenes += 1

                else:
                    alreadyIn.append(str(protseq))
                    protname = ">" + str(gene.name) + "\n"

                    #print protseq

                    protDict[protname] = str(protseq)
                    geneDict[str(gene.name)] = gene.seq
            else:

                print gene.name

    geneFile = os.path.abspath(proteinfile)
    print proteinfile
    Gene_Blast_DB_name = Create_Blastdb(geneFile, 1, True)

    geneF = os.path.splitext(geneFile)[0]
    blast_out_file = geneF + '.xml'
    # ------------------------------ RUNNING BLAST ------------------------------ #

    cline = NcbiblastpCommandline(query=geneFile,
                                  db=Gene_Blast_DB_name,
                                  evalue=0.001,
                                  out=blast_out_file,
                                  outfmt=5)
    blast_records = runBlastParser(cline, blast_out_file, geneFile)
    toRemove = []
    genesToKeep = []
    log = ["removed\tcause\texplanation"]
    for blast_record in blast_records:

        allelename = blast_record.query
        allelename = allelename.split(" ")
        allelename = allelename[0]
        alleleLength = len(geneDict[allelename])

        try:

            #if gene A is not on the toRemove list yet, add to genesToKeep list

            if str(blast_record.query) not in toRemove:
                genesToKeep.append(blast_record.query)

                i = 0
                #if first alignement is not against self, gene B is bigger than gene A and very simillar - remove gene A from genesToKeep and add gene B instead
                if not str(blast_record.query) == str(
                    (blast_record.alignments[0]).hit_def):
                    genesToKeep.remove(str(blast_record.query))
                    toRemove.append(str(blast_record.query))
                    log.append(
                        str(blast_record.query) + "\t" +
                        str((blast_record.alignments[0]).hit_def) + "\t" +
                        "2 is first best match")

                    #if gene B is not on the toRemove list, add to genesToKeep list
                    if str(
                        (blast_record.alignments[0]).hit_def) not in toRemove:
                        genesToKeep.append(
                            str((blast_record.alignments[0]).hit_def))

                    raise

                selfblastscore = (((blast_record.alignments[0]).hsps)[0]).score

                while i < len(blast_record.alignments):
                    align = blast_record.alignments[i]

                    match = (align.hsps)[0]
                    scoreRatio = float(match.score) / float(selfblastscore)

                    alleleLength2 = len(geneDict[str(align.hit_def)])

                    #if good match and gene B not in toremove list
                    if (scoreRatio > 0.6 and
                            not str(align.hit_def) == str(blast_record.query)
                            and str(align.hit_def) not in toRemove):

                        #if gene B is bigger than gene A, keep bigger gene B
                        if alleleLength2 > alleleLength:
                            genesToKeep.append(str(align.hit_def))
                            genesToKeep.remove(str(blast_record.query))
                            toRemove.append(str(blast_record.query))
                            log.append(
                                str(blast_record.query) + "\t" +
                                str(align.hit_def) + "\t" +
                                "2 is bigger and bsr >0.6")

                            raise
                        #else add gene B to toremove list
                        elif str(align.hit_def) in genesToKeep:
                            genesToKeep.remove(str(align.hit_def))
                            toRemove.append(str(align.hit_def))
                            log.append(
                                str(align.hit_def) + "\t" +
                                str(blast_record.query) + "\t" +
                                "2 is bigger and bsr >0.6")

                    i += 1

            #else gene A is on toRemove list, add all similar genes (not in genesToKeep) list to the toRemove list
            else:

                i = 0
                selfblastscore = 0
                for align in blast_record.alignments:
                    if not (str(align.hit_def) == str(blast_record.query)):
                        selfblastscore = ((align.hsps)[0]).score
                        print "gene " + str(
                            align.hit_def) + " is bigger than gene " + str(
                                blast_record.query)
                        raise

                while i < len(blast_record.alignments):
                    align = blast_record.alignments[i]
                    match = (align.hsps)[0]
                    scoreRatio = float(match.score) / float(selfblastscore)

                    if align.hit_def not in genesToKeep and not str(
                            align.hit_def) == str(
                                blast_record.query) and scoreRatio > 0.6:
                        toRemove.append(align.hit_def)
                        log.append(
                            str(align.hit_def) + "\t" +
                            str(blast_record.query) + "\t" +
                            "2 was on the removed list and bsr >0.6")

                    else:
                        pass

                    i += 1

        except Exception as e:
            #print e
            pass
    with open("logfile.txt", "wb") as f:
        for elem in log:

            f.write(str(elem) + "\n")

    genesToKeep = list(set(genesToKeep))
    toRemove = list(set(toRemove))
    s = set(toRemove)
    notcommonToKeep = [x for x in genesToKeep if x not in s]
    print len(toRemove)
    print len(genesToKeep)
    print len(notcommonToKeep)

    pathfiles = os.path.dirname(geneFile)
    pathfiles = pathfiles + "/"

    g_fp = HTSeq.FastaReader(genes)
    removedparalogs = 0
    removedsize = 0
    totalgenes = 0
    rest = 0
    concatenatedFile = ''
    for contig in g_fp:
        totalgenes += 1
        name = contig.name + " " + contig.descr
        name2 = contig.name

        if name2 not in toRemove and name2 in genesToKeep:
            if int(len(contig.seq)) > sizethresh:
                namefile = contig.name
                namefile = namefile.replace("|", "_")
                with open(pathfiles + namefile + ".fasta", "wb") as f:
                    f.write(">1\n" + contig.seq + "\n")
                rest += 1
                concatenatedFile += ">" + namefile + "\n" + contig.seq + "\n"
            else:
                removedsize += 1
        else:

            removedparalogs += 1

    print "%s genes are contained in other genes" % (g)
    print "Removed %s same Locus genes" % str(removedparalogs)
    print "Removed %s because of size " % str(removedsize)
    print "%s Scheme genes " % str(rest)
    print "total genes:" + str(totalgenes)

    with open(pathfiles + "concatenated.fasta", "wb") as f:
        f.write(concatenatedFile)

Example #32

Show file

def main():

    try:
        input_file = sys.argv[1]
        temppath = sys.argv[2]
    except IndexError:
        print "usage: list_pickle_obj"

    argumentList = []
    with open(input_file, 'rb') as f:
        argumentList = pickle.load(f)

    geneFile = argumentList[0]
    genomesList = argumentList[1]

    basepath = temppath + "/" + os.path.basename(geneFile)

    if not os.path.exists(basepath + "/blastdbs/"):
        os.makedirs(basepath + "/blastdbs/")

    gene_fp = HTSeq.FastaReader(geneFile)
    geneDict = {}
    alleleI = 1
    inverted = False
    orderedAlleleNames = []
    biggestAllelelen = 0
    smallestAllelelen = 999999
    for allele in gene_fp:
        if allele.seq in geneDict:
            print "\nWARNING: this file contains a repeated allele, it should be checked. Ignoring it now!\n", geneFile
        else:
            if len(allele.seq) > biggestAllelelen:
                biggestAllelelen = len(allele.seq)
            if len(allele.seq) < smallestAllelelen:
                smallestAllelelen = len(allele.seq)
            orderedAlleleNames.append(str(alleleI))
            geneDict[allele.seq] = alleleI
            alleleI += 1

    # --- make 1st blast DB --- #

    geneF = os.path.basename(geneFile)
    blast_out_file = os.path.dirname(geneFile) + "/blastdbs/" + geneF + '.xml'

    # list of results - the output of the function
    i = 0
    perfectMatchIdAllele = []
    perfectMatchIdAllele2 = []
    genomeDict = {}
    genome = -1
    resultsList = []
    print genomesList
    for genomeFile in genomesList:
        print "_______________________________________________________"
        print perfectMatchIdAllele
        printinfo(genomeFile, geneFile)
        #currentCDSDict = listOfCDSDicts[i]

        g_fp = HTSeq.FastaReader(genomeFile)
        for contig in g_fp:
            sequence = str(contig.seq)
            genomeDict[contig.name] = sequence

        currentGenomeDict = genomeDict

        genome += 1

        print("Blasting alleles on genome at : " +
              time.strftime("%H:%M:%S-%d/%m/%Y"))

        blast_out_file = os.path.join(
            basepath, "blastdbs/" + os.path.basename(geneFile) + '_List.xml')

        Gene_Blast_DB_name = os.path.join(
            temppath,
            str(os.path.basename(genomeFile)) + "/" +
            str(os.path.basename(genomeFile)) + "_db")

        cline = NcbiblastnCommandline(query=geneFile,
                                      db=Gene_Blast_DB_name,
                                      evalue=0.001,
                                      out=blast_out_file,
                                      outfmt=5)

        blast_records = runBlastParser(cline, blast_out_file, geneFile)
        print("Blasted alleles on genome at : " +
              time.strftime("%H:%M:%S-%d/%m/%Y"))

        # ------ DETERMINING BEST MATCH ------ #

        bestMatch = ['', '', 0]
        bestMatchContig = ''
        bestMatchContigLen = ''
        bestalignlen = 0
        perfectMatch = False
        bmAlleleLen2 = 0
        bmAllele = ''
        for blast_record in blast_records:

            if perfectMatch == True:
                break
            try:
                hspC = blast_record.alignments[0]

                if bestMatch[0] == '' and bestMatch[1] == '':
                    bestMatch[0] = blast_record.query
                    bestMatch[1] = hspC
            except IndexError:
                continue

            # --- the contig tag is used in the progigal function --- #

            contigTag = blast_record.query

            # --- brute force parsing of the contig tag - better solution is advisable --- #

            j = 0
            for l in contigTag:
                if l == ' ':
                    break
                j += 1

            contigTag = contigTag[:j]

            contigLen = blast_record.query_letters

            # --- iterating over all the results to determine the best match --- #
            for alignment in blast_record.alignments:
                contigTag = alignment.hit_def
                contigTag = (contigTag.split(" "))[0]

                index = orderedAlleleNames.index(
                    str(blast_record.query_id).split("_")[1])

                for k, v in geneDict.iteritems():
                    if v == index + 1:
                        bmAlleleLen2 = len(k)

                if perfectMatch:
                    break
                for match in alignment.hsps:

                    scoreRatio = float(match.score) / float(bmAlleleLen2)

                    #if #identities is the same as the length of the allele and it has no gaps or N's
                    if (int(match.identities) == int(bmAlleleLen2)
                            and int(match.identities) == int(len(match.query))
                            and "N" not in match.sbjct
                            and "K" not in match.sbjct
                            and "Y" not in match.sbjct
                            and "R" not in match.sbjct):

                        index = orderedAlleleNames.index(
                            str(blast_record.query_id).split("_")[1])
                        for seq, alleleid in geneDict.iteritems():
                            if alleleid == index + 1:
                                bmAllele = seq
                                break
                        bmAlleleLen = len(bmAllele)

                        lenratio = float(len(match.sbjct)) / float(bmAlleleLen)
                        bestMatch = [
                            blast_record.query, match, scoreRatio,
                            blast_record.query_id, lenratio, bmAlleleLen
                        ]
                        bestMatchContig = contigTag
                        perfectMatch = True
                        index = orderedAlleleNames.index(
                            str(blast_record.query_id).split("_")[1])
                        bmAlleleLen = len(geneDict.keys()[index])
                        break

                    #chose the match with the best score ratio (score/length of allele)
                    elif scoreRatio > bestMatch[2]:
                        index = orderedAlleleNames.index(
                            str(blast_record.query_id).split("_")[1])
                        for seq, alleleid in geneDict.iteritems():
                            if alleleid == index + 1:
                                bmAllele = seq
                                break
                        bmAlleleLen = len(bmAllele)
                        lenratio = float(len(match.sbjct)) / float(bmAlleleLen)
                        bestMatch = [
                            blast_record.query, match, scoreRatio,
                            blast_record.query_id, lenratio, bmAlleleLen
                        ]
                        bestMatchContig = contigTag
                        bestMatchContigLen = len(currentGenomeDict[contigTag])
                        print contigTag
                        bestalignlen = alignment.length

                    if perfectMatch == True:
                        break

        # ---------- ALLELE CALLING AFTER DETERMINING BEST MATCH ---------- #
        print("Finished choosing best match at : " +
              time.strftime("%H:%M:%S-%d/%m/%Y"))

        try:
            match = bestMatch[1]
            bestMatchStart = match.sbjct_start
            bestMatchEnd = match.sbjct_end
            if match.query_start > match.query_end:
                bestMatchEnd = match.sbjct_start
                bestMatchStart = match.sbjct_end

            print match

            geneLen = bestMatch[5]
            alleleStr = match.sbjct
            nIdentities = match.identities
            idPercent = float(nIdentities) / float(geneLen)
            scoreRatio = bestMatch[2]
            lenRatio = bestMatch[4]

        except:
            #if no best match was found

            ###################
            # LOCUS NOT FOUND #
            ###################

            perfectMatchIdAllele.append('LNF')
            perfectMatchIdAllele2.append('LNF')

            print "Locus not found, no matches \n"
            continue

        print "is perfect match true?" + str(perfectMatch)
        if perfectMatch is True:

            #if a perfect match was found

            try:
                alleleNumber = geneDict[alleleStr]
            except:
                alleleStr = reverseComplement(alleleStr)
                alleleNumber = geneDict[alleleStr]

            ################################################
            # EXACT MATCH --- MATCH == GENE --- GENE FOUND #
            ################################################
            if "_" in bestMatch[3]:
                a = bestMatch[3].split("_")
                perfectMatchIdAllele.append(a[1])
                perfectMatchIdAllele2.append(
                    str(bestMatchContig) + "&" + str(bestMatchStart) + "-" +
                    str(bestMatchEnd) + "&" + "+")
            else:
                perfectMatchIdAllele.append(bestMatch[3])
                perfectMatchIdAllele2.append(
                    str(bestMatchContig) + "&" + str(bestMatchStart) + "-" +
                    str(bestMatchEnd) + "&" + "+")
            printinfo(genomeFile, geneFile)
            print "Exact match \n"
            continue

        else:

            #if a best match was found but it's not an exact match

            ###########################
            # LOCUS ON THE CONTIG TIP #
            ###########################
            print geneLen
            if bestMatchContigLen <= geneLen:

                perfectMatchIdAllele.append('LOTSC')
                perfectMatchIdAllele2.append(
                    str(bestMatchContig) + "&" + str(bestMatchStart) + "-" +
                    str(bestMatchEnd) + "&" + "+")
                printinfo(genomeFile, geneFile)
                print "Locus is bigger than the contig \n"

            elif (match.sbjct_start == 1 and len(match.query) < geneLen) or (
                    match.sbjct_start == bestMatchContigLen
                    and len(match.query) < bestMatchContigLen
                    and match.sbjct_start > match.sbjct_end):

                perfectMatchIdAllele.append('LOT5')
                perfectMatchIdAllele2.append(
                    str(bestMatchContig) + "&" + str(bestMatchStart) + "-" +
                    str(bestMatchEnd) + "&" + "+")
                printinfo(genomeFile, geneFile)

                print "Locus is on the 5' tip of the contig \n"

            elif (match.sbjct_end == 1 and len(match.query) < geneLen
                  and match.sbjct_start > match.sbjct_end) or (
                      match.sbjct_end == bestMatchContigLen
                      and len(match.query) < bestMatchContigLen):

                perfectMatchIdAllele.append('LOT3')
                perfectMatchIdAllele2.append(
                    str(bestMatchContig) + "&" + str(bestMatchStart) + "-" +
                    str(bestMatchEnd) + "&" + "+")
                printinfo(genomeFile, geneFile)

                print "Locus is on the 3' tip of the contig \n"

            elif 'N' in alleleStr or "K" in alleleStr or "R" in alleleStr or "Y" in alleleStr:

                #####################
                # ALLELE NOT FOUND  #		# N base found!
                #####################

                geneFile2 = os.path.splitext(geneFile)[0] + "LNFN.fasta"
                with open(geneFile2, 'a') as f:
                    f.write(">" + (str(os.path.basename(genomeFile))) + "|" +
                            (str(os.path.basename(geneFile))) + "\n")
                    f.write((alleleStr) + "\n")
                perfectMatchIdAllele.append('LNFN')
                perfectMatchIdAllele2.append('LNFN')
                printinfo(genomeFile, geneFile)
                print "LNFN, contains strange (N,K,R) bases! \n"

            else:

                print "new allele?"
                #removing gaps

                alleleStr = alleleStr.replace('-', '')
                lenExtraThresh = int(biggestAllelelen * 0.2)

                #else: #check if best match without gaps are contained inside an already defined allele

                isContainedDefinedAllele = False
                definedAllele = ''
                definedAlleleName = ''

                for k in geneDict.keys():
                    if alleleStr in k:
                        definedAllele = k
                        isContainedDefinedAllele = True
                        definedAlleleName = geneDict.get(k)
                        break
                print "is contained? " + str(isContainedDefinedAllele)
                print idPercent
                print geneLen
                print lenExtraThresh
                print lenRatio

                if isContainedDefinedAllele and int(len(match.sbjct)) <= int(
                        len(definedAllele)) + lenExtraThresh and int(
                            len(match.sbjct)) >= int(
                                len(definedAllele)) - lenExtraThresh:
                    #allele without gaps is contained in a defined allele
                    #best match with gaps has same size +1/-1 base as the defined allele

                    isnewallele = False

                    if int(len(alleleStr)) == int(
                            len(definedAllele)
                    ):  # if match without gaps has same size as the defined allele
                        tagAux = 'NA?:'
                        printinfo(genomeFile, geneFile)
                        perfectMatchIdAllele.append("NA?-" + str(alleleI))
                        perfectMatchIdAllele2.append(
                            str(bestMatchContig) + "&" + str(bestMatchStart) +
                            "-" + str(bestMatchEnd) + "&" + "+")
                        isnewallele = True

                    elif int(len(alleleStr)) == int(
                            len(definedAllele)
                    ) - 1:  # if match without gaps has minus one base than the defined allele

                        tagAux = 'NA2:'
                        printinfo(genomeFile, geneFile)
                        perfectMatchIdAllele.append("NA2-" + str(alleleI))
                        perfectMatchIdAllele2.append(
                            str(bestMatchContig) + "&" + str(bestMatchStart) +
                            "-" + str(bestMatchEnd) + "&" + "+")
                        isnewallele = True

                    else:
                        extraleft = 0
                        extraright = 0
                        tS = 0
                        tE = 0

                        handle = open(genomeFile, "rU")
                        record_dict = SeqIO.to_dict(
                            SeqIO.parse(handle, "fasta"))
                        handle.close()
                        record = record_dict[bestMatchContig]

                        # if match without gaps has more than one base missing comparing to the defined allele
                        if (1 < int(match.query_start)
                                and 1 < int(match.query_end)):

                            if match.query_start > match.query_end:
                                extraleft = match.query_end - 1

                            else:
                                extraleft = match.query_start - 1

                        print extraleft, extraright

                        if (
                                int(geneLen) > int(match.query_start)
                                and int(geneLen) > int(match.query_end)
                        ):  # if 3' tip bases of the allele are missing on the match

                            if match.query_start > match.query_end:
                                extraright = geneLen - match.query_start

                            else:
                                extraright = geneLen - match.query_end

                        print extraleft, extraright

                        if match.sbjct_start > match.sbjct_end:
                            tE = match.sbjct_start + extraleft
                            tS = match.sbjct_end - extraright - 1
                            alleleStr = str(record.seq[tS:tE])
                            alleleStr = reverseComplement(alleleStr)
                        else:
                            tS = match.sbjct_start - extraleft - 1
                            tE = match.sbjct_end + extraright
                            alleleStr = str(record.seq[tS:tE])

                        print tS
                        print tE
                        print "allele is:"
                        print alleleStr

                        if tE > bestMatchContigLen:
                            perfectMatchIdAllele.append('LOT3B')
                            perfectMatchIdAllele2.append(
                                str(bestMatchContig) + "&" + str(tS) + "-" +
                                str(bestMatchContigLen) + "&" + "+")
                            printinfo(genomeFile, geneFile)

                            print "Locus is on the 3B' tip of the contig \n"

                        elif tS < 0:
                            perfectMatchIdAllele.append('LOT5B')
                            perfectMatchIdAllele2.append(
                                str(bestMatchContig) + "&" + str(0) + "-" +
                                str(tE) + "&" + "+")
                            printinfo(genomeFile, geneFile)

                            print "Locus is on the 5B' tip of the contig \n"

                        else:

                            tagAux = 'NA2:'
                            printinfo(genomeFile, geneFile)
                            perfectMatchIdAllele.append("NA2-" + str(alleleI))
                            perfectMatchIdAllele2.append(
                                str(bestMatchContig) + "&" + str(tS) + "-" +
                                str(tE) + "&" + "+")
                            isnewallele = True

                    if isnewallele:
                        print "New allele found! Adding allele " + tagAux + str(
                            alleleI) + " to the database"
                        geneDict[alleleStr] = alleleI

                        orderedAlleleNames.append(str(alleleI))
                        # --- add the new allele to the gene fasta --- #

                        fG = open(geneFile, 'a')
                        fG.write('>allele_' + str(alleleI) + '_' +
                                 tagAux[:-1] + '_' +
                                 str(os.path.basename(genomeFile)) + '\n')
                        fG.write(alleleStr + '\n')
                        fG.close()
                        alleleI += 1

                #if best match is not contained in an already defined allele, check if it has similar size with the match allele and has 0.8 similarity

                elif not isContainedDefinedAllele and idPercent >= 0.8 and int(
                        len(match.sbjct)
                ) <= int(geneLen) + lenExtraThresh and int(len(
                        match.sbjct)) >= int(geneLen) - lenExtraThresh:
                    #best match with gaps has 80% identity
                    #best match with gaps is the same size or +1/-1 as the defined allele

                    ratio = float(len(alleleStr)) / float(geneLen)

                    if ratio >= 0.8 and ratio <= 1.2:  # if match without gaps has same size as the best match allele and 80%similarity

                        tagAux = ''
                        extraleft = 0
                        extraright = 0
                        tS = 0
                        tE = 0

                        handle = open(genomeFile, "rU")
                        record_dict = SeqIO.to_dict(
                            SeqIO.parse(handle, "fasta"))
                        handle.close()
                        record = record_dict[bestMatchContig]

                        #if len(match.sbjct)<geneLen and "-" not in match.sbjct:  #if the allele is not fully covered against the match, compensate the tips
                        try:
                            print match
                            if (1 < int(match.query_start)
                                    and 1 < int(match.query_end)):

                                if match.query_start > match.query_end:
                                    extraleft = match.query_end - 1

                                else:
                                    extraleft = match.query_start - 1

                            print extraleft, extraright

                            if (
                                    int(geneLen) > int(match.query_start)
                                    and int(geneLen) > int(match.query_end)
                            ):  # if 3' tip bases of the allele are missing on the match

                                if match.query_start > match.query_end:
                                    extraright = geneLen - match.query_start

                                else:
                                    extraright = geneLen - match.query_end

                            print extraleft, extraright

                            if match.sbjct_start > match.sbjct_end:
                                tE = match.sbjct_start + extraleft
                                tS = match.sbjct_end - extraright - 1
                                alleleStr = str(record.seq[tS:tE])
                                alleleStr = reverseComplement(alleleStr)
                            else:
                                tS = match.sbjct_start - extraleft - 1
                                tE = match.sbjct_end + extraright
                                alleleStr = str(record.seq[tS:tE])

                            print tS
                            print tE
                            print "allele is:"
                            print alleleStr

                            if tE > bestMatchContigLen:
                                perfectMatchIdAllele.append('LOT3C')
                                perfectMatchIdAllele2.append(
                                    str(bestMatchContig) + "&" + str(tS) +
                                    "-" + str(bestMatchContigLen) + "&" + "+")
                                printinfo(genomeFile, geneFile)

                                print "Locus is on the 3C' tip of the contig \n"

                            elif tS < 0:
                                perfectMatchIdAllele.append('LOT5C')
                                perfectMatchIdAllele2.append(
                                    str(bestMatchContig) + "&" + str(0) + "-" +
                                    str(tE) + "&" + "+")
                                printinfo(genomeFile, geneFile)

                                print "Locus is on the 5C' tip of the contig \n"

                            else:
                                tagAux = 'NA3:'
                                printinfo(genomeFile, geneFile)
                                perfectMatchIdAllele.append("NA3-" +
                                                            str(alleleI))
                                perfectMatchIdAllele2.append(
                                    str(bestMatchContig) + "&" + str(tS) +
                                    "-" + str(tE) + "&" + "+")

                                print "New allele found! Adding allele " + tagAux + str(
                                    alleleI) + " to the database"
                                geneDict[alleleStr] = alleleI

                                orderedAlleleNames.append(str(alleleI))
                                # --- add the new allele to the gene fasta --- #

                                fG = open(geneFile, 'a')
                                fG.write('>allele_' + str(alleleI) + '_' +
                                         tagAux[:-1] + '_' +
                                         str(os.path.basename(genomeFile)) +
                                         '\n')
                                fG.write(alleleStr + '\n')
                                fG.close()
                                alleleI += 1

                        except Exception as e:
                            ##################
                            #       LNF      #
                            ##################
                            print e
                            geneFile2 = os.path.splitext(
                                geneFile)[0] + "LNF3.fasta"
                            print geneFile2
                            with open(geneFile2, 'a') as f:
                                f.write(">" +
                                        (str(os.path.basename(genomeFile))) +
                                        "|" +
                                        (str(os.path.basename(geneFile))) +
                                        " | " + str(bestMatchContig) + "\n")
                                f.write((alleleStr) + "\n")
                                f.write(">Allele\n")
                                f.write((bmAllele) + "\n")
                            printinfo(genomeFile, geneFile)
                            perfectMatchIdAllele.append("LNF3")
                            perfectMatchIdAllele2.append("LNF3")
                            print "No allele found"
                    else:
                        ##################
                        #       LNF      #
                        ##################
                        geneFile2 = os.path.splitext(
                            geneFile)[0] + "LNF4.fasta"
                        print geneFile2
                        with open(geneFile2, 'a') as f:
                            f.write(">" + (str(os.path.basename(genomeFile))) +
                                    "|" + (str(os.path.basename(geneFile))) +
                                    " | " + str(bestMatchContig) + "\n")
                            f.write((alleleStr) + "\n")
                            f.write(">Allele\n")
                            f.write((bmAllele) + "\n")
                        printinfo(genomeFile, geneFile)
                        perfectMatchIdAllele.append("LNF4")
                        perfectMatchIdAllele2.append("LNF4")
                        print "No allele found"

                elif isContainedDefinedAllele:
                    ####################
                    # UNDEFINED ALLELE #		# it is contained in another allele
                    ####################

                    alleleStr = match.query

                    perfectMatchIdAllele.append("undefined allele")
                    perfectMatchIdAllele2.append("undefined allele")
                    printinfo(genomeFile, geneFile)
                    print "Undefined allele \n"

                    geneFile2 = os.path.splitext(
                        geneFile)[0] + "undefined.fasta"
                    print geneFile2

                elif lenRatio < 0.5:

                    ###############
                    # SMALL MATCH #
                    ###############

                    perfectMatchIdAllele.append('small match')
                    perfectMatchIdAllele2.append('small match')
                    printinfo(genomeFile, geneFile)
                    print "lower than 50% match \n"

                elif lenRatio < 0.8 and idPercent < 0.5:
                    #####################
                    # INCOMPLETE ALLELE #		# it was not possible to extend it to at least 80% of the length of the gene
                    #####################
                    perfectMatchIdAllele.append('allele incomplete')
                    perfectMatchIdAllele2.append('allele incomplete')
                    printinfo(genomeFile, geneFile)
                    print "Incomplete allele\n"

                else:
                    ##################
                    #       LNF      #
                    ##################

                    printinfo(genomeFile, geneFile)
                    perfectMatchIdAllele.append("LNF5")
                    perfectMatchIdAllele2.append("LNF5")
                    print "Locus not found"

    final = (resultsList, perfectMatchIdAllele)
    print("Finished allele calling at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))
    filepath = os.path.join(temppath,
                            os.path.basename(geneFile) + "_result.txt")
    filepath2 = os.path.join(temppath,
                             os.path.basename(geneFile) + "_result2.txt")
    with open(filepath, 'wb') as f:
        pickle.dump(final, f)
    with open(filepath2, 'wb') as f:
        pickle.dump(perfectMatchIdAllele2, f)
    return True

Example #33

Show file

File: alleleCalling.py Project: B-UMMI/wgMLSTscripts

def callAlleles(argumentList):

	geneFile = argumentList[0]
	genomesList = argumentList[1]
	listOfCDSDicts = argumentList[2]
	listOfGenomesDict = argumentList[3]

	gene_fp = HTSeq.FastaReader(geneFile)
	geneDict = {}
	alleleI = 0


	for allele in gene_fp:
		if allele.seq in geneDict:
			print "\nWARNING: this file contains a repeated allele, it should be checked. Ignoring it now!\n", geneFile
		else:
			geneDict[ allele.seq ] = alleleI
		alleleI += 1


	# --- make 1st blast DB --- #

	Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 )
	geneF = os.path.splitext( geneFile )[0]
	blast_out_file = geneF + '.xml'


	# list of results - the output of the function
	resultsList = []
	i = 0

	for genomeFile in genomesList:

		currentCDSDict = listOfCDSDicts[i]
		currentGenomeDict = listOfGenomesDict[i]

		i+=1		# it has to be incremented here

		if genomeFile[-1] == '\n':
			genomeFile = genomeFile[:-1]


                # ------------------------------ RUNNING BLAST ------------------------------ #

		cline = NcbiblastnCommandline(query=genomeFile, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5)
		blast_records = runBlastParser(cline, blast_out_file, genomeFile)
		

		# ------ DETERMINING BEST MATCH ------ #

		# bestMatch = ['rec.query','hsp', lenRatio]
		bestMatch = ['','', 0]
		
		for blast_record in blast_records:

			# --- the LNF cases are now called outside de loop --- #

			try:
				hspC = blast_record.alignments[0]
				if bestMatch[0] == '' and bestMatch[1] == '':
					bestMatch[0] = blast_record.query
					bestMatch[1] = hspC
			except IndexError:
				continue


			# --- the contig tag is used in the progigal function --- #

			contigTag = blast_record.query

			# --- brute force parsing of the contig tag - better solution is advisable --- #			

			j=0
			for l in contigTag:
				if l == ' ':
					break
				j+=1

			contigTag = contigTag[:j]

			contigLen = blast_record.query_letters


			# --- iterating over all the results to determine the best match --- #

			for alignment in blast_record.alignments:

				for match in alignment.hsps:

					lenRatio = float(len( match.query )) / float( len(match.sbjct) )

					if lenRatio > bestMatch[2]:
						bestMatch = [blast_record.query, match, lenRatio]


		# ---------- ALLELE CALLING AFTER DETERMINING BEST MATCH ---------- #

		###################
		# LOCUS NOT FOUND #
		###################
		
		if bestMatch[0] == '':
			resultsList.append('LNF:-1')            # append result to the list of results
			continue

		match = bestMatch[1]

		geneLen = len(match.sbjct)
		alleleStr = match.query

		nIdentities = match.identities
		idPercent = float(nIdentities) / float(geneLen)
		lenRatio = bestMatch[2]


		###########################
                # LOCUS ON THE CONTIG TIP #
                ###########################

		if contigLen <= match.query_start or contigLen <= match.query_end:
			resultsList.append('LOT:-1')

		###############
                # SMALL MATCH #
                ###############

                elif lenRatio < 0.5:
			resultsList.append('SAC:-1')		# don't know what 'SAC' stands for

		else:
			# ------------------------------------------------------------------------------------------------------- #
			#                                                                                                         #
			#                                   USING PRODIGAL TO TRY TO EXTEND CDS                                   #
			#                                                                                                         #
			# ------------------------------------------------------------------------------------------------------- #

			extended, strCDS = extendCDS(contigTag, currentCDSDict, match.sbjct_start, match.sbjct_end, currentGenomeDict)

			# --- if it was possible to extend it using prodigal --- #

			if extended and ( ( len(strCDS) * lenRatio ) >= geneLen):		# and idPercent > 0.8 and ( len(strCDS) / geneLen)  > 0.8:
				alleleStr = strCDS
				lenRatio = float(len(strCDS)) / float(geneLen)

			# --- removing gaps '-' --- #

			alleleStr = alleleStr.replace('-', '')

			# --- continuing the allele calling --- #

			if lenRatio < 0.8 and idPercent < 0.5:

				#####################
				# INCOMPLETE ALLELE #		# it was not possible to extend it to at least 80% of the length of the gene
				#####################

				resultsList.append('INC:-1')

			else:
				# --- it might be needed to obtain the reverse complement of the allele string --- #

				if match.sbjct_start > match.sbjct_end:
					alleleStr = reverseComplement(alleleStr)


				if alleleStr in geneDict:
					alleleNumber = geneDict[ alleleStr ]
					
					################################################
					# EXACT MATCH --- MATCH == GENE --- GENE FOUND #
					################################################

					resultsList.append('EXC:' + str(alleleNumber) )

				else:

					isUndefined = False	
					for k in geneDict.keys():
						if alleleStr in k:
							isUndefined = True
							break

					if isUndefined:

						####################
                                                # UNDEFINED ALLELE #		# it is contained in another allele
                                                ####################

						resultsList.append('UND:-1')
						
					else:
						if not extended and idPercent > 0.8:
							
							##################
        	                                        # ADD NEW ALLELE #
                	                                ##################

							tagAux = 'NA:'

						else:

							#######################
        	                                        # ADD INFERRED ALLELE #		# a new allele that was extended with prodigal
                	                                #######################

							tagAux = 'INF:'
						
						resultsList.append( tagAux + str(alleleI) )
                                                geneDict[ alleleStr ] = alleleI
                                                alleleI += 1

						# --- add the new allele to the gene fasta --- #

						fG = open( geneFile, 'a' )
						fG.write('>allele_' + str(alleleI) + '_' + tagAux[:-1] + '\n')
						fG.write( alleleStr + '\n')
						fG.close()

						# --- remake blast DB --- #
						Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 )

	return resultsList