def getBlastScoreRatios(FASTAfile, allelescore, queryDef, databasePath, queryProteomeName, referenceGenomeArray, referenceCDS, isXML):
    
    alleleProt=''
    proteome=""
    countP=0
    countCDS=0

    if isXML == 'True':
        blast_out_file = 'BLASTresults.xml'

        cline = NcbiblastpCommandline(query=queryProteomeName, db=databasePath, out=blast_out_file, outfmt=5, num_alignments=7000, num_descriptions=7000)

        print 'BSR:'
            #print cline
        blast_records = runBlastParser(cline,blast_out_file, False)

        startTime = datetime.now()
        ToNewAllele = parseBLASTRecordsXML(blast_records, allelescore, queryDef, referenceGenomeArray, referenceCDS)
        print 'CheckResults:' + str(datetime.now() - startTime)

    else:
        blast_out_file = 'BLASTresults.tab'

        cline = NcbiblastpCommandline(query=queryProteomeName, db=databasePath, out=blast_out_file, outfmt=6, num_alignments=7000, num_descriptions=7000)

        print 'BSR:'
        blast_records = runBlastParserTAB(cline,blast_out_file, False)

        startTime = datetime.now()
        ToNewAllele = parseBLASTRecordsTAB(blast_records, allelescore, queryDef, referenceGenomeArray, referenceCDS)
        print 'CheckResults:' + str(datetime.now() - startTime)

    os.remove(queryProteomeName)

    return ToNewAllele
def reDogetBlastScoreRatios(genefile,basepath,alleleI,allelescores2,newGene_Blast_DB_name,alleleList2,picklepath):
	
	gene_fp = HTSeq.FastaReader(genefile)

	alleleProt=''
	
	alleleI+=1
		
	proteinfastaPath=genefile
	
	print ("Re-starting Blast alleles at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	
	blast_out_file2 = os.path.join(basepath,'blastdbs/temp.xml')

	cline = NcbiblastpCommandline(query=proteinfastaPath, db=newGene_Blast_DB_name, evalue=0.001, out=blast_out_file2, outfmt=5)
	allelescore=0
	blast_records = runBlastParser(cline,blast_out_file2, proteinfastaPath)
	
	print ("Blasted alleles at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	
	found =False
	for blast_record in blast_records:
		
		for alignment in blast_record.alignments:
			
			
			for match in alignment.hsps:
				allelescores2.append(int(match.score))
				

	var=[alleleI,allelescores2]
	with open(picklepath,'wb') as f:
		currentCDSDict = pickle.dump(var, f)
	
	return int(alleleI),allelescores2,alleleList2
def getOwnBlastScore(FASTAfile, databasePath, queryProteomeName, numberOfLocus, blastResultsPath, LocusToUse, queryFile):

    allelescores = []
    alleleNumbers = {}
    sameAlleles = {}
    prevAlleleName = {}

    databasePath, isEmpty, proteinsToQueryFile, queryAlleleList, prevAlleleName = CreateQueryDatabase(FASTAfile, databasePath,queryProteomeName)


    if isEmpty:
        return allelescores, isEmpty, proteinsToQueryFile, alleleNumbers, sameAlleles, prevAlleleName

    blast_out_file = blastResultsPath + '/' + numberOfLocus + '_BLASTresults.xml'

    cline = NcbiblastpCommandline(query=queryProteomeName, db=databasePath, out=blast_out_file, outfmt=5, num_alignments=7000, num_descriptions=7000)

        #print cline
    allelescore=0

    blast_records = runBlastParser(cline,blast_out_file, False)


    allelescores, alleleList, alleleNumbers, sameAlleles = parseOwnBLASTRecordsAndDuplicates(blast_records, FASTAfile, queryAlleleList)

    proteinsToQueryFile = translateAlleleList(alleleList, queryProteomeName, LocusToUse, queryFile)


    os.remove(databasePath+ ".pin")
    os.remove(databasePath+ ".phr")
    os.remove(databasePath+ ".psq")
    os.remove(databasePath+ "_blast.log")
    os.remove(blast_out_file)

    return allelescores, isEmpty, proteinsToQueryFile, alleleNumbers, sameAlleles, prevAlleleName
Example #4
0
def getBlastScoreRatios(allelescore, alleleList, databasePath,
                        queryProteomeName, referenceGenomeArray, referenceCDS,
                        bestmatches, referenceCDSsequences, referenceFileName,
                        countNumberOfGenomes, blastResultsPath, LocusToUse):

    alleleProt = ''
    proteome = ""
    countP = 0
    countCDS = 0

    blast_out_file = blastResultsPath + countNumberOfGenomes + '_BLASTresults.xml'

    cline = NcbiblastpCommandline(query=queryProteomeName,
                                  db=databasePath,
                                  out=blast_out_file,
                                  outfmt=5,
                                  num_alignments=7000,
                                  num_descriptions=7000)

    #print cline
    blast_records = runBlastParser(cline, blast_out_file, False)

    resultsList, addNewAlleles = parseBLASTRecordsXML(
        blast_records, allelescore, alleleList, referenceGenomeArray,
        referenceCDS, bestmatches, referenceCDSsequences, referenceFileName,
        LocusToUse)

    os.remove(blast_out_file)

    return resultsList, addNewAlleles
Example #5
0
def getOwnBlastScore(FASTAfile):
    gene_fp = HTSeq.FastaReader(FASTAfile)
    #alleleI=0
    names=""
    alleleProt=''
    proteome=""
    for allele in gene_fp: #new db for each allele to blast it against himself
        try:
            x = str(translateSeq(allele.seq))
        except:
            continue
        #print str(allele.name)
        #names=allele.name.split("|")[3]
        #print allele.seq
        alleleProt+=">"+str(allele.name)+"\n"+x+"\n"
        proteome+=">"+str(allele.name)+"\n"+x+"\n"
    with open(pathRef+'allAllelesAA.fasta', "wb") as f:
        f.write(alleleProt)
    with open(pathRef+nameOrg+'proteome.fasta', "wb") as v:
        v.write(proteome)
    Gene_Blast_DB_name = Create_Blastdb(pathRef+'allAllelesAA.fasta',1,True)
        # --- get BLAST score ratio --- #
    cline = NcbiblastpCommandline(query=pathRef+nameOrg+'proteome.fasta', db=name, out=blast_out_file, outfmt=5, num_alignments=7000, num_descriptions=7000)
        #print cline
    allelescore=0
    blast_records = runBlastParser(cline,blast_out_file, alleleProt)
    allelescores={}
    for blast_record in blast_records:
        found=False 
        for alignment in blast_record.alignments:
            if found is False:
                #print blast_record.query, alignment.hit_def
                for match in alignment.hsps:
                    #print alignment.hit_def
                        #print "---------------------"
                    #print alignment.hit_def
                    #print blast_record.query
                    #print alignment.hit_def
                    try:
                        if allelescores[str(alignment.hit_def)] < match.score:
                            allelescores[str(alignment.hit_def)] = int(match.score)
                            break
                    except KeyError:
                        allelescores[str(alignment.hit_def)] = int(match.score)
                        break
            else:
                break
    #print allelescores
    #for i in allelescores:
        #hitsName.append(str(i)+";"+str(allelescores[i])+";")
    #hitsName.sort(key=Align_sort_key)
    #print hitsName
    #return alleleI,allelescores,Gene_Blast_DB_name
    #print alleleI
    #print len(allelescores)
    return allelescores
def main():

	parser = argparse.ArgumentParser(description="Given an ffn file, recovers the genes that are not paralogs and have a size bigger than the g parameter provided")
	parser.add_argument('-i', nargs='?', type=str, help='ffn file', required=True)
	parser.add_argument('-g', nargs='?', type=int, help='int minimum size', required=True)
	
	args = parser.parse_args()
	genes = args.i
	sizethresh = args.g
	
	gene_fp = HTSeq.FastaReader(genes)
	geneFile = os.path.abspath( genes )
	Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 )
	
	geneF = os.path.splitext( geneFile )[0]
	blast_out_file = geneF + '.xml'

	# list of results - the output of the function
	resultsList = []

					# ------------------------------ RUNNING BLAST ------------------------------ #

	cline = NcbiblastnCommandline(query=geneFile, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5)
	blast_records = runBlastParser(cline, blast_out_file, geneFile)
	paralogs=[]
	for blast_record in blast_records:
		try:
			alignment=blast_record.alignments[1]
			paralogs.append( alignment.hit_def)

		except:
			continue
	
	pathfiles=os.path.dirname(geneFile)
	pathfiles=pathfiles+"/"
	print pathfiles
	
	g_fp = HTSeq.FastaReader( genes )
	removedparalogs=0
	removedsize=0
	for contig in g_fp:
		name = contig.name+" "+contig.descr
		if name not in paralogs:
			if int(len(contig.seq))>sizethresh:
				namefile=contig.name
				namefile=namefile.replace("|","_")
				with open(pathfiles+namefile+".fasta", "wb") as f:
					f.write(">1\n"+contig.seq+"\n")
			else:
				removedsize+=1
		else:
			print name
			removedparalogs+=1
	print "Removed %s paralog genes" % str(removedparalogs)
	print "Removed %s because of size :" % str(removedsize)
def reDogetBlastScoreRatios(genefile,basepath,alleleI,allelescores2,newGene_Blast_DB_name,alleleList2):
	
	gene_fp = HTSeq.FastaReader(genefile)
	#alleleI=0
	#allelescores=[]
	alleleProt=''
	#alleleList=[]
	"""for allele in gene_fp: #new db for each allele to blast it against himself
		print allele
		alleleI+=1
		genome=-1
		alleleList2.append(allele.seq)
		translatedSequence,x,y=translateSeq(allele.seq)
		print translatedSequence
		alleleProt+=">"+str(alleleI)+"\n"+str(translatedSequence+"\n")"""
	
	alleleI+=1
		
	proteinfastaPath=genefile
	print proteinfastaPath
	blast_out_file2 = os.path.join(basepath,'blastdbs/temp.xml')
	#with open(proteinfastaPath, "wb") as f:
	#	f.write(alleleProt)
	#Gene_Blast_DB_name = Create_Blastdb( proteinfastaPath, 1, True )
		# --- get BLAST score ratio --- #
	cline = NcbiblastpCommandline(query=proteinfastaPath, db=newGene_Blast_DB_name, evalue=0.001, out=blast_out_file2, outfmt=5)
		#print cline
	allelescore=0
	blast_records = runBlastParser(cline,blast_out_file2, proteinfastaPath)
	found =False
	for blast_record in blast_records:
		found=False 
		print blast_record
		#print blast_record.header
		#print blast_record.alignments[0]
		for alignment in blast_record.alignments:
			print alignment,alignment.hsps
			if found is False:
				#print blast_record.query, alignment.hit_def
				for match in alignment.hsps:
					print match
					
						#print "---------------------"
					if(int(alignment.hit_def)== int(blast_record.query)):
						#print match
						allelescores2.append(int(match.score))
						found=True
						break
			else:
				break
	#print allelescores2, alleleList2
	return alleleI,allelescores2,alleleList2
def getBlastScoreRatios(genefile,basepath):
	
	gene_fp = HTSeq.FastaReader(genefile)
	alleleI=0
	allelescores=[]
	alleleProt=''
	alleleList=[]
	for allele in gene_fp: #new db for each allele to blast it against himself
		alleleI+=1
		genome=-1
		alleleList.append(allele.seq)
		translatedSequence,x,y=translateSeq(allele.seq)
		alleleProt+=">"+str(alleleI)+"\n"+str(translatedSequence+"\n")
	#basepath="./blastdbs/temp"+str(os.path.basename(genefile))
	#if not os.path.exists(basepath):
	#	os.makedirs(basepath)
	proteinfastaPath=os.path.join(basepath,str(os.path.basename(genefile)+'_protein.fasta'))
	blast_out_file = os.path.join(basepath,"blastdbs/"+os.path.basename(genefile) + '.xml')
	with open(proteinfastaPath, "wb") as f:
		f.write(alleleProt)
	
	print ("Starting Blast alleles at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	Gene_Blast_DB_name = Create_Blastdb( proteinfastaPath, 1, True )
	print proteinfastaPath
		# --- get BLAST score ratio --- #
	cline = NcbiblastpCommandline(query=proteinfastaPath, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5)
		#print cline
	allelescore=0
	
	print ("Parse bsr blast at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	blast_records = runBlastParser(cline,blast_out_file, alleleProt)
	
	print ("Blasted alleles at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	
	for blast_record in blast_records:
		found=False 
		for alignment in blast_record.alignments:
			if found is False:
				#print blast_record.query, alignment.hit_def
				for match in alignment.hsps:
					
						#print "---------------------"
					if(int(alignment.hit_def)== int(blast_record.query)):
						#print match
						allelescores.append(int(match.score))
						found=True
						break
			else:
				break
	#print allelescores
	return alleleI,allelescores,Gene_Blast_DB_name,alleleList
def reDogetBlastScoreRatios(genefile, basepath, alleleI, allelescores2,
                            newGene_Blast_DB_name, alleleList2, picklepath,
                            verbose, blastPath, listAllelesNames):
    if verbose:

        def verboseprint(*args):
            for arg in args:
                print(arg),
            print
    else:
        verboseprint = lambda *a: None  # do-nothing function

    #gene_fp = HTSeq.FastaReader(genefile)

    alleleProt = ''

    proteinfastaPath = genefile

    verboseprint("Starting Blast of new alleles to calculate BSR at : " +
                 time.strftime("%H:%M:%S-%d/%m/%Y"))

    blast_out_file2 = os.path.join(basepath, 'blastdbs/temp.xml')

    cline = NcbiblastpCommandline(cmd=blastPath,
                                  query=proteinfastaPath,
                                  db=newGene_Blast_DB_name,
                                  evalue=0.001,
                                  out=blast_out_file2,
                                  outfmt=5,
                                  num_threads=1)
    allelescore = 0
    blast_records = runBlastParser(cline, blast_out_file2)

    verboseprint("Blasted alleles at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))

    found = False
    matchscore = 0
    for blast_record in blast_records:

        for alignment in blast_record.alignments:

            for match in alignment.hsps:
                matchscore = int(match.score)

    allelescores2[alleleI] = matchscore
    with open(picklepath, 'wb') as f:
        pickle.dump(allelescores2, f)

    return allelescores2, alleleList2, listAllelesNames
def BLASTp(queryFile, dbName, blast_out_path, queryNames, sequenceLengths):
	blast_out_file = os.path.join(blast_out_path,'blastOut.xml')
	cline = NcbiblastpCommandline(query=queryFile, db=dbName, out=blast_out_file, outfmt=5)
	blast_records = runBlastParser(cline,blast_out_file, "")
	matchGene = ''
	score = -1
	for blast_record in blast_records:
		queryGeneIndex = queryNames.index(blast_record.query.strip('|'))
		querySequenceLength = sequenceLengths[queryGeneIndex]
		for alignment in blast_record.alignments:
			for match in alignment.hsps:
				identity_length_ratio = float(match.identities)/float(querySequenceLength)
				if identity_length_ratio >= 0.8:
					if score < match.score:
						matchGene = alignment.hit_def
						score = match.score

	return matchGene
def getBlastScoreRatios(allelescore, alleleList, databasePath, queryProteomeName, referenceGenomeArray, referenceCDS, bestmatches, referenceCDSsequences,referenceFileName, countNumberOfGenomes, blastResultsPath, LocusToUse):
    
    alleleProt=''
    proteome=""
    countP=0
    countCDS=0

    blast_out_file = blastResultsPath + countNumberOfGenomes + '_BLASTresults.xml'


    cline = NcbiblastpCommandline(query=queryProteomeName, db=databasePath, out=blast_out_file, outfmt=5, num_alignments=7000, num_descriptions=7000)

        #print cline
    blast_records = runBlastParser(cline,blast_out_file, False)

    resultsList, addNewAlleles = parseBLASTRecordsXML(blast_records, allelescore, alleleList, referenceGenomeArray, referenceCDS, bestmatches, referenceCDSsequences, referenceFileName, LocusToUse)

    os.remove(blast_out_file)

    return resultsList, addNewAlleles
Example #12
0
def getOwnBlastScore(FASTAfile, databasePath, queryProteomeName, numberOfLocus,
                     blastResultsPath, LocusToUse, queryFile):

    allelescores = []
    alleleNumbers = {}
    sameAlleles = {}
    prevAlleleName = {}

    databasePath, isEmpty, proteinsToQueryFile, queryAlleleList, prevAlleleName = CreateQueryDatabase(
        FASTAfile, databasePath, queryProteomeName)

    if isEmpty:
        return allelescores, isEmpty, proteinsToQueryFile, alleleNumbers, sameAlleles, prevAlleleName

    blast_out_file = blastResultsPath + '/' + numberOfLocus + '_BLASTresults.xml'

    cline = NcbiblastpCommandline(query=queryProteomeName,
                                  db=databasePath,
                                  out=blast_out_file,
                                  outfmt=5,
                                  num_alignments=7000,
                                  num_descriptions=7000)

    #print cline
    allelescore = 0

    blast_records = runBlastParser(cline, blast_out_file, False)

    allelescores, alleleList, alleleNumbers, sameAlleles = parseOwnBLASTRecordsAndDuplicates(
        blast_records, FASTAfile, queryAlleleList)

    proteinsToQueryFile = translateAlleleList(alleleList, queryProteomeName,
                                              LocusToUse, queryFile)

    os.remove(databasePath + ".pin")
    os.remove(databasePath + ".phr")
    os.remove(databasePath + ".psq")
    os.remove(databasePath + "_blast.log")
    os.remove(blast_out_file)

    return allelescores, isEmpty, proteinsToQueryFile, alleleNumbers, sameAlleles, prevAlleleName
def getBlastScoreRatios(genefile):
	
	gene_fp = HTSeq.FastaReader(genefile)
	alleleI=0
	allelescores=[]
	alleleProt=''
	for allele in gene_fp: #new db for each allele to blast it against himself
		alleleI+=1
		genome=-1
		alleleProt+=">"+str(alleleI)+"\n"+str(translateSeq(allele.seq)+"\n")
	basepath="./blastdbs/temp"+str(os.path.basename(genefile))
	if not os.path.exists(basepath):
		os.makedirs(basepath)
	with open(basepath+'/protein.fasta', "wb") as f:
		f.write(alleleProt)
	Gene_Blast_DB_name = Create_Blastdb( basepath+'/protein.fasta', 1, True )
		# --- get BLAST score ratio --- #
	cline = NcbiblastpCommandline(query=basepath+'/protein.fasta', db=Gene_Blast_DB_name, evalue=0.001, out=basepath+'protein.xml', outfmt=5)
		#print cline
	allelescore=0
	blast_records = runBlastParser(cline,basepath+'protein.xml', alleleProt)
	
	for blast_record in blast_records:
		found=False 
		for alignment in blast_record.alignments:
			if found is False:
				#print blast_record.query, alignment.hit_def
				for match in alignment.hsps:
					
						#print "---------------------"
					if(int(alignment.hit_def)== int(blast_record.query)):
						#print match
						allelescores.append(int(match.score))
						found=True
						break
			else:
				break
	#print allelescores
	return alleleI,allelescores,Gene_Blast_DB_name
Example #14
0
def reDogetBlastScoreRatios(genefile, basepath, alleleI, allelescores2,
                            newGene_Blast_DB_name, alleleList2, picklepath):

    gene_fp = HTSeq.FastaReader(genefile)

    alleleProt = ''

    alleleI += 1

    proteinfastaPath = genefile

    print("Re-starting Blast alleles at : " +
          time.strftime("%H:%M:%S-%d/%m/%Y"))

    blast_out_file2 = os.path.join(basepath, 'blastdbs/temp.xml')

    cline = NcbiblastpCommandline(query=proteinfastaPath,
                                  db=newGene_Blast_DB_name,
                                  evalue=0.001,
                                  out=blast_out_file2,
                                  outfmt=5)
    allelescore = 0
    blast_records = runBlastParser(cline, blast_out_file2, proteinfastaPath)

    print("Blasted alleles at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))

    found = False
    for blast_record in blast_records:

        for alignment in blast_record.alignments:

            for match in alignment.hsps:
                allelescores2.append(int(match.score))

    var = [alleleI, allelescores2]
    with open(picklepath, 'wb') as f:
        currentCDSDict = pickle.dump(var, f)

    return int(alleleI), allelescores2, alleleList2
def callAlleles(argumentList):
	
	geneFile = argumentList[0]
	genomesList = argumentList[1]
	listOfProts = argumentList[2]
	listAllCDS = argumentList[3]
	#print geneFile
	gene_fp = HTSeq.FastaReader(geneFile)
	alleleI = 0
	#inverted=False
	#orderedAlleleNames=[]
	resultsList = []
	i = 0
	perfectMatchIdAllele=[]
	bestmatch=[0,0,False,'',0] #score, score ratio, perfectmatch, key name of the DNA sequence string, allele ID
	allelescores=[]
	
	
	alleleI,allelescores,Gene_Blast_DB_name=getBlastScoreRatios(geneFile)
	
	genome=-1	
	
	for protList in listOfProts:

		#alleleI = 0
		#alleleProt=''
		#for allele in gene_fp: #new db for each allele to blast it against himself
		#	alleleI+=1
		#	alleleProt+=">"+str(alleleI)+"\n"+str(translateSeq(allele.seq)+"\n")
		basepath="./blastdbs/temp"+str(os.path.basename(geneFile))
		#if not os.path.exists(basepath):
		#	os.makedirs(basepath)
		#with open(basepath+'/protein.fasta', "wb") as f:
		#	f.write(alleleProt)
		#Gene_Blast_DB_name = Create_Blastdb( basepath+'/protein.fasta', 1, True )
		genome+=1
		with open(basepath+'/proteinList.fasta', "wb") as f:
			f.write(protList)
		#Gene_Blast_DB_name = Create_Blastdb( './temp/proteinList.fasta', 1, True )
		cline = NcbiblastpCommandline(query=basepath+'/proteinList.fasta', db=Gene_Blast_DB_name, evalue=0.001, out=basepath+'proteinList.xml', outfmt=5)
		#print cline
		blast_records = runBlastParser(cline, basepath+'proteinList.xml', basepath+'/proteinList.fasta')
		for blast_record in blast_records:
				
			for alignment in blast_record.alignments:
				#print alignment
					#print alignment.hsps
				#print alignment.hit_id
				#print alignment.hit_def
					#print alignment.title
				for match in alignment.hsps:
					#print blast_record.query
					#print match
					#print alleleI, len(allelescores)
					scoreRatio=float(match.score)/float(allelescores[int(alignment.hit_def)-1])
					#print scoreRatio
					#print alignment.hit_def
					cdsStrName=blast_record.query
					if(scoreRatio == 1 and bestmatch[2] is False):
						bestmatch=[match.score,scoreRatio,True,cdsStrName,int(alignment.hit_def)]
						#print alignment
						#print match
					elif(scoreRatio == 1 and match.score>bestmatch[0]):
						bestmatch=[match.score,scoreRatio,True,cdsStrName,int(alignment.hit_def)]
						#print match
					elif(match.score>bestmatch[0] and scoreRatio>0.4 and scoreRatio>bestmatch[1] and bestmatch[2] is False):
						#print match.query
						#print match.sbjct
						#print allelescores
						bestmatch=[match.score,scoreRatio,False,cdsStrName,int(alignment.hit_def)]
						#print match
		#print bestmatch
				
		if bestmatch[0]==0:
					#if no best match was found
					
					###################
					# LOCUS NOT FOUND #
					###################
						
			resultsList.append('LNF3:-1')            # append result to the list of results
			perfectMatchIdAllele.append('LNF')
			#printinfo(genomeFile,geneFile)
			print "Locus not found, no matches \n"
			
				
		elif bestmatch[2] is True:
						
					#if a perfect match was found
					
						################################################
						# EXACT MATCH --- MATCH == GENE --- GENE FOUND #
						################################################
					
			perfectMatchIdAllele.append(str(bestmatch[4]))
			resultsList.append('EXC:' + str(bestmatch[4]) )
				
		else:
					#######################
					# ADD INFERRED ALLELE #		# a new allele 
					#######################
					
											
				#print "infered allele has location : "+(CDSType)
				#printinfo(genomeFile,geneFile) 
			tagAux='INF'
			perfectMatchIdAllele.append( tagAux +"-"+str(alleleI+1))
			print "New allele! Adding allele "+ tagAux + str(alleleI+1) +" to the database\n"
																				
			resultsList.append( tagAux + str(alleleI+1) )

					#orderedAlleleNames.append('allele_' + str(alleleI) + '_' + tagAux[:-1] +"_" +str(os.path.basename(genomeFile)))	
										# --- add the new allele to the gene fasta --- #

			fG = open( geneFile, 'a' )
			fG.write('>allele_' + str(alleleI+1) + '_' + tagAux[:-1] +"_" + str(os.path.basename(genomesList[genome])) + '\n')
					#print alleleStr
				
			listOfCDS=listAllCDS[genome]
			#print listOfCDS
			fG.write( listOfCDS[">"+bestmatch[3]] + '\n')
			fG.close()
					#alleleI += 1
					# --- remake blast DB --- #
			Gene_Blast_DB_name = Create_Blastdb( basepath+'/protein.fasta', 1, True )
			alleleI,allelescores,Gene_Blast_DB_name=getBlastScoreRatios(geneFile)
	#x=y
	shutil.rmtree(basepath)

	
	final =	(resultsList,perfectMatchIdAllele)	
	#return (resultsList)
	return final
def callAlleles(argumentList):

	geneFile = argumentList[0]
	genomesList = argumentList[1]
	listOfCDSDicts = argumentList[2]
	listOfGenomesDict = argumentList[3]
	
	gene_fp = HTSeq.FastaReader(geneFile)
	geneDict = {}
	alleleI = 1
	inverted=False
	orderedAlleleNames=[]
	biggestAllelelen=0
	for allele in gene_fp:
		if allele.seq in geneDict:
			print "\nWARNING: this file contains a repeated allele, it should be checked. Ignoring it now!\n", geneFile
		else:
			if len(allele.seq)>biggestAllelelen:
				biggestAllelelen=len(allele.seq)
			orderedAlleleNames.append(allele.name)
			geneDict[ allele.seq ] = alleleI
			alleleI += 1
	#print geneDict
	#print orderedAlleleNames

	# --- make 1st blast DB --- #

	Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 )
	geneF = os.path.splitext( geneFile )[0]
	blast_out_file = geneF + '.xml'

	# list of results - the output of the function
	resultsList = []
	i = 0
	perfectMatchIdAllele=[]
	
	for genomeFile in genomesList:
		#print geneDict
		currentCDSDict = listOfCDSDicts[i]
		currentGenomeDict = listOfGenomesDict[i]
		
		#print genomeFile
		#print resultsList
		#print geneDict
		#print orderedAlleleNames
		i+=1		# it has to be incremented here
		if genomeFile[-1] == '\n':
			genomeFile = genomeFile[:-1]


                # ------------------------------ RUNNING BLAST ------------------------------ #

		cline = NcbiblastnCommandline(query=genomeFile, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5)
		blast_records = runBlastParser(cline, blast_out_file, genomeFile)


		# ------ DETERMINING BEST MATCH ------ #

		# bestMatch = ['rec.query','hsp', lenRatio]
		bestMatch = ['','', 0]
		bestMatchContig=''
		bestMatchContigLen=''
		bestalignlen=0
		perfectMatch=False
		bmAlleleLen2=0
		bmAllele=''
		#noAlignment=False
		for blast_record in blast_records:
			# --- the LNF cases are now called outside de loop --- #
			#print blast_record
			if perfectMatch==True:
				break
			try:
				#print blast_record.alignments
				hspC = blast_record.alignments[0]
				
				if bestMatch[0] == '' and bestMatch[1] == '':
					bestMatch[0] = blast_record.query
					bestMatch[1] = hspC
			except IndexError:
				continue


			# --- the contig tag is used in the progigal function --- #

			contigTag = blast_record.query
			# --- brute force parsing of the contig tag - better solution is advisable --- #			

			j=0
			for l in contigTag:
				if l == ' ':
					break
				j+=1

			contigTag = contigTag[:j]

			contigLen = blast_record.query_letters
			#print blast_record.query_id
			
			# --- iterating over all the results to determine the best match --- #
			for alignment in blast_record.alignments:
				
				index=orderedAlleleNames.index(alignment.hit_def)
				#print alignment.hit_def
				for k, v in geneDict.iteritems():
					if v == index+1:
						bmAlleleLen2= len(k)
					
				if perfectMatch:
					break
				for match in alignment.hsps:
					#print match
					scoreRatio = float(match.score) / float(bmAlleleLen2)
					
					#print alignment.hit_def
					#print match.identities
					#print bmAlleleLen2
					#print
					#print match.identities
					#print len(match.sbjct)

					
					if (int(match.identities)==int(bmAlleleLen2) and int(match.identities)==int(len(match.sbjct)) and "N" not in match.query ): 
						index=orderedAlleleNames.index(alignment.hit_def)
						bmAlleleLen= len(geneDict.keys()[index])
						
						lenratio=float(len(match.query))/float(bmAlleleLen)
						bestMatch = [blast_record.query, match, scoreRatio, alignment.hit_def,lenratio,bmAlleleLen]
						bestMatchContig=contigTag
						perfectMatch=True
						index=orderedAlleleNames.index(alignment.hit_def)
						bmAlleleLen= len(geneDict.keys()[index])
						break
						
					elif scoreRatio > bestMatch[2]:
						index=orderedAlleleNames.index(alignment.hit_def)
						bmAllele=geneDict.keys()[index]
						bmAlleleLen= len(bmAllele)
						lenratio=float(len(match.query))/float(bmAlleleLen)
						bestMatch = [blast_record.query, match, scoreRatio, alignment.hit_def,lenratio,bmAlleleLen]
						bestMatchContig=contigTag
						bestMatchContigLen=blast_record.query_letters
						if match.sbjct_start > match.sbjct_end:
							inverted=True
						#print match.query
						bestalignlen=alignment.length
						
						
					
					if perfectMatch==True:
						break


		# ---------- ALLELE CALLING AFTER DETERMINING BEST MATCH ---------- #

		
		
		try:
			#print bestMatch[0]
			match = bestMatch[1]
			#print match.query
			geneLen = bestMatch[5]
			alleleStr = match.query
			nIdentities = match.identities
			idPercent = float(nIdentities) / float(geneLen)
			scoreRatio = bestMatch[2]
			lenRatio = bestMatch[4]
			#print perfectMatch
			#print "\nContig best/exact match is :"
			#print bestMatchContig +"\n"
		
		except:
			resultsList.append('LNF3:-1')            # append result to the list of results
			perfectMatchIdAllele.append('LNF')
			printinfo(genomeFile,geneFile)
			print "Locus not found \n"
			continue
		
		#TODO check identities >0.8
		
		if perfectMatch is True:
			#TODO perfect match to top
			if match.sbjct_start > match.sbjct_end:
				alleleStr = reverseComplement(alleleStr)
			#TODO test replace -
			#alleleStr = alleleStr.replace('-', '')
			alleleNumber = geneDict[ alleleStr ]
			
					################################################
					# EXACT MATCH --- MATCH == GENE --- GENE FOUND #
					################################################
			if "_" in bestMatch[3]:
				a=bestMatch[3].split("_")
				perfectMatchIdAllele.append(a[1])
			else:
				perfectMatchIdAllele.append(bestMatch[3])
			resultsList.append('EXC:' + str(alleleNumber) )
		
						###################
						# LOCUS NOT FOUND #
						###################
			
		#elif bestMatch[0] == '':
		#	resultsList.append('LNF:-1')            # append result to the list of results
		#	perfectMatchIdAllele.append('LNF')
		#	printinfo(genomeFile,geneFile)
		#	print "Locus not found \n"

		elif bestMatch[0] != '' and perfectMatch is not True:
						
				

					###########################
					# LOCUS ON THE CONTIG TIP #
					###########################
			
			
			#if match.query_start == 1 or bestMatchContigLen <= match.query_end:
			## TODO-
			## 1 -  LOT5 match.query_start ==1 and match.length < match.subj.length (allele length) alignement length
			## 2 - LOT 3' match.query_end == match.query.length (contig length) and match.length < contig length (allele length??)
			## 3 - LOT SC bestMatchContigLen <= allele length
			
			if match.query_start ==1 and len(match.query) < geneLen:
			
				resultsList.append('LOT5:-1')
				perfectMatchIdAllele.append('LOT5')
				printinfo(genomeFile,geneFile)
				
				print "Locus is on the 5' tip of the contig \n"
			
			
			elif match.query_end == bestMatchContigLen and len(match.query) < bestMatchContigLen:
				
				resultsList.append('LOT3:-1')
				perfectMatchIdAllele.append('LOT3')
				printinfo(genomeFile,geneFile)

				print "Locus is on the 3' tip of the contig \n"
			
			elif bestMatchContigLen <= geneLen:
				
				resultsList.append('LOTSC:-1')
				perfectMatchIdAllele.append('LOTSC')
				printinfo(genomeFile,geneFile)
				#print match.query_start
				print "Locus is bigger than the contig \n"
					
					
				

			elif 'N' in alleleStr:
				#TODO gravar para ficheiro
					#####################
					# ALLELE NOT FOUND  #		# N base found!
					#####################
				
				geneFile2= os.path.splitext(geneFile)[0] + "LNFN.fasta"
				print geneFile2
				with open(geneFile2, 'a') as f:
					f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+"\n")
					f.write((alleleStr) +"\n")
				resultsList.append('LNFN:-1')
				perfectMatchIdAllele.append('LNFN')
				printinfo(genomeFile,geneFile) 
				print "LNFN, contains N bases! \n"
			
			
			
			else:
				# ------------------------------------------------------------------------------------------------------- #
				#                                                                                                         #
				#                                   USING PRODIGAL TO TRY TO EXTEND CDS                                   #
				#                                                                                                         #
				# ------------------------------------------------------------------------------------------------------- #
				
				CDSType=''
				extended, strCDS, CDSType = extendCDS(bestMatchContig, currentCDSDict, match.query_start, match.query_end, currentGenomeDict, geneLen)
				# --- if it was possible to extend it using prodigal --- #
				
				#print extended
				#print strCDS
				#print CDSType
				

				if extended :
					alleleStr = strCDS
					
					lenRatio = float(len(strCDS)) / float(geneLen)
					#print alleleStr
					#print lenRatio
				elif not extended and biggestAllelelen > geneLen:
					extended, strCDS, CDSType = extendCDS(bestMatchContig, currentCDSDict, match.query_start, match.query_end, currentGenomeDict, biggestAllelelen)
					if extended :
						alleleStr = strCDS
					
						lenRatio = float(len(strCDS)) / float(geneLen)
					else:
						alleleStr = alleleStr.replace('-', '')
				
				
				else:
					# --- removing gaps '-' --- #
				#print alleleStr
					
					alleleStr = alleleStr.replace('-', '')

				# --- continuing the allele calling --- #

					
			
					#print geneDict
					#print alleleStr
					# --- it might be needed to obtain the reverse complement of the allele string --- #
				if match.sbjct_start > match.sbjct_end:
					alleleStr = reverseComplement(alleleStr)
					
				if alleleStr in geneDict:
					alleleNumber = geneDict[ alleleStr ]
					
					################################################
					# EXACT MATCH --- MATCH == GENE --- GENE FOUND #						
					################################################
					perfectMatchIdAllele.append(alleleNumber)
					resultsList.append('EXC:' + str(alleleNumber) )
						

				else:

					isUndefined = False	
					#print geneDict.keys()[0]
					defAllele=''
					defAlleleName=''
					for k in geneDict.keys():
						if alleleStr in k:
							defAllele=k
							#print alleleStr
							isUndefined = True
							defAlleleName=geneDict.get(k)
							break

						
					if extended and isUndefined and idPercent > 0.8 and ((int(len(match.query))==int(len(defAllele)) or int(len(match.query))==int(len(defAllele))+1 or int(len(match.query))==int(len(defAllele))-1)) :
						#extended allele to compare may be different from the allele to compare from bm	
						alleleStr=match.query
							
						alleleStr = alleleStr.replace('-', '')
							
						if match.sbjct_start > match.sbjct_end:    #### - error??
							alleleStr = reverseComplement(alleleStr)
							
						if int(len(alleleStr))==int(len(defAllele)): # se o match for do mesmo tamanho que o alello
							tagAux = 'NA1:'
							printinfo(genomeFile,geneFile) 
							perfectMatchIdAllele.append("NA1-"+str(alleleI))
							
						elif int(len(alleleStr))==int(len(defAllele))-1 : # se o match tiver uma base a mais que o alelo
							tagAux = 'NA2:'
							printinfo(genomeFile,geneFile) 
							perfectMatchIdAllele.append("NA2-"+str(alleleI))
							
						else:												#se o match tiver uma base a menos que o alelo
							tagAux = 'NA3:'
							printinfo(genomeFile,geneFile) 
							perfectMatchIdAllele.append("NA3-"+str(alleleI))
							
								
						print "New allele found! Adding allele "+ tagAux + str(alleleI) +" to the database"
						geneDict[alleleStr] = alleleI
							
						resultsList.append( tagAux + str(alleleI) )
							
						orderedAlleleNames.append('allele_' + str(alleleI) + '_' + tagAux[:-1] + str(os.path.basename(genomeFile)))									
						# --- add the new allele to the gene fasta --- #
							
						fG = open( geneFile, 'a' )
						fG.write('>allele_' + str(alleleI) + '_' + tagAux[:-1] + str(os.path.basename(genomeFile)) + '\n')
						fG.write( alleleStr + '\n')
						fG.close()
						alleleI += 1
						Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 )
					
					elif not extended and idPercent > 0.8 and ((int(len(match.query))==int(geneLen) or int(len(match.query))==int(geneLen)+1 or int(len(match.query))==int(geneLen)-1)) :
							
						alleleStr=match.query
							
						alleleStr = alleleStr.replace('-', '')
							
						if match.sbjct_start > match.sbjct_end:    #### - error??
							alleleStr = reverseComplement(alleleStr)
							
						if int(len(alleleStr))==int(geneLen): # se o match for do mesmo tamanho que o alello
							tagAux = 'NA4:'
							printinfo(genomeFile,geneFile) 
							perfectMatchIdAllele.append("NA4-"+str(alleleI))
							
						elif int(len(alleleStr))==int(geneLen)-1 : # se o match tiver uma base a mais que o alelo
							tagAux = 'NA5:'
							printinfo(genomeFile,geneFile) 
							perfectMatchIdAllele.append("NA5-"+str(alleleI))
							
						else:												#se o match tiver uma base a menos que o alelo
							tagAux = 'NA6:'
							printinfo(genomeFile,geneFile) 
							perfectMatchIdAllele.append("NA6-"+str(alleleI))
							
								
						print "New allele found! Adding allele "+ tagAux + str(alleleI) +" to the database"
						geneDict[alleleStr] = alleleI
							
						resultsList.append( tagAux + str(alleleI) )
							
						orderedAlleleNames.append('allele_' + str(alleleI) + '_' + tagAux[:-1] + str(os.path.basename(genomeFile)))									
						# --- add the new allele to the gene fasta --- #
							
						fG = open( geneFile, 'a' )
						fG.write('>allele_' + str(alleleI) + '_' + tagAux[:-1] + str(os.path.basename(genomeFile)) + '\n')
						fG.write( alleleStr + '\n')
						fG.close()
						alleleI += 1
						Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 )
					
							
					elif isUndefined:

							####################
							# UNDEFINED ALLELE #		# it is contained in another allele
							####################
							
						alleleStr=match.query
						#if match.sbjct_start > match.sbjct_end:    #### - error
							#alleleStr = reverseComplement(alleleStr)
						resultsList.append('UND:-1')
						perfectMatchIdAllele.append("undefined allele")
						printinfo(genomeFile,geneFile) 
						print "Undefined allele \n"
						
						geneFile2= os.path.splitext(geneFile)[0] + "undefined.fasta"
						print geneFile2
						with open(geneFile2, 'a') as f:
							f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n")
							f.write((alleleStr) +"\n")
							#f.write(">BlastBestMatch"+str(defAlleleName)+"\n")
							#f.write((alleleStr)+"\n")
							f.write(">Allele"+str(defAlleleName)+"\n")
							f.write((defAllele)+"\n")
						
					
						
						
						
					else:
						if not extended :
							
								
							if lenRatio < 0.5:
							
									###############
									# SMALL MATCH #
									###############
									
								resultsList.append('SAC:-1')		# don't know what 'SAC' stands for
								perfectMatchIdAllele.append('small match')
								printinfo(genomeFile,geneFile) 
								print "lower than 50% match \n"	
								
							elif lenRatio < 0.8 and idPercent < 0.5:

								#####################
								# INCOMPLETE ALLELE #		# it was not possible to extend it to at least 80% of the length of the gene
								#####################

								resultsList.append('INC:-1')
								perfectMatchIdAllele.append('allele incomplete')
								printinfo(genomeFile,geneFile)
								print "Incomplete allele\n"
							
							else:	
								##################
								# LNF WTFFF #
								##################
								geneFile2= os.path.splitext(geneFile)[0] + "LNF2.fasta"
								print geneFile2
								with open(geneFile2, 'a') as f:
									f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n")
									f.write((alleleStr) +"\n")
									f.write(">Allele\n")
									f.write((bmAllele)+"\n")
								resultsList.append('LNF2')
								printinfo(genomeFile,geneFile) 
								perfectMatchIdAllele.append("LNF2")
								print "Not extended and no allele found"

						else:

								#######################
								# ADD INFERRED ALLELE #		# a new allele that was extended with prodigal
								#######################
							if(CDSType=='larger than match'):
								tagAux = 'INF1:'
							elif(CDSType=='start codon inside match'):
								tagAux = 'INF2:'
							elif(CDSType=='early stop codon in match'):
								tagAux = 'INF3:'
							elif(CDSType=='same size as allele'):
								tagAux = 'INF4:'
							else:
								tagAux = 'INF5:'
								
							print "infered allele has location : "+(CDSType)
							printinfo(genomeFile,geneFile) 
							perfectMatchIdAllele.append( tagAux +"-"+str(alleleI))
							print "New allele Infered with prodigal! Adding allele "+ tagAux + str(alleleI) +" to the database\n"
							
							
								
							geneDict[alleleStr] = alleleI
								
							resultsList.append( tagAux + str(alleleI) )
								
							orderedAlleleNames.append('allele_' + str(alleleI) + '_' + tagAux[:-1] + str(os.path.basename(genomeFile)))	
							# --- add the new allele to the gene fasta --- #

							fG = open( geneFile, 'a' )
							fG.write('>allele_' + str(alleleI) + '_' + tagAux[:-1] + str(os.path.basename(genomeFile)) + '\n')
							#print alleleStr
							fG.write( alleleStr + '\n')
							fG.close()
							alleleI += 1
							

							# --- remake blast DB --- #
							
							Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 )
							
	final =	(resultsList,perfectMatchIdAllele)	
	#return (resultsList)
	return final
def main():

	parser = argparse.ArgumentParser(description="Given an ffn file, recovers the genes that are not paralogs and have a size bigger than the g parameter provided")
	parser.add_argument('-i', nargs='?', type=str, help='ffn file', required=True)
	parser.add_argument('-g', nargs='?', type=int, help='int minimum size', required=True)
	
	args = parser.parse_args()
	genes = args.i
	sizethresh = args.g
	passSteps = False

	
	#translate to protein and create new file
	abspath=os.path.abspath(genes)
	filename=os.path.basename(genes)
	abspath=abspath.replace(filename,'')
	proteinfile=os.path.join(abspath,'proteins.fasta') 
	
	geneDict = {}
	protDict={}
	orderedprotDict=collections.OrderedDict()
	alreadyIn=[]
	totalgenes=0
	repeatedgenes=0
	smallgenes=0
	
	if not passSteps:
		print "not passing steps"
		with open(proteinfile, "wb") as f:
			g_fp = HTSeq.FastaReader( genes )
			totalgenes+=1
			for gene in g_fp:
				dnaseq=	str(gene.seq)
				protseq,x,y=translateSeq(dnaseq)
				if len(protseq)>1:
					
					if str(protseq) in alreadyIn:
						repeatedgenes+=1
					
					elif len(str(protseq))<67:
						smallgenes+=1
						
					else:	
						alreadyIn.append(str(protseq))
						protname=">"+str(gene.name)+"\n"
														
						f.write(protname+str(protseq)+"\n")
						protDict[protname] = str(protseq)
						geneDict[str(gene.name)] = gene.seq
				else:

					print gene.name
			
			orderedprotList=[]
			orderedprotList=sorted(protDict.items(), key=lambda x: len(x[1]), reverse=True)
			
			
			i=0
			while i < len(orderedprotList):
				elem=orderedprotList[i]
				orderedprotDict[elem[0]] = elem[1]
				i+=1
				
			#print orderedprotDict
		print str(repeatedgenes) + " repeated genes out of "+ str(totalgenes)
		print str(smallgenes) + " small genes out of "+ str(totalgenes)
		print "protein file created"
				
		# first step -  remove genes contained in other genes or 100% equal genes
		
		# list of results - the output of the function
		resultsList = []
		
		auxDict={}
		g_fp = HTSeq.FastaReader( proteinfile )
		g=0
		j=0
		
		print "Checking if proteins are equal or substring of others..."
		
		# for each gene from all the annotated genes - starting with an empty dictionary, only add a new gene if the "to be added gene" is not contained or equal to a gene already added to the dictionary
		auxprot=[]

		for elem in orderedprotDict.items():

			contained=False
			
			prot=str(elem[1])
			if any(prot in x for x in auxprot):
				g+=1
				contained=True
			
			else:
				auxDict[elem[1]] = elem[0]
				auxprot.append(str(elem[1]))
			
			print str(j)+ " out of " + str(len(orderedprotDict)	)
				
			j+=1
			#print "____" +str(j)
		print "%s genes are contained in other genes" %  (g)
		
		#overwrite the original file, obtaining a new file with unique genes
		
		with open(proteinfile, "wb") as f:
			allsequences=''
			for k,v in auxDict.iteritems():
				allsequences+=v+k+"\n"
			f.write(allsequences)
	
	else:
		
		totalgenes=0
		smallgenes=0
		g_fp = HTSeq.FastaReader( genes )
		totalgenes+=1
		for gene in g_fp:
			dnaseq=	str(gene.seq)
			protseq,x,y=translateSeq(dnaseq)
			if len(protseq)>1:
				
				if str(protseq) in alreadyIn:
					repeatedgenes+=1
					#print gene.name + " already saved "
				
				elif len(str(protseq))<67:
					smallgenes+=1
					
				else:	
					alreadyIn.append(str(protseq))
					protname=">"+str(gene.name)+"\n"
							
						#print protseq
							
					protDict[protname] = str(protseq)
					geneDict[str(gene.name)] = gene.seq
			else:

				print gene.name
	
	
	
	geneFile = os.path.abspath( proteinfile )
	print proteinfile
	Gene_Blast_DB_name = Create_Blastdb( geneFile, 1, True )
	
	geneF = os.path.splitext( geneFile )[0]
	blast_out_file = geneF + '.xml'
					# ------------------------------ RUNNING BLAST ------------------------------ #

	cline = NcbiblastpCommandline(query=geneFile, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5)
	blast_records = runBlastParser(cline, blast_out_file, geneFile)
	toRemove=[]
	genesToKeep=[]
	log=["removed\tcause\texplanation"]
	for blast_record in blast_records:
		
		allelename=blast_record.query
		allelename=allelename.split(" ")
		allelename=allelename[0]
		alleleLength=len(geneDict[allelename])

		try:
			
			#if gene A is not on the toRemove list yet, add to genesToKeep list
			
			if str(blast_record.query) not in toRemove:
				genesToKeep.append(blast_record.query)
				
				i=0
				#if first alignement is not against self, gene B is bigger than gene A and very simillar - remove gene A from genesToKeep and add gene B instead
				if  not str(blast_record.query) == str((blast_record.alignments[0]).hit_def):
					genesToKeep.remove(str(blast_record.query))
					toRemove.append(str(blast_record.query))
					log.append(str(blast_record.query)+"\t"+str((blast_record.alignments[0]).hit_def)+"\t"+"2 is first best match")
					
					#if gene B is not on the toRemove list, add to genesToKeep list
					if str((blast_record.alignments[0]).hit_def) not in toRemove:
						genesToKeep.append(str((blast_record.alignments[0]).hit_def))

					raise
				
				selfblastscore=(((blast_record.alignments[0]).hsps)[0]).score


				
				
				while i<len(blast_record.alignments):
					align=blast_record.alignments[i]
					
					match=(align.hsps)[0]
					scoreRatio=float(match.score)/float(selfblastscore)
					
					alleleLength2=len(geneDict[str(align.hit_def)])
					
					#if good match and gene B not in toremove list
					if(scoreRatio>0.6 and not str(align.hit_def) == str(blast_record.query) and str(align.hit_def) not in toRemove):
						
						#if gene B is bigger than gene A, keep bigger gene B
						if alleleLength2>alleleLength :
							genesToKeep.append(str(align.hit_def))
							genesToKeep.remove(str(blast_record.query))
							toRemove.append(str(blast_record.query))
							log.append(str(blast_record.query)+"\t"+str(align.hit_def)+"\t"+"2 is bigger and bsr >0.6")
							
							raise
						#else add gene B to toremove list
						elif str(align.hit_def) in genesToKeep:
							genesToKeep.remove(str(align.hit_def))
							toRemove.append(str(align.hit_def))
							log.append(str(align.hit_def)+"\t"+str(blast_record.query)+"\t"+"2 is bigger and bsr >0.6")
							
					i+=1
			
			#else gene A is on toRemove list, add all similar genes (not in genesToKeep) list to the toRemove list
			else:		
						
				i=0
				selfblastscore=0
				for align in blast_record.alignments:
					if not (str(align.hit_def) == str(blast_record.query)):
						selfblastscore=((align.hsps)[0]).score
						print "gene "+str(align.hit_def)+" is bigger than gene "+str(blast_record.query)
						raise
				
				while i<len(blast_record.alignments):
					align=blast_record.alignments[i]
					match=(align.hsps)[0]
					scoreRatio=float(match.score)/float(selfblastscore)
					
					if align.hit_def not in genesToKeep and not str(align.hit_def) == str(blast_record.query) and scoreRatio>0.6 :
						toRemove.append(align.hit_def)
						log.append(str(align.hit_def)+"\t"+str(blast_record.query)+"\t"+"2 was on the removed list and bsr >0.6")
							
					else:
						pass

					i+=1
			

		except Exception as e:
			#print e
			pass
	with open("logfile.txt", "wb") as f:
		for elem in log:
			
			f.write(str(elem)+"\n")
	

	
	genesToKeep=list(set(genesToKeep))
	toRemove=list(set(toRemove))
	s = set(toRemove)
	notcommonToKeep= [x for x in genesToKeep if x not in s]
	print len(toRemove)
	print len(genesToKeep)
	print len(notcommonToKeep)

	pathfiles=os.path.dirname(geneFile)
	pathfiles=pathfiles+"/"

	g_fp = HTSeq.FastaReader( genes )
	removedparalogs=0
	removedsize=0
	totalgenes=0
	rest=0
	concatenatedFile=''
	for contig in g_fp:
		totalgenes+=1
		name = contig.name+" "+contig.descr
		name2= contig.name
		
		
		if name2 not in toRemove and name2 in genesToKeep:
			if int(len(contig.seq))>sizethresh:
				namefile=contig.name
				namefile=namefile.replace("|","_")
				with open(pathfiles+namefile+".fasta", "wb") as f:
					f.write(">1\n"+contig.seq+"\n")
				rest+=1	
				concatenatedFile+=">"+namefile+"\n"+contig.seq+"\n"
			else:
				removedsize+=1
		else:

			removedparalogs+=1
		
	print "%s genes are contained in other genes" %  (g)
	print "Removed %s same Locus genes" % str(removedparalogs)
	print "Removed %s because of size " % str(removedsize)
	print "%s Scheme genes " % str(rest)
	print "total genes:" + str(totalgenes)
	
	with open (pathfiles+"concatenated.fasta","wb") as f:
		f.write (concatenatedFile)
Example #18
0
def main():
    print("Starting script at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))
    try:
        input_file = sys.argv[1]
        temppath = sys.argv[2]
    except IndexError:
        print "usage: list_pickle_obj"

    argumentList = []
    with open(input_file, 'rb') as f:
        argumentList = pickle.load(f)

    geneFile = argumentList[0]
    genomesList = argumentList[1]

    basepath = os.path.join(temppath, os.path.splitext(geneFile)[0])
    if not os.path.exists(basepath):
        os.makedirs(basepath)

    gene_fp = HTSeq.FastaReader(geneFile)
    alleleI = 0

    resultsList = []
    i = 0
    perfectMatchIdAllele = []
    perfectMatchIdAllele2 = []
    allelescores = []

    print("Getting BSR at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))

    geneScorePickle = os.path.abspath(geneFile) + '_bsr.txt'

    #check if bsr as arealdy been calculated and recalculate it

    if os.path.isfile(geneScorePickle):

        alleleI, allelescores, alleleList = getBlastScoreRatios(
            geneFile, basepath, False)

    else:
        alleleI, allelescores, alleleList = getBlastScoreRatios(
            geneFile, basepath, True)

    print("Finished BSR at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))
    genome = -1

    genomeDict = {}
    print("starting allele call at: " + time.strftime("%H:%M:%S-%d/%m/%Y"))
    for genomeFile in genomesList:
        print genomeFile
        bestmatch = [
            0, 0, False, '', 0
        ]  #score, score ratio, perfectmatch, key name of the DNA sequence string, allele ID
        currentGenomeDict = {}
        currentCDSDict = {}

        # load the translated CDS from the genome to a dictionary
        filepath = os.path.join(
            temppath,
            str(os.path.basename(genomeFile)) + "_ORF_Protein.txt")
        with open(filepath, 'rb') as f:
            currentCDSDict = pickle.load(f)

        #load the contig info of the genome to a dictionary
        g_fp = HTSeq.FastaReader(genomeFile)
        for contig in g_fp:
            sequence = str(contig.seq)
            genomeDict[contig.name] = sequence

        currentGenomeDict = genomeDict

        genome += 1
        listOfCDS = currentCDSDict
        genomeProteinfastaPath = os.path.join(
            temppath, str(os.path.basename(genomeFile) + '_Protein.fasta'))

        print("Blasting alleles on genome at : " +
              time.strftime("%H:%M:%S-%d/%m/%Y"))

        blast_out_file = os.path.join(
            basepath, "blastdbs/" + os.path.basename(geneFile) + '_List.xml')

        Gene_Blast_DB_name = os.path.join(
            temppath,
            str(os.path.basename(genomeFile)) + "/" +
            str(os.path.basename(genomeFile)) + "_db")

        proteinfastaPath = os.path.join(
            basepath, str(os.path.basename(geneFile) + '_protein.fasta'))

        #blast the genome CDS against the translated locus
        cline = NcbiblastpCommandline(query=proteinfastaPath,
                                      db=Gene_Blast_DB_name,
                                      evalue=0.001,
                                      out=blast_out_file,
                                      outfmt=5)

        blast_records = runBlastParser(cline, blast_out_file, proteinfastaPath)
        print("Blasted alleles on genome at : " +
              time.strftime("%H:%M:%S-%d/%m/%Y"))

        alleleSizes = []
        for allele in alleleList:
            alleleSizes.append(len(allele))

        biggestSizeAllele = 0

        moda = max(set(alleleSizes), key=alleleSizes.count)
        contador = Counter(alleleSizes).most_common()

        if (contador[0])[1] == 1:
            moda = alleleSizes[0]

        try:

            # iterate through the blast results
            for blast_record in blast_records:

                locationcontigs = []

                for alignment in blast_record.alignments:

                    # select the best match
                    for match in alignment.hsps:

                        alleleMatchid = str(
                            blast_record.query_id).split("_")[1]

                        scoreRatio = float(match.score) / float(
                            allelescores[int(alleleMatchid) - 1])

                        cdsStrName = ((alignment.title).split(" "))[1]

                        DNAstr = listOfCDS[">" + cdsStrName]

                        AlleleDNAstr = alleleList[int(alleleMatchid) - 1]
                        if len(AlleleDNAstr) > biggestSizeAllele:
                            biggestSizeAllele = len(AlleleDNAstr)

                        compare = False

                        #compare the DNA match and the allele DNA sequence (protein sequences may be equal and DNA different)
                        if DNAstr == AlleleDNAstr is False:
                            try:
                                DNAstr = reverseComplement(DNAstr)
                                if DNAstr == AlleleDNAstr is False:
                                    pass
                                else:
                                    compare = True
                            except:
                                pass
                        else:
                            compare = True

                        if scoreRatio > 0.6:
                            locationcontigs.append(cdsStrName)

                        if "N" in DNAstr or "K" in DNAstr or "R" in DNAstr:
                            pass

                        elif (scoreRatio == 1 and bestmatch[2] is False
                              and compare is True):
                            bestmatch = [
                                match.score, scoreRatio, True, cdsStrName,
                                int(alleleMatchid), match,
                                len(AlleleDNAstr)
                            ]

                        elif (scoreRatio == 1 and match.score > bestmatch[0]
                              and compare is True):
                            bestmatch = [
                                match.score, scoreRatio, True, cdsStrName,
                                int(alleleMatchid), match,
                                len(AlleleDNAstr)
                            ]

                        elif (scoreRatio == 1 and bestmatch[2] is False
                              and compare is False):
                            bestmatch = [
                                match.score, scoreRatio, False, cdsStrName,
                                int(alleleMatchid), match,
                                len(AlleleDNAstr)
                            ]

                        elif (scoreRatio == 1 and match.score > bestmatch[0]
                              and compare is False):
                            bestmatch = [
                                match.score, scoreRatio, False, cdsStrName,
                                int(alleleMatchid), match,
                                len(AlleleDNAstr)
                            ]

                        elif (match.score > bestmatch[0] and scoreRatio > 0.6
                              and scoreRatio > bestmatch[1]
                              and bestmatch[2] is False):
                            bestmatch = [
                                match.score, scoreRatio, False, cdsStrName,
                                int(alleleMatchid), match,
                                len(AlleleDNAstr)
                            ]

            print("Classifying the match at : " +
                  time.strftime("%H:%M:%S-%d/%m/%Y"))

            #if no best match was found it's a Locus Not Found
            if bestmatch[
                    0] == 0 or "N" in AlleleDNAstr or "K" in AlleleDNAstr or "R" in AlleleDNAstr:

                ###################
                # LOCUS NOT FOUND #
                ###################
                if bestmatch[0] == 0:
                    resultsList.append('LNF3:-1')
                    perfectMatchIdAllele.append('LNF')
                    perfectMatchIdAllele2.append('LNF')
                    print "Locus not found, no matches \n"
                else:
                    resultsList.append('LNFN:-1')
                    perfectMatchIdAllele.append('LNF')
                    perfectMatchIdAllele2.append('LNF')
                    print "Locus has strange base (N, K or R) \n"

            #if more than one BSR >0.6 in two different CDSs it's a Non Paralog Locus
            elif len(list(set(locationcontigs))) > 1:
                resultsList.append('NIPL')
                perfectMatchIdAllele.append('NIPL')
                perfectMatchIdAllele2.append('NIPL')
                for elem in locationcontigs:
                    print elem

            #in case the DNA match sequence equal to the DNA sequence of the comparing allele
            elif bestmatch[2] is True:
                contigname = bestmatch[3]

                contigname = contigname.split("&")
                matchLocation = contigname[2]
                contigname = contigname[0]
                print contigname
                alleleStr = listOfCDS[">" + bestmatch[3]]
                protSeq, alleleStr, Reversed = translateSeq(alleleStr)

                #check for possible locus on tip
                match = bestmatch[5]
                matchLocation2 = matchLocation.split("-")
                seq = currentGenomeDict[contigname]
                bestMatchContigLen = len(seq)

                rightmatchContig = bestMatchContigLen - int(matchLocation2[1])
                leftmatchContig = int(matchLocation2[0])

                if Reversed:
                    aux = rightmatchContig
                    rightmatchContig = leftmatchContig
                    leftmatchContig = aux

                # get extra space to the right and left between the allele and match

                possibleExtra = int(moda) - ((int(match.query_end) * 3) -
                                             (int(match.query_start) * 3))

                if possibleExtra < 0:
                    perfectMatchIdAllele.append(str(bestmatch[4]))
                    if not Reversed:
                        perfectMatchIdAllele2.append(
                            str(contigname) + "&" + str(matchLocation) + "&" +
                            "+")
                    else:
                        perfectMatchIdAllele2.append(
                            str(contigname) + "&" + str(matchLocation) + "&" +
                            "-")
                    resultsList.append('EXC:' + str(bestmatch[4]))

                else:
                    rightmatchAllele = possibleExtra
                    leftmatchAllele = possibleExtra

                    if leftmatchContig < leftmatchAllele and rightmatchContig < rightmatchAllele:

                        resultsList.append('PLOTSC:-1')
                        perfectMatchIdAllele.append('PLOTSC')
                        perfectMatchIdAllele2.append('PLOTSC')

                        print match
                        print "contig extras (l,r)"
                        print leftmatchContig, rightmatchContig
                        print "allele extras (l,r)"
                        print leftmatchAllele, rightmatchAllele

                        print "Locus is possibly bigger than the contig \n"

                    elif leftmatchContig < leftmatchAllele:

                        resultsList.append('PLOT3:-1')
                        perfectMatchIdAllele.append('PLOT3')
                        perfectMatchIdAllele2.append('PLOT3')

                        print match
                        print "contig extras (l,r)"
                        print leftmatchContig, rightmatchContig
                        print "allele extras (l,r)"
                        print leftmatchAllele, rightmatchAllele

                        print "Locus is possibly on the 3' tip of the contig \n"

                    elif rightmatchContig < rightmatchAllele:

                        resultsList.append('PLOT5:-1')
                        perfectMatchIdAllele.append('PLOT5')
                        perfectMatchIdAllele2.append('PLOT5')

                        print match
                        print "contig extras (l,r)"
                        print leftmatchContig, rightmatchContig
                        print "allele extras (l,r)"
                        print leftmatchAllele, rightmatchAllele

                        print "Locus is possibly on the 5' tip of the contig \n"

                    else:
                        #if a perfect match was found

                        ################################################
                        # EXACT MATCH --- MATCH == GENE --- GENE FOUND #
                        ################################################

                        perfectMatchIdAllele.append(str(bestmatch[4]))
                        if not Reversed:
                            perfectMatchIdAllele2.append(
                                str(contigname) + "&" + str(matchLocation) +
                                "&" + "+")
                        else:
                            perfectMatchIdAllele2.append(
                                str(contigname) + "&" + str(matchLocation) +
                                "&" + "-")
                        resultsList.append('EXC:' + str(bestmatch[4]))

            # if match with BSR >0.6 and not equal DNA sequences
            else:

                match = bestmatch[5]
                geneLen = bestmatch[6]

                contigname = bestmatch[3]

                contigname = contigname.split("&")
                matchLocation = contigname[2]
                matchLocation = matchLocation.split("-")
                contigname = contigname[0]

                seq = currentGenomeDict[contigname]
                bestMatchContigLen = len(seq)

                alleleStr = listOfCDS[">" + bestmatch[3]]
                protSeq, alleleStr, Reversed = translateSeq(alleleStr)

                rightmatchContig = bestMatchContigLen - int(matchLocation[1])
                leftmatchContig = int(matchLocation[0])

                if Reversed:
                    aux = rightmatchContig
                    rightmatchContig = leftmatchContig
                    leftmatchContig = aux

                print rightmatchContig, leftmatchContig

                # get extra space to the right and left between the allele and match and check if it's still inside the contig

                rightmatchAllele = geneLen - ((int(match.query_end) + 1) * 3)
                leftmatchAllele = ((int(match.query_start) - 1) * 3)

                ###########################
                # LOCUS ON THE CONTIG TIP #
                ###########################

                if leftmatchContig < leftmatchAllele and rightmatchContig < rightmatchAllele:

                    resultsList.append('LOTSC:-1')
                    perfectMatchIdAllele.append('LOTSC')
                    perfectMatchIdAllele2.append('LOTSC')
                    print match
                    print contigname
                    print geneFile
                    print leftmatchAllele, rightmatchAllele
                    print "Locus is bigger than the contig \n"

                elif leftmatchContig < leftmatchAllele:

                    resultsList.append('LOT3:-1')
                    perfectMatchIdAllele.append('LOT3')
                    perfectMatchIdAllele2.append('LOT3')
                    print match
                    print contigname
                    print geneFile
                    print leftmatchAllele, rightmatchAllele
                    print "Locus is on the 3' tip of the contig \n"

                elif rightmatchContig < rightmatchAllele:

                    resultsList.append('LOT5:-1')
                    perfectMatchIdAllele.append('LOT5')
                    perfectMatchIdAllele2.append('LOT5')
                    print match
                    print contigname
                    print geneFile
                    print leftmatchAllele, rightmatchAllele
                    print "Locus is on the 5' tip of the contig \n"

                elif len(alleleStr) > moda + (moda * 0.2):

                    print moda
                    print alleleStr
                    resultsList.append('ALM')
                    perfectMatchIdAllele.append('ALM')
                    perfectMatchIdAllele2.append('ALM')

                elif len(alleleStr) < moda - (moda * 0.2):

                    print moda
                    print alleleStr
                    resultsList.append('ASM')
                    perfectMatchIdAllele.append('ASM')
                    perfectMatchIdAllele2.append('ASM')

                else:
                    #######################
                    # ADD INFERRED ALLELE #		# a new allele
                    #######################

                    tagAux = 'INF'
                    perfectMatchIdAllele.append(tagAux + "-" +
                                                str(alleleI + 1))

                    if not Reversed:
                        perfectMatchIdAllele2.append(
                            str(contigname) + "&" + str(matchLocation[0]) +
                            "-" + str(matchLocation[1]) + "&" + "+")
                    else:
                        perfectMatchIdAllele2.append(
                            str(contigname) + "&" + str(matchLocation[0]) +
                            "-" + str(matchLocation[1]) + "&" + "-")

                    print "New allele! Adding allele " + tagAux + str(
                        alleleI + 1) + " to the database\n"

                    resultsList.append(tagAux + str(alleleI + 1))

                    # --- add the new allele to the gene fasta --- #

                    appendAllele = '>allele_' + str(
                        alleleI + 1) + '_' + tagAux[:-1] + "_" + str(
                            os.path.basename(genomesList[genome])) + '\n'
                    fG = open(geneFile, 'a')
                    fG.write(appendAllele)

                    fG.write(alleleStr + '\n')
                    fG.close()

                    fG = open(
                        os.path.join(
                            basepath,
                            str(
                                os.path.basename(geneFile) +
                                '_protein2.fasta')), 'w')
                    fG.write('>' + str(alleleI + 1) + '\n' + str(protSeq) +
                             '\n')
                    fG.close()
                    fG = open(
                        os.path.join(
                            basepath,
                            str(os.path.basename(geneFile) +
                                '_protein.fasta')), 'a')
                    fG.write('>' + str(alleleI + 1) + '\n' + str(protSeq) +
                             '\n')
                    fG.close()

                    match = bestmatch[5]

                    # --- remake blast DB and recalculate the BSR for the locus --- #
                    alleleList.append(alleleStr)
                    print os.path.join(
                        basepath,
                        str(os.path.basename(geneFile) + '_protein.fasta'))
                    genefile2 = os.path.join(
                        basepath,
                        str(os.path.basename(geneFile) + '_protein2.fasta'))
                    Gene_Blast_DB_name2 = Create_Blastdb(genefile2, 1, True)
                    print("Re-calculating BSR at : " +
                          time.strftime("%H:%M:%S-%d/%m/%Y"))
                    alleleI, allelescores, alleleList = reDogetBlastScoreRatios(
                        genefile2, basepath, alleleI, allelescores,
                        Gene_Blast_DB_name2, alleleList, geneScorePickle)
                    print "allele id " + str(alleleI)
                    print("Done Re-calculating BSR at : " +
                          time.strftime("%H:%M:%S-%d/%m/%Y"))

        except Exception as e:
            print "some error occurred"
            print e
            print 'Error on line {}'.format(sys.exc_info()[-1].tb_lineno)
            perfectMatchIdAllele2.append("ERROR")
            perfectMatchIdAllele.append("ERROR")
            resultsList.append('ERROR')

    final = (resultsList, perfectMatchIdAllele)
    print("Finished allele calling at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))
    filepath = os.path.join(temppath,
                            os.path.basename(geneFile) + "_result.txt")
    filepath2 = os.path.join(temppath,
                             os.path.basename(geneFile) + "_result2.txt")
    with open(filepath, 'wb') as f:
        pickle.dump(final, f)
    with open(filepath2, 'wb') as f:
        pickle.dump(perfectMatchIdAllele2, f)
    shutil.rmtree(basepath)
    return True
Example #19
0
def getBlastScoreRatios(orgName,allelescores,cdsDict,prodigalPath):
    
    openPresults = prodigalPath
    Presults=open(openPresults, 'r')
    linesP = Presults.readlines()
    lastlineP=len(linesP)
    alleleProt=''
    proteome=""
    countP=0
    countCDS=0

    if isContig=="no":
        CreateProteome(nameOrg)
    else:
        queryCDS = CreateProteomeContig(nameOrg,cdsDict)


    cline = NcbiblastpCommandline(query=pathRef+nameOrg+'proteome.fasta', db=name, out=blast_out_file, outfmt=5, num_alignments=7000, num_descriptions=7000)
        #print cline

    allelescore=0
    blast_records = runBlastParser(cline,blast_out_file, alleleProt)

    os.remove(pathRef+nameOrg+'proteome.fasta')
    
    blastScoreRatio=0
    countRecords=0
    bestMatches={}
    BestMatchResults= []
    length=[]
    alignment_posStart=[]
    query_length=[]
    for blast_record in blast_records:
        found=False
        countRecords+=1 
        for alignment in blast_record.alignments:
            if found is False:
                #print blast_record.query, alignment.hit_def
                scoreToUse=0
                for match in alignment.hsps:
                    if len(blast_record.alignments)==0:
                        countResults=countResults
                    else:
                        blastScoreRatio = float(match.score) / float(allelescores[str(alignment.hit_def)])
                                #or re.search("ENA|",alignment.title)
                                #print alignment.title
                        try:
                            geneName=alignment.title.split("|")[5]
                        except IndexError:
                            geneName=alignment.title.split("|")[2]
                                #print geneName
                                #products.append(alignment.title.split("|")[6].split("[")[0])
                                #if hsp.expect < 0.001 and 100 <= hsp.align_length:
                        if geneName.strip() not in BestMatchResults and blastScoreRatio>0.6:
                            BestMatchResults.append(genomeDB+"..."+str(geneName).strip())
                            length.append(str(match.align_length-1))
                            #score.append(str(Score))
                            alignment_posStart.append(str(match.query_start))
                            query_length.append(str(len(match.query)))
                    break
            else:
                break
        #print str(blast_record.query)
        bestMatches[str(countRecords)] = [BestMatchResults,length,alignment_posStart,query_length,str(blast_record.query)]
        BestMatchResults= []
        length=[]
        alignment_posStart=[]
        query_length=[]
    print countRecords


    #fG = open( pathRef+'AllAlleles.fasta', 'a' )
    #for i in ToNewAllele:
        #print i
        #fG.write(i)
    #fG.close()


    #Create_Blastdb(pathRef+'allAllelesAA.fasta',1,True)
    #print matchR
    #print allelescores
    #return alleleI,allelescores,Gene_Blast_DB_name
    #print alleleI
    #print len(allelescores)
    #print countT
    return bestMatches,queryCDS
Example #20
0
def getBlastScoreRatios(genefile, basepath, doAll):

    gene_fp = HTSeq.FastaReader(genefile)
    alleleI = 0
    allelescores = []
    alleleProt = ''
    alleleAllProt = ''
    alleleList = []
    for allele in gene_fp:  #new db for each allele to blast it against himself
        alleleI += 1
        genome = -1
        alleleList.append(allele.seq)
        translatedSequence, x, y = translateSeq(allele.seq)

        if translatedSequence == '':
            pass

        else:
            alleleProt = ">" + str(alleleI) + "\n" + str(translatedSequence +
                                                         "\n")
            alleleAllProt += ">" + str(alleleI) + "\n" + str(
                translatedSequence + "\n")
            proteinfastaPath = os.path.join(
                basepath, str(os.path.basename(genefile) + '_protein2.fasta'))

            with open(proteinfastaPath, "wb") as f:
                f.write(alleleProt)
            Gene_Blast_DB_name = Create_Blastdb(proteinfastaPath, 1, True)
            if doAll:

                blast_out_file = os.path.join(basepath, 'blastdbs/temp.xml')
                print("Starting Blast alleles at : " +
                      time.strftime("%H:%M:%S-%d/%m/%Y"))

                # --- get BLAST score ratio --- #
                cline = NcbiblastpCommandline(query=proteinfastaPath,
                                              db=Gene_Blast_DB_name,
                                              evalue=0.001,
                                              out=blast_out_file,
                                              outfmt=5)
                allelescore = 0

                blast_records = runBlastParser(cline, blast_out_file,
                                               alleleProt)

                print("Blasted alleles at : " +
                      time.strftime("%H:%M:%S-%d/%m/%Y"))

                for blast_record in blast_records:

                    for alignment in blast_record.alignments:

                        for match in alignment.hsps:

                            allelescores.append(int(match.score))

                geneScorePickle = os.path.abspath(genefile) + '_bsr.txt'
                print "________"
                var = [alleleI, allelescores]
                with open(geneScorePickle, 'wb') as f:
                    pickle.dump(var, f)

            else:
                geneScorePickle = os.path.abspath(genefile) + '_bsr.txt'
                with open(geneScorePickle, 'rb') as f:
                    var = pickle.load(f)
                    allelescores = var[1]

    proteinfastaPath = os.path.join(
        basepath, str(os.path.basename(genefile) + '_protein.fasta'))
    with open(proteinfastaPath, "wb") as f:
        f.write(alleleAllProt)

    return int(alleleI), allelescores, alleleList
def getBlastScoreRatios(genefile,basepath,doAll):
	
	gene_fp = HTSeq.FastaReader(genefile)
	alleleI=0
	allelescores=[]
	alleleProt=''
	alleleAllProt=''
	alleleList=[]
	for allele in gene_fp: #new db for each allele to blast it against himself
		alleleI+=1
		genome=-1
		alleleList.append(allele.seq)
		translatedSequence,x,y=translateSeq(allele.seq)
		
		if translatedSequence =='':
			pass
			
		else:	
			alleleProt=">"+str(alleleI)+"\n"+str(translatedSequence+"\n")
			alleleAllProt+=">"+str(alleleI)+"\n"+str(translatedSequence+"\n")
			proteinfastaPath=os.path.join(basepath,str(os.path.basename(genefile)+'_protein2.fasta'))
			
			with open(proteinfastaPath, "wb") as f:
				f.write(alleleProt)
			Gene_Blast_DB_name = Create_Blastdb( proteinfastaPath, 1, True )
			if doAll:
				
				blast_out_file = os.path.join(basepath,'blastdbs/temp.xml')
				print ("Starting Blast alleles at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
				

				# --- get BLAST score ratio --- #
				cline = NcbiblastpCommandline(query=proteinfastaPath, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5)
				allelescore=0
			
				blast_records = runBlastParser(cline,blast_out_file, alleleProt)
			
				print ("Blasted alleles at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
			
				for blast_record in blast_records:

					for alignment in blast_record.alignments:

						for match in alignment.hsps:
								
							allelescores.append(int(match.score))
							
				geneScorePickle=os.path.abspath(genefile)+'_bsr.txt'
				print "________"
				var=[alleleI,allelescores]
				with open(geneScorePickle,'wb') as f:
					pickle.dump(var, f)			
			
			else:
				geneScorePickle=os.path.abspath(genefile)+'_bsr.txt'
				with open(geneScorePickle,'rb') as f:
					var = pickle.load(f)
					allelescores=var[1]
				
	proteinfastaPath=os.path.join(basepath,str(os.path.basename(genefile)+'_protein.fasta'))
	with open(proteinfastaPath, "wb") as f:
			f.write(alleleAllProt)
			
			
	return int(alleleI),allelescores,alleleList
def main():
	print ("Starting script at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	try:
		input_file = sys.argv[1]
		temppath = sys.argv[2]
	except IndexError:
		print "usage: list_pickle_obj"

	argumentList=[]
	with open(input_file,'rb') as f:
		argumentList = pickle.load(f)
	
	geneFile = argumentList[0]
	genomesList = argumentList[1]
	
	basepath=os.path.join(temppath,os.path.splitext(geneFile)[0])
	if not os.path.exists(basepath):
			os.makedirs(basepath)

	gene_fp = HTSeq.FastaReader(geneFile)
	alleleI = 0

	resultsList = []
	i = 0
	perfectMatchIdAllele=[]
	perfectMatchIdAllele2=[]
	allelescores=[]
	
	print ("Getting BSR at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))

	geneScorePickle=os.path.abspath(geneFile)+'_bsr.txt'
	
	#check if bsr as arealdy been calculated and recalculate it

	if os.path.isfile(geneScorePickle) :
		
		alleleI,allelescores,alleleList=getBlastScoreRatios(geneFile,basepath,False)
		
	else:	
		alleleI,allelescores,alleleList=getBlastScoreRatios(geneFile,basepath,True)
		
			
			
	print ("Finished BSR at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	genome=-1	
	
	genomeDict = {}
	print ("starting allele call at: "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	for genomeFile in genomesList:
		print genomeFile
		bestmatch=[0,0,False,'',0] #score, score ratio, perfectmatch, key name of the DNA sequence string, allele ID
		currentGenomeDict={}
		currentCDSDict={}
		
		# load the translated CDS from the genome to a dictionary
		filepath=os.path.join(temppath,str(os.path.basename(genomeFile))+"_ORF_Protein.txt")
		with open(filepath,'rb') as f:
			currentCDSDict = pickle.load(f)
		
		#load the contig info of the genome to a dictionary
		g_fp = HTSeq.FastaReader( genomeFile )
		for contig in g_fp:
			sequence=str(contig.seq)
			genomeDict[ contig.name ] = sequence
		
		currentGenomeDict = genomeDict

		genome+=1
		listOfCDS=currentCDSDict
		genomeProteinfastaPath=os.path.join(temppath,str(os.path.basename(genomeFile)+'_Protein.fasta'))
		
		print ("Blasting alleles on genome at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
		
		blast_out_file = os.path.join(basepath,"blastdbs/"+os.path.basename(geneFile)+ '_List.xml')

		Gene_Blast_DB_name = os.path.join(temppath,str(os.path.basename(genomeFile))+"/"+str(os.path.basename(genomeFile))+"_db")

		proteinfastaPath=os.path.join(basepath,str(os.path.basename(geneFile)+'_protein.fasta'))
		
		
		#blast the genome CDS against the translated locus
		cline = NcbiblastpCommandline(query=proteinfastaPath, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5)
			
		blast_records = runBlastParser(cline, blast_out_file, proteinfastaPath)
		print ("Blasted alleles on genome at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
		
		alleleSizes=[]
		for allele in alleleList:
			alleleSizes.append(len(allele))
		
		biggestSizeAllele=0
		
		moda=max(set(alleleSizes), key=alleleSizes.count)
		contador= Counter(alleleSizes).most_common()
		
		if (contador[0])[1] ==1:
			moda= alleleSizes[0]

		try:
			
			# iterate through the blast results
			for blast_record in blast_records:
					
				locationcontigs=[]
				
				for alignment in blast_record.alignments:
					
					# select the best match
					for match in alignment.hsps:
						
						alleleMatchid=str(blast_record.query_id).split("_")[1]
						
						scoreRatio=float(match.score)/float(allelescores[int(alleleMatchid)-1])

						cdsStrName=((alignment.title).split(" "))[1]
						
						DNAstr=listOfCDS[">"+cdsStrName]

						AlleleDNAstr=alleleList[int(alleleMatchid)-1]
						if len(AlleleDNAstr)>biggestSizeAllele:
							biggestSizeAllele=len(AlleleDNAstr)
							
						compare=False
						
						#compare the DNA match and the allele DNA sequence (protein sequences may be equal and DNA different)
						if DNAstr==AlleleDNAstr is False:
							try:
								DNAstr=reverseComplement(DNAstr)
								if DNAstr==AlleleDNAstr is False:
									pass
								else:
									compare=True
							except:
								pass
						else:
							compare=True
						
						if scoreRatio>0.6:
							locationcontigs.append(cdsStrName)
							
						if "N" in DNAstr or "K" in DNAstr or "R" in DNAstr:
							pass
							
						elif(scoreRatio == 1 and bestmatch[2] is False and compare is True):
							bestmatch=[match.score,scoreRatio,True,cdsStrName,int(alleleMatchid),match,len(AlleleDNAstr)]

						elif(scoreRatio == 1 and match.score>bestmatch[0] and compare is True):
							bestmatch=[match.score,scoreRatio,True,cdsStrName,int(alleleMatchid),match,len(AlleleDNAstr)]

						elif(scoreRatio == 1 and bestmatch[2] is False and compare is False):
							bestmatch=[match.score,scoreRatio,False,cdsStrName,int(alleleMatchid),match,len(AlleleDNAstr)]
						
						elif(scoreRatio == 1 and match.score>bestmatch[0] and compare is False):
							bestmatch=[match.score,scoreRatio,False,cdsStrName,int(alleleMatchid),match,len(AlleleDNAstr)]

						elif(match.score>bestmatch[0] and scoreRatio>0.6 and scoreRatio>bestmatch[1] and bestmatch[2] is False):
							bestmatch=[match.score,scoreRatio,False,cdsStrName,int(alleleMatchid),match,len(AlleleDNAstr)]
							
										
			print ("Classifying the match at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))		
			
			#if no best match was found it's a Locus Not Found
			if bestmatch[0]==0 or "N" in AlleleDNAstr or "K" in AlleleDNAstr or "R" in AlleleDNAstr :
						
						###################
						# LOCUS NOT FOUND #
						###################
				if 	bestmatch[0]==0:		
					resultsList.append('LNF3:-1')
					perfectMatchIdAllele.append('LNF')
					perfectMatchIdAllele2.append('LNF')
					print "Locus not found, no matches \n"
				else:
					resultsList.append('LNFN:-1')
					perfectMatchIdAllele.append('LNF')
					perfectMatchIdAllele2.append('LNF')
					print "Locus has strange base (N, K or R) \n"
			
			#if more than one BSR >0.6 in two different CDSs it's a Non Paralog Locus
			elif len(list(set(locationcontigs)))>1:
				resultsList.append('NIPL')            
				perfectMatchIdAllele.append('NIPL')
				perfectMatchIdAllele2.append('NIPL')
				for elem in locationcontigs:
					print elem
				
			
			#in case the DNA match sequence equal to the DNA sequence of the comparing allele
			elif bestmatch[2] is True:
				contigname=bestmatch[3]	
				
				contigname=contigname.split("&")
				matchLocation=contigname[2]	
				contigname=contigname[0]	
				print contigname
				alleleStr=listOfCDS[">"+bestmatch[3]]
				protSeq,alleleStr,Reversed=translateSeq(alleleStr)
				

				#check for possible locus on tip
				match=bestmatch[5]
				matchLocation2=matchLocation.split("-")			
				seq=currentGenomeDict[ contigname ]
				bestMatchContigLen=len(seq)
				
				rightmatchContig=bestMatchContigLen-int(matchLocation2[1])	
				leftmatchContig=int(matchLocation2[0])
				
				if Reversed:
					aux=rightmatchContig
					rightmatchContig=leftmatchContig
					leftmatchContig=aux
				
				
				
				
				
				# get extra space to the right and left between the allele and match
				
				possibleExtra=int(moda)-((int(match.query_end)*3)-(int(match.query_start)*3))
				
				if possibleExtra<0:
					perfectMatchIdAllele.append(str(bestmatch[4]))
					if not Reversed:
						perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation)+"&"+"+")
					else:
						perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation)+"&"+"-")
					resultsList.append('EXC:' + str(bestmatch[4]) )
				
				else:	
					rightmatchAllele=possibleExtra
					leftmatchAllele=possibleExtra
					
					if leftmatchContig<leftmatchAllele and 	rightmatchContig < rightmatchAllele:
				
						resultsList.append('PLOTSC:-1')
						perfectMatchIdAllele.append('PLOTSC')
						perfectMatchIdAllele2.append('PLOTSC')

						print match
						print "contig extras (l,r)"
						print leftmatchContig,rightmatchContig
						print "allele extras (l,r)"
						print leftmatchAllele,rightmatchAllele
						
						print "Locus is possibly bigger than the contig \n"
					
					elif leftmatchContig<leftmatchAllele:
						
						
						resultsList.append('PLOT3:-1')
						perfectMatchIdAllele.append('PLOT3')
						perfectMatchIdAllele2.append('PLOT3')
						
						print match
						print "contig extras (l,r)"
						print leftmatchContig,rightmatchContig
						print "allele extras (l,r)"
						print leftmatchAllele,rightmatchAllele
						
						print "Locus is possibly on the 3' tip of the contig \n"
					
					
					elif 	rightmatchContig < rightmatchAllele:
						
						resultsList.append('PLOT5:-1')
						perfectMatchIdAllele.append('PLOT5')
						perfectMatchIdAllele2.append('PLOT5')
						
						print match
						print "contig extras (l,r)"
						print leftmatchContig,rightmatchContig
						print "allele extras (l,r)"
						print leftmatchAllele,rightmatchAllele

						print "Locus is possibly on the 5' tip of the contig \n"
				
					else:
						#if a perfect match was found
								
						################################################
						# EXACT MATCH --- MATCH == GENE --- GENE FOUND #
						################################################
								
						perfectMatchIdAllele.append(str(bestmatch[4]))
						if not Reversed:
							perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation)+"&"+"+")
						else:
							perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation)+"&"+"-")
						resultsList.append('EXC:' + str(bestmatch[4]) )

			
			# if match with BSR >0.6 and not equal DNA sequences
			else:
				
				match=bestmatch[5]
				geneLen=bestmatch[6]

				contigname=bestmatch[3]	
				
				contigname=contigname.split("&")
				matchLocation=contigname[2]	
				matchLocation=matchLocation.split("-")
				contigname=contigname[0]
				
				seq=currentGenomeDict[ contigname ]
				bestMatchContigLen=len(seq)
				
				alleleStr=listOfCDS[">"+bestmatch[3]]
				protSeq,alleleStr,Reversed=translateSeq(alleleStr)
				
				
				rightmatchContig=bestMatchContigLen-int(matchLocation[1])	
				leftmatchContig=int(matchLocation[0])
				
				if Reversed:
					aux=rightmatchContig
					rightmatchContig=leftmatchContig
					leftmatchContig=aux
				
				
				print rightmatchContig,leftmatchContig
				
				
				# get extra space to the right and left between the allele and match and check if it's still inside the contig
				
				rightmatchAllele=geneLen-((int(match.query_end)+1)*3)	
				leftmatchAllele=((int(match.query_start)-1)*3)
				

						###########################
						# LOCUS ON THE CONTIG TIP #
						###########################
				
				
				
				if leftmatchContig<leftmatchAllele and 	rightmatchContig < rightmatchAllele:
				
					resultsList.append('LOTSC:-1')
					perfectMatchIdAllele.append('LOTSC')
					perfectMatchIdAllele2.append('LOTSC')
					print match
					print contigname
					print geneFile
					print leftmatchAllele,rightmatchAllele
					print "Locus is bigger than the contig \n"
				
				elif leftmatchContig<leftmatchAllele:
					
					
					resultsList.append('LOT3:-1')
					perfectMatchIdAllele.append('LOT3')
					perfectMatchIdAllele2.append('LOT3')
					print match
					print contigname
					print geneFile
					print leftmatchAllele,rightmatchAllele
					print "Locus is on the 3' tip of the contig \n"
				
				
				elif 	rightmatchContig < rightmatchAllele:
					
					resultsList.append('LOT5:-1')
					perfectMatchIdAllele.append('LOT5')
					perfectMatchIdAllele2.append('LOT5')
					print match
					print contigname
					print geneFile
					print leftmatchAllele,rightmatchAllele
					print "Locus is on the 5' tip of the contig \n"
				
				
							
				elif len(alleleStr) > moda+(moda*0.2) :
					
					print moda
					print alleleStr
					resultsList.append('ALM')
					perfectMatchIdAllele.append('ALM')
					perfectMatchIdAllele2.append('ALM')
				
				elif len(alleleStr) < moda-(moda*0.2):
					
					print moda
					print alleleStr
					resultsList.append('ASM')
					perfectMatchIdAllele.append('ASM')
					perfectMatchIdAllele2.append('ASM')
			
					
				else:
							#######################
							# ADD INFERRED ALLELE #		# a new allele 
							#######################
							
													
					tagAux='INF'
					perfectMatchIdAllele.append( tagAux +"-"+str(alleleI+1))
					
					if not Reversed:
						perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"+")
					else:
						perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"-")
					
					
					print "New allele! Adding allele "+ tagAux + str(alleleI+1) +" to the database\n"
																						
					resultsList.append( tagAux + str(alleleI+1) )

												# --- add the new allele to the gene fasta --- #
					
					
					appendAllele='>allele_' + str(alleleI+1) + '_' + tagAux[:-1] +"_" + str(os.path.basename(genomesList[genome])) + '\n'
					fG = open( geneFile, 'a' )
					fG.write(appendAllele)
						
					fG.write( alleleStr + '\n')
					fG.close()
					
					fG = open( os.path.join(basepath,str(os.path.basename(geneFile)+'_protein2.fasta')), 'w' )
					fG.write('>'+str(alleleI+1)+'\n'+str(protSeq) + '\n')
					fG.close()
					fG = open( os.path.join(basepath,str(os.path.basename(geneFile)+'_protein.fasta')), 'a' )
					fG.write('>'+str(alleleI+1)+'\n'+str(protSeq) + '\n')
					fG.close()	
					
					match=bestmatch[5]
					
					# --- remake blast DB and recalculate the BSR for the locus --- #
					alleleList.append(alleleStr)
					print os.path.join(basepath,str(os.path.basename(geneFile)+'_protein.fasta'))
					genefile2= os.path.join(basepath,str(os.path.basename(geneFile)+'_protein2.fasta'))
					Gene_Blast_DB_name2 = Create_Blastdb( genefile2, 1, True )
					print ("Re-calculating BSR at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
					alleleI,allelescores,alleleList=reDogetBlastScoreRatios(genefile2,basepath,alleleI,allelescores,Gene_Blast_DB_name2,alleleList,geneScorePickle)
					print "allele id " + str(alleleI)
					print ("Done Re-calculating BSR at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
		
		except Exception as e:
			print "some error occurred"
			print e
			print 'Error on line {}'.format(sys.exc_info()[-1].tb_lineno)
			perfectMatchIdAllele2.append("ERROR")
			perfectMatchIdAllele.append("ERROR")
			resultsList.append('ERROR')  
		
	
	final =	(resultsList,perfectMatchIdAllele)	
	print ("Finished allele calling at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	filepath=os.path.join(temppath , os.path.basename(geneFile)+"_result.txt")
	filepath2=os.path.join(temppath , os.path.basename(geneFile)+"_result2.txt")
	with open(filepath, 'wb') as f:
		pickle.dump(final, f)
	with open(filepath2, 'wb') as f:
		pickle.dump(perfectMatchIdAllele2, f)
	shutil.rmtree(basepath)
	return True
def main():

	parser = argparse.ArgumentParser(description="Given two list of genes, creates a folder with paired files when located on the same locus")
	parser.add_argument('-i', nargs='?', type=str, help='1st list of genes files to compare', required=True)
	parser.add_argument('-g', nargs='?', type=str, help='2nd list of genes files to compare', required=True)
	
	args = parser.parse_args()
	geneFiles1 = args.i
	geneFiles2 = args.g
	
		
	name1="concat1.fasta"
	name2="concat2.fasta"
		
	concat_genes(geneFiles1, name1)
	concat_genes(geneFiles2, name2)
	
	#orderedAlleleNames=[]

	geneDict={}
	gene_fp = HTSeq.FastaReader(name1)
	alleleI=0
	for allele in gene_fp:
		#if allele.seq in geneDict:
		#	print "\nWARNING: this file contains a repeated allele, it should be checked. Ignoring it now!\n", geneFile
		#else:
			#orderedAlleleNames.append(allele.name)
		geneDict[ allele.seq ] = allele.name
		alleleI += 1
	
	gene_fp = HTSeq.FastaReader(name1)
	geneFile = os.path.abspath( name1 )
	Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 , False)
	
	geneF = os.path.splitext( geneFile )[0]
	blast_out_file = geneF + '.xml'

	# list of results - the output of the function
	resultsList = []

					# ------------------------------ RUNNING BLAST ------------------------------ #

	cline = NcbiblastnCommandline(query=name2, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5)
	blast_records = runBlastParser(cline, blast_out_file, name2)
	samelocus=0
	alreadyUsed=[]
	nomatch=0
	small=0
	if not os.path.exists("./sameLocus"):
		os.makedirs("./sameLocus")
	LocusID=0
	for blast_record in blast_records:
		try:
			alignment=blast_record.alignments[1]
			#print blast_record.query
			#print alignment.num_alignments
			
			try:
					#print alleleLength, alignment.length
				i=0
				align=blast_record.alignments[i]	
				while i<len(blast_record.alignments):
					if align.hit_def:
						result,allelename2,LocusID,alreadyUsed=alignHasGoodMatch(align,geneDict,LocusID,blast_record,alreadyUsed)
						if result>0 and allelename2:
							samelocus+=result
							i+=999
						else:
							small+=1
							i+=999
							alreadyUsed.append(allelename2)
					elif allelename :
						#alreadyUsed.append(allelename)
						result,allelename,LocusID,alreadyUsed=alignHasGoodMatch(align,geneDict,LocusID,blast_record,alreadyUsed)
						if result>0:
							samelocus+=result
							i+=999
						else:
							small+=1
							i+=999
							#alreadyUsed.append(allelename2)
					else :
						nomatch+=1
					#print align.length, alleleleng
					
					i+=1
			except Exception as e:
				print e
					#print "lkjh"
				pass
		except:
			try:
				alignment=blast_record.alignments[0]
				#print blast_record.query
				
				result,allelename,LocusID,alreadyUsed=alignHasGoodMatch(alignment,geneDict,LocusID,blast_record,alreadyUsed)
				if result>0 and allelename:
					samelocus+=result
				else :
					small+=1
				#alreadyUsed.append(allelename)
				#alreadyUsed.append(alignment.hit_def)
			except:
				nomatch+=1
				
	
	print "%s are within same locus, %s had no match and %s had a bigger than 0.2 ratio size difference or less than 0.8 similarity ratio" % (samelocus,nomatch, small)
	
	os.remove(name1)
	os.remove(name2)
	shutil.rmtree('./blastdbs')
Example #24
0
def main():
	
	try:
		input_file = sys.argv[1]
		temppath = sys.argv[2]
	except IndexError:
		print "usage: list_pickle_obj"

	argumentList=[]
	with open(input_file,'rb') as f:
		argumentList = pickle.load(f)
	
	geneFile = argumentList[0]
	genomesList = argumentList[1]
	#listOfCDSDicts = argumentList[2]
	
	basepath=temppath
	
	gene_fp = HTSeq.FastaReader(geneFile)
	geneDict = {}
	alleleI = 1
	inverted=False
	orderedAlleleNames=[]
	biggestAllelelen=0
	smallestAllelelen=99999
	for allele in gene_fp:
		if allele.seq in geneDict:
			print "\nWARNING: this file contains a repeated allele, it should be checked. Ignoring it now!\n", geneFile
		else:
			if len(allele.seq)>biggestAllelelen:
				biggestAllelelen=len(allele.seq)
			if len(allele.seq)<smallestAllelelen:
				smallestAllelelen=len(allele.seq)
			orderedAlleleNames.append(allele.name)
			geneDict[ allele.seq ] = alleleI
			alleleI += 1
	#print geneDict
	#print orderedAlleleNames

	# --- make 1st blast DB --- #

	Gene_Blast_DB_name = Create_Blastdb( geneFile, 1, False )
	geneF = os.path.basename(geneFile)
	blast_out_file = os.path.dirname(geneFile)+"/blastdbs/"+geneF + '.xml'

	# list of results - the output of the function
	resultsList = []
	i = 0
	perfectMatchIdAllele=[]
	genomeDict = {}
	for genomeFile in genomesList:
		#currentCDSDict = listOfCDSDicts[i]
		
		filepath=os.path.join(basepath,str(os.path.basename(genomeFile))+"_ORF.txt")
		with open(filepath,'rb') as f:
			currentCDSDict = pickle.load(f)
		
		g_fp = HTSeq.FastaReader( genomeFile )
		for contig in g_fp:
			sequence=str(contig.seq)
			genomeDict[ contig.name ] = sequence
		
		currentGenomeDict = genomeDict
		
		#print genomeFile
		#print resultsList
		#print geneDict
		#print orderedAlleleNames
		i+=1		# it has to be incremented here
		if genomeFile[-1] == '\n':
			genomeFile = genomeFile[:-1]

                # ------------------------------ RUNNING BLAST ------------------------------ #
		#print Gene_Blast_DB_name
		#cline = NcbiblastnCommandline(query=genomeFile, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5)
		cline = NcbiblastnCommandline(query=genomeFile, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5)
		#print cline
		blast_records = runBlastParser(cline, blast_out_file, genomeFile)

		print ("Finished Blast at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))

		# ------ DETERMINING BEST MATCH ------ #

		# bestMatch = ['rec.query','hsp', lenRatio]
		bestMatch = ['','', 0]
		bestMatchContig=''
		bestMatchContigLen=''
		bestalignlen=0
		perfectMatch=False
		bmAlleleLen2=0
		bmAllele=''
		#noAlignment=False
		for blast_record in blast_records:
			# --- the LNF cases are now called outside de loop --- #
			#print blast_record
			if perfectMatch==True:
				break
			try:
				#print blast_record.alignments
				hspC = blast_record.alignments[0]
				
				if bestMatch[0] == '' and bestMatch[1] == '':
					bestMatch[0] = blast_record.query
					bestMatch[1] = hspC
			except IndexError:
				continue


			# --- the contig tag is used in the progigal function --- #

			contigTag = blast_record.query
			# --- brute force parsing of the contig tag - better solution is advisable --- #			

			j=0
			for l in contigTag:
				if l == ' ':
					break
				j+=1

			contigTag = contigTag[:j]

			contigLen = blast_record.query_letters
			#print blast_record.query_id
			
			# --- iterating over all the results to determine the best match --- #
			for alignment in blast_record.alignments:
				
				index=orderedAlleleNames.index(alignment.hit_def)
				#print alignment.hit_def
				for k, v in geneDict.iteritems():
					if v == index+1:
						bmAlleleLen2= len(k)
					
				if perfectMatch:
					break
				for match in alignment.hsps:
					#print match
					scoreRatio = float(match.score) / float(bmAlleleLen2)
					
					#print alignment.hit_def
					#print match.identities
					#print bmAlleleLen2
					#print
					#print match.identities
					#print len(match.sbjct)

					#if #identities is the same as the length of the allele and it has no gaps or N's
					if (int(match.identities)==int(bmAlleleLen2) and int(match.identities)==int(len(match.sbjct)) and "N" not in match.query and "K" not in match.query and "R" not in match.query ): 
						index=orderedAlleleNames.index(alignment.hit_def)
						for seq, alleleid in geneDict.iteritems():
							if alleleid == index+1:
								bmAllele=seq
								break
						bmAlleleLen= len(bmAllele)
						
						lenratio=float(len(match.query))/float(bmAlleleLen)
						bestMatch = [blast_record.query, match, scoreRatio, alignment.hit_def,lenratio,bmAlleleLen]
						bestMatchContig=contigTag
						perfectMatch=True
						index=orderedAlleleNames.index(alignment.hit_def)
						bmAlleleLen= len(geneDict.keys()[index])
						break
						
					#choose the match with the best score ratio score/length of allele	
					elif scoreRatio > bestMatch[2]:
						index=orderedAlleleNames.index(alignment.hit_def)
						for seq, alleleid in geneDict.iteritems():
							if alleleid == index+1:
								bmAllele=seq
								break
						bmAlleleLen= len(bmAllele)
						lenratio=float(len(match.query))/float(bmAlleleLen)
						bestMatch = [blast_record.query, match, scoreRatio, alignment.hit_def,lenratio,bmAlleleLen]
						bestMatchContig=contigTag
						bestMatchContigLen=blast_record.query_letters
						if match.sbjct_start > match.sbjct_end:
							inverted=True
						#print match.query
						bestalignlen=alignment.length
						#print match
						#print bmAlleleLen, bestMatchContig
						
						
					
					if perfectMatch==True:
						break


		# ---------- ALLELE CALLING AFTER DETERMINING BEST MATCH ---------- #
		print ("Finished choosing best match at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
		
		
		try:
			#print bestMatch[0]
			match = bestMatch[1]
			#print match.query
			geneLen = bestMatch[5]
			alleleStr = match.query
			nIdentities = match.identities
			idPercent = float(nIdentities) / float(geneLen)
			scoreRatio = bestMatch[2]
			lenRatio = bestMatch[4]
			#print perfectMatch
			#print "\nContig best/exact match is :"
			#print bestMatchContig +"\n"
		
		except:
			#if no best match was found
			
			###################
			# LOCUS NOT FOUND #
			###################
			
			resultsList.append('LNF3:-1')            # append result to the list of results
			perfectMatchIdAllele.append('LNF')
			printinfo(genomeFile,geneFile)
			print "Locus not found, no matches \n"
			continue
		
		
		if perfectMatch is True:
			
			#if a perfect match was found
			
			if match.sbjct_start > match.sbjct_end: #reverse the order if needed
				alleleStr = reverseComplement(alleleStr)
			alleleNumber = geneDict[ alleleStr ]
			
					################################################
					# EXACT MATCH --- MATCH == GENE --- GENE FOUND #
					################################################
			if "_" in bestMatch[3]:
				a=bestMatch[3].split("_")
				perfectMatchIdAllele.append(a[1])
			else:
				perfectMatchIdAllele.append(bestMatch[3])
			resultsList.append('EXC:' + str(alleleNumber) )
		
						
			

		elif bestMatch[0] != '' and perfectMatch is not True:
						
			#if a best match was found but it's not an exact match	

					###########################
					# LOCUS ON THE CONTIG TIP #
					###########################
			
			
			
			if match.query_start ==1 and len(match.query) < geneLen:
			
				resultsList.append('LOT5:-1')
				perfectMatchIdAllele.append('LOT5')
				printinfo(genomeFile,geneFile)
				
				print "Locus is on the 5' tip of the contig \n"
			
			
			elif match.query_end == bestMatchContigLen and len(match.query) < geneLen:
				
				resultsList.append('LOT3:-1')
				perfectMatchIdAllele.append('LOT3')
				printinfo(genomeFile,geneFile)

				print "Locus is on the 3' tip of the contig \n"
			
			elif bestMatchContigLen <= geneLen:
				
				resultsList.append('LOTSC:-1')
				perfectMatchIdAllele.append('LOTSC')
				printinfo(genomeFile,geneFile)
				#print match.query_start
				print "Locus is bigger than the contig \n"
					
					
				

			elif 'N' in alleleStr:
				#TODO gravar para ficheiro
					#####################
					# ALLELE NOT FOUND  #		# N base found!
					#####################
				
				geneFile2= os.path.splitext(geneFile)[0] + "LNFN.fasta"
				print geneFile2
				with open(geneFile2, 'a') as f:
					f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+"\n")
					f.write((alleleStr) +"\n")
				resultsList.append('LNFN:-1')
				perfectMatchIdAllele.append('LNFN')
				printinfo(genomeFile,geneFile) 
				print "LNFN, contains N bases! \n"
			
			
			
			else:
				# ------------------------------------------------------------------------------------------------------- #
				#                                                                                                         #
				#                                   USING PRODIGAL TO TRY TO EXTEND CDS                                   #
				#                                                                                                         #
				# ------------------------------------------------------------------------------------------------------- #
				
				CDSType=''
				sizeratio=0.2
				ORFFoundInMatch, strCDS, CDSType = extendCDS(bestMatchContig, currentCDSDict, match.query_start, match.query_end, currentGenomeDict, biggestAllelelen, smallestAllelelen,sizeratio)
				# --- if it was possible to extend it using prodigal --- #
				print ("Finished extension at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
				#print ORFFoundInMatch
				#print strCDS
				#print CDSType
				isContainedDefinedAllele = False
				
				try:	
					if ORFFoundInMatch :
						alleleStr = strCDS
						if match.sbjct_start > match.sbjct_end: #reverse the order if needed
							alleleStr = reverseComplement(alleleStr)
						
						lenRatio = float(len(strCDS)) / float(geneLen)
						defAllele=[]
						if alleleStr in geneDict:  #if ORF found is already defined
							alleleNumber = geneDict[ alleleStr ]
							
							################################################
							# EXACT MATCH --- MATCH == GENE --- GENE FOUND #						
							################################################
							perfectMatchIdAllele.append(alleleNumber)
							resultsList.append('EXC2:' + str(alleleNumber) )


							
						else:
									#######################
									# ADD INFERRED ALLELE #		# a new allele that was extended with prodigal
									#######################
							if(CDSType=='stop codon in match end'):
								tagAux = 'INF1:'
							elif(CDSType=='start codon in match beggining'):
								tagAux = 'INF2:'
							elif(CDSType=='bigger than match'):
								tagAux = 'INF3:'
							elif(CDSType=='same size as match'):
								tagAux = 'INF4:'
							elif(CDSType=='cds inside match'):
								tagAux = 'INF5:'
							elif(CDSType=='start codon inside match'):
								tagAux = 'INF6:'
							else:
								tagAux = 'INF7:'
									
							print "infered allele has location : "+(CDSType)
							printinfo(genomeFile,geneFile) 
							perfectMatchIdAllele.append( tagAux +"-"+str(alleleI))
							print "New allele Infered with prodigal! Adding allele "+ tagAux + str(alleleI) +" to the database\n"
								
								
									
							geneDict[alleleStr] = alleleI
									
							resultsList.append( tagAux + str(alleleI) )

							orderedAlleleNames.append('allele_' + str(alleleI) + '_' + tagAux[:-1] +"_" +str(os.path.basename(genomeFile)))	
								# --- add the new allele to the gene fasta --- #

							fG = open( geneFile, 'a' )
							fG.write('>allele_' + str(alleleI) + '_' + tagAux[:-1] +"_" + str(os.path.basename(genomeFile)) + '\n')
							#print alleleStr
							fG.write( alleleStr + '\n')
							fG.close()
							alleleI += 1
								

								# --- remake blast DB --- #
								
							Gene_Blast_DB_name = Create_Blastdb( geneFile, 1,False )
							
							
							
					else:
								
					##################
					# LNF WTFFF #
					##################
						geneFile2= os.path.splitext(geneFile)[0] + "LNF2.fasta"
						print geneFile2
						with open(geneFile2, 'a') as f:
							f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n")
							f.write((alleleStr) +"\n")
							f.write(">Allele\n")
							f.write((bmAllele)+"\n")
						resultsList.append('LNF2')
						printinfo(genomeFile,geneFile) 
						perfectMatchIdAllele.append("LNF2")
						print "CDS not found"
				
				except:
					if ORFFoundInMatch :
						alleleStr = strCDS
					##################
					# LNF WTFFF #
					##################
					geneFile2= os.path.splitext(geneFile)[0] + "LNF99.fasta"
					print geneFile2
					with open(geneFile2, 'a') as f:
						f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n")
						f.write((alleleStr) +"\n")
						f.write(">Allele\n")
						f.write((bmAllele)+"\n")
					resultsList.append('LNF99')
					printinfo(genomeFile,geneFile) 
					perfectMatchIdAllele.append("LNF99")
					print "A problem occurred"
						
							
	final =	(resultsList,perfectMatchIdAllele)	
	#return (resultsList)
	print ("Finished allele calling at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	filepath=os.path.join(basepath , os.path.basename(geneFile)+"_result.txt")
	#print filepath
	with open(filepath, 'wb') as f:
		pickle.dump(final, f)
	return True
def callAlleles(argumentList):

	geneFile = argumentList[0]
	genomesList = argumentList[1]
	listOfGenomesDict = argumentList[2]
	
	gene_fp = HTSeq.FastaReader(geneFile)
	geneDict = {}
	alleleI = 1
	orderedAlleleNames=[]
	biggestAllelelen=0
	smallestAllelelen=99999
	for allele in gene_fp:
		if allele.seq in geneDict:
			print "\nWARNING: this file contains a repeated allele, it should be checked. Ignoring it now!\n", geneFile
		else:
			if len(allele.seq)>biggestAllelelen:
				biggestAllelelen=len(allele.seq)
			if len(allele.seq)<smallestAllelelen:
				smallestAllelelen=len(allele.seq)
			orderedAlleleNames.append(allele.name)
			geneDict[ allele.seq ] = alleleI
			alleleI += 1
	#print geneDict
	#print orderedAlleleNames

	# --- make 1st blast DB --- #

	Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 )
	geneF = os.path.splitext( geneFile )[0]
	blast_out_file = geneF + '.xml'

	# list of results - the output of the function
	resultsList = []
	i = 0
	perfectMatchIdAllele=[]
	
	for genomeFile in genomesList:
		#print geneDict
		currentGenomeDict = listOfGenomesDict[i]
		
		#print genomeFile
		#print resultsList
		#print geneDict
		#print orderedAlleleNames
		i+=1		# it has to be incremented here
		if genomeFile[-1] == '\n':
			genomeFile = genomeFile[:-1]


                # ------------------------------ RUNNING BLAST ------------------------------ #

		cline = NcbiblastnCommandline(query=genomeFile, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5)
		blast_records = runBlastParser(cline, blast_out_file, genomeFile)
		

		# ------ DETERMINING BEST MATCH ------ #

		# bestMatch = ['rec.query','hsp', lenRatio]
		bestMatch = ['','', 0]
		bestMatchContig=''
		bestMatchContigLen=''
		bestalignlen=0
		perfectMatch=False
		bmAlleleLen2=0
		bmAllele=''
		#noAlignment=False
		for blast_record in blast_records:
			# --- the LNF cases are now called outside de loop --- #
			#print blast_record
			if perfectMatch==True:
				break
			try:
				#print blast_record.alignments
				hspC = blast_record.alignments[0]
				
				if bestMatch[0] == '' and bestMatch[1] == '':
					bestMatch[0] = blast_record.query
					bestMatch[1] = hspC
			except IndexError:
				continue


			# --- the contig tag is used in the progigal function --- #

			contigTag = blast_record.query
			# --- brute force parsing of the contig tag - better solution is advisable --- #			

			j=0
			for l in contigTag:
				if l == ' ':
					break
				j+=1

			contigTag = contigTag[:j]

			contigLen = blast_record.query_letters
			#print blast_record.query_id
			
			# --- iterating over all the results to determine the best match --- #
			for alignment in blast_record.alignments:

				index=orderedAlleleNames.index(alignment.hit_def)
				#print alignment.hit_def
				for k, v in geneDict.iteritems():
					if v == index+1:
						bmAlleleLen2= len(k)
					
				if perfectMatch:
					break
				for match in alignment.hsps:
					#print match
					scoreRatio = float(match.score) / float(bmAlleleLen2)
					
					#print alignment.hit_def
					#print match.identities
					#print bmAlleleLen2
					#print
					#print match.identities
					#print len(match.sbjct)
					
					#if #identities is the same as the length of the allele and it has no gaps or N's
					if (int(match.identities)==int(bmAlleleLen2) and int(match.identities)==int(len(match.sbjct)) and "N" not in match.query ): 
						index=orderedAlleleNames.index(alignment.hit_def)
						
						for seq, alleleid in geneDict.iteritems():
							if alleleid == index+1:
								bmAllele=seq
								break
						bmAlleleLen= len(bmAllele)
						
						lenratio=float(len(match.query))/float(bmAlleleLen)
						bestMatch = [blast_record.query, match, scoreRatio, alignment.hit_def,lenratio,bmAlleleLen]
						bestMatchContig=contigTag
						perfectMatch=True
						index=orderedAlleleNames.index(alignment.hit_def)
						bmAlleleLen= len(geneDict.keys()[index])
						break
						
					#choose the match with the best score ratio score/length of allele	
					elif scoreRatio > bestMatch[2]:
						index=orderedAlleleNames.index(alignment.hit_def)
						#print orderedAlleleNames
						#print geneDict
						#print orderedAlleleNames
						#print alignment.hit_def
						#print index
						#print geneDict
						for seq, alleleid in geneDict.iteritems():
							if alleleid == index+1:
								bmAllele=seq
								break
						bmAlleleLen= len(bmAllele)
						#print bmAllele
						lenratio=float(len(match.query))/float(bmAlleleLen)
						bestMatch = [blast_record.query, match, scoreRatio, alignment.hit_def,lenratio,bmAlleleLen]
						bestMatchContig=contigTag
						bestMatchContigLen=blast_record.query_letters
						
						#print match.query
						bestalignlen=alignment.length
						
						
					
					if perfectMatch==True:
						break


		# ---------- ALLELE CALLING AFTER DETERMINING BEST MATCH ---------- #

		
		
		try:
			#print bestMatch[0]
			match = bestMatch[1]
			#print match
			#print match.sbjct
			geneLen = bestMatch[5]
			alleleStr = match.query
			nIdentities = match.identities
			idPercent = float(nIdentities) / float(geneLen)
			scoreRatio = bestMatch[2]
			lenRatio = bestMatch[4]
			#print perfectMatch
			#print "\nContig best/exact match is :"
			#print bestMatchContig +"\n"
		
		except:
			#if no best match was found
			
			###################
			# LOCUS NOT FOUND #
			###################
			
			resultsList.append('LNF:-1')            # append result to the list of results
			perfectMatchIdAllele.append('LNF')
			printinfo(genomeFile,geneFile)
			print "Locus not found \n"
			continue
		
		
		if perfectMatch is True:
			
			#if a perfect match was found
			
			if match.sbjct_start > match.sbjct_end: #reverse the order if needed
				alleleStr = reverseComplement(alleleStr)
			alleleNumber = geneDict[ alleleStr ]
			
					################################################
					# EXACT MATCH --- MATCH == GENE --- GENE FOUND #
					################################################
			if "_" in bestMatch[3]:
				a=bestMatch[3].split("_")
				perfectMatchIdAllele.append(a[1])
			else:
				perfectMatchIdAllele.append(bestMatch[3])
			resultsList.append('EXC:' + str(alleleNumber) )
								
			

		elif bestMatch[0] != '' and perfectMatch is not True:
						
			#if a best match was found but it's not an exact match	

					###########################
					# LOCUS ON THE CONTIG TIP #
					###########################
						
			
			if match.query_start ==1 and len(match.query) < geneLen:
			
				resultsList.append('LOT5:-1')
				perfectMatchIdAllele.append('LOT5')
				printinfo(genomeFile,geneFile)
				
				print "Locus is on the 5' tip of the contig \n"
			
			
			elif match.query_end == bestMatchContigLen and len(match.query) < bestMatchContigLen:
				
				resultsList.append('LOT3:-1')
				perfectMatchIdAllele.append('LOT3')
				printinfo(genomeFile,geneFile)

				print "Locus is on the 3' tip of the contig \n"
			
			elif bestMatchContigLen <= geneLen:
				
				resultsList.append('LOTSC:-1')
				perfectMatchIdAllele.append('LOTSC')
				printinfo(genomeFile,geneFile)
				#print match.query_start
				print "Locus is bigger than the contig \n"
								
				
			elif 'N' in alleleStr:
				#TODO gravar para ficheiro
					#####################
					# ALLELE NOT FOUND  #		# N base found!
					#####################
				
				geneFile2= os.path.splitext(geneFile)[0] + "LNFN.fasta"
				print geneFile2
				with open(geneFile2, 'a') as f:
					f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+"\n")
					f.write((alleleStr) +"\n")
				resultsList.append('LNFN:-1')
				perfectMatchIdAllele.append('LNFN')
				printinfo(genomeFile,geneFile) 
				print "LNFN, contains N bases! \n"
			
			
			
			else:
				
				#removing gaps
					
				alleleStr = alleleStr.replace('-', '')
				#lenExtraThresh=int(biggestAllelelen*0.1)
				lenExtraThresh=50
			
				#print alleleStr
				# --- it might be needed to obtain the reverse complement of the allele string --- #
				if match.sbjct_start > match.sbjct_end:
					alleleStr = reverseComplement(alleleStr)
					
				#if alleleStr in geneDict:  #if best match without gaps is already defined, example: best match allele was already defined but without gaps it's equal to a NA added
				#	alleleNumber = geneDict[ alleleStr ]
					
					################################################
					# EXACT MATCH --- MATCH == GENE --- GENE FOUND #						
					################################################
				#	perfectMatchIdAllele.append("EXC2-"+str(alleleNumber))
				#	resultsList.append('EXC2:' + str(alleleNumber) )
						

				#else: #check if best match without gaps are contained inside an already defined allele

				isContainedDefinedAllele = False	
				#print geneDict.keys()[0]
				definedAllele=''
				definedAlleleName=''
				for k in geneDict.keys():
					if alleleStr in k:
						definedAllele=k
						#print alleleStr
						isContainedDefinedAllele = True
						definedAlleleName=geneDict.get(k)
						break
						
				if isContainedDefinedAllele  and int(len(match.query))<=int(len(definedAllele))+lenExtraThresh and int(len(match.query))>=int(len(definedAllele))-lenExtraThresh :
					#allele without gaps is contained in a defined allele
					#best match with gaps has same size +1/-1 base as the defined allele
					
					#print int(len(definedAllele)), int(len(match.sbjct))
					
						
					if int(len(alleleStr))==int(len(definedAllele)): # if match without gaps has same size as the defined allele 
						tagAux = 'NA1:'
						printinfo(genomeFile,geneFile) 
						perfectMatchIdAllele.append("NA1-"+str(alleleI))
						
					elif int(len(alleleStr))==int(len(definedAllele))-1 : # if match without gaps has minus one base than the defined allele
						
						tagAux = 'NA2:'
						printinfo(genomeFile,geneFile) 
						perfectMatchIdAllele.append("NA2-"+str(alleleI))
					#elif int(len(alleleStr))==int(len(definedAllele))+1 : # if match without gaps has plus one base than the defined allele 
					#	tagAux = 'NA3:'
					#	printinfo(genomeFile,geneFile) 
					#	perfectMatchIdAllele.append("NA3-"+str(alleleI))
						
					else:												# if match without gaps has more than one base missing comparing to the defined allele 
						tagAux = 'NA4:'
						printinfo(genomeFile,geneFile) 
						perfectMatchIdAllele.append("NA4-"+str(alleleI))
					#TODO catch +1 and others
							
					print "New allele found! Adding allele "+ tagAux + str(alleleI) +" to the database"
					geneDict[alleleStr] = alleleI
						
					resultsList.append( tagAux + str(alleleI) )
						
					orderedAlleleNames.append('allele_' + str(alleleI) + '_' + tagAux[:-1] +'_' + str(os.path.basename(genomeFile)))									
					# --- add the new allele to the gene fasta --- #
						
					fG = open( geneFile, 'a' )
					fG.write('>allele_' + str(alleleI) + '_' + tagAux[:-1] +'_' + str(os.path.basename(genomeFile)) + '\n')
					fG.write( alleleStr + '\n')
					fG.close()
					alleleI += 1
					Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 )
					
				#if best match is not contained in an already defined allele, check if it has similar size with the match allele and has 0.8 similarity
				elif not isContainedDefinedAllele and idPercent > 0.8 and int(len(match.query))<=int(geneLen)+lenExtraThresh and int(len(match.query))>=int(geneLen)-lenExtraThresh :
					#best match with gaps has 80% identity
					#best match with gaps is the same size or +1/-1 as the defined allele
					
					ratio=float(len(alleleStr)) / float(geneLen)
					
					if ratio>=0.8 and ratio<=1.2: # if match without gaps has same size as the best match allele and 80%similarity
						
						tagAux = ''
						extraleft=0
						extraright=0
						tS=0
						tE=0
						#print int(geneLen), len(match.sbjct)
						#print match.sbjct
						#print match
						handle = open(genomeFile, "rU")
						record_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
						handle.close()
						record= record_dict[bestMatchContig]
						#print match.sbjct
						#if(int(len(alleleStr))<int(len(match.query)) and int(len(match.query))<int(geneLen)) and int(geneLen)==int(match.sbjct_start): #if best match allele has missing bases, the tips would be cut
						
						#if len(match.sbjct)<geneLen and "-" not in match.sbjct:  #if the allele is not fully used against the match, compensate the tips
						
						if (1<int(match.sbjct_start) and 1<int(match.sbjct_end)):
							
							if match.sbjct_start > match.sbjct_end:
								extraleft=match.sbjct_end-1
								
							else:
								extraleft=match.sbjct_start-1
								
								
						if (int(geneLen)>int(match.sbjct_start) and int(geneLen)>int(match.sbjct_end) ): # if 3' tip bases of the allele are missing on the match
							
							
							if match.sbjct_start > match.sbjct_end:
								extraright=geneLen-match.sbjct_start
								
							else:
								extraright=geneLen-match.sbjct_end
								
						#print 	extraleft, 	extraright
						
						
						if match.sbjct_start > match.sbjct_end:
							tS=match.query_start-extraright-1
							tE=match.query_end+extraleft
							alleleStr=str(record.seq[tS:tE])
							alleleStr = reverseComplement(alleleStr)
						else:
							tS=match.query_start-extraleft-1
							tE=match.query_end+extraright
							alleleStr=str(record.seq[tS:tE])
						
						tagAux = 'NA5:'
						printinfo(genomeFile,geneFile) 
						perfectMatchIdAllele.append("NA5-"+str(alleleI))
							
						print "New allele found! Adding allele "+ tagAux + str(alleleI) +" to the database"
						geneDict[alleleStr] = alleleI
							
						resultsList.append( tagAux + str(alleleI) )
							
						orderedAlleleNames.append('allele_' + str(alleleI) + '_' + tagAux[:-1] +'_' + str(os.path.basename(genomeFile)))									
						# --- add the new allele to the gene fasta --- #
							
						fG = open( geneFile, 'a' )
						fG.write('>allele_' + str(alleleI) + '_' + tagAux[:-1] +'_' + str(os.path.basename(genomeFile)) + '\n')
						fG.write( alleleStr + '\n')
						fG.close()
						alleleI += 1
						Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 )
						
					else:
						##################
						# LNF WTFFF #
						##################
						geneFile2= os.path.splitext(geneFile)[0] + "LNF3.fasta"
						print geneFile2
						with open(geneFile2, 'a') as f:
							f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n")
							f.write((alleleStr) +"\n")
							f.write(">Allele\n")
							f.write((bmAllele)+"\n")
						resultsList.append('LNF3')
						printinfo(genomeFile,geneFile) 
						perfectMatchIdAllele.append("LNF3")
						print "No allele found"
						
				elif isContainedDefinedAllele:
							####################
						# UNDEFINED ALLELE #		# it is contained in another allele
						####################
						
					alleleStr=match.query
					#if match.sbjct_start > match.sbjct_end:    #### - error
						#alleleStr = reverseComplement(alleleStr)
					resultsList.append('UND:-1')
					perfectMatchIdAllele.append("undefined allele")
					printinfo(genomeFile,geneFile) 
					print "Undefined allele \n"
					
					geneFile2= os.path.splitext(geneFile)[0] + "undefined.fasta"
					print geneFile2
					with open(geneFile2, 'a') as f:
						f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n")
						f.write((alleleStr) +"\n")
						#f.write(">BlastBestMatch"+str(definedAlleleName)+"\n")
						#f.write((alleleStr)+"\n")
						f.write(">Allele"+str(definedAlleleName)+"\n")
						f.write((definedAllele)+"\n")
					
				
							
				elif lenRatio < 0.5:
						
					###############
					# SMALL MATCH #
					###############
								
					resultsList.append('SAC:-1')		# don't know what 'SAC' stands for
					perfectMatchIdAllele.append('small match')
					printinfo(genomeFile,geneFile) 
					print "lower than 50% match \n"	
							
				elif lenRatio < 0.8 and idPercent < 0.5:
						#####################
					# INCOMPLETE ALLELE #		# it was not possible to extend it to at least 80% of the length of the gene
					#####################
					resultsList.append('INC:-1')
					perfectMatchIdAllele.append('allele incomplete')
					printinfo(genomeFile,geneFile)
					print "Incomplete allele\n"
						
				else:	
					##################
					# LNF WTFFF #
					##################
					geneFile2= os.path.splitext(geneFile)[0] + "LNF2.fasta"
					print geneFile2
					with open(geneFile2, 'a') as f:
						f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n")
						f.write((alleleStr) +"\n")
						f.write(">Allele\n")
						f.write((bmAllele)+"\n")
					resultsList.append('LNF2')
					printinfo(genomeFile,geneFile) 
					perfectMatchIdAllele.append("LNF2")
					print "No allele found"

						
							
	final =	(resultsList,perfectMatchIdAllele)	
	#return (resultsList)
	return final
def main():
	
	try:
		input_file = sys.argv[1]
		temppath = sys.argv[2]
	except IndexError:
		print "usage: list_pickle_obj"

	argumentList=[]
	with open(input_file,'rb') as f:
		argumentList = pickle.load(f)
	
	geneFile = argumentList[0]
	genomesList = argumentList[1]

	basepath=temppath+"/"+os.path.basename(geneFile)

	if not os.path.exists(basepath+"/blastdbs/"):
		os.makedirs(basepath+"/blastdbs/")
	
	
	gene_fp = HTSeq.FastaReader(geneFile)
	geneDict = {}
	alleleI = 1
	inverted=False
	orderedAlleleNames=[]
	biggestAllelelen=0
	smallestAllelelen=99999
	for allele in gene_fp:
		if allele.seq in geneDict:
			print "\nWARNING: this file contains a repeated allele, it should be checked. Ignoring it now!\n", geneFile
		else:
			if len(allele.seq)>biggestAllelelen:
				biggestAllelelen=len(allele.seq)
			if len(allele.seq)<smallestAllelelen:
				smallestAllelelen=len(allele.seq)
			orderedAlleleNames.append(str(alleleI))
			geneDict[ allele.seq ] = alleleI
			alleleI += 1

	# --- make 1st blast DB --- #

	geneF = os.path.basename(geneFile)
	blast_out_file = os.path.dirname(geneFile)+"/blastdbs/"+geneF + '.xml'

	# list of results - the output of the function
	resultsList = []
	i = 0
	perfectMatchIdAllele=[]
	perfectMatchIdAllele2=[]
	genomeDict = {}
	genome=-1	
	print genomesList
	for genomeFile in genomesList:
		print "_______________________________________________________"

		printinfo(genomeFile,geneFile)
		
		g_fp = HTSeq.FastaReader( genomeFile )
		for contig in g_fp:
			sequence=str(contig.seq)
			genomeDict[ contig.name ] = sequence
		
		currentGenomeDict = genomeDict
		
		genome+=1
		
		print ("Blasting alleles on genome at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
		
		blast_out_file = os.path.join(basepath,"blastdbs/"+os.path.basename(geneFile)+ '_List.xml')

		Gene_Blast_DB_name = os.path.join(temppath,str(os.path.basename(genomeFile))+"/"+str(os.path.basename(genomeFile))+"_db")

		
		cline = NcbiblastnCommandline(query=geneFile, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5)

			
		blast_records = runBlastParser(cline, blast_out_file, geneFile)
		print ("Blasted alleles on genome at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))

		# ------ DETERMINING BEST MATCH ------ #

		bestMatch = ['','', 0]
		bestMatchContig=''
		bestMatchContigLen=''
		bestalignlen=0
		perfectMatch=False
		bmAlleleLen2=0
		bmAllele=''

		for blast_record in blast_records:
		
			
			if perfectMatch==True:
				break
			try:

				hspC = blast_record.alignments[0]
				
				if bestMatch[0] == '' and bestMatch[1] == '':
					bestMatch[0] = blast_record.query
					bestMatch[1] = hspC
			except IndexError:
				continue


			# --- the contig tag is used in the progigal function --- #

			contigTag = blast_record.query
			
	
			j=0
			for l in contigTag:
				if l == ' ':
					break
				j+=1

			contigTag = contigTag[:j]

			contigLen = blast_record.query_letters
			
			# --- iterating over all the results to determine the best match --- #
			for alignment in blast_record.alignments:
				contigTag = alignment.hit_def
				contigTag=(contigTag.split(" "))[0]

				index=orderedAlleleNames.index(str(blast_record.query_id).split("_")[1])
				
				#print alignment.hit_def
				for k, v in geneDict.iteritems():
					if v == index+1:
						bmAlleleLen2= len(k)
					
				if perfectMatch:
					break
				for match in alignment.hsps:

					scoreRatio = float(match.score) / float(bmAlleleLen2)
					


					#if # of identities is the same as the length of the allele and it has no gaps or N's
					if (int(match.identities)==int(bmAlleleLen2) and int(match.identities)==int(len(match.query)) and "N" not in match.sbjct and "K" not in match.sbjct and "Y" not in match.sbjct and "R" not in match.sbjct ): 
						
						index=orderedAlleleNames.index(str(blast_record.query_id).split("_")[1])
						for seq, alleleid in geneDict.iteritems():
							if alleleid == index+1:
								bmAllele=seq
								break
						bmAlleleLen= len(bmAllele)
						
						lenratio=float(len(match.sbjct))/float(bmAlleleLen)
						bestMatch = [blast_record.query, match, scoreRatio, blast_record.query_id,lenratio,bmAlleleLen]
						bestMatchContig=contigTag
						perfectMatch=True
						index=orderedAlleleNames.index(str(blast_record.query_id).split("_")[1])
						bmAlleleLen= len(geneDict.keys()[index])
						break
						
					#choose the match with the best score ratio score/length of allele	
					elif scoreRatio > bestMatch[2]:
						index=orderedAlleleNames.index(str(blast_record.query_id).split("_")[1])
						for seq, alleleid in geneDict.iteritems():
							if alleleid == index+1:
								bmAllele=seq
								break
						bmAlleleLen= len(bmAllele)
						lenratio=float(len(match.sbjct))/float(bmAlleleLen)
						bestMatch = [blast_record.query, match, scoreRatio, blast_record.query_id,lenratio,bmAlleleLen]
						bestMatchContig=contigTag
						bestMatchContigLen=len(currentGenomeDict[contigTag])
						print contigTag
						bestalignlen=alignment.length
						
						
					
					if perfectMatch==True:
						break


		# ---------- ALLELE CALLING AFTER DETERMINING BEST MATCH ---------- #
		print ("Finished choosing best match at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
		
		
		try:

			match = bestMatch[1]
			bestMatchStart=match.sbjct_start
			bestMatchEnd=match.sbjct_end
			if match.query_start > match.query_end:
				bestMatchEnd=match.sbjct_start
				bestMatchStart=match.sbjct_end
			

			geneLen = bestMatch[5]
			alleleStr = match.sbjct
			nIdentities = match.identities
			idPercent = float(nIdentities) / float(geneLen)
			scoreRatio = bestMatch[2]
			lenRatio = bestMatch[4]

		
		except:
			#if no best match was found
			
			###################
			# LOCUS NOT FOUND #
			###################
			
			resultsList.append('LNF3:-1')
			perfectMatchIdAllele.append('LNF')
			perfectMatchIdAllele2.append('LNF')
			
			print "Locus not found, no matches \n"
			continue
		
		print "is perfect match true?" +str(perfectMatch)
		if perfectMatch is True:
			
			#if a perfect match was found (DNA sequence is the same)
			

			try:
				alleleNumber = geneDict[ alleleStr ]
			except:
				alleleStr=reverseComplement(alleleStr)
				alleleNumber = geneDict[ alleleStr ]
			
					################################################
					# EXACT MATCH --- MATCH == GENE --- GENE FOUND #
					################################################
			if "_" in bestMatch[3]:
				a=bestMatch[3].split("_")
				perfectMatchIdAllele.append(a[1])
				perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(bestMatchStart)+"-"+str(bestMatchEnd)+"&"+"+")
			else:
				perfectMatchIdAllele.append(bestMatch[3])
				perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(bestMatchStart)+"-"+str(bestMatchEnd)+"&"+"+")
			resultsList.append('EXC:' + str(alleleNumber) )
			printinfo(genomeFile,geneFile)
			print "Exact match \n"
			continue
						
			

		elif bestMatch[0] != '' and perfectMatch is not True:
						
			#if a best match was found but it's not an exact match	

					###########################
					# LOCUS ON THE CONTIG TIP #
					###########################
			
			#check if the match is on the tip of the contig
			
			if bestMatchContigLen <= geneLen:
				

				
				resultsList.append('LOTSC:-1')
				perfectMatchIdAllele.append('LOTSC')
				perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(bestMatchStart)+"-"+str(bestMatchEnd)+"&"+"+")
				printinfo(genomeFile,geneFile)
				print "Locus is bigger than the contig \n"
			
			
			elif match.sbjct_start ==1 and len(match.query) < geneLen:
			
				resultsList.append('LOT5:-1')
				perfectMatchIdAllele.append('LOT5')
				perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(bestMatchStart)+"-"+str(bestMatchEnd)+"&"+"+")
				printinfo(genomeFile,geneFile)
				
				print "Locus is on the 5' tip of the contig \n"
			
			elif match.sbjct_end ==1 and len(match.query) < geneLen and match.sbjct_start > match.sbjct_end:
			
				resultsList.append('LOT3:-1')
				perfectMatchIdAllele.append('LOT3')
				perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(bestMatchStart)+"-"+str(bestMatchEnd)+"&"+"+")
				printinfo(genomeFile,geneFile)
				
				print "Locus is on the 3' tip of the contig \n"
			
			
			
			elif match.sbjct_end == bestMatchContigLen and len(match.query) < bestMatchContigLen:
				
				resultsList.append('LOT3:-1')
				perfectMatchIdAllele.append('LOT3')
				perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(bestMatchStart)+"-"+str(bestMatchEnd)+"&"+"+")
				printinfo(genomeFile,geneFile)

				print "Locus is on the 3' tip of the contig \n"
			
			elif match.sbjct_start == bestMatchContigLen and len(match.query) < bestMatchContigLen and match.sbjct_start > match.sbjct_end:
				
				resultsList.append('LOT5:-1')
				perfectMatchIdAllele.append('LOT5')
				perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(bestMatchStart)+"-"+str(bestMatchEnd)+"&"+"+")
				printinfo(genomeFile,geneFile)

				print "Locus is on the 5' tip of the contig \n"

				

			elif 'N' in alleleStr or "K" in alleleStr or "R" in alleleStr or "Y" in alleleStr:
				
					#####################
					# ALLELE NOT FOUND  #		
					#####################
				
				# strange base found!
				
				geneFile2= os.path.splitext(geneFile)[0] + "LNFN.fasta"

				with open(geneFile2, 'a') as f:
					f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+"\n")
					f.write((alleleStr) +"\n")
				resultsList.append('LNFN:-1')
				perfectMatchIdAllele.append('LNFN')
				perfectMatchIdAllele2.append('LNFN')
				printinfo(genomeFile,geneFile) 
				print "LNFN, contains strange (N,K,R) bases! \n"
			
			
			
			else:
				
				print "new allele?"
					
				alleleStr = alleleStr.replace('-', '')
				lenExtraThresh=int(biggestAllelelen*0.2)

				#else: #check if best match without gaps are contained inside an already defined allele

				isContainedDefinedAllele = False	
				definedAllele=''
				definedAlleleName=''

				for k in geneDict.keys():
					if alleleStr in k:
						definedAllele=k
						isContainedDefinedAllele = True
						definedAlleleName=geneDict.get(k)
						break
				print "is contained? " + str(isContainedDefinedAllele)
				print idPercent
				print geneLen
				print lenExtraThresh
				print lenRatio
				
				if isContainedDefinedAllele  and int(len(match.sbjct))<=int(len(definedAllele))+lenExtraThresh and int(len(match.sbjct))>=int(len(definedAllele))-lenExtraThresh :
					#allele without gaps is contained in a defined allele
					#best match with gaps has same size +1/-1 base as the defined allele
					
					isnewallele=False
						
					if int(len(alleleStr))==int(len(definedAllele)): # if match without gaps has same size as the defined allele 
						tagAux = 'NA1:'
						printinfo(genomeFile,geneFile) 
						perfectMatchIdAllele.append("NA1-"+str(alleleI))
						perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(bestMatchStart)+"-"+str(bestMatchEnd)+"&"+"+")
						isnewallele=True
						
					elif int(len(alleleStr))==int(len(definedAllele))-1 : # if match without gaps has minus one base than the defined allele
						
						tagAux = 'NA2:'
						printinfo(genomeFile,geneFile) 
						perfectMatchIdAllele.append("NA2-"+str(alleleI))
						perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(bestMatchStart)+"-"+str(bestMatchEnd)+"&"+"+")
						isnewallele=True
						
					else:
							extraleft=0
							extraright=0
							tS=0
							tE=0

							handle = open(genomeFile, "rU")
							record_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
							handle.close()
							record= record_dict[bestMatchContig]
							
							
							# if match without gaps has more than one base missing comparing to the defined allele 
							if (1<int(match.query_start) and 1<int(match.query_end)):
								
								if match.query_start > match.query_end:
									extraleft=match.query_end-1
									
								else:
									extraleft=match.query_start-1
							
							print 	extraleft, 	extraright		
									
									
							# if 3' tip bases of the allele are missing on the match	
							if (int(geneLen)>int(match.query_start) and int(geneLen)>int(match.query_end) ): 
								
								if match.query_start > match.query_end:
									extraright=geneLen-match.query_start
									
								else:
									extraright=geneLen-match.query_end
									
							print 	extraleft, 	extraright
							
							
							if match.sbjct_start > match.sbjct_end:
								tE=match.sbjct_start+extraleft
								tS=match.sbjct_end-extraright-1
								alleleStr=str(record.seq[tS:tE])
								alleleStr = reverseComplement(alleleStr)
							else:
								tS=match.sbjct_start-extraleft-1
								tE=match.sbjct_end+extraright
								alleleStr=str(record.seq[tS:tE])
							
							print tS
							print tE
							print "allele is:"
							print alleleStr
							
							if tE> bestMatchContigLen:
								resultsList.append('LOT3B:-1')
								perfectMatchIdAllele.append('LOT3B')
								perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(tS)+"-"+str(bestMatchContigLen)+"&"+"+")
								printinfo(genomeFile,geneFile)
								
								print "Locus is on the 3B' tip of the contig \n"
							
							elif tS<0:
								resultsList.append('LOT5B:-1')
								perfectMatchIdAllele.append('LOT5B')
								perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(0)+"-"+str(tE)+"&"+"+")
								printinfo(genomeFile,geneFile)
								
								print "Locus is on the 5B' tip of the contig \n"
						
						
							else:
						
								tagAux = 'NA4:'
								printinfo(genomeFile,geneFile) 
								perfectMatchIdAllele.append("NA4-"+str(alleleI))
								perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(tS)+"-"+str(tE)+"&"+"+")
								isnewallele=True
						
					
					if isnewallele:
						print "New allele found! Adding allele "+ tagAux + str(alleleI) +" to the database"
						geneDict[alleleStr] = alleleI
							
						resultsList.append( tagAux + str(alleleI) )
							
						orderedAlleleNames.append(str(alleleI))						
						# --- add the new allele to the gene fasta --- #
							
						fG = open( geneFile, 'a' )
						fG.write('>allele_' + str(alleleI) + '_' + tagAux[:-1] +'_' + str(os.path.basename(genomeFile)) + '\n')
						fG.write( alleleStr + '\n')
						fG.close()
						alleleI += 1
					
				#if best match is not contained in an already defined allele, check if it has similar size with the match allele and has 0.8 similarity
				
				elif not isContainedDefinedAllele and idPercent >= 0.8 and int(len(match.sbjct))<=int(geneLen)+lenExtraThresh and int(len(match.sbjct))>=int(geneLen)-lenExtraThresh :

					ratio=float(len(alleleStr)) / float(geneLen)
					
					if ratio>=0.8 and ratio<=1.2: # if match without gaps has same size as the best match allele and 80%similarity
						
						tagAux = ''
						extraleft=0
						extraright=0
						tS=0
						tE=0

						handle = open(genomeFile, "rU")
						record_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
						handle.close()
						record= record_dict[bestMatchContig]

						#if len(match.sbjct)<geneLen and "-" not in match.sbjct:  #if the allele is not fully used against the match, compensate the tips
						try:
							print match
							if (1<int(match.query_start) and 1<int(match.query_end)):
								
								if match.query_start > match.query_end:
									extraleft=match.query_end-1
									
								else:
									extraleft=match.query_start-1
							
							print 	extraleft, 	extraright		
									
							if (int(geneLen)>int(match.query_start) and int(geneLen)>int(match.query_end) ): # if 3' tip bases of the allele are missing on the match
								
								
								if match.query_start > match.query_end:
									extraright=geneLen-match.query_start
									
								else:
									extraright=geneLen-match.query_end
									
							print 	extraleft, 	extraright
							
							
							if match.sbjct_start > match.sbjct_end:
								tE=match.sbjct_start+extraleft
								tS=match.sbjct_end-extraright-1
								alleleStr=str(record.seq[tS:tE])
								alleleStr = reverseComplement(alleleStr)
							else:
								tS=match.sbjct_start-extraleft-1
								tE=match.sbjct_end+extraright
								alleleStr=str(record.seq[tS:tE])
							
							print tS
							print tE
							print "allele is:"
							print alleleStr
							
							if tE> bestMatchContigLen:
								resultsList.append('LOT3B:-1')
								perfectMatchIdAllele.append('LOT3B')
								perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(tS)+"-"+str(bestMatchContigLen)+"&"+"+")
								printinfo(genomeFile,geneFile)
								
								print "Locus is on the 3B' tip of the contig \n"
							
							elif tS<0:
								resultsList.append('LOT5B:-1')
								perfectMatchIdAllele.append('LOT5B')
								perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(0)+"-"+str(tE)+"&"+"+")
								printinfo(genomeFile,geneFile)
								
								print "Locus is on the 5B' tip of the contig \n"
							
							else:
								tagAux = 'NA5:'
								printinfo(genomeFile,geneFile) 
								perfectMatchIdAllele.append("NA5-"+str(alleleI))
								perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(tS)+"-"+str(tE)+"&"+"+")
									
								print "New allele found! Adding allele "+ tagAux + str(alleleI) +" to the database"
								geneDict[alleleStr] = alleleI
									
								resultsList.append( tagAux + str(alleleI) )
									
								#orderedAlleleNames.append('allele_' + str(alleleI) + '_' + tagAux[:-1] +'_' + str(os.path.basename(genomeFile)))									
								orderedAlleleNames.append(str(alleleI))
								# --- add the new allele to the gene fasta --- #
									
								fG = open( geneFile, 'a' )
								fG.write('>allele_' + str(alleleI) + '_' + tagAux[:-1] +'_' + str(os.path.basename(genomeFile)) + '\n')
								fG.write( alleleStr + '\n')
								fG.close()
								alleleI += 1

						except Exception as e:
							##################
							# LNF WTF #
							##################
							print e
							geneFile2= os.path.splitext(geneFile)[0] + "LNF3.fasta"
							print geneFile2
							with open(geneFile2, 'a') as f:
								f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n")
								f.write((alleleStr) +"\n")
								f.write(">Allele\n")
								f.write((bmAllele)+"\n")
							resultsList.append('LNF3')
							printinfo(genomeFile,geneFile) 
							perfectMatchIdAllele.append("LNF3")
							perfectMatchIdAllele2.append("LNF3")
							print "No allele found"
					else:
						##################
						# LNF WTF2 #
						##################
						geneFile2= os.path.splitext(geneFile)[0] + "LNF4.fasta"
						print geneFile2
						with open(geneFile2, 'a') as f:
							f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n")
							f.write((alleleStr) +"\n")
							f.write(">Allele\n")
							f.write((bmAllele)+"\n")
						resultsList.append('LNF4')
						printinfo(genomeFile,geneFile) 
						perfectMatchIdAllele.append("LNF4")
						perfectMatchIdAllele2.append("LNF4")
						print "No allele found"
					
					
					
					
					
						
				elif isContainedDefinedAllele:
						####################
						# UNDEFINED ALLELE #		# it is contained in another allele
						####################
						
					alleleStr=match.query

					resultsList.append('UND:-1')
					perfectMatchIdAllele.append("undefined allele")
					perfectMatchIdAllele2.append("undefined allele")
					printinfo(genomeFile,geneFile) 
					print "Undefined allele \n"
					
					geneFile2= os.path.splitext(geneFile)[0] + "undefined.fasta"
					print geneFile2
					"""with open(geneFile2, 'a') as f:
						f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n")
						f.write((alleleStr) +"\n")
						#f.write(">BlastBestMatch"+str(definedAlleleName)+"\n")
						#f.write((alleleStr)+"\n")
						f.write(">Allele"+str(definedAlleleName)+"\n")
						f.write((definedAllele)+"\n")"""
					
				
							
				elif lenRatio < 0.5:
						
					###############
					# SMALL MATCH #
					###############
								
					resultsList.append('SAC:-1')		# don't know what 'SAC' stands for
					perfectMatchIdAllele.append('small match')
					perfectMatchIdAllele2.append('small match')
					printinfo(genomeFile,geneFile) 
					print "lower than 50% match \n"	
							
				elif lenRatio < 0.8 and idPercent < 0.5:
					
					#####################
					# INCOMPLETE ALLELE #		# it was not possible to extend it to at least 80% of the length of the gene
					#####################
					resultsList.append('INC:-1')
					perfectMatchIdAllele.append('allele incomplete')
					perfectMatchIdAllele2.append('allele incomplete')
					printinfo(genomeFile,geneFile)
					print "Incomplete allele\n"
						
				else:	
					##################
					# LNF WTFFF #
					##################
					geneFile2= os.path.splitext(geneFile)[0] + "LNF.fasta"
					print geneFile2
					with open(geneFile2, 'a') as f:
						f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n")
						f.write((alleleStr) +"\n")
						f.write(">Allele\n")
						f.write((bmAllele)+"\n")
					resultsList.append('LNF5')
					printinfo(genomeFile,geneFile) 
					perfectMatchIdAllele.append("LNF5")
					perfectMatchIdAllele2.append("LNF5")
					print "Locus not found"
						
							
	final =	(resultsList,perfectMatchIdAllele)	

	print ("Finished allele calling at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	filepath=os.path.join(temppath , os.path.basename(geneFile)+"_result.txt")
	filepath2=os.path.join(temppath , os.path.basename(geneFile)+"_result2.txt")

	with open(filepath, 'wb') as f:
		pickle.dump(final, f)
	with open(filepath2, 'wb') as f:
		pickle.dump(perfectMatchIdAllele2, f)
	return True
def main():
	print ("Starting script at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	try:
		input_file = sys.argv[1]
		temppath = sys.argv[2]
	except IndexError:
		print "usage: list_pickle_obj"

	argumentList=[]
	with open(input_file,'rb') as f:
		argumentList = pickle.load(f)
	
	geneFile = argumentList[0]
	genomesList = argumentList[1]
	#listOfCDSDicts = argumentList[2]
	
	basepath=os.path.join(temppath,os.path.splitext(geneFile)[0])
	if not os.path.exists(basepath):
			os.makedirs(basepath)
	#print geneFile
	gene_fp = HTSeq.FastaReader(geneFile)
	alleleI = 0
	#inverted=False
	#orderedAlleleNames=[]
	resultsList = []
	i = 0
	perfectMatchIdAllele=[]
	perfectMatchIdAllele2=[]
	allelescores=[]
	
	print ("Getting BSR at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	alleleI,allelescores,Gene_Blast_DB_name,alleleList=getBlastScoreRatios(geneFile,basepath)
	print ("Finished BSR at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	genome=-1	
	
	genomeDict = {}
	print ("starting allele call at: "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	for genomeFile in genomesList:
		print genomeFile
		bestmatch=[0,0,False,'',0] #score, score ratio, perfectmatch, key name of the DNA sequence string, allele ID
		currentGenomeDict={}
		currentCDSDict={}
		#currentCDSDict = listOfCDSDicts[i]
		
		filepath=os.path.join(temppath,str(os.path.basename(genomeFile))+"_ORF_Protein.txt")
		with open(filepath,'rb') as f:
			currentCDSDict = pickle.load(f)
		
		g_fp = HTSeq.FastaReader( genomeFile )
		for contig in g_fp:
			sequence=str(contig.seq)
			genomeDict[ contig.name ] = sequence
		
		currentGenomeDict = genomeDict
		#print currentGenomeDict
		#alleleI = 0
		#alleleProt=''
		#for allele in gene_fp: #new db for each allele to blast it against himself
		#	alleleI+=1
		#	alleleProt+=">"+str(alleleI)+"\n"+str(translateSeq(allele.seq)+"\n")
		#basepath="./blastdbs/temp"+str(os.path.basename(geneFile))
		#if not os.path.exists(basepath):
		#	os.makedirs(basepath)
		#with open(basepath+'/protein.fasta', "wb") as f:
		#	f.write(alleleProt)
		#Gene_Blast_DB_name = Create_Blastdb( basepath+'/protein.fasta', 1, True )
		genome+=1
		listOfCDS=currentCDSDict
		genomeProteinfastaPath=os.path.join(temppath,str(os.path.basename(genomeFile)+'_Protein.fasta'))
		
		
		print ("Blasting alleles on genome at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
		
		blast_out_file = os.path.join(basepath,"blastdbs/"+os.path.basename(geneFile)+ '_List.xml')

		#with open(basepath+'/proteinList.fasta', "wb") as f:
		#	f.write(protList)
		#Gene_Blast_DB_name = Create_Blastdb( './temp/proteinList.fasta', 1, True )
		cline = NcbiblastpCommandline(query=genomeProteinfastaPath, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5)
		#print cline
		#try:
		
		print ("Parse bsr blast at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
		
		blast_records = runBlastParser(cline, blast_out_file, genomeProteinfastaPath)
		#except:
		#	cline = NcbiblastpCommandline(query=genomeProteinfastaPath, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5)
		
		print ("Blasted alleles on genome at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
		
		for blast_record in blast_records:
				
			
			for alignment in blast_record.alignments:
				
				for match in alignment.hsps:
					#print blast_record.query
					#print match
					#print alleleI, len(allelescores)
					scoreRatio=float(match.score)/float(allelescores[int(alignment.hit_def)-1])
					#print scoreRatio
					#print alignment.hit_def
					cdsStrName=blast_record.query
					DNAstr=listOfCDS[">"+cdsStrName]

					AlleleDNAstr=alleleList[int(alignment.hit_def)-1]
					compare=False
					if DNAstr==AlleleDNAstr is False:
						try:
							DNAstr=reverseComplement(DNAstr)
							if DNAstr==AlleleDNAstr is False:
								pass
							else:
								compare=True
						except:
							pass
					else:
						compare=True
						
					if "N" in DNAstr or "K" in DNAstr or "R" in DNAstr:
						pass
						
					elif(scoreRatio == 1 and bestmatch[2] is False and compare is True):
						bestmatch=[match.score,scoreRatio,True,cdsStrName,int(alignment.hit_def),match,len(AlleleDNAstr),blast_record.query_letters]
						#print alignment
						#print match
					elif(scoreRatio == 1 and match.score>bestmatch[0] and compare is True):
						bestmatch=[match.score,scoreRatio,True,cdsStrName,int(alignment.hit_def),match,len(AlleleDNAstr),blast_record.query_letters]
						#print match
					elif(scoreRatio == 1 and bestmatch[2] is False and compare is False):
						bestmatch=[match.score,scoreRatio,False,cdsStrName,int(alignment.hit_def),match,len(AlleleDNAstr),blast_record.query_letters]
						#print alignment
						#print match
					
					elif(scoreRatio == 1 and match.score>bestmatch[0] and compare is False):
						bestmatch=[match.score,scoreRatio,False,cdsStrName,int(alignment.hit_def),match,len(AlleleDNAstr),blast_record.query_letters]
						#print match
					elif(match.score>bestmatch[0] and scoreRatio>0.6 and scoreRatio>bestmatch[1] and bestmatch[2] is False):
						#print match.query
						#print match.sbjct
						#print allelescores
						bestmatch=[match.score,scoreRatio,False,cdsStrName,int(alignment.hit_def),match,len(AlleleDNAstr),blast_record.query_letters]
						#print match
		#print bestmatch
		
		print ("Classifying the match at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))		
		if bestmatch[0]==0 or "N" in AlleleDNAstr or "K" in AlleleDNAstr or "R" in AlleleDNAstr :
					#if no best match was found
					
					###################
					# LOCUS NOT FOUND #
					###################
			if 	bestmatch[0]==0:		
				resultsList.append('LNF3:-1')            # append result to the list of results
				perfectMatchIdAllele.append('LNF')
				perfectMatchIdAllele2.append('LNF')
				#printinfo(genomeFile,geneFile)
				print "Locus not found, no matches \n"
			else:
				resultsList.append('LNFN:-1')            # append result to the list of results
				perfectMatchIdAllele.append('LNF')
				perfectMatchIdAllele2.append('LNF')
				#printinfo(genomeFile,geneFile)
				print "Locus has strange base (N, K or R) \n"
		
		
		elif bestmatch[2] is True:
			contigname=bestmatch[3]	
			
			contigname=contigname.split("&")
			matchLocation=contigname[2]	
			#matchLocation=matchLocation.split("-")
			contigname=contigname[0]	
			
			alleleStr=listOfCDS[">"+bestmatch[3]]
			protSeq,alleleStr,Reversed=translateSeq(alleleStr)
					#if a perfect match was found
					
						################################################
						# EXACT MATCH --- MATCH == GENE --- GENE FOUND #
						################################################
					
			perfectMatchIdAllele.append(str(bestmatch[4]))
			if not Reversed:
				perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation)+"&"+"+")
			else:
				perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation)+"&"+"-")
			resultsList.append('EXC:' + str(bestmatch[4]) )
			
		else:
			
			match=bestmatch[5]
			#print match
			geneLen=bestmatch[6]

			contigname=bestmatch[3]	
			#print contigname
			
			contigname=contigname.split("&")
			matchLocation=contigname[2]	
			matchLocation=matchLocation.split("-")
			contigname=contigname[0]
			
			seq=currentGenomeDict[ contigname ]
			bestMatchContigLen=len(seq)
			
			alleleStr=listOfCDS[">"+bestmatch[3]]
			protSeq,alleleStr,Reversed=translateSeq(alleleStr)
			
			
			print match
			print matchLocation
			print bestMatchContigLen
			# get extra space to the right and left between the contig and match 
			rightmatchContig=bestMatchContigLen-int(matchLocation[1])	
			leftmatchContig=int(matchLocation[0])
			
			if Reversed:
				aux=rightmatchContig
				rightmatchContig=leftmatchContig
				leftmatchContig=aux
			"""else:
				rightmatchContig=bestMatchContigLen-int(matchLocation[0])	
				leftmatchContig=int(matchLocation[1])"""
			
			print rightmatchContig,leftmatchContig
			
			
			# get extra space to the right and left between the allele and match
			
			rightmatchAllele=geneLen-(int(match.sbjct_end)*3)	
			leftmatchAllele=(int(match.sbjct_start)*3)
			"""if Reversed: 
				aux=rightmatchAllele
				rightmatchAllele=leftmatchAllele
				leftmatchAllele=aux"""
				
			"""else:
				rightmatchAllele=geneLen-(int(match.sbjct_start	)*3)
				leftmatchAllele=(int(match.sbjct_end)*3)"""
			
			#if a best match was found but it's not an exact match	

					###########################
					# LOCUS ON THE CONTIG TIP #
					###########################
			
			print rightmatchAllele, leftmatchAllele
			print geneLen
			
			
			
			#if bestMatchContigLen <= geneLen:
			if leftmatchContig<leftmatchAllele and 	rightmatchContig < rightmatchAllele:
			
				resultsList.append('LOTSC:-1')
				perfectMatchIdAllele.append('LOTSC')
				perfectMatchIdAllele2.append('LOTSC')
				#printinfo(genomeFile,geneFile)
				#print match.query_start
				print "Locus is bigger than the contig \n"
			
			#if match.query_start ==1 and len(match.query) < geneLen:		
			elif leftmatchContig<leftmatchAllele:
				
				
				resultsList.append('LOT3:-1')
				perfectMatchIdAllele.append('LOT3')
				perfectMatchIdAllele2.append('LOT3')
				
				print "Locus is on the 3' tip of the contig \n"
			
			
			#elif match.query_end == bestMatchContigLen and len(match.query) < geneLen:
			elif 	rightmatchContig < rightmatchAllele:
				
				resultsList.append('LOT5:-1')
				perfectMatchIdAllele.append('LOT5')
				perfectMatchIdAllele2.append('LOT5')

				print "Locus is on the 5' tip of the contig \n"
			
			
						
				
		
				
			else:
						#######################
						# ADD INFERRED ALLELE #		# a new allele 
						#######################
						
												
					#print "infered allele has location : "+(CDSType)
					#printinfo(genomeFile,geneFile) 
				tagAux='INF'
				perfectMatchIdAllele.append( tagAux +"-"+str(alleleI+1))
				#perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1]))
				
				if not Reversed:
					perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"+")
				else:
					perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"-")
				
				
				print "New allele! Adding allele "+ tagAux + str(alleleI+1) +" to the database\n"
																					
				resultsList.append( tagAux + str(alleleI+1) )

						#orderedAlleleNames.append('allele_' + str(alleleI) + '_' + tagAux[:-1] +"_" +str(os.path.basename(genomeFile)))	
											# --- add the new allele to the gene fasta --- #
				
				
				appendAllele='>allele_' + str(alleleI+1) + '_' + tagAux[:-1] +"_" + str(os.path.basename(genomesList[genome])) + '\n'
				fG = open( geneFile, 'a' )
				#fG.write('>allele_' + str(alleleI+1) + '_' + tagAux[:-1] +"_" + str(os.path.basename(genomesList[genome])) + '\n')
				fG.write(appendAllele)
						#print alleleStr
					
				
				#print listOfCDS
				#alleleStr=listOfCDS[">"+bestmatch[3]]
				#match=bestmatch[5]
				#reverse the order if needed
				#if match.sbjct_start > match.sbjct_end: 
				#	alleleStr = reverseComplement(alleleStr)
				fG.write( alleleStr + '\n')
				fG.close()
				
				fG = open( os.path.join(basepath,str(os.path.basename(geneFile)+'_protein2.fasta')), 'w' )
				#fG.write('>allele_' + str(alleleI+1) + '_' + tagAux[:-1] +"_" + str(os.path.basename(genomesList[genome])) + '\n')
				fG.write('>'+str(alleleI+1)+'\n'+str(protSeq) + '\n')
				fG.close()
				fG = open( os.path.join(basepath,str(os.path.basename(geneFile)+'_protein.fasta')), 'a' )
				#fG.write('>allele_' + str(alleleI+1) + '_' + tagAux[:-1] +"_" + str(os.path.basename(genomesList[genome])) + '\n')
				fG.write('>'+str(alleleI+1)+'\n'+str(protSeq) + '\n')
						#print alleleStr
				fG.close()	
				
				#print listOfCDS
				#alleleStr=listOfCDS[">"+bestmatch[3]]
				match=bestmatch[5]
				
						#alleleI += 1
						# --- remake blast DB --- #
				alleleList.append(alleleStr)
				Gene_Blast_DB_name = Create_Blastdb( os.path.join(basepath,str(os.path.basename(geneFile)+'_protein.fasta')), 1, True )
				print os.path.join(basepath,str(os.path.basename(geneFile)+'_protein.fasta'))
				genefile2= os.path.join(basepath,str(os.path.basename(geneFile)+'_protein2.fasta'))
				Gene_Blast_DB_name2 = Create_Blastdb( genefile2, 1, True )
				print ("Re-calculating BSR at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
				alleleI,allelescores,alleleList=reDogetBlastScoreRatios(genefile2,basepath,alleleI,allelescores,Gene_Blast_DB_name2,alleleList)
				print allelescores
				print ("Done Re-calculating BSR at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	#x=y
	#shutil.rmtree(basepath)

	
	final =	(resultsList,perfectMatchIdAllele)	
	#return (resultsList)
	print ("Finished allele calling at : "+time.strftime("%H:%M:%S-%d/%m/%Y"))
	filepath=os.path.join(temppath , os.path.basename(geneFile)+"_result.txt")
	filepath2=os.path.join(temppath , os.path.basename(geneFile)+"_result2.txt")
	#print filepath
	with open(filepath, 'wb') as f:
		pickle.dump(final, f)
	with open(filepath2, 'wb') as f:
		pickle.dump(perfectMatchIdAllele2, f)
	shutil.rmtree(basepath)
	return True
def getBlastScoreRatios(genefile, basepath, doAll, verbose, blastPath):
    if verbose:

        def verboseprint(*args):
            for arg in args:
                print(arg),
            print
    else:
        verboseprint = lambda *a: None  # do-nothing function

    #gene_fp = HTSeq.FastaReader(genefile)
    allelescores = []
    alleleProt = ''
    alleleAllProt = ''
    alleleList = []
    alleleI = 0
    alleleIlist = []
    listAllelesNames = []
    # calculate bsr for each allele
    for allele in SeqIO.parse(genefile, "fasta", generic_dna):

        # usually first allele name is just >1 and after that it has >gene_id_genome
        aux = allele.id.split("_")
        if len(aux) < 2:
            alleleI = int(aux[0])
        else:
            alleleI = int(aux[-1])

        # try to translate the allele
        alleleIlist.append(alleleI)
        alleleList.append(str(allele.seq))
        listAllelesNames.append(allele.id)
        translatedSequence, x, y = translateSeq(allele.seq)

        if translatedSequence == '':
            print("cannot translate allele on bsr calculation")
            pass

        # calculate BSR for the allele
        else:
            alleleProt = ">" + str(alleleI) + "\n" + str(translatedSequence +
                                                         "\n")
            alleleAllProt += ">" + str(alleleI) + "\n" + str(
                translatedSequence + "\n")
            proteinfastaPath = os.path.join(
                basepath, str(os.path.basename(genefile) + '_protein2.fasta'))

            # new db for each allele to blast it against himself
            with open(proteinfastaPath, "w") as f:
                f.write(alleleProt)
            Gene_Blast_DB_name = Create_Blastdb(proteinfastaPath, 1, True)

            # if bsr hasn't been calculated, do the BLAST
            if doAll:

                blast_out_file = os.path.join(basepath, 'blastdbs/temp.xml')
                verboseprint("Starting Blast alleles at : " +
                             time.strftime("%H:%M:%S-%d/%m/%Y"))

                # --- get BLAST score ratio --- #
                cline = NcbiblastpCommandline(cmd=blastPath,
                                              query=proteinfastaPath,
                                              db=Gene_Blast_DB_name,
                                              evalue=0.001,
                                              out=blast_out_file,
                                              outfmt=5,
                                              num_threads=1)
                allelescore = 0

                blast_records = runBlastParser(cline, blast_out_file)

                verboseprint("Blasted alleles at : " +
                             time.strftime("%H:%M:%S-%d/%m/%Y"))

                for blast_record in blast_records:

                    for alignment in blast_record.alignments:

                        for match in alignment.hsps:
                            allelescores.append(int(match.score))

                geneScorePickle = os.path.abspath(genefile) + '_bsr.txt'
                verboseprint("________")
                # ~ var=[alleleI,allelescores]
                var = dict(zip(alleleIlist, allelescores))
                with open(geneScorePickle, 'wb') as f:
                    pickle.dump(var, f)

            # bsr had already been calculated, load it to memory
            else:
                geneScorePickle = os.path.abspath(genefile) + '_bsr.txt'
                with open(geneScorePickle, 'rb') as f:
                    var = pickle.load(f)
                # ~ allelescores=var[1]

    proteinfastaPath = os.path.join(
        basepath, str(os.path.basename(genefile) + '_protein.fasta'))
    with open(proteinfastaPath, "w") as f:
        f.write(alleleAllProt)

    # returning all allele BSR scores and list of alleles for this gene
    return var, alleleList, listAllelesNames
Example #29
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "Given an ffn file, recovers the genes that are not paralogs and have a size bigger than the g parameter provided"
    )
    parser.add_argument('-i',
                        nargs='?',
                        type=str,
                        help='ffn file',
                        required=True)
    parser.add_argument('-l',
                        nargs='?',
                        type=int,
                        help='int minimum length',
                        required=True)
    parser.add_argument(
        '--cpu',
        nargs='?',
        type=int,
        help="Number of cpus, if over the maximum uses maximum -2",
        required=False)
    parser.add_argument('-p',
                        nargs='?',
                        type=str,
                        help="file with protein",
                        required=False,
                        default=False)
    parser.add_argument('-o',
                        nargs='?',
                        type=str,
                        help="output filename",
                        required=False,
                        default=False)
    parser.add_argument('-b',
                        nargs='?',
                        type=str,
                        help="BLAST full path",
                        required=False,
                        default='blastp')
    parser.add_argument('--bsr',
                        nargs='?',
                        type=float,
                        help="minimum BSR similarity",
                        required=False,
                        default=0.6)
    parser.add_argument("-v",
                        "--verbose",
                        help="increase output verbosity",
                        dest='verbose',
                        action="store_true",
                        default=False)

    args = parser.parse_args()
    genes = args.i
    sizethresh = args.l
    cpuToUse = args.cpu
    proteinFIlePath = args.p
    outputFIlePath = args.o
    BlastpPath = args.b
    bsr = args.bsr
    verbose = args.verbose

    if verbose:

        def verboseprint(*args):

            for arg in args:
                print(arg),
            print
    else:
        verboseprint = lambda *a: None

    starttime = "\nStarting Script at : " + time.strftime("%H:%M:%S-%d/%m/%Y")
    verboseprint("\nStarting Script at : " +
                 time.strftime("%H:%M:%S-%d/%m/%Y"))

    verboseprint("Checking Blast installed... " + str(which(BlastpPath)))

    # translate to protein and create new file
    abspath = os.path.abspath(genes)
    filename = os.path.basename(genes)
    abspath = abspath.replace(filename, '')
    proteinfile = os.path.join(abspath, 'proteins.fasta')

    geneDict = {}
    protDict = {}
    orderedprotDict = collections.OrderedDict()
    alreadyIn = []
    totalgenes = 0
    repeatedgenes = 0
    smallgenes = 0
    nottranslatable = 0

    verboseprint("Checking translatability of the loci:\n")

    if not proteinFIlePath:
        # print "not passing steps"
        with open(proteinfile, "w") as f:
            #g_fp = HTSeq.FastaReader(genes)

            for gene in SeqIO.parse(genes, "fasta", generic_dna):
                dnaseq = str(gene.seq)
                protseq, seq, y = translateSeq(dnaseq, gene.id)
                totalgenes += 1
                if len(protseq) > 1:

                    if str(protseq) in alreadyIn:
                        repeatedgenes += 1

                    elif len(str(seq)) < sizethresh:
                        smallgenes += 1

                    else:
                        alreadyIn.append(str(protseq))
                        protname = ">" + str(gene.id) + "\n"

                        f.write(protname + str(protseq) + "\n")
                        protDict[protname] = str(protseq)
                        geneDict[str(gene.name)] = dnaseq
                else:
                    nottranslatable += 1
                    continue

            verboseprint(
                str(nottranslatable) + " not translatable out of " +
                str(totalgenes))

            verboseprint("\nChecking if repeated protein sequences:\n")

            orderedprotList = []
            orderedprotList = sorted(protDict.items(),
                                     key=lambda x: len(x[1]),
                                     reverse=True)

            i = 0
            while i < len(orderedprotList):
                elem = orderedprotList[i]
                orderedprotDict[elem[0]] = elem[1]
                i += 1

        verboseprint(
            str(repeatedgenes) + " repeated loci out of " + str(totalgenes))
        verboseprint(
            str(smallgenes) + " loci out of " + str(totalgenes) +
            " smaller than " + str(sizethresh) + "bp")
        verboseprint("\nprotein file created\n")

        # first step -  remove genes contained in other genes or 100% equal genes

        # list of results - the output of the function
        resultsList = []

        auxDict = {}
        #g_fp = HTSeq.FastaReader(proteinfile)
        g = 0
        j = 0

        verboseprint(
            "Checking if protein sequences are contained in others...")

        # for each gene from all the annotated genes - starting with an empty dictionary, only add a new gene if the "to be added gene" is not contained or equal to a gene already added to the dictionary
        auxprot = []

        for elem in orderedprotDict.items():

            contained = False

            prot = str(elem[1])
            if any(prot in x for x in auxprot):
                g += 1
                contained = True

            else:
                auxDict[elem[1]] = elem[0]
                auxprot.append(str(elem[1]))

            j += 1
        verboseprint(str(g) + " loci are contained in other genes\n")

        # overwrite the original file, obtaining a new file with unique genes

        with open(proteinfile, "w") as f:
            allsequences = ''
            for k, v in auxDict.items():
                allsequences += v + k + "\n"
            f.write(allsequences)

    else:
        # print "passed steps"

        proteinfile = proteinFIlePath
        totalgenes = 0
        smallgenes = 0
        #g_fp = HTSeq.FastaReader(genes)
        proteinfile = proteinFIlePath
        for gene in SeqIO.parse(genes, "fasta", generic_dna):
            #for gene in g_fp:
            dnaseq = str(gene.seq)

            protname = ">" + str(gene.id) + "\n"
            # protDict[protname] = str(protseq)
            geneDict[str(gene.name)] = dnaseq

    verboseprint("Starting Blast")
    # print "Blasting the total of "+ str(len(auxDict.keys())) + " loci"

    geneFile = os.path.abspath(proteinfile)
    Gene_Blast_DB_name = Create_Blastdb(geneFile, 1, True)

    geneF = os.path.splitext(geneFile)[0]
    blast_out_file = geneF + '.xml'
    # ------------------------------ RUNNING BLAST ------------------------------ #
    if cpuToUse:
        cline = NcbiblastpCommandline(cmd=BlastpPath,
                                      query=geneFile,
                                      db=Gene_Blast_DB_name,
                                      evalue=0.001,
                                      out=blast_out_file,
                                      outfmt=5,
                                      num_threads=int(cpuToUse))
    else:
        cline = NcbiblastpCommandline(cmd=BlastpPath,
                                      query=geneFile,
                                      db=Gene_Blast_DB_name,
                                      evalue=0.001,
                                      out=blast_out_file,
                                      outfmt=5,
                                      num_threads=1)
    blast_records = runBlastParser(cline, blast_out_file)
    verboseprint("Finished blast")

    toRemove = []
    genesToKeep = []
    log = ["removed\tcause\texplanation"]
    for blast_record in blast_records:

        allelename = blast_record.query
        allelename = allelename.split(" ")
        allelename = allelename[0]
        alleleLength = len(geneDict[allelename])

        try:

            # if gene A is not on the toRemove list yet, add to genesToKeep list

            if str(blast_record.query) not in toRemove:
                genesToKeep.append(blast_record.query)

                i = 0
                # if first alignement is not against self, gene B is bigger than gene A and very simillar - remove gene A from genesToKeep and add gene B instead
                if not str(blast_record.query) == str(
                    (blast_record.alignments[0]).hit_def):
                    genesToKeep.remove(str(blast_record.query))
                    toRemove.append(str(blast_record.query))
                    log.append(
                        str(blast_record.query) + "\t" +
                        str((blast_record.alignments[0]).hit_def) + "\t" +
                        "2 is first best match")

                    # if gene B is not on the toRemove list, add to genesToKeep list
                    if str(
                        (blast_record.alignments[0]).hit_def) not in toRemove:
                        genesToKeep.append(
                            str((blast_record.alignments[0]).hit_def))

                    raise

                selfblastscore = (((blast_record.alignments[0]).hsps)[0]).score

                while i < len(blast_record.alignments):
                    align = blast_record.alignments[i]

                    match = (align.hsps)[0]
                    scoreRatio = float(match.score) / float(selfblastscore)

                    alleleLength2 = len(geneDict[str(align.hit_def)])

                    # if good match and gene B not in toremove list
                    if (scoreRatio > bsr and
                            not str(align.hit_def) == str(blast_record.query)
                            and str(align.hit_def) not in toRemove):

                        # if gene B is bigger than gene A, keep bigger gene B
                        if alleleLength2 > alleleLength:
                            genesToKeep.append(str(align.hit_def))
                            genesToKeep.remove(str(blast_record.query))
                            toRemove.append(str(blast_record.query))
                            log.append(
                                str(blast_record.query) + "\t" +
                                str(align.hit_def) + "\t" +
                                "2 is bigger and bsr >" + str(bsr))

                            raise
                        # else add gene B to toremove list
                        elif str(align.hit_def) in genesToKeep:
                            genesToKeep.remove(str(align.hit_def))
                            toRemove.append(str(align.hit_def))
                            log.append(
                                str(align.hit_def) + "\t" +
                                str(blast_record.query) + "\t" +
                                "2 is bigger and bsr >" + str(bsr))

                    i += 1

            # else gene A is on toRemove list, add all similar genes (not in genesToKeep) list to the toRemove list
            else:

                i = 0
                selfblastscore = 0
                for align in blast_record.alignments:
                    if not (str(align.hit_def) == str(blast_record.query)):
                        selfblastscore = ((align.hsps)[0]).score
                        # print "gene "+str(align.hit_def)+" is larger than gene "+str(blast_record.query)
                        raise

                while i < len(blast_record.alignments):
                    align = blast_record.alignments[i]
                    match = (align.hsps)[0]
                    scoreRatio = float(match.score) / float(selfblastscore)

                    if align.hit_def not in genesToKeep and not str(
                            align.hit_def) == str(
                                blast_record.query) and scoreRatio > bsr:
                        toRemove.append(align.hit_def)
                        log.append(
                            str(align.hit_def) + "\t" +
                            str(blast_record.query) + "\t" +
                            "2 was on the removed list and bsr >" + str(bsr))

                    else:
                        pass

                    i += 1

        except Exception as e:
            # print e
            pass

    genesToKeep = list(set(genesToKeep))
    toRemove = list(set(toRemove))
    s = set(toRemove)
    notcommonToKeep = [x for x in genesToKeep if x not in s]

    pathfiles = os.path.dirname(geneFile)
    pathfiles = pathfiles + "/"
    listfiles = []

    #g_fp = HTSeq.FastaReader(genes)
    removedparalogs = 0
    removedsize = 0
    totalgenes = 0
    rest = 0
    concatenatedFile = ''
    schema_folder_path = os.path.join(pathfiles, 'schema_seed')

    if not os.path.exists(
            schema_folder_path) and not proteinFIlePath and not outputFIlePath:
        os.makedirs(schema_folder_path)
    elif not proteinFIlePath and outputFIlePath:
        os.makedirs(outputFIlePath)

    for contig in SeqIO.parse(genes, "fasta", generic_dna):
        totalgenes += 1
        #name = contig.name + " " + contig.descr
        name2 = contig.id

        # print name2
        if name2 not in toRemove and name2 in genesToKeep:
            if int(len(contig.seq)) > sizethresh:
                namefile = contig.name
                namefile = namefile.replace("|", "_")
                namefile = namefile.replace("_", "-")
                namefile = namefile.replace("(", "")
                namefile = namefile.replace(")", "")
                namefile = namefile.replace("'", "")
                namefile = namefile.replace("\"", "")
                namefile = namefile.replace(":", "")

                if not proteinFIlePath and not outputFIlePath:
                    newFile = os.path.join(schema_folder_path,
                                           namefile + ".fasta")
                    listfiles.append(newFile)
                    with open(newFile, "w") as f:
                        f.write(">" + namefile + "_1\n" + str(contig.seq) +
                                "\n")
                elif not proteinFIlePath and outputFIlePath:
                    newFile = os.path.join(outputFIlePath, namefile + ".fasta")
                    listfiles.append(newFile)
                    with open(newFile, "w") as f:
                        f.write(">" + namefile + "_1\n" + str(contig.seq) +
                                "\n")
                else:
                    concatenatedFile += ">" + contig.id + " \n" + str(
                        contig.seq) + "\n"

                rest += 1

            else:
                removedsize += 1
        else:

            removedparalogs += 1

    if proteinFIlePath and outputFIlePath:
        with open(outputFIlePath, "w") as f:
            f.write(concatenatedFile)
    elif not proteinFIlePath and outputFIlePath:
        get_Short(listfiles)
        verboseprint("\nRemoved " + str(removedparalogs) +
                     " with a high similarity (BSR>" + str(bsr) + ")")
        print("Total of " + str(rest) + " loci that constitute the schema")
        os.remove(proteinfile)

    # create short folder
    else:
        # ~ with open("schemacreation.log", "wb") as f:
        # ~ for elem in log:
        # ~ f.write(str(elem)+"\n")
        get_Short(listfiles)
        verboseprint("\nRemoved " + str(removedparalogs) +
                     " with a high similarity (BSR>" + str(bsr) + ")")
        print("Total of " + str(rest) + " loci that constitute the schema")
        os.remove(proteinfile)

    shutil.rmtree(os.path.join(pathfiles, 'blastdbs'))

    os.remove(blast_out_file)

    verboseprint(starttime)
    verboseprint("Finished Script at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))
def main():
    try:
        input_file = sys.argv[1]
        temppath = sys.argv[2]
        blastPath = sys.argv[3]
        verbose = sys.argv[4]
        bsrTresh = sys.argv[5]

        if verbose == 'True':
            verbose = True
        else:
            verbose = False

    except IndexError:
        print(
            "Error starting the callAlleleles_protein3 script. usage: list_pickle_obj"
        )

    bsrTresh = float(bsrTresh)

    argumentList = []
    with open(input_file, 'rb') as f:
        argumentList = pickle.load(f)

    if verbose:

        def verboseprint(*args):

            for arg in args:
                print(arg),
            print
    else:
        verboseprint = lambda *a: None

    geneFile = argumentList[0]

    verboseprint("Using gene: " + str(geneFile))
    shortgeneFile = os.path.join(os.path.dirname(argumentList[0]), "short",
                                 os.path.basename(argumentList[0]))
    shortgeneFile = shortgeneFile.replace(".fasta", "_short.fasta")
    genomesList = argumentList[1]
    genesList = argumentList[2]

    newListgenes = []
    with open(genesList, 'r') as gene_fp:
        for gene in gene_fp:
            gene = gene.rstrip('\n')
            gene = gene.rstrip('\r')
            newListgenes.append(gene)

    statusbar = float(newListgenes.index(str(geneFile))) / len(newListgenes)
    locusnumber = (newListgenes.index(str(geneFile)))
    totalocusnumber = len(newListgenes)
    basepath = os.path.join(temppath, os.path.splitext(geneFile)[0])

    print("\rProcessing " + os.path.basename(geneFile) + ". Start " +
          time.strftime("%H:%M:%S-%d/%m/%Y") + " Locus " + str(locusnumber) +
          " of " + str(totalocusnumber) + ". Done " +
          str(int(statusbar * 100)) + "%.",
          end="")

    if not os.path.exists(basepath):
        os.makedirs(basepath)

    #gene_fp = HTSeq.FastaReader(geneFile)

    fullAlleleList = []
    fullAlleleNameList = []
    alleleI = 0
    # get full list of alleles from main gene file and last allele number id
    for allele in SeqIO.parse(geneFile, "fasta", generic_dna):
        aux = allele.id.split("_")
        if len(aux) < 2:
            alleleI = int(aux[0])
        else:
            alleleI = int(aux[-1])
        fullAlleleList.append(str(allele.seq))
        fullAlleleNameList.append(allele.id)

    resultsList = []
    i = 0
    perfectMatchIdAllele = []
    perfectMatchIdAllele2 = []
    allelescores = []
    listShortAllelesNames = []

    verboseprint("Getting BSR at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))

    geneScorePickle = os.path.abspath(shortgeneFile) + '_bsr.txt'

    # check if bsr as arealdy been calculated and recalculate it if necessary

    if os.path.isfile(geneScorePickle):
        allelescores, alleleList, listShortAllelesNames = getBlastScoreRatios(
            shortgeneFile, basepath, False, verbose, blastPath)

    else:
        allelescores, alleleList, listShortAllelesNames = getBlastScoreRatios(
            shortgeneFile, basepath, True, verbose, blastPath)

    verboseprint("Finished BSR at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))

    verboseprint("starting allele call blast at: " +
                 time.strftime("%H:%M:%S-%d/%m/%Y"))
    for genomeFile in genomesList:
        verboseprint(genomeFile)
        bestmatch = [
            0, 0, False, '', 0
        ]  # score, score ratio, perfectmatch, key name of the DNA sequence string, allele ID
        currentGenomeDict = {}
        currentCDSDict = {}

        # load the CDS from the genome to a dictionary
        filepath = os.path.join(
            temppath,
            str(os.path.basename(genomeFile)) + "_ORF_Protein.txt")

        with open(filepath, 'rb') as f:
            currentCDSDict = pickle.load(f)

        try:
            intersection = set(fullAlleleList).intersection(
                currentCDSDict.values())
            intersection = list(intersection)

            if len(intersection) > 1:
                perfectMatchIdAllele.append('NIPHEM')
                perfectMatchIdAllele2.append('NIPHEM')
                verboseprint(
                    os.path.basename(genomeFile) + " has " +
                    str(len(intersection)) + " multiple exact match : " +
                    os.path.basename(geneFile) +
                    " MULTIPLE ALLELES as EXACT MATCH")
                raise ValueError("MULTIPLE ALLELES as EXACT MATCH")

            elif len(intersection) == 1:
                alleleStr = intersection[0]
                # it doenst return both keys with equal values
                # ~ elem=currentCDSDict.keys()[currentCDSDict.values().index(alleleStr)]

                elem = [
                    key for key, value in currentCDSDict.items()
                    if value == alleleStr
                ]
                if len(elem) > 1:
                    perfectMatchIdAllele.append('NIPHEM')
                    perfectMatchIdAllele2.append('NIPHEM')
                    verboseprint(
                        os.path.basename(genomeFile) + " has " +
                        str(len(intersection)) + " multiple exact match : " +
                        os.path.basename(geneFile) +
                        " MULTIPLE ALLELES as EXACT MATCH")
                    raise ValueError("MULTIPLE ALLELES as EXACT MATCH")

                contigname = elem[0].split("&")
                matchLocation = contigname[2]
                # starting CDS base need to be +1
                matchLocation = matchLocation.split("-")
                matchLocation = [
                    int(matchLocation[0]) + 1,
                    int(matchLocation[1])
                ]
                contigname = (contigname[0]).replace(">", "")
                alleleName = ''
                alleleMatchid = 0

                alleleName = fullAlleleNameList[fullAlleleList.index(
                    alleleStr)]
                alleleMatchid = int((alleleName.split("_"))[-1])
                perfectMatchIdAllele.append(str(alleleMatchid))

                if matchLocation[0] > matchLocation[1]:
                    perfectMatchIdAllele2.append(
                        str(contigname) + "&" + str(matchLocation[0]) + "-" +
                        str(matchLocation[1]) + "&" + "-")
                else:

                    perfectMatchIdAllele2.append(
                        str(contigname) + "&" + str(matchLocation[0]) + "-" +
                        str(matchLocation[1]) + "&" + "+")

                # check if atributed allele is contained or contains
                try:
                    containedInfo = (alleleName.split("_"))[1]
                except:
                    containedInfo = ''
                if containedInfo == "CD":
                    resultsList.append([(os.path.basename(genomeFile)),
                                        str(alleleMatchid),
                                        containedInfo.rstrip()])
                elif containedInfo == "CS":
                    resultsList.append([(os.path.basename(genomeFile)),
                                        str(alleleMatchid),
                                        containedInfo.rstrip()])
                else:
                    pass

                raise ValueError("EQUAL")
        except Exception as e:
            # ~ exc_type, exc_obj, exc_tb = sys.exc_info()
            # ~ fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
            # ~ print(exc_tb.tb_lineno)
            # ~ print e
            continue

        else:
            verboseprint("Blasting alleles on genome at : " +
                         time.strftime("%H:%M:%S-%d/%m/%Y"))

            blast_out_file = os.path.join(
                basepath,
                "blastdbs/" + os.path.basename(geneFile) + '_List.xml')

            Gene_Blast_DB_name = os.path.join(
                temppath,
                str(os.path.basename(genomeFile)) + "/" +
                str(os.path.basename(genomeFile)) + "_db")

            proteinfastaPath = os.path.join(
                basepath,
                str(os.path.basename(shortgeneFile) + '_protein.fasta'))

            # blast the genome CDS against the translated locus
            # cline = NcbiblastpCommandline(query=proteinfastaPath, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5,max_target_seqs=10,max_hsps_per_subject=10)
            # 2.2.28 up
            cline = NcbiblastpCommandline(cmd=blastPath,
                                          query=proteinfastaPath,
                                          db=Gene_Blast_DB_name,
                                          evalue=0.001,
                                          out=blast_out_file,
                                          outfmt=5,
                                          max_target_seqs=10,
                                          max_hsps=10,
                                          num_threads=1)

            blast_records = runBlastParser(cline, blast_out_file)
            verboseprint("Blasted alleles on genome at : " +
                         time.strftime("%H:%M:%S-%d/%m/%Y"))

            alleleSizes = []
            for allele in fullAlleleList:
                alleleSizes.append(len(allele))

            biggestSizeAllele = max(alleleSizes)

            # get mode allele size
            moda = max(set(alleleSizes), key=alleleSizes.count)
            contador = Counter(alleleSizes).most_common()

            # if most common allele size appears 1 time, get first allele size
            if (contador[0])[1] == 1:
                moda = alleleSizes[0]

            try:

                # iterate through the blast results
                for blast_record in blast_records:

                    locationcontigs = []

                    for alignment in blast_record.alignments:

                        # select the best match
                        for match in alignment.hsps:

                            # query id comes with query_id, not name of the allele
                            alleleMatchid = int(
                                (blast_record.query_id.split("_"))[-1])

                            # ~ scoreRatio=float(match.score)/float(allelescores[int(alleleMatchid)-1])
                            # query_id starts with 1
                            alleleMatchid2 = ((
                                listShortAllelesNames[alleleMatchid -
                                                      1]).split("_"))[-1]
                            scoreRatio = float(match.score) / float(
                                allelescores[int(alleleMatchid2)])

                            cdsStrName = (alignment.title.split(" "))[1]

                            #DNAstr = str(currentCDSDict[">" + cdsStrName])

                            AlleleDNAstr = alleleList[int(alleleMatchid) - 1]

                            if scoreRatio >= bsrTresh:
                                locationcontigs.append(cdsStrName)

                            # select the best match from BLAST results

                            if scoreRatio == 1 and match.score > bestmatch[0]:
                                bestmatch = [
                                    match.score, scoreRatio, False, cdsStrName,
                                    int(alleleMatchid), match,
                                    len(AlleleDNAstr)
                                ]

                            elif (match.score > bestmatch[0]
                                  and scoreRatio >= bsrTresh
                                  and scoreRatio > bestmatch[1]
                                  and bestmatch[2] is False):
                                bestmatch = [
                                    match.score, scoreRatio, False, cdsStrName,
                                    int(alleleMatchid), match,
                                    len(AlleleDNAstr)
                                ]

                verboseprint("Classifying the match at : " +
                             time.strftime("%H:%M:%S-%d/%m/%Y"))

                # if no best match was found it's a Locus Not Found

                # check for ambiguious bases
                if not bestmatch[0] == 0:
                    alleleStr = currentCDSDict[">" + bestmatch[3]]
                    listFoundAmbiguities = []
                    listambiguousBases = [
                        'K', 'M', 'R', 'Y', 'S', 'W', 'B', 'V', 'H', 'D', 'X',
                        'N', '-', '.'
                    ]
                    listFoundAmbiguities = [
                        e for e in listambiguousBases if e in alleleStr
                    ]

                if bestmatch[0] == 0 or len(listFoundAmbiguities) > 0:

                    ###################
                    # LOCUS NOT FOUND #
                    ###################
                    if bestmatch[0] == 0:
                        perfectMatchIdAllele.append('LNF')
                        perfectMatchIdAllele2.append('LNF')
                        verboseprint("Locus not found, no matches \n")
                    else:

                        perfectMatchIdAllele.append('LNF')
                        perfectMatchIdAllele2.append('LNF')
                        verboseprint("Locus has strange base \n")

                # if more than one BSR >0.6 in two different CDSs it's a Non Paralog Locus
                elif len(list(set(locationcontigs))) > 1:
                    verboseprint("NIPH", "")
                    perfectMatchIdAllele.append('NIPH')
                    perfectMatchIdAllele2.append('NIPH')
                    for elem in locationcontigs:
                        verboseprint(elem)

                # if match with BSR >0.6 and not equal DNA sequences
                else:

                    # load the contig info of the genome to a dictionary
                    #g_fp = HTSeq.FastaReader(genomeFile)
                    for contig in SeqIO.parse(genomeFile, "fasta",
                                              generic_dna):
                        currentGenomeDict[contig.id] = len(str(contig.seq))

                    match = bestmatch[5]
                    geneLen = bestmatch[6]
                    alleleStr = currentCDSDict[">" + bestmatch[3]]
                    contigname = bestmatch[3]

                    contigname = contigname.split("&")
                    matchLocation = contigname[2]
                    matchLocation = matchLocation.split("-")
                    matchLocation = [
                        int(matchLocation[0]) + 1, matchLocation[1]
                    ]
                    contigname = contigname[0]

                    bestMatchContigLen = currentGenomeDict[contigname]

                    protSeq, alleleStr, Reversed = translateSeq(alleleStr)

                    # get extra space to the right and left between the allele and match and check if it's still inside the contig

                    rightmatchAllele = geneLen - (
                        (int(match.query_end) + 1) * 3)
                    leftmatchAllele = ((int(match.query_start) - 1) * 3)

                    # ~ if Reversed swap left and right contig extra
                    if int(matchLocation[1]) < int(matchLocation[0]):
                        rightmatchContig = bestMatchContigLen - int(
                            matchLocation[0])
                        leftmatchContig = int(matchLocation[1])
                        aux = rightmatchAllele
                        rightmatchAllele = leftmatchAllele
                        leftmatchAllele = aux

                    else:
                        rightmatchContig = bestMatchContigLen - int(
                            matchLocation[1])
                        leftmatchContig = int(matchLocation[0])

                    ###########################
                    # LOCUS ON THE CONTIG TIP #
                    ###########################

                    # check if contig is smaller than the matched allele
                    if leftmatchContig < leftmatchAllele and rightmatchContig < rightmatchAllele:

                        # ~ resultsList.append('PLOTSC:-1')
                        perfectMatchIdAllele.append('LOTSC')
                        perfectMatchIdAllele2.append('LOTSC')
                        # ~ if not Reversed:
                        # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"+")
                        # ~ else:
                        # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"-")

                        verboseprint(match, contigname, geneFile,
                                     leftmatchAllele, rightmatchAllele,
                                     "Locus is bigger than the contig \n")

                    elif leftmatchContig < leftmatchAllele:

                        # ~ resultsList.append('PLOT3:-1')
                        perfectMatchIdAllele.append('PLOT3')
                        perfectMatchIdAllele2.append('PLOT3')
                        # ~ if not Reversed:
                        # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"+")
                        # ~ else:
                        # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"-")

                        verboseprint(
                            match, contigname, geneFile, leftmatchAllele,
                            rightmatchAllele,
                            "Locus is on the 3' tip of the contig \n")

                    elif rightmatchContig < rightmatchAllele:

                        # ~ resultsList.append('PLOT5:-1')
                        perfectMatchIdAllele.append('PLOT5')
                        perfectMatchIdAllele2.append('PLOT5')
                        # ~ if not Reversed:
                        # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"+")
                        # ~ else:
                        # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"-")

                        verboseprint(
                            match, contigname, geneFile, leftmatchAllele,
                            rightmatchAllele,
                            "Locus is on the 5' tip of the contig \n")

                    elif float(len(alleleStr)) > moda + (moda * 0.2):

                        verboseprint("Locus is larger than mode", moda,
                                     alleleStr)

                        # ~ resultsList.append('ALM')
                        perfectMatchIdAllele.append('ALM')
                        perfectMatchIdAllele2.append('ALM')
                    # ~ if not Reversed:
                    # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"+")
                    # ~ else:
                    # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"-")

                    elif float(len(alleleStr)) < moda - (moda * 0.2):

                        verboseprint("Locus is smaller than mode", moda,
                                     alleleStr)

                        # ~ resultsList.append('ASM')
                        perfectMatchIdAllele.append('ASM')
                        perfectMatchIdAllele2.append('ASM')
                    # ~ if not Reversed:
                    # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"+")
                    # ~ else:
                    # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"-")

                    else:
                        #######################
                        # ADD INFERRED ALLELE #		# a new allele
                        #######################

                        wasContained = False
                        tagAuxC = 'S'
                        for alleleaux in fullAlleleList:

                            if alleleStr in alleleaux:
                                alleleName = fullAlleleNameList[
                                    fullAlleleList.index(alleleaux)]
                                alleleMatchid = (alleleName.split("_"))[-1]
                                tagAuxC = 'CD' + alleleMatchid.rstrip()
                                resultsList.append([
                                    (os.path.basename(genomeFile)),
                                    str(alleleI + 1), tagAuxC
                                ])
                                break
                            elif alleleaux in alleleStr:
                                alleleName = fullAlleleNameList[
                                    fullAlleleList.index(alleleaux)]
                                alleleMatchid = (alleleName.split("_"))[-1]
                                tagAuxC = 'CS' + alleleMatchid.rstrip()
                                resultsList.append([
                                    (os.path.basename(genomeFile)),
                                    str(alleleI + 1), tagAuxC
                                ])
                                break

                        if not wasContained:
                            tagAux = 'INF'

                            perfectMatchIdAllele.append(tagAux + "-" +
                                                        str(alleleI + 1))

                            if not Reversed:
                                perfectMatchIdAllele2.append(
                                    str(contigname) + "&" +
                                    str(matchLocation[0]) + "-" +
                                    str(matchLocation[1]) + "&" + "+")
                            else:
                                perfectMatchIdAllele2.append(
                                    str(contigname) + "&" +
                                    str(matchLocation[0]) + "-" +
                                    str(matchLocation[1]) + "&" + "-")

                            verboseprint("New allele! Adding allele " +
                                         tagAux + str(alleleI + 1) +
                                         " to the database\n")

                            # --- add the new allele to the gene fasta --- #

                            alleleI += 1
                            appendAllele = '>' + str((
                                ((os.path.basename(geneFile)).split("."))[0]
                            ).replace("_", "-")) + "_" + tagAuxC + "_" + (str(
                                os.path.basename(genomeFile))).replace(
                                    "_", "-") + "_" + str(alleleI) + '\n'
                            fG = open(geneFile, 'a')
                            fG.write(appendAllele)
                            fG.write(alleleStr + '\n')
                            fG.close()
                            fullAlleleList.append(alleleStr)
                            fullAlleleNameList.append(appendAllele)

                            if bestmatch[1] >= int(bsrTresh) and float(
                                    bestmatch[1]) < int(bsrTresh) + 0.1:
                                fG = open(shortgeneFile, 'a')
                                fG.write(appendAllele)
                                fG.write(alleleStr + '\n')
                                fG.close()

                                geneTransalatedPath2 = os.path.join(
                                    basepath,
                                    str(
                                        os.path.basename(shortgeneFile) +
                                        '_protein2.fasta'))
                                geneTransalatedPath = os.path.join(
                                    basepath,
                                    str(
                                        os.path.basename(shortgeneFile) +
                                        '_protein.fasta'))

                                with open(geneTransalatedPath2, 'w') as fG:
                                    fG.write('>' + str(alleleI) + '\n' +
                                             str(protSeq) + '\n')
                                with open(geneTransalatedPath, 'a') as fG:
                                    fG.write('>' + str(alleleI) + '\n' +
                                             str(protSeq) + '\n')

                                match = bestmatch[5]

                                # --- remake blast DB and recalculate the BSR for the locus --- #
                                alleleList.append(alleleStr)
                                listShortAllelesNames.append(appendAllele)

                                genefile2 = geneTransalatedPath2
                                Gene_Blast_DB_name2 = Create_Blastdb(
                                    genefile2, 1, True)
                                verboseprint(
                                    "Re-calculating BSR at : " +
                                    time.strftime("%H:%M:%S-%d/%m/%Y"))
                                allelescores, alleleList, listShortAllelesNames = reDogetBlastScoreRatios(
                                    genefile2, basepath, alleleI, allelescores,
                                    Gene_Blast_DB_name2, alleleList,
                                    geneScorePickle, verbose, blastPath,
                                    listShortAllelesNames)
                                verboseprint(
                                    "Done Re-calculating BSR at : " +
                                    time.strftime("%H:%M:%S-%d/%m/%Y"))

            except Exception as e:
                print("some error occurred")
                print(e)
                print('Error on line {}'.format(sys.exc_info()[-1].tb_lineno))
                perfectMatchIdAllele2.append("ERROR")
                perfectMatchIdAllele.append("ERROR")

    final = (resultsList, perfectMatchIdAllele)
    verboseprint("Finished allele calling at : " +
                 time.strftime("%H:%M:%S-%d/%m/%Y"))
    filepath = os.path.join(temppath,
                            os.path.basename(geneFile) + "_result.txt")
    filepath2 = os.path.join(temppath,
                             os.path.basename(geneFile) + "_result2.txt")
    with open(filepath, 'wb') as f:
        pickle.dump(final, f)
    with open(filepath2, 'wb') as f:
        pickle.dump(perfectMatchIdAllele2, f)
    shutil.rmtree(basepath)
    return True
Example #31
0
def main():

    parser = argparse.ArgumentParser(
        description=
        "Given an ffn file, recovers the genes that are not paralogs and have a size bigger than the g parameter provided"
    )
    parser.add_argument('-i',
                        nargs='?',
                        type=str,
                        help='ffn file',
                        required=True)
    parser.add_argument('-g',
                        nargs='?',
                        type=int,
                        help='int minimum size',
                        required=True)

    args = parser.parse_args()
    genes = args.i
    sizethresh = args.g
    passSteps = False

    #translate to protein and create new file
    abspath = os.path.abspath(genes)
    filename = os.path.basename(genes)
    abspath = abspath.replace(filename, '')
    proteinfile = os.path.join(abspath, 'proteins.fasta')

    geneDict = {}
    protDict = {}
    orderedprotDict = collections.OrderedDict()
    alreadyIn = []
    totalgenes = 0
    repeatedgenes = 0
    smallgenes = 0

    if not passSteps:
        print "not passing steps"
        with open(proteinfile, "wb") as f:
            g_fp = HTSeq.FastaReader(genes)
            totalgenes += 1
            for gene in g_fp:
                dnaseq = str(gene.seq)
                protseq, x, y = translateSeq(dnaseq)
                if len(protseq) > 1:

                    if str(protseq) in alreadyIn:
                        repeatedgenes += 1

                    elif len(str(protseq)) < 67:
                        smallgenes += 1

                    else:
                        alreadyIn.append(str(protseq))
                        protname = ">" + str(gene.name) + "\n"

                        f.write(protname + str(protseq) + "\n")
                        protDict[protname] = str(protseq)
                        geneDict[str(gene.name)] = gene.seq
                else:

                    print gene.name

            orderedprotList = []
            orderedprotList = sorted(protDict.items(),
                                     key=lambda x: len(x[1]),
                                     reverse=True)

            i = 0
            while i < len(orderedprotList):
                elem = orderedprotList[i]
                orderedprotDict[elem[0]] = elem[1]
                i += 1

            #print orderedprotDict
        print str(repeatedgenes) + " repeated genes out of " + str(totalgenes)
        print str(smallgenes) + " small genes out of " + str(totalgenes)
        print "protein file created"

        # first step -  remove genes contained in other genes or 100% equal genes

        # list of results - the output of the function
        resultsList = []

        auxDict = {}
        g_fp = HTSeq.FastaReader(proteinfile)
        g = 0
        j = 0

        print "Checking if proteins are equal or substring of others..."

        # for each gene from all the annotated genes - starting with an empty dictionary, only add a new gene if the "to be added gene" is not contained or equal to a gene already added to the dictionary
        auxprot = []

        for elem in orderedprotDict.items():

            contained = False

            prot = str(elem[1])
            if any(prot in x for x in auxprot):
                g += 1
                contained = True

            else:
                auxDict[elem[1]] = elem[0]
                auxprot.append(str(elem[1]))

            print str(j) + " out of " + str(len(orderedprotDict))

            j += 1
            #print "____" +str(j)
        print "%s genes are contained in other genes" % (g)

        #overwrite the original file, obtaining a new file with unique genes

        with open(proteinfile, "wb") as f:
            allsequences = ''
            for k, v in auxDict.iteritems():
                allsequences += v + k + "\n"
            f.write(allsequences)

    else:

        totalgenes = 0
        smallgenes = 0
        g_fp = HTSeq.FastaReader(genes)
        totalgenes += 1
        for gene in g_fp:
            dnaseq = str(gene.seq)
            protseq, x, y = translateSeq(dnaseq)
            if len(protseq) > 1:

                if str(protseq) in alreadyIn:
                    repeatedgenes += 1
                    #print gene.name + " already saved "

                elif len(str(protseq)) < 67:
                    smallgenes += 1

                else:
                    alreadyIn.append(str(protseq))
                    protname = ">" + str(gene.name) + "\n"

                    #print protseq

                    protDict[protname] = str(protseq)
                    geneDict[str(gene.name)] = gene.seq
            else:

                print gene.name

    geneFile = os.path.abspath(proteinfile)
    print proteinfile
    Gene_Blast_DB_name = Create_Blastdb(geneFile, 1, True)

    geneF = os.path.splitext(geneFile)[0]
    blast_out_file = geneF + '.xml'
    # ------------------------------ RUNNING BLAST ------------------------------ #

    cline = NcbiblastpCommandline(query=geneFile,
                                  db=Gene_Blast_DB_name,
                                  evalue=0.001,
                                  out=blast_out_file,
                                  outfmt=5)
    blast_records = runBlastParser(cline, blast_out_file, geneFile)
    toRemove = []
    genesToKeep = []
    log = ["removed\tcause\texplanation"]
    for blast_record in blast_records:

        allelename = blast_record.query
        allelename = allelename.split(" ")
        allelename = allelename[0]
        alleleLength = len(geneDict[allelename])

        try:

            #if gene A is not on the toRemove list yet, add to genesToKeep list

            if str(blast_record.query) not in toRemove:
                genesToKeep.append(blast_record.query)

                i = 0
                #if first alignement is not against self, gene B is bigger than gene A and very simillar - remove gene A from genesToKeep and add gene B instead
                if not str(blast_record.query) == str(
                    (blast_record.alignments[0]).hit_def):
                    genesToKeep.remove(str(blast_record.query))
                    toRemove.append(str(blast_record.query))
                    log.append(
                        str(blast_record.query) + "\t" +
                        str((blast_record.alignments[0]).hit_def) + "\t" +
                        "2 is first best match")

                    #if gene B is not on the toRemove list, add to genesToKeep list
                    if str(
                        (blast_record.alignments[0]).hit_def) not in toRemove:
                        genesToKeep.append(
                            str((blast_record.alignments[0]).hit_def))

                    raise

                selfblastscore = (((blast_record.alignments[0]).hsps)[0]).score

                while i < len(blast_record.alignments):
                    align = blast_record.alignments[i]

                    match = (align.hsps)[0]
                    scoreRatio = float(match.score) / float(selfblastscore)

                    alleleLength2 = len(geneDict[str(align.hit_def)])

                    #if good match and gene B not in toremove list
                    if (scoreRatio > 0.6 and
                            not str(align.hit_def) == str(blast_record.query)
                            and str(align.hit_def) not in toRemove):

                        #if gene B is bigger than gene A, keep bigger gene B
                        if alleleLength2 > alleleLength:
                            genesToKeep.append(str(align.hit_def))
                            genesToKeep.remove(str(blast_record.query))
                            toRemove.append(str(blast_record.query))
                            log.append(
                                str(blast_record.query) + "\t" +
                                str(align.hit_def) + "\t" +
                                "2 is bigger and bsr >0.6")

                            raise
                        #else add gene B to toremove list
                        elif str(align.hit_def) in genesToKeep:
                            genesToKeep.remove(str(align.hit_def))
                            toRemove.append(str(align.hit_def))
                            log.append(
                                str(align.hit_def) + "\t" +
                                str(blast_record.query) + "\t" +
                                "2 is bigger and bsr >0.6")

                    i += 1

            #else gene A is on toRemove list, add all similar genes (not in genesToKeep) list to the toRemove list
            else:

                i = 0
                selfblastscore = 0
                for align in blast_record.alignments:
                    if not (str(align.hit_def) == str(blast_record.query)):
                        selfblastscore = ((align.hsps)[0]).score
                        print "gene " + str(
                            align.hit_def) + " is bigger than gene " + str(
                                blast_record.query)
                        raise

                while i < len(blast_record.alignments):
                    align = blast_record.alignments[i]
                    match = (align.hsps)[0]
                    scoreRatio = float(match.score) / float(selfblastscore)

                    if align.hit_def not in genesToKeep and not str(
                            align.hit_def) == str(
                                blast_record.query) and scoreRatio > 0.6:
                        toRemove.append(align.hit_def)
                        log.append(
                            str(align.hit_def) + "\t" +
                            str(blast_record.query) + "\t" +
                            "2 was on the removed list and bsr >0.6")

                    else:
                        pass

                    i += 1

        except Exception as e:
            #print e
            pass
    with open("logfile.txt", "wb") as f:
        for elem in log:

            f.write(str(elem) + "\n")

    genesToKeep = list(set(genesToKeep))
    toRemove = list(set(toRemove))
    s = set(toRemove)
    notcommonToKeep = [x for x in genesToKeep if x not in s]
    print len(toRemove)
    print len(genesToKeep)
    print len(notcommonToKeep)

    pathfiles = os.path.dirname(geneFile)
    pathfiles = pathfiles + "/"

    g_fp = HTSeq.FastaReader(genes)
    removedparalogs = 0
    removedsize = 0
    totalgenes = 0
    rest = 0
    concatenatedFile = ''
    for contig in g_fp:
        totalgenes += 1
        name = contig.name + " " + contig.descr
        name2 = contig.name

        if name2 not in toRemove and name2 in genesToKeep:
            if int(len(contig.seq)) > sizethresh:
                namefile = contig.name
                namefile = namefile.replace("|", "_")
                with open(pathfiles + namefile + ".fasta", "wb") as f:
                    f.write(">1\n" + contig.seq + "\n")
                rest += 1
                concatenatedFile += ">" + namefile + "\n" + contig.seq + "\n"
            else:
                removedsize += 1
        else:

            removedparalogs += 1

    print "%s genes are contained in other genes" % (g)
    print "Removed %s same Locus genes" % str(removedparalogs)
    print "Removed %s because of size " % str(removedsize)
    print "%s Scheme genes " % str(rest)
    print "total genes:" + str(totalgenes)

    with open(pathfiles + "concatenated.fasta", "wb") as f:
        f.write(concatenatedFile)
Example #32
0
def main():

    try:
        input_file = sys.argv[1]
        temppath = sys.argv[2]
    except IndexError:
        print "usage: list_pickle_obj"

    argumentList = []
    with open(input_file, 'rb') as f:
        argumentList = pickle.load(f)

    geneFile = argumentList[0]
    genomesList = argumentList[1]

    basepath = temppath + "/" + os.path.basename(geneFile)

    if not os.path.exists(basepath + "/blastdbs/"):
        os.makedirs(basepath + "/blastdbs/")

    gene_fp = HTSeq.FastaReader(geneFile)
    geneDict = {}
    alleleI = 1
    inverted = False
    orderedAlleleNames = []
    biggestAllelelen = 0
    smallestAllelelen = 999999
    for allele in gene_fp:
        if allele.seq in geneDict:
            print "\nWARNING: this file contains a repeated allele, it should be checked. Ignoring it now!\n", geneFile
        else:
            if len(allele.seq) > biggestAllelelen:
                biggestAllelelen = len(allele.seq)
            if len(allele.seq) < smallestAllelelen:
                smallestAllelelen = len(allele.seq)
            orderedAlleleNames.append(str(alleleI))
            geneDict[allele.seq] = alleleI
            alleleI += 1

    # --- make 1st blast DB --- #

    geneF = os.path.basename(geneFile)
    blast_out_file = os.path.dirname(geneFile) + "/blastdbs/" + geneF + '.xml'

    # list of results - the output of the function
    i = 0
    perfectMatchIdAllele = []
    perfectMatchIdAllele2 = []
    genomeDict = {}
    genome = -1
    resultsList = []
    print genomesList
    for genomeFile in genomesList:
        print "_______________________________________________________"
        print perfectMatchIdAllele
        printinfo(genomeFile, geneFile)
        #currentCDSDict = listOfCDSDicts[i]

        g_fp = HTSeq.FastaReader(genomeFile)
        for contig in g_fp:
            sequence = str(contig.seq)
            genomeDict[contig.name] = sequence

        currentGenomeDict = genomeDict

        genome += 1

        print("Blasting alleles on genome at : " +
              time.strftime("%H:%M:%S-%d/%m/%Y"))

        blast_out_file = os.path.join(
            basepath, "blastdbs/" + os.path.basename(geneFile) + '_List.xml')

        Gene_Blast_DB_name = os.path.join(
            temppath,
            str(os.path.basename(genomeFile)) + "/" +
            str(os.path.basename(genomeFile)) + "_db")

        cline = NcbiblastnCommandline(query=geneFile,
                                      db=Gene_Blast_DB_name,
                                      evalue=0.001,
                                      out=blast_out_file,
                                      outfmt=5)

        blast_records = runBlastParser(cline, blast_out_file, geneFile)
        print("Blasted alleles on genome at : " +
              time.strftime("%H:%M:%S-%d/%m/%Y"))

        # ------ DETERMINING BEST MATCH ------ #

        bestMatch = ['', '', 0]
        bestMatchContig = ''
        bestMatchContigLen = ''
        bestalignlen = 0
        perfectMatch = False
        bmAlleleLen2 = 0
        bmAllele = ''
        for blast_record in blast_records:

            if perfectMatch == True:
                break
            try:
                hspC = blast_record.alignments[0]

                if bestMatch[0] == '' and bestMatch[1] == '':
                    bestMatch[0] = blast_record.query
                    bestMatch[1] = hspC
            except IndexError:
                continue

            # --- the contig tag is used in the progigal function --- #

            contigTag = blast_record.query

            # --- brute force parsing of the contig tag - better solution is advisable --- #

            j = 0
            for l in contigTag:
                if l == ' ':
                    break
                j += 1

            contigTag = contigTag[:j]

            contigLen = blast_record.query_letters

            # --- iterating over all the results to determine the best match --- #
            for alignment in blast_record.alignments:
                contigTag = alignment.hit_def
                contigTag = (contigTag.split(" "))[0]

                index = orderedAlleleNames.index(
                    str(blast_record.query_id).split("_")[1])

                for k, v in geneDict.iteritems():
                    if v == index + 1:
                        bmAlleleLen2 = len(k)

                if perfectMatch:
                    break
                for match in alignment.hsps:

                    scoreRatio = float(match.score) / float(bmAlleleLen2)

                    #if #identities is the same as the length of the allele and it has no gaps or N's
                    if (int(match.identities) == int(bmAlleleLen2)
                            and int(match.identities) == int(len(match.query))
                            and "N" not in match.sbjct
                            and "K" not in match.sbjct
                            and "Y" not in match.sbjct
                            and "R" not in match.sbjct):

                        index = orderedAlleleNames.index(
                            str(blast_record.query_id).split("_")[1])
                        for seq, alleleid in geneDict.iteritems():
                            if alleleid == index + 1:
                                bmAllele = seq
                                break
                        bmAlleleLen = len(bmAllele)

                        lenratio = float(len(match.sbjct)) / float(bmAlleleLen)
                        bestMatch = [
                            blast_record.query, match, scoreRatio,
                            blast_record.query_id, lenratio, bmAlleleLen
                        ]
                        bestMatchContig = contigTag
                        perfectMatch = True
                        index = orderedAlleleNames.index(
                            str(blast_record.query_id).split("_")[1])
                        bmAlleleLen = len(geneDict.keys()[index])
                        break

                    #chose the match with the best score ratio (score/length of allele)
                    elif scoreRatio > bestMatch[2]:
                        index = orderedAlleleNames.index(
                            str(blast_record.query_id).split("_")[1])
                        for seq, alleleid in geneDict.iteritems():
                            if alleleid == index + 1:
                                bmAllele = seq
                                break
                        bmAlleleLen = len(bmAllele)
                        lenratio = float(len(match.sbjct)) / float(bmAlleleLen)
                        bestMatch = [
                            blast_record.query, match, scoreRatio,
                            blast_record.query_id, lenratio, bmAlleleLen
                        ]
                        bestMatchContig = contigTag
                        bestMatchContigLen = len(currentGenomeDict[contigTag])
                        print contigTag
                        bestalignlen = alignment.length

                    if perfectMatch == True:
                        break

        # ---------- ALLELE CALLING AFTER DETERMINING BEST MATCH ---------- #
        print("Finished choosing best match at : " +
              time.strftime("%H:%M:%S-%d/%m/%Y"))

        try:
            match = bestMatch[1]
            bestMatchStart = match.sbjct_start
            bestMatchEnd = match.sbjct_end
            if match.query_start > match.query_end:
                bestMatchEnd = match.sbjct_start
                bestMatchStart = match.sbjct_end

            print match

            geneLen = bestMatch[5]
            alleleStr = match.sbjct
            nIdentities = match.identities
            idPercent = float(nIdentities) / float(geneLen)
            scoreRatio = bestMatch[2]
            lenRatio = bestMatch[4]

        except:
            #if no best match was found

            ###################
            # LOCUS NOT FOUND #
            ###################

            perfectMatchIdAllele.append('LNF')
            perfectMatchIdAllele2.append('LNF')

            print "Locus not found, no matches \n"
            continue

        print "is perfect match true?" + str(perfectMatch)
        if perfectMatch is True:

            #if a perfect match was found

            try:
                alleleNumber = geneDict[alleleStr]
            except:
                alleleStr = reverseComplement(alleleStr)
                alleleNumber = geneDict[alleleStr]

            ################################################
            # EXACT MATCH --- MATCH == GENE --- GENE FOUND #
            ################################################
            if "_" in bestMatch[3]:
                a = bestMatch[3].split("_")
                perfectMatchIdAllele.append(a[1])
                perfectMatchIdAllele2.append(
                    str(bestMatchContig) + "&" + str(bestMatchStart) + "-" +
                    str(bestMatchEnd) + "&" + "+")
            else:
                perfectMatchIdAllele.append(bestMatch[3])
                perfectMatchIdAllele2.append(
                    str(bestMatchContig) + "&" + str(bestMatchStart) + "-" +
                    str(bestMatchEnd) + "&" + "+")
            printinfo(genomeFile, geneFile)
            print "Exact match \n"
            continue

        else:

            #if a best match was found but it's not an exact match

            ###########################
            # LOCUS ON THE CONTIG TIP #
            ###########################
            print geneLen
            if bestMatchContigLen <= geneLen:

                perfectMatchIdAllele.append('LOTSC')
                perfectMatchIdAllele2.append(
                    str(bestMatchContig) + "&" + str(bestMatchStart) + "-" +
                    str(bestMatchEnd) + "&" + "+")
                printinfo(genomeFile, geneFile)
                print "Locus is bigger than the contig \n"

            elif (match.sbjct_start == 1 and len(match.query) < geneLen) or (
                    match.sbjct_start == bestMatchContigLen
                    and len(match.query) < bestMatchContigLen
                    and match.sbjct_start > match.sbjct_end):

                perfectMatchIdAllele.append('LOT5')
                perfectMatchIdAllele2.append(
                    str(bestMatchContig) + "&" + str(bestMatchStart) + "-" +
                    str(bestMatchEnd) + "&" + "+")
                printinfo(genomeFile, geneFile)

                print "Locus is on the 5' tip of the contig \n"

            elif (match.sbjct_end == 1 and len(match.query) < geneLen
                  and match.sbjct_start > match.sbjct_end) or (
                      match.sbjct_end == bestMatchContigLen
                      and len(match.query) < bestMatchContigLen):

                perfectMatchIdAllele.append('LOT3')
                perfectMatchIdAllele2.append(
                    str(bestMatchContig) + "&" + str(bestMatchStart) + "-" +
                    str(bestMatchEnd) + "&" + "+")
                printinfo(genomeFile, geneFile)

                print "Locus is on the 3' tip of the contig \n"

            elif 'N' in alleleStr or "K" in alleleStr or "R" in alleleStr or "Y" in alleleStr:

                #####################
                # ALLELE NOT FOUND  #		# N base found!
                #####################

                geneFile2 = os.path.splitext(geneFile)[0] + "LNFN.fasta"
                with open(geneFile2, 'a') as f:
                    f.write(">" + (str(os.path.basename(genomeFile))) + "|" +
                            (str(os.path.basename(geneFile))) + "\n")
                    f.write((alleleStr) + "\n")
                perfectMatchIdAllele.append('LNFN')
                perfectMatchIdAllele2.append('LNFN')
                printinfo(genomeFile, geneFile)
                print "LNFN, contains strange (N,K,R) bases! \n"

            else:

                print "new allele?"
                #removing gaps

                alleleStr = alleleStr.replace('-', '')
                lenExtraThresh = int(biggestAllelelen * 0.2)

                #else: #check if best match without gaps are contained inside an already defined allele

                isContainedDefinedAllele = False
                definedAllele = ''
                definedAlleleName = ''

                for k in geneDict.keys():
                    if alleleStr in k:
                        definedAllele = k
                        isContainedDefinedAllele = True
                        definedAlleleName = geneDict.get(k)
                        break
                print "is contained? " + str(isContainedDefinedAllele)
                print idPercent
                print geneLen
                print lenExtraThresh
                print lenRatio

                if isContainedDefinedAllele and int(len(match.sbjct)) <= int(
                        len(definedAllele)) + lenExtraThresh and int(
                            len(match.sbjct)) >= int(
                                len(definedAllele)) - lenExtraThresh:
                    #allele without gaps is contained in a defined allele
                    #best match with gaps has same size +1/-1 base as the defined allele

                    isnewallele = False

                    if int(len(alleleStr)) == int(
                            len(definedAllele)
                    ):  # if match without gaps has same size as the defined allele
                        tagAux = 'NA?:'
                        printinfo(genomeFile, geneFile)
                        perfectMatchIdAllele.append("NA?-" + str(alleleI))
                        perfectMatchIdAllele2.append(
                            str(bestMatchContig) + "&" + str(bestMatchStart) +
                            "-" + str(bestMatchEnd) + "&" + "+")
                        isnewallele = True

                    elif int(len(alleleStr)) == int(
                            len(definedAllele)
                    ) - 1:  # if match without gaps has minus one base than the defined allele

                        tagAux = 'NA2:'
                        printinfo(genomeFile, geneFile)
                        perfectMatchIdAllele.append("NA2-" + str(alleleI))
                        perfectMatchIdAllele2.append(
                            str(bestMatchContig) + "&" + str(bestMatchStart) +
                            "-" + str(bestMatchEnd) + "&" + "+")
                        isnewallele = True

                    else:
                        extraleft = 0
                        extraright = 0
                        tS = 0
                        tE = 0

                        handle = open(genomeFile, "rU")
                        record_dict = SeqIO.to_dict(
                            SeqIO.parse(handle, "fasta"))
                        handle.close()
                        record = record_dict[bestMatchContig]

                        # if match without gaps has more than one base missing comparing to the defined allele
                        if (1 < int(match.query_start)
                                and 1 < int(match.query_end)):

                            if match.query_start > match.query_end:
                                extraleft = match.query_end - 1

                            else:
                                extraleft = match.query_start - 1

                        print extraleft, extraright

                        if (
                                int(geneLen) > int(match.query_start)
                                and int(geneLen) > int(match.query_end)
                        ):  # if 3' tip bases of the allele are missing on the match

                            if match.query_start > match.query_end:
                                extraright = geneLen - match.query_start

                            else:
                                extraright = geneLen - match.query_end

                        print extraleft, extraright

                        if match.sbjct_start > match.sbjct_end:
                            tE = match.sbjct_start + extraleft
                            tS = match.sbjct_end - extraright - 1
                            alleleStr = str(record.seq[tS:tE])
                            alleleStr = reverseComplement(alleleStr)
                        else:
                            tS = match.sbjct_start - extraleft - 1
                            tE = match.sbjct_end + extraright
                            alleleStr = str(record.seq[tS:tE])

                        print tS
                        print tE
                        print "allele is:"
                        print alleleStr

                        if tE > bestMatchContigLen:
                            perfectMatchIdAllele.append('LOT3B')
                            perfectMatchIdAllele2.append(
                                str(bestMatchContig) + "&" + str(tS) + "-" +
                                str(bestMatchContigLen) + "&" + "+")
                            printinfo(genomeFile, geneFile)

                            print "Locus is on the 3B' tip of the contig \n"

                        elif tS < 0:
                            perfectMatchIdAllele.append('LOT5B')
                            perfectMatchIdAllele2.append(
                                str(bestMatchContig) + "&" + str(0) + "-" +
                                str(tE) + "&" + "+")
                            printinfo(genomeFile, geneFile)

                            print "Locus is on the 5B' tip of the contig \n"

                        else:

                            tagAux = 'NA2:'
                            printinfo(genomeFile, geneFile)
                            perfectMatchIdAllele.append("NA2-" + str(alleleI))
                            perfectMatchIdAllele2.append(
                                str(bestMatchContig) + "&" + str(tS) + "-" +
                                str(tE) + "&" + "+")
                            isnewallele = True

                    if isnewallele:
                        print "New allele found! Adding allele " + tagAux + str(
                            alleleI) + " to the database"
                        geneDict[alleleStr] = alleleI

                        orderedAlleleNames.append(str(alleleI))
                        # --- add the new allele to the gene fasta --- #

                        fG = open(geneFile, 'a')
                        fG.write('>allele_' + str(alleleI) + '_' +
                                 tagAux[:-1] + '_' +
                                 str(os.path.basename(genomeFile)) + '\n')
                        fG.write(alleleStr + '\n')
                        fG.close()
                        alleleI += 1

                #if best match is not contained in an already defined allele, check if it has similar size with the match allele and has 0.8 similarity

                elif not isContainedDefinedAllele and idPercent >= 0.8 and int(
                        len(match.sbjct)
                ) <= int(geneLen) + lenExtraThresh and int(len(
                        match.sbjct)) >= int(geneLen) - lenExtraThresh:
                    #best match with gaps has 80% identity
                    #best match with gaps is the same size or +1/-1 as the defined allele

                    ratio = float(len(alleleStr)) / float(geneLen)

                    if ratio >= 0.8 and ratio <= 1.2:  # if match without gaps has same size as the best match allele and 80%similarity

                        tagAux = ''
                        extraleft = 0
                        extraright = 0
                        tS = 0
                        tE = 0

                        handle = open(genomeFile, "rU")
                        record_dict = SeqIO.to_dict(
                            SeqIO.parse(handle, "fasta"))
                        handle.close()
                        record = record_dict[bestMatchContig]

                        #if len(match.sbjct)<geneLen and "-" not in match.sbjct:  #if the allele is not fully covered against the match, compensate the tips
                        try:
                            print match
                            if (1 < int(match.query_start)
                                    and 1 < int(match.query_end)):

                                if match.query_start > match.query_end:
                                    extraleft = match.query_end - 1

                                else:
                                    extraleft = match.query_start - 1

                            print extraleft, extraright

                            if (
                                    int(geneLen) > int(match.query_start)
                                    and int(geneLen) > int(match.query_end)
                            ):  # if 3' tip bases of the allele are missing on the match

                                if match.query_start > match.query_end:
                                    extraright = geneLen - match.query_start

                                else:
                                    extraright = geneLen - match.query_end

                            print extraleft, extraright

                            if match.sbjct_start > match.sbjct_end:
                                tE = match.sbjct_start + extraleft
                                tS = match.sbjct_end - extraright - 1
                                alleleStr = str(record.seq[tS:tE])
                                alleleStr = reverseComplement(alleleStr)
                            else:
                                tS = match.sbjct_start - extraleft - 1
                                tE = match.sbjct_end + extraright
                                alleleStr = str(record.seq[tS:tE])

                            print tS
                            print tE
                            print "allele is:"
                            print alleleStr

                            if tE > bestMatchContigLen:
                                perfectMatchIdAllele.append('LOT3C')
                                perfectMatchIdAllele2.append(
                                    str(bestMatchContig) + "&" + str(tS) +
                                    "-" + str(bestMatchContigLen) + "&" + "+")
                                printinfo(genomeFile, geneFile)

                                print "Locus is on the 3C' tip of the contig \n"

                            elif tS < 0:
                                perfectMatchIdAllele.append('LOT5C')
                                perfectMatchIdAllele2.append(
                                    str(bestMatchContig) + "&" + str(0) + "-" +
                                    str(tE) + "&" + "+")
                                printinfo(genomeFile, geneFile)

                                print "Locus is on the 5C' tip of the contig \n"

                            else:
                                tagAux = 'NA3:'
                                printinfo(genomeFile, geneFile)
                                perfectMatchIdAllele.append("NA3-" +
                                                            str(alleleI))
                                perfectMatchIdAllele2.append(
                                    str(bestMatchContig) + "&" + str(tS) +
                                    "-" + str(tE) + "&" + "+")

                                print "New allele found! Adding allele " + tagAux + str(
                                    alleleI) + " to the database"
                                geneDict[alleleStr] = alleleI

                                orderedAlleleNames.append(str(alleleI))
                                # --- add the new allele to the gene fasta --- #

                                fG = open(geneFile, 'a')
                                fG.write('>allele_' + str(alleleI) + '_' +
                                         tagAux[:-1] + '_' +
                                         str(os.path.basename(genomeFile)) +
                                         '\n')
                                fG.write(alleleStr + '\n')
                                fG.close()
                                alleleI += 1

                        except Exception as e:
                            ##################
                            #       LNF      #
                            ##################
                            print e
                            geneFile2 = os.path.splitext(
                                geneFile)[0] + "LNF3.fasta"
                            print geneFile2
                            with open(geneFile2, 'a') as f:
                                f.write(">" +
                                        (str(os.path.basename(genomeFile))) +
                                        "|" +
                                        (str(os.path.basename(geneFile))) +
                                        " | " + str(bestMatchContig) + "\n")
                                f.write((alleleStr) + "\n")
                                f.write(">Allele\n")
                                f.write((bmAllele) + "\n")
                            printinfo(genomeFile, geneFile)
                            perfectMatchIdAllele.append("LNF3")
                            perfectMatchIdAllele2.append("LNF3")
                            print "No allele found"
                    else:
                        ##################
                        #       LNF      #
                        ##################
                        geneFile2 = os.path.splitext(
                            geneFile)[0] + "LNF4.fasta"
                        print geneFile2
                        with open(geneFile2, 'a') as f:
                            f.write(">" + (str(os.path.basename(genomeFile))) +
                                    "|" + (str(os.path.basename(geneFile))) +
                                    " | " + str(bestMatchContig) + "\n")
                            f.write((alleleStr) + "\n")
                            f.write(">Allele\n")
                            f.write((bmAllele) + "\n")
                        printinfo(genomeFile, geneFile)
                        perfectMatchIdAllele.append("LNF4")
                        perfectMatchIdAllele2.append("LNF4")
                        print "No allele found"

                elif isContainedDefinedAllele:
                    ####################
                    # UNDEFINED ALLELE #		# it is contained in another allele
                    ####################

                    alleleStr = match.query

                    perfectMatchIdAllele.append("undefined allele")
                    perfectMatchIdAllele2.append("undefined allele")
                    printinfo(genomeFile, geneFile)
                    print "Undefined allele \n"

                    geneFile2 = os.path.splitext(
                        geneFile)[0] + "undefined.fasta"
                    print geneFile2

                elif lenRatio < 0.5:

                    ###############
                    # SMALL MATCH #
                    ###############

                    perfectMatchIdAllele.append('small match')
                    perfectMatchIdAllele2.append('small match')
                    printinfo(genomeFile, geneFile)
                    print "lower than 50% match \n"

                elif lenRatio < 0.8 and idPercent < 0.5:
                    #####################
                    # INCOMPLETE ALLELE #		# it was not possible to extend it to at least 80% of the length of the gene
                    #####################
                    perfectMatchIdAllele.append('allele incomplete')
                    perfectMatchIdAllele2.append('allele incomplete')
                    printinfo(genomeFile, geneFile)
                    print "Incomplete allele\n"

                else:
                    ##################
                    #       LNF      #
                    ##################

                    printinfo(genomeFile, geneFile)
                    perfectMatchIdAllele.append("LNF5")
                    perfectMatchIdAllele2.append("LNF5")
                    print "Locus not found"

    final = (resultsList, perfectMatchIdAllele)
    print("Finished allele calling at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))
    filepath = os.path.join(temppath,
                            os.path.basename(geneFile) + "_result.txt")
    filepath2 = os.path.join(temppath,
                             os.path.basename(geneFile) + "_result2.txt")
    with open(filepath, 'wb') as f:
        pickle.dump(final, f)
    with open(filepath2, 'wb') as f:
        pickle.dump(perfectMatchIdAllele2, f)
    return True
Example #33
0
def callAlleles(argumentList):

	geneFile = argumentList[0]
	genomesList = argumentList[1]
	listOfCDSDicts = argumentList[2]
	listOfGenomesDict = argumentList[3]

	gene_fp = HTSeq.FastaReader(geneFile)
	geneDict = {}
	alleleI = 0


	for allele in gene_fp:
		if allele.seq in geneDict:
			print "\nWARNING: this file contains a repeated allele, it should be checked. Ignoring it now!\n", geneFile
		else:
			geneDict[ allele.seq ] = alleleI
		alleleI += 1


	# --- make 1st blast DB --- #

	Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 )
	geneF = os.path.splitext( geneFile )[0]
	blast_out_file = geneF + '.xml'


	# list of results - the output of the function
	resultsList = []
	i = 0

	for genomeFile in genomesList:

		currentCDSDict = listOfCDSDicts[i]
		currentGenomeDict = listOfGenomesDict[i]

		i+=1		# it has to be incremented here

		if genomeFile[-1] == '\n':
			genomeFile = genomeFile[:-1]


                # ------------------------------ RUNNING BLAST ------------------------------ #

		cline = NcbiblastnCommandline(query=genomeFile, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5)
		blast_records = runBlastParser(cline, blast_out_file, genomeFile)
		

		# ------ DETERMINING BEST MATCH ------ #

		# bestMatch = ['rec.query','hsp', lenRatio]
		bestMatch = ['','', 0]
		
		for blast_record in blast_records:

			# --- the LNF cases are now called outside de loop --- #

			try:
				hspC = blast_record.alignments[0]
				if bestMatch[0] == '' and bestMatch[1] == '':
					bestMatch[0] = blast_record.query
					bestMatch[1] = hspC
			except IndexError:
				continue


			# --- the contig tag is used in the progigal function --- #

			contigTag = blast_record.query

			# --- brute force parsing of the contig tag - better solution is advisable --- #			

			j=0
			for l in contigTag:
				if l == ' ':
					break
				j+=1

			contigTag = contigTag[:j]

			contigLen = blast_record.query_letters


			# --- iterating over all the results to determine the best match --- #

			for alignment in blast_record.alignments:

				for match in alignment.hsps:

					lenRatio = float(len( match.query )) / float( len(match.sbjct) )

					if lenRatio > bestMatch[2]:
						bestMatch = [blast_record.query, match, lenRatio]


		# ---------- ALLELE CALLING AFTER DETERMINING BEST MATCH ---------- #

		###################
		# LOCUS NOT FOUND #
		###################
		
		if bestMatch[0] == '':
			resultsList.append('LNF:-1')            # append result to the list of results
			continue

		match = bestMatch[1]

		geneLen = len(match.sbjct)
		alleleStr = match.query

		nIdentities = match.identities
		idPercent = float(nIdentities) / float(geneLen)
		lenRatio = bestMatch[2]


		###########################
                # LOCUS ON THE CONTIG TIP #
                ###########################

		if contigLen <= match.query_start or contigLen <= match.query_end:
			resultsList.append('LOT:-1')

		###############
                # SMALL MATCH #
                ###############

                elif lenRatio < 0.5:
			resultsList.append('SAC:-1')		# don't know what 'SAC' stands for

		else:
			# ------------------------------------------------------------------------------------------------------- #
			#                                                                                                         #
			#                                   USING PRODIGAL TO TRY TO EXTEND CDS                                   #
			#                                                                                                         #
			# ------------------------------------------------------------------------------------------------------- #

			extended, strCDS = extendCDS(contigTag, currentCDSDict, match.sbjct_start, match.sbjct_end, currentGenomeDict)

			# --- if it was possible to extend it using prodigal --- #

			if extended and ( ( len(strCDS) * lenRatio ) >= geneLen):		# and idPercent > 0.8 and ( len(strCDS) / geneLen)  > 0.8:
				alleleStr = strCDS
				lenRatio = float(len(strCDS)) / float(geneLen)

			# --- removing gaps '-' --- #

			alleleStr = alleleStr.replace('-', '')

			# --- continuing the allele calling --- #

			if lenRatio < 0.8 and idPercent < 0.5:

				#####################
				# INCOMPLETE ALLELE #		# it was not possible to extend it to at least 80% of the length of the gene
				#####################

				resultsList.append('INC:-1')

			else:
				# --- it might be needed to obtain the reverse complement of the allele string --- #

				if match.sbjct_start > match.sbjct_end:
					alleleStr = reverseComplement(alleleStr)


				if alleleStr in geneDict:
					alleleNumber = geneDict[ alleleStr ]
					
					################################################
					# EXACT MATCH --- MATCH == GENE --- GENE FOUND #
					################################################

					resultsList.append('EXC:' + str(alleleNumber) )

				else:

					isUndefined = False	
					for k in geneDict.keys():
						if alleleStr in k:
							isUndefined = True
							break

					if isUndefined:

						####################
                                                # UNDEFINED ALLELE #		# it is contained in another allele
                                                ####################

						resultsList.append('UND:-1')
						
					else:
						if not extended and idPercent > 0.8:
							
							##################
        	                                        # ADD NEW ALLELE #
                	                                ##################

							tagAux = 'NA:'

						else:

							#######################
        	                                        # ADD INFERRED ALLELE #		# a new allele that was extended with prodigal
                	                                #######################

							tagAux = 'INF:'
						
						resultsList.append( tagAux + str(alleleI) )
                                                geneDict[ alleleStr ] = alleleI
                                                alleleI += 1

						# --- add the new allele to the gene fasta --- #

						fG = open( geneFile, 'a' )
						fG.write('>allele_' + str(alleleI) + '_' + tagAux[:-1] + '\n')
						fG.write( alleleStr + '\n')
						fG.close()

						# --- remake blast DB --- #
						Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 )

	return resultsList