Ejemplo n.º 1
0
def main(input_file, temppath, blastPath, verbose, bsrTresh, sizeTresh):
    #~ try:
    #~ input_file = sys.argv[1]
    #~ temppath = sys.argv[2]
    #~ blastPath = sys.argv[3]
    #~ verbose = sys.argv[4]
    #~ bsrTresh = sys.argv[5]

    if verbose == 'True':
        verbose = True
    else:
        verbose = False

    #~ except IndexError:
    #~ print ("Error starting the callAlleleles_protein3 script. usage: list_pickle_obj")

    bsrTresh = float(bsrTresh)
    sizeTresh = float(sizeTresh)

    argumentList = []
    with open(input_file, 'rb') as f:
        argumentList = pickle.load(f)

    if verbose:

        def verboseprint(*args):

            for arg in args:
                print(arg),
            print
    else:
        verboseprint = lambda *a: None

    geneFile = argumentList[0]

    verboseprint("Using gene: " + str(geneFile))
    shortgeneFile = os.path.join(os.path.dirname(argumentList[0]), "short",
                                 os.path.basename(argumentList[0]))
    shortgeneFile = shortgeneFile.replace(".fasta", "_short.fasta")
    genomesList = argumentList[1]
    genesList = argumentList[2]

    newListgenes = []
    with open(genesList, 'r') as gene_fp:
        for gene in gene_fp:
            gene = gene.rstrip('\n')
            gene = gene.rstrip('\r')
            newListgenes.append(gene)

    statusbar = float(newListgenes.index(str(geneFile))) / len(newListgenes)
    locusnumber = (newListgenes.index(str(geneFile)))
    totalocusnumber = len(newListgenes)
    basepath = os.path.join(temppath, os.path.splitext(geneFile)[0])
    newDNAAlleles2Add2Fasta = ''
    newDNAAlleles2Add2shortFasta = ''
    proteinFastaString = ''

    print("\rProcessing " + os.path.basename(geneFile) + ". Start " +
          time.strftime("%H:%M:%S-%d/%m/%Y") + " Locus " + str(locusnumber) +
          " of " + str(totalocusnumber) + ". Done " +
          str(int(statusbar * 100)) + "%.",
          end="")

    if not os.path.exists(basepath):
        os.makedirs(basepath)

    #gene_fp = HTSeq.FastaReader(geneFile)

    fullAlleleList = []
    fullAlleleNameList = []
    alleleI = 0
    # get full list of alleles from main gene file and last allele number id
    for allele in SeqIO.parse(geneFile, "fasta", generic_dna):
        aux = allele.id.split("_")
        if len(aux) < 2:
            alleleI = int(aux[0])
        else:
            alleleI = int(aux[-1])
        fullAlleleList.append(str(allele.seq.upper()))
        fullAlleleNameList.append(allele.id)

    resultsList = []
    i = 0
    perfectMatchIdAllele = []
    perfectMatchIdAllele2 = []
    allelescores = []
    listShortAllelesNames = []

    verboseprint("Getting BSR at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))

    geneScorePickle = os.path.abspath(shortgeneFile) + '_bsr.txt'

    # check if bsr as arealdy been calculated and recalculate it if necessary

    if os.path.isfile(geneScorePickle):
        allelescores, alleleList, listShortAllelesNames = getBlastScoreRatios(
            shortgeneFile, basepath, False, verbose, blastPath)

    else:
        allelescores, alleleList, listShortAllelesNames = getBlastScoreRatios(
            shortgeneFile, basepath, True, verbose, blastPath)

    with open(
            os.path.join(
                basepath,
                str(os.path.basename(shortgeneFile) + '_protein.fasta')),
            'r') as myfile:
        proteinFastaString = myfile.read()
        proteinFastaString += "\n"

    verboseprint("Finished BSR at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))

    verboseprint("starting allele call blast at: " +
                 time.strftime("%H:%M:%S-%d/%m/%Y"))
    for genomeFile in genomesList:
        verboseprint(genomeFile)
        bestmatch = [
            0, 0, False, '', 0
        ]  # score, score ratio, perfectmatch, key name of the DNA sequence string, allele ID
        currentGenomeDict = {}
        currentCDSDict = {}

        # load the CDS from the genome to a dictionary
        filepath = os.path.join(
            temppath,
            str(os.path.basename(genomeFile)) + "_ORF_Protein.txt")

        with open(filepath, 'rb') as f:
            currentCDSDict = pickle.load(f)

        try:
            intersection = set(fullAlleleList).intersection(
                currentCDSDict.values())
            intersection = list(intersection)

            if len(intersection) > 1:
                perfectMatchIdAllele.append('NIPHEM')
                perfectMatchIdAllele2.append('NIPHEM')
                verboseprint(
                    os.path.basename(genomeFile) + " has " +
                    str(len(intersection)) + " multiple exact match : " +
                    os.path.basename(geneFile) +
                    " MULTIPLE ALLELES as EXACT MATCH")
                raise ValueError("MULTIPLE ALLELES as EXACT MATCH")

            elif len(intersection) == 1:
                alleleStr = intersection[0]
                # it doenst return both keys with equal values
                # ~ elem=currentCDSDict.keys()[currentCDSDict.values().index(alleleStr)]

                elem = [
                    key for key, value in currentCDSDict.items()
                    if value == alleleStr
                ]
                if len(elem) > 1:
                    perfectMatchIdAllele.append('NIPHEM')
                    perfectMatchIdAllele2.append('NIPHEM')
                    verboseprint(
                        os.path.basename(genomeFile) + " has " +
                        str(len(intersection)) + " multiple exact match : " +
                        os.path.basename(geneFile) +
                        " MULTIPLE ALLELES as EXACT MATCH")
                    raise ValueError("MULTIPLE ALLELES as EXACT MATCH")

                contigname = elem[0].split("&")
                matchLocation = contigname[2]
                # starting CDS base need to be +1
                matchLocation = matchLocation.split("-")
                matchLocation = [
                    int(matchLocation[0]) + 1,
                    int(matchLocation[1])
                ]
                contigname = (contigname[0]).replace(">", "")
                alleleName = ''
                alleleMatchid = 0

                alleleName = fullAlleleNameList[fullAlleleList.index(
                    alleleStr)]
                alleleMatchid = int((alleleName.split("_"))[-1])
                perfectMatchIdAllele.append(str(alleleMatchid))

                if matchLocation[0] > matchLocation[1]:
                    perfectMatchIdAllele2.append(
                        str(contigname) + "&" + str(matchLocation[0]) + "-" +
                        str(matchLocation[1]) + "&" + "-")
                else:

                    perfectMatchIdAllele2.append(
                        str(contigname) + "&" + str(matchLocation[0]) + "-" +
                        str(matchLocation[1]) + "&" + "+")

                # check if atributed allele is contained or contains
                try:
                    containedInfo = (alleleName.split("_"))[1]
                except:
                    containedInfo = ''
                if containedInfo == "CD":
                    resultsList.append([(os.path.basename(genomeFile)),
                                        str(alleleMatchid),
                                        containedInfo.rstrip()])
                elif containedInfo == "CS":
                    resultsList.append([(os.path.basename(genomeFile)),
                                        str(alleleMatchid),
                                        containedInfo.rstrip()])
                else:
                    pass

                raise ValueError("EQUAL")
        except Exception as e:
            # ~ exc_type, exc_obj, exc_tb = sys.exc_info()
            # ~ fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
            # ~ print(exc_tb.tb_lineno)
            # ~ print e
            continue

        else:
            verboseprint("Blasting alleles on genome at : " +
                         time.strftime("%H:%M:%S-%d/%m/%Y"))

            blast_out_file = os.path.join(
                basepath,
                "blastdbs/" + os.path.basename(geneFile) + '_List.xml')

            Gene_Blast_DB_name = os.path.join(
                temppath,
                str(os.path.basename(genomeFile)) + "/" +
                str(os.path.basename(genomeFile)) + "_db")

            #~ proteinfastaPath = os.path.join(basepath, str(os.path.basename(shortgeneFile) + '_protein.fasta'))

            # blast the genome CDS against the translated locus
            # cline = NcbiblastpCommandline(query=proteinfastaPath, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5,max_target_seqs=10,max_hsps_per_subject=10)
            # 2.2.28 up
            cline = NcbiblastpCommandline(cmd=blastPath,
                                          db=Gene_Blast_DB_name,
                                          evalue=0.001,
                                          outfmt=5,
                                          max_target_seqs=10,
                                          max_hsps=10,
                                          num_threads=1)

            out, err = cline(stdin=proteinFastaString)

            #~ proteinFastaString
            #~ cline = NcbiblastpCommandline(cmd=blastPath, query=proteinfastaPath, db=Gene_Blast_DB_name, evalue=0.001, outfmt=5, max_target_seqs=10, max_hsps=10,num_threads=1)
            #~ out, err = cline()
            psiblast_xml = StringIO(out)
            blast_records = NCBIXML.parse(psiblast_xml)

            #~ blast_records = CommonFastaFunctions.runBlastParser(cline, blast_out_file)
            verboseprint("Blasted alleles on genome at : " +
                         time.strftime("%H:%M:%S-%d/%m/%Y"))

            alleleSizes = []
            for allele in fullAlleleList:
                alleleSizes.append(len(allele))

            biggestSizeAllele = max(alleleSizes)

            # get mode allele size
            moda = max(set(alleleSizes), key=alleleSizes.count)
            contador = Counter(alleleSizes).most_common()

            # if most common allele size appears 1 time, get first allele size
            if (contador[0])[1] == 1:
                moda = alleleSizes[0]

            try:

                # iterate through the blast results
                for blast_record in blast_records:

                    locationcontigs = []

                    for alignment in blast_record.alignments:

                        # select the best match
                        for match in alignment.hsps:

                            # query id comes with query_id, not name of the allele
                            alleleMatchid = int(
                                (blast_record.query_id.split("_"))[-1])

                            # ~ scoreRatio=float(match.score)/float(allelescores[int(alleleMatchid)-1])
                            # query_id starts with 1
                            alleleMatchid2 = ((
                                listShortAllelesNames[alleleMatchid -
                                                      1]).split("_"))[-1]
                            scoreRatio = float(match.score) / float(
                                allelescores[int(alleleMatchid2)])

                            cdsStrName = (alignment.title.split(" "))[1]

                            #DNAstr = str(currentCDSDict[">" + cdsStrName])

                            AlleleDNAstr = alleleList[int(alleleMatchid) - 1]
                            verboseprint(str(match))
                            verboseprint("BSR : " + str(scoreRatio))

                            if scoreRatio >= bsrTresh:
                                locationcontigs.append(cdsStrName)

                            # select the best match from BLAST results

                            if scoreRatio == 1 and match.score > bestmatch[0]:
                                bestmatch = [
                                    match.score, scoreRatio, False, cdsStrName,
                                    int(alleleMatchid), match,
                                    len(AlleleDNAstr)
                                ]

                            elif (match.score > bestmatch[0]
                                  and scoreRatio >= bsrTresh
                                  and scoreRatio > bestmatch[1]
                                  and bestmatch[2] is False):
                                bestmatch = [
                                    match.score, scoreRatio, False, cdsStrName,
                                    int(alleleMatchid), match,
                                    len(AlleleDNAstr)
                                ]

                verboseprint("Classifying the match at : " +
                             time.strftime("%H:%M:%S-%d/%m/%Y"))

                # if no best match was found it's a Locus Not Found

                # check for ambiguious bases
                if not bestmatch[0] == 0:
                    alleleStr = currentCDSDict[">" + bestmatch[3]]
                    listFoundAmbiguities = []
                    listambiguousBases = [
                        'K', 'M', 'R', 'Y', 'S', 'W', 'B', 'V', 'H', 'D', 'X',
                        'N', '-', '.'
                    ]
                    listFoundAmbiguities = [
                        e for e in listambiguousBases if e in alleleStr
                    ]

                if bestmatch[0] == 0 or len(listFoundAmbiguities) > 0:

                    ###################
                    # LOCUS NOT FOUND #
                    ###################
                    if bestmatch[0] == 0:
                        perfectMatchIdAllele.append('LNF')
                        perfectMatchIdAllele2.append('LNF')
                        verboseprint("Locus not found, no matches \n")
                    else:

                        perfectMatchIdAllele.append('LNF')
                        perfectMatchIdAllele2.append('LNF')
                        verboseprint("Locus has strange base \n")

                # if more than one BSR >0.6 in two different CDSs it's a Non Paralog Locus
                elif len(list(set(locationcontigs))) > 1:
                    verboseprint("NIPH", "")
                    perfectMatchIdAllele.append('NIPH')
                    perfectMatchIdAllele2.append('NIPH')
                    for elem in locationcontigs:
                        verboseprint(elem)

                # if match with BSR >0.6 and not equal DNA sequences
                else:

                    # load the contig info of the genome to a dictionary
                    #g_fp = HTSeq.FastaReader(genomeFile)
                    for contig in SeqIO.parse(genomeFile, "fasta",
                                              generic_dna):
                        currentGenomeDict[contig.id] = len(
                            str(contig.seq.upper()))

                    match = bestmatch[5]
                    geneLen = bestmatch[6]
                    alleleStr = currentCDSDict[">" + bestmatch[3]]
                    contigname = bestmatch[3]

                    contigname = contigname.split("&")
                    matchLocation = contigname[2]
                    matchLocation = matchLocation.split("-")
                    matchLocation = [
                        int(matchLocation[0]) + 1, matchLocation[1]
                    ]
                    contigname = contigname[0]

                    bestMatchContigLen = currentGenomeDict[contigname]

                    protSeq, alleleStr = translateSeq(alleleStr)
                    # get extra space to the right and left between the allele and match and check if it's still inside the contig

                    rightmatchAllele = geneLen - (
                        (int(match.query_end) + 1) * 3)
                    leftmatchAllele = ((int(match.query_start) - 1) * 3)

                    Reversed = False
                    # ~ if Reversed swap left and right contig extra
                    if int(matchLocation[1]) < int(matchLocation[0]):
                        rightmatchContig = bestMatchContigLen - int(
                            matchLocation[0])
                        leftmatchContig = int(matchLocation[1])
                        aux = rightmatchAllele
                        rightmatchAllele = leftmatchAllele
                        leftmatchAllele = aux
                        Reversed = True

                    else:
                        rightmatchContig = bestMatchContigLen - int(
                            matchLocation[1])
                        leftmatchContig = int(matchLocation[0])

                    ###########################
                    # LOCUS ON THE CONTIG TIP #
                    ###########################

                    # check if contig is smaller than the matched allele
                    if leftmatchContig < leftmatchAllele and rightmatchContig < rightmatchAllele:

                        # ~ resultsList.append('PLOTSC:-1')
                        perfectMatchIdAllele.append('LOTSC')
                        perfectMatchIdAllele2.append('LOTSC')
                        # ~ if not Reversed:
                        # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"+")
                        # ~ else:
                        # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"-")

                        verboseprint(match, contigname, geneFile,
                                     leftmatchAllele, rightmatchAllele,
                                     "Locus is bigger than the contig \n")

                    elif leftmatchContig < leftmatchAllele:

                        # ~ resultsList.append('PLOT3:-1')
                        perfectMatchIdAllele.append('PLOT3')
                        perfectMatchIdAllele2.append('PLOT3')
                        # ~ if not Reversed:
                        # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"+")
                        # ~ else:
                        # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"-")

                        verboseprint(
                            match, contigname, geneFile, leftmatchAllele,
                            rightmatchAllele,
                            "Locus is on the 3' tip of the contig \n")

                    elif rightmatchContig < rightmatchAllele:

                        # ~ resultsList.append('PLOT5:-1')
                        perfectMatchIdAllele.append('PLOT5')
                        perfectMatchIdAllele2.append('PLOT5')
                        # ~ if not Reversed:
                        # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"+")
                        # ~ else:
                        # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"-")

                        verboseprint(
                            match, contigname, geneFile, leftmatchAllele,
                            rightmatchAllele,
                            "Locus is on the 5' tip of the contig \n")

                    elif float(len(alleleStr)) > moda + (moda * sizeTresh):

                        verboseprint("Locus is larger than mode", moda,
                                     alleleStr)

                        # ~ resultsList.append('ALM')
                        perfectMatchIdAllele.append('ALM')
                        perfectMatchIdAllele2.append('ALM')
                    # ~ if not Reversed:
                    # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"+")
                    # ~ else:
                    # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"-")

                    elif float(len(alleleStr)) < moda - (moda * sizeTresh):

                        verboseprint("Locus is smaller than mode", moda,
                                     alleleStr)

                        # ~ resultsList.append('ASM')
                        perfectMatchIdAllele.append('ASM')
                        perfectMatchIdAllele2.append('ASM')
                    # ~ if not Reversed:
                    # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"+")
                    # ~ else:
                    # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"-")

                    else:
                        #######################
                        # ADD INFERRED ALLELE #		# a new allele
                        #######################

                        wasContained = False
                        tagAuxC = 'S'
                        for alleleaux in fullAlleleList:

                            if alleleStr in alleleaux:
                                alleleName = fullAlleleNameList[
                                    fullAlleleList.index(alleleaux)]
                                alleleMatchid = (alleleName.split("_"))[-1]
                                tagAuxC = 'CD' + alleleMatchid.rstrip()
                                resultsList.append([
                                    (os.path.basename(genomeFile)),
                                    str(alleleI + 1), tagAuxC
                                ])
                                break
                            elif alleleaux in alleleStr:
                                alleleName = fullAlleleNameList[
                                    fullAlleleList.index(alleleaux)]
                                alleleMatchid = (alleleName.split("_"))[-1]
                                tagAuxC = 'CS' + alleleMatchid.rstrip()
                                resultsList.append([
                                    (os.path.basename(genomeFile)),
                                    str(alleleI + 1), tagAuxC
                                ])
                                break

                        if not wasContained:
                            tagAux = 'INF'

                            perfectMatchIdAllele.append(tagAux + "-" +
                                                        str(alleleI + 1))

                            if not Reversed:
                                perfectMatchIdAllele2.append(
                                    str(contigname) + "&" +
                                    str(matchLocation[0]) + "-" +
                                    str(matchLocation[1]) + "&" + "+")
                            else:
                                perfectMatchIdAllele2.append(
                                    str(contigname) + "&" +
                                    str(matchLocation[0]) + "-" +
                                    str(matchLocation[1]) + "&" + "-")

                            verboseprint("New allele! Adding allele " +
                                         tagAux + str(alleleI + 1) +
                                         " to the database\n")

                            # --- add the new allele to the gene fasta --- #

                            alleleI += 1
                            appendAllele = '>' + str((
                                ((os.path.basename(geneFile)).split("."))[0]
                            ).replace("_", "-")) + "_" + tagAuxC + "_" + (str(
                                os.path.basename(genomeFile))).replace(
                                    "_", "-") + "_" + str(alleleI) + '\n'

                            #~ fG = open(geneFile, 'a')
                            #~ fG.write(appendAllele)
                            #~ fG.write(alleleStr + '\n')
                            #~ fG.close()

                            newDNAAlleles2Add2Fasta += appendAllele + alleleStr + '\n'

                            fullAlleleList.append(alleleStr)
                            fullAlleleNameList.append(appendAllele)

                            if float(bestmatch[1]) >= bsrTresh and float(
                                    bestmatch[1]) < bsrTresh + 0.1:

                                newDNAAlleles2Add2shortFasta += appendAllele + alleleStr + '\n'

                                geneTransalatedPath2 = os.path.join(
                                    basepath,
                                    str(
                                        os.path.basename(shortgeneFile) +
                                        '_protein2.fasta'))
                                geneTransalatedPath = os.path.join(
                                    basepath,
                                    str(
                                        os.path.basename(shortgeneFile) +
                                        '_protein.fasta'))

                                #~ proteinFastaString+='>' + alleleIaux + '\n' + str(protSeq) + '\n'
                                proteinFastaString += '>' + str(
                                    alleleI) + '\n' + str(protSeq) + '\n'
                                match = bestmatch[5]

                                # --- remake blast DB and recalculate the BSR for the locus --- #
                                alleleList.append(alleleStr)
                                listShortAllelesNames.append(appendAllele)

                                #~ sequence_2_blast='>' + alleleIaux + '\n' + str(protSeq)
                                sequence_2_blast = '>' + str(
                                    alleleI) + '\n' + str(protSeq)
                                Gene_Blast_DB_name2 = CommonFastaFunctions.Create_Blastdb_no_fasta(
                                    geneTransalatedPath2, 1, True,
                                    sequence_2_blast)

                                verboseprint(
                                    "Re-calculating BSR at : " +
                                    time.strftime("%H:%M:%S-%d/%m/%Y"))
                                allelescores, alleleList, listShortAllelesNames = reDogetBlastScoreRatios(
                                    sequence_2_blast, basepath, alleleI,
                                    allelescores, Gene_Blast_DB_name2,
                                    alleleList, geneScorePickle, verbose,
                                    blastPath, listShortAllelesNames)
                                verboseprint(
                                    "Done Re-calculating BSR at : " +
                                    time.strftime("%H:%M:%S-%d/%m/%Y"))

            except Exception as e:
                print("some error occurred")
                print(e)
                print('Error on line {}'.format(sys.exc_info()[-1].tb_lineno))
                perfectMatchIdAllele2.append("ERROR")
                perfectMatchIdAllele.append("ERROR")

    #add new alleles to the locus fasta file

    if len(newDNAAlleles2Add2Fasta) > 5:
        with open(geneFile, 'a') as fG:
            fG.write(newDNAAlleles2Add2Fasta)
    if len(newDNAAlleles2Add2shortFasta) > 5:
        with open(shortgeneFile, 'a') as fG:
            fG.write(newDNAAlleles2Add2shortFasta)

    final = (resultsList, perfectMatchIdAllele)
    verboseprint("Finished allele calling at : " +
                 time.strftime("%H:%M:%S-%d/%m/%Y"))
    filepath = os.path.join(temppath,
                            os.path.basename(geneFile) + "_result.txt")
    filepath2 = os.path.join(temppath,
                             os.path.basename(geneFile) + "_result2.txt")
    with open(filepath, 'wb') as f:
        pickle.dump(final, f)
    with open(filepath2, 'wb') as f:
        pickle.dump(perfectMatchIdAllele2, f)
    shutil.rmtree(basepath)
    return True
Ejemplo n.º 2
0
def getBlastScoreRatios(genefile, basepath, doAll, verbose, blastPath):
    if verbose:

        def verboseprint(*args):
            for arg in args:
                print(arg),
            print
    else:
        verboseprint = lambda *a: None  # do-nothing function

    #gene_fp = HTSeq.FastaReader(genefile)
    allelescores = []
    alleleProt = ''
    alleleAllProt = ''
    alleleList = []
    alleleI = 0
    alleleIlist = []
    listAllelesNames = []
    # calculate bsr for each allele
    for allele in SeqIO.parse(genefile, "fasta", generic_dna):

        # usually first allele name is just >1 and after that it has >gene_id_genome
        aux = allele.id.split("_")
        if len(aux) < 2:
            alleleI = int(aux[0])
        else:
            alleleI = int(aux[-1])

        # try to translate the allele
        alleleIlist.append(alleleI)
        alleleList.append(str(allele.seq.upper()))
        listAllelesNames.append(allele.id)
        translatedSequence, x = translateSeq(str(allele.seq.upper()))

        if translatedSequence == '':
            print("cannot translate allele on bsr calculation")
            pass

        # calculate BSR for the allele
        else:
            alleleProt = ">" + str(alleleI) + "\n" + str(translatedSequence +
                                                         "\n")
            alleleAllProt += ">" + str(alleleI) + "\n" + str(
                translatedSequence + "\n")
            proteinfastaPath = os.path.join(
                basepath, str(os.path.basename(genefile) + '_protein2.fasta'))

            # new db for each allele to blast it against himself

            Gene_Blast_DB_name = CommonFastaFunctions.Create_Blastdb_no_fasta(
                proteinfastaPath, 1, True, alleleProt)

            # if bsr hasn't been calculated, do the BLAST
            if doAll:

                #~ blast_out_file = os.path.join(basepath, 'blastdbs/temp.xml')
                verboseprint("Starting Blast alleles at : " +
                             time.strftime("%H:%M:%S-%d/%m/%Y"))

                # --- get BLAST score ratio --- #
                #~ cline = NcbiblastpCommandline(cmd=blastPath, query=proteinfastaPath, db=Gene_Blast_DB_name,
                #~ evalue=0.001, out=blast_out_file, outfmt=5, num_threads=1)

                cline = NcbiblastpCommandline(cmd=blastPath,
                                              db=Gene_Blast_DB_name,
                                              evalue=0.001,
                                              outfmt=5,
                                              num_threads=1)
                out, err = cline(stdin=alleleProt)
                psiblast_xml = StringIO(out)
                blast_records = NCBIXML.parse(psiblast_xml)

                allelescore = 0

                #~ blast_records = CommonFastaFunctions.runBlastParser(cline, blast_out_file)

                verboseprint("Blasted alleles at : " +
                             time.strftime("%H:%M:%S-%d/%m/%Y"))

                for blast_record in blast_records:

                    for alignment in blast_record.alignments:

                        for match in alignment.hsps:
                            allelescores.append(int(match.score))

                geneScorePickle = os.path.abspath(genefile) + '_bsr.txt'
                verboseprint("________")
                # ~ var=[alleleI,allelescores]
                var = dict(zip(alleleIlist, allelescores))
                with open(geneScorePickle, 'wb') as f:
                    pickle.dump(var, f)

            # bsr had already been calculated, load it to memory
            else:
                geneScorePickle = os.path.abspath(genefile) + '_bsr.txt'
                with open(geneScorePickle, 'rb') as f:
                    var = pickle.load(f)
                # ~ allelescores=var[1]

    proteinfastaPath = os.path.join(
        basepath, str(os.path.basename(genefile) + '_protein.fasta'))
    with open(proteinfastaPath, "w") as f:
        f.write(alleleAllProt)

    # returning all allele BSR scores and list of alleles for this gene
    return var, alleleList, listAllelesNames