Example #1
0
def main(genomeFiles, cpuToUse, outputFile, bsr, BlastpPath, min_length,
         verbose, chosenTrainingFile, inputCDS, prodigal_mode):
    #~ parser = argparse.ArgumentParser(description="This program call alleles for a set of genomes provided a schema")
    #~ parser.add_argument('-i', nargs='?', type=str, help='List of genome files (list of fasta files)', required=True)
    #~ parser.add_argument('-o', nargs='?', type=str, help="Name of the output files", required=True)
    #~ parser.add_argument('--cpu', nargs='?', type=int, help="Number of cpus, if over the maximum uses maximum -2",
    #~ required=True)
    #~ parser.add_argument('-b', nargs='?', type=str, help="BLAST full path", required=False, default='blastp')
    #~ parser.add_argument('--bsr', nargs='?', type=float, help="minimum BSR similarity", required=False, default=0.6)
    #~ parser.add_argument('-l', nargs='?', type=int, help="minimum bp locus lenght", required=False, default=200)
    #~ parser.add_argument('-t', nargs='?', type=str, help="taxon", required=False, default=False)
    #~ parser.add_argument('--ptf', nargs='?', type=str, help="provide own training file path", required=False, default=False)
    #~ parser.add_argument("-v", "--verbose", help="increase output verbosity", dest='verbose', action="store_true",
    #~ default=False)
    #~
    #~ args = parser.parse_args()
    #~
    #~ genomeFiles = args.i
    #~ cpuToUse = args.cpu
    #~ outputFile = args.o
    #~ BlastpPath = args.b
    #~ bsr = args.bsr
    #~ chosenTaxon = args.t
    #~ chosenTrainingFile = args.ptf
    #~ verbose = args.verbose
    #~ min_length = args.l

    if verbose:

        def verboseprint(*args):
            for arg in args:
                print(arg, end="")
            print
    else:
        verboseprint = lambda *a: None  # do-nothing function

    # avoid user to run the script with all cores available, could impossibilitate any usage when running on a laptop
    if cpuToUse > multiprocessing.cpu_count() - 2:
        print(
            "Warning, you are close to use all your cpus, if you are using a laptop you may be uncapable to perform any action"
        )

    taxonList = {
        'Campylobacter jejuni': 'trained_campyJejuni.trn',
        'Acinetobacter baumannii': 'trained_acinetoBaumannii.trn',
        'Streptococcus agalactiae': 'trained_strepAgalactiae.trn',
        'Haemophilus influenzae': 'trained_haemoInfluenzae_A.trn',
        'Yersinia enterocolitica': 'trained_yersiniaEnterocolitica.trn',
        'Escherichia coli': 'trained_eColi.trn',
        'Enterococcus faecium': 'trained_enteroFaecium.trn',
        'Staphylococcus haemolyticus': 'trained_staphHaemolyticus.trn',
        'Salmonella enterica': 'trained_salmonellaEnterica_enteritidis.trn',
        'Staphylococcus aureus': 'trained_StaphylococcusAureus.trn',
        'Streptococcus pneumoniae': 'trained_strepPneumoniae.trn'
    }
    #~ if isinstance(chosenTaxon, str):
    #~ trainingFolderPAth = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'TrainingFiles4Prodigal'))
    #~ try:
    #~ chosenTaxon = os.path.join(trainingFolderPAth, taxonList[chosenTaxon])
    #~
    #~ if os.path.isfile(chosenTaxon):
    #~ print("will use this training file : " + chosenTaxon)
    #~ else:
    #~ print("training file don't exist")
    #~ print(chosenTaxon)
    #~ return "retry"
    #~ except:
    #~ print("Your chosen taxon "+chosenTaxon+" is not attributed, select one from:")
    #~ for elem in taxonList.keys():
    #~ print(elem)
    #~ return "retry"

    if isinstance(chosenTrainingFile, str):
        trainingFolderPAth = os.path.abspath(chosenTrainingFile)
        try:
            chosenTaxon = trainingFolderPAth

            if os.path.isfile(chosenTaxon):
                print("will use this training file : " + chosenTaxon)
            else:
                print("training file don't exist " + chosenTaxon)
                return "retry"
        except:
            print("The training file you provided doesn't exist:")
            print(chosenTaxon)
            return "retry"
    else:
        chosenTaxon = False

    scripts_path = os.path.dirname(os.path.realpath(__file__))

    print("Will use this number of cpus: " + str(cpuToUse))
    print("Checking all programs are installed")

    print("Checking Prodigal installed... " + str(which('prodigal')))

    starttime = "\nStarting Script at : " + time.strftime("%H:%M:%S-%d/%m/%Y")
    print(starttime)

    listOfGenomes = []

    fp = open(genomeFiles, 'r')

    for genomeFile in fp:
        genomeFile = genomeFile.rstrip('\n')
        genomeFile = genomeFile.rstrip('\r')
        listOfGenomes.append(genomeFile)

    fp.close()
    listOfGenomes.sort(key=lambda y: y.lower())

    # check if remnant files from previous run exist, prompt user if exists to know if it's his run and want to continue or start a new one

    basepath = os.path.join((os.path.dirname(outputFile)), "temp")
    if not os.path.exists(basepath):
        os.makedirs(basepath)

    # ------------------------------------------------- #
    #           RUN PRODIGAL OVER ALL GENOMES           #
    # ------------------------------------------------- #

    if inputCDS == True:
        CreateSchema.main(listOfGenomes[0], min_length, cpuToUse, False,
                          outputFile, BlastpPath, bsr, verbose)
        shutil.rmtree(basepath)
        return True

    elif inputCDS == False:

        print("\nStarting Prodigal at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))

        # Prodigal run on the genomes, one genome per core using n-2 cores (n number of cores)
        print("chosen taxon :" + str(chosenTaxon))
        pool = multiprocessing.Pool(cpuToUse)
        for genome in listOfGenomes:
            pool.apply_async(
                runProdigal.main,
                (str(genome), basepath, str(chosenTaxon), prodigal_mode))

        pool.close()
        pool.join()

        print(
            "\nChecking all prodigal processes created the necessary files...")

        listOfORFCreated = []
        for orffile in os.listdir(basepath):
            if orffile.endswith("_ORF.txt"):
                listOfORFCreated.append(orffile)

        if len(listOfGenomes) > len(listOfORFCreated):
            message = "Missing some files from prodigal. " + str(
                (len(listOfGenomes)) -
                (len(listOfORFCreated))) + " missing files out of " + str(
                    len(listOfGenomes))
            shutil.rmtree(basepath)
            raise ValueError(message)
        else:
            print("All prodigal files necessary were created\n")

        print("Finishing Prodigal at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))

    createSchemaPath = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                    'CreateSchema.py')

    # ---CDS to protein---#

    # translate the genome CDSs, load them into dictionaries and fasta files to be used further ahead
    # listpairs=[]

    pairID = 0
    # while len(processed)<len(toprocess):
    with open("proteinID_Genome.tsv", 'w') as f:
        f.write("Genome\tcontig\tStart\tStop\tprotID")
    while len(listOfGenomes) > 0:

        pair = []
        dictPairs = {}

        for genomeFile in listOfGenomes:
            # toprocess.append(listOfGenomes)
            if len(pair) < 2:
                pair.append(genomeFile)
            else:
                dictPairs[pairID] = pair
                pairID += 1
                pair = []
                pair.append(genomeFile)

        # if total unpair, keep the remainig
        listOfGenomes = []

        if len(pair) == 2:
            dictPairs[pairID] = pair
            pairID += 1

        elif len(pair) > 0:
            listOfGenomes.append(pair[0])

        numberOfPairs = len(dictPairs.items())
        extraCpu = 0
        if numberOfPairs >= cpuToUse:
            pool = multiprocessing.Pool(cpuToUse)
        else:
            pool = multiprocessing.Pool(numberOfPairs)
            extraCpu = cpuToUse - numberOfPairs

        # print dictPairs
        for item in dictPairs.items():
            k = item[0]
            v = item[1]

            newgGenome = "protogenome" + str(k)
            pathFornewgGenome = os.path.join(basepath, newgGenome,
                                             newgGenome + ".fasta")
            listOfGenomes.append(pathFornewgGenome)
            extraCpuPerProcess = extraCpu / numberOfPairs
            print("running analysis for pair : " + str(v[0]) + " " + str(v[1]))
            pool.apply_async(checkGeneStrings,
                             args=[
                                 v[0], v[1], newgGenome, basepath,
                                 int(extraCpuPerProcess + 1), BlastpPath,
                                 createSchemaPath, verbose, bsr
                             ])

        pool.close()
        pool.join()
        verboseprint("finished running pair analysis")

        if len(listOfGenomes) == 1:

            verboseprint(
                "___________________\nFinal step : creating the schema")
            lastFile = listOfGenomes.pop()
            CreateSchema.main(lastFile, min_length, cpuToUse, False,
                              outputFile, BlastpPath, bsr, verbose)
            #~ proc = subprocess.Popen(
            #~ [createSchemaPath, '-i', lastFile, '-l', str(min_length), '--cpu', str(cpuToUse), "-b", BlastpPath, "-o",
            #~ outputFile, "--bsr", str(bsr)])
            #~ p_status = proc.wait()
            verboseprint("Schema Created sucessfully")

    shutil.rmtree(basepath)

    print(starttime)
    print("Finished Script at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))
Example #2
0
def checkGeneStrings(genome1, genome2, newName, basepath, cpu, blastp,
                     createSchemaPath, verbose, bsr):

    if verbose:

        def verboseprint(*args):
            for arg in args:
                print(arg, end="")
            print
    else:
        verboseprint = lambda *a: None  # do-nothing function

    pathForTemp = os.path.join(basepath, newName)
    if not os.path.exists(pathForTemp):
        os.makedirs(pathForTemp)

    listOfGenomes = [genome1, genome2]

    dictprots = {}
    dictprotsLen = {}
    dictprotsName = {}
    newlistOfCDS = {}
    proteinsEqual = 0
    smallProteins = 0
    protid = 0
    genomeProts = ""
    genomeProtsTrans = ""

    try:

        for genomeFile in listOfGenomes:

            genomename = (os.path.basename(genomeFile)).split(".")
            genomename = genomename[0]

            listOfCDS = {}

            currentCDSDict = {}
            currentGenomeDict = {}
            filepath = os.path.join(
                basepath,
                str(os.path.basename(genomeFile)) + "_ORF.txt")
            newfilepath = os.path.join(basepath, str(newName))

            #g_fp = HTSeq.FastaReader(genomeFile)
            for contig in SeqIO.parse(genomeFile, "fasta", generic_dna):
                sequence = str(contig.seq.upper())
                currentGenomeDict[contig.id] = sequence

            # after the first iteration, genomes are already defined by their cds and no longer have a cds dictionary pickle file
            try:
                with open(filepath, 'rb') as f:
                    currentCDSDict = pickle.load(f)
            except:
                for k, v in currentGenomeDict.items():
                    currentCDSDict[k] = [[v]]

            j = 0

            counter = 0
            tsvProtidGenome = ""

            for contigTag, value in currentCDSDict.items():

                for protein in value:
                    protid += 1

                    # at first iteration we use the genome file and after a cds only multifasta file
                    try:
                        seq = currentGenomeDict[contigTag][
                            protein[0]:protein[1]].upper()
                        aux = [
                            genomename, contigTag,
                            str(protein[0]),
                            str(protein[1]),
                            str(protid)
                        ]
                        tsvProtidGenome += "\n" + '\t'.join(aux)

                    except Exception as e:

                        seq = str(protein[0])

                    try:
                        protseq, orderedSeq = translateSeq(seq)
                        lengthofProt = len(str(protseq))
                    except:
                        verboseprint(str(genome1) + " " + str(genome2))
                        pass
                    # check if any protein with size on dict

                    try:

                        if len(str(protseq)) < 67:
                            smallProteins += 1
                            pass

                        elif dictprotsLen[lengthofProt]:
                            proteinFound = False
                            listproteinsid = dictprotsLen[lengthofProt]
                            for elem in listproteinsid:

                                if protseq == dictprots[elem]:
                                    proteinFound = True
                                    proteinsEqual += 1
                                    break

                            if not proteinFound:
                                dictprotsLen[lengthofProt].append(protid)
                                dictprots[protid] = protseq
                                newlistOfCDS[protid] = orderedSeq
                                try:
                                    protein[1]
                                    idstr = ">" + str(
                                        genomename) + "|protein" + str(protid)
                                    idstr2 = ">" + str(
                                        genomename) + "|protein" + str(protid)
                                except:
                                    idstr = ">" + str(contigTag)
                                    idstr2 = ">" + str(
                                        (contigTag.split(" "))[0])
                                genomeProts += idstr + "\n"
                                genomeProtsTrans += idstr2 + "\n"
                                genomeProts += str(orderedSeq) + "\n"
                                genomeProtsTrans += str(protseq) + "\n"
                                dictprotsName[protid] = idstr2
                    except Exception as e:
                        try:
                            dictprotsLen[lengthofProt] = [protid]
                            dictprots[protid] = protseq
                            newlistOfCDS[protid] = orderedSeq
                            try:
                                protein[1]
                                idstr = ">" + str(
                                    genomename) + "|protein" + str(protid)
                                idstr2 = ">" + str(
                                    genomename) + "|protein" + str(protid)

                            except:
                                idstr = ">" + str(contigTag)
                                idstr2 = ">" + str((contigTag.split(" "))[0])

                            genomeProts += idstr + "\n"
                            genomeProtsTrans += idstr2 + "\n"
                            genomeProts += str(orderedSeq) + "\n"
                            genomeProtsTrans += str(protseq) + "\n"
                            dictprotsName[protid] = idstr2
                        except:
                            pass

                    else:
                        pass

            listOfCDS = ''
            currentGenomeDict = ''
            currentCDSDict = ''
            with open("proteinID_Genome.tsv", 'a') as f:
                f.write(tsvProtidGenome)

        verboseprint("Checked equal proteins for: " + str(genome1) + " " +
                     str(genome2))
        verboseprint("Starting with a total of loci: " + str(protid))
        verboseprint("equal proteins : " + str(proteinsEqual))
        verboseprint("small proteins : " + str(smallProteins))

        fastaFile = os.path.join(pathForTemp, newName + ".fasta")
        with open(fastaFile, 'a') as f:
            f.write(genomeProts)

        newlistOfCDS = {}
        genomeProtsTrans = ''
        genomeProts = ''

        # check if any protein is substring of a larger, mantaining the larger one
        # ordering the sequences by length, larger ones first
        auxlist = dictprotsLen.keys()
        auxlist = sorted(auxlist, key=int)
        auxlist = auxlist[::-1]

        finalProtDict = {}
        genomeProtsTrans = ''
        auxprotlist = []
        contained = 0
        finalnumber = 0
        verboseprint("Looking for contained proteins in : " + str(genome1) +
                     " " + str(genome2))
        counter = 0

        for elem in auxlist:
            counter += 1

            for protid in dictprotsLen[elem]:
                str2 = str(dictprots[protid])

                try:
                    auxprotlist[0]
                    for elem2 in auxprotlist:
                        isContained = False
                        if len(elem2) < len(str2):
                            break
                        else:
                            if str2 in elem2:
                                isContained = True
                                contained += 1
                                break
                    if not isContained:
                        str1 = dictprotsName[protid]
                        genomeProtsTrans += str1 + "\n" + str2 + "\n"
                        finalnumber += 1
                        auxprotlist.append(str2)
                except Exception as e:
                    str1 = dictprotsName[protid]
                    genomeProtsTrans += str1 + "\n" + str2 + "\n"
                    finalnumber += 1
                    auxprotlist.append(str2)

        auxprotlist = []
        dictprots = {}
        dictprotsLen = {}
        dictprotsName = {}

        verboseprint("number of contained proteins : " + str(contained))
        verboseprint("total of loci to blast : " + str(finalnumber))

        proteinFile = os.path.join(pathForTemp, newName + "_proteins.fasta")

        with open(proteinFile, 'a') as f:
            f.write(genomeProtsTrans)
        genomeProtsTrans = ''

        # run createschema for the final protogenome
        verboseprint("running blast will use this number of cpu: " + str(cpu))
        CreateSchema.main(fastaFile, 200, cpu, proteinFile, fastaFile, blastp,
                          bsr, verbose)
        #~ proc = subprocess.Popen(
        #~ [createSchemaPath, '-i', fastaFile, '-l', "200", '--cpu', str(cpu), '-p', proteinFile, '-o', fastaFile,
        #~ "-b", blastp], stdout=subprocess.PIPE)
        #~ p_status = proc.wait()
        verboseprint("finished blast")

        os.remove(proteinFile)

    except Exception as e:
        verboseprint(e)
        return e

    return True
Example #3
0
def main(genomeFiles, cpuToUse, outputFile, bsr, BlastpPath, min_length,
         verbose, chosenTrainingFile, inputCDS, translation_table, st):

    if verbose:
        def verboseprint(*args):
            for arg in args:
                print(arg, end="")
            print
    else:
        verboseprint = lambda *a: None  # do-nothing function

    # avoid user to run the script with all cores available, could impossibilitate any usage when running on a laptop
    if cpuToUse > multiprocessing.cpu_count() - 2:
        print('\nWARNING: you provided a --cpu value close to the '
              'maximum number of available CPU cores.\n'
              'This might degrade system performance and lead '
              'to system unresponsiveness.\n')
        time.sleep(2)

    if isinstance(chosenTrainingFile, str):
        trainingFolderPAth = os.path.abspath(chosenTrainingFile)
        try:
            chosenTaxon = trainingFolderPAth

            if os.path.isfile(chosenTaxon):
                print("Prodigal training file: " + chosenTaxon)
            else:
                print("Training file does not exist "+chosenTaxon)
                return "retry"
        except:
            print("The training file you provided does not exist:")
            print(chosenTaxon)
            return "retry"
    else:
        chosenTaxon = False

    scripts_path = os.path.dirname(os.path.realpath(__file__))

    print("Number of CPU cores: " + str(cpuToUse))

    print("\nChecking dependencies...")
    print("Blast installation..." + str(which(str(BlastpPath))))
    print("Prodigal installation..." + str(which('prodigal')))

    start_date = dt.datetime.now()
    start_date_str = dt.datetime.strftime(start_date, '%H:%M:%S-%d/%m/%Y')
    print('\nStarted at: {0}\n'.format(start_date_str))

    listOfGenomes = []
    fp = open(genomeFiles, 'r')
    for genomeFile in fp:
        genomeFile = genomeFile.rstrip('\n')
        genomeFile = genomeFile.rstrip('\r')
        listOfGenomes.append(genomeFile)
    fp.close()
    listOfGenomes.sort(key=lambda y: y.lower())

    # check if remnant files from previous run exist, prompt user if exists to know if it's his run and want to continue or start a new one
    basepath = os.path.join((os.path.dirname(outputFile)), "temp")
    if not os.path.exists(basepath):
        os.makedirs(basepath)

    # ------------------------------------------------- #
    #           RUN PRODIGAL OVER ALL GENOMES           #
    # ------------------------------------------------- #

    if inputCDS is True:
        CreateSchema.main(listOfGenomes[0], min_length, cpuToUse, False, outputFile, BlastpPath, bsr, verbose)
        shutil.rmtree(basepath)
        return True

    elif inputCDS is False:

        print("Starting Prodigal at: " + time.strftime("%H:%M:%S-%d/%m/%Y"))

        # Prodigal run on the genomes, one genome per core using n-2 cores (n number of cores)
        pool = multiprocessing.Pool(cpuToUse)
        for genome in listOfGenomes:
            pool.apply_async(runProdigal.main, (str(genome), basepath, str(chosenTaxon), translation_table))

        pool.close()
        pool.join()

        print("Finishing Prodigal at: " + time.strftime("%H:%M:%S-%d/%m/%Y"))

        print("\nChecking if Prodigal created all the necessary files...")
        listOfORFCreated = []
        for orffile in os.listdir(basepath):
            if orffile.endswith("_ORF.txt"):
                listOfORFCreated.append(orffile)

        if len(listOfGenomes) > len(listOfORFCreated):
            message = "Missing some files from prodigal. " + str(
                (len(listOfGenomes)) - (len(listOfORFCreated))) + " missing files out of " + str(len(listOfGenomes))
            shutil.rmtree(basepath)
            raise ValueError(message)
        else:
            print("All files were created.\n")

    createSchemaPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'CreateSchema.py')

    # ---CDS to protein---#

    # translate the genome CDSs, load them into dictionaries and fasta files to be used further ahead
    pairID = 0
    with open("proteinID_Genome.tsv", 'w') as f:
        f.write("Genome\tcontig\tStart\tStop\tprotID")
    while len(listOfGenomes) > 0:

        pair = []
        dictPairs = {}

        for genomeFile in listOfGenomes:
            if len(pair) < 2:
                pair.append(genomeFile)
            else:
                dictPairs[pairID] = pair
                pairID += 1
                pair = []
                pair.append(genomeFile)

        # if total unpair, keep the remainig
        listOfGenomes = []

        if len(pair) == 2:
            dictPairs[pairID] = pair
            pairID += 1

        elif len(pair) > 0:
            listOfGenomes.append(pair[0])

        numberOfPairs = len(dictPairs.items())
        extraCpu = 0
        if numberOfPairs >= cpuToUse:
            pool = multiprocessing.Pool(cpuToUse)
        else:
            pool = multiprocessing.Pool(numberOfPairs)
            extraCpu = cpuToUse - numberOfPairs

        # print dictPairs
        for item in dictPairs.items():
            k = item[0]
            v = item[1]

            newgGenome = "protogenome" + str(k)
            pathFornewgGenome = os.path.join(basepath, newgGenome, newgGenome + ".fasta")
            listOfGenomes.append(pathFornewgGenome)
            extraCpuPerProcess = extraCpu / numberOfPairs
            print("Running analysis for pair: " + str(v[0]) + " " + str(v[1]))
            pool.apply_async(checkGeneStrings,
                             args=[v[0], v[1], newgGenome, basepath, int(extraCpuPerProcess + 1), BlastpPath,
                                   createSchemaPath, verbose, bsr])

        pool.close()
        pool.join()

        if len(listOfGenomes) == 1:

            verboseprint("___________________\nFinal step : creating the schema")
            lastFile = listOfGenomes.pop()

            CreateSchema.main(lastFile, min_length, cpuToUse, False,
                              outputFile, BlastpPath, bsr, verbose)

            verboseprint("Schema Created sucessfully")

    shutil.rmtree(basepath)

    end_date = dt.datetime.now()
    end_date_str = dt.datetime.strftime(end_date, '%H:%M:%S-%d/%m/%Y')

    delta = end_date - start_date
    minutes, seconds = divmod(delta.total_seconds(), 60)

    print('\nFinished at: {0}'.format(end_date_str))
    print('Elapsed time: {0:.0f}m{1:.0f}s'.format(minutes, seconds))