def main(genomeFiles, cpuToUse, outputFile, bsr, BlastpPath, min_length, verbose, chosenTrainingFile, inputCDS, translation_table, st): if verbose: def verboseprint(*args): for arg in args: print(arg, end="") print else: verboseprint = lambda *a: None # do-nothing function # avoid user to run the script with all cores available, could impossibilitate any usage when running on a laptop if cpuToUse > multiprocessing.cpu_count() - 2: print('\nWARNING: you provided a --cpu value close to the ' 'maximum number of available CPU cores.\n' 'This might degrade system performance and lead ' 'to system unresponsiveness.\n') time.sleep(2) if isinstance(chosenTrainingFile, str): trainingFolderPAth = os.path.abspath(chosenTrainingFile) try: chosenTaxon = trainingFolderPAth if os.path.isfile(chosenTaxon): print("Prodigal training file: " + chosenTaxon) else: print("Training file does not exist "+chosenTaxon) return "retry" except: print("The training file you provided does not exist:") print(chosenTaxon) return "retry" else: chosenTaxon = False scripts_path = os.path.dirname(os.path.realpath(__file__)) print("Number of CPU cores: " + str(cpuToUse)) print("\nChecking dependencies...") print("Blast installation..." + str(which(str(BlastpPath)))) print("Prodigal installation..." + str(which('prodigal'))) start_date = start_date_str = dt.datetime.strftime(start_date, '%H:%M:%S-%d/%m/%Y') print('\nStarted at: {0}\n'.format(start_date_str)) listOfGenomes = [] fp = open(genomeFiles, 'r') for genomeFile in fp: genomeFile = genomeFile.rstrip('\n') genomeFile = genomeFile.rstrip('\r') listOfGenomes.append(genomeFile) fp.close() listOfGenomes.sort(key=lambda y: y.lower()) # check if remnant files from previous run exist, prompt user if exists to know if it's his run and want to continue or start a new one basepath = os.path.join((os.path.dirname(outputFile)), "temp") if not os.path.exists(basepath): os.makedirs(basepath) # ------------------------------------------------- # # RUN PRODIGAL OVER ALL GENOMES # # ------------------------------------------------- # if inputCDS is True: CreateSchema.main(listOfGenomes[0], min_length, cpuToUse, False, outputFile, BlastpPath, bsr, verbose) shutil.rmtree(basepath) return True elif inputCDS is False: print("Starting Prodigal at: " + time.strftime("%H:%M:%S-%d/%m/%Y")) # Prodigal run on the genomes, one genome per core using n-2 cores (n number of cores) pool = multiprocessing.Pool(cpuToUse) for genome in listOfGenomes: pool.apply_async(runProdigal.main, (str(genome), basepath, str(chosenTaxon), translation_table)) pool.close() pool.join() print("Finishing Prodigal at: " + time.strftime("%H:%M:%S-%d/%m/%Y")) print("\nChecking if Prodigal created all the necessary files...") listOfORFCreated = [] for orffile in os.listdir(basepath): if orffile.endswith("_ORF.txt"): listOfORFCreated.append(orffile) if len(listOfGenomes) > len(listOfORFCreated): message = "Missing some files from prodigal. " + str( (len(listOfGenomes)) - (len(listOfORFCreated))) + " missing files out of " + str(len(listOfGenomes)) shutil.rmtree(basepath) raise ValueError(message) else: print("All files were created.\n") createSchemaPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), '') # ---CDS to protein---# # translate the genome CDSs, load them into dictionaries and fasta files to be used further ahead pairID = 0 with open("proteinID_Genome.tsv", 'w') as f: f.write("Genome\tcontig\tStart\tStop\tprotID") while len(listOfGenomes) > 0: pair = [] dictPairs = {} for genomeFile in listOfGenomes: if len(pair) < 2: pair.append(genomeFile) else: dictPairs[pairID] = pair pairID += 1 pair = [] pair.append(genomeFile) # if total unpair, keep the remainig listOfGenomes = [] if len(pair) == 2: dictPairs[pairID] = pair pairID += 1 elif len(pair) > 0: listOfGenomes.append(pair[0]) numberOfPairs = len(dictPairs.items()) extraCpu = 0 if numberOfPairs >= cpuToUse: pool = multiprocessing.Pool(cpuToUse) else: pool = multiprocessing.Pool(numberOfPairs) extraCpu = cpuToUse - numberOfPairs # print dictPairs for item in dictPairs.items(): k = item[0] v = item[1] newgGenome = "protogenome" + str(k) pathFornewgGenome = os.path.join(basepath, newgGenome, newgGenome + ".fasta") listOfGenomes.append(pathFornewgGenome) extraCpuPerProcess = extraCpu / numberOfPairs print("Running analysis for pair: " + str(v[0]) + " " + str(v[1])) pool.apply_async(checkGeneStrings, args=[v[0], v[1], newgGenome, basepath, int(extraCpuPerProcess + 1), BlastpPath, createSchemaPath, verbose, bsr]) pool.close() pool.join() if len(listOfGenomes) == 1: verboseprint("___________________\nFinal step : creating the schema") lastFile = listOfGenomes.pop() CreateSchema.main(lastFile, min_length, cpuToUse, False, outputFile, BlastpPath, bsr, verbose) verboseprint("Schema Created sucessfully") shutil.rmtree(basepath) end_date = end_date_str = dt.datetime.strftime(end_date, '%H:%M:%S-%d/%m/%Y') delta = end_date - start_date minutes, seconds = divmod(delta.total_seconds(), 60) print('\nFinished at: {0}'.format(end_date_str)) print('Elapsed time: {0:.0f}m{1:.0f}s'.format(minutes, seconds))