def multiSplit(files, globs): # Given a file and a number of splits (in this case, the number of processors), this function splits # the file into files with equal numbers of lines. import math file_info = files[1] #infilename, reffilename, outfilename, scaffs, globs = files[1]; RC.printWrite(globs['logfilename'], globs['log-v'], "+ Making tmp directory: " + globs['tmpdir']) os.makedirs(globs['tmpdir']) # Make the temporary directory to store the split files and split outputs. new_files = {} # The dictionary for the new temporary files. tmpfiles = [ os.path.join(globs['tmpdir'], str(i) + "-chunk.txt") for i in range(globs['num-procs']) ] # Generate the names of the tmp input files. num_lines = RC.getFileLen(file_info['in']) linespersplit = int(math.ceil(num_lines / float(globs['num-procs']))) # Count the number of lines in the input file and get the number of lines per split. with RC.getFileReader(file_info['in'])(file_info['in'], "r") as infile: file_lines, file_num = 0, 0 tmpfile = open(tmpfiles[file_num], "w") for line in infile: tmpfile.write(line) file_lines += 1 if file_lines == linespersplit: tmpfile.close() newoutfile = os.path.join(globs['tmpdir'], str(file_num) + "-chunk-out.txt") new_files[file_num] = { 'in': tmpfiles[file_num], 'out': newoutfile } file_lines = 0 file_num += 1 if file_num != len(tmpfiles): tmpfile = open(tmpfiles[file_num], "w") # Read through every line in the input file and write it to one of the sub-files, updating the # subfile if we've reached the number of lines per split in that file. if len(new_files) != len(tmpfiles): tmpfile.close() newoutfile = os.path.join(globs['tmpdir'], str(file_num) + "-out.txt") new_files[file_num] = { 'in': tmpfiles[file_num], 'out': newoutfile } # If the last file has fewer lines than the rest it won't get added in the loop so we add it here. return new_files
def mergeFiles(outfile, files, globs): # This function merges the tmp output files back into the main output file. import shutil with open(outfile, "w") as out: for file_num in sorted(files.keys()): with open(files[file_num]['out']) as infile: for line in infile: out.write(line) try: RC.printWrite(globs['logfilename'], globs['log-v'], "+ Removing tmp directory and files: " + globs['tmpdir']) shutil.rmtree(globs['tmpdir']) except: RC.printWrite( globs['logfilename'], globs['log-v'], "+ Could not remove tmp directory and files. User can remove manually: " + globs['tmpdir'])
def startProg(globs): # A nice way to start the program. start_v = 1 print("#") RC.printWrite( globs['logfilename'], 0, "# Welcome to Referee -- Reference genome quality score calculator.") RC.printWrite( globs['logfilename'], start_v, "# Version " + globs['version'] + " released on " + globs['releasedate']) RC.printWrite(globs['logfilename'], start_v, "# Referee was developed by Gregg Thomas and Matthew Hahn") RC.printWrite(globs['logfilename'], start_v, "# Citation: " + globs['doi']) RC.printWrite(globs['logfilename'], start_v, "# Website: " + globs['http']) RC.printWrite(globs['logfilename'], start_v, "# Report issues: " + globs['github']) RC.printWrite(globs['logfilename'], start_v, "#") RC.printWrite(globs['logfilename'], start_v, "# The date and time at the start is: " + RC.getDateTime()) RC.printWrite( globs['logfilename'], start_v, "# Using Python version: " + globs['pyver'] + "\n#") RC.printWrite(globs['logfilename'], start_v, "# The program was called as: " + " ".join(sys.argv) + "\n#") pad = 20 RC.printWrite(globs['logfilename'], start_v, "# " + "-" * 125) RC.printWrite(globs['logfilename'], start_v, "# INPUT/OUTPUT INFO") RC.printWrite(globs['logfilename'], start_v, RC.spacedOut("# Input file:", pad) + globs['in-file']) RC.printWrite(globs['logfilename'], start_v, RC.spacedOut("# Reference file:", pad) + globs['ref-file']) RC.printWrite(globs['logfilename'], start_v, RC.spacedOut("# Output directory:", pad) + globs['out-dir']) RC.printWrite(globs['logfilename'], start_v, RC.spacedOut("# Output prefix:", pad) + globs['out-prefix']) RC.printWrite(globs['logfilename'], start_v, "# " + "-" * 125) RC.printWrite(globs['logfilename'], start_v, "# OPTIONS INFO") RC.printWrite( globs['logfilename'], start_v, RC.spacedOut("# Option", pad) + RC.spacedOut("Current setting", pad) + "Current action") RC.printWrite(globs['logfilename'], start_v, "# " + "-" * 125) if globs['pileup-opt']: RC.printWrite( globs['logfilename'], start_v, RC.spacedOut("# --pileup", pad) + RC.spacedOut("True", pad) + "Input type set to pileup. Referee will calculate genotype likelihoods." ) if globs['mapq-opt']: RC.printWrite( globs['logfilename'], start_v, RC.spacedOut("# --mapq", pad) + RC.spacedOut("True", pad) + "Incorporating mapping qualities (7th column of pileup file) into quality score calculations if they are present." ) else: RC.printWrite( globs['logfilename'], start_v, RC.spacedOut("# --mapq", pad) + RC.spacedOut("False", pad) + "Ignoring mapping qualities in pileup file if they are present." ) else: RC.printWrite( globs['logfilename'], start_v, RC.spacedOut("# --pileup", pad) + RC.spacedOut("False", pad) + "Input is pre-calculated genotype log likelihoods.") if globs['mapq-opt']: RC.printWrite( globs['logfilename'], start_v, RC.spacedOut("# --mapq", pad) + RC.spacedOut("True", pad) + "--pileup not set. Ignoring --mapq option.") # Reporting the pileup option. if globs['fastq-opt']: RC.printWrite( globs['logfilename'], start_v, RC.spacedOut("# --fastq", pad) + RC.spacedOut("True", pad) + "Writing output in FASTQ format in addition to tab delimited: " + globs['out-fq']) else: RC.printWrite( globs['logfilename'], start_v, RC.spacedOut("# --fastq", pad) + RC.spacedOut("False", pad) + "Not writing output in FASTQ format.") # Reporting the fastq option. if globs['fasta-opt']: RC.printWrite( globs['logfilename'], start_v, RC.spacedOut("# --fasta", pad) + RC.spacedOut("True", pad) + "Writing corrected output in FASTA format in addition to tab delimited: " + globs['out-fq']) else: RC.printWrite( globs['logfilename'], start_v, RC.spacedOut("# --fasta", pad) + RC.spacedOut("False", pad) + "Not writing corrected output in FASTA format.") # Reporting the fastq option. if globs['bed-opt']: RC.printWrite( globs['logfilename'], start_v, RC.spacedOut("# --bed", pad) + RC.spacedOut("True", pad) + "Writing output in BED format in addition to tab delimited: " + globs['bed-dir']) # Specifiy and create the BED directory, if necessary. else: RC.printWrite( globs['logfilename'], start_v, RC.spacedOut("# --bed", pad) + RC.spacedOut("False", pad) + "Not writing output in BED format.") # Reporting the fastq option. if globs['mapped-only-opt']: RC.printWrite( globs['logfilename'], start_v, RC.spacedOut("# --mapped", pad) + RC.spacedOut("True", pad) + "Only calculating scores for positions with reads mapped to them.") else: RC.printWrite( globs['logfilename'], start_v, RC.spacedOut("# --mapped", pad) + RC.spacedOut("False", pad) + "Calculating scores for every position in the reference genome.") # Reporting the mapped option. if globs['haploid-opt']: RC.printWrite( globs['logfilename'], start_v, RC.spacedOut("# --haploid", pad) + RC.spacedOut("True", pad) + "Calculating genotype likelihoods and quality scores for HAPLOID data (4 genotypes)." ) else: RC.printWrite( globs['logfilename'], start_v, RC.spacedOut("# --haploid", pad) + RC.spacedOut("False", pad) + "Calculating genotype likelihoods and quality scores for DIPLOID data (10 genotypes)." ) # Reporting the haploid option. if globs['raw-opt']: RC.printWrite( globs['logfilename'], start_v, RC.spacedOut("# --raw", pad) + RC.spacedOut("True", pad) + "Printing raw Referee score in fourth column of tabbed output.") else: RC.printWrite( globs['logfilename'], start_v, RC.spacedOut("# --raw", pad) + RC.spacedOut("False", pad) + "NOT printing raw Referee score in tabbed output.") # Reporting the correct option. if globs['correct-opt']: RC.printWrite( globs['logfilename'], start_v, RC.spacedOut("# --correct", pad) + RC.spacedOut("True", pad) + "Suggesting higher scoring alternative base when reference score is negative or reference base is N." ) else: RC.printWrite( globs['logfilename'], start_v, RC.spacedOut("# --correct", pad) + RC.spacedOut("False", pad) + "Not suggesting higher scoring alternative base when reference score is negative or reference base is N." ) # Reporting the correct option. if not globs['quiet']: RC.printWrite( globs['logfilename'], start_v, RC.spacedOut("# --quiet", pad) + RC.spacedOut("False", pad) + "Step info will be output while Referee is running.") else: RC.printWrite( globs['logfilename'], start_v, RC.spacedOut("# --quiet", pad) + RC.spacedOut("True", pad) + "No further information will be output while Referee is running.") # Reporting the correct option. RC.printWrite( globs['logfilename'], start_v, RC.spacedOut("# -p", pad) + RC.spacedOut(str(globs['num-procs']), pad) + "Referee will use this many processes to run.") # Reporting the number of processes specified. RC.printWrite( globs['logfilename'], start_v, RC.spacedOut("# -l", pad) + RC.spacedOut(str(globs['lines-per-proc']), pad) + "This many lines will be read per process to be calculated at one time in parallel" ) # Reporting the lines per proc option. if globs['allcalc-opt']: RC.printWrite( globs['logfilename'], start_v, RC.spacedOut("# --allcalcs", pad) + RC.spacedOut("True", pad) + "Using tab delimited output and reporting extra columns.") # Reporting the allcalc option. if globs['debug']: RC.printWrite( globs['logfilename'], start_v, RC.spacedOut("# --debug", pad) + RC.spacedOut("True", pad) + "Printing out a bit of debug info.") # Reporting the allcalc option. if not globs['pileup-opt']: RC.printWrite(globs['logfilename'], start_v, "#\n# " + "-" * 40) RC.printWrite(globs['logfilename'], start_v, "## IMPORTANT!") RC.printWrite( globs['logfilename'], start_v, "## Input columns: Scaffold\tPosition\tAA\tAC\tAG\tAT\tCC\tCG\tCT\tGG\tGT\tTT" ) RC.printWrite( globs['logfilename'], start_v, "## Please ensure that your input genotype likelihood files are tab delimited with columns in this exact order without headers." ) RC.printWrite( globs['logfilename'], start_v, "## Failure to do so will result in inaccurate calculations!!") RC.printWrite(globs['logfilename'], start_v, "# " + "-" * 40 + "\n#") if globs['quiet']: RC.printWrite(globs['logfilename'], start_v, "# " + "-" * 125) RC.printWrite(globs['logfilename'], start_v, "# Running...")
def referee(globs): step_start_time = RC.report_step(globs, "", "", "", start=True) # Initialize the step headers step = "Detecting compression" step_start_time = RC.report_step(globs, step, False, "In progress...") globs['reader'] = RC.getFileReader(globs['in-file']) if globs['reader'] != open: globs['lread'] = RC.readGzipLine globs['read-mode'] = "rb" step_start_time = RC.report_step(globs, step, step_start_time, "Success!") #print("\n", globs['reader'], globs['lread'], globs['read-mode']); # Detect whether the input file is gzip compressed or not and save the appropriate functions. step = "Indexing reference FASTA" step_start_time = RC.report_step(globs, step, False, "In progress...") globs['ref'], prev_scaff = RC.fastaReadInd(globs['ref-file'], globs) step_start_time = RC.report_step(globs, step, step_start_time, "Success!") # Index the reference FASTA file. if globs['ref-index']: step = "Getting scaffold lengths from index" step_start_time = RC.report_step(globs, step, False, "In progress...") for line in open(globs['ref-index']): line = line.split("\t") globs['scaff-lens'][line[0]] = int(line[1]) else: RC.printWrite( globs['logfilename'], globs['log-v'], "# WARNING 1: Cannot find reference index file (" + globs['ref-file'] + ".fai)") RC.printWrite( globs['logfilename'], globs['log-v'], "# WARNING 1: Will read reference scaffold lengths manually, which can take a few minutes." ) step = "Getting scaffold lengths manually" step_start_time = RC.report_step(globs, step, False, "In progress...") for scaff in globs['ref']: seq = RC.fastaGet(globs['ref-file'], globs['ref'][scaff])[1] globs['scaff-lens'][scaff] = len(seq) step_start_time = RC.report_step(globs, step, step_start_time, "Success!") globs['num-pos'], globs['num-scaff'] = sum( globs['scaff-lens'].values()), len(globs['scaff-lens']) RC.printWrite( globs['logfilename'], globs['log-v'], "# Read " + str(globs['num-pos']) + " positions in " + str(globs['num-scaff']) + " scaffolds") # Getting scaffold lengths if globs['pileup-opt']: step = "Computing likelihood look-up table" step_start_time = RC.report_step(globs, step, False, "In progress...") globs['probs'] = CALC.glInit(globs['mapq-opt'], globs['haploid-opt']) step_start_time = RC.report_step(globs, step, step_start_time, "Success!") # Pre-compute the likelihoods for every base-quality (+ mapping quality if set) so they can # just be looked up for each position. with open(globs['out-tab'], "w") as outfile, mp.Pool(processes=globs['num-procs']) as pool: if globs['fastq-opt']: fastqfile = open(globs['out-fq'], "w") else: fastqfile = "" # Open the FASTQ file if --fastq is specified. Otherwise just set an empty string instead of the stream. if globs['fasta-opt']: fastafile = open(globs['out-fa'], "w") else: fastafile = "" # Open the FASTA file if --fasta is specified. Otherwise just set an empty string instead of the stream. if globs['bed-opt']: for line in globs['reader'](globs['in-file'], globs['read-mode']): first_scaff = globs['lread'](line)[0] break globs['cur-bed'] = OUT.initializeBed(first_scaff, globs) # Initialize first scaffold for BED output. cur_lines, outdicts = [], [] i, i_start, next_pos = 0, 1, 1 for line in globs['reader'](globs['in-file'], globs['read-mode']): i += 1 cur_lines.append(line) if len(cur_lines) == globs['chunk-size']: step = "Processing lines " + str(i_start) + "-" + str(i) step_start_time = RC.report_step(globs, step, False, "In progress...") i_start = i + 1 line_chunks = list( RC.chunks(cur_lines, globs['lines-per-proc'])) for result in pool.starmap(CALC.refCalc, ((line_chunk, globs) for line_chunk in line_chunks)): #outdicts += result; for outdict in result: # print(site); # print(result[site]); prev_scaff, next_pos, globs = OUT.outputDistributor( outdict, prev_scaff, next_pos, outfile, fastqfile, fastafile, globs) # for outdict in outdicts: # prev_scaff, next_pos, globs = OUT.outputDistributor(outdict, prev_scaff, next_pos, outfile, fastqfile, fastafile, globs); cur_lines, outdicts = [], [] step_start_time = RC.report_step(globs, step, step_start_time, "Success!") # Read the input file line by line. Once a certain number of lines have been read, pass them to siteParse in parallel. if cur_lines != []: step = "Processing lines " + str(i_start) + "-" + str(i) step_start_time = RC.report_step(globs, step, False, "In progress...") line_chunks = list(RC.chunks(cur_lines, globs['lines-per-proc'])) for result in pool.starmap(CALC.refCalc, ((line_chunk, globs) for line_chunk in line_chunks)): #outdicts += result; for outdict in result: # print(site); # print(result[site]); prev_scaff, next_pos, globs = OUT.outputDistributor( outdict, prev_scaff, next_pos, outfile, fastqfile, fastafile, globs) # for outdict in outdicts: # prev_scaff, next_pos, globs = OUT.outputDistributor(outdict, prev_scaff, next_pos, outfile, fastqfile, fastafile, globs); step_start_time = RC.report_step(globs, step, step_start_time, "Success!") # Read the input file line by line. Once a certain number of lines hav # Count the last chunk of lines if necessary. if next_pos <= globs['scaff-lens'][prev_scaff]: step = "Filling final unmapped positions" step_start_time = RC.report_step(globs, step, False, "In progress...") seq = RC.fastaGet(globs['ref-file'], globs['ref'][prev_scaff])[1] outdict = { 'scaff': prev_scaff, 'pos': globs['scaff-lens'][prev_scaff], 'ref': seq[next_pos - 1], 'rq': -2, 'raw': "NA", 'lr': "NA", 'l_match': "NA", 'l_mismatch': "NA", 'gls': "NA", 'cor_ref': "NA", 'cor_score': "NA", 'cor_raw': "NA" } prev_scaff, next_pos, globs = OUT.outputDistributor( outdict, prev_scaff, next_pos, outfile, fastqfile, fastafile, globs) step_start_time = RC.report_step(globs, step, step_start_time, "Success!") # If the last positions are unmapped they won't have been filled in. Do that here using the last position (length) of the # previous scaffold as the outdict. globs['scaffs-written'].append(prev_scaff) step = "Checking for unmapped scaffolds" step_start_time = RC.report_step(globs, step, False, "In progress...") for scaff in globs['scaff-lens']: if scaff not in globs['scaffs-written']: scaff_len = globs['scaff-lens'][scaff] seq = RC.fastaGet(globs['ref-file'], globs['ref'][scaff])[1] for p in range(len(seq)): pos = p + 1 ref = seq[p] outdict = { 'scaff': scaff, 'pos': pos, 'ref': ref, 'rq': -2, 'raw': "NA", 'lr': "NA", 'l_match': "NA", 'l_mismatch': "NA", 'gls': "NA", 'cor_ref': "NA", 'cor_score': "NA", 'cor_raw': "NA" } prev_scaff, next_pos, globs = OUT.outputDistributor( outdict, scaff, next_pos, outfile, fastqfile, fastafile, globs) step_start_time = RC.report_step(globs, step, step_start_time, "Success!") # If any scaffolds had no positions with reads mapped, they will not have been written. Go through them here to write out # their positions in ouput files with scores of -2. if globs['fastq-opt']: fastqfile.close() # Close the FASTQ file if --fastq was set. if globs['fasta-opt']: fastafile.close() # Close the FASTA file if --fasta was set. if globs['bed-opt']: step = "Writing final bed file" step_start_time = RC.report_step(globs, step, False, "In progress...") OUT.outputBed(globs['cur-bed']) step_start_time = RC.report_step(globs, step, step_start_time, "Success!") # Write out the last bed file. with open(globs['out-summary'], "w") as sumout: sumout.write("# SCAFFOLDS:\t" + str(globs['num-scaff']) + "\n") sumout.write("# POSITIONS:\t" + str(globs['num-pos']) + "\n") sumout.write("# UNMAPPED POSITIONS:\t" + str(globs['hist'][2]['count']) + "\n") if globs['correct-opt']: sumout.write("# ERRORS CORRECTED:\t" + str(globs['num-corrected']) + "\n") err_rate = globs['num-corrected'] / globs['num-pos'] sumout.write("# ERROR RATE PER BASE:\t" + str(err_rate) + "\n") sumout.write("#\n# ERROR TYPES\n") sumout.write("from\tto\tcount\n") for err in globs['err-types']: outline = err[0] + "\t" + err[1] + "\t" + str( globs['err-types'][err]) sumout.write(outline + "\n") sumout.write("#\n# SCORE DISTRIBUTION\n") sumout.write("bin\tcount\n") for score_bin in globs['hist']: outline = [ globs['hist'][score_bin]['min'], globs['hist'][score_bin]['max'], globs['hist'][score_bin]['count'] ] outline = [str(o) for o in outline] if outline[0] == outline[1]: outline = outline[0] + "\t" + outline[2] else: outline = outline[0] + "-" + outline[1] + "\t" + outline[2] sumout.write(outline + "\n") # Write the summary file. return
def referee(files, globs, step_start_time): if globs['stats']: if globs['psutil']: import psutil step_start_time = RC.report_stats(globs, "Index ref fasta", step_start=step_start_time) # Initialize the stats output if --stats is set globs['ref'] = RC.fastaReadInd(globs['reffile']) # Index the reference FASTA file. if globs['pileup']: if globs['stats']: step_start_time = RC.report_stats(globs, "GL Init", step_start=step_start_time) globs['probs'] = CALC.glInit(globs['mapq'], globs['haploid']) if globs['stats']: file_start_time = RC.report_stats(globs, "Calcs", step_start=step_start_time) # --stats update. if globs['num-procs'] == 1: for file_num in files: result = CALC.refCalc((file_num, files[file_num], globs)) if globs['stats']: step_start_time = RC.report_stats( globs, "File " + str(result) + " calcs done", file_start_time) # The serial version. else: if len(files) == 1: if globs['stats']: step_start_time = RC.report_stats(globs, "Split files", step_start=step_start_time) new_files = OP.multiSplit(files, globs) else: new_files = files # If multiple processors are available for 1 file, we split the file into chunks. pool = mp.Pool(processes=globs['num-procs']) if globs['stats'] and globs['psutil']: for result in pool.imap(RC.getSubPID, range(globs['num-procs'])): globs['pids'].append(result) for result in pool.imap(CALC.refCalc, ((file_num, new_files[file_num], globs) for file_num in new_files)): if globs['stats']: step_start_time = RC.report_stats( globs, "File " + str(result) + " calcs done", file_start_time) # Creates the pool of processes and passes each file to one process to calculate scores on. if len(files) == 1: if globs['stats']: step_start_time = RC.report_stats(globs, "Merge files", step_start=step_start_time) OP.mergeFiles(files[1]['out'], new_files, globs) # Merges the split tmp files back into a single output file. # The parallel verison. if globs['stats']: file_start_time = RC.report_stats(globs, "Adding unmapped ", step_start=step_start_time) if not globs['mapped']: if globs['num-procs'] == 1 or len(files) == 1: for file_num in files: result = OUT.addUnmapped((file_num, files[file_num], globs)) if globs['stats']: step_start_time = RC.report_stats( globs, "File " + str(result) + " unmapped done", step_start=file_start_time) RC.printWrite( globs['logfilename'], globs['log-v'], "+ Renaming tmp file to output file: " + files[result]['tmpfile'] + " -> " + files[result]['out']) shutil.move(files[result]['tmpfile'], files[result]['out']) # Serial version to add unmapped sites. else: for result in pool.imap(OUT.addUnmapped, ((file_num, files[file_num], globs) for file_num in new_files)): if globs['stats']: step_start_time = RC.report_stats( globs, "File " + str(result) + " unmapped done", step_start=file_start_time) RC.printWrite( globs['logfilename'], globs['log-v'], "+ Renaming tmp file to output file: " + files[result]['tmpfile'] + " -> " + files[result]['out']) shutil.move(files[result]['tmpfile'], files[result]['out']) # Parallel version to add unmapped sites. # If all positions are to be assigned a score, this fills in the unmapped positions. Requires one pass through of the output file. if globs['stats']: step_start_time = RC.report_stats(globs, "End program", step_start=step_start_time, stat_end=True) # A step update for --stats. return