def data_processing(self): self._log.info("Begin Family Size and UMT Analysis") umt_stats_outstring = "UMT\tCount" family_size_outstring = "Family\tCount" for index_key in self._family_data: count_list = [] for k, v in sorted(Counter(self._family_data[index_key]).items(), key=lambda x: x[1], reverse=True): umt_stats_outstring += "\n{0}\t{1}".format(k, v) count_list.append(str(v)) c = dict(Counter(count_list)) for k in natsort.natsorted(c): family_size_outstring += "\n{0}\t{1}".format(k, c[k]) stats_filename = "{0}{1}_UMT_Stats.txt".format( self._args.Working_Folder, self._data_source) size_filename = "{0}{1}_Family_Size.txt".format( self._args.Working_Folder, self._data_source) # Deleting the files if they exist prevents a random text file busy OSError I am getting using VBox on Windows. Tool_Box.delete([stats_filename, size_filename]) umt_stats_file = open( "{0}{1}_UMT_Stats.txt".format(self._args.Working_Folder, self._data_source), 'w') family_size_file = open(size_filename, "w") umt_stats_file.write(umt_stats_outstring) family_size_file.write(family_size_outstring) umt_stats_file.close() family_size_file.close() self._log.info( "{0} {1} UMT Family Size and Stats Files Written".format( self._args.Job_Name, self._data_source))
def pear_consensus(args, log): """ This will take the input FASTQ files and use PEAR to generate a consensus file. :param args: :param log: :return: """ log.info("Beginning PEAR Consensus") fastq_consensus_prefix = "{}{}".format(args.WorkingFolder, args.Job_Name) fastq_consensus_file = "{}.assembled.fastq".format(fastq_consensus_prefix) discarded_fastq = "{}.discarded.fastq".format(fastq_consensus_prefix) r1_unassembled = "{}.unassembled.forward.fastq".format( fastq_consensus_prefix) r2_unassembled = "{}.unassembled.reverse.fastq".format( fastq_consensus_prefix) y = "-y {} ".format(args.Memory) j = "-j {} ".format(int(args.Spawn) - 1) p_value = '' if args.PValue: p_value = "-p {} ".format(args.PValue) min_overlap = '' if args.MinOverlap: min_overlap = "-v {} ".format(args.MinOverlap) quality_threshold = "" if args.QualityThreshold: quality_threshold = "-q {} ".format(args.QualityThreshold) phred_value = "" if args.PhredValue: phred_value = "-b {} ".format(args.PhredValue) test_method = "" if args.TestMethod: test_method = "-g {}".format(args.TestMethod) n = "" if args.MinConsensusLength: n = "-n {} ".format(args.MinConsensusLength) proc = subprocess.run( "{}{}Pear{}bin{}./pear -f {} -r {} -o {} {}{}{}{}{}{}{}".format( pathlib.Path(__file__).parent.absolute(), os.sep, os.sep, os.sep, args.FASTQ1, args.FASTQ2, fastq_consensus_prefix, y, j, n, p_value, min_overlap, quality_threshold, phred_value, test_method), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) if proc.stderr: log.error("{}\n{}\n".format(proc.stderr.decode(), proc.stdout.decode())) return else: log.info( "Begin PEAR Output\n" "----------------------------------------------------------------------------------------------------------\n{}" "\n----------------------------------------------------------------------------------------------------------\n" .format(proc.stdout.decode())) file_list = [fastq_consensus_file, r1_unassembled, r2_unassembled] if os.stat(discarded_fastq).st_size > 0: file_list.append(discarded_fastq) else: Tool_Box.delete([discarded_fastq]) return file_list
def main(command_line_args=None): """ Let's get this party started. :param command_line_args: """ start_time = time.time() VersionDependencies.python_check() if not command_line_args: command_line_args = sys.argv run_start = datetime.datetime.today().strftime("%H:%M:%S %Y %a %b %d") parser = argparse.ArgumentParser( description= "A package to map genomic repair scars at defined loci.\n {} v{}". format(__package__, __version__), formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('--options_file', action='store', dest='options_file', required=True, help='File containing program parameters.') # Check options file for errors and return object. args = error_checking(string_to_boolean(parser)) log = Tool_Box.Logger(args) Tool_Box.log_environment_info(log, args, command_line_args) module_name = "" log.info("{} v{}".format(__package__, __version__)) if args.IndelProcessing: file_list = [] if args.Platform == "Illumina" or args.Platform == "Ramsden" or args.Platform == "TruSeq": log.info("Sending FASTQ files to FASTQ preprocessor.") if args.PEAR: file_list = pear_consensus(args, log) if not file_list: log.error("PEAR failed. Check logs.") raise SystemExit(1) fastq_consensus = file_list[0] fq1 = FASTQ_Tools.FASTQ_Reader(fastq_consensus, log) fq2 = None else: fq2 = FASTQ_Tools.FASTQ_Reader(args.FASTQ2, log) fq1 = FASTQ_Tools.FASTQ_Reader(args.FASTQ1, log) sample_manifest = Tool_Box.FileParser.indices( log, args.SampleManifest) indel_processing = \ Indel_Processing.DataProcessing(log, args, run_start, __version__, Target_Mapper.TargetMapper(log, args, sample_manifest), fq1, fq2) indel_processing.main_loop() # Compress or delete PEAR files. if args.PEAR and file_list: if args.DeleteConsensusFASTQ: log.info("Deleting PEAR FASTQ Files.") Tool_Box.delete(file_list) else: log.info( "Compressing {} FASTQ Files Generated by PEAR.".format( len(file_list))) p = pathos.multiprocessing.Pool(int(args.Spawn)) p.starmap(Tool_Box.compress_files, zip(file_list, itertools.repeat(log))) else: log.error( "Only 'Illumina', 'TruSeq' or 'Ramsden' --Platform methods currently allowed." ) raise SystemExit(1) elif not args.IndelProcessing: # Run frequency file Combine module run_start = datetime.datetime.today().strftime("%a %b %d %H:%M:%S %Y") log.info("Process Replicates.") data_dict = collections.defaultdict(list) file_list = [ f for f in glob.glob("{}*ScarMapper_Frequency.txt".format( args.DataFiles, )) ] file_count = len(file_list) page_header = "# ScarMapper File Merge v{}\n# Run: {}\n# Sample Name: {}\n" \ .format(__version__, run_start, args.SampleName) line_num = 0 index_file = list(csv.reader(open(file_list[0]), delimiter='\t')) for line in index_file: if not line: break elif line_num > 3: page_header += "{}\n".format(line[0]) line_num += 1 page_header += "\n\n" for file_name in file_list: freq_file_data = Tool_Box.FileParser.indices(log, file_name) for row in freq_file_data: key = "{}|{}|{}|{}".format(row[3], row[4], row[6], row[8]) row_data = row[2:] if key in data_dict: data_dict[key][0].append(float(row[1])) else: data_dict[key] = [[float(row[1])], row_data] # Process Data and Write Combined Frequency results file plot_data_dict = collections.defaultdict(list) label_dict = collections.defaultdict(float) output_data_dict = collections.defaultdict(list) marker_list = [] for key, row_list in data_dict.items(): # Force pattern to be in at least half of the files. if len(row_list[0]) / file_count >= 0.5: row_string = "\t".join(row_list[1]) freq = gmean(row_list[0]) sem = stats.sem(row_list[0]) freq_results_outstring = "{}\t{}\t{}\n".format( freq, sem, row_string) output_key = freq # Freq is a 17 digit float so it is very unlikely to be duplicated but if it is this increments it by # a small number then checks the uniqueness again. if output_key in output_data_dict: output_key = output_key + 1e-16 if output_key in output_data_dict: output_key = output_key + 1e-16 scar_type = row_list[1][0] label_dict[scar_type] += freq # Gather up our data for plotting lft_del = int(row_list[1][1]) rt_del = int(row_list[1][2]) mh_size = int(row_list[1][5]) ins_size = int(row_list[1][7]) output_data_dict[output_key] = \ [(freq, lft_del, rt_del, mh_size, ins_size, scar_type), freq_results_outstring] freq_results_outstring = \ "{}# Frequency\tSEM\tScar Type\tLeft Deletions\tRight Deletions\tDeletion Size\tMicrohomology\t" \ "Microhomology Size\tInsertion\tInsertion Size\tLeft Template\tRight Template\tConsensus Left Junction\t" \ "Consensus Right Junction\tTarget Left Junction\tTarget Right Junction\tConsensus\tTarget Region\n" \ .format(page_header) # Now draw a pretty graph of the data if we are not dealing with a negative control. for k in natsort.natsorted(output_data_dict, reverse=True): data_list = output_data_dict[k] freq_results_outstring += data_list[1] freq = data_list[0][0] lft_del = data_list[0][1] rt_del = data_list[0][2] mh_size = data_list[0][3] ins_size = data_list[0][4] scar_type = data_list[0][5] # Plotting all scar patterns is messy. This provides a cutoff. if freq < 0.00025: continue y_value = freq * 0.5 lft_ins_width = freq rt_ins_width = freq # This is gathered up to find the largest value. Used to set the x-axis limits. marker_list.extend([ lft_del + (mh_size * 0.5), rt_del + (mh_size * 0.5), ins_size ]) # Deletion size included half the size of any microhomology present. lft_del_plot_value = (lft_del + (mh_size * 0.5)) * -1 rt_del_plot_value = rt_del + (mh_size * 0.5) # Insertions are centered on 0 so we need to take half the value for each side. lft_ins_plot_value = (ins_size * 0.5) * -1 rt_ins_plot_value = ins_size * 0.5 # Scale the width of bars for insertions inside of deletions if lft_del + (mh_size * 0.5) != 0: lft_ins_width = freq * 0.5 if rt_del + (mh_size * 0.5) != 0: rt_ins_width = freq * 0.5 if scar_type not in plot_data_dict: plot_data_dict[scar_type] = \ [[freq], [lft_del_plot_value], [rt_del_plot_value], [lft_ins_plot_value], [rt_ins_plot_value], [lft_ins_width], [rt_ins_width], [y_value]] else: # Get some previous plot data count = len(plot_data_dict[scar_type][0]) previous_freq = plot_data_dict[scar_type][0][count - 1] previous_y = plot_data_dict[scar_type][7][count - 1] plot_data_dict[scar_type][0].append(freq) plot_data_dict[scar_type][1].append(lft_del_plot_value) plot_data_dict[scar_type][2].append(rt_del_plot_value) plot_data_dict[scar_type][3].append(lft_ins_plot_value) plot_data_dict[scar_type][4].append(rt_ins_plot_value) plot_data_dict[scar_type][5].append(lft_ins_width) plot_data_dict[scar_type][6].append(rt_ins_width) # Use the previous plot data to find the y-value of the current bar. plot_data_dict[scar_type][7] \ .append(previous_y + 0.002 + (0.5 * previous_freq) + y_value) plot_data_dict['Marker'] = [(max(marker_list)) * -1, max(marker_list)] # sample_name = "{}.{}".format(args.Job_Name, args.SampleName) ScarMapperPlot.scarmapperplot(args, datafile=None, sample_name=args.SampleName, plot_data_dict=plot_data_dict, label_dict=label_dict) freq_results_file = \ open("{}{}_ScarMapper_Combined_Frequency.txt".format(args.WorkingFolder, args.SampleName), "w") freq_results_file.write(freq_results_outstring) freq_results_file.close() warning = "\033[1;31m **See warnings above**\033[m" if log.warning_occurred else '' elapsed_time = int(time.time() - start_time) log.info( "****ScarMapper {0} complete ({1} seconds, {2} Mb peak memory).****". format(module_name, elapsed_time, Tool_Box.peak_memory(), warning)) # All done so we need to quit otherwise Python will not release the log file on virtual Linux. exit(0)