def main(command_line_args=None): """ :param command_line_args: """ VersionDependencies.python_check() if not command_line_args: command_line_args = sys.argv parser = argparse.ArgumentParser( description="A package to process Synthetic Lethal Data.\n {0} v{1}". format(__package__, __version__), formatter_class=RawTextHelpFormatter) parser.add_argument('--options_file', action='store', dest='options_file', required=True, help='File containing program parameters.') # Convert universal variables intended as boolean from string to boolean. args, options_parser = string_to_boolean(Tool_Box.options_file(parser)) # Check file names and paths for errors error_checking(args) log = Tool_Box.Logger(args) Tool_Box.log_environment_info(log, args, command_line_args) start_time = time.time() module_name = "Synthetic_Lethal" log.info( "{0} v{1}; Module: Synthetic Lethal Analysis v{2} Beginning".format( __package__, __version__, Synthetic_Lethal.__version__)) synthetic_lethal = Synthetic_Lethal.SyntheticLethal(log, args) if args.TargetSearch: synthetic_lethal.fastq_analysis() elif args.Statistics: synthetic_lethal.statistics() else: log.error('No module selected to run.') warning = "\033[1;31m **See warnings above**\033[m" if log.warning_occurred else '' elapsed_time = int(time.time() - start_time) log.info( "****Völundr {0} complete ({1} seconds, {2} Mb peak memory).****\n{3}". format(module_name, elapsed_time, Tool_Box.peak_memory(), warning))
def main(command_line_args=None): VersionDependencies.python_check() if not command_line_args: command_line_args = sys.argv run_start = datetime.datetime.today().strftime("%a %b %d %H:%M:%S %Y") parser = argparse.ArgumentParser(description="A little ditty to manipulate FASTQ files.\n {0} v{1}" .format(__package__, __version__), formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('--options_file', action='store', dest='options_file', required=True, help='File containing program parameters.') options_parser = Tool_Box.options_file(parser) args = options_parser.parse_args() # args, options_parser = string_to_boolean(args, options_parser) options_parser.set_defaults(Trim5=0) options_parser.set_defaults(Trim3=0) options_parser.set_defaults(Minimum_Length=100) options_parser.set_defaults(N_Limit=100) options_parser.set_defaults(HaloPLEX=False) options_parser.set_defaults(ThruPLEX=False) options_parser.set_defaults(FASTQ_PreProcess=True) args = options_parser.parse_args() # Check options file for errors. error_checking(args) log = Tool_Box.Logger(args) Tool_Box.log_environment_info(log, args, command_line_args) start_time = time.time() module_name = "" # Initialize generator to read each FASTQ file fastq1 = FASTQ_Tools.FASTQ_Reader(args.FASTQ1, log) fastq2 = FASTQ_Tools.FASTQ_Reader(args.FASTQ2, log) index1 = FASTQ_Tools.FASTQ_Reader(args.Index1, log) index2 = FASTQ_Tools.FASTQ_Reader(args.Index2, log) splitter_data = FASTQ_Tools.FastqSplitter(args, log, fastq1, fastq2, index1, index2, paired_end=True) new_fastq1, new_fastq2 = splitter_data.file_writer() warning = "\033[1;31m **See warnings above**\033[m" if log.warning_occurred else '' elapsed_time = int(time.time() - start_time) log.info("****FASTQ Preprocessing {0} complete ({1} seconds, {2} Mb peak memory).****" .format(module_name, elapsed_time, Tool_Box.peak_memory(), warning))
def main(command_line_args=None): """ :param command_line_args: """ VersionDependencies.python_check() if not command_line_args: command_line_args = sys.argv parser = argparse.ArgumentParser( description="A package to process Synthetic Lethal Data.\n {0} v{1}". format(__package__, __version__), formatter_class=RawTextHelpFormatter) parser.add_argument('--options_file', action='store', dest='options_file', required=True, help='File containing program parameters.') options_parser = Tool_Box.options_file(parser) args = options_parser.parse_args() # If we are doing statistical analysis the user will not input an Index_Mismatch value if not getattr(args, "Index_Mismatch", False): options_parser.add_argument("--Index_Mismatch", dest="Index_Mismatch", default=0) options_parser.add_argument("--Analyze_Unknowns", dest="Analyze_Unknowns", default="False") args = options_parser.parse_args() log = Tool_Box.Logger(args) Tool_Box.log_environment_info(log, args, command_line_args) start_time = time.time() module_name = "Synthetic_Lethal" log.info( "{0} v{1}; Module: Synthetic Lethal Analysis v{2} Beginning".format( __package__, __version__, Synthetic_Lethal.__version__)) # Convert universal variables intended as boolean from string to boolean. # ToDo: Should be a cleaner method to do this. if args.Target_Search == "True": options_parser.set_defaults(Target_Search=True) if args.RevComp == "True": options_parser.set_defaults(RevComp=True) else: options_parser.set_defaults(RevComp=False) if args.Delete_Demultiplexed_FASTQ == "True": options_parser.set_defaults(Delete_Demultiplexed_FASTQ=True) else: options_parser.set_defaults(Delete_Demultiplexed_FASTQ=False) if args.compress == "True": options_parser.set_defaults(compress=True) else: options_parser.set_defaults(compress=False) else: options_parser.set_defaults(Target_Search=False) if args.Statistics == "True": options_parser.set_defaults(Statistics=True) else: options_parser.set_defaults(Statistics=False) if args.Analyze_Unknowns == "True": options_parser.set_defaults(Analyze_Unknowns=True) else: options_parser.set_defaults(Analyze_Unknowns=False) args = options_parser.parse_args() synthetic_lethal = Synthetic_Lethal.SyntheticLethal(log, args) # Add some parameters to our options parser object. args = options_parser.parse_args() if args.Target_Search: synthetic_lethal.fastq_analysis() elif args.Statistics: synthetic_lethal.statistics() else: log.error('No module selected to run.') warning = "\033[1;31m **See warnings above**\033[m" if log.warning_occurred else '' elapsed_time = int(time.time() - start_time) log.info( "****Volundr {0} complete ({1} seconds, {2} Mb peak memory).****\n{3}". format(module_name, elapsed_time, Tool_Box.peak_memory(), warning))
def main(command_line_args=None): """ Let's get this party started. :param command_line_args: """ start_time = time.time() VersionDependencies.python_check() if not command_line_args: command_line_args = sys.argv run_start = datetime.datetime.today().strftime("%H:%M:%S %Y %a %b %d") parser = argparse.ArgumentParser( description= "A package to map genomic repair scars at defined loci.\n {} v{}". format(__package__, __version__), formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('--options_file', action='store', dest='options_file', required=True, help='File containing program parameters.') # Check options file for errors and return object. args = error_checking(string_to_boolean(parser)) log = Tool_Box.Logger(args) Tool_Box.log_environment_info(log, args, command_line_args) module_name = "" log.info("{} v{}".format(__package__, __version__)) if args.IndelProcessing: file_list = [] if args.Platform == "Illumina" or args.Platform == "Ramsden" or args.Platform == "TruSeq": log.info("Sending FASTQ files to FASTQ preprocessor.") if args.PEAR: file_list = pear_consensus(args, log) if not file_list: log.error("PEAR failed. Check logs.") raise SystemExit(1) fastq_consensus = file_list[0] fq1 = FASTQ_Tools.FASTQ_Reader(fastq_consensus, log) fq2 = None else: fq2 = FASTQ_Tools.FASTQ_Reader(args.FASTQ2, log) fq1 = FASTQ_Tools.FASTQ_Reader(args.FASTQ1, log) sample_manifest = Tool_Box.FileParser.indices( log, args.SampleManifest) indel_processing = \ Indel_Processing.DataProcessing(log, args, run_start, __version__, Target_Mapper.TargetMapper(log, args, sample_manifest), fq1, fq2) indel_processing.main_loop() # Compress or delete PEAR files. if args.PEAR and file_list: if args.DeleteConsensusFASTQ: log.info("Deleting PEAR FASTQ Files.") Tool_Box.delete(file_list) else: log.info( "Compressing {} FASTQ Files Generated by PEAR.".format( len(file_list))) p = pathos.multiprocessing.Pool(int(args.Spawn)) p.starmap(Tool_Box.compress_files, zip(file_list, itertools.repeat(log))) else: log.error( "Only 'Illumina', 'TruSeq' or 'Ramsden' --Platform methods currently allowed." ) raise SystemExit(1) elif not args.IndelProcessing: # Run frequency file Combine module run_start = datetime.datetime.today().strftime("%a %b %d %H:%M:%S %Y") log.info("Process Replicates.") data_dict = collections.defaultdict(list) file_list = [ f for f in glob.glob("{}*ScarMapper_Frequency.txt".format( args.DataFiles, )) ] file_count = len(file_list) page_header = "# ScarMapper File Merge v{}\n# Run: {}\n# Sample Name: {}\n" \ .format(__version__, run_start, args.SampleName) line_num = 0 index_file = list(csv.reader(open(file_list[0]), delimiter='\t')) for line in index_file: if not line: break elif line_num > 3: page_header += "{}\n".format(line[0]) line_num += 1 page_header += "\n\n" for file_name in file_list: freq_file_data = Tool_Box.FileParser.indices(log, file_name) for row in freq_file_data: key = "{}|{}|{}|{}".format(row[3], row[4], row[6], row[8]) row_data = row[2:] if key in data_dict: data_dict[key][0].append(float(row[1])) else: data_dict[key] = [[float(row[1])], row_data] # Process Data and Write Combined Frequency results file plot_data_dict = collections.defaultdict(list) label_dict = collections.defaultdict(float) output_data_dict = collections.defaultdict(list) marker_list = [] for key, row_list in data_dict.items(): # Force pattern to be in at least half of the files. if len(row_list[0]) / file_count >= 0.5: row_string = "\t".join(row_list[1]) freq = gmean(row_list[0]) sem = stats.sem(row_list[0]) freq_results_outstring = "{}\t{}\t{}\n".format( freq, sem, row_string) output_key = freq # Freq is a 17 digit float so it is very unlikely to be duplicated but if it is this increments it by # a small number then checks the uniqueness again. if output_key in output_data_dict: output_key = output_key + 1e-16 if output_key in output_data_dict: output_key = output_key + 1e-16 scar_type = row_list[1][0] label_dict[scar_type] += freq # Gather up our data for plotting lft_del = int(row_list[1][1]) rt_del = int(row_list[1][2]) mh_size = int(row_list[1][5]) ins_size = int(row_list[1][7]) output_data_dict[output_key] = \ [(freq, lft_del, rt_del, mh_size, ins_size, scar_type), freq_results_outstring] freq_results_outstring = \ "{}# Frequency\tSEM\tScar Type\tLeft Deletions\tRight Deletions\tDeletion Size\tMicrohomology\t" \ "Microhomology Size\tInsertion\tInsertion Size\tLeft Template\tRight Template\tConsensus Left Junction\t" \ "Consensus Right Junction\tTarget Left Junction\tTarget Right Junction\tConsensus\tTarget Region\n" \ .format(page_header) # Now draw a pretty graph of the data if we are not dealing with a negative control. for k in natsort.natsorted(output_data_dict, reverse=True): data_list = output_data_dict[k] freq_results_outstring += data_list[1] freq = data_list[0][0] lft_del = data_list[0][1] rt_del = data_list[0][2] mh_size = data_list[0][3] ins_size = data_list[0][4] scar_type = data_list[0][5] # Plotting all scar patterns is messy. This provides a cutoff. if freq < 0.00025: continue y_value = freq * 0.5 lft_ins_width = freq rt_ins_width = freq # This is gathered up to find the largest value. Used to set the x-axis limits. marker_list.extend([ lft_del + (mh_size * 0.5), rt_del + (mh_size * 0.5), ins_size ]) # Deletion size included half the size of any microhomology present. lft_del_plot_value = (lft_del + (mh_size * 0.5)) * -1 rt_del_plot_value = rt_del + (mh_size * 0.5) # Insertions are centered on 0 so we need to take half the value for each side. lft_ins_plot_value = (ins_size * 0.5) * -1 rt_ins_plot_value = ins_size * 0.5 # Scale the width of bars for insertions inside of deletions if lft_del + (mh_size * 0.5) != 0: lft_ins_width = freq * 0.5 if rt_del + (mh_size * 0.5) != 0: rt_ins_width = freq * 0.5 if scar_type not in plot_data_dict: plot_data_dict[scar_type] = \ [[freq], [lft_del_plot_value], [rt_del_plot_value], [lft_ins_plot_value], [rt_ins_plot_value], [lft_ins_width], [rt_ins_width], [y_value]] else: # Get some previous plot data count = len(plot_data_dict[scar_type][0]) previous_freq = plot_data_dict[scar_type][0][count - 1] previous_y = plot_data_dict[scar_type][7][count - 1] plot_data_dict[scar_type][0].append(freq) plot_data_dict[scar_type][1].append(lft_del_plot_value) plot_data_dict[scar_type][2].append(rt_del_plot_value) plot_data_dict[scar_type][3].append(lft_ins_plot_value) plot_data_dict[scar_type][4].append(rt_ins_plot_value) plot_data_dict[scar_type][5].append(lft_ins_width) plot_data_dict[scar_type][6].append(rt_ins_width) # Use the previous plot data to find the y-value of the current bar. plot_data_dict[scar_type][7] \ .append(previous_y + 0.002 + (0.5 * previous_freq) + y_value) plot_data_dict['Marker'] = [(max(marker_list)) * -1, max(marker_list)] # sample_name = "{}.{}".format(args.Job_Name, args.SampleName) ScarMapperPlot.scarmapperplot(args, datafile=None, sample_name=args.SampleName, plot_data_dict=plot_data_dict, label_dict=label_dict) freq_results_file = \ open("{}{}_ScarMapper_Combined_Frequency.txt".format(args.WorkingFolder, args.SampleName), "w") freq_results_file.write(freq_results_outstring) freq_results_file.close() warning = "\033[1;31m **See warnings above**\033[m" if log.warning_occurred else '' elapsed_time = int(time.time() - start_time) log.info( "****ScarMapper {0} complete ({1} seconds, {2} Mb peak memory).****". format(module_name, elapsed_time, Tool_Box.peak_memory(), warning)) # All done so we need to quit otherwise Python will not release the log file on virtual Linux. exit(0)
def error_checking(parser): """ Check parameter file for errors and return parser object. :param parser: :return: """ def string_conversions(parser): """ Convert True/False statements in parameter file to boolean :param parser: :return: """ options_parser = Tool_Box.options_file(parser) initial_args = options_parser.parse_args() options_parser.set_defaults( TargetSearch=bool(strtobool(initial_args.TargetSearch))) options_parser.set_defaults( Statistics=bool(strtobool(initial_args.Statistics))) options_parser.set_defaults(Verbose=initial_args.Verbose.upper()) if initial_args.Statistics == "False": options_parser.set_defaults( AnchorSeq=initial_args.AnchorSeq.upper()) options_parser.set_defaults(Analyze_Unknowns=bool( strtobool(initial_args.Analyze_Unknowns))) options_parser.set_defaults(Delete_Demultiplexed_FASTQ=bool( strtobool(initial_args.Delete_Demultiplexed_FASTQ))) options_parser.set_defaults( RevComp=bool(strtobool(initial_args.RevComp))) options_parser.set_defaults(BatchSize=int(initial_args.BatchSize)) options_parser.set_defaults( Target_Mismatch=int(initial_args.Target_Mismatch)) options_parser.set_defaults( MinimumReadLength=int(initial_args.MinimumReadLength)) options_parser.set_defaults(N_Limit=10) options_parser.set_defaults( Target_Length=int(initial_args.Target_Length)) options_parser.set_defaults( Target_Start=int(initial_args.Target_Start)) # options_parser.set_defaults(Index_Mismatch=int(initial_args.Index_Mismatch)) options_parser.set_defaults(Spawn=int(initial_args.Spawn)) options_parser.set_defaults( Target_Padding=int(initial_args.Target_Padding)) options_parser.set_defaults( Expected_Position=int(initial_args.Expected_Position)) options_parser.set_defaults( AnchorMismatch=int(initial_args.AnchorMismatch)) options_parser.set_defaults( AnchorStart=int(initial_args.AnchorStart)) options_parser.set_defaults( AnchorStop=int(initial_args.AnchorStop)) else: options_parser.set_defaults( Write_TDnorm_Log2_sgRNA_Control_File=bool( strtobool( initial_args.Write_TDnorm_Log2_sgRNA_Control_File))) options_parser.set_defaults( Write_TDnorm_Log2_sgRNA_Sample_File=bool( strtobool( initial_args.Write_TDnorm_Log2_sgRNA_Sample_File))) options_parser.set_defaults(Write_Log2_sgRNA_File=bool( strtobool(initial_args.Write_Log2_sgRNA_File))) options_parser.set_defaults(Write_Permuted_Log2_Data_File=bool( strtobool(initial_args.Write_Permuted_Log2_Data_File))) options_parser.set_defaults(Bad_sgRNA_Lower_Percentile=float( initial_args.Bad_sgRNA_Lower_Percentile)) options_parser.set_defaults(Bad_sgRNA_Upper_Percentile=float( initial_args.Bad_sgRNA_Upper_Percentile)) options_parser.set_defaults( UpperPercentile=float(initial_args.UpperPercentile)) options_parser.set_defaults( LowerPercentile=float(initial_args.LowerPercentile)) options_parser.set_defaults( PermutationCount=int(initial_args.PermutationCount)) options_parser.set_defaults(Alpha=float(initial_args.Alpha)) options_parser.set_defaults( Target_Mismatch=float(initial_args.Target_Mismatch)) options_parser.set_defaults( UpperGuideLimit=float(initial_args.UpperGuideLimit)) options_parser.set_defaults( LowerGuideLimit=float(initial_args.LowerGuideLimit)) initial_args = options_parser.parse_args() return initial_args args = string_conversions(parser) log = Tool_Box.Logger(args) Tool_Box.log_environment_info(log, args, sys.argv) if not pathlib.Path(args.WorkingFolder).exists(): print( "\033[1;31mERROR:\n\tWorking Folder Path: {} Not Found. Check Parameter File." .format(args.WorkingFolder)) raise SystemExit(1) if args.Statistics: if not pathlib.Path(args.DataFiles).exists(): print( "\033[1;31mERROR:\n\t--DataFiles Folder Path: {} Not Found. Check Parameter File." .format(args.DataFiles)) raise SystemExit(1) if not pathlib.Path(args.SampleManifest).exists(): print( "\033[1;31mERROR:\n\t--SampleManifest: {} Not Found. Check Parameter File." .format(args.SampleManifest)) raise SystemExit(1) if not pathlib.Path(args.Master_Index_File).exists(): print( "\033[1;31mERROR:\n\t--Master_Index_File: {} Not Found. Check Parameter File." .format(args.Master_Index_File)) raise SystemExit(1) if not pathlib.Path(args.Target_File).exists(): print( "\033[1;31mERROR:\n\t--Target_File: {} Not Found. Check Parameter File." .format(args.Target_File)) raise SystemExit(1) if args.TargetSearch: if getattr(args, "FASTQ1", False) and not pathlib.Path(args.FASTQ1).exists(): print( "\033[1;31mERROR:\n\t--FASTQ1: {} Not Found. Check Parameter File." .format(args.FASTQ1)) raise SystemExit(1) try: mime_type1 = magic.from_file(args.FASTQ1, mime=True).decode() except AttributeError: mime_type1 = magic.from_file(args.FASTQ1, mime=True) if "text" in mime_type1 or "gzip" in mime_type1: pass else: log.error( "Unsupported FASTQ file-type. Only TEXT or GZIP Allowed.") raise SystemExit(1) return args, log