Esempio n. 1
0
def main(command_line_args=None):
    """

    :param command_line_args:
    """
    VersionDependencies.python_check()

    if not command_line_args:
        command_line_args = sys.argv

    parser = argparse.ArgumentParser(
        description="A package to process Synthetic Lethal Data.\n {0} v{1}".
        format(__package__, __version__),
        formatter_class=RawTextHelpFormatter)

    parser.add_argument('--options_file',
                        action='store',
                        dest='options_file',
                        required=True,
                        help='File containing program parameters.')

    # Convert universal variables intended as boolean from string to boolean.
    args, options_parser = string_to_boolean(Tool_Box.options_file(parser))

    # Check file names and paths for errors
    error_checking(args)

    log = Tool_Box.Logger(args)
    Tool_Box.log_environment_info(log, args, command_line_args)

    start_time = time.time()
    module_name = "Synthetic_Lethal"

    log.info(
        "{0} v{1}; Module: Synthetic Lethal Analysis v{2} Beginning".format(
            __package__, __version__, Synthetic_Lethal.__version__))

    synthetic_lethal = Synthetic_Lethal.SyntheticLethal(log, args)

    if args.TargetSearch:
        synthetic_lethal.fastq_analysis()
    elif args.Statistics:
        synthetic_lethal.statistics()
    else:
        log.error('No module selected to run.')

    warning = "\033[1;31m **See warnings above**\033[m" if log.warning_occurred else ''
    elapsed_time = int(time.time() - start_time)
    log.info(
        "****Völundr {0} complete ({1} seconds, {2} Mb peak memory).****\n{3}".
        format(module_name, elapsed_time, Tool_Box.peak_memory(), warning))
Esempio n. 2
0
def main(command_line_args=None):
    VersionDependencies.python_check()

    if not command_line_args:
        command_line_args = sys.argv
    run_start = datetime.datetime.today().strftime("%a %b %d %H:%M:%S %Y")
    parser = argparse.ArgumentParser(description="A little ditty to manipulate FASTQ files.\n {0} v{1}"
                                     .format(__package__, __version__), formatter_class=argparse.RawTextHelpFormatter)

    parser.add_argument('--options_file', action='store', dest='options_file', required=True,
                        help='File containing program parameters.')

    options_parser = Tool_Box.options_file(parser)
    args = options_parser.parse_args()
    # args, options_parser = string_to_boolean(args, options_parser)
    options_parser.set_defaults(Trim5=0)
    options_parser.set_defaults(Trim3=0)
    options_parser.set_defaults(Minimum_Length=100)
    options_parser.set_defaults(N_Limit=100)
    options_parser.set_defaults(HaloPLEX=False)
    options_parser.set_defaults(ThruPLEX=False)
    options_parser.set_defaults(FASTQ_PreProcess=True)
    args = options_parser.parse_args()

    # Check options file for errors.
    error_checking(args)

    log = Tool_Box.Logger(args)
    Tool_Box.log_environment_info(log, args, command_line_args)
    start_time = time.time()
    module_name = ""

    # Initialize generator to read each FASTQ file
    fastq1 = FASTQ_Tools.FASTQ_Reader(args.FASTQ1, log)
    fastq2 = FASTQ_Tools.FASTQ_Reader(args.FASTQ2, log)
    index1 = FASTQ_Tools.FASTQ_Reader(args.Index1, log)
    index2 = FASTQ_Tools.FASTQ_Reader(args.Index2, log)

    splitter_data = FASTQ_Tools.FastqSplitter(args, log, fastq1, fastq2, index1, index2, paired_end=True)
    new_fastq1, new_fastq2 = splitter_data.file_writer()

    warning = "\033[1;31m **See warnings above**\033[m" if log.warning_occurred else ''
    elapsed_time = int(time.time() - start_time)
    log.info("****FASTQ Preprocessing {0} complete ({1} seconds, {2} Mb peak memory).****"
             .format(module_name, elapsed_time, Tool_Box.peak_memory(), warning))
Esempio n. 3
0
def main(command_line_args=None):
    """

    :param command_line_args:
    """
    VersionDependencies.python_check()

    if not command_line_args:
        command_line_args = sys.argv

    parser = argparse.ArgumentParser(
        description="A package to process Synthetic Lethal Data.\n {0} v{1}".
        format(__package__, __version__),
        formatter_class=RawTextHelpFormatter)

    parser.add_argument('--options_file',
                        action='store',
                        dest='options_file',
                        required=True,
                        help='File containing program parameters.')

    options_parser = Tool_Box.options_file(parser)
    args = options_parser.parse_args()

    # If we are doing statistical analysis the user will not input an Index_Mismatch value
    if not getattr(args, "Index_Mismatch", False):
        options_parser.add_argument("--Index_Mismatch",
                                    dest="Index_Mismatch",
                                    default=0)
        options_parser.add_argument("--Analyze_Unknowns",
                                    dest="Analyze_Unknowns",
                                    default="False")
        args = options_parser.parse_args()

    log = Tool_Box.Logger(args)
    Tool_Box.log_environment_info(log, args, command_line_args)
    start_time = time.time()
    module_name = "Synthetic_Lethal"

    log.info(
        "{0} v{1}; Module: Synthetic Lethal Analysis v{2} Beginning".format(
            __package__, __version__, Synthetic_Lethal.__version__))

    # Convert universal variables intended as boolean from string to boolean.
    # ToDo: Should be a cleaner method to do this.
    if args.Target_Search == "True":
        options_parser.set_defaults(Target_Search=True)
        if args.RevComp == "True":
            options_parser.set_defaults(RevComp=True)
        else:
            options_parser.set_defaults(RevComp=False)
        if args.Delete_Demultiplexed_FASTQ == "True":
            options_parser.set_defaults(Delete_Demultiplexed_FASTQ=True)
        else:
            options_parser.set_defaults(Delete_Demultiplexed_FASTQ=False)
        if args.compress == "True":
            options_parser.set_defaults(compress=True)
        else:
            options_parser.set_defaults(compress=False)
    else:
        options_parser.set_defaults(Target_Search=False)

    if args.Statistics == "True":
        options_parser.set_defaults(Statistics=True)
    else:
        options_parser.set_defaults(Statistics=False)

    if args.Analyze_Unknowns == "True":
        options_parser.set_defaults(Analyze_Unknowns=True)
    else:
        options_parser.set_defaults(Analyze_Unknowns=False)

    args = options_parser.parse_args()

    synthetic_lethal = Synthetic_Lethal.SyntheticLethal(log, args)

    # Add some parameters to our options parser object.
    args = options_parser.parse_args()

    if args.Target_Search:
        synthetic_lethal.fastq_analysis()
    elif args.Statistics:
        synthetic_lethal.statistics()
    else:
        log.error('No module selected to run.')

    warning = "\033[1;31m **See warnings above**\033[m" if log.warning_occurred else ''
    elapsed_time = int(time.time() - start_time)
    log.info(
        "****Volundr {0} complete ({1} seconds, {2} Mb peak memory).****\n{3}".
        format(module_name, elapsed_time, Tool_Box.peak_memory(), warning))
Esempio n. 4
0
def main(command_line_args=None):
    """
    Let's get this party started.
    :param command_line_args:
    """
    start_time = time.time()
    VersionDependencies.python_check()

    if not command_line_args:
        command_line_args = sys.argv

    run_start = datetime.datetime.today().strftime("%H:%M:%S %Y  %a %b %d")
    parser = argparse.ArgumentParser(
        description=
        "A package to map genomic repair scars at defined loci.\n {} v{}".
        format(__package__, __version__),
        formatter_class=argparse.RawTextHelpFormatter)

    parser.add_argument('--options_file',
                        action='store',
                        dest='options_file',
                        required=True,
                        help='File containing program parameters.')

    # Check options file for errors and return object.
    args = error_checking(string_to_boolean(parser))

    log = Tool_Box.Logger(args)
    Tool_Box.log_environment_info(log, args, command_line_args)

    module_name = ""
    log.info("{} v{}".format(__package__, __version__))

    if args.IndelProcessing:
        file_list = []
        if args.Platform == "Illumina" or args.Platform == "Ramsden" or args.Platform == "TruSeq":
            log.info("Sending FASTQ files to FASTQ preprocessor.")

            if args.PEAR:
                file_list = pear_consensus(args, log)
                if not file_list:
                    log.error("PEAR failed.  Check logs.")
                    raise SystemExit(1)
                fastq_consensus = file_list[0]

                fq1 = FASTQ_Tools.FASTQ_Reader(fastq_consensus, log)
                fq2 = None

            else:
                fq2 = FASTQ_Tools.FASTQ_Reader(args.FASTQ2, log)
                fq1 = FASTQ_Tools.FASTQ_Reader(args.FASTQ1, log)

            sample_manifest = Tool_Box.FileParser.indices(
                log, args.SampleManifest)
            indel_processing = \
                Indel_Processing.DataProcessing(log, args, run_start, __version__,
                                                Target_Mapper.TargetMapper(log, args, sample_manifest), fq1, fq2)

            indel_processing.main_loop()

            # Compress or delete PEAR files.
            if args.PEAR and file_list:
                if args.DeleteConsensusFASTQ:
                    log.info("Deleting PEAR FASTQ Files.")
                    Tool_Box.delete(file_list)
                else:
                    log.info(
                        "Compressing {} FASTQ Files Generated by PEAR.".format(
                            len(file_list)))
                    p = pathos.multiprocessing.Pool(int(args.Spawn))
                    p.starmap(Tool_Box.compress_files,
                              zip(file_list, itertools.repeat(log)))
        else:
            log.error(
                "Only 'Illumina', 'TruSeq' or 'Ramsden' --Platform methods currently allowed."
            )
            raise SystemExit(1)

    elif not args.IndelProcessing:
        # Run frequency file Combine module
        run_start = datetime.datetime.today().strftime("%a %b %d %H:%M:%S %Y")
        log.info("Process Replicates.")
        data_dict = collections.defaultdict(list)
        file_list = [
            f for f in glob.glob("{}*ScarMapper_Frequency.txt".format(
                args.DataFiles, ))
        ]
        file_count = len(file_list)
        page_header = "# ScarMapper File Merge v{}\n# Run: {}\n# Sample Name: {}\n" \
            .format(__version__, run_start, args.SampleName)

        line_num = 0
        index_file = list(csv.reader(open(file_list[0]), delimiter='\t'))
        for line in index_file:
            if not line:
                break
            elif line_num > 3:
                page_header += "{}\n".format(line[0])

            line_num += 1
        page_header += "\n\n"

        for file_name in file_list:
            freq_file_data = Tool_Box.FileParser.indices(log, file_name)

            for row in freq_file_data:
                key = "{}|{}|{}|{}".format(row[3], row[4], row[6], row[8])
                row_data = row[2:]

                if key in data_dict:
                    data_dict[key][0].append(float(row[1]))
                else:
                    data_dict[key] = [[float(row[1])], row_data]

        # Process Data and Write Combined Frequency results file

        plot_data_dict = collections.defaultdict(list)
        label_dict = collections.defaultdict(float)
        output_data_dict = collections.defaultdict(list)
        marker_list = []

        for key, row_list in data_dict.items():
            # Force pattern to be in at least half of the files.
            if len(row_list[0]) / file_count >= 0.5:
                row_string = "\t".join(row_list[1])
                freq = gmean(row_list[0])
                sem = stats.sem(row_list[0])
                freq_results_outstring = "{}\t{}\t{}\n".format(
                    freq, sem, row_string)
                output_key = freq

                # Freq is a 17 digit float so it is very unlikely to be duplicated but if it is this increments it by
                # a small number then checks the uniqueness again.
                if output_key in output_data_dict:
                    output_key = output_key + 1e-16
                    if output_key in output_data_dict:
                        output_key = output_key + 1e-16

                scar_type = row_list[1][0]
                label_dict[scar_type] += freq

                # Gather up our data for plotting
                lft_del = int(row_list[1][1])
                rt_del = int(row_list[1][2])
                mh_size = int(row_list[1][5])
                ins_size = int(row_list[1][7])

                output_data_dict[output_key] = \
                    [(freq, lft_del, rt_del, mh_size, ins_size, scar_type), freq_results_outstring]

        freq_results_outstring = \
            "{}# Frequency\tSEM\tScar Type\tLeft Deletions\tRight Deletions\tDeletion Size\tMicrohomology\t" \
            "Microhomology Size\tInsertion\tInsertion Size\tLeft Template\tRight Template\tConsensus Left Junction\t" \
            "Consensus Right Junction\tTarget Left Junction\tTarget Right Junction\tConsensus\tTarget Region\n" \
            .format(page_header)

        # Now draw a pretty graph of the data if we are not dealing with a negative control.
        for k in natsort.natsorted(output_data_dict, reverse=True):
            data_list = output_data_dict[k]
            freq_results_outstring += data_list[1]

            freq = data_list[0][0]
            lft_del = data_list[0][1]
            rt_del = data_list[0][2]
            mh_size = data_list[0][3]
            ins_size = data_list[0][4]
            scar_type = data_list[0][5]

            # Plotting all scar patterns is messy.  This provides a cutoff.
            if freq < 0.00025:
                continue

            y_value = freq * 0.5
            lft_ins_width = freq
            rt_ins_width = freq

            # This is gathered up to find the largest value.  Used to set the x-axis limits.
            marker_list.extend([
                lft_del + (mh_size * 0.5), rt_del + (mh_size * 0.5), ins_size
            ])

            # Deletion size included half the size of any microhomology present.
            lft_del_plot_value = (lft_del + (mh_size * 0.5)) * -1
            rt_del_plot_value = rt_del + (mh_size * 0.5)

            # Insertions are centered on 0 so we need to take half the value for each side.
            lft_ins_plot_value = (ins_size * 0.5) * -1
            rt_ins_plot_value = ins_size * 0.5

            # Scale the width of bars for insertions inside of deletions
            if lft_del + (mh_size * 0.5) != 0:
                lft_ins_width = freq * 0.5
            if rt_del + (mh_size * 0.5) != 0:
                rt_ins_width = freq * 0.5

            if scar_type not in plot_data_dict:
                plot_data_dict[scar_type] = \
                    [[freq], [lft_del_plot_value], [rt_del_plot_value], [lft_ins_plot_value],
                     [rt_ins_plot_value], [lft_ins_width], [rt_ins_width], [y_value]]
            else:
                # Get some previous plot data
                count = len(plot_data_dict[scar_type][0])
                previous_freq = plot_data_dict[scar_type][0][count - 1]
                previous_y = plot_data_dict[scar_type][7][count - 1]

                plot_data_dict[scar_type][0].append(freq)
                plot_data_dict[scar_type][1].append(lft_del_plot_value)
                plot_data_dict[scar_type][2].append(rt_del_plot_value)
                plot_data_dict[scar_type][3].append(lft_ins_plot_value)
                plot_data_dict[scar_type][4].append(rt_ins_plot_value)
                plot_data_dict[scar_type][5].append(lft_ins_width)
                plot_data_dict[scar_type][6].append(rt_ins_width)

                # Use the previous plot data to find the y-value of the current bar.
                plot_data_dict[scar_type][7] \
                    .append(previous_y + 0.002 + (0.5 * previous_freq) + y_value)

        plot_data_dict['Marker'] = [(max(marker_list)) * -1, max(marker_list)]
        # sample_name = "{}.{}".format(args.Job_Name, args.SampleName)

        ScarMapperPlot.scarmapperplot(args,
                                      datafile=None,
                                      sample_name=args.SampleName,
                                      plot_data_dict=plot_data_dict,
                                      label_dict=label_dict)

        freq_results_file = \
            open("{}{}_ScarMapper_Combined_Frequency.txt".format(args.WorkingFolder, args.SampleName), "w")

        freq_results_file.write(freq_results_outstring)
        freq_results_file.close()

    warning = "\033[1;31m **See warnings above**\033[m" if log.warning_occurred else ''
    elapsed_time = int(time.time() - start_time)
    log.info(
        "****ScarMapper {0} complete ({1} seconds, {2} Mb peak memory).****".
        format(module_name, elapsed_time, Tool_Box.peak_memory(), warning))

    # All done so we need to quit otherwise Python will not release the log file on virtual Linux.
    exit(0)
Esempio n. 5
0
def error_checking(parser):
    """
    Check parameter file for errors and return parser object.
    :param parser:
    :return:
    """
    def string_conversions(parser):
        """
        Convert True/False statements in parameter file to boolean
        :param parser:
        :return:
        """
        options_parser = Tool_Box.options_file(parser)
        initial_args = options_parser.parse_args()

        options_parser.set_defaults(
            TargetSearch=bool(strtobool(initial_args.TargetSearch)))
        options_parser.set_defaults(
            Statistics=bool(strtobool(initial_args.Statistics)))

        options_parser.set_defaults(Verbose=initial_args.Verbose.upper())

        if initial_args.Statistics == "False":
            options_parser.set_defaults(
                AnchorSeq=initial_args.AnchorSeq.upper())
            options_parser.set_defaults(Analyze_Unknowns=bool(
                strtobool(initial_args.Analyze_Unknowns)))
            options_parser.set_defaults(Delete_Demultiplexed_FASTQ=bool(
                strtobool(initial_args.Delete_Demultiplexed_FASTQ)))
            options_parser.set_defaults(
                RevComp=bool(strtobool(initial_args.RevComp)))
            options_parser.set_defaults(BatchSize=int(initial_args.BatchSize))
            options_parser.set_defaults(
                Target_Mismatch=int(initial_args.Target_Mismatch))
            options_parser.set_defaults(
                MinimumReadLength=int(initial_args.MinimumReadLength))
            options_parser.set_defaults(N_Limit=10)
            options_parser.set_defaults(
                Target_Length=int(initial_args.Target_Length))
            options_parser.set_defaults(
                Target_Start=int(initial_args.Target_Start))
            # options_parser.set_defaults(Index_Mismatch=int(initial_args.Index_Mismatch))
            options_parser.set_defaults(Spawn=int(initial_args.Spawn))
            options_parser.set_defaults(
                Target_Padding=int(initial_args.Target_Padding))
            options_parser.set_defaults(
                Expected_Position=int(initial_args.Expected_Position))
            options_parser.set_defaults(
                AnchorMismatch=int(initial_args.AnchorMismatch))
            options_parser.set_defaults(
                AnchorStart=int(initial_args.AnchorStart))
            options_parser.set_defaults(
                AnchorStop=int(initial_args.AnchorStop))
        else:
            options_parser.set_defaults(
                Write_TDnorm_Log2_sgRNA_Control_File=bool(
                    strtobool(
                        initial_args.Write_TDnorm_Log2_sgRNA_Control_File)))
            options_parser.set_defaults(
                Write_TDnorm_Log2_sgRNA_Sample_File=bool(
                    strtobool(
                        initial_args.Write_TDnorm_Log2_sgRNA_Sample_File)))
            options_parser.set_defaults(Write_Log2_sgRNA_File=bool(
                strtobool(initial_args.Write_Log2_sgRNA_File)))
            options_parser.set_defaults(Write_Permuted_Log2_Data_File=bool(
                strtobool(initial_args.Write_Permuted_Log2_Data_File)))
            options_parser.set_defaults(Bad_sgRNA_Lower_Percentile=float(
                initial_args.Bad_sgRNA_Lower_Percentile))
            options_parser.set_defaults(Bad_sgRNA_Upper_Percentile=float(
                initial_args.Bad_sgRNA_Upper_Percentile))
            options_parser.set_defaults(
                UpperPercentile=float(initial_args.UpperPercentile))
            options_parser.set_defaults(
                LowerPercentile=float(initial_args.LowerPercentile))
            options_parser.set_defaults(
                PermutationCount=int(initial_args.PermutationCount))
            options_parser.set_defaults(Alpha=float(initial_args.Alpha))
            options_parser.set_defaults(
                Target_Mismatch=float(initial_args.Target_Mismatch))
            options_parser.set_defaults(
                UpperGuideLimit=float(initial_args.UpperGuideLimit))
            options_parser.set_defaults(
                LowerGuideLimit=float(initial_args.LowerGuideLimit))

        initial_args = options_parser.parse_args()

        return initial_args

    args = string_conversions(parser)
    log = Tool_Box.Logger(args)
    Tool_Box.log_environment_info(log, args, sys.argv)

    if not pathlib.Path(args.WorkingFolder).exists():
        print(
            "\033[1;31mERROR:\n\tWorking Folder Path: {} Not Found.  Check Parameter File."
            .format(args.WorkingFolder))
        raise SystemExit(1)

    if args.Statistics:
        if not pathlib.Path(args.DataFiles).exists():
            print(
                "\033[1;31mERROR:\n\t--DataFiles Folder Path: {} Not Found.  Check Parameter File."
                .format(args.DataFiles))
            raise SystemExit(1)

    if not pathlib.Path(args.SampleManifest).exists():
        print(
            "\033[1;31mERROR:\n\t--SampleManifest: {} Not Found.  Check Parameter File."
            .format(args.SampleManifest))
        raise SystemExit(1)

    if not pathlib.Path(args.Master_Index_File).exists():
        print(
            "\033[1;31mERROR:\n\t--Master_Index_File: {} Not Found.  Check Parameter File."
            .format(args.Master_Index_File))
        raise SystemExit(1)

    if not pathlib.Path(args.Target_File).exists():
        print(
            "\033[1;31mERROR:\n\t--Target_File: {} Not Found.  Check Parameter File."
            .format(args.Target_File))
        raise SystemExit(1)

    if args.TargetSearch:
        if getattr(args, "FASTQ1",
                   False) and not pathlib.Path(args.FASTQ1).exists():
            print(
                "\033[1;31mERROR:\n\t--FASTQ1: {} Not Found.  Check Parameter File."
                .format(args.FASTQ1))
            raise SystemExit(1)

        try:
            mime_type1 = magic.from_file(args.FASTQ1, mime=True).decode()

        except AttributeError:
            mime_type1 = magic.from_file(args.FASTQ1, mime=True)

        if "text" in mime_type1 or "gzip" in mime_type1:
            pass
        else:
            log.error(
                "Unsupported FASTQ file-type.  Only TEXT or GZIP Allowed.")
            raise SystemExit(1)

    return args, log