Esempio n. 1
0
def _cluster_multi_process(p: multiprocessing.pool, data_normalized, start,
                           end, st, dist_func, verbose):
    # if len(data_normalized) < p._processe:  # group the time series first if # time series < # worker
    group_partition = __partition_and_group(data_normalized, p._processes,
                                            start, end, p)
    cluster_arg_partition = [(x, st, dist_func, verbose)
                             for x in group_partition]
    """
    Linear Cluster for debug purposes
    # cluster_partition = []
    # for arg in cluster_arg_partition:
    #     cluster_partition.append(_cluster_groups(*arg))

    """
    cluster_partition = p.starmap(_cluster_groups, cluster_arg_partition)
    cluster_meta_dict = _cluster_to_meta_mp(cluster_partition, p)

    subsequences = flatten(p.map(get_second, flatten(group_partition)))
    return subsequences, cluster_partition, cluster_meta_dict
Esempio n. 2
0
def runtime_parallel_np(pool: mp.pool,
                        sample: np.ndarray,
                        func: Callable,
                        num_resamples: int,
                        resample_size: int = None) -> float:
    assert isinstance(
        sample, np.ndarray), "Please convert the input into an numpy ndarray"
    sample_size = len(sample)
    if resample_size == None or resample_size > sample_size:
        resample_size = sample_size
    sample_ = mp.Array(ctypes.c_double, sample_size)
    sample_shr = np.ctypeslib.as_array(sample_.get_obj())
    sample_shr[:] = sample
    t_start = time.perf_counter()

    pool.starmap_async(bootstrap_np, [(sample_shr, func, resample_size)
                                      for _ in range(num_resamples)]).get()

    t_end = time.perf_counter()
    return t_end - t_start
Esempio n. 3
0
def _query_mp(p: multiprocessing.pool, clusters, **kwargs):
    query_arg_partition = [[x] + list(kwargs.values()) for x in clusters]

    # Linear query for debug purposes
    # candidates = []
    # for qp in query_arg_partition:
    #     rtn = _query_partition(*qp)
    #     candidates.append(rtn)

    candidates = flatten(p.starmap(_query_partition, query_arg_partition))
    return candidates
Esempio n. 4
0
def __partition_and_group(data,
                          slice_num,
                          start,
                          end,
                          p: multiprocessing.pool,
                          shuffle=True):
    data_partition = _partitioner(data, p._processes)
    group_arg_partition = [(x, start, end) for x in data_partition]

    # Linear partitioning for debugging
    # group_partition = []
    # for arg in group_arg_partition:
    #     group_partition.append(_group_time_series(*arg))

    group_partition = p.starmap(_group_time_series, group_arg_partition)
    return group_partition
Esempio n. 5
0
def _query_bf_mp(query, p: multiprocessing.pool, subsequences: list, dt_index):
    dist_subsequences_arg = [(query, x, dt_index) for x in subsequences]
    dist_subsequences = p.starmap(get_dist_query, dist_subsequences_arg)
    return dist_subsequences
Esempio n. 6
0
def _cluster_to_meta_mp(cluster_partition: list, p: multiprocessing.pool):
    clusters = flatten(cluster_partition)
    temp = p.map(_cluster_to_meta, clusters)
    return tuple(reduce_by_key(_cluster_reduce_func, temp))
Esempio n. 7
0
def main():
    """EdiTyper"""
    #   Setup EdiTyper
    #   Parse arguments
    parser = arguments.make_argument_parser() # type: argparse.ArgumentParser
    if not sys.argv[1:] or any(map(lambda a: a in sys.argv, ('-h', '--help'))):
        sys.exit(parser.print_help())
    args = {key: value for key, value in vars(parser.parse_args()).items() if value is not None} # type: Dict[str, Any]
    #   Make an output directory
    if os.path.exists(args['outdirectory']):
        args['outdirectory'] = args['outdirectory'] + time.strftime('_%Y-%m-%d_%H:%M')
    try:
        os.makedirs(args['outdirectory'])
    except OSError:
        pass
    finally:
        #   Make a prefix for project-level output files
        output_prefix = os.path.join(args['outdirectory'], args['project']) # type: str
    #   Setup logger
    #   Formatting values
    log_format = '%(asctime)s %(levelname)s:\t%(message)s' # type: str
    date_format = '%Y-%m-%d %H:%M:%S' # type: str
    #   Formatters
    stripped_formatter = toolkit.StrippedFormatter(fmt=log_format, datefmt=date_format) # toolkit.StrippedFormatter
    colored_formater = toolkit.ColoredFormatter(fmt=log_format, datefmt=date_format) # type: toolkit.ColoredFormatter
    #   Open /dev/null (or whatever it is on Windows) to send basic stream information to
    devnull = open(os.devnull, 'w') # type: file
    #   Configure the logger
    verbosity = _set_verbosity(level=args['verbosity']) # type: int
    logging.basicConfig(
        stream=devnull,
        level=verbosity,
    )
    #   If we're being verbose, capture other warnings (mainly matplotlib and numpy)
    #   Otherwise, ignore them
    if verbosity == logging.DEBUG:
        logging.captureWarnings(True)
    else:
        warnings.filterwarnings('ignore')
    #   Setup a FileHandler for the log file
    #   Use a StrippedFormatter to remove extra ANSI color codes
    logname = output_prefix + '.log'
    logfile = logging.FileHandler(filename=logname, mode='w') # type: Logging.FileHandler
    logfile.setFormatter(stripped_formatter)
    logging.getLogger().addHandler(logfile)
    #   Setup the console handler
    #   Use a ColoredFormatter because colors are cool
    console = logging.StreamHandler() # type: logging.StreamHandler
    console.setFormatter(colored_formater)
    logging.getLogger().addHandler(console)
    #   Begin the program
    logging.info("Welcome to %s %s!", os.path.basename(sys.argv[0]), arguments.VERSION)
    program_start = time.time() # type: float
    #   Where are we putting our output directory?
    logging.warning("Using outdirectory \x1b[1m%s", args['outdirectory'])
    logging.warning("Full logfile can be found at %s", logname)
    #   Check suppression values and other arguments
    if args['suppress_sam']: # Suppressed SAM output?
        logging.warning("SAM output suppressed, not writing SAM file")
        args['bam'] = False
    elif args['bam']: # Check for SAMtools
        try:
            args['samtools_exec'] = toolkit.which('samtools')
        except ValueError: # No SAMtools found
            logging.error("Cannot find SAMtools, outputing SAM instead of BAM")
            args['bam'] = False
    if args['suppress_events'] or args['suppress_tables']: # Suppressed events table?
        logging.warning("Events output suppressed, not writing events table")
    if args['suppress_classification'] or args['suppress_tables']: # Suppressed classification table?
        logging.warning("Read classification suppressed, not writing classification table")
    if args['suppress_plots']: # Suppressed plots?
        logging.warning("Plots suppressed, not creating plots")
    else: # Search for Rscript
        try:
            args['Rscript'] = toolkit.which('Rscript')
        except ValueError: # No Rscript found
            logging.error("Cannot find Rscript, not generating plots")
            args['suppress_plots'] = True
    if _check_suppressions(suppressions=args): # All output suppressed? Error
        sys.exit(logging.critical("All output suppressed, not running"))
    # if args['xkcd']:
    #     plots._XKCD = True
    #   Enable the profiler if desired
    if args['profile']:
        toolkit._DO_PROFILE = True
    #   Read in reference and template sequences
    logging.info("Quality control...")
    #   Get genomic chromosome and start position
    try:
        chrom, args['genomic_start'] = sam.get_genomic_location(bedfile=args['reference_bed']) # type: str, int
    except KeyError: # Not provided
        chrom, args['genomic_start'] = '', 0 # type: str, int
    qc_start = time.time() # type: float
    reference = toolkit.load_seq(seq_file=args['reference'], chrom=chrom) # type: toolkit.NamedSequence
    template = toolkit.load_seq(seq_file=args['template']) # type: toolkit.NamedSequence
    #   Align template and reference sequences to determine alignment direction
    al_ref_seq, al_temp_seq = quality_control.align_reference( # type: str, str
        reference=reference.sequence,
        template=template.sequence,
        gap_penalty=args['gap_opening']
    )
    aligned_reference = toolkit.NamedSequence(name=reference.name, sequence=al_ref_seq) # type: toolkit.NamedSequence
    aligned_template = toolkit.NamedSequence(name=template.name, sequence=al_temp_seq) # type: toolkit.NamedSequence
    #   QC the alignments
    logging.info("Validating reference/template alignment...")
    alignment_validation = time.time() # type: float
    if '-' in set(aligned_reference.sequence):
        raise ValueError(logging.error("Cannot have insertions in the reference"))
    if '-' in set(toolkit.side_trimmer(seq=aligned_template.sequence)):
        raise ValueError(logging.error("Cannot have deletions in the template sequence"))
    template_reference_mismatch = toolkit.get_mismatch(seq_a=aligned_reference.sequence, seq_b=aligned_template.sequence) # type: List
    if not template_reference_mismatch:
        logging.error("No mismatches found between the reference and template sequenecs, going into NHEJ-only mode")
        template_reference_mismatch = list(itertools.repeat((None, ('',)), times=len(args['analysis_mode']))) # type: List[Tuple[None, Tuple[str]]]
    if len(template_reference_mismatch) != len(args['analysis_mode']):
        msg = "There can only be %(num)s mismatches in '%(mode)s' mode" % { # type: str
            'num': len(args['analysis_mode']),
            'mode': '+'.join(args['analysis_mode'])
        }
        if len(args['analysis_mode']) == 1:
            msg = msg.replace('mismatches', 'mismatch') # type: str
        raise ValueError(logging.error(msg))
    logging.debug("Reference/template aligmnent validation took %s seconds", round(time.time() - alignment_validation, 3))
    #   Get SNP information
    # snp_info_raw = template_reference_mismatch.pop(args['analysis_mode'].index('SNP'))
    snp_index, reference_state, target_snp = quality_control.get_snp_states( # type: int, str, str
        reference=aligned_reference.sequence,
        template=aligned_template.sequence,
        mismatch=template_reference_mismatch.pop(args['analysis_mode'].index('SNP'))
        # mismatch=snp_info_raw
    )
    snp = SNP(reference=reference_state, target=target_snp, position=snp_index) # type: SNP
    logging.debug("Quality control took %s seconds", round(time.time() - qc_start, 3))
    #   Collect FASTQ information
    if 'sample_list' in args:
        if not os.path.exists(args['sample_list']):
            raise ValueError(logging.critical("Cannot find sample list %s", args['sample_list']))
        with open(args['sample_list'], 'r') as listfile:
            fastq_list = tuple(line.strip() for line in listfile if not line.startswith('#')) # type: Tuple[str]
    elif 'input_file' in args:
        fastq_list = tuple(args['input_file']) # type: Tuple[str]
    elif 'fastq_directory' in args:
        fastq_list = toolkit.find_fastq(directory=args['fastq_directory']) # type: Tuple[str]
    else:
        sys.exit(logging.critical("No inputs provided"))
    zipped_args = zip( # type: Iterable[str, NamedSequence, toolkit.NamedSequence, Dict[str, Any], SNP, str]
        fastq_list,
        itertools.repeat(reference),
        itertools.repeat(aligned_reference),
        itertools.repeat(args),
        itertools.repeat(snp),
        itertools.repeat(args['outdirectory'])
    )
    #   Tell the pool to ignore SIGINT (^C)
    #   by turning INTERUPT signals into IGNORED signals
    sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) # type: function
    #   Setup our multiprocessing pool
    #   Allow the user to specify the number of jobs to run at once
    #   If not specified, let multiprocessing figure it out
    try:
        pool = Pool(processes=args['num_cores']) # type: multiprocessing.Pool
    except KeyError:
        pool = Pool() # type: multiprocessing.Pool
    #   Re-enable the capturing of SIGINT, catch with KeyboardInterrupt
    #   or ExitPool, depending on how the exit was initiated
    #   Note: SystemExits are swallowed by Pool, no way to change that
    signal.signal(signal.SIGINT, sigint_handler)
    #   If we have multiple FASTQ files AND multiple processes running
    #   use pool.map_async; else use generic map to avoid timeout issues
    if all(map(lambda i: i > 1, (len(fastq_list), getattr(pool, '_processes')))):
        try:
            #   Use map_async and get with a large timeout
            #   to allow for KeyboardInterrupts to be caught
            #   and handled with the try/except
            timeout = max((9999, 600 * len(fastq_list))) # type: int
            logging.debug("Setting timeout to %s seconds", timeout)
            res = pool.map_async(crispr_analysis, zipped_args) # type: multiprocessing.pool.MapResult
            pool.close()
            results = res.get(timeout)
        except (KeyboardInterrupt, ExitPool) as error: # Handle ctrl+c or custom ExitPool
            pool.terminate()
            pool.join()
            if isinstance(error, KeyboardInterrupt): # ctrl+c
                sys.exit('\nkilled')
            elif isinstance(error, ExitPool): # My way of handling SystemExits
                sys.exit(error.msg)
            else: # Shouldn't happen, but you know...
                raise
        else:
            pool.join()
    #   Otherwise, don't bother with pool.map() make life easy
    else:
        #   Clean up the pool
        pool.close(); pool.terminate(); pool.join()
        #   Use standard map (or itertools.imap if Python 2)
        results = map(crispr_analysis, zipped_args) # type: Iterable[Tuple[Tuple[alignment.Alignment]], Tuple[Dict[str, Any]]]
    #   Sort our alignments and summaries into separate collections
    try:
        alignments, summaries = zip(*results) # type: Tuple[Tuple[alignment.Alignment]], Tuple[Dict[str, Any]]
    except ExitPool as error: # Handle ExitPool calls for single-threaded map
        sys.exit(error.msg)
    #   Unpack our alignments into a single tuple
    alignments = toolkit.unpack(collection=alignments) # type: Tuple[alignment.Alignment]
    #   Final batch summary plot and table
    if not args['suppress_plots']:
        if alignments:
            plots.quality_plot(
                alignments=alignments,
                thresholds={d['filename']: d['score_threshold'] for d in summaries},
                output_prefix=output_prefix
            )
        else:
            logging.error("No passing reads found in any file, not producing quality plot")
    if not (args['suppress_classification'] or args['suppress_events'] or args['suppress_tables']):
        summary_name = output_prefix + '.summary.txt' # type: str
        summary_header = (
            '#FASTQ',
            'TOTAL_READS',
            'TOTAL_NON_DISC',
            'UNIQ_READS',
            'DISCARDED',
            'SNP_POS',
            'REF_STATE',
            'TEMP_SNP',
            'NO_EDIT',
            'PERC_NO_EDIT',
            'HDR',
            'PERC_HDR',
            'MIX',
            'PERC_MIX',
            'NHEJ',
            'PERC_NHEJ',
            'PERC_MIS_A',
            'PERC_MIS_T',
            'PERC_MIS_C',
            'PERC_MIS_G'
        )
        logging.info("Writing summary to %s", summary_name)
        summary_start = time.time() # type: float
        with open(summary_name, 'w') as summfile:
            summfile.write('\t'.join(summary_header) + '\n')
            summfile.flush()
            for sum_dict in sorted(summaries, key=lambda d: d['filename']): # type: Dict[str, Any]
                out = ( # type: Tuple[Any]
                    sum_dict['filename'],
                    sum_dict['total_reads'] + sum_dict['discarded'],
                    sum_dict['total_reads'],
                    sum_dict['unique_reads'],
                    sum_dict['discarded'],
                    snp.position + 1 if _is_snp(snp=snp) else analysis.NA,
                    snp.reference if _is_snp(snp=snp) else analysis.NA,
                    snp.target if _is_snp(snp=snp) else analysis.NA,
                    sum_dict['no_edit'],
                    sum_dict['no_edit_perc'],
                    sum_dict['hdr'],
                    sum_dict['hdr_perc'],
                    sum_dict['mix'],
                    sum_dict['mix_perc'],
                    sum_dict['nhej'],
                    sum_dict['nhej_perc'],
                    sum_dict['perc_a'],
                    sum_dict['perc_t'],
                    sum_dict['perc_c'],
                    sum_dict['perc_g']
                )
                out = map(str, out) # type: Iterable[str]
                summfile.write('\t'.join(out))
                summfile.write('\n')
                summfile.flush()
        logging.debug("Writing summary took %s seconds", round(time.time() - summary_start, 3))
    #   Close logfile
    logging.debug("Entire program took %s seconds to run", round(time.time() - program_start, 3))
    logging.info("Thank you for using %s", os.path.basename(sys.argv[0]))
    devnull.close()
    try:
        logfile.close()
    except NameError:
        pass
Esempio n. 8
0
def runtime_parallel(resamples: np.ndarray, func: Callable,
                     pool: mp.pool) -> float:
    t_start = time.perf_counter()
    pool.map(func, resamples, 500)  # running time should be O(mn)
    t_end = time.perf_counter()
    return t_end - t_start