Beispiel #1
0
def align(fp_obj, references=[], counter_references=None,
          random=True, unique=True, max_quality='70',
          quals_type='solexa1.3',  mismatches='2', seed_len='28',
          counter_mismatches=None,
          use_quality=True, logging_level=10, num_cpus=1, **kwargs):
    if not unique and not random:
        raise Usage('Nothing to do')
    common_flags = ['-y', '-a', '--time', '--best', '--chunkmbs', '1024',
                    '--strata', '--sam']
    common_flags.extend(['-p', str(num_cpus)])
    logger = get_logger(logging_level)
    uniqueness = {}
    if unique:
        uniqueness.update({'unique': ['-m', '1']})
    if random:
        uniqueness.update({'random': ['-M', '1']})
    stdout_buffer = []
    common_flags.extend(['-l', seed_len])

    if counter_references is not None:
        #counter align first
        flags = [item for item in common_flags]
        flags.extend(uniqueness['random'])
        if counter_mismatches is None:
            counter_mismatches = mismatches
        if use_quality:
            flags.extend(['-e', max_quality])
            flags.extend(['-n', counter_mismatches])
        else:
            flags.extend(['-v', counter_mismatches])

        if fp_obj.paired_end:
            flags.extend(['-X', '600'])
        new_filenames = counteralign_once(fp_obj, flags, counter_references,
                                          logger=logger, **kwargs)

        # after alignment
        fp_obj.input_file = new_filenames[0]
        fp_obj.second_file = new_filenames[1]
        fp_obj.use_pysam = True
        fp_obj.format = 'BAM'

    for match_type, match_flag in uniqueness.items():
        flags = [item for item in common_flags]
        flags.extend(match_flag)

        if use_quality:
            flags.extend(['-e', max_quality])
            flags.extend(['-n', mismatches])
        else:
            flags.extend(['-v', mismatches])

        if fp_obj.paired_end:
            flags.extend(['-X', '600'])

        for ref in references:
            s = align_once(fp_obj, flags, ref, match_type,
                           logger=logger, **kwargs)
            stdout_buffer.append(s)
        return '\n'.join([s for s in stdout_buffer if s is not None])
def compare_clone_to_ref(clone, ref, logging_level=20, **kwargs):
    """
    process-safe comparison of a clone Sequence to reference Sequence
    aligns using clonechecker.align.align_clone_to_ref
    returns the pickled tuple (clone, Alignment)
            or None (if AlignmentError is raised)
    """
    logger = get_logger(logging_level)
    try:
        aln = align_clone_to_ref(clone, ref)
    except AlignmentError:
        return
    if aln.is_match():
        logger.info("match found %s, %s", clone.Name, ref.Name)
    if aln.is_truncated:
        logger.debug("alignment is truncated (%s, %s)", clone.Name, ref.Name)
        if not aln.has_gaps and not aln.has_mismatches:
            logger.info("truncated match found %s, %s", clone.Name, ref.Name)
    if aln.has_gaps:
        logger.debug("alignment has gaps (%s, %s)", clone.Name, ref.Name)
    if aln.has_mismatches:
        logger.debug("alignment has mismatches (%s, %s)", clone.Name, ref.Name)
        if not aln.is_truncated and not aln.has_gaps:
            logger.info("mutated match found %s, %s", clone.Name, ref.Name)
    return dumps((aln.Clone.Name, aln))
def announce_first(clone, logging_level=20, **kwargs):
    """
    Announce to stderr that we are about to start comparing a clone to
    all available references
    """
    logger = get_logger(logging_level)
    logger.info("Comparing %s to references", clone.Name)
def align_bwa(fp_obj, references=[],
              unique=True, seed_len='28',
              use_quality=True, logging_level=10, num_threads=1,
              passthru_args=None,
              **kwargs):
    common_flags = ['-t', str(num_threads), '-M']

    logger = get_logger(logging_level)
    if references is None or references == []:
        logger.critical('Nothing to do')
        return
    stdout_buffer = []

    if fp_obj.format == 'BAM' or fp_obj.format == 'SAM':
        convert_to_fastq(fp_obj, logger=logger)

    if not fp_obj.format == 'FASTQ':
        logger.critical('%s only supports FASTQ files' % __file__)
        return

    if passthru_args is not None:
        for i in range(len(passthru_args)):
            passthru_args[i] = passthru_args[i].replace('+', '-')
        logger.debug('Passing thru arguments %s', ' '.join(passthru_args))
    kwargs['passthru_args'] = passthru_args

    flags = [item for item in common_flags]

    for ref in references:
        s = align_once(fp_obj, flags, ref, logger=logger, **kwargs)
        stdout_buffer.append(s)
    return '\n'.join([s for s in stdout_buffer if s is not None])
def splitter(pf, **kwargs):
    logger = get_logger()
    if pf.paired_end:
        return split_paired_files(pf, logger=logger,
                                  **kwargs)
    else:
        return split_file(pf, logger=logger,
                          **kwargs)
Beispiel #6
0
def action(fp_obj, motif_file=None, motif_type=None, motif_number=1,
           **kwargs):
    logger = get_logger()
    logger.debug("trying to find sites for %s", fp_obj.input_file)
    motif = get_motif(motif_file, motif_number, motif_type)
    stdout_buffer = find_sites(fp_obj.input_file,
                               fp_obj.fasta_file,
                               motif, bed=fp_obj.is_bed, xls=fp_obj.is_xls,
                               output_dir=fp_obj.output_dir,
                               src_fnc=__file__, **kwargs)
    return stdout_buffer
Beispiel #7
0
def align2(fp_obj, references=[], counter_references=None,
           unique=True, seed_len='28',
           use_quality=True, logging_level=10, num_cpus=1,
           passthru_args=None,
           **kwargs):
    common_flags = ['--time', '-p', str(num_cpus), '-L', seed_len]
    if fp_obj.paired_end:
        common_flags.extend(['-X', '600'])

    logger = get_logger(logging_level)
    if references is None or references == []:
        logger.critical('Nothing to do')
        return
    stdout_buffer = []

    if fp_obj.format == 'BAM' or fp_obj.format == 'SAM':
        convert_to_fastq(fp_obj, logger=logger)

    if not fp_obj.format == 'FASTQ':
        logger.critical('%s only supports FASTQ files' % __file__)
        return

    if passthru_args is not None:
        for i in range(len(passthru_args)):
            passthru_args[i] = passthru_args[i].replace('+', '-')
        logger.debug('Passing thru arguments %s', ' '.join(passthru_args))
    kwargs['passthru_args'] = passthru_args

    if counter_references is not None:
        #counter align first
        flags = [item for item in common_flags]

        flags.append('--fast')
        new_filenames = counteralign_once(fp_obj, flags, counter_references,
                                          logger=logger, **kwargs)
        # after alignment
        fp_obj.input_file = new_filenames[0]
        fp_obj.second_file = new_filenames[1]

    flags = [item for item in common_flags]

    for ref in references:
        s = align_once(fp_obj, flags, ref, logger=logger, **kwargs)
        stdout_buffer.append(s)
    return '\n'.join([s for s in stdout_buffer if s is not None])
def run_macs(f, subpeaks=True, path_to_macs=None, logging_level=10,
             user_gsize=None, qvalue=0.01, passthru_args=None,
             **kwargs):
    """Run MACS on a BAM file
    """
    logger = get_logger(logging_level)
    if path_to_macs is None:
        path_to_macs = path_to_executable("macs2")

    input_file = f.input_file
    control_file = f.control_file
    logger.debug('Processing %s', input_file)
    if control_file is not None:
        logger.debug('with control %s', control_file)

    # determine genome name and size
    if user_gsize:
        genome_size = user_gsize
        try:
            genome_build = guess_bam_genome(input_file)
        except NoMatchFoundError:
            genome_build = None
    else:
        try:
            genome_build = guess_bam_genome(input_file)
        except NoMatchFoundError:
            raise Usage('\
Could not determine genome / genome size for file %s' % input_file)

        gname = ''.join([x for x in genome_build if x.isalpha()])
        if gname == 'hg':
            genome_size = 'hs'
        elif gname in ['mm', 'ce', 'dm']:
            genome_size = gname
        else:
            genome_size = '%.1e' % sum(genome(genome_build).itervalues())

    fmt = decide_format(input_file, control_file, logger)
    name = f.sample_name.replace(' ', '_')
    if passthru_args is not None:
        for i in range(len(passthru_args)):
            passthru_args[i] = passthru_args[i].replace('+', '-')
        logger.debug('Passing thru arguments %s', ' '.join(passthru_args))
    macs_options = ['--trackline',
                    '-f', fmt,  # correct file format BAM or BAMPE
                    '-B', '--SPMR',  # bedgraphs, SPMR
                    '-g', genome_size,
                    '-q', qvalue,
                    '-n', name,  # run name
                    '-t', join(getcwd(), input_file)]  # treatment
    if control_file is not None:
        macs_options.extend(['-c', join(getcwd(), control_file)])
    if subpeaks:
        macs_options.append('--call-summits')
    if passthru_args is not None:
        macs_options.extend(passthru_args)

    step = [path_to_macs, 'callpeak'] + macs_options
    if platform.system() is 'Windows':
        step.insert(sys.executable, 0)

    macs_stdout = PolledPipe(logger=logger, level=WARN)
    macs_stderr = PolledPipe(logger=logger, level=ERROR)
    logger.debug('Launching %s', ' '.join(step))
    job = Popen(step, stdout=macs_stdout.w, stderr=macs_stderr.w,
                cwd=f.output_dir)

    pollables = [macs_stdout, macs_stderr]
    wait_for_job(job, pollables, logger)

    return '%s\n\n' % ' '.join(step)
def split_file(fp_obj, no_gzip=False,
               barcodes=[], linker='', min_length=4,
               max_length=-1, logger=None,
               strip_before_barcode=0,
               strip_after_barcode=0,
               no_clipping=False,
               **kwargs):
    if logger is None:
        logger = get_logger()
    filename = fp_obj.input_file
    open_func, format_ = discover_file_format(filename)
    if not format_ == 'FASTQ':
        logger.error('Only FASTQ files are supported at this time')
        return
    f = open_func(filename, "rU")
    records = FasterFastqIterator(f)

    barcoded_files = {}
    filenames = []
    output_filename = partial(fp_obj.output_filename, no_gzip=no_gzip)
    if no_gzip:
        open_func = open
    elif PATH_TO_GZIP is not None:
        open_func = gzip_class_factory(PATH_TO_GZIP)
    else:
        open_func = GzipFile
        if barcodes is None:
            barcodes = []
    if barcodes is not None and len(barcodes) > 0:
        processed_file = None
        for barcode in barcodes:
            fname = output_filename(barcode)
            filenames.append(fname)
            barcoded_files[barcode] = open_func(fname, 'w')

        # and make a unmatched file
        unmatched_filename = output_filename("unmatched")
        filenames.append(unmatched_filename)
        unmatched_file = open_func(unmatched_filename, 'w')
    else:
        barcoded_files = None
        unmatched_file = None
        processed_filename = output_filename("processed", is_barcode=False)
        filenames.append(processed_filename)
        processed_file = open_func(processed_filename, 'w')

    writer_args = {'barcoded_files': barcoded_files,
                   'unmatched_file': unmatched_file,
                   'processed_file': processed_file}
    results = apply_plan(records, writer_args, barcodes=barcodes,
                         linker=linker,
                         min_length=min_length, max_length=max_length,
                         strip_after_barcode=strip_after_barcode,
                         strip_before_barcode=strip_before_barcode,
                         no_clipping=no_clipping,
                         logger=logger)
    linker_only = results['linker']
    too_short = results['short']
    record_count = results['all']
    # close and exit #
    f.close()
    if barcoded_files is not None:
        logger.debug('closing barcoded files')
        for f_ in barcoded_files.values():
            f_.close()
    if unmatched_file is not None:
        logger.debug('closing unmatched file')
        unmatched_file.close()
    if processed_file is not None:
        logger.debug('closing output file')
        processed_file.close()

    logger.info('Split %s as %s ', fp_obj.input_file, ', '.join(filenames))
    logger.info('Processed %s records', record_count)
    logger.info('%s linker only dimers', linker_only)
    logger.info('%s sequences too short (1-3 bp)', too_short)