def align(fp_obj, references=[], counter_references=None, random=True, unique=True, max_quality='70', quals_type='solexa1.3', mismatches='2', seed_len='28', counter_mismatches=None, use_quality=True, logging_level=10, num_cpus=1, **kwargs): if not unique and not random: raise Usage('Nothing to do') common_flags = ['-y', '-a', '--time', '--best', '--chunkmbs', '1024', '--strata', '--sam'] common_flags.extend(['-p', str(num_cpus)]) logger = get_logger(logging_level) uniqueness = {} if unique: uniqueness.update({'unique': ['-m', '1']}) if random: uniqueness.update({'random': ['-M', '1']}) stdout_buffer = [] common_flags.extend(['-l', seed_len]) if counter_references is not None: #counter align first flags = [item for item in common_flags] flags.extend(uniqueness['random']) if counter_mismatches is None: counter_mismatches = mismatches if use_quality: flags.extend(['-e', max_quality]) flags.extend(['-n', counter_mismatches]) else: flags.extend(['-v', counter_mismatches]) if fp_obj.paired_end: flags.extend(['-X', '600']) new_filenames = counteralign_once(fp_obj, flags, counter_references, logger=logger, **kwargs) # after alignment fp_obj.input_file = new_filenames[0] fp_obj.second_file = new_filenames[1] fp_obj.use_pysam = True fp_obj.format = 'BAM' for match_type, match_flag in uniqueness.items(): flags = [item for item in common_flags] flags.extend(match_flag) if use_quality: flags.extend(['-e', max_quality]) flags.extend(['-n', mismatches]) else: flags.extend(['-v', mismatches]) if fp_obj.paired_end: flags.extend(['-X', '600']) for ref in references: s = align_once(fp_obj, flags, ref, match_type, logger=logger, **kwargs) stdout_buffer.append(s) return '\n'.join([s for s in stdout_buffer if s is not None])
def compare_clone_to_ref(clone, ref, logging_level=20, **kwargs): """ process-safe comparison of a clone Sequence to reference Sequence aligns using clonechecker.align.align_clone_to_ref returns the pickled tuple (clone, Alignment) or None (if AlignmentError is raised) """ logger = get_logger(logging_level) try: aln = align_clone_to_ref(clone, ref) except AlignmentError: return if aln.is_match(): logger.info("match found %s, %s", clone.Name, ref.Name) if aln.is_truncated: logger.debug("alignment is truncated (%s, %s)", clone.Name, ref.Name) if not aln.has_gaps and not aln.has_mismatches: logger.info("truncated match found %s, %s", clone.Name, ref.Name) if aln.has_gaps: logger.debug("alignment has gaps (%s, %s)", clone.Name, ref.Name) if aln.has_mismatches: logger.debug("alignment has mismatches (%s, %s)", clone.Name, ref.Name) if not aln.is_truncated and not aln.has_gaps: logger.info("mutated match found %s, %s", clone.Name, ref.Name) return dumps((aln.Clone.Name, aln))
def announce_first(clone, logging_level=20, **kwargs): """ Announce to stderr that we are about to start comparing a clone to all available references """ logger = get_logger(logging_level) logger.info("Comparing %s to references", clone.Name)
def align_bwa(fp_obj, references=[], unique=True, seed_len='28', use_quality=True, logging_level=10, num_threads=1, passthru_args=None, **kwargs): common_flags = ['-t', str(num_threads), '-M'] logger = get_logger(logging_level) if references is None or references == []: logger.critical('Nothing to do') return stdout_buffer = [] if fp_obj.format == 'BAM' or fp_obj.format == 'SAM': convert_to_fastq(fp_obj, logger=logger) if not fp_obj.format == 'FASTQ': logger.critical('%s only supports FASTQ files' % __file__) return if passthru_args is not None: for i in range(len(passthru_args)): passthru_args[i] = passthru_args[i].replace('+', '-') logger.debug('Passing thru arguments %s', ' '.join(passthru_args)) kwargs['passthru_args'] = passthru_args flags = [item for item in common_flags] for ref in references: s = align_once(fp_obj, flags, ref, logger=logger, **kwargs) stdout_buffer.append(s) return '\n'.join([s for s in stdout_buffer if s is not None])
def splitter(pf, **kwargs): logger = get_logger() if pf.paired_end: return split_paired_files(pf, logger=logger, **kwargs) else: return split_file(pf, logger=logger, **kwargs)
def action(fp_obj, motif_file=None, motif_type=None, motif_number=1, **kwargs): logger = get_logger() logger.debug("trying to find sites for %s", fp_obj.input_file) motif = get_motif(motif_file, motif_number, motif_type) stdout_buffer = find_sites(fp_obj.input_file, fp_obj.fasta_file, motif, bed=fp_obj.is_bed, xls=fp_obj.is_xls, output_dir=fp_obj.output_dir, src_fnc=__file__, **kwargs) return stdout_buffer
def align2(fp_obj, references=[], counter_references=None, unique=True, seed_len='28', use_quality=True, logging_level=10, num_cpus=1, passthru_args=None, **kwargs): common_flags = ['--time', '-p', str(num_cpus), '-L', seed_len] if fp_obj.paired_end: common_flags.extend(['-X', '600']) logger = get_logger(logging_level) if references is None or references == []: logger.critical('Nothing to do') return stdout_buffer = [] if fp_obj.format == 'BAM' or fp_obj.format == 'SAM': convert_to_fastq(fp_obj, logger=logger) if not fp_obj.format == 'FASTQ': logger.critical('%s only supports FASTQ files' % __file__) return if passthru_args is not None: for i in range(len(passthru_args)): passthru_args[i] = passthru_args[i].replace('+', '-') logger.debug('Passing thru arguments %s', ' '.join(passthru_args)) kwargs['passthru_args'] = passthru_args if counter_references is not None: #counter align first flags = [item for item in common_flags] flags.append('--fast') new_filenames = counteralign_once(fp_obj, flags, counter_references, logger=logger, **kwargs) # after alignment fp_obj.input_file = new_filenames[0] fp_obj.second_file = new_filenames[1] flags = [item for item in common_flags] for ref in references: s = align_once(fp_obj, flags, ref, logger=logger, **kwargs) stdout_buffer.append(s) return '\n'.join([s for s in stdout_buffer if s is not None])
def run_macs(f, subpeaks=True, path_to_macs=None, logging_level=10, user_gsize=None, qvalue=0.01, passthru_args=None, **kwargs): """Run MACS on a BAM file """ logger = get_logger(logging_level) if path_to_macs is None: path_to_macs = path_to_executable("macs2") input_file = f.input_file control_file = f.control_file logger.debug('Processing %s', input_file) if control_file is not None: logger.debug('with control %s', control_file) # determine genome name and size if user_gsize: genome_size = user_gsize try: genome_build = guess_bam_genome(input_file) except NoMatchFoundError: genome_build = None else: try: genome_build = guess_bam_genome(input_file) except NoMatchFoundError: raise Usage('\ Could not determine genome / genome size for file %s' % input_file) gname = ''.join([x for x in genome_build if x.isalpha()]) if gname == 'hg': genome_size = 'hs' elif gname in ['mm', 'ce', 'dm']: genome_size = gname else: genome_size = '%.1e' % sum(genome(genome_build).itervalues()) fmt = decide_format(input_file, control_file, logger) name = f.sample_name.replace(' ', '_') if passthru_args is not None: for i in range(len(passthru_args)): passthru_args[i] = passthru_args[i].replace('+', '-') logger.debug('Passing thru arguments %s', ' '.join(passthru_args)) macs_options = ['--trackline', '-f', fmt, # correct file format BAM or BAMPE '-B', '--SPMR', # bedgraphs, SPMR '-g', genome_size, '-q', qvalue, '-n', name, # run name '-t', join(getcwd(), input_file)] # treatment if control_file is not None: macs_options.extend(['-c', join(getcwd(), control_file)]) if subpeaks: macs_options.append('--call-summits') if passthru_args is not None: macs_options.extend(passthru_args) step = [path_to_macs, 'callpeak'] + macs_options if platform.system() is 'Windows': step.insert(sys.executable, 0) macs_stdout = PolledPipe(logger=logger, level=WARN) macs_stderr = PolledPipe(logger=logger, level=ERROR) logger.debug('Launching %s', ' '.join(step)) job = Popen(step, stdout=macs_stdout.w, stderr=macs_stderr.w, cwd=f.output_dir) pollables = [macs_stdout, macs_stderr] wait_for_job(job, pollables, logger) return '%s\n\n' % ' '.join(step)
def split_file(fp_obj, no_gzip=False, barcodes=[], linker='', min_length=4, max_length=-1, logger=None, strip_before_barcode=0, strip_after_barcode=0, no_clipping=False, **kwargs): if logger is None: logger = get_logger() filename = fp_obj.input_file open_func, format_ = discover_file_format(filename) if not format_ == 'FASTQ': logger.error('Only FASTQ files are supported at this time') return f = open_func(filename, "rU") records = FasterFastqIterator(f) barcoded_files = {} filenames = [] output_filename = partial(fp_obj.output_filename, no_gzip=no_gzip) if no_gzip: open_func = open elif PATH_TO_GZIP is not None: open_func = gzip_class_factory(PATH_TO_GZIP) else: open_func = GzipFile if barcodes is None: barcodes = [] if barcodes is not None and len(barcodes) > 0: processed_file = None for barcode in barcodes: fname = output_filename(barcode) filenames.append(fname) barcoded_files[barcode] = open_func(fname, 'w') # and make a unmatched file unmatched_filename = output_filename("unmatched") filenames.append(unmatched_filename) unmatched_file = open_func(unmatched_filename, 'w') else: barcoded_files = None unmatched_file = None processed_filename = output_filename("processed", is_barcode=False) filenames.append(processed_filename) processed_file = open_func(processed_filename, 'w') writer_args = {'barcoded_files': barcoded_files, 'unmatched_file': unmatched_file, 'processed_file': processed_file} results = apply_plan(records, writer_args, barcodes=barcodes, linker=linker, min_length=min_length, max_length=max_length, strip_after_barcode=strip_after_barcode, strip_before_barcode=strip_before_barcode, no_clipping=no_clipping, logger=logger) linker_only = results['linker'] too_short = results['short'] record_count = results['all'] # close and exit # f.close() if barcoded_files is not None: logger.debug('closing barcoded files') for f_ in barcoded_files.values(): f_.close() if unmatched_file is not None: logger.debug('closing unmatched file') unmatched_file.close() if processed_file is not None: logger.debug('closing output file') processed_file.close() logger.info('Split %s as %s ', fp_obj.input_file, ', '.join(filenames)) logger.info('Processed %s records', record_count) logger.info('%s linker only dimers', linker_only) logger.info('%s sequences too short (1-3 bp)', too_short)