def split_paired_files(fp_obj, no_gzip=False, barcodes=None, linker='', min_length=4, max_length=-1, strip_before_barcode=0, strip_after_barcode=0, logger=None, no_clipping=False, **kwargs): filename = fp_obj.input_file filename2 = fp_obj.second_file open_func, format1 = discover_file_format(filename) open_func2, format2 = discover_file_format(filename2) if not (format1 == 'FASTQ' and format2 == 'FASTQ'): logger.error('Only FASTQ files are supported at this time') return f = open_func(filename, "rU") f2 = open_func2(filename2, "rU") records = FasterFastqIterator(f) records2 = FasterFastqIterator(f2) barcoded_file_pairs = {} filenames = [] if no_gzip: open_func = open elif PATH_TO_GZIP is not None: open_func = gzip_class_factory(PATH_TO_GZIP) else: open_func = GzipFile output_filename = partial(fp_obj.output_filename, no_gzip=no_gzip) output_filename2 = partial(fp_obj.output_filename2, no_gzip=no_gzip) if barcodes is None: barcodes = [] if barcodes is not None and len(barcodes) > 0: processed_files = None orphaned_read_files = None for barcode in barcodes: fname = output_filename(barcode) fname2 = output_filename2(barcode) filenames.extend((fname, fname2)) barcoded_file_pairs[barcode] = (open_func(fname, 'w'), open_func(fname2, 'w')) # and make a unmatched file unmatched_filename = output_filename("unmatched") unmatched_filename2 = output_filename2("unmatched") unmatched_files = (open_func(unmatched_filename, 'w'), open_func(unmatched_filename2, 'w')) mismatched_filename = output_filename("mismatched") mismatched_filename2 = output_filename2("mismatched") mismatched_files = (open_func(mismatched_filename, 'w'), open_func(mismatched_filename2, 'w')) filenames.extend((unmatched_filename, unmatched_filename2, mismatched_filename, mismatched_filename2)) else: barcoded_file_pairs = None unmatched_files = None mismatched_files = None orphaned_read_filename = output_filename("orphaned", is_barcoded=False) orphaned_read_filename2 = output_filename2("mismatched", is_barcoded=False) orphaned_read_files = (open_func(orphaned_read_filename, 'w'), open_func(orphaned_read_filename2, 'w')) processed_filename = output_filename("processed", is_barcoded=False) processed_filename2 = output_filename2("processed", is_barcoded=False) processed_files = (open_func(processed_filename, 'w'), open_func(processed_filename2, 'w')) filenames.extend((orphaned_read_filename, orphaned_read_filename2), (processed_filename, processed_filename2)) writer_args = {'barcoded_file_pairs': barcoded_file_pairs, 'unmatched_files': unmatched_files, 'mismatched_files': mismatched_files, 'processed_files': processed_files, 'orphaned_read_files': orphaned_read_files, 'linker': linker, 'min_length': min_length } results = apply_plan_pe(records, records2, writer_args, barcodes=barcodes, linker=linker, min_length=min_length, max_length=max_length, strip_after_barcode=strip_after_barcode, strip_before_barcode=strip_before_barcode, no_clipping=no_clipping, logger=logger) linker_only = results['linker'] too_short = results['short'] record_count = results['all'] # close and exit # f.close() for f_, f2_ in barcoded_file_pairs.values(): f_.close() f2_.close() unmatched_files[0].close() unmatched_files[1].close() mismatched_files[0].close() mismatched_files[1].close() logger.info('Split %s, %s as %s', fp_obj.input_file, fp_obj.second_file, ', '.join(filenames)) logger.info('Processed %s records', record_count) logger.info('%s linker only dimers', linker_only) logger.info('%s sequences too short (1-3 bp)', too_short)
def split_file(fp_obj, no_gzip=False, barcodes=[], linker='', min_length=4, max_length=-1, logger=None, strip_before_barcode=0, strip_after_barcode=0, no_clipping=False, **kwargs): if logger is None: logger = get_logger() filename = fp_obj.input_file open_func, format_ = discover_file_format(filename) if not format_ == 'FASTQ': logger.error('Only FASTQ files are supported at this time') return f = open_func(filename, "rU") records = FasterFastqIterator(f) barcoded_files = {} filenames = [] output_filename = partial(fp_obj.output_filename, no_gzip=no_gzip) if no_gzip: open_func = open elif PATH_TO_GZIP is not None: open_func = gzip_class_factory(PATH_TO_GZIP) else: open_func = GzipFile if barcodes is None: barcodes = [] if barcodes is not None and len(barcodes) > 0: processed_file = None for barcode in barcodes: fname = output_filename(barcode) filenames.append(fname) barcoded_files[barcode] = open_func(fname, 'w') # and make a unmatched file unmatched_filename = output_filename("unmatched") filenames.append(unmatched_filename) unmatched_file = open_func(unmatched_filename, 'w') else: barcoded_files = None unmatched_file = None processed_filename = output_filename("processed", is_barcode=False) filenames.append(processed_filename) processed_file = open_func(processed_filename, 'w') writer_args = {'barcoded_files': barcoded_files, 'unmatched_file': unmatched_file, 'processed_file': processed_file} results = apply_plan(records, writer_args, barcodes=barcodes, linker=linker, min_length=min_length, max_length=max_length, strip_after_barcode=strip_after_barcode, strip_before_barcode=strip_before_barcode, no_clipping=no_clipping, logger=logger) linker_only = results['linker'] too_short = results['short'] record_count = results['all'] # close and exit # f.close() if barcoded_files is not None: logger.debug('closing barcoded files') for f_ in barcoded_files.values(): f_.close() if unmatched_file is not None: logger.debug('closing unmatched file') unmatched_file.close() if processed_file is not None: logger.debug('closing output file') processed_file.close() logger.info('Split %s as %s ', fp_obj.input_file, ', '.join(filenames)) logger.info('Processed %s records', record_count) logger.info('%s linker only dimers', linker_only) logger.info('%s sequences too short (1-3 bp)', too_short)