def extract(infile, sample): log = pct.create_logger() if not sample: if infile == '-': sample = 'stdin' else: sample = infile with pct.open(infile) as f: sys.stdout.write( 'sample\torientation\tinteraction_type\tditag_length\t' 'insert_size\tfragment_seperation\n') for i, line in enumerate(f): if line.startswith('@'): continue else: try: read1 = pct.Sam(line) read2 = pct.Sam(next(f)) except StopIteration: log.exception('Odd number of alignments in file') sys.exit(1) sys.stdout.write( f'{sample}\t{read1.optional["or:Z"]}\t' f'{read1.optional["it:Z"]}\t{read1.optional["dt:i"]}\t' f'{read1.optional["is:i"]}\t{read1.optional["fs:i"]}\n')
def process(infile, digest): log = pct.create_logger() with pct.open(infile) as in_obj, pct.open(digest) as digest: d = process_digest(digest) for line in in_obj: if line.startswith("@"): sys.stdout.write(line) continue else: try: read1 = pct.Sam(line) read2 = pct.Sam(next(in_obj)) except StopIteration: log.exception("Odd number of alignments in file") filter_stats = run_filter(read1, read2, d) read1.optional['or:Z'] = filter_stats['orientation'] read2.optional['or:Z'] = filter_stats['orientation'] read1.optional['it:Z'] = filter_stats['interaction'] read2.optional['it:Z'] = filter_stats['interaction'] read1.optional['dt:i'] = filter_stats['ditag_length'] read2.optional['dt:i'] = filter_stats['ditag_length'] read1.optional['is:i'] = filter_stats['insert_size'] read2.optional['is:i'] = filter_stats['insert_size'] read1.optional['fs:i'] = filter_stats['fragment_seperation'] read2.optional['fs:i'] = filter_stats['fragment_seperation'] read1.optional['fn:i'] = filter_stats['read1_fragment'] read2.optional['fn:i'] = filter_stats['read2_fragment'] sys.stdout.write(read1.get_record()) sys.stdout.write(read2.get_record())
def find_cut_sites(ref_seq, ref, restriction): log = pct.create_logger() if not ref_seq: log.error(f'Reference {ref} contains no sequence.') ec = 1 elif invalid_seq(ref_seq): log.error(f'Invalid FASTA character in {ref}.') ec = 1 else: overhang = restriction.index('^') site = restriction.replace('^', '') matches = re.finditer(site, ref_seq) index = 0 previous_end = 0 for match in matches: # Skip if restriction sequence at start of reference. if match.start() == 0: continue index += 1 start = 1 if index == 1 else previous_end + 1 end = match.start() + overhang sys.stdout.write(f'{ref}\t{start}\t{end}\t{index}\n') previous_end = end sys.stdout.write( f'{ref}\t{previous_end + 1}\t{len(ref_seq)}\t{index + 1}\n') ec = 0 return ec
def duplicates(in_fastq: str = '-', out_fastq: str = '-', buffer_size: float = 0.1, parallel: int = 1): ''' Check FASTQ duplication level. ''' log = pct.create_logger() with contextlib.ExitStack() as cm: out_fobj = cm.enter_context(pct.open_smart(out_fastq, mode='w')) in_fobj = cm.enter_context(pct.open_smart(in_fastq)) try: p1 = cm.enter_context( subprocess.Popen(['awk', 'NR%4 == 2'], stdin=in_fobj, stdout=subprocess.PIPE)) p2 = cm.enter_context( subprocess.Popen([ 'sort', '--parallel', f'{parallel}', '--buffer-size', f'{int(buffer_size*100)}%' ], stdin=p1.stdout, stdout=subprocess.PIPE, encoding='utf8')) p1.stdout.close() except FileNotFoundError as e: log.exception(f'{e}\nPlease install awk and sort.') sys.exit(1) duplicates = defaultdict(int) for index, line in enumerate(p2.stdout): line = line.rstrip('\n') if index == 0: copies = 1 elif line == previous_line: copies += 1 else: duplicates[copies] += 1 copies = 1 previous_line = line duplicates[copies] += 1 for duplicate_level, ncopies in sorted(duplicates.items()): out_fobj.write(f'{duplicate_level}\t{ncopies}\n') exit_codes = [p.wait() for p in [p1, p2]] log.debug(f'Exit_codes for p1, p2: {exit_codes}.') if not all(ec is 0 for ec in exit_codes): log.error('A sub-process returned a non-zero exit code.')
def is_valid(read1, read2): log = pct.create_logger() if read1.qname != read2.qname: log.error(f'Qname mismatch: {read1.qname} {read2.qname}. ' 'Is file name sorted?') elif not read1.is_paired and not read2.is_paired: log.error(f'{read1.qname} is not paired') elif read1.is_read1 == read2.is_read1: log.error(f'R1 and R2 flags in {read1.qname} not correctly set') elif read1.pnext != read2.left_pos or read2.pnext != read1.left_pos: log.error(f'Mate position mismatch in {read1.qname}.') else: return True return False
def truncate(infile, qc, sample, restriction): ''' Run main loop. ''' log = pct.create_logger() ligation_seq, restriction_seq = process_restriction(restriction) total = 0 truncated = 0 truncated_length = 0 if not sample: sample = infile with pct.open(infile) as in_obj: is_truncated = False for index, line in enumerate(in_obj): line = line.rstrip('\n') # Sequence line if index % 4 == 1: total += 1 line = line.upper() if ligation_seq in line: line = line[0:line.index(ligation_seq)] + restriction_seq is_truncated = True seq_length = len(line) # Quality line elif index % 4 == 3: line = line[0:seq_length] if is_truncated: truncated += 1 truncated_length += seq_length is_truncated = False sys.stdout.write(f'{line}\n') try: mean_truncated_length = truncated_length / truncated except ZeroDivisionError: mean_truncated_length = 'na' with pct.open(qc, stderr=True, mode='w') as qc_out: qc_out.write( f'{sample}\tTotal\t{total}\n' f'{sample}\tTruncated\t{truncated}\n' f'{sample}\tNot truncated\t{total-truncated}\n' f'{sample}\tMean truncated length\t{mean_truncated_length}\n')
def digest(infile, restriction): ''' Iterate through each infile. ''' log = pct.create_logger() with pct.open(infile) as in_obj: header = 1 for index, line in enumerate(in_obj): if line.startswith('>'): if header > 1: find_cut_sites(''.join(seqs), ref, restriction) ref = line.rsplit()[0][1:] log.info(f'Digesting reference {ref}.') header += 1 seqs = [] elif header == 1 and index == 0: log.error(f'FASTA line 1 does not begin with ">".') sys.exit(1) else: seqs.append(line.upper().strip('\n')) find_cut_sites(''.join(seqs), ref, restriction)
Parameters ---------- in_file : str, optional Path to input file. out_file : str, optional Path to output file. nsamples : int, optional Number of records to sample from input. blocksize : int, optional Number of consecutive lines per record. seed : float, optional Seed for random number generator """ log = pct.create_logger() log.info( f'Sampling {nsamples} records in blocks of {blocksize} from {in_file}') random.seed(seed) with contextlib.ExitStack() as cm: out_fobj = cm.enter_context(pct.open_smart(out_file, mode='w')) in_fobj = cm.enter_context(pct.open_smart(in_file)) log.info(f'Writing output to {out_file}') sample = reservoir_sample(in_fobj, nsamples, blocksize) for line in sample: out_fobj.write(f'{line}\n') def reservoir_sample(in_fobj: typing.TextIO, nsamples: int = 100_000,
def filter(infile, qc, sample, min_inward, min_outward, min_ditag, max_ditag): ''' Iterate through each infile. ''' log = pct.create_logger() inputs = [min_inward, min_outward, max_ditag, min_ditag] if all(i is None for i in inputs): log.error('No filter settings defined.') sys.exit(1) if not sample: if infile == '-': sample = 'stdin' else: sample = infile with pct.open(infile) as in_obj: total = 0 retained = 0 invalid = 0 above_ditag = 0 below_ditag = 0 same_fragment = 0 below_min_inward = 0 below_min_outward = 0 for line in in_obj: if line.startswith("@"): sys.stdout.write(line) else: try: read1 = pct.Sam(line) read2 = pct.Sam(next(in_obj)) total += 1 except StopIteration: log.exception('Odd number of alignments in file.') sys.exit(1) if max_ditag is not None: if read1.optional['dt:i'] > max_ditag: above_ditag += 1 continue if min_ditag is not None: if read1.optional['dt:i'] < min_ditag: below_ditag += 1 continue if read1.optional['it:Z'] == "cis": if read1.optional['fs:i'] == 0: same_fragment += 1 continue if read1.optional['or:Z'] == 'Inward': if min_inward is not None: if read1.optional['is:i'] < min_inward: below_min_inward += 1 continue elif read1.optional['or:Z'] == 'Outward': if min_outward is not None: if read1.optional['is:i'] < min_outward: below_min_outward += 1 continue retained += 1 sys.stdout.write(read1.get_record()) sys.stdout.write(read2.get_record()) with pct.open(qc, stderr=True, mode='w') as qc_out: qc_out.write( f'{sample}\tTotal\t{total}\n' f'{sample}\tRetained\t{retained}\n' f'{sample}\tFiltered\t{total - retained}\n' f'{sample}\tInvalid\t{invalid}\n' f'{sample}\tDitag < {min_ditag}bp\t{above_ditag}\n' f'{sample}\tDitag > {max_ditag}bp\t{below_ditag}\n' f'{sample}\tSame fragment\t{same_fragment}\n' f'{sample}\tInward insert < {min_inward}bp\t{below_min_inward}\n' f'{sample}\tOutward insert < {min_outward}bp\t{below_min_outward}\n' )