def count_exact_matches(input, output, reference, read_length, sample): """Match reads to reference exactly. Takes the first <read-length> bases of each read and attempt to match it exactly to the reference sequences. Computes the number of matches for each reference. """ # load reference seq_to_ref = OrderedDict() with open(reference, "r") as ip: for (ref_name, seq, _) in readfq(ip): seq_to_ref[seq[:read_length]] = ref_name num_reads = 0 num_matched = 0 counts = Counter() with open_maybe_compressed(input, "r") as ip: for (name, seq, _) in tqdm(readfq(ip)): num_reads += 1 refname = seq_to_ref.get(seq[:read_length]) if refname is not None: num_matched += 1 counts[refname] += 1 print( "num_reads: {}\nnum_matched: {}\nfrac_matched: {}".format( num_reads, num_matched, num_matched / num_reads ), file=sys.stderr, ) if not sample: sample = pathlib.Path(input).stem with open(output, "w") as op: print(f"id\t{sample}", file=op) for (_, refname) in seq_to_ref.items(): print(f"{refname}\t{counts[refname]}", file=op)
def split_fastq(input, output, chunk_size): """Split fastq files into smaller chunks.""" input_file = osp.abspath(input) output_dir = osp.abspath(output) os.makedirs(output_dir, mode=0o755) # convenience functions output_file = lambda i: pjoin(output_dir, "part.{0}.fastq".format(i)) with open_maybe_compressed(input_file, "r") as input_handle: num_processed = 0 file_num = 1 for (name, seq, qual) in readfq(input_handle): if num_processed == 0: op = open_maybe_compressed(output_file(file_num), "w") print(f"@{name}\n{seq}\n+\n{qual}", file=op) num_processed += 1 if num_processed == chunk_size: op.close() num_processed = 0 file_num += 1 if not op.closed: op.close()
def truncate_fasta(input, output, length): """Truncate each sequence of a fasta file.""" with open(input, "r") as ip, open(output, "w") as op: for (n, s, q) in readfq(ip): print(f">{n}\n{s[:length]}", file=op)