Beispiel #1
0
def count_exact_matches(input, output, reference, read_length, sample):
    """Match reads to reference exactly.

    Takes the first <read-length> bases of each read and attempt to match it
    exactly to the reference sequences. Computes the number of matches for each
    reference.
    """
    # load reference
    seq_to_ref = OrderedDict()
    with open(reference, "r") as ip:
        for (ref_name, seq, _) in readfq(ip):
            seq_to_ref[seq[:read_length]] = ref_name

    num_reads = 0
    num_matched = 0
    counts = Counter()
    with open_maybe_compressed(input, "r") as ip:
        for (name, seq, _) in tqdm(readfq(ip)):
            num_reads += 1
            refname = seq_to_ref.get(seq[:read_length])
            if refname is not None:
                num_matched += 1
                counts[refname] += 1

    print(
        "num_reads: {}\nnum_matched: {}\nfrac_matched: {}".format(
            num_reads, num_matched, num_matched / num_reads
        ),
        file=sys.stderr,
    )

    if not sample:
        sample = pathlib.Path(input).stem

    with open(output, "w") as op:
        print(f"id\t{sample}", file=op)
        for (_, refname) in seq_to_ref.items():
            print(f"{refname}\t{counts[refname]}", file=op)
Beispiel #2
0
def split_fastq(input, output, chunk_size):
    """Split fastq files into smaller chunks."""
    input_file = osp.abspath(input)
    output_dir = osp.abspath(output)
    os.makedirs(output_dir, mode=0o755)

    # convenience functions
    output_file = lambda i: pjoin(output_dir, "part.{0}.fastq".format(i))

    with open_maybe_compressed(input_file, "r") as input_handle:
        num_processed = 0
        file_num = 1
        for (name, seq, qual) in readfq(input_handle):
            if num_processed == 0:
                op = open_maybe_compressed(output_file(file_num), "w")
            print(f"@{name}\n{seq}\n+\n{qual}", file=op)
            num_processed += 1
            if num_processed == chunk_size:
                op.close()
                num_processed = 0
                file_num += 1
        if not op.closed:
            op.close()
Beispiel #3
0
def truncate_fasta(input, output, length):
    """Truncate each sequence of a fasta file."""
    with open(input, "r") as ip, open(output, "w") as op:
        for (n, s, q) in readfq(ip):
            print(f">{n}\n{s[:length]}", file=op)