def add_multimapping_tally(open_bowtie_file, chunk_size=2048):
    """Count the number of mappings of a read tag.

    ASSUMES: file is sorted by read tag name
    Either run on unmodified bowtie output or re-sort
    on the first column of the SAM formatted output:
        To preserve the header lines:
            samtools view -SH bowtiefile.sam > sortedbowtiefile.sam
        To add sorted alignment lines:
            samtools view -S bowtiefile.sam | sort -k1,1 >> sortedbowtiefile.sam
    Note that sorting may take a lot of resource to do, that's why it is best
    to add multimapping tally BEFORE any other operations are done to the
    Bowtie output."""
    master_read = ""
    mapping_count = 0
    saved_reads = []
    for alignment in read_chunk(open_bowtie_file, chunk_size):
        columns = alignment.split("\t")
        if columns[0] == master_read:
            mapping_count += 1
            saved_reads.append(alignment)
        else:
            if master_read != "":
                for read in saved_reads:
                    yield "{}\tNH:i:{}".format(read, mapping_count)
            master_read = columns[0]
            saved_reads = [alignment]
            if (int(columns[1]) & 0x4) == 0x4:
                # unmapped read according to SAM 0x4 flag in second column
                mapping_count = 0
            else:
                # mapped read
                mapping_count = 1
def create_alignment_db(sam_openfile, library_name, database_prefix):
    """Create alignment SQLite3 databases representing alignment data from Bowtie SAM file.

    Database files:
        {prefix}_tagloci.db: partition on (p) Chromosome,
                             index and partition on (ip) Start(0-based),
                             End (0-based),
                             Strand (+,-,or .),
                             Tag Sequence,
                             Number of Mismatches
        {prefix}_{library}.db: (ip) Tag Sequence,
                                Abundance
        {prefix}_tags.db: (ip) Tag Sequence,
                          Total Mappings,
                          Mappings with 0 mismatches (perfect),
                          Mappings with 1 mismatch,
                          Mappings with 2 mismatches"""
    #TODO create_chromosome_db(sam_openfile, database_prefix)
    tagloci = "{}_tagloci".format(database_prefix)
    library = "{}_{}".format(database_prefix, library_name)
    tags = "{}_tags".format(database_prefix)

    # scan through header lines
    header = sam_openfile.readline()
    while header[0] == "@":
        header = sam_openfile.readline()

    mismatch_tally = [0, 0, 0]
    # parse initial alignment
    (read_name, maps, mismatches, tag, position, strand) = parse_alignment(header.strip())
    if maps:
        write_data("{}\t{}\t{}\t{}".format(position, tag, mismatches, strand), "{}.data".format(tagloci))
        mismatch_tally[mismatches] += 1
    last_read_name = read_name
    last_tag = tag

    for alignment in read_chunk(sam_openfile, CHUNK):
        (read_name, maps, mismatches, tag, position, strand) = parse_alignment(alignment)
        if maps:  # don't process unmapped reads
            write_data("{}\t{}\t{}\t{}".format(position, tag, mismatches, strand), "{}.data".format(tagloci))
            if read_name == last_read_name:
                mismatch_tally[mismatches] += 1
            else:
                write_data("{}\t1".format(last_tag), "{}.data".format(library))
                # prepare output for tags database
                mismatch_string = "\t".join(str(m) for m in mismatch_tally)
                write_data("{}\t{}\t{}".format(last_tag, sum(mismatch_tally), mismatch_string),
                           "{}.data".format(tags))
                # reset for next round
                last_read_name = read_name
                last_tag = tag
                mismatch_tally = [0, 0, 0]
                mismatch_tally[mismatches] += 1
    # write out last tag
    write_data("{}\t1".format(last_tag), "{}.data".format(library))
    # prepare output for tags database
    mismatch_string = "\t".join(str(m) for m in mismatch_tally)
    write_data("{}\t{}\t{}".format(last_tag, sum(mismatch_tally), mismatch_string),
               "{}.data".format(tags))
Esempio n. 3
0
def split_by_position(bed_like_file, base_chunk):
    """Split a file into several subfiles by chromosome and start position.
    """
    for line in read_chunk(bed_like_file, CHUNK):
        parts = line.split("\t")
        outfile_name = "{}_{}".format(parts[0], (int(parts[1]) / base_chunk))
        with open(outfile_name, 'a') as outfile:
            outfile.write(line + "\n")
def extract_5prime_most_base(alignments_source, output_to_stdout, output_filename):
    """Extract the 5'-most base from each alignment."""
    for alignment in read_chunk(alignments_source, CHUNK_SIZE):
        if alignment[0] != "@" and alignment[0] != "#":  # skip any header/comment lines
            try:
                read = Read(alignment)
                if output_to_stdout:
                    print read.print_first_base()
                else:
                    with open(output_filename, 'a') as output:
                        output.write('{}\n'.format(read.print_first_base()))
            except ReadError as _error:
                if _error.name != 'unmapped':  # silently skip unmapped reads only
                    raise ReadError(_error.message, _error.name)
Esempio n. 5
0
def extract_5prime_most_base(alignments_source, output_to_stdout,
                             output_filename):
    """Extract the 5'-most base from each alignment."""
    for alignment in read_chunk(alignments_source, CHUNK_SIZE):
        if alignment[0] != "@" and alignment[
                0] != "#":  # skip any header/comment lines
            try:
                read = Read(alignment)
                if output_to_stdout:
                    print read.print_first_base()
                else:
                    with open(output_filename, 'a') as output:
                        output.write('{}\n'.format(read.print_first_base()))
            except ReadError as _error:
                if _error.name != 'unmapped':  # silently skip unmapped reads only
                    raise ReadError(_error.message, _error.name)
def create_cluster_files(loci_openfile, library_name, database_prefix):
    """Create cluster and cluster-tag files from merged loci.

    Database files:
        {prefix}_clusters.db: partition on (p) Chromosome,
                             index and partition on (ip) Start(0-based),
                             End (0-based),
                             cluster_name (unique),
                             Number of tags,
                             Strand (+,-,or .)
        {prefix}_clustertags.db: cluster_name,
                                tag_sequence
                                (together the two will be unique)"""
    clusters = "{}_clusters".format(database_prefix)
    clustertags = "{}_clustertags".format(database_prefix)

    cluster_index = 1

    for cluster in read_chunk(loci_openfile, CHUNK):
        parts = cluster.split("\t")
        tags = parts[3].split(";")
        unique_tags = get_unique_tags(tags)
        if len(parts) == 6:
            strand = parts[-1]
        else:
            strand = "."
        with open("{}.data".format(clusters), 'a') as output:
            output.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(parts[0],
                                                           parts[1],
                                                           parts[2],
                                                           "{}_{}".format(library_name, cluster_index),
                                                           len(unique_tags),
                                                           strand))
        with open("{}.data".format(clustertags), 'a') as output:
            for tag in unique_tags:
                output.write("{}\t{}\n".format("{}_{}".format(library_name, cluster_index),
                                               tag))
        cluster_index += 1
def create_cluster_files(loci_openfile, library_name, database_prefix):
    """Create cluster and cluster-tag files from merged loci.

    Database files:
        {prefix}_clusters.db: partition on (p) Chromosome,
                             index and partition on (ip) Start(0-based),
                             End (0-based),
                             cluster_name (unique),
                             Number of tags,
                             Strand (+,-,or .)
        {prefix}_clustertags.db: cluster_name,
                                tag_sequence
                                (together the two will be unique)"""
    clusters = "{}_clusters".format(database_prefix)
    clustertags = "{}_clustertags".format(database_prefix)

    cluster_index = 1

    for cluster in read_chunk(loci_openfile, CHUNK):
        parts = cluster.split("\t")
        tags = parts[3].split(";")
        unique_tags = get_unique_tags(tags)
        if len(parts) == 6:
            strand = parts[-1]
        else:
            strand = "."
        with open("{}.data".format(clusters), 'a') as output:
            output.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(
                parts[0], parts[1],
                parts[2], "{}_{}".format(library_name, cluster_index),
                len(unique_tags), strand))
        with open("{}.data".format(clustertags), 'a') as output:
            for tag in unique_tags:
                output.write("{}\t{}\n".format(
                    "{}_{}".format(library_name, cluster_index), tag))
        cluster_index += 1
def create_alignment_db(sam_openfile, library_name, database_prefix):
    """Create alignment SQLite3 databases representing alignment data from Bowtie SAM file.

    Database files:
        {prefix}_tagloci.db: partition on (p) Chromosome,
                             index and partition on (ip) Start(0-based),
                             End (0-based),
                             Strand (+,-,or .),
                             Tag Sequence,
                             Number of Mismatches
        {prefix}_{library}.db: (ip) Tag Sequence,
                                Abundance
        {prefix}_tags.db: (ip) Tag Sequence,
                          Total Mappings,
                          Mappings with 0 mismatches (perfect),
                          Mappings with 1 mismatch,
                          Mappings with 2 mismatches"""
    #TODO create_chromosome_db(sam_openfile, database_prefix)
    tagloci = "{}_tagloci".format(database_prefix)
    library = "{}_{}".format(database_prefix, library_name)
    tags = "{}_tags".format(database_prefix)

    # scan through header lines
    header = sam_openfile.readline()
    while header[0] == "@":
        header = sam_openfile.readline()

    mismatch_tally = [0, 0, 0]
    # parse initial alignment
    (read_name, maps, mismatches, tag, position,
     strand) = parse_alignment(header.strip())
    if maps:
        write_data("{}\t{}\t{}\t{}".format(position, tag, mismatches, strand),
                   "{}.data".format(tagloci))
        mismatch_tally[mismatches] += 1
    last_read_name = read_name
    last_tag = tag

    for alignment in read_chunk(sam_openfile, CHUNK):
        (read_name, maps, mismatches, tag, position,
         strand) = parse_alignment(alignment)
        if maps:  # don't process unmapped reads
            write_data(
                "{}\t{}\t{}\t{}".format(position, tag, mismatches, strand),
                "{}.data".format(tagloci))
            if read_name == last_read_name:
                mismatch_tally[mismatches] += 1
            else:
                write_data("{}\t1".format(last_tag), "{}.data".format(library))
                # prepare output for tags database
                mismatch_string = "\t".join(str(m) for m in mismatch_tally)
                write_data(
                    "{}\t{}\t{}".format(last_tag, sum(mismatch_tally),
                                        mismatch_string),
                    "{}.data".format(tags))
                # reset for next round
                last_read_name = read_name
                last_tag = tag
                mismatch_tally = [0, 0, 0]
                mismatch_tally[mismatches] += 1
    # write out last tag
    write_data("{}\t1".format(last_tag), "{}.data".format(library))
    # prepare output for tags database
    mismatch_string = "\t".join(str(m) for m in mismatch_tally)
    write_data(
        "{}\t{}\t{}".format(last_tag, sum(mismatch_tally), mismatch_string),
        "{}.data".format(tags))