def add_multimapping_tally(open_bowtie_file, chunk_size=2048): """Count the number of mappings of a read tag. ASSUMES: file is sorted by read tag name Either run on unmodified bowtie output or re-sort on the first column of the SAM formatted output: To preserve the header lines: samtools view -SH bowtiefile.sam > sortedbowtiefile.sam To add sorted alignment lines: samtools view -S bowtiefile.sam | sort -k1,1 >> sortedbowtiefile.sam Note that sorting may take a lot of resource to do, that's why it is best to add multimapping tally BEFORE any other operations are done to the Bowtie output.""" master_read = "" mapping_count = 0 saved_reads = [] for alignment in read_chunk(open_bowtie_file, chunk_size): columns = alignment.split("\t") if columns[0] == master_read: mapping_count += 1 saved_reads.append(alignment) else: if master_read != "": for read in saved_reads: yield "{}\tNH:i:{}".format(read, mapping_count) master_read = columns[0] saved_reads = [alignment] if (int(columns[1]) & 0x4) == 0x4: # unmapped read according to SAM 0x4 flag in second column mapping_count = 0 else: # mapped read mapping_count = 1
def create_alignment_db(sam_openfile, library_name, database_prefix): """Create alignment SQLite3 databases representing alignment data from Bowtie SAM file. Database files: {prefix}_tagloci.db: partition on (p) Chromosome, index and partition on (ip) Start(0-based), End (0-based), Strand (+,-,or .), Tag Sequence, Number of Mismatches {prefix}_{library}.db: (ip) Tag Sequence, Abundance {prefix}_tags.db: (ip) Tag Sequence, Total Mappings, Mappings with 0 mismatches (perfect), Mappings with 1 mismatch, Mappings with 2 mismatches""" #TODO create_chromosome_db(sam_openfile, database_prefix) tagloci = "{}_tagloci".format(database_prefix) library = "{}_{}".format(database_prefix, library_name) tags = "{}_tags".format(database_prefix) # scan through header lines header = sam_openfile.readline() while header[0] == "@": header = sam_openfile.readline() mismatch_tally = [0, 0, 0] # parse initial alignment (read_name, maps, mismatches, tag, position, strand) = parse_alignment(header.strip()) if maps: write_data("{}\t{}\t{}\t{}".format(position, tag, mismatches, strand), "{}.data".format(tagloci)) mismatch_tally[mismatches] += 1 last_read_name = read_name last_tag = tag for alignment in read_chunk(sam_openfile, CHUNK): (read_name, maps, mismatches, tag, position, strand) = parse_alignment(alignment) if maps: # don't process unmapped reads write_data("{}\t{}\t{}\t{}".format(position, tag, mismatches, strand), "{}.data".format(tagloci)) if read_name == last_read_name: mismatch_tally[mismatches] += 1 else: write_data("{}\t1".format(last_tag), "{}.data".format(library)) # prepare output for tags database mismatch_string = "\t".join(str(m) for m in mismatch_tally) write_data("{}\t{}\t{}".format(last_tag, sum(mismatch_tally), mismatch_string), "{}.data".format(tags)) # reset for next round last_read_name = read_name last_tag = tag mismatch_tally = [0, 0, 0] mismatch_tally[mismatches] += 1 # write out last tag write_data("{}\t1".format(last_tag), "{}.data".format(library)) # prepare output for tags database mismatch_string = "\t".join(str(m) for m in mismatch_tally) write_data("{}\t{}\t{}".format(last_tag, sum(mismatch_tally), mismatch_string), "{}.data".format(tags))
def split_by_position(bed_like_file, base_chunk): """Split a file into several subfiles by chromosome and start position. """ for line in read_chunk(bed_like_file, CHUNK): parts = line.split("\t") outfile_name = "{}_{}".format(parts[0], (int(parts[1]) / base_chunk)) with open(outfile_name, 'a') as outfile: outfile.write(line + "\n")
def extract_5prime_most_base(alignments_source, output_to_stdout, output_filename): """Extract the 5'-most base from each alignment.""" for alignment in read_chunk(alignments_source, CHUNK_SIZE): if alignment[0] != "@" and alignment[0] != "#": # skip any header/comment lines try: read = Read(alignment) if output_to_stdout: print read.print_first_base() else: with open(output_filename, 'a') as output: output.write('{}\n'.format(read.print_first_base())) except ReadError as _error: if _error.name != 'unmapped': # silently skip unmapped reads only raise ReadError(_error.message, _error.name)
def extract_5prime_most_base(alignments_source, output_to_stdout, output_filename): """Extract the 5'-most base from each alignment.""" for alignment in read_chunk(alignments_source, CHUNK_SIZE): if alignment[0] != "@" and alignment[ 0] != "#": # skip any header/comment lines try: read = Read(alignment) if output_to_stdout: print read.print_first_base() else: with open(output_filename, 'a') as output: output.write('{}\n'.format(read.print_first_base())) except ReadError as _error: if _error.name != 'unmapped': # silently skip unmapped reads only raise ReadError(_error.message, _error.name)
def create_cluster_files(loci_openfile, library_name, database_prefix): """Create cluster and cluster-tag files from merged loci. Database files: {prefix}_clusters.db: partition on (p) Chromosome, index and partition on (ip) Start(0-based), End (0-based), cluster_name (unique), Number of tags, Strand (+,-,or .) {prefix}_clustertags.db: cluster_name, tag_sequence (together the two will be unique)""" clusters = "{}_clusters".format(database_prefix) clustertags = "{}_clustertags".format(database_prefix) cluster_index = 1 for cluster in read_chunk(loci_openfile, CHUNK): parts = cluster.split("\t") tags = parts[3].split(";") unique_tags = get_unique_tags(tags) if len(parts) == 6: strand = parts[-1] else: strand = "." with open("{}.data".format(clusters), 'a') as output: output.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(parts[0], parts[1], parts[2], "{}_{}".format(library_name, cluster_index), len(unique_tags), strand)) with open("{}.data".format(clustertags), 'a') as output: for tag in unique_tags: output.write("{}\t{}\n".format("{}_{}".format(library_name, cluster_index), tag)) cluster_index += 1
def create_cluster_files(loci_openfile, library_name, database_prefix): """Create cluster and cluster-tag files from merged loci. Database files: {prefix}_clusters.db: partition on (p) Chromosome, index and partition on (ip) Start(0-based), End (0-based), cluster_name (unique), Number of tags, Strand (+,-,or .) {prefix}_clustertags.db: cluster_name, tag_sequence (together the two will be unique)""" clusters = "{}_clusters".format(database_prefix) clustertags = "{}_clustertags".format(database_prefix) cluster_index = 1 for cluster in read_chunk(loci_openfile, CHUNK): parts = cluster.split("\t") tags = parts[3].split(";") unique_tags = get_unique_tags(tags) if len(parts) == 6: strand = parts[-1] else: strand = "." with open("{}.data".format(clusters), 'a') as output: output.write("{}\t{}\t{}\t{}\t{}\t{}\n".format( parts[0], parts[1], parts[2], "{}_{}".format(library_name, cluster_index), len(unique_tags), strand)) with open("{}.data".format(clustertags), 'a') as output: for tag in unique_tags: output.write("{}\t{}\n".format( "{}_{}".format(library_name, cluster_index), tag)) cluster_index += 1
def create_alignment_db(sam_openfile, library_name, database_prefix): """Create alignment SQLite3 databases representing alignment data from Bowtie SAM file. Database files: {prefix}_tagloci.db: partition on (p) Chromosome, index and partition on (ip) Start(0-based), End (0-based), Strand (+,-,or .), Tag Sequence, Number of Mismatches {prefix}_{library}.db: (ip) Tag Sequence, Abundance {prefix}_tags.db: (ip) Tag Sequence, Total Mappings, Mappings with 0 mismatches (perfect), Mappings with 1 mismatch, Mappings with 2 mismatches""" #TODO create_chromosome_db(sam_openfile, database_prefix) tagloci = "{}_tagloci".format(database_prefix) library = "{}_{}".format(database_prefix, library_name) tags = "{}_tags".format(database_prefix) # scan through header lines header = sam_openfile.readline() while header[0] == "@": header = sam_openfile.readline() mismatch_tally = [0, 0, 0] # parse initial alignment (read_name, maps, mismatches, tag, position, strand) = parse_alignment(header.strip()) if maps: write_data("{}\t{}\t{}\t{}".format(position, tag, mismatches, strand), "{}.data".format(tagloci)) mismatch_tally[mismatches] += 1 last_read_name = read_name last_tag = tag for alignment in read_chunk(sam_openfile, CHUNK): (read_name, maps, mismatches, tag, position, strand) = parse_alignment(alignment) if maps: # don't process unmapped reads write_data( "{}\t{}\t{}\t{}".format(position, tag, mismatches, strand), "{}.data".format(tagloci)) if read_name == last_read_name: mismatch_tally[mismatches] += 1 else: write_data("{}\t1".format(last_tag), "{}.data".format(library)) # prepare output for tags database mismatch_string = "\t".join(str(m) for m in mismatch_tally) write_data( "{}\t{}\t{}".format(last_tag, sum(mismatch_tally), mismatch_string), "{}.data".format(tags)) # reset for next round last_read_name = read_name last_tag = tag mismatch_tally = [0, 0, 0] mismatch_tally[mismatches] += 1 # write out last tag write_data("{}\t1".format(last_tag), "{}.data".format(library)) # prepare output for tags database mismatch_string = "\t".join(str(m) for m in mismatch_tally) write_data( "{}\t{}\t{}".format(last_tag, sum(mismatch_tally), mismatch_string), "{}.data".format(tags))