def set_sam_tag(cls, count_tag, bamfile_name, tag_regex): """Add key:value pair to class variable: has_sam_tag. Keyword Arguments: count_tag -- boolean on whether to count with this tag bamfile_name -- file to query for tag tag_regex -- regular expression for the tag (eg 'NA:i:(\d+)') """ (run_pipe_worked, sam_sample) = run_pipe(['samtools view {}'.format(bamfile_name), 'head -n 10']) if run_pipe_worked: return cls.process_set_sam_tag(sam_sample, count_tag, tag_regex) else: raise MetageneError("Checking the bam file failed with error: {}".format(sam_sample))
def set_chromosome_sizes(cls, bamfile): """Set chromosome_sizes dictionary with BAM header. Keyword Arguments: bamfile -- name of bamfile """ (run_pipe_worked, header) = run_pipe(["samtools view -H {}".format(bamfile)]) if not run_pipe_worked: raise MetageneError("Could not open BAM file {}".format(bamfile)) else: try: return cls.extract_chromosome_sizes(header) except MetageneError as err: raise MetageneError("Error processing {} header\n{}".format(bamfile, err.message))
def set_chromosome_sizes(cls, bamfile): """Set chromosome_sizes dictionary with BAM header. Keyword Arguments: bamfile -- name of bamfile """ (run_pipe_worked, header) = run_pipe(["samtools view -H {}".format(bamfile)]) if not run_pipe_worked: raise MetageneError("Could not open BAM file {}".format(bamfile)) else: try: return cls.extract_chromosome_sizes(header) except MetageneError as err: raise MetageneError("Error processing {} header\n{}".format( bamfile, err.message))
def set_sam_tag(cls, count_tag, bamfile_name, tag_regex): """Add key:value pair to class variable: has_sam_tag. Keyword Arguments: count_tag -- boolean on whether to count with this tag bamfile_name -- file to query for tag tag_regex -- regular expression for the tag (eg 'NA:i:(\d+)') """ (run_pipe_worked, sam_sample) = run_pipe( ['samtools view {}'.format(bamfile_name), 'head -n 10']) if run_pipe_worked: return cls.process_set_sam_tag(sam_sample, count_tag, tag_regex) else: raise MetageneError( "Checking the bam file failed with error: {}".format( sam_sample))
def metagene_count(): """Chain of command for metagene_count analysis.""" arguments = get_arguments() # confirm BAM file and extract chromosome sizes Read.set_chromosome_sizes(arguments.alignment) ##TODO: create a list of chromosomes to analyze and/or exclude # create chromosome conversion dictionary for feature (GFF/BED) to alignment (BAM) Feature.set_chromosome_conversion(arguments.chromosome_names, Read.chromosome_sizes.keys()) # define has_abundance and has_mappings tags for Read class Read.set_sam_tag(arguments.extract_abundance, arguments.alignment, "NA:i:(\d+)") Read.set_sam_tag(arguments.extract_mappings, arguments.alignment, "NH:i:(\d+)") # define the metagene array shape (left padding, start, internal, end, right padding) # metagene = padding ---- internal region ---- padding try: metagene = Metagene(arguments.interval_size, arguments.padding, arguments.padding) print "Metagene definition:\t{}".format(metagene) except MetageneError as err: print err raise MetageneError("Unable to create the metagene template") try: Feature.set_format(arguments.feature) # assign file format for the feature file print "Reading feature file as {} format".format(Feature.format) except MetageneError as err: print err raise MetageneError("Unable to create the feature object") # print out the header line... if not arguments.interval_variable: with open("{}.metagene_counts.csv".format(arguments.output_prefix), 'w') as output_file: output_file.write("# Metagene:\t{}\n".format(metagene)) # define for plotting later output_file.write(metagene.print_full()) # for each feature with open(arguments.feature, 'r') as feature_file: for feature_line in read_chunk(feature_file, 1024): if feature_line[0] != "#": # skip comment lines # change creation with feature_method feature = Feature.create(arguments.feature_count, metagene, feature_line, arguments.count_splicing, arguments.ignore_strand) # pull out sam file lines; it is important to use Feature.get_samtools_region(chromosome_lengths) rather # than Feature.get_chromosome_region() because only the first ensures that the interval does not # extend beyond the length of the chromosome which makes samtools view return no reads (run_pipe_worked, sam_sample) = run_pipe(['samtools view {} {}'.format( arguments.alignment, feature.get_samtools_region())]) if run_pipe_worked: for samline in sam_sample: if len(samline) > 0: # create Read feature (created_read, read) = Read.create_from_sam(samline, Feature.chromosome_conversion.values(), arguments.count_method, arguments.uniquely_mapping, arguments.ignore_strand, arguments.count_secondary_alignments, arguments.count_failed_quality_control, arguments.count_PCR_optical_duplicate, arguments.count_supplementary_alignment) # count read (if it exists) if created_read: feature.count_read(read, arguments.count_method, arguments.count_splicing, arguments.count_partial_reads, arguments.ignore_strand) # output the resulting metagene with open("{}.metagene_counts.csv".format(arguments.output_prefix), 'a') as output_file: output_file.write( "{}\n".format(feature.print_metagene(interval_override=arguments.interval_variable))) else: raise MetageneError("Could not pull chromosomal region {} for feature {} from BAM file {}.".format( feature.get_chromosome_region(), feature.name, arguments.alignment))
def metagene_count(): """Chain of command for metagene_count analysis.""" arguments = get_arguments() # confirm BAM file and extract chromosome sizes Read.set_chromosome_sizes(arguments.alignment) ##TODO: create a list of chromosomes to analyze and/or exclude # create chromosome conversion dictionary for feature (GFF/BED) to alignment (BAM) Feature.set_chromosome_conversion(arguments.chromosome_names, Read.chromosome_sizes.keys()) # define has_abundance and has_mappings tags for Read class Read.set_sam_tag(arguments.extract_abundance, arguments.alignment, "NA:i:(\d+)") Read.set_sam_tag(arguments.extract_mappings, arguments.alignment, "NH:i:(\d+)") # define the metagene array shape (left padding, start, internal, end, right padding) # metagene = padding ---- internal region ---- padding try: metagene = Metagene(arguments.interval_size, arguments.padding, arguments.padding) print "Metagene definition:\t{}".format(metagene) except MetageneError as err: print err raise MetageneError("Unable to create the metagene template") try: Feature.set_format( arguments.feature) # assign file format for the feature file print "Reading feature file as {} format".format(Feature.format) except MetageneError as err: print err raise MetageneError("Unable to create the feature object") # print out the header line... if not arguments.interval_variable: with open("{}.metagene_counts.csv".format(arguments.output_prefix), 'w') as output_file: output_file.write("# Metagene:\t{}\n".format( metagene)) # define for plotting later output_file.write(metagene.print_full()) # for each feature with open(arguments.feature, 'r') as feature_file: for feature_line in read_chunk(feature_file, 1024): if feature_line[0] != "#": # skip comment lines # change creation with feature_method feature = Feature.create(arguments.feature_count, metagene, feature_line, arguments.count_splicing, arguments.ignore_strand) # pull out sam file lines; it is important to use Feature.get_samtools_region(chromosome_lengths) rather # than Feature.get_chromosome_region() because only the first ensures that the interval does not # extend beyond the length of the chromosome which makes samtools view return no reads (run_pipe_worked, sam_sample) = run_pipe([ 'samtools view {} {}'.format(arguments.alignment, feature.get_samtools_region()) ]) if run_pipe_worked: for samline in sam_sample: if len(samline) > 0: # create Read feature (created_read, read) = Read.create_from_sam( samline, Feature.chromosome_conversion.values(), arguments.count_method, arguments.uniquely_mapping, arguments.ignore_strand, arguments.count_secondary_alignments, arguments.count_failed_quality_control, arguments.count_PCR_optical_duplicate, arguments.count_supplementary_alignment) # count read (if it exists) if created_read: feature.count_read( read, arguments.count_method, arguments.count_splicing, arguments.count_partial_reads, arguments.ignore_strand) # output the resulting metagene with open( "{}.metagene_counts.csv".format( arguments.output_prefix), 'a') as output_file: output_file.write("{}\n".format( feature.print_metagene(interval_override=arguments. interval_variable))) else: raise MetageneError( "Could not pull chromosomal region {} for feature {} from BAM file {}." .format(feature.get_chromosome_region(), feature.name, arguments.alignment))