def main(): outfolder, genome_fasta, normalization_file_name = sys.argv[1:4] experimental_file_names = sys.argv[4:] control_dict = mod_utils.unPickle(normalization_file_name) rescaled_control_dict = normalize_dict_to_max(control_dict) norm_name = '.'.join(os.path.basename(normalization_file_name).split('.')[:-1]) experimental_dict_names = ['.'.join(os.path.basename(file_name).split('.')[:-1]) for file_name in experimental_file_names] experimental_dicts = [mod_utils.unPickle(file_name) for file_name in experimental_file_names] rescaled_experimental_dicts = [normalize_dict_to_max(exp_dict) for exp_dict in experimental_dicts] print experimental_dict_names, norm_name normed_mutation_rate_histogram(rescaled_experimental_dicts, experimental_dict_names, os.path.join(outfolder, '%s_rescaled_mutation_rate_histogram' % norm_name), title='mutation rate, rescaled to max', xlim = (0, 0.1), min = 0, max =1, step = 0.001) comparisons = [] rescaled_comparisons = [] write_wig(control_dict, norm_name, os.path.join(outfolder, norm_name)) for i in range(len(experimental_dict_names)): write_wig(rescaled_experimental_dicts[i], experimental_dict_names[i], os.path.join(outfolder, experimental_dict_names[i])) comparison_log2_ratios = compare_to_control(experimental_dicts[i], control_dict) rescaled_comparison_log2_ratios = compare_to_control(rescaled_experimental_dicts[i], rescaled_control_dict) comparisons.append(comparison_log2_ratios) rescaled_comparisons.append(rescaled_comparison_log2_ratios) mod_utils.makePickle(comparison_log2_ratios, os.path.join(outfolder, experimental_dict_names[i]+'_comparison_log2.pkl')) #mod_utils.makePickle(rescaled_comparison_log2_ratios, os.path.join(outfolder, experimental_dict_names[i]+'_rescaled_comparison_log2.pkl')) write_wig(comparison_log2_ratios, experimental_dict_names[i]+'_comparison_log2', os.path.join(outfolder, experimental_dict_names[i]+'_comparison_log2')) #write_wig(rescaled_comparison_log2_ratios, experimental_dict_names[i]+'_rescaled)comparison_log2', os.path.join(outfolder, experimental_dict_names[i]+'_rescaled_comparison_log2')) #try: # plot_weighted_nts_pie(background_subtracted, genome_fasta, '%s backround-subtracted fractions' % experimental_dict_names[i], os.path.join(outfolder, experimental_dict_names[i]+'_sub_pie')) #except: # pass #print comparisons #print rescaled_comparisons normed_mutation_rate_histogram(comparisons, experimental_dict_names, os.path.join(outfolder, '%s_comparison_histogram' % norm_name), title='log2 experiment/control', xlim = (-10, 10), min = -100, max =100, step = 0.1)
def pickle_mutation_fold_change(self, output_name, exclude_constitutive=False): """ stores mutation rates as a simple pickle, of {rRNA_name:{position:mutation rate}} :param subtract_background: :return: """ output_dict = {} for rRNA in self.rRNA_mutation_data: output_dict[rRNA] = {} for position in self.rRNA_mutation_data[rRNA].nucleotides: nucleotide = self.rRNA_mutation_data[rRNA].nucleotides[position] if exclude_constitutive and nucleotide.exclude_constitutive: output_dict[rRNA][position] = 1.0 else: try: output_dict[rRNA][position] = nucleotide.mutation_rate/self.get_normalizing_lib_with_mod().get_mutation_rate_at_position(rRNA, nucleotide.position) except: output_dict[rRNA][position] = float('inf')*nucleotide.mutation_rate mod_utils.makePickle(output_dict, output_name)
def main(): outfolder, genome_fasta, normalization_file_name = sys.argv[1:4] experimental_file_names = sys.argv[4:] mod_utils.make_dir(outfolder) normalization_dict = mod_utils.unPickle(normalization_file_name) norm_name = '.'.join(os.path.basename(normalization_file_name).split('.')[:-2]) experimental_dict_names = ['.'.join(os.path.basename(file_name).split('.')[:-2]) for file_name in experimental_file_names] experimental_dicts = [mod_utils.unPickle(file_name) for file_name in experimental_file_names] normed_mutation_rate_histogram(experimental_dicts, experimental_dict_names, os.path.join(outfolder, 'mutation_rate_histogram'), title='nonzero positions') background_subtracted_sets = [] write_wig(normalization_dict, norm_name, os.path.join(outfolder, norm_name)) for i in range(len(experimental_dict_names)): write_wig(experimental_dicts[i], experimental_dict_names[i], os.path.join(outfolder, experimental_dict_names[i])) background_subtracted = subtract_background(experimental_dicts[i], normalization_dict) background_subtracted_sets.append(background_subtracted) mod_utils.makePickle(background_subtracted, os.path.join(outfolder, experimental_dict_names[i]+'_subtracted.pkl')) write_wig(background_subtracted, experimental_dict_names[i]+'_subtracted', os.path.join(outfolder, experimental_dict_names[i]+'_subtracted')) try: plot_weighted_nts_pie(background_subtracted, genome_fasta, '%s backround-subtracted fractions' % experimental_dict_names[i], os.path.join(outfolder, experimental_dict_names[i]+'_sub_pie')) except: pass normed_mutation_rate_histogram(background_subtracted_sets, experimental_dict_names, os.path.join(outfolder, 'back_subtracted_mutation_rate_histogram'), title = 'nonzero positions, background subtracted')
def main(): all_counts_file, all_depths_file, min_mutations, output_prefix = sys.argv[1:5] min_mutations = int(min_mutations) all_counts = mod_utils.unPickle(all_counts_file) all_depths = mod_utils.unPickle(all_depths_file) comparisons = (pair.split(',') for pair in sys.argv[5:]) for comparison in comparisons: subtracted_rates, subtraction_errors = subtraction_norm(all_counts, all_depths, min_mutations, comparison) divided_rates, division_errors = division_norm(all_counts, all_depths, min_mutations, comparison) mod_utils.makePickle(subtracted_rates, '%s_%s_%s_sub_norm.pkl' % (output_prefix, comparison[0], comparison[1])) mod_utils.makePickle(subtraction_errors, '%s_%s_%s_sub_err.pkl' % (output_prefix, comparison[0], comparison[1])) mod_utils.makePickle(divided_rates, '%s_%s_%s_div_norm.pkl' % (output_prefix, comparison[0], comparison[1])) mod_utils.makePickle(division_errors, '%s_%s_%s_div_err.pkl' % (output_prefix, comparison[0], comparison[1])) for rRNA in subtracted_rates: write_out_counts(subtracted_rates, subtraction_errors, divided_rates, division_errors, rRNA, '%s_%s_%s_%s.txt' % (output_prefix, comparison[0], comparison[1], rRNA))
def pickle_mutation_rates(self, output_name, subtract_background=False, subtract_control=False, exclude_constitutive=False): """ stores mutation rates as a simple pickle, of {rRNA_name:{position:mutation rate}} :param subtract_background: :return: """ output_dict = {} for rRNA in self.rRNA_mutation_data: output_dict[rRNA] = {} for position in self.rRNA_mutation_data[rRNA].nucleotides: nucleotide = self.rRNA_mutation_data[rRNA].nucleotides[position] if exclude_constitutive and nucleotide.exclude_constitutive: output_dict[rRNA][position] = 0 else: if subtract_background and subtract_control: raise SyntaxError('Cannot subtract background and control simultaneously') if subtract_background: output_dict[rRNA][position] = max((nucleotide.mutation_rate - self.get_normalizing_lib(). get_mutation_rate_at_position(rRNA, nucleotide.position)), 0.) elif subtract_control: output_dict[rRNA][position] = nucleotide.mutation_rate - self.get_normalizing_lib_with_mod().get_mutation_rate_at_position(rRNA, nucleotide.position) else: output_dict[rRNA][position] = nucleotide.mutation_rate mod_utils.makePickle(output_dict, output_name)
def count_reads(lib_settings): """ """ # Create empty dicts for storing counts data srt_dict = createStrandDict(strands) # Counts for 5' end of read our standard data format cov_dict = createStrandDict(strands) # Counts of times covered by a read mut_dict = createStrandDict(strands) # Counts of mismatches at a position read_mutations = defaultdict(int) #counts different types of mutations relative to read genome_mutations = defaultdict(int) #counts different types of mutations relative to genome mutations_by_read_position = defaultdict(dict) read_position_coverage = defaultdict(float) mutations_by_genome_position = defaultdict(dict) genome_position_coverage = defaultdict(float) mutated_nts = defaultdict(float) read_insertion_sizes = [] genomic_deletion_sizes = [] with gzip.open(lib_settings.get_mapped_reads_sam_gz(), 'r') as f: for line in f: # Iterate through SAM file lines if not line.startswith('@'): # Parse line into relevant strings fields = line.strip().split('\t') ID = fields[0] #the first field in the mapped file corresponds to a unique id number for that read- these should correspond to the names in the raw_seqs dictionary flag = int(fields[1]) ''' The flag field provides a lot of info about the read, it is the decimal representation of a bit string, each digit of which is true or false Bit 0 = The read was part of a pair during sequencing Bit 1 = The read is mapped in a pair Bit 2 = The query sequence is unmapped Bit 3 = The mate is unmapped Bit 4 = Strand of query (0=forward 1=reverse) So, to see if a flag represents a read on the - strand, we evaluate (16 & 'flag'), where & is the bitwise and operator, which will be non-zero (True) only if this read is on the - strand ''' if (4&flag):#if this is an unmapped read, don't bother continue if (16&flag): strand = '-' else: strand = '+' chrom = fields[2] MAPQ = int(fields[4]) if int(MAPQ) >= lib_settings.get_property('min_mapping_quality'): cigarString = fields[5] seq = fields[9] mappingLength = len(seq) qScores = fields[10] # Some lines seem to lack some strings this throws of indexing of NM:i, MD:Z, and NH:i strings NHstr = checkTag('NH:i:',fields) NMstr = checkTag('NM:i:',fields) MDstr = checkTag('MD:Z:',fields) assert 'NM:i' in NMstr assert 'MD:Z' in MDstr assert 'NH:i' in NHstr multiplicity = float(NHstr.split(':')[2]) fields = line.strip().split('\t') counts = float(1.0/multiplicity) # Weight of read MDzString = MDstr # Add subdicts for chromosome if needed if chrom not in srt_dict[strand]: srt_dict[strand][chrom] = defaultdict(float) if chrom not in cov_dict[strand]: cov_dict[strand][chrom] = defaultdict(float) if chrom not in mut_dict[strand]: mut_dict[strand][chrom] = defaultdict(float) # Parse cigar string, get genome mapping span, and relative genomic positions covered by read rel_genomic_event_positions, rel_genome_coverage, mutations_rel_genome, mutations_rel_read, readMappingSpan, genomeMappingSpan = parse_MDz_and_cigar(cigarString, MDzString, mappingLength, seq) for pos in range(len(mutations_rel_genome)): genome_position_coverage[pos] += counts event = mutations_rel_genome[pos] if not event == 'M': #count if it's not a match assert event[0] != 'I' if event[0] == 'D': genomic_deletion_sizes.append(event[1]) event = event[0] if event not in mutations_by_genome_position[pos]: mutations_by_genome_position[pos][event] = 0 mutations_by_genome_position[pos][event] += counts genome_mutations[event] += counts if event[0] in 'ATCG': mutated_nts[event[0]] += counts for pos in range(len(mutations_rel_read)): read_position_coverage[pos] += counts event = mutations_rel_read[pos] if not event == 'M': #count if it's not a match assert event[0] != 'D' if event[0] == 'I': read_insertion_sizes.append(event[1]) event = event[0] if event not in mutations_by_read_position[pos]: mutations_by_read_position[pos][event] = 0 mutations_by_read_position[pos][event] += counts read_mutations[event] += counts # Set start position of read if strand== '+': start=int(fields[3]) else: #When a read maps to the minus strand, bowtie returns the reverse complement, and indicates # where this reverse mapped on the + strand. Thus the original 5' end of the read actually # was x nt downstream on the + strand start=int(fields[3])+genomeMappingSpan-1 # translate relative positions to absolute positions genome_cov = readGenomicCoverage(rel_genome_coverage, strand, start) # get genome coverage srt_dict[strand][chrom][start] += counts #just add the number of counts to that start position for pos in genome_cov: # Increment positions for coverage dict cov_dict[strand][chrom][pos] += counts # If mismatches need to parse, get the absolute genomic pos, and increment counters genMismatches = readGenomicCoverage(rel_genomic_event_positions, strand, start) for event_position in genMismatches: mut_dict[strand][chrom][event_position] += counts mod_utils.makePickle(srt_dict, lib_settings.get_read_5p_counts()) mod_utils.makePickle(cov_dict, lib_settings.get_positional_coverage()) mod_utils.makePickle(mut_dict, lib_settings.get_mutation_counts()) mod_utils.makePickle(genome_mutations, lib_settings.get_counting_prefix() + '.genome_mutations.pkl') mod_utils.makePickle(mutations_by_genome_position, lib_settings.get_counting_prefix() + '.genome_position_mutations.pkl') mod_utils.makePickle(genome_position_coverage, lib_settings.get_counting_prefix() + '.genome_position_coverage.pkl') mod_utils.makePickle(mutated_nts, lib_settings.get_counting_prefix() + '.nt_mutations.pkl') mod_utils.makePickle(read_mutations, lib_settings.get_counting_prefix() + '.read_mutations.pkl') mod_utils.makePickle(mutations_by_read_position, lib_settings.get_counting_prefix() + '.read_position_mutations.pkl') mod_utils.makePickle(read_position_coverage, lib_settings.get_counting_prefix() + '.read_position_coverage.pkl') mod_utils.makePickle(genomic_deletion_sizes, lib_settings.get_counting_prefix() + '.deletion_sizes.pkl') mod_utils.makePickle(read_insertion_sizes, lib_settings.get_counting_prefix() + '.insertion_sizes.pkl') normalized_mutations = normalized_mutation_rates(mod_utils.unPickle(lib_settings.get_mutation_counts()), mod_utils.unPickle(lib_settings.get_positional_coverage())) mod_utils.makePickle(normalized_mutations, lib_settings.get_normalized_mutation_counts()) plot_mutated_nts_pie(mod_utils.unPickle(lib_settings.get_counting_prefix() + '.nt_mutations.pkl'), 'mutated rRNA nts in ' + lib_settings.sample_name, lib_settings.get_counting_prefix()+'.mutated_nts' ) plot_full_mutation_stats(mod_utils.unPickle(lib_settings.get_counting_prefix() + '.read_mutations.pkl'), mod_utils.unPickle(lib_settings.get_counting_prefix() + '.insertion_sizes.pkl'), mod_utils.unPickle(lib_settings.get_counting_prefix() + '.read_position_mutations.pkl'), mod_utils.unPickle(lib_settings.get_counting_prefix() + '.read_position_coverage.pkl'), 'mutations wrt reads', "insertion size", lib_settings.get_counting_prefix()+'.read_mutations') plot_full_mutation_stats(mod_utils.unPickle(lib_settings.get_counting_prefix() + '.genome_mutations.pkl'), mod_utils.unPickle(lib_settings.get_counting_prefix() + '.deletion_sizes.pkl'), mod_utils.unPickle(lib_settings.get_counting_prefix() + '.genome_position_mutations.pkl'), mod_utils.unPickle(lib_settings.get_counting_prefix() + '.genome_position_coverage.pkl'), 'mutations wrt genome', "deletion size", lib_settings.get_counting_prefix()+'.genome_mutations') pie_read_5p_ends(mod_utils.unPickle(lib_settings.get_read_5p_counts()), mod_utils.convertFastaToDict(lib_settings.experiment_settings.get_rRNA_fasta()), lib_settings.get_counting_prefix()) normed_mutation_rate_histogram(mod_utils.unPickle(lib_settings.get_normalized_mutation_counts()), lib_settings.sample_name, lib_settings.get_counting_prefix())