def main(): read_5p_ends_file, genome_fasta, outprefix = sys.argv[1:4] tp_tn_annotations = sys.argv[4:]#true positive and true negative annotations genome_dict = mod_utils.convertFastaToDict(genome_fasta) read_5p_ends = mod_utils.unPickle(read_5p_ends_file) normed_density_array = winsorize_norm_chromosome_data(read_5p_ends, 'S.c.18S_rRNA', '+', genome_dict, 'ACTG') real_tp_tn_data = [] for filename in tp_tn_annotations: real_tp, real_tn = get_tp_tn(filename) real_tp_tn_data.append((os.path.basename(filename), real_tp, real_tn)) roc_curves = {} for entry in real_tp_tn_data: roc_curves[entry[0]] = [[],[]]#x and y value arrays for each stepsize = 0.0001 for cutoff in numpy.arange(0,1.+5*stepsize, stepsize): called_p = call_positives(normed_density_array, 'S.c.18S_rRNA', '+', genome_dict, 'AC', cutoff) for entry in real_tp_tn_data: #print called_p.intersection(entry[1]) num_tp_called = len(called_p.intersection(entry[1]))#how many true positives called at this cutoff num_fp_called = len(called_p.intersection(entry[2]))#how many fp positives called at this cutoff roc_curves[entry[0]][0].append(100.*num_fp_called/float(len(entry[2])))#FP rate on x axis roc_curves[entry[0]][1].append(100.*num_tp_called/float(len(entry[1])))#TP rate on y axis plot_ROC_curves(roc_curves, outprefix)
def main(): tp_tn_annotations, genome_fasta, outprefix = sys.argv[1:4] density_files = sys.argv[4:] sample_names = [os.path.basename(filename).split("_back_")[0] for filename in density_files] mutation_densities = [mod_utils.unPickle(pickled_density) for pickled_density in density_files] genome_dict = mod_utils.convertFastaToDict(genome_fasta) normed_density_arrays = [ winsorize_norm_chromosome_data(mutation_density, "S.c.25S__rRNA", genome_dict, "AC") for mutation_density in mutation_densities ] real_tp, real_tn = get_tp_tn(tp_tn_annotations) roc_curves = {} for sample_name in sample_names: roc_curves[sample_name] = [[], []] # x and y value arrays for each stepsize = 0.0001 for cutoff in numpy.arange(0, 1.0 + 5 * stepsize, stepsize): for i in range(len(sample_names)): # the fasta file should be the EXACT one used for the pipeline, and the chromosome name below should match # the one in the FASTA file exactly called_p = call_positives(normed_density_arrays[i], "S.c.25S__rRNA", genome_dict, "AC", cutoff) num_tp_called = len(called_p.intersection(real_tp)) # how many true positives called at this cutoff num_fp_called = len(called_p.intersection(real_tn)) # how many fp positives called at this cutoff roc_curves[sample_names[i]][1].append(100.0 * num_tp_called / float(len(real_tp))) # TP rate on y axis roc_curves[sample_names[i]][0].append(100.0 * num_fp_called / float(len(real_tn))) # FP rate on x axis plot_ROC_curves(roc_curves, "S.c.25S__rRNA", outprefix)
def plot_weighted_nts_pie(background_subtracted, fasta_genome, title, out_prefix): genome = mod_utils.convertFastaToDict(fasta_genome) fig = plt.figure(figsize=(8,8)) plot = fig.add_subplot(111)#a pie chart of mutated nts weighted by background-subtracted counts labels = "ATCG" nt_counts = defaultdict(float) for strand in background_subtracted: for chromosome in background_subtracted[strand]: for position in background_subtracted[strand][chromosome]: nt = genome[chromosome][position-1] nt_counts[nt] += background_subtracted[strand][chromosome][position] sizes = numpy.array([nt_counts[nt] for nt in labels]) total = float(sum(sizes)) sizes = sizes/total merged_labels = ['%s %.3f' % (labels[i], sizes[i]) for i in range(len(sizes))] plot.pie(sizes, labels = merged_labels, colors = mod_utils.rainbow) plot.set_title(title) plt.savefig(out_prefix + '.pdf', transparent='True', format='pdf') plt.clf()
def main(): tp_tn_annotations, genome_fasta, outprefix = sys.argv[1:4] density_files = sys.argv[4:] sample_names = [os.path.basename(filename) for filename in density_files] mutation_densities = [mod_utils.unPickle(pickled_density) for pickled_density in density_files] genome_dict = mod_utils.convertFastaToDict(genome_fasta) normed_density_arrays = [winsorize_norm_chromosome_data(mutation_density, 'S.c.18S_rRNA', '+', genome_dict, 'ACTG') for mutation_density in mutation_densities] real_tp, real_tn = get_tp_tn(tp_tn_annotations) roc_curves = {} for sample_name in sample_names: roc_curves[sample_name] = [[],[]]#x and y value arrays for each stepsize = 0.0001 for cutoff in numpy.arange(0,1.+5*stepsize, stepsize): for i in range(len(sample_names)): called_p = call_positives(normed_density_arrays[i], 'S.c.18S_rRNA', '+', genome_dict, 'AC', cutoff) num_tp_called = len(called_p.intersection(real_tp))#how many true positives called at this cutoff num_fp_called = len(called_p.intersection(real_tn))#how many fp positives called at this cutoff roc_curves[sample_names[i]][1].append(100.*num_tp_called/float(len(real_tp)))#TP rate on y axis roc_curves[sample_names[i]][0].append(100.*num_fp_called/float(len(real_tn)))#FP rate on x axis plot_ROC_curves(roc_curves, outprefix)
def __init__(self, settings_file): self.settings_file = settings_file self.process_settings(settings_file) self.rRNA_seqs = mod_utils.convertFastaToDict(self.get_rRNA_fasta())
def count_reads(lib_settings): """ """ # Create empty dicts for storing counts data srt_dict = createStrandDict(strands) # Counts for 5' end of read our standard data format cov_dict = createStrandDict(strands) # Counts of times covered by a read mut_dict = createStrandDict(strands) # Counts of mismatches at a position read_mutations = defaultdict(int) #counts different types of mutations relative to read genome_mutations = defaultdict(int) #counts different types of mutations relative to genome mutations_by_read_position = defaultdict(dict) read_position_coverage = defaultdict(float) mutations_by_genome_position = defaultdict(dict) genome_position_coverage = defaultdict(float) mutated_nts = defaultdict(float) read_insertion_sizes = [] genomic_deletion_sizes = [] with gzip.open(lib_settings.get_mapped_reads_sam_gz(), 'r') as f: for line in f: # Iterate through SAM file lines if not line.startswith('@'): # Parse line into relevant strings fields = line.strip().split('\t') ID = fields[0] #the first field in the mapped file corresponds to a unique id number for that read- these should correspond to the names in the raw_seqs dictionary flag = int(fields[1]) ''' The flag field provides a lot of info about the read, it is the decimal representation of a bit string, each digit of which is true or false Bit 0 = The read was part of a pair during sequencing Bit 1 = The read is mapped in a pair Bit 2 = The query sequence is unmapped Bit 3 = The mate is unmapped Bit 4 = Strand of query (0=forward 1=reverse) So, to see if a flag represents a read on the - strand, we evaluate (16 & 'flag'), where & is the bitwise and operator, which will be non-zero (True) only if this read is on the - strand ''' if (4&flag):#if this is an unmapped read, don't bother continue if (16&flag): strand = '-' else: strand = '+' chrom = fields[2] MAPQ = int(fields[4]) if int(MAPQ) >= lib_settings.get_property('min_mapping_quality'): cigarString = fields[5] seq = fields[9] mappingLength = len(seq) qScores = fields[10] # Some lines seem to lack some strings this throws of indexing of NM:i, MD:Z, and NH:i strings NHstr = checkTag('NH:i:',fields) NMstr = checkTag('NM:i:',fields) MDstr = checkTag('MD:Z:',fields) assert 'NM:i' in NMstr assert 'MD:Z' in MDstr assert 'NH:i' in NHstr multiplicity = float(NHstr.split(':')[2]) fields = line.strip().split('\t') counts = float(1.0/multiplicity) # Weight of read MDzString = MDstr # Add subdicts for chromosome if needed if chrom not in srt_dict[strand]: srt_dict[strand][chrom] = defaultdict(float) if chrom not in cov_dict[strand]: cov_dict[strand][chrom] = defaultdict(float) if chrom not in mut_dict[strand]: mut_dict[strand][chrom] = defaultdict(float) # Parse cigar string, get genome mapping span, and relative genomic positions covered by read rel_genomic_event_positions, rel_genome_coverage, mutations_rel_genome, mutations_rel_read, readMappingSpan, genomeMappingSpan = parse_MDz_and_cigar(cigarString, MDzString, mappingLength, seq) for pos in range(len(mutations_rel_genome)): genome_position_coverage[pos] += counts event = mutations_rel_genome[pos] if not event == 'M': #count if it's not a match assert event[0] != 'I' if event[0] == 'D': genomic_deletion_sizes.append(event[1]) event = event[0] if event not in mutations_by_genome_position[pos]: mutations_by_genome_position[pos][event] = 0 mutations_by_genome_position[pos][event] += counts genome_mutations[event] += counts if event[0] in 'ATCG': mutated_nts[event[0]] += counts for pos in range(len(mutations_rel_read)): read_position_coverage[pos] += counts event = mutations_rel_read[pos] if not event == 'M': #count if it's not a match assert event[0] != 'D' if event[0] == 'I': read_insertion_sizes.append(event[1]) event = event[0] if event not in mutations_by_read_position[pos]: mutations_by_read_position[pos][event] = 0 mutations_by_read_position[pos][event] += counts read_mutations[event] += counts # Set start position of read if strand== '+': start=int(fields[3]) else: #When a read maps to the minus strand, bowtie returns the reverse complement, and indicates # where this reverse mapped on the + strand. Thus the original 5' end of the read actually # was x nt downstream on the + strand start=int(fields[3])+genomeMappingSpan-1 # translate relative positions to absolute positions genome_cov = readGenomicCoverage(rel_genome_coverage, strand, start) # get genome coverage srt_dict[strand][chrom][start] += counts #just add the number of counts to that start position for pos in genome_cov: # Increment positions for coverage dict cov_dict[strand][chrom][pos] += counts # If mismatches need to parse, get the absolute genomic pos, and increment counters genMismatches = readGenomicCoverage(rel_genomic_event_positions, strand, start) for event_position in genMismatches: mut_dict[strand][chrom][event_position] += counts mod_utils.makePickle(srt_dict, lib_settings.get_read_5p_counts()) mod_utils.makePickle(cov_dict, lib_settings.get_positional_coverage()) mod_utils.makePickle(mut_dict, lib_settings.get_mutation_counts()) mod_utils.makePickle(genome_mutations, lib_settings.get_counting_prefix() + '.genome_mutations.pkl') mod_utils.makePickle(mutations_by_genome_position, lib_settings.get_counting_prefix() + '.genome_position_mutations.pkl') mod_utils.makePickle(genome_position_coverage, lib_settings.get_counting_prefix() + '.genome_position_coverage.pkl') mod_utils.makePickle(mutated_nts, lib_settings.get_counting_prefix() + '.nt_mutations.pkl') mod_utils.makePickle(read_mutations, lib_settings.get_counting_prefix() + '.read_mutations.pkl') mod_utils.makePickle(mutations_by_read_position, lib_settings.get_counting_prefix() + '.read_position_mutations.pkl') mod_utils.makePickle(read_position_coverage, lib_settings.get_counting_prefix() + '.read_position_coverage.pkl') mod_utils.makePickle(genomic_deletion_sizes, lib_settings.get_counting_prefix() + '.deletion_sizes.pkl') mod_utils.makePickle(read_insertion_sizes, lib_settings.get_counting_prefix() + '.insertion_sizes.pkl') normalized_mutations = normalized_mutation_rates(mod_utils.unPickle(lib_settings.get_mutation_counts()), mod_utils.unPickle(lib_settings.get_positional_coverage())) mod_utils.makePickle(normalized_mutations, lib_settings.get_normalized_mutation_counts()) plot_mutated_nts_pie(mod_utils.unPickle(lib_settings.get_counting_prefix() + '.nt_mutations.pkl'), 'mutated rRNA nts in ' + lib_settings.sample_name, lib_settings.get_counting_prefix()+'.mutated_nts' ) plot_full_mutation_stats(mod_utils.unPickle(lib_settings.get_counting_prefix() + '.read_mutations.pkl'), mod_utils.unPickle(lib_settings.get_counting_prefix() + '.insertion_sizes.pkl'), mod_utils.unPickle(lib_settings.get_counting_prefix() + '.read_position_mutations.pkl'), mod_utils.unPickle(lib_settings.get_counting_prefix() + '.read_position_coverage.pkl'), 'mutations wrt reads', "insertion size", lib_settings.get_counting_prefix()+'.read_mutations') plot_full_mutation_stats(mod_utils.unPickle(lib_settings.get_counting_prefix() + '.genome_mutations.pkl'), mod_utils.unPickle(lib_settings.get_counting_prefix() + '.deletion_sizes.pkl'), mod_utils.unPickle(lib_settings.get_counting_prefix() + '.genome_position_mutations.pkl'), mod_utils.unPickle(lib_settings.get_counting_prefix() + '.genome_position_coverage.pkl'), 'mutations wrt genome', "deletion size", lib_settings.get_counting_prefix()+'.genome_mutations') pie_read_5p_ends(mod_utils.unPickle(lib_settings.get_read_5p_counts()), mod_utils.convertFastaToDict(lib_settings.experiment_settings.get_rRNA_fasta()), lib_settings.get_counting_prefix()) normed_mutation_rate_histogram(mod_utils.unPickle(lib_settings.get_normalized_mutation_counts()), lib_settings.sample_name, lib_settings.get_counting_prefix())