Ejemplo n.º 1
0
def main():
    outfolder, genome_fasta, normalization_file_name = sys.argv[1:4]
    experimental_file_names = sys.argv[4:]

    control_dict = mod_utils.unPickle(normalization_file_name)
    rescaled_control_dict = normalize_dict_to_max(control_dict)
    norm_name = '.'.join(os.path.basename(normalization_file_name).split('.')[:-1])
    experimental_dict_names = ['.'.join(os.path.basename(file_name).split('.')[:-1]) for file_name in experimental_file_names]
    experimental_dicts = [mod_utils.unPickle(file_name) for file_name in experimental_file_names]
    rescaled_experimental_dicts = [normalize_dict_to_max(exp_dict) for exp_dict in experimental_dicts]

    print experimental_dict_names, norm_name

    normed_mutation_rate_histogram(rescaled_experimental_dicts, experimental_dict_names, os.path.join(outfolder, '%s_rescaled_mutation_rate_histogram' % norm_name), title='mutation rate, rescaled to max', xlim = (0, 0.1), min = 0, max =1, step = 0.001)
    comparisons = []
    rescaled_comparisons = []
    write_wig(control_dict, norm_name, os.path.join(outfolder, norm_name))
    for i in range(len(experimental_dict_names)):
        write_wig(rescaled_experimental_dicts[i], experimental_dict_names[i], os.path.join(outfolder, experimental_dict_names[i]))
        comparison_log2_ratios = compare_to_control(experimental_dicts[i], control_dict)
        rescaled_comparison_log2_ratios = compare_to_control(rescaled_experimental_dicts[i], rescaled_control_dict)
        comparisons.append(comparison_log2_ratios)
        rescaled_comparisons.append(rescaled_comparison_log2_ratios)
        mod_utils.makePickle(comparison_log2_ratios, os.path.join(outfolder, experimental_dict_names[i]+'_comparison_log2.pkl'))
        #mod_utils.makePickle(rescaled_comparison_log2_ratios, os.path.join(outfolder, experimental_dict_names[i]+'_rescaled_comparison_log2.pkl'))
        write_wig(comparison_log2_ratios, experimental_dict_names[i]+'_comparison_log2', os.path.join(outfolder, experimental_dict_names[i]+'_comparison_log2'))
        #write_wig(rescaled_comparison_log2_ratios, experimental_dict_names[i]+'_rescaled)comparison_log2', os.path.join(outfolder, experimental_dict_names[i]+'_rescaled_comparison_log2'))
        #try:
        #    plot_weighted_nts_pie(background_subtracted, genome_fasta, '%s backround-subtracted fractions' % experimental_dict_names[i], os.path.join(outfolder, experimental_dict_names[i]+'_sub_pie'))
        #except:
        #    pass
    #print comparisons
    #print rescaled_comparisons
    normed_mutation_rate_histogram(comparisons, experimental_dict_names, os.path.join(outfolder, '%s_comparison_histogram' % norm_name), title='log2 experiment/control', xlim = (-10, 10), min = -100, max =100, step = 0.1)
Ejemplo n.º 2
0
 def pickle_mutation_fold_change(self, output_name, exclude_constitutive=False):
     """
     stores mutation rates as a simple pickle, of {rRNA_name:{position:mutation rate}}
     :param subtract_background:
     :return:
     """
     output_dict = {}
     for rRNA in self.rRNA_mutation_data:
         output_dict[rRNA] = {}
         for position in self.rRNA_mutation_data[rRNA].nucleotides:
             nucleotide = self.rRNA_mutation_data[rRNA].nucleotides[position]
             if exclude_constitutive and nucleotide.exclude_constitutive:
                 output_dict[rRNA][position] = 1.0
             else:
                 try:
                     output_dict[rRNA][position] = nucleotide.mutation_rate/self.get_normalizing_lib_with_mod().get_mutation_rate_at_position(rRNA, nucleotide.position)
                 except:
                     output_dict[rRNA][position] = float('inf')*nucleotide.mutation_rate
     mod_utils.makePickle(output_dict, output_name)
def main():
    outfolder, genome_fasta, normalization_file_name = sys.argv[1:4]
    experimental_file_names = sys.argv[4:]
    mod_utils.make_dir(outfolder)
    normalization_dict = mod_utils.unPickle(normalization_file_name)
    norm_name = '.'.join(os.path.basename(normalization_file_name).split('.')[:-2])
    experimental_dict_names = ['.'.join(os.path.basename(file_name).split('.')[:-2]) for file_name in experimental_file_names]
    experimental_dicts = [mod_utils.unPickle(file_name) for file_name in experimental_file_names]

    normed_mutation_rate_histogram(experimental_dicts, experimental_dict_names, os.path.join(outfolder, 'mutation_rate_histogram'), title='nonzero positions')
    background_subtracted_sets = []
    write_wig(normalization_dict, norm_name, os.path.join(outfolder, norm_name))
    for i in range(len(experimental_dict_names)):
        write_wig(experimental_dicts[i], experimental_dict_names[i], os.path.join(outfolder, experimental_dict_names[i]))
        background_subtracted = subtract_background(experimental_dicts[i], normalization_dict)
        background_subtracted_sets.append(background_subtracted)
        mod_utils.makePickle(background_subtracted, os.path.join(outfolder, experimental_dict_names[i]+'_subtracted.pkl'))
        write_wig(background_subtracted, experimental_dict_names[i]+'_subtracted', os.path.join(outfolder, experimental_dict_names[i]+'_subtracted'))
        try:
            plot_weighted_nts_pie(background_subtracted, genome_fasta, '%s backround-subtracted fractions' % experimental_dict_names[i], os.path.join(outfolder, experimental_dict_names[i]+'_sub_pie'))
        except:
            pass
    normed_mutation_rate_histogram(background_subtracted_sets, experimental_dict_names, os.path.join(outfolder, 'back_subtracted_mutation_rate_histogram'), title = 'nonzero positions, background subtracted')
def main():
    all_counts_file, all_depths_file, min_mutations, output_prefix = sys.argv[1:5]
    min_mutations = int(min_mutations)
    all_counts = mod_utils.unPickle(all_counts_file)
    all_depths = mod_utils.unPickle(all_depths_file)

    comparisons = (pair.split(',') for pair in sys.argv[5:])

    for comparison in comparisons:
        subtracted_rates, subtraction_errors = subtraction_norm(all_counts, all_depths, min_mutations, comparison)
        divided_rates, division_errors = division_norm(all_counts, all_depths, min_mutations, comparison)
        mod_utils.makePickle(subtracted_rates, '%s_%s_%s_sub_norm.pkl' % (output_prefix, comparison[0], comparison[1]))
        mod_utils.makePickle(subtraction_errors, '%s_%s_%s_sub_err.pkl' % (output_prefix, comparison[0], comparison[1]))
        mod_utils.makePickle(divided_rates, '%s_%s_%s_div_norm.pkl' % (output_prefix, comparison[0], comparison[1]))
        mod_utils.makePickle(division_errors, '%s_%s_%s_div_err.pkl' % (output_prefix, comparison[0], comparison[1]))
        for rRNA in subtracted_rates:
            write_out_counts(subtracted_rates, subtraction_errors, divided_rates, division_errors, rRNA, '%s_%s_%s_%s.txt' % (output_prefix, comparison[0], comparison[1], rRNA))
Ejemplo n.º 5
0
 def pickle_mutation_rates(self, output_name, subtract_background=False, subtract_control=False, exclude_constitutive=False):
     """
     stores mutation rates as a simple pickle, of {rRNA_name:{position:mutation rate}}
     :param subtract_background:
     :return:
     """
     output_dict = {}
     for rRNA in self.rRNA_mutation_data:
         output_dict[rRNA] = {}
         for position in self.rRNA_mutation_data[rRNA].nucleotides:
             nucleotide = self.rRNA_mutation_data[rRNA].nucleotides[position]
             if exclude_constitutive and nucleotide.exclude_constitutive:
                 output_dict[rRNA][position] = 0
             else:
                 if subtract_background and subtract_control:
                     raise SyntaxError('Cannot subtract background and control simultaneously')
                 if subtract_background:
                     output_dict[rRNA][position] = max((nucleotide.mutation_rate - self.get_normalizing_lib().
                                                 get_mutation_rate_at_position(rRNA, nucleotide.position)), 0.)
                 elif subtract_control:
                     output_dict[rRNA][position] = nucleotide.mutation_rate - self.get_normalizing_lib_with_mod().get_mutation_rate_at_position(rRNA, nucleotide.position)
                 else:
                     output_dict[rRNA][position] = nucleotide.mutation_rate
     mod_utils.makePickle(output_dict, output_name)
def count_reads(lib_settings):
    """
    """

    # Create empty dicts for storing counts data
    srt_dict = createStrandDict(strands) # Counts for 5' end of read our standard data format
    cov_dict = createStrandDict(strands) # Counts of times covered by a read
    mut_dict = createStrandDict(strands) # Counts of mismatches at a position
    read_mutations = defaultdict(int) #counts different types of mutations relative to read
    genome_mutations = defaultdict(int) #counts different types of mutations relative to genome
    mutations_by_read_position = defaultdict(dict)
    read_position_coverage = defaultdict(float)
    mutations_by_genome_position = defaultdict(dict)
    genome_position_coverage = defaultdict(float)
    mutated_nts = defaultdict(float)
    read_insertion_sizes = []
    genomic_deletion_sizes = []

    with gzip.open(lib_settings.get_mapped_reads_sam_gz(), 'r') as f:
        for line in f: # Iterate through SAM file lines
            if not line.startswith('@'):
                # Parse line into relevant strings

                fields = line.strip().split('\t')
                ID = fields[0] #the first field in the mapped file corresponds to a unique id number for that read- these should correspond to the names in the raw_seqs dictionary
                flag = int(fields[1])
                '''
                The flag field provides a lot of info about the read, it is the decimal representation of a bit string, each digit of which is true or false

                Bit 0 = The read was part of a pair during sequencing
                Bit 1 = The read is mapped in a pair
                Bit 2 = The query sequence is unmapped
                Bit 3 = The mate is unmapped
                Bit 4 = Strand of query (0=forward 1=reverse)
                So, to see if a flag represents a read on the  - strand, we evaluate (16 & 'flag'), where & is the bitwise and operator,
                which will be non-zero (True) only if this read is on the - strand
                '''
                if (4&flag):#if this is an unmapped read, don't bother
                    continue

                if (16&flag):
                    strand = '-'
                else:
                    strand = '+'
                chrom = fields[2]
                MAPQ = int(fields[4])
                if int(MAPQ) >= lib_settings.get_property('min_mapping_quality'):
                    cigarString = fields[5]
                    seq = fields[9]
                    mappingLength = len(seq)
                    qScores = fields[10]
                    # Some lines seem to lack some strings this throws of indexing of NM:i, MD:Z, and NH:i strings
                    NHstr = checkTag('NH:i:',fields)
                    NMstr = checkTag('NM:i:',fields)
                    MDstr = checkTag('MD:Z:',fields)
                    assert 'NM:i' in NMstr
                    assert 'MD:Z' in MDstr
                    assert 'NH:i' in NHstr
                    multiplicity = float(NHstr.split(':')[2])

                    fields = line.strip().split('\t')
                    counts = float(1.0/multiplicity) # Weight of read
                    MDzString = MDstr

                    # Add subdicts for chromosome if needed
                    if chrom not in srt_dict[strand]:
                        srt_dict[strand][chrom] = defaultdict(float)
                    if chrom not in cov_dict[strand]:
                        cov_dict[strand][chrom] = defaultdict(float)
                    if chrom not in mut_dict[strand]:
                        mut_dict[strand][chrom] = defaultdict(float)

                    # Parse cigar string, get genome mapping span, and relative genomic positions covered by read
                    rel_genomic_event_positions, rel_genome_coverage, mutations_rel_genome, mutations_rel_read, readMappingSpan, genomeMappingSpan = parse_MDz_and_cigar(cigarString, MDzString, mappingLength, seq)

                    for pos in range(len(mutations_rel_genome)):
                        genome_position_coverage[pos] += counts
                        event = mutations_rel_genome[pos]
                        if not event == 'M': #count if it's not a match
                            assert event[0] != 'I'
                            if event[0] == 'D':
                                genomic_deletion_sizes.append(event[1])
                                event = event[0]
                            if event not in mutations_by_genome_position[pos]:
                                 mutations_by_genome_position[pos][event] = 0
                            mutations_by_genome_position[pos][event] += counts
                            genome_mutations[event] += counts
                            if event[0] in 'ATCG':
                                mutated_nts[event[0]] += counts

                    for pos in range(len(mutations_rel_read)):
                        read_position_coverage[pos] += counts
                        event = mutations_rel_read[pos]
                        if not event == 'M': #count if it's not a match
                            assert event[0] != 'D'
                            if event[0] == 'I':
                                read_insertion_sizes.append(event[1])
                                event = event[0]
                            if event not in mutations_by_read_position[pos]:
                                 mutations_by_read_position[pos][event] = 0
                            mutations_by_read_position[pos][event] += counts
                            read_mutations[event] += counts

                    # Set start position of read
                    if strand== '+':
                        start=int(fields[3])
                    else:
                        #When a read maps to the minus strand, bowtie returns the reverse complement, and indicates
                        # where this reverse mapped on the + strand. Thus the original 5' end of the read actually
                        # was x nt downstream on the + strand
                        start=int(fields[3])+genomeMappingSpan-1

                    # translate relative positions to absolute positions
                    genome_cov = readGenomicCoverage(rel_genome_coverage, strand, start) # get genome coverage

                    srt_dict[strand][chrom][start] += counts #just add the number of counts to that start position
                    for pos in genome_cov: # Increment positions for coverage dict
                        cov_dict[strand][chrom][pos] += counts

                    # If mismatches need to parse, get the absolute genomic pos, and increment counters
                    genMismatches = readGenomicCoverage(rel_genomic_event_positions, strand, start)
                    for event_position in genMismatches:
                        mut_dict[strand][chrom][event_position] += counts




    mod_utils.makePickle(srt_dict, lib_settings.get_read_5p_counts())
    mod_utils.makePickle(cov_dict, lib_settings.get_positional_coverage())
    mod_utils.makePickle(mut_dict, lib_settings.get_mutation_counts())

    mod_utils.makePickle(genome_mutations, lib_settings.get_counting_prefix() + '.genome_mutations.pkl')
    mod_utils.makePickle(mutations_by_genome_position, lib_settings.get_counting_prefix() + '.genome_position_mutations.pkl')
    mod_utils.makePickle(genome_position_coverage, lib_settings.get_counting_prefix() + '.genome_position_coverage.pkl')

    mod_utils.makePickle(mutated_nts, lib_settings.get_counting_prefix() + '.nt_mutations.pkl')

    mod_utils.makePickle(read_mutations, lib_settings.get_counting_prefix() + '.read_mutations.pkl')
    mod_utils.makePickle(mutations_by_read_position, lib_settings.get_counting_prefix() + '.read_position_mutations.pkl')
    mod_utils.makePickle(read_position_coverage, lib_settings.get_counting_prefix() + '.read_position_coverage.pkl')

    mod_utils.makePickle(genomic_deletion_sizes, lib_settings.get_counting_prefix() + '.deletion_sizes.pkl')

    mod_utils.makePickle(read_insertion_sizes, lib_settings.get_counting_prefix() + '.insertion_sizes.pkl')

    normalized_mutations = normalized_mutation_rates(mod_utils.unPickle(lib_settings.get_mutation_counts()), mod_utils.unPickle(lib_settings.get_positional_coverage()))
    mod_utils.makePickle(normalized_mutations, lib_settings.get_normalized_mutation_counts())



    plot_mutated_nts_pie(mod_utils.unPickle(lib_settings.get_counting_prefix() + '.nt_mutations.pkl'), 'mutated rRNA nts in ' + lib_settings.sample_name, lib_settings.get_counting_prefix()+'.mutated_nts' )
    plot_full_mutation_stats(mod_utils.unPickle(lib_settings.get_counting_prefix() + '.read_mutations.pkl'), mod_utils.unPickle(lib_settings.get_counting_prefix() + '.insertion_sizes.pkl'),
                             mod_utils.unPickle(lib_settings.get_counting_prefix() + '.read_position_mutations.pkl'), mod_utils.unPickle(lib_settings.get_counting_prefix() + '.read_position_coverage.pkl'), 'mutations wrt reads', "insertion size",
                             lib_settings.get_counting_prefix()+'.read_mutations')
    plot_full_mutation_stats(mod_utils.unPickle(lib_settings.get_counting_prefix() + '.genome_mutations.pkl'), mod_utils.unPickle(lib_settings.get_counting_prefix() + '.deletion_sizes.pkl'), mod_utils.unPickle(lib_settings.get_counting_prefix() + '.genome_position_mutations.pkl'),
                             mod_utils.unPickle(lib_settings.get_counting_prefix() + '.genome_position_coverage.pkl'), 'mutations wrt genome', "deletion size",
                             lib_settings.get_counting_prefix()+'.genome_mutations')
    pie_read_5p_ends(mod_utils.unPickle(lib_settings.get_read_5p_counts()), mod_utils.convertFastaToDict(lib_settings.experiment_settings.get_rRNA_fasta()), lib_settings.get_counting_prefix())
    normed_mutation_rate_histogram(mod_utils.unPickle(lib_settings.get_normalized_mutation_counts()), lib_settings.sample_name, lib_settings.get_counting_prefix())