コード例 #1
0
ファイル: compare_samples.py プロジェクト: borisz264/mod_seq
def main():
    outfolder, genome_fasta, normalization_file_name = sys.argv[1:4]
    experimental_file_names = sys.argv[4:]

    control_dict = mod_utils.unPickle(normalization_file_name)
    rescaled_control_dict = normalize_dict_to_max(control_dict)
    norm_name = '.'.join(os.path.basename(normalization_file_name).split('.')[:-1])
    experimental_dict_names = ['.'.join(os.path.basename(file_name).split('.')[:-1]) for file_name in experimental_file_names]
    experimental_dicts = [mod_utils.unPickle(file_name) for file_name in experimental_file_names]
    rescaled_experimental_dicts = [normalize_dict_to_max(exp_dict) for exp_dict in experimental_dicts]

    print experimental_dict_names, norm_name

    normed_mutation_rate_histogram(rescaled_experimental_dicts, experimental_dict_names, os.path.join(outfolder, '%s_rescaled_mutation_rate_histogram' % norm_name), title='mutation rate, rescaled to max', xlim = (0, 0.1), min = 0, max =1, step = 0.001)
    comparisons = []
    rescaled_comparisons = []
    write_wig(control_dict, norm_name, os.path.join(outfolder, norm_name))
    for i in range(len(experimental_dict_names)):
        write_wig(rescaled_experimental_dicts[i], experimental_dict_names[i], os.path.join(outfolder, experimental_dict_names[i]))
        comparison_log2_ratios = compare_to_control(experimental_dicts[i], control_dict)
        rescaled_comparison_log2_ratios = compare_to_control(rescaled_experimental_dicts[i], rescaled_control_dict)
        comparisons.append(comparison_log2_ratios)
        rescaled_comparisons.append(rescaled_comparison_log2_ratios)
        mod_utils.makePickle(comparison_log2_ratios, os.path.join(outfolder, experimental_dict_names[i]+'_comparison_log2.pkl'))
        #mod_utils.makePickle(rescaled_comparison_log2_ratios, os.path.join(outfolder, experimental_dict_names[i]+'_rescaled_comparison_log2.pkl'))
        write_wig(comparison_log2_ratios, experimental_dict_names[i]+'_comparison_log2', os.path.join(outfolder, experimental_dict_names[i]+'_comparison_log2'))
        #write_wig(rescaled_comparison_log2_ratios, experimental_dict_names[i]+'_rescaled)comparison_log2', os.path.join(outfolder, experimental_dict_names[i]+'_rescaled_comparison_log2'))
        #try:
        #    plot_weighted_nts_pie(background_subtracted, genome_fasta, '%s backround-subtracted fractions' % experimental_dict_names[i], os.path.join(outfolder, experimental_dict_names[i]+'_sub_pie'))
        #except:
        #    pass
    #print comparisons
    #print rescaled_comparisons
    normed_mutation_rate_histogram(comparisons, experimental_dict_names, os.path.join(outfolder, '%s_comparison_histogram' % norm_name), title='log2 experiment/control', xlim = (-10, 10), min = -100, max =100, step = 0.1)
コード例 #2
0
def main():
    outprefix, bundle1, bundle2, bundle3, bundle4, bundle5, datafile_name  = sys.argv[1:8]

    bundles = [bundle1, bundle2, bundle3, bundle4, bundle5]
    reactivities = mod_utils.unPickle(datafile_name)
    for i in range(1,6):
        infile = open(bundles[i-1])
        outfile = open(outprefix+'_bundle'+str(i)+'.pdb' ,'w')

        for line in infile:
            if line.startswith('ATOM'):
                chain = line[21]
                resi = int(line[22:28].strip())
                if i in rRNA_assignments and chain in rRNA_assignments[i] and resi in reactivities['+'][rRNA_assignments[i][chain]]:
                    new_line = '%s%6.3f%s' % (line[:60], reactivities['+'][rRNA_assignments[i][chain]][resi], line[66:])
                    assert len(line) == len(new_line)
                else:
                    new_line = '%s%6.4f%s' % (line[:60], 0.0, line[66:])
                    assert len(line) == len(new_line)

            elif line.startswith("ANISOU"):
                new_line = '' #remove the anisotropic b factors, I don't need them
            else:
                new_line = line

            outfile.write(new_line)
        infile.close()
        outfile.close()
コード例 #3
0
def main():
    read_5p_ends_file, genome_fasta, outprefix = sys.argv[1:4]
    tp_tn_annotations = sys.argv[4:]#true positive and true negative annotations
    genome_dict = mod_utils.convertFastaToDict(genome_fasta)
    read_5p_ends = mod_utils.unPickle(read_5p_ends_file)
    normed_density_array = winsorize_norm_chromosome_data(read_5p_ends, 'S.c.18S_rRNA', '+', genome_dict, 'ACTG')
    real_tp_tn_data = []
    for filename in tp_tn_annotations:
        real_tp, real_tn = get_tp_tn(filename)
        real_tp_tn_data.append((os.path.basename(filename), real_tp, real_tn))

    roc_curves = {}
    for entry in real_tp_tn_data:
        roc_curves[entry[0]] = [[],[]]#x and y value arrays for each

    stepsize = 0.0001
    for cutoff in numpy.arange(0,1.+5*stepsize, stepsize):
        called_p = call_positives(normed_density_array, 'S.c.18S_rRNA', '+', genome_dict, 'AC', cutoff)
        for entry in real_tp_tn_data:
            #print called_p.intersection(entry[1])

            num_tp_called = len(called_p.intersection(entry[1]))#how many true positives called at this cutoff
            num_fp_called = len(called_p.intersection(entry[2]))#how many fp positives called at this cutoff
            roc_curves[entry[0]][0].append(100.*num_fp_called/float(len(entry[2])))#FP rate on x axis
            roc_curves[entry[0]][1].append(100.*num_tp_called/float(len(entry[1])))#TP rate on y axis

    plot_ROC_curves(roc_curves, outprefix)
コード例 #4
0
def main():
    tp_tn_annotations, genome_fasta, outprefix = sys.argv[1:4]
    density_files = sys.argv[4:]
    sample_names = [os.path.basename(filename).split("_back_")[0] for filename in density_files]
    mutation_densities = [mod_utils.unPickle(pickled_density) for pickled_density in density_files]

    genome_dict = mod_utils.convertFastaToDict(genome_fasta)
    normed_density_arrays = [
        winsorize_norm_chromosome_data(mutation_density, "S.c.25S__rRNA", genome_dict, "AC")
        for mutation_density in mutation_densities
    ]
    real_tp, real_tn = get_tp_tn(tp_tn_annotations)
    roc_curves = {}
    for sample_name in sample_names:
        roc_curves[sample_name] = [[], []]  # x and y value arrays for each

    stepsize = 0.0001
    for cutoff in numpy.arange(0, 1.0 + 5 * stepsize, stepsize):
        for i in range(len(sample_names)):
            # the fasta file should be the EXACT one used for the pipeline, and the chromosome name below should match
            # the one in the FASTA file exactly
            called_p = call_positives(normed_density_arrays[i], "S.c.25S__rRNA", genome_dict, "AC", cutoff)
            num_tp_called = len(called_p.intersection(real_tp))  # how many true positives called at this cutoff
            num_fp_called = len(called_p.intersection(real_tn))  # how many fp positives called at this cutoff
            roc_curves[sample_names[i]][1].append(100.0 * num_tp_called / float(len(real_tp)))  # TP rate on y axis
            roc_curves[sample_names[i]][0].append(100.0 * num_fp_called / float(len(real_tn)))  # FP rate on x axis

    plot_ROC_curves(roc_curves, "S.c.25S__rRNA", outprefix)
コード例 #5
0
def main():
    all_counts_file, all_depths_file, min_mutations, output_prefix = sys.argv[1:5]
    min_mutations = int(min_mutations)
    all_counts = mod_utils.unPickle(all_counts_file)
    all_depths = mod_utils.unPickle(all_depths_file)

    comparisons = (pair.split(',') for pair in sys.argv[5:])

    for comparison in comparisons:
        subtracted_rates, subtraction_errors = subtraction_norm(all_counts, all_depths, min_mutations, comparison)
        divided_rates, division_errors = division_norm(all_counts, all_depths, min_mutations, comparison)
        mod_utils.makePickle(subtracted_rates, '%s_%s_%s_sub_norm.pkl' % (output_prefix, comparison[0], comparison[1]))
        mod_utils.makePickle(subtraction_errors, '%s_%s_%s_sub_err.pkl' % (output_prefix, comparison[0], comparison[1]))
        mod_utils.makePickle(divided_rates, '%s_%s_%s_div_norm.pkl' % (output_prefix, comparison[0], comparison[1]))
        mod_utils.makePickle(division_errors, '%s_%s_%s_div_err.pkl' % (output_prefix, comparison[0], comparison[1]))
        for rRNA in subtracted_rates:
            write_out_counts(subtracted_rates, subtraction_errors, divided_rates, division_errors, rRNA, '%s_%s_%s_%s.txt' % (output_prefix, comparison[0], comparison[1], rRNA))
コード例 #6
0
ファイル: simulate_gel.py プロジェクト: borisz264/mod_seq
def generate_single_mutation_rates_dict(chromosome, start, stop, folder, file_names, strip_suffix):
    combined_mutation_rates = {}
    for file_name in file_names:
        dataset_label = file_name.rstrip(strip_suffix)
        mutation_rates = mod_utils.unPickle(os.path.join(folder, file_name))

        mutation_array = [float(mutation_rates[chromosome][position]) if position in mutation_rates[chromosome] else 0.0 for position in range(start, stop+1)]
        combined_mutation_rates[dataset_label] = mutation_array
    return combined_mutation_rates
コード例 #7
0
def main():
    outfolder, genome_fasta, normalization_file_name = sys.argv[1:4]
    experimental_file_names = sys.argv[4:]
    mod_utils.make_dir(outfolder)
    normalization_dict = mod_utils.unPickle(normalization_file_name)
    norm_name = '.'.join(os.path.basename(normalization_file_name).split('.')[:-2])
    experimental_dict_names = ['.'.join(os.path.basename(file_name).split('.')[:-2]) for file_name in experimental_file_names]
    experimental_dicts = [mod_utils.unPickle(file_name) for file_name in experimental_file_names]

    normed_mutation_rate_histogram(experimental_dicts, experimental_dict_names, os.path.join(outfolder, 'mutation_rate_histogram'), title='nonzero positions')
    background_subtracted_sets = []
    write_wig(normalization_dict, norm_name, os.path.join(outfolder, norm_name))
    for i in range(len(experimental_dict_names)):
        write_wig(experimental_dicts[i], experimental_dict_names[i], os.path.join(outfolder, experimental_dict_names[i]))
        background_subtracted = subtract_background(experimental_dicts[i], normalization_dict)
        background_subtracted_sets.append(background_subtracted)
        mod_utils.makePickle(background_subtracted, os.path.join(outfolder, experimental_dict_names[i]+'_subtracted.pkl'))
        write_wig(background_subtracted, experimental_dict_names[i]+'_subtracted', os.path.join(outfolder, experimental_dict_names[i]+'_subtracted'))
        try:
            plot_weighted_nts_pie(background_subtracted, genome_fasta, '%s backround-subtracted fractions' % experimental_dict_names[i], os.path.join(outfolder, experimental_dict_names[i]+'_sub_pie'))
        except:
            pass
    normed_mutation_rate_histogram(background_subtracted_sets, experimental_dict_names, os.path.join(outfolder, 'back_subtracted_mutation_rate_histogram'), title = 'nonzero positions, background subtracted')
コード例 #8
0
def main():
    tp_tn_annotations, genome_fasta, outprefix = sys.argv[1:4]
    density_files = sys.argv[4:]
    sample_names = [os.path.basename(filename) for filename in density_files]
    mutation_densities = [mod_utils.unPickle(pickled_density) for pickled_density in density_files]

    genome_dict = mod_utils.convertFastaToDict(genome_fasta)
    normed_density_arrays = [winsorize_norm_chromosome_data(mutation_density, 'S.c.18S_rRNA', '+', genome_dict, 'ACTG') for mutation_density in mutation_densities]
    real_tp, real_tn = get_tp_tn(tp_tn_annotations)
    roc_curves = {}
    for sample_name in sample_names:
        roc_curves[sample_name] = [[],[]]#x and y value arrays for each

    stepsize = 0.0001
    for cutoff in numpy.arange(0,1.+5*stepsize, stepsize):
        for i in range(len(sample_names)):
            called_p = call_positives(normed_density_arrays[i], 'S.c.18S_rRNA', '+', genome_dict, 'AC', cutoff)
            num_tp_called = len(called_p.intersection(real_tp))#how many true positives called at this cutoff
            num_fp_called = len(called_p.intersection(real_tn))#how many fp positives called at this cutoff
            roc_curves[sample_names[i]][1].append(100.*num_tp_called/float(len(real_tp)))#TP rate on y axis
            roc_curves[sample_names[i]][0].append(100.*num_fp_called/float(len(real_tn)))#FP rate on x axis

    plot_ROC_curves(roc_curves, outprefix)
コード例 #9
0
def count_reads(lib_settings):
    """
    """

    # Create empty dicts for storing counts data
    srt_dict = createStrandDict(strands) # Counts for 5' end of read our standard data format
    cov_dict = createStrandDict(strands) # Counts of times covered by a read
    mut_dict = createStrandDict(strands) # Counts of mismatches at a position
    read_mutations = defaultdict(int) #counts different types of mutations relative to read
    genome_mutations = defaultdict(int) #counts different types of mutations relative to genome
    mutations_by_read_position = defaultdict(dict)
    read_position_coverage = defaultdict(float)
    mutations_by_genome_position = defaultdict(dict)
    genome_position_coverage = defaultdict(float)
    mutated_nts = defaultdict(float)
    read_insertion_sizes = []
    genomic_deletion_sizes = []

    with gzip.open(lib_settings.get_mapped_reads_sam_gz(), 'r') as f:
        for line in f: # Iterate through SAM file lines
            if not line.startswith('@'):
                # Parse line into relevant strings

                fields = line.strip().split('\t')
                ID = fields[0] #the first field in the mapped file corresponds to a unique id number for that read- these should correspond to the names in the raw_seqs dictionary
                flag = int(fields[1])
                '''
                The flag field provides a lot of info about the read, it is the decimal representation of a bit string, each digit of which is true or false

                Bit 0 = The read was part of a pair during sequencing
                Bit 1 = The read is mapped in a pair
                Bit 2 = The query sequence is unmapped
                Bit 3 = The mate is unmapped
                Bit 4 = Strand of query (0=forward 1=reverse)
                So, to see if a flag represents a read on the  - strand, we evaluate (16 & 'flag'), where & is the bitwise and operator,
                which will be non-zero (True) only if this read is on the - strand
                '''
                if (4&flag):#if this is an unmapped read, don't bother
                    continue

                if (16&flag):
                    strand = '-'
                else:
                    strand = '+'
                chrom = fields[2]
                MAPQ = int(fields[4])
                if int(MAPQ) >= lib_settings.get_property('min_mapping_quality'):
                    cigarString = fields[5]
                    seq = fields[9]
                    mappingLength = len(seq)
                    qScores = fields[10]
                    # Some lines seem to lack some strings this throws of indexing of NM:i, MD:Z, and NH:i strings
                    NHstr = checkTag('NH:i:',fields)
                    NMstr = checkTag('NM:i:',fields)
                    MDstr = checkTag('MD:Z:',fields)
                    assert 'NM:i' in NMstr
                    assert 'MD:Z' in MDstr
                    assert 'NH:i' in NHstr
                    multiplicity = float(NHstr.split(':')[2])

                    fields = line.strip().split('\t')
                    counts = float(1.0/multiplicity) # Weight of read
                    MDzString = MDstr

                    # Add subdicts for chromosome if needed
                    if chrom not in srt_dict[strand]:
                        srt_dict[strand][chrom] = defaultdict(float)
                    if chrom not in cov_dict[strand]:
                        cov_dict[strand][chrom] = defaultdict(float)
                    if chrom not in mut_dict[strand]:
                        mut_dict[strand][chrom] = defaultdict(float)

                    # Parse cigar string, get genome mapping span, and relative genomic positions covered by read
                    rel_genomic_event_positions, rel_genome_coverage, mutations_rel_genome, mutations_rel_read, readMappingSpan, genomeMappingSpan = parse_MDz_and_cigar(cigarString, MDzString, mappingLength, seq)

                    for pos in range(len(mutations_rel_genome)):
                        genome_position_coverage[pos] += counts
                        event = mutations_rel_genome[pos]
                        if not event == 'M': #count if it's not a match
                            assert event[0] != 'I'
                            if event[0] == 'D':
                                genomic_deletion_sizes.append(event[1])
                                event = event[0]
                            if event not in mutations_by_genome_position[pos]:
                                 mutations_by_genome_position[pos][event] = 0
                            mutations_by_genome_position[pos][event] += counts
                            genome_mutations[event] += counts
                            if event[0] in 'ATCG':
                                mutated_nts[event[0]] += counts

                    for pos in range(len(mutations_rel_read)):
                        read_position_coverage[pos] += counts
                        event = mutations_rel_read[pos]
                        if not event == 'M': #count if it's not a match
                            assert event[0] != 'D'
                            if event[0] == 'I':
                                read_insertion_sizes.append(event[1])
                                event = event[0]
                            if event not in mutations_by_read_position[pos]:
                                 mutations_by_read_position[pos][event] = 0
                            mutations_by_read_position[pos][event] += counts
                            read_mutations[event] += counts

                    # Set start position of read
                    if strand== '+':
                        start=int(fields[3])
                    else:
                        #When a read maps to the minus strand, bowtie returns the reverse complement, and indicates
                        # where this reverse mapped on the + strand. Thus the original 5' end of the read actually
                        # was x nt downstream on the + strand
                        start=int(fields[3])+genomeMappingSpan-1

                    # translate relative positions to absolute positions
                    genome_cov = readGenomicCoverage(rel_genome_coverage, strand, start) # get genome coverage

                    srt_dict[strand][chrom][start] += counts #just add the number of counts to that start position
                    for pos in genome_cov: # Increment positions for coverage dict
                        cov_dict[strand][chrom][pos] += counts

                    # If mismatches need to parse, get the absolute genomic pos, and increment counters
                    genMismatches = readGenomicCoverage(rel_genomic_event_positions, strand, start)
                    for event_position in genMismatches:
                        mut_dict[strand][chrom][event_position] += counts




    mod_utils.makePickle(srt_dict, lib_settings.get_read_5p_counts())
    mod_utils.makePickle(cov_dict, lib_settings.get_positional_coverage())
    mod_utils.makePickle(mut_dict, lib_settings.get_mutation_counts())

    mod_utils.makePickle(genome_mutations, lib_settings.get_counting_prefix() + '.genome_mutations.pkl')
    mod_utils.makePickle(mutations_by_genome_position, lib_settings.get_counting_prefix() + '.genome_position_mutations.pkl')
    mod_utils.makePickle(genome_position_coverage, lib_settings.get_counting_prefix() + '.genome_position_coverage.pkl')

    mod_utils.makePickle(mutated_nts, lib_settings.get_counting_prefix() + '.nt_mutations.pkl')

    mod_utils.makePickle(read_mutations, lib_settings.get_counting_prefix() + '.read_mutations.pkl')
    mod_utils.makePickle(mutations_by_read_position, lib_settings.get_counting_prefix() + '.read_position_mutations.pkl')
    mod_utils.makePickle(read_position_coverage, lib_settings.get_counting_prefix() + '.read_position_coverage.pkl')

    mod_utils.makePickle(genomic_deletion_sizes, lib_settings.get_counting_prefix() + '.deletion_sizes.pkl')

    mod_utils.makePickle(read_insertion_sizes, lib_settings.get_counting_prefix() + '.insertion_sizes.pkl')

    normalized_mutations = normalized_mutation_rates(mod_utils.unPickle(lib_settings.get_mutation_counts()), mod_utils.unPickle(lib_settings.get_positional_coverage()))
    mod_utils.makePickle(normalized_mutations, lib_settings.get_normalized_mutation_counts())



    plot_mutated_nts_pie(mod_utils.unPickle(lib_settings.get_counting_prefix() + '.nt_mutations.pkl'), 'mutated rRNA nts in ' + lib_settings.sample_name, lib_settings.get_counting_prefix()+'.mutated_nts' )
    plot_full_mutation_stats(mod_utils.unPickle(lib_settings.get_counting_prefix() + '.read_mutations.pkl'), mod_utils.unPickle(lib_settings.get_counting_prefix() + '.insertion_sizes.pkl'),
                             mod_utils.unPickle(lib_settings.get_counting_prefix() + '.read_position_mutations.pkl'), mod_utils.unPickle(lib_settings.get_counting_prefix() + '.read_position_coverage.pkl'), 'mutations wrt reads', "insertion size",
                             lib_settings.get_counting_prefix()+'.read_mutations')
    plot_full_mutation_stats(mod_utils.unPickle(lib_settings.get_counting_prefix() + '.genome_mutations.pkl'), mod_utils.unPickle(lib_settings.get_counting_prefix() + '.deletion_sizes.pkl'), mod_utils.unPickle(lib_settings.get_counting_prefix() + '.genome_position_mutations.pkl'),
                             mod_utils.unPickle(lib_settings.get_counting_prefix() + '.genome_position_coverage.pkl'), 'mutations wrt genome', "deletion size",
                             lib_settings.get_counting_prefix()+'.genome_mutations')
    pie_read_5p_ends(mod_utils.unPickle(lib_settings.get_read_5p_counts()), mod_utils.convertFastaToDict(lib_settings.experiment_settings.get_rRNA_fasta()), lib_settings.get_counting_prefix())
    normed_mutation_rate_histogram(mod_utils.unPickle(lib_settings.get_normalized_mutation_counts()), lib_settings.sample_name, lib_settings.get_counting_prefix())