コード例 #1
0
ファイル: read_positions.py プロジェクト: AlexeyG/ribosomes
def read_file(file_name, specific_keys=None, show_progress=False):
    genes = {}
    with h5py.File(file_name, 'r') as hdf5_file:
        gene_names = hdf5_file.keys()
        if show_progress:
            gene_names = utilities.progress_bar(len(gene_names), gene_names)
        for gene_name in gene_names:
            genes[gene_name] = build_gene(hdf5_file[gene_name], specific_keys)
    return genes
コード例 #2
0
def call_3p_peaks():
    gtf_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gtf'
    genome_dir = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/'
    composition_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_recent_As.hdf5'

    output_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_3p_lengths.txt'

    region_fetcher = genomes.build_region_fetcher(genome_dir)
    CDSs = gtf.get_CDSs(gtf_fn)
    CDS_dict = {t.name: t for t in CDSs}

    experiments = build_all_experiments(verbose=False)

    three_prime_experiments = [(n, e) for n, e in sorted(experiments['three_p_seq']['three_p_seq'].items())] + \
                              [(n, e) for n, e in sorted(experiments['three_t_fill_seq']['wilkening_nar'].items()) if '3tfill_ypd_rep1' in n] + \
                              [(n, e) for n, e in sorted(experiments['TIF_seq']['pelechano_nature'].items()) if n == 'ypd_bio1_lib1' or n == 'ypd_bio1_lib4']

    argmaxes = {}
    fractions = {}
    joints = {}
    for name, experiment in three_prime_experiments:
        print name
        argmaxes[name] = {}
        fractions[name] = []
        joints[name] = []
        fn = experiment.file_names['three_prime_read_positions']
        f = h5py.File(fn, 'r')
        for transcript in utilities.progress_bar(len(CDSs), CDSs):
            if transcript.name not in f:
                continue
            gene = Serialize.read_positions.build_gene(f[transcript.name])
            xs = np.arange(0, 400)

            argmax = gene['all'].argmax_over_slice('stop_codon', xs)
            argmaxes[name][transcript.name] = argmax
            most = gene['all']['stop_codon', argmax]
            total = gene['all']['stop_codon', xs].sum()
            if total > 9:
                fraction = np.true_divide(most, total)
                fractions[name].append(fraction)
                joints[name].append((argmax, fraction))

    with open(output_fn, 'w') as output_fh:
        name_order = sorted(argmaxes['Cerevisiae_3Pseq'],
                            key=argmaxes['Cerevisiae_3Pseq'].get)
        for name in name_order:
            output_fh.write('{0}\t'.format(str(CDS_dict[name])))
            for exp_name, _ in three_prime_experiments:
                output_fh.write('{0}\t'.format(argmaxes[exp_name][name]))
            output_fh.write('\n')
コード例 #3
0
def produce_transcript_base_compositions():
    gff_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gff'
    genome_dir = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/'
    composition_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_recent_As.hdf5'
    CDSs = gff.get_CDSs(gff_fn, genome_dir)

    left_buffer = 500
    right_buffer = 500
    genes = {}

    windows = [5, 10, 20]

    for transcript in utilities.progress_bar(len(CDSs), CDSs):
        genes[transcript.name] = {}
        transcript.build_coordinate_maps()
        landmarks = {
            'start': 0,
            'start_codon': transcript.transcript_start_codon,
            'stop_codon': transcript.transcript_stop_codon,
            'end': transcript.transcript_length,
        }
        sequence = transcript.get_transcript_sequence(left_buffer,
                                                      right_buffer)

        A_locations = positions.PositionCounts(
            landmarks,
            left_buffer,
            right_buffer,
            data=(sequence.data == 'A'),
        )
        for window in windows:
            recent_As = positions.PositionCounts(
                landmarks,
                left_buffer,
                right_buffer,
            )
            for left_edge in range(
                    -left_buffer,
                    transcript.CDS_length + right_buffer - window):
                num_As = sum(A_locations['start',
                                         left_edge:left_edge + window])
                recent_As['start', left_edge] = num_As

            genes[transcript.name][window] = recent_As

        transcript.delete_coordinate_maps()

    Serialize.read_positions.write_file(genes, composition_fn)
コード例 #4
0
ファイル: geometric_model.py プロジェクト: AlexeyG/ribosomes
def counts_from_read_positions_fn(read_positions_fn, key='all'):
    hdf5_file = h5py.File(read_positions_fn, 'r')
    progress = utilities.progress_bar(len(hdf5_file), hdf5_file)
    for gene_name in progress:
        #if gene_name == 'YLR256W':
        #    continue
        #if gene_name in {'YLR249W', 'YPL106C', 'YGL008C'}:
        #    continue
        if key == 'nonzero':
            gene = Serialize.read_positions.build_gene(hdf5_file[gene_name], specific_keys={'all', '0'})
            nonzero_counts = gene['all'] - gene[0]
            yield gene_name, nonzero_counts
        else:
            gene = Serialize.read_positions.build_gene(hdf5_file[gene_name], specific_keys={str(key)})
            counts = gene[key]
            yield gene_name, counts
コード例 #5
0
def counts_from_read_positions_fn(read_positions_fn, key='all'):
    hdf5_file = h5py.File(read_positions_fn, 'r')
    progress = utilities.progress_bar(len(hdf5_file), hdf5_file)
    for gene_name in progress:
        #if gene_name == 'YLR256W':
        #    continue
        #if gene_name in {'YLR249W', 'YPL106C', 'YGL008C'}:
        #    continue
        if key == 'nonzero':
            gene = Serialize.read_positions.build_gene(
                hdf5_file[gene_name], specific_keys={'all', '0'})
            nonzero_counts = gene['all'] - gene[0]
            yield gene_name, nonzero_counts
        else:
            gene = Serialize.read_positions.build_gene(
                hdf5_file[gene_name], specific_keys={str(key)})
            counts = gene[key]
            yield gene_name, counts
コード例 #6
0
def call_5p_peaks():
    gtf_fn = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gtf"
    genome_dir = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/"
    region_fetcher = genomes.build_region_fetcher(genome_dir)
    CDSs = gtf.get_CDSs(gtf_fn)

    experiments = build_all_experiments(verbose=False)

    five_prime_experiments = (
        [(n, e) for n, e in sorted(experiments["TL_seq"]["arribere_gr"].items()) if "TLSeq1" in n]
        + [(n, e) for n, e in sorted(experiments["TL_seq"]["park_nar"].items()) if n == "SMORE-seq_WT_TAP+_rep1"]
        + [
            (n, e)
            for n, e in sorted(experiments["TIF_seq"]["pelechano_nature"].items())
            if n == "ypd_bio1_lib1" or n == "ypd_bio1_lib4"
        ]
    )

    argmaxes = {}
    fractions = {}
    joints = {}
    for name, experiment in five_prime_experiments:
        print name
        argmaxes[name] = Counter()
        fractions[name] = []
        joints[name] = []
        fn = experiment.file_names["five_prime_read_positions"]
        f = h5py.File(fn, "r")
        for transcript in utilities.progress_bar(len(CDSs), CDSs):
            if transcript.name not in f:
                continue
            gene = Serialize.read_positions.build_gene(f[transcript.name])
            xs = np.arange(-300, 0)

            argmax = gene["all"].argmax_over_slice("start_codon", xs)
            argmaxes[name][argmax] += 1
            most = gene["all"]["start_codon", argmax]
            total = gene["all"]["start_codon", xs].sum()
            if total == 0:
                print transcript
            if total > 9:
                fraction = np.true_divide(most, total)
                fractions[name].append(fraction)
                joints[name].append((argmax, fraction))
コード例 #7
0
def look_at_densities():
    import ribosome_profiling_experiment
    description_fn = '/home/jah/projects/ribosomes/experiments/weinberg/RiboZero/job/description.txt'
    exp = ribosome_profiling_experiment.RibosomeProfilingExperiment.from_description_file_name(description_fn)
    
    names = []
    zero_ratios = []

    hdf5_file = h5py.File(exp.file_names['three_prime_read_positions'], 'r')
    progress = utilities.progress_bar(len(hdf5_file), hdf5_file)
    for gene_name in progress:
        gene = Serialize.read_positions.build_gene(hdf5_file[gene_name], specific_keys={'0'})
        zero_counts = gene[0]
        before = zero_counts['polyA', -100:1].sum()
        after = zero_counts['polyA', 1:102].sum()
        names.append(gene_name)
        zero_ratios.append((before, after))

    return names, zero_ratios
コード例 #8
0
def call_5p_peaks():
    gtf_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gtf'
    genome_dir = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/'
    region_fetcher = genomes.build_region_fetcher(genome_dir)
    CDSs = gtf.get_CDSs(gtf_fn)

    experiments = build_all_experiments(verbose=False)

    five_prime_experiments = [(n, e) for n, e in sorted(experiments['TL_seq']['arribere_gr'].items()) if 'TLSeq1' in n] + \
                             [(n, e) for n, e in sorted(experiments['TL_seq']['park_nar'].items()) if n == 'SMORE-seq_WT_TAP+_rep1'] + \
                             [(n, e) for n, e in sorted(experiments['TIF_seq']['pelechano_nature'].items()) if n == 'ypd_bio1_lib1' or n == 'ypd_bio1_lib4']

    argmaxes = {}
    fractions = {}
    joints = {}
    for name, experiment in five_prime_experiments:
        print name
        argmaxes[name] = Counter()
        fractions[name] = []
        joints[name] = []
        fn = experiment.file_names['five_prime_read_positions']
        f = h5py.File(fn, 'r')
        for transcript in utilities.progress_bar(len(CDSs), CDSs):
            if transcript.name not in f:
                continue
            gene = Serialize.read_positions.build_gene(f[transcript.name])
            xs = np.arange(-300, 0)

            argmax = gene['all'].argmax_over_slice('start_codon', xs)
            argmaxes[name][argmax] += 1
            most = gene['all']['start_codon', argmax]
            total = gene['all']['start_codon', xs].sum()
            if total == 0:
                print transcript
            if total > 9:
                fraction = np.true_divide(most, total)
                fractions[name].append(fraction)
                joints[name].append((argmax, fraction))
コード例 #9
0
def produce_transcript_base_compositions():
    gff_fn = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gff"
    genome_dir = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/"
    composition_fn = (
        "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_recent_As.hdf5"
    )
    CDSs = gff.get_CDSs(gff_fn, genome_dir)

    left_buffer = 500
    right_buffer = 500
    genes = {}

    windows = [5, 10, 20]

    for transcript in utilities.progress_bar(len(CDSs), CDSs):
        genes[transcript.name] = {}
        transcript.build_coordinate_maps()
        landmarks = {
            "start": 0,
            "start_codon": transcript.transcript_start_codon,
            "stop_codon": transcript.transcript_stop_codon,
            "end": transcript.transcript_length,
        }
        sequence = transcript.get_transcript_sequence(left_buffer, right_buffer)

        A_locations = positions.PositionCounts(landmarks, left_buffer, right_buffer, data=(sequence.data == "A"))
        for window in windows:
            recent_As = positions.PositionCounts(landmarks, left_buffer, right_buffer)
            for left_edge in range(-left_buffer, transcript.CDS_length + right_buffer - window):
                num_As = sum(A_locations["start", left_edge : left_edge + window])
                recent_As["start", left_edge] = num_As

            genes[transcript.name][window] = recent_As

        transcript.delete_coordinate_maps()

    Serialize.read_positions.write_file(genes, composition_fn)
コード例 #10
0
ファイル: trim.py プロジェクト: sameer-aryal/ribosomes
            updated_cigar = soft_clipped_block + trimmed_cigar
        else:
            # Remove blocks from the end.
            trimmed_cigar = sam.truncate_cigar_blocks_up_to(
                mapping.cigar, trimmed_length)
            updated_cigar = trimmed_cigar + soft_clipped_block

        mapping.cigar = updated_cigar

    if mapping.tags:
        # Clear the MD tag since the possible removal of bases to the
        # alignment may have made it inaccurate.
        # TODO: now have machinery to make it accurate.
        filtered_tags = filter(lambda t: t[0] != 'MD', mapping.tags)
        mapping.tags = filtered_tags

    set_nongenomic_length(mapping, bases_to_trim)

    return mapping


if __name__ == '__main__':
    fastq_fn = '/home/jah/projects/ribosomes/experiments/guydosh_cell/dom34KO_CHX/data/SRR1042854.fastq'
    seqs = [r.seq for _, r in zip(xrange(100000), fastq.reads(fastq_fn))]
    seqs = utilities.progress_bar(len(seqs), seqs)
    adapter = full_linker
    count = 0
    counts = Counter()
    for seq in seqs:
        counts[trim_by_local_alignment(adapter, seq)] += 1
コード例 #11
0
def call_UTR_boundaries(boundaries_fn, diagnostic_fn='/dev/null'):
    experiments = build_all_experiments(verbose=False)
    
    five_prime_exp = experiments['TL_seq']['arribere_gr']['S288C_TLSeq1']
    three_prime_exp = experiments['three_p_seq']['three_p_seq']['Cerevisiae_3Pseq']

    other_five_prime_exps = [experiments['TL_seq']['park_nar']['SMORE-seq_WT_TAP+_rep1'],
                             experiments['TIF_seq']['pelechano_nature']['ypd_bio1_lib1'],
                            ]

    
    other_three_prime_exps = [experiments['three_t_fill_seq']['wilkening_nar']['3tfill_ypd_rep1'],
                              experiments['TIF_seq']['pelechano_nature']['ypd_bio1_lib1'],
                             ]

    five_prime_fh = h5py.File(five_prime_exp.file_names['five_prime_read_positions'], 'r')
    three_prime_fh = h5py.File(three_prime_exp.file_names['three_prime_read_positions'], 'r')
    
    other_five_prime_fhs = [h5py.File(exp.file_names['five_prime_read_positions'], 'r') for exp in other_five_prime_exps]
    other_three_prime_fhs = [h5py.File(exp.file_names['three_prime_read_positions'], 'r') for exp in other_three_prime_exps]

    transcripts, _ = five_prime_exp.get_CDSs()

    UTR_boundaries = {}
    
    with open(diagnostic_fn, 'w') as diagnostic_fh:
        progress = utilities.progress_bar(len(transcripts), sorted(transcripts))
        for transcript in progress:
            name = transcript.name

            transcript.build_coordinate_maps(left_buffer=500, right_buffer=500)

            five_prime_gene = Serialize.read_positions.build_gene(five_prime_fh[name], specific_keys={'all'})
            other_genes = [Serialize.read_positions.build_gene(other_fh[name], specific_keys={'all'}) for other_fh in other_five_prime_fhs]
            five_xs = np.arange(-500, transcript.CDS_length)
            five_slice = ('start_codon', five_xs)
            
            five_counts = five_prime_gene['all']
            five_sum = five_counts[five_slice].sum()
            if five_sum == 0:
                five_offset = 0
            else:
                five_offset = five_counts.argmax_over_slice('start_codon', five_xs)

            n_largest = five_counts.n_largest_over_slice(10, five_slice)
            five_prime_diagnostic = []
            for i in n_largest:
                row = []
                for gene in [five_prime_gene] + other_genes:
                    count = gene['all']['start_codon', i]
                    total = gene['all'][five_slice].sum()
                    if row == []:
                        genomic = transcript.transcript_to_genomic[transcript.transcript_start_codon + i]
                        row.append('{0}\t({1:,})\t'.format(i, genomic))
                    row.append('{0}\t{1:0.2%}'.format(count, count / float(total)))
                five_prime_diagnostic.append('\t'.join(row))
            five_prime_diagnostic = '\n'.join(five_prime_diagnostic)
            
            three_prime_gene = Serialize.read_positions.build_gene(three_prime_fh[name], specific_keys={'all', '0'})
            other_genes = [Serialize.read_positions.build_gene(other_fh[name], specific_keys={'all', '0'}) for other_fh in other_three_prime_fhs]
            three_xs = np.arange(-transcript.CDS_length, 500)
            three_slice = ('stop_codon', three_xs)
            
            three_counts = three_prime_gene['all']# - three_prime_gene[0]
            three_sum = three_counts[three_slice].sum()
            if three_sum == 0:
                three_offset = 3
            else:
                three_offset = three_counts.argmax_over_slice('stop_codon', three_xs)

            n_largest = three_counts.n_largest_over_slice(10, three_slice)
            three_prime_diagnostic = []
            for i in n_largest:
                row = []
                for gene in [three_prime_gene] + other_genes:
                    count = gene['all']['stop_codon', i]
                    total = gene['all'][three_slice].sum()
                    if row == []:
                        genomic = transcript.transcript_to_genomic[transcript.transcript_stop_codon + i]
                        row.append('{0}\t({1:,})\t'.format(i, genomic))
                    row.append('{0}\t{1:0.2%}'.format(count, count / float(total)))
                three_prime_diagnostic.append('\t'.join(row))
            three_prime_diagnostic = '\n'.join(three_prime_diagnostic)

            diagnostic_fh.write('{0}\n'.format(str(transcript)))
            diagnostic_fh.write('{0}\n'.format(five_prime_diagnostic))
            diagnostic_fh.write('\n')
            diagnostic_fh.write('{0}\n'.format(three_prime_diagnostic))
            diagnostic_fh.write('\n')

            five_pos = transcript.transcript_to_genomic[transcript.transcript_start_codon + five_offset]
            three_pos = transcript.transcript_to_genomic[transcript.transcript_stop_codon + three_offset]
            
            transcript.delete_coordinate_maps()

            UTR_boundaries[name] = (transcript.seqname, transcript.strand, five_pos, three_pos)

    write_UTR_file(UTR_boundaries, boundaries_fn)
コード例 #12
0
def call_3p_peaks():
    gtf_fn = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gtf"
    genome_dir = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/"
    composition_fn = (
        "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_recent_As.hdf5"
    )

    output_fn = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_3p_lengths.txt"

    region_fetcher = genomes.build_region_fetcher(genome_dir)
    CDSs = gtf.get_CDSs(gtf_fn)
    CDS_dict = {t.name: t for t in CDSs}

    experiments = build_all_experiments(verbose=False)

    three_prime_experiments = (
        [(n, e) for n, e in sorted(experiments["three_p_seq"]["three_p_seq"].items())]
        + [
            (n, e)
            for n, e in sorted(experiments["three_t_fill_seq"]["wilkening_nar"].items())
            if "3tfill_ypd_rep1" in n
        ]
        + [
            (n, e)
            for n, e in sorted(experiments["TIF_seq"]["pelechano_nature"].items())
            if n == "ypd_bio1_lib1" or n == "ypd_bio1_lib4"
        ]
    )

    argmaxes = {}
    fractions = {}
    joints = {}
    for name, experiment in three_prime_experiments:
        print name
        argmaxes[name] = {}
        fractions[name] = []
        joints[name] = []
        fn = experiment.file_names["three_prime_read_positions"]
        f = h5py.File(fn, "r")
        for transcript in utilities.progress_bar(len(CDSs), CDSs):
            if transcript.name not in f:
                continue
            gene = Serialize.read_positions.build_gene(f[transcript.name])
            xs = np.arange(0, 400)

            argmax = gene["all"].argmax_over_slice("stop_codon", xs)
            argmaxes[name][transcript.name] = argmax
            most = gene["all"]["stop_codon", argmax]
            total = gene["all"]["stop_codon", xs].sum()
            if total > 9:
                fraction = np.true_divide(most, total)
                fractions[name].append(fraction)
                joints[name].append((argmax, fraction))

    with open(output_fn, "w") as output_fh:
        name_order = sorted(argmaxes["Cerevisiae_3Pseq"], key=argmaxes["Cerevisiae_3Pseq"].get)
        for name in name_order:
            output_fh.write("{0}\t".format(str(CDS_dict[name])))
            for exp_name, _ in three_prime_experiments:
                output_fh.write("{0}\t".format(argmaxes[exp_name][name]))
            output_fh.write("\n")
コード例 #13
0
    composition_fn = (
        "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_recent_As.hdf5"
    )
    CDSs = gff.get_CDSs(gff_fn, genome_dir)

    import select_work

    exps = select_work.build_all_experiments(verbose=False)

    reads_fn = exps["belgium_2014_12_10"]["WT_1_mRNA"].file_names["three_prime_read_positions"]
    reads_fh = h5py.File(reads_fn, "r")

    meta_counts = positions.PositionCounts({"A": 0}, left_buffer=100000, right_buffer=100000)

    f = h5py.File(composition_fn, "r")
    for t in utilities.progress_bar(len(CDSs), CDSs):
        if t.name not in reads_fh:
            continue
        gene = Serialize.read_positions.build_gene(f[t.name])
        t.build_coordinate_maps()

        if t.transcript_length < 301:
            continue
        end = t.transcript_length - 200
        sl = ("start", np.arange(100, end))
        A_rich_position = gene[10].argmax_over_slice(*sl)
        if gene[10]["start", A_rich_position] > 9:
            counts = Serialize.read_positions.build_gene(reads_fh[t.name])
            before_counts = counts["all"]["start", 0:A_rich_position]
            after_counts = counts["all"]["start", A_rich_position : A_rich_position + 200]
            meta_counts["A", -len(before_counts) : 0] += before_counts
コード例 #14
0
    composition_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_recent_As.hdf5'
    CDSs = gff.get_CDSs(gff_fn, genome_dir)

    import select_work
    exps = select_work.build_all_experiments(verbose=False)

    reads_fn = exps['belgium_2014_12_10']['WT_1_mRNA'].file_names[
        'three_prime_read_positions']
    reads_fh = h5py.File(reads_fn, 'r')

    meta_counts = positions.PositionCounts({'A': 0},
                                           left_buffer=100000,
                                           right_buffer=100000)

    f = h5py.File(composition_fn, 'r')
    for t in utilities.progress_bar(len(CDSs), CDSs):
        if t.name not in reads_fh:
            continue
        gene = Serialize.read_positions.build_gene(f[t.name])
        t.build_coordinate_maps()

        if t.transcript_length < 301:
            continue
        end = t.transcript_length - 200
        sl = ('start', np.arange(100, end))
        A_rich_position = gene[10].argmax_over_slice(*sl)
        if gene[10]['start', A_rich_position] > 9:
            counts = Serialize.read_positions.build_gene(reads_fh[t.name])
            before_counts = counts['all']['start', 0:A_rich_position]
            after_counts = counts['all']['start',
                                         A_rich_position:A_rich_position + 200]
コード例 #15
0
ファイル: trim.py プロジェクト: AlexeyG/ribosomes
            # Remove blocks from the beginning.
            trimmed_cigar = sam.truncate_cigar_blocks_from_beginning(mapping.cigar, trimmed_length)
            updated_cigar = soft_clipped_block + trimmed_cigar
        else:
            # Remove blocks from the end.
            trimmed_cigar = sam.truncate_cigar_blocks_up_to(mapping.cigar, trimmed_length)
            updated_cigar = trimmed_cigar + soft_clipped_block
        
        mapping.cigar = updated_cigar
    
    if mapping.tags:
        # Clear the MD tag since the possible removal of bases to the
        # alignment may have made it inaccurate. 
        # TODO: now have machinery to make it accurate.
        filtered_tags = filter(lambda t: t[0] != 'MD', mapping.tags)
        mapping.tags = filtered_tags

    set_nongenomic_length(mapping, bases_to_trim)

    return mapping

if __name__ == '__main__':
    fastq_fn = '/home/jah/projects/ribosomes/experiments/guydosh_cell/dom34KO_CHX/data/SRR1042854.fastq'
    seqs = [r.seq for _, r in zip(xrange(100000), fastq.reads(fastq_fn))]
    seqs = utilities.progress_bar(len(seqs), seqs)
    adapter = full_linker
    count = 0
    counts = Counter()
    for seq in seqs:
        counts[trim_by_local_alignment(adapter, seq)] += 1