예제 #1
0
def compute_metanucleotide_counts(read_positions):
    left_buffer = 50
    right_buffer = 50

    # Figure out what lengths were recorded.
    length_keys, _, _ = extract_lengths_and_buffers(read_positions)

    dinucleotides =  list(''.join(pair) for pair in product('TCAG', repeat=2))
    features_keys = codons.all_codons# + ['T', 'C', 'A', 'G'] + dinucleotides

    metacodon_counts = {features_key: {length: PositionCounts({'feature': 0}, left_buffer, right_buffer)
                                       for length in length_keys
                                      }
                        for features_key in features_keys}

    feature_slice = ('feature', slice(-left_buffer, right_buffer))
    
    for name, read_counts in read_positions.iteritems():
        transcript_sequence = read_counts['sequence']
        coding_sequence = ''.join(transcript_sequence['start_codon':('stop_codon', 3)])

        for c, codon_id in enumerate(codons.codons_from_seq(coding_sequence)):
            p = 3 * c
            if p >= left_buffer and p <= len(coding_sequence) - right_buffer:
                p_slice = ('start_codon', slice(p - left_buffer, p + left_buffer))
                for length in length_keys:
                    counts = read_positions[name][length][p_slice]
                    metacodon_counts[codon_id][length][feature_slice] += counts
                    
                    #nucleotide = codon_id[0]
                    #metacodon_counts[nucleotide][length][feature_slice] += counts
                    #dinucleotide = codon_id[:2]
                    #metacodon_counts[dinucleotide][length][feature_slice] += counts

    return metacodon_counts
예제 #2
0
def compute_codon_counts(position_counts, offset_type):
    CDS_length = position_counts.values()[0].CDS_length

    if CDS_length % 3 != 0:
        raise ValueError('CDS length not divisible by 3')

    # Note: CDS_length is the index of the first nucleotide of the stop codon.
    # Ingolia's original model never has the stop codon in the A site, but
    # subsequent data show an accumulation of (typically length 29 or 30) reads
    # that do advance this far.
    num_codons = CDS_length // 3
    landmarks = {
        'start_codon': 0,
        'stop_codon': num_codons,
    }
    codon_counts = PositionCounts(landmarks, codon_buffer, codon_buffer)

    recorded_lengths = set(position_counts.keys())
    known_A_site_lengths = set(A_site_offsets[offset_type].keys())

    for length in recorded_lengths & known_A_site_lengths:
        A_site_offset = A_site_offsets[offset_type][length]
        start_index = -A_site_offset - (codon_buffer * 3)
        end_index = CDS_length - A_site_offset + (codon_buffer * 3)
        in_frame = slice(start_index, end_index, 3)
        one_behind = slice(start_index - 1, end_index - 1, 3)
        one_ahead = slice(start_index + 1, end_index + 1, 3)
        codon_counts.data += position_counts[length]['start_codon', in_frame] + \
                             position_counts[length]['start_codon', one_behind] + \
                             position_counts[length]['start_codon', one_ahead]

    sequence_slice = slice(('start_codon', -codon_buffer * 3),
                           ('stop_codon', codon_buffer * 3))
    sequence = ''.join(position_counts['sequence'][sequence_slice])
    codon_identities_list = list(codons.codons_from_seq(sequence))
    codon_identities = PositionCounts(
        landmarks,
        codon_buffer,
        codon_buffer,
        data=np.asarray(codon_identities_list),
    )
    return codon_counts, codon_identities
예제 #3
0
def compute_metanucleotide_counts(read_positions):
    left_buffer = 50
    right_buffer = 50

    # Figure out what lengths were recorded.
    length_keys, _, _ = extract_lengths_and_buffers(read_positions)

    dinucleotides = list(''.join(pair) for pair in product('TCAG', repeat=2))
    features_keys = codons.all_codons  # + ['T', 'C', 'A', 'G'] + dinucleotides

    metacodon_counts = {
        features_key: {
            length: PositionCounts({'feature': 0}, left_buffer, right_buffer)
            for length in length_keys
        }
        for features_key in features_keys
    }

    feature_slice = ('feature', slice(-left_buffer, right_buffer))

    for name, read_counts in read_positions.iteritems():
        transcript_sequence = read_counts['sequence']
        coding_sequence = ''.join(
            transcript_sequence['start_codon':('stop_codon', 3)])

        for c, codon_id in enumerate(codons.codons_from_seq(coding_sequence)):
            p = 3 * c
            if p >= left_buffer and p <= len(coding_sequence) - right_buffer:
                p_slice = ('start_codon',
                           slice(p - left_buffer, p + left_buffer))
                for length in length_keys:
                    counts = read_positions[name][length][p_slice]
                    metacodon_counts[codon_id][length][feature_slice] += counts

                    #nucleotide = codon_id[0]
                    #metacodon_counts[nucleotide][length][feature_slice] += counts
                    #dinucleotide = codon_id[:2]
                    #metacodon_counts[dinucleotide][length][feature_slice] += counts

    return metacodon_counts
예제 #4
0
def compute_codon_counts(position_counts, offset_type):
    CDS_length = position_counts.values()[0].CDS_length

    if CDS_length % 3 != 0:
        raise ValueError('CDS length not divisible by 3')

    # Note: CDS_length is the index of the first nucleotide of the stop codon.
    # Ingolia's original model never has the stop codon in the A site, but
    # subsequent data show an accumulation of (typically length 29 or 30) reads
    # that do advance this far.
    num_codons = CDS_length // 3
    landmarks = {'start_codon': 0,
                 'stop_codon': num_codons,
                }
    codon_counts = PositionCounts(landmarks, codon_buffer, codon_buffer)

    recorded_lengths = set(position_counts.keys())
    known_A_site_lengths = set(A_site_offsets[offset_type].keys())

    for length in recorded_lengths & known_A_site_lengths:
        A_site_offset = A_site_offsets[offset_type][length]
        start_index = -A_site_offset - (codon_buffer * 3)
        end_index = CDS_length - A_site_offset + (codon_buffer * 3)
        in_frame = slice(start_index, end_index, 3)
        one_behind = slice(start_index - 1, end_index - 1, 3)
        one_ahead = slice(start_index + 1, end_index + 1, 3)
        codon_counts.data += position_counts[length]['start_codon', in_frame] + \
                             position_counts[length]['start_codon', one_behind] + \
                             position_counts[length]['start_codon', one_ahead]

    sequence_slice = slice(('start_codon', -codon_buffer * 3), ('stop_codon', codon_buffer * 3))
    sequence = ''.join(position_counts['sequence'][sequence_slice])
    codon_identities_list = list(codons.codons_from_seq(sequence))
    codon_identities = PositionCounts(landmarks,
                                      codon_buffer,
                                      codon_buffer,
                                      data = np.asarray(codon_identities_list),
                                     )
    return codon_counts, codon_identities