def test_fix_overlap_pair_opposite_directions(self):
        """Account for RBS buffer on both strands.
        """
        overlap_start_pos = 21
        feature_1_seq = 'AAACCCGGGTTTCCCAAACCCATGTTTAAAGGGTTTCCC'
        feature_2_seq = (feature_1_seq[overlap_start_pos:] +
                         'CCCTTTGGGAAACCCAAACCCGGGTTTCCCAAATTTAAA')
        whole_seq = feature_1_seq[:overlap_start_pos] + feature_2_seq
        overlap_size = len(feature_1_seq) - overlap_start_pos

        # Sanity check.
        self.assertEqual(
            len(whole_seq),
            len(feature_1_seq) + len(feature_2_seq) - overlap_size)

        # Create the sequence.
        seq = Seq(whole_seq, generic_dna)
        seq_record = SeqRecord(seq)

        feature_id_to_seq_map = {
            1: feature_1_seq,
            2: feature_2_seq,
        }

        feature_1_loc = FeatureLocation(0, len(feature_1_seq), strand=-1)
        feature_1 = SeqFeature(feature_1_loc, type='CDS', id=1)
        seq_record.features.append(feature_1)
        self._assert_feature_seq(feature_1, seq_record, feature_id_to_seq_map)

        feature_2_loc = FeatureLocation(overlap_start_pos,
                                        overlap_start_pos + len(feature_2_seq),
                                        strand=1)
        feature_2 = SeqFeature(feature_2_loc, type='CDS', id=2)
        seq_record.features.append(feature_2)
        self._assert_feature_seq(feature_2, seq_record, feature_id_to_seq_map)

        updated_seq_record = copy.deepcopy(seq_record)
        is_fix_success = fix_overlap_pair(feature_1.id, feature_2.id,
                                          updated_seq_record)
        self.assertTrue(is_fix_success)

        result_seq = str(updated_seq_record.seq)

        EXPECTED_SEQUENCE = (
            feature_1_seq +
            feature_2_seq[overlap_size:overlap_size +
                          conflicting_pair_common.RBS_BUFFER_SIZE] +
            feature_1_seq[overlap_start_pos - conflicting_pair_common.
                          RBS_BUFFER_SIZE:overlap_start_pos] + feature_2_seq)
        self.assertEqual(EXPECTED_SEQUENCE, result_seq)
        new_feature_1 = get_feature_by_id(updated_seq_record, feature_1.id)
        new_feature_2 = get_feature_by_id(updated_seq_record, feature_2.id)
        self.assertEqual(
            new_feature_1.location.end +
            2 * conflicting_pair_common.RBS_BUFFER_SIZE,
            new_feature_2.location.start)
        self._assert_feature_seq(new_feature_1, updated_seq_record,
                                 feature_id_to_seq_map)
        self._assert_feature_seq(new_feature_2, updated_seq_record,
                                 feature_id_to_seq_map)
    def test_fix_overlap_pair_same_direction_forward(self):
        """Account for RBS buffer for right strand.
        """
        overlap_start_pos = 39
        feature_1_seq = 'ATGTTTGGGAAACCCAAACCCGGGTTTAAACCCGGGTTTATGAAAGGG'
        feature_2_seq = feature_1_seq[overlap_start_pos:] + 'CCCAAATTT'
        whole_seq = feature_1_seq[:overlap_start_pos] + feature_2_seq
        seq = Seq(whole_seq, generic_dna)
        seq_record = SeqRecord(seq)

        feature_id_to_seq_map = {
            1: feature_1_seq,
            2: feature_2_seq,
        }

        feature_1_loc = FeatureLocation(0, len(feature_1_seq), strand=1)
        feature_1 = SeqFeature(feature_1_loc, type='CDS', id=1)
        seq_record.features.append(feature_1)
        self._assert_feature_seq(feature_1, seq_record, feature_id_to_seq_map)

        feature_2_loc = FeatureLocation(overlap_start_pos,
                                        overlap_start_pos + len(feature_2_seq),
                                        strand=1)
        feature_2 = SeqFeature(feature_2_loc, type='CDS', id=2)
        seq_record.features.append(feature_2)
        self._assert_feature_seq(feature_2, seq_record, feature_id_to_seq_map)

        updated_seq_record = copy.deepcopy(seq_record)
        is_fix_success = fix_overlap_pair(feature_1.id, feature_2.id,
                                          updated_seq_record)
        self.assertTrue(is_fix_success)

        EXPECTED_SEQUENCE = (
            feature_1_seq +
            feature_1_seq[overlap_start_pos - conflicting_pair_common.
                          RBS_BUFFER_SIZE:overlap_start_pos] + feature_2_seq)
        self.assertEqual(EXPECTED_SEQUENCE, str(updated_seq_record.seq))
        new_feature_1 = get_feature_by_id(updated_seq_record, feature_1.id)
        new_feature_2 = get_feature_by_id(updated_seq_record, feature_2.id)
        self.assertEqual(
            new_feature_1.location.end +
            conflicting_pair_common.RBS_BUFFER_SIZE,
            new_feature_2.location.start)
        self._assert_feature_seq(new_feature_1, updated_seq_record,
                                 feature_id_to_seq_map)
        self._assert_feature_seq(new_feature_2, updated_seq_record,
                                 feature_id_to_seq_map)
    def test_fix_overlaps_simple(self):
        feature_1_seq = 'ATGTTTGGG'
        feature_2_seq = 'GGGCCCAAAGTA'
        overlap_start_pos = 6
        whole_seq = feature_1_seq[:overlap_start_pos] + feature_2_seq
        overlap_size = len(feature_1_seq) - overlap_start_pos
        seq = Seq(whole_seq, generic_dna)
        seq_record = SeqRecord(seq)

        feature_id_to_seq_map = {
            1: feature_1_seq,
            2: feature_2_seq,
        }

        feature_1_loc = FeatureLocation(0, len(feature_1_seq), strand=1)
        feature_1 = SeqFeature(feature_1_loc, type='CDS', id=1)
        seq_record.features.append(feature_1)
        self._assert_feature_seq(feature_1, seq_record, feature_id_to_seq_map)

        feature_2_loc = FeatureLocation(overlap_start_pos,
                                        overlap_start_pos + len(feature_2_seq),
                                        strand=-1)
        feature_2 = SeqFeature(feature_2_loc, type='CDS', id=2)
        seq_record.features.append(feature_2)
        self._assert_feature_seq(feature_2, seq_record, feature_id_to_seq_map)

        # Build and use the overlap fixer.
        updated_seq_record = copy.deepcopy(seq_record)
        refactor_context = RefactorContext(updated_seq_record)
        refactor_context.set_forbidden_codon_set(set(['GGG']))
        cpf = ConflictingPairFixer(refactor_context)
        cpf.fix_overlaps()

        EXPECTED_SEQUENCE = feature_1_seq + feature_2_seq

        self.assertEqual(EXPECTED_SEQUENCE, str(updated_seq_record.seq))
        new_feature_1 = get_feature_by_id(updated_seq_record, feature_1.id)
        new_feature_2 = get_feature_by_id(updated_seq_record, feature_2.id)
        self.assertEqual(new_feature_1.location.end,
                         new_feature_2.location.start)
        self._assert_feature_seq(new_feature_1, updated_seq_record,
                                 feature_id_to_seq_map)
        self._assert_feature_seq(new_feature_2, updated_seq_record,
                                 feature_id_to_seq_map)
    def test_add_feature_to_seq_record__regular(self):
        SEQ = Seq('ATGTTTGGGTAGAGTA', generic_dna)
        seq_record = SeqRecord(SEQ)
        FEATURE_1_ID = '1'
        FEATURE_1_LOC = FeatureLocation(4, 7)
        feature_1 = SeqFeature(FEATURE_1_LOC, type='CDS', id=FEATURE_1_ID)
        add_feature_to_seq_record(seq_record, feature_1)

        lookup_feature_1 = get_feature_by_id(seq_record, FEATURE_1_ID)
        self.assertEqual(feature_1, lookup_feature_1)
    def test_add_feature_to_seq_record__regular(self):
        SEQ = Seq('ATGTTTGGGTAGAGTA', generic_dna)
        seq_record = SeqRecord(SEQ)
        FEATURE_1_ID = '1'
        FEATURE_1_LOC = FeatureLocation(4, 7)
        feature_1 = SeqFeature(FEATURE_1_LOC, type='CDS', id=FEATURE_1_ID)
        add_feature_to_seq_record(seq_record, feature_1)

        lookup_feature_1 = get_feature_by_id(seq_record, FEATURE_1_ID)
        self.assertEqual(feature_1, lookup_feature_1)
    def test_fix_overlaps_third_feature_none_overlap(self):
        feature_1_seq = 'ATGTTTGGG'
        feature_2_seq = 'GGGCCCAAAGTA'
        inter_f2_f3_junk = 'GTAGCTATCTATCTGGTTAAATC'
        feature_3_seq = 'ATGAAACCCTTTGGGTTTCCCAAA'
        overlap_start_pos = 6
        whole_seq = (feature_1_seq[:overlap_start_pos] + feature_2_seq +
                     inter_f2_f3_junk + feature_3_seq)
        overlap_size = len(feature_1_seq) - overlap_start_pos
        seq = Seq(whole_seq, generic_dna)
        seq_record = SeqRecord(seq)

        feature_id_to_seq_map = {
            1: feature_1_seq,
            2: feature_2_seq,
            3: feature_3_seq,
        }

        feature_1_loc = FeatureLocation(0, len(feature_1_seq), strand=1)
        feature_1 = SeqFeature(feature_1_loc, type='CDS', id=1)
        seq_record.features.append(feature_1)
        self._assert_feature_seq(feature_1, seq_record, feature_id_to_seq_map)

        feature_2_loc = FeatureLocation(overlap_start_pos,
                                        overlap_start_pos + len(feature_2_seq),
                                        strand=-1)
        feature_2 = SeqFeature(feature_2_loc, type='CDS', id=2)
        seq_record.features.append(feature_2)
        self._assert_feature_seq(feature_2, seq_record, feature_id_to_seq_map)

        feature_3_start = feature_2_loc.end + len(inter_f2_f3_junk)
        feature_3_end = feature_3_start + len(feature_3_seq)
        feature_3_loc = FeatureLocation(feature_3_start,
                                        feature_3_end,
                                        strand=1)
        feature_3 = SeqFeature(feature_3_loc, type='CDS', id=3)
        seq_record.features.append(feature_3)
        self._assert_feature_seq(feature_3, seq_record, feature_id_to_seq_map)

        # Build and use the overlap fixer.
        updated_seq_record = copy.deepcopy(seq_record)
        refactor_context = RefactorContext(updated_seq_record)
        refactor_context.set_forbidden_codon_set(set(['GGG']))
        cpf = ConflictingPairFixer(refactor_context)
        cpf.fix_overlaps()

        EXPECTED_SEQUENCE = (feature_1_seq + feature_2_seq + inter_f2_f3_junk +
                             feature_3_seq)

        self.assertEqual(EXPECTED_SEQUENCE, str(updated_seq_record.seq))
        for feature_id in feature_id_to_seq_map.keys():
            new_feature = get_feature_by_id(updated_seq_record, feature_id)
            self._assert_feature_seq(new_feature, updated_seq_record,
                                     feature_id_to_seq_map)
def check_rnas_conserved(original_seq_record,
                         refactored_seq_record,
                         ignore_problems_in_feature_ids=[],
                         interval=None):
    """Check that RNA coding sequences are conserved.
    """
    print '...Checking RNA\'s are conserved...'

    RNA_TYPES = set(['misc_RNA', 'ncRNA', 'rRNA', 'tRNA', 'tmRNA'])

    # Filter original features to those in interval, if provided.
    if interval:
        interval_features = [
            f for f in original_seq_record.features
            if does_interval_overlap_feature(interval, f)
        ]
    else:
        interval_features = original_seq_record.features
    original_rna_features = filter(
        lambda feature: feature.type in RNA_TYPES and not feature.id in
        ignore_problems_in_feature_ids, interval_features)

    if interval:
        refactored_interval_features = [
            f for f in refactored_seq_record.features
            if does_interval_overlap_feature(interval, f)
        ]
    else:
        refactored_interval_features = refactored_seq_record.features
    refactored_rna_features = filter(
        lambda feature: feature.type in RNA_TYPES and not feature.id in
        ignore_problems_in_feature_ids, refactored_interval_features)

    error_msg = "Different number of RNA features."
    assert len(original_rna_features) == len(
        refactored_rna_features), error_msg

    for original_feature in original_rna_features:
        original_feature_seq = original_feature.extract(
            original_seq_record.seq)

        refactored_feature = get_feature_by_id(refactored_seq_record,
                                               original_feature.id)
        refactored_feature_seq = refactored_feature.extract(
            refactored_seq_record.seq)

        error_msg = "RNA not conserved for %s" % original_feature.id
        assert str(original_feature_seq) == str(
            refactored_feature_seq), error_msg

    print '......RNA conservation confirmed.'

    return True
Example #8
0
def analyze_variation(original_record, recoded_record):
    """Analyzes the variation between the two records.

    The start and recoded records must have the same feature ids.

    Returns:
        Dictionary with the following keys:
            * codon_similarity: Proportion of codons unchanged.
            * based_similarity: Proportion of bases unchanged.
    """
    total_codons = 0
    reassigned_codons = 0
    total_bases = 0
    reassigned_bases = 0

    original_record_coding_features = [
        feature for feature in original_record.features
        if feature.type == 'CDS'
    ]

    for orig_feature in original_record_coding_features:
        recoded_feature = get_feature_by_id(recoded_record, orig_feature.id)

        orig_seq = orig_feature.extract(original_record.seq)
        recoded_seq = recoded_feature.extract(recoded_record.seq)

        if not len(orig_seq) == len(recoded_seq):
            print '>>>> Omitting ' + str(orig_feature)
            continue

        total_bases += len(orig_feature)
        total_codons += len(orig_feature) / 3

        for codon_index in range(0, len(orig_seq), 3):
            orig_codon = str(orig_seq[codon_index:codon_index + 3])
            recoded_codon = str(recoded_seq[codon_index:codon_index + 3])
            if orig_codon == recoded_codon:
                continue
            else:
                reassigned_codons += 1
                reassigned_bases += _num_differing_bases(
                    orig_codon, recoded_codon)

    return {
        'total_codons': total_codons,
        'reassigned_codons': reassigned_codons,
        'codon_similarity':
        float(total_codons - reassigned_codons) / total_codons,
        'total_bases': total_bases,
        'reassigned_bases': reassigned_bases,
        'base_similarity': float(total_bases - reassigned_bases) / total_bases
    }
    def test_fix_overlap_pair_opposing_strands(self):
        """This should simply pull them apart, without adding anything in
        between.
        """
        seq = Seq('ATGTTTGGGCCCAAAGTA', generic_dna)
        seq_record = SeqRecord(seq)

        feature_1_seq = 'ATGTTTGGG'
        feature_2_seq = 'GGGCCCAAAGTA'
        feature_id_to_seq_map = {
            1: feature_1_seq,
            2: feature_2_seq,
        }

        feature_1_loc = FeatureLocation(0, 9, strand=1)
        feature_1 = SeqFeature(feature_1_loc, type='CDS', id=1)
        seq_record.features.append(feature_1)
        self._assert_feature_seq(feature_1, seq_record, feature_id_to_seq_map)

        feature_2_loc = FeatureLocation(6, 18, strand=-1)
        feature_2 = SeqFeature(feature_2_loc, type='CDS', id=2)
        seq_record.features.append(feature_2)
        self._assert_feature_seq(feature_2, seq_record, feature_id_to_seq_map)

        updated_seq_record = copy.deepcopy(seq_record)
        is_fix_success = fix_overlap_pair(feature_1.id, feature_2.id,
                                          updated_seq_record)
        self.assertTrue(is_fix_success)

        self.assertEqual('ATGTTTGGGGGGCCCAAAGTA', str(updated_seq_record.seq))
        new_feature_1 = get_feature_by_id(updated_seq_record, feature_1.id)
        new_feature_2 = get_feature_by_id(updated_seq_record, feature_2.id)
        self.assertEqual(new_feature_1.location.end,
                         new_feature_2.location.start)
        self._assert_feature_seq(new_feature_1, updated_seq_record,
                                 feature_id_to_seq_map)
        self._assert_feature_seq(new_feature_2, updated_seq_record,
                                 feature_id_to_seq_map)
def check_translations_conserved(original_seq_record,
                                 refactored_seq_record,
                                 interval=None):
    """Confirms that the translations of the coding features are preserved
    before and after.
    """
    print '...Checking translation is conserved...'

    original_coding_features = _get_features_passing_interval_filter(
        original_seq_record, interval)
    original_coding_features = filter(lambda feature: feature.type == 'CDS',
                                      original_coding_features)

    refactored_coding_features = _get_features_passing_interval_filter(
        refactored_seq_record, interval)
    refactored_coding_features = filter(lambda feature: feature.type == 'CDS',
                                        refactored_coding_features)

    error_msg = "Different number of CDS features."
    assert len(original_coding_features) == len(
        refactored_coding_features), error_msg

    for original_feature in original_coding_features:
        if original_feature.id in TRANSLATION_CONSERVED_EXCEPTIONS:
            continue
        original_feature_seq = original_feature.extract(
            original_seq_record.seq)
        original_translation = translate_custom(str(original_feature_seq))

        refactored_feature = get_feature_by_id(refactored_seq_record,
                                               original_feature.id)
        if not refactored_feature:
            raise AssertionError("Feature lost after refactor: %s" %
                                 str(original_feature))
        refactored_feature_seq = refactored_feature.extract(
            refactored_seq_record.seq)
        try:
            refactored_translation = translate_custom(
                str(refactored_feature_seq))
        except TranslationError as e:
            print "Error translating %s" % str(original_feature)
            raise e

        error_msg = "Translation mismatch for feature %s" % original_feature.id
        assert original_translation == refactored_translation, error_msg

    print '......Translation conservation confirmed.'

    # All tests passed.
    return True
Example #11
0
def resolve_single_homology_issue(refactor_context, homology_pair_obj):
    """Resolves homology issues between two objects.

    Args:
        refactor_context: The RefactorContext whose SeqRecord will be mutated.
        homology_pair_obj: Object produced by
            find_features_to_check_for_homology(). This is not guaranteed to
            have all the correct keys as of the current implementation.
    """
    genome_record = refactor_context.get_genome_record()

    # The homology_pair_obj passed in may not have all the necessary keys,
    # for example if it's the type of copy we don't know how to deal with yet,
    # so for now we just try-catch any KeyError and report that the homology
    # was not fixed.
    try:
        copy_id = homology_pair_obj['copy_id']
        copy_seq = homology_pair_obj['copy_seq']
        print 'Resolving homology for %s ...' % copy_id

        # Parse the details of the feature to modify from homology_pair_obj.
        feature_to_modify_id = homology_pair_obj['source_id']
        first_codon_to_modify = homology_pair_obj[
            'source_seq_first_codon_index']
        source_feature = get_feature_by_id(genome_record, feature_to_modify_id)
        source_feature_seq = source_feature.extract(genome_record.seq)
        num_codons = len(source_feature) / 3
        avoid_codons_in_positions = {}
        for codon_index in range(first_codon_to_modify, num_codons):
            codon = str(source_feature_seq[codon_index * 3:codon_index * 3 +
                                           3])
            avoid_codons_in_positions[codon_index] = codon

        # Perform the fix.
        result = replace_codons_in_single_feature(
            refactor_context,
            feature_to_modify_id,
            start_codon_index=first_codon_to_modify,
            avoid_codons_in_positions=avoid_codons_in_positions)
        assert str(result['orig_feature_seq']) != str(
            result['new_feature_seq'])
        assert result['is_success'], "Resolving homology not successful."

        update_seq_record_feature(genome_record, feature_to_modify_id, result)

        # Homology fixed.
        return True

    except KeyError:
        return False
Example #12
0
def generate_recoding_stats(original_genome_record, recoded_genome_record,
                            outfile):
    """Generate stats related to recoding.
    """
    # We only use CDS features for this stat.
    original_coding_features = filter(lambda feature: feature.type == 'CDS',
                                      original_genome_record.features)

    total_coding_features = len(original_coding_features)
    total_codons = 0
    num_codons_recoded = 0
    percent_recoded_list = []
    for idx, feature in enumerate(original_coding_features):
        print 'Analyzing feature %s,  %d of %d' % (feature.id, idx + 1,
                                                   total_coding_features)
        recoded_feature = get_feature_by_id(recoded_genome_record, feature.id)
        if recoded_feature is None:
            continue
        orig_seq = feature.extract(original_genome_record).seq
        recoded_seq = recoded_feature.extract(recoded_genome_record).seq
        codons_recoded_for_feature = 0
        feature_len = len(orig_seq)
        for codon_index in range(0, len(orig_seq), 3):
            total_codons += 1
            original_codon = str(orig_seq[codon_index:codon_index + 3])
            recoded_codon = str(recoded_seq[codon_index:codon_index + 3])
            if original_codon != recoded_codon:
                codons_recoded_for_feature += 1
        feature_codons = feature_len / 3
        feature_percent_recoded = (float(codons_recoded_for_feature) /
                                   feature_codons)
        percent_recoded_list.append(feature_percent_recoded)

        num_codons_recoded += codons_recoded_for_feature

    percent_recoded = float(num_codons_recoded) / total_codons
    avg_percent_recoded = (float(sum(percent_recoded_list)) /
                           len(percent_recoded_list))

    data = {
        'total_CDS_features': total_coding_features,
        'total_codons': total_codons,
        'num_codons_recoded': num_codons_recoded,
        'proportion_recoded': percent_recoded,
        'avg_proportion_recoded': avg_percent_recoded
    }

    with open(outfile, 'w') as fh:
        fh.write(json.dumps(data, indent=4, separators=(',', ': ')))
Example #13
0
def main():
    source_ids_to_muddle = []
    with open(AGN_DEBUG_FILE) as agn_debug_fh:
        reader = csv.DictReader(agn_debug_fh)
        for row in reader:
            if row['Separate'] == '1':
                source_ids_to_muddle.append(row['ID'])
    print source_ids_to_muddle

    record = get_genome_record(RECODED_PATH)
    refactor_context = RefactorContext(record)

    for source_feature_id in source_ids_to_muddle:
        source_feature = get_feature_by_id(record, source_feature_id)
        muddle_end(source_feature, record, refactor_context, 20)

    # rbs_cp_features = [feature for feature in record.features if
    #         feature.type == InsertType.FIX_OVERLAP_RBS_COPY]

    # overlap_head_cp_features = [feature for feature in record.features if
    #         feature.type == InsertType.FIX_OVERLAP_HEAD_COPY]

    # print 'rbs', len(rbs_cp_features)
    # print 'head', len(overlap_head_cp_features)

    # head_cp_feature_ids = set()
    # for head_feature in overlap_head_cp_features:
    #     source_feature_id = re.match(r'(?P<feature_id>.*)_' + InsertType.FIX_OVERLAP_HEAD_COPY, head_feature.id).group('feature_id')
    #     head_cp_feature_ids.add(source_feature_id)

    # count = 0
    # for rbs_cp_feature in rbs_cp_features:
    #     downstream = False
    #     match = re.match(r'(?P<feature_id>.*)_upstream_' + InsertType.FIX_OVERLAP_RBS_COPY, rbs_cp_feature.id)
    #     if not match:
    #         downstream = True
    #         match = re.match(r'(?P<feature_id>.*)_downstream_' + InsertType.FIX_OVERLAP_RBS_COPY, rbs_cp_feature.id)
    #     source_feature_id = match.group('feature_id')
    #     if source_feature_id in head_cp_feature_ids:
    #         print 'HAS HEAD_CP', source_feature_id
    #         continue
    #     source_feature = get_feature_by_id(record, source_feature_id)
    #     num_part = re.match(r'.*_(?P<num>[0-9]+)', source_feature_id).group('num')

    #     if source_feature.strand == 1:
    #         assert source_feature.location.start > rbs_cp_feature.location.start
    #         actual_source_id = ID_ROOT + str(int(num_part) - 1)
    #     else:
    #         assert source_feature.location.start < rbs_cp_feature.location.start
    #         actual_source_id = ID_ROOT + str(int(num_part) + 1)
    #     try:
    #         actual_source_feature = get_feature_by_id(record, actual_source_id)
    #     except:
    #         print 'NOT MUDDLING', rbs_cp_feature.id
    #         continue

    #     if actual_source_feature.strand != rbs_cp_feature.strand:
    #         print 'NOT MUDDLING', rbs_cp_feature.id
    #         continue

    #     print 'MUDDLING', rbs_cp_feature.id
    #     muddle_end(actual_source_feature, record, refactor_context, len(rbs_cp_feature))

    #     count += 1

    # print 'COUNT', count

    with open(OUTFILE, 'w') as fh:
        SeqIO.write(record, fh, 'genbank')
Example #14
0
    def __init__(self, feature_id, original_seq_record, **kwargs):
        """Default constructor.

        Args:
            feature_id: The string id of the feature to profile.
            original_seq_record: The SeqRecord object with that contains all the
                data.
            kwargs: Dictionary of any of the following keys
                * error_tolerance
                * error_inc
                * value_range
        """
        self.original_seq_record = original_seq_record
        self.feature = biopython_util.get_feature_by_id(
            self.original_seq_record, feature_id)
        self.original_feature_seq = str(
            self.feature.extract(self.original_seq_record.seq))
        self.computation_cache = {}
        self.failed_error_treshold_count = 0

        if 'error_tolerance' in kwargs:
            self.error_tolerance = kwargs['error_tolerance']

        if 'error_inc' in kwargs:
            self.error_inc = kwargs['error_inc']

        if 'value_range' in kwargs:
            self.value_range = kwargs['value_range']

        if 'sliding_window_size' in kwargs:
            self.sliding_window_size = kwargs['sliding_window_size']

        if self.feature.strand == 1:
            self.polarity_aware_underlying_seq = self.original_seq_record.seq
            self.polarity_aware_feature_location_start = (
                self.feature.location.start)
            self.polarity_aware_feature_location_end = (
                self.feature.location.end)
        elif self.feature.strand == -1:
            self.polarity_aware_underlying_seq = (
                self.original_seq_record.seq.reverse_complement())
            total_len = len(self.polarity_aware_underlying_seq)
            self.polarity_aware_feature_location_start = (
                total_len - self.feature.location.end)
            self.polarity_aware_feature_location_end = (
                self.polarity_aware_feature_location_start + len(self.feature))
        else:
            raise ValueError("No feature strand.")

        # Some basic validation.
        self._validate()

        # Set or compute the original score profile.
        if 'values' in kwargs:
            num_codons = len(self.feature) / 3
            self.values = kwargs['values']
            assert num_codons == len(
                self.values), ("Wrong number of "
                               "values passed to FeatureProfile constructor.")
        else:
            self.compute_original_feature_values()
Example #15
0
def handle_prfB(genome_record):
    """Modifies the genome record to make prfB a CDS and remove
    the frameshift.
    """
    prfB_locus_tag = 'ECMDS42_2390'
    prfB_feature = get_feature_by_id(genome_record, prfB_locus_tag)
    prfB_feature_seq = str(prfB_feature.extract(genome_record.seq))
    original_length = len(prfB_feature)

    # Feature is on the negative strand.
    assert prfB_feature.strand == -1

    # First delete the frameshift, CTTT -> CTT, by removing the T at the 75th
    # position of the feature.
    FRAMESHIFT_START = 72
    assert 'CTTTGAC' == str(
        prfB_feature_seq[FRAMESHIFT_START:FRAMESHIFT_START + 7])
    prfB_feature_seq = prfB_feature_seq[:74] + prfB_feature_seq[75:]
    assert 'CTTGAC' == str(prfB_feature_seq[FRAMESHIFT_START:FRAMESHIFT_START +
                                            6])

    # Next change the RBS just before it AGGGGG -> CGTGGG (as per Chris Gregg
    # data from 8/25/13).
    RBS_START = 63
    assert 'AGGGGG' == str(prfB_feature_seq[63:69])
    prfB_feature_seq = prfB_feature_seq[:63] + 'CGT' + prfB_feature_seq[66:]
    assert 'CGTGGG' == str(prfB_feature_seq[63:69])

    # Now replace the underlying sequence of the genome.
    updated_seq = (genome_record.seq[:prfB_feature.location.start] +
                   reverse_complement(prfB_feature_seq) +
                   genome_record.seq[prfB_feature.location.end:])
    assert len(genome_record.seq) - 1 == len(updated_seq)
    genome_record.seq = updated_seq

    # Change the prfB feature to be a CDS.
    prfB_feature.type = 'CDS'

    # Bump the start position one unit right.
    # Remember, it's on the negative strand.
    prfB_feature.location = FeatureLocation(prfB_feature.location.start,
                                            prfB_feature.location.end - 1,
                                            strand=prfB_feature.strand)

    # Update the positions of downstream features.
    updated_features = []
    for feature in genome_record.features:
        if feature.location.start > prfB_feature.location.start:
            feature = feature._shift(-1)
        updated_features.append(feature)
    genome_record.features = updated_features

    # Make sure changes went through.
    mod_prfB_feature = get_feature_by_id(genome_record, prfB_locus_tag)
    mod_prfB_feature_seq = str(mod_prfB_feature.extract(genome_record.seq))
    assert original_length - 1 == len(mod_prfB_feature)
    assert prfB_feature_seq[:6] == mod_prfB_feature_seq[:6], (
        "Before: %s, After: %s" %
        (prfB_feature_seq[:6], mod_prfB_feature_seq[:6]))
    assert prfB_feature_seq[-10:] == mod_prfB_feature_seq[-10:], (
        "Before: %s, After: %s" %
        (prfB_feature_seq[-10:], mod_prfB_feature_seq[10:]))

    return genome_record
    def test_fix_overlap_pair_same_direction_reverse(self):
        """Account for RBS buffer for left strand.
        """
        overlap_start_pos = 12
        feature_1_seq = 'AAACCCGGGTTTCCCAAAGTA'
        feature_2_seq = (feature_1_seq[overlap_start_pos:] +
                         'CCCTTTGGGAAACCCAAACCCGGGTTTCCCAAATTTGTA')
        whole_seq = feature_1_seq[:overlap_start_pos] + feature_2_seq
        overlap_seq = feature_1_seq[overlap_start_pos:]
        overlap_size = len(overlap_seq)

        # Sanity check, for visual debug if necessary.
        self.assertEqual(
            'AAACCCGGGTTTCCCAAAGTACCCTTTGGGAAACCCAAACCCGGGTTTCCCAAATTTGTA',
            whole_seq)
        self.assertEqual(
            len(whole_seq),
            len(feature_1_seq) + len(feature_2_seq) - overlap_size)

        # Create the sequence.
        seq = Seq(whole_seq, generic_dna)
        seq_record = SeqRecord(seq)

        feature_id_to_seq_map = {
            1: feature_1_seq,
            2: feature_2_seq,
        }

        feature_1_loc = FeatureLocation(0, len(feature_1_seq), strand=-1)
        feature_1 = SeqFeature(feature_1_loc, type='CDS', id=1)
        seq_record.features.append(feature_1)
        self._assert_feature_seq(feature_1, seq_record, feature_id_to_seq_map)

        feature_2_loc = FeatureLocation(overlap_start_pos,
                                        overlap_start_pos + len(feature_2_seq),
                                        strand=-1)
        feature_2 = SeqFeature(feature_2_loc, type='CDS', id=2)
        seq_record.features.append(feature_2)
        self._assert_feature_seq(feature_2, seq_record, feature_id_to_seq_map)

        updated_seq_record = copy.deepcopy(seq_record)
        is_fix_success = fix_overlap_pair(feature_1.id, feature_2.id,
                                          updated_seq_record)
        self.assertTrue(is_fix_success)

        result_seq = str(updated_seq_record.seq)

        EXPECTED_SEQUENCE = (
            feature_1_seq +
            feature_2_seq[overlap_size:overlap_size +
                          conflicting_pair_common.RBS_BUFFER_SIZE] +
            feature_2_seq)
        # Sanity check, for visual debug if necessary.
        self.assertEqual(
            'AAACCCGGGTTTCCCAAAGTA'
            'CCCTTTGGGAAACCC'
            'CCCAAAGTACCCTTTGGGAAACCCAAACCCGGGTTTCCCAAATTTGTA',
            EXPECTED_SEQUENCE)

        # Assert correct length. (Simple debug check).
        self.assertEqual(len(EXPECTED_SEQUENCE), len(result_seq))

        # Somewhat redundant piece-wise checks from debugging, but might as well
        # leave them here for the future.

        # Assert the entirety of feature_1 is copied over.
        self.assertEqual(feature_1_seq, result_seq[:len(feature_1_seq)])

        # Assert the RBS extension is properly copied over
        self.assertEqual(
            EXPECTED_SEQUENCE[len(feature_1_seq):len(feature_1_seq) +
                              conflicting_pair_common.RBS_BUFFER_SIZE],
            result_seq[len(feature_1_seq):len(feature_1_seq) +
                       conflicting_pair_common.RBS_BUFFER_SIZE])

        # Assert the entirety of feature_2 is copied over at the end.
        self.assertEqual(feature_2_seq,
                         result_seq[len(result_seq) - len(feature_2_seq):])

        # Assert the whole thing is correct.
        self.assertEqual(EXPECTED_SEQUENCE, result_seq)

        # Make sure the features are preserved.
        new_feature_1 = get_feature_by_id(updated_seq_record, feature_1.id)
        new_feature_2 = get_feature_by_id(updated_seq_record, feature_2.id)
        self.assertEqual(
            new_feature_1.location.end +
            conflicting_pair_common.RBS_BUFFER_SIZE,
            new_feature_2.location.start)
        self._assert_feature_seq(new_feature_1, updated_seq_record,
                                 feature_id_to_seq_map)
        self._assert_feature_seq(new_feature_2, updated_seq_record,
                                 feature_id_to_seq_map)
Example #17
0
def analyze_codon_motif(motif_set, original_record, recoded_record,
                        report_file):
    """Find all AGY positions that are are within 30 bases upstream of another
    feature.

    Args:
        motif_set: Set of codons to look for.
    """
    AGY_list = []

    original_seq = str(original_record.seq).upper()
    recoded_seq = str(recoded_record.seq).upper()

    # Identify overlapping features (this includes those close enough for RBS)
    # to possibly conflict. Use cache.
    conflicting_pairs = find_all_overlaps(
        None,  # genome_record,
        None,  # forbidden_codons,
        cache=True)

    original_rbs_profiles = get_mds42_rbs_strength_profile()

    for pair in conflicting_pairs:
        if (pair['upstream_feature'].type != 'CDS'
                or pair['downstream_feature'].type != 'CDS'):
            continue
        # Ignore prfB.
        if (get_feature_gene(pair['upstream_feature']) == 'prfB'
                or get_feature_gene(pair['downstream_feature']) == 'prfB'):
            continue
        original_upstream_feature = get_feature_by_id(
            original_record, pair['upstream_feature'].id)
        original_downstream_feature = get_feature_by_id(
            original_record, pair['downstream_feature'].id)
        recoded_upstream_feature = get_feature_by_id(
            recoded_record, pair['upstream_feature'].id)
        recoded_downstream_feature = get_feature_by_id(
            recoded_record, pair['downstream_feature'].id)

        if (original_upstream_feature.strand == 1
                and original_downstream_feature.strand == 1):
            roi_start = original_downstream_feature.location.start - 20
            for codon_index in range(0, len(original_upstream_feature), 3):
                pos = original_upstream_feature.location.start + codon_index
                if pos < roi_start:
                    continue
                else:
                    codon = original_seq[pos:pos + 3]
                    if codon in motif_set:
                        recoded_pos = (
                            recoded_upstream_feature.location.start +
                            codon_index)
                        recoded_codon = recoded_seq[recoded_pos:recoded_pos +
                                                    3]

                        # Get before/after RBS.
                        orig_rbs_expression = original_rbs_profiles[
                            original_downstream_feature.id]
                        recoded_rbs_expression = calc_rbs_score_for_feature(
                            recoded_downstream_feature,
                            recoded_seq)['expression']
                        delta_expression = (recoded_rbs_expression -
                                            orig_rbs_expression)

                        AGY_list.append({
                            'pos': pos,
                            'recoded_pos': recoded_pos,
                            'ref': codon,
                            'alt': recoded_codon,
                            'orig_rbs_expression': orig_rbs_expression,
                            'recoded_rbs_expression': recoded_rbs_expression,
                            'delta_expression': delta_expression,
                            'strand': 1,
                        })

        # elif (original_upstream_feature.strand == -1 and
        #         original_downstream_feature.strand == -1):
        #     original_downstream_feature_seq = (
        #             original_downstream_feature.extract(original_seq))
        #     recoded_downstream_feature_seq = (
        #             recoded_downstream_feature.extract(recoded_seq))
        #     roi_start = original_upstream_feature.location.end + 20
        #     for codon_index in range(0, len(original_downstream_feature), 3):
        #         pos = original_downstream_feature.location.end - codon_index
        #         if pos > roi_start:
        #             continue
        #         else:
        #             codon = original_downstream_feature_seq[
        #                     codon_index:codon_index + 3]
        #             if codon in AGY_codons:
        #                 recoded_pos = (recoded_downstream_feature.location.start -
        #                         codon_index)
        #                 recoded_codon = recoded_downstream_feature_seq[
        #                         codon_index:codon_index + 3]

        #                 # Get before/after RBS.
        #                 orig_rbs_expression = original_rbs_profiles[
        #                         original_upstream_feature.id]
        #                 recoded_rbs_expression = calc_rbs_score_for_feature(
        #                         recoded_upstream_feature, recoded_seq)[
        #                                 'expression']
        #                 delta_expression = (recoded_rbs_expression -
        #                         orig_rbs_expression)

        #                 AGY_list.append({
        #                     'pos': pos,
        #                     'recoded_pos': recoded_pos,
        #                     'ref': codon,
        #                     'alt': recoded_codon,
        #                     'orig_rbs_expression': orig_rbs_expression,
        #                     'recoded_rbs_expression': recoded_rbs_expression,
        #                     'delta_expression': delta_expression,
        #                     'strand': -1
        #                 })

    print 'Writing report.'
    with open(report_file, 'w') as fh:
        FIELD_NAMES = [
            'pos',
            'recoded_pos',
            'ref',
            'alt',
            'orig_rbs_expression',
            'recoded_rbs_expression',
            'delta_expression',
            'strand',
        ]

        writer = csv.DictWriter(fh, FIELD_NAMES)
        writer.writeheader()
        for AGY_obj in AGY_list:
            writer.writerow(AGY_obj)
Example #18
0
def main():
    record = get_genome_record(RECODED_PATH)

    with open(AGN_DEBUG_FILE) as agn_debug_fh:
        reader = csv.DictReader(agn_debug_fh)
        for row in reader:
            new_codon = row['Codon']
            if len(new_codon) != 3:
                continue

            feature_id = row['ID']
            print feature_id

            # Identify codon positions from original record.
            sequence = row['Sequence']
            if len(sequence) < 14:
                # Too hard.
                continue
            orig_codon = sequence[8:11]
            if not re.match(r'[A-Z]{3}', orig_codon):
                continue

            # Identify which codon index the position is.
            seq_ending_with_codon = sequence[0:3].upper(
            ) + sequence[4:7].upper() + orig_codon
            print seq_ending_with_codon

            orig_feature = get_feature_by_id(MDS42_RECORD, feature_id)
            orig_feature_seq = str(orig_feature.extract(MDS42_RECORD.seq))

            new_feature = get_feature_by_id(record, feature_id)
            new_feature_seq = str(new_feature.extract(record.seq))

            # Find the last occurrence.
            last_pos = None
            for match in re.finditer(seq_ending_with_codon, orig_feature_seq):
                last_pos = match.start()
            print last_pos
            codon_pos = last_pos + 6
            print 'expect', orig_feature_seq[codon_pos:codon_pos + 3]

            # Translate to codon index.
            codon_index = codon_pos / 3

            # Make sure codon at that position in recoded record is synonymous.
            codon_at_index = new_feature_seq[codon_pos:codon_pos + 3]
            print codon_at_index
            syn = ORIGINAL_CODON_USAGE_MEMEX.get_synonymous_codons(
                codon_at_index)
            print syn
            assert orig_codon in syn

            # Make the change in the target record.
            swap_feature_codon_at_position(record, new_feature.id, codon_pos,
                                           codon_at_index, new_codon)

            # Maybe do the next one.
            maybe_next = row['next'].upper()
            if len(maybe_next) == 3:
                syn = ORIGINAL_CODON_USAGE_MEMEX.get_synonymous_codons(
                    maybe_next)
                codon_at_index_next = new_feature_seq[codon_pos + 3:codon_pos +
                                                      6]
                assert codon_at_index_next in syn
                swap_feature_codon_at_position(record, new_feature.id,
                                               codon_pos + 3,
                                               codon_at_index_next, maybe_next)

    with open(OUTFILE, 'w') as fh:
        SeqIO.write(record, fh, 'genbank')