def test_fix_overlap_pair_opposite_directions(self): """Account for RBS buffer on both strands. """ overlap_start_pos = 21 feature_1_seq = 'AAACCCGGGTTTCCCAAACCCATGTTTAAAGGGTTTCCC' feature_2_seq = (feature_1_seq[overlap_start_pos:] + 'CCCTTTGGGAAACCCAAACCCGGGTTTCCCAAATTTAAA') whole_seq = feature_1_seq[:overlap_start_pos] + feature_2_seq overlap_size = len(feature_1_seq) - overlap_start_pos # Sanity check. self.assertEqual( len(whole_seq), len(feature_1_seq) + len(feature_2_seq) - overlap_size) # Create the sequence. seq = Seq(whole_seq, generic_dna) seq_record = SeqRecord(seq) feature_id_to_seq_map = { 1: feature_1_seq, 2: feature_2_seq, } feature_1_loc = FeatureLocation(0, len(feature_1_seq), strand=-1) feature_1 = SeqFeature(feature_1_loc, type='CDS', id=1) seq_record.features.append(feature_1) self._assert_feature_seq(feature_1, seq_record, feature_id_to_seq_map) feature_2_loc = FeatureLocation(overlap_start_pos, overlap_start_pos + len(feature_2_seq), strand=1) feature_2 = SeqFeature(feature_2_loc, type='CDS', id=2) seq_record.features.append(feature_2) self._assert_feature_seq(feature_2, seq_record, feature_id_to_seq_map) updated_seq_record = copy.deepcopy(seq_record) is_fix_success = fix_overlap_pair(feature_1.id, feature_2.id, updated_seq_record) self.assertTrue(is_fix_success) result_seq = str(updated_seq_record.seq) EXPECTED_SEQUENCE = ( feature_1_seq + feature_2_seq[overlap_size:overlap_size + conflicting_pair_common.RBS_BUFFER_SIZE] + feature_1_seq[overlap_start_pos - conflicting_pair_common. RBS_BUFFER_SIZE:overlap_start_pos] + feature_2_seq) self.assertEqual(EXPECTED_SEQUENCE, result_seq) new_feature_1 = get_feature_by_id(updated_seq_record, feature_1.id) new_feature_2 = get_feature_by_id(updated_seq_record, feature_2.id) self.assertEqual( new_feature_1.location.end + 2 * conflicting_pair_common.RBS_BUFFER_SIZE, new_feature_2.location.start) self._assert_feature_seq(new_feature_1, updated_seq_record, feature_id_to_seq_map) self._assert_feature_seq(new_feature_2, updated_seq_record, feature_id_to_seq_map)
def test_fix_overlap_pair_same_direction_forward(self): """Account for RBS buffer for right strand. """ overlap_start_pos = 39 feature_1_seq = 'ATGTTTGGGAAACCCAAACCCGGGTTTAAACCCGGGTTTATGAAAGGG' feature_2_seq = feature_1_seq[overlap_start_pos:] + 'CCCAAATTT' whole_seq = feature_1_seq[:overlap_start_pos] + feature_2_seq seq = Seq(whole_seq, generic_dna) seq_record = SeqRecord(seq) feature_id_to_seq_map = { 1: feature_1_seq, 2: feature_2_seq, } feature_1_loc = FeatureLocation(0, len(feature_1_seq), strand=1) feature_1 = SeqFeature(feature_1_loc, type='CDS', id=1) seq_record.features.append(feature_1) self._assert_feature_seq(feature_1, seq_record, feature_id_to_seq_map) feature_2_loc = FeatureLocation(overlap_start_pos, overlap_start_pos + len(feature_2_seq), strand=1) feature_2 = SeqFeature(feature_2_loc, type='CDS', id=2) seq_record.features.append(feature_2) self._assert_feature_seq(feature_2, seq_record, feature_id_to_seq_map) updated_seq_record = copy.deepcopy(seq_record) is_fix_success = fix_overlap_pair(feature_1.id, feature_2.id, updated_seq_record) self.assertTrue(is_fix_success) EXPECTED_SEQUENCE = ( feature_1_seq + feature_1_seq[overlap_start_pos - conflicting_pair_common. RBS_BUFFER_SIZE:overlap_start_pos] + feature_2_seq) self.assertEqual(EXPECTED_SEQUENCE, str(updated_seq_record.seq)) new_feature_1 = get_feature_by_id(updated_seq_record, feature_1.id) new_feature_2 = get_feature_by_id(updated_seq_record, feature_2.id) self.assertEqual( new_feature_1.location.end + conflicting_pair_common.RBS_BUFFER_SIZE, new_feature_2.location.start) self._assert_feature_seq(new_feature_1, updated_seq_record, feature_id_to_seq_map) self._assert_feature_seq(new_feature_2, updated_seq_record, feature_id_to_seq_map)
def test_fix_overlaps_simple(self): feature_1_seq = 'ATGTTTGGG' feature_2_seq = 'GGGCCCAAAGTA' overlap_start_pos = 6 whole_seq = feature_1_seq[:overlap_start_pos] + feature_2_seq overlap_size = len(feature_1_seq) - overlap_start_pos seq = Seq(whole_seq, generic_dna) seq_record = SeqRecord(seq) feature_id_to_seq_map = { 1: feature_1_seq, 2: feature_2_seq, } feature_1_loc = FeatureLocation(0, len(feature_1_seq), strand=1) feature_1 = SeqFeature(feature_1_loc, type='CDS', id=1) seq_record.features.append(feature_1) self._assert_feature_seq(feature_1, seq_record, feature_id_to_seq_map) feature_2_loc = FeatureLocation(overlap_start_pos, overlap_start_pos + len(feature_2_seq), strand=-1) feature_2 = SeqFeature(feature_2_loc, type='CDS', id=2) seq_record.features.append(feature_2) self._assert_feature_seq(feature_2, seq_record, feature_id_to_seq_map) # Build and use the overlap fixer. updated_seq_record = copy.deepcopy(seq_record) refactor_context = RefactorContext(updated_seq_record) refactor_context.set_forbidden_codon_set(set(['GGG'])) cpf = ConflictingPairFixer(refactor_context) cpf.fix_overlaps() EXPECTED_SEQUENCE = feature_1_seq + feature_2_seq self.assertEqual(EXPECTED_SEQUENCE, str(updated_seq_record.seq)) new_feature_1 = get_feature_by_id(updated_seq_record, feature_1.id) new_feature_2 = get_feature_by_id(updated_seq_record, feature_2.id) self.assertEqual(new_feature_1.location.end, new_feature_2.location.start) self._assert_feature_seq(new_feature_1, updated_seq_record, feature_id_to_seq_map) self._assert_feature_seq(new_feature_2, updated_seq_record, feature_id_to_seq_map)
def test_add_feature_to_seq_record__regular(self): SEQ = Seq('ATGTTTGGGTAGAGTA', generic_dna) seq_record = SeqRecord(SEQ) FEATURE_1_ID = '1' FEATURE_1_LOC = FeatureLocation(4, 7) feature_1 = SeqFeature(FEATURE_1_LOC, type='CDS', id=FEATURE_1_ID) add_feature_to_seq_record(seq_record, feature_1) lookup_feature_1 = get_feature_by_id(seq_record, FEATURE_1_ID) self.assertEqual(feature_1, lookup_feature_1)
def test_fix_overlaps_third_feature_none_overlap(self): feature_1_seq = 'ATGTTTGGG' feature_2_seq = 'GGGCCCAAAGTA' inter_f2_f3_junk = 'GTAGCTATCTATCTGGTTAAATC' feature_3_seq = 'ATGAAACCCTTTGGGTTTCCCAAA' overlap_start_pos = 6 whole_seq = (feature_1_seq[:overlap_start_pos] + feature_2_seq + inter_f2_f3_junk + feature_3_seq) overlap_size = len(feature_1_seq) - overlap_start_pos seq = Seq(whole_seq, generic_dna) seq_record = SeqRecord(seq) feature_id_to_seq_map = { 1: feature_1_seq, 2: feature_2_seq, 3: feature_3_seq, } feature_1_loc = FeatureLocation(0, len(feature_1_seq), strand=1) feature_1 = SeqFeature(feature_1_loc, type='CDS', id=1) seq_record.features.append(feature_1) self._assert_feature_seq(feature_1, seq_record, feature_id_to_seq_map) feature_2_loc = FeatureLocation(overlap_start_pos, overlap_start_pos + len(feature_2_seq), strand=-1) feature_2 = SeqFeature(feature_2_loc, type='CDS', id=2) seq_record.features.append(feature_2) self._assert_feature_seq(feature_2, seq_record, feature_id_to_seq_map) feature_3_start = feature_2_loc.end + len(inter_f2_f3_junk) feature_3_end = feature_3_start + len(feature_3_seq) feature_3_loc = FeatureLocation(feature_3_start, feature_3_end, strand=1) feature_3 = SeqFeature(feature_3_loc, type='CDS', id=3) seq_record.features.append(feature_3) self._assert_feature_seq(feature_3, seq_record, feature_id_to_seq_map) # Build and use the overlap fixer. updated_seq_record = copy.deepcopy(seq_record) refactor_context = RefactorContext(updated_seq_record) refactor_context.set_forbidden_codon_set(set(['GGG'])) cpf = ConflictingPairFixer(refactor_context) cpf.fix_overlaps() EXPECTED_SEQUENCE = (feature_1_seq + feature_2_seq + inter_f2_f3_junk + feature_3_seq) self.assertEqual(EXPECTED_SEQUENCE, str(updated_seq_record.seq)) for feature_id in feature_id_to_seq_map.keys(): new_feature = get_feature_by_id(updated_seq_record, feature_id) self._assert_feature_seq(new_feature, updated_seq_record, feature_id_to_seq_map)
def check_rnas_conserved(original_seq_record, refactored_seq_record, ignore_problems_in_feature_ids=[], interval=None): """Check that RNA coding sequences are conserved. """ print '...Checking RNA\'s are conserved...' RNA_TYPES = set(['misc_RNA', 'ncRNA', 'rRNA', 'tRNA', 'tmRNA']) # Filter original features to those in interval, if provided. if interval: interval_features = [ f for f in original_seq_record.features if does_interval_overlap_feature(interval, f) ] else: interval_features = original_seq_record.features original_rna_features = filter( lambda feature: feature.type in RNA_TYPES and not feature.id in ignore_problems_in_feature_ids, interval_features) if interval: refactored_interval_features = [ f for f in refactored_seq_record.features if does_interval_overlap_feature(interval, f) ] else: refactored_interval_features = refactored_seq_record.features refactored_rna_features = filter( lambda feature: feature.type in RNA_TYPES and not feature.id in ignore_problems_in_feature_ids, refactored_interval_features) error_msg = "Different number of RNA features." assert len(original_rna_features) == len( refactored_rna_features), error_msg for original_feature in original_rna_features: original_feature_seq = original_feature.extract( original_seq_record.seq) refactored_feature = get_feature_by_id(refactored_seq_record, original_feature.id) refactored_feature_seq = refactored_feature.extract( refactored_seq_record.seq) error_msg = "RNA not conserved for %s" % original_feature.id assert str(original_feature_seq) == str( refactored_feature_seq), error_msg print '......RNA conservation confirmed.' return True
def analyze_variation(original_record, recoded_record): """Analyzes the variation between the two records. The start and recoded records must have the same feature ids. Returns: Dictionary with the following keys: * codon_similarity: Proportion of codons unchanged. * based_similarity: Proportion of bases unchanged. """ total_codons = 0 reassigned_codons = 0 total_bases = 0 reassigned_bases = 0 original_record_coding_features = [ feature for feature in original_record.features if feature.type == 'CDS' ] for orig_feature in original_record_coding_features: recoded_feature = get_feature_by_id(recoded_record, orig_feature.id) orig_seq = orig_feature.extract(original_record.seq) recoded_seq = recoded_feature.extract(recoded_record.seq) if not len(orig_seq) == len(recoded_seq): print '>>>> Omitting ' + str(orig_feature) continue total_bases += len(orig_feature) total_codons += len(orig_feature) / 3 for codon_index in range(0, len(orig_seq), 3): orig_codon = str(orig_seq[codon_index:codon_index + 3]) recoded_codon = str(recoded_seq[codon_index:codon_index + 3]) if orig_codon == recoded_codon: continue else: reassigned_codons += 1 reassigned_bases += _num_differing_bases( orig_codon, recoded_codon) return { 'total_codons': total_codons, 'reassigned_codons': reassigned_codons, 'codon_similarity': float(total_codons - reassigned_codons) / total_codons, 'total_bases': total_bases, 'reassigned_bases': reassigned_bases, 'base_similarity': float(total_bases - reassigned_bases) / total_bases }
def test_fix_overlap_pair_opposing_strands(self): """This should simply pull them apart, without adding anything in between. """ seq = Seq('ATGTTTGGGCCCAAAGTA', generic_dna) seq_record = SeqRecord(seq) feature_1_seq = 'ATGTTTGGG' feature_2_seq = 'GGGCCCAAAGTA' feature_id_to_seq_map = { 1: feature_1_seq, 2: feature_2_seq, } feature_1_loc = FeatureLocation(0, 9, strand=1) feature_1 = SeqFeature(feature_1_loc, type='CDS', id=1) seq_record.features.append(feature_1) self._assert_feature_seq(feature_1, seq_record, feature_id_to_seq_map) feature_2_loc = FeatureLocation(6, 18, strand=-1) feature_2 = SeqFeature(feature_2_loc, type='CDS', id=2) seq_record.features.append(feature_2) self._assert_feature_seq(feature_2, seq_record, feature_id_to_seq_map) updated_seq_record = copy.deepcopy(seq_record) is_fix_success = fix_overlap_pair(feature_1.id, feature_2.id, updated_seq_record) self.assertTrue(is_fix_success) self.assertEqual('ATGTTTGGGGGGCCCAAAGTA', str(updated_seq_record.seq)) new_feature_1 = get_feature_by_id(updated_seq_record, feature_1.id) new_feature_2 = get_feature_by_id(updated_seq_record, feature_2.id) self.assertEqual(new_feature_1.location.end, new_feature_2.location.start) self._assert_feature_seq(new_feature_1, updated_seq_record, feature_id_to_seq_map) self._assert_feature_seq(new_feature_2, updated_seq_record, feature_id_to_seq_map)
def check_translations_conserved(original_seq_record, refactored_seq_record, interval=None): """Confirms that the translations of the coding features are preserved before and after. """ print '...Checking translation is conserved...' original_coding_features = _get_features_passing_interval_filter( original_seq_record, interval) original_coding_features = filter(lambda feature: feature.type == 'CDS', original_coding_features) refactored_coding_features = _get_features_passing_interval_filter( refactored_seq_record, interval) refactored_coding_features = filter(lambda feature: feature.type == 'CDS', refactored_coding_features) error_msg = "Different number of CDS features." assert len(original_coding_features) == len( refactored_coding_features), error_msg for original_feature in original_coding_features: if original_feature.id in TRANSLATION_CONSERVED_EXCEPTIONS: continue original_feature_seq = original_feature.extract( original_seq_record.seq) original_translation = translate_custom(str(original_feature_seq)) refactored_feature = get_feature_by_id(refactored_seq_record, original_feature.id) if not refactored_feature: raise AssertionError("Feature lost after refactor: %s" % str(original_feature)) refactored_feature_seq = refactored_feature.extract( refactored_seq_record.seq) try: refactored_translation = translate_custom( str(refactored_feature_seq)) except TranslationError as e: print "Error translating %s" % str(original_feature) raise e error_msg = "Translation mismatch for feature %s" % original_feature.id assert original_translation == refactored_translation, error_msg print '......Translation conservation confirmed.' # All tests passed. return True
def resolve_single_homology_issue(refactor_context, homology_pair_obj): """Resolves homology issues between two objects. Args: refactor_context: The RefactorContext whose SeqRecord will be mutated. homology_pair_obj: Object produced by find_features_to_check_for_homology(). This is not guaranteed to have all the correct keys as of the current implementation. """ genome_record = refactor_context.get_genome_record() # The homology_pair_obj passed in may not have all the necessary keys, # for example if it's the type of copy we don't know how to deal with yet, # so for now we just try-catch any KeyError and report that the homology # was not fixed. try: copy_id = homology_pair_obj['copy_id'] copy_seq = homology_pair_obj['copy_seq'] print 'Resolving homology for %s ...' % copy_id # Parse the details of the feature to modify from homology_pair_obj. feature_to_modify_id = homology_pair_obj['source_id'] first_codon_to_modify = homology_pair_obj[ 'source_seq_first_codon_index'] source_feature = get_feature_by_id(genome_record, feature_to_modify_id) source_feature_seq = source_feature.extract(genome_record.seq) num_codons = len(source_feature) / 3 avoid_codons_in_positions = {} for codon_index in range(first_codon_to_modify, num_codons): codon = str(source_feature_seq[codon_index * 3:codon_index * 3 + 3]) avoid_codons_in_positions[codon_index] = codon # Perform the fix. result = replace_codons_in_single_feature( refactor_context, feature_to_modify_id, start_codon_index=first_codon_to_modify, avoid_codons_in_positions=avoid_codons_in_positions) assert str(result['orig_feature_seq']) != str( result['new_feature_seq']) assert result['is_success'], "Resolving homology not successful." update_seq_record_feature(genome_record, feature_to_modify_id, result) # Homology fixed. return True except KeyError: return False
def generate_recoding_stats(original_genome_record, recoded_genome_record, outfile): """Generate stats related to recoding. """ # We only use CDS features for this stat. original_coding_features = filter(lambda feature: feature.type == 'CDS', original_genome_record.features) total_coding_features = len(original_coding_features) total_codons = 0 num_codons_recoded = 0 percent_recoded_list = [] for idx, feature in enumerate(original_coding_features): print 'Analyzing feature %s, %d of %d' % (feature.id, idx + 1, total_coding_features) recoded_feature = get_feature_by_id(recoded_genome_record, feature.id) if recoded_feature is None: continue orig_seq = feature.extract(original_genome_record).seq recoded_seq = recoded_feature.extract(recoded_genome_record).seq codons_recoded_for_feature = 0 feature_len = len(orig_seq) for codon_index in range(0, len(orig_seq), 3): total_codons += 1 original_codon = str(orig_seq[codon_index:codon_index + 3]) recoded_codon = str(recoded_seq[codon_index:codon_index + 3]) if original_codon != recoded_codon: codons_recoded_for_feature += 1 feature_codons = feature_len / 3 feature_percent_recoded = (float(codons_recoded_for_feature) / feature_codons) percent_recoded_list.append(feature_percent_recoded) num_codons_recoded += codons_recoded_for_feature percent_recoded = float(num_codons_recoded) / total_codons avg_percent_recoded = (float(sum(percent_recoded_list)) / len(percent_recoded_list)) data = { 'total_CDS_features': total_coding_features, 'total_codons': total_codons, 'num_codons_recoded': num_codons_recoded, 'proportion_recoded': percent_recoded, 'avg_proportion_recoded': avg_percent_recoded } with open(outfile, 'w') as fh: fh.write(json.dumps(data, indent=4, separators=(',', ': ')))
def main(): source_ids_to_muddle = [] with open(AGN_DEBUG_FILE) as agn_debug_fh: reader = csv.DictReader(agn_debug_fh) for row in reader: if row['Separate'] == '1': source_ids_to_muddle.append(row['ID']) print source_ids_to_muddle record = get_genome_record(RECODED_PATH) refactor_context = RefactorContext(record) for source_feature_id in source_ids_to_muddle: source_feature = get_feature_by_id(record, source_feature_id) muddle_end(source_feature, record, refactor_context, 20) # rbs_cp_features = [feature for feature in record.features if # feature.type == InsertType.FIX_OVERLAP_RBS_COPY] # overlap_head_cp_features = [feature for feature in record.features if # feature.type == InsertType.FIX_OVERLAP_HEAD_COPY] # print 'rbs', len(rbs_cp_features) # print 'head', len(overlap_head_cp_features) # head_cp_feature_ids = set() # for head_feature in overlap_head_cp_features: # source_feature_id = re.match(r'(?P<feature_id>.*)_' + InsertType.FIX_OVERLAP_HEAD_COPY, head_feature.id).group('feature_id') # head_cp_feature_ids.add(source_feature_id) # count = 0 # for rbs_cp_feature in rbs_cp_features: # downstream = False # match = re.match(r'(?P<feature_id>.*)_upstream_' + InsertType.FIX_OVERLAP_RBS_COPY, rbs_cp_feature.id) # if not match: # downstream = True # match = re.match(r'(?P<feature_id>.*)_downstream_' + InsertType.FIX_OVERLAP_RBS_COPY, rbs_cp_feature.id) # source_feature_id = match.group('feature_id') # if source_feature_id in head_cp_feature_ids: # print 'HAS HEAD_CP', source_feature_id # continue # source_feature = get_feature_by_id(record, source_feature_id) # num_part = re.match(r'.*_(?P<num>[0-9]+)', source_feature_id).group('num') # if source_feature.strand == 1: # assert source_feature.location.start > rbs_cp_feature.location.start # actual_source_id = ID_ROOT + str(int(num_part) - 1) # else: # assert source_feature.location.start < rbs_cp_feature.location.start # actual_source_id = ID_ROOT + str(int(num_part) + 1) # try: # actual_source_feature = get_feature_by_id(record, actual_source_id) # except: # print 'NOT MUDDLING', rbs_cp_feature.id # continue # if actual_source_feature.strand != rbs_cp_feature.strand: # print 'NOT MUDDLING', rbs_cp_feature.id # continue # print 'MUDDLING', rbs_cp_feature.id # muddle_end(actual_source_feature, record, refactor_context, len(rbs_cp_feature)) # count += 1 # print 'COUNT', count with open(OUTFILE, 'w') as fh: SeqIO.write(record, fh, 'genbank')
def __init__(self, feature_id, original_seq_record, **kwargs): """Default constructor. Args: feature_id: The string id of the feature to profile. original_seq_record: The SeqRecord object with that contains all the data. kwargs: Dictionary of any of the following keys * error_tolerance * error_inc * value_range """ self.original_seq_record = original_seq_record self.feature = biopython_util.get_feature_by_id( self.original_seq_record, feature_id) self.original_feature_seq = str( self.feature.extract(self.original_seq_record.seq)) self.computation_cache = {} self.failed_error_treshold_count = 0 if 'error_tolerance' in kwargs: self.error_tolerance = kwargs['error_tolerance'] if 'error_inc' in kwargs: self.error_inc = kwargs['error_inc'] if 'value_range' in kwargs: self.value_range = kwargs['value_range'] if 'sliding_window_size' in kwargs: self.sliding_window_size = kwargs['sliding_window_size'] if self.feature.strand == 1: self.polarity_aware_underlying_seq = self.original_seq_record.seq self.polarity_aware_feature_location_start = ( self.feature.location.start) self.polarity_aware_feature_location_end = ( self.feature.location.end) elif self.feature.strand == -1: self.polarity_aware_underlying_seq = ( self.original_seq_record.seq.reverse_complement()) total_len = len(self.polarity_aware_underlying_seq) self.polarity_aware_feature_location_start = ( total_len - self.feature.location.end) self.polarity_aware_feature_location_end = ( self.polarity_aware_feature_location_start + len(self.feature)) else: raise ValueError("No feature strand.") # Some basic validation. self._validate() # Set or compute the original score profile. if 'values' in kwargs: num_codons = len(self.feature) / 3 self.values = kwargs['values'] assert num_codons == len( self.values), ("Wrong number of " "values passed to FeatureProfile constructor.") else: self.compute_original_feature_values()
def handle_prfB(genome_record): """Modifies the genome record to make prfB a CDS and remove the frameshift. """ prfB_locus_tag = 'ECMDS42_2390' prfB_feature = get_feature_by_id(genome_record, prfB_locus_tag) prfB_feature_seq = str(prfB_feature.extract(genome_record.seq)) original_length = len(prfB_feature) # Feature is on the negative strand. assert prfB_feature.strand == -1 # First delete the frameshift, CTTT -> CTT, by removing the T at the 75th # position of the feature. FRAMESHIFT_START = 72 assert 'CTTTGAC' == str( prfB_feature_seq[FRAMESHIFT_START:FRAMESHIFT_START + 7]) prfB_feature_seq = prfB_feature_seq[:74] + prfB_feature_seq[75:] assert 'CTTGAC' == str(prfB_feature_seq[FRAMESHIFT_START:FRAMESHIFT_START + 6]) # Next change the RBS just before it AGGGGG -> CGTGGG (as per Chris Gregg # data from 8/25/13). RBS_START = 63 assert 'AGGGGG' == str(prfB_feature_seq[63:69]) prfB_feature_seq = prfB_feature_seq[:63] + 'CGT' + prfB_feature_seq[66:] assert 'CGTGGG' == str(prfB_feature_seq[63:69]) # Now replace the underlying sequence of the genome. updated_seq = (genome_record.seq[:prfB_feature.location.start] + reverse_complement(prfB_feature_seq) + genome_record.seq[prfB_feature.location.end:]) assert len(genome_record.seq) - 1 == len(updated_seq) genome_record.seq = updated_seq # Change the prfB feature to be a CDS. prfB_feature.type = 'CDS' # Bump the start position one unit right. # Remember, it's on the negative strand. prfB_feature.location = FeatureLocation(prfB_feature.location.start, prfB_feature.location.end - 1, strand=prfB_feature.strand) # Update the positions of downstream features. updated_features = [] for feature in genome_record.features: if feature.location.start > prfB_feature.location.start: feature = feature._shift(-1) updated_features.append(feature) genome_record.features = updated_features # Make sure changes went through. mod_prfB_feature = get_feature_by_id(genome_record, prfB_locus_tag) mod_prfB_feature_seq = str(mod_prfB_feature.extract(genome_record.seq)) assert original_length - 1 == len(mod_prfB_feature) assert prfB_feature_seq[:6] == mod_prfB_feature_seq[:6], ( "Before: %s, After: %s" % (prfB_feature_seq[:6], mod_prfB_feature_seq[:6])) assert prfB_feature_seq[-10:] == mod_prfB_feature_seq[-10:], ( "Before: %s, After: %s" % (prfB_feature_seq[-10:], mod_prfB_feature_seq[10:])) return genome_record
def test_fix_overlap_pair_same_direction_reverse(self): """Account for RBS buffer for left strand. """ overlap_start_pos = 12 feature_1_seq = 'AAACCCGGGTTTCCCAAAGTA' feature_2_seq = (feature_1_seq[overlap_start_pos:] + 'CCCTTTGGGAAACCCAAACCCGGGTTTCCCAAATTTGTA') whole_seq = feature_1_seq[:overlap_start_pos] + feature_2_seq overlap_seq = feature_1_seq[overlap_start_pos:] overlap_size = len(overlap_seq) # Sanity check, for visual debug if necessary. self.assertEqual( 'AAACCCGGGTTTCCCAAAGTACCCTTTGGGAAACCCAAACCCGGGTTTCCCAAATTTGTA', whole_seq) self.assertEqual( len(whole_seq), len(feature_1_seq) + len(feature_2_seq) - overlap_size) # Create the sequence. seq = Seq(whole_seq, generic_dna) seq_record = SeqRecord(seq) feature_id_to_seq_map = { 1: feature_1_seq, 2: feature_2_seq, } feature_1_loc = FeatureLocation(0, len(feature_1_seq), strand=-1) feature_1 = SeqFeature(feature_1_loc, type='CDS', id=1) seq_record.features.append(feature_1) self._assert_feature_seq(feature_1, seq_record, feature_id_to_seq_map) feature_2_loc = FeatureLocation(overlap_start_pos, overlap_start_pos + len(feature_2_seq), strand=-1) feature_2 = SeqFeature(feature_2_loc, type='CDS', id=2) seq_record.features.append(feature_2) self._assert_feature_seq(feature_2, seq_record, feature_id_to_seq_map) updated_seq_record = copy.deepcopy(seq_record) is_fix_success = fix_overlap_pair(feature_1.id, feature_2.id, updated_seq_record) self.assertTrue(is_fix_success) result_seq = str(updated_seq_record.seq) EXPECTED_SEQUENCE = ( feature_1_seq + feature_2_seq[overlap_size:overlap_size + conflicting_pair_common.RBS_BUFFER_SIZE] + feature_2_seq) # Sanity check, for visual debug if necessary. self.assertEqual( 'AAACCCGGGTTTCCCAAAGTA' 'CCCTTTGGGAAACCC' 'CCCAAAGTACCCTTTGGGAAACCCAAACCCGGGTTTCCCAAATTTGTA', EXPECTED_SEQUENCE) # Assert correct length. (Simple debug check). self.assertEqual(len(EXPECTED_SEQUENCE), len(result_seq)) # Somewhat redundant piece-wise checks from debugging, but might as well # leave them here for the future. # Assert the entirety of feature_1 is copied over. self.assertEqual(feature_1_seq, result_seq[:len(feature_1_seq)]) # Assert the RBS extension is properly copied over self.assertEqual( EXPECTED_SEQUENCE[len(feature_1_seq):len(feature_1_seq) + conflicting_pair_common.RBS_BUFFER_SIZE], result_seq[len(feature_1_seq):len(feature_1_seq) + conflicting_pair_common.RBS_BUFFER_SIZE]) # Assert the entirety of feature_2 is copied over at the end. self.assertEqual(feature_2_seq, result_seq[len(result_seq) - len(feature_2_seq):]) # Assert the whole thing is correct. self.assertEqual(EXPECTED_SEQUENCE, result_seq) # Make sure the features are preserved. new_feature_1 = get_feature_by_id(updated_seq_record, feature_1.id) new_feature_2 = get_feature_by_id(updated_seq_record, feature_2.id) self.assertEqual( new_feature_1.location.end + conflicting_pair_common.RBS_BUFFER_SIZE, new_feature_2.location.start) self._assert_feature_seq(new_feature_1, updated_seq_record, feature_id_to_seq_map) self._assert_feature_seq(new_feature_2, updated_seq_record, feature_id_to_seq_map)
def analyze_codon_motif(motif_set, original_record, recoded_record, report_file): """Find all AGY positions that are are within 30 bases upstream of another feature. Args: motif_set: Set of codons to look for. """ AGY_list = [] original_seq = str(original_record.seq).upper() recoded_seq = str(recoded_record.seq).upper() # Identify overlapping features (this includes those close enough for RBS) # to possibly conflict. Use cache. conflicting_pairs = find_all_overlaps( None, # genome_record, None, # forbidden_codons, cache=True) original_rbs_profiles = get_mds42_rbs_strength_profile() for pair in conflicting_pairs: if (pair['upstream_feature'].type != 'CDS' or pair['downstream_feature'].type != 'CDS'): continue # Ignore prfB. if (get_feature_gene(pair['upstream_feature']) == 'prfB' or get_feature_gene(pair['downstream_feature']) == 'prfB'): continue original_upstream_feature = get_feature_by_id( original_record, pair['upstream_feature'].id) original_downstream_feature = get_feature_by_id( original_record, pair['downstream_feature'].id) recoded_upstream_feature = get_feature_by_id( recoded_record, pair['upstream_feature'].id) recoded_downstream_feature = get_feature_by_id( recoded_record, pair['downstream_feature'].id) if (original_upstream_feature.strand == 1 and original_downstream_feature.strand == 1): roi_start = original_downstream_feature.location.start - 20 for codon_index in range(0, len(original_upstream_feature), 3): pos = original_upstream_feature.location.start + codon_index if pos < roi_start: continue else: codon = original_seq[pos:pos + 3] if codon in motif_set: recoded_pos = ( recoded_upstream_feature.location.start + codon_index) recoded_codon = recoded_seq[recoded_pos:recoded_pos + 3] # Get before/after RBS. orig_rbs_expression = original_rbs_profiles[ original_downstream_feature.id] recoded_rbs_expression = calc_rbs_score_for_feature( recoded_downstream_feature, recoded_seq)['expression'] delta_expression = (recoded_rbs_expression - orig_rbs_expression) AGY_list.append({ 'pos': pos, 'recoded_pos': recoded_pos, 'ref': codon, 'alt': recoded_codon, 'orig_rbs_expression': orig_rbs_expression, 'recoded_rbs_expression': recoded_rbs_expression, 'delta_expression': delta_expression, 'strand': 1, }) # elif (original_upstream_feature.strand == -1 and # original_downstream_feature.strand == -1): # original_downstream_feature_seq = ( # original_downstream_feature.extract(original_seq)) # recoded_downstream_feature_seq = ( # recoded_downstream_feature.extract(recoded_seq)) # roi_start = original_upstream_feature.location.end + 20 # for codon_index in range(0, len(original_downstream_feature), 3): # pos = original_downstream_feature.location.end - codon_index # if pos > roi_start: # continue # else: # codon = original_downstream_feature_seq[ # codon_index:codon_index + 3] # if codon in AGY_codons: # recoded_pos = (recoded_downstream_feature.location.start - # codon_index) # recoded_codon = recoded_downstream_feature_seq[ # codon_index:codon_index + 3] # # Get before/after RBS. # orig_rbs_expression = original_rbs_profiles[ # original_upstream_feature.id] # recoded_rbs_expression = calc_rbs_score_for_feature( # recoded_upstream_feature, recoded_seq)[ # 'expression'] # delta_expression = (recoded_rbs_expression - # orig_rbs_expression) # AGY_list.append({ # 'pos': pos, # 'recoded_pos': recoded_pos, # 'ref': codon, # 'alt': recoded_codon, # 'orig_rbs_expression': orig_rbs_expression, # 'recoded_rbs_expression': recoded_rbs_expression, # 'delta_expression': delta_expression, # 'strand': -1 # }) print 'Writing report.' with open(report_file, 'w') as fh: FIELD_NAMES = [ 'pos', 'recoded_pos', 'ref', 'alt', 'orig_rbs_expression', 'recoded_rbs_expression', 'delta_expression', 'strand', ] writer = csv.DictWriter(fh, FIELD_NAMES) writer.writeheader() for AGY_obj in AGY_list: writer.writerow(AGY_obj)
def main(): record = get_genome_record(RECODED_PATH) with open(AGN_DEBUG_FILE) as agn_debug_fh: reader = csv.DictReader(agn_debug_fh) for row in reader: new_codon = row['Codon'] if len(new_codon) != 3: continue feature_id = row['ID'] print feature_id # Identify codon positions from original record. sequence = row['Sequence'] if len(sequence) < 14: # Too hard. continue orig_codon = sequence[8:11] if not re.match(r'[A-Z]{3}', orig_codon): continue # Identify which codon index the position is. seq_ending_with_codon = sequence[0:3].upper( ) + sequence[4:7].upper() + orig_codon print seq_ending_with_codon orig_feature = get_feature_by_id(MDS42_RECORD, feature_id) orig_feature_seq = str(orig_feature.extract(MDS42_RECORD.seq)) new_feature = get_feature_by_id(record, feature_id) new_feature_seq = str(new_feature.extract(record.seq)) # Find the last occurrence. last_pos = None for match in re.finditer(seq_ending_with_codon, orig_feature_seq): last_pos = match.start() print last_pos codon_pos = last_pos + 6 print 'expect', orig_feature_seq[codon_pos:codon_pos + 3] # Translate to codon index. codon_index = codon_pos / 3 # Make sure codon at that position in recoded record is synonymous. codon_at_index = new_feature_seq[codon_pos:codon_pos + 3] print codon_at_index syn = ORIGINAL_CODON_USAGE_MEMEX.get_synonymous_codons( codon_at_index) print syn assert orig_codon in syn # Make the change in the target record. swap_feature_codon_at_position(record, new_feature.id, codon_pos, codon_at_index, new_codon) # Maybe do the next one. maybe_next = row['next'].upper() if len(maybe_next) == 3: syn = ORIGINAL_CODON_USAGE_MEMEX.get_synonymous_codons( maybe_next) codon_at_index_next = new_feature_seq[codon_pos + 3:codon_pos + 6] assert codon_at_index_next in syn swap_feature_codon_at_position(record, new_feature.id, codon_pos + 3, codon_at_index_next, maybe_next) with open(OUTFILE, 'w') as fh: SeqIO.write(record, fh, 'genbank')