def test_fix_overlaps_third_feature_none_overlap(self): feature_1_seq = 'ATGTTTGGG' feature_2_seq = 'GGGCCCAAAGTA' inter_f2_f3_junk = 'GTAGCTATCTATCTGGTTAAATC' feature_3_seq = 'ATGAAACCCTTTGGGTTTCCCAAA' overlap_start_pos = 6 whole_seq = (feature_1_seq[:overlap_start_pos] + feature_2_seq + inter_f2_f3_junk + feature_3_seq) overlap_size = len(feature_1_seq) - overlap_start_pos seq = Seq(whole_seq, generic_dna) seq_record = SeqRecord(seq) feature_id_to_seq_map = { 1: feature_1_seq, 2: feature_2_seq, 3: feature_3_seq, } feature_1_loc = FeatureLocation(0, len(feature_1_seq), strand=1) feature_1 = SeqFeature(feature_1_loc, type='CDS', id=1) seq_record.features.append(feature_1) self._assert_feature_seq(feature_1, seq_record, feature_id_to_seq_map) feature_2_loc = FeatureLocation(overlap_start_pos, overlap_start_pos + len(feature_2_seq), strand=-1) feature_2 = SeqFeature(feature_2_loc, type='CDS', id=2) seq_record.features.append(feature_2) self._assert_feature_seq(feature_2, seq_record, feature_id_to_seq_map) feature_3_start = feature_2_loc.end + len(inter_f2_f3_junk) feature_3_end = feature_3_start + len(feature_3_seq) feature_3_loc = FeatureLocation(feature_3_start, feature_3_end, strand=1) feature_3 = SeqFeature(feature_3_loc, type='CDS', id=3) seq_record.features.append(feature_3) self._assert_feature_seq(feature_3, seq_record, feature_id_to_seq_map) # Build and use the overlap fixer. updated_seq_record = copy.deepcopy(seq_record) refactor_context = RefactorContext(updated_seq_record) refactor_context.set_forbidden_codon_set(set(['GGG'])) cpf = ConflictingPairFixer(refactor_context) cpf.fix_overlaps() EXPECTED_SEQUENCE = (feature_1_seq + feature_2_seq + inter_f2_f3_junk + feature_3_seq) self.assertEqual(EXPECTED_SEQUENCE, str(updated_seq_record.seq)) for feature_id in feature_id_to_seq_map.keys(): new_feature = get_feature_by_id(updated_seq_record, feature_id) self._assert_feature_seq(new_feature, updated_seq_record, feature_id_to_seq_map)
def test_remove_site_in_coding_feature(self): """Tests removing a restriction enzyme that falls in a coding region. """ RESTRICTION_ENZYME = Restriction.BsmBI BEFORE = 'ATGTTTGGGCCCAAATTTGGGAAATTTGGGAAATTTGGGAAATTTGGGAAATTTGGG' SITE_SEQ = RESTRICTION_ENZYME.site AFTER = 'TAGAAAAAAAAAAAAAAAA' SEQ = Seq(BEFORE + SITE_SEQ + AFTER, generic_dna) seq_record = SeqRecord(SEQ) refactor_context = RefactorContext(seq_record) feature_1_loc = FeatureLocation(0, len(BEFORE) + len(SITE_SEQ) + 3, strand=1) feature_1 = SeqFeature(feature_1_loc, type='CDS', id='1') seq_record.features.append(feature_1) FEATURE_1_SEQ_ORIG = feature_1.extract(str(seq_record.seq)) FEATURE_1_NUM_CODONS = len(feature_1) / 3 # Compute fake feature profile. fake_profile_values_map = {} fake_profile_values_map[feature_1.id] = { GCContentFeatureProfile.get_name(): [0.2] * FEATURE_1_NUM_CODONS, SecondaryStructureFeatureProfile.get_name(): [-10] * FEATURE_1_NUM_CODONS, CodonRarityFeatureProfile.get_name(): [0.5] * FEATURE_1_NUM_CODONS, } refactor_context.set_feature_id_to_profile_values_map( fake_profile_values_map) occurrences = find_restriction_site_occurrences( seq_record, RESTRICTION_ENZYME) self.assertEqual(1, len(occurrences)) result = _remove_site_in_coding_feature(refactor_context, seq_record, occurrences[0], feature_1) self.assertTrue(result['is_success']) seq_record = result['updated_genome_record'] FEATURE_1_SEQ_UPDATED = feature_1.extract(str(seq_record.seq)) occurrences = find_restriction_site_occurrences( seq_record, RESTRICTION_ENZYME) self.assertEqual(0, len(occurrences)) self.assertEqual(translate_custom(FEATURE_1_SEQ_ORIG), translate_custom(FEATURE_1_SEQ_UPDATED))
def fix_homology_issues(genome_record, ids_to_fix=[]): """Finds pairs of copied features created during genome refactoring and muddles the upstream original (near the 3' terminus) in order to decreate the probability of "snap-back" during insertion. Returns: A copy of genome_record with homology issues resolved. """ resolved_genome_record = copy.deepcopy(genome_record) refactor_context = RefactorContext(resolved_genome_record) # Identify features to check for homology issues. These are # generally (always?) features that that have head the head/RBS portions # copied in order to split apart large overlaps. homology_pair_obj_list = find_features_to_check_for_homology( resolved_genome_record) # Resolve homologies. for pair_obj in homology_pair_obj_list: copy_id = pair_obj['copy_id'] if ids_to_fix: if not copy_id in ids_to_fix: continue resolve_single_homology_issue( refactor_context, pair_obj, ) return resolved_genome_record
def test_fix_overlaps_simple(self): feature_1_seq = 'ATGTTTGGG' feature_2_seq = 'GGGCCCAAAGTA' overlap_start_pos = 6 whole_seq = feature_1_seq[:overlap_start_pos] + feature_2_seq overlap_size = len(feature_1_seq) - overlap_start_pos seq = Seq(whole_seq, generic_dna) seq_record = SeqRecord(seq) feature_id_to_seq_map = { 1: feature_1_seq, 2: feature_2_seq, } feature_1_loc = FeatureLocation(0, len(feature_1_seq), strand=1) feature_1 = SeqFeature(feature_1_loc, type='CDS', id=1) seq_record.features.append(feature_1) self._assert_feature_seq(feature_1, seq_record, feature_id_to_seq_map) feature_2_loc = FeatureLocation(overlap_start_pos, overlap_start_pos + len(feature_2_seq), strand=-1) feature_2 = SeqFeature(feature_2_loc, type='CDS', id=2) seq_record.features.append(feature_2) self._assert_feature_seq(feature_2, seq_record, feature_id_to_seq_map) # Build and use the overlap fixer. updated_seq_record = copy.deepcopy(seq_record) refactor_context = RefactorContext(updated_seq_record) refactor_context.set_forbidden_codon_set(set(['GGG'])) cpf = ConflictingPairFixer(refactor_context) cpf.fix_overlaps() EXPECTED_SEQUENCE = feature_1_seq + feature_2_seq self.assertEqual(EXPECTED_SEQUENCE, str(updated_seq_record.seq)) new_feature_1 = get_feature_by_id(updated_seq_record, feature_1.id) new_feature_2 = get_feature_by_id(updated_seq_record, feature_2.id) self.assertEqual(new_feature_1.location.end, new_feature_2.location.start) self._assert_feature_seq(new_feature_1, updated_seq_record, feature_id_to_seq_map) self._assert_feature_seq(new_feature_2, updated_seq_record, feature_id_to_seq_map)
def test_swap_region_seq(self): SEQ = Seq('ATGTTTGGG', generic_dna) SEQ_RECORD = SeqRecord(SEQ) REFACTOR_CONTEXT = RefactorContext(SEQ_RECORD) NEW_SEQ = 'TTAGGA' updated_seq_record = swap_region_seq(REFACTOR_CONTEXT, 3, 'TTTGGG', NEW_SEQ) self.assertEqual('ATGTTAGGA', str(updated_seq_record.seq))
def replace_codons_in_feature_subset( genome_record, essential_feature_ids, codons_to_remove, original_codon_usage_memex, refactored_codon_usage_memex, feature_id_to_profile_values_map, range_start, range_end, tmp_result_file, debug): """Work on a subset of the features to fix. """ assert range_end >= range_start results = {} refactor_context = RefactorContext(genome_record) refactor_context.set_feature_id_to_profile_values_map( feature_id_to_profile_values_map) # Code path that helps debug across a limited range when debug is True. effective_range_start = range_start effective_range_end = range_end if not debug else range_start + 1 num_features = len(essential_feature_ids) for feature_index in range(effective_range_start, effective_range_end): print 'Fixing feature %d of %d' % (feature_index + 1, num_features) feature_id = essential_feature_ids[feature_index] print 'Feature id: %s' % feature_id result = replace_codons_in_single_feature(refactor_context, feature_id) # Add the results to the growing dictionary. results[feature_id] = result # Write resuts to file as soon as we're done. with open(tmp_result_file, 'w') as fh: pickle.dump(results, fh)
def perform_final_steps(refactor_context, seg_start, seg_end, upstream_flanking_seq=None, downstream_flanking_seq=None, validation_start_seq=None, validation_end_seq=None, ignore_problems_in_feature_ids=[], report_prefix=None): """Updates the contained genome_record after mutating it, including: * remove homopolymer runs * remove restriction sites * (optional) add end pieces (e.g. FRT sites) Args: refactor_context: The RefactorContext. seg_start: The first position (pythonic) in the genome_record contained within refactor_context for the segment. seg_end: End position (pythonic) for the segment. upstream_flanking_seq: Sequence to insert at the head of the segment. downstream_flanking_seq: Sequence to insert at the tail of the segment. validation_start_seq: Optional sequence at the start of the segment to sanity check the start position. We lack a ui. validation_end_seq: Optional sequence at the end of the segment to sanity check the end position. ignore_problems_in_feature_ids: Feature ids that the client is aware may have problems so that we can ignore. Returns: An updated SeqRecord reflecting changes. """ updated_genome_record = copy.deepcopy(refactor_context.get_genome_record()) # Check features are conserved before we start. check_recoding_is_complete(ORIGINAL_GENOME_RECORD, updated_genome_record, ignore_problems_in_feature_ids=ignore_problems_in_feature_ids, interval=(seg_start, seg_end)) orig_seq = str(updated_genome_record.seq) if validation_start_seq: assert validation_start_seq == orig_seq[ seg_start:seg_start + len(validation_start_seq)] if validation_end_seq: assert validation_end_seq == orig_seq[ seg_end - len(validation_end_seq):seg_end] updated_refactor_context = RefactorContext(updated_genome_record) # Fix GC content. GC_CONTENT_CONSTRAINT_OBJ = GCContentConstraints() updated_genome_record = fix_gc_content( refactor_context, GC_CONTENT_CONSTRAINT_OBJ, start_bound=seg_start, end_bound=seg_end, debug=False) updated_refactor_context.set_genome_record(updated_genome_record) # Remove homopolymer runs. remove_homopolymer_result = remove_homopolymer_runs( updated_refactor_context, start_bound=seg_start, end_bound=seg_end, report_prefix=report_prefix) updated_genome_record = remove_homopolymer_result['updated_genome_record'] flagged_h_runs = remove_homopolymer_result['flagged'] updated_refactor_context.set_genome_record(updated_genome_record) print 'Flagged homopolyer runs:' PRETTY_PRINTER.pprint(flagged_h_runs) # Remove restriction sites. remove_res_sites_result = remove_restriction_sites( updated_refactor_context, RESTRICTION_ENZYME_SITES_TO_REMOVE, start_bound=seg_start, end_bound=seg_end, report_prefix=report_prefix) updated_genome_record = remove_res_sites_result['updated_genome_record'] updated_refactor_context.set_genome_record(updated_genome_record) flagged_res_sites = remove_res_sites_result['flagged'] print 'Flagged restriction sites:' PRETTY_PRINTER.pprint(flagged_res_sites) # Generate the GC content report after all fixes are done. gc_report_file = report_prefix + 'gc_content.csv' updated_genome_record = fix_gc_content( updated_refactor_context, GC_CONTENT_CONSTRAINT_OBJ, start_bound=seg_start, end_bound=seg_end, debug=True, report_file=gc_report_file) # Check features are conserved. print 'Checking translation/rna/forbidden codons ...' check_recoding_is_complete(ORIGINAL_GENOME_RECORD, updated_genome_record, ignore_problems_in_feature_ids=ignore_problems_in_feature_ids, interval=(seg_start, seg_end)) # Maybe insert FRT sites. if upstream_flanking_seq or downstream_flanking_seq: updated_genome_record = insert_frt_site( updated_genome_record, upstream_flanking_seq, seg_start, downstream_flanking_seq, seg_end, feature_id_prefix='seg2', upstream_validation_seq=validation_start_seq, downstream_validation_seq=validation_end_seq) return updated_genome_record
except AssertionError: print 'WARNING: Could not find %s' % string.upper(row['original']) # Probably couldn't find it because overlaps, just continue on. continue if __name__ == '__main__': from Bio import SeqIO from biopython_util import get_genome_record from refactor_context import RefactorContext genome_record = get_genome_record( '../data/completed_segments/seg2/2013_03_06_20_16_04_mds42_refactored.gbk') genome_record.name = genome_record.name[:-3] + 'seg2' refactor_context = RefactorContext(genome_record) SEG_START = 100863 SEG_END = 148475 UPSTREAM_FLANKING_SEQ = 'CAGCCTTGTTTCGCCAGAATGCCAGTCAGCATAAGGGAGAGCTCAAGGCAGAAGTTCCTATTCCGAAGTTCCTATTCTCATATAAGTATAGGAACTTC' DOWNSTREAM_FLANKING_SEQ = 'CCTGTTGACAATTAATCATCGGCATAGTATATCGGCATAGTATAATACGACAAGGTGAGGAACTAAACCCAGGAGGCAGATCATGAGTCTGAAAGAAAAAACACAATCTCTGTTTGCCAACGCATTTGGCTACCCTGCCACTCACACCATTCAGGCGCCTGGCCGCGTGAATTTGATTGGTGAACACACCGACTACAACGACGGTTTCGTTCTGCCCTGCGCGATTGATTATCAAACCGTGATCAGTTGTGCACCACGCGATGACCGTAAAGTTCGCGTGATGGCAGCCGATTATGAAAATCAGCTCGACGAGTTTTCCCTCGATGCGCCCATTGTCGCACATGAAAACTATCAATGGGCTAACTACGTTCGTGGCGTGGTGAAACATCTGCAACTGCGTAACAACAGCTTCGGCGGCGTGGACATGGTGATCAGCGGCAATGTGCCGCAGGGTGCCGGGTTAAGTTCTTCCGCTTCACTGGAAGTCGCGGTCGGAACCGTATTGCAGCAGCTTTATCATCTGCCGCTGGACGGCGCACAAATCGCGCTTAACGGTCAGGAAGCAGAAAACCAGTTTGTAGGCTGTAACTGCGGGATCATGGATCAGCTAATTTCCGCGCTCGGCAAGAAAGATCATGCCTTGCTGATCGATTGCCGCTCACTGGGGACCAAAGCAGTTTCCATGCCCAAAGGTGTGGCTGTCGTCATCATCAACAGTAACTTCAAACGTACCCTGGTTGGCAGCGAATACAACACCCGTCGTGAACAGTGCGAAACCGGTGCGCGTTTCTTCCAGCAGCCAGCCCTGCGTGATGTCACCATTGAAGAGTTCAACGCTGTTGCGCATGAACTGGACCCGATCGTGGCAAAACGCGTGCGTCATATACTGACTGAAAACGCCCGCACCGTTGAAGCTGCCAGCGCGCTGGAGCAAGGCGACCTGAAACGTATGGGCGAGTTGATGGCGGAGTCTCATGCCTCTATGCGCGATGATTTCGAAATCACCGTGCCGCAAATTGACACTCTGGTAGAAATCGTCAAAGCTGTGATTGGCGACAAAGGTGGCGTACGCATGACCGGCGGCGGATTTGGCGGCTGTATCGTCGCGCTGATCCCGGAAGAGCTGGTGCCTGCCGTACAGCAAGCTGTCGCTGAACAATATGAAGCAAAAACAGGTATTAAAGAGACTTTTTACGTTTGTAAACCATCACAAGGAGCAGGACAGTGCTGAAAAAAAAAACCCCGCCCCTGACAGGGCGGGGTTTTTTTTGAAGTTCCTATTCCGAAGTTCCTATTCTATCAGAAGTATAGGAACTTCAGTGCGGATTTCGTATTTGCAGCTCGTCAGTACTTTCAGAATCATGGCCT' VALIDATION_START_SEQ = 'GAGGCCGACGATGATTACGGCCTCAGG' VALIDATION_END_SEQ = 'TTAATCATTTGACGTCCCTTGT' updated_genome_record = perform_final_steps(refactor_context, SEG_START, SEG_END, UPSTREAM_FLANKING_SEQ, DOWNSTREAM_FLANKING_SEQ, VALIDATION_START_SEQ, VALIDATION_END_SEQ) genome_output_file = ( '../data/completed_segments/seg2/seg2_final_with_flanking.gbk') with open(genome_output_file, 'w') as output_fh:
def main(): source_ids_to_muddle = [] with open(AGN_DEBUG_FILE) as agn_debug_fh: reader = csv.DictReader(agn_debug_fh) for row in reader: if row['Separate'] == '1': source_ids_to_muddle.append(row['ID']) print source_ids_to_muddle record = get_genome_record(RECODED_PATH) refactor_context = RefactorContext(record) for source_feature_id in source_ids_to_muddle: source_feature = get_feature_by_id(record, source_feature_id) muddle_end(source_feature, record, refactor_context, 20) # rbs_cp_features = [feature for feature in record.features if # feature.type == InsertType.FIX_OVERLAP_RBS_COPY] # overlap_head_cp_features = [feature for feature in record.features if # feature.type == InsertType.FIX_OVERLAP_HEAD_COPY] # print 'rbs', len(rbs_cp_features) # print 'head', len(overlap_head_cp_features) # head_cp_feature_ids = set() # for head_feature in overlap_head_cp_features: # source_feature_id = re.match(r'(?P<feature_id>.*)_' + InsertType.FIX_OVERLAP_HEAD_COPY, head_feature.id).group('feature_id') # head_cp_feature_ids.add(source_feature_id) # count = 0 # for rbs_cp_feature in rbs_cp_features: # downstream = False # match = re.match(r'(?P<feature_id>.*)_upstream_' + InsertType.FIX_OVERLAP_RBS_COPY, rbs_cp_feature.id) # if not match: # downstream = True # match = re.match(r'(?P<feature_id>.*)_downstream_' + InsertType.FIX_OVERLAP_RBS_COPY, rbs_cp_feature.id) # source_feature_id = match.group('feature_id') # if source_feature_id in head_cp_feature_ids: # print 'HAS HEAD_CP', source_feature_id # continue # source_feature = get_feature_by_id(record, source_feature_id) # num_part = re.match(r'.*_(?P<num>[0-9]+)', source_feature_id).group('num') # if source_feature.strand == 1: # assert source_feature.location.start > rbs_cp_feature.location.start # actual_source_id = ID_ROOT + str(int(num_part) - 1) # else: # assert source_feature.location.start < rbs_cp_feature.location.start # actual_source_id = ID_ROOT + str(int(num_part) + 1) # try: # actual_source_feature = get_feature_by_id(record, actual_source_id) # except: # print 'NOT MUDDLING', rbs_cp_feature.id # continue # if actual_source_feature.strand != rbs_cp_feature.strand: # print 'NOT MUDDLING', rbs_cp_feature.id # continue # print 'MUDDLING', rbs_cp_feature.id # muddle_end(actual_source_feature, record, refactor_context, len(rbs_cp_feature)) # count += 1 # print 'COUNT', count with open(OUTFILE, 'w') as fh: SeqIO.write(record, fh, 'genbank')
def refactor_with_min_overlap_fixes_and_preserve_rbs(genome_record, tmp_file_prefix, debug=False): """Refactoring strategy that only fixes overlaps when it's necessary for removing forbidden codons and/or preserving rbs strength. This is the second big iteration of our strategy following disucssion on 2/6/13. Notable differences from the initial strategy: * Before this we were just pulling apart all overlaps and copying the RBS site. However, intuitively we have concerns that this may introduce many unnecessary changes. * We are now also taking into account coding features that are close enough to each other (even if not overlapping) where re-coding could affect one of the feature's RBS regions. Super high-level algorithm overview: 1. Fix overlaps. 2. Recode each gene to remove forbidden codons. Medium-level algorithm overview: * Identify all pairs of coding regions that are either overlapping, or are close enough (< 20 bp), where recoding one may affect translation of the other. - For each of these pairs: * If there are no forbidden codons in the affected regions: - Nothing to do, mark the pair as resolved. * If there are forbidden codons: - Do an exhaustive search over the affected region and try to find a path of synonymous codon substitutions that don't require physical separation. If success, perform the change, and mark any changed codons as "fixed" so that they are not changed in the second half of the overall algorithm where we do the bulk forbidden codon removal. - Otherwise, we need to separate: * If overlap < 4 bp, find minimum amount to copy that resolves any issues, and lock affected RBS regions so that they are not changed. * Otherwise, need to copy overlap + 15 bp upstream of ATG, and to help prevent snap-back: - muddle old start codon in upstream gene - muddle bases in copied region that are not part of RBS * Perform synonymous swapping as before, but this time respecting locked-in regions from first half of algorithm. """ # Make a copy of the original for validation. original_genome_record = copy.deepcopy(genome_record) # Context object to be passed around to different methods. refactor_context = RefactorContext(genome_record) ########################################################################### # Fix overlaps ########################################################################### cfp = ConflictingPairFixer( refactor_context, cache=USE_CACHE, include_close_features=True, single_iteration=DEBUG_SINGLE_ITERATION, force_separate_AGN=True, agn_separation_data_file=AGN_SEPARATION_DATA_FILE) genome_record = cfp.fix_overlaps() refactor_context.set_genome_record(genome_record) # Write the output before going on to next step in case there is an error # later, so we can at least have partial results. _write_output(genome_record, {}, tmp_file_prefix) # Validate that overlaps were fixed correctly before moving on. check_all(original_genome_record, genome_record) ########################################################################### # Swap out remaining forbidden codons # NOTE: Some were already replaced while fixing overlaps. ########################################################################### (genome_record, metadata) = replace_forbidden_codons(refactor_context, num_cores=NUM_CORES, tmp_file_prefix=tmp_file_prefix, debug=debug) # Write the output just in case again. _write_output(genome_record, metadata, tmp_file_prefix) # Check that forbidden codons were removed. check_forbidden_codons_removed(genome_record, CODONS_TO_REMOVE) # Validate that we're still good after codon replacement. check_all(original_genome_record, genome_record) ########################################################################### # Resolve homology issues ########################################################################### genome_record = fix_homology_issues(genome_record) # Write the final output, overriding the intermediate write above. _write_output(genome_record, metadata, tmp_file_prefix) # Validation checks. check_forbidden_codons_removed(genome_record, CODONS_TO_REMOVE) check_all(original_genome_record, genome_record) print 'Done.'