def perform_assembly(self, plasmids_only=True, new_id='New_assembly', new_description='New assembly'): """ Performs an assembly provided a pool of Parts. Inspiration from pydna to use networkx to represent assemblies as a graph. However I think we can be a lot more efficient and enforce our biological constraints at the same time. :param plasmids_only: only report cyclic assemblies that fulfill reaction constraints :return: """ # --- Sanity Checks --- # """ All DNA entities in input_dna_list should be parts. Any undigested Plasmids with the same resistance marker as your assembly product will totally mess up your day. """ if any([isinstance(dna, Plasmid) for dna in self.input_dna_list]): raise ReactionDefinitionException('Undigested plasmids cannot be part of an assembly!') # --- Assemble Graph --- # directed_graph = networkx.MultiDiGraph() # Add nodes and edges for each part in digest_pool for part in self.input_dna_list: sticky_match_l = part.overhang_5 sticky_match_r = part.overhang_3 # Process sticky ends into nodes # (overhang_sequence, overhang_strand) l_node = 'Blunt_Left' if sticky_match_l is None else (part.sequence[:sticky_match_l[0]], sticky_match_l[1]) r_node = 'Blunt_Right' if sticky_match_r is None else (part.sequence[-sticky_match_r[0]:], sticky_match_r[1]) directed_graph.add_node(l_node) directed_graph.add_node(r_node) # Add directed edge between nodes (5' -> 3') and add Part as edge attribute directed_graph.add_edge(l_node, r_node, part=part) # --- Traverse Graph --- # # First: simple_cycles graph_cycles = networkx.algorithms.cycles.simple_cycles(directed_graph) processed_cycles = list() all_possible_assemblies = list() for cycle in graph_cycles: if not any([set(cycle) == processed_cycle for processed_cycle in processed_cycles]): sequences_list = list() # Iterate through edges and get sequence up to 3' sticky end for node_l, node_r in pairwise(cycle): current_edge = directed_graph[node_l][node_r] new_sequence_list = [current_edge[index]['part'] for index, edge in enumerate(current_edge)] sequences_list.append(new_sequence_list) # Get last part of cycle last_edge = directed_graph[cycle[-1]][cycle[0]] new_sequence_list = [last_edge[index]['part'] for index, edge in enumerate(last_edge)] sequences_list.append(new_sequence_list) all_possible_assemblies += product(*sequences_list) # Actually do assemblies intermediate_assemblies = list() for assembly in all_possible_assemblies: assembly_dict = dict() assembly_dict['sequence'] = '' assembly_dict['features'] = list() assembly_dict['sources'] = list() assembly_dict['description'] = list() for part in assembly: right_sticky_end = part.overhang_3[0] assembly_dict['sequence'] += part.sequence[:-right_sticky_end] if part.features: assembly_dict['features'] += part.features if part.description: assembly_dict['description'].append(part.description) assembly_dict['sources'].append(part.source) if assembly_dict['sequence'] != '': intermediate_assemblies.append(assembly_dict) # In plasmids_only=False: all_simple_paths, iterate over all combinations of nodes # todo: write this # --- Final digestion --- # # Omit assemblies that still possess restriction sites for enzymes in restriction_enzyme_list complete_assemblies = list() for assembly in intermediate_assemblies: rxn_site_found = False for enzyme in self.restriction_enzyme_list: if len(enzyme.compsite.findall(assembly['sequence'])) > 0: rxn_site_found = True if rxn_site_found is True: continue else: complete_assemblies.append(assembly) # --- Raise exceptions if something went wrong --- # # Plasmid specific exceptions if len(complete_assemblies) > 1 and plasmids_only: raise AssemblyException('This assembly produces more than one plasmid product. Check your input sequences.') if len(complete_assemblies) == 0 and plasmids_only: raise AssemblyException('This assembly does not produce any complete products. Check your input sequences.') # --- Dump assembly and things into a new Plasmid --- # if plasmids_only: final_assembly_product = complete_assemblies[0] # Find features that actually exist in Plasmid plasmid_feature_set = set() for feature in final_assembly_product['features']: regex_pattern = f'{feature.sequence}|{feature.reverse_complement()}' if len(re.findall(regex_pattern, final_assembly_product['sequence'])) > 0: plasmid_feature_set.add(feature) return Plasmid(final_assembly_product['sequence'], entity_id=new_id, name=new_id, features=list(plasmid_feature_set), description=new_description, source=final_assembly_product['sources'])
def perform_assembly(self, plasmids_only=True, new_id='New_assembly', new_description='New assembly'): """ We're going to use our knowledge of how homology-directed assembly works to take a few short-cuts... Nodes are sequences and edges are homology overlaps :return: """ homology_overlap = 6 # --- Assemble Graph --- # directed_graph = networkx.MultiDiGraph() for part_1, part_2 in combinations(self.input_dna_list, 2): print(part_1) print(part_2) # Get possible part_1 3' overlap with part_2 5' part_1_right = part_1.sequence[-homology_overlap:] re_pattern = re.compile(f'(?={part_1_right})') for match in re_pattern.finditer(part_2.sequence): overlap_length = match.start() + len(part_1_right) print(part_2.sequence[:overlap_length], part_1.sequence[-overlap_length:]) if part_2.sequence[:overlap_length] == part_1.sequence[-overlap_length:]: print('Overlap found, adding nodes and edges to graph.') overlap_seq = part_1.sequence[-overlap_length:] directed_graph.add_node(part_1) directed_graph.add_node(part_2) directed_graph.add_edge(part_1, part_2, overlap=overlap_seq) # Get possible part_1 5' overlap with part_2 3' part_1_left = part_1.sequence[:homology_overlap] re_pattern = re.compile(f'(?={part_1_left})') for match in re_pattern.finditer(part_2.sequence): overlap_length = len(part_2.sequence) - match.start() print(part_2.sequence[-overlap_length:], part_1.sequence[:overlap_length]) if part_2.sequence[-overlap_length:] == part_1.sequence[:overlap_length]: print('Overlap found, adding nodes and edges to graph.') overlap_seq = part_1.sequence[-overlap_length:] directed_graph.add_node(part_1) directed_graph.add_node(part_2) directed_graph.add_edge(part_2, part_1, overlap=overlap_seq) # --- Traverse Graph --- # # First: simple_cycles graph_cycles = networkx.algorithms.cycles.simple_cycles(directed_graph) processed_cycles = list() all_possible_assemblies = list() for cycle in graph_cycles: if not any([set(cycle) == processed_cycle for processed_cycle in processed_cycles]): sequences_list = list() # Iterate through nodes for node_l, node_r in pairwise(cycle): current_edge = directed_graph[node_l][node_r] new_sequence_list = [current_edge[index]['overlap'] for index, edge in enumerate(current_edge)] sequences_list.append(new_sequence_list) # Get last part of cycle last_edge = directed_graph[cycle[-1]][cycle[0]] new_sequence_list = [last_edge[index]['overlap'] for index, edge in enumerate(last_edge)] sequences_list.append(new_sequence_list) all_possible_assemblies += product(*sequences_list) # Actually do assemblies intermediate_assemblies = list() for assembly in all_possible_assemblies: print(assembly) assembly_dict = dict() assembly_dict['sequence'] = '' assembly_dict['features'] = list() assembly_dict['sources'] = list() assembly_dict['description'] = list() # Technically it shouldn't be possible for two edges to exist a pair of sequences with homology... # but I wanted to reuse sequence_from_cycles() for part_1, part_2 in pairwise(assembly): overlap_length = len(directed_graph[part_1][part_2][0]['overlap']) assembly_dict['sequence'] += part_1.sequence[:-overlap_length] if part_1.features: assembly_dict['features'] += part_1.features if part_1.description: assembly_dict['description'].append(part_1.description) assembly_dict['sources'].append(part_1.source) # Get last part first_part = assembly[0] last_part = assembly[-1] overlap_length = len(directed_graph[last_part][first_part][0]['overlap']) assembly_dict['sequence'] += last_part.sequence[:-overlap_length] if last_part.features: assembly_dict['features'] += last_part.features if last_part.description: assembly_dict['description'].append(last_part.description) assembly_dict['sources'].append(last_part.source) intermediate_assemblies.append(assembly_dict) # --- Make complete assemblies --- # """All assemblies are valid since there isn't a digestion step like with sticky-end methods""" complete_assemblies = intermediate_assemblies # --- Raise exceptions if something went wrong --- # # Plasmid specific exceptions if len(complete_assemblies) > 1 and plasmids_only: raise AssemblyException('This assembly produces more than one plasmid product. Check your input sequences.') if len(complete_assemblies) == 0 and plasmids_only: raise AssemblyException('This assembly does not produce any complete products. Check your input sequences.') # --- Dump assembly and things into a new Plasmid --- # if plasmids_only: final_assembly_product = complete_assemblies[0] # Find features that actually exist in Plasmid plasmid_feature_set = set() for feature in final_assembly_product['features']: regex_pattern = f'{feature.sequence}|{feature.reverse_complement()}' if len(re.findall(regex_pattern, final_assembly_product['sequence'])) > 0: plasmid_feature_set.add(feature) return Plasmid(final_assembly_product['sequence'], entity_id=new_id, name=new_id, features=list(plasmid_feature_set), description=new_description, source=final_assembly_product['sources'])
def _digest_by_slicing(self): """ Iterate through rxn_enzyme_cuts in order and pull sequences from DNA to produce new Parts. Super dirty initial implementation, we'll clean that up later (he said 5 years ago)... There are a lot of edge cases I have to check using this method, specifically catching restriction sites that span the start/end of a plasmid sequence... and if a plasmid only has one cut... :return: """ def process_sticky_end(cut_index_5, cut_index_3, strand): cut_index_difference = (cut_index_3 - cut_index_5) * strand if cut_index_difference == 0: return None else: overhang_strand = 5 if cut_index_difference > 0 else 3 return abs(cut_index_difference), overhang_strand # Nested because this function really should not be exposed, defeats the point of CloningReaction objects def make_part(template_dna, left_cut, right_cut, plasmid_span=False): """ Use {cut_position: rxn_enzyme_dict} to create a new part from input_dna There are subtle differences in how 5' and 3' ends are processed which makes generalizing code difficult... :param left_cut: {cut_position: {'enzyme': BioPython Restriction, 'strand': {1 | -1} } } :param right_cut: {cut_position: {'enzyme': BioPython Restriction, 'strand': {1 | -1} } } :param plasmid_span: assemble """ # --- Process 5' of new Part --- # if left_cut[0] == 'Start': part_overhang_5 = left_cut[1] left_cut_index = 0 left_cut_indicies = (0,0) else: cut_position_left, rxn_enzyme_dict_left = left_cut cut_index_5, cut_index_3, strand = template_dna.find_cut_indicies(cut_position_left, rxn_enzyme_dict_left['enzyme']) left_cut_indicies = (cut_index_5, cut_index_3) left_cut_index = min(left_cut_indicies) if isinstance(template_dna, Plasmid): # Process restriction site part_overhang_5 = process_sticky_end(cut_index_5, cut_index_3, strand) else: # Handle already-processed restriction sites at part terminals (e.g. digested BsaI part backbone) stick_end_offset = 0 if template_dna.overhang_5 is None else template_dna.overhang_5[0] if min(cut_index_5, cut_index_3) <= stick_end_offset: part_overhang_5 = template_dna.overhang_5 # Process restriction site else: part_overhang_5 = process_sticky_end(cut_index_5, cut_index_3, strand) # --- Process 3' of new part --- # if right_cut[0] == 'End': part_overhang_3 = right_cut[1] right_cut_index = len(template_dna.sequence) right_cut_indices = (right_cut_index, right_cut_index) else: cut_position_right, rxn_enzyme_dict_right = right_cut cut_index_5, cut_index_3, strand = template_dna.find_cut_indicies(cut_position_right, rxn_enzyme_dict_right['enzyme']) right_cut_indices = (cut_index_5, cut_index_3) right_cut_index = max(right_cut_indices) if isinstance(template_dna, Plasmid): # Process restriction site part_overhang_3 = process_sticky_end(cut_index_5, cut_index_3, strand) else: # Handle already-processed restriction sites at part terminals (e.g. digested BsaI part backbone) stick_end_offset = len(template_dna.sequence) if template_dna.overhang_5 is None else (len(template_dna.sequence) - template_dna.overhang_3[0]) if max(cut_index_5, cut_index_3) >= stick_end_offset: part_overhang_3 = template_dna.overhang_3 # Process restriction site else: part_overhang_3 = process_sticky_end(cut_index_5, cut_index_3, strand) # --- Check for dud parts --- # "These pop up as a result of checking for parts across plasmid sequence end/start." # Match between restriction sites directly adjacent to previously processed start/end of part if not isinstance(template_dna, Plasmid) and max(left_cut_indicies) >= min(right_cut_indices): return None # Duplicate part if min(left_cut_indicies) == 0 and max(right_cut_indices) == len(template_dna.sequence): return None # --- Make new Part --- # if plasmid_span: new_part_sequence = template_dna.sequence[left_cut_index:] + template_dna.sequence[:right_cut_index] else: new_part_sequence = template_dna.sequence[left_cut_index:right_cut_index] new_part = Part(new_part_sequence, entity_id=input_dna.entity_id, name=input_dna.name, features=input_dna.features, description=input_dna.name, source=input_dna.entity_id, overhang_5=part_overhang_5, overhang_3=part_overhang_3) return new_part digestion_products = list() # For each input DNA entity for input_dna in self.input_dna_list: # --- Find restriction sites in input_dna --- # rxn_enzyme_cuts = self.find_restriction_sites(input_dna) if len(rxn_enzyme_cuts) == 0 and isinstance(input_dna, Plasmid): raise AssemblyException(f'Plasmid {input_dna.entity_id} cannot be cut by the restriction enzymes in this reaction!\n' f'Enzymes: {" ".join([a.__name__ for a in self.restriction_enzyme_list])}') sorted_rxn_enzyme_cuts = OrderedDict(sorted(rxn_enzyme_cuts.items(), key=lambda t: t[0])) # If Part/DNA, add overhang information to start/end of OrderedDict if not isinstance(input_dna, Plasmid): # Start (5') sorted_rxn_enzyme_cuts.update({'Start': input_dna.overhang_5}) sorted_rxn_enzyme_cuts.move_to_end('Start', last=False) # End (3') sorted_rxn_enzyme_cuts.update({'End': input_dna.overhang_3}) sorted_rxn_enzyme_cuts.move_to_end('End', last=True) # Duplicate cut if there's only one site in a Plasmid if isinstance(input_dna, Plasmid) and len(sorted_rxn_enzyme_cuts) == 1: sorted_rxn_enzyme_cuts.update(rxn_enzyme_cuts) # --- Perform assemblies --- # for left_cut, right_cut in pairwise(sorted_rxn_enzyme_cuts.items()): new_part = make_part(input_dna, left_cut, right_cut) digestion_products.append(new_part) # Perform last part assembly across sequence start/end for Plasmids if isinstance(input_dna, Plasmid): dict_keys = [a for a in sorted_rxn_enzyme_cuts.keys()] right_cut = (dict_keys[0], sorted_rxn_enzyme_cuts[dict_keys[0]]) left_cut = (dict_keys[-1], sorted_rxn_enzyme_cuts[dict_keys[-1]]) plasmid_span_part = make_part(input_dna, left_cut, right_cut, plasmid_span=True) # Check for restriction sites that may have spanned gap rxn_enzyme_cuts = self.find_restriction_sites(plasmid_span_part) if not rxn_enzyme_cuts: digestion_products.append(plasmid_span_part) else: sorted_rxn_enzyme_cuts = OrderedDict(sorted(rxn_enzyme_cuts.items(), key=lambda t: t[0])) # Start (5') sorted_rxn_enzyme_cuts.update({'Start': plasmid_span_part.overhang_5}) sorted_rxn_enzyme_cuts.move_to_end('Start', last=False) # End (3') sorted_rxn_enzyme_cuts.update({'End': plasmid_span_part.overhang_3}) sorted_rxn_enzyme_cuts.move_to_end('End', last=True) for left_cut, right_cut in pairwise(sorted_rxn_enzyme_cuts.items()): plasmid_span_parts = make_part(plasmid_span_part, left_cut, right_cut) digestion_products.append(plasmid_span_parts) self.input_dna_list = [a for a in digestion_products if a is not None] self._digested = True