Python pairwise Examples

Programming Language: Python

Namespace/Package Name: dnassembly.utils.utils

Method/Function: pairwise

Examples at hotexamples.com: 3

Python pairwise - 3 examples found. These are the top rated real world Python examples of dnassembly.utils.utils.pairwise extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

    def perform_assembly(self, plasmids_only=True, new_id='New_assembly', new_description='New assembly'):
        """
        Performs an assembly provided a pool of Parts.

        Inspiration from pydna to use networkx to represent assemblies as a graph. However I think we can be a lot more
        efficient and enforce our biological constraints at the same time.
        :param plasmids_only: only report cyclic assemblies that fulfill reaction constraints
        :return:
        """

        # --- Sanity Checks --- #

        """
        All DNA entities in input_dna_list should be parts. Any undigested Plasmids with the same resistance marker as
        your assembly product will totally mess up your day.
        """
        if any([isinstance(dna, Plasmid) for dna in self.input_dna_list]):
            raise ReactionDefinitionException('Undigested plasmids cannot be part of an assembly!')

        # --- Assemble Graph --- #

        directed_graph = networkx.MultiDiGraph()

        # Add nodes and edges for each part in digest_pool
        for part in self.input_dna_list:

            sticky_match_l = part.overhang_5
            sticky_match_r = part.overhang_3

            # Process sticky ends into nodes
            # (overhang_sequence, overhang_strand)
            l_node = 'Blunt_Left' if sticky_match_l is None else (part.sequence[:sticky_match_l[0]], sticky_match_l[1])
            r_node = 'Blunt_Right' if sticky_match_r is None else (part.sequence[-sticky_match_r[0]:], sticky_match_r[1])

            directed_graph.add_node(l_node)
            directed_graph.add_node(r_node)

            # Add directed edge between nodes (5' -> 3') and add Part as edge attribute
            directed_graph.add_edge(l_node, r_node, part=part)

        # --- Traverse Graph --- #

        # First: simple_cycles
        graph_cycles = networkx.algorithms.cycles.simple_cycles(directed_graph)
        processed_cycles = list()
        all_possible_assemblies = list()

        for cycle in graph_cycles:

            if not any([set(cycle) == processed_cycle for processed_cycle in processed_cycles]):
                sequences_list = list()

                # Iterate through edges and get sequence up to 3' sticky end
                for node_l, node_r in pairwise(cycle):
                    current_edge = directed_graph[node_l][node_r]
                    new_sequence_list = [current_edge[index]['part'] for index, edge in enumerate(current_edge)]
                    sequences_list.append(new_sequence_list)

                # Get last part of cycle
                last_edge = directed_graph[cycle[-1]][cycle[0]]
                new_sequence_list = [last_edge[index]['part'] for index, edge in enumerate(last_edge)]
                sequences_list.append(new_sequence_list)
                all_possible_assemblies += product(*sequences_list)

        # Actually do assemblies
        intermediate_assemblies = list()
        for assembly in all_possible_assemblies:
            assembly_dict = dict()
            assembly_dict['sequence'] = ''
            assembly_dict['features'] = list()
            assembly_dict['sources'] = list()
            assembly_dict['description'] = list()
            for part in assembly:
                right_sticky_end = part.overhang_3[0]
                assembly_dict['sequence'] += part.sequence[:-right_sticky_end]
                if part.features:
                    assembly_dict['features'] += part.features
                if part.description:
                    assembly_dict['description'].append(part.description)
                assembly_dict['sources'].append(part.source)

            if assembly_dict['sequence'] != '':
                intermediate_assemblies.append(assembly_dict)

        # In plasmids_only=False: all_simple_paths, iterate over all combinations of nodes
        # todo: write this

        # --- Final digestion --- #

        # Omit assemblies that still possess restriction sites for enzymes in restriction_enzyme_list
        complete_assemblies = list()

        for assembly in intermediate_assemblies:
            rxn_site_found = False

            for enzyme in self.restriction_enzyme_list:
                if len(enzyme.compsite.findall(assembly['sequence'])) > 0:
                    rxn_site_found = True

            if rxn_site_found is True:
                continue
            else:
                complete_assemblies.append(assembly)

        # --- Raise exceptions if something went wrong --- #

        # Plasmid specific exceptions
        if len(complete_assemblies) > 1 and plasmids_only:
            raise AssemblyException('This assembly produces more than one plasmid product. Check your input sequences.')

        if len(complete_assemblies) == 0 and plasmids_only:
            raise AssemblyException('This assembly does not produce any complete products. Check your input sequences.')

        # --- Dump assembly and things into a new Plasmid --- #

        if plasmids_only:

            final_assembly_product = complete_assemblies[0]

            # Find features that actually exist in Plasmid
            plasmid_feature_set = set()
            for feature in final_assembly_product['features']:
                regex_pattern = f'{feature.sequence}|{feature.reverse_complement()}'
                if len(re.findall(regex_pattern, final_assembly_product['sequence'])) > 0:
                    plasmid_feature_set.add(feature)

            return Plasmid(final_assembly_product['sequence'], entity_id=new_id, name=new_id, features=list(plasmid_feature_set),
                           description=new_description, source=final_assembly_product['sources'])

Example #2

Show file

    def perform_assembly(self, plasmids_only=True, new_id='New_assembly', new_description='New assembly'):
        """
        We're going to use our knowledge of how homology-directed assembly works to take a few short-cuts...
        Nodes are sequences and edges are homology overlaps
        :return:
        """

        homology_overlap = 6

        # --- Assemble Graph --- #

        directed_graph = networkx.MultiDiGraph()

        for part_1, part_2 in combinations(self.input_dna_list, 2):

            print(part_1)
            print(part_2)

            # Get possible part_1 3' overlap with part_2 5'
            part_1_right = part_1.sequence[-homology_overlap:]
            re_pattern = re.compile(f'(?={part_1_right})')
            for match in re_pattern.finditer(part_2.sequence):

                overlap_length = match.start() + len(part_1_right)
                print(part_2.sequence[:overlap_length], part_1.sequence[-overlap_length:])
                if part_2.sequence[:overlap_length] == part_1.sequence[-overlap_length:]:
                    print('Overlap found, adding nodes and edges to graph.')
                    overlap_seq = part_1.sequence[-overlap_length:]

                    directed_graph.add_node(part_1)
                    directed_graph.add_node(part_2)
                    directed_graph.add_edge(part_1, part_2, overlap=overlap_seq)

            # Get possible part_1 5' overlap with part_2 3'
            part_1_left = part_1.sequence[:homology_overlap]
            re_pattern = re.compile(f'(?={part_1_left})')
            for match in re_pattern.finditer(part_2.sequence):
                overlap_length = len(part_2.sequence) - match.start()
                print(part_2.sequence[-overlap_length:], part_1.sequence[:overlap_length])
                if part_2.sequence[-overlap_length:] == part_1.sequence[:overlap_length]:
                    print('Overlap found, adding nodes and edges to graph.')
                    overlap_seq = part_1.sequence[-overlap_length:]

                    directed_graph.add_node(part_1)
                    directed_graph.add_node(part_2)
                    directed_graph.add_edge(part_2, part_1, overlap=overlap_seq)

        # --- Traverse Graph --- #

        # First: simple_cycles
        graph_cycles = networkx.algorithms.cycles.simple_cycles(directed_graph)

        processed_cycles = list()
        all_possible_assemblies = list()

        for cycle in graph_cycles:

            if not any([set(cycle) == processed_cycle for processed_cycle in processed_cycles]):
                sequences_list = list()

                # Iterate through nodes
                for node_l, node_r in pairwise(cycle):
                    current_edge = directed_graph[node_l][node_r]
                    new_sequence_list = [current_edge[index]['overlap'] for index, edge in enumerate(current_edge)]
                    sequences_list.append(new_sequence_list)

                # Get last part of cycle
                last_edge = directed_graph[cycle[-1]][cycle[0]]
                new_sequence_list = [last_edge[index]['overlap'] for index, edge in enumerate(last_edge)]
                sequences_list.append(new_sequence_list)
                all_possible_assemblies += product(*sequences_list)

        # Actually do assemblies
        intermediate_assemblies = list()
        for assembly in all_possible_assemblies:
            print(assembly)
            assembly_dict = dict()
            assembly_dict['sequence'] = ''
            assembly_dict['features'] = list()
            assembly_dict['sources'] = list()
            assembly_dict['description'] = list()

            # Technically it shouldn't be possible for two edges to exist a pair of sequences with homology...
            # but I wanted to reuse sequence_from_cycles()

            for part_1, part_2 in pairwise(assembly):

                overlap_length = len(directed_graph[part_1][part_2][0]['overlap'])
                assembly_dict['sequence'] += part_1.sequence[:-overlap_length]

                if part_1.features:
                    assembly_dict['features'] += part_1.features
                if part_1.description:
                    assembly_dict['description'].append(part_1.description)
                assembly_dict['sources'].append(part_1.source)

            # Get last part
            first_part = assembly[0]
            last_part = assembly[-1]
            overlap_length = len(directed_graph[last_part][first_part][0]['overlap'])
            assembly_dict['sequence'] += last_part.sequence[:-overlap_length]

            if last_part.features:
                assembly_dict['features'] += last_part.features
            if last_part.description:
                assembly_dict['description'].append(last_part.description)
            assembly_dict['sources'].append(last_part.source)

            intermediate_assemblies.append(assembly_dict)

        # --- Make complete assemblies --- #
        """All assemblies are valid since there isn't a digestion step like with sticky-end methods"""
        complete_assemblies = intermediate_assemblies

        # --- Raise exceptions if something went wrong --- #

        # Plasmid specific exceptions
        if len(complete_assemblies) > 1 and plasmids_only:
            raise AssemblyException('This assembly produces more than one plasmid product. Check your input sequences.')

        if len(complete_assemblies) == 0 and plasmids_only:
            raise AssemblyException('This assembly does not produce any complete products. Check your input sequences.')

        # --- Dump assembly and things into a new Plasmid --- #

        if plasmids_only:

            final_assembly_product = complete_assemblies[0]

            # Find features that actually exist in Plasmid
            plasmid_feature_set = set()
            for feature in final_assembly_product['features']:
                regex_pattern = f'{feature.sequence}|{feature.reverse_complement()}'
                if len(re.findall(regex_pattern, final_assembly_product['sequence'])) > 0:
                    plasmid_feature_set.add(feature)

            return Plasmid(final_assembly_product['sequence'], entity_id=new_id, name=new_id, features=list(plasmid_feature_set),
                           description=new_description, source=final_assembly_product['sources'])

Example #3

Show file

    def _digest_by_slicing(self):
        """
        Iterate through rxn_enzyme_cuts in order and pull sequences from DNA to produce new Parts.
        Super dirty initial implementation, we'll clean that up later (he said 5 years ago)...
        There are a lot of edge cases I have to check using this method, specifically catching restriction sites that
        span the start/end of a plasmid sequence...
        and if a plasmid only has one cut...
        :return:
        """
        def process_sticky_end(cut_index_5, cut_index_3, strand):
            cut_index_difference = (cut_index_3 - cut_index_5) * strand
            if cut_index_difference == 0:
                return None
            else:
                overhang_strand = 5 if cut_index_difference > 0 else 3
                return abs(cut_index_difference), overhang_strand

        # Nested because this function really should not be exposed, defeats the point of CloningReaction objects
        def make_part(template_dna, left_cut, right_cut, plasmid_span=False):
            """
            Use {cut_position: rxn_enzyme_dict} to create a new part from input_dna
            There are subtle differences in how 5' and 3' ends are processed which makes generalizing code difficult...

            :param left_cut: {cut_position: {'enzyme': BioPython Restriction, 'strand': {1 | -1} } }
            :param right_cut: {cut_position: {'enzyme': BioPython Restriction, 'strand': {1 | -1} } }
            :param plasmid_span: assemble
            """

            # --- Process 5' of new Part --- #

            if left_cut[0] == 'Start':
                part_overhang_5 = left_cut[1]
                left_cut_index = 0
                left_cut_indicies = (0,0)

            else:
                cut_position_left, rxn_enzyme_dict_left = left_cut
                cut_index_5, cut_index_3, strand = template_dna.find_cut_indicies(cut_position_left, rxn_enzyme_dict_left['enzyme'])
                left_cut_indicies = (cut_index_5, cut_index_3)
                left_cut_index = min(left_cut_indicies)

                if isinstance(template_dna, Plasmid):
                    # Process restriction site
                    part_overhang_5 = process_sticky_end(cut_index_5, cut_index_3, strand)
                else:
                    # Handle already-processed restriction sites at part terminals (e.g. digested BsaI part backbone)
                    stick_end_offset = 0 if template_dna.overhang_5 is None else template_dna.overhang_5[0]
                    if min(cut_index_5, cut_index_3) <= stick_end_offset:
                        part_overhang_5 = template_dna.overhang_5
                    # Process restriction site
                    else:
                        part_overhang_5 = process_sticky_end(cut_index_5, cut_index_3, strand)

            # --- Process 3' of new part --- #

            if right_cut[0] == 'End':
                part_overhang_3 = right_cut[1]
                right_cut_index = len(template_dna.sequence)
                right_cut_indices = (right_cut_index, right_cut_index)
            else:
                cut_position_right, rxn_enzyme_dict_right = right_cut
                cut_index_5, cut_index_3, strand = template_dna.find_cut_indicies(cut_position_right, rxn_enzyme_dict_right['enzyme'])
                right_cut_indices = (cut_index_5, cut_index_3)
                right_cut_index = max(right_cut_indices)

                if isinstance(template_dna, Plasmid):
                    # Process restriction site
                    part_overhang_3 = process_sticky_end(cut_index_5, cut_index_3, strand)
                else:
                    # Handle already-processed restriction sites at part terminals (e.g. digested BsaI part backbone)
                    stick_end_offset = len(template_dna.sequence) if template_dna.overhang_5 is None else (len(template_dna.sequence) - template_dna.overhang_3[0])
                    if max(cut_index_5, cut_index_3) >= stick_end_offset:
                        part_overhang_3 = template_dna.overhang_3
                    # Process restriction site
                    else:
                        part_overhang_3 = process_sticky_end(cut_index_5, cut_index_3, strand)

            # --- Check for dud parts --- #
            "These pop up as a result of checking for parts across plasmid sequence end/start."

            # Match between restriction sites directly adjacent to previously processed start/end of part
            if not isinstance(template_dna, Plasmid) and max(left_cut_indicies) >= min(right_cut_indices):
                return None
            # Duplicate part
            if min(left_cut_indicies) == 0 and max(right_cut_indices) == len(template_dna.sequence):
                return None

            # --- Make new Part --- #

            if plasmid_span:
                new_part_sequence = template_dna.sequence[left_cut_index:] + template_dna.sequence[:right_cut_index]
            else:
                new_part_sequence = template_dna.sequence[left_cut_index:right_cut_index]
            new_part = Part(new_part_sequence, entity_id=input_dna.entity_id, name=input_dna.name, features=input_dna.features,
                            description=input_dna.name, source=input_dna.entity_id, overhang_5=part_overhang_5,
                            overhang_3=part_overhang_3)
            return new_part

        digestion_products = list()

        # For each input DNA entity
        for input_dna in self.input_dna_list:

            # --- Find restriction sites in input_dna --- #

            rxn_enzyme_cuts = self.find_restriction_sites(input_dna)
            if len(rxn_enzyme_cuts) == 0 and isinstance(input_dna, Plasmid):
                raise AssemblyException(f'Plasmid {input_dna.entity_id} cannot be cut by the restriction enzymes in this reaction!\n'
                                        f'Enzymes: {" ".join([a.__name__ for a in self.restriction_enzyme_list])}')

            sorted_rxn_enzyme_cuts = OrderedDict(sorted(rxn_enzyme_cuts.items(), key=lambda t: t[0]))

            # If Part/DNA, add overhang information to start/end of OrderedDict
            if not isinstance(input_dna, Plasmid):
                # Start (5')
                sorted_rxn_enzyme_cuts.update({'Start': input_dna.overhang_5})
                sorted_rxn_enzyme_cuts.move_to_end('Start', last=False)
                # End (3')
                sorted_rxn_enzyme_cuts.update({'End': input_dna.overhang_3})
                sorted_rxn_enzyme_cuts.move_to_end('End', last=True)

            # Duplicate cut if there's only one site in a Plasmid
            if isinstance(input_dna, Plasmid) and len(sorted_rxn_enzyme_cuts) == 1:
                sorted_rxn_enzyme_cuts.update(rxn_enzyme_cuts)

            # --- Perform assemblies --- #

            for left_cut, right_cut in pairwise(sorted_rxn_enzyme_cuts.items()):
                new_part = make_part(input_dna, left_cut, right_cut)
                digestion_products.append(new_part)

            # Perform last part assembly across sequence start/end for Plasmids
            if isinstance(input_dna, Plasmid):

                dict_keys = [a for a in sorted_rxn_enzyme_cuts.keys()]
                right_cut = (dict_keys[0], sorted_rxn_enzyme_cuts[dict_keys[0]])
                left_cut = (dict_keys[-1], sorted_rxn_enzyme_cuts[dict_keys[-1]])
                plasmid_span_part = make_part(input_dna, left_cut, right_cut, plasmid_span=True)

                # Check for restriction sites that may have spanned gap
                rxn_enzyme_cuts = self.find_restriction_sites(plasmid_span_part)

                if not rxn_enzyme_cuts:
                    digestion_products.append(plasmid_span_part)

                else:
                    sorted_rxn_enzyme_cuts = OrderedDict(sorted(rxn_enzyme_cuts.items(), key=lambda t: t[0]))

                    # Start (5')
                    sorted_rxn_enzyme_cuts.update({'Start': plasmid_span_part.overhang_5})
                    sorted_rxn_enzyme_cuts.move_to_end('Start', last=False)
                    # End (3')
                    sorted_rxn_enzyme_cuts.update({'End': plasmid_span_part.overhang_3})
                    sorted_rxn_enzyme_cuts.move_to_end('End', last=True)

                    for left_cut, right_cut in pairwise(sorted_rxn_enzyme_cuts.items()):

                        plasmid_span_parts = make_part(plasmid_span_part, left_cut, right_cut)
                        digestion_products.append(plasmid_span_parts)

        self.input_dna_list = [a for a in digestion_products if a is not None]
        self._digested = True