def _edge_to_region(self, n1, n2): if n1.index == len(self.query) and is_circular(self.query): a = 0 b = n2.index else: a = n1.index b = n2.index return Region(a, b, len(self.query), cyclic=is_circular(self.query))
def __init__( self, graph: nx.DiGraph, query: SeqRecord, span_cost: SpanCost, seqdb: Dict[str, SeqRecord], container: AlignmentContainer, stats_repeat_window: Optional[int] = None, stats_window: Optional[int] = None, stats_hairpin_window: Optional[int] = None, edge_threshold: Optional[float] = None, stages: Optional[Tuple[str]] = None, ): if stats_repeat_window is None: stats_repeat_window = SequenceScoringConfig.stats_repeat_window if stats_window is None: stats_window = SequenceScoringConfig.stats_window if stats_hairpin_window is None: stats_hairpin_window = SequenceScoringConfig.stats_hairpin_window if stages is None: stages = SequenceScoringConfig.post_process_stages if edge_threshold is None: edge_threshold = SequenceScoringConfig.edge_threshold self.graph = graph self.graph_builder = AssemblyGraphBuilder(container, span_cost=span_cost) self.graph_builder.G = graph self.query = query self.seqdb = seqdb query_seq = str(query.seq) if is_circular(query): query_seq = query_seq + query_seq self.stats = DNAStats( query_seq, repeat_window=stats_repeat_window, stats_window=stats_window, hairpin_window=stats_hairpin_window, ) self.stats_single = DNAStats( str(query.seq), repeat_window=stats_repeat_window, stats_window=stats_window, hairpin_window=stats_hairpin_window, ) self.logged_msgs = [] # TODO: make a more sophisticated complexity function? # TODO: expose this to input parameters self.COMPLEXITY_THRESHOLD = SequenceScoringConfig.complexity_threshold self.logger = logger(self) self.span_cost = span_cost self.stages = stages self.edge_threshold = edge_threshold
def _collect_optimize_args( self, graphs: Dict[str, nx.DiGraph]) -> Tuple[str, nx.DiGraph, bool, dict]: for query_key, graph in self.logger.tqdm(graphs.items(), "INFO", desc="optimizing graphs"): container = self.containers[query_key] query = container.seqdb[query_key] cyclic = is_circular(query) result = DesignResult(container=container, query_key=query_key, graph=graph) yield query_key, graph, len(query), cyclic, result
def make_blast(): subjects = load_genbank_glob(join( here, "data/test_data/genbank/templates/*.gb"), force_unique_ids=True) queries = load_genbank_glob( join( here, "data/test_data/genbank/designs/pmodkan-ho-pact1-z4-er-vpr.gb" ), force_unique_ids=True, ) queries = make_circular(queries) assert is_circular(queries[0]) return BioBlast(subjects, queries)
def _get_design_status(self, qk): status = { "compiled": False, "run": False, "success": False, "assemblies": [] } record = self.seqdb[qk] status["record"] = { "name": record.name, "length": len(record.seq), "id": record.id, "is_circular": is_circular(record), } if self.graphs.get(qk, None) is not None: status["compiled"] = True if self.results.get(qk, None) is not None: status["run"] = True for a in self.results[qk].assemblies: status["success"] = True summ_df = a.to_df() material = sum(list(summ_df["material"])) eff = functools.reduce(operator.mul, summ_df["efficiency"]) comp = 0 for c in summ_df["complexity"]: if not c: continue elif c > comp: comp = c status["assemblies"].append({ "cost": { "material cost": round(material, 2), "assembly efficiency": round(eff, 2), "max synthesis complexity": round(comp, 2), } }) return status
def _optimize(self, n_paths) -> Dict[str, DesignResult]: """Finds the optimal paths for each query in the design.""" results_dict = {} for query_key, graph, query_length, cyclic, result in self.logger.tqdm( self._collect_optimize_args(self.graphs), "INFO", desc="optimizing graphs (n_graphs={})".format(len( self.graphs)), ): container = self.containers[query_key] query = container.seqdb[query_key] cyclic = is_circular(query) results_dict[query_key] = result paths, costs = optimize_graph(graph, len(query), cyclic, n_paths) if not paths: query_rec = self.blast_factory.db.records[query_key] self.logger.error( "\n\tThere were no solutions found for design '{}' ({}).\n\t" "This sequence may be better synthesized. Use a tool such as JBEI's" " BOOST.".format(query_rec.name, query_key)) result.add_assemblies(paths, ignore_invalid=True) return results_dict
def filter_linear_records(cls, records: List[SeqRecord]) -> List[SeqRecord]: """Return only linear records.""" return [r for r in records if not is_circular(r)]
def cyclic(self): return is_circular(self.query)
def partition(self, edges: List[Edge]): tracker = self.logger.track( "INFO", desc="Partitioning sequences", total=3 ).enter() tracker.update(0, "{} highly complex sequences".format(len(edges))) edges_to_partition = self._filter_partition_edges(edges) cyclic = is_circular(self.query) partitions = find_by_partitions_for_sequence( self.stats_single, cyclic=cyclic, threshold=Config.SequenceScoringConfig.complexity_threshold, step_size=Config.SequenceScoringConfig.partition_step_size, delta=Config.SequenceScoringConfig.partition_overlap, ) tracker.update(1, "Partition: locations: {}".format(partitions)) add_gap_edge = partial(self.graph_builder.add_gap_edge, add_to_graph=False) add_overlap_edge = partial( self.graph_builder.add_overlap_edge, add_to_graph=True, validate_groups_present=False, groups=None, group_keys=None, ) new_edges = [] for n1, n2, edata in edges_to_partition: r = Region(n1.index, n2.index, len(self.query.seq), cyclic=cyclic) if n1.type == "B" and n2.type == "A": for p in partitions: if p in r: # TODO: overlap? find optimal partition for overlap? i4 = p i3 = p + Config.SequenceScoringConfig.partition_overlap n3 = AssemblyNode(i3, False, "BC", overhang=True) n4 = AssemblyNode(i4, False, "CA", overhang=True) e1 = add_gap_edge(n1, n3, r, origin=False) e2 = add_gap_edge(n1, n3, r, origin=True) if e1 is None and e2 is None: continue e3 = add_overlap_edge(n3, n4, r, origin=False) e4 = add_overlap_edge(n3, n4, r, origin=True) if e3 is None and e4 is None: continue e5 = add_gap_edge(n4, n2, r, origin=False) e6 = add_gap_edge(n4, n2, r, origin=True) if e5 is None and e6 is None: continue new_edges += [e1, e2, e3, e4, e5, e6] for e in new_edges: if e is not None: self.graph_builder.G.add_edge(e[0], e[1], **e[2]) edges = [] for n1, n2, edata in self.graph_builder.G.edges(data=True): if edata["cost"] is None: edges.append((n1, n2, edata)) tracker.update(2, "Partition: Added {} new edges".format(len(edges))) self.graph_builder.update_costs(edges) self.score_complexity_edges(list(self.graph_builder.G.edges(data=True))) tracker.exit()
def is_circular(records): return is_circular(records)