def sort_by_domain_architecture(self): self.domain_archs = defaultdict(list) if self.options.prefix: prefix = self.options.prefix else: prefix = "NOVEL" for seq_id, seq in self.seqcat.items(): assignments = sorted(seq.assignments, key=operator.attrgetter("start")) domains = [] if self.details_file: print(seq_id, file=self.details_file) primary_source = set() new_assignments = [] for assignment in assignments: new_assignment = assignment.resolve_interpro_ids(self.interpro) if assignment.comment == "1": primary_source.add(assignment.source) domains.append(new_assignment.domain) new_assignments.append(new_assignment) tree_arch = TreeRepresentation(new_assignments, self.interpro) seq.architecture = tree_arch.get_string() seq.architecture_pos = tree_arch.get_string_positions() self.domain_archs[seq.architecture].append(seq_id) if not primary_source: primary_source = None else: primary_source = ", ".join(primary_source) if self.details_file: seq2 = SequenceWithAssignments(seq.name, seq.length) seq2.assignments = [ assignment for assignment in assignments if assignment.source != "Novel" ] sources = sorted( set(assignment.source for assignment in assignments if assignment.source != "Novel")) print( " Primary assignment source: {}".format(primary_source), file=self.details_file) print(" Number of data sources used: {}".format( len(sources)), file=self.details_file) print(" Data sources: %s" % ", ".join(sources), file=self.details_file) print(" Coverage: %.3f" % seq.coverage(), file=self.details_file) print(" Coverage w/o novel domains: %.3f" % seq2.coverage(), file=self.details_file) for assignment in assignments: attrs = assignment._asdict() if assignment.comment is None and \ assignment.domain.startswith(prefix): attrs["comment"] = "novel" row = " %(start)4d-%(end)4d: %(domain)s "\ "(%(source)s, stage: %(comment)s)" % attrs print(row, file=self.details_file) interpro_id = assignment.interpro_id if not interpro_id\ and assignment.domain in self.interpro.mapping: interpro_id = self.interpro.mapping[assignment.domain] if interpro_id: anc = self.interpro.tree.get_most_remote_ancestor( interpro_id) if interpro_id == anc: print("(InterPro ID: %s)" % anc, file=self.details_file) else: print("(InterPro ID: %s --> %s)" % (interpro_id, anc), file=self.details_file) if anc in self.interpro_names: print("{}{}".format(" " * (row.index(":") + 1), self.interpro_names[anc]), file=self.details_file) else: print("", file=self.details_file) if assignment.domain in self.interpro_names: print("{}{}".format( " " * (row.index(":") + 1), self.interpro_names[assignment.domain]), file=self.details_file) print("", file=self.details_file) seq.assignments = new_assignments
def sort_by_domain_architecture(self): self.domain_archs = defaultdict(list) for seq_id, seq in self.seqcat.iteritems(): assignments = sorted(seq.assignments, key=operator.attrgetter("start")) domains = [] if self.details_file: print >>self.details_file, seq_id primary_source = set() new_assignments = [] for assignment in assignments: new_assignment = assignment.resolve_interpro_ids(self.interpro) if assignment.comment == "1": primary_source.add(assignment.source) domains.append(new_assignment.domain) new_assignments.append(new_assignment) self.domain_archs[tuple(domains)].append(seq_id) if not primary_source: primary_source = None else: primary_source = ", ".join(primary_source) if self.details_file: seq2 = SequenceWithAssignments(seq.name, seq.length) seq2.assignments = [assignment for assignment in assignments \ if assignment.source != "Novel"] sources = sorted(set(assignment.source \ for assignment in assignments \ if assignment.source != "Novel")) print >>self.details_file, " Primary assignment source:", primary_source print >>self.details_file, " Number of data sources used:", len(sources) print >>self.details_file, " Data sources: %s" % ", ".join(sources) print >>self.details_file, " Coverage: %.3f" % seq.coverage() print >>self.details_file, " Coverage w/o novel domains: %.3f" % seq2.coverage() for assignment in assignments: attrs = assignment._asdict() if assignment.comment is None and \ assignment.domain.startswith("NOVEL"): attrs["comment"] = "novel" row = " %(start)4d-%(end)4d: %(domain)s "\ "(%(source)s, stage: %(comment)s)" % attrs print >>self.details_file, row, interpro_id = assignment.interpro_id if not interpro_id and assignment.domain in self.interpro.mapping: interpro_id = self.interpro.mapping[assignment.domain] if interpro_id: anc = self.interpro.tree.get_most_remote_ancestor(interpro_id) if interpro_id == anc: print >>self.details_file, "(InterPro ID: %s)" % anc else: print >>self.details_file, "(InterPro ID: %s --> %s)" % (interpro_id, anc) if anc in self.interpro_names: print >>self.details_file, " "*(row.index(":")+1), self.interpro_names[anc] else: print >>self.details_file, "" if assignment.domain in self.interpro_names: print >>self.details_file, " "*(row.index(":")+1), self.interpro_names[assignment.domain] print >>self.details_file, "" seq.assignments = new_assignments
def filter_assignments(self, name, assignments_by_source): """Given a sequence name and its assignments ordered in a dict by their sources, selects a representative assignment set based on the rules outlined in the documentation of `FindUnassignedApp`. """ if not assignments_by_source: self.log_exclusion( name, "no assignments in the input data file " + "passed the filters") return [] # Determine the length of the sequence (and check that the length is # the same across all assignments; if not, then the input file is # inconsistent and the sequence will be skipped). source = assignments_by_source.keys()[0] seq_length = assignments_by_source[source][0][0].length for _source, assignments in assignments_by_source.items(): if any(assignment.length != seq_length for assignment, _ in assignments): self.log.warning("Sequence %s has multiple assignments with " "different sequence lengths in the " "input file, skipping" % name) self.log_exclusion( name, "ambiguous sequence " + "length in input file") return [] # Initially, the result is empty result = [] # Set up the stages stages = self.get_stages_from_config() """ stages = [complementerset(["HMMPanther", "Gene3D"]), complementerset(["HMMPanther", "Gene3D"]), complementerset()] """ # The first stage is treated specially as we have to select a single # source thas has the largest coverage. In the remaining stages, we # are allowed to cherrypick from different sources. # First, find the data source which covers the most of the sequence # and is allowed in stage 1 first_stage = stages.pop(0) coverage = {} for source, assignments in assignments_by_source.items(): # Exclude those sources that we don't consider in the first stage if source not in first_stage: continue # Calculate the coverage: we add all the residues covered by # each sequence, not taking overlaps into consideration (by the # moment) seq = SequenceWithAssignments(name, seq_length) for a, _ in assignments: seq.assign(a, False, interpro=self.interpro) coverage[source] = seq.coverage() # Find the source giving the best coverage, add its domains into # the current assignment. seq = SequenceWithAssignments(name, seq_length) if coverage: best_source = max(coverage.keys(), key=coverage.__getitem__) sorted_assignments = sorted( assignments_by_source[best_source], key=lambda x: x[0].get_assigned_length(), reverse=True) for a, line in sorted_assignments: line = line.strip() if seq.assign(a, True, interpro=self.interpro): tab_count = list(line).count("\t") if tab_count < 13: line = line + "\t" * (13 - tab_count) result.append("%s\t%s" % (line, 1)) else: best_source = None # Collect the unused assignments (not from the best source) # into unused_assignments unused_assignments = [] for source, assignments in assignments_by_source.items(): if source == best_source: continue unused_assignments.extend(assignments) if not unused_assignments: return result # Try to fill the unassigned regions with the rest of the assignments # that were unused so far, starting from the longest assignment. unused_assignments.sort(key=lambda x: -x[0].get_assigned_length()) # Okay, we're done with the first stage, process the rest. # idx_to_stage will contain the indices of the selected # assignments as keys and the number of the corresponding # stage in which they were selected as values. idx_to_stage = {} for stage_no, sources in enumerate(stages): for idx, (a, _) in enumerate(unused_assignments): if a.source in sources and seq.assign( a, True, interpro=self.interpro): idx_to_stage[idx] = stage_no + 2 for idx in sorted(idx_to_stage.keys()): row = unused_assignments[idx][1].strip() tab_count = list(row).count("\t") if tab_count < 13: row = row + "\t" * (13 - tab_count) result.append("%s\t%s" % (row, idx_to_stage[idx])) if not result: self.log_exclusion( name, "no assignments were selected after " "executing all the stages") return result
def filter_assignments(self, name, assignments_by_source): """Given a sequence name and its assignments ordered in a dict by their sources, selects a representative assignment set based on the rules outlined in the documentation of `FindUnassignedApp`. """ if not assignments_by_source: self.log_exclusion(name, "no assignments in the input data file " + "passed the filters") return [] # Determine the length of the sequence (and check that the length is # the same across all assignments; if not, then the input file is # inconsistent and the sequence will be skipped). source = assignments_by_source.keys()[0] seq_length = assignments_by_source[source][0][0].length for source, assignments in assignments_by_source.iteritems(): if any(assignment.length != seq_length \ for assignment, _ in assignments): self.log.warning("Sequence %s has multiple assignments with " "different sequence lengths in the " "input file, skipping" % name) self.log_exclusion(name, "ambiguous sequence length in input file") return [] # Initially, the result is empty result = [] # Set up the stages stages = self.get_stages_from_config() """ stages = [complementerset(["HMMPanther", "Gene3D"]), complementerset(["HMMPanther", "Gene3D"]), complementerset()] """ # The first stage is treated specially as we have to select a single # source thas has the largest coverage. In the remaining stages, we # are allowed to cherrypick from different sources. # First, find the data source which covers the most of the sequence # and is allowed in stage 1 first_stage = stages.pop(0) coverage = {} for source, assignments in assignments_by_source.iteritems(): # Exclude those sources that we don't consider in the first stage if source not in first_stage: continue # Calculate the coverage: we add all the residues covered by # each sequence, not taking overlaps into consideration (by the # moment) seq = SequenceWithAssignments(name, seq_length) for a, _ in assignments: seq.assign(a, False, interpro=self.interpro) coverage[source] = seq.coverage() # Find the source giving the best coverage, add its domains into # the current assignment. seq = SequenceWithAssignments(name, seq_length) if coverage: best_source = max(coverage.keys(), key = coverage.__getitem__) sorted_assignments = sorted(assignments_by_source[best_source], key=lambda x: x[0].get_assigned_length(), reverse=True) for a, line in sorted_assignments: line = line.strip() if seq.assign(a, True, interpro=self.interpro): tab_count = list(line).count("\t") if tab_count < 13: line = line + "\t" * (13-tab_count) result.append("%s\t%s" % (line, 1)) else: best_source = None # Collect the unused assignments (not from the best source) # into unused_assignments unused_assignments = [] for source, assignments in assignments_by_source.iteritems(): if source == best_source: continue unused_assignments.extend(assignments) if not unused_assignments: return result # Try to fill the unassigned regions with the rest of the assignments # that were unused so far, starting from the longest assignment. unused_assignments.sort(key = lambda x: -x[0].get_assigned_length()) # Okay, we're done with the first stage, process the rest. # idx_to_stage will contain the indices of the selected # assignments as keys and the number of the corresponding # stage in which they were selected as values. idx_to_stage = {} for stage_no, sources in enumerate(stages): for idx, (a, _) in enumerate(unused_assignments): if a.source in sources and seq.assign(a, True, interpro=self.interpro): idx_to_stage[idx] = stage_no+2 for idx in sorted(idx_to_stage.keys()): row = unused_assignments[idx][1].strip() tab_count = list(row).count("\t") if tab_count < 13: row = row + "\t" * (13-tab_count) result.append("%s\t%s" % (row, idx_to_stage[idx])) if not result: self.log_exclusion(name, "no assignments were selected after " "executing all the stages") return result
def sort_by_domain_architecture(self): self.domain_archs = defaultdict(list) for seq_id, seq in self.seqcat.items(): assignments = sorted(seq.assignments, key=operator.attrgetter("start")) domains = [] if self.details_file: print(seq_id, file=self.details_file) primary_source = set() new_assignments = [] for assignment in assignments: new_assignment = assignment.resolve_interpro_ids(self.interpro) if assignment.comment == "1": primary_source.add(assignment.source) domains.append(new_assignment.domain) new_assignments.append(new_assignment) tree_arch = TreeRepresentation(new_assignments, self.interpro) seq.architecture = tree_arch.get_string() seq.architecture_pos = tree_arch.get_string_positions() self.domain_archs[seq.architecture].append(seq_id) if not primary_source: primary_source = None else: primary_source = ", ".join(primary_source) if self.details_file: seq2 = SequenceWithAssignments(seq.name, seq.length) seq2.assignments = [assignment for assignment in assignments if assignment.source != "Novel"] sources = sorted(set(assignment.source for assignment in assignments if assignment.source != "Novel")) print(" Primary assignment source: {}".format( primary_source), file=self.details_file) print(" Number of data sources used: {}".format( len(sources)), file=self.details_file) print(" Data sources: %s" % ", ".join(sources), file=self.details_file) print(" Coverage: %.3f" % seq.coverage(), file=self.details_file) print(" Coverage w/o novel domains: %.3f" % seq2.coverage(), file=self.details_file) for assignment in assignments: attrs = assignment._asdict() if assignment.comment is None and \ assignment.domain in self.hmm_domains: attrs["comment"] = "novel" row = " %(start)4d-%(end)4d: %(domain)s "\ "(%(source)s, stage: %(comment)s)" % attrs print(row, file=self.details_file) interpro_id = assignment.interpro_id if not interpro_id\ and assignment.domain in self.interpro.mapping: interpro_id = self.interpro.mapping[assignment.domain] if interpro_id: anc = self.interpro.tree.get_most_remote_ancestor( interpro_id) if interpro_id == anc: print("(InterPro ID: %s)" % anc, file=self.details_file) else: print("(InterPro ID: %s --> %s)" % (interpro_id, anc), file=self.details_file) if anc in self.interpro_names: print("{}{}".format(" "*(row.index(":")+1), self.interpro_names[anc]), file=self.details_file) else: print("", file=self.details_file) if assignment.domain in self.interpro_names: print("{}{}".format( " "*(row.index(":")+1), self.interpro_names[assignment.domain]), file=self.details_file) print("", file=self.details_file) seq.assignments = new_assignments