Esempio n. 1
0
    def process_sequence(self, name, assignments):
        """Processes the given sequence `name` with the given `assignments`."""
        if not assignments:
            return

        seq = SequenceWithAssignments(name, assignments[0].length)
        for assignment in assignments:
            seq.assign(assignment, overlap_check=False)
        self.output_formatter.process_assignments(seq)
Esempio n. 2
0
    def process_sequence(self, name, assignments):
        """Processes the given sequence `name` with the given `assignments`."""
        if not assignments:
            return

        seq = SequenceWithAssignments(name, assignments[0].length)
        for assignment in assignments:
            seq.assign(assignment, overlap_check=False)
        self.output_formatter.process_assignments(seq)
Esempio n. 3
0
    def process_infile(self, fname):
        self.log.info("Processing input file: %s" % fname)

        for assignment in AssignmentReader(fname):
            try:
                seq = self.seqcat[assignment.id]
            except KeyError:
                seq = SequenceWithAssignments(assignment.id, assignment.length)
                self.seqcat[assignment.id] = seq
            if seq.length != assignment.length:
                raise ValueError, "different lengths encountered for %s: %d and %d" % (seq.name, seq.length, assignment.length)
            seq.assign(assignment)
Esempio n. 4
0
 def process_infile(self, fname, interpro=None):
     self.log.info("Processing input file: %s" % fname)
     import sys
     for assignment in AssignmentReader(fname):
         try:
             seq = self.seqcat[assignment.id]
         except KeyError:
             seq = SequenceWithAssignments(assignment.id, assignment.length)
             self.seqcat[assignment.id] = seq
         if seq.length != assignment.length:
             raise ValueError, "different lengths encountered for %s: %d and %d" % (seq.name, seq.length, assignment.length)
         if interpro is not None:
             assignment = assignment.resolve_interpro_ids(interpro)
         seq.assign(assignment, interpro)
Esempio n. 5
0
    def filter_assignments(self, name, assignments_by_source):
        """Given a sequence name and its assignments ordered in a dict by
        their sources, selects a representative assignment set based on the
        rules outlined in the documentation of `FindUnassignedApp`.
        """

        if not assignments_by_source:
            self.log_exclusion(name, "no assignments in the input data file " +
                                     "passed the filters")
            return []

        # Determine the length of the sequence (and check that the length is
        # the same across all assignments; if not, then the input file is
        # inconsistent and the sequence will be skipped).
        source = assignments_by_source.keys()[0]
        seq_length = assignments_by_source[source][0][0].length
        for source, assignments in assignments_by_source.iteritems():
            if any(assignment.length != seq_length \
                   for assignment, _ in assignments):
                self.log.warning("Sequence %s has multiple assignments with "
                                 "different sequence lengths in the "
                                 "input file, skipping" % name)
                self.log_exclusion(name, "ambiguous sequence length in input file")
                return []

        # Initially, the result is empty
        result = []

        # Set up the stages
        stages = self.get_stages_from_config()
        """
        stages = [complementerset(["HMMPanther", "Gene3D"]),
                  complementerset(["HMMPanther", "Gene3D"]),
                  complementerset()]
        """

        # The first stage is treated specially as we have to select a single
        # source thas has the largest coverage. In the remaining stages, we
        # are allowed to cherrypick from different sources.

        # First, find the data source which covers the most of the sequence
        # and is allowed in stage 1
        first_stage = stages.pop(0)
        coverage = {}
        for source, assignments in assignments_by_source.iteritems():
            # Exclude those sources that we don't consider in the first stage
            if source not in first_stage:
                continue

            # Calculate the coverage: we add all the residues covered by 
            # each sequence, not taking overlaps into consideration (by the
            # moment)
            seq = SequenceWithAssignments(name, seq_length)
            for a, _ in assignments:
                seq.assign(a, False, interpro=self.interpro)
            coverage[source] = seq.coverage()

        # Find the source giving the best coverage, add its domains into
        # the current assignment.
        seq = SequenceWithAssignments(name, seq_length)
        if coverage:
            best_source = max(coverage.keys(), key = coverage.__getitem__)
            sorted_assignments = sorted(assignments_by_source[best_source], key=lambda
                    x: x[0].get_assigned_length(), reverse=True)
            for a, line in sorted_assignments:
                line = line.strip()
                if seq.assign(a, True, interpro=self.interpro):
                    tab_count = list(line).count("\t")
                    if tab_count < 13:
                        line = line + "\t" * (13-tab_count)
                    result.append("%s\t%s" % (line, 1))
        else:
            best_source = None

        # Collect the unused assignments (not from the best source)
        # into unused_assignments
        unused_assignments = []
        for source, assignments in assignments_by_source.iteritems():
            if source == best_source:
                continue
            unused_assignments.extend(assignments)

        if not unused_assignments:
            return result

        # Try to fill the unassigned regions with the rest of the assignments
        # that were unused so far, starting from the longest assignment.
        unused_assignments.sort(key = lambda x: -x[0].get_assigned_length())

        # Okay, we're done with the first stage, process the rest.
        # idx_to_stage will contain the indices of the selected
        # assignments as keys and the number of the corresponding
        # stage in which they were selected as values.
        idx_to_stage = {}
        for stage_no, sources in enumerate(stages):
            for idx, (a, _) in enumerate(unused_assignments):
                if a.source in sources and seq.assign(a, True, interpro=self.interpro):
                    idx_to_stage[idx] = stage_no+2
        for idx in sorted(idx_to_stage.keys()):
            row = unused_assignments[idx][1].strip()
            tab_count = list(row).count("\t")
            if tab_count < 13:
                row = row + "\t" * (13-tab_count)
            result.append("%s\t%s" % (row, idx_to_stage[idx]))

        if not result:
            self.log_exclusion(name, "no assignments were selected after "
                                     "executing all the stages")

        return result
Esempio n. 6
0
    def filter_assignments(self, name, assignments_by_source):
        """Given a sequence name and its assignments ordered in a dict by
        their sources, selects a representative assignment set based on the
        rules outlined in the documentation of `FindUnassignedApp`.
        """

        if not assignments_by_source:
            self.log_exclusion(
                name, "no assignments in the input data file " +
                "passed the filters")
            return []

        # Determine the length of the sequence (and check that the length is
        # the same across all assignments; if not, then the input file is
        # inconsistent and the sequence will be skipped).
        source = assignments_by_source.keys()[0]
        seq_length = assignments_by_source[source][0][0].length
        for _source, assignments in assignments_by_source.items():
            if any(assignment.length != seq_length
                   for assignment, _ in assignments):
                self.log.warning("Sequence %s has multiple assignments with "
                                 "different sequence lengths in the "
                                 "input file, skipping" % name)
                self.log_exclusion(
                    name, "ambiguous sequence " + "length in input file")
                return []

        # Initially, the result is empty
        result = []

        # Set up the stages
        stages = self.get_stages_from_config()
        """
        stages = [complementerset(["HMMPanther", "Gene3D"]),
                  complementerset(["HMMPanther", "Gene3D"]),
                  complementerset()]
        """

        # The first stage is treated specially as we have to select a single
        # source thas has the largest coverage. In the remaining stages, we
        # are allowed to cherrypick from different sources.

        # First, find the data source which covers the most of the sequence
        # and is allowed in stage 1
        first_stage = stages.pop(0)
        coverage = {}
        for source, assignments in assignments_by_source.items():
            # Exclude those sources that we don't consider in the first stage
            if source not in first_stage:
                continue

            # Calculate the coverage: we add all the residues covered by
            # each sequence, not taking overlaps into consideration (by the
            # moment)
            seq = SequenceWithAssignments(name, seq_length)
            for a, _ in assignments:
                seq.assign(a, False, interpro=self.interpro)
            coverage[source] = seq.coverage()

        # Find the source giving the best coverage, add its domains into
        # the current assignment.
        seq = SequenceWithAssignments(name, seq_length)
        if coverage:
            best_source = max(coverage.keys(), key=coverage.__getitem__)
            sorted_assignments = sorted(
                assignments_by_source[best_source],
                key=lambda x: x[0].get_assigned_length(),
                reverse=True)
            for a, line in sorted_assignments:
                line = line.strip()
                if seq.assign(a, True, interpro=self.interpro):
                    tab_count = list(line).count("\t")
                    if tab_count < 13:
                        line = line + "\t" * (13 - tab_count)
                    result.append("%s\t%s" % (line, 1))
        else:
            best_source = None

        # Collect the unused assignments (not from the best source)
        # into unused_assignments
        unused_assignments = []
        for source, assignments in assignments_by_source.items():
            if source == best_source:
                continue
            unused_assignments.extend(assignments)

        if not unused_assignments:
            return result

        # Try to fill the unassigned regions with the rest of the assignments
        # that were unused so far, starting from the longest assignment.
        unused_assignments.sort(key=lambda x: -x[0].get_assigned_length())

        # Okay, we're done with the first stage, process the rest.
        # idx_to_stage will contain the indices of the selected
        # assignments as keys and the number of the corresponding
        # stage in which they were selected as values.
        idx_to_stage = {}
        for stage_no, sources in enumerate(stages):
            for idx, (a, _) in enumerate(unused_assignments):
                if a.source in sources and seq.assign(
                        a, True, interpro=self.interpro):
                    idx_to_stage[idx] = stage_no + 2
        for idx in sorted(idx_to_stage.keys()):
            row = unused_assignments[idx][1].strip()
            tab_count = list(row).count("\t")
            if tab_count < 13:
                row = row + "\t" * (13 - tab_count)
            result.append("%s\t%s" % (row, idx_to_stage[idx]))

        if not result:
            self.log_exclusion(
                name, "no assignments were selected after "
                "executing all the stages")

        return result