Ejemplo n.º 1
0
    def define_adapter_presence_substitutions_only(self, adapter, max_substitutions):
        """Sets the adapter_absent attribute according to whether a match is found with a number of substitutions
        less or equal to
        edit_threshold.

        :rtype : None
        Args:
            self, adapter, edit_threshold

        Returns:
            None

        Args:
            self, adapter, max_substitutions

        Returns:
            None
        """
        # Solves the straightforward case where the adapter is exactly present in the read
        if adapter in self.sequence_line:
            self.adapter_present = True
            return
        # Otherwise, look for an approximate match of the adapter, less than max_substitutions different
        # First do a preliminary (hopefully faster) check whether the adapter is present as an approximate match in
        # the read sequence within a Levenshtein distance equal to the allowed number of substitutions
        # Two sequences less than N substitutions apart are automatically less than a N edits apart (the reciprocal
        # is not true). If the Levenshtein code is faster, then it will save a lot of time pre-filtering adapters
        # within the Levenshtein distance, which are much more likely to contain adapters. The slower code checking
        # the mismatch distance will then be called on these pre-filtered reads to confirm whether it is an actual
        # substituted match or if the Levensthein match involved insertions and deletions
        # If 1 or more approximate matches of the adapter were found within a Levenshtein distance of max_substitutions
        if len(fuzzysearch.find_near_matches(adapter, self.sequence_line, max_substitutions)):
            # scan the read sequence for a potential substituted match
            # for each subsequence of the sequence_line of length identical to the adapter (last starting position is
            # length of the adapter before the end of the read)
            for index in range(len(self.sequence_line) - len(adapter) + 1):
                # check if the adapter is less than max_substitutions away from the subsequence
                result = ApproxMatch.approx_substitute(adapter, self.sequence_line[index:index + len(adapter)],
                                                       max_substitutions)
                # If a match was found (result is TRUE)
                if result:
                    # set the adapter presence to True (and stop the function here)
                    self.adapter_present = True
                    return
Ejemplo n.º 2
0
    def assign_to_unique_sample(self, sequenced_barcode):
        """Resolves the barcode to a unique sample, or returns False if the barcode is ambiguous.

        Args:
            self, sequenced_barcode

        Returns:
            The name of the sample matched to the barcode, or False if the barcode was ambiguous.
        """
        # Temporary list to store the expected barcodes which approximately match the sequenced barcode
        matches = []
        # For each expected barcode
        for expected in list(self.expected.keys()):
            # check if the sequenced barcode is an approximate match to each of them
            if ApproxMatch.approx_substitute(expected, sequenced_barcode, 1):
                matches.append(expected)
        # If the sequenced barcode is an approximate match of a unique expected barcode
        if len(matches) == 1:
            # return the corresponding sample name
            return self.expected[matches[0]]
        # If the sequenced barcode is an approximate match of more than one expected barcode, or not a match at all
        else:
            # return False
            return False