Esempio n. 1
0
    def find_similar(self, gfe, features, imgtdb_version):
        """
        creates GFE from HLA sequence and locus

        :param locus: string containing HLA locus.
        :param sequence: string containing sequence data.

        :return: GFEobject.
        """
        if is_classI(gfe):
            gfe_dict = self.breakup_gfe(gfe)
            [locus, feature_accessions] = gfe.split("w")
            if self.gfe2hla:
                exon23 = "-".join([gfe_dict["EXON-2"], gfe_dict["EXON-3"]])
                df = self.gfe2hla[locus][(self.gfe2hla[locus]['EXON23'] == exon23) & (self.gfe2hla[locus]['DB'] == imgtdb_version)][['GFE', 'HLA']]\
                         .reset_index()
                return self.create_typing(df, gfe, features)
            else:
                cypher = similar_gfe_classI(gfe, gfe_dict["EXON-2"],
                                            gfe_dict["EXON-3"],
                                            imgtdb_version)
                similar_data = self.graph.run(cypher).to_data_frame()
                return self.create_typing(similar_data, gfe, features)
        elif is_classII(gfe):
            gfe_dict = self.breakup_gfe(gfe)
            [locus, feature_accessions] = gfe.split("w")
            if self.gfe2hla:
                exon2 = gfe_dict["EXON-2"]
                df = self.gfe2hla[locus][(self.gfe2hla[locus]['EXON2'] == exon2) & (self.gfe2hla[locus]['DB'] == imgtdb_version)][['GFE', 'HLA']]\
                         .reset_index()
                return self.create_typing(df, gfe, features)
            else:
                cypher = similar_gfe_classII(gfe, gfe_dict["EXON-2"],
                                             imgtdb_version)
                similar_data = self.graph.run(cypher).to_data_frame()
                return self.create_typing(similar_data, gfe, features)
        elif is_kir(gfe):
            return self.find_gfe_kir(gfe, features)
        else:
            return
Esempio n. 2
0
    def __init__(self,
                 url="http://feature.nmdp-bioinformatics.org",
                 loci=[
                     'HLA-A', 'HLA-B', 'HLA-C', 'HLA-DRB1', 'HLA-DQB1',
                     'HLA-DRB4', 'HLA-DRB5', 'HLA-DPB1', 'HLA-DPA1',
                     'HLA-DQA1', 'HLA-DRB3'
                 ],
                 graph: Graph = None,
                 seqann: Any = {},
                 features: Dict = None,
                 verbose: bool = False,
                 kir: bool = False,
                 pid: str = "NA",
                 gfe2hla: Dict = None,
                 gfe_feats: DataFrame = None,
                 seq2hla: DataFrame = None,
                 load_gfe2hla: bool = False,
                 load_seq2hla: bool = False,
                 load_gfe2feat: bool = False,
                 verbosity=1):
        '''
        Constructor
        '''
        # TODO: Add catch if seqann or graph aren't defined
        self.kir = kir
        self.graph = graph
        self.logger = logging.getLogger("Logger." + __name__)
        if pid:
            self.logname = "ID {:<10} - ".format(str(pid))
        else:
            self.logname = ''

        if not isinstance(seqann, Dict) and seqann:
            if isinstance(seqann, BioSeqAnn):
                self.seqann = {seqann.refdata.dbversion: seqann}
            elif (isinstance(seqann, List)):
                self.seqann = {}
                for ann in seqann:
                    self.seqann.update({ann.refdata.dbversion: ann})
            else:
                raise SeqAnnException(inputtype=type(seqann),
                                      reason="Can't initalize seqann")
        else:
            self.seqann = seqann

        self.features = features
        self.gfe2hla = gfe2hla
        self.seq2hla = seq2hla
        self.gfe_feats = gfe_feats
        self.verbose = verbose
        self.structures = get_structures()

        # ISSUE: gfe_feats & seq2hla need to be loaded together
        #
        if load_gfe2feat:
            self.gfe_feats = self.graph.run(all_gfe2feats()).to_data_frame()
            self.gfe_feats['DBV'] = self.gfe_feats['DB'].apply(
                lambda db: "".join(db.split(".")))
            self.gfe_feats['DB'] = self.gfe_feats['DBV']
            self.gfe_feats = self.gfe_feats.drop(['DBV'], axis=1)

        if load_seq2hla:
            self.seq2hla = self.graph.run(all_seq2hla()).to_data_frame()
            self.seq2hla['DBV'] = self.seq2hla['DB'].apply(
                lambda db: "".join(db.split(".")))
            self.seq2hla['DB'] = self.seq2hla['DBV']
            self.seq2hla = self.seq2hla.drop(['DBV'], axis=1)

        if load_gfe2hla:
            tmp_gfe = {}
            gfehla_df = self.graph.run(all_gfe2hla()).to_data_frame()
            for loc in gfehla_df['LOC'].unique().tolist():
                if re.search("HLA-\D$", loc):
                    loc_df = gfehla_df.loc[gfehla_df['LOC'] == loc]
                    loc1 = self.structures[loc]['exon-2']
                    loc2 = self.structures[loc]['exon-3']
                    loc_df['EXON23'] = loc_df['GFE'].apply(
                        lambda gfe: "-".join(
                            [gfe.split("-")[loc1],
                             gfe.split("-")[loc2]]))
                    tmp_gfe.update({loc: loc_df})

                if is_classII(loc):
                    loc_df = gfehla_df.loc[gfehla_df['LOC'] == loc]
                    loc1 = self.structures[loc]['exon-2']
                    loc_df['EXON2'] = loc_df['GFE'].apply(
                        lambda gfe: gfe.split("-")[loc1])
                    tmp_gfe.update({loc: loc_df})
            self.gfe2hla = tmp_gfe
Esempio n. 3
0
    def search_seqs(self, seqrec, in_seq, locus, run=0, partial_ann=None):
        """
        search_seqs - method for annotating a BioPython sequence without alignment

        :param seqrec: The reference sequence
        :type seqrec: SeqRecord
        :param locus: The gene locus associated with the sequence.
        :type locus: str
        :param in_seq: The input sequence
        :type in_seq: SeqRecord
        :param run: The number of runs that have been done
        :type run: int
        :param partial_ann: A partial annotation from a previous step
        :type partial_ann: :ref:`ann`
        :rtype: :ref:`ann`

        Example usage:

            >>> from Bio.Seq import Seq
            >>> from seqann.seq_search import SeqSearch
            >>> inseq = Seq('AGAGACTCTCCCGAGGATTTCGTGTACCAGTTTAAGGCCATGTGCTACTTCACC')
            >>> sqsrch = SeqSearch()
            >>> ann = sqsrch.search_seqs(refseqs, inseq)

        """
        # Extract out the sequences and feature names
        # from the reference sequences

        # The mapped features will be subtracted from seq_covered
        # so the final seq_covered number will reflect the remaining
        # number of base pairs that haven't been mapped.
        #
        # The coordinates and mapping will help determine what positions
        # in the sequence have been mapped and to what features. The
        # missing blocks variable will be generated using these.
        structures = get_structures()
        seq_covered = len(in_seq.seq)
        coordinates = dict(
            map(lambda x: [x, 1], [i for i in range(0,
                                                    len(in_seq.seq) + 1)]))

        mapping = dict(
            map(lambda x: [x, 1], [i for i in range(0,
                                                    len(in_seq.seq) + 1)]))

        ambig_map = {}
        found_feats = {}
        feat_missing = {}

        method = "nt_search" if not partial_ann else partial_ann.method

        # If the partial annotation is provided
        # then make the found_feats equal to
        # what has already been annotated
        feats = get_features(seqrec)
        if partial_ann:

            found_feats = partial_ann.features

            if self.verbose and self.verbosity > 4:
                self.logger.info("Found partial features:")
                for f in found_feats:
                    self.logger.info(f)

            # Skip references that only have features
            # that have already been annoated
            if len([f for f in feats if f in found_feats]) == len(feats):
                if self.verbose:
                    self.logger.info("Skipping incomplete refseq")
                return partial_ann

            if self.verbose and self.verbosity > 1:
                self.logger.info("Using partial annotation | " + locus + " " +
                                 str(len(partial_ann.features)))

            coordinates = dict(
                map(lambda l: [l, 1], [
                    item for sublist in partial_ann.blocks for item in sublist
                ]))
            seq_covered = partial_ann.covered
            mapping = partial_ann.mapping

            if self.verbose and self.verbosity > 2:
                self.logger.info("Partial sequence coverage = " +
                                 str(seq_covered))
                self.logger.info("Partial sequence metho = " + method)

        added_feat = {}
        deleted_coords = {}
        for feat_name in sorted(feats, key=lambda k: structures[locus][k]):

            # skip if partial annotation is provided
            # and the feat name is not one of the
            # missing features
            if partial_ann and feat_name not in partial_ann.refmissing:
                if self.verbose and self.verbosity > 1:
                    self.logger.info("Skipping " + feat_name +
                                     " - Already annotated")
                continue

            if self.verbose and self.verbosity > 1:
                self.logger.info("Running seqsearch for " + feat_name)

            # Search for the reference feature sequence in the
            # input sequence. Record the coordinates if it's
            # found and if it's found in multiple spots. If it
            # is not found, then record that feature as missing.
            seq_search = nt_search(str(in_seq.seq), str(feats[feat_name]))

            if len(seq_search) == 2:

                if self.verbose and self.verbosity > 0:
                    self.logger.info("Found exact match for " + feat_name)

                seq_covered -= len(str(feats[feat_name]))
                end = int(len(str(feats[feat_name])) + seq_search[1])

                if feat_name == 'three_prime_UTR' \
                        and len(str(in_seq.seq)) > end:
                    end = len(str(in_seq.seq))

                # If the feature is found and it's a five_prime_UTR then
                # the start should always be 0, so insertions at the
                # beinging of the sequence will be found.
                start = seq_search[1] if feat_name != 'five_prime_UTR' else 0
                si = seq_search[1]+1 if seq_search[1] != 0 and \
                    feat_name != 'five_prime_UTR' else 0

                # check if this features has already been mapped
                mapcheck = set(
                    [0 if i in coordinates else 1 for i in range(si, end + 1)])

                # Dont map features if they are out of order
                skip = False
                if found_feats and len(found_feats) > 0:
                    for f in found_feats:
                        o1 = structures[locus][feat_name]
                        o2 = structures[locus][f]
                        loctyp = loctype(found_feats[f].location.start,
                                         found_feats[f].location.end, start,
                                         end)

                        if o1 < o2 and loctyp:
                            skip = True
                            if self.verbose:
                                self.logger.info("Skipping map for " +
                                                 feat_name)
                        elif o2 < o1 and not loctyp:
                            skip = True
                            if self.verbose:
                                self.logger.info("Skipping map for " +
                                                 feat_name)

                if 1 not in mapcheck and not skip:
                    for i in range(si, end + 1):
                        if i in coordinates:
                            if feat_name == "exon_8" or feat_name == 'three_prime_UTR':
                                deleted_coords.update({i: coordinates[i]})
                            del coordinates[i]
                        else:
                            if self.verbose:
                                self.logger.error(
                                    "seqsearch - should't be here " + locus +
                                    " - " + " - " + feat_name)
                        mapping[i] = feat_name

                    found_feats.update({
                        feat_name:
                        SeqFeature(FeatureLocation(ExactPosition(start),
                                                   ExactPosition(end),
                                                   strand=1),
                                   type=feat_name)
                    })

                    if feat_name == "exon_8" or feat_name == 'three_prime_UTR':
                        added_feat.update({feat_name: feats[feat_name]})
                    if self.verbose and self.verbosity > 3:
                        self.logger.info("Coordinates | Start = " +
                                         str(start) + " - End = " + str(end))

            elif (len(seq_search) > 2):
                if self.verbose and self.verbosity > 1:
                    self.logger.info("Found " + str(len(seq_search)) +
                                     " matches for " + feat_name)

                new_seq = [seq_search[0]]
                for i in range(1, len(seq_search)):
                    tnp = seq_search[i] + 1
                    if seq_search[i] in coordinates or tnp in coordinates:
                        new_seq.append(seq_search[i])

                seq_search = new_seq
                if (partial_ann and feat_name == "exon_8" and run > 0):
                    missing_feats = sorted(list(partial_ann.missing.keys()))

                    # * HARD CODED LOGIC * #
                    # > exon8 in class I maps to multiple spots in a sequence,
                    #   often in the 3' UTR. These features need to be mapped
                    #   last to make sure it's not mapping exon8 incorrectly.
                    if (missing_feats == ['exon_8', 'three_prime_UTR']
                            and len(seq_search) <= 3):
                        if self.verbose and self.verbosity > 0:
                            self.logger.info("Resolving exon_8")

                        seq_covered -= len(str(feats[feat_name]))
                        end = int(len(str(feats[feat_name])) + seq_search[1])

                        # If the feature is found and it's a five_prime_UTR then
                        # the start should always be 0, so insertions at the
                        # beinging of the sequence will be found.
                        start = seq_search[1]
                        si = seq_search[1] + 1 if seq_search[1] != 0 else 0

                        # check if this features has already been mapped
                        mapcheck = set([
                            0 if i in coordinates else 1
                            for i in range(si, end + 1)
                        ])

                        for i in range(si, end + 1):
                            if i in coordinates:
                                del coordinates[i]
                            else:
                                if self.verbose:
                                    self.logger.error(
                                        "seqsearch - should't be here " +
                                        locus + " - " + " - " + feat_name)
                            mapping[i] = feat_name

                        found_feats.update({
                            feat_name:
                            SeqFeature(FeatureLocation(ExactPosition(start),
                                                       ExactPosition(end),
                                                       strand=1),
                                       type=feat_name)
                        })

                        if self.verbose and self.verbosity > 0:
                            self.logger.info("Coordinates | Start = " +
                                             str(start) + " - End = " +
                                             str(end))
                    else:
                        if self.verbose and self.verbosity > 0:
                            self.logger.info("Adding ambig feature " +
                                             feat_name)
                        feat_missing.update({feat_name: feats[feat_name]})
                        ambig_map.update(
                            {feat_name: seq_search[1:len(seq_search)]})
                else:
                    if self.verbose and self.verbosity > 0:
                        self.logger.info("Adding ambig feature " + feat_name)
                    feat_missing.update({feat_name: feats[feat_name]})
                    ambig_map.update(
                        {feat_name: seq_search[1:len(seq_search)]})
            else:
                if self.verbose and self.verbosity > 1:
                    self.logger.info("No match for " + feat_name)
                feat_missing.update({feat_name: feats[feat_name]})

        blocks = getblocks(coordinates)
        exact_matches = list(found_feats.keys())

        # * HARD CODED LOGIC * #
        # >
        #
        #  HLA-DRB1 exon3 exact match - with intron1 and 3 missing
        if ('exon_3' in exact_matches and run == 99 and locus == 'HLA-DRB1'
                and 'exon_2' in feat_missing
                and (len(blocks) == 1 or len(blocks) == 2)):

            for b in blocks:
                x = b[len(b) - 1]
                if x == max(list(mapping.keys())):
                    featname = "intron_3"
                    found_feats.update({
                        featname:
                        SeqFeature(FeatureLocation(ExactPosition(b[0] - 1),
                                                   ExactPosition(b[len(b) -
                                                                   1]),
                                                   strand=1),
                                   type=featname)
                    })
                else:
                    featname = "exon_2"
                    found_feats.update({
                        featname:
                        SeqFeature(FeatureLocation(ExactPosition(b[0]),
                                                   ExactPosition(b[len(b) -
                                                                   1]),
                                                   strand=1),
                                   type=featname)
                    })
                    seq_covered -= len(b)

                if self.verbose and self.verbosity > 1:
                    self.logger.info(
                        "Successfully annotated class DRB1 II sequence")

                return Annotation(features=found_feats,
                                  covered=seq_covered,
                                  seq=in_seq,
                                  missing=feat_missing,
                                  ambig=ambig_map,
                                  method=method,
                                  mapping=mapping,
                                  exact_match=exact_matches)

        # If it's a class II sequence and
        # exon_2 is an exact match
        # * HARD CODED LOGIC * #
        # > It's common for exon2 to be fully sequenced
        #   but intron_2 and intron_1 to be partially sequenced,
        #   which can make it hard to annotate those to features.
        #   If there are two missing blocks that is small enough
        #   and they are before and after exon2, then it's very
        #   very likely to be intron_2 and intron_1.
        if 'exon_2' in exact_matches and len(blocks) == 2 \
                and is_classII(locus) and seq_covered < 300:

            if self.verbose and self.verbosity > 1:
                self.logger.info("Running search for class II sequence")

            r = True
            for b in blocks:
                x = b[len(b) - 1]
                if x == max(list(mapping.keys())):
                    x = b[0] - 1
                else:
                    x += 1
                f = mapping[x]
                if f != 'exon_2':
                    r = False
            if r:
                for b in blocks:
                    x = b[len(b) - 1]
                    if x == max(list(mapping.keys())):
                        featname = "intron_2"
                        found_feats.update({
                            featname:
                            SeqFeature(FeatureLocation(ExactPosition(b[0] - 1),
                                                       ExactPosition(b[len(b) -
                                                                       1]),
                                                       strand=1),
                                       type=featname)
                        })
                    else:
                        featname = "intron_1"
                        found_feats.update({
                            featname:
                            SeqFeature(FeatureLocation(ExactPosition(b[0]),
                                                       ExactPosition(b[len(b) -
                                                                       1]),
                                                       strand=1),
                                       type=featname)
                        })
                    seq_covered -= len(b)

                if self.verbose and self.verbosity > 1:
                    self.logger.info(
                        "Successfully annotated class II sequence")

                return Annotation(features=found_feats,
                                  covered=seq_covered,
                                  seq=in_seq,
                                  missing=feat_missing,
                                  ambig=ambig_map,
                                  method=method,
                                  mapping=mapping,
                                  exact_match=exact_matches)

        annotated_feats, mb, mapping = self._resolve_unmapped(
            blocks, feat_missing, ambig_map, mapping, found_feats, locus,
            seq_covered)

        # * HARD CODED LOGIC * #
        if (not mb and blocks and len(feat_missing.keys()) == 0
                and len(ambig_map.keys()) == 0):
            mb = blocks

        if mb:

            # Unmap exon 8
            if locus in ['HLA-C', 'HLA-A'] and len(in_seq.seq) < 3000 \
                    and 'exon_8' in exact_matches:
                for i in deleted_coords:
                    mapping[i] = 1
                coordinates.update(deleted_coords)
                mb = getblocks(coordinates)
                feat_missing.update(added_feat)

                # Delte from found features
                del exact_matches[exact_matches.index('exon_8')]
                del found_feats['exon_8']

                if 'exon_8' in annotated_feats:
                    del annotated_feats['exon_8']
                if 'three_prime_UTR' in found_feats:
                    del found_feats['three_prime_UTR']
                if 'three_prime_UTR' in annotated_feats:
                    del annotated_feats['three_prime_UTR']

            refmissing = [
                f for f in structures[locus] if f not in annotated_feats
            ]

            if self.verbose and self.verbosity > 1:
                self.logger.info("* Annotation not complete *")

            # Print out what features were missing by the ref
            if self.verbose and self.verbosity > 2:
                self.logger.info("Refseq was missing these features = " +
                                 ",".join(list(refmissing)))

            # Print out what features were ambig matches
            if self.verbose and self.verbosity > 1 and len(ambig_map) > 1:
                self.logger.info("Features with ambig matches = " +
                                 ",".join(list(ambig_map)))

            # Print out what features were exact matches
            if self.verbose and self.verbosity > 2 and len(exact_matches) > 1:
                self.logger.info("Features exact matches = " +
                                 ",".join(list(exact_matches)))

            # Print out what features have been annotated
            if self.verbose and self.verbosity > 1 and len(
                    annotated_feats) > 1:
                self.logger.info("Features annotated = " +
                                 ",".join(list(annotated_feats)))

            # Print out what features are missing
            if self.verbose and self.verbosity > 1 and len(feat_missing) > 1:
                self.logger.info("Features missing = " +
                                 ",".join(list(feat_missing)))

            annotation = Annotation(features=annotated_feats,
                                    covered=seq_covered,
                                    seq=in_seq,
                                    missing=feat_missing,
                                    ambig=ambig_map,
                                    blocks=mb,
                                    method=method,
                                    refmissing=refmissing,
                                    mapping=mapping,
                                    exact_match=exact_matches,
                                    annotation=None)
        else:

            mb = None
            # Unmap exon 8
            if locus in ['HLA-C', 'HLA-A'] and len(in_seq.seq) < 600 \
                    and 'exon_8' in exact_matches \
                    and 'three_prime_UTR' in annotated_feats\
                    and 'three_prime_UTR' not in exact_matches:

                for i in deleted_coords:
                    mapping[i] = 1

                coordinates.update(deleted_coords)
                mb = getblocks(coordinates)
                feat_missing.update(added_feat)
                del exact_matches[exact_matches.index('exon_8')]
                del found_feats['exon_8']
                if 'exon_8' in annotated_feats:
                    del annotated_feats['exon_8']
                if 'three_prime_UTR' in found_feats:
                    del found_feats['three_prime_UTR']
                if 'three_prime_UTR' in annotated_feats:
                    del annotated_feats['three_prime_UTR']

            if self.verbose:
                self.logger.info("* No missing blocks after seq_search *")

            # Print out what features were ambig matches
            if self.verbose and self.verbosity > 0 and len(ambig_map) > 1:
                self.logger.info("Features with ambig matches = " +
                                 ",".join(list(ambig_map)))

            # Print out what features were exact matches
            if self.verbose and self.verbosity > 0 and len(exact_matches) > 1:
                self.logger.info("Features exact matches = " +
                                 ",".join(list(exact_matches)))

            # Print out what features have been annotated
            if self.verbose and self.verbosity > 0 and len(
                    annotated_feats) > 1:
                self.logger.info("Features annotated = " +
                                 ",".join(list(annotated_feats)))

            # Print out what features are missing
            if self.verbose and self.verbosity > 0 and len(feat_missing) > 1:
                self.logger.info("Features missing = " +
                                 ",".join(list(feat_missing)))

            annotation = Annotation(features=annotated_feats,
                                    covered=seq_covered,
                                    seq=in_seq,
                                    missing=feat_missing,
                                    ambig=ambig_map,
                                    method=method,
                                    blocks=mb,
                                    mapping=mapping,
                                    exact_match=exact_matches,
                                    annotation=None)

        return annotation
Esempio n. 4
0
 def test_003_is_classII(self):
     self.assertTrue(is_classII('HLA-DRB1*15:01'))
     self.assertTrue(is_classII('HLA-DQB1*06:01'))
     self.assertFalse(is_classII('HLA-A*02:01'))
     pass