def find_similar(self, gfe, features, imgtdb_version): """ creates GFE from HLA sequence and locus :param locus: string containing HLA locus. :param sequence: string containing sequence data. :return: GFEobject. """ if is_classI(gfe): gfe_dict = self.breakup_gfe(gfe) [locus, feature_accessions] = gfe.split("w") if self.gfe2hla: exon23 = "-".join([gfe_dict["EXON-2"], gfe_dict["EXON-3"]]) df = self.gfe2hla[locus][(self.gfe2hla[locus]['EXON23'] == exon23) & (self.gfe2hla[locus]['DB'] == imgtdb_version)][['GFE', 'HLA']]\ .reset_index() return self.create_typing(df, gfe, features) else: cypher = similar_gfe_classI(gfe, gfe_dict["EXON-2"], gfe_dict["EXON-3"], imgtdb_version) similar_data = self.graph.run(cypher).to_data_frame() return self.create_typing(similar_data, gfe, features) elif is_classII(gfe): gfe_dict = self.breakup_gfe(gfe) [locus, feature_accessions] = gfe.split("w") if self.gfe2hla: exon2 = gfe_dict["EXON-2"] df = self.gfe2hla[locus][(self.gfe2hla[locus]['EXON2'] == exon2) & (self.gfe2hla[locus]['DB'] == imgtdb_version)][['GFE', 'HLA']]\ .reset_index() return self.create_typing(df, gfe, features) else: cypher = similar_gfe_classII(gfe, gfe_dict["EXON-2"], imgtdb_version) similar_data = self.graph.run(cypher).to_data_frame() return self.create_typing(similar_data, gfe, features) elif is_kir(gfe): return self.find_gfe_kir(gfe, features) else: return
def __init__(self, url="http://feature.nmdp-bioinformatics.org", loci=[ 'HLA-A', 'HLA-B', 'HLA-C', 'HLA-DRB1', 'HLA-DQB1', 'HLA-DRB4', 'HLA-DRB5', 'HLA-DPB1', 'HLA-DPA1', 'HLA-DQA1', 'HLA-DRB3' ], graph: Graph = None, seqann: Any = {}, features: Dict = None, verbose: bool = False, kir: bool = False, pid: str = "NA", gfe2hla: Dict = None, gfe_feats: DataFrame = None, seq2hla: DataFrame = None, load_gfe2hla: bool = False, load_seq2hla: bool = False, load_gfe2feat: bool = False, verbosity=1): ''' Constructor ''' # TODO: Add catch if seqann or graph aren't defined self.kir = kir self.graph = graph self.logger = logging.getLogger("Logger." + __name__) if pid: self.logname = "ID {:<10} - ".format(str(pid)) else: self.logname = '' if not isinstance(seqann, Dict) and seqann: if isinstance(seqann, BioSeqAnn): self.seqann = {seqann.refdata.dbversion: seqann} elif (isinstance(seqann, List)): self.seqann = {} for ann in seqann: self.seqann.update({ann.refdata.dbversion: ann}) else: raise SeqAnnException(inputtype=type(seqann), reason="Can't initalize seqann") else: self.seqann = seqann self.features = features self.gfe2hla = gfe2hla self.seq2hla = seq2hla self.gfe_feats = gfe_feats self.verbose = verbose self.structures = get_structures() # ISSUE: gfe_feats & seq2hla need to be loaded together # if load_gfe2feat: self.gfe_feats = self.graph.run(all_gfe2feats()).to_data_frame() self.gfe_feats['DBV'] = self.gfe_feats['DB'].apply( lambda db: "".join(db.split("."))) self.gfe_feats['DB'] = self.gfe_feats['DBV'] self.gfe_feats = self.gfe_feats.drop(['DBV'], axis=1) if load_seq2hla: self.seq2hla = self.graph.run(all_seq2hla()).to_data_frame() self.seq2hla['DBV'] = self.seq2hla['DB'].apply( lambda db: "".join(db.split("."))) self.seq2hla['DB'] = self.seq2hla['DBV'] self.seq2hla = self.seq2hla.drop(['DBV'], axis=1) if load_gfe2hla: tmp_gfe = {} gfehla_df = self.graph.run(all_gfe2hla()).to_data_frame() for loc in gfehla_df['LOC'].unique().tolist(): if re.search("HLA-\D$", loc): loc_df = gfehla_df.loc[gfehla_df['LOC'] == loc] loc1 = self.structures[loc]['exon-2'] loc2 = self.structures[loc]['exon-3'] loc_df['EXON23'] = loc_df['GFE'].apply( lambda gfe: "-".join( [gfe.split("-")[loc1], gfe.split("-")[loc2]])) tmp_gfe.update({loc: loc_df}) if is_classII(loc): loc_df = gfehla_df.loc[gfehla_df['LOC'] == loc] loc1 = self.structures[loc]['exon-2'] loc_df['EXON2'] = loc_df['GFE'].apply( lambda gfe: gfe.split("-")[loc1]) tmp_gfe.update({loc: loc_df}) self.gfe2hla = tmp_gfe
def search_seqs(self, seqrec, in_seq, locus, run=0, partial_ann=None): """ search_seqs - method for annotating a BioPython sequence without alignment :param seqrec: The reference sequence :type seqrec: SeqRecord :param locus: The gene locus associated with the sequence. :type locus: str :param in_seq: The input sequence :type in_seq: SeqRecord :param run: The number of runs that have been done :type run: int :param partial_ann: A partial annotation from a previous step :type partial_ann: :ref:`ann` :rtype: :ref:`ann` Example usage: >>> from Bio.Seq import Seq >>> from seqann.seq_search import SeqSearch >>> inseq = Seq('AGAGACTCTCCCGAGGATTTCGTGTACCAGTTTAAGGCCATGTGCTACTTCACC') >>> sqsrch = SeqSearch() >>> ann = sqsrch.search_seqs(refseqs, inseq) """ # Extract out the sequences and feature names # from the reference sequences # The mapped features will be subtracted from seq_covered # so the final seq_covered number will reflect the remaining # number of base pairs that haven't been mapped. # # The coordinates and mapping will help determine what positions # in the sequence have been mapped and to what features. The # missing blocks variable will be generated using these. structures = get_structures() seq_covered = len(in_seq.seq) coordinates = dict( map(lambda x: [x, 1], [i for i in range(0, len(in_seq.seq) + 1)])) mapping = dict( map(lambda x: [x, 1], [i for i in range(0, len(in_seq.seq) + 1)])) ambig_map = {} found_feats = {} feat_missing = {} method = "nt_search" if not partial_ann else partial_ann.method # If the partial annotation is provided # then make the found_feats equal to # what has already been annotated feats = get_features(seqrec) if partial_ann: found_feats = partial_ann.features if self.verbose and self.verbosity > 4: self.logger.info("Found partial features:") for f in found_feats: self.logger.info(f) # Skip references that only have features # that have already been annoated if len([f for f in feats if f in found_feats]) == len(feats): if self.verbose: self.logger.info("Skipping incomplete refseq") return partial_ann if self.verbose and self.verbosity > 1: self.logger.info("Using partial annotation | " + locus + " " + str(len(partial_ann.features))) coordinates = dict( map(lambda l: [l, 1], [ item for sublist in partial_ann.blocks for item in sublist ])) seq_covered = partial_ann.covered mapping = partial_ann.mapping if self.verbose and self.verbosity > 2: self.logger.info("Partial sequence coverage = " + str(seq_covered)) self.logger.info("Partial sequence metho = " + method) added_feat = {} deleted_coords = {} for feat_name in sorted(feats, key=lambda k: structures[locus][k]): # skip if partial annotation is provided # and the feat name is not one of the # missing features if partial_ann and feat_name not in partial_ann.refmissing: if self.verbose and self.verbosity > 1: self.logger.info("Skipping " + feat_name + " - Already annotated") continue if self.verbose and self.verbosity > 1: self.logger.info("Running seqsearch for " + feat_name) # Search for the reference feature sequence in the # input sequence. Record the coordinates if it's # found and if it's found in multiple spots. If it # is not found, then record that feature as missing. seq_search = nt_search(str(in_seq.seq), str(feats[feat_name])) if len(seq_search) == 2: if self.verbose and self.verbosity > 0: self.logger.info("Found exact match for " + feat_name) seq_covered -= len(str(feats[feat_name])) end = int(len(str(feats[feat_name])) + seq_search[1]) if feat_name == 'three_prime_UTR' \ and len(str(in_seq.seq)) > end: end = len(str(in_seq.seq)) # If the feature is found and it's a five_prime_UTR then # the start should always be 0, so insertions at the # beinging of the sequence will be found. start = seq_search[1] if feat_name != 'five_prime_UTR' else 0 si = seq_search[1]+1 if seq_search[1] != 0 and \ feat_name != 'five_prime_UTR' else 0 # check if this features has already been mapped mapcheck = set( [0 if i in coordinates else 1 for i in range(si, end + 1)]) # Dont map features if they are out of order skip = False if found_feats and len(found_feats) > 0: for f in found_feats: o1 = structures[locus][feat_name] o2 = structures[locus][f] loctyp = loctype(found_feats[f].location.start, found_feats[f].location.end, start, end) if o1 < o2 and loctyp: skip = True if self.verbose: self.logger.info("Skipping map for " + feat_name) elif o2 < o1 and not loctyp: skip = True if self.verbose: self.logger.info("Skipping map for " + feat_name) if 1 not in mapcheck and not skip: for i in range(si, end + 1): if i in coordinates: if feat_name == "exon_8" or feat_name == 'three_prime_UTR': deleted_coords.update({i: coordinates[i]}) del coordinates[i] else: if self.verbose: self.logger.error( "seqsearch - should't be here " + locus + " - " + " - " + feat_name) mapping[i] = feat_name found_feats.update({ feat_name: SeqFeature(FeatureLocation(ExactPosition(start), ExactPosition(end), strand=1), type=feat_name) }) if feat_name == "exon_8" or feat_name == 'three_prime_UTR': added_feat.update({feat_name: feats[feat_name]}) if self.verbose and self.verbosity > 3: self.logger.info("Coordinates | Start = " + str(start) + " - End = " + str(end)) elif (len(seq_search) > 2): if self.verbose and self.verbosity > 1: self.logger.info("Found " + str(len(seq_search)) + " matches for " + feat_name) new_seq = [seq_search[0]] for i in range(1, len(seq_search)): tnp = seq_search[i] + 1 if seq_search[i] in coordinates or tnp in coordinates: new_seq.append(seq_search[i]) seq_search = new_seq if (partial_ann and feat_name == "exon_8" and run > 0): missing_feats = sorted(list(partial_ann.missing.keys())) # * HARD CODED LOGIC * # # > exon8 in class I maps to multiple spots in a sequence, # often in the 3' UTR. These features need to be mapped # last to make sure it's not mapping exon8 incorrectly. if (missing_feats == ['exon_8', 'three_prime_UTR'] and len(seq_search) <= 3): if self.verbose and self.verbosity > 0: self.logger.info("Resolving exon_8") seq_covered -= len(str(feats[feat_name])) end = int(len(str(feats[feat_name])) + seq_search[1]) # If the feature is found and it's a five_prime_UTR then # the start should always be 0, so insertions at the # beinging of the sequence will be found. start = seq_search[1] si = seq_search[1] + 1 if seq_search[1] != 0 else 0 # check if this features has already been mapped mapcheck = set([ 0 if i in coordinates else 1 for i in range(si, end + 1) ]) for i in range(si, end + 1): if i in coordinates: del coordinates[i] else: if self.verbose: self.logger.error( "seqsearch - should't be here " + locus + " - " + " - " + feat_name) mapping[i] = feat_name found_feats.update({ feat_name: SeqFeature(FeatureLocation(ExactPosition(start), ExactPosition(end), strand=1), type=feat_name) }) if self.verbose and self.verbosity > 0: self.logger.info("Coordinates | Start = " + str(start) + " - End = " + str(end)) else: if self.verbose and self.verbosity > 0: self.logger.info("Adding ambig feature " + feat_name) feat_missing.update({feat_name: feats[feat_name]}) ambig_map.update( {feat_name: seq_search[1:len(seq_search)]}) else: if self.verbose and self.verbosity > 0: self.logger.info("Adding ambig feature " + feat_name) feat_missing.update({feat_name: feats[feat_name]}) ambig_map.update( {feat_name: seq_search[1:len(seq_search)]}) else: if self.verbose and self.verbosity > 1: self.logger.info("No match for " + feat_name) feat_missing.update({feat_name: feats[feat_name]}) blocks = getblocks(coordinates) exact_matches = list(found_feats.keys()) # * HARD CODED LOGIC * # # > # # HLA-DRB1 exon3 exact match - with intron1 and 3 missing if ('exon_3' in exact_matches and run == 99 and locus == 'HLA-DRB1' and 'exon_2' in feat_missing and (len(blocks) == 1 or len(blocks) == 2)): for b in blocks: x = b[len(b) - 1] if x == max(list(mapping.keys())): featname = "intron_3" found_feats.update({ featname: SeqFeature(FeatureLocation(ExactPosition(b[0] - 1), ExactPosition(b[len(b) - 1]), strand=1), type=featname) }) else: featname = "exon_2" found_feats.update({ featname: SeqFeature(FeatureLocation(ExactPosition(b[0]), ExactPosition(b[len(b) - 1]), strand=1), type=featname) }) seq_covered -= len(b) if self.verbose and self.verbosity > 1: self.logger.info( "Successfully annotated class DRB1 II sequence") return Annotation(features=found_feats, covered=seq_covered, seq=in_seq, missing=feat_missing, ambig=ambig_map, method=method, mapping=mapping, exact_match=exact_matches) # If it's a class II sequence and # exon_2 is an exact match # * HARD CODED LOGIC * # # > It's common for exon2 to be fully sequenced # but intron_2 and intron_1 to be partially sequenced, # which can make it hard to annotate those to features. # If there are two missing blocks that is small enough # and they are before and after exon2, then it's very # very likely to be intron_2 and intron_1. if 'exon_2' in exact_matches and len(blocks) == 2 \ and is_classII(locus) and seq_covered < 300: if self.verbose and self.verbosity > 1: self.logger.info("Running search for class II sequence") r = True for b in blocks: x = b[len(b) - 1] if x == max(list(mapping.keys())): x = b[0] - 1 else: x += 1 f = mapping[x] if f != 'exon_2': r = False if r: for b in blocks: x = b[len(b) - 1] if x == max(list(mapping.keys())): featname = "intron_2" found_feats.update({ featname: SeqFeature(FeatureLocation(ExactPosition(b[0] - 1), ExactPosition(b[len(b) - 1]), strand=1), type=featname) }) else: featname = "intron_1" found_feats.update({ featname: SeqFeature(FeatureLocation(ExactPosition(b[0]), ExactPosition(b[len(b) - 1]), strand=1), type=featname) }) seq_covered -= len(b) if self.verbose and self.verbosity > 1: self.logger.info( "Successfully annotated class II sequence") return Annotation(features=found_feats, covered=seq_covered, seq=in_seq, missing=feat_missing, ambig=ambig_map, method=method, mapping=mapping, exact_match=exact_matches) annotated_feats, mb, mapping = self._resolve_unmapped( blocks, feat_missing, ambig_map, mapping, found_feats, locus, seq_covered) # * HARD CODED LOGIC * # if (not mb and blocks and len(feat_missing.keys()) == 0 and len(ambig_map.keys()) == 0): mb = blocks if mb: # Unmap exon 8 if locus in ['HLA-C', 'HLA-A'] and len(in_seq.seq) < 3000 \ and 'exon_8' in exact_matches: for i in deleted_coords: mapping[i] = 1 coordinates.update(deleted_coords) mb = getblocks(coordinates) feat_missing.update(added_feat) # Delte from found features del exact_matches[exact_matches.index('exon_8')] del found_feats['exon_8'] if 'exon_8' in annotated_feats: del annotated_feats['exon_8'] if 'three_prime_UTR' in found_feats: del found_feats['three_prime_UTR'] if 'three_prime_UTR' in annotated_feats: del annotated_feats['three_prime_UTR'] refmissing = [ f for f in structures[locus] if f not in annotated_feats ] if self.verbose and self.verbosity > 1: self.logger.info("* Annotation not complete *") # Print out what features were missing by the ref if self.verbose and self.verbosity > 2: self.logger.info("Refseq was missing these features = " + ",".join(list(refmissing))) # Print out what features were ambig matches if self.verbose and self.verbosity > 1 and len(ambig_map) > 1: self.logger.info("Features with ambig matches = " + ",".join(list(ambig_map))) # Print out what features were exact matches if self.verbose and self.verbosity > 2 and len(exact_matches) > 1: self.logger.info("Features exact matches = " + ",".join(list(exact_matches))) # Print out what features have been annotated if self.verbose and self.verbosity > 1 and len( annotated_feats) > 1: self.logger.info("Features annotated = " + ",".join(list(annotated_feats))) # Print out what features are missing if self.verbose and self.verbosity > 1 and len(feat_missing) > 1: self.logger.info("Features missing = " + ",".join(list(feat_missing))) annotation = Annotation(features=annotated_feats, covered=seq_covered, seq=in_seq, missing=feat_missing, ambig=ambig_map, blocks=mb, method=method, refmissing=refmissing, mapping=mapping, exact_match=exact_matches, annotation=None) else: mb = None # Unmap exon 8 if locus in ['HLA-C', 'HLA-A'] and len(in_seq.seq) < 600 \ and 'exon_8' in exact_matches \ and 'three_prime_UTR' in annotated_feats\ and 'three_prime_UTR' not in exact_matches: for i in deleted_coords: mapping[i] = 1 coordinates.update(deleted_coords) mb = getblocks(coordinates) feat_missing.update(added_feat) del exact_matches[exact_matches.index('exon_8')] del found_feats['exon_8'] if 'exon_8' in annotated_feats: del annotated_feats['exon_8'] if 'three_prime_UTR' in found_feats: del found_feats['three_prime_UTR'] if 'three_prime_UTR' in annotated_feats: del annotated_feats['three_prime_UTR'] if self.verbose: self.logger.info("* No missing blocks after seq_search *") # Print out what features were ambig matches if self.verbose and self.verbosity > 0 and len(ambig_map) > 1: self.logger.info("Features with ambig matches = " + ",".join(list(ambig_map))) # Print out what features were exact matches if self.verbose and self.verbosity > 0 and len(exact_matches) > 1: self.logger.info("Features exact matches = " + ",".join(list(exact_matches))) # Print out what features have been annotated if self.verbose and self.verbosity > 0 and len( annotated_feats) > 1: self.logger.info("Features annotated = " + ",".join(list(annotated_feats))) # Print out what features are missing if self.verbose and self.verbosity > 0 and len(feat_missing) > 1: self.logger.info("Features missing = " + ",".join(list(feat_missing))) annotation = Annotation(features=annotated_feats, covered=seq_covered, seq=in_seq, missing=feat_missing, ambig=ambig_map, method=method, blocks=mb, mapping=mapping, exact_match=exact_matches, annotation=None) return annotation
def test_003_is_classII(self): self.assertTrue(is_classII('HLA-DRB1*15:01')) self.assertTrue(is_classII('HLA-DQB1*06:01')) self.assertFalse(is_classII('HLA-A*02:01')) pass