def matching_features(self, gfe1, gfe2, structures): """ creates GFE from HLA sequence and locus :param locus: string containing HLA locus. :param sequence: string containing sequence data. :return: GFEobject. """ # TODO: Update looping gfe_parts1 = self.breakup_gfe(gfe1) gfe_parts2 = self.breakup_gfe(gfe2) feat_list = list() for feat in gfe_parts1: if feat in gfe_parts2: if gfe_parts1[feat] == gfe_parts2[feat]: if not feat == "FIVE_PRIME_UTR" and not feat == "THREE_PRIME_UTR": feat_term, feat_rank = feat.split('-') shared_feat = Feature(term=feat_term.upper(), rank=feat_rank, sequence=structures[feat], accession=gfe_parts1[feat]) feat_list.append(shared_feat) else: shared_feat = Feature( term=feat, rank=1, sequence=structures["-".join([feat, str(1)])], accession=gfe_parts1["-".join([feat, str(1)])]) feat_list.append(shared_feat) return (feat_list)
def sequence_lookup(self, locus, sequence, imgtdb_version): """ Looks up sequence from :param locus: string containing HLA locus. :param sequence: string containing sequence data. :return: GFEobject. """ # TODO: just initalize seq2hla as DataFrame if isinstance(self.seq2hla, DataFrame) and not self.seq2hla.empty: df = self.seq2hla[(self.seq2hla['DB'] == imgtdb_version) & (self.seq2hla['LOC'] == locus) & (self.seq2hla['SEQ'] == sequence)] if not df.empty: hla = df['HLA'].tolist()[0] gfe = df['GFE'].tolist()[0] features = [] feats = self.gfe_feats[(self.gfe_feats['GFE'] == gfe) & (self.gfe_feats['DB'] == imgtdb_version)]['FEATS'].tolist()[0] if feats and len(feats) > 0: for feat in feats: feature = Feature(accession=feat['accession'], rank=feat['rank'], sequence=feat['sequence'], term=lc(feat['term'])) features.append(feature) else: seq_features = self.graph.run( get_features(gfe)).to_data_frame() for i in range(0, len(seq_features['term'])): feature = Feature( accession=seq_features['accession'][i], rank=seq_features['rank'][i], sequence=seq_features['sequence'][i], term=lc(seq_features['term'][i])) features.append(feature) return [hla, gfe, features] else: lookup_query = sequence_search(locus, sequence) sequence_data = self.graph.run(lookup_query).to_data_frame() if not sequence_data.empty: features = list() gfe = list(set([x for x in sequence_data["GFE"]])) hla = list(set([x for x in sequence_data["HLA"]])) seq_features = self.graph.run(get_features( gfe[0])).to_data_frame() for i in range(0, len(seq_features['term'])): feature = Feature(accession=seq_features['accession'][i], rank=seq_features['rank'][i], sequence=seq_features['sequence'][i], term=lc(seq_features['term'][i])) features.append(feature) return [hla[0], gfe[0], features] else: return
def gfeNotation_post(sequence, locus, gene): """ gfeNotation_post GFE notations associated with the sequence :param locus: Valid HLA locus :param sequence: Valid sequence :param gene : Kir true or false :rtype: Feature and gfe """ kir = gene sequence = SeqRecord(seq=Seq(sequence['sequence'])) log_capture_string = io.StringIO() logger = logging.getLogger('') logging.basicConfig(datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) # create console handler and set level to debug ch = logging.StreamHandler(log_capture_string) formatter = logging.Formatter( '%(asctime)s - %(name)-35s - %(levelname)-5s ' '- %(funcName)s %(lineno)d: - %(message)s') ch.setFormatter(formatter) ch.setLevel(logging.INFO) logger.addHandler(ch) gfe = GFE() if kir: seqann = BioSeqAnn(kir=True) else: seqann = BioSeqAnn() try: annotation = seqann.annotate(sequence) except Exception as e: print(e) log_contents = log_capture_string.getvalue() return Error("An error occured during the annotation", log=log_contents.split("\n")), 404 try: res_feature, res_gfe = gfe.get_gfe(annotation, locus) except Exception as e: print(e) log_contents = log_capture_string.getvalue() return Error("An error occurred in getting the gfe of annotation", log=log_contents.split("\n")), 404 feats = [] for f in res_feature: fn = Feature(accession=f.accession, rank=f.rank, term=f.term, sequence=f.sequence) feats.append(fn) return {'gfe': res_gfe, 'feature': feats}
def gfecreate_post(locus, sequence, imgt_version, neo4j_url=neo_dict['neo4j_url'], user=neo_dict['user'], password=neo_dict['password']): # noqa: E501 """gfecreate_post Get all features associated with a locus :param locus: Valid HLA locus :param sequence: Valid sequence :param imgt_version : db version :rtype: Typing """ imgthla_version = imgt_version global seqanns global gfe_feats global gfe2hla global seq2hla pygfe = pyGFE() sequence = sequence['sequence'] log_capture_string = io.StringIO() logger = logging.getLogger('') logging.basicConfig(datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) # create console handler and set level to debug ch = logging.StreamHandler(log_capture_string) formatter = logging.Formatter('%(asctime)s - %(name)-35s - %(levelname)-5s' ' - %(funcName)s %(lineno)d: - %(message)s') ch.setFormatter(formatter) ch.setLevel(logging.INFO) logger.addHandler(ch) if not re.match(".", imgthla_version): imgthla_version = ".".join([ list(imgthla_version)[0], "".join(list(imgthla_version)[1:3]), list(imgthla_version)[3] ]) db = "".join(imgthla_version.split(".")) if db in seqanns: seqann = seqanns[db] else: seqann = BioSeqAnn(verbose=True, safemode=True, dbversion=db, verbosity=3) seqanns.update({db: seqann}) try: graph = Graph(neo4j_url, user=user, password=password, bolt=False) except ServiceUnavailable as err: log_contents = log_capture_string.getvalue() log_data = log_contents.split("\n") log_data.append(str(err)) return Error("Failed to connect to graph", log=log_data), 404 if (not isinstance(gfe_feats, DataFrame) or not isinstance(seq2hla, DataFrame)): pygfe = pyGFE(graph=graph, seqann=seqann, load_gfe2hla=True, load_seq2hla=True, load_gfe2feat=True, verbose=True) gfe_feats = pygfe.gfe_feats seq2hla = pygfe.seq2hla gfe2hla = pygfe.gfe2hla else: pygfe = pyGFE(graph=graph, seqann=seqann, gfe2hla=gfe2hla, gfe_feats=gfe_feats, seq2hla=seq2hla, verbose=True) try: typing = pygfe.gfe_create(locus=locus, sequence=sequence, imgtdb_version=db) except Exception as e: print(e) log_contents = log_capture_string.getvalue() return Error("Type with alignment failed", log=log_contents.split("\n")), 404 if isinstance(typing, Error): log_contents = log_capture_string.getvalue() typing.log = log_contents.split("\n") return typing, 404 if not typing: log_contents = log_capture_string.getvalue() return Error("Type with alignment failed", log=log_contents.split("\n")), 404 structute_feats = [] for f in typing['structure']: fn = Feature(accession=f.accession, rank=f.rank, term=f.term, sequence=f.sequence) structute_feats.append(fn) anno_feats = [] for f in typing['annotation'].structure: fn = Feature(accession=f.accession, rank=f.rank, term=f.term, sequence=f.sequence) anno_feats.append(fn) return { 'gfe': typing['gfe'], 'feature': structute_feats, 'annotation_feature': anno_feats }
def type_from_seq(self, locus: str = None, sequence: str = None, imgtdb_version: str = "3.31.0", nseqs: int = 20, alignseqs: int = 10, skip: List = []): """ creates GFE from HLA sequence and locus :param locus: string containing HLA locus. :param sequence: string containing sequence data. :return: GFEobject. """ # TODO: Add full gene accession # TODO: reformt dbversion if missing . ac_object = Typing() ac_object.imgtdb_version = "".join(imgtdb_version.split(".")) ac_object.pygfe_version = pygfe.__version__ ac_object.seqann_version = seqann.__version__ ac_object.gfedb_version = '0.0.2' # If sequence is now a biopython # sequence record convert it to one if isinstance(sequence, Seq): sequence = str(sequence) elif (isinstance(sequence, SeqRecord)): sequence = str(sequence.seq) if not ac_object.imgtdb_version in self.seqann: self.seqann.update({ ac_object.imgtdb_version: BioSeqAnn( dbversion=ac_object.imgtdb_version, #store_features=self.store_features, load_features=self.load_features, cached_features=self.cached_features) }) # If sequence contains any characters # other than ATCG then the GFE notation # can not be created valid_seq = checkseq(sequence) if self.verbose and not valid_seq: self.logger.warning(self.logname + " Sequence alphabet " + "contains non DNA") self.logger.warning(self.logname + " No GFE string will be generated") raise Exception( "Input sequence was not valid! {}".format(sequence)) # Check it the locus exists if not locus: if self.verbose: self.logger.info(self.logname + " No locus provided! ") # Guessing locus with blastn locus = get_locus( sequence, kir=self.kir, refdata=self.seqann[ac_object.imgtdb_version].refdata) if locus and self.verbose: self.logger.info(self.logname + " Locus prediction = " + locus) if not locus: if self.verbose: self.logger.error(self.logname + " Locus could not be determined!") # TODO: Raise exception raise Exception( "Locus could not be determined! {}".format(sequence)) sequence = sequence.upper() sequence_typing = self.sequence_lookup(locus, sequence, ac_object.imgtdb_version) if sequence_typing: ac_object.status = "documented" ac_object.hla = sequence_typing[0] ac_object.gfe = sequence_typing[1] ac_object.closest_gfe = sequence_typing[1] ac_object.features = sequence_typing[2] if self.verbose: self.logger.info(self.logname + locus + " sequence documented for " + imgtdb_version + " | " + ac_object.gfe + " = " + ac_object.hla) return ac_object else: # time GFE creation time_start = time.time() gfe_o = self.gfe_create(locus, sequence, ac_object.imgtdb_version) if not 'annotation' in gfe_o: self.logger.error(self.logname + "Failed to create annotation!!") error = Error("Failed to create annotation!!", ac_object.pygfe_version, ac_object.gfedb_version, imgtdb_version) return error if self.verbose: time_taken = int(time.time() - time_start) self.logger.info(self.logname + " gfe_create time for " + locus + " " + imgtdb_version + " = " + str(time_taken) + " minutes") annotation = gfe_o['annotation'] ac_object.gfe = gfe_o['gfe'] ac_object.features = [ Feature(accession=f.accession, rank=f.rank, sequence=f.sequence, term=f.term) for f in gfe_o['structure'] ] novel_features = self.unique_features(ac_object.features, locus, ac_object.imgtdb_version) if (len(novel_features) != 0): if self.verbose: self.logger.info(self.logname + " # novel features = " + str(len(novel_features))) ac_object.novel_features = novel_features ac_object.status = "novel" else: self.logger.info(self.logname + " novel combination") ac_object.status = "novel_combination" similar_results = self.find_similar(ac_object.gfe, ac_object.features, imgtdb_version) if similar_results: ac_object.hla = similar_results[0] ac_object.closest_gfe = similar_results[1] if self.seqann[ac_object.imgtdb_version].align: if self.verbose: self.logger.info(self.logname + " finding sequence differences") ac_object.seqdiff = self.diff_seq(similar_results[0], annotation, imgtdb_version) ac_object.differences = len(ac_object.seqdiff) else: ac_object.hla = "NA" ac_object.closest_gfe = "NA" if self.verbose: self.logger.warn(self.logname + " No allele call made!") return ac_object
def gfeAnnotation_post(sequence, locus, gene=None, imgtdb_version="3.31.0"): """gfeAnnotation_post Get all kir associated with a GFE # noqa: E501 :param sequence: Valid sequence fasta :param gene: the KIR param true or false :param locus: Valid Locus :param imgtdb_version: :rtype: Typing """ global seqanns typing = Typing() sequence = SeqRecord(seq=Seq(sequence['sequence'])) if not re.match(".", imgtdb_version): imgtdb_version = ".".join([list(imgtdb_version)[0], "".join(list(imgtdb_version)[1:3]), list(imgtdb_version)[3]]) db = "".join(imgtdb_version.split(".")) log_capture_string = io.StringIO() logger = logging.getLogger('') logging.basicConfig(datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) # create console handler and set level to debug ch = logging.StreamHandler(log_capture_string) formatter = logging.Formatter( '%(asctime)s - %(name)-35s - %(levelname)-5s ' '- %(funcName)s %(lineno)d: - %(message)s') ch.setFormatter(formatter) ch.setLevel(logging.INFO) logger.addHandler(ch) # TODO: Use `gene` or locus to figure out the gene-family if db in seqanns: seqann = seqanns[db] elif gene: if gene.upper() == 'KIR': seqann = BioSeqAnn(verbose=True, safemode=True, dbversion=db, verbosity=3, kir=True) seqanns.update({db: seqann}) else: # Defaults to HLA seqann = BioSeqAnn(verbose=True, safemode=True, dbversion=db, verbosity=3) seqanns.update({db: seqann}) try: annotation = seqann.annotate(sequence, locus) except Exception as e: print(e) log_contents = log_capture_string.getvalue() return Error("An error occurred during the annotation", log=log_contents.split("\n")), 404 if not annotation: log_contents = log_capture_string.getvalue() return Error("No annotation could be produced", log=log_contents.split("\n")), 404 if not hasattr(annotation, 'structure'): log_contents = log_capture_string.getvalue() return Error("No structure was produced", log=log_contents.split("\n")), 404 feats = [] for f in annotation.structure: fn = Feature(accession=f.accession, rank=f.rank, term=f.term, sequence=f.sequence) feats.append(fn) typing.features = feats typing.gfe = annotation.gfe typing.imgtdb_version = imgtdb_version return typing
def annotate_get(sequence, locus=None, imgthla_version="3.31.0"): # noqa: E501 """annotate_get Find the sequence differences between two GFE # noqa: E501 :param sequence: Valid consensus sequence :type sequence: str :param locus: Valid locus :type locus: str :param imgthla_version: IMGT/HLA DB Version :type imgthla_version: str :param verbose: Flag for running service in verbose :type verbose: bool :rtype: Typing """ global seqanns typing = Typing() sequence = SeqRecord(seq=Seq(sequence)) if not re.match(".", imgthla_version): imgthla_version = ".".join([ list(imgthla_version)[0], "".join(list(imgthla_version)[1:3]), list(imgthla_version)[3] ]) db = "".join(imgthla_version.split(".")) log_capture_string = io.StringIO() logger = logging.getLogger('') logging.basicConfig(datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) # create console handler and set level to debug ch = logging.StreamHandler(log_capture_string) formatter = logging.Formatter( '%(asctime)s - %(name)-35s - %(levelname)-5s - %(funcName)s %(lineno)d: - %(message)s' ) ch.setFormatter(formatter) ch.setLevel(logging.INFO) logger.addHandler(ch) if db in seqanns: seqann = seqanns[db] else: seqann = BioSeqAnn(verbose=True, safemode=True, dbversion=db, verbosity=3) seqanns.update({db: seqann}) try: annotation = seqann.annotate(sequence, locus) except: log_contents = log_capture_string.getvalue() return Error("An error occured during the annotation", log=log_contents.split("\n")), 404 if not annotation: log_contents = log_capture_string.getvalue() return Error("No annotation could be produced", log=log_contents.split("\n")), 404 if not hasattr(annotation, 'structure'): log_contents = log_capture_string.getvalue() return Error("No structure was produced", log=log_contents.split("\n")), 404 feats = [] for f in annotation.structure: fn = Feature(accession=f.accession, rank=f.rank, term=f.term, sequence=f.sequence) feats.append(fn) typing.features = feats typing.gfe = annotation.gfe typing.imgtdb_version = imgthla_version return typing