def seqvar_or_None(entry): if entry is None: return None # basic string cleaning entry = entry.strip() # beware the pesky N-dash in at least 1 observed ClinVar db entry. entry = entry.replace('\u2013', '-') hgvs_text = '' match = re_hgvs.match(entry) if match: stuff = match.groupdict() hgvs_text = stuff['prefix'] + '_' + stuff['transcript'] + ':' + stuff[ 'edit'] # TODO: LRG else: return None try: return Variant(hgvs_text) except (CriticalHgvsError, TypeError): # empty or broken return None
def test_simple_substitution_case(self): var_c = Variant(hgvs_c['SUB']) lex = LVG(var_c) query = GoogleQuery(lex) assert query.startswith( '"SCN5A" ("4786T>A"|"T4786A"|"4786T-->A"|"4786T/A"|"4786T->A"') assert query.find('4732T->A') > -1 assert query.find('4783T-->A') > -1
def components_or_None(hgvs_p): try: comp = VariantComponents(Variant(hgvs_p)) if comp.ref != '': return comp except (TypeError, RejectedSeqVar, CriticalHgvsError): # either the hgvs_p did not parse (Variant returned None) or it has incomplete edit info. pass return None
def hgvs2pmid_cli(): args = docopt(__doc__, version=__version__) var = Variant(args['<hgvs>']) if var: hgvs2pmid(str(var)) else: print( 'Supplied argument must be a valid HGVS string! See --help for examples.' )
def test_crazy_long_hgvs_strings_just_to_see_what_happens(self): # https://www.ncbi.nlm.nih.gov/clinvar/38771295/ RCV000008537 seqvar = Variant( 'NP_001121636.1:p.Gln197_Gln208delinsGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGlnGln' ) # https://www.ncbi.nlm.nih.gov/clinvar/36484151/ # https://www.ncbi.nlm.nih.gov/clinvar/variation/68075/ seqvar = Variant( 'LRG_702p1:p.Glu129_Asp145delinsGluIleLysValIleSerGlyIleLeuThrGlnGlyArgCysAspIleGluIleLysValIleSerGlyIleLeuThrGlnGlyArgCysAspIleAsp' ) # RCV000087437 seqvar = Variant( 'LRG_3p1:p.Arg1139_Gly1140insValSerSerThrGluArgTyrTyrArgSerThrCysPheArgCysLeuHisPheArgLysIlePheTrpHisCysAspValMetIleLeuSerLeu' ) # also RCV000087437 seqvar = Variant( 'NP_000081.1:p.Arg1139_Gly1140insValSerSerThrGluArgTyrTyrArgSerThrCysPheArgCysLeuHisPheArgLysIlePheTrpHisCysAspValMetIleLeuSerLeu' )
def __init__(self, lex=None, seqvar=None, hgvs_text=None, **kwargs): """ Requires either an LVG object (lex=) or a Sequence Variant object (seqvar=) or an hgvs_text string (hgvs_text=) Priority for instantiation (in case of multiple-parameter submission): lex, seqvar, hgvs_text Keywords: gene_name: should be supplied when instantiated with seqvar= or hgvs_text= """ if lex: self.lex = lex self.seqvar = lex.seqvar self.hgvs_text = lex.hgvs_text if lex.gene_name: self.gene_name = lex.gene_name else: self.gene_name = kwargs.get('gene_name', None) elif seqvar: self.lex = None self.seqvar = seqvar self.hgvs_text = '%s' % seqvar self.gene_name = kwargs.get('gene_name', None) elif hgvs_text: self.lex = None self.seqvar = Variant(hgvs_text) self.hgvs_text = hgvs_text self.gene_name = kwargs.get('gene_name', None) if self.gene_name is None: raise GoogleQueryMissingGeneName( 'Information supplied with variant %s is missing gene name.' % self.seqvar) # self.synonyms = {'c': [], 'g': [], 'p': [], 'n': []} self.gene_synonyms = filter_gene_synonyms(GeneSynonyms(self.gene_name)) # choice of Google CSE ("cx") -- "whitelist" or "schema" [default: whitelist] self.cse = kwargs.get('cse', 'whitelist')
def pubtator_results_for_seqvar(seqvar_or_hgvs_text, gene_id): """ Takes a SequenceVariant or hgvs_text string. Returns a dictionary of results mapping hgvs_text to a list of results from pubtator, i.e.: { hgvs_text: [ <dictionaries representing matching results from pubtator> ] } :param seqvar_or_hgvs_text: hgvs_text or SequenceVariant object :param gene_id: id of gene associated with variant (required) :return: dictionary of results :raises: RejectedSeqVar, PubtatorDBError """ seqvar = Variant(seqvar_or_hgvs_text) hgvs_text = '%s' % seqvar result = {hgvs_text: []} components = VariantComponents(seqvar) if seqvar.type == 'p': result[hgvs_text] = pubtator_db.search_proteins(components, gene_id) else: result[hgvs_text] = pubtator_db.search_m2p(components, gene_id) return result
def process_row(db, dbrow): """ For each column in the row that might contain an HGVS string -- namely: * variant_name * HGVS_p * HGVS_c ...do the following actions: * if variant_name or HGVS_c, make an LVG. If both exist, make an lvg from HGVS_c (preferentially). * for each seqvar, add a row to the database with '%s' % seqvar, PMID, VariantComponents --> Ref,Alt,Pos * if HGVS_p, see if it exists in the LVG object (in hgvs_p). * if not: try to make a VariantComponents object. add new row if comp is not None. :param dbrow: (dict) one row from t2g_variant_summary table. :return: list of VariantComponents successfully added to the database (or empty list) """ added_components = [] variants = {'c': [], 'g': [], 'n': []} for option in ['variant_name']: #, 'HGVS_c']: seqvar = seqvar_or_None(dbrow[option]) if seqvar: variants[seqvar.type].append(seqvar) # use the first usable variant to construct an LVG for this variant. lex = None for seqtype in ['c', 'g', 'n']: if variants[seqtype]: seqvar = variants[seqtype][0] print('[%s] Using %s to build LVG' % (dbrow['variant_name'], seqvar)) lex = lvg_or_None(seqvar) if lex: break # collect a unique set of protein variants stripped of parentheses. hgvs_ps = set() #comp = components_or_None(dbrow['HGVS_p']) #if comp: # treat "uncertain" protein effects as unneeded duplicates of the "certain" ones. # hgvs_ps.add(('%s' % Variant(dbrow['HGVS_p'])).replace(')','').replace('(','')) # add non-p-vars to the database if lex: for item in lex.hgvs_p: hgvs_ps.add(item.replace(')', '').replace('(', '')) for seqtype in ['c', 'g', 'n']: for seqvar in lex.variants[seqtype].values(): comp = components_or_None(seqvar) if comp: if add_components_to_row(db, dbrow, comp): added_components.append(comp) # and now add the proteins for hgvs_p in hgvs_ps: comp = components_or_None(Variant(hgvs_p)) if comp: if add_components_to_row(db, dbrow, comp): added_components.append(comp) return added_components
def test_frameshift_case(self): var_c = Variant(hgvs_c['FS']) lex = LVG(var_c) query = GoogleQuery(lex)
def test_duplication(self): var_c = Variant(hgvs_c['DUP']) comp = VariantComponents(var_c) pass
def test_insert(self): var_c = Variant(hgvs_c['INS']) comp = VariantComponents(var_c) pass
def test_frameshift(self): var_p = Variant(hgvs_p['FS']) comp = VariantComponents(var_p) pass
def test_indel(self): var_n = Variant(hgvs_n['INDEL']) comp = VariantComponents(var_n) pass
def test_simple_substitution(self): var_c = Variant(hgvs_c['SUB']) comp = VariantComponents(var_c) pass
def test_deletion(self): var_g = Variant(hgvs_g['DEL']) comp = VariantComponents(var_g) pass
def test_deletion_case(self): var_c = Variant(hgvs_c['DEL']) lex = LVG(var_c) query = GoogleQuery(lex)
def test_dup_case(self): var_c = Variant(hgvs_c['DUP']) lex = LVG(var_c) query = GoogleQuery(lex)
def test_ins_case(self): var_c = Variant(hgvs_c['INS']) lex = LVG(var_c) query = GoogleQuery(lex)
def test_allow_gene_name_in_hgvs_string(self): seqvar = Variant('NM_001165963.1(SCN1A):c.4338_6030del') assert 'NM_001165963.1:c.4338_6030del' == '%s' % seqvar
def test_bad_hgvs_string_returns_None(self): assert Variant('boogers') is None