def main(): # script meant to update variants when the canonical form of a gene is changed parser = argparse.ArgumentParser(description='Define a canonical transcript per gene when several are defined', usage='python update_canonical_when_several.py -k md_api_key -g gene_hgnc') parser.add_argument('-d', '--dbsnp-file', default='', required=True, help='Path to the dbSNP file') args = parser.parse_args() if os.path.isfile(args.dbsnp_file): db = get_db() curs = db.cursor(cursor_factory=psycopg2.extras.DictCursor) curs.execute( "SELECT a.id, b.pos, b.pos_ref, b.pos_alt, c.ncbi_name FROM variant_feature a, variant b, chromosomes c WHERE a.id = b.feature_id AND b.chr = c.name AND c.genome_version = 'hg38' AND b.genome_version = 'hg38' AND a.dbsnp_id is NULL ORDER BY a.id" ) res = curs.fetchall() count = curs.rowcount log('INFO', 'Found {0} variants to check'.format(count)) i = 0 j = 0 for var in res: j += 1 if j % 500 == 0: log('INFO', '{0}/{1} variant checked'.format(j, count)) tb = tabix.open(args.dbsnp_file) query = "{0}:{1}-{2}".format(var['ncbi_name'], var['pos'], var['pos']) records = tb.querys(query) for record in records: match_object = re.search(r'RS=(\d+);', record[7]) if match_object: pos_ref_list = re.split(',', record[3]) pos_alt_list = re.split(',', record[4]) if var['pos_ref'] in pos_ref_list and \ var['pos_alt'] in pos_alt_list: new_rs_id = match_object.group(1) # need to update var entry curs.execute( "UPDATE variant_feature SET dbsnp_id = %s WHERE id = %s", (new_rs_id, var['id']) ) log('INFO', 'Adding rsid for variant {0} - ref:{1} - alt:{2} to rs{3}'.format(var['id'], var['pos_ref'], var['pos_alt'], new_rs_id)) i += 1 db.commit() log('INFO', '{0} rsids added'.format(i)) else: log('ERROR', 'Your input file was not found {0}'.format(args.dbsnp_file))
def main(): parser = argparse.ArgumentParser(description='Defines NP RefSeq acc_no when lacking', usage='python update_np_acc_no.py -k ncbi_api_key') parser.add_argument('-k', '--ncbi-api-key', default=None, required=True, help='NCBI Entrez API key.') args = parser.parse_args() # get file ncbi_api_key = None if args.ncbi_api_key is not None: if not re.search(r'\w+', args.ncbi_api_key): log('ERROR', 'Invalid NCBI API key, please check') else: ncbi_api_key = args.ncbi_api_key # get db connector and cursor db = get_db() curs = db.cursor(cursor_factory=psycopg2.extras.DictCursor) i = 0 if ncbi_api_key is not None: http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where()) # get list of remaining genes with no canonical defined curs.execute( "SELECT name, np FROM gene WHERE np = 'NP_000000.0' ORDER by name" ) res = curs.fetchall() for acc in res: # ncbi ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={0}&api_key={1}'.format(acc['name'][1], ncbi_api_key) eutils_response = http.request('GET', ncbi_url).data.decode('utf-8') if re.search(r'accession\s"NP_\d+",\s+version\s\d$', eutils_response, re.MULTILINE): match_object = re.search(r'accession\s"(NP_\d+)",\s+version\s(\d+)$', eutils_response, re.MULTILINE) curs.execute( "UPDATE gene SET np = '{0}.{1}' WHERE name[2] = '{2}'".format(match_object.group(1), match_object.group(2), acc['name'][1]) ) log('INFO', 'Updated gene NP acc no of {0} to {1}.{2}'.format(acc['name'][0], match_object.group(1), match_object.group(2))) i += 1 log('INFO', '{} genes updated'.format(i)) db.commit()
def main(): # script meant to update variants when the canonical form of a gene is changed parser = argparse.ArgumentParser( description= 'Define a canonical transcript per gene when several are defined', usage= 'python update_canonical_when_several.py -k md_api_key -g gene_hgnc') parser.add_argument( '-k', '--api-key', default='', required=True, help='Your API key visible on your profile page on the website.') parser.add_argument('-g', '--gene-name', default='', required=True, help='The gene you want to update the variants from.') args = parser.parse_args() db = get_db() curs = db.cursor(cursor_factory=psycopg2.extras.DictCursor) username = None if len(args.api_key) != 43: log('ERROR', 'Invalid API key, please check it') else: api_key = args.api_key # user curs.execute("SELECT username FROM mobiuser WHERE api_key = %s", (api_key, )) res_user = curs.fetchone() if res_user is None: log('ERROR', 'Unknown API key') username = res_user['username'] log('INFO', 'User: {}'.format(username)) match_obj = re.search('^([\w-]+)$', args.gene_name) if match_obj: gene_name = match_obj.group(1) else: log('ERROR', 'Invalid gene name, please check it') # date today = datetime.datetime.now() creation_date = '{0}-{1}-{2}'.format(today.strftime("%Y"), today.strftime("%m"), today.strftime("%d")) # check if gene exists and get new canonical isoform curs.execute( "SELECT DISTINCT(name[2]) as nm, nm_version FROM gene WHERE name[1] = %s AND canonical = 't'", (gene_name, )) res = curs.fetchone() if res is None: log( 'ERROR', 'The gene {} is not present in MobiDetails, please check it'. format(gene_name)) nm = res['nm'] nm_full = '{0}.{1}'.format(res['nm'], res['nm_version']) # get all variants curs.execute( "SELECT a.chr, a.pos, a.pos_ref, a.pos_alt, a.g_name, b.c_name, b.id FROM variant a, variant_feature b WHERE a.feature_id = b.id AND b.gene_name[1] = %s AND b.gene_name[2] != %s AND a.genome_version = 'hg38' ORDER BY a.pos", (gene_name, nm)) res = curs.fetchall() if res is None: log('ERROR', 'No variant to update') for var in res: http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where()) vv_url_base = "https://rest.variantvalidator.org" # vv_url_base = "http://0.0.0.0:8000/" vv_url = "{0}/VariantValidator/variantvalidator/GRCh38/{1}-{2}-{3}-{4}/all?content-type=application/json".format( vv_url_base, var['chr'], var['pos'], var['pos_ref'], var['pos_alt']) log('DEBUG', 'Calling VariantValidator API: {}'.format(vv_url)) try: vv_data = json.loads( http.request('GET', vv_url).data.decode('utf-8')) # log('DEBUG', vv_data) except Exception: log('WARNING', 'No VV result for {0}:{1}'.format(nm_full, var['c_name'])) continue for first_level_key in vv_data: match_obj = re.search('{}:c\.(.+)$'.format(nm_full), first_level_key) if match_obj: new_c_name = match_obj.group(1) log( 'DEBUG', 'Old c_name: {0} - New c_name: {1}'.format( var['c_name'], new_c_name)) if new_c_name == var['c_name']: curs.execute( "UPDATE variant_feature SET gene_name[2] = %s, \ creation_date = %s WHERE id = %s", (nm, creation_date, var['id'])) log('INFO', 'Variant {} remains unchanged'.format(var['c_name'])) else: # likely to change are p_name, ivs_name, prot_type, start_segment_type, start_segment_number, end_segment_type, end_segment_number # also need to update creation_date, creation_user # get p_name p_name = None if 'hgvs_predicted_protein_consequence' in vv_data[ first_level_key]: # log('DEBUG', vv_data[first_level_key]['hgvs_predicted_protein_consequence']) if 'tlr' in vv_data[first_level_key][ 'hgvs_predicted_protein_consequence']: # log('DEBUG', vv_data[first_level_key]['hgvs_predicted_protein_consequence']['tlr']) match_object = re.search( 'NP_\d+\.\d.*:p\.\(?(.+)\)?', vv_data[first_level_key] ['hgvs_predicted_protein_consequence']['tlr']) if match_object: p_name = match_object.group(1) if re.search(r'\)$', p_name): # remove last ')' p_name = p_name[:-1] else: log('WARNING', 'No p_name in VV results') else: log('WARNING', 'No tlr in VV results') else: log( 'WARNING', 'No hgvs_predicted_protein_consequence in VV results' ) start_segment_type = start_segment_number = end_segment_type = end_segment_number = ivs_name = None # get segments type and number positions = compute_start_end_pos(var['g_name']) if positions[0] != positions[1]: curs.execute( "SELECT number, type FROM segment WHERE genome_version = 'hg38' AND \ gene_name[1] = %s AND gene_name[2] = %s AND %s BETWEEN SYMMETRIC segment_start \ AND segment_end AND %s BETWEEN SYMMETRIC segment_start AND segment_end", (gene_name, nm, positions[0], positions[1])) res_seg = curs.fetchone() if res_seg is not None: # start - end in same segment start_segment_type = res_seg['type'] start_segment_number = res_seg['number'] end_segment_type = res_seg['type'] end_segment_number = res_seg['number'] else: curs.execute( "SELECT number, type FROM segment WHERE genome_version = 'hg38' \ AND gene_name[1] = %s AND gene_name[2] = %s AND %s \ BETWEEN SYMMETRIC segment_start AND segment_end ", (gene_name, nm, positions[0])) res_seg1 = curs.fetchone() curs.execute( "SELECT number, type FROM segment WHERE genome_version = 'hg38' \ AND gene_name[1] = %s AND gene_name[2] = %s AND %s \ BETWEEN SYMMETRIC segment_start AND segment_end ", (gene_name, nm, positions[1])) res_seg2 = curs.fetchone() if res_strand['strand'] == '+': start_segment_type = res_seg1['type'] start_segment_number = res_seg1['number'] end_segment_type = res_seg2['type'] end_segment_number = res_seg2['number'] else: start_segment_type = res_seg2['type'] start_segment_number = res_seg2['number'] end_segment_type = res_seg1['type'] end_segment_number = res_seg1['number'] # get IVS name if start_segment_type == 'intron': ivs_obj = re.search( r'^\d+([\+-]\d+)_\d+([\+-]\d+)(.+)$', new_c_name) if ivs_obj: ivs_name = 'IVS{0}{1}_IVS{2}{3}{4}'.format( start_segment_number, ivs_obj.group(1), end_segment_number, ivs_obj.group(2), ivs_obj.group(3)) else: ivs_obj = re.search( r'^\d+([\+-]\d+)_(\d+)([^\+-].+)$', new_c_name) if ivs_obj: ivs_name = 'IVS{0}{1}_{2}{3}'.format( start_segment_number, ivs_obj.group(1), ivs_obj.group(2), ivs_obj.group(3)) else: ivs_obj = re.search( r'^(\d+)_\d+([\+-]\d+)(.+)$', new_c_name) if ivs_obj: ivs_name = '{0}_IVS{1}{2}{3}'.format( ivs_obj.group(1), end_segment_number, ivs_obj.group(2), ivs_obj.group(3)) else: # substitutions curs.execute( "SELECT number, type FROM segment WHERE genome_version = 'hg38' \ AND gene_name[1] = %s AND gene_name[2] = %s AND %s \ BETWEEN SYMMETRIC segment_start AND segment_end ", (gene_name, nm, positions[0])) res_seg = curs.fetchone() start_segment_type = res_seg['type'] start_segment_number = res_seg['number'] end_segment_type = res_seg['type'] end_segment_number = res_seg['number'] if start_segment_type == 'intron': ivs_obj = re.search(r'^[\*-]?\d+([\+-]\d+)(.+)$', new_c_name) ivs_name = 'IVS{0}{1}{2}'.format( start_segment_number, ivs_obj.group(1), ivs_obj.group(2)) if p_name is None or \ start_segment_type is None or \ start_segment_number is None or \ end_segment_type is None or \ end_segment_number is None: log('WARNING', 'A mandatory new parameter is lacking') continue if ivs_name is None: curs.execute( "UPDATE variant_feature SET gene_name[2] = %s, c_name = %s, p_name = %s, start_segment_type = %s, \ start_segment_number = %s, end_segment_type = %s, end_segment_number = %s, \ creation_date = %s WHERE id = %s", (nm, new_c_name, p_name, start_segment_type, start_segment_number, end_segment_type, end_segment_number, creation_date, var['id'])) else: curs.execute( "UPDATE variant_feature SET gene_name[2] = %s, c_name = %s, p_name = %s, ivs_name = %s, start_segment_type = %s, \ start_segment_number = %s, end_segment_type = %s, end_segment_number = %s, \ creation_date = %s WHERE id = %s", (nm, new_c_name, p_name, ivs_name, start_segment_type, start_segment_number, end_segment_type, end_segment_number, creation_date, var['id'])) log( 'INFO', 'Variant {0} updated to {1}'.format( var['c_name'], new_c_name)) db.commit()
def main(): parser = argparse.ArgumentParser(description='Define a canonical transcript per gene', usage='python define_canonical.py [-r path/to/refGeneCanonical_2019_09_23.txt]') parser.add_argument('-r', '--refgene', default='', required=True, help='Path to the file containing the canonical refSeq IDs per gene (from UCSC)') parser.add_argument('-k', '--ncbi-api-key', default=None, required=False, help='NCBI Entrez API key. If not provided, 3rd method is not executed') parser.add_argument('-u', '--update-refgene', default=None, required=False, help='Update RefGene (canonical) for genes w/ on variants based on NCBI (requires NCBI API key)', action='store_true') args = parser.parse_args() # get file if os.path.isfile(args.refgene): refgeneFile = args.refgene else: sys.exit('ERROR: Invalid input path, please check your command') ncbi_api_key = None if args.ncbi_api_key is not None: if not re.search(r'\w+', args.ncbi_api_key): sys.exit('ERROR: Invalid NCBI API key, please check') else: ncbi_api_key = args.ncbi_api_key # get db connector and cursor db = get_db() curs = db.cursor(cursor_factory=psycopg2.extras.DictCursor) i = 0 # lacking_nm = [] # first when only one isoform => canonical log('INFO', "1st Query: genes w/ only one isoform - is automatically canonical") curs.execute( "SELECT name FROM gene WHERE canonical = 'f' AND name[1] IN (SELECT name[1] FROM gene GROUP BY name[1] HAVING COUNT (name[1]) = 1)" ) res = curs.fetchall() for acc in res: curs.execute( "UPDATE gene SET canonical = 't' WHERE name[2] = %s", (acc['name'][1],) ) # lacking_nm.append(acc['name'][0]) log('INFO', 'Updated gene {} (1st method)'.format(acc['name'][0])) i += 1 db.commit() # second check the refgene file log('INFO', "2nd Query: get info from local refGene file") for geneLine in open(refgeneFile).readlines(): # ENST - NM - gene geneLineList = geneLine.rstrip().split("\t") # print(geneLineList[2]) if geneLineList[2] != 'n/a' and geneLineList[2] != 'hg38.refGene.name2': # "SELECT DISTINCT(name[1]) FROM gene WHERE name[1] = %s AND name[1] NOT IN (SELECT name[1] FROM gene WHERE canonical = 't') ORDER BY name", - removed -too long curs.execute( # gene exists in MD (no main already set) "SELECT DISTINCT(name[1]) FROM gene WHERE name[1] = %s ORDER BY name", (geneLineList[2],) ) mdgene = curs.fetchone() if mdgene is not None: # is not canonical? curs.execute( # gene exists in MD (no main already set) "SELECT canonical FROM gene WHERE name[1] = %s AND canonical = 't'", (geneLineList[2],) ) mdgene_can = curs.fetchone() if mdgene_can is None: # nm exists in md? curs.execute( "SELECT name FROM gene WHERE name[2] = %s", (geneLineList[1],) ) # exists in table gene_annotation? get a nm mdnm = curs.fetchone() if mdnm is not None: # ok => canonical i += 1 postGene = '{"' + mdnm['name'][0] + '","' + mdnm['name'][1] + '"}' # print("UPDATE gene SET canonical = 't' WHERE name = '{}'".format(postGene)) curs.execute( "UPDATE gene SET canonical = 't' WHERE name = %s", (postGene,) ) log('INFO', 'Updated gene {} (2nd method)'.format(mdnm['name'][0])) # else: # lacking_nm.append(geneLineList[2]) # print(lacking_nm) db.commit() log('INFO', "3rd Query: get info from NCBI for genes with no canonical defined remaining") # 3rd get info at NCBI # API key mandatory if ncbi_api_key is not None: # get list of remaining genes with no canonical defined # "SELECT name, np, canonical FROM gene WHERE name[1] NOT IN (SELECT name[1] FROM gene WHERE canonical='t') ORDER BY name" - removed, too long curs.execute( "SELECT name, np, canonical FROM gene ORDER BY name[1], canonical DESC" ) res = curs.fetchall() http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where()) semaph_gene = None for acc in res: if semaph_gene != acc['name'][0]: semaph_num_iso = 0 semaph_gene = acc['name'][0] semaph_num_iso += 1 if semaph_num_iso > 1: continue # check if a canonical has been defined # curs.execute( # "SELECT name FROM gene WHERE canonical='t' AND name[2] = %s", # (acc['name'][1],) # ) # res_cano = curs.fetchone() # if res_cano is None: if acc['canonical'] == 'f' and \ semaph_num_iso == 1: # ncbi ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={0}&api_key={1}'.format(acc['name'][1], ncbi_api_key) eutils_response = http.request('GET', ncbi_url).data.decode('utf-8') if re.search(r'"RefSeq\sSelect"', eutils_response): curs.execute( "UPDATE gene SET canonical = 't' WHERE name[2] = %s", (acc['name'][1],) ) i += 1 log('INFO', 'Updated gene {} (3rd method)'.format(acc['name'][0])) if acc['np'] == 'NP_000000.0': if re.search(r'accession\s"NP_\d+",\s+version\s\d$', eutils_response, re.MULTILINE): match_object = re.search(r'accession\s"(NP_\d+)",\s+version\s(\d+)$', eutils_response, re.MULTILINE) curs.execute( "UPDATE gene SET np = '{0}.{1}' WHERE name[2] = '{2}'".format(match_object.group(1), match_object.group(2), acc['name'][1]) ) log('INFO', 'Updated gene NP acc no of {0} to {1}.{2}'.format(acc['name'][0], match_object.group(1), match_object.group(2))) if args.update_refgene: log('INFO', "Update refGene") # get genes w/ no variants, and at least 2 isoforms to check which one should be canonical curs.execute( "SELECT name, canonical FROM gene WHERE (name[1] NOT IN \ (SELECT gene_name[1] FROM variant_feature)) AND \ (name[1] IN (SELECT name[1] FROM gene GROUP BY name[1] \ HAVING COUNT (name[1]) > 1)) ORDER BY name" ) res = curs.fetchall() for acc in res: ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={0}&api_key={1}'.format( acc['name'][1], ncbi_api_key ) # log('DEBUG', ncbi_url) eutils_response = http.request('GET', ncbi_url).data.decode('utf-8') # if acc['name'][1] == 'NM_018257': # log('DEBUG', eutils_response) # log('DEBUG', acc['canonical']) # log('DEBUG', re.search(r'"RefSeq\sSelect\scriteria"', eutils_response)) if re.search(r'"RefSeq\sSelect\scriteria"', eutils_response) and acc['canonical'] is False: curs.execute( "UPDATE gene SET canonical = 'f' WHERE name[1] = %s", (acc['name'][0],) ) # log('INFO', "UPDATE gene SET canonical = 'f' WHERE name[1] = '{}'".format(acc['name'][0])) curs.execute( "UPDATE gene SET canonical = 't' WHERE name[2] = %s", (acc['name'][1],) ) # log('INFO', "UPDATE gene SET canonical = 't' WHERE name[2] = '{}'".format(acc['name'][1])) i += 1 log('INFO', 'Updated gene {} (4th method)'.format(acc['name'][0])) log('INFO', '{} genes modified'.format(i)) db.commit()
def main(): parser = argparse.ArgumentParser( description= 'Define a canonical transcript per gene and optionally updates various fields', usage='python update_canonical_from_remote.py [-r remote_server_url]') parser.add_argument('-r', '--remote-server', default='', required=True, help='base URL of the remote server') parser.add_argument( '-uca', '--update-can-all', default='', required=False, help='Optionally update canonical for all genes w/ no variants', action='store_true') parser.add_argument('-np', '--update-np', default='', required=False, help='Optionally update NP for genes', action='store_true') parser.add_argument('-uu', '--update-uniprot', default='', required=False, help='Optionally update UNIPROT IDs', action='store_true') parser.add_argument('-uc', '--update-creation', default='', required=False, help='Optionally update variant_creation tag', action='store_true') parser.add_argument( '-un', '--update-nm', default='', required=False, help='Optionally update RefSeq Nm accession number tag', action='store_true') args = parser.parse_args() remote_addr = args.remote_server # args = parser.parse_args(['-np']) print() log('INFO', 'Working with server {}'.format(remote_addr)) #headers header = { 'Accept': 'application/json', 'User-Agent': 'python-requests Python/{}.{}.{}'.format(sys.version_info[0], sys.version_info[1], sys.version_info[2]), } # get db connector and cursor db = get_db() curs = db.cursor(cursor_factory=psycopg2.extras.DictCursor) # get local list of genes with no canonical isoform defined curs.execute( "SELECT DISTINCT(name[1]) as hgnc FROM gene WHERE name[1] NOT IN (SELECT name[1] FROM gene WHERE canonical='t') ORDER BY name[1]" ) no_can = curs.fetchall() http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where()) i = 0 # lacking_nm = [] for gene in no_can: req_url = '{0}/api/gene/{1}'.format(remote_addr, gene['hgnc']) api_response = json.loads( http.request('GET', req_url, headers=header).data.decode('utf-8')) log('DEBUG', 'req_url:{0}'.format(req_url)) # log('DEBUG', 'api_response:{0}'.format(api_response)) for keys in api_response: # log('DEBUG', 'key:{0} - value:{1}'.format(keys, api_response[keys])) if isinstance(keys, dict) and \ 'canonical' in api_response[keys]: if api_response[keys]['canonical'] is True: if re.search(r'NM_\d+\.\d+', keys): match_obj = re.search(r'(NM_\d+)\.\d+', keys) nm_acc = match_obj.group(1) curs.execute( "UPDATE gene set canonical = 't' WHERE name[2] = %s", (nm_acc, )) log('INFO', 'Updating {}'.format(nm_acc)) i += 1 db.commit() log('INFO', '{} genes modified (canonical)'.format(i)) i = 0 if args.update_can_all: # get genes with no variants and at least 2 isoforms to see if we need to update canonical curs.execute( "SELECT name, canonical FROM gene WHERE (name[1] NOT IN (SELECT gene_name[1] FROM variant_feature)) \ AND (name[1] IN (SELECT name[1] FROM gene GROUP BY name[1] HAVING COUNT (name[1]) > 1)) ORDER BY name" ) res = curs.fetchall() for acc in res: req_url = '{0}/api/gene/{1}'.format(remote_addr, acc['name'][0]) api_response = json.loads( http.request('GET', req_url, headers=header).data.decode('utf-8')) for keys in api_response: if isinstance(keys, dict) and \ 'canonical' in api_response[keys]: if api_response[keys]['canonical'] is True and acc[ 'canonical'] == 0: if re.search(r'NM_\d+\.\d+', keys): match_obj = re.search(r'(NM_\d+)\.\d+', keys) nm_acc = match_obj.group(1) # double check if nm_acc == acc['name'][1]: curs.execute( "UPDATE gene SET canonical = 'f' WHERE name[1] = %s", (acc['name'][0], )) # log('INFO', "UPDATE gene SET canonical = 'f' WHERE name[1] = '{}'".format(acc['name'][0])) curs.execute( "UPDATE gene SET canonical = 't' WHERE name[2] = %s", (acc['name'][1], )) # log('INFO', "UPDATE gene SET canonical = 't' WHERE name[2] = '{}'".format(acc['name'][1])) i += 1 log('INFO', 'Updated gene {}'.format(acc['name'][0])) db.commit() log('INFO', '{} genes modified (canonical all)'.format(i)) if args.update_np: curs.execute( "SELECT DISTINCT(name[1]) as hgnc FROM gene WHERE np = 'NP_000000.0'" ) no_np = curs.fetchall() j = 0 for gene in no_np: req_url = '{0}/api/gene/{1}'.format(remote_addr, gene['hgnc']) api_response = json.loads( http.request('GET', req_url, headers=header).data.decode('utf-8')) for keys in api_response: if isinstance(keys, dict): if 'RefProtein' in api_response[keys] and \ api_response[keys]['RefProtein'] != 'NP_000000.0': if re.search(r'NP_\d+\.\d+', api_response[keys]['RefProtein']): match_obj = re.search(r'(NM_\d+)\.\d+', keys) nm_acc = match_obj.group(1) np_acc = api_response[keys]['RefProtein'] curs.execute( "UPDATE gene set np = %s WHERE name[2] = %s", (np_acc, nm_acc)) log( 'INFO', 'Updating gene NP acc no of {0} to {1}'.format( nm_acc, np_acc)) j += 1 db.commit() log('INFO', '{} NP acc no modified'.format(j)) if args.update_uniprot or args.update_creation or args.update_nm: curs.execute( "SELECT name[1] as HGNC, name[2] as nm, nm_version, np, uniprot_id, variant_creation FROM gene ORDER BY name" ) res = curs.fetchall() k = l = m = n = 0 o = curs.rowcount for gene in res: req_url = '{0}/api/gene/{1}'.format(remote_addr, gene['hgnc']) api_response = json.loads( http.request('GET', req_url, headers=header).data.decode('utf-8')) l += 1 if l % 1000 == 0: log('INFO', '{0}/{1} isoforms checked'.format(l, o)) for keys in api_response: match_obj = re.search(r'^(NM_\d+)\.(\d+)$', keys) if match_obj: nm_acc = match_obj.group(1) # check again if nm_acc == gene['nm']: if args.update_nm: nm_version = match_obj.group(2) # log('DEBUG', '{0}dev:{1}-prod:{2}'.format(gene['hgnc'], int(nm_version), int(gene['nm_version']))) if int(nm_version) != int(gene['nm_version']): # no downgrade? y => downgrade curs.execute( "UPDATE gene set nm_version = %s WHERE name[2] = %s", (nm_version, nm_acc)) log( 'INFO', 'Updating gene RefSeq NM accession version of {0} from {1} to {2}' .format(nm_acc, gene['nm_version'], nm_version)) n += 1 if 'UNIPROT' in api_response[ keys] and args.update_uniprot: uniprot = api_response[keys]['UNIPROT'] if uniprot != gene['uniprot_id']: curs.execute( "UPDATE gene set uniprot_id = %s WHERE name[2] = %s", (uniprot, nm_acc)) log( 'INFO', 'Updating gene UNIPROT id of {0} to {1}'. format(nm_acc, uniprot)) k += 1 if 'variantCreationTag' in api_response[ keys] and args.update_creation: tag = api_response[keys]['variantCreationTag'] if tag != gene['variant_creation']: curs.execute( "UPDATE gene set variant_creation = %s WHERE name[2] = %s", (tag, nm_acc)) log( 'INFO', 'Updating gene variantCreationTag of {0} to {1}' .format(nm_acc, tag)) m += 1 db.commit() log('INFO', '{} UNIPROT IDs modified'.format(k)) log('INFO', '{} variantCreationTag modified'.format(m)) log('INFO', '{} RefSeq NM accession version modified'.format(n))
def main(): parser = argparse.ArgumentParser( description='Checks that genes accept variant creation', usage='python check_variant_creation.py [-r remote_server_url]') parser.add_argument('-r', '--remote-server', default='', required=True, help='base URL of the remote server') parser.add_argument( '-k', '--api-key', default='', required=True, help='Your API key visible on your profile page on the website.') args = parser.parse_args() remote_addr = args.remote_server if re.search(r'mobidetails\.iurc', remote_addr): log('ERROR', 'This script is not intended to work with the production server') if len(args.api_key) != 43: log('ERROR', 'Invalid API key, please check it') else: api_key = args.api_key print() log('INFO', 'Working with server {}'.format(remote_addr)) # get db connector and cursor db = get_db() curs = db.cursor(cursor_factory=psycopg2.extras.DictCursor) curs.execute("DELETE FROM variant_feature WHERE c_name = '1A>T'") db.commit() # # reinitialise gene state # curs.execute( # "UPDATE gene SET variant_creation = 'ok'" # ) db.commit() http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where()) curs.execute( "SELECT DISTINCT(name), nm_version, variant_creation FROM gene WHERE canonical = 't' ORDER BY name" ) # AND variant_creation IN ('hg19_mapping_default', 'hg38_mapping_default') can = curs.fetchall() num_can = curs.rowcount i = 0 j = 0 k = 0 # variant = 'c.1A>T' failed_genes = [] for gene in can: print('.', end="", flush=True) i += 1 if i % 500 == 0: log('INFO', '{0}/{1} genes checked'.format(i, num_can)) # variant = '{0}.{1}:c.1A>T'.format(gene['name'][1], gene['nm_version']) # md_url = '{0}/api/variant/create/{1}/{2}'.format(remote_addr, variant, api_key) md_url = '{0}/api/variant/create'.format(remote_addr) variant_chgvs = '{0}.{1}:c.1A>T'.format(gene['name'][1], gene['nm_version']) data = { 'variant_chgvs': urllib.parse.quote(variant_chgvs), 'caller': 'cli', 'api_key': api_key } # reinitialise gene state before query curs.execute( "UPDATE gene SET variant_creation = 'ok' WHERE name[2] = %s", (gene['name'][1], )) db.commit() try: md_response = json.loads( http.request('POST', md_url, headers=md_utilities.api_agent, fields=data).data.decode('utf-8')) # try: # md_response = json.loads(http.request('GET', md_url, headers={'Accept': 'application/json'}).data.decode('utf-8')) if 'mobidetails_error' in md_response: j += 1 log( 'WARNING', 'variant creation failed for gene {0} with error {1}'. format(gene['name'], md_response['mobidetails_error'])) new_nm_match_obj = re.search( r'A more recent version of the selected reference sequence NM_\d+\.\d+ is available \((NM_\d+)\.(\d+)\)', md_response['mobidetails_error']) if new_nm_match_obj: nm_to_check = new_nm_match_obj.group(1) new_ver = new_nm_match_obj.group(2) if nm_to_check == gene['name'][1]: curs.execute( "UPDATE gene SET nm_version = '{0}' WHERE name[2] = '{1}'" .format(new_ver, gene['name'][1])) # recheck data['variant_chgvs'] = '{0}.{1}:c.1A>T'.format( gene['name'][1], new_ver) # md_url_2 = '{0}/api/variant/create/{1}/{2}'.format(remote_addr, variant_2, api_key) try: md_response_2 = json.loads( http.request('POST', md_url, headers=md_utilities.api_agent, fields=data).data.decode('utf-8')) # md_response_2 = json.loads(http.request('GET', md_url_2, headers={'Accept': 'application/json'}).data.decode('utf-8')) if 'mobidetails_id' in md_response_2 and gene[ 'variant_creation'] != 'ok': curs.execute( "UPDATE gene SET variant_creation = 'ok' WHERE name[2] = '{}'" .format(gene['name'][1])) continue except Exception: k += 1 failed_genes.append('{}'.format(gene['name'][0])) continue if re.search( r'cannot be mapped directly to genome build GRCh38', md_response['mobidetails_error']): curs.execute( "UPDATE gene SET variant_creation = 'hg38_mapping_default' WHERE name[2] = '{}'" .format(gene['name'][1])) log( 'INFO', 'MD gene table updated with variant_creation = hg38_mapping_default' ) elif re.search(r'does not seem to map correctly to hg19', md_response['mobidetails_error']): curs.execute( "UPDATE gene SET variant_creation = 'hg19_mapping_default' WHERE name[2] = '{}'" .format(gene['name'][1])) log( 'INFO', 'MD gene table updated with variant_creation = hg19_mapping_default' ) elif re.search(r'with the variant position and intron', md_response['mobidetails_error']): curs.execute( "UPDATE gene SET variant_creation = 'mapping_default' WHERE name[2] = '{}'" .format(gene['name'][1])) log( 'INFO', 'MD gene table updated with variant_creation = mapping_default' ) elif 'mobidetails_id' in md_response and gene[ 'variant_creation'] != 'ok': curs.execute( "UPDATE gene SET variant_creation = 'ok' WHERE name[2] = '{}'" .format(gene['name'][1])) db.commit() except Exception: log('ERROR', 'failed MD API call {}'.format(md_url)) k += 1 failed_genes.append('{}'.format(gene['name'][0])) continue log('INFO', '{0}/{1} genes reported a VV error'.format(j, num_can)) log('INFO', '{0}/{1} genes triggered an MD error:'.format(k, num_can)) log('INFO', failed_genes)
def main(): parser = argparse.ArgumentParser( description='Update UNIPROT ids and protein size', usage='python check_uniprot_ids.py [-k NCBI_API_KEY]') parser.add_argument( '-k', '--ncbi-api-key', default=None, required=False, help='NCBI Entrez API key. If not provided, 3rd method is not executed' ) args = parser.parse_args() ncbi_api_key = None if args.ncbi_api_key is not None: if not re.search(r'\w+', args.ncbi_api_key): sys.exit('ERROR: Invalid NCBI API key, please check') else: ncbi_api_key = args.ncbi_api_key # get db connector and cursor db = get_db() curs = db.cursor(cursor_factory=psycopg2.extras.DictCursor) i = 0 http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where()) curs.execute( "SELECT name, np, uniprot_id, prot_size FROM gene WHERE ORDER BY name") res = curs.fetchall() count = curs.rowcount i = 0 for gene in res: # ncbi print('.', end="", flush=True) i += 1 if i % 500 == 0: log('INFO', '{0}/{1} genes checked'.format(i, count)) match_obj = re.search(r'(NP_\d+)\.\d', gene['np']) if match_obj: # log('DEBUG', gene['name'][0]) np = match_obj.group(1) uniprot_url = 'https://www.ebi.ac.uk/proteins/api/proteins/refseq:{}?offset=0&size=100&reviewed=true'.format( np) uniprot_response = json.loads( http.request('GET', uniprot_url, headers={ 'Accept': 'application/json' }).data.decode('utf-8')) # print(uniprot_response[0]['accession']) try: if uniprot_response[0]['accession']: # get uniport id prot size # print('{0}-{1}'.format(gene['uniprot_id'], uniprot_response[0]['sequence']['length'])) if gene['uniprot_id'] == uniprot_response[0]['accession']: # print('INFO: RefSeq: {0} - {1} - {2} OK'.format(gene['np'], gene['name'][1], gene['name'][0])) pass else: curs.execute( "UPDATE gene SET uniprot_id = '{0}' WHERE name[2] = '{1}'" .format(uniprot_response[0]['accession'], gene['name'][1])) # print("UPDATE gene SET uniprot_id = '{0}' WHERE name[2] = '{1}'".format(uniprot_response[0]['accession'], gene['name'][1])) log( 'WARNING', 'Updated gene UNIPROT ID of {0} - {1} from {2} to {3}' .format(gene['name'][0], gene['name'][1], gene['uniprot_id'], uniprot_response[0]['accession'])) i += 1 else: log( 'WARNING', 'md_uniprot_id: {0} - RefSeq: {1} - {2} - {3} :not checked' .format(gene['uniprot_id'], gene['np'], gene['name'][1], gene['name'][0])) except Exception: log( 'WARNING', 'no UNIPROT ID {0} for {1} - {2}'.format( uniprot_response, gene['name'][1], gene['name'][0])) # get prot size from eutils ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id={0}&rettype=gp&complexity=3&api_key={1}'.format( np, ncbi_api_key) # ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={0}&api_key={1}'.format(gene['name'][1], ncbi_api_key) prot_size = -1 try: eutils_response = http.request('GET', ncbi_url).data.decode('utf-8') # log('DEBUG', eutils_response) prot_match = re.search(r'Protein\s+1\.\.(\d+)', eutils_response) #Protein\s+1\.\.(\d+)$ if prot_match: # log('DEBUG', 'ouhou') prot_size = prot_match.group(1) # log('DEBUG', prot_size) except Exception: log( 'WARNING', 'no protein size w/ got from eutils NP acc no {0}, eutils URL:{1}' .format(gene['np'], ncbi_url)) # log('DEBUG', prot_size) if int(prot_size) != -1 and \ gene['prot_size'] is not None and \ int(prot_size) != int(gene['prot_size']): curs.execute( "UPDATE gene SET prot_size = '{0}' WHERE name[2] = '{1}'". format(prot_size, gene['name'][1])) log( 'WARNING', 'Updated protein size for gene {0} - {1} - {2} to {3}'. format(gene['name'][0], gene['name'][1], gene['uniprot_id'], prot_size)) else: log('WARNING', 'pb w/ NP acc no {}'.format(gene['np'])) log('INFO', '{} isoforms updated'.format(i)) db.commit()
def main(): parser = argparse.ArgumentParser( description='Insert gnomAD data into MD', usage= 'python insert_gnomad.py [-d path/to/dir/containing/gnomad.v2.1.1.lof_metrics.by_gene.txt]' ) parser.add_argument( '-d', '--directory', default='', required=True, help='Path to the directory containing the gnomAD metrics by gene file' ) args = parser.parse_args() # get file if os.path.isfile(args.directory): gnomadFile = args.directory else: sys.exit('Invalid input path, please check your command') # get db connector and cursor db = get_db() curs = db.cursor(cursor_factory=psycopg2.extras.DictCursor) i = 0 for geneLine in open(gnomadFile).readlines(): geneLineList = geneLine.split("\t") # print(geneLineList[0]) curs.execute( # exists in MD? "SELECT name FROM gene WHERE name[1] = '{0}' AND canonical = 't'". format(geneLineList[0]) # number_of_exons IN (SELECT MAX(number_of_exons) FROM gene WHERE name[1] = '{0}')".format(geneLineList[0]) ) mdNMFirst = curs.fetchone() if mdNMFirst is not None: # print(mdNMFirst['nm']) curs.execute( "SELECT DISTINCT(gene_name[2]) FROM gene_annotation WHERE gene_name[1] = '{}'" .format(geneLineList[0] )) # exists in table gene_annotation? get a nm mdNMSecond = curs.fetchone() if mdNMSecond is None: # does not exists => creation i += 1 postGene = '{"' + mdNMFirst['name'][0] + '","' + mdNMFirst[ 'name'][1] + '"}' oeValues = { 'synoe': geneLineList[13], 'synlower': geneLineList[24], 'synupper': geneLineList[25], 'misoe': geneLineList[4], 'mislower': geneLineList[26], 'misupper': geneLineList[27], 'lofoe': geneLineList[23], 'loflower': geneLineList[28], 'lofupper': geneLineList[29] } for oeval in oeValues: try: oeValues[oeval] = float(oeValues[oeval]) oeValues[oeval] = "{:.2f}".format(oeValues[oeval]) except Exception: next curs.execute( "INSERT INTO gene_annotation VALUES('{0}','{1}','{2}','{3}','{4}','{5}','{6}','{7}','{8}','{9}')" .format(postGene, oeValues['synoe'], oeValues['synlower'], oeValues['synupper'], oeValues['misoe'], oeValues['mislower'], oeValues['misupper'], oeValues['lofoe'], oeValues['loflower'], oeValues['lofupper'])) log('INFO', '{} annotations added'.format(i)) db.commit()
def main(): # script meant to be croned to update NM acc versions in MD according to VariantValidator # to be ran after uta docker update for example # uses VV API genes2transcript # https://rest.variantvalidator.org/VariantValidator/tools/gene2transcripts/NM_130786?content-type=application%2Fjson vv_url_base = "https://rest.variantvalidator.org" # vv_url_base = "http://0.0.0.0:8000/" db = get_db() curs = db.cursor(cursor_factory=psycopg2.extras.DictCursor) curs.execute( # get genes "SELECT name, nm_version FROM gene WHERE canonical = 't' ORDER BY name" ) genes = curs.fetchall() count = curs.rowcount i = 0 for gene in genes: # log('DEBUG', '{}-{}'.format(gene['name'][0], i)) i += 1 if i % 500 == 0: log('INFO', '{0}/{1} genes checked'.format(i, count)) # print("MD------{}".format(gene['name'][1])) # get VV info for the gene http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where()) vv_url = "{0}/VariantValidator/tools/gene2transcripts/{1}?content-type=application/json".format( vv_url_base, gene['name'][1]) try: vv_data = json.loads( http.request('GET', vv_url).data.decode('utf-8')) except Exception: log('WARNING', 'No value for {0}'.format(gene['name'][0])) continue if 'transcripts' in vv_data: # current_nm = gene['nm_version'] ts_dict = {} for transcript in vv_data['transcripts']: # print("VV------{}".format(transcript['reference'])) match_object = re.search(r'^(N[MR]_\d+)\.(\d{1,2})', transcript['reference']) if match_object: nm_acc = match_object.group(1) # if nm_acc == gene['name'][1]: nm_version = match_object.group(2) if nm_acc not in ts_dict: ts_dict[nm_acc] = [nm_version] else: ts_dict[nm_acc].append(nm_version) # do sthg with ts_dict before changing gene for nm in ts_dict: # exploring unconsistant NMs curs.execute("SELECT nm_version FROM gene WHERE name[2] = %s", (nm, )) res_nm = curs.fetchone() max_vv_nm = max(ts_dict[nm]) if not res_nm: continue # log("DEBUG", "Gene: {0} - NM: {1} - VV Max NM: {2} - MD Current NM: {3}".format(gene['name'][0], nm, max_vv_nm, res_nm[0])) if res_nm and \ int(res_nm[0]) != int(max_vv_nm): # NEED TO TEST IF THE TRANSCIPT WORKS!!!!! vv_url_var = "{0}/VariantValidator/variantvalidator/GRCh38/{1}.{2}:c.1A>T/all?content-type=application/json".format( vv_url_base, nm, max_vv_nm) log('DEBUG', 'Calling VariantValidator API: {}'.format(vv_url_var)) try: vv_data = json.loads( http.request('GET', vv_url_var).data.decode('utf-8')) # log('DEBUG', vv_data) except Exception: log('WARNING', 'No VV result for {0}.{1}'.format(nm, max_vv_nm)) continue noupdate = None for first_level_key in vv_data: if 'validation_warnings' in vv_data[first_level_key]: for warning in vv_data[first_level_key][ 'validation_warnings']: if re.search(r'cannot be mapped directly to genome build', warning) or \ re.search(r'No transcript definition for', warning) or \ re.search(r'No transcripts found', warning) or \ re.search(r'expected one of', warning): log( 'WARNING', "Cannot update gene {0} from {1} to {2} because of {3}" .format(gene['name'][0], res_nm[0], max_vv_nm, warning)) noupdate = 1 break if not noupdate: curs.execute( "UPDATE gene SET nm_version = %s WHERE name[2] = %s", (max_vv_nm, nm)) log( 'INFO', "NM UPDATE: gene {0} - {1} modified from {2} to {3}" .format(gene['name'][0], nm, res_nm[0], max_vv_nm)) db.commit() print('.', end="", flush=True)
def main(): parser = argparse.ArgumentParser( description='Check isoforms differences between 2 DBs', usage= 'python check_isoforms_differences.py -k md_api_key -nk ncbi_api_key') parser.add_argument( '-k', '--api-key', default=None, required=True, help='Your API key visible on your profile page on the website.') parser.add_argument('-nk', '--ncbi-api-key', default=None, required=True, help='NCBI Entrez API key.') parser.add_argument('-md', '--diff-md', default='', required=False, help='Check differences between MD Dev and Prod', action='store_true') parser.add_argument('-ncbi', '--diff-ncbi', default='', required=False, help='Check differences between NCBI RefSeq and MD', action='store_true') ncbi_api_key = None args = parser.parse_args() if args.ncbi_api_key is not None: if not re.search(r'\w+', args.ncbi_api_key): log('ERROR', 'Invalid NCBI API key, please check') else: ncbi_api_key = args.ncbi_api_key # get db connector and cursor db = get_db() curs = db.cursor(cursor_factory=psycopg2.extras.DictCursor) username = None if len(args.api_key) != 43: log('ERROR', 'Invalid API key, please check it') else: api_key = args.api_key # user curs.execute("SELECT username FROM mobiuser WHERE api_key = %s", (api_key, )) res_user = curs.fetchone() if res_user is None: log('ERROR', 'Unknown API key') username = res_user['username'] log('INFO', 'User: {}'.format(username)) db = get_db() curs = db.cursor(cursor_factory=psycopg2.extras.DictCursor) if args.diff_md: # meant to be ran on prod server diff = {} # get genes w/ more than one isoform curs.execute( "SELECT name[1] AS hgnc, name[2] AS nm, nm_version FROM gene WHERE canonical = 't' AND name[1] IN (SELECT name[1] FROM gene GROUP BY name[1] HAVING COUNT(name[1]) > 1) ORDER by name[1]" ) res = curs.fetchall() i = 0 for gene in res: i += 1 log('INFO', 'Treating gene {0} - #{1}'.format(gene['hgnc'], i)) full_nm = '{0}.{1}'.format(gene['nm'], gene['nm_version']) base_url = "http://10.34.20.79" md_url = '{0}/MD/api/gene/{1}'.format(base_url, gene['hgnc']) http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where()) try: md_data = json.loads( http.request('GET', md_url).data.decode('utf-8')) except Exception: log('WARNING', 'MD not responding for {}'.format(gene['hgnc'])) if full_nm in md_data and \ md_data[full_nm]['canonical'] is True: # log('INFO', 'No change for {}'.format(gene['hgnc'])) continue elif full_nm not in md_data: log('DEBUG', '{0}-{1}'.format(md_data, full_nm)) for key in md_data: matchobj = re.search(r'^(NM_\d+)\.\d+$', key) if matchobj: new_nm = matchobj.group(1) if md_data[key]['canonical'] is True: diff[gene['hgnc']] = { 'old_can': full_nm, 'new_can': key } log( 'INFO', 'updating canonical for {0}: {1} instead of {2}'. format(gene['hgnc'], key, full_nm)) curs.execute( "UPDATE gene SET canonical = 'f' WHERE name[1] = %s", (gene['hgnc'], )) curs.execute( "UPDATE gene SET canonical = 't' WHERE name[2] = %s", (new_nm, )) db.commit() cmd = "python3 update_vars_when_iso_change.py -k {0} -g {1}".format( api_key, gene['hgnc']) returned_value = subprocess.call(cmd, shell=True) log( 'INFO', 'Variants update returned value for {0}: {1}'. format(gene['hgnc'], returned_value)) pp = pprint.PrettyPrinter(indent=4) pp.pprint(diff)