Beispiel #1
0
def main():
    # script meant to update variants when the canonical form of a gene is changed
    parser = argparse.ArgumentParser(description='Define a canonical transcript per gene when several are defined',
                                     usage='python update_canonical_when_several.py -k md_api_key -g gene_hgnc')
    parser.add_argument('-d', '--dbsnp-file', default='', required=True,
                        help='Path to the dbSNP file')
    args = parser.parse_args()
    if os.path.isfile(args.dbsnp_file):
        db = get_db()
        curs = db.cursor(cursor_factory=psycopg2.extras.DictCursor)
        curs.execute(
            "SELECT a.id, b.pos, b.pos_ref, b.pos_alt, c.ncbi_name FROM variant_feature a, variant b, chromosomes c WHERE a.id = b.feature_id AND b.chr = c.name AND c.genome_version = 'hg38' AND b.genome_version = 'hg38' AND a.dbsnp_id is NULL ORDER BY a.id"
        )
        res = curs.fetchall()
        count = curs.rowcount
        log('INFO', 'Found {0} variants to check'.format(count))
        i = 0
        j = 0
        for var in res:
            j += 1
            if j % 500 == 0:
                log('INFO', '{0}/{1} variant checked'.format(j, count))
            tb = tabix.open(args.dbsnp_file)
            query = "{0}:{1}-{2}".format(var['ncbi_name'], var['pos'], var['pos'])
            records = tb.querys(query)
            for record in records:
                match_object = re.search(r'RS=(\d+);', record[7])
                if match_object:
                    pos_ref_list = re.split(',', record[3])
                    pos_alt_list = re.split(',', record[4])
                    if var['pos_ref'] in pos_ref_list and \
                            var['pos_alt'] in pos_alt_list:
                        new_rs_id = match_object.group(1)
                        # need to update var entry
                        curs.execute(
                            "UPDATE variant_feature SET dbsnp_id = %s WHERE id = %s",
                            (new_rs_id, var['id'])
                        )
                        log('INFO', 'Adding rsid for variant {0} - ref:{1} - alt:{2} to rs{3}'.format(var['id'], var['pos_ref'], var['pos_alt'], new_rs_id))
                        i += 1
                        db.commit()
        log('INFO', '{0} rsids added'.format(i))
    else:
        log('ERROR', 'Your input file was not found {0}'.format(args.dbsnp_file))
Beispiel #2
0
def main():
    parser = argparse.ArgumentParser(description='Defines NP RefSeq acc_no when lacking', usage='python update_np_acc_no.py -k ncbi_api_key')
    parser.add_argument('-k', '--ncbi-api-key', default=None, required=True, help='NCBI Entrez API key.')
    args = parser.parse_args()
    # get file

    ncbi_api_key = None
    if args.ncbi_api_key is not None:
        if not re.search(r'\w+', args.ncbi_api_key):
            log('ERROR', 'Invalid NCBI API key, please check')
        else:
            ncbi_api_key = args.ncbi_api_key

    # get db connector and cursor
    db = get_db()
    curs = db.cursor(cursor_factory=psycopg2.extras.DictCursor)

    i = 0

    if ncbi_api_key is not None:
        http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
        # get list of remaining genes with no canonical defined
        curs.execute(
            "SELECT name, np FROM gene WHERE np = 'NP_000000.0' ORDER by name"
        )
        res = curs.fetchall()
        for acc in res:
            # ncbi
            ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={0}&api_key={1}'.format(acc['name'][1], ncbi_api_key)
            eutils_response = http.request('GET', ncbi_url).data.decode('utf-8')
            if re.search(r'accession\s"NP_\d+",\s+version\s\d$', eutils_response, re.MULTILINE):
                match_object = re.search(r'accession\s"(NP_\d+)",\s+version\s(\d+)$', eutils_response, re.MULTILINE)
                curs.execute(
                    "UPDATE gene SET np = '{0}.{1}' WHERE name[2] = '{2}'".format(match_object.group(1), match_object.group(2), acc['name'][1])
                )
                log('INFO', 'Updated gene NP acc no of {0} to {1}.{2}'.format(acc['name'][0], match_object.group(1), match_object.group(2)))
                i += 1
    log('INFO', '{} genes updated'.format(i))

    db.commit()
Beispiel #3
0
def main():
    # script meant to update variants when the canonical form of a gene is changed
    parser = argparse.ArgumentParser(
        description=
        'Define a canonical transcript per gene when several are defined',
        usage=
        'python update_canonical_when_several.py -k md_api_key -g gene_hgnc')
    parser.add_argument(
        '-k',
        '--api-key',
        default='',
        required=True,
        help='Your API key visible on your profile page on the website.')
    parser.add_argument('-g',
                        '--gene-name',
                        default='',
                        required=True,
                        help='The gene you want to update the variants from.')
    args = parser.parse_args()
    db = get_db()
    curs = db.cursor(cursor_factory=psycopg2.extras.DictCursor)
    username = None
    if len(args.api_key) != 43:
        log('ERROR', 'Invalid API key, please check it')
    else:
        api_key = args.api_key
        # user
        curs.execute("SELECT username FROM mobiuser WHERE api_key = %s",
                     (api_key, ))
        res_user = curs.fetchone()
        if res_user is None:
            log('ERROR', 'Unknown API key')
        username = res_user['username']
        log('INFO', 'User: {}'.format(username))
    match_obj = re.search('^([\w-]+)$', args.gene_name)
    if match_obj:
        gene_name = match_obj.group(1)
    else:
        log('ERROR', 'Invalid gene name, please check it')
    # date
    today = datetime.datetime.now()
    creation_date = '{0}-{1}-{2}'.format(today.strftime("%Y"),
                                         today.strftime("%m"),
                                         today.strftime("%d"))
    # check if gene exists and get new canonical isoform
    curs.execute(
        "SELECT DISTINCT(name[2]) as nm, nm_version FROM gene WHERE name[1] = %s AND canonical = 't'",
        (gene_name, ))
    res = curs.fetchone()
    if res is None:
        log(
            'ERROR',
            'The gene {} is not present in MobiDetails, please check it'.
            format(gene_name))
    nm = res['nm']
    nm_full = '{0}.{1}'.format(res['nm'], res['nm_version'])
    # get all variants
    curs.execute(
        "SELECT a.chr, a.pos, a.pos_ref, a.pos_alt, a.g_name, b.c_name, b.id FROM variant a, variant_feature b WHERE a.feature_id = b.id AND b.gene_name[1] = %s AND b.gene_name[2] != %s AND a.genome_version = 'hg38' ORDER BY a.pos",
        (gene_name, nm))
    res = curs.fetchall()
    if res is None:
        log('ERROR', 'No variant to update')
    for var in res:
        http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
                                   ca_certs=certifi.where())
        vv_url_base = "https://rest.variantvalidator.org"
        # vv_url_base = "http://0.0.0.0:8000/"
        vv_url = "{0}/VariantValidator/variantvalidator/GRCh38/{1}-{2}-{3}-{4}/all?content-type=application/json".format(
            vv_url_base, var['chr'], var['pos'], var['pos_ref'],
            var['pos_alt'])
        log('DEBUG', 'Calling VariantValidator API: {}'.format(vv_url))
        try:
            vv_data = json.loads(
                http.request('GET', vv_url).data.decode('utf-8'))
            # log('DEBUG', vv_data)
        except Exception:
            log('WARNING',
                'No VV result for {0}:{1}'.format(nm_full, var['c_name']))
            continue
        for first_level_key in vv_data:
            match_obj = re.search('{}:c\.(.+)$'.format(nm_full),
                                  first_level_key)
            if match_obj:
                new_c_name = match_obj.group(1)
                log(
                    'DEBUG', 'Old c_name: {0} - New c_name: {1}'.format(
                        var['c_name'], new_c_name))
                if new_c_name == var['c_name']:
                    curs.execute(
                        "UPDATE variant_feature SET gene_name[2] = %s, \
                            creation_date = %s WHERE id = %s",
                        (nm, creation_date, var['id']))
                    log('INFO',
                        'Variant {} remains unchanged'.format(var['c_name']))
                else:
                    # likely to change are p_name, ivs_name, prot_type, start_segment_type, start_segment_number, end_segment_type, end_segment_number
                    # also need to update creation_date, creation_user
                    # get p_name
                    p_name = None
                    if 'hgvs_predicted_protein_consequence' in vv_data[
                            first_level_key]:
                        # log('DEBUG', vv_data[first_level_key]['hgvs_predicted_protein_consequence'])
                        if 'tlr' in vv_data[first_level_key][
                                'hgvs_predicted_protein_consequence']:
                            # log('DEBUG', vv_data[first_level_key]['hgvs_predicted_protein_consequence']['tlr'])
                            match_object = re.search(
                                'NP_\d+\.\d.*:p\.\(?(.+)\)?',
                                vv_data[first_level_key]
                                ['hgvs_predicted_protein_consequence']['tlr'])
                            if match_object:
                                p_name = match_object.group(1)
                                if re.search(r'\)$', p_name):
                                    # remove last ')'
                                    p_name = p_name[:-1]
                            else:
                                log('WARNING', 'No p_name in VV results')
                        else:
                            log('WARNING', 'No tlr in VV results')
                    else:
                        log(
                            'WARNING',
                            'No hgvs_predicted_protein_consequence in VV results'
                        )
                    start_segment_type = start_segment_number = end_segment_type = end_segment_number = ivs_name = None
                    # get segments type and number
                    positions = compute_start_end_pos(var['g_name'])
                    if positions[0] != positions[1]:
                        curs.execute(
                            "SELECT number, type FROM segment WHERE genome_version = 'hg38' AND \
                            gene_name[1] = %s AND gene_name[2] = %s AND %s BETWEEN SYMMETRIC segment_start \
                            AND segment_end AND %s BETWEEN SYMMETRIC segment_start AND segment_end",
                            (gene_name, nm, positions[0], positions[1]))
                        res_seg = curs.fetchone()
                        if res_seg is not None:
                            # start - end in same segment
                            start_segment_type = res_seg['type']
                            start_segment_number = res_seg['number']
                            end_segment_type = res_seg['type']
                            end_segment_number = res_seg['number']
                        else:
                            curs.execute(
                                "SELECT number, type FROM segment WHERE genome_version = 'hg38' \
                                AND gene_name[1] = %s AND gene_name[2] = %s AND %s \
                                BETWEEN SYMMETRIC segment_start AND segment_end ",
                                (gene_name, nm, positions[0]))
                            res_seg1 = curs.fetchone()
                            curs.execute(
                                "SELECT number, type FROM segment WHERE genome_version = 'hg38' \
                                AND gene_name[1] = %s AND gene_name[2] = %s AND %s \
                                BETWEEN SYMMETRIC segment_start AND segment_end ",
                                (gene_name, nm, positions[1]))
                            res_seg2 = curs.fetchone()
                            if res_strand['strand'] == '+':
                                start_segment_type = res_seg1['type']
                                start_segment_number = res_seg1['number']
                                end_segment_type = res_seg2['type']
                                end_segment_number = res_seg2['number']
                            else:
                                start_segment_type = res_seg2['type']
                                start_segment_number = res_seg2['number']
                                end_segment_type = res_seg1['type']
                                end_segment_number = res_seg1['number']
                        # get IVS name
                        if start_segment_type == 'intron':
                            ivs_obj = re.search(
                                r'^\d+([\+-]\d+)_\d+([\+-]\d+)(.+)$',
                                new_c_name)
                            if ivs_obj:
                                ivs_name = 'IVS{0}{1}_IVS{2}{3}{4}'.format(
                                    start_segment_number, ivs_obj.group(1),
                                    end_segment_number, ivs_obj.group(2),
                                    ivs_obj.group(3))
                            else:
                                ivs_obj = re.search(
                                    r'^\d+([\+-]\d+)_(\d+)([^\+-].+)$',
                                    new_c_name)
                                if ivs_obj:
                                    ivs_name = 'IVS{0}{1}_{2}{3}'.format(
                                        start_segment_number, ivs_obj.group(1),
                                        ivs_obj.group(2), ivs_obj.group(3))
                                else:
                                    ivs_obj = re.search(
                                        r'^(\d+)_\d+([\+-]\d+)(.+)$',
                                        new_c_name)
                                    if ivs_obj:
                                        ivs_name = '{0}_IVS{1}{2}{3}'.format(
                                            ivs_obj.group(1),
                                            end_segment_number,
                                            ivs_obj.group(2), ivs_obj.group(3))
                    else:
                        # substitutions
                        curs.execute(
                            "SELECT number, type FROM segment WHERE genome_version = 'hg38' \
                            AND gene_name[1] = %s AND gene_name[2] = %s AND %s \
                            BETWEEN SYMMETRIC segment_start AND segment_end ",
                            (gene_name, nm, positions[0]))
                        res_seg = curs.fetchone()
                        start_segment_type = res_seg['type']
                        start_segment_number = res_seg['number']
                        end_segment_type = res_seg['type']
                        end_segment_number = res_seg['number']
                        if start_segment_type == 'intron':
                            ivs_obj = re.search(r'^[\*-]?\d+([\+-]\d+)(.+)$',
                                                new_c_name)
                            ivs_name = 'IVS{0}{1}{2}'.format(
                                start_segment_number, ivs_obj.group(1),
                                ivs_obj.group(2))
                    if p_name is None or \
                            start_segment_type is None or \
                            start_segment_number is None or \
                            end_segment_type is None or \
                            end_segment_number is None:
                        log('WARNING', 'A mandatory new parameter is lacking')
                        continue
                    if ivs_name is None:
                        curs.execute(
                            "UPDATE variant_feature SET gene_name[2] = %s, c_name = %s, p_name = %s, start_segment_type = %s, \
                            start_segment_number = %s, end_segment_type = %s, end_segment_number = %s, \
                            creation_date = %s WHERE id = %s",
                            (nm, new_c_name, p_name, start_segment_type,
                             start_segment_number, end_segment_type,
                             end_segment_number, creation_date, var['id']))
                    else:
                        curs.execute(
                            "UPDATE variant_feature SET gene_name[2] = %s, c_name = %s, p_name = %s, ivs_name = %s, start_segment_type = %s, \
                            start_segment_number = %s, end_segment_type = %s, end_segment_number = %s, \
                            creation_date = %s WHERE id = %s",
                            (nm, new_c_name, p_name, ivs_name,
                             start_segment_type, start_segment_number,
                             end_segment_type, end_segment_number,
                             creation_date, var['id']))
                    log(
                        'INFO', 'Variant {0} updated to {1}'.format(
                            var['c_name'], new_c_name))

    db.commit()
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser(description='Define a canonical transcript per gene',
                                     usage='python define_canonical.py [-r path/to/refGeneCanonical_2019_09_23.txt]')
    parser.add_argument('-r', '--refgene', default='', required=True,
                        help='Path to the file containing the canonical refSeq IDs per gene (from UCSC)')
    parser.add_argument('-k', '--ncbi-api-key', default=None, required=False,
                        help='NCBI Entrez API key. If not provided, 3rd method is not executed')
    parser.add_argument('-u', '--update-refgene', default=None, required=False,
                        help='Update RefGene (canonical) for genes w/ on variants based on NCBI (requires NCBI API key)', action='store_true')
    args = parser.parse_args()
    # get file
    if os.path.isfile(args.refgene):
        refgeneFile = args.refgene
    else:
        sys.exit('ERROR: Invalid input path, please check your command')
    ncbi_api_key = None
    if args.ncbi_api_key is not None:
        if not re.search(r'\w+', args.ncbi_api_key):
            sys.exit('ERROR: Invalid NCBI API key, please check')
        else:
            ncbi_api_key = args.ncbi_api_key
    # get db connector and cursor
    db = get_db()
    curs = db.cursor(cursor_factory=psycopg2.extras.DictCursor)

    i = 0
    # lacking_nm = []

    # first when only one isoform => canonical
    log('INFO', "1st Query: genes w/ only one isoform - is automatically canonical")
    curs.execute(
        "SELECT name FROM gene WHERE canonical = 'f' AND name[1] IN (SELECT name[1] FROM gene GROUP BY name[1] HAVING COUNT (name[1]) = 1)"
    )
    res = curs.fetchall()
    for acc in res:
        curs.execute(
            "UPDATE gene SET canonical = 't' WHERE name[2] = %s",
            (acc['name'][1],)
        )
        # lacking_nm.append(acc['name'][0])
        log('INFO', 'Updated gene {} (1st method)'.format(acc['name'][0]))
        i += 1
    db.commit()
    # second check the refgene file
    log('INFO', "2nd Query: get info from local refGene file")
    for geneLine in open(refgeneFile).readlines():
        # ENST - NM - gene
        geneLineList = geneLine.rstrip().split("\t")
        # print(geneLineList[2])
        if geneLineList[2] != 'n/a' and geneLineList[2] != 'hg38.refGene.name2':
            # "SELECT DISTINCT(name[1]) FROM gene WHERE name[1] = %s AND name[1] NOT IN (SELECT name[1] FROM gene WHERE canonical = 't') ORDER BY name", - removed -too long
            curs.execute(  # gene exists in MD (no main already set)
                "SELECT DISTINCT(name[1]) FROM gene WHERE name[1] = %s ORDER BY name",
                (geneLineList[2],)
            )
            mdgene = curs.fetchone()

            if mdgene is not None:
                # is not canonical?
                curs.execute(  # gene exists in MD (no main already set)
                    "SELECT canonical FROM gene WHERE name[1] = %s AND canonical = 't'",
                    (geneLineList[2],)
                )
                mdgene_can = curs.fetchone()
                if mdgene_can is None:
                    # nm exists in md?
                    curs.execute(
                        "SELECT name FROM gene WHERE name[2] = %s",
                        (geneLineList[1],)
                    )  # exists in table gene_annotation? get a nm
                    mdnm = curs.fetchone()
                    if mdnm is not None:
                        # ok => canonical
                        i += 1
                        postGene = '{"' + mdnm['name'][0] + '","' + mdnm['name'][1] + '"}'
                        # print("UPDATE gene SET canonical = 't' WHERE name = '{}'".format(postGene))
                        curs.execute(
                             "UPDATE gene SET canonical = 't' WHERE name = %s",
                             (postGene,)
                        )
                        log('INFO', 'Updated gene {} (2nd method)'.format(mdnm['name'][0]))
                # else:
                    # lacking_nm.append(geneLineList[2])
    # print(lacking_nm)
    db.commit()
    log('INFO', "3rd Query: get info from NCBI for genes with no canonical defined remaining")
    # 3rd get info at NCBI
    # API key mandatory
    if ncbi_api_key is not None:        
        # get list of remaining genes with no canonical defined
        # "SELECT name, np, canonical FROM gene WHERE name[1] NOT IN (SELECT name[1] FROM gene WHERE canonical='t') ORDER BY name" - removed, too long
        curs.execute(
            "SELECT name, np, canonical FROM gene ORDER BY name[1], canonical DESC"
        )
        res = curs.fetchall()
        http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
        semaph_gene = None
        for acc in res:
            if semaph_gene != acc['name'][0]:
                semaph_num_iso = 0
                semaph_gene = acc['name'][0]
            semaph_num_iso += 1
            if semaph_num_iso > 1:
                continue
            # check if a canonical has been defined
            # curs.execute(
            #     "SELECT name FROM gene WHERE canonical='t' AND name[2] = %s",
            #     (acc['name'][1],)
            # )
            # res_cano = curs.fetchone()
            # if res_cano is None:
            if acc['canonical'] == 'f' and \
                    semaph_num_iso == 1:
                # ncbi
                ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={0}&api_key={1}'.format(acc['name'][1], ncbi_api_key)
                eutils_response = http.request('GET', ncbi_url).data.decode('utf-8')
                if re.search(r'"RefSeq\sSelect"', eutils_response):
                    curs.execute(
                        "UPDATE gene SET canonical = 't' WHERE name[2] = %s",
                        (acc['name'][1],)
                    )
                    i += 1
                    log('INFO', 'Updated gene {} (3rd method)'.format(acc['name'][0]))
                if acc['np'] == 'NP_000000.0':
                    if re.search(r'accession\s"NP_\d+",\s+version\s\d$', eutils_response, re.MULTILINE):
                        match_object = re.search(r'accession\s"(NP_\d+)",\s+version\s(\d+)$', eutils_response, re.MULTILINE)
                        curs.execute(
                            "UPDATE gene SET np = '{0}.{1}' WHERE name[2] = '{2}'".format(match_object.group(1), match_object.group(2), acc['name'][1])
                        )
                        log('INFO', 'Updated gene NP acc no of {0} to {1}.{2}'.format(acc['name'][0], match_object.group(1), match_object.group(2)))
            
        if args.update_refgene:
            log('INFO', "Update refGene")
            # get genes w/ no variants, and at least 2 isoforms to check which one should be canonical
            curs.execute(
                "SELECT name, canonical FROM gene WHERE (name[1] NOT IN \
                (SELECT gene_name[1] FROM variant_feature)) AND \
                (name[1] IN (SELECT name[1] FROM gene GROUP BY name[1] \
                HAVING COUNT (name[1]) > 1)) ORDER BY name"
            )
            res = curs.fetchall()
            for acc in res:
                ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={0}&api_key={1}'.format(
                    acc['name'][1],
                    ncbi_api_key
                )
                # log('DEBUG', ncbi_url)
                eutils_response = http.request('GET', ncbi_url).data.decode('utf-8')
                # if acc['name'][1] == 'NM_018257':
                    # log('DEBUG', eutils_response)
                    # log('DEBUG', acc['canonical'])
                    # log('DEBUG', re.search(r'"RefSeq\sSelect\scriteria"', eutils_response))
                if re.search(r'"RefSeq\sSelect\scriteria"', eutils_response) and acc['canonical'] is False:
                    curs.execute(
                        "UPDATE gene SET canonical = 'f' WHERE name[1] = %s",
                        (acc['name'][0],)
                    )
                    # log('INFO', "UPDATE gene SET canonical = 'f' WHERE name[1] = '{}'".format(acc['name'][0]))
                    curs.execute(
                        "UPDATE gene SET canonical = 't' WHERE name[2] = %s",
                        (acc['name'][1],)
                    )
                    # log('INFO', "UPDATE gene SET canonical = 't' WHERE name[2] = '{}'".format(acc['name'][1]))
                    i += 1
                    log('INFO', 'Updated gene {} (4th method)'.format(acc['name'][0]))
    log('INFO', '{} genes modified'.format(i))

    db.commit()
Beispiel #5
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Define a canonical transcript per gene and optionally updates various fields',
        usage='python update_canonical_from_remote.py [-r remote_server_url]')
    parser.add_argument('-r',
                        '--remote-server',
                        default='',
                        required=True,
                        help='base URL of the remote server')
    parser.add_argument(
        '-uca',
        '--update-can-all',
        default='',
        required=False,
        help='Optionally update canonical for all genes w/ no variants',
        action='store_true')
    parser.add_argument('-np',
                        '--update-np',
                        default='',
                        required=False,
                        help='Optionally update NP for genes',
                        action='store_true')
    parser.add_argument('-uu',
                        '--update-uniprot',
                        default='',
                        required=False,
                        help='Optionally update UNIPROT IDs',
                        action='store_true')
    parser.add_argument('-uc',
                        '--update-creation',
                        default='',
                        required=False,
                        help='Optionally update variant_creation tag',
                        action='store_true')
    parser.add_argument(
        '-un',
        '--update-nm',
        default='',
        required=False,
        help='Optionally update RefSeq Nm accession number tag',
        action='store_true')

    args = parser.parse_args()
    remote_addr = args.remote_server
    # args = parser.parse_args(['-np'])
    print()
    log('INFO', 'Working with server {}'.format(remote_addr))

    #headers
    header = {
        'Accept':
        'application/json',
        'User-Agent':
        'python-requests Python/{}.{}.{}'.format(sys.version_info[0],
                                                 sys.version_info[1],
                                                 sys.version_info[2]),
    }
    # get db connector and cursor
    db = get_db()
    curs = db.cursor(cursor_factory=psycopg2.extras.DictCursor)
    # get local list of genes with no canonical isoform defined
    curs.execute(
        "SELECT DISTINCT(name[1]) as hgnc FROM gene WHERE name[1] NOT IN (SELECT name[1] FROM gene WHERE canonical='t') ORDER BY name[1]"
    )
    no_can = curs.fetchall()

    http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
                               ca_certs=certifi.where())

    i = 0
    # lacking_nm = []

    for gene in no_can:
        req_url = '{0}/api/gene/{1}'.format(remote_addr, gene['hgnc'])
        api_response = json.loads(
            http.request('GET', req_url, headers=header).data.decode('utf-8'))
        log('DEBUG', 'req_url:{0}'.format(req_url))
        # log('DEBUG', 'api_response:{0}'.format(api_response))
        for keys in api_response:
            # log('DEBUG', 'key:{0} - value:{1}'.format(keys, api_response[keys]))
            if isinstance(keys, dict) and \
                    'canonical' in api_response[keys]:
                if api_response[keys]['canonical'] is True:
                    if re.search(r'NM_\d+\.\d+', keys):
                        match_obj = re.search(r'(NM_\d+)\.\d+', keys)
                        nm_acc = match_obj.group(1)
                        curs.execute(
                            "UPDATE gene set canonical = 't' WHERE name[2] = %s",
                            (nm_acc, ))
                        log('INFO', 'Updating {}'.format(nm_acc))
                        i += 1
    db.commit()
    log('INFO', '{} genes modified (canonical)'.format(i))
    i = 0
    if args.update_can_all:
        # get genes with no variants and at least 2 isoforms to see if we need to update canonical
        curs.execute(
            "SELECT name, canonical FROM gene WHERE (name[1] NOT IN (SELECT gene_name[1] FROM variant_feature)) \
            AND (name[1] IN (SELECT name[1] FROM gene GROUP BY name[1] HAVING COUNT (name[1]) > 1)) ORDER BY name"
        )
        res = curs.fetchall()
        for acc in res:
            req_url = '{0}/api/gene/{1}'.format(remote_addr, acc['name'][0])
            api_response = json.loads(
                http.request('GET', req_url,
                             headers=header).data.decode('utf-8'))
            for keys in api_response:
                if isinstance(keys, dict) and \
                       'canonical' in api_response[keys]:
                    if api_response[keys]['canonical'] is True and acc[
                            'canonical'] == 0:
                        if re.search(r'NM_\d+\.\d+', keys):
                            match_obj = re.search(r'(NM_\d+)\.\d+', keys)
                            nm_acc = match_obj.group(1)
                            # double check
                            if nm_acc == acc['name'][1]:
                                curs.execute(
                                    "UPDATE gene SET canonical = 'f' WHERE name[1] = %s",
                                    (acc['name'][0], ))
                                # log('INFO', "UPDATE gene SET canonical = 'f' WHERE name[1] = '{}'".format(acc['name'][0]))
                                curs.execute(
                                    "UPDATE gene SET canonical = 't' WHERE name[2] = %s",
                                    (acc['name'][1], ))
                                # log('INFO', "UPDATE gene SET canonical = 't' WHERE name[2] = '{}'".format(acc['name'][1]))
                                i += 1
                                log('INFO',
                                    'Updated gene {}'.format(acc['name'][0]))
        db.commit()

        log('INFO', '{} genes modified (canonical all)'.format(i))

    if args.update_np:
        curs.execute(
            "SELECT DISTINCT(name[1]) as hgnc FROM gene WHERE np = 'NP_000000.0'"
        )
        no_np = curs.fetchall()
        j = 0
        for gene in no_np:
            req_url = '{0}/api/gene/{1}'.format(remote_addr, gene['hgnc'])
            api_response = json.loads(
                http.request('GET', req_url,
                             headers=header).data.decode('utf-8'))
            for keys in api_response:
                if isinstance(keys, dict):
                    if 'RefProtein' in api_response[keys] and \
                            api_response[keys]['RefProtein'] != 'NP_000000.0':
                        if re.search(r'NP_\d+\.\d+',
                                     api_response[keys]['RefProtein']):
                            match_obj = re.search(r'(NM_\d+)\.\d+', keys)
                            nm_acc = match_obj.group(1)
                            np_acc = api_response[keys]['RefProtein']
                            curs.execute(
                                "UPDATE gene set np = %s WHERE name[2] = %s",
                                (np_acc, nm_acc))
                            log(
                                'INFO',
                                'Updating gene NP acc no of {0} to {1}'.format(
                                    nm_acc, np_acc))
                            j += 1
        db.commit()
        log('INFO', '{} NP acc no modified'.format(j))
    if args.update_uniprot or args.update_creation or args.update_nm:
        curs.execute(
            "SELECT  name[1] as HGNC, name[2] as nm, nm_version, np, uniprot_id, variant_creation FROM gene ORDER BY name"
        )
        res = curs.fetchall()
        k = l = m = n = 0
        o = curs.rowcount
        for gene in res:
            req_url = '{0}/api/gene/{1}'.format(remote_addr, gene['hgnc'])
            api_response = json.loads(
                http.request('GET', req_url,
                             headers=header).data.decode('utf-8'))
            l += 1
            if l % 1000 == 0:
                log('INFO', '{0}/{1} isoforms checked'.format(l, o))
            for keys in api_response:
                match_obj = re.search(r'^(NM_\d+)\.(\d+)$', keys)
                if match_obj:
                    nm_acc = match_obj.group(1)
                    # check again
                    if nm_acc == gene['nm']:
                        if args.update_nm:
                            nm_version = match_obj.group(2)
                            # log('DEBUG', '{0}dev:{1}-prod:{2}'.format(gene['hgnc'], int(nm_version), int(gene['nm_version'])))
                            if int(nm_version) != int(gene['nm_version']):
                                # no downgrade? y => downgrade
                                curs.execute(
                                    "UPDATE gene set nm_version = %s WHERE name[2] = %s",
                                    (nm_version, nm_acc))
                                log(
                                    'INFO',
                                    'Updating gene RefSeq NM accession version of {0} from {1} to {2}'
                                    .format(nm_acc, gene['nm_version'],
                                            nm_version))
                                n += 1
                        if 'UNIPROT' in api_response[
                                keys] and args.update_uniprot:
                            uniprot = api_response[keys]['UNIPROT']
                            if uniprot != gene['uniprot_id']:
                                curs.execute(
                                    "UPDATE gene set uniprot_id = %s WHERE name[2] = %s",
                                    (uniprot, nm_acc))
                                log(
                                    'INFO',
                                    'Updating gene UNIPROT id of {0} to {1}'.
                                    format(nm_acc, uniprot))
                                k += 1
                        if 'variantCreationTag' in api_response[
                                keys] and args.update_creation:
                            tag = api_response[keys]['variantCreationTag']
                            if tag != gene['variant_creation']:
                                curs.execute(
                                    "UPDATE gene set variant_creation = %s WHERE name[2] = %s",
                                    (tag, nm_acc))
                                log(
                                    'INFO',
                                    'Updating gene variantCreationTag of {0} to {1}'
                                    .format(nm_acc, tag))
                                m += 1
        db.commit()
        log('INFO', '{} UNIPROT IDs modified'.format(k))
        log('INFO', '{} variantCreationTag modified'.format(m))
        log('INFO', '{} RefSeq NM accession version modified'.format(n))
def main():
    parser = argparse.ArgumentParser(
        description='Checks that genes accept variant creation',
        usage='python check_variant_creation.py [-r remote_server_url]')
    parser.add_argument('-r',
                        '--remote-server',
                        default='',
                        required=True,
                        help='base URL of the remote server')
    parser.add_argument(
        '-k',
        '--api-key',
        default='',
        required=True,
        help='Your API key visible on your profile page on the website.')

    args = parser.parse_args()
    remote_addr = args.remote_server
    if re.search(r'mobidetails\.iurc', remote_addr):
        log('ERROR',
            'This script is not intended to work with the production server')
    if len(args.api_key) != 43:
        log('ERROR', 'Invalid API key, please check it')
    else:
        api_key = args.api_key
    print()
    log('INFO', 'Working with server {}'.format(remote_addr))

    # get db connector and cursor
    db = get_db()
    curs = db.cursor(cursor_factory=psycopg2.extras.DictCursor)
    curs.execute("DELETE FROM variant_feature WHERE c_name = '1A>T'")
    db.commit()
    # # reinitialise gene state
    # curs.execute(
    #    "UPDATE gene SET variant_creation = 'ok'"
    # )
    db.commit()

    http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
                               ca_certs=certifi.where())

    curs.execute(
        "SELECT DISTINCT(name), nm_version, variant_creation FROM gene WHERE canonical = 't' ORDER BY name"
    )
    #  AND variant_creation IN ('hg19_mapping_default', 'hg38_mapping_default')
    can = curs.fetchall()
    num_can = curs.rowcount
    i = 0
    j = 0
    k = 0
    # variant = 'c.1A>T'
    failed_genes = []
    for gene in can:
        print('.', end="", flush=True)
        i += 1
        if i % 500 == 0:
            log('INFO', '{0}/{1} genes checked'.format(i, num_can))
        # variant = '{0}.{1}:c.1A>T'.format(gene['name'][1], gene['nm_version'])
        # md_url = '{0}/api/variant/create/{1}/{2}'.format(remote_addr, variant, api_key)
        md_url = '{0}/api/variant/create'.format(remote_addr)
        variant_chgvs = '{0}.{1}:c.1A>T'.format(gene['name'][1],
                                                gene['nm_version'])
        data = {
            'variant_chgvs': urllib.parse.quote(variant_chgvs),
            'caller': 'cli',
            'api_key': api_key
        }
        # reinitialise gene state before query
        curs.execute(
            "UPDATE gene SET variant_creation = 'ok' WHERE name[2] = %s",
            (gene['name'][1], ))
        db.commit()
        try:
            md_response = json.loads(
                http.request('POST',
                             md_url,
                             headers=md_utilities.api_agent,
                             fields=data).data.decode('utf-8'))
            # try:
            #     md_response = json.loads(http.request('GET', md_url, headers={'Accept': 'application/json'}).data.decode('utf-8'))
            if 'mobidetails_error' in md_response:
                j += 1
                log(
                    'WARNING',
                    'variant creation failed for gene {0} with error {1}'.
                    format(gene['name'], md_response['mobidetails_error']))
                new_nm_match_obj = re.search(
                    r'A more recent version of the selected reference sequence NM_\d+\.\d+ is available \((NM_\d+)\.(\d+)\)',
                    md_response['mobidetails_error'])
                if new_nm_match_obj:
                    nm_to_check = new_nm_match_obj.group(1)
                    new_ver = new_nm_match_obj.group(2)
                    if nm_to_check == gene['name'][1]:
                        curs.execute(
                            "UPDATE gene SET nm_version = '{0}' WHERE name[2] = '{1}'"
                            .format(new_ver, gene['name'][1]))
                    # recheck
                    data['variant_chgvs'] = '{0}.{1}:c.1A>T'.format(
                        gene['name'][1], new_ver)
                    # md_url_2 = '{0}/api/variant/create/{1}/{2}'.format(remote_addr, variant_2, api_key)
                    try:
                        md_response_2 = json.loads(
                            http.request('POST',
                                         md_url,
                                         headers=md_utilities.api_agent,
                                         fields=data).data.decode('utf-8'))
                        # md_response_2 = json.loads(http.request('GET', md_url_2, headers={'Accept': 'application/json'}).data.decode('utf-8'))
                        if 'mobidetails_id' in md_response_2 and gene[
                                'variant_creation'] != 'ok':
                            curs.execute(
                                "UPDATE gene SET variant_creation = 'ok' WHERE name[2] = '{}'"
                                .format(gene['name'][1]))
                            continue
                    except Exception:
                        k += 1
                        failed_genes.append('{}'.format(gene['name'][0]))
                        continue
                if re.search(
                        r'cannot be mapped directly to genome build GRCh38',
                        md_response['mobidetails_error']):
                    curs.execute(
                        "UPDATE gene SET variant_creation = 'hg38_mapping_default' WHERE name[2] = '{}'"
                        .format(gene['name'][1]))
                    log(
                        'INFO',
                        'MD gene table updated with variant_creation = hg38_mapping_default'
                    )
                elif re.search(r'does not seem to map correctly to hg19',
                               md_response['mobidetails_error']):
                    curs.execute(
                        "UPDATE gene SET variant_creation = 'hg19_mapping_default' WHERE name[2] = '{}'"
                        .format(gene['name'][1]))
                    log(
                        'INFO',
                        'MD gene table updated with variant_creation = hg19_mapping_default'
                    )
                elif re.search(r'with the variant position and intron',
                               md_response['mobidetails_error']):
                    curs.execute(
                        "UPDATE gene SET variant_creation = 'mapping_default' WHERE name[2] = '{}'"
                        .format(gene['name'][1]))
                    log(
                        'INFO',
                        'MD gene table updated with variant_creation = mapping_default'
                    )

            elif 'mobidetails_id' in md_response and gene[
                    'variant_creation'] != 'ok':
                curs.execute(
                    "UPDATE gene SET variant_creation = 'ok' WHERE name[2] = '{}'"
                    .format(gene['name'][1]))
            db.commit()
        except Exception:
            log('ERROR', 'failed MD API call {}'.format(md_url))
            k += 1
            failed_genes.append('{}'.format(gene['name'][0]))
            continue
    log('INFO', '{0}/{1} genes reported a VV error'.format(j, num_can))
    log('INFO', '{0}/{1} genes triggered an MD error:'.format(k, num_can))
    log('INFO', failed_genes)
Beispiel #7
0
def main():
    parser = argparse.ArgumentParser(
        description='Update UNIPROT ids and protein size',
        usage='python check_uniprot_ids.py [-k NCBI_API_KEY]')
    parser.add_argument(
        '-k',
        '--ncbi-api-key',
        default=None,
        required=False,
        help='NCBI Entrez API key. If not provided, 3rd method is not executed'
    )
    args = parser.parse_args()
    ncbi_api_key = None
    if args.ncbi_api_key is not None:
        if not re.search(r'\w+', args.ncbi_api_key):
            sys.exit('ERROR: Invalid NCBI API key, please check')
        else:
            ncbi_api_key = args.ncbi_api_key
    # get db connector and cursor
    db = get_db()
    curs = db.cursor(cursor_factory=psycopg2.extras.DictCursor)

    i = 0

    http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
                               ca_certs=certifi.where())
    curs.execute(
        "SELECT name, np, uniprot_id, prot_size FROM gene WHERE ORDER BY name")
    res = curs.fetchall()
    count = curs.rowcount
    i = 0
    for gene in res:
        # ncbi
        print('.', end="", flush=True)
        i += 1
        if i % 500 == 0:
            log('INFO', '{0}/{1} genes checked'.format(i, count))
        match_obj = re.search(r'(NP_\d+)\.\d', gene['np'])
        if match_obj:
            # log('DEBUG', gene['name'][0])
            np = match_obj.group(1)
            uniprot_url = 'https://www.ebi.ac.uk/proteins/api/proteins/refseq:{}?offset=0&size=100&reviewed=true'.format(
                np)
            uniprot_response = json.loads(
                http.request('GET',
                             uniprot_url,
                             headers={
                                 'Accept': 'application/json'
                             }).data.decode('utf-8'))
            # print(uniprot_response[0]['accession'])
            try:
                if uniprot_response[0]['accession']:
                    # get uniport id prot size
                    # print('{0}-{1}'.format(gene['uniprot_id'], uniprot_response[0]['sequence']['length']))
                    if gene['uniprot_id'] == uniprot_response[0]['accession']:
                        # print('INFO: RefSeq: {0} - {1} - {2} OK'.format(gene['np'], gene['name'][1], gene['name'][0]))
                        pass
                    else:
                        curs.execute(
                            "UPDATE gene SET uniprot_id = '{0}' WHERE name[2] = '{1}'"
                            .format(uniprot_response[0]['accession'],
                                    gene['name'][1]))
                        # print("UPDATE gene SET uniprot_id = '{0}' WHERE name[2] = '{1}'".format(uniprot_response[0]['accession'], gene['name'][1]))
                        log(
                            'WARNING',
                            'Updated gene UNIPROT ID of {0} - {1} from {2} to {3}'
                            .format(gene['name'][0], gene['name'][1],
                                    gene['uniprot_id'],
                                    uniprot_response[0]['accession']))
                        i += 1
                else:
                    log(
                        'WARNING',
                        'md_uniprot_id: {0} - RefSeq: {1} - {2} - {3} :not checked'
                        .format(gene['uniprot_id'], gene['np'],
                                gene['name'][1], gene['name'][0]))
            except Exception:
                log(
                    'WARNING', 'no UNIPROT ID {0} for {1} - {2}'.format(
                        uniprot_response, gene['name'][1], gene['name'][0]))
            # get prot size from eutils
            ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id={0}&rettype=gp&complexity=3&api_key={1}'.format(
                np, ncbi_api_key)
            # ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={0}&api_key={1}'.format(gene['name'][1], ncbi_api_key)
            prot_size = -1
            try:
                eutils_response = http.request('GET',
                                               ncbi_url).data.decode('utf-8')
                # log('DEBUG', eutils_response)
                prot_match = re.search(r'Protein\s+1\.\.(\d+)',
                                       eutils_response)  #Protein\s+1\.\.(\d+)$
                if prot_match:
                    # log('DEBUG', 'ouhou')
                    prot_size = prot_match.group(1)
                    # log('DEBUG', prot_size)
            except Exception:
                log(
                    'WARNING',
                    'no protein size w/ got from eutils NP acc no {0}, eutils URL:{1}'
                    .format(gene['np'], ncbi_url))
            # log('DEBUG', prot_size)
            if int(prot_size) != -1 and \
                    gene['prot_size'] is not None and \
                    int(prot_size) != int(gene['prot_size']):
                curs.execute(
                    "UPDATE gene SET prot_size = '{0}' WHERE name[2] = '{1}'".
                    format(prot_size, gene['name'][1]))
                log(
                    'WARNING',
                    'Updated protein size for gene {0} - {1} - {2} to {3}'.
                    format(gene['name'][0], gene['name'][1],
                           gene['uniprot_id'], prot_size))
        else:
            log('WARNING', 'pb w/ NP acc no {}'.format(gene['np']))
    log('INFO', '{} isoforms updated'.format(i))

    db.commit()
Beispiel #8
0
def main():
    parser = argparse.ArgumentParser(
        description='Insert gnomAD data into MD',
        usage=
        'python insert_gnomad.py [-d path/to/dir/containing/gnomad.v2.1.1.lof_metrics.by_gene.txt]'
    )
    parser.add_argument(
        '-d',
        '--directory',
        default='',
        required=True,
        help='Path to the directory containing the gnomAD metrics by gene file'
    )
    args = parser.parse_args()
    # get file
    if os.path.isfile(args.directory):
        gnomadFile = args.directory
    else:
        sys.exit('Invalid input path, please check your command')

    # get db connector and cursor
    db = get_db()
    curs = db.cursor(cursor_factory=psycopg2.extras.DictCursor)
    i = 0

    for geneLine in open(gnomadFile).readlines():
        geneLineList = geneLine.split("\t")
        # print(geneLineList[0])
        curs.execute(  # exists in MD?
            "SELECT name FROM gene WHERE name[1] = '{0}' AND canonical = 't'".
            format(geneLineList[0])
            # number_of_exons IN (SELECT MAX(number_of_exons) FROM gene WHERE name[1] = '{0}')".format(geneLineList[0])
        )
        mdNMFirst = curs.fetchone()
        if mdNMFirst is not None:
            # print(mdNMFirst['nm'])
            curs.execute(
                "SELECT DISTINCT(gene_name[2]) FROM gene_annotation WHERE gene_name[1] = '{}'"
                .format(geneLineList[0]
                        ))  # exists in table gene_annotation? get a nm
            mdNMSecond = curs.fetchone()
            if mdNMSecond is None:
                # does not exists => creation
                i += 1
                postGene = '{"' + mdNMFirst['name'][0] + '","' + mdNMFirst[
                    'name'][1] + '"}'
                oeValues = {
                    'synoe': geneLineList[13],
                    'synlower': geneLineList[24],
                    'synupper': geneLineList[25],
                    'misoe': geneLineList[4],
                    'mislower': geneLineList[26],
                    'misupper': geneLineList[27],
                    'lofoe': geneLineList[23],
                    'loflower': geneLineList[28],
                    'lofupper': geneLineList[29]
                }
                for oeval in oeValues:
                    try:
                        oeValues[oeval] = float(oeValues[oeval])
                        oeValues[oeval] = "{:.2f}".format(oeValues[oeval])
                    except Exception:
                        next

                curs.execute(
                    "INSERT INTO gene_annotation VALUES('{0}','{1}','{2}','{3}','{4}','{5}','{6}','{7}','{8}','{9}')"
                    .format(postGene, oeValues['synoe'], oeValues['synlower'],
                            oeValues['synupper'], oeValues['misoe'],
                            oeValues['mislower'], oeValues['misupper'],
                            oeValues['lofoe'], oeValues['loflower'],
                            oeValues['lofupper']))

    log('INFO', '{} annotations added'.format(i))

    db.commit()
Beispiel #9
0
def main():
    # script meant to be croned to update NM acc versions in MD according to VariantValidator
    # to be ran after uta docker update for example
    # uses VV API genes2transcript
    # https://rest.variantvalidator.org/VariantValidator/tools/gene2transcripts/NM_130786?content-type=application%2Fjson
    vv_url_base = "https://rest.variantvalidator.org"
    # vv_url_base = "http://0.0.0.0:8000/"

    db = get_db()
    curs = db.cursor(cursor_factory=psycopg2.extras.DictCursor)

    curs.execute(  # get genes
        "SELECT name, nm_version FROM gene WHERE canonical = 't' ORDER BY name"
    )
    genes = curs.fetchall()
    count = curs.rowcount
    i = 0
    for gene in genes:
        # log('DEBUG', '{}-{}'.format(gene['name'][0], i))
        i += 1
        if i % 500 == 0:
            log('INFO', '{0}/{1} genes checked'.format(i, count))
        # print("MD------{}".format(gene['name'][1]))
        # get VV info for the gene
        http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
                                   ca_certs=certifi.where())
        vv_url = "{0}/VariantValidator/tools/gene2transcripts/{1}?content-type=application/json".format(
            vv_url_base, gene['name'][1])
        try:
            vv_data = json.loads(
                http.request('GET', vv_url).data.decode('utf-8'))
        except Exception:
            log('WARNING', 'No value for {0}'.format(gene['name'][0]))
            continue
        if 'transcripts' in vv_data:
            # current_nm = gene['nm_version']
            ts_dict = {}
            for transcript in vv_data['transcripts']:
                # print("VV------{}".format(transcript['reference']))
                match_object = re.search(r'^(N[MR]_\d+)\.(\d{1,2})',
                                         transcript['reference'])
                if match_object:
                    nm_acc = match_object.group(1)
                    # if nm_acc == gene['name'][1]:
                    nm_version = match_object.group(2)
                    if nm_acc not in ts_dict:
                        ts_dict[nm_acc] = [nm_version]
                    else:
                        ts_dict[nm_acc].append(nm_version)
            # do sthg with ts_dict before changing gene
            for nm in ts_dict:
                # exploring unconsistant NMs
                curs.execute("SELECT nm_version FROM gene WHERE name[2] = %s",
                             (nm, ))
                res_nm = curs.fetchone()
                max_vv_nm = max(ts_dict[nm])
                if not res_nm:
                    continue
                # log("DEBUG", "Gene: {0} - NM: {1} - VV Max NM: {2} - MD Current NM: {3}".format(gene['name'][0], nm, max_vv_nm, res_nm[0]))
                if res_nm and \
                        int(res_nm[0]) != int(max_vv_nm):
                    # NEED TO TEST IF THE TRANSCIPT WORKS!!!!!
                    vv_url_var = "{0}/VariantValidator/variantvalidator/GRCh38/{1}.{2}:c.1A>T/all?content-type=application/json".format(
                        vv_url_base, nm, max_vv_nm)
                    log('DEBUG',
                        'Calling VariantValidator API: {}'.format(vv_url_var))
                    try:
                        vv_data = json.loads(
                            http.request('GET',
                                         vv_url_var).data.decode('utf-8'))
                        # log('DEBUG', vv_data)
                    except Exception:
                        log('WARNING',
                            'No VV result for {0}.{1}'.format(nm, max_vv_nm))
                        continue
                    noupdate = None
                    for first_level_key in vv_data:
                        if 'validation_warnings' in vv_data[first_level_key]:
                            for warning in vv_data[first_level_key][
                                    'validation_warnings']:
                                if re.search(r'cannot be mapped directly to genome build', warning) or \
                                        re.search(r'No transcript definition for', warning) or \
                                        re.search(r'No transcripts found', warning) or \
                                        re.search(r'expected one of', warning):
                                    log(
                                        'WARNING',
                                        "Cannot update gene {0} from {1} to {2} because of {3}"
                                        .format(gene['name'][0], res_nm[0],
                                                max_vv_nm, warning))
                                    noupdate = 1
                                    break
                    if not noupdate:
                        curs.execute(
                            "UPDATE gene SET nm_version = %s WHERE name[2] = %s",
                            (max_vv_nm, nm))
                        log(
                            'INFO',
                            "NM UPDATE: gene {0} - {1} modified from {2} to {3}"
                            .format(gene['name'][0], nm, res_nm[0], max_vv_nm))
                db.commit()

        print('.', end="", flush=True)
def main():
    parser = argparse.ArgumentParser(
        description='Check isoforms differences between 2 DBs',
        usage=
        'python check_isoforms_differences.py -k md_api_key -nk ncbi_api_key')
    parser.add_argument(
        '-k',
        '--api-key',
        default=None,
        required=True,
        help='Your API key visible on your profile page on the website.')
    parser.add_argument('-nk',
                        '--ncbi-api-key',
                        default=None,
                        required=True,
                        help='NCBI Entrez API key.')
    parser.add_argument('-md',
                        '--diff-md',
                        default='',
                        required=False,
                        help='Check differences between MD Dev and Prod',
                        action='store_true')
    parser.add_argument('-ncbi',
                        '--diff-ncbi',
                        default='',
                        required=False,
                        help='Check differences between NCBI RefSeq and MD',
                        action='store_true')
    ncbi_api_key = None
    args = parser.parse_args()
    if args.ncbi_api_key is not None:
        if not re.search(r'\w+', args.ncbi_api_key):
            log('ERROR', 'Invalid NCBI API key, please check')
        else:
            ncbi_api_key = args.ncbi_api_key
    # get db connector and cursor
    db = get_db()
    curs = db.cursor(cursor_factory=psycopg2.extras.DictCursor)
    username = None
    if len(args.api_key) != 43:
        log('ERROR', 'Invalid API key, please check it')
    else:
        api_key = args.api_key
        # user
        curs.execute("SELECT username FROM mobiuser WHERE api_key = %s",
                     (api_key, ))
        res_user = curs.fetchone()
        if res_user is None:
            log('ERROR', 'Unknown API key')
        username = res_user['username']
        log('INFO', 'User: {}'.format(username))

    db = get_db()
    curs = db.cursor(cursor_factory=psycopg2.extras.DictCursor)
    if args.diff_md:
        # meant to be ran on prod server
        diff = {}
        # get genes w/ more than one isoform
        curs.execute(
            "SELECT name[1] AS hgnc, name[2] AS nm, nm_version FROM gene WHERE canonical = 't' AND name[1] IN (SELECT name[1] FROM gene GROUP BY name[1] HAVING COUNT(name[1]) > 1) ORDER by name[1]"
        )
        res = curs.fetchall()
        i = 0
        for gene in res:
            i += 1
            log('INFO', 'Treating gene {0} - #{1}'.format(gene['hgnc'], i))
            full_nm = '{0}.{1}'.format(gene['nm'], gene['nm_version'])
            base_url = "http://10.34.20.79"
            md_url = '{0}/MD/api/gene/{1}'.format(base_url, gene['hgnc'])
            http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
                                       ca_certs=certifi.where())
            try:
                md_data = json.loads(
                    http.request('GET', md_url).data.decode('utf-8'))
            except Exception:
                log('WARNING', 'MD not responding for {}'.format(gene['hgnc']))
            if full_nm in md_data and \
                    md_data[full_nm]['canonical'] is True:
                # log('INFO', 'No change for {}'.format(gene['hgnc']))
                continue
            elif full_nm not in md_data:
                log('DEBUG', '{0}-{1}'.format(md_data, full_nm))
            for key in md_data:
                matchobj = re.search(r'^(NM_\d+)\.\d+$', key)
                if matchobj:
                    new_nm = matchobj.group(1)
                    if md_data[key]['canonical'] is True:
                        diff[gene['hgnc']] = {
                            'old_can': full_nm,
                            'new_can': key
                        }
                        log(
                            'INFO',
                            'updating canonical for {0}: {1} instead of {2}'.
                            format(gene['hgnc'], key, full_nm))
                        curs.execute(
                            "UPDATE gene SET canonical = 'f' WHERE name[1] = %s",
                            (gene['hgnc'], ))
                        curs.execute(
                            "UPDATE gene SET canonical = 't' WHERE name[2] = %s",
                            (new_nm, ))
                        db.commit()
                        cmd = "python3 update_vars_when_iso_change.py -k {0} -g {1}".format(
                            api_key, gene['hgnc'])
                        returned_value = subprocess.call(cmd, shell=True)
                        log(
                            'INFO',
                            'Variants update returned value for {0}: {1}'.
                            format(gene['hgnc'], returned_value))
        pp = pprint.PrettyPrinter(indent=4)
        pp.pprint(diff)