Example #1
0
 def load_ensembl2interpro(self):
     #Interpro
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt')
     load_start(DATAFILE)
     ensembl2interpro = dict_nodup(tab2dict(DATAFILE, (1, 4, 5, 6), 0))
     ensembl2interpro = value_convert(ensembl2interpro, lambda x: {'id': x[0], 'short_desc': x[1], 'desc': x[2]})
     ensembl2interpro = value_convert(ensembl2interpro, lambda x: {'interpro': x}, traverse_list=False)
     load_done('[%d]' % len(ensembl2interpro))
     return self.convert2entrez(ensembl2interpro)
Example #2
0
 def load_ensembl2pos(self):
     #Genomic position
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt')
     load_start(DATAFILE)
     ensembl2pos = dict_nodup(tab2dict(DATAFILE, (1, 3, 4, 5, 6), 0, includefn=_not_LRG))
     ensembl2pos = value_convert(ensembl2pos, lambda x: {'chr': x[2], 'start': int(x[0]), 'end': int(x[1]), 'strand': int(x[3])})
     ensembl2pos = value_convert(ensembl2pos, lambda x: {'genomic_pos': x}, traverse_list=False)
     load_done('[%d]' % len(ensembl2pos))
     return self.convert2entrez(ensembl2pos)
Example #3
0
 def load_ensembl2pos(self):
     #Genomic position
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt')
     load_start(DATAFILE)
     ensembl2pos = dict_nodup(tab2dict(DATAFILE, (1, 3, 4, 5, 6), 0, includefn=_not_LRG))
     ensembl2pos = value_convert(ensembl2pos, lambda x: {'chr': x[2], 'start': int(x[0]), 'end': int(x[1]), 'strand': int(x[3])})
     ensembl2pos = value_convert(ensembl2pos, lambda x: {'genomic_pos': x}, traverse_list=False)
     load_done('[%d]' % len(ensembl2pos))
     return self.convert2entrez(ensembl2pos)
Example #4
0
 def load_ensembl2interpro(self):
     #Interpro
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt')
     load_start(DATAFILE)
     ensembl2interpro = dict_nodup(tab2dict(DATAFILE, (1, 4, 5, 6), 0))
     ensembl2interpro = value_convert(ensembl2interpro, lambda x: {'id': x[0], 'short_desc': x[1], 'desc': x[2]})
     ensembl2interpro = value_convert(ensembl2interpro, lambda x: {'interpro': x}, traverse_list=False)
     load_done('[%d]' % len(ensembl2interpro))
     return self.convert2entrez(ensembl2interpro)
Example #5
0
 def _cvt(pli):
     _d = list2dict(pli, 2)
     _d = value_convert(_d, _inner_cvt)
     for p_source in _d:
         if isinstance(_d[p_source], list):
             _d[p_source].sort(key=lambda e: e["id"])
     return {'pathway': _d}
Example #6
0
 def _cvt(pli):
     _d = list2dict(pli, 2)
     _d = value_convert(_d, _inner_cvt)
     for p_source in _d:
         if isinstance(_d[p_source], list):
             _d[p_source].sort()
     return {'pathway': _d}
Example #7
0
def _map_line_to_json(fields):
    vid = fields[0].split(":")
    chrom = re.search(r'[1-9]+', vid[0]).group()

    if chrom == '23':
        chrom = chrom.replace('23', 'X')
    HGVS = "chr%s:%s" % (chrom, vid[1])
    # load as json data
    if HGVS is None:
        return

    one_snp_json = {
        "_id": HGVS,
        "emv": {
            "gene": fields[2],
            "variant_id": fields[3],
            "exon": fields[4],
            "egl_variant": fields[5],
            "egl_protein": fields[6],
            "egl_classification": fields[7],
            "egl_classification_date": fields[8],
            "hgvs": fields[9].split(" | "),
            "clinvar_rcv": fields[10],
        }
    }

    return unlist(dict_sweep(value_convert(one_snp_json), vals=[""]))
Example #8
0
def load_x(idx, fieldname, cvt_fn=None):
    print('DATA_FOLDER: ' + DATA_FOLDER)
    DATAFILE = os.path.join(DATA_FOLDER, 'idmapping_selected.tab.gz')
    load_start(DATAFILE)
    t0 = time.time()
    xli = []
    for ld in tabfile_feeder(DATAFILE, header=1):
        ld = listitems(ld, *(2,19,idx))    # GeneID Ensembl(Gene) target_value
        for value in dupline_seperator(dupline=ld,
                                       dup_sep='; '):
            xli.append(value)

    ensembl2geneid = list2dict(list_nondup([(x[1], x[0]) for x in xli if x[0]!='' and x[1]!='']), 0, alwayslist=True)
    xli2 = []
    for entrez_id, ensembl_id, x_value in xli:
        if x_value:
            if cvt_fn:
                x_value = cvt_fn(x_value)
            if entrez_id:
                xli2.append((entrez_id, x_value))
            elif ensembl_id:
                entrez_id = ensembl2geneid.get(ensembl_id, None)
                if entrez_id:
                    for _eid in entrez_id:
                        xli2.append((_eid, x_value))
                else:
                    xli2.append((ensembl_id, x_value))

    gene2x = list2dict(list_nondup(xli2), 0)
    fn = lambda value: {fieldname: sorted(value) if type(value) is types.ListType else value}
    gene2x = value_convert(gene2x, fn, traverse_list=False)
    load_done('[%d, %s]' % (len(gene2x), timesofar(t0)))

    return gene2x
Example #9
0
 def load_ensembl2pfam(self):
     #Prosite
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__prot_pfam__dm.txt')
     load_start(DATAFILE)
     ensembl2pfam = dict_nodup(tab2dict(DATAFILE, (1, 4), 0))
     ensembl2pfam = value_convert(ensembl2pfam, lambda x: {'pfam': x}, traverse_list=False)
     load_done('[%d]' % len(ensembl2pfam))
     return self.convert2entrez(ensembl2pfam)
Example #10
0
 def load_ensembl2pfam(self):
     #Prosite
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__prot_pfam__dm.txt')
     load_start(DATAFILE)
     ensembl2pfam = dict_nodup(tab2dict(DATAFILE, (1, 4), 0))
     ensembl2pfam = value_convert(ensembl2pfam, lambda x: {'pfam': x}, traverse_list=False)
     load_done('[%d]' % len(ensembl2pfam))
     return self.convert2entrez(ensembl2pfam)
Example #11
0
 def _load_ensembl_2taxid(self):
     """ensembl2taxid"""
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt')
     load_start(DATAFILE)
     ensembl2taxid = dict_nodup(tab2dict(DATAFILE, (0, 1), 1, includefn=_not_LRG))
     # need to convert taxid to integer here
     ensembl2taxid = value_convert(ensembl2taxid, lambda x: int(x))
     load_done('[%d]' % len(ensembl2taxid))
     return ensembl2taxid
Example #12
0
 def _load_ensembl_2taxid(self):
     """ensembl2taxid"""
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt')
     load_start(DATAFILE)
     ensembl2taxid = dict_nodup(tab2dict(DATAFILE, (0, 1), 1, includefn=_not_LRG))
     # need to convert taxid to integer here
     ensembl2taxid = value_convert(ensembl2taxid, lambda x: int(x))
     load_done('[%d]' % len(ensembl2taxid))
     return ensembl2taxid
Example #13
0
def load_genedoc(self=None):
    reporter_d = {}
    for module in reporter_modules:
        reporter_d.update(module.loaddata())
    platform_li = reporter_d.keys()
    genedoc_d = merge_dict([reporter_d[k] for k in platform_li], platform_li)
    fn = lambda value: {'reporter': value}
    genedoc_d = value_convert(genedoc_d, fn, traverse_list=False)
    return genedoc_d
Example #14
0
def load_genedoc(self=None):
    reporter_d = {}
    for module in reporter_modules:
        reporter_d.update(module.loaddata())
    platform_li = reporter_d.keys()
    genedoc_d = merge_dict([reporter_d[k] for k in platform_li], platform_li)
    fn = lambda value: {'reporter': value}
    genedoc_d = value_convert(genedoc_d, fn, traverse_list=False)
    return genedoc_d
Example #15
0
def load_pharmgkb():
    print('DATA_FOLDER: ' + DATA_FOLDER)
    DATAFILE = os.path.join(DATA_FOLDER, 'genes.zip')
    load_start(DATAFILE)
    gene2pharmgkb = tab2dict((DATAFILE, 'genes.tsv'), (0, 1), 1, header=1, includefn=lambda ld: ld[1] != '')
    fn = lambda value: {'pharmgkb': value}
    gene2pharmgkb = value_convert(gene2pharmgkb, fn, traverse_list=False)

    load_done('[%d]' % len(gene2pharmgkb))

    return gene2pharmgkb
 def load_ensembl2pos(self):
     #Genomic position
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt')
     load_start(DATAFILE)
     # Twice 1 because first is the dict key, the second because we need gene id within genomic_pos
     ensembl2pos = dict_nodup(
         tab2dict(DATAFILE, (1, 1, 3, 4, 5, 6), 0, includefn=_not_LRG))
     ensembl2pos = value_convert(
         ensembl2pos, lambda x: {
             'ensemblgene': x[0],
             'chr': x[3],
             'start': int(x[1]),
             'end': int(x[2]),
             'strand': int(x[4])
         })
     ensembl2pos = value_convert(ensembl2pos,
                                 lambda x: {'genomic_pos': x},
                                 traverse_list=False)
     load_done('[%d]' % len(ensembl2pos))
     return self.convert2entrez(ensembl2pos)
def load_uniprot():
    print('DATA_FOLDER: ' + DATA_FOLDER)
    DATAFILE = os.path.join(DATA_FOLDER, 'idmapping_selected.tab.gz')
    load_start(DATAFILE)
    t0 = time.time()
    xli = []
    for ld in tabfile_feeder(DATAFILE,
                             header=1,
                             assert_column_no=VALID_COLUMN_NO):
        ld = listitems(ld,
                       *(0, 1, 2,
                         18))  # UniProtKB-AC UniProtKB-ID GeneID Ensembl(Gene)
        for value in dupline_seperator(
                dupline=ld,
                dup_idx=[2, 3
                         ],  # GeneID and EnsemblID columns may have duplicates
                dup_sep='; '):
            value = list(value)
            value[1] = get_uniprot_section(value[1])
            value = tuple(value)
            xli.append(value)

    ensembl2geneid = list2dict([(x[3], x[2])
                                for x in xli if x[2] != '' and x[3] != ''],
                               0,
                               alwayslist=True)
    xli2 = []
    for uniprot_acc, section, entrez_id, ensembl_id in xli:
        if entrez_id:
            xli2.append((uniprot_acc, section, entrez_id))
        elif ensembl_id:
            entrez_id = ensembl2geneid.get(ensembl_id, None)
            if entrez_id:
                #if ensembl_id can be mapped to entrez_id
                for _eid in entrez_id:
                    xli2.append((uniprot_acc, section, _eid))
            else:
                #otherwise, just use ensembl_id
                xli2.append((uniprot_acc, section, ensembl_id))

    gene2uniprot = list2dict(list_nondup(xli2), 2, alwayslist=True)
    gene2uniprot = value_convert(gene2uniprot,
                                 _dict_convert,
                                 traverse_list=False)
    load_done('[%d, %s]' % (len(gene2uniprot), timesofar(t0)))

    return gene2uniprot
Example #18
0
 def _load_ensembl2name(self):
     """loading ensembl gene to symbol+name mapping"""
     DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt')
     load_start(DATAFILE)
     ensembl2name = tab2dict(DATAFILE, (1,2,7), 0, includefn=_not_LRG)
     def _fn(x):
         out={}
         if x[0].strip() not in ['', '\\N']:
             out['symbol'] = x[0].strip()
         if x[1].strip() not in ['', '\\N']:
             _name = SubStr(x[1].strip(), '', ' [Source:').strip()
             if _name:
                 out['name'] = _name
         return out
     ensembl2name = value_convert(ensembl2name, _fn)
     load_done('[%d]' % len(ensembl2name))
     return ensembl2name
Example #19
0
    def _load_ensembl2name(self):
        """loading ensembl gene to symbol+name mapping"""
        DATAFILE = os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt')
        load_start(DATAFILE)
        ensembl2name = tab2dict(DATAFILE, (1, 2, 7), 0, includefn=_not_LRG)

        def _fn(x):
            out = {}
            if x[0].strip() not in ['', '\\N']:
                out['symbol'] = x[0].strip()
            if x[1].strip() not in ['', '\\N']:
                _name = SubStr(x[1].strip(), '', ' [Source:').strip()
                if _name:
                    out['name'] = _name
            return out
        ensembl2name = value_convert(ensembl2name, _fn)
        load_done('[%d]' % len(ensembl2name))
        return ensembl2name
Example #20
0
    def convert2entrez(self, ensembl2x):
        '''convert a dict with ensembl gene ids as the keys to matching entrezgene ids as the keys.'''
        if not self.ensembl2entrez_li:
            self._load_ensembl2entrez_li()

        if not self.ensembl_main:
            self.ensembl_main = self.load_ensembl_main()

        ensembl2entrez = list2dict(self.ensembl2entrez_li, 0)
        entrez2ensembl = list2dict(self.ensembl2entrez_li, 1)

        #Now make a dictionary indexed by entrez gene id
        print('# of ensembl IDs in total: %d' %
              len(set(ensembl2x) | set(ensembl2entrez)))
        print('# of ensembl IDs match entrez Gene IDs: %d' %
              len(set(ensembl2x) & set(ensembl2entrez)))
        print('# of ensembl IDs DO NOT match entrez Gene IDs: %d' %
              len(set(ensembl2x) - set(ensembl2entrez)))

        #all genes with matched entrez
        def _fn(eid, taxid=None):
            d = copy.copy(ensembl2x.get(
                eid, {}))  # need to make a copy of the value here.
            return d  # otherwise, it will cause issue when multiple entrezgene ids
            # match the same ensembl gene, for example,
            #      ENSMUSG00000027104 --> (11909, 100047997)

        data = value_convert(entrez2ensembl, _fn)

        #add those has no matched entrez geneid, using ensembl id as the key
        for eid in (set(ensembl2x) - set(ensembl2entrez)):
            _g = ensembl2x[eid]
            #_g.update(self.ensembl_main.get(eid, {}))
            data[eid] = _g

        for id in data:
            if isinstance(data[id], dict):
                _doc = dict_nodup(data[id], sort=True)
            else:
                #if one entrez gene matches multiple ensembl genes
                _doc = dict_attrmerge(data[id], removedup=True, sort=True)
            data[id] = _doc

        return data
Example #21
0
def _map_line_to_json(item):
    chrom = item.CHROM
    chromStart = item.POS
    ref = item.REF
    info = item.INFO
    hpo_count=item.INFO['HPO_CT']
    for alt in item.ALT:
        alt = str(alt)
        (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True)
        if HGVS is None:
            return
        one_snp_json = {
            "_id": HGVS,
            "geno2mp": {
                "hpo_count": hpo_count,

            }
        }
        obj = (dict_sweep(unlist(value_convert(one_snp_json)), [None]))
        yield obj
def load_x(idx, fieldname, cvt_fn=None):
    '''idx is 0-based column number'''
    print('DATA_FOLDER: ' + DATA_FOLDER)
    DATAFILE = os.path.join(DATA_FOLDER, 'idmapping_selected.tab.gz')
    load_start(DATAFILE)
    t0 = time.time()
    xli = []
    for ld in tabfile_feeder(DATAFILE,
                             header=1,
                             assert_column_no=VALID_COLUMN_NO):
        ld = listitems(ld, *(2, 19, idx))  # GeneID Ensembl(Gene) target_value
        for value in dupline_seperator(dupline=ld, dup_sep='; '):
            xli.append(value)

    ensembl2geneid = list2dict(list_nondup([(x[1], x[0]) for x in xli
                                            if x[0] != '' and x[1] != '']),
                               0,
                               alwayslist=True)
    xli2 = []
    for entrez_id, ensembl_id, x_value in xli:
        if x_value:
            if cvt_fn:
                x_value = cvt_fn(x_value)
            if entrez_id:
                xli2.append((entrez_id, x_value))
            elif ensembl_id:
                entrez_id = ensembl2geneid.get(ensembl_id, None)
                if entrez_id:
                    for _eid in entrez_id:
                        xli2.append((_eid, x_value))
                else:
                    xli2.append((ensembl_id, x_value))

    gene2x = list2dict(list_nondup(xli2), 0)
    fn = lambda value: {
        fieldname: sorted(value) if isinstance(value, list) else value
    }
    gene2x = value_convert(gene2x, fn, traverse_list=False)
    load_done('[%d, %s]' % (len(gene2x), timesofar(t0)))

    return gene2x
Example #23
0
    def convert2entrez(self, ensembl2x):
        '''convert a dict with ensembl gene ids as the keys to matching entrezgene ids as the keys.'''
        if not self.ensembl2entrez_li:
            self._load_ensembl2entrez_li()

        if not self.ensembl_main:
            self.ensembl_main = self.load_ensembl_main()

        ensembl2entrez = list2dict(self.ensembl2entrez_li, 0)
        entrez2ensembl = list2dict(self.ensembl2entrez_li, 1)

        #Now make a dictionary indexed by entrez gene id
        print '# of ensembl IDs in total: %d' % len(set(ensembl2x) | set(ensembl2entrez))
        print '# of ensembl IDs match entrez Gene IDs: %d' % len(set(ensembl2x) & set(ensembl2entrez))
        print '# of ensembl IDs DO NOT match entrez Gene IDs: %d' % len(set(ensembl2x) - set(ensembl2entrez))

        #all genes with matched entrez
        def _fn(eid, taxid=None):
            d = copy.copy(ensembl2x.get(eid, {}))    #need to make a copy of the value here.
            return d                                    #otherwise, it will cause issue when multiple entrezgene ids
                                                        #match the same ensembl gene, for example,
                                                        #      ENSMUSG00000027104 --> (11909, 100047997)

        data = value_convert(entrez2ensembl, _fn)

        #add those has no matched entrez geneid, using ensembl id as the key
        for eid in (set(ensembl2x) - set(ensembl2entrez)):
            _g = ensembl2x[eid]
            #_g.update(self.ensembl_main.get(eid, {}))
            data[eid] = _g

        doc_li = []
        for id in data:
            if type(data[id]) is types.DictType:
                _doc = dict_nodup(data[id], sort=True)
            else:
                #if one entrez gene matches multiple ensembl genes
                _doc = dict_attrmerge(data[id], removedup=True, sort=True)
            data[id] = _doc

        return data
Example #24
0
def _map_line_to_json(item):
    chrom = item.CHROM
    chromStart = item.POS
    ref = item.REF
    info = item.INFO
    hpo_count = item.INFO['HPO_CT']
    for alt in item.ALT:
        alt = str(alt)
        (HGVS, var_type) = get_hgvs_from_vcf(chrom,
                                             chromStart,
                                             ref,
                                             alt,
                                             mutant_type=True)
        if HGVS is None:
            return
        one_snp_json = {
            "_id": HGVS,
            "geno2mp": {
                "hpo_count": hpo_count,
            }
        }
        obj = (dict_sweep(unlist(value_convert(one_snp_json)), [None]))
        yield obj
Example #25
0
def load_uniprot():
    print('DATA_FOLDER: ' + DATA_FOLDER)
    DATAFILE = os.path.join(DATA_FOLDER, 'idmapping_selected.tab.gz')
    load_start(DATAFILE)
    t0 = time.time()
    xli = []
    for ld in tabfile_feeder(DATAFILE, header=1):
        ld = listitems(ld, *(0,1,2,19))    #UniProtKB-AC UniProtKB-ID GeneID Ensembl(Gene)
        for value in dupline_seperator(dupline=ld,
                                       dup_idx=[2,3],   #GeneID and EnsemblID columns may have duplicates
                                       dup_sep='; '):
            value = list(value)
            value[1] = get_uniprot_section(value[1])
            value = tuple(value)
            xli.append(value)

    ensembl2geneid = list2dict([(x[3], x[2]) for x in xli if x[2]!='' and x[3]!=''], 0, alwayslist=True)
    xli2 = []
    for uniprot_acc, section, entrez_id, ensembl_id in xli:
        if entrez_id:
            xli2.append((uniprot_acc, section, entrez_id))
        elif ensembl_id:
            entrez_id = ensembl2geneid.get(ensembl_id, None)
            if entrez_id:
                #if ensembl_id can be mapped to entrez_id
                for _eid in entrez_id:
                    xli2.append((uniprot_acc, section, _eid))
            else:
                #otherwise, just use ensembl_id
                xli2.append((uniprot_acc, section, ensembl_id))

    gene2uniprot = list2dict(list_nondup(xli2), 2, alwayslist=True)
    gene2uniprot = value_convert(gene2uniprot, _dict_convert, traverse_list=False)
    load_done('[%d, %s]' % (len(gene2uniprot), timesofar(t0)))

    return gene2uniprot
Example #26
0
def _map_line_to_json(item):
    chrom = item.CHROM
    chromStart = item.POS
    ref = item.REF
    info = item.INFO
    try:
        baseqranksum = info['BaseQRankSum']
    except:
        baseqranksum = None
    try:
        clippingranksum = info['ClippingRankSum']
    except:
        clippingranksum = None
    try:
        mqranksum = info['MQRankSum']
    except:
        mqranksum = None
    try:
        readposranksum = info['ReadPosRankSum']
    except:
        readposranksum = None
    try:
        qd = info['QD']
    except:
        qd = None
    try:
        inbreedingcoeff = info['InbreedingCoeff']
    except:
        inbreedingcoeff = None
    for i in range(0, len(item.ALT)):
        item.ALT[i] = str(item.ALT[i])
    for alt in item.ALT:
        alt = str(alt)
        (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True)
        if HGVS is None:
            return
        one_snp_json = {
            "_id": HGVS,
            "exac": {
                "chrom": chrom,
                "pos": chromStart,
                "ref": ref,
                "alt": alt,
                "alleles": item.ALT,
                "type": var_type,
                "ac": {
                    "ac": info['AC'],
                    "ac_afr": info['AC_AFR'],
                    "ac_amr": info['AC_AMR'],
                    "ac_adj": info['AC_Adj'],
                    "ac_eas": info['AC_EAS'],
                    "ac_fin": info['AC_FIN'],
                    "ac_het": info['AC_Het'],
                    "ac_hom": info['AC_Hom'],
                    "ac_nfe": info['AC_NFE'],
                    "ac_oth": info['AC_OTH'],
                    "ac_sas": info['AC_SAS']
                },
                "af": info['AF'],
                "an": {
                    "an": info['AN'],
                    "an_afr": info['AN_AFR'],
                    "an_amr": info['AN_AMR'],
                    "an_adj": info['AN_Adj'],
                    "an_eas": info['AN_EAS'],
                    "an_fin": info['AN_FIN'],
                    "an_nfe": info['AN_NFE'],
                    "an_oth": info['AN_OTH'],
                    "an_sas": info['AN_SAS']

                },
                "baseqranksum": baseqranksum,
                "clippingranksum": clippingranksum,
                "fs": info['FS'],
                "het": {
                    "het_afr": info['Het_AFR'],
                    "het_amr": info['Het_AMR'],
                    "het_eas": info['Het_EAS'],
                    "het_fin": info['Het_FIN'],
                    "het_nfe": info['Het_NFE'],
                    "het_oth": info['Het_OTH'],
                    "het_sas": info['Het_SAS']
                },
                "hom": {
                    "hom_afr": info['Hom_AFR'],
                    "hom_amr": info['Hom_AMR'],
                    "hom_eas": info['Hom_EAS'],
                    "hom_fin": info['Hom_FIN'],
                    "hom_nfe": info['Hom_NFE'],
                    "hom_oth": info['Hom_OTH'],
                    "hom_sas": info['Hom_SAS']
                },
                "inbreedingcoeff": inbreedingcoeff,
                "mq": {
                    "mq": info['MQ'],
                    "mq0": info['MQ0'],
                    "mqranksum": mqranksum
                },
                "ncc": info['NCC'],
                "qd": qd,
                "readposranksum": readposranksum,
                "vqslod": info['VQSLOD'],
                "culprit": info['culprit']
            }
        }
        obj = (dict_sweep(unlist(value_convert(one_snp_json)), [None]))
        yield obj
Example #27
0
def _map_line_to_json(fields):
    assert len(fields) == VALID_COLUMN_NO
    rsid = fields[8]

    # load as json data
    if rsid is None:
        return
    url = 'http://myvariant.info/v1/query?q=dbsnp.rsid:'\
          + rsid + '&fields=_id'
    r = requests.get(url)
    for hits in r.json()['hits']:
        HGVS = hits['_id']
        one_snp_json = {

            "_id": HGVS,
            "grasp":
                {
                    'hg19':
                        {
                            'chr': fields[5],
                            'pos': fields[6]
                        },
                    'hupfield': fields[1],
                    'last_curation_date': fields[2],
                    'creation_date': fields[3],
                    'srsid': fields[4],
                    'publication':
                        {
                            'journal': fields[16],
                            'title': fields[17],
                            'pmid': fields[7],
                            'snpid': fields[8],
                            'location_within_paper': fields[9],
                            'p_value': fields[10],
                            'phenotype': fields[11],
                            'paper_phenotype_description': fields[12],
                            'paper_phenotype_categories': fields[13],
                            'date_pub': fields[14]
                        },
                    'includes_male_female_only_analyses': fields[18],
                    'exclusively_male_female': fields[19],
                    'initial_sample_description': fields[20],
                    'replication_sample_description': fields[21],
                    'platform_snps_passing_qc': fields[22],
                    'gwas_ancestry_description': fields[23],
                    'discovery':
                        {
                            'total_samples': fields[25],
                            'european': fields[26],
                            'african': fields[27],
                            'east_asian': fields[28],
                            'indian_south_asian': fields[29],
                            'hispanic': fields[30],
                            'native': fields[31],
                            'micronesian': fields[32],
                            'arab_me': fields[33],
                            'mixed': fields[34],
                            'unspecified': fields[35],
                            'filipino': fields[36],
                            'indonesian': fields[37]
                        },
                    'replication':
                        {
                            'total_samples': fields[38],
                            'european': fields[39],
                            'african': fields[40],
                            'east_asian': fields[41],
                            'indian_south_asian': fields[42],
                            'hispanic': fields[43],
                            'native': fields[44],
                            'micronesian': fields[45],
                            'arab_me': fields[46],
                            'mixed': fields[47],
                            'unspecified': fields[48],
                            'filipino': fields[49],
                            'indonesian': fields[50]
                        },
                    'in_gene': fields[51],
                    'nearest_gene': fields[52],
                    'in_lincrna': fields[53],
                    'in_mirna': fields[54],
                    'in_mirna_bs': fields[55],
                    'oreg_anno': fields[61],
                    'conserv_pred_tfbs': fields[62],
                    'human_enhancer': fields[63],
                    'rna_edit': fields[64],
                    'polyphen2': fields[65],
                    'sift': fields[66],
                    'ls_snp': fields[67],
                    'uniprot': fields[68],
                    'eqtl_meth_metab_study': fields[69]
                }
            }
        return list_split(dict_sweep(unlist(value_convert(one_snp_json)), [""]), ",")
def _map_line_to_json(cp, hg19):
    try:
        clinical_significance = cp.ReferenceClinVarAssertion.\
            ClinicalSignificance.Description
    except:
        clinical_significance = None
    rcv_accession = cp.ReferenceClinVarAssertion.ClinVarAccession.Acc
    try:
        review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.\
            ReviewStatus
    except:
        review_status = None
    try:
        last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.\
            DateLastEvaluated
    except:
        last_evaluated = None
    variant_id = cp.ReferenceClinVarAssertion.MeasureSet.ID
    number_submitters = len(cp.ClinVarAssertion)
    # some items in clinvar_xml doesn't have origin information
    try:
        origin = cp.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin
    except:
        origin = None
    trait = cp.ReferenceClinVarAssertion.TraitSet.Trait[0]
    synonyms = []
    conditions_name = ''
    for name in trait.Name:
        if name.ElementValue.Type == 'Alternate':
            synonyms.append(name.ElementValue.get_valueOf_())
        if name.ElementValue.Type == 'Preferred':
            conditions_name += name.ElementValue.get_valueOf_()
    identifiers = {}
    for item in trait.XRef:
        if item.DB == 'Human Phenotype Ontology':
            key = 'Human_Phenotype_Ontology'
        else:
            key = item.DB
        identifiers[key.lower()] = item.ID
    for symbol in trait.Symbol:
        if symbol.ElementValue.Type == 'Preferred':
            conditions_name += ' (' + symbol.ElementValue.get_valueOf_() + ')'
    age_of_onset = ''
    for _set in trait.AttributeSet:
        if _set.Attribute.Type == 'age of onset':
            age_of_onset = _set.Attribute.get_valueOf_()

    # MeasureSet.Measure return a list, there might be multiple
    # Measure under one MeasureSet
    for Measure in cp.ReferenceClinVarAssertion.MeasureSet.Measure:
        variation_type = Measure.Type
        # exclude any item of which types belong to
        # 'Variation', 'protein only' or 'Microsatellite'
        if variation_type == 'Variation' or variation_type\
           == 'protein only' or variation_type == 'Microsatellite':
            continue
        allele_id = Measure.ID
        chrom = None
        chromStart_19 = None
        chromEnd_19 = None
        chromStart_38 = None
        chromEnd_38 = None
        ref = None
        alt = None
        if Measure.SequenceLocation:
            for SequenceLocation in Measure.SequenceLocation:
                # In this version, only accept information concerning GRCh37
                if 'GRCh37' in SequenceLocation.Assembly:
                    chrom = SequenceLocation.Chr
                    chromStart_19 = SequenceLocation.start
                    chromEnd_19 = SequenceLocation.stop
                    ref = SequenceLocation.referenceAllele
                    alt = SequenceLocation.alternateAllele
                if 'GRCh38' in SequenceLocation.Assembly:
                    chromStart_38 = SequenceLocation.start
                    chromEnd_38 = SequenceLocation.stop
                    if not ref:
                        ref = SequenceLocation.referenceAllele
                    if not alt:
                        alt = SequenceLocation.alternateAllele
        if Measure.MeasureRelationship:
            try:
                symbol = Measure.MeasureRelationship[0].\
                    Symbol[0].get_ElementValue().valueOf_
            except:
                symbol = None
            gene_id = Measure.MeasureRelationship[0].XRef[0].ID
        else:
            symbol = None
            gene_id = None
        if Measure.Name:
            name = Measure.Name[0].ElementValue.valueOf_
        else:
            name = None
        if len(Measure.CytogeneticLocation) == 1:
            cytogenic = Measure.CytogeneticLocation[0]
        else:
            cytogenic = Measure.CytogeneticLocation
        hgvs_coding = None
        hgvs_genome = None
        HGVS = {'genomic': [], 'coding': [], 'non-coding': [], 'protein': []}
        coding_hgvs_only = None
        hgvs_id = None
        if hg19:
            chromStart = chromStart_19
            chromEnd = chromEnd_19
        else:
            chromStart = chromStart_38
            chromEnd = chromEnd_38
        # hgvs_not_validated = None
        if Measure.AttributeSet:
            # 'copy number loss' or 'gain' have format different\
            # from other types, should be dealt with seperately
            if (variation_type == 'copy number loss') or \
                    (variation_type == 'copy number gain'):
                for AttributeSet in Measure.AttributeSet:
                    if 'HGVS, genomic, top level' in AttributeSet.\
                            Attribute.Type:
                        if AttributeSet.Attribute.integerValue == 37:
                            hgvs_genome = AttributeSet.Attribute.get_valueOf_()
                    if 'genomic' in AttributeSet.Attribute.Type:
                        HGVS['genomic'].append(AttributeSet.Attribute.
                                               get_valueOf_())
                    elif 'non-coding' in AttributeSet.Attribute.Type:
                        HGVS['non-coding'].append(AttributeSet.Attribute.
                                                  get_valueOf_())
                    elif 'coding' in AttributeSet.Attribute.Type:
                        HGVS['coding'].append(AttributeSet.Attribute.
                                              get_valueOf_())
                    elif 'protein' in AttributeSet.Attribute.Type:
                        HGVS['protein'].append(AttributeSet.
                                               Attribute.get_valueOf_())
            else:
                for AttributeSet in Measure.AttributeSet:
                    if 'genomic' in AttributeSet.Attribute.Type:
                        HGVS['genomic'].append(AttributeSet.
                                               Attribute.get_valueOf_())
                    elif 'non-coding' in AttributeSet.Attribute.Type:
                        HGVS['non-coding'].append(AttributeSet.
                                                  Attribute.get_valueOf_())
                    elif 'coding' in AttributeSet.Attribute.Type:
                        HGVS['coding'].append(AttributeSet.Attribute.
                                              get_valueOf_())
                    elif 'protein' in AttributeSet.Attribute.Type:
                        HGVS['protein'].append(AttributeSet.
                                               Attribute.get_valueOf_())
                    if AttributeSet.Attribute.Type == 'HGVS, coding, RefSeq':
                        hgvs_coding = AttributeSet.Attribute.get_valueOf_()
                    elif AttributeSet.Attribute.Type == \
                            'HGVS, genomic, top level, previous':
                        hgvs_genome = AttributeSet.Attribute.get_valueOf_()
                        break
            if chrom and chromStart and chromEnd:
                if variation_type == 'single nucleotide variant':
                    hgvs_id = "chr%s:g.%s%s>%s" % (chrom, chromStart, ref, alt)
                # items whose type belong to 'Indel, Insertion, \
                # Duplication' might not hava explicit alt information, \
                # so we will parse from hgvs_genome
                elif variation_type == 'Indel':
                    if hgvs_genome:
                        indel_position = hgvs_genome.find('del')
                        indel_alt = hgvs_genome[indel_position+3:]
                        hgvs_id = "chr%s:g.%s_%sdel%s" % \
                                  (chrom, chromStart, chromEnd, indel_alt)
                elif variation_type == 'Deletion':
                    hgvs_id = "chr%s:g.%s_%sdel" % \
                              (chrom, chromStart, chromEnd)
                elif variation_type == 'Insertion':
                    if hgvs_genome:
                        ins_position = hgvs_genome.find('ins')
                        if 'ins' in hgvs_genome:
                            ins_ref = hgvs_genome[ins_position+3:]
                            hgvs_id = "chr%s:g.%s_%sins%s" % \
                                      (chrom, chromStart, chromEnd, ins_ref)
                elif variation_type == 'Duplication':
                    if hgvs_genome:
                        dup_position = hgvs_genome.find('dup')
                        if 'dup' in hgvs_genome:
                            dup_ref = hgvs_genome[dup_position+3:]
                            hgvs_id = "chr%s:g.%s_%sdup%s" % \
                                      (chrom, chromStart, chromEnd, dup_ref)
            elif variation_type == 'copy number loss' or\
                    variation_type == 'copy number gain':
                if hgvs_genome and chrom:
                    hgvs_id = "chr" + chrom + ":" + hgvs_genome.split('.')[2]
            elif hgvs_coding:
                hgvs_id = hgvs_coding
                coding_hgvs_only = True
            else:
                print "couldn't find any id", rcv_accession
                return
        else:
            print 'no measure.attribute', rcv_accession
            return
        for key in HGVS:
            HGVS[key].sort()
        rsid = None
        cosmic = None
        dbvar = None
        uniprot = None
        omim = None
        # loop through XRef to find rsid as well as other ids
        if Measure.XRef:
            for XRef in Measure.XRef:
                if XRef.Type == 'rs':
                    rsid = 'rs' + str(XRef.ID)
                elif XRef.DB == 'COSMIC':
                    cosmic = XRef.ID
                elif XRef.DB == 'OMIM':
                    omim = XRef.ID
                elif XRef.DB == 'UniProtKB/Swiss-Prot':
                    uniprot = XRef.ID
                elif XRef.DB == 'dbVar':
                    dbvar = XRef.ID

        # make sure the hgvs_id is not none
        if hgvs_id:
            one_snp_json = {

                "_id": hgvs_id,
                "clinvar":
                    {
                        "allele_id": allele_id,
                        "variant_id": variant_id,
                        "chrom": chrom,
                        "omim": omim,
                        "cosmic": cosmic,
                        "uniprot": uniprot,
                        "dbvar": dbvar,
                        "hg19":
                            {
                                "start": chromStart_19,
                                "end": chromEnd_19
                            },
                        "hg38":
                            {
                                "start": chromStart_38,
                                "end": chromEnd_38
                            },
                        "type": variation_type,
                        "gene":
                            {
                                "id": gene_id,
                                "symbol": symbol
                            },
                        "rcv":
                            {
                                "accession": rcv_accession,
                                "clinical_significance": clinical_significance,
                                "number_submitters": number_submitters,
                                "review_status": review_status,
                                "last_evaluated": str(last_evaluated),
                                "preferred_name": name,
                                "origin": origin,
                                "conditions":
                                    {
                                        "name": conditions_name,
                                        "synonyms": synonyms,
                                        "identifiers": identifiers,
                                        "age_of_onset": age_of_onset
                                }
                            },
                        "rsid": rsid,
                        "cytogenic": cytogenic,
                        "hgvs": HGVS,
                        "coding_hgvs_only": coding_hgvs_only,
                        "ref": ref,
                        "alt": alt
                    }
            }
            obj = (dict_sweep(unlist(value_convert(one_snp_json,
                                                   ['chrom', 'omim', 'id', 'orphanet', 'gene',
                                                    'rettbase_(cdkl5)', 'cosmic', 'dbrbc'])), [None, '', 'None']))
            yield obj
Example #29
0
def _map_line_to_json(fields):
    assert len(fields) == VALID_COLUMN_NO
    chrom = fields[0]
    chromStart = fields[1]
    ref = fields[2]
    alt = fields[4]
    HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt)

    # load as json data
    if HGVS is None:
        return
    one_snp_json = {
        "_id": HGVS,
        "cadd": {
            'chrom': fields[0],
            'pos': fields[1],
            'ref': fields[2],
            'anc': fields[3],
            'alt': fields[4],
            'type': fields[5],
            'length': fields[6],
            'istv': fields[7],
            'isderived': fields[8],
            'annotype': fields[9],
            'consequence': fields[10],
            'consscore': fields[11],
            'consdetail': fields[12],
            'gc': fields[13],
            'cpg': fields[14],
            'mapability': {
                '20bp': fields[15],
                '35bp': fields[16]
            },
            'scoresegdup': fields[17],
            'phast_cons': {
                'primate': fields[18],
                'mammalian': fields[19],
                'vertebrate': fields[20]
            },
            'phylop': {
                'primate': fields[21],
                'mammalian': fields[22],
                'vertebrate': fields[23]
            },
            'gerp': {
                'n': fields[24],
                's': fields[25],
                'rs': fields[26],
                'rs_pval': fields[27]
            },
            'bstatistic': fields[28],
            'mutindex': fields[29],
            'dna': {
                'helt': fields[30],
                'mgw': fields[31],
                'prot': fields[32],
                'roll': fields[33]
            },
            'mirsvr': {
                'score': fields[34],
                'e': fields[35],
                'aln': fields[36]
            },
            'targetscans': fields[37],
            'fitcons': fields[38],
            'chmm': {
                'tssa': fields[39],
                'tssaflnk': fields[40],
                'txflnk': fields[41],
                'tx': fields[42],
                'txwk': fields[43],
                'enh': fields[44],
                # 'enh': fields[45],
                'znfrpts': fields[46],
                'het': fields[47],
                'tssbiv': fields[48],
                'bivflnk': fields[49],
                'enhbiv': fields[50],
                'reprpc': fields[51],
                'reprpcwk': fields[52],
                'quies': fields[53],
            },
            'encode': {
                'exp': fields[54],
                'h3k27ac': fields[55],
                'h3k4me1': fields[56],
                'h3k4me3': fields[57],
                'nucleo': fields[58],
                'occ': fields[59],
                'p_val': {
                    'comb': fields[60],
                    'dnas': fields[61],
                    'faire': fields[62],
                    'polii': fields[63],
                    'ctcf': fields[64],
                    'mycp': fields[65]
                },
                'sig': {
                    'dnase': fields[66],
                    'faire': fields[67],
                    'polii': fields[68],
                    'ctcf': fields[69],
                    'myc': fields[70]
                },
            },
            'segway': fields[71],
            'motif': {
                'toverlap': fields[72],
                'dist': fields[73],
                'ecount': fields[74],
                'ename': fields[75],
                'ehipos': fields[76],
                'escorechng': fields[77]
            },
            'tf': {
                'bs': fields[78],
                'bs_peaks': fields[79],
                'bs_peaks_max': fields[80]
            },
            'isknownvariant': fields[81],
            'esp': {
                'af': fields[82],
                'afr': fields[83],
                'eur': fields[84]
            },
            '1000g': {
                'af': fields[85],
                'asn': fields[86],
                'amr': fields[87],
                'afr': fields[88],
                'eur': fields[89]
            },
            'min_dist_tss': fields[90],
            'min_dist_tse': fields[91],
            'gene': {
                'gene_id': fields[92],
                'feature_id': fields[93],
                'ccds_id': fields[94],
                'genename': fields[95],
                'cds': {
                    'cdna_pos': fields[96],
                    'rel_cdna_pos': fields[97],
                    'cds_pos': fields[98],
                    'rel_cds_pos': fields[99]
                },
                'prot': {
                    'protpos': fields[100],
                    'rel_prot_pos': fields[101],
                    'domain': fields[102]
                }
            },
            'dst2splice': fields[103],
            'dst2spltype': fields[104],
            'exon': fields[105],
            'intron': fields[106],
            'oaa': fields[107],  # ref aa
            'naa': fields[108],  # alt aa
            'grantham': fields[109],
            'polyphen': {
                'cat': fields[110],
                'val': fields[111]
            },
            'sift': {
                'cat': fields[112],
                'val': fields[113]
            },
            'rawscore': fields[114],  # raw CADD score
            'phred': fields[115]  # log-percentile of raw CADD score
        }
    }

    return dict_sweep(unlist(value_convert(one_snp_json)), ["NA"])
Example #30
0
def _map_line_to_json(fields):
    # specific variable treatment
    chrom = fields[0]
    if fields[7] == ".":
        hg18_end = "."
    else:
        hg18_end = int(fields[7])+1
    chromStart = int(fields[1])
    chromEnd = int(fields[1]) + 1
    allele1 = fields[2]
    allele2 = fields[3]
    HGVS = "chr%s:g.%d%s>%s" % (chrom, chromStart, allele1, allele2)

    if fields[74] == ".":
        siphy = "."
    else:
        freq = fields[74].split(":")
        siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}

    acc = fields[11].rstrip().rstrip(';').split(";")
    pos = fields[13].rstrip().rstrip(';').split(";")
    uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos)))

    # load as json data
    one_snp_json = {

        "_id": HGVS,
        "dbnsfp":
            {
                "chrom": chrom,
                "hg19":
                    {
                        "start": fields[1],
                        "end": chromEnd
                    },
                "hg18":
                    {
                        "start": fields[7],
                        "end": hg18_end
                    },
                "hg38":
                    {
                        "chrom": fields[8],
                        "pos": fields[9]                    
                    },
                "allele1": allele1,
                "allele2": allele2,
                "aa":
                    {
                        "ref": fields[4],
                        "alt": fields[5],
                        "pos": fields[23],
                        "refcodon": fields[16],
                        "codonpos": fields[18],
                        "aapos_sift": fields[24],
                        "aapos_fathmm": fields[25]
                    },
                "genename": fields[10],
                "uniprot": uniprot,
                "interpro_domain": fields[14],
                "cds_strand": fields[15],
                "slr_test_statistic": fields[17],
                "fold-degenerate": fields[19],
                "ancestral_allele": fields[20],
                "ensembl":
                    {
                        "geneid": fields[21],
                        "transcriptid": fields[22]
                    },
                "sift":
                    {
                        "score": fields[26],
                        "converted_rankscore": fields[27],
                        "pred": fields[28]
                    },
                "polyphen2":
                    {
                        "hdiv":
                        {
                            "score": fields[29],
                            "rankscore": fields[30],
                            "pred": fields[31]
                        },
                        "hvar":
                        {
                            "score": fields[32],
                            "rankscore": fields[33],
                            "pred": fields[34]
                        }
                    },
                "lrt":
                    {
                        "score": fields[35],
                        "converted_rankscore": fields[36],
                        "pred": fields[37]
                    },
                "mutationtaster":
                    {
                        "score": fields[38],
                        "converted_rankscore": fields[39],
                        "pred": fields[40]
                    },
                "mutationassessor":
                    {
                        "score": fields[41],
                        "rankscore": fields[42],
                        "pred": fields[43]
                    },
                "fathmm":
                    {
                        "score": fields[44],
                        "rankscore": fields[45],
                        "pred": fields[46]
                    },
                "radialsvm":
                    {
                        "score": fields[47],
                        "rankscore": fields[48],
                        "pred": fields[49]
                    },
                "lr":
                    {
                        "score": fields[50],
                        "rankscore": fields[51],
                        "pred": fields[52]
                    },
                "reliability_index": fields[53],
                "vest3":
                    {
                        "score": fields[54],
                        "rankscore": fields[55]
                    },
                "cadd":
                    {
                        "raw": fields[56],
                        "raw_rankscore": fields[57],
                        "phred": fields[58]
                    },
                "gerp++":
                    {
                        "nr": fields[59],
                        "rs": fields[60],
                        "rs_rankscore": fields[61]
                    },
                "phylop":
                    {
                        "46way": 
                            {
                                "primate": fields[62],
                                "primate_rankscore": fields[63],
                                "placental": fields[64],
                                "placental_rankscore": fields[65],
                            },
                        "100way":
                            {
                                "vertebrate": fields[66],
                                "vertebrate_rankscore": fields[67]
                            }
                    },
                "phastcons":
                    {
                        "46way": 
                            {
                                "primate": fields[68],
                                "primate_rankscore": fields[69],
                                "placental": fields[70],
                                "placental_rankscore": fields[71],
                            },
                        "100way":
                            {
                                "vertebrate": fields[72],
                                "vertebrate_rankscore": fields[73]
                            }
                    },
                "siphy_29way":
                    {
                        "pi": siphy,
                        "logodds": fields[75],
                        "logodds_rankscore": fields[76]
                    },
                "lrt_omega": fields[77],
                "unisnp_ids": fields[78],
                "1000gp1":
                    {
                        "ac": fields[79],
                        "af": fields[80],
                        "afr_ac": fields[81],
                        "afr_af": fields[82],
                        "eur_ac": fields[83],
                        "eur_af": fields[84],
                        "amr_ac": fields[85],
                        "amr_af": fields[86],
                        "asn_ac": fields[87],
                        "asn_af": fields[88]
                    },
                "esp6500":
                    {
                        "aa_af": fields[89],
                        "ea_af": fields[90]
                    },
                "aric5606":
                    {
                        "aa_ac": fields[91],
                        "aa_af": fields[92],
                        "ea_ac": fields[93],
                        "ea_af": fields[94]
                    },
                "clinvar":
                    {
                        "rs": fields[95],
                        "clin_sig": fields[96],
                        "trait": fields[97]
                    }
            }
    }

    one_snp_json = list_split(dict_sweep(unlist(value_convert(one_snp_json)), vals=["."]), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json
Example #31
0
def _map_line_to_json(fields):
    assert len(fields) == VALID_COLUMN_NO
    chr_info = re.findall(r"[\w']+", fields[17])
    chrom = chr_info[0]  # Mutation GRCh37 genome position
    chromStart = chr_info[1]
    chromEnd = chr_info[2]

    HGVS = None
    cds = fields[13]
    sub = re.search(r'[ATCGMNHKRY]+>[ATCGMNHKRY]+', cds)
    ins = re.search(r'ins[ATCGMN]+|ins[0-9]+', cds)
    delete = cds.find('del') != -1
    del_ins = re.search(r'[0-9]+>[ATCGMN]+', cds)
    comp = re.search(r'[ATCGMN]+', cds)

    if sub:
        HGVS = "chr%s:g.%s%s" % (chrom, chromStart, sub.group())
    elif ins:
        HGVS = "chr%s:g.%s_%s%s" % (chrom, chromStart, chromEnd, ins.group())
    elif delete:
        HGVS = "chr%s:g.%s_%sdel" % (chrom, chromStart, chromEnd)
    elif del_ins:
        HGVS = "chr%s:g.%s_%sdelins%s" % (chrom, chromStart, chromEnd, comp.group())
    # elif comp:
    #    HGVS = "chr%s:g.%s_%s%s" % (chrom, chromStart, chromEnd, comp.group())
    else:
        HGVS = fields[12]
        print "Error2:", fields[15], cds, fields[17]

    # load as json data
    if HGVS is None:
        return

    one_snp_json = {
        "sorter": fields[17] + fields[13],
        "_id": HGVS,
        "cosmic":
            {
                "gene":
                    {
                        "symbol": fields[0],  # Gene name
                        "id": fields[3],  # HGNC ID
                        "cds_length": fields[2]
                    },
                "transcript": fields[1],  # Accession Number
                "sample":
                    {
                        "name": fields[4],  # Sample name
                        "id": fields[5]  # ID_sample
                    },
                "tumour":
                    {
                        "id": fields[6],  # ID_tumour
                        "primary_site": fields[7],  # Primary site
                        "site_subtype": fields[8],  # Site subtype
                        "primary_histology": fields[9],  # Primary histology
                        "histology_subtype": fields[10],  # Histology subtype
                        "origin": fields[1]
                    },
                "mutation":
                    {
                        "id": "COSM" + fields[12],  # Mutation ID
                        "cds": cds,  # Mutation CDS
                        "aa": fields[14],  # Mutation AA
                        "description": fields[15],  # Mutation Description
                        "zygosity": fields[16],  # Mutation zygosity
                        "somatic_status": fields[21]  # Mutation somatic status
                    },
                "chrom": chrom,
                "hg19":
                   {
                        "start": chromStart,
                        "end": chromEnd
                    },
                "pubmed": fields[22]  # Pubmed_PMID
            }
        }
    return dict_sweep(value_convert(one_snp_json), vals=[""])
Example #32
0
def _map_line_to_json(fields):
    assert len(fields) == VALID_COLUMN_NO
    rsid = fields[8]

    # load as json data
    if rsid is None:
        return
    url = "http://myvariant.info/v1/query?q=dbsnp.rsid:" + rsid + "&fields=_id"
    r = requests.get(url)
    for hits in r.json()["hits"]:
        HGVS = hits["_id"]
        one_snp_json = {
            "_id": HGVS,
            "grasp": {
                "hg19": {"chr": fields[5], "pos": fields[6]},
                "hupfield": fields[1],
                "last_curation_date": fields[2],
                "creation_date": fields[3],
                "srsid": fields[4],
                "publication": {
                    "journal": fields[16],
                    "title": fields[17],
                    "pmid": fields[7],
                    "snpid": fields[8],
                    "location_within_paper": fields[9],
                    "p_value": fields[10],
                    "phenotype": fields[11],
                    "paper_phenotype_description": fields[12],
                    "paper_phenotype_categories": fields[13],
                    "date_pub": fields[14],
                },
                "includes_male_female_only_analyses": fields[18],
                "exclusively_male_female": fields[19],
                "initial_sample_description": fields[20],
                "replication_sample_description": fields[21],
                "platform_snps_passing_qc": fields[22],
                "gwas_ancestry_description": fields[23],
                "discovery": {
                    "total_samples": fields[25],
                    "european": fields[26],
                    "african": fields[27],
                    "east_asian": fields[28],
                    "indian_south_asian": fields[29],
                    "hispanic": fields[30],
                    "native": fields[31],
                    "micronesian": fields[32],
                    "arab_me": fields[33],
                    "mixed": fields[34],
                    "unspecified": fields[35],
                    "filipino": fields[36],
                    "indonesian": fields[37],
                },
                "replication": {
                    "total_samples": fields[38],
                    "european": fields[39],
                    "african": fields[40],
                    "east_asian": fields[41],
                    "indian_south_asian": fields[42],
                    "hispanic": fields[43],
                    "native": fields[44],
                    "micronesian": fields[45],
                    "arab_me": fields[46],
                    "mixed": fields[47],
                    "unspecified": fields[48],
                    "filipino": fields[49],
                    "indonesian": fields[50],
                },
                "in_gene": fields[51],
                "nearest_gene": fields[52],
                "in_lincrna": fields[53],
                "in_mirna": fields[54],
                "in_mirna_bs": fields[55],
                "oreg_anno": fields[61],
                "conserv_pred_tfbs": fields[62],
                "human_enhancer": fields[63],
                "rna_edit": fields[64],
                "polyphen2": fields[65],
                "sift": fields[66],
                "ls_snp": fields[67],
                "uniprot": fields[68],
                "eqtl_meth_metab_study": fields[69],
            },
        }
        return list_split(dict_sweep(unlist(value_convert(one_snp_json)), [""]), ",")
Example #33
0
def _map_line_to_json(item):
    chrom = item.CHROM
    chromStart = item.POS
    ref = item.REF
    info = item.INFO
    try:
        baseqranksum = info['BaseQRankSum']
    except:
        baseqranksum = None
    try:
        clippingranksum = info['ClippingRankSum']
    except:
        clippingranksum = None
    try:
        mqranksum = info['MQRankSum']
    except:
        mqranksum = None
    try:
        readposranksum = info['ReadPosRankSum']
    except:
        readposranksum = None
    try:
        qd = info['QD']
    except:
        qd = None
    try:
        inbreedingcoeff = info['InbreedingCoeff']
    except:
        inbreedingcoeff = None
    for i in range(0, len(item.ALT)):
        item.ALT[i] = str(item.ALT[i])
    for alt in item.ALT:
        alt = str(alt)
        (HGVS, var_type) = get_hgvs_from_vcf(chrom,
                                             chromStart,
                                             ref,
                                             alt,
                                             mutant_type=True)
        if HGVS is None:
            return
        one_snp_json = {
            "_id": HGVS,
            "exac": {
                "chrom": chrom,
                "pos": chromStart,
                "ref": ref,
                "alt": alt,
                "alleles": item.ALT,
                "type": var_type,
                "ac": {
                    "ac": info['AC'],
                    "ac_afr": info['AC_AFR'],
                    "ac_amr": info['AC_AMR'],
                    "ac_adj": info['AC_Adj'],
                    "ac_eas": info['AC_EAS'],
                    "ac_fin": info['AC_FIN'],
                    "ac_het": info['AC_Het'],
                    "ac_hom": info['AC_Hom'],
                    "ac_nfe": info['AC_NFE'],
                    "ac_oth": info['AC_OTH'],
                    "ac_sas": info['AC_SAS'],
                    "ac_female": info['AC_FEMALE'],
                    "ac_male": info['AC_MALE']
                },
                "af": info['AF'],
                "an": {
                    "an": info['AN'],
                    "an_afr": info['AN_AFR'],
                    "an_amr": info['AN_AMR'],
                    "an_adj": info['AN_Adj'],
                    "an_eas": info['AN_EAS'],
                    "an_fin": info['AN_FIN'],
                    "an_nfe": info['AN_NFE'],
                    "an_oth": info['AN_OTH'],
                    "an_sas": info['AN_SAS'],
                    "an_female": info['AN_FEMALE'],
                    "an_male": info['AN_MALE']
                },
                "baseqranksum": baseqranksum,
                "clippingranksum": clippingranksum,
                "fs": info['FS'],
                "het": {
                    "het_afr": info['Het_AFR'],
                    "het_amr": info['Het_AMR'],
                    "het_eas": info['Het_EAS'],
                    "het_fin": info['Het_FIN'],
                    "het_nfe": info['Het_NFE'],
                    "het_oth": info['Het_OTH'],
                    "het_sas": info['Het_SAS']
                },
                "hom": {
                    "hom_afr": info['Hom_AFR'],
                    "hom_amr": info['Hom_AMR'],
                    "hom_eas": info['Hom_EAS'],
                    "hom_fin": info['Hom_FIN'],
                    "hom_nfe": info['Hom_NFE'],
                    "hom_oth": info['Hom_OTH'],
                    "hom_sas": info['Hom_SAS']
                },
                "inbreedingcoeff": inbreedingcoeff,
                "mq": {
                    "mq": info['MQ'],
                    "mq0": info['MQ0'],
                    "mqranksum": mqranksum
                },
                "ncc": info['NCC'],
                "qd": qd,
                "readposranksum": readposranksum,
                "vqslod": info['VQSLOD'],
                "culprit": info['culprit']
            }
        }
        obj = (dict_sweep(unlist(value_convert(one_snp_json)), [None]))
        yield obj
Example #34
0
def _map_line_to_json(fields, version):
    # specific variable treatment
    chrom = fields[0]
    if chrom == 'M':
        chrom = 'MT'
    # fields[7] in version 2, represent hg18_pos
    if fields[10] == ".":
        hg18_end = "."
    else:
        hg18_end = int(fields[10])
    # in case of no hg19 position provided, remove the item
    if fields[8] == '.':
        return None
    else:
        chromStart = int(fields[8])
        chromEnd = int(fields[8])
    chromStart_38 = int(fields[1])
    ref = fields[2].upper()
    alt = fields[3].upper()
    HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt)
    HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt)
    if version == 'hg19':
        HGVS = HGVS_19
    elif version == 'hg38':
        HGVS = HGVS_38
    if fields[105] == ".":
        siphy = "."
    else:
        freq = fields[105].split(":")
        siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}
    gtex_gene = fields[181].split('|')
    gtex_tissue = fields[182].split('|')
    gtex = map(
        dict,
        map(lambda t: zip(('gene', 'tissue'), t), zip(gtex_gene, gtex_tissue)))
    acc = fields[26].rstrip().rstrip(';').split(";")
    pos = fields[28].rstrip().rstrip(';').split(";")
    uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos)))
    provean_score = fields[52].split(';')
    sift_score = fields[23].split(';')
    hdiv_score = fields[29].split(';')
    hvar_score = fields[32].split(';')
    lrt_score = fields[35].split(';')
    dann_score = fields[69].split(';')
    mutationtaster_score = fields[39].split(';')
    mutationassessor_score = fields[46].split(';')
    vest3_score = fields[57].split(';')
    metasvm_score = fields[59].split(';')
    fathmm_score = fields[49].split(';')
    lr_score = fields[62].split(';')
    fathmm_coding_score = fields[71].split(';')
    integrated_fitcons_score = fields[82].split(';')
    gm12878_fitcons_score = fields[85].split(';')
    h1_hesc_fitcons_score = fields[88].split(';')
    huvec_fitcons_score = fields[91].split(';')
    if len(provean_score) > 1:
        for i in range(len(provean_score)):
            if provean_score[i] == '.':
                provean_score[i] = None
    if len(sift_score) > 1:
        for i in range(len(sift_score)):
            if sift_score[i] == '.':
                sift_score[i] = None
    if len(hdiv_score) > 1:
        for i in range(len(hdiv_score)):
            if hdiv_score[i] == '.':
                hdiv_score[i] = None
    if len(hvar_score) > 1:
        for i in range(len(hvar_score)):
            if hvar_score[i] == '.':
                hvar_score[i] = None
    if len(lrt_score) > 1:
        for i in range(len(lrt_score)):
            if lrt_score[i] == '.':
                lrt_score[i] = None
    if len(mutationtaster_score) > 1:
        for i in range(len(mutationtaster_score)):
            if mutationtaster_score[i] == '.':
                mutationtaster_score[i] = None
    if len(mutationassessor_score) > 1:
        for i in range(len(mutationassessor_score)):
            if mutationassessor_score[i] == '.':
                mutationassessor_score[i] = None
    if len(metasvm_score) > 1:
        for i in range(len(metasvm_score)):
            if metasvm_score[i] == '.':
                metasvm_score[i] = None
    if len(vest3_score) > 1:
        for i in range(len(vest3_score)):
            if vest3_score[i] == '.':
                vest3_score[i] = None
    if len(fathmm_score) > 1:
        for i in range(len(fathmm_score)):
            if fathmm_score[i] == '.':
                fathmm_score[i] = None
    if len(lr_score) > 1:
        for i in range(len(lr_score)):
            if lr_score[i] == '.':
                lr_score[i] = None
    if len(fathmm_coding_score) > 1:
        for i in range(len(fathmm_coding_score)):
            if fathmm_coding_score[i] == '.':
                fathmm_coding_score[i] = None
    if len(dann_score) > 1:
        for i in range(len(dann_score)):
            if dann_score[i] == '.':
                dann_score[i] = None
    if len(integrated_fitcons_score) > 1:
        for i in range(len(integrated_fitcons_score)):
            if integrated_fitcons_score[i] == '.':
                integrated_fitcons_score[i] = None
    if len(gm12878_fitcons_score) > 1:
        for i in range(len(gm12878_fitcons_score)):
            if gm12878_fitcons_score[i] == '.':
                gm12878_fitcons_score[i] = None
    if len(h1_hesc_fitcons_score) > 1:
        for i in range(len(h1_hesc_fitcons_score)):
            if h1_hesc_fitcons_score[i] == '.':
                h1_hesc_fitcons_score[i] = None
    if len(huvec_fitcons_score) > 1:
        for i in range(len(huvec_fitcons_score)):
            if huvec_fitcons_score[i] == '.':
                huvec_fitcons_score[i] = None
# load as json data
    one_snp_json = {
        "_id": HGVS,
        "dbnsfp": {
            "rsid": fields[6],
            #"rsid_dbSNP144": fields[6],
            "chrom": chrom,
            "hg19": {
                "start": chromStart,
                "end": chromEnd
            },
            "hg18": {
                "start": fields[10],
                "end": hg18_end
            },
            "hg38": {
                "start": fields[1],
                "end": fields[1]
            },
            "ref": ref,
            "alt": alt,
            "aa": {
                "ref": fields[4],
                "alt": fields[5],
                "pos": fields[22],
                "refcodon": fields[13],
                "codonpos": fields[14],
                "codon_degeneracy": fields[15]
            },
            "genename": fields[11],
            "uniprot": uniprot,
            "interpro_domain": fields[180],
            "cds_strand": fields[12],
            "ancestral_allele": fields[16],
            #"altaineandertal": fields[17],
            #"denisova": fields[18]
            "ensembl": {
                "geneid": fields[19],
                "transcriptid": fields[20],
                "proteinid": fields[21]
            },
            "sift": {
                "score": sift_score,
                "converted_rankscore": fields[24],
                "pred": fields[25]
            },
            "polyphen2": {
                "hdiv": {
                    "score": hdiv_score,
                    "rankscore": fields[30],
                    "pred": fields[31]
                },
                "hvar": {
                    "score": hvar_score,
                    "rankscore": fields[33],
                    "pred": fields[34]
                }
            },
            "lrt": {
                "score": lrt_score,
                "converted_rankscore": fields[36],
                "pred": fields[37],
                "omega": fields[38]
            },
            "mutationtaster": {
                "score": mutationtaster_score,
                "converted_rankscore": fields[40],
                "pred": fields[41],
                "model": fields[42],
                "AAE": fields[43]
            },
            "mutationassessor": {
                "score": mutationassessor_score,
                "rankscore": fields[47],
                "pred": fields[48]
            },
            "fathmm": {
                "score": fathmm_score,
                "rankscore": fields[50],
                "pred": fields[51]
            },
            "provean": {
                "score": provean_score,
                "rankscore": fields[53],
                "pred": fields[54]
            },
            "vest3": {
                "score": vest3_score,
                "rankscore": fields[57],
                "transcriptid": fields[55],
                "transcriptvar": fields[56]
            },
            "fathmm-mkl": {
                "coding_score": fathmm_coding_score,
                "coding_rankscore": fields[72],
                "coding_pred": fields[73],
                "coding_group": fields[74]
            },
            "eigen": {
                "raw": fields[75],
                "phred": fields[76],
                "raw_rankscore": fields[77]
            },
            "eigen-pc": {
                "raw": fields[78],
                "raw_rankscore": fields[79]
            },
            "genocanyon": {
                "score": fields[80],
                "rankscore": fields[81]
            },
            "metasvm": {
                "score": metasvm_score,
                "rankscore": fields[60],
                "pred": fields[61]
            },
            "metalr": {
                "score": lr_score,
                "rankscore": fields[63],
                "pred": fields[64]
            },
            "reliability_index": fields[65],
            "dann": {
                "score": dann_score,
                "rankscore": fields[70]
            },
            "gerp++": {
                "nr": fields[94],
                "rs": fields[95],
                "rs_rankscore": fields[96]
            },
            "integrated": {
                "fitcons_score": integrated_fitcons_score,
                "fitcons_rankscore": fields[83],
                "confidence_value": fields[84]
            },
            "gm12878": {
                "fitcons_score": gm12878_fitcons_score,
                "fitcons_rankscore": fields[86],
                "confidence_value": fields[87]
            },
            "h1-hesc": {
                "fitcons_score": h1_hesc_fitcons_score,
                "fitcons_rankscore": fields[89],
                "confidence_value": fields[90]
            },
            "huvec": {
                "fitcons_score": huvec_fitcons_score,
                "fitcons_rankscore": fields[92],
                "confidence_value": fields[93]
            },
            "phylo": {
                "p100way": {
                    "vertebrate": fields[97],
                    "vertebrate_rankscore": fields[98]
                },
                "p20way": {
                    "mammalian": fields[99],
                    "mammalian_rankscore": fields[100]
                }
            },
            "phastcons": {
                "100way": {
                    "vertebrate": fields[101],
                    "vertebrate_rankscore": fields[102]
                },
                "20way": {
                    "mammalian": fields[103],
                    "mammalian_rankscore": fields[104]
                }
            },
            "siphy_29way": {
                "pi": siphy,
                "logodds": fields[106],
                "logodds_rankscore": fields[107]
            },
            "1000gp3": {
                "ac": fields[108],
                "af": fields[109],
                "afr_ac": fields[110],
                "afr_af": fields[111],
                "eur_ac": fields[112],
                "eur_af": fields[113],
                "amr_ac": fields[114],
                "amr_af": fields[115],
                "eas_ac": fields[116],
                "eas_af": fields[117],
                "sas_ac": fields[118],
                "sas_af": fields[119]
            },
            "twinsuk": {
                "ac": fields[120],
                "af": fields[121]
            },
            "alspac": {
                "ac": fields[122],
                "af": fields[123]
            },
            "esp6500": {
                "aa_ac": fields[124],
                "aa_af": fields[125],
                "ea_ac": fields[126],
                "ea_af": fields[127]
            },
            "exac": {
                "ac": fields[128],
                "af": fields[129],
                "adj_ac": fields[130],
                "adj_af": fields[131],
                "afr_ac": fields[132],
                "afr_af": fields[133],
                "amr_ac": fields[134],
                "amr_af": fields[135],
                "eas_ac": fields[136],
                "eas_af": fields[137],
                "fin_ac": fields[138],
                "fin_af": fields[139],
                "nfe_ac": fields[140],
                "nfe_af": fields[141],
                "sas_ac": fields[142],
                "sas_af": fields[143]
            },
            "exac_nontcga": {
                "ac": fields[144],
                "af": fields[145],
                "adj_ac": fields[146],
                "adj_af": fields[147],
                "afr_ac": fields[148],
                "afr_af": fields[149],
                "amr_ac": fields[150],
                "amr_af": fields[151],
                "eas_ac": fields[152],
                "eas_af": fields[153],
                "fin_ac": fields[154],
                "fin_af": fields[155],
                "nfe_ac": fields[156],
                "nfe_af": fields[157],
                "sas_ac": fields[158],
                "sas_af": fields[159]
            },
            "exac_nonpsych": {
                "ac": fields[160],
                "af": fields[161],
                "adj_ac": fields[162],
                "adj_af": fields[163],
                "afr_ac": fields[164],
                "afr_af": fields[165],
                "amr_ac": fields[166],
                "amr_af": fields[167],
                "eas_ac": fields[168],
                "eas_af": fields[169],
                "fin_ac": fields[170],
                "fin_af": fields[171],
                "nfe_ac": fields[172],
                "nfe_af": fields[173]
            },
            "clinvar": {
                "rs": fields[176],
                "clinsig": fields[177],
                "trait": fields[178],
                "golden_stars": fields[179]
            },
            "gtex": gtex
        }
    }

    one_snp_json = list_split(
        dict_sweep(unlist(value_convert(one_snp_json)), vals=["."]), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json
Example #35
0
def _map_line_to_json(fields, version):
    chrInfo = fields[0].split(":")  # grch37
    chrom = chrInfo[0]
    chromStart = int(chrInfo[1])
    ma_fin_percent = fields[7].split("/")
    if fields[3]:
        mutation = fields[3].split(">")
        ref = mutation[0]
        alt = mutation[1]
        hg19 = get_pos_start_end(chrom, chromStart, ref, alt)
        hg38 = get_pos_start_end(chrom, int(fields[30].split(":")[1]), ref,
                                 alt)
        if version == 'hg19':
            HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt)
        elif version == 'hg38':
            HGVS = get_hgvs_from_vcf(chrom, hg38[0], ref, alt)

    # load as json data
    if HGVS is None:
        return

    one_snp_json = {
        "_id": HGVS,
        "evs": {
            "chrom": chrom,
            "hg19": {
                "start": hg19[0],
                "end": hg19[1]
            },
            "hg38": {
                "start": hg38[0],
                "end": hg38[1]
            },
            "rsid": fields[1],
            "dbsnp_version": get_dbsnp(fields[2]),
            "ref": ref,
            "alt": alt,
            "allele_count": {
                "european_american": count_dict(fields[4]),
                "african_american": count_dict(fields[5]),
                "all": count_dict(fields[6])
            },
            "ma_fin_percent": {
                "european_american": ma_fin_percent[0],
                "african_american": ma_fin_percent[1],
                "all": ma_fin_percent[2]
            },
            "genotype_count": {
                "european_american": count_dict(fields[8]),
                "african_american": count_dict(fields[9]),
                "all_genotype": count_dict(fields[10])
            },
            "avg_sample_read": fields[11],
            "gene": {
                "symbol": fields[12],
                "accession": fields[13]
            },
            "function_gvs": fields[14],
            "hgvs": {
                "coding": fields[16],
                "protein": fields[15]
            },
            "coding_dna_size": fields[17],
            "conservation": {
                "phast_cons": fields[18],
                "gerp": fields[19]
            },
            "grantham_score": fields[20],
            "polyphen2": {
                "class": polyphen(fields[21])[0],
                "score": polyphen(fields[21])[1]
            },
            "ref_base_ncbi": fields[22],
            "chimp_allele": fields[23],
            "clinical_info": fields[24],
            "filter_status": fields[25],
            "on_illumina_human_exome_chip": fields[26],
            "gwas_pubmed_info": fields[27],
            "estimated_age_kyrs": {
                "ea": fields[28],
                "aa": fields[29]
            }
        }
    }
    return dict_sweep(value_convert(one_snp_json),
                      vals=["NA", "none", "unknown"])
Example #36
0
def _map_line_to_json(fields):
    chrInfo = fields[0].split(":")  # grch37
    chrom = chrInfo[0]
    chromStart = int(chrInfo[1])

    ma_fin_percent = fields[7].split("/")

    if fields[3]:
        mutation = fields[3].split(">")
        ref = mutation[0]
        alt = mutation[1]
        HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt)
        hg19 = get_pos_start_end(chrom, chromStart, ref, alt)
        hg38 = get_pos_start_end(chrom, int(fields[30].split(":")[1]), ref, alt)

    # load as json data
    if HGVS is None:
        return

    one_snp_json = {
        "_id": HGVS,
        "evs":
            {
                "chrom": chrom,
                "hg19":
                    {
                        "start": hg19[0],
                        "end": hg19[1]
                    },
                "hg38":
                    {
                        "start": hg38[0],
                        "end": hg38[1]
                    },
                "rsid": fields[1],
                "dbsnp_version": get_dbsnp(fields[2]),
                "ref": ref,
                "alt": alt,
                "allele_count":
                    {
                        "european_american": count_dict(fields[4]),
                        "african_american": count_dict(fields[5]),
                        "all": count_dict(fields[6])
                    },
                "ma_fin_percent":
                    {
                        "european_american": ma_fin_percent[0],
                        "african_american": ma_fin_percent[1],
                        "all": ma_fin_percent[2]
                    },
                "genotype_count":
                    {
                        "european_american": count_dict(fields[8]),
                        "african_american": count_dict(fields[9]),
                        "all_genotype": count_dict(fields[10])
                    },
                "avg_sample_read": fields[11],
                "gene":
                    {
                        "symbol": fields[12],
                        "accession": fields[13]
                    },
                "function_gvs": fields[14],
                "hgvs":
                    {
                        "coding": fields[16],
                        "protein": fields[15]
                    },
                "coding_dna_size": fields[17],
                "conservation":
                    {
                        "phast_cons": fields[18],
                        "gerp": fields[19]
                    },
                "grantham_score": fields[20],
                "polyphen2":
                    {
                        "class": polyphen(fields[21])[0],
                        "score": polyphen(fields[21])[1]
                    },
                "ref_base_ncbi": fields[22],
                "chimp_allele": fields[23],
                "clinical_info": fields[24],
                "filter_status": fields[25],
                "on_illumina_human_exome_chip": fields[26],
                "gwas_pubmed_info": fields[27],
                "estimated_age_kyrs":
                    {
                        "ea": fields[28],
                        "aa": fields[29]
                    }
            }
        }
    return dict_sweep(value_convert(one_snp_json), vals=["NA", "none", "unknown"])
Example #37
0
    def load(self, aslist=False):
        '''
        loading ncbi "gene_info" file
        This must be called first to create basic gene documents
        with all basic fields, e.g., name, symbol, synonyms, etc.

        format of gene_info file:
        #Format: tax_id GeneID Symbol LocusTag Synonyms dbXrefs
                 map_location description type_of_gene Symbol_from
                 nomenclature_authority Full_name_from_nomenclature_authority
        Nomenclature_status Other_designations Modification_da
        te (tab is used as a separator, pound sign - start of a comment)

        '''
        load_start(self.datafile)
        gene_d = tab2dict(self.datafile, (0, 1, 2, 3, 4, 5, 7, 8, 9, 13, 14),
                          key=1,
                          alwayslist=0,
                          includefn=self.species_filter)

        def _ff(d):
            (taxid, symbol, locus_tag, synonyms, dbxrefs, map_location,
             description, type_of_gene, other_designations,
             modification_date) = d
            out = dict(taxid=int(taxid), symbol=symbol, name=description)
            if map_location != '-':
                out['map_location'] = map_location
            if type_of_gene != '-':
                out['type_of_gene'] = type_of_gene
            if synonyms != '-':
                out['alias'] = normalized_value(synonyms.split('|'))
            if locus_tag != '-':
                out['locus_tag'] = locus_tag
            if other_designations != "-":
                out['other_names'] = normalized_value(
                    other_designations.split('|'))

            # when merged, this will become the default timestamp
            out["_timestamp"] = datetime.datetime.strptime(
                modification_date, "%Y%m%d")

            for x in dbxrefs.split('|'):
                if x == '-':
                    continue
                xd = x.split(':')
                if len(xd) == 3 and xd[0] == xd[1] and \
                        xd[0] in ['VGNC', 'HGNC', 'MGI']:
                    # a fix for NCBI bug for dup xref prefix, 'HGNC:HGNC:36328'
                    xd = xd[1:]
                try:
                    _db, _id = xd
                except:
                    print(repr(x))
                    raise
                # we don't need ensembl xref from here, we will get it from
                # Ensembl directly
                if _db.lower() in ['ensembl', 'imgt/gene-db']:
                    # we don't need 'IMGT/GENE-DB" xref either, because they
                    # are mostly the same as gene symbol
                    continue
                # add "MGI:" prefix for MGI ids.
                if _db.lower() == 'mgi':
                    _id = "MGI:" + _id
                out[_db] = _id
            return out

        gene_d = value_convert(gene_d, _ff)

        # add entrezgene field
        for geneid in gene_d:
            d = gene_d[geneid]
            d['entrezgene'] = int(geneid)
            gene_d[geneid] = d

        load_done('[%d]' % len(gene_d))

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d
Example #38
0
def _map_line_to_json(fields):
    if len(fields) == VALID_COLUMN_NO:
        chrom = fields[0]
        chromStart = fields[1]
        allele1 = fields[2]
        allele2 = fields[4]
        HGVS = "chr%s:g.%s%s>%s" % (chrom, chromStart, allele1, allele2)
    
        # load as json data
        if HGVS is None:
            return
    
        one_snp_json = {
    
                "_id": HGVS,
                "cadd":
                    {
                         'chrom': fields[0],
                         'pos': fields[1],
                         'ref': fields[2],
                         'anc': fields[3],
                         'alt': fields[4],
                         'type': fields[5],
                         'length': fields[6],
                         'istv': fields[7],
                         'isderived': fields[8],
                         'annotype': fields[9],
                         'consequence': fields[10],
                         'consscore': fields[11],
                         'consdetail': fields[12],
                         'gc': fields[13],
                         'cpg': fields[14],
                         'mapability':
                             {
                                 '20bp': fields[15],
                                 '35bp': fields[16]
                             },
                         'scoresegdup': fields[17],
                         'phast_cons':
                             {
                                 'primate': fields[18],
                                 'mammalian': fields[19],
                                 'vertebrate': fields[20]
                             },
                         'phylop':
                             {
                                 'primate': fields[21],
                                 'mammalian': fields[22],
                                 'vertebrate': fields[23]
                             },
                         'gerp':
                             {
                                 'n': fields[24],
                                 's': fields[25],
                                 'rs': fields[26],
                                 'rs_pval': fields[27]
                             },
                         'bstatistic': fields[28],
                         'encode':
                             {
                                 'exp': fields[29],
                                 'h3k27ac': fields[30],
                                 'h3k4me1': fields[31],
                                 'h3k4me3': fields[32],
                                 'nucleo': fields[33],
                                 'occ': fields[34],
                                 'p_val':
                                     {
                                         'comb': fields[35],
                                         'dnas': fields[36],
                                         'faire': fields[37],
                                         'polii': fields[38],
                                         'ctcf': fields[39],
                                         'mycp': fields[40]
                                     },
                                 'sig':
                                     {
                                         'dnase': fields[41],
                                         'faire': fields[42],
                                         'polii': fields[43],
                                         'ctcf': fields[44],
                                         'myc': fields[45]
                                     },
                             },
                         'segway': fields[46],
                         'motif':
                             {
                                 'toverlap': fields[47],
                                 'dist': fields[48],
                                 'ecount': fields[49],
                                 'ename': fields[50],
                                 'ehipos': fields[51],
                                 'escorechng': fields[52]
                             },
                         'tf':
                             {
                                 'bs': fields[53],
                                 'bs_peaks': fields[54],
                                 'bs_peaks_max': fields[55]
                             },
                         'isknownvariant': fields[56],
                         'esp':
                             {
                                 'af': fields[57],
                                 'afr': fields[58],
                                 'eur': fields[59]
                             },
                         '1000g':
                             {
                                 'af': fields[60],
                                 'asn': fields[61],
                                 'amr': fields[62],
                                 'afr': fields[63],
                                 'eur': fields[64]
                             },
                         'min_dist_tss': fields[65],
                         'min_dist_tse': fields[66],
                         'gene':
                             {
                                 'gene_id': fields[67],
                                 'feature_id': fields[68],
                                 'ccds_id': fields[69],
                                 'genename': fields[70],
                                 'cds':
                                     {
                                         'cdna_pos': fields[71],
                                         'rel_cdna_pos': fields[72],
                                         'cds_pos': fields[73],
                                         'rel_cds_pos': fields[74]
                                     },
                                 'prot':
                                     {
                                         'protpos': fields[75],
                                         'rel_prot_pos': fields[76],
                                         'oaa': fields[81],
                                         'naa': fields[82]
                                     },
                                 'dst_2_splice': fields[77],
                                 'dst_2_spltype': fields[78],
                                 'exon': fields[79],
                                 'intron': fields[80]
                             },
                         'grantham': fields[83],
                             'polyphen':
                             {
                                 'cat': fields[84],
                                 'val': fields[85]
                             },
                         'sift':
                             {
                                 'cat': fields[86],
                                 'val': fields[87]
                             },
                         'rawscore': fields[88],
                         'phred': fields[89]
                      }
                }
        return dict_sweep(unlist(value_convert(one_snp_json)), "NA")
def _map_line_to_json(cp):
    try:
        clinical_significance = cp.ReferenceClinVarAssertion.\
            ClinicalSignificance.Description
    except:
        clinical_significance = None
    rcv_accession = cp.ReferenceClinVarAssertion.ClinVarAccession.Acc
    try:
        review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.\
            ReviewStatus
    except:
        review_status = None
    try:
        last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.\
            DateLastEvaluated
    except:
        last_evaluated = None
    variant_id = cp.ReferenceClinVarAssertion.MeasureSet.ID
    number_submitters = len(cp.ClinVarAssertion)
    # some items in clinvar_xml doesn't have origin information
    try:
        origin = cp.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin
    except:
        origin = None
    trait = cp.ReferenceClinVarAssertion.TraitSet.Trait[0]
    synonyms = []
    conditions_name = ''
    for name in trait.Name:
        if name.ElementValue.Type == 'Alternate':
            synonyms.append(name.ElementValue.get_valueOf_())
        if name.ElementValue.Type == 'Preferred':
            conditions_name += name.ElementValue.get_valueOf_()
    identifiers = {}
    for item in trait.XRef:
        if item.DB == 'Human Phenotype Ontology':
            key = 'Human_Phenotype_Ontology'
        else:
            key = item.DB
        identifiers[key.lower()] = item.ID
    for symbol in trait.Symbol:
        if symbol.ElementValue.Type == 'Preferred':
            conditions_name += ' (' + symbol.ElementValue.get_valueOf_() + ')'
    age_of_onset = ''
    for _set in trait.AttributeSet:
        if _set.Attribute.Type == 'age of onset':
            age_of_onset = _set.Attribute.get_valueOf_()

    # MeasureSet.Measure return a list, there might be multiple
    # Measure under one MeasureSet
    for Measure in cp.ReferenceClinVarAssertion.MeasureSet.Measure:
        variation_type = Measure.Type
        # exclude any item of which types belong to
        # 'Variation', 'protein only' or 'Microsatellite'
        if variation_type == 'Variation' or variation_type\
           == 'protein only' or variation_type == 'Microsatellite':
            continue
        allele_id = Measure.ID
        chrom = None
        chromStart = None
        chromEnd = None
        chromStart_38 = None
        chromEnd_38 = None
        ref = None
        alt = None
        if Measure.SequenceLocation:
            for SequenceLocation in Measure.SequenceLocation:
                # In this version, only accept information concerning GRCh37
                if 'GRCh37' in SequenceLocation.Assembly:
                    chrom = SequenceLocation.Chr
                    chromStart = SequenceLocation.start
                    chromEnd = SequenceLocation.stop
                    ref = SequenceLocation.referenceAllele
                    alt = SequenceLocation.alternateAllele
                if 'GRCh38' in SequenceLocation.Assembly:
                    chromStart_38 = SequenceLocation.start
                    chromEnd_38 = SequenceLocation.stop
                    if not ref:
                        ref = SequenceLocation.referenceAllele
                    if not alt:
                        alt = SequenceLocation.alternateAllele
        if Measure.MeasureRelationship:
            try:
                symbol = Measure.MeasureRelationship[0].\
                    Symbol[0].get_ElementValue().valueOf_
            except:
                symbol = None
            gene_id = Measure.MeasureRelationship[0].XRef[0].ID
        else:
            symbol = None
            gene_id = None
        if Measure.Name:
            name = Measure.Name[0].ElementValue.valueOf_
        else:
            name = None
        if len(Measure.CytogeneticLocation) == 1:
            cytogenic = Measure.CytogeneticLocation[0]
        else:
            cytogenic = Measure.CytogeneticLocation
        hgvs_coding = None
        hgvs_genome = None
        HGVS = {'genomic': [], 'coding': [], 'non-coding': [], 'protein': []}
        coding_hgvs_only = None
        hgvs_id = None
        # hgvs_not_validated = None
        if Measure.AttributeSet:
            # 'copy number loss' or 'gain' have format different\
            # from other types, should be dealt with seperately
            if (variation_type == 'copy number loss') or \
                    (variation_type == 'copy number gain'):
                for AttributeSet in Measure.AttributeSet:
                    if 'HGVS, genomic, top level' in AttributeSet.\
                            Attribute.Type:
                        if AttributeSet.Attribute.integerValue == 37:
                            hgvs_genome = AttributeSet.Attribute.get_valueOf_()
                    if 'genomic' in AttributeSet.Attribute.Type:
                        HGVS['genomic'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    elif 'non-coding' in AttributeSet.Attribute.Type:
                        HGVS['non-coding'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    elif 'coding' in AttributeSet.Attribute.Type:
                        HGVS['coding'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    elif 'protein' in AttributeSet.Attribute.Type:
                        HGVS['protein'].append(
                            AttributeSet.Attribute.get_valueOf_())
            else:
                for AttributeSet in Measure.AttributeSet:
                    if 'genomic' in AttributeSet.Attribute.Type:
                        HGVS['genomic'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    elif 'non-coding' in AttributeSet.Attribute.Type:
                        HGVS['non-coding'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    elif 'coding' in AttributeSet.Attribute.Type:
                        HGVS['coding'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    elif 'protein' in AttributeSet.Attribute.Type:
                        HGVS['protein'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    if AttributeSet.Attribute.Type == 'HGVS, coding, RefSeq':
                        hgvs_coding = AttributeSet.Attribute.get_valueOf_()
                    elif AttributeSet.Attribute.Type == \
                            'HGVS, genomic, top level, previous':
                        hgvs_genome = AttributeSet.Attribute.get_valueOf_()
                        break
            if chrom and chromStart and chromEnd:
                if variation_type == 'single nucleotide variant':
                    hgvs_id = "chr%s:g.%s%s>%s" % (chrom, chromStart, ref, alt)
                # items whose type belong to 'Indel, Insertion, \
                # Duplication' might not hava explicit alt information, \
                # so we will parse from hgvs_genome
                elif variation_type == 'Indel':
                    if hgvs_genome:
                        indel_position = hgvs_genome.find('del')
                        indel_alt = hgvs_genome[indel_position + 3:]
                        hgvs_id = "chr%s:g.%s_%sdel%s" % \
                                  (chrom, chromStart, chromEnd, indel_alt)
                elif variation_type == 'Deletion':
                    hgvs_id = "chr%s:g.%s_%sdel" % \
                              (chrom, chromStart, chromEnd)
                elif variation_type == 'Insertion':
                    if hgvs_genome:
                        ins_position = hgvs_genome.find('ins')
                        if 'ins' in hgvs_genome:
                            ins_ref = hgvs_genome[ins_position + 3:]
                            hgvs_id = "chr%s:g.%s_%sins%s" % \
                                      (chrom, chromStart, chromEnd, ins_ref)
                elif variation_type == 'Duplication':
                    if hgvs_genome:
                        dup_position = hgvs_genome.find('dup')
                        if 'dup' in hgvs_genome:
                            dup_ref = hgvs_genome[dup_position + 3:]
                            hgvs_id = "chr%s:g.%s_%sdup%s" % \
                                      (chrom, chromStart, chromEnd, dup_ref)
            elif variation_type == 'copy number loss' or\
                    variation_type == 'copy number gain':
                if hgvs_genome:
                    hgvs_id = "chr" + hgvs_genome.split('.')[1] +\
                              hgvs_genome.split('.')[2]
            elif hgvs_coding:
                hgvs_id = hgvs_coding
                coding_hgvs_only = True
            else:
                print "couldn't find any id", rcv_accession
                return
        else:
            print 'no measure.attribute', rcv_accession
            return
        for key in HGVS:
            HGVS[key].sort()
        rsid = None
        cosmic = None
        dbvar = None
        uniprot = None
        omim = None
        # loop through XRef to find rsid as well as other ids
        if Measure.XRef:
            for XRef in Measure.XRef:
                if XRef.Type == 'rs':
                    rsid = 'rs' + str(XRef.ID)
                elif XRef.DB == 'COSMIC':
                    cosmic = XRef.ID
                elif XRef.DB == 'OMIM':
                    omim = XRef.ID
                elif XRef.DB == 'UniProtKB/Swiss-Prot':
                    uniprot = XRef.ID
                elif XRef.DB == 'dbVar':
                    dbvar = XRef.ID

        # make sure the hgvs_id is not none
        if hgvs_id:
            one_snp_json = {
                "_id": hgvs_id,
                "clinvar": {
                    "allele_id": allele_id,
                    "variant_id": variant_id,
                    "chrom": chrom,
                    "omim": omim,
                    "cosmic": cosmic,
                    "uniprot": uniprot,
                    "dbvar": dbvar,
                    "hg19": {
                        "start": chromStart,
                        "end": chromEnd
                    },
                    "hg38": {
                        "start": chromStart_38,
                        "end": chromEnd_38
                    },
                    "type": variation_type,
                    "gene": {
                        "id": gene_id,
                        "symbol": symbol
                    },
                    "rcv": {
                        "accession": rcv_accession,
                        "clinical_significance": clinical_significance,
                        "number_submitters": number_submitters,
                        "review_status": review_status,
                        "last_evaluated": str(last_evaluated),
                        "preferred_name": name,
                        "origin": origin,
                        "conditions": {
                            "name": conditions_name,
                            "synonyms": synonyms,
                            "identifiers": identifiers,
                            "age_of_onset": age_of_onset
                        }
                    },
                    "rsid": rsid,
                    "cytogenic": cytogenic,
                    "hgvs": HGVS,
                    "coding_hgvs_only": coding_hgvs_only,
                    "ref": ref,
                    "alt": alt
                }
            }
            obj = (dict_sweep(
                unlist(
                    value_convert(one_snp_json, [
                        'chrom', 'omim', 'id', 'orphanet', 'gene',
                        'rettbase_(cdkl5)', 'cosmic', 'dbrbc'
                    ])), [None, '', 'None']))
            yield obj
Example #40
0
def _map_line_to_json(fields):
    assert len(fields) == VALID_COLUMN_NO
    chrom = fields[0]
    chromStart = fields[1]
    ref = fields[2]
    alt = fields[4]
    HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt)

    # load as json data
    if HGVS is None:
        return
    one_snp_json = {
        "_id": HGVS,
        "cadd": {
            'chrom': fields[0],
            'pos': fields[1],
            'ref': fields[2],
            'anc': fields[3],
            'alt': fields[4],
            'type': fields[5],
            'length': fields[6],
            'istv': fields[7],
            'isderived': fields[8],
            'annotype': fields[9],
            'consequence': fields[10],
            'consscore': fields[11],
            'consdetail': fields[12],
            'gc': fields[13],
            'cpg': fields[14],
            'mapability': {
                '20bp': fields[15],
                '35bp': fields[16]
            },
            'scoresegdup': fields[17],
            'phast_cons': {
                'primate': fields[18],
                'mammalian': fields[19],
                'vertebrate': fields[20]
            },
            'phylop': {
                'primate': fields[21],
                'mammalian': fields[22],
                'vertebrate': fields[23]
            },
            'gerp': {
                'n': fields[24],
                's': fields[25],
                'rs': fields[26],
                'rs_pval': fields[27]
            },
            'bstatistic': fields[28],
            'mutindex': fields[29],
            'dna': {
                'helt': fields[30],
                'mgw': fields[31],
                'prot': fields[32],
                'roll': fields[33]
            },
            'mirsvr': {
                'score': fields[34],
                'e': fields[35],
                'aln': fields[36]
            },
            'targetscans': fields[37],
            'fitcons': fields[38],
            'chmm': {
                'tssa': fields[39],
                'tssaflnk': fields[40],
                'txflnk': fields[41],
                'tx': fields[42],
                'txwk': fields[43],
                'enh': fields[44],
                # 'enh': fields[45],
                'znfrpts': fields[46],
                'het': fields[47],
                'tssbiv': fields[48],
                'bivflnk': fields[49],
                'enhbiv': fields[50],
                'reprpc': fields[51],
                'reprpcwk': fields[52],
                'quies': fields[53],
            },
            'encode': {
                'exp': fields[54],
                'h3k27ac': fields[55],
                'h3k4me1': fields[56],
                'h3k4me3': fields[57],
                'nucleo': fields[58],
                'occ': fields[59],
                'p_val': {
                    'comb': fields[60],
                    'dnas': fields[61],
                    'faire': fields[62],
                    'polii': fields[63],
                    'ctcf': fields[64],
                    'mycp': fields[65]
                },
                'sig': {
                    'dnase': fields[66],
                    'faire': fields[67],
                    'polii': fields[68],
                    'ctcf': fields[69],
                    'myc': fields[70]
                },
            },
            'segway': fields[71],
            'motif': {
                'toverlap': fields[72],
                'dist': fields[73],
                'ecount': fields[74],
                'ename': fields[75],
                'ehipos': fields[76],
                'escorechng': fields[77]
            },
            'tf': {
                'bs': fields[78],
                'bs_peaks': fields[79],
                'bs_peaks_max': fields[80]
            },
            'isknownvariant': fields[81],
            'esp': {
                'af': fields[82],
                'afr': fields[83],
                'eur': fields[84]
            },
            '1000g': {
                'af': fields[85],
                'asn': fields[86],
                'amr': fields[87],
                'afr': fields[88],
                'eur': fields[89]
            },
            'min_dist_tss': fields[90],
            'min_dist_tse': fields[91],
            'gene': {
                'gene_id': fields[92],
                'feature_id': fields[93],
                'ccds_id': fields[94],
                'genename': fields[95],
                'cds': {
                    'cdna_pos': fields[96],
                    'rel_cdna_pos': fields[97],
                    'cds_pos': fields[98],
                    'rel_cds_pos': fields[99]
                },
                'prot': {
                    'protpos': fields[100],
                    'rel_prot_pos': fields[101],
                    'domain': fields[102]
                }
            },
            'dst2splice': fields[103],
            'dst2spltype': fields[104],
            'exon': fields[105],
            'intron': fields[106],
            'oaa': fields[107],   # ref aa
            'naa': fields[108],   # alt aa
            'grantham': fields[109],
            'polyphen': {
                'cat': fields[110],
                'val': fields[111]
            },
            'sift': {
                'cat': fields[112],
                'val': fields[113]
            },
            'rawscore': fields[114],    # raw CADD score
            'phred': fields[115]        # log-percentile of raw CADD score
        }
    }

    obj = dict_sweep(unlist(value_convert(one_snp_json)), ["NA"])
    yield obj
Example #41
0
    def load(self, aslist=False):
        '''
        loading ncbi "gene_info" file
        This must be called first to create basic gene documents
        with all basic fields, e.g., name, symbol, synonyms, etc.

        format of gene_info file:
        #Format: tax_id GeneID Symbol LocusTag Synonyms dbXrefs chromosome map_location description type_of_gene Symbol_from
        _nomenclature_authority Full_name_from_nomenclature_authority Nomenclature_status Other_designations Modification_da
        te (tab is used as a separator, pound sign - start of a comment)

        '''
        load_start(self.datafile)
        gene_d = tab2dict(self.datafile, (0, 1, 2, 4, 5, 7, 8, 9), key=1, alwayslist=0, includefn=self.species_filter)

        def _ff(d):
            (
                taxid, symbol, synonyms,
                dbxrefs, map_location,
                description, type_of_gene
            ) = d
            out = dict(taxid=int(taxid),
                       symbol=symbol,
                       name=description)
            if map_location != '-':
                out['map_location'] = map_location
            if type_of_gene != '-':
                out['type_of_gene'] = type_of_gene
            if synonyms != '-':
                out['alias'] = normalized_value(synonyms.split('|'))

            for x in dbxrefs.split('|'):
                if x == '-':
                    continue
                xd = x.split(':')
                if len(xd) == 3 and xd[0] == xd[1] and xd[0] in ['HGNC', 'MGI']:
                    xd = xd[1:]      # a fix for NCBI bug for dup xref prefix, 'HGNC:HGNC:36328'
                try:
                    _db, _id = xd
                except:
                    print(x)
                    raise
                if _db.lower() in ['ensembl', 'imgt/gene-db']:      # we don't need ensembl xref from here, we will get it from Ensembl directly
                    continue                                        # we don't need 'IMGT/GENE-DB" xref either, because they are mostly the same as gene symbol
                if _db.lower() == 'mgi':            # add "MGI:" prefix for MGI ids.
                    _id = "MGI:"+_id
                out[_db] = _id
            return out

        gene_d = value_convert(gene_d, _ff)

        # add entrezgene field
        for geneid in gene_d:
            d = gene_d[geneid]
            d['entrezgene'] = int(geneid)
            gene_d[geneid] = d

        load_done('[%d]' % len(gene_d))

        if aslist:
            return dict_to_list(gene_d)
        else:
            return gene_d
Example #42
0
def _map_line_to_json(cp):
    clinical_siginificance = cp.ReferenceClinVarAssertion.\
        ClinicalSignificance.Description
    rcv_accession = cp.ReferenceClinVarAssertion.ClinVarAccession.Acc
    review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.\
        ReviewStatus
    last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.\
        DateLastEvaluated
    CLINVAR_ID = cp.ReferenceClinVarAssertion.MeasureSet.ID
    number_submitters = len(cp.ClinVarAssertion)
    # some items in clinvar_xml doesn't have origin information
    try:
        origin = cp.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin
    except:
        origin = None
    # MeasureSet.Measure return a list, there might be multiple
    # Measure under one MeasureSet
    for Measure in cp.ReferenceClinVarAssertion.MeasureSet.Measure:
        variation_type = Measure.Type
        # exclude any item of which types belong to
        # 'Variation', 'protein only' or 'Microsatellite'
        if variation_type == 'Variation' or variation_type\
           == 'protein only' or variation_type == 'Microsatellite':
            continue
        allele_id = Measure.ID
        chrom = None
        chromStart = None
        chromEnd = None
        ref = None
        alt = None
        if Measure.SequenceLocation:
            for SequenceLocation in Measure.SequenceLocation:
                # In this version, only accept information concerning GRCh37
                if 'GRCh37' in SequenceLocation.Assembly:
                    chrom = SequenceLocation.Chr
                    chromStart = SequenceLocation.start
                    chromEnd = SequenceLocation.stop
                    ref = SequenceLocation.referenceAllele
                    alt = SequenceLocation.alternateAllele
        if Measure.MeasureRelationship:
            try:
                symbol = Measure.MeasureRelationship[0].\
                    Symbol[0].get_ElementValue().valueOf_
            except:
                symbol = None
            gene_id = Measure.MeasureRelationship[0].XRef[0].ID
        else:
            symbol = None
            gene_id = None
        if Measure.Name:
            name = Measure.Name[0].ElementValue.valueOf_
        else:
            name = None
        if len(Measure.CytogeneticLocation) == 1:
            cytogenic = Measure.CytogeneticLocation[0]
        else:
            cytogenic = Measure.CytogeneticLocation
        hgvs_coding = None
        hgvs_genome = None
        HGVS = {'genomic': [], 'coding': [], 'non-coding': [], 'protein': []}
        coding_hgvs_only = None
        hgvs_id = None
        # hgvs_not_validated = None
        if Measure.AttributeSet:
            # 'copy number loss' or 'gain' have format different\
            # from other types, should be dealt with seperately
            if (variation_type == 'copy number loss') or \
                    (variation_type == 'copy number gain'):
                for AttributeSet in Measure.AttributeSet:
                    if 'HGVS, genomic, top level' in AttributeSet.\
                            Attribute.Type:
                        if AttributeSet.Attribute.integerValue == 37:
                            hgvs_genome = AttributeSet.Attribute.get_valueOf_()
                    if 'genomic' in AttributeSet.Attribute.Type:
                        HGVS['genomic'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    elif 'non-coding' in AttributeSet.Attribute.Type:
                        HGVS['non-coding'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    elif 'coding' in AttributeSet.Attribute.Type:
                        HGVS['coding'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    elif 'protein' in AttributeSet.Attribute.Type:
                        HGVS['protein'].append(
                            AttributeSet.Attribute.get_valueOf_())
            else:
                for AttributeSet in Measure.AttributeSet:
                    if 'genomic' in AttributeSet.Attribute.Type:
                        HGVS['genomic'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    elif 'non-coding' in AttributeSet.Attribute.Type:
                        HGVS['non-coding'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    elif 'coding' in AttributeSet.Attribute.Type:
                        HGVS['coding'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    elif 'protein' in AttributeSet.Attribute.Type:
                        HGVS['protein'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    if AttributeSet.Attribute.Type == 'HGVS, coding, RefSeq':
                        hgvs_coding = AttributeSet.Attribute.get_valueOf_()
                    elif AttributeSet.Attribute.Type == \
                            'HGVS, genomic, top level, previous':
                        hgvs_genome = AttributeSet.Attribute.get_valueOf_()
                        break
            if chrom and chromStart and chromEnd:
                if variation_type == 'single nucleotide variant':
                    hgvs_id = "chr%s:g.%s%s>%s" % (chrom, chromStart, ref, alt)
                # items whose type belong to 'Indel, Insertion, \
                # Duplication' might not hava explicit alt information, \
                # so we will parse from hgvs_genome
                elif variation_type == 'Indel':
                    if hgvs_genome:
                        indel_position = hgvs_genome.find('del')
                        indel_alt = hgvs_genome[indel_position + 3:]
                        hgvs_id = "chr%s:g.%s_%sdel%s" % \
                                  (chrom, chromStart, chromEnd, indel_alt)
                elif variation_type == 'Deletion':
                    hgvs_id = "chr%s:g.%s_%sdel" % \
                              (chrom, chromStart, chromEnd)
                elif variation_type == 'Insertion':
                    if hgvs_genome:
                        ins_position = hgvs_genome.find('ins')
                        if 'ins' in hgvs_genome:
                            ins_ref = hgvs_genome[ins_position + 3:]
                            hgvs_id = "chr%s:g.%s_%sins%s" % \
                                      (chrom, chromStart, chromEnd, ins_ref)
                elif variation_type == 'Duplication':
                    if hgvs_genome:
                        dup_position = hgvs_genome.find('dup')
                        if 'dup' in hgvs_genome:
                            dup_ref = hgvs_genome[dup_position + 3:]
                            hgvs_id = "chr%s:g.%s_%sdup%s" % \
                                      (chrom, chromStart, chromEnd, dup_ref)
            elif variation_type == 'copy number loss' or\
                    variation_type == 'copy number gain':
                if hgvs_genome:
                    hgvs_id = "chr" + hgvs_genome.split('.')[1] +\
                              hgvs_genome.split('.')[2]
            elif hgvs_coding:
                hgvs_id = hgvs_coding
                coding_hgvs_only = True
            else:
                print "couldn't find any id", rcv_accession
                return
        else:
            print 'no measure.attribute', rcv_accession
            return
        other_ids = ''
        rsid = None
        # loop through XRef to find rsid as well as other ids
        if Measure.XRef:
            for XRef in Measure.XRef:
                if XRef.Type == 'rs':
                    rsid = 'rs' + str(XRef.ID)
                other_ids = other_ids + XRef.DB + ':' + XRef.ID + ';'
        # make sure the hgvs_id is not none
        if hgvs_id:
            one_snp_json = {
                "_id": hgvs_id,
                "clinvar": {
                    "allele_id": allele_id,
                    "chrom": chrom,
                    "hg19": {
                        "start": chromStart,
                        "end": chromEnd
                    },
                    "type": variation_type,
                    "name": name,
                    "gene": {
                        "id": gene_id,
                        "symbol": symbol
                    },
                    "clinical_significance": clinical_siginificance,
                    "rsid": rsid,
                    "rcv_accession": rcv_accession,
                    "origin": origin,
                    "cytogenic": cytogenic,
                    "review_status": review_status,
                    "hgvs": HGVS,
                    "number_submitters": number_submitters,
                    "last_evaluated": str(last_evaluated),
                    "other_ids": other_ids,
                    "clinvar_id": CLINVAR_ID,
                    "coding_hgvs_only": coding_hgvs_only,
                    "ref": ref,
                    "alt": alt
                }
            }
            obj = (dict_sweep(unlist(value_convert(one_snp_json)), [None]))
            yield obj
Example #43
0
def _map_line_to_json(df, version, index):
    # specific variable treatment
    chrom = df.get_value(index, "#chr")
    if chrom == 'M':
        chrom = 'MT'
    # fields[7] in version 2, represent hg18_pos
    hg18_end = df.get_value(index, "hg18_pos(1-based)")
    if hg18_end == ".":
        hg18_end = "."
    else:
        hg18_end = int(hg18_end)
    # in case of no hg19 position provided, remove the item
    if df.get_value(index, "hg19_pos(1-based)") == '.':
        return None
    else:
        chromStart = int(df.get_value(index, "hg19_pos(1-based)"))
        chromEnd = chromStart
    chromStart_38 = int(df.get_value(index, "pos(1-based)"))
    ref = df.get_value(index, "ref").upper()
    alt = df.get_value(index, "alt").upper()
    HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt)
    HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt)
    if version == 'hg19':
        HGVS = HGVS_19
    elif version == 'hg38':
        HGVS = HGVS_38
    siphy_29way_pi = df.get_value(index, "SiPhy_29way_pi")
    if siphy_29way_pi == ".":
        siphy = "."
    else:
        freq = siphy_29way_pi.split(":")
        siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}
    gtex_gene = df.get_value(index, "GTEx_V6_gene").split('|')
    gtex_tissue = df.get_value(index, "GTEx_V6_tissue").split('|')
    gtex = map(
        dict,
        map(lambda t: zip(('gene', 'tissue'), t), zip(gtex_gene, gtex_tissue)))
    acc = df.get_value(index,
                       "Uniprot_acc_Polyphen2").rstrip().rstrip(';').split(";")
    pos = df.get_value(
        index, "Uniprot_aapos_Polyphen2").rstrip().rstrip(';').split(";")
    uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos)))
    provean_score = df.get_value(index, "PROVEAN_score").split(';')
    sift_score = df.get_value(index, "SIFT_score").split(';')
    hdiv_score = df.get_value(index, "Polyphen2_HDIV_score").split(';')
    hvar_score = df.get_value(index, "Polyphen2_HVAR_score").split(';')
    lrt_score = df.get_value(index, "LRT_score").split(';')
    m_cap_score = df.get_value(index, "M-CAP_score").split(';')
    mutationtaster_score = df.get_value(index,
                                        "MutationTaster_score").split(';')
    mutationassessor_score = df.get_value(index,
                                          "MutationAssessor_score").split(';')
    vest3_score = df.get_value(index, "VEST3_score").split(';')
    metasvm_score = df.get_value(index, "MetaSVM_score").split(';')
    fathmm_score = df.get_value(index, "FATHMM_score").split(';')
    metalr_score = df.get_value(index, "MetaLR_score").split(';')
    modify_score_list = [
        provean_score, sift_score, hdiv_score, hvar_score, lrt_score,
        m_cap_score, mutationtaster_score, mutationassessor_score, vest3_score,
        metasvm_score, fathmm_score, metalr_score
    ]
    for _score in modify_score_list:
        [None if item == '.' else item for item in _score]

# load as json data
    one_snp_json = {
        "_id": HGVS,
        "dbnsfp": {
            "rsid": df.get_value(index, "rs_dbSNP147"),
            #"rsid_dbSNP144": fields[6],
            "chrom": chrom,
            "hg19": {
                "start": chromStart,
                "end": chromEnd
            },
            "hg18": {
                "start": df.get_value(index, "hg18_pos(1-based)"),
                "end": hg18_end
            },
            "hg38": {
                "start": df.get_value(index, "pos(1-based)"),
                "end": df.get_value(index, "pos(1-based)")
            },
            "ref": ref,
            "alt": alt,
            "aa": {
                "ref": df.get_value(index, "aaref"),
                "alt": df.get_value(index, "aaalt"),
                "pos": df.get_value(index, "aapos"),
                "refcodon": df.get_value(index, "refcodon"),
                "codonpos": df.get_value(index, "codonpos"),
                "codon_degeneracy": df.get_value(index, "codon_degeneracy"),
            },
            "genename": df.get_value(index, "genename"),
            "uniprot": uniprot,
            "interpro_domain": df.get_value(index, "Interpro_domain"),
            "cds_strand": df.get_value(index, "cds_strand"),
            "ancestral_allele": df.get_value(index, "Ancestral_allele"),
            #"altaineandertal": fields[17],
            #"denisova": fields[18]
            "ensembl": {
                "geneid": df.get_value(index, "Ensembl_geneid"),
                "transcriptid": df.get_value(index, "Ensembl_transcriptid"),
                "proteinid": df.get_value(index, "Ensembl_proteinid")
            },
            "sift": {
                "score":
                sift_score,
                "converted_rankscore":
                df.get_value(index, "SIFT_converted_rankscore"),
                "pred":
                df.get_value(index, "SIFT_pred")
            },
            "polyphen2": {
                "hdiv": {
                    "score": hdiv_score,
                    "rankscore": df.get_value(index,
                                              "Polyphen2_HDIV_rankscore"),
                    "pred": df.get_value(index, "Polyphen2_HDIV_pred")
                },
                "hvar": {
                    "score": hvar_score,
                    "rankscore": df.get_value(index,
                                              "Polyphen2_HVAR_rankscore"),
                    "pred": df.get_value(index, "Polyphen2_HVAR_pred")
                }
            },
            "lrt": {
                "score":
                lrt_score,
                "converted_rankscore":
                df.get_value(index, "LRT_converted_rankscore"),
                "pred":
                df.get_value(index, "LRT_pred"),
                "omega":
                df.get_value(index, "LRT_Omega")
            },
            "mutationtaster": {
                "score":
                mutationtaster_score,
                "converted_rankscore":
                df.get_value(index, "MutationTaster_converted_rankscore"),
                "pred":
                df.get_value(index, "MutationTaster_pred"),
                "model":
                df.get_value(index, "MutationTaster_model"),
                "AAE":
                df.get_value(index, "MutationTaster_AAE")
            },
            "mutationassessor": {
                "score":
                mutationassessor_score,
                "rankscore":
                df.get_value(index, "MutationAssessor_score_rankscore"),
                "pred":
                df.get_value(index, "MutationAssessor_pred")
            },
            "fathmm": {
                "score": fathmm_score,
                "rankscore": df.get_value(index, "FATHMM_converted_rankscore"),
                "pred": df.get_value(index, "FATHMM_pred")
            },
            "provean": {
                "score": provean_score,
                "rankscore": df.get_value(index,
                                          "PROVEAN_converted_rankscore"),
                "pred": df.get_value(index, "PROVEAN_pred")
            },
            "vest3": {
                "score": vest3_score,
                "rankscore": df.get_value(index, "VEST3_rankscore"),
                "transcriptid": df.get_value(index, "Transcript_id_VEST3"),
                "transcriptvar": df.get_value(index, "Transcript_var_VEST3")
            },
            "fathmm-mkl": {
                "coding_score":
                df.get_value(index, "fathmm-MKL_coding_score"),
                "coding_rankscore":
                df.get_value(index, "fathmm-MKL_coding_rankscore"),
                "coding_pred":
                df.get_value(index, "fathmm-MKL_coding_pred"),
                "coding_group":
                df.get_value(index, "fathmm-MKL_coding_group")
            },
            "eigen": {
                "coding_or_noncoding":
                df.get_value(index, "Eigen_coding_or_noncoding"),
                "raw":
                df.get_value(index, "Eigen-raw"),
                "phred":
                df.get_value(index, "Eigen-phred")
            },
            "eigen-pc": {
                "raw": df.get_value(index, "Eigen-PC-raw"),
                "phred": df.get_value(index, "Eigen-PC-phred"),
                "raw_rankscore": df.get_value(index, "Eigen-PC-raw_rankscore")
            },
            "genocanyon": {
                "score": df.get_value(index, "GenoCanyon_score"),
                "rankscore": df.get_value(index, "GenoCanyon_score_rankscore")
            },
            "metasvm": {
                "score": metasvm_score,
                "rankscore": df.get_value(index, "MetaSVM_rankscore"),
                "pred": df.get_value(index, "MetaSVM_pred")
            },
            "metalr": {
                "score": metalr_score,
                "rankscore": df.get_value(index, "MetaLR_rankscore"),
                "pred": df.get_value(index, "MetaLR_pred")
            },
            "reliability_index": df.get_value(index, "Reliability_index"),
            "m_cap_score": {
                "score": m_cap_score,
                "rankscore": df.get_value(index, "M-CAP_rankscore"),
                "pred": df.get_value(index, "M-CAP_pred")
            },
            "dann": {
                "score": df.get_value(index, "DANN_score"),
                "rankscore": df.get_value(index, "DANN_rankscore")
            },
            "gerp++": {
                "nr": df.get_value(index, "GERP++_NR"),
                "rs": df.get_value(index, "GERP++_RS"),
                "rs_rankscore": df.get_value(index, "GERP++_RS_rankscore")
            },
            "integrated": {
                "fitcons_score":
                df.get_value(index, "integrated_fitCons_score"),
                "fitcons_rankscore":
                df.get_value(index, "integrated_fitCons_score_rankscore"),
                "confidence_value":
                df.get_value(index, "integrated_confidence_value")
            },
            "gm12878": {
                "fitcons_score":
                df.get_value(index, "GM12878_fitCons_score"),
                "fitcons_rankscore":
                df.get_value(index, "GM12878_fitCons_score_rankscore"),
                "confidence_value":
                df.get_value(index, "GM12878_confidence_value")
            },
            "h1-hesc": {
                "fitcons_score":
                df.get_value(index, "H1-hESC_fitCons_score"),
                "fitcons_rankscore":
                df.get_value(index, "H1-hESC_fitCons_score_rankscore"),
                "confidence_value":
                df.get_value(index, "H1-hESC_confidence_value")
            },
            "huvec": {
                "fitcons_score":
                df.get_value(index, "HUVEC_fitCons_score"),
                "fitcons_rankscore":
                df.get_value(index, "HUVEC_fitCons_score_rankscore"),
                "confidence_value":
                df.get_value(index, "HUVEC_confidence_value")
            },
            "phylo": {
                "p100way": {
                    "vertebrate":
                    df.get_value(index, "phyloP100way_vertebrate"),
                    "vertebrate_rankscore":
                    df.get_value(index, "phyloP100way_vertebrate_rankscore")
                },
                "p20way": {
                    "mammalian":
                    df.get_value(index, "phyloP20way_mammalian"),
                    "mammalian_rankscore":
                    df.get_value(index, "phyloP20way_mammalian_rankscore")
                }
            },
            "phastcons": {
                "100way": {
                    "vertebrate":
                    df.get_value(index, "phastCons100way_vertebrate"),
                    "vertebrate_rankscore":
                    df.get_value(index, "phastCons100way_vertebrate_rankscore")
                },
                "20way": {
                    "mammalian":
                    df.get_value(index, "phastCons20way_mammalian"),
                    "mammalian_rankscore":
                    df.get_value(index, "phastCons20way_mammalian_rankscore")
                }
            },
            "siphy_29way": {
                "pi":
                siphy,
                "logodds":
                df.get_value(index, "SiPhy_29way_logOdds"),
                "logodds_rankscore":
                df.get_value(index, "SiPhy_29way_logOdds_rankscore")
            },
            "1000gp3": {
                "ac": df.get_value(index, "1000Gp3_AC"),
                "af": df.get_value(index, "1000Gp3_AF"),
                "afr_ac": df.get_value(index, "1000Gp3_AFR_AC"),
                "afr_af": df.get_value(index, "1000Gp3_AFR_AF"),
                "eur_ac": df.get_value(index, "1000Gp3_EUR_AC"),
                "eur_af": df.get_value(index, "1000Gp3_EUR_AF"),
                "amr_ac": df.get_value(index, "1000Gp3_AMR_AC"),
                "amr_af": df.get_value(index, "1000Gp3_AMR_AF"),
                "eas_ac": df.get_value(index, "1000Gp3_EAS_AC"),
                "eas_af": df.get_value(index, "1000Gp3_EAS_AF"),
                "sas_ac": df.get_value(index, "1000Gp3_SAS_AC"),
                "sas_af": df.get_value(index, "1000Gp3_SAS_AF")
            },
            "twinsuk": {
                "ac": df.get_value(index, "TWINSUK_AC"),
                "af": df.get_value(index, "TWINSUK_AF")
            },
            "alspac": {
                "ac": df.get_value(index, "ALSPAC_AC"),
                "af": df.get_value(index, "ALSPAC_AF")
            },
            "esp6500": {
                "aa_ac": df.get_value(index, "ESP6500_AA_AC"),
                "aa_af": df.get_value(index, "ESP6500_AA_AF"),
                "ea_ac": df.get_value(index, "ESP6500_EA_AC"),
                "ea_af": df.get_value(index, "ESP6500_EA_AF")
            },
            "exac": {
                "ac": df.get_value(index, "ExAC_AC"),
                "af": df.get_value(index, "ExAC_AF"),
                "adj_ac": df.get_value(index, "ExAC_Adj_AC"),
                "adj_af": df.get_value(index, "ExAC_Adj_AF"),
                "afr_ac": df.get_value(index, "ExAC_AFR_AC"),
                "afr_af": df.get_value(index, "ExAC_AFR_AF"),
                "amr_ac": df.get_value(index, "ExAC_AMR_AC"),
                "amr_af": df.get_value(index, "ExAC_AMR_AF"),
                "eas_ac": df.get_value(index, "ExAC_EAS_AC"),
                "eas_af": df.get_value(index, "ExAC_EAS_AF"),
                "fin_ac": df.get_value(index, "ExAC_FIN_AC"),
                "fin_af": df.get_value(index, "ExAC_FIN_AF"),
                "nfe_ac": df.get_value(index, "ExAC_NFE_AC"),
                "nfe_af": df.get_value(index, "ExAC_NFE_AF"),
                "sas_ac": df.get_value(index, "ExAC_SAS_AC"),
                "sas_af": df.get_value(index, "ExAC_SAS_AF")
            },
            "exac_nontcga": {
                "ac": df.get_value(index, "ExAC_nonTCGA_AC"),
                "af": df.get_value(index, "ExAC_nonTCGA_AF"),
                "adj_ac": df.get_value(index, "ExAC_nonTCGA_Adj_AC"),
                "adj_af": df.get_value(index, "ExAC_nonTCGA_Adj_AF"),
                "afr_ac": df.get_value(index, "ExAC_nonTCGA_AFR_AC"),
                "afr_af": df.get_value(index, "ExAC_nonTCGA_AFR_AF"),
                "amr_ac": df.get_value(index, "ExAC_nonTCGA_AMR_AC"),
                "amr_af": df.get_value(index, "ExAC_nonTCGA_AMR_AF"),
                "eas_ac": df.get_value(index, "ExAC_nonTCGA_EAS_AC"),
                "eas_af": df.get_value(index, "ExAC_nonTCGA_EAS_AF"),
                "fin_ac": df.get_value(index, "ExAC_nonTCGA_FIN_AC"),
                "fin_af": df.get_value(index, "ExAC_nonTCGA_FIN_AF"),
                "nfe_ac": df.get_value(index, "ExAC_nonTCGA_NFE_AC"),
                "nfe_af": df.get_value(index, "ExAC_nonTCGA_NFE_AF"),
                "sas_ac": df.get_value(index, "ExAC_nonTCGA_SAS_AC"),
                "sas_af": df.get_value(index, "ExAC_nonTCGA_SAS_AF")
            },
            "exac_nonpsych": {
                "ac": df.get_value(index, "ExAC_nonpsych_AC"),
                "af": df.get_value(index, "ExAC_nonpsych_AF"),
                "adj_ac": df.get_value(index, "ExAC_nonpsych_Adj_AC"),
                "adj_af": df.get_value(index, "ExAC_nonpsych_Adj_AF"),
                "afr_ac": df.get_value(index, "ExAC_nonpsych_AFR_AC"),
                "afr_af": df.get_value(index, "ExAC_nonpsych_AFR_AF"),
                "amr_ac": df.get_value(index, "ExAC_nonpsych_AMR_AC"),
                "amr_af": df.get_value(index, "ExAC_nonpsych_AMR_AF"),
                "eas_ac": df.get_value(index, "ExAC_nonpsych_EAS_AC"),
                "eas_af": df.get_value(index, "ExAC_nonpsych_EAS_AF"),
                "fin_ac": df.get_value(index, "ExAC_nonpsych_FIN_AC"),
                "fin_af": df.get_value(index, "ExAC_nonpsych_FIN_AF"),
                "nfe_ac": df.get_value(index, "ExAC_nonpsych_NFE_AC"),
                "nfe_af": df.get_value(index, "ExAC_nonpsych_NFE_AF"),
                "sas_ac": df.get_value(index, "ExAC_nonpsych_SAS_AC"),
                "sas_af": df.get_value(index, "ExAC_nonpsych_SAS_AF")
            },
            "clinvar": {
                "rs": df.get_value(index, "clinvar_rs"),
                "clinsig": df.get_value(index, "clinvar_clnsig"),
                "trait": df.get_value(index, "clinvar_trait"),
                "golden_stars": df.get_value(index, "clinvar_golden_stars")
            },
            "gtex": gtex
        }
    }

    one_snp_json = list_split(
        dict_sweep(unlist(value_convert(one_snp_json)), vals=["."]), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json
Example #44
0
def _map_line_to_json(fields, version='hg19'):
    # specific variable treatment
    chrom = fields[0]
    if chrom == 'M':
        chrom = 'MT'
    # fields[7] in version 2, represent hg18_pos
    if fields[10] == ".":
        hg18_end = "."
    else:
        hg18_end = int(fields[10])
    chromStart = int(fields[8])
    chromEnd = int(fields[8])
    chromStart_38 = int(fields[1])
    ref = fields[2].upper()
    alt = fields[3].upper()
    HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt)
    HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt)
    if version == 'hg19':
        HGVS = HGVS_19
    elif version == 'hg38':
        HGVS = HGVS_38
    if fields[69] == ".":
        siphy = "."
    else:
        freq = fields[69].split(":")
        siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}

    acc = fields[26].rstrip().rstrip(';').split(";")
    pos = fields[28].rstrip().rstrip(';').split(";")
    uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos)))

    # load as json data
    one_snp_json = {
        "_id": HGVS,
        "dbnsfp": {
            "rsid": fields[6],
            "chrom": chrom,
            "hg19": {
                "start": fields[8],
                "end": chromEnd
            },
            "hg18": {
                "start": fields[10],
                "end": hg18_end
            },
            "hg38": {
                "start": fields[1],
                "end": fields[1]
            },
            "ref": ref,
            "alt": alt,
            "aa": {
                "ref": fields[4],
                "alt": fields[5],
                "pos": fields[22],
                "refcodon": fields[13],
                "codonpos": fields[14],
            },
            "genename": fields[11],
            "uniprot": uniprot,
            "interpro_domain": fields[111],
            "cds_strand": fields[12],
            "ancestral_allele": fields[16],
            "ensembl": {
                "geneid": fields[19],
                "transcriptid": fields[20]
            },
            "sift": {
                "score": fields[23],
                "converted_rankscore": fields[24],
                "pred": fields[25]
            },
            "polyphen2": {
                "hdiv": {
                    "score": fields[29],
                    "rankscore": fields[30],
                    "pred": fields[31]
                },
                "hvar": {
                    "score": fields[32],
                    "rankscore": fields[33],
                    "pred": fields[34]
                }
            },
            "lrt": {
                "score": fields[35],
                "converted_rankscore": fields[36],
                "pred": fields[37],
                "omega": fields[38]
            },
            "mutationtaster": {
                "score": fields[39],
                "converted_rankscore": fields[40],
                "pred": fields[41],
                "model": fields[42],
                "AAE": fields[43]
            },
            "mutationassessor": {
                "score": fields[46],
                "rankscore": fields[47],
                "pred": fields[48]
            },
            "fathmm": {
                "score": fields[49],
                "rankscore": fields[50],
                "pred": fields[51]
            },
            "provean": {
                "score": fields[52],
                "rankscore": fields[53],
                "pred": fields[54]
            },
            "metasvm": {
                "score": fields[55],
                "rankscore": fields[56],
                "pred": fields[57]
            },
            "lr": {
                "score": fields[58],
                "rankscore": fields[59],
                "pred": fields[60]
            },
            "reliability_index": fields[61],
            "gerp++": {
                "nr": fields[62],
                "rs": fields[63],
                "rs_rankscore": fields[64]
            },
            "phylop_7way": {
                "vertebrate": fields[65],
                "vertebrate_rankscore": fields[66]
            },
            "phastcons_7way": {
                "vertebrate": fields[67],
                "vertebrate_rankscore": fields[68]
            },
            "siphy_29way": {
                "pi": siphy,
                "logodds": fields[70],
                "logodds_rankscore": fields[71]
            },
            "1000gp1": {
                "ac": fields[72],
                "af": fields[73],
                "afr_ac": fields[74],
                "afr_af": fields[75],
                "eur_ac": fields[76],
                "eur_af": fields[77],
                "amr_ac": fields[78],
                "amr_af": fields[79],
                "eas_ac": fields[80],
                "eas_af": fields[81],
                "sas_ac": fields[82],
                "sas_af": fields[83]
            },
            "twinsuk": {
                "ac": fields[84],
                "af": fields[85]
            },
            "alspac": {
                "ac": fields[86],
                "af": fields[87]
            },
            "esp6500": {
                "aa_ac": fields[88],
                "aa_af": fields[89],
                "ea_ac": fields[90],
                "ea_af": fields[91]
            },
            "exac": {
                "ac": fields[92],
                "af": fields[93],
                "adj_ac": fields[94],
                "adj_af": fields[95],
                "afr_ac": fields[96],
                "afr_af": fields[97],
                "amr_ac": fields[98],
                "amr_af": fields[99],
                "eas_ac": fields[100],
                "eas_af": fields[101],
                "fin_ac": fields[102],
                "fin_af": fields[103],
                "nfe_ac": fields[104],
                "nfe_af": fields[105],
                "sas_ac": fields[106],
                "sas_af": fields[107]
            },
            "clinvar": {
                "rs": fields[108],
                "clinsig": fields[109],
                "trait": fields[110]
            }
        }
    }

    one_snp_json = list_split(dict_sweep(unlist(value_convert(one_snp_json)), vals=["."]), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json
Example #45
0
def _map_line_to_json(fields, version='hg19'):
    # specific variable treatment
    chrom = fields[0]
    if chrom == 'M':
        chrom = 'MT'
    # fields[7] in version 2, represent hg18_pos
    if fields[10] == ".":
        hg18_end = "."
    else:
        hg18_end = int(fields[10])
    chromStart = int(fields[8])
    chromEnd = int(fields[8])
    chromStart_38 = int(fields[1])
    ref = fields[2].upper()
    alt = fields[3].upper()
    HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt)
    HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt)
    if version == 'hg19':
        HGVS = HGVS_19
    elif version == 'hg38':
        HGVS = HGVS_38
    if fields[69] == ".":
        siphy = "."
    else:
        freq = fields[69].split(":")
        siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}

    acc = fields[26].rstrip().rstrip(';').split(";")
    pos = fields[28].rstrip().rstrip(';').split(";")
    uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos)))

    # load as json data
    one_snp_json = {
        "_id": HGVS,
        "dbnsfp": {
            "rsid": fields[6],
            "chrom": chrom,
            "hg19": {
                "start": fields[8],
                "end": chromEnd
            },
            "hg18": {
                "start": fields[10],
                "end": hg18_end
            },
            "hg38": {
                "start": fields[1],
                "end": fields[1]
            },
            "ref": ref,
            "alt": alt,
            "aa": {
                "ref": fields[4],
                "alt": fields[5],
                "pos": fields[22],
                "refcodon": fields[13],
                "codonpos": fields[14],
            },
            "genename": fields[11],
            "uniprot": uniprot,
            "interpro_domain": fields[111],
            "cds_strand": fields[12],
            "ancestral_allele": fields[16],
            "ensembl": {
                "geneid": fields[19],
                "transcriptid": fields[20]
            },
            "sift": {
                "score": fields[23],
                "converted_rankscore": fields[24],
                "pred": fields[25]
            },
            "polyphen2": {
                "hdiv": {
                    "score": fields[29],
                    "rankscore": fields[30],
                    "pred": fields[31]
                },
                "hvar": {
                    "score": fields[32],
                    "rankscore": fields[33],
                    "pred": fields[34]
                }
            },
            "lrt": {
                "score": fields[35],
                "converted_rankscore": fields[36],
                "pred": fields[37],
                "omega": fields[38]
            },
            "mutationtaster": {
                "score": fields[39],
                "converted_rankscore": fields[40],
                "pred": fields[41],
                "model": fields[42],
                "AAE": fields[43]
            },
            "mutationassessor": {
                "score": fields[46],
                "rankscore": fields[47],
                "pred": fields[48]
            },
            "fathmm": {
                "score": fields[49],
                "rankscore": fields[50],
                "pred": fields[51]
            },
            "provean": {
                "score": fields[52],
                "rankscore": fields[53],
                "pred": fields[54]
            },
            "metasvm": {
                "score": fields[55],
                "rankscore": fields[56],
                "pred": fields[57]
            },
            "lr": {
                "score": fields[58],
                "rankscore": fields[59],
                "pred": fields[60]
            },
            "reliability_index": fields[61],
            "gerp++": {
                "nr": fields[62],
                "rs": fields[63],
                "rs_rankscore": fields[64]
            },
            "phylop_7way": {
                "vertebrate": fields[65],
                "vertebrate_rankscore": fields[66]
            },
            "phastcons_7way": {
                "vertebrate": fields[67],
                "vertebrate_rankscore": fields[68]
            },
            "siphy_29way": {
                "pi": siphy,
                "logodds": fields[70],
                "logodds_rankscore": fields[71]
            },
            "1000gp1": {
                "ac": fields[72],
                "af": fields[73],
                "afr_ac": fields[74],
                "afr_af": fields[75],
                "eur_ac": fields[76],
                "eur_af": fields[77],
                "amr_ac": fields[78],
                "amr_af": fields[79],
                "eas_ac": fields[80],
                "eas_af": fields[81],
                "sas_ac": fields[82],
                "sas_af": fields[83]
            },
            "twinsuk": {
                "ac": fields[84],
                "af": fields[85]
            },
            "alspac": {
                "ac": fields[86],
                "af": fields[87]
            },
            "esp6500": {
                "aa_ac": fields[88],
                "aa_af": fields[89],
                "ea_ac": fields[90],
                "ea_af": fields[91]
            },
            "exac": {
                "ac": fields[92],
                "af": fields[93],
                "adj_ac": fields[94],
                "adj_af": fields[95],
                "afr_ac": fields[96],
                "afr_af": fields[97],
                "amr_ac": fields[98],
                "amr_af": fields[99],
                "eas_ac": fields[100],
                "eas_af": fields[101],
                "fin_ac": fields[102],
                "fin_af": fields[103],
                "nfe_ac": fields[104],
                "nfe_af": fields[105],
                "sas_ac": fields[106],
                "sas_af": fields[107]
            },
            "clinvar": {
                "rs": fields[108],
                "clinsig": fields[109],
                "trait": fields[110]
            }
        }
    }

    one_snp_json = list_split(
        dict_sweep(unlist(value_convert(one_snp_json)), vals=["."]), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json
def _map_line_to_json(fields, version):
    # specific variable treatment
    chrom = fields[0]
    if chrom == 'M':
        chrom = 'MT'
    # fields[7] in version 2, represent hg18_pos
    if fields[10] == ".":
        hg18_end = "."
    else:
        hg18_end = int(fields[10])
    # in case of no hg19 position provided, remove the item
    if fields[8] == '.':
        return None
    else:
        chromStart = int(fields[8])
        chromEnd = int(fields[8])
    chromStart_38 = int(fields[1])
    ref = fields[2].upper()
    alt = fields[3].upper()
    HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt)
    HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt)
    if version == 'hg19':
        HGVS = HGVS_19
    elif version == 'hg38':
        HGVS = HGVS_38
    if fields[105] == ".":
        siphy = "."
    else:
        freq = fields[105].split(":")
        siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}
    gtex_gene = fields[181].split('|')
    gtex_tissue = fields[182].split('|')
    gtex = map(dict, map(lambda t: zip(('gene', 'tissue'), t), zip(gtex_gene, gtex_tissue)))
    acc = fields[26].rstrip().rstrip(';').split(";")
    pos = fields[28].rstrip().rstrip(';').split(";")
    uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos)))
    provean_score = fields[52].split(';')
    sift_score = fields[23].split(';')
    hdiv_score = fields[29].split(';')
    hvar_score = fields[32].split(';')
    lrt_score = fields[35].split(';')
    dann_score = fields[69].split(';')
    mutationtaster_score = fields[39].split(';')
    mutationassessor_score = fields[46].split(';')
    vest3_score = fields[57].split(';')
    metasvm_score = fields[59].split(';')
    fathmm_score = fields[49].split(';')
    lr_score = fields[62].split(';')
    fathmm_coding_score = fields[71].split(';')
    integrated_fitcons_score = fields[82].split(';')
    gm12878_fitcons_score = fields[85].split(';')
    h1_hesc_fitcons_score = fields[88].split(';')
    huvec_fitcons_score = fields[91].split(';')
    if len(provean_score) > 1:
        for i in range(len(provean_score)):
            if provean_score[i] == '.':
                provean_score[i] = None
    if len(sift_score) > 1:
        for i in range(len(sift_score)):
            if sift_score[i] == '.':
                sift_score[i] = None
    if len(hdiv_score) > 1:
        for i in range(len(hdiv_score)):
            if hdiv_score[i] == '.':
                hdiv_score[i] = None
    if len(hvar_score) > 1:
        for i in range(len(hvar_score)):
            if hvar_score[i] == '.':
                hvar_score[i] = None
    if len(lrt_score) > 1:
        for i in range(len(lrt_score)):
            if lrt_score[i] == '.':
                lrt_score[i] = None
    if len(mutationtaster_score) > 1:
        for i in range(len(mutationtaster_score)):
            if mutationtaster_score[i] == '.':
                mutationtaster_score[i] = None
    if len(mutationassessor_score) > 1:
        for i in range(len(mutationassessor_score)):
            if mutationassessor_score[i] == '.':
                mutationassessor_score[i] = None
    if len(metasvm_score) > 1:
        for i in range(len(metasvm_score)):
            if metasvm_score[i] == '.':
                metasvm_score[i] = None
    if len(vest3_score) > 1:
        for i in range(len(vest3_score)):
            if vest3_score[i] == '.':
                vest3_score[i] = None
    if len(fathmm_score) > 1:
        for i in range(len(fathmm_score)):
            if fathmm_score[i] == '.':
                fathmm_score[i] = None
    if len(lr_score) > 1:
        for i in range(len(lr_score)):
            if lr_score[i] == '.':
                lr_score[i] = None
    if len(fathmm_coding_score) > 1:
        for i in range(len(fathmm_coding_score)):
            if fathmm_coding_score[i] == '.':
                fathmm_coding_score[i] = None
    if len(dann_score) > 1:
        for i in range(len(dann_score)):
            if dann_score[i] == '.':
                dann_score[i] = None
    if len(integrated_fitcons_score) > 1:
        for i in range(len(integrated_fitcons_score)):
            if integrated_fitcons_score[i] == '.':
                integrated_fitcons_score[i] = None
    if len(gm12878_fitcons_score) > 1:
        for i in range(len(gm12878_fitcons_score)):
            if gm12878_fitcons_score[i] == '.':
                gm12878_fitcons_score[i] = None
    if len(h1_hesc_fitcons_score) > 1:
        for i in range(len(h1_hesc_fitcons_score)):
            if h1_hesc_fitcons_score[i] == '.':
                h1_hesc_fitcons_score[i] = None
    if len(huvec_fitcons_score) > 1:
        for i in range(len(huvec_fitcons_score)):
            if huvec_fitcons_score[i] == '.':
                huvec_fitcons_score[i] = None
# load as json data
    one_snp_json = {
        "_id": HGVS,
        "dbnsfp": {
            "rsid": fields[6],
            #"rsid_dbSNP144": fields[6],
            "chrom": chrom,
            "hg19": {
                "start": chromStart,
                "end": chromEnd
            },
            "hg18": {
                "start": fields[10],
                "end": hg18_end
            },
            "hg38": {
                "start": fields[1],
                "end": fields[1]
            },
            "ref": ref,
            "alt": alt,
            "aa": {
                "ref": fields[4],
                "alt": fields[5],
                "pos": fields[22],
                "refcodon": fields[13],
                "codonpos": fields[14],
                "codon_degeneracy": fields[15]
            },
            "genename": fields[11],
            "uniprot": uniprot,
            "interpro_domain": fields[180],
            "cds_strand": fields[12],
            "ancestral_allele": fields[16],
            #"altaineandertal": fields[17],
            #"denisova": fields[18]
            "ensembl": {
                "geneid": fields[19],
                "transcriptid": fields[20],
                "proteinid": fields[21]
            },
            "sift": {
                "score": sift_score,
                "converted_rankscore": fields[24],
                "pred": fields[25]
            },
            "polyphen2": {
                "hdiv": {
                    "score": hdiv_score,
                    "rankscore": fields[30],
                    "pred": fields[31]
                },
                "hvar": {
                    "score": hvar_score,
                    "rankscore": fields[33],
                    "pred": fields[34]
                }
            },
            "lrt": {
                "score": lrt_score,
                "converted_rankscore": fields[36],
                "pred": fields[37],
                "omega": fields[38]
            },
            "mutationtaster": {
                "score": mutationtaster_score,
                "converted_rankscore": fields[40],
                "pred": fields[41],
                "model": fields[42],
                "AAE": fields[43]
            },
            "mutationassessor": {
                "score": mutationassessor_score,
                "rankscore": fields[47],
                "pred": fields[48]
            },
            "fathmm": {
                "score": fathmm_score,
                "rankscore": fields[50],
                "pred": fields[51]
            },
            "provean": {
                "score": provean_score,
                "rankscore": fields[53],
                "pred": fields[54]
            },
            "vest3": {
                "score": vest3_score,
                "rankscore": fields[57],
                "transcriptid": fields[55],
                "transcriptvar": fields[56]
            },
            "fathmm-mkl": {
                "coding_score": fathmm_coding_score,
                "coding_rankscore": fields[72],
                "coding_pred": fields[73],
                "coding_group": fields[74]
            },
            "eigen": {
                "raw": fields[75],
                "phred": fields[76],
                "raw_rankscore": fields[77]
            },
            "eigen-pc": {
                "raw": fields[78],
                "raw_rankscore": fields[79]
            },
            "genocanyon": {
                "score": fields[80],
                "rankscore": fields[81]
            },
            "metasvm": {
                "score": metasvm_score,
                "rankscore": fields[60],
                "pred": fields[61]
            },
            "metalr": {
                "score": lr_score,
                "rankscore": fields[63],
                "pred": fields[64]
            },
            "reliability_index": fields[65],
            "dann": {
                "score": dann_score,
                "rankscore": fields[70]
            },
            "gerp++": {
                "nr": fields[94],
                "rs": fields[95],
                "rs_rankscore": fields[96]
            },
            "integrated": {
                "fitcons_score": integrated_fitcons_score,
                "fitcons_rankscore": fields[83],
                "confidence_value": fields[84]
            },
            "gm12878": {
                "fitcons_score": gm12878_fitcons_score,
                "fitcons_rankscore": fields[86],
                "confidence_value": fields[87]
            },
            "h1-hesc": {
                "fitcons_score": h1_hesc_fitcons_score,
                "fitcons_rankscore": fields[89],
                "confidence_value": fields[90]
            },
            "huvec": {
                "fitcons_score": huvec_fitcons_score,
                "fitcons_rankscore": fields[92],
                "confidence_value": fields[93]
            },
            "phylo": {
                "p100way": {
                    "vertebrate": fields[97],
                    "vertebrate_rankscore": fields[98]
                },
                "p20way": {
                    "mammalian": fields[99],
                    "mammalian_rankscore": fields[100]
                }
            },
            "phastcons": {
                "100way": {
                    "vertebrate": fields[101],
                    "vertebrate_rankscore": fields[102]
                },
                "20way": {
                    "mammalian": fields[103],
                    "mammalian_rankscore": fields[104]
                }
            },
            "siphy_29way": {
                "pi": siphy,
                "logodds": fields[106],
                "logodds_rankscore": fields[107]
            },
            "1000gp3": {
                "ac": fields[108],
                "af": fields[109],
                "afr_ac": fields[110],
                "afr_af": fields[111],
                "eur_ac": fields[112],
                "eur_af": fields[113],
                "amr_ac": fields[114],
                "amr_af": fields[115],
                "eas_ac": fields[116],
                "eas_af": fields[117],
                "sas_ac": fields[118],
                "sas_af": fields[119]
            },
            "twinsuk": {
                "ac": fields[120],
                "af": fields[121]
            },
            "alspac": {
                "ac": fields[122],
                "af": fields[123]
            },
            "esp6500": {
                "aa_ac": fields[124],
                "aa_af": fields[125],
                "ea_ac": fields[126],
                "ea_af": fields[127]
            },
            "exac": {
                "ac": fields[128],
                "af": fields[129],
                "adj_ac": fields[130],
                "adj_af": fields[131],
                "afr_ac": fields[132],
                "afr_af": fields[133],
                "amr_ac": fields[134],
                "amr_af": fields[135],
                "eas_ac": fields[136],
                "eas_af": fields[137],
                "fin_ac": fields[138],
                "fin_af": fields[139],
                "nfe_ac": fields[140],
                "nfe_af": fields[141],
                "sas_ac": fields[142],
                "sas_af": fields[143]
            },
            "exac_nontcga": {
                "ac": fields[144],
                "af": fields[145],
                "adj_ac": fields[146],
                "adj_af": fields[147],
                "afr_ac": fields[148],
                "afr_af": fields[149],
                "amr_ac": fields[150],
                "amr_af": fields[151],
                "eas_ac": fields[152],
                "eas_af": fields[153],
                "fin_ac": fields[154],
                "fin_af": fields[155],
                "nfe_ac": fields[156],
                "nfe_af": fields[157],
                "sas_ac": fields[158],
                "sas_af": fields[159]
            },
            "exac_nonpsych": {
                "ac": fields[160],
                "af": fields[161],
                "adj_ac": fields[162],
                "adj_af": fields[163],
                "afr_ac": fields[164],
                "afr_af": fields[165],
                "amr_ac": fields[166],
                "amr_af": fields[167],
                "eas_ac": fields[168],
                "eas_af": fields[169],
                "fin_ac": fields[170],
                "fin_af": fields[171],
                "nfe_ac": fields[172],
                "nfe_af": fields[173]
            },
            "clinvar": {
                "rs": fields[176],
                "clinsig": fields[177],
                "trait": fields[178],
                "golden_stars": fields[179]
            },
            "gtex": gtex
        }
    }

    one_snp_json = list_split(dict_sweep(unlist(value_convert(one_snp_json)), vals=["."]), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json
def _map_line_to_json(cp):
    clinical_siginificance = cp.ReferenceClinVarAssertion.\
        ClinicalSignificance.Description
    rcv_accession = cp.ReferenceClinVarAssertion.ClinVarAccession.Acc
    review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.\
        ReviewStatus
    last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.\
        DateLastEvaluated
    CLINVAR_ID = cp.ReferenceClinVarAssertion.MeasureSet.ID
    number_submitters = len(cp.ClinVarAssertion)
    # some items in clinvar_xml doesn't have origin information
    try:
        origin = cp.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin
    except:
        origin = None
    # MeasureSet.Measure return a list, there might be multiple
    # Measure under one MeasureSet
    for Measure in cp.ReferenceClinVarAssertion.MeasureSet.Measure:
        variation_type = Measure.Type
        # exclude any item of which types belong to
        # 'Variation', 'protein only' or 'Microsatellite'
        if variation_type == 'Variation' or variation_type\
           == 'protein only' or variation_type == 'Microsatellite':
            continue
        allele_id = Measure.ID
        chrom = None
        chromStart = None
        chromEnd = None
        ref = None
        alt = None
        if Measure.SequenceLocation:
            for SequenceLocation in Measure.SequenceLocation:
                # In this version, only accept information concerning GRCh37
                if 'GRCh37' in SequenceLocation.Assembly:
                    chrom = SequenceLocation.Chr
                    chromStart = SequenceLocation.start
                    chromEnd = SequenceLocation.stop
                    ref = SequenceLocation.referenceAllele
                    alt = SequenceLocation.alternateAllele
        if Measure.MeasureRelationship:
            try:
                symbol = Measure.MeasureRelationship[0].\
                    Symbol[0].get_ElementValue().valueOf_
            except:
                symbol = None
            gene_id = Measure.MeasureRelationship[0].XRef[0].ID
        else:
            symbol = None
            gene_id = None
        if Measure.Name:
            name = Measure.Name[0].ElementValue.valueOf_
        else:
            name = None
        if len(Measure.CytogeneticLocation) == 1:
            cytogenic = Measure.CytogeneticLocation[0]
        else:
            cytogenic = Measure.CytogeneticLocation
        hgvs_coding = None
        hgvs_genome = None
        HGVS = {'genomic': [], 'coding': [], 'non-coding': [], 'protein': []}
        coding_hgvs_only = None
        hgvs_id = None
        # hgvs_not_validated = None
        if Measure.AttributeSet:
            # 'copy number loss' or 'gain' have format different\
            # from other types, should be dealt with seperately
            if (variation_type == 'copy number loss') or \
                    (variation_type == 'copy number gain'):
                for AttributeSet in Measure.AttributeSet:
                    if 'HGVS, genomic, top level' in AttributeSet.\
                            Attribute.Type:
                        if AttributeSet.Attribute.integerValue == 37:
                            hgvs_genome = AttributeSet.Attribute.get_valueOf_()
                    if 'genomic' in AttributeSet.Attribute.Type:
                        HGVS['genomic'].append(AttributeSet.Attribute.
                                               get_valueOf_())
                    elif 'non-coding' in AttributeSet.Attribute.Type:
                        HGVS['non-coding'].append(AttributeSet.Attribute.
                                                  get_valueOf_())
                    elif 'coding' in AttributeSet.Attribute.Type:
                        HGVS['coding'].append(AttributeSet.Attribute.
                                              get_valueOf_())
                    elif 'protein' in AttributeSet.Attribute.Type:
                        HGVS['protein'].append(AttributeSet.
                                               Attribute.get_valueOf_())
            else:
                for AttributeSet in Measure.AttributeSet:
                    if 'genomic' in AttributeSet.Attribute.Type:
                        HGVS['genomic'].append(AttributeSet.
                                               Attribute.get_valueOf_())
                    elif 'non-coding' in AttributeSet.Attribute.Type:
                        HGVS['non-coding'].append(AttributeSet.
                                                  Attribute.get_valueOf_())
                    elif 'coding' in AttributeSet.Attribute.Type:
                        HGVS['coding'].append(AttributeSet.Attribute.
                                              get_valueOf_())
                    elif 'protein' in AttributeSet.Attribute.Type:
                        HGVS['protein'].append(AttributeSet.
                                               Attribute.get_valueOf_())
                    if AttributeSet.Attribute.Type == 'HGVS, coding, RefSeq':
                        hgvs_coding = AttributeSet.Attribute.get_valueOf_()
                    elif AttributeSet.Attribute.Type == \
                            'HGVS, genomic, top level, previous':
                        hgvs_genome = AttributeSet.Attribute.get_valueOf_()
                        break
            if chrom and chromStart and chromEnd:
                if variation_type == 'single nucleotide variant':
                    hgvs_id = "chr%s:g.%s%s>%s" % (chrom, chromStart, ref, alt)
                # items whose type belong to 'Indel, Insertion, \
                # Duplication' might not hava explicit alt information, \
                # so we will parse from hgvs_genome
                elif variation_type == 'Indel':
                    if hgvs_genome:
                        indel_position = hgvs_genome.find('del')
                        indel_alt = hgvs_genome[indel_position+3:]
                        hgvs_id = "chr%s:g.%s_%sdel%s" % \
                                  (chrom, chromStart, chromEnd, indel_alt)
                elif variation_type == 'Deletion':
                    hgvs_id = "chr%s:g.%s_%sdel" % \
                              (chrom, chromStart, chromEnd)
                elif variation_type == 'Insertion':
                    if hgvs_genome:
                        ins_position = hgvs_genome.find('ins')
                        if 'ins' in hgvs_genome:
                            ins_ref = hgvs_genome[ins_position+3:]
                            hgvs_id = "chr%s:g.%s_%sins%s" % \
                                      (chrom, chromStart, chromEnd, ins_ref)
                elif variation_type == 'Duplication':
                    if hgvs_genome:
                        dup_position = hgvs_genome.find('dup')
                        if 'dup' in hgvs_genome:
                            dup_ref = hgvs_genome[dup_position+3:]
                            hgvs_id = "chr%s:g.%s_%sdup%s" % \
                                      (chrom, chromStart, chromEnd, dup_ref)
            elif variation_type == 'copy number loss' or\
                    variation_type == 'copy number gain':
                if hgvs_genome:
                    hgvs_id = "chr" + hgvs_genome.split('.')[1] +\
                              hgvs_genome.split('.')[2]
            elif hgvs_coding:
                hgvs_id = hgvs_coding
                coding_hgvs_only = True
            else:
                print "couldn't find any id", rcv_accession
                return
        else:
            print 'no measure.attribute', rcv_accession
            return
        other_ids = ''
        rsid = None
        # loop through XRef to find rsid as well as other ids
        if Measure.XRef:
            for XRef in Measure.XRef:
                if XRef.Type == 'rs':
                    rsid = 'rs' + str(XRef.ID)
                other_ids = other_ids + XRef.DB + ':' + XRef.ID + ';'
        # make sure the hgvs_id is not none
        if hgvs_id:
            one_snp_json = {

                "_id": hgvs_id,
                "clinvar":
                    {
                        "allele_id": allele_id,
                        "chrom": chrom,
                        "hg19":
                            {
                                "start": chromStart,
                                "end": chromEnd
                            },
                        "type": variation_type,
                        "name": name,
                        "gene":
                            {
                                "id": gene_id,
                                "symbol": symbol
                            },
                        "clinical_significance": clinical_siginificance,
                        "rsid": rsid,
                        "rcv_accession": rcv_accession,
                        "origin": origin,
                        "cytogenic": cytogenic,
                        "review_status": review_status,
                        "hgvs": HGVS,
                        "number_submitters": number_submitters,
                        "last_evaluated": str(last_evaluated),
                        "other_ids": other_ids,
                        "clinvar_id": CLINVAR_ID,
                        "coding_hgvs_only": coding_hgvs_only,
                        "ref": ref,
                        "alt": alt
                    }
                }
            obj = (dict_sweep(unlist(value_convert(one_snp_json)), [None]))
            yield obj
Example #48
0
def _map_line_to_json(fields):
    assert len(fields) == VALID_COLUMN_NO
    chrom = fields[13]
    chromStart = fields[14]
    chromEnd = fields[15]

    HGVS = None
    cds = fields[18].split(":")
    cds = cds[1]
    replace = re.findall(r'[ATCGMNYR=]+', cds)
    sub = re.search(r'\d([ATCGMNHKRY]>[ATCGMNHKRY])', cds)
    ins = re.search(r'ins[ATCGMNHYR]+|ins[0-9]+', cds)
    delete = fields[1] == 'deletion'
    indel = fields[1] == 'indel'
    dup = re.search(r'dup', cds)
    inv = re.search(r'inv|inv[0-9]+|inv[ATCGMNHYR]+', cds)
    if ins:
        delete = None
        indel = None
    elif delete:
        ins = None
        indel = None
    # parse from vcf file. Input chrom number
    # and chromStart, and return REF, ALT
    if chromStart:
        record = vcf_reader.fetch(chrom, int(chromStart))
    else:
        record = None
    if record:
        REF = record.REF
        ALT = record.ALT
        ALT = ALT[0]
        if record.is_snp and len(ALT) < 2:
            mod = [REF, ALT]
        else:
            mod = ALT
    else:
        return

    if sub and record.is_snp:
            HGVS = "chr%s:g.%s%s>%s" % (chrom, chromStart, mod[0], mod[1])
    elif ins:
        HGVS = "chr%s:g.%s_%sins%s" % (chrom, chromStart, chromEnd, mod)
    elif delete:
        HGVS = "chr%s:g.%s_%sdel" % (chrom, chromStart, chromEnd)
    elif indel:
        try:
            HGVS = "chr%s:g.%s_%sdelins%s" % (chrom, chromStart, chromEnd, mod)
        except AttributeError:
            print "ERROR:", fields[1], cds
    elif dup:
        HGVS = "chr%s:g.%s_%sdup%s" % (chrom, chromStart, chromEnd, mod)
    elif inv:
        HGVS = "chr%s:g.%s_%sinv%s" % (chrom, chromStart, chromEnd, mod)
    elif replace:
        HGVS = "chr%s:g.%s_%s%s" % (chrom, chromStart, chromEnd, mod)
    else:
        print 'ERROR:', fields[1], cds

    # load as json data
    if HGVS is None:
        print 'None:', fields[1], cds
        return None

    one_snp_json = {

        "_id": HGVS,
        "clinvar":
            {
                "allele_id": fields[0],
                "hg19":
                    {
                        "chr": fields[13],
                        "start": fields[14],
                        "end": fields[15]
                    },
                "type": fields[1],
                "name": fields[2],
                "gene":
                    {
                        "id": fields[3],
                        "symbol": fields[4]
                    },
                "clinical_significance": fields[5].split(";"),
                "rsid": 'rs' + str(fields[6]),
                "nsv_dbvar": fields[7],
                "rcv_accession": fields[8].split(";"),
                "tested_in_gtr": fields[9],
                "phenotype_id": other_id(fields[10]),
                "origin": fields[11],
                "cytogenic": fields[16],
                "review_status": fields[17],
                "hgvs":
                    {
                        "coding": fields[18],
                        "protein": fields[19]
                    },
                "number_submitters": fields[20],
                "last_evaluated": fields[21],
                "guidelines": fields[22],
                "other_ids": other_id(fields[23]),
                "clinvar_id": fields[24]
            }
        }
    return dict_sweep(unlist(value_convert(one_snp_json)), vals=["-"])
def _map_line_to_json(fields):
    assert len(fields) == VALID_COLUMN_NO
    chrom = fields[13]
    chromStart = fields[14]
    chromEnd = fields[15]

    HGVS = None
    cds = fields[18].split(":")
    cds = cds[1]
    replace = re.findall(r'[ATCGMNYR=]+', cds)
    sub = re.search(r'\d([ATCGMNHKRY]>[ATCGMNHKRY])', cds)
    ins = re.search(r'ins[ATCGMNHYR]+|ins[0-9]+', cds)
    delete = fields[1] == 'deletion'
    indel = fields[1] == 'indel'
    dup = re.search(r'dup', cds)
    inv = re.search(r'inv|inv[0-9]+|inv[ATCGMNHYR]+', cds)
    if ins:
        delete = None
        indel = None
    elif delete:
        ins = None
        indel = None
    # parse from vcf file. Input chrom number
    # and chromStart, and return REF, ALT
    if chromStart:
        record = vcf_reader.fetch(chrom, int(chromStart))
    else:
        record = None
    if record:
        REF = record.REF
        ALT = record.ALT
        ALT = ALT[0]
        if record.is_snp and len(ALT) < 2:
            mod = [REF, ALT]
        else:
            mod = ALT
    else:
        return

    if sub and record.is_snp:
        HGVS = "chr%s:g.%s%s>%s" % (chrom, chromStart, mod[0], mod[1])
    elif ins:
        HGVS = "chr%s:g.%s_%sins%s" % (chrom, chromStart, chromEnd, mod)
    elif delete:
        HGVS = "chr%s:g.%s_%sdel" % (chrom, chromStart, chromEnd)
    elif indel:
        try:
            HGVS = "chr%s:g.%s_%sdelins%s" % (chrom, chromStart, chromEnd, mod)
        except AttributeError:
            print "ERROR:", fields[1], cds
    elif dup:
        HGVS = "chr%s:g.%s_%sdup%s" % (chrom, chromStart, chromEnd, mod)
    elif inv:
        HGVS = "chr%s:g.%s_%sinv%s" % (chrom, chromStart, chromEnd, mod)
    elif replace:
        HGVS = "chr%s:g.%s_%s%s" % (chrom, chromStart, chromEnd, mod)
    else:
        print 'ERROR:', fields[1], cds

    # load as json data
    if HGVS is None:
        print 'None:', fields[1], cds
        return None

    one_snp_json = {
        "_id": HGVS,
        "clinvar": {
            "allele_id": fields[0],
            "hg19": {
                "chr": fields[13],
                "start": fields[14],
                "end": fields[15]
            },
            "type": fields[1],
            "name": fields[2],
            "gene": {
                "id": fields[3],
                "symbol": fields[4]
            },
            "clinical_significance": fields[5].split(";"),
            "rsid": 'rs' + str(fields[6]),
            "nsv_dbvar": fields[7],
            "rcv_accession": fields[8].split(";"),
            "tested_in_gtr": fields[9],
            "phenotype_id": other_id(fields[10]),
            "origin": fields[11],
            "cytogenic": fields[16],
            "review_status": fields[17],
            "hgvs": {
                "coding": fields[18],
                "protein": fields[19]
            },
            "number_submitters": fields[20],
            "last_evaluated": fields[21],
            "guidelines": fields[22],
            "other_ids": other_id(fields[23]),
            "clinvar_id": fields[24]
        }
    }
    return dict_sweep(unlist(value_convert(one_snp_json)), vals=["-"])