Example #1
0
def _map_line_to_json(fields):
    vid = fields[0].split(":")
    chrom = re.search(r'[1-9]+', vid[0]).group()

    if chrom == '23':
        chrom = chrom.replace('23', 'X')
    HGVS = "chr%s:%s" % (chrom, vid[1])
    # load as json data
    if HGVS is None:
        return

    one_snp_json = {
        "_id": HGVS,
        "emv": {
            "gene": fields[2],
            "variant_id": fields[3],
            "exon": fields[4],
            "egl_variant": fields[5],
            "egl_protein": fields[6],
            "egl_classification": fields[7],
            "egl_classification_date": fields[8],
            "hgvs": fields[9].split(" | "),
            "clinvar_rcv": fields[10],
        }
    }

    return unlist(dict_sweep(value_convert(one_snp_json), vals=[""]))
def _map_line_to_json(fields):
    assert len(fields) == VALID_CO_NUMBER

    HGVS = fields[1]
    if HGVS is None:
        return
    one_snp_json = {
        "_id": HGVS,
        'drugbank':
            {
                'drug': fields[2],
                'interacting_gene_or_enzyme': fields[3],
                'snp_rs_id': fields[0],
                'allele_name': fields[4],
                'defining_change': fields[5],
                'adverse_reaction': fields[6],
                'references': fields[7]
            }
    }
    return dict_sweep(one_snp_json, ['Not Available'])
def _map_line_to_json(item):
    chrom = item.CHROM
    chromStart = item.POS
    ref = item.REF
    info = item.INFO
    hpo_count=item.INFO['HPO_CT']
    for alt in item.ALT:
        alt = str(alt)
        (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True)
        if HGVS is None:
            return
        one_snp_json = {
            "_id": HGVS,
            "geno2mp": {
                "hpo_count": hpo_count,

            }
        }
        obj = (dict_sweep(unlist(value_convert(one_snp_json)), [None]))
        yield obj
Example #4
0
def _map_line_to_json(item):
    chrom = item.CHROM
    chromStart = item.POS
    ref = item.REF
    info = item.INFO
    hpo_count = item.INFO['HPO_CT']
    for alt in item.ALT:
        alt = str(alt)
        (HGVS, var_type) = get_hgvs_from_vcf(chrom,
                                             chromStart,
                                             ref,
                                             alt,
                                             mutant_type=True)
        if HGVS is None:
            return
        one_snp_json = {
            "_id": HGVS,
            "geno2mp": {
                "hpo_count": hpo_count,
            }
        }
        obj = (dict_sweep(unlist(value_convert(one_snp_json)), [None]))
        yield obj
Example #5
0
def _map_line_to_json(fields):
    assert len(fields) == VALID_COLUMN_NO
    rsid = fields[8]

    # load as json data
    if rsid is None:
        return
    url = "http://myvariant.info/v1/query?q=dbsnp.rsid:" + rsid + "&fields=_id"
    r = requests.get(url)
    for hits in r.json()["hits"]:
        HGVS = hits["_id"]
        one_snp_json = {
            "_id": HGVS,
            "grasp": {
                "hg19": {"chr": fields[5], "pos": fields[6]},
                "hupfield": fields[1],
                "last_curation_date": fields[2],
                "creation_date": fields[3],
                "srsid": fields[4],
                "publication": {
                    "journal": fields[16],
                    "title": fields[17],
                    "pmid": fields[7],
                    "snpid": fields[8],
                    "location_within_paper": fields[9],
                    "p_value": fields[10],
                    "phenotype": fields[11],
                    "paper_phenotype_description": fields[12],
                    "paper_phenotype_categories": fields[13],
                    "date_pub": fields[14],
                },
                "includes_male_female_only_analyses": fields[18],
                "exclusively_male_female": fields[19],
                "initial_sample_description": fields[20],
                "replication_sample_description": fields[21],
                "platform_snps_passing_qc": fields[22],
                "gwas_ancestry_description": fields[23],
                "discovery": {
                    "total_samples": fields[25],
                    "european": fields[26],
                    "african": fields[27],
                    "east_asian": fields[28],
                    "indian_south_asian": fields[29],
                    "hispanic": fields[30],
                    "native": fields[31],
                    "micronesian": fields[32],
                    "arab_me": fields[33],
                    "mixed": fields[34],
                    "unspecified": fields[35],
                    "filipino": fields[36],
                    "indonesian": fields[37],
                },
                "replication": {
                    "total_samples": fields[38],
                    "european": fields[39],
                    "african": fields[40],
                    "east_asian": fields[41],
                    "indian_south_asian": fields[42],
                    "hispanic": fields[43],
                    "native": fields[44],
                    "micronesian": fields[45],
                    "arab_me": fields[46],
                    "mixed": fields[47],
                    "unspecified": fields[48],
                    "filipino": fields[49],
                    "indonesian": fields[50],
                },
                "in_gene": fields[51],
                "nearest_gene": fields[52],
                "in_lincrna": fields[53],
                "in_mirna": fields[54],
                "in_mirna_bs": fields[55],
                "oreg_anno": fields[61],
                "conserv_pred_tfbs": fields[62],
                "human_enhancer": fields[63],
                "rna_edit": fields[64],
                "polyphen2": fields[65],
                "sift": fields[66],
                "ls_snp": fields[67],
                "uniprot": fields[68],
                "eqtl_meth_metab_study": fields[69],
            },
        }
        return list_split(dict_sweep(unlist(value_convert(one_snp_json)), [""]), ",")
Example #6
0
def _map_line_to_json(cp):
    clinical_siginificance = cp.ReferenceClinVarAssertion.\
        ClinicalSignificance.Description
    rcv_accession = cp.ReferenceClinVarAssertion.ClinVarAccession.Acc
    review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.\
        ReviewStatus
    last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.\
        DateLastEvaluated
    CLINVAR_ID = cp.ReferenceClinVarAssertion.MeasureSet.ID
    number_submitters = len(cp.ClinVarAssertion)
    # some items in clinvar_xml doesn't have origin information
    try:
        origin = cp.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin
    except:
        origin = None
    # MeasureSet.Measure return a list, there might be multiple
    # Measure under one MeasureSet
    for Measure in cp.ReferenceClinVarAssertion.MeasureSet.Measure:
        variation_type = Measure.Type
        # exclude any item of which types belong to
        # 'Variation', 'protein only' or 'Microsatellite'
        if variation_type == 'Variation' or variation_type\
           == 'protein only' or variation_type == 'Microsatellite':
            continue
        allele_id = Measure.ID
        chrom = None
        chromStart = None
        chromEnd = None
        ref = None
        alt = None
        if Measure.SequenceLocation:
            for SequenceLocation in Measure.SequenceLocation:
                # In this version, only accept information concerning GRCh37
                if 'GRCh37' in SequenceLocation.Assembly:
                    chrom = SequenceLocation.Chr
                    chromStart = SequenceLocation.start
                    chromEnd = SequenceLocation.stop
                    ref = SequenceLocation.referenceAllele
                    alt = SequenceLocation.alternateAllele
        if Measure.MeasureRelationship:
            try:
                symbol = Measure.MeasureRelationship[0].\
                    Symbol[0].get_ElementValue().valueOf_
            except:
                symbol = None
            gene_id = Measure.MeasureRelationship[0].XRef[0].ID
        else:
            symbol = None
            gene_id = None
        if Measure.Name:
            name = Measure.Name[0].ElementValue.valueOf_
        else:
            name = None
        if len(Measure.CytogeneticLocation) == 1:
            cytogenic = Measure.CytogeneticLocation[0]
        else:
            cytogenic = Measure.CytogeneticLocation
        hgvs_coding = None
        hgvs_genome = None
        HGVS = {'genomic': [], 'coding': [], 'non-coding': [], 'protein': []}
        coding_hgvs_only = None
        hgvs_id = None
        # hgvs_not_validated = None
        if Measure.AttributeSet:
            # 'copy number loss' or 'gain' have format different\
            # from other types, should be dealt with seperately
            if (variation_type == 'copy number loss') or \
                    (variation_type == 'copy number gain'):
                for AttributeSet in Measure.AttributeSet:
                    if 'HGVS, genomic, top level' in AttributeSet.\
                            Attribute.Type:
                        if AttributeSet.Attribute.integerValue == 37:
                            hgvs_genome = AttributeSet.Attribute.get_valueOf_()
                    if 'genomic' in AttributeSet.Attribute.Type:
                        HGVS['genomic'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    elif 'non-coding' in AttributeSet.Attribute.Type:
                        HGVS['non-coding'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    elif 'coding' in AttributeSet.Attribute.Type:
                        HGVS['coding'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    elif 'protein' in AttributeSet.Attribute.Type:
                        HGVS['protein'].append(
                            AttributeSet.Attribute.get_valueOf_())
            else:
                for AttributeSet in Measure.AttributeSet:
                    if 'genomic' in AttributeSet.Attribute.Type:
                        HGVS['genomic'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    elif 'non-coding' in AttributeSet.Attribute.Type:
                        HGVS['non-coding'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    elif 'coding' in AttributeSet.Attribute.Type:
                        HGVS['coding'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    elif 'protein' in AttributeSet.Attribute.Type:
                        HGVS['protein'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    if AttributeSet.Attribute.Type == 'HGVS, coding, RefSeq':
                        hgvs_coding = AttributeSet.Attribute.get_valueOf_()
                    elif AttributeSet.Attribute.Type == \
                            'HGVS, genomic, top level, previous':
                        hgvs_genome = AttributeSet.Attribute.get_valueOf_()
                        break
            if chrom and chromStart and chromEnd:
                if variation_type == 'single nucleotide variant':
                    hgvs_id = "chr%s:g.%s%s>%s" % (chrom, chromStart, ref, alt)
                # items whose type belong to 'Indel, Insertion, \
                # Duplication' might not hava explicit alt information, \
                # so we will parse from hgvs_genome
                elif variation_type == 'Indel':
                    if hgvs_genome:
                        indel_position = hgvs_genome.find('del')
                        indel_alt = hgvs_genome[indel_position + 3:]
                        hgvs_id = "chr%s:g.%s_%sdel%s" % \
                                  (chrom, chromStart, chromEnd, indel_alt)
                elif variation_type == 'Deletion':
                    hgvs_id = "chr%s:g.%s_%sdel" % \
                              (chrom, chromStart, chromEnd)
                elif variation_type == 'Insertion':
                    if hgvs_genome:
                        ins_position = hgvs_genome.find('ins')
                        if 'ins' in hgvs_genome:
                            ins_ref = hgvs_genome[ins_position + 3:]
                            hgvs_id = "chr%s:g.%s_%sins%s" % \
                                      (chrom, chromStart, chromEnd, ins_ref)
                elif variation_type == 'Duplication':
                    if hgvs_genome:
                        dup_position = hgvs_genome.find('dup')
                        if 'dup' in hgvs_genome:
                            dup_ref = hgvs_genome[dup_position + 3:]
                            hgvs_id = "chr%s:g.%s_%sdup%s" % \
                                      (chrom, chromStart, chromEnd, dup_ref)
            elif variation_type == 'copy number loss' or\
                    variation_type == 'copy number gain':
                if hgvs_genome:
                    hgvs_id = "chr" + hgvs_genome.split('.')[1] +\
                              hgvs_genome.split('.')[2]
            elif hgvs_coding:
                hgvs_id = hgvs_coding
                coding_hgvs_only = True
            else:
                print "couldn't find any id", rcv_accession
                return
        else:
            print 'no measure.attribute', rcv_accession
            return
        other_ids = ''
        rsid = None
        # loop through XRef to find rsid as well as other ids
        if Measure.XRef:
            for XRef in Measure.XRef:
                if XRef.Type == 'rs':
                    rsid = 'rs' + str(XRef.ID)
                other_ids = other_ids + XRef.DB + ':' + XRef.ID + ';'
        # make sure the hgvs_id is not none
        if hgvs_id:
            one_snp_json = {
                "_id": hgvs_id,
                "clinvar": {
                    "allele_id": allele_id,
                    "chrom": chrom,
                    "hg19": {
                        "start": chromStart,
                        "end": chromEnd
                    },
                    "type": variation_type,
                    "name": name,
                    "gene": {
                        "id": gene_id,
                        "symbol": symbol
                    },
                    "clinical_significance": clinical_siginificance,
                    "rsid": rsid,
                    "rcv_accession": rcv_accession,
                    "origin": origin,
                    "cytogenic": cytogenic,
                    "review_status": review_status,
                    "hgvs": HGVS,
                    "number_submitters": number_submitters,
                    "last_evaluated": str(last_evaluated),
                    "other_ids": other_ids,
                    "clinvar_id": CLINVAR_ID,
                    "coding_hgvs_only": coding_hgvs_only,
                    "ref": ref,
                    "alt": alt
                }
            }
            obj = (dict_sweep(unlist(value_convert(one_snp_json)), [None]))
            yield obj
Example #7
0
def _map_line_to_json(item):
    chrom = item.CHROM
    chromStart = item.POS
    ref = item.REF
    info = item.INFO
    try:
        baseqranksum = info['BaseQRankSum']
    except:
        baseqranksum = None
    try:
        clippingranksum = info['ClippingRankSum']
    except:
        clippingranksum = None
    try:
        mqranksum = info['MQRankSum']
    except:
        mqranksum = None
    try:
        readposranksum = info['ReadPosRankSum']
    except:
        readposranksum = None
    try:
        qd = info['QD']
    except:
        qd = None
    try:
        inbreedingcoeff = info['InbreedingCoeff']
    except:
        inbreedingcoeff = None
    for i in range(0, len(item.ALT)):
        item.ALT[i] = str(item.ALT[i])
    for alt in item.ALT:
        alt = str(alt)
        (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True)
        if HGVS is None:
            return
        one_snp_json = {
            "_id": HGVS,
            "exac": {
                "chrom": chrom,
                "pos": chromStart,
                "ref": ref,
                "alt": alt,
                "alleles": item.ALT,
                "type": var_type,
                "ac": {
                    "ac": info['AC'],
                    "ac_afr": info['AC_AFR'],
                    "ac_amr": info['AC_AMR'],
                    "ac_adj": info['AC_Adj'],
                    "ac_eas": info['AC_EAS'],
                    "ac_fin": info['AC_FIN'],
                    "ac_het": info['AC_Het'],
                    "ac_hom": info['AC_Hom'],
                    "ac_nfe": info['AC_NFE'],
                    "ac_oth": info['AC_OTH'],
                    "ac_sas": info['AC_SAS']
                },
                "af": info['AF'],
                "an": {
                    "an": info['AN'],
                    "an_afr": info['AN_AFR'],
                    "an_amr": info['AN_AMR'],
                    "an_adj": info['AN_Adj'],
                    "an_eas": info['AN_EAS'],
                    "an_fin": info['AN_FIN'],
                    "an_nfe": info['AN_NFE'],
                    "an_oth": info['AN_OTH'],
                    "an_sas": info['AN_SAS']

                },
                "baseqranksum": baseqranksum,
                "clippingranksum": clippingranksum,
                "fs": info['FS'],
                "het": {
                    "het_afr": info['Het_AFR'],
                    "het_amr": info['Het_AMR'],
                    "het_eas": info['Het_EAS'],
                    "het_fin": info['Het_FIN'],
                    "het_nfe": info['Het_NFE'],
                    "het_oth": info['Het_OTH'],
                    "het_sas": info['Het_SAS']
                },
                "hom": {
                    "hom_afr": info['Hom_AFR'],
                    "hom_amr": info['Hom_AMR'],
                    "hom_eas": info['Hom_EAS'],
                    "hom_fin": info['Hom_FIN'],
                    "hom_nfe": info['Hom_NFE'],
                    "hom_oth": info['Hom_OTH'],
                    "hom_sas": info['Hom_SAS']
                },
                "inbreedingcoeff": inbreedingcoeff,
                "mq": {
                    "mq": info['MQ'],
                    "mq0": info['MQ0'],
                    "mqranksum": mqranksum
                },
                "ncc": info['NCC'],
                "qd": qd,
                "readposranksum": readposranksum,
                "vqslod": info['VQSLOD'],
                "culprit": info['culprit']
            }
        }
        obj = (dict_sweep(unlist(value_convert(one_snp_json)), [None]))
        yield obj
Example #8
0
def _map_line_to_json(item):
    chrom = item.CHROM
    chromStart = item.POS
    ref = item.REF
    info = item.INFO
    try:
        baseqranksum = info['BaseQRankSum']
    except:
        baseqranksum = None
    try:
        clippingranksum = info['ClippingRankSum']
    except:
        clippingranksum = None
    try:
        mqranksum = info['MQRankSum']
    except:
        mqranksum = None
    try:
        readposranksum = info['ReadPosRankSum']
    except:
        readposranksum = None
    try:
        qd = info['QD']
    except:
        qd = None
    try:
        inbreedingcoeff = info['InbreedingCoeff']
    except:
        inbreedingcoeff = None
    for i in range(0, len(item.ALT)):
        item.ALT[i] = str(item.ALT[i])
    for alt in item.ALT:
        alt = str(alt)
        (HGVS, var_type) = get_hgvs_from_vcf(chrom,
                                             chromStart,
                                             ref,
                                             alt,
                                             mutant_type=True)
        if HGVS is None:
            return
        one_snp_json = {
            "_id": HGVS,
            "exac": {
                "chrom": chrom,
                "pos": chromStart,
                "ref": ref,
                "alt": alt,
                "alleles": item.ALT,
                "type": var_type,
                "ac": {
                    "ac": info['AC'],
                    "ac_afr": info['AC_AFR'],
                    "ac_amr": info['AC_AMR'],
                    "ac_adj": info['AC_Adj'],
                    "ac_eas": info['AC_EAS'],
                    "ac_fin": info['AC_FIN'],
                    "ac_het": info['AC_Het'],
                    "ac_hom": info['AC_Hom'],
                    "ac_nfe": info['AC_NFE'],
                    "ac_oth": info['AC_OTH'],
                    "ac_sas": info['AC_SAS'],
                    "ac_female": info['AC_FEMALE'],
                    "ac_male": info['AC_MALE']
                },
                "af": info['AF'],
                "an": {
                    "an": info['AN'],
                    "an_afr": info['AN_AFR'],
                    "an_amr": info['AN_AMR'],
                    "an_adj": info['AN_Adj'],
                    "an_eas": info['AN_EAS'],
                    "an_fin": info['AN_FIN'],
                    "an_nfe": info['AN_NFE'],
                    "an_oth": info['AN_OTH'],
                    "an_sas": info['AN_SAS'],
                    "an_female": info['AN_FEMALE'],
                    "an_male": info['AN_MALE']
                },
                "baseqranksum": baseqranksum,
                "clippingranksum": clippingranksum,
                "fs": info['FS'],
                "het": {
                    "het_afr": info['Het_AFR'],
                    "het_amr": info['Het_AMR'],
                    "het_eas": info['Het_EAS'],
                    "het_fin": info['Het_FIN'],
                    "het_nfe": info['Het_NFE'],
                    "het_oth": info['Het_OTH'],
                    "het_sas": info['Het_SAS']
                },
                "hom": {
                    "hom_afr": info['Hom_AFR'],
                    "hom_amr": info['Hom_AMR'],
                    "hom_eas": info['Hom_EAS'],
                    "hom_fin": info['Hom_FIN'],
                    "hom_nfe": info['Hom_NFE'],
                    "hom_oth": info['Hom_OTH'],
                    "hom_sas": info['Hom_SAS']
                },
                "inbreedingcoeff": inbreedingcoeff,
                "mq": {
                    "mq": info['MQ'],
                    "mq0": info['MQ0'],
                    "mqranksum": mqranksum
                },
                "ncc": info['NCC'],
                "qd": qd,
                "readposranksum": readposranksum,
                "vqslod": info['VQSLOD'],
                "culprit": info['culprit']
            }
        }
        obj = (dict_sweep(unlist(value_convert(one_snp_json)), [None]))
        yield obj
Example #9
0
def _map_line_to_json(fields, version):
    chrInfo = fields[0].split(":")  # grch37
    chrom = chrInfo[0]
    chromStart = int(chrInfo[1])
    ma_fin_percent = fields[7].split("/")
    if fields[3]:
        mutation = fields[3].split(">")
        ref = mutation[0]
        alt = mutation[1]
        hg19 = get_pos_start_end(chrom, chromStart, ref, alt)
        hg38 = get_pos_start_end(chrom, int(fields[30].split(":")[1]), ref,
                                 alt)
        if version == 'hg19':
            HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt)
        elif version == 'hg38':
            HGVS = get_hgvs_from_vcf(chrom, hg38[0], ref, alt)

    # load as json data
    if HGVS is None:
        return

    one_snp_json = {
        "_id": HGVS,
        "evs": {
            "chrom": chrom,
            "hg19": {
                "start": hg19[0],
                "end": hg19[1]
            },
            "hg38": {
                "start": hg38[0],
                "end": hg38[1]
            },
            "rsid": fields[1],
            "dbsnp_version": get_dbsnp(fields[2]),
            "ref": ref,
            "alt": alt,
            "allele_count": {
                "european_american": count_dict(fields[4]),
                "african_american": count_dict(fields[5]),
                "all": count_dict(fields[6])
            },
            "ma_fin_percent": {
                "european_american": ma_fin_percent[0],
                "african_american": ma_fin_percent[1],
                "all": ma_fin_percent[2]
            },
            "genotype_count": {
                "european_american": count_dict(fields[8]),
                "african_american": count_dict(fields[9]),
                "all_genotype": count_dict(fields[10])
            },
            "avg_sample_read": fields[11],
            "gene": {
                "symbol": fields[12],
                "accession": fields[13]
            },
            "function_gvs": fields[14],
            "hgvs": {
                "coding": fields[16],
                "protein": fields[15]
            },
            "coding_dna_size": fields[17],
            "conservation": {
                "phast_cons": fields[18],
                "gerp": fields[19]
            },
            "grantham_score": fields[20],
            "polyphen2": {
                "class": polyphen(fields[21])[0],
                "score": polyphen(fields[21])[1]
            },
            "ref_base_ncbi": fields[22],
            "chimp_allele": fields[23],
            "clinical_info": fields[24],
            "filter_status": fields[25],
            "on_illumina_human_exome_chip": fields[26],
            "gwas_pubmed_info": fields[27],
            "estimated_age_kyrs": {
                "ea": fields[28],
                "aa": fields[29]
            }
        }
    }
    return dict_sweep(value_convert(one_snp_json),
                      vals=["NA", "none", "unknown"])
Example #10
0
def _map_line_to_json(fields):
    assert len(fields) == VALID_COLUMN_NO
    rsid = fields[8]

    # load as json data
    if rsid is None:
        return
    url = 'http://myvariant.info/v1/query?q=dbsnp.rsid:'\
          + rsid + '&fields=_id'
    r = requests.get(url)
    for hits in r.json()['hits']:
        HGVS = hits['_id']
        one_snp_json = {

            "_id": HGVS,
            "grasp":
                {
                    'hg19':
                        {
                            'chr': fields[5],
                            'pos': fields[6]
                        },
                    'hupfield': fields[1],
                    'last_curation_date': fields[2],
                    'creation_date': fields[3],
                    'srsid': fields[4],
                    'publication':
                        {
                            'journal': fields[16],
                            'title': fields[17],
                            'pmid': fields[7],
                            'snpid': fields[8],
                            'location_within_paper': fields[9],
                            'p_value': fields[10],
                            'phenotype': fields[11],
                            'paper_phenotype_description': fields[12],
                            'paper_phenotype_categories': fields[13],
                            'date_pub': fields[14]
                        },
                    'includes_male_female_only_analyses': fields[18],
                    'exclusively_male_female': fields[19],
                    'initial_sample_description': fields[20],
                    'replication_sample_description': fields[21],
                    'platform_snps_passing_qc': fields[22],
                    'gwas_ancestry_description': fields[23],
                    'discovery':
                        {
                            'total_samples': fields[25],
                            'european': fields[26],
                            'african': fields[27],
                            'east_asian': fields[28],
                            'indian_south_asian': fields[29],
                            'hispanic': fields[30],
                            'native': fields[31],
                            'micronesian': fields[32],
                            'arab_me': fields[33],
                            'mixed': fields[34],
                            'unspecified': fields[35],
                            'filipino': fields[36],
                            'indonesian': fields[37]
                        },
                    'replication':
                        {
                            'total_samples': fields[38],
                            'european': fields[39],
                            'african': fields[40],
                            'east_asian': fields[41],
                            'indian_south_asian': fields[42],
                            'hispanic': fields[43],
                            'native': fields[44],
                            'micronesian': fields[45],
                            'arab_me': fields[46],
                            'mixed': fields[47],
                            'unspecified': fields[48],
                            'filipino': fields[49],
                            'indonesian': fields[50]
                        },
                    'in_gene': fields[51],
                    'nearest_gene': fields[52],
                    'in_lincrna': fields[53],
                    'in_mirna': fields[54],
                    'in_mirna_bs': fields[55],
                    'oreg_anno': fields[61],
                    'conserv_pred_tfbs': fields[62],
                    'human_enhancer': fields[63],
                    'rna_edit': fields[64],
                    'polyphen2': fields[65],
                    'sift': fields[66],
                    'ls_snp': fields[67],
                    'uniprot': fields[68],
                    'eqtl_meth_metab_study': fields[69]
                }
            }
        return list_split(dict_sweep(unlist(value_convert(one_snp_json)), [""]), ",")
def _map_line_to_json(cp, hg19):
    try:
        clinical_significance = cp.ReferenceClinVarAssertion.\
            ClinicalSignificance.Description
    except:
        clinical_significance = None
    rcv_accession = cp.ReferenceClinVarAssertion.ClinVarAccession.Acc
    try:
        review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.\
            ReviewStatus
    except:
        review_status = None
    try:
        last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.\
            DateLastEvaluated
    except:
        last_evaluated = None
    variant_id = cp.ReferenceClinVarAssertion.MeasureSet.ID
    number_submitters = len(cp.ClinVarAssertion)
    # some items in clinvar_xml doesn't have origin information
    try:
        origin = cp.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin
    except:
        origin = None
    trait = cp.ReferenceClinVarAssertion.TraitSet.Trait[0]
    synonyms = []
    conditions_name = ''
    for name in trait.Name:
        if name.ElementValue.Type == 'Alternate':
            synonyms.append(name.ElementValue.get_valueOf_())
        if name.ElementValue.Type == 'Preferred':
            conditions_name += name.ElementValue.get_valueOf_()
    identifiers = {}
    for item in trait.XRef:
        if item.DB == 'Human Phenotype Ontology':
            key = 'Human_Phenotype_Ontology'
        else:
            key = item.DB
        identifiers[key.lower()] = item.ID
    for symbol in trait.Symbol:
        if symbol.ElementValue.Type == 'Preferred':
            conditions_name += ' (' + symbol.ElementValue.get_valueOf_() + ')'
    age_of_onset = ''
    for _set in trait.AttributeSet:
        if _set.Attribute.Type == 'age of onset':
            age_of_onset = _set.Attribute.get_valueOf_()

    # MeasureSet.Measure return a list, there might be multiple
    # Measure under one MeasureSet
    for Measure in cp.ReferenceClinVarAssertion.MeasureSet.Measure:
        variation_type = Measure.Type
        # exclude any item of which types belong to
        # 'Variation', 'protein only' or 'Microsatellite'
        if variation_type == 'Variation' or variation_type\
           == 'protein only' or variation_type == 'Microsatellite':
            continue
        allele_id = Measure.ID
        chrom = None
        chromStart_19 = None
        chromEnd_19 = None
        chromStart_38 = None
        chromEnd_38 = None
        ref = None
        alt = None
        if Measure.SequenceLocation:
            for SequenceLocation in Measure.SequenceLocation:
                # In this version, only accept information concerning GRCh37
                if 'GRCh37' in SequenceLocation.Assembly:
                    chrom = SequenceLocation.Chr
                    chromStart_19 = SequenceLocation.start
                    chromEnd_19 = SequenceLocation.stop
                    ref = SequenceLocation.referenceAllele
                    alt = SequenceLocation.alternateAllele
                if 'GRCh38' in SequenceLocation.Assembly:
                    chromStart_38 = SequenceLocation.start
                    chromEnd_38 = SequenceLocation.stop
                    if not ref:
                        ref = SequenceLocation.referenceAllele
                    if not alt:
                        alt = SequenceLocation.alternateAllele
        if Measure.MeasureRelationship:
            try:
                symbol = Measure.MeasureRelationship[0].\
                    Symbol[0].get_ElementValue().valueOf_
            except:
                symbol = None
            gene_id = Measure.MeasureRelationship[0].XRef[0].ID
        else:
            symbol = None
            gene_id = None
        if Measure.Name:
            name = Measure.Name[0].ElementValue.valueOf_
        else:
            name = None
        if len(Measure.CytogeneticLocation) == 1:
            cytogenic = Measure.CytogeneticLocation[0]
        else:
            cytogenic = Measure.CytogeneticLocation
        hgvs_coding = None
        hgvs_genome = None
        HGVS = {'genomic': [], 'coding': [], 'non-coding': [], 'protein': []}
        coding_hgvs_only = None
        hgvs_id = None
        if hg19:
            chromStart = chromStart_19
            chromEnd = chromEnd_19
        else:
            chromStart = chromStart_38
            chromEnd = chromEnd_38
        # hgvs_not_validated = None
        if Measure.AttributeSet:
            # 'copy number loss' or 'gain' have format different\
            # from other types, should be dealt with seperately
            if (variation_type == 'copy number loss') or \
                    (variation_type == 'copy number gain'):
                for AttributeSet in Measure.AttributeSet:
                    if 'HGVS, genomic, top level' in AttributeSet.\
                            Attribute.Type:
                        if AttributeSet.Attribute.integerValue == 37:
                            hgvs_genome = AttributeSet.Attribute.get_valueOf_()
                    if 'genomic' in AttributeSet.Attribute.Type:
                        HGVS['genomic'].append(AttributeSet.Attribute.
                                               get_valueOf_())
                    elif 'non-coding' in AttributeSet.Attribute.Type:
                        HGVS['non-coding'].append(AttributeSet.Attribute.
                                                  get_valueOf_())
                    elif 'coding' in AttributeSet.Attribute.Type:
                        HGVS['coding'].append(AttributeSet.Attribute.
                                              get_valueOf_())
                    elif 'protein' in AttributeSet.Attribute.Type:
                        HGVS['protein'].append(AttributeSet.
                                               Attribute.get_valueOf_())
            else:
                for AttributeSet in Measure.AttributeSet:
                    if 'genomic' in AttributeSet.Attribute.Type:
                        HGVS['genomic'].append(AttributeSet.
                                               Attribute.get_valueOf_())
                    elif 'non-coding' in AttributeSet.Attribute.Type:
                        HGVS['non-coding'].append(AttributeSet.
                                                  Attribute.get_valueOf_())
                    elif 'coding' in AttributeSet.Attribute.Type:
                        HGVS['coding'].append(AttributeSet.Attribute.
                                              get_valueOf_())
                    elif 'protein' in AttributeSet.Attribute.Type:
                        HGVS['protein'].append(AttributeSet.
                                               Attribute.get_valueOf_())
                    if AttributeSet.Attribute.Type == 'HGVS, coding, RefSeq':
                        hgvs_coding = AttributeSet.Attribute.get_valueOf_()
                    elif AttributeSet.Attribute.Type == \
                            'HGVS, genomic, top level, previous':
                        hgvs_genome = AttributeSet.Attribute.get_valueOf_()
                        break
            if chrom and chromStart and chromEnd:
                if variation_type == 'single nucleotide variant':
                    hgvs_id = "chr%s:g.%s%s>%s" % (chrom, chromStart, ref, alt)
                # items whose type belong to 'Indel, Insertion, \
                # Duplication' might not hava explicit alt information, \
                # so we will parse from hgvs_genome
                elif variation_type == 'Indel':
                    if hgvs_genome:
                        indel_position = hgvs_genome.find('del')
                        indel_alt = hgvs_genome[indel_position+3:]
                        hgvs_id = "chr%s:g.%s_%sdel%s" % \
                                  (chrom, chromStart, chromEnd, indel_alt)
                elif variation_type == 'Deletion':
                    hgvs_id = "chr%s:g.%s_%sdel" % \
                              (chrom, chromStart, chromEnd)
                elif variation_type == 'Insertion':
                    if hgvs_genome:
                        ins_position = hgvs_genome.find('ins')
                        if 'ins' in hgvs_genome:
                            ins_ref = hgvs_genome[ins_position+3:]
                            hgvs_id = "chr%s:g.%s_%sins%s" % \
                                      (chrom, chromStart, chromEnd, ins_ref)
                elif variation_type == 'Duplication':
                    if hgvs_genome:
                        dup_position = hgvs_genome.find('dup')
                        if 'dup' in hgvs_genome:
                            dup_ref = hgvs_genome[dup_position+3:]
                            hgvs_id = "chr%s:g.%s_%sdup%s" % \
                                      (chrom, chromStart, chromEnd, dup_ref)
            elif variation_type == 'copy number loss' or\
                    variation_type == 'copy number gain':
                if hgvs_genome and chrom:
                    hgvs_id = "chr" + chrom + ":" + hgvs_genome.split('.')[2]
            elif hgvs_coding:
                hgvs_id = hgvs_coding
                coding_hgvs_only = True
            else:
                print "couldn't find any id", rcv_accession
                return
        else:
            print 'no measure.attribute', rcv_accession
            return
        for key in HGVS:
            HGVS[key].sort()
        rsid = None
        cosmic = None
        dbvar = None
        uniprot = None
        omim = None
        # loop through XRef to find rsid as well as other ids
        if Measure.XRef:
            for XRef in Measure.XRef:
                if XRef.Type == 'rs':
                    rsid = 'rs' + str(XRef.ID)
                elif XRef.DB == 'COSMIC':
                    cosmic = XRef.ID
                elif XRef.DB == 'OMIM':
                    omim = XRef.ID
                elif XRef.DB == 'UniProtKB/Swiss-Prot':
                    uniprot = XRef.ID
                elif XRef.DB == 'dbVar':
                    dbvar = XRef.ID

        # make sure the hgvs_id is not none
        if hgvs_id:
            one_snp_json = {

                "_id": hgvs_id,
                "clinvar":
                    {
                        "allele_id": allele_id,
                        "variant_id": variant_id,
                        "chrom": chrom,
                        "omim": omim,
                        "cosmic": cosmic,
                        "uniprot": uniprot,
                        "dbvar": dbvar,
                        "hg19":
                            {
                                "start": chromStart_19,
                                "end": chromEnd_19
                            },
                        "hg38":
                            {
                                "start": chromStart_38,
                                "end": chromEnd_38
                            },
                        "type": variation_type,
                        "gene":
                            {
                                "id": gene_id,
                                "symbol": symbol
                            },
                        "rcv":
                            {
                                "accession": rcv_accession,
                                "clinical_significance": clinical_significance,
                                "number_submitters": number_submitters,
                                "review_status": review_status,
                                "last_evaluated": str(last_evaluated),
                                "preferred_name": name,
                                "origin": origin,
                                "conditions":
                                    {
                                        "name": conditions_name,
                                        "synonyms": synonyms,
                                        "identifiers": identifiers,
                                        "age_of_onset": age_of_onset
                                }
                            },
                        "rsid": rsid,
                        "cytogenic": cytogenic,
                        "hgvs": HGVS,
                        "coding_hgvs_only": coding_hgvs_only,
                        "ref": ref,
                        "alt": alt
                    }
            }
            obj = (dict_sweep(unlist(value_convert(one_snp_json,
                                                   ['chrom', 'omim', 'id', 'orphanet', 'gene',
                                                    'rettbase_(cdkl5)', 'cosmic', 'dbrbc'])), [None, '', 'None']))
            yield obj
Example #12
0
def _map_line_to_json(fields, version='hg19'):
    # specific variable treatment
    chrom = fields[0]
    if chrom == 'M':
        chrom = 'MT'
    # fields[7] in version 2, represent hg18_pos
    if fields[10] == ".":
        hg18_end = "."
    else:
        hg18_end = int(fields[10])
    chromStart = int(fields[8])
    chromEnd = int(fields[8])
    chromStart_38 = int(fields[1])
    ref = fields[2].upper()
    alt = fields[3].upper()
    HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt)
    HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt)
    if version == 'hg19':
        HGVS = HGVS_19
    elif version == 'hg38':
        HGVS = HGVS_38
    if fields[69] == ".":
        siphy = "."
    else:
        freq = fields[69].split(":")
        siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}

    acc = fields[26].rstrip().rstrip(';').split(";")
    pos = fields[28].rstrip().rstrip(';').split(";")
    uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos)))

    # load as json data
    one_snp_json = {
        "_id": HGVS,
        "dbnsfp": {
            "rsid": fields[6],
            "chrom": chrom,
            "hg19": {
                "start": fields[8],
                "end": chromEnd
            },
            "hg18": {
                "start": fields[10],
                "end": hg18_end
            },
            "hg38": {
                "start": fields[1],
                "end": fields[1]
            },
            "ref": ref,
            "alt": alt,
            "aa": {
                "ref": fields[4],
                "alt": fields[5],
                "pos": fields[22],
                "refcodon": fields[13],
                "codonpos": fields[14],
            },
            "genename": fields[11],
            "uniprot": uniprot,
            "interpro_domain": fields[111],
            "cds_strand": fields[12],
            "ancestral_allele": fields[16],
            "ensembl": {
                "geneid": fields[19],
                "transcriptid": fields[20]
            },
            "sift": {
                "score": fields[23],
                "converted_rankscore": fields[24],
                "pred": fields[25]
            },
            "polyphen2": {
                "hdiv": {
                    "score": fields[29],
                    "rankscore": fields[30],
                    "pred": fields[31]
                },
                "hvar": {
                    "score": fields[32],
                    "rankscore": fields[33],
                    "pred": fields[34]
                }
            },
            "lrt": {
                "score": fields[35],
                "converted_rankscore": fields[36],
                "pred": fields[37],
                "omega": fields[38]
            },
            "mutationtaster": {
                "score": fields[39],
                "converted_rankscore": fields[40],
                "pred": fields[41],
                "model": fields[42],
                "AAE": fields[43]
            },
            "mutationassessor": {
                "score": fields[46],
                "rankscore": fields[47],
                "pred": fields[48]
            },
            "fathmm": {
                "score": fields[49],
                "rankscore": fields[50],
                "pred": fields[51]
            },
            "provean": {
                "score": fields[52],
                "rankscore": fields[53],
                "pred": fields[54]
            },
            "metasvm": {
                "score": fields[55],
                "rankscore": fields[56],
                "pred": fields[57]
            },
            "lr": {
                "score": fields[58],
                "rankscore": fields[59],
                "pred": fields[60]
            },
            "reliability_index": fields[61],
            "gerp++": {
                "nr": fields[62],
                "rs": fields[63],
                "rs_rankscore": fields[64]
            },
            "phylop_7way": {
                "vertebrate": fields[65],
                "vertebrate_rankscore": fields[66]
            },
            "phastcons_7way": {
                "vertebrate": fields[67],
                "vertebrate_rankscore": fields[68]
            },
            "siphy_29way": {
                "pi": siphy,
                "logodds": fields[70],
                "logodds_rankscore": fields[71]
            },
            "1000gp1": {
                "ac": fields[72],
                "af": fields[73],
                "afr_ac": fields[74],
                "afr_af": fields[75],
                "eur_ac": fields[76],
                "eur_af": fields[77],
                "amr_ac": fields[78],
                "amr_af": fields[79],
                "eas_ac": fields[80],
                "eas_af": fields[81],
                "sas_ac": fields[82],
                "sas_af": fields[83]
            },
            "twinsuk": {
                "ac": fields[84],
                "af": fields[85]
            },
            "alspac": {
                "ac": fields[86],
                "af": fields[87]
            },
            "esp6500": {
                "aa_ac": fields[88],
                "aa_af": fields[89],
                "ea_ac": fields[90],
                "ea_af": fields[91]
            },
            "exac": {
                "ac": fields[92],
                "af": fields[93],
                "adj_ac": fields[94],
                "adj_af": fields[95],
                "afr_ac": fields[96],
                "afr_af": fields[97],
                "amr_ac": fields[98],
                "amr_af": fields[99],
                "eas_ac": fields[100],
                "eas_af": fields[101],
                "fin_ac": fields[102],
                "fin_af": fields[103],
                "nfe_ac": fields[104],
                "nfe_af": fields[105],
                "sas_ac": fields[106],
                "sas_af": fields[107]
            },
            "clinvar": {
                "rs": fields[108],
                "clinsig": fields[109],
                "trait": fields[110]
            }
        }
    }

    one_snp_json = list_split(dict_sweep(unlist(value_convert(one_snp_json)), vals=["."]), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json
Example #13
0
def _map_line_to_json(fields):
    assert len(fields) == VALID_COLUMN_NO
    chrom = fields[0]
    chromStart = fields[1]
    ref = fields[2]
    alt = fields[4]
    HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt)

    # load as json data
    if HGVS is None:
        return
    one_snp_json = {
        "_id": HGVS,
        "cadd": {
            'chrom': fields[0],
            'pos': fields[1],
            'ref': fields[2],
            'anc': fields[3],
            'alt': fields[4],
            'type': fields[5],
            'length': fields[6],
            'istv': fields[7],
            'isderived': fields[8],
            'annotype': fields[9],
            'consequence': fields[10],
            'consscore': fields[11],
            'consdetail': fields[12],
            'gc': fields[13],
            'cpg': fields[14],
            'mapability': {
                '20bp': fields[15],
                '35bp': fields[16]
            },
            'scoresegdup': fields[17],
            'phast_cons': {
                'primate': fields[18],
                'mammalian': fields[19],
                'vertebrate': fields[20]
            },
            'phylop': {
                'primate': fields[21],
                'mammalian': fields[22],
                'vertebrate': fields[23]
            },
            'gerp': {
                'n': fields[24],
                's': fields[25],
                'rs': fields[26],
                'rs_pval': fields[27]
            },
            'bstatistic': fields[28],
            'mutindex': fields[29],
            'dna': {
                'helt': fields[30],
                'mgw': fields[31],
                'prot': fields[32],
                'roll': fields[33]
            },
            'mirsvr': {
                'score': fields[34],
                'e': fields[35],
                'aln': fields[36]
            },
            'targetscans': fields[37],
            'fitcons': fields[38],
            'chmm': {
                'tssa': fields[39],
                'tssaflnk': fields[40],
                'txflnk': fields[41],
                'tx': fields[42],
                'txwk': fields[43],
                'enh': fields[44],
                # 'enh': fields[45],
                'znfrpts': fields[46],
                'het': fields[47],
                'tssbiv': fields[48],
                'bivflnk': fields[49],
                'enhbiv': fields[50],
                'reprpc': fields[51],
                'reprpcwk': fields[52],
                'quies': fields[53],
            },
            'encode': {
                'exp': fields[54],
                'h3k27ac': fields[55],
                'h3k4me1': fields[56],
                'h3k4me3': fields[57],
                'nucleo': fields[58],
                'occ': fields[59],
                'p_val': {
                    'comb': fields[60],
                    'dnas': fields[61],
                    'faire': fields[62],
                    'polii': fields[63],
                    'ctcf': fields[64],
                    'mycp': fields[65]
                },
                'sig': {
                    'dnase': fields[66],
                    'faire': fields[67],
                    'polii': fields[68],
                    'ctcf': fields[69],
                    'myc': fields[70]
                },
            },
            'segway': fields[71],
            'motif': {
                'toverlap': fields[72],
                'dist': fields[73],
                'ecount': fields[74],
                'ename': fields[75],
                'ehipos': fields[76],
                'escorechng': fields[77]
            },
            'tf': {
                'bs': fields[78],
                'bs_peaks': fields[79],
                'bs_peaks_max': fields[80]
            },
            'isknownvariant': fields[81],
            'esp': {
                'af': fields[82],
                'afr': fields[83],
                'eur': fields[84]
            },
            '1000g': {
                'af': fields[85],
                'asn': fields[86],
                'amr': fields[87],
                'afr': fields[88],
                'eur': fields[89]
            },
            'min_dist_tss': fields[90],
            'min_dist_tse': fields[91],
            'gene': {
                'gene_id': fields[92],
                'feature_id': fields[93],
                'ccds_id': fields[94],
                'genename': fields[95],
                'cds': {
                    'cdna_pos': fields[96],
                    'rel_cdna_pos': fields[97],
                    'cds_pos': fields[98],
                    'rel_cds_pos': fields[99]
                },
                'prot': {
                    'protpos': fields[100],
                    'rel_prot_pos': fields[101],
                    'domain': fields[102]
                }
            },
            'dst2splice': fields[103],
            'dst2spltype': fields[104],
            'exon': fields[105],
            'intron': fields[106],
            'oaa': fields[107],  # ref aa
            'naa': fields[108],  # alt aa
            'grantham': fields[109],
            'polyphen': {
                'cat': fields[110],
                'val': fields[111]
            },
            'sift': {
                'cat': fields[112],
                'val': fields[113]
            },
            'rawscore': fields[114],  # raw CADD score
            'phred': fields[115]  # log-percentile of raw CADD score
        }
    }

    return dict_sweep(unlist(value_convert(one_snp_json)), ["NA"])
def _map_line_to_json(fields, version):
    # specific variable treatment
    chrom = fields[0]
    if chrom == 'M':
        chrom = 'MT'
    # fields[7] in version 2, represent hg18_pos
    if fields[10] == ".":
        hg18_end = "."
    else:
        hg18_end = int(fields[10])
    # in case of no hg19 position provided, remove the item
    if fields[8] == '.':
        return None
    else:
        chromStart = int(fields[8])
        chromEnd = int(fields[8])
    chromStart_38 = int(fields[1])
    ref = fields[2].upper()
    alt = fields[3].upper()
    HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt)
    HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt)
    if version == 'hg19':
        HGVS = HGVS_19
    elif version == 'hg38':
        HGVS = HGVS_38
    if fields[105] == ".":
        siphy = "."
    else:
        freq = fields[105].split(":")
        siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}
    gtex_gene = fields[181].split('|')
    gtex_tissue = fields[182].split('|')
    gtex = map(dict, map(lambda t: zip(('gene', 'tissue'), t), zip(gtex_gene, gtex_tissue)))
    acc = fields[26].rstrip().rstrip(';').split(";")
    pos = fields[28].rstrip().rstrip(';').split(";")
    uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos)))
    provean_score = fields[52].split(';')
    sift_score = fields[23].split(';')
    hdiv_score = fields[29].split(';')
    hvar_score = fields[32].split(';')
    lrt_score = fields[35].split(';')
    dann_score = fields[69].split(';')
    mutationtaster_score = fields[39].split(';')
    mutationassessor_score = fields[46].split(';')
    vest3_score = fields[57].split(';')
    metasvm_score = fields[59].split(';')
    fathmm_score = fields[49].split(';')
    lr_score = fields[62].split(';')
    fathmm_coding_score = fields[71].split(';')
    integrated_fitcons_score = fields[82].split(';')
    gm12878_fitcons_score = fields[85].split(';')
    h1_hesc_fitcons_score = fields[88].split(';')
    huvec_fitcons_score = fields[91].split(';')
    if len(provean_score) > 1:
        for i in range(len(provean_score)):
            if provean_score[i] == '.':
                provean_score[i] = None
    if len(sift_score) > 1:
        for i in range(len(sift_score)):
            if sift_score[i] == '.':
                sift_score[i] = None
    if len(hdiv_score) > 1:
        for i in range(len(hdiv_score)):
            if hdiv_score[i] == '.':
                hdiv_score[i] = None
    if len(hvar_score) > 1:
        for i in range(len(hvar_score)):
            if hvar_score[i] == '.':
                hvar_score[i] = None
    if len(lrt_score) > 1:
        for i in range(len(lrt_score)):
            if lrt_score[i] == '.':
                lrt_score[i] = None
    if len(mutationtaster_score) > 1:
        for i in range(len(mutationtaster_score)):
            if mutationtaster_score[i] == '.':
                mutationtaster_score[i] = None
    if len(mutationassessor_score) > 1:
        for i in range(len(mutationassessor_score)):
            if mutationassessor_score[i] == '.':
                mutationassessor_score[i] = None
    if len(metasvm_score) > 1:
        for i in range(len(metasvm_score)):
            if metasvm_score[i] == '.':
                metasvm_score[i] = None
    if len(vest3_score) > 1:
        for i in range(len(vest3_score)):
            if vest3_score[i] == '.':
                vest3_score[i] = None
    if len(fathmm_score) > 1:
        for i in range(len(fathmm_score)):
            if fathmm_score[i] == '.':
                fathmm_score[i] = None
    if len(lr_score) > 1:
        for i in range(len(lr_score)):
            if lr_score[i] == '.':
                lr_score[i] = None
    if len(fathmm_coding_score) > 1:
        for i in range(len(fathmm_coding_score)):
            if fathmm_coding_score[i] == '.':
                fathmm_coding_score[i] = None
    if len(dann_score) > 1:
        for i in range(len(dann_score)):
            if dann_score[i] == '.':
                dann_score[i] = None
    if len(integrated_fitcons_score) > 1:
        for i in range(len(integrated_fitcons_score)):
            if integrated_fitcons_score[i] == '.':
                integrated_fitcons_score[i] = None
    if len(gm12878_fitcons_score) > 1:
        for i in range(len(gm12878_fitcons_score)):
            if gm12878_fitcons_score[i] == '.':
                gm12878_fitcons_score[i] = None
    if len(h1_hesc_fitcons_score) > 1:
        for i in range(len(h1_hesc_fitcons_score)):
            if h1_hesc_fitcons_score[i] == '.':
                h1_hesc_fitcons_score[i] = None
    if len(huvec_fitcons_score) > 1:
        for i in range(len(huvec_fitcons_score)):
            if huvec_fitcons_score[i] == '.':
                huvec_fitcons_score[i] = None
# load as json data
    one_snp_json = {
        "_id": HGVS,
        "dbnsfp": {
            "rsid": fields[6],
            #"rsid_dbSNP144": fields[6],
            "chrom": chrom,
            "hg19": {
                "start": chromStart,
                "end": chromEnd
            },
            "hg18": {
                "start": fields[10],
                "end": hg18_end
            },
            "hg38": {
                "start": fields[1],
                "end": fields[1]
            },
            "ref": ref,
            "alt": alt,
            "aa": {
                "ref": fields[4],
                "alt": fields[5],
                "pos": fields[22],
                "refcodon": fields[13],
                "codonpos": fields[14],
                "codon_degeneracy": fields[15]
            },
            "genename": fields[11],
            "uniprot": uniprot,
            "interpro_domain": fields[180],
            "cds_strand": fields[12],
            "ancestral_allele": fields[16],
            #"altaineandertal": fields[17],
            #"denisova": fields[18]
            "ensembl": {
                "geneid": fields[19],
                "transcriptid": fields[20],
                "proteinid": fields[21]
            },
            "sift": {
                "score": sift_score,
                "converted_rankscore": fields[24],
                "pred": fields[25]
            },
            "polyphen2": {
                "hdiv": {
                    "score": hdiv_score,
                    "rankscore": fields[30],
                    "pred": fields[31]
                },
                "hvar": {
                    "score": hvar_score,
                    "rankscore": fields[33],
                    "pred": fields[34]
                }
            },
            "lrt": {
                "score": lrt_score,
                "converted_rankscore": fields[36],
                "pred": fields[37],
                "omega": fields[38]
            },
            "mutationtaster": {
                "score": mutationtaster_score,
                "converted_rankscore": fields[40],
                "pred": fields[41],
                "model": fields[42],
                "AAE": fields[43]
            },
            "mutationassessor": {
                "score": mutationassessor_score,
                "rankscore": fields[47],
                "pred": fields[48]
            },
            "fathmm": {
                "score": fathmm_score,
                "rankscore": fields[50],
                "pred": fields[51]
            },
            "provean": {
                "score": provean_score,
                "rankscore": fields[53],
                "pred": fields[54]
            },
            "vest3": {
                "score": vest3_score,
                "rankscore": fields[57],
                "transcriptid": fields[55],
                "transcriptvar": fields[56]
            },
            "fathmm-mkl": {
                "coding_score": fathmm_coding_score,
                "coding_rankscore": fields[72],
                "coding_pred": fields[73],
                "coding_group": fields[74]
            },
            "eigen": {
                "raw": fields[75],
                "phred": fields[76],
                "raw_rankscore": fields[77]
            },
            "eigen-pc": {
                "raw": fields[78],
                "raw_rankscore": fields[79]
            },
            "genocanyon": {
                "score": fields[80],
                "rankscore": fields[81]
            },
            "metasvm": {
                "score": metasvm_score,
                "rankscore": fields[60],
                "pred": fields[61]
            },
            "metalr": {
                "score": lr_score,
                "rankscore": fields[63],
                "pred": fields[64]
            },
            "reliability_index": fields[65],
            "dann": {
                "score": dann_score,
                "rankscore": fields[70]
            },
            "gerp++": {
                "nr": fields[94],
                "rs": fields[95],
                "rs_rankscore": fields[96]
            },
            "integrated": {
                "fitcons_score": integrated_fitcons_score,
                "fitcons_rankscore": fields[83],
                "confidence_value": fields[84]
            },
            "gm12878": {
                "fitcons_score": gm12878_fitcons_score,
                "fitcons_rankscore": fields[86],
                "confidence_value": fields[87]
            },
            "h1-hesc": {
                "fitcons_score": h1_hesc_fitcons_score,
                "fitcons_rankscore": fields[89],
                "confidence_value": fields[90]
            },
            "huvec": {
                "fitcons_score": huvec_fitcons_score,
                "fitcons_rankscore": fields[92],
                "confidence_value": fields[93]
            },
            "phylo": {
                "p100way": {
                    "vertebrate": fields[97],
                    "vertebrate_rankscore": fields[98]
                },
                "p20way": {
                    "mammalian": fields[99],
                    "mammalian_rankscore": fields[100]
                }
            },
            "phastcons": {
                "100way": {
                    "vertebrate": fields[101],
                    "vertebrate_rankscore": fields[102]
                },
                "20way": {
                    "mammalian": fields[103],
                    "mammalian_rankscore": fields[104]
                }
            },
            "siphy_29way": {
                "pi": siphy,
                "logodds": fields[106],
                "logodds_rankscore": fields[107]
            },
            "1000gp3": {
                "ac": fields[108],
                "af": fields[109],
                "afr_ac": fields[110],
                "afr_af": fields[111],
                "eur_ac": fields[112],
                "eur_af": fields[113],
                "amr_ac": fields[114],
                "amr_af": fields[115],
                "eas_ac": fields[116],
                "eas_af": fields[117],
                "sas_ac": fields[118],
                "sas_af": fields[119]
            },
            "twinsuk": {
                "ac": fields[120],
                "af": fields[121]
            },
            "alspac": {
                "ac": fields[122],
                "af": fields[123]
            },
            "esp6500": {
                "aa_ac": fields[124],
                "aa_af": fields[125],
                "ea_ac": fields[126],
                "ea_af": fields[127]
            },
            "exac": {
                "ac": fields[128],
                "af": fields[129],
                "adj_ac": fields[130],
                "adj_af": fields[131],
                "afr_ac": fields[132],
                "afr_af": fields[133],
                "amr_ac": fields[134],
                "amr_af": fields[135],
                "eas_ac": fields[136],
                "eas_af": fields[137],
                "fin_ac": fields[138],
                "fin_af": fields[139],
                "nfe_ac": fields[140],
                "nfe_af": fields[141],
                "sas_ac": fields[142],
                "sas_af": fields[143]
            },
            "exac_nontcga": {
                "ac": fields[144],
                "af": fields[145],
                "adj_ac": fields[146],
                "adj_af": fields[147],
                "afr_ac": fields[148],
                "afr_af": fields[149],
                "amr_ac": fields[150],
                "amr_af": fields[151],
                "eas_ac": fields[152],
                "eas_af": fields[153],
                "fin_ac": fields[154],
                "fin_af": fields[155],
                "nfe_ac": fields[156],
                "nfe_af": fields[157],
                "sas_ac": fields[158],
                "sas_af": fields[159]
            },
            "exac_nonpsych": {
                "ac": fields[160],
                "af": fields[161],
                "adj_ac": fields[162],
                "adj_af": fields[163],
                "afr_ac": fields[164],
                "afr_af": fields[165],
                "amr_ac": fields[166],
                "amr_af": fields[167],
                "eas_ac": fields[168],
                "eas_af": fields[169],
                "fin_ac": fields[170],
                "fin_af": fields[171],
                "nfe_ac": fields[172],
                "nfe_af": fields[173]
            },
            "clinvar": {
                "rs": fields[176],
                "clinsig": fields[177],
                "trait": fields[178],
                "golden_stars": fields[179]
            },
            "gtex": gtex
        }
    }

    one_snp_json = list_split(dict_sweep(unlist(value_convert(one_snp_json)), vals=["."]), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json
Example #15
0
def _map_line_to_json(fields):
    # specific variable treatment
    chrom = fields[0]
    if fields[7] == ".":
        hg18_end = "."
    else:
        hg18_end = int(fields[7])+1
    chromStart = int(fields[1])
    chromEnd = int(fields[1]) + 1
    allele1 = fields[2]
    allele2 = fields[3]
    HGVS = "chr%s:g.%d%s>%s" % (chrom, chromStart, allele1, allele2)

    if fields[74] == ".":
        siphy = "."
    else:
        freq = fields[74].split(":")
        siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}

    acc = fields[11].rstrip().rstrip(';').split(";")
    pos = fields[13].rstrip().rstrip(';').split(";")
    uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos)))

    # load as json data
    one_snp_json = {

        "_id": HGVS,
        "dbnsfp":
            {
                "chrom": chrom,
                "hg19":
                    {
                        "start": fields[1],
                        "end": chromEnd
                    },
                "hg18":
                    {
                        "start": fields[7],
                        "end": hg18_end
                    },
                "hg38":
                    {
                        "chrom": fields[8],
                        "pos": fields[9]                    
                    },
                "allele1": allele1,
                "allele2": allele2,
                "aa":
                    {
                        "ref": fields[4],
                        "alt": fields[5],
                        "pos": fields[23],
                        "refcodon": fields[16],
                        "codonpos": fields[18],
                        "aapos_sift": fields[24],
                        "aapos_fathmm": fields[25]
                    },
                "genename": fields[10],
                "uniprot": uniprot,
                "interpro_domain": fields[14],
                "cds_strand": fields[15],
                "slr_test_statistic": fields[17],
                "fold-degenerate": fields[19],
                "ancestral_allele": fields[20],
                "ensembl":
                    {
                        "geneid": fields[21],
                        "transcriptid": fields[22]
                    },
                "sift":
                    {
                        "score": fields[26],
                        "converted_rankscore": fields[27],
                        "pred": fields[28]
                    },
                "polyphen2":
                    {
                        "hdiv":
                        {
                            "score": fields[29],
                            "rankscore": fields[30],
                            "pred": fields[31]
                        },
                        "hvar":
                        {
                            "score": fields[32],
                            "rankscore": fields[33],
                            "pred": fields[34]
                        }
                    },
                "lrt":
                    {
                        "score": fields[35],
                        "converted_rankscore": fields[36],
                        "pred": fields[37]
                    },
                "mutationtaster":
                    {
                        "score": fields[38],
                        "converted_rankscore": fields[39],
                        "pred": fields[40]
                    },
                "mutationassessor":
                    {
                        "score": fields[41],
                        "rankscore": fields[42],
                        "pred": fields[43]
                    },
                "fathmm":
                    {
                        "score": fields[44],
                        "rankscore": fields[45],
                        "pred": fields[46]
                    },
                "radialsvm":
                    {
                        "score": fields[47],
                        "rankscore": fields[48],
                        "pred": fields[49]
                    },
                "lr":
                    {
                        "score": fields[50],
                        "rankscore": fields[51],
                        "pred": fields[52]
                    },
                "reliability_index": fields[53],
                "vest3":
                    {
                        "score": fields[54],
                        "rankscore": fields[55]
                    },
                "cadd":
                    {
                        "raw": fields[56],
                        "raw_rankscore": fields[57],
                        "phred": fields[58]
                    },
                "gerp++":
                    {
                        "nr": fields[59],
                        "rs": fields[60],
                        "rs_rankscore": fields[61]
                    },
                "phylop":
                    {
                        "46way": 
                            {
                                "primate": fields[62],
                                "primate_rankscore": fields[63],
                                "placental": fields[64],
                                "placental_rankscore": fields[65],
                            },
                        "100way":
                            {
                                "vertebrate": fields[66],
                                "vertebrate_rankscore": fields[67]
                            }
                    },
                "phastcons":
                    {
                        "46way": 
                            {
                                "primate": fields[68],
                                "primate_rankscore": fields[69],
                                "placental": fields[70],
                                "placental_rankscore": fields[71],
                            },
                        "100way":
                            {
                                "vertebrate": fields[72],
                                "vertebrate_rankscore": fields[73]
                            }
                    },
                "siphy_29way":
                    {
                        "pi": siphy,
                        "logodds": fields[75],
                        "logodds_rankscore": fields[76]
                    },
                "lrt_omega": fields[77],
                "unisnp_ids": fields[78],
                "1000gp1":
                    {
                        "ac": fields[79],
                        "af": fields[80],
                        "afr_ac": fields[81],
                        "afr_af": fields[82],
                        "eur_ac": fields[83],
                        "eur_af": fields[84],
                        "amr_ac": fields[85],
                        "amr_af": fields[86],
                        "asn_ac": fields[87],
                        "asn_af": fields[88]
                    },
                "esp6500":
                    {
                        "aa_af": fields[89],
                        "ea_af": fields[90]
                    },
                "aric5606":
                    {
                        "aa_ac": fields[91],
                        "aa_af": fields[92],
                        "ea_ac": fields[93],
                        "ea_af": fields[94]
                    },
                "clinvar":
                    {
                        "rs": fields[95],
                        "clin_sig": fields[96],
                        "trait": fields[97]
                    }
            }
    }

    one_snp_json = list_split(dict_sweep(unlist(value_convert(one_snp_json)), vals=["."]), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json
def _map_line_to_json(cp):
    try:
        clinical_significance = cp.ReferenceClinVarAssertion.\
            ClinicalSignificance.Description
    except:
        clinical_significance = None
    rcv_accession = cp.ReferenceClinVarAssertion.ClinVarAccession.Acc
    try:
        review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.\
            ReviewStatus
    except:
        review_status = None
    try:
        last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.\
            DateLastEvaluated
    except:
        last_evaluated = None
    variant_id = cp.ReferenceClinVarAssertion.MeasureSet.ID
    number_submitters = len(cp.ClinVarAssertion)
    # some items in clinvar_xml doesn't have origin information
    try:
        origin = cp.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin
    except:
        origin = None
    trait = cp.ReferenceClinVarAssertion.TraitSet.Trait[0]
    synonyms = []
    conditions_name = ''
    for name in trait.Name:
        if name.ElementValue.Type == 'Alternate':
            synonyms.append(name.ElementValue.get_valueOf_())
        if name.ElementValue.Type == 'Preferred':
            conditions_name += name.ElementValue.get_valueOf_()
    identifiers = {}
    for item in trait.XRef:
        if item.DB == 'Human Phenotype Ontology':
            key = 'Human_Phenotype_Ontology'
        else:
            key = item.DB
        identifiers[key.lower()] = item.ID
    for symbol in trait.Symbol:
        if symbol.ElementValue.Type == 'Preferred':
            conditions_name += ' (' + symbol.ElementValue.get_valueOf_() + ')'
    age_of_onset = ''
    for _set in trait.AttributeSet:
        if _set.Attribute.Type == 'age of onset':
            age_of_onset = _set.Attribute.get_valueOf_()

    # MeasureSet.Measure return a list, there might be multiple
    # Measure under one MeasureSet
    for Measure in cp.ReferenceClinVarAssertion.MeasureSet.Measure:
        variation_type = Measure.Type
        # exclude any item of which types belong to
        # 'Variation', 'protein only' or 'Microsatellite'
        if variation_type == 'Variation' or variation_type\
           == 'protein only' or variation_type == 'Microsatellite':
            continue
        allele_id = Measure.ID
        chrom = None
        chromStart = None
        chromEnd = None
        chromStart_38 = None
        chromEnd_38 = None
        ref = None
        alt = None
        if Measure.SequenceLocation:
            for SequenceLocation in Measure.SequenceLocation:
                # In this version, only accept information concerning GRCh37
                if 'GRCh37' in SequenceLocation.Assembly:
                    chrom = SequenceLocation.Chr
                    chromStart = SequenceLocation.start
                    chromEnd = SequenceLocation.stop
                    ref = SequenceLocation.referenceAllele
                    alt = SequenceLocation.alternateAllele
                if 'GRCh38' in SequenceLocation.Assembly:
                    chromStart_38 = SequenceLocation.start
                    chromEnd_38 = SequenceLocation.stop
                    if not ref:
                        ref = SequenceLocation.referenceAllele
                    if not alt:
                        alt = SequenceLocation.alternateAllele
        if Measure.MeasureRelationship:
            try:
                symbol = Measure.MeasureRelationship[0].\
                    Symbol[0].get_ElementValue().valueOf_
            except:
                symbol = None
            gene_id = Measure.MeasureRelationship[0].XRef[0].ID
        else:
            symbol = None
            gene_id = None
        if Measure.Name:
            name = Measure.Name[0].ElementValue.valueOf_
        else:
            name = None
        if len(Measure.CytogeneticLocation) == 1:
            cytogenic = Measure.CytogeneticLocation[0]
        else:
            cytogenic = Measure.CytogeneticLocation
        hgvs_coding = None
        hgvs_genome = None
        HGVS = {'genomic': [], 'coding': [], 'non-coding': [], 'protein': []}
        coding_hgvs_only = None
        hgvs_id = None
        # hgvs_not_validated = None
        if Measure.AttributeSet:
            # 'copy number loss' or 'gain' have format different\
            # from other types, should be dealt with seperately
            if (variation_type == 'copy number loss') or \
                    (variation_type == 'copy number gain'):
                for AttributeSet in Measure.AttributeSet:
                    if 'HGVS, genomic, top level' in AttributeSet.\
                            Attribute.Type:
                        if AttributeSet.Attribute.integerValue == 37:
                            hgvs_genome = AttributeSet.Attribute.get_valueOf_()
                    if 'genomic' in AttributeSet.Attribute.Type:
                        HGVS['genomic'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    elif 'non-coding' in AttributeSet.Attribute.Type:
                        HGVS['non-coding'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    elif 'coding' in AttributeSet.Attribute.Type:
                        HGVS['coding'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    elif 'protein' in AttributeSet.Attribute.Type:
                        HGVS['protein'].append(
                            AttributeSet.Attribute.get_valueOf_())
            else:
                for AttributeSet in Measure.AttributeSet:
                    if 'genomic' in AttributeSet.Attribute.Type:
                        HGVS['genomic'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    elif 'non-coding' in AttributeSet.Attribute.Type:
                        HGVS['non-coding'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    elif 'coding' in AttributeSet.Attribute.Type:
                        HGVS['coding'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    elif 'protein' in AttributeSet.Attribute.Type:
                        HGVS['protein'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    if AttributeSet.Attribute.Type == 'HGVS, coding, RefSeq':
                        hgvs_coding = AttributeSet.Attribute.get_valueOf_()
                    elif AttributeSet.Attribute.Type == \
                            'HGVS, genomic, top level, previous':
                        hgvs_genome = AttributeSet.Attribute.get_valueOf_()
                        break
            if chrom and chromStart and chromEnd:
                if variation_type == 'single nucleotide variant':
                    hgvs_id = "chr%s:g.%s%s>%s" % (chrom, chromStart, ref, alt)
                # items whose type belong to 'Indel, Insertion, \
                # Duplication' might not hava explicit alt information, \
                # so we will parse from hgvs_genome
                elif variation_type == 'Indel':
                    if hgvs_genome:
                        indel_position = hgvs_genome.find('del')
                        indel_alt = hgvs_genome[indel_position + 3:]
                        hgvs_id = "chr%s:g.%s_%sdel%s" % \
                                  (chrom, chromStart, chromEnd, indel_alt)
                elif variation_type == 'Deletion':
                    hgvs_id = "chr%s:g.%s_%sdel" % \
                              (chrom, chromStart, chromEnd)
                elif variation_type == 'Insertion':
                    if hgvs_genome:
                        ins_position = hgvs_genome.find('ins')
                        if 'ins' in hgvs_genome:
                            ins_ref = hgvs_genome[ins_position + 3:]
                            hgvs_id = "chr%s:g.%s_%sins%s" % \
                                      (chrom, chromStart, chromEnd, ins_ref)
                elif variation_type == 'Duplication':
                    if hgvs_genome:
                        dup_position = hgvs_genome.find('dup')
                        if 'dup' in hgvs_genome:
                            dup_ref = hgvs_genome[dup_position + 3:]
                            hgvs_id = "chr%s:g.%s_%sdup%s" % \
                                      (chrom, chromStart, chromEnd, dup_ref)
            elif variation_type == 'copy number loss' or\
                    variation_type == 'copy number gain':
                if hgvs_genome:
                    hgvs_id = "chr" + hgvs_genome.split('.')[1] +\
                              hgvs_genome.split('.')[2]
            elif hgvs_coding:
                hgvs_id = hgvs_coding
                coding_hgvs_only = True
            else:
                print "couldn't find any id", rcv_accession
                return
        else:
            print 'no measure.attribute', rcv_accession
            return
        for key in HGVS:
            HGVS[key].sort()
        rsid = None
        cosmic = None
        dbvar = None
        uniprot = None
        omim = None
        # loop through XRef to find rsid as well as other ids
        if Measure.XRef:
            for XRef in Measure.XRef:
                if XRef.Type == 'rs':
                    rsid = 'rs' + str(XRef.ID)
                elif XRef.DB == 'COSMIC':
                    cosmic = XRef.ID
                elif XRef.DB == 'OMIM':
                    omim = XRef.ID
                elif XRef.DB == 'UniProtKB/Swiss-Prot':
                    uniprot = XRef.ID
                elif XRef.DB == 'dbVar':
                    dbvar = XRef.ID

        # make sure the hgvs_id is not none
        if hgvs_id:
            one_snp_json = {
                "_id": hgvs_id,
                "clinvar": {
                    "allele_id": allele_id,
                    "variant_id": variant_id,
                    "chrom": chrom,
                    "omim": omim,
                    "cosmic": cosmic,
                    "uniprot": uniprot,
                    "dbvar": dbvar,
                    "hg19": {
                        "start": chromStart,
                        "end": chromEnd
                    },
                    "hg38": {
                        "start": chromStart_38,
                        "end": chromEnd_38
                    },
                    "type": variation_type,
                    "gene": {
                        "id": gene_id,
                        "symbol": symbol
                    },
                    "rcv": {
                        "accession": rcv_accession,
                        "clinical_significance": clinical_significance,
                        "number_submitters": number_submitters,
                        "review_status": review_status,
                        "last_evaluated": str(last_evaluated),
                        "preferred_name": name,
                        "origin": origin,
                        "conditions": {
                            "name": conditions_name,
                            "synonyms": synonyms,
                            "identifiers": identifiers,
                            "age_of_onset": age_of_onset
                        }
                    },
                    "rsid": rsid,
                    "cytogenic": cytogenic,
                    "hgvs": HGVS,
                    "coding_hgvs_only": coding_hgvs_only,
                    "ref": ref,
                    "alt": alt
                }
            }
            obj = (dict_sweep(
                unlist(
                    value_convert(one_snp_json, [
                        'chrom', 'omim', 'id', 'orphanet', 'gene',
                        'rettbase_(cdkl5)', 'cosmic', 'dbrbc'
                    ])), [None, '', 'None']))
            yield obj
def _map_line_to_json(cp):
    clinical_siginificance = cp.ReferenceClinVarAssertion.\
        ClinicalSignificance.Description
    rcv_accession = cp.ReferenceClinVarAssertion.ClinVarAccession.Acc
    review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.\
        ReviewStatus
    last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.\
        DateLastEvaluated
    CLINVAR_ID = cp.ReferenceClinVarAssertion.MeasureSet.ID
    number_submitters = len(cp.ClinVarAssertion)
    # some items in clinvar_xml doesn't have origin information
    try:
        origin = cp.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin
    except:
        origin = None
    # MeasureSet.Measure return a list, there might be multiple
    # Measure under one MeasureSet
    for Measure in cp.ReferenceClinVarAssertion.MeasureSet.Measure:
        variation_type = Measure.Type
        # exclude any item of which types belong to
        # 'Variation', 'protein only' or 'Microsatellite'
        if variation_type == 'Variation' or variation_type\
           == 'protein only' or variation_type == 'Microsatellite':
            continue
        allele_id = Measure.ID
        chrom = None
        chromStart = None
        chromEnd = None
        ref = None
        alt = None
        if Measure.SequenceLocation:
            for SequenceLocation in Measure.SequenceLocation:
                # In this version, only accept information concerning GRCh37
                if 'GRCh37' in SequenceLocation.Assembly:
                    chrom = SequenceLocation.Chr
                    chromStart = SequenceLocation.start
                    chromEnd = SequenceLocation.stop
                    ref = SequenceLocation.referenceAllele
                    alt = SequenceLocation.alternateAllele
        if Measure.MeasureRelationship:
            try:
                symbol = Measure.MeasureRelationship[0].\
                    Symbol[0].get_ElementValue().valueOf_
            except:
                symbol = None
            gene_id = Measure.MeasureRelationship[0].XRef[0].ID
        else:
            symbol = None
            gene_id = None
        if Measure.Name:
            name = Measure.Name[0].ElementValue.valueOf_
        else:
            name = None
        if len(Measure.CytogeneticLocation) == 1:
            cytogenic = Measure.CytogeneticLocation[0]
        else:
            cytogenic = Measure.CytogeneticLocation
        hgvs_coding = None
        hgvs_genome = None
        HGVS = {'genomic': [], 'coding': [], 'non-coding': [], 'protein': []}
        coding_hgvs_only = None
        hgvs_id = None
        # hgvs_not_validated = None
        if Measure.AttributeSet:
            # 'copy number loss' or 'gain' have format different\
            # from other types, should be dealt with seperately
            if (variation_type == 'copy number loss') or \
                    (variation_type == 'copy number gain'):
                for AttributeSet in Measure.AttributeSet:
                    if 'HGVS, genomic, top level' in AttributeSet.\
                            Attribute.Type:
                        if AttributeSet.Attribute.integerValue == 37:
                            hgvs_genome = AttributeSet.Attribute.get_valueOf_()
                    if 'genomic' in AttributeSet.Attribute.Type:
                        HGVS['genomic'].append(AttributeSet.Attribute.
                                               get_valueOf_())
                    elif 'non-coding' in AttributeSet.Attribute.Type:
                        HGVS['non-coding'].append(AttributeSet.Attribute.
                                                  get_valueOf_())
                    elif 'coding' in AttributeSet.Attribute.Type:
                        HGVS['coding'].append(AttributeSet.Attribute.
                                              get_valueOf_())
                    elif 'protein' in AttributeSet.Attribute.Type:
                        HGVS['protein'].append(AttributeSet.
                                               Attribute.get_valueOf_())
            else:
                for AttributeSet in Measure.AttributeSet:
                    if 'genomic' in AttributeSet.Attribute.Type:
                        HGVS['genomic'].append(AttributeSet.
                                               Attribute.get_valueOf_())
                    elif 'non-coding' in AttributeSet.Attribute.Type:
                        HGVS['non-coding'].append(AttributeSet.
                                                  Attribute.get_valueOf_())
                    elif 'coding' in AttributeSet.Attribute.Type:
                        HGVS['coding'].append(AttributeSet.Attribute.
                                              get_valueOf_())
                    elif 'protein' in AttributeSet.Attribute.Type:
                        HGVS['protein'].append(AttributeSet.
                                               Attribute.get_valueOf_())
                    if AttributeSet.Attribute.Type == 'HGVS, coding, RefSeq':
                        hgvs_coding = AttributeSet.Attribute.get_valueOf_()
                    elif AttributeSet.Attribute.Type == \
                            'HGVS, genomic, top level, previous':
                        hgvs_genome = AttributeSet.Attribute.get_valueOf_()
                        break
            if chrom and chromStart and chromEnd:
                if variation_type == 'single nucleotide variant':
                    hgvs_id = "chr%s:g.%s%s>%s" % (chrom, chromStart, ref, alt)
                # items whose type belong to 'Indel, Insertion, \
                # Duplication' might not hava explicit alt information, \
                # so we will parse from hgvs_genome
                elif variation_type == 'Indel':
                    if hgvs_genome:
                        indel_position = hgvs_genome.find('del')
                        indel_alt = hgvs_genome[indel_position+3:]
                        hgvs_id = "chr%s:g.%s_%sdel%s" % \
                                  (chrom, chromStart, chromEnd, indel_alt)
                elif variation_type == 'Deletion':
                    hgvs_id = "chr%s:g.%s_%sdel" % \
                              (chrom, chromStart, chromEnd)
                elif variation_type == 'Insertion':
                    if hgvs_genome:
                        ins_position = hgvs_genome.find('ins')
                        if 'ins' in hgvs_genome:
                            ins_ref = hgvs_genome[ins_position+3:]
                            hgvs_id = "chr%s:g.%s_%sins%s" % \
                                      (chrom, chromStart, chromEnd, ins_ref)
                elif variation_type == 'Duplication':
                    if hgvs_genome:
                        dup_position = hgvs_genome.find('dup')
                        if 'dup' in hgvs_genome:
                            dup_ref = hgvs_genome[dup_position+3:]
                            hgvs_id = "chr%s:g.%s_%sdup%s" % \
                                      (chrom, chromStart, chromEnd, dup_ref)
            elif variation_type == 'copy number loss' or\
                    variation_type == 'copy number gain':
                if hgvs_genome:
                    hgvs_id = "chr" + hgvs_genome.split('.')[1] +\
                              hgvs_genome.split('.')[2]
            elif hgvs_coding:
                hgvs_id = hgvs_coding
                coding_hgvs_only = True
            else:
                print "couldn't find any id", rcv_accession
                return
        else:
            print 'no measure.attribute', rcv_accession
            return
        other_ids = ''
        rsid = None
        # loop through XRef to find rsid as well as other ids
        if Measure.XRef:
            for XRef in Measure.XRef:
                if XRef.Type == 'rs':
                    rsid = 'rs' + str(XRef.ID)
                other_ids = other_ids + XRef.DB + ':' + XRef.ID + ';'
        # make sure the hgvs_id is not none
        if hgvs_id:
            one_snp_json = {

                "_id": hgvs_id,
                "clinvar":
                    {
                        "allele_id": allele_id,
                        "chrom": chrom,
                        "hg19":
                            {
                                "start": chromStart,
                                "end": chromEnd
                            },
                        "type": variation_type,
                        "name": name,
                        "gene":
                            {
                                "id": gene_id,
                                "symbol": symbol
                            },
                        "clinical_significance": clinical_siginificance,
                        "rsid": rsid,
                        "rcv_accession": rcv_accession,
                        "origin": origin,
                        "cytogenic": cytogenic,
                        "review_status": review_status,
                        "hgvs": HGVS,
                        "number_submitters": number_submitters,
                        "last_evaluated": str(last_evaluated),
                        "other_ids": other_ids,
                        "clinvar_id": CLINVAR_ID,
                        "coding_hgvs_only": coding_hgvs_only,
                        "ref": ref,
                        "alt": alt
                    }
                }
            obj = (dict_sweep(unlist(value_convert(one_snp_json)), [None]))
            yield obj
Example #18
0
def _map_line_to_json(fields, version='hg19'):
    # specific variable treatment
    chrom = fields[0]
    if chrom == 'M':
        chrom = 'MT'
    # fields[7] in version 2, represent hg18_pos
    if fields[10] == ".":
        hg18_end = "."
    else:
        hg18_end = int(fields[10])
    chromStart = int(fields[8])
    chromEnd = int(fields[8])
    chromStart_38 = int(fields[1])
    ref = fields[2].upper()
    alt = fields[3].upper()
    HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt)
    HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt)
    if version == 'hg19':
        HGVS = HGVS_19
    elif version == 'hg38':
        HGVS = HGVS_38
    if fields[69] == ".":
        siphy = "."
    else:
        freq = fields[69].split(":")
        siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}

    acc = fields[26].rstrip().rstrip(';').split(";")
    pos = fields[28].rstrip().rstrip(';').split(";")
    uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos)))

    # load as json data
    one_snp_json = {
        "_id": HGVS,
        "dbnsfp": {
            "rsid": fields[6],
            "chrom": chrom,
            "hg19": {
                "start": fields[8],
                "end": chromEnd
            },
            "hg18": {
                "start": fields[10],
                "end": hg18_end
            },
            "hg38": {
                "start": fields[1],
                "end": fields[1]
            },
            "ref": ref,
            "alt": alt,
            "aa": {
                "ref": fields[4],
                "alt": fields[5],
                "pos": fields[22],
                "refcodon": fields[13],
                "codonpos": fields[14],
            },
            "genename": fields[11],
            "uniprot": uniprot,
            "interpro_domain": fields[111],
            "cds_strand": fields[12],
            "ancestral_allele": fields[16],
            "ensembl": {
                "geneid": fields[19],
                "transcriptid": fields[20]
            },
            "sift": {
                "score": fields[23],
                "converted_rankscore": fields[24],
                "pred": fields[25]
            },
            "polyphen2": {
                "hdiv": {
                    "score": fields[29],
                    "rankscore": fields[30],
                    "pred": fields[31]
                },
                "hvar": {
                    "score": fields[32],
                    "rankscore": fields[33],
                    "pred": fields[34]
                }
            },
            "lrt": {
                "score": fields[35],
                "converted_rankscore": fields[36],
                "pred": fields[37],
                "omega": fields[38]
            },
            "mutationtaster": {
                "score": fields[39],
                "converted_rankscore": fields[40],
                "pred": fields[41],
                "model": fields[42],
                "AAE": fields[43]
            },
            "mutationassessor": {
                "score": fields[46],
                "rankscore": fields[47],
                "pred": fields[48]
            },
            "fathmm": {
                "score": fields[49],
                "rankscore": fields[50],
                "pred": fields[51]
            },
            "provean": {
                "score": fields[52],
                "rankscore": fields[53],
                "pred": fields[54]
            },
            "metasvm": {
                "score": fields[55],
                "rankscore": fields[56],
                "pred": fields[57]
            },
            "lr": {
                "score": fields[58],
                "rankscore": fields[59],
                "pred": fields[60]
            },
            "reliability_index": fields[61],
            "gerp++": {
                "nr": fields[62],
                "rs": fields[63],
                "rs_rankscore": fields[64]
            },
            "phylop_7way": {
                "vertebrate": fields[65],
                "vertebrate_rankscore": fields[66]
            },
            "phastcons_7way": {
                "vertebrate": fields[67],
                "vertebrate_rankscore": fields[68]
            },
            "siphy_29way": {
                "pi": siphy,
                "logodds": fields[70],
                "logodds_rankscore": fields[71]
            },
            "1000gp1": {
                "ac": fields[72],
                "af": fields[73],
                "afr_ac": fields[74],
                "afr_af": fields[75],
                "eur_ac": fields[76],
                "eur_af": fields[77],
                "amr_ac": fields[78],
                "amr_af": fields[79],
                "eas_ac": fields[80],
                "eas_af": fields[81],
                "sas_ac": fields[82],
                "sas_af": fields[83]
            },
            "twinsuk": {
                "ac": fields[84],
                "af": fields[85]
            },
            "alspac": {
                "ac": fields[86],
                "af": fields[87]
            },
            "esp6500": {
                "aa_ac": fields[88],
                "aa_af": fields[89],
                "ea_ac": fields[90],
                "ea_af": fields[91]
            },
            "exac": {
                "ac": fields[92],
                "af": fields[93],
                "adj_ac": fields[94],
                "adj_af": fields[95],
                "afr_ac": fields[96],
                "afr_af": fields[97],
                "amr_ac": fields[98],
                "amr_af": fields[99],
                "eas_ac": fields[100],
                "eas_af": fields[101],
                "fin_ac": fields[102],
                "fin_af": fields[103],
                "nfe_ac": fields[104],
                "nfe_af": fields[105],
                "sas_ac": fields[106],
                "sas_af": fields[107]
            },
            "clinvar": {
                "rs": fields[108],
                "clinsig": fields[109],
                "trait": fields[110]
            }
        }
    }

    one_snp_json = list_split(
        dict_sweep(unlist(value_convert(one_snp_json)), vals=["."]), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json
Example #19
0
def _map_line_to_json(fields):
    chrInfo = fields[0].split(":")  # grch37
    chrom = chrInfo[0]
    chromStart = int(chrInfo[1])

    ma_fin_percent = fields[7].split("/")

    if fields[3]:
        mutation = fields[3].split(">")
        ref = mutation[0]
        alt = mutation[1]
        HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt)
        hg19 = get_pos_start_end(chrom, chromStart, ref, alt)
        hg38 = get_pos_start_end(chrom, int(fields[30].split(":")[1]), ref, alt)

    # load as json data
    if HGVS is None:
        return

    one_snp_json = {
        "_id": HGVS,
        "evs":
            {
                "chrom": chrom,
                "hg19":
                    {
                        "start": hg19[0],
                        "end": hg19[1]
                    },
                "hg38":
                    {
                        "start": hg38[0],
                        "end": hg38[1]
                    },
                "rsid": fields[1],
                "dbsnp_version": get_dbsnp(fields[2]),
                "ref": ref,
                "alt": alt,
                "allele_count":
                    {
                        "european_american": count_dict(fields[4]),
                        "african_american": count_dict(fields[5]),
                        "all": count_dict(fields[6])
                    },
                "ma_fin_percent":
                    {
                        "european_american": ma_fin_percent[0],
                        "african_american": ma_fin_percent[1],
                        "all": ma_fin_percent[2]
                    },
                "genotype_count":
                    {
                        "european_american": count_dict(fields[8]),
                        "african_american": count_dict(fields[9]),
                        "all_genotype": count_dict(fields[10])
                    },
                "avg_sample_read": fields[11],
                "gene":
                    {
                        "symbol": fields[12],
                        "accession": fields[13]
                    },
                "function_gvs": fields[14],
                "hgvs":
                    {
                        "coding": fields[16],
                        "protein": fields[15]
                    },
                "coding_dna_size": fields[17],
                "conservation":
                    {
                        "phast_cons": fields[18],
                        "gerp": fields[19]
                    },
                "grantham_score": fields[20],
                "polyphen2":
                    {
                        "class": polyphen(fields[21])[0],
                        "score": polyphen(fields[21])[1]
                    },
                "ref_base_ncbi": fields[22],
                "chimp_allele": fields[23],
                "clinical_info": fields[24],
                "filter_status": fields[25],
                "on_illumina_human_exome_chip": fields[26],
                "gwas_pubmed_info": fields[27],
                "estimated_age_kyrs":
                    {
                        "ea": fields[28],
                        "aa": fields[29]
                    }
            }
        }
    return dict_sweep(value_convert(one_snp_json), vals=["NA", "none", "unknown"])
 def annotate_by_snpeff(self, varobj_list):
     '''load data'''
     # title of vcf
     vcf_stdin = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n'
     # extract each item from list, transform into vcf format
     snpeff_valid_id = []
     for item in varobj_list:
         if '>' in item:
             hgvs_info = self.snp_hgvs_id_parser(item)
             try:
                 vcf_stdin += self.snp_vcf_constructer(hgvs_info)
             except TypeError:
                 print(item)
                 continue
             snpeff_valid_id.append(item)
         elif item.endswith('del'):
             hgvs_info = self.del_hgvs_id_parser(item)
             try:
                 vcf_stdin += self.del_vcf_constructor(hgvs_info)
             except TypeError:
                 print(item)
                 continue
             snpeff_valid_id.append(item)
         elif 'ins' in item and 'del' not in item:
             hgvs_info = self.ins_hgvs_id_parser(item)
             try:
                 vcf_stdin += self.ins_vcf_constructor(hgvs_info)
             except TypeError:
                 print(item)
                 continue
             snpeff_valid_id.append(item)
         elif 'delins' in item:
             hgvs_info = self.delins_hgvs_id_parser(item)
             try:
                 vcf_stdin += self.delins_vcf_constructor(hgvs_info)
             except TypeError:
                 print(item)
                 continue
         else:
             print(item)
             print('beyond current capacity')
     proc = subprocess.Popen(SNPEFF_CMD, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     (stdout, stderr) = proc.communicate(vcf_stdin)
     assert stderr == '', stderr
     vcf_stdout_raw = stdout.split('\n')
     for vcf_line in vcf_stdout_raw:
         if vcf_line.startswith('#'):
             continue
         elif vcf_line == '':
             continue
         else:
             # assume the first item is 'ANN'
             ann_info = vcf_line.split(';')[0]
             ann = []
             # Multiple annotations per VCF line
             for item in ann_info.split(','):
                 if len(item.split('|')) > 1:
                     (effect, putative_impact, gene_name, gene_id, feature_type, feature_id) = item.split('|')[1:7]
                     (transcript_biotype, exon, hgvs_coding, hgvs_protein, cdna, cds, protein, distance_to_feature) = item.split('|')[7:15]
                     print(effect)
                     if cdna:
                         (cdna_position, cdna_len) = cdna.split('/')
                     else:
                         cdna_position = None
                         cdna_len = None
                     if cds:
                         (cds_position, cds_len) = cds.split('/')
                     else:
                         cds_position = None
                         cds_len = None
                     if protein:
                         (protein_position, protein_len) = protein.split('/')
                     else:
                         protein_position = None
                         protein_len = None
                     if exon:
                         (rank, total) = exon.split('/')
                     else:
                         rank = None
                         total = None
                     ann.append({
                         "effect": effect,
                         "putative_impact": putative_impact,
                         "genename": gene_name,
                         "gene_id": gene_id,
                         "feature_type": feature_type,
                         "feature_id": feature_id,
                         "transcript_biotype": transcript_biotype,
                         "rank": rank,
                         "total": total,
                         "hgvs.c": hgvs_coding,
                         "hgvs.p": hgvs_protein,
                         "cdna": {
                             "position": cdna_position,
                             "length": cdna_len
                         },
                         "cds": {
                             "position": cds_position,
                             "length": cds_len
                         },
                         "protein": {
                             "position": protein_position,
                             "length": protein_len
                         },
                         "distance_to_feature": distance_to_feature
                     })
                     print(ann)
             # not all annotations include lof & nmd information. Set them to 'None' as default
             lof = None
             nmd = None
             # the case that annotation include 'ann' & 'lof' & 'nmd'
             if len(vcf_line.split(';')) == 3:
                 (lof_info, nmd_info) = vcf_line.split(';')[1:3]
                 # assume the second item is 'lof'
                 assert lof_info.startswith('LOF')
                 # the information to be parsed is like this: 'LOF=(PTEN|PTEN|1|1.00)'
                 lof_info = lof_info.split('(')[1].split(')')[0]
                 nmd_info = nmd_info.split('(')[1].split(')')[0]
                 (id_lof, name_lof, nt_lof, pt_lof) = lof_info.split('|')
                 (id_nmd, name_nmd, nt_nmd, pt_nmd) = nmd_info.split('|')
                 lof = {
                     "gene_id": id_lof,
                     "genename": name_lof,
                     "number_of_transcripts_in_gene": nt_lof,
                     "percent_of_transcripts_affected": pt_lof
                 }
                 nmd = {
                     "gene_id": id_nmd,
                     "genename": name_nmd,
                     "number_of_transcripts_in_gene": nt_nmd,
                     "percent_of_transcripts_affected": pt_nmd
                 }
             # the case that annotation include 'ann' & 'lof or nmd'
             elif len(vcf_line.split(';')) == 2:
                 (ann_info, idk_info) = vcf_line.split(';')
                 if idk_info.startswith('LOF'):
                     lof_info = idk_info.split('(')[1].split(')')[0]
                     (id_lof, name_lof, nt_lof, pt_lof) = lof_info.split('|')
                     lof = {
                         "gene_id": id_lof,
                         "genename": name_lof,
                         "number_of_transcripts_in_gene": nt_lof,
                         "percent_of_transcripts_affected": pt_lof
                     }
                 else:
                     nmd_info = idk_info.split('(')[1].split(')')[0]
                     (id_nmd, name_nmd, nt_nmd, pt_nmd) = nmd_info.split('|')
                     nmd = {
                         "gene_id": id_nmd,
                         "genename": name_nmd,
                         "number_of_transcripts_in_gene": nt_nmd,
                         "percent_of_transcripts_affected": pt_nmd
                     }
             (chrom, pos, _id, ref, alt) = ann_info.split('\t')[0:5]
             hgvs_id = get_hgvs_from_vcf(chrom, pos, ref, alt)
             one_snp_json = {
                 "id": hgvs_id,
                 "snpeff": {
                     "ann": ann,
                     "lof": lof,
                     "nmd": nmd,
                     "vcf": {
                         "position": pos,
                         "ref": ref,
                         "alt": alt
                     }
                 }
             }
             snpeff_json = dict_sweep(unlist(one_snp_json), vals=['', None])
             yield snpeff_json
Example #21
0
def _map_line_to_json(fields, version):
    # specific variable treatment
    chrom = fields[0]
    if chrom == 'M':
        chrom = 'MT'
    # fields[7] in version 2, represent hg18_pos
    if fields[10] == ".":
        hg18_end = "."
    else:
        hg18_end = int(fields[10])
    # in case of no hg19 position provided, remove the item
    if fields[8] == '.':
        return None
    else:
        chromStart = int(fields[8])
        chromEnd = int(fields[8])
    chromStart_38 = int(fields[1])
    ref = fields[2].upper()
    alt = fields[3].upper()
    HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt)
    HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt)
    if version == 'hg19':
        HGVS = HGVS_19
    elif version == 'hg38':
        HGVS = HGVS_38
    if fields[105] == ".":
        siphy = "."
    else:
        freq = fields[105].split(":")
        siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}
    gtex_gene = fields[181].split('|')
    gtex_tissue = fields[182].split('|')
    gtex = map(
        dict,
        map(lambda t: zip(('gene', 'tissue'), t), zip(gtex_gene, gtex_tissue)))
    acc = fields[26].rstrip().rstrip(';').split(";")
    pos = fields[28].rstrip().rstrip(';').split(";")
    uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos)))
    provean_score = fields[52].split(';')
    sift_score = fields[23].split(';')
    hdiv_score = fields[29].split(';')
    hvar_score = fields[32].split(';')
    lrt_score = fields[35].split(';')
    dann_score = fields[69].split(';')
    mutationtaster_score = fields[39].split(';')
    mutationassessor_score = fields[46].split(';')
    vest3_score = fields[57].split(';')
    metasvm_score = fields[59].split(';')
    fathmm_score = fields[49].split(';')
    lr_score = fields[62].split(';')
    fathmm_coding_score = fields[71].split(';')
    integrated_fitcons_score = fields[82].split(';')
    gm12878_fitcons_score = fields[85].split(';')
    h1_hesc_fitcons_score = fields[88].split(';')
    huvec_fitcons_score = fields[91].split(';')
    if len(provean_score) > 1:
        for i in range(len(provean_score)):
            if provean_score[i] == '.':
                provean_score[i] = None
    if len(sift_score) > 1:
        for i in range(len(sift_score)):
            if sift_score[i] == '.':
                sift_score[i] = None
    if len(hdiv_score) > 1:
        for i in range(len(hdiv_score)):
            if hdiv_score[i] == '.':
                hdiv_score[i] = None
    if len(hvar_score) > 1:
        for i in range(len(hvar_score)):
            if hvar_score[i] == '.':
                hvar_score[i] = None
    if len(lrt_score) > 1:
        for i in range(len(lrt_score)):
            if lrt_score[i] == '.':
                lrt_score[i] = None
    if len(mutationtaster_score) > 1:
        for i in range(len(mutationtaster_score)):
            if mutationtaster_score[i] == '.':
                mutationtaster_score[i] = None
    if len(mutationassessor_score) > 1:
        for i in range(len(mutationassessor_score)):
            if mutationassessor_score[i] == '.':
                mutationassessor_score[i] = None
    if len(metasvm_score) > 1:
        for i in range(len(metasvm_score)):
            if metasvm_score[i] == '.':
                metasvm_score[i] = None
    if len(vest3_score) > 1:
        for i in range(len(vest3_score)):
            if vest3_score[i] == '.':
                vest3_score[i] = None
    if len(fathmm_score) > 1:
        for i in range(len(fathmm_score)):
            if fathmm_score[i] == '.':
                fathmm_score[i] = None
    if len(lr_score) > 1:
        for i in range(len(lr_score)):
            if lr_score[i] == '.':
                lr_score[i] = None
    if len(fathmm_coding_score) > 1:
        for i in range(len(fathmm_coding_score)):
            if fathmm_coding_score[i] == '.':
                fathmm_coding_score[i] = None
    if len(dann_score) > 1:
        for i in range(len(dann_score)):
            if dann_score[i] == '.':
                dann_score[i] = None
    if len(integrated_fitcons_score) > 1:
        for i in range(len(integrated_fitcons_score)):
            if integrated_fitcons_score[i] == '.':
                integrated_fitcons_score[i] = None
    if len(gm12878_fitcons_score) > 1:
        for i in range(len(gm12878_fitcons_score)):
            if gm12878_fitcons_score[i] == '.':
                gm12878_fitcons_score[i] = None
    if len(h1_hesc_fitcons_score) > 1:
        for i in range(len(h1_hesc_fitcons_score)):
            if h1_hesc_fitcons_score[i] == '.':
                h1_hesc_fitcons_score[i] = None
    if len(huvec_fitcons_score) > 1:
        for i in range(len(huvec_fitcons_score)):
            if huvec_fitcons_score[i] == '.':
                huvec_fitcons_score[i] = None
# load as json data
    one_snp_json = {
        "_id": HGVS,
        "dbnsfp": {
            "rsid": fields[6],
            #"rsid_dbSNP144": fields[6],
            "chrom": chrom,
            "hg19": {
                "start": chromStart,
                "end": chromEnd
            },
            "hg18": {
                "start": fields[10],
                "end": hg18_end
            },
            "hg38": {
                "start": fields[1],
                "end": fields[1]
            },
            "ref": ref,
            "alt": alt,
            "aa": {
                "ref": fields[4],
                "alt": fields[5],
                "pos": fields[22],
                "refcodon": fields[13],
                "codonpos": fields[14],
                "codon_degeneracy": fields[15]
            },
            "genename": fields[11],
            "uniprot": uniprot,
            "interpro_domain": fields[180],
            "cds_strand": fields[12],
            "ancestral_allele": fields[16],
            #"altaineandertal": fields[17],
            #"denisova": fields[18]
            "ensembl": {
                "geneid": fields[19],
                "transcriptid": fields[20],
                "proteinid": fields[21]
            },
            "sift": {
                "score": sift_score,
                "converted_rankscore": fields[24],
                "pred": fields[25]
            },
            "polyphen2": {
                "hdiv": {
                    "score": hdiv_score,
                    "rankscore": fields[30],
                    "pred": fields[31]
                },
                "hvar": {
                    "score": hvar_score,
                    "rankscore": fields[33],
                    "pred": fields[34]
                }
            },
            "lrt": {
                "score": lrt_score,
                "converted_rankscore": fields[36],
                "pred": fields[37],
                "omega": fields[38]
            },
            "mutationtaster": {
                "score": mutationtaster_score,
                "converted_rankscore": fields[40],
                "pred": fields[41],
                "model": fields[42],
                "AAE": fields[43]
            },
            "mutationassessor": {
                "score": mutationassessor_score,
                "rankscore": fields[47],
                "pred": fields[48]
            },
            "fathmm": {
                "score": fathmm_score,
                "rankscore": fields[50],
                "pred": fields[51]
            },
            "provean": {
                "score": provean_score,
                "rankscore": fields[53],
                "pred": fields[54]
            },
            "vest3": {
                "score": vest3_score,
                "rankscore": fields[57],
                "transcriptid": fields[55],
                "transcriptvar": fields[56]
            },
            "fathmm-mkl": {
                "coding_score": fathmm_coding_score,
                "coding_rankscore": fields[72],
                "coding_pred": fields[73],
                "coding_group": fields[74]
            },
            "eigen": {
                "raw": fields[75],
                "phred": fields[76],
                "raw_rankscore": fields[77]
            },
            "eigen-pc": {
                "raw": fields[78],
                "raw_rankscore": fields[79]
            },
            "genocanyon": {
                "score": fields[80],
                "rankscore": fields[81]
            },
            "metasvm": {
                "score": metasvm_score,
                "rankscore": fields[60],
                "pred": fields[61]
            },
            "metalr": {
                "score": lr_score,
                "rankscore": fields[63],
                "pred": fields[64]
            },
            "reliability_index": fields[65],
            "dann": {
                "score": dann_score,
                "rankscore": fields[70]
            },
            "gerp++": {
                "nr": fields[94],
                "rs": fields[95],
                "rs_rankscore": fields[96]
            },
            "integrated": {
                "fitcons_score": integrated_fitcons_score,
                "fitcons_rankscore": fields[83],
                "confidence_value": fields[84]
            },
            "gm12878": {
                "fitcons_score": gm12878_fitcons_score,
                "fitcons_rankscore": fields[86],
                "confidence_value": fields[87]
            },
            "h1-hesc": {
                "fitcons_score": h1_hesc_fitcons_score,
                "fitcons_rankscore": fields[89],
                "confidence_value": fields[90]
            },
            "huvec": {
                "fitcons_score": huvec_fitcons_score,
                "fitcons_rankscore": fields[92],
                "confidence_value": fields[93]
            },
            "phylo": {
                "p100way": {
                    "vertebrate": fields[97],
                    "vertebrate_rankscore": fields[98]
                },
                "p20way": {
                    "mammalian": fields[99],
                    "mammalian_rankscore": fields[100]
                }
            },
            "phastcons": {
                "100way": {
                    "vertebrate": fields[101],
                    "vertebrate_rankscore": fields[102]
                },
                "20way": {
                    "mammalian": fields[103],
                    "mammalian_rankscore": fields[104]
                }
            },
            "siphy_29way": {
                "pi": siphy,
                "logodds": fields[106],
                "logodds_rankscore": fields[107]
            },
            "1000gp3": {
                "ac": fields[108],
                "af": fields[109],
                "afr_ac": fields[110],
                "afr_af": fields[111],
                "eur_ac": fields[112],
                "eur_af": fields[113],
                "amr_ac": fields[114],
                "amr_af": fields[115],
                "eas_ac": fields[116],
                "eas_af": fields[117],
                "sas_ac": fields[118],
                "sas_af": fields[119]
            },
            "twinsuk": {
                "ac": fields[120],
                "af": fields[121]
            },
            "alspac": {
                "ac": fields[122],
                "af": fields[123]
            },
            "esp6500": {
                "aa_ac": fields[124],
                "aa_af": fields[125],
                "ea_ac": fields[126],
                "ea_af": fields[127]
            },
            "exac": {
                "ac": fields[128],
                "af": fields[129],
                "adj_ac": fields[130],
                "adj_af": fields[131],
                "afr_ac": fields[132],
                "afr_af": fields[133],
                "amr_ac": fields[134],
                "amr_af": fields[135],
                "eas_ac": fields[136],
                "eas_af": fields[137],
                "fin_ac": fields[138],
                "fin_af": fields[139],
                "nfe_ac": fields[140],
                "nfe_af": fields[141],
                "sas_ac": fields[142],
                "sas_af": fields[143]
            },
            "exac_nontcga": {
                "ac": fields[144],
                "af": fields[145],
                "adj_ac": fields[146],
                "adj_af": fields[147],
                "afr_ac": fields[148],
                "afr_af": fields[149],
                "amr_ac": fields[150],
                "amr_af": fields[151],
                "eas_ac": fields[152],
                "eas_af": fields[153],
                "fin_ac": fields[154],
                "fin_af": fields[155],
                "nfe_ac": fields[156],
                "nfe_af": fields[157],
                "sas_ac": fields[158],
                "sas_af": fields[159]
            },
            "exac_nonpsych": {
                "ac": fields[160],
                "af": fields[161],
                "adj_ac": fields[162],
                "adj_af": fields[163],
                "afr_ac": fields[164],
                "afr_af": fields[165],
                "amr_ac": fields[166],
                "amr_af": fields[167],
                "eas_ac": fields[168],
                "eas_af": fields[169],
                "fin_ac": fields[170],
                "fin_af": fields[171],
                "nfe_ac": fields[172],
                "nfe_af": fields[173]
            },
            "clinvar": {
                "rs": fields[176],
                "clinsig": fields[177],
                "trait": fields[178],
                "golden_stars": fields[179]
            },
            "gtex": gtex
        }
    }

    one_snp_json = list_split(
        dict_sweep(unlist(value_convert(one_snp_json)), vals=["."]), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json
Example #22
0
def _map_line_to_json(df, version, index=0):
    # specific variable treatment
    chrom = df["#chr"]
    if chrom == 'M':
        chrom = 'MT'
    # fields[7] in version 2, represent hg18_pos
    hg18_end = df["hg18_pos(1-coor)"]
    if hg18_end == ".":
        hg18_end = "."
    else:
        hg18_end = int(hg18_end)
    # in case of no hg19 position provided, remove the item
    if df["pos(1-coor)"] == '.':
        return None
    else:
        chromStart = int(df["pos(1-coor)"])
        chromEnd = chromStart
    chromStart_38 = int(df["hg38_pos"])
    ref = df["ref"].upper()
    alt = df["alt"].upper()
    HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt)
    HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt)
    if version == 'hg19':
        HGVS = HGVS_19
    elif version == 'hg38':
        HGVS = HGVS_38
    siphy_29way_pi = df["SiPhy_29way_pi"]
    if siphy_29way_pi == ".":
        siphy = "."
    else:
        freq = siphy_29way_pi.split(":")
        siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}
    acc = df["Uniprot_acc"].rstrip().rstrip(';').split(";")
    pos = df["Uniprot_aapos"].rstrip().rstrip(';').split(";")
    uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos)))
    provean_score = df["PROVEAN_score"].split(';')
    sift_score = df["SIFT_score"].split(';')
    hdiv_score = df["Polyphen2_HDIV_score"].split(';')
    hvar_score = df["Polyphen2_HVAR_score"].split(';')
    lrt_score = df["LRT_score"].split(';')
    m_cap_score = df["M-CAP_score"].split(';')
    mutationtaster_score = df["MutationTaster_score"].split(';')
    mutationassessor_score = df["MutationAssessor_score"].split(';')
    vest3_score = df["VEST3_score"].split(';')
    metasvm_score = df["MetaSVM_score"].split(';')
    fathmm_score = df["FATHMM_score"].split(';')
    metalr_score = df["MetaLR_score"].split(';')
    revel_score = df["REVEL_score"].split(';')
    '''
    parse mutpred top 5 features
    '''
    def modify_pvalue(pvalue):
        return float(pvalue.strip('P = '))

    mutpred_mechanisms = df["MutPred_Top5features"]
    if mutpred_mechanisms not in ['.', ',', '-']:
        mutpred_mechanisms = mutpred_mechanisms.split(
            " (") and mutpred_mechanisms.split(";")
        mutpred_mechanisms = [m.rstrip(")") for m in mutpred_mechanisms]
        mutpred_mechanisms = [i.split(" (") for i in mutpred_mechanisms]
        mutpred_mechanisms = sum(mutpred_mechanisms, [])
        mechanisms = [{
            "mechanism": mutpred_mechanisms[0],
            "p_val": modify_pvalue(mutpred_mechanisms[1])
        }, {
            "mechanism": mutpred_mechanisms[2],
            "p_val": modify_pvalue(mutpred_mechanisms[3])
        }, {
            "mechanism": mutpred_mechanisms[4],
            "p_val": modify_pvalue(mutpred_mechanisms[5])
        }, {
            "mechanism": mutpred_mechanisms[6],
            "p_val": modify_pvalue(mutpred_mechanisms[7])
        }, {
            "mechanism": mutpred_mechanisms[8],
            "p_val": modify_pvalue(mutpred_mechanisms[9])
        }]
    else:
        mechanisms = '.'

    # normalize scores

    def norm(arr):
        return [None if item == '.' else item for item in arr]

    provean_score = norm(provean_score)
    sift_score = norm(sift_score)
    hdiv_score = norm(hdiv_score)
    hvar_score = norm(hvar_score)
    lrt_score = norm(lrt_score)
    m_cap_score = norm(m_cap_score)
    mutationtaster_score = norm(mutationtaster_score)
    mutationassessor_score = norm(mutationassessor_score)
    vest3_score = norm(vest3_score)
    metasvm_score = norm(metasvm_score)
    fathmm_score = norm(fathmm_score)
    metalr_score = norm(metalr_score)
    revel_score = norm(revel_score)

    # load as json data
    one_snp_json = {
        "_id": HGVS,
        "dbnsfp": {
            "rsid": df["rs_dbSNP147"],
            #"rsid_dbSNP144": fields[6],
            "chrom": chrom,
            "hg19": {
                "start": chromStart,
                "end": chromEnd
            },
            "hg18": {
                "start": df["hg18_pos(1-coor)"],
                "end": hg18_end
            },
            "hg38": {
                "start": df["hg38_pos"],
                "end": df["hg38_pos"]
            },
            "ref": ref,
            "alt": alt,
            "aa": {
                "ref": df["aaref"],
                "alt": df["aaalt"],
                "pos": df["aapos"],
                "refcodon": df["refcodon"],
                "codonpos": df["codonpos"]
            },
            "genename": df["genename"],
            "uniprot": list(uniprot),
            "interpro_domain": df["Interpro_domain"],
            "cds_strand": df["cds_strand"],
            "ancestral_allele": df["Ancestral_allele"],
            #"altaineandertal": fields[17],
            #"denisova": fields[18]
            "ensembl": {
                "geneid": df["Ensembl_geneid"],
                "transcriptid": df["Ensembl_transcriptid"]
            },
            "sift": {
                "score": sift_score,
                "converted_rankscore": df["SIFT_converted_rankscore"],
                "pred": df["SIFT_pred"]
            },
            "polyphen2": {
                "hdiv": {
                    "score": hdiv_score,
                    "rankscore": df["Polyphen2_HDIV_rankscore"],
                    "pred": df["Polyphen2_HDIV_pred"]
                },
                "hvar": {
                    "score": hvar_score,
                    "rankscore": df["Polyphen2_HVAR_rankscore"],
                    "pred": df["Polyphen2_HVAR_pred"]
                }
            },
            "lrt": {
                "score": lrt_score,
                "converted_rankscore": df["LRT_converted_rankscore"],
                "pred": df["LRT_pred"],
                "omega": df["LRT_Omega"]
            },
            "mutationtaster": {
                "score": mutationtaster_score,
                "converted_rankscore":
                df["MutationTaster_converted_rankscore"],
                "pred": df["MutationTaster_pred"]
            },
            "mutationassessor": {
                "score": mutationassessor_score,
                "rankscore": df["MutationAssessor_rankscore"],
                "pred": df["MutationAssessor_pred"]
            },
            "fathmm": {
                "score": fathmm_score,
                "rankscore": df["FATHMM_rankscore"],
                "pred": df["FATHMM_pred"]
            },
            "provean": {
                "score": provean_score,
                "rankscore": df["PROVEAN_converted_rankscore"],
                "pred": df["PROVEAN_pred"]
            },
            "vest3": {
                "score": vest3_score,
                "rankscore": df["VEST3_rankscore"]
            },
            "eigen": {
                "coding_or_noncoding": df["Eigen_coding_or_noncoding"],
                "raw": df["Eigen-raw"],
                "phred": df["Eigen-phred"]
            },
            "eigen-pc": {
                "raw": df["Eigen-PC-raw"],
                "phred": df["Eigen-PC-phred"],
                "raw_rankscore": df["Eigen-PC-raw_rankscore"]
            },
            "metasvm": {
                "score": metasvm_score,
                "rankscore": df["MetaSVM_rankscore"],
                "pred": df["MetaSVM_pred"]
            },
            "metalr": {
                "score": metalr_score,
                "rankscore": df["MetaLR_rankscore"],
                "pred": df["MetaLR_pred"]
            },
            "reliability_index": df["Reliability_index"],
            "m_cap_score": {
                "score": m_cap_score,
                "rankscore": df["M-CAP_rankscore"],
                "pred": df["M-CAP_pred"]
            },
            "revel": {
                "score": revel_score,
                "rankscore": df["REVEL_rankscore"]
            },
            "mutpred": {
                "score": df["MutPred_score"],
                "rankscore": df["MutPred_rankscore"],
                "accession": df["MutPred_protID"],
                "aa_change": df["MutPred_AAchange"],
                "pred": mechanisms
            },
            "gerp++": {
                "nr": df["GERP++_NR"],
                "rs": df["GERP++_RS"],
                "rs_rankscore": df["GERP++_RS_rankscore"]
            },
            "phylo": {
                "p100way": {
                    "vertebrate": df["phyloP100way_vertebrate"],
                    "vertebrate_rankscore":
                    df["phyloP100way_vertebrate_rankscore"]
                },
                "p46way": {
                    "placental": df["phyloP46way_placental"],
                    "placental_rankscore":
                    df["phyloP46way_placental_rankscore"],
                    "primate": df["phyloP46way_primate"],
                    "primate_rankscore": df["phyloP46way_primate_rankscore"]
                }
            },
            "phastcons": {
                "100way": {
                    "vertebrate":
                    df["phastCons100way_vertebrate"],
                    "vertebrate_rankscore":
                    df["phastCons100way_vertebrate_rankscore"]
                },
                "46way": {
                    "placental": df["phastCons46way_placental"],
                    "placental_rankscore":
                    df["phastCons46way_placental_rankscore"],
                    "primate": df["phastCons46way_primate"],
                    "primate_rankscore": df["phastCons46way_primate_rankscore"]
                }
            },
            "siphy_29way": {
                "pi": siphy,
                "logodds": df["SiPhy_29way_logOdds"],
                "logodds_rankscore": df["SiPhy_29way_logOdds_rankscore"]
            },
            "1000gp1": {
                "ac": df["1000Gp1_AC"],
                "af": df["1000Gp1_AF"],
                "afr_ac": df["1000Gp1_AFR_AC"],
                "afr_af": df["1000Gp1_AFR_AF"],
                "eur_ac": df["1000Gp1_EUR_AC"],
                "eur_af": df["1000Gp1_EUR_AF"],
                "amr_ac": df["1000Gp1_AMR_AC"],
                "amr_af": df["1000Gp1_AMR_AF"],
                "asn_ac": df["1000Gp1_ASN_AC"],
                "asn_af": df["1000Gp1_ASN_AF"]
            },
            "esp6500": {
                "aa_af": df["ESP6500_AA_AF"],
                "ea_af": df["ESP6500_EA_AF "]
            },
            "exac": {
                "ac": df["ExAC_AC"],
                "af": df["ExAC_AF"],
                "adj_ac": df["ExAC_Adj_AC"],
                "adj_af": df["ExAC_Adj_AF"],
                "afr_ac": df["ExAC_AFR_AC"],
                "afr_af": df["ExAC_AFR_AF"],
                "amr_ac": df["ExAC_AMR_AC"],
                "amr_af": df["ExAC_AMR_AF"],
                "eas_ac": df["ExAC_EAS_AC"],
                "eas_af": df["ExAC_EAS_AF"],
                "fin_ac": df["ExAC_FIN_AC"],
                "fin_af": df["ExAC_FIN_AF"],
                "nfe_ac": df["ExAC_NFE_AC"],
                "nfe_af": df["ExAC_NFE_AF"],
                "sas_ac": df["ExAC_SAS_AC"],
                "sas_af": df["ExAC_SAS_AF"]
            },
            "aric5606": {
                "aa_ac": df["ARIC5606_AA_AC"],
                "aa_af": df["ARIC5606_AA_AF"],
                "ea_ac": df["ARIC5606_EA_AC"],
                "ea_af": df["ARIC5606_EA_AF"]
            },
            "clinvar": {
                "rs":
                df["clinvar_rs"],
                "clinsig":
                list(
                    map(int, [
                        i for i in df["clinvar_clnsig"].split("|") if i != "."
                    ])),
                "trait":
                [i for i in df["clinvar_trait"].split("|") if i != "."],
                "golden_stars":
                list(
                    map(int, [
                        i for i in df["clinvar_golden_stars"].split("|")
                        if i != "."
                    ]))
            }
        }
    }

    one_snp_json = list_split(
        dict_sweep(unlist(value_convert_to_number(one_snp_json)),
                   vals=[".", None]), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json
Example #23
0
def _map_line_to_json(df, version, index):
    # specific variable treatment
    chrom = df.get_value(index, "#chr")
    if chrom == 'M':
        chrom = 'MT'
    # fields[7] in version 2, represent hg18_pos
    hg18_end = df.get_value(index, "hg18_pos(1-based)")
    if hg18_end == ".":
        hg18_end = "."
    else:
        hg18_end = int(hg18_end)
    # in case of no hg19 position provided, remove the item
    if df.get_value(index, "hg19_pos(1-based)") == '.':
        return None
    else:
        chromStart = int(df.get_value(index, "hg19_pos(1-based)"))
        chromEnd = chromStart
    chromStart_38 = int(df.get_value(index, "pos(1-based)"))
    ref = df.get_value(index, "ref").upper()
    alt = df.get_value(index, "alt").upper()
    HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt)
    HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt)
    if version == 'hg19':
        HGVS = HGVS_19
    elif version == 'hg38':
        HGVS = HGVS_38
    siphy_29way_pi = df.get_value(index, "SiPhy_29way_pi")
    if siphy_29way_pi == ".":
        siphy = "."
    else:
        freq = siphy_29way_pi.split(":")
        siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}
    gtex_gene = df.get_value(index, "GTEx_V6_gene").split('|')
    gtex_tissue = df.get_value(index, "GTEx_V6_tissue").split('|')
    gtex = map(
        dict,
        map(lambda t: zip(('gene', 'tissue'), t), zip(gtex_gene, gtex_tissue)))
    acc = df.get_value(index,
                       "Uniprot_acc_Polyphen2").rstrip().rstrip(';').split(";")
    pos = df.get_value(
        index, "Uniprot_aapos_Polyphen2").rstrip().rstrip(';').split(";")
    uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos)))
    provean_score = df.get_value(index, "PROVEAN_score").split(';')
    sift_score = df.get_value(index, "SIFT_score").split(';')
    hdiv_score = df.get_value(index, "Polyphen2_HDIV_score").split(';')
    hvar_score = df.get_value(index, "Polyphen2_HVAR_score").split(';')
    lrt_score = df.get_value(index, "LRT_score").split(';')
    m_cap_score = df.get_value(index, "M-CAP_score").split(';')
    mutationtaster_score = df.get_value(index,
                                        "MutationTaster_score").split(';')
    mutationassessor_score = df.get_value(index,
                                          "MutationAssessor_score").split(';')
    vest3_score = df.get_value(index, "VEST3_score").split(';')
    metasvm_score = df.get_value(index, "MetaSVM_score").split(';')
    fathmm_score = df.get_value(index, "FATHMM_score").split(';')
    metalr_score = df.get_value(index, "MetaLR_score").split(';')
    modify_score_list = [
        provean_score, sift_score, hdiv_score, hvar_score, lrt_score,
        m_cap_score, mutationtaster_score, mutationassessor_score, vest3_score,
        metasvm_score, fathmm_score, metalr_score
    ]
    for _score in modify_score_list:
        [None if item == '.' else item for item in _score]

# load as json data
    one_snp_json = {
        "_id": HGVS,
        "dbnsfp": {
            "rsid": df.get_value(index, "rs_dbSNP147"),
            #"rsid_dbSNP144": fields[6],
            "chrom": chrom,
            "hg19": {
                "start": chromStart,
                "end": chromEnd
            },
            "hg18": {
                "start": df.get_value(index, "hg18_pos(1-based)"),
                "end": hg18_end
            },
            "hg38": {
                "start": df.get_value(index, "pos(1-based)"),
                "end": df.get_value(index, "pos(1-based)")
            },
            "ref": ref,
            "alt": alt,
            "aa": {
                "ref": df.get_value(index, "aaref"),
                "alt": df.get_value(index, "aaalt"),
                "pos": df.get_value(index, "aapos"),
                "refcodon": df.get_value(index, "refcodon"),
                "codonpos": df.get_value(index, "codonpos"),
                "codon_degeneracy": df.get_value(index, "codon_degeneracy"),
            },
            "genename": df.get_value(index, "genename"),
            "uniprot": uniprot,
            "interpro_domain": df.get_value(index, "Interpro_domain"),
            "cds_strand": df.get_value(index, "cds_strand"),
            "ancestral_allele": df.get_value(index, "Ancestral_allele"),
            #"altaineandertal": fields[17],
            #"denisova": fields[18]
            "ensembl": {
                "geneid": df.get_value(index, "Ensembl_geneid"),
                "transcriptid": df.get_value(index, "Ensembl_transcriptid"),
                "proteinid": df.get_value(index, "Ensembl_proteinid")
            },
            "sift": {
                "score":
                sift_score,
                "converted_rankscore":
                df.get_value(index, "SIFT_converted_rankscore"),
                "pred":
                df.get_value(index, "SIFT_pred")
            },
            "polyphen2": {
                "hdiv": {
                    "score": hdiv_score,
                    "rankscore": df.get_value(index,
                                              "Polyphen2_HDIV_rankscore"),
                    "pred": df.get_value(index, "Polyphen2_HDIV_pred")
                },
                "hvar": {
                    "score": hvar_score,
                    "rankscore": df.get_value(index,
                                              "Polyphen2_HVAR_rankscore"),
                    "pred": df.get_value(index, "Polyphen2_HVAR_pred")
                }
            },
            "lrt": {
                "score":
                lrt_score,
                "converted_rankscore":
                df.get_value(index, "LRT_converted_rankscore"),
                "pred":
                df.get_value(index, "LRT_pred"),
                "omega":
                df.get_value(index, "LRT_Omega")
            },
            "mutationtaster": {
                "score":
                mutationtaster_score,
                "converted_rankscore":
                df.get_value(index, "MutationTaster_converted_rankscore"),
                "pred":
                df.get_value(index, "MutationTaster_pred"),
                "model":
                df.get_value(index, "MutationTaster_model"),
                "AAE":
                df.get_value(index, "MutationTaster_AAE")
            },
            "mutationassessor": {
                "score":
                mutationassessor_score,
                "rankscore":
                df.get_value(index, "MutationAssessor_score_rankscore"),
                "pred":
                df.get_value(index, "MutationAssessor_pred")
            },
            "fathmm": {
                "score": fathmm_score,
                "rankscore": df.get_value(index, "FATHMM_converted_rankscore"),
                "pred": df.get_value(index, "FATHMM_pred")
            },
            "provean": {
                "score": provean_score,
                "rankscore": df.get_value(index,
                                          "PROVEAN_converted_rankscore"),
                "pred": df.get_value(index, "PROVEAN_pred")
            },
            "vest3": {
                "score": vest3_score,
                "rankscore": df.get_value(index, "VEST3_rankscore"),
                "transcriptid": df.get_value(index, "Transcript_id_VEST3"),
                "transcriptvar": df.get_value(index, "Transcript_var_VEST3")
            },
            "fathmm-mkl": {
                "coding_score":
                df.get_value(index, "fathmm-MKL_coding_score"),
                "coding_rankscore":
                df.get_value(index, "fathmm-MKL_coding_rankscore"),
                "coding_pred":
                df.get_value(index, "fathmm-MKL_coding_pred"),
                "coding_group":
                df.get_value(index, "fathmm-MKL_coding_group")
            },
            "eigen": {
                "coding_or_noncoding":
                df.get_value(index, "Eigen_coding_or_noncoding"),
                "raw":
                df.get_value(index, "Eigen-raw"),
                "phred":
                df.get_value(index, "Eigen-phred")
            },
            "eigen-pc": {
                "raw": df.get_value(index, "Eigen-PC-raw"),
                "phred": df.get_value(index, "Eigen-PC-phred"),
                "raw_rankscore": df.get_value(index, "Eigen-PC-raw_rankscore")
            },
            "genocanyon": {
                "score": df.get_value(index, "GenoCanyon_score"),
                "rankscore": df.get_value(index, "GenoCanyon_score_rankscore")
            },
            "metasvm": {
                "score": metasvm_score,
                "rankscore": df.get_value(index, "MetaSVM_rankscore"),
                "pred": df.get_value(index, "MetaSVM_pred")
            },
            "metalr": {
                "score": metalr_score,
                "rankscore": df.get_value(index, "MetaLR_rankscore"),
                "pred": df.get_value(index, "MetaLR_pred")
            },
            "reliability_index": df.get_value(index, "Reliability_index"),
            "m_cap_score": {
                "score": m_cap_score,
                "rankscore": df.get_value(index, "M-CAP_rankscore"),
                "pred": df.get_value(index, "M-CAP_pred")
            },
            "dann": {
                "score": df.get_value(index, "DANN_score"),
                "rankscore": df.get_value(index, "DANN_rankscore")
            },
            "gerp++": {
                "nr": df.get_value(index, "GERP++_NR"),
                "rs": df.get_value(index, "GERP++_RS"),
                "rs_rankscore": df.get_value(index, "GERP++_RS_rankscore")
            },
            "integrated": {
                "fitcons_score":
                df.get_value(index, "integrated_fitCons_score"),
                "fitcons_rankscore":
                df.get_value(index, "integrated_fitCons_score_rankscore"),
                "confidence_value":
                df.get_value(index, "integrated_confidence_value")
            },
            "gm12878": {
                "fitcons_score":
                df.get_value(index, "GM12878_fitCons_score"),
                "fitcons_rankscore":
                df.get_value(index, "GM12878_fitCons_score_rankscore"),
                "confidence_value":
                df.get_value(index, "GM12878_confidence_value")
            },
            "h1-hesc": {
                "fitcons_score":
                df.get_value(index, "H1-hESC_fitCons_score"),
                "fitcons_rankscore":
                df.get_value(index, "H1-hESC_fitCons_score_rankscore"),
                "confidence_value":
                df.get_value(index, "H1-hESC_confidence_value")
            },
            "huvec": {
                "fitcons_score":
                df.get_value(index, "HUVEC_fitCons_score"),
                "fitcons_rankscore":
                df.get_value(index, "HUVEC_fitCons_score_rankscore"),
                "confidence_value":
                df.get_value(index, "HUVEC_confidence_value")
            },
            "phylo": {
                "p100way": {
                    "vertebrate":
                    df.get_value(index, "phyloP100way_vertebrate"),
                    "vertebrate_rankscore":
                    df.get_value(index, "phyloP100way_vertebrate_rankscore")
                },
                "p20way": {
                    "mammalian":
                    df.get_value(index, "phyloP20way_mammalian"),
                    "mammalian_rankscore":
                    df.get_value(index, "phyloP20way_mammalian_rankscore")
                }
            },
            "phastcons": {
                "100way": {
                    "vertebrate":
                    df.get_value(index, "phastCons100way_vertebrate"),
                    "vertebrate_rankscore":
                    df.get_value(index, "phastCons100way_vertebrate_rankscore")
                },
                "20way": {
                    "mammalian":
                    df.get_value(index, "phastCons20way_mammalian"),
                    "mammalian_rankscore":
                    df.get_value(index, "phastCons20way_mammalian_rankscore")
                }
            },
            "siphy_29way": {
                "pi":
                siphy,
                "logodds":
                df.get_value(index, "SiPhy_29way_logOdds"),
                "logodds_rankscore":
                df.get_value(index, "SiPhy_29way_logOdds_rankscore")
            },
            "1000gp3": {
                "ac": df.get_value(index, "1000Gp3_AC"),
                "af": df.get_value(index, "1000Gp3_AF"),
                "afr_ac": df.get_value(index, "1000Gp3_AFR_AC"),
                "afr_af": df.get_value(index, "1000Gp3_AFR_AF"),
                "eur_ac": df.get_value(index, "1000Gp3_EUR_AC"),
                "eur_af": df.get_value(index, "1000Gp3_EUR_AF"),
                "amr_ac": df.get_value(index, "1000Gp3_AMR_AC"),
                "amr_af": df.get_value(index, "1000Gp3_AMR_AF"),
                "eas_ac": df.get_value(index, "1000Gp3_EAS_AC"),
                "eas_af": df.get_value(index, "1000Gp3_EAS_AF"),
                "sas_ac": df.get_value(index, "1000Gp3_SAS_AC"),
                "sas_af": df.get_value(index, "1000Gp3_SAS_AF")
            },
            "twinsuk": {
                "ac": df.get_value(index, "TWINSUK_AC"),
                "af": df.get_value(index, "TWINSUK_AF")
            },
            "alspac": {
                "ac": df.get_value(index, "ALSPAC_AC"),
                "af": df.get_value(index, "ALSPAC_AF")
            },
            "esp6500": {
                "aa_ac": df.get_value(index, "ESP6500_AA_AC"),
                "aa_af": df.get_value(index, "ESP6500_AA_AF"),
                "ea_ac": df.get_value(index, "ESP6500_EA_AC"),
                "ea_af": df.get_value(index, "ESP6500_EA_AF")
            },
            "exac": {
                "ac": df.get_value(index, "ExAC_AC"),
                "af": df.get_value(index, "ExAC_AF"),
                "adj_ac": df.get_value(index, "ExAC_Adj_AC"),
                "adj_af": df.get_value(index, "ExAC_Adj_AF"),
                "afr_ac": df.get_value(index, "ExAC_AFR_AC"),
                "afr_af": df.get_value(index, "ExAC_AFR_AF"),
                "amr_ac": df.get_value(index, "ExAC_AMR_AC"),
                "amr_af": df.get_value(index, "ExAC_AMR_AF"),
                "eas_ac": df.get_value(index, "ExAC_EAS_AC"),
                "eas_af": df.get_value(index, "ExAC_EAS_AF"),
                "fin_ac": df.get_value(index, "ExAC_FIN_AC"),
                "fin_af": df.get_value(index, "ExAC_FIN_AF"),
                "nfe_ac": df.get_value(index, "ExAC_NFE_AC"),
                "nfe_af": df.get_value(index, "ExAC_NFE_AF"),
                "sas_ac": df.get_value(index, "ExAC_SAS_AC"),
                "sas_af": df.get_value(index, "ExAC_SAS_AF")
            },
            "exac_nontcga": {
                "ac": df.get_value(index, "ExAC_nonTCGA_AC"),
                "af": df.get_value(index, "ExAC_nonTCGA_AF"),
                "adj_ac": df.get_value(index, "ExAC_nonTCGA_Adj_AC"),
                "adj_af": df.get_value(index, "ExAC_nonTCGA_Adj_AF"),
                "afr_ac": df.get_value(index, "ExAC_nonTCGA_AFR_AC"),
                "afr_af": df.get_value(index, "ExAC_nonTCGA_AFR_AF"),
                "amr_ac": df.get_value(index, "ExAC_nonTCGA_AMR_AC"),
                "amr_af": df.get_value(index, "ExAC_nonTCGA_AMR_AF"),
                "eas_ac": df.get_value(index, "ExAC_nonTCGA_EAS_AC"),
                "eas_af": df.get_value(index, "ExAC_nonTCGA_EAS_AF"),
                "fin_ac": df.get_value(index, "ExAC_nonTCGA_FIN_AC"),
                "fin_af": df.get_value(index, "ExAC_nonTCGA_FIN_AF"),
                "nfe_ac": df.get_value(index, "ExAC_nonTCGA_NFE_AC"),
                "nfe_af": df.get_value(index, "ExAC_nonTCGA_NFE_AF"),
                "sas_ac": df.get_value(index, "ExAC_nonTCGA_SAS_AC"),
                "sas_af": df.get_value(index, "ExAC_nonTCGA_SAS_AF")
            },
            "exac_nonpsych": {
                "ac": df.get_value(index, "ExAC_nonpsych_AC"),
                "af": df.get_value(index, "ExAC_nonpsych_AF"),
                "adj_ac": df.get_value(index, "ExAC_nonpsych_Adj_AC"),
                "adj_af": df.get_value(index, "ExAC_nonpsych_Adj_AF"),
                "afr_ac": df.get_value(index, "ExAC_nonpsych_AFR_AC"),
                "afr_af": df.get_value(index, "ExAC_nonpsych_AFR_AF"),
                "amr_ac": df.get_value(index, "ExAC_nonpsych_AMR_AC"),
                "amr_af": df.get_value(index, "ExAC_nonpsych_AMR_AF"),
                "eas_ac": df.get_value(index, "ExAC_nonpsych_EAS_AC"),
                "eas_af": df.get_value(index, "ExAC_nonpsych_EAS_AF"),
                "fin_ac": df.get_value(index, "ExAC_nonpsych_FIN_AC"),
                "fin_af": df.get_value(index, "ExAC_nonpsych_FIN_AF"),
                "nfe_ac": df.get_value(index, "ExAC_nonpsych_NFE_AC"),
                "nfe_af": df.get_value(index, "ExAC_nonpsych_NFE_AF"),
                "sas_ac": df.get_value(index, "ExAC_nonpsych_SAS_AC"),
                "sas_af": df.get_value(index, "ExAC_nonpsych_SAS_AF")
            },
            "clinvar": {
                "rs": df.get_value(index, "clinvar_rs"),
                "clinsig": df.get_value(index, "clinvar_clnsig"),
                "trait": df.get_value(index, "clinvar_trait"),
                "golden_stars": df.get_value(index, "clinvar_golden_stars")
            },
            "gtex": gtex
        }
    }

    one_snp_json = list_split(
        dict_sweep(unlist(value_convert(one_snp_json)), vals=["."]), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json
Example #24
0
 def annotate_by_snpeff(self, varobj_list):
     '''load data'''
     # title of vcf
     vcf_stdin = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n'
     # extract each item from list, transform into vcf format
     snpeff_valid_id = []
     for item in varobj_list:
         if '>' in item:
             hgvs_info = self.snp_hgvs_id_parser(item)
             try:
                 vcf_stdin += self.snp_vcf_constructer(hgvs_info)
             except TypeError:
                 print(item)
                 continue
             snpeff_valid_id.append(item)
         elif item.endswith('del'):
             hgvs_info = self.del_hgvs_id_parser(item)
             try:
                 vcf_stdin += self.del_vcf_constructor(hgvs_info)
             except TypeError:
                 print(item)
                 continue
             snpeff_valid_id.append(item)
         elif 'ins' in item and 'del' not in item:
             hgvs_info = self.ins_hgvs_id_parser(item)
             try:
                 vcf_stdin += self.ins_vcf_constructor(hgvs_info)
             except TypeError:
                 print(item)
                 continue
             snpeff_valid_id.append(item)
         elif 'delins' in item:
             hgvs_info = self.delins_hgvs_id_parser(item)
             try:
                 vcf_stdin += self.delins_vcf_constructor(hgvs_info)
             except TypeError:
                 print(item)
                 continue
         else:
             print(item)
             print('beyond current capacity')
     proc = subprocess.Popen(SNPEFF_CMD,
                             stdin=subprocess.PIPE,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
     (stdout, stderr) = proc.communicate(vcf_stdin)
     assert stderr == '', stderr
     vcf_stdout_raw = stdout.split('\n')
     for vcf_line in vcf_stdout_raw:
         if vcf_line.startswith('#'):
             continue
         elif vcf_line == '':
             continue
         else:
             # assume the first item is 'ANN'
             ann_info = vcf_line.split(';')[0]
             ann = []
             # Multiple annotations per VCF line
             for item in ann_info.split(','):
                 if len(item.split('|')) > 1:
                     (effect, putative_impact, gene_name, gene_id,
                      feature_type, feature_id) = item.split('|')[1:7]
                     (transcript_biotype, exon, hgvs_coding, hgvs_protein,
                      cdna, cds, protein,
                      distance_to_feature) = item.split('|')[7:15]
                     print(effect)
                     if cdna:
                         (cdna_position, cdna_len) = cdna.split('/')
                     else:
                         cdna_position = None
                         cdna_len = None
                     if cds:
                         (cds_position, cds_len) = cds.split('/')
                     else:
                         cds_position = None
                         cds_len = None
                     if protein:
                         (protein_position,
                          protein_len) = protein.split('/')
                     else:
                         protein_position = None
                         protein_len = None
                     if exon:
                         (rank, total) = exon.split('/')
                     else:
                         rank = None
                         total = None
                     ann.append({
                         "effect": effect,
                         "putative_impact": putative_impact,
                         "genename": gene_name,
                         "gene_id": gene_id,
                         "feature_type": feature_type,
                         "feature_id": feature_id,
                         "transcript_biotype": transcript_biotype,
                         "rank": rank,
                         "total": total,
                         "hgvs.c": hgvs_coding,
                         "hgvs.p": hgvs_protein,
                         "cdna": {
                             "position": cdna_position,
                             "length": cdna_len
                         },
                         "cds": {
                             "position": cds_position,
                             "length": cds_len
                         },
                         "protein": {
                             "position": protein_position,
                             "length": protein_len
                         },
                         "distance_to_feature": distance_to_feature
                     })
                     print(ann)
             # not all annotations include lof & nmd information. Set them to 'None' as default
             lof = None
             nmd = None
             # the case that annotation include 'ann' & 'lof' & 'nmd'
             if len(vcf_line.split(';')) == 3:
                 (lof_info, nmd_info) = vcf_line.split(';')[1:3]
                 # assume the second item is 'lof'
                 assert lof_info.startswith('LOF')
                 # the information to be parsed is like this: 'LOF=(PTEN|PTEN|1|1.00)'
                 lof_info = lof_info.split('(')[1].split(')')[0]
                 nmd_info = nmd_info.split('(')[1].split(')')[0]
                 (id_lof, name_lof, nt_lof, pt_lof) = lof_info.split('|')
                 (id_nmd, name_nmd, nt_nmd, pt_nmd) = nmd_info.split('|')
                 lof = {
                     "gene_id": id_lof,
                     "genename": name_lof,
                     "number_of_transcripts_in_gene": nt_lof,
                     "percent_of_transcripts_affected": pt_lof
                 }
                 nmd = {
                     "gene_id": id_nmd,
                     "genename": name_nmd,
                     "number_of_transcripts_in_gene": nt_nmd,
                     "percent_of_transcripts_affected": pt_nmd
                 }
             # the case that annotation include 'ann' & 'lof or nmd'
             elif len(vcf_line.split(';')) == 2:
                 (ann_info, idk_info) = vcf_line.split(';')
                 if idk_info.startswith('LOF'):
                     lof_info = idk_info.split('(')[1].split(')')[0]
                     (id_lof, name_lof, nt_lof,
                      pt_lof) = lof_info.split('|')
                     lof = {
                         "gene_id": id_lof,
                         "genename": name_lof,
                         "number_of_transcripts_in_gene": nt_lof,
                         "percent_of_transcripts_affected": pt_lof
                     }
                 else:
                     nmd_info = idk_info.split('(')[1].split(')')[0]
                     (id_nmd, name_nmd, nt_nmd,
                      pt_nmd) = nmd_info.split('|')
                     nmd = {
                         "gene_id": id_nmd,
                         "genename": name_nmd,
                         "number_of_transcripts_in_gene": nt_nmd,
                         "percent_of_transcripts_affected": pt_nmd
                     }
             (chrom, pos, _id, ref, alt) = ann_info.split('\t')[0:5]
             hgvs_id = get_hgvs_from_vcf(chrom, pos, ref, alt)
             one_snp_json = {
                 "id": hgvs_id,
                 "snpeff": {
                     "ann": ann,
                     "lof": lof,
                     "nmd": nmd,
                     "vcf": {
                         "position": pos,
                         "ref": ref,
                         "alt": alt
                     }
                 }
             }
             snpeff_json = dict_sweep(unlist(one_snp_json), vals=['', None])
             yield snpeff_json
Example #25
0
def _map_line_to_json(fields):
    assert len(fields) == VALID_COLUMN_NO
    chrom = fields[0]
    chromStart = fields[1]
    ref = fields[2]
    alt = fields[4]
    HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt)

    # load as json data
    if HGVS is None:
        return
    one_snp_json = {
        "_id": HGVS,
        "cadd": {
            'chrom': fields[0],
            'pos': fields[1],
            'ref': fields[2],
            'anc': fields[3],
            'alt': fields[4],
            'type': fields[5],
            'length': fields[6],
            'istv': fields[7],
            'isderived': fields[8],
            'annotype': fields[9],
            'consequence': fields[10],
            'consscore': fields[11],
            'consdetail': fields[12],
            'gc': fields[13],
            'cpg': fields[14],
            'mapability': {
                '20bp': fields[15],
                '35bp': fields[16]
            },
            'scoresegdup': fields[17],
            'phast_cons': {
                'primate': fields[18],
                'mammalian': fields[19],
                'vertebrate': fields[20]
            },
            'phylop': {
                'primate': fields[21],
                'mammalian': fields[22],
                'vertebrate': fields[23]
            },
            'gerp': {
                'n': fields[24],
                's': fields[25],
                'rs': fields[26],
                'rs_pval': fields[27]
            },
            'bstatistic': fields[28],
            'mutindex': fields[29],
            'dna': {
                'helt': fields[30],
                'mgw': fields[31],
                'prot': fields[32],
                'roll': fields[33]
            },
            'mirsvr': {
                'score': fields[34],
                'e': fields[35],
                'aln': fields[36]
            },
            'targetscans': fields[37],
            'fitcons': fields[38],
            'chmm': {
                'tssa': fields[39],
                'tssaflnk': fields[40],
                'txflnk': fields[41],
                'tx': fields[42],
                'txwk': fields[43],
                'enh': fields[44],
                # 'enh': fields[45],
                'znfrpts': fields[46],
                'het': fields[47],
                'tssbiv': fields[48],
                'bivflnk': fields[49],
                'enhbiv': fields[50],
                'reprpc': fields[51],
                'reprpcwk': fields[52],
                'quies': fields[53],
            },
            'encode': {
                'exp': fields[54],
                'h3k27ac': fields[55],
                'h3k4me1': fields[56],
                'h3k4me3': fields[57],
                'nucleo': fields[58],
                'occ': fields[59],
                'p_val': {
                    'comb': fields[60],
                    'dnas': fields[61],
                    'faire': fields[62],
                    'polii': fields[63],
                    'ctcf': fields[64],
                    'mycp': fields[65]
                },
                'sig': {
                    'dnase': fields[66],
                    'faire': fields[67],
                    'polii': fields[68],
                    'ctcf': fields[69],
                    'myc': fields[70]
                },
            },
            'segway': fields[71],
            'motif': {
                'toverlap': fields[72],
                'dist': fields[73],
                'ecount': fields[74],
                'ename': fields[75],
                'ehipos': fields[76],
                'escorechng': fields[77]
            },
            'tf': {
                'bs': fields[78],
                'bs_peaks': fields[79],
                'bs_peaks_max': fields[80]
            },
            'isknownvariant': fields[81],
            'esp': {
                'af': fields[82],
                'afr': fields[83],
                'eur': fields[84]
            },
            '1000g': {
                'af': fields[85],
                'asn': fields[86],
                'amr': fields[87],
                'afr': fields[88],
                'eur': fields[89]
            },
            'min_dist_tss': fields[90],
            'min_dist_tse': fields[91],
            'gene': {
                'gene_id': fields[92],
                'feature_id': fields[93],
                'ccds_id': fields[94],
                'genename': fields[95],
                'cds': {
                    'cdna_pos': fields[96],
                    'rel_cdna_pos': fields[97],
                    'cds_pos': fields[98],
                    'rel_cds_pos': fields[99]
                },
                'prot': {
                    'protpos': fields[100],
                    'rel_prot_pos': fields[101],
                    'domain': fields[102]
                }
            },
            'dst2splice': fields[103],
            'dst2spltype': fields[104],
            'exon': fields[105],
            'intron': fields[106],
            'oaa': fields[107],   # ref aa
            'naa': fields[108],   # alt aa
            'grantham': fields[109],
            'polyphen': {
                'cat': fields[110],
                'val': fields[111]
            },
            'sift': {
                'cat': fields[112],
                'val': fields[113]
            },
            'rawscore': fields[114],    # raw CADD score
            'phred': fields[115]        # log-percentile of raw CADD score
        }
    }

    obj = dict_sweep(unlist(value_convert(one_snp_json)), ["NA"])
    yield obj
def _map_line_to_json(df, version, index=0):
    # specific variable treatment
    chrom = df["#chr"]
    if chrom == 'M':
        chrom = 'MT'
    # fields[7] in version 2, represent hg18_pos
    hg18_end = df["hg18_pos(1-coor)"]
    if hg18_end == ".":
        hg18_end = "."
    else:
        hg18_end = int(hg18_end)
    # in case of no hg19 position provided, remove the item
    if df["pos(1-coor)"] == '.':
        return None
    else:
        chromStart = int(df["pos(1-coor)"])
        chromEnd = chromStart
    chromStart_38 = int(df["hg38_pos"])
    ref = df["ref"].upper()
    alt = df["alt"].upper()
    HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt)
    HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt)
    if version == 'hg19':
        HGVS = HGVS_19
    elif version == 'hg38':
        HGVS = HGVS_38
    siphy_29way_pi = df["SiPhy_29way_pi"]
    if siphy_29way_pi == ".":
        siphy = "."
    else:
        freq = siphy_29way_pi.split(":")
        siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}
    acc = df["Uniprot_acc"].rstrip().rstrip(';').split(";")
    pos = df["Uniprot_aapos"].rstrip().rstrip(';').split(";")
    uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos)))
    provean_score = df["PROVEAN_score"].split(';')
    sift_score = df["SIFT_score"].split(';')
    hdiv_score = df["Polyphen2_HDIV_score"].split(';')
    hvar_score = df["Polyphen2_HVAR_score"].split(';')
    lrt_score = df["LRT_score"].split(';')
    m_cap_score = df["M-CAP_score"].split(';')
    mutationtaster_score = df["MutationTaster_score"].split(';')
    mutationassessor_score = df["MutationAssessor_score"].split(';')
    vest3_score = df["VEST3_score"].split(';')
    metasvm_score = df["MetaSVM_score"].split(';')
    fathmm_score = df["FATHMM_score"].split(';')
    metalr_score = df["MetaLR_score"].split(';')
    revel_score = df["REVEL_score"].split(';')
    '''
    parse mutpred top 5 features
    '''
    def modify_pvalue(pvalue):
        return float(pvalue.strip('P = '))
    mutpred_mechanisms = df["MutPred_Top5features"]
    if mutpred_mechanisms not in ['.', ',', '-']:
        mutpred_mechanisms = mutpred_mechanisms.split(" (") and mutpred_mechanisms.split(";")
        mutpred_mechanisms = [m.rstrip(")") for m in mutpred_mechanisms]
        mutpred_mechanisms = [i.split(" (") for i in mutpred_mechanisms]
        mutpred_mechanisms = sum(mutpred_mechanisms, [])
        mechanisms = [
            {"mechanism": mutpred_mechanisms[0],
             "p_val": modify_pvalue(mutpred_mechanisms[1])},
            {"mechanism": mutpred_mechanisms[2],
             "p_val": modify_pvalue(mutpred_mechanisms[3])},
            {"mechanism": mutpred_mechanisms[4],
             "p_val": modify_pvalue(mutpred_mechanisms[5])},
            {"mechanism": mutpred_mechanisms[6],
             "p_val": modify_pvalue(mutpred_mechanisms[7])},
            {"mechanism": mutpred_mechanisms[8],
             "p_val": modify_pvalue(mutpred_mechanisms[9])}
        ]
    else:
        mechanisms = '.'

    # normalize scores

    def norm(arr):
        return [None if item == '.' else item for item in arr]

    provean_score = norm(provean_score)
    sift_score = norm(sift_score)
    hdiv_score = norm(hdiv_score)
    hvar_score = norm(hvar_score)
    lrt_score = norm(lrt_score)
    m_cap_score = norm(m_cap_score)
    mutationtaster_score = norm(mutationtaster_score)
    mutationassessor_score = norm(mutationassessor_score)
    vest3_score = norm(vest3_score)
    metasvm_score = norm(metasvm_score)
    fathmm_score = norm(fathmm_score)
    metalr_score = norm(metalr_score)
    revel_score = norm(revel_score)

# load as json data
    one_snp_json = {
        "_id": HGVS,
        "dbnsfp": {
            "rsid": df["rs_dbSNP147"],
            #"rsid_dbSNP144": fields[6],
            "chrom": chrom,
            "hg19": {
                "start": chromStart,
                "end": chromEnd
            },
            "hg18": {
                "start": df["hg18_pos(1-coor)"],
                "end": hg18_end
            },
            "hg38": {
                "start": df["hg38_pos"],
                "end": df["hg38_pos"]
            },
            "ref": ref,
            "alt": alt,
            "aa": {
                "ref": df["aaref"],
                "alt": df["aaalt"],
                "pos": df["aapos"],
                "refcodon": df["refcodon"],
                "codonpos": df["codonpos"]
            },
            "genename": df["genename"],
            "uniprot": list(uniprot),
            "interpro_domain": df["Interpro_domain"],
            "cds_strand": df["cds_strand"],
            "ancestral_allele": df["Ancestral_allele"],
            #"altaineandertal": fields[17],
            #"denisova": fields[18]
            "ensembl": {
                "geneid": df["Ensembl_geneid"],
                "transcriptid": df["Ensembl_transcriptid"]
            },
            "sift": {
                "score": sift_score,
                "converted_rankscore": df["SIFT_converted_rankscore"],
                "pred": df["SIFT_pred"]
            },
            "polyphen2": {
                "hdiv": {
                    "score": hdiv_score,
                    "rankscore": df["Polyphen2_HDIV_rankscore"],
                    "pred": df["Polyphen2_HDIV_pred"]
                },
                "hvar": {
                    "score": hvar_score,
                    "rankscore": df["Polyphen2_HVAR_rankscore"],
                    "pred": df["Polyphen2_HVAR_pred"]
                }
            },
            "lrt": {
                "score": lrt_score,
                "converted_rankscore": df["LRT_converted_rankscore"],
                "pred": df["LRT_pred"],
                "omega": df["LRT_Omega"]
            },
            "mutationtaster": {
                "score": mutationtaster_score,
                "converted_rankscore": df["MutationTaster_converted_rankscore"],
                "pred": df["MutationTaster_pred"]
            },
            "mutationassessor": {
                "score": mutationassessor_score,
                "rankscore": df["MutationAssessor_rankscore"],
                "pred": df["MutationAssessor_pred"]
            },
            "fathmm": {
                "score": fathmm_score,
                "rankscore": df["FATHMM_rankscore"],
                "pred": df["FATHMM_pred"]
            },
            "provean": {
                "score": provean_score,
                "rankscore": df["PROVEAN_converted_rankscore"],
                "pred": df["PROVEAN_pred"]
            },
            "vest3": {
                "score": vest3_score,
                "rankscore": df["VEST3_rankscore"]
            },
            "eigen": {
                "coding_or_noncoding": df["Eigen_coding_or_noncoding"],
                "raw": df["Eigen-raw"],
                "phred": df["Eigen-phred"]
            },
            "eigen-pc": {
                "raw": df["Eigen-PC-raw"],
                "phred": df["Eigen-PC-phred"],
                "raw_rankscore": df["Eigen-PC-raw_rankscore"]
            },
            "metasvm": {
                "score": metasvm_score,
                "rankscore": df["MetaSVM_rankscore"],
                "pred": df["MetaSVM_pred"]
            },
            "metalr": {
                "score": metalr_score,
                "rankscore": df["MetaLR_rankscore"],
                "pred": df["MetaLR_pred"]
            },
            "reliability_index": df["Reliability_index"],
            "m_cap_score": {
                "score": m_cap_score,
                "rankscore": df["M-CAP_rankscore"],
                "pred": df["M-CAP_pred"]
            },
            "revel": {
                "score": revel_score,
                "rankscore": df["REVEL_rankscore"]
            },
            "mutpred": {
                "score": df["MutPred_score"],
                "rankscore": df["MutPred_rankscore"],
                "accession": df["MutPred_protID"],
                "aa_change": df["MutPred_AAchange"],
                "pred": mechanisms
            },
            "gerp++": {
                "nr": df["GERP++_NR"],
                "rs": df["GERP++_RS"],
                "rs_rankscore": df["GERP++_RS_rankscore"]
            },
            "phylo": {
                "p100way": {
                    "vertebrate": df["phyloP100way_vertebrate"],
                    "vertebrate_rankscore": df["phyloP100way_vertebrate_rankscore"]
                },
                "p46way": {
                    "placental": df["phyloP46way_placental"],
                    "placental_rankscore": df["phyloP46way_placental_rankscore"],
                    "primate": df["phyloP46way_primate"],
                    "primate_rankscore": df["phyloP46way_primate_rankscore"]
                }
            },
            "phastcons": {
                "100way": {
                    "vertebrate": df["phastCons100way_vertebrate"],
                    "vertebrate_rankscore": df["phastCons100way_vertebrate_rankscore"]
                },
                "46way": {
                    "placental": df["phastCons46way_placental"],
                    "placental_rankscore": df["phastCons46way_placental_rankscore"],
                    "primate": df["phastCons46way_primate"],
                    "primate_rankscore": df["phastCons46way_primate_rankscore"]
                }
            },
            "siphy_29way": {
                "pi": siphy,
                "logodds": df["SiPhy_29way_logOdds"],
                "logodds_rankscore": df["SiPhy_29way_logOdds_rankscore"]
            },
            "1000gp1": {
                "ac": df["1000Gp1_AC"],
                "af": df["1000Gp1_AF"],
                "afr_ac": df["1000Gp1_AFR_AC"],
                "afr_af": df["1000Gp1_AFR_AF"],
                "eur_ac": df["1000Gp1_EUR_AC"],
                "eur_af": df["1000Gp1_EUR_AF"],
                "amr_ac": df["1000Gp1_AMR_AC"],
                "amr_af": df["1000Gp1_AMR_AF"],
                "asn_ac": df["1000Gp1_ASN_AC"],
                "asn_af": df["1000Gp1_ASN_AF"]
            },
            "esp6500": {
                "aa_af": df["ESP6500_AA_AF"],
                "ea_af": df["ESP6500_EA_AF "]
            },
            "exac": {
                "ac": df["ExAC_AC"],
                "af": df["ExAC_AF"],
                "adj_ac": df["ExAC_Adj_AC"],
                "adj_af": df["ExAC_Adj_AF"],
                "afr_ac": df["ExAC_AFR_AC"],
                "afr_af": df["ExAC_AFR_AF"],
                "amr_ac": df["ExAC_AMR_AC"],
                "amr_af": df["ExAC_AMR_AF"],
                "eas_ac": df["ExAC_EAS_AC"],
                "eas_af": df["ExAC_EAS_AF"],
                "fin_ac": df["ExAC_FIN_AC"],
                "fin_af": df["ExAC_FIN_AF"],
                "nfe_ac": df["ExAC_NFE_AC"],
                "nfe_af": df["ExAC_NFE_AF"],
                "sas_ac": df["ExAC_SAS_AC"],
                "sas_af": df["ExAC_SAS_AF"]
            },
            "aric5606": {
                "aa_ac": df["ARIC5606_AA_AC"],
                "aa_af": df["ARIC5606_AA_AF"],
                "ea_ac": df["ARIC5606_EA_AC"],
                "ea_af": df["ARIC5606_EA_AF"]
            },
            "clinvar": {
                "rs": df["clinvar_rs"],
                "clinsig": list(map(int,[i for i in df["clinvar_clnsig"].split("|") if i != "."])),
                "trait": [i for i in df["clinvar_trait"].split("|") if i != "."],
                "golden_stars": list(map(int,[i for i in df["clinvar_golden_stars"].split("|") if i != "."]))
            }
        }
    }

    one_snp_json = list_split(dict_sweep(unlist(value_convert_to_number(one_snp_json)), vals=[".", None]), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json
Example #27
0
def _map_line_to_json(fields):
    assert len(fields) == VALID_COLUMN_NO
    chr_info = re.findall(r"[\w']+", fields[17])
    chrom = chr_info[0]  # Mutation GRCh37 genome position
    chromStart = chr_info[1]
    chromEnd = chr_info[2]

    HGVS = None
    cds = fields[13]
    sub = re.search(r'[ATCGMNHKRY]+>[ATCGMNHKRY]+', cds)
    ins = re.search(r'ins[ATCGMN]+|ins[0-9]+', cds)
    delete = cds.find('del') != -1
    del_ins = re.search(r'[0-9]+>[ATCGMN]+', cds)
    comp = re.search(r'[ATCGMN]+', cds)

    if sub:
        HGVS = "chr%s:g.%s%s" % (chrom, chromStart, sub.group())
    elif ins:
        HGVS = "chr%s:g.%s_%s%s" % (chrom, chromStart, chromEnd, ins.group())
    elif delete:
        HGVS = "chr%s:g.%s_%sdel" % (chrom, chromStart, chromEnd)
    elif del_ins:
        HGVS = "chr%s:g.%s_%sdelins%s" % (chrom, chromStart, chromEnd, comp.group())
    # elif comp:
    #    HGVS = "chr%s:g.%s_%s%s" % (chrom, chromStart, chromEnd, comp.group())
    else:
        HGVS = fields[12]
        print "Error2:", fields[15], cds, fields[17]

    # load as json data
    if HGVS is None:
        return

    one_snp_json = {
        "sorter": fields[17] + fields[13],
        "_id": HGVS,
        "cosmic":
            {
                "gene":
                    {
                        "symbol": fields[0],  # Gene name
                        "id": fields[3],  # HGNC ID
                        "cds_length": fields[2]
                    },
                "transcript": fields[1],  # Accession Number
                "sample":
                    {
                        "name": fields[4],  # Sample name
                        "id": fields[5]  # ID_sample
                    },
                "tumour":
                    {
                        "id": fields[6],  # ID_tumour
                        "primary_site": fields[7],  # Primary site
                        "site_subtype": fields[8],  # Site subtype
                        "primary_histology": fields[9],  # Primary histology
                        "histology_subtype": fields[10],  # Histology subtype
                        "origin": fields[1]
                    },
                "mutation":
                    {
                        "id": "COSM" + fields[12],  # Mutation ID
                        "cds": cds,  # Mutation CDS
                        "aa": fields[14],  # Mutation AA
                        "description": fields[15],  # Mutation Description
                        "zygosity": fields[16],  # Mutation zygosity
                        "somatic_status": fields[21]  # Mutation somatic status
                    },
                "chrom": chrom,
                "hg19":
                   {
                        "start": chromStart,
                        "end": chromEnd
                    },
                "pubmed": fields[22]  # Pubmed_PMID
            }
        }
    return dict_sweep(value_convert(one_snp_json), vals=[""])
Example #28
0
def _map_line_to_json(fields):
    if len(fields) == VALID_COLUMN_NO:
        chrom = fields[0]
        chromStart = fields[1]
        allele1 = fields[2]
        allele2 = fields[4]
        HGVS = "chr%s:g.%s%s>%s" % (chrom, chromStart, allele1, allele2)
    
        # load as json data
        if HGVS is None:
            return
    
        one_snp_json = {
    
                "_id": HGVS,
                "cadd":
                    {
                         'chrom': fields[0],
                         'pos': fields[1],
                         'ref': fields[2],
                         'anc': fields[3],
                         'alt': fields[4],
                         'type': fields[5],
                         'length': fields[6],
                         'istv': fields[7],
                         'isderived': fields[8],
                         'annotype': fields[9],
                         'consequence': fields[10],
                         'consscore': fields[11],
                         'consdetail': fields[12],
                         'gc': fields[13],
                         'cpg': fields[14],
                         'mapability':
                             {
                                 '20bp': fields[15],
                                 '35bp': fields[16]
                             },
                         'scoresegdup': fields[17],
                         'phast_cons':
                             {
                                 'primate': fields[18],
                                 'mammalian': fields[19],
                                 'vertebrate': fields[20]
                             },
                         'phylop':
                             {
                                 'primate': fields[21],
                                 'mammalian': fields[22],
                                 'vertebrate': fields[23]
                             },
                         'gerp':
                             {
                                 'n': fields[24],
                                 's': fields[25],
                                 'rs': fields[26],
                                 'rs_pval': fields[27]
                             },
                         'bstatistic': fields[28],
                         'encode':
                             {
                                 'exp': fields[29],
                                 'h3k27ac': fields[30],
                                 'h3k4me1': fields[31],
                                 'h3k4me3': fields[32],
                                 'nucleo': fields[33],
                                 'occ': fields[34],
                                 'p_val':
                                     {
                                         'comb': fields[35],
                                         'dnas': fields[36],
                                         'faire': fields[37],
                                         'polii': fields[38],
                                         'ctcf': fields[39],
                                         'mycp': fields[40]
                                     },
                                 'sig':
                                     {
                                         'dnase': fields[41],
                                         'faire': fields[42],
                                         'polii': fields[43],
                                         'ctcf': fields[44],
                                         'myc': fields[45]
                                     },
                             },
                         'segway': fields[46],
                         'motif':
                             {
                                 'toverlap': fields[47],
                                 'dist': fields[48],
                                 'ecount': fields[49],
                                 'ename': fields[50],
                                 'ehipos': fields[51],
                                 'escorechng': fields[52]
                             },
                         'tf':
                             {
                                 'bs': fields[53],
                                 'bs_peaks': fields[54],
                                 'bs_peaks_max': fields[55]
                             },
                         'isknownvariant': fields[56],
                         'esp':
                             {
                                 'af': fields[57],
                                 'afr': fields[58],
                                 'eur': fields[59]
                             },
                         '1000g':
                             {
                                 'af': fields[60],
                                 'asn': fields[61],
                                 'amr': fields[62],
                                 'afr': fields[63],
                                 'eur': fields[64]
                             },
                         'min_dist_tss': fields[65],
                         'min_dist_tse': fields[66],
                         'gene':
                             {
                                 'gene_id': fields[67],
                                 'feature_id': fields[68],
                                 'ccds_id': fields[69],
                                 'genename': fields[70],
                                 'cds':
                                     {
                                         'cdna_pos': fields[71],
                                         'rel_cdna_pos': fields[72],
                                         'cds_pos': fields[73],
                                         'rel_cds_pos': fields[74]
                                     },
                                 'prot':
                                     {
                                         'protpos': fields[75],
                                         'rel_prot_pos': fields[76],
                                         'oaa': fields[81],
                                         'naa': fields[82]
                                     },
                                 'dst_2_splice': fields[77],
                                 'dst_2_spltype': fields[78],
                                 'exon': fields[79],
                                 'intron': fields[80]
                             },
                         'grantham': fields[83],
                             'polyphen':
                             {
                                 'cat': fields[84],
                                 'val': fields[85]
                             },
                         'sift':
                             {
                                 'cat': fields[86],
                                 'val': fields[87]
                             },
                         'rawscore': fields[88],
                         'phred': fields[89]
                      }
                }
        return dict_sweep(unlist(value_convert(one_snp_json)), "NA")
Example #29
0
def _map_line_to_json(fields):
    assert len(fields) == VALID_COLUMN_NO
    chrom = fields[13]
    chromStart = fields[14]
    chromEnd = fields[15]

    HGVS = None
    cds = fields[18].split(":")
    cds = cds[1]
    replace = re.findall(r'[ATCGMNYR=]+', cds)
    sub = re.search(r'\d([ATCGMNHKRY]>[ATCGMNHKRY])', cds)
    ins = re.search(r'ins[ATCGMNHYR]+|ins[0-9]+', cds)
    delete = fields[1] == 'deletion'
    indel = fields[1] == 'indel'
    dup = re.search(r'dup', cds)
    inv = re.search(r'inv|inv[0-9]+|inv[ATCGMNHYR]+', cds)
    if ins:
        delete = None
        indel = None
    elif delete:
        ins = None
        indel = None
    # parse from vcf file. Input chrom number
    # and chromStart, and return REF, ALT
    if chromStart:
        record = vcf_reader.fetch(chrom, int(chromStart))
    else:
        record = None
    if record:
        REF = record.REF
        ALT = record.ALT
        ALT = ALT[0]
        if record.is_snp and len(ALT) < 2:
            mod = [REF, ALT]
        else:
            mod = ALT
    else:
        return

    if sub and record.is_snp:
            HGVS = "chr%s:g.%s%s>%s" % (chrom, chromStart, mod[0], mod[1])
    elif ins:
        HGVS = "chr%s:g.%s_%sins%s" % (chrom, chromStart, chromEnd, mod)
    elif delete:
        HGVS = "chr%s:g.%s_%sdel" % (chrom, chromStart, chromEnd)
    elif indel:
        try:
            HGVS = "chr%s:g.%s_%sdelins%s" % (chrom, chromStart, chromEnd, mod)
        except AttributeError:
            print "ERROR:", fields[1], cds
    elif dup:
        HGVS = "chr%s:g.%s_%sdup%s" % (chrom, chromStart, chromEnd, mod)
    elif inv:
        HGVS = "chr%s:g.%s_%sinv%s" % (chrom, chromStart, chromEnd, mod)
    elif replace:
        HGVS = "chr%s:g.%s_%s%s" % (chrom, chromStart, chromEnd, mod)
    else:
        print 'ERROR:', fields[1], cds

    # load as json data
    if HGVS is None:
        print 'None:', fields[1], cds
        return None

    one_snp_json = {

        "_id": HGVS,
        "clinvar":
            {
                "allele_id": fields[0],
                "hg19":
                    {
                        "chr": fields[13],
                        "start": fields[14],
                        "end": fields[15]
                    },
                "type": fields[1],
                "name": fields[2],
                "gene":
                    {
                        "id": fields[3],
                        "symbol": fields[4]
                    },
                "clinical_significance": fields[5].split(";"),
                "rsid": 'rs' + str(fields[6]),
                "nsv_dbvar": fields[7],
                "rcv_accession": fields[8].split(";"),
                "tested_in_gtr": fields[9],
                "phenotype_id": other_id(fields[10]),
                "origin": fields[11],
                "cytogenic": fields[16],
                "review_status": fields[17],
                "hgvs":
                    {
                        "coding": fields[18],
                        "protein": fields[19]
                    },
                "number_submitters": fields[20],
                "last_evaluated": fields[21],
                "guidelines": fields[22],
                "other_ids": other_id(fields[23]),
                "clinvar_id": fields[24]
            }
        }
    return dict_sweep(unlist(value_convert(one_snp_json)), vals=["-"])
def _map_line_to_json(fields):
    assert len(fields) == VALID_COLUMN_NO
    chrom = fields[13]
    chromStart = fields[14]
    chromEnd = fields[15]

    HGVS = None
    cds = fields[18].split(":")
    cds = cds[1]
    replace = re.findall(r'[ATCGMNYR=]+', cds)
    sub = re.search(r'\d([ATCGMNHKRY]>[ATCGMNHKRY])', cds)
    ins = re.search(r'ins[ATCGMNHYR]+|ins[0-9]+', cds)
    delete = fields[1] == 'deletion'
    indel = fields[1] == 'indel'
    dup = re.search(r'dup', cds)
    inv = re.search(r'inv|inv[0-9]+|inv[ATCGMNHYR]+', cds)
    if ins:
        delete = None
        indel = None
    elif delete:
        ins = None
        indel = None
    # parse from vcf file. Input chrom number
    # and chromStart, and return REF, ALT
    if chromStart:
        record = vcf_reader.fetch(chrom, int(chromStart))
    else:
        record = None
    if record:
        REF = record.REF
        ALT = record.ALT
        ALT = ALT[0]
        if record.is_snp and len(ALT) < 2:
            mod = [REF, ALT]
        else:
            mod = ALT
    else:
        return

    if sub and record.is_snp:
        HGVS = "chr%s:g.%s%s>%s" % (chrom, chromStart, mod[0], mod[1])
    elif ins:
        HGVS = "chr%s:g.%s_%sins%s" % (chrom, chromStart, chromEnd, mod)
    elif delete:
        HGVS = "chr%s:g.%s_%sdel" % (chrom, chromStart, chromEnd)
    elif indel:
        try:
            HGVS = "chr%s:g.%s_%sdelins%s" % (chrom, chromStart, chromEnd, mod)
        except AttributeError:
            print "ERROR:", fields[1], cds
    elif dup:
        HGVS = "chr%s:g.%s_%sdup%s" % (chrom, chromStart, chromEnd, mod)
    elif inv:
        HGVS = "chr%s:g.%s_%sinv%s" % (chrom, chromStart, chromEnd, mod)
    elif replace:
        HGVS = "chr%s:g.%s_%s%s" % (chrom, chromStart, chromEnd, mod)
    else:
        print 'ERROR:', fields[1], cds

    # load as json data
    if HGVS is None:
        print 'None:', fields[1], cds
        return None

    one_snp_json = {
        "_id": HGVS,
        "clinvar": {
            "allele_id": fields[0],
            "hg19": {
                "chr": fields[13],
                "start": fields[14],
                "end": fields[15]
            },
            "type": fields[1],
            "name": fields[2],
            "gene": {
                "id": fields[3],
                "symbol": fields[4]
            },
            "clinical_significance": fields[5].split(";"),
            "rsid": 'rs' + str(fields[6]),
            "nsv_dbvar": fields[7],
            "rcv_accession": fields[8].split(";"),
            "tested_in_gtr": fields[9],
            "phenotype_id": other_id(fields[10]),
            "origin": fields[11],
            "cytogenic": fields[16],
            "review_status": fields[17],
            "hgvs": {
                "coding": fields[18],
                "protein": fields[19]
            },
            "number_submitters": fields[20],
            "last_evaluated": fields[21],
            "guidelines": fields[22],
            "other_ids": other_id(fields[23]),
            "clinvar_id": fields[24]
        }
    }
    return dict_sweep(unlist(value_convert(one_snp_json)), vals=["-"])