def load_data(data_folder): input_file = os.path.join(data_folder, "phewas-catalog.csv") assert os.path.exists(input_file), "Can't find input file '%s'" % input_file with open_anyfile(input_file) as in_f: # Remove duplicated lines if any header = next(in_f).strip().split(',') header = [_item[1:-1] for _item in header] lines = set(list(in_f)) reader = DictReader(lines, fieldnames=header, delimiter=',') results = defaultdict(list) for row in reader: variant = {"associations": {"phenotype": {}}, "variant": {}} assert re.match("^rs\d+$", row["snp"]) != None variant["variant"]["rsid"] = row["snp"] variant["associations"]["phenotype"]["name"] = row["phewas phenotype"] variant["associations"]["cases"] = row["cases"] variant["associations"]["pval"] = float(row["p-value"]) variant["associations"]["odds-ratio"] = row["odds-ratio"] variant["associations"]["phenotype"]["phewas_code"] = row["phewas code"] variant["variant"]["gene"] = row["gene_name"] variant["variant"]["gwas_associations"] = row["gwas-associations"].split(',') pos_info = row["chromosome"].split(' ') if len(pos_info) == 2: variant["variant"]["chrom"], variant["variant"]["pos"] = pos_info else: variant["variant"]["chrom"] = pos_info[0] results[variant["variant"]["rsid"]].append(variant) # Merge duplications rsid_list = [_item for _item in results.keys()] hgvs_rsid_dict = batch_query_hgvs_from_rsid(rsid_list) for k, v in results.items(): if k in hgvs_rsid_dict and hgvs_rsid_dict[k]: if len(v) == 1: doc = {'_id': hgvs_rsid_dict[k], 'phewas': v[0]["variant"]} doc["phewas"]["associations"] = v[0]["associations"] yield dict_sweep(unlist(value_convert_to_number(doc, skipped_keys=['chrom'])), vals=[[], {}, None, '', 'NULL']) else: doc = {'_id': hgvs_rsid_dict[k], 'phewas': v[0]["variant"]} doc["phewas"]["associations"] = [] for _item in v: doc["phewas"]["associations"].append(_item["associations"]) yield dict_sweep(unlist(value_convert_to_number(doc, skipped_keys=['chrom'])), vals=[[], {}, None, '', 'NULL'])
def restructure_dict(dictionary): restr_dict = dict() _flag = 0 for key in list(dictionary): # this is for 1 if key == 'molecule_chembl_id': restr_dict['_id'] = dictionary[key] if key == 'molecule_structures' and type( dictionary['molecule_structures']) == dict: restr_dict['chembl'] = dictionary _flag = 1 for x, y in iter(dictionary['molecule_structures'].items()): if x == 'standard_inchi_key': restr_dict['chembl'].update(dictionary) restr_dict['chembl'].update({'inchi_key': y}) if x == 'canonical_smiles': restr_dict['chembl']['smiles'] = y if x == 'standard_inchi': restr_dict['chembl']['inchi'] = y if _flag == 0: restr_dict['chembl'] = dictionary del restr_dict['chembl']['molecule_structures'] restr_dict = unlist(restr_dict) restr_dict = dict_sweep(restr_dict, vals=[ None, ".", "-", "", "NA", "None", "none", " ", "Not Available", "unknown", "null" ]) restr_dict = value_convert_to_number( restr_dict, skipped_keys=["chebi_par_id", "first_approval"]) restr_dict = boolean_convert(restr_dict, [ "topical", "oral", "parenteral", "dosed_ingredient", "polymer_flag", "therapeutic_flag", "med_chem_friendly", "molecule_properties.ro3_pass" ]) return restr_dict
def _map_line_to_json(fields): vid = fields[0].split(":") chrom = re.search(r'[1-9]+', vid[0]).group() if chrom == '23': chrom = chrom.replace('23', 'X') HGVS = "chr%s:%s" % (chrom, vid[1]) # load as json data if HGVS is None: return one_snp_json = { "_id": HGVS, "emv": { "gene": fields[2], "variant_id": fields[3], "exon": fields[4], "egl_variant": fields[5], "egl_protein": fields[6], "egl_classification": fields[7], "egl_classification_date": fields[8], "hgvs": fields[9].split(" | "), "clinvar_rcv": fields[10], } } return unlist(dict_sweep(value_convert_to_number(one_snp_json), vals=[""]))
def restructure_dict(dictionary): restr_dict = dict() restr_dict['_id'] = dictionary['ChEBI ID'] restr_dict['chebi']= dictionary restr_dict['chebi'] = clean_up(restr_dict['chebi']) restr_dict = dict_sweep(restr_dict,vals=[None,".", "-", "", "NA", "none", " ", "Not Available", "unknown","null","None","NaN"]) restr_dict = value_convert_to_number(unlist(restr_dict),skipped_keys=["cid","sid", "beilstein","pubmed","sabio_rk","gmelin","molbase", "synonyms", "wikipedia","url_stub"]) return restr_dict
def reformat(cls, dictionary): ret_dict = dict() _flag = 0 for key in list(dictionary): if key == 'molecule_chembl_id': ret_dict['_id'] = dictionary[key] if key == 'molecule_structures' and type( dictionary['molecule_structures']) == dict: ret_dict['chembl'] = dictionary _flag = 1 for x, y in iter(dictionary['molecule_structures'].items()): if x == 'standard_inchi_key': ret_dict['chembl'].update(dictionary) ret_dict['chembl'].update({'inchi_key': y}) if x == 'canonical_smiles': ret_dict['chembl']['smiles'] = y if x == 'standard_inchi': ret_dict['chembl']['inchi'] = y if _flag == 0: ret_dict['chembl'] = dictionary if 'cross_references' in ret_dict['chembl'] and ret_dict['chembl'][ 'cross_references']: ret_dict['chembl'][ 'xrefs'] = MoleculeCrossReferenceListTransformer.transform_to_dict( ret_dict['chembl']['cross_references']) del ret_dict['chembl']['molecule_structures'] del ret_dict['chembl']['cross_references'] ret_dict = unlist(ret_dict) # Add "CHEBI:" prefix, standardize the way representing CHEBI IDs if 'chebi_par_id' in ret_dict['chembl'] and ret_dict['chembl'][ 'chebi_par_id']: ret_dict['chembl']['chebi_par_id'] = 'CHEBI:' + str( ret_dict['chembl']['chebi_par_id']) else: # clean, could be a None ret_dict['chembl'].pop("chebi_par_id", None) ret_dict = dict_sweep(ret_dict, vals=[ None, ".", "-", "", "NA", "None", "none", " ", "Not Available", "unknown", "null" ]) ret_dict = value_convert_to_number( ret_dict, skipped_keys=["chebi_par_id", "first_approval"]) ret_dict = boolean_convert(ret_dict, [ "topical", "oral", "parenteral", "dosed_ingredient", "polymer_flag", "therapeutic_flag", "med_chem_friendly", "molecule_properties.ro3_pass" ]) return ret_dict
def parse(self, record: vcf.model._Record, doc_key: str): """ When parsing gnomad.genomes.*.vcf.bgz files, `doc_key` should be "gnomad_genome"; when parsing gnomad.exomes.*.vcf.bgz files, `doc_key` should be "gnomad_exome". The returned document has the following structure: one_snp_json = { "_id": hgvs_id, doc_key: { "chrom": chrom, ... } } """ # the value of CHROM in hg38 GNOMAD source file startswith 'chr'; need to remove it first if record.CHROM.startswith('chr'): record.CHROM = record.CHROM[3:] # This step is necessary to `profile_parser.parse()` method if record.CHROM not in CHROM_VALID_VALUES: return info = record.INFO for key in ["AC", "AF", "nhomalt"]: if key in info: assert len(record.ALT) == len(info[key]), \ "length of record.ALT != length of info.%s, at CHROM=%s, POS=%s" % (key, record.CHROM, record.POS) profile_list = self.profile_parser.parse(record) site_quality_metrics_dict = self.site_quality_metrics_parser.parse(info) for i in range(len(record.ALT)): hgvs_id, profile_dict = profile_list[i] if hgvs_id is None: continue population_frequency_dict = self.population_frequency_parser.parse(info, i) one_snp_json = { "_id": hgvs_id, doc_key: { **profile_dict, **site_quality_metrics_dict, **population_frequency_dict } } obj = (dict_sweep(unlist(value_convert_to_number(one_snp_json, skipped_keys=['chrom'])), [None])) yield obj
def _map_line_to_json(fields): """Mapping each lines in csv file into JSON doc """ one_snp_json = { "gene": fields[1], "variant_id": fields[2], "exon": fields[3], "egl_variant": fields[4], "egl_protein": fields[5], "egl_classification": fields[6], "egl_classification_date": fields[7], "hgvs": fields[8].split(" | ") } return unlist(dict_sweep(value_convert_to_number(one_snp_json), vals=[""]))
def load_data(assembly, input_file, chrom): import logging as loggingmod global logging logging = loggingmod.getLogger("dbsnp_upload") logging.info("Processing chr{}...".format(chrom)) snpdoc_iter = parse_vcf(assembly, input_file, compressed=True, verbose=False, by_id=True, reference=chrom) for doc in snpdoc_iter: _doc = {'dbsnp': doc} _doc['_id'] = doc['_id'] del doc['_id'] yield (dict_sweep(unlist(value_convert_to_number(_doc)), [None]))
def restr_dict(_dict, row): _d = {} _d.update({'stitch': {'flat': row[1], 'stereo': row[2]}}) _d.update({ 'side_effect': { 'name': row[10], 'placebo': bool(row[4]), 'frequency': row[5] } }) _d.update({'meddra': {'type': row[8], 'umls_id': row[9]}}) _d.update( {'indication': { 'method_of_detection': row[11], 'name': row[12] }}) _d = dict_sweep(value_convert_to_number(_d)) return _d
def parse_one_rec(assembly, record): """Restructure JSON """ doc = {"alleles": [], "gene": [], assembly: {}, "vartype": record.get("primary_snapshot_data").get("variant_type"), "rsid": "rs" + str(record.get("refsnp_id")), "dbsnp_build": int(record.get("last_update_build_id")), "dbsnp_merges": restructure_dbsnp_merge(record.get("dbsnp1_merges")), "citations": record.get("citations")} data = record.get('primary_snapshot_data') hgvs_vcf_info = get_hgvs_and_vcf(assembly, data.get("placements_with_allele")) allele_annotations = data.get('allele_annotations') allele_annotations = list(allele_annotations) doc["alleles"] = restructure_allele_freq_info(allele_annotations) doc['gene'] = restructure_gene_info(allele_annotations) for _item in hgvs_vcf_info: hgvs, vcf = _item if vcf: doc["chrom"], pos, doc["ref"], doc["alt"] = vcf doc["chrom"] = str(doc["chrom"]) if doc["chrom"] == "23": doc["chrom"] = "X" elif doc["chrom"] == "24": doc["chrom"] = "Y" doc[assembly] = {} try: if doc["vartype"] != "snv": ref = "T" + doc["ref"] alt = "T" + doc["alt"] else: ref = doc["ref"] alt = doc["alt"] if doc["vartype"] in ["ins", "del", "delins"]: doc[assembly]['start'], doc[assembly]['end'] = get_pos_start_end(doc["chrom"], pos - 1, ref, alt) else: doc[assembly]['start'], doc[assembly]['end'] = get_pos_start_end(doc["chrom"], pos, ref, alt) except (ValueError, AssertionError): doc[assembly] = {} if hgvs: doc["_id"] = hgvs.replace('chr23', 'chrX').replace('chr24', 'chrY') yield dict_sweep(unlist(value_convert_to_number(doc, skipped_keys=['chrom', 'ref', 'alt', 'allele', 'deleted_sequence', 'inserted_sequence'])), vals=[[], {}, None])
def restructure_dict(dictionary): restr_dict = dict() restr_dict['_id'] = dictionary['ChEBI ID'] restr_dict['chebi'] = dictionary restr_dict['chebi'] = clean_up(restr_dict['chebi']) restr_dict = dict_sweep(restr_dict, vals=[ None, ".", "-", "", "NA", "none", " ", "Not Available", "unknown", "null", "None", "NaN" ]) restr_dict = value_convert_to_number(unlist(restr_dict), skipped_keys=[ "cid", "sid", "beilstein", "pubmed", "sabio_rk", "gmelin", "molbase", "synonyms", "wikipedia", "url_stub" ]) return restr_dict
def _map_line_to_json(item): chrom = item.CHROM chromStart = item.POS ref = item.REF info = item.INFO hpo_count=item.INFO['HPO_CT'] for alt in item.ALT: alt = str(alt) (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True) if HGVS is None: return one_snp_json = { "_id": HGVS, "geno2mp": { "hpo_count": hpo_count, } } obj = (dict_sweep(unlist(value_convert_to_number(one_snp_json)), [None])) yield obj
def restructure_dict(dictionary): restr_dict = dict() restr_dict['_id'] = dictionary['ChEBI ID'] restr_dict['chebi'] = dictionary restr_dict['chebi'] = clean_up(restr_dict['chebi']) restr_dict = dict_sweep(restr_dict, vals=[ None, ".", "-", "", "NA", "none", " ", "Not Available", "unknown", "null", "None", "NaN" ]) restr_dict = value_convert_to_number(unlist(restr_dict), skipped_keys=[ "beilstein_registry_numbers", "pubmed_citation_links", "sabio_rk_database_links", "gmelin_registry_numbers", "molbase_database_links" ]) return restr_dict
def restructure_dict(dictionary): restr_dict = dict() _flag = 0 for key in list(dictionary): # this is for 1 if key == 'molecule_chembl_id': restr_dict['_id']=dictionary[key] if key == 'molecule_structures' and type(dictionary['molecule_structures'])==dict: restr_dict['chembl'] = dictionary _flag=1 for x,y in iter(dictionary['molecule_structures'].items()): if x == 'standard_inchi_key': restr_dict['chembl'].update(dictionary) restr_dict['chembl'].update({'inchi_key':y}) if x == 'canonical_smiles': restr_dict['chembl']['smiles'] = y if x == 'standard_inchi': restr_dict['chembl']['inchi'] = y if _flag == 0: restr_dict['chembl'] = dictionary if 'cross_references' in restr_dict['chembl'] and restr_dict['chembl']['cross_references']: restr_dict['chembl']['xrefs'] = restructure_xref(restr_dict['chembl']['cross_references']) del restr_dict['chembl']['molecule_structures'] del restr_dict['chembl']['cross_references'] restr_dict = unlist(restr_dict) # Add "CHEBI:" prefix, standardize the way representing CHEBI IDs if 'chebi_par_id' in restr_dict['chembl'] and restr_dict['chembl']['chebi_par_id']: restr_dict['chembl']['chebi_par_id'] = 'CHEBI:' + str(restr_dict['chembl']['chebi_par_id']) else: # clean, could be a None restr_dict['chembl'].pop("chebi_par_id",None) restr_dict = dict_sweep(restr_dict, vals=[None,".", "-", "", "NA", "None","none", " ", "Not Available", "unknown","null"]) restr_dict = value_convert_to_number(restr_dict, skipped_keys=["chebi_par_id","first_approval"]) restr_dict = boolean_convert(restr_dict, ["topical","oral","parenteral","dosed_ingredient","polymer_flag", "therapeutic_flag","med_chem_friendly","molecule_properties.ro3_pass"]) return restr_dict
def _map_line_to_json(doc_key, item): chrom = item.CHROM chromStart = item.POS ref = item.REF info = item.INFO try: baseqranksum = info['BaseQRankSum'] except: baseqranksum = None try: clippingranksum = info['ClippingRankSum'] except: clippingranksum = None try: mqranksum = info['MQRankSum'] except: mqranksum = None try: readposranksum = info['ReadPosRankSum'] except: readposranksum = None try: qd = info['QD'] except: qd = None try: inbreedingcoeff = info['InbreedingCoeff'] except: inbreedingcoeff = None for i in range(0, len(item.ALT)): item.ALT[i] = str(item.ALT[i]) for alt in item.ALT: alt = str(alt) (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True) if HGVS is None: return one_snp_json = { "_id": HGVS, doc_key: { "chrom": chrom, "pos": chromStart, "ref": ref, "alt": alt, "alleles": item.ALT, "type": var_type, "ac": { "ac": info['AC'], "ac_afr": info['AC_AFR'], "ac_amr": info['AC_AMR'], "ac_adj": info['AC_Adj'], "ac_eas": info['AC_EAS'], "ac_fin": info['AC_FIN'], "ac_het": info['AC_Het'], "ac_hom": info['AC_Hom'], "ac_nfe": info['AC_NFE'], "ac_oth": info['AC_OTH'], "ac_sas": info['AC_SAS'], "ac_female": info['AC_FEMALE'], "ac_male": info['AC_MALE'] }, "af": info['AF'], "an": { "an": info['AN'], "an_afr": info['AN_AFR'], "an_amr": info['AN_AMR'], "an_adj": info['AN_Adj'], "an_eas": info['AN_EAS'], "an_fin": info['AN_FIN'], "an_nfe": info['AN_NFE'], "an_oth": info['AN_OTH'], "an_sas": info['AN_SAS'], "an_female": info['AN_FEMALE'], "an_male": info['AN_MALE'] }, "baseqranksum": baseqranksum, "clippingranksum": clippingranksum, "fs": info['FS'], "het": { "het_afr": info['Het_AFR'], "het_amr": info['Het_AMR'], "het_eas": info['Het_EAS'], "het_fin": info['Het_FIN'], "het_nfe": info['Het_NFE'], "het_oth": info['Het_OTH'], "het_sas": info['Het_SAS'] }, "hom": { "hom_afr": info['Hom_AFR'], "hom_amr": info['Hom_AMR'], "hom_eas": info['Hom_EAS'], "hom_fin": info['Hom_FIN'], "hom_nfe": info['Hom_NFE'], "hom_oth": info['Hom_OTH'], "hom_sas": info['Hom_SAS'] }, "inbreedingcoeff": inbreedingcoeff, "mq": { "mq": info['MQ'], "mq0": info['MQ0'], "mqranksum": mqranksum }, "ncc": info['NCC'], "qd": qd, "readposranksum": readposranksum, "vqslod": info['VQSLOD'], "culprit": info['culprit'] } } obj = (dict_sweep(unlist(value_convert_to_number(one_snp_json)), [None])) yield obj
def parse_one_rec(assembly, record): """ Parse a record from a 'refsnp-chr*.json.bz2' file into one or multiple documents. From https://ftp.ncbi.nlm.nih.gov/snp/latest_release/JSON/JSON_README.txt we know that each 'refsnp-chr*.json.bz2' file conform to the "refsnp_snapshot_success" OpenAPI schema, as defined in https://api.ncbi.nlm.nih.gov/variation/v0/var_service.yaml From the above schema, we can find that each record (i.e. each line of one of 'refsnp-chr*.json.bz2' files) conform to the "refsnp_snapshot" OpenAPI schema, which **requires** the following components: - "refsnp_id" (type: string, format: uint64), - "create_date" (type: string, format: ISO 8601), - "last_update_date" (type: string, format: ISO 8601), - "last_update_build_id" (type: string, format: ascii), - "dbsnp1_merges" (type: array), - "lost_obs_movements" (type: array), - "present_obs_movements" (type: array), - "citations" (type: array) A another component of our interest is "primary_snapshot_data" (type: object), which is optional to "refsnp_snapshot". If exists, itself **requires** the following sub-components: - "placements_with_allele" (type: array), - "allele_annotations" (type: array), - "support" (type: array), - "anchor" (type: string, format: ascii), - "variant_type" (type: string, format: ascii) Plus, it's known that **none** of the fields defined in the above schema is "nullable". (See https://stackoverflow.com/questions/45575493/what-does-required-in-openapi-really-mean for more.) The requiredness, data types, and nullability of each components are a guideline to apply existence check and type conversion to those fields in the output json objects. """ """ We can extract common fields from the input record, and for each "allele" in each "placement" from the record's "primary_snapshot_data" component, we can extract some allele-specific fields. The generation of the output documents can be described with the pseudocode below: common_fields = {...} for placement in placements: for allele in placement["alleles"] allele_specific_fields = {...} doc = { **common_fields, **allele_specific_fields, } yield doc """ snapshot = record.get("primary_snapshot_data") annotations = snapshot.get("allele_annotations") placements = snapshot.get("placements_with_allele") common_fields = { # fields parsed directly from `record` "rsid": "rs" + str(record.get("refsnp_id")), "dbsnp_build": int(record.get("last_update_build_id")), "dbsnp_merges": restructure_dbsnp_merge(record.get("dbsnp1_merges")), "citations": record.get("citations"), # fields parsed from `record["primary_snapshot_data"]` "vartype": snapshot.get("variant_type"), # fields parsed from `record["primary_snapshot_data"]["allele_annotations"]` "alleles": restructure_allele_freq_info(annotations), "gene": restructure_gene_info(annotations) } variant_type = common_fields["vartype"] # fields parsed from `record["primary_snapshot_data"]["placements_with_allele"] for hgvs, vcf in get_hgvs_and_vcf(assembly, placements): chrom, pos, ref, alt = vcf start, end = get_start_end(variant_type, chrom, pos, ref, alt) if start is None and end is None: coordinates = {} else: # we can infer from `get_pos_start_end` that in this case, neither of `start` or `end` could be None coordinates = {"start": start, "end": end} allele_specific_fields = { "_id": hgvs, "chrom": chrom, "ref": ref, "alt": alt, assembly: coordinates } doc = {**common_fields, **allele_specific_fields} yield dict_sweep(unlist( value_convert_to_number(doc, skipped_keys=[ 'chrom', 'ref', 'alt', 'allele', 'deleted_sequence', 'inserted_sequence' ])), vals=[[], {}, None])
def _map_line_to_json(fields): assert len(fields) == VALID_COLUMN_NO chrom = fields[13] chromStart = fields[14] chromEnd = fields[15] HGVS = None cds = fields[18].split(":") cds = cds[1] replace = re.findall(r'[ATCGMNYR=]+', cds) sub = re.search(r'\d([ATCGMNHKRY]>[ATCGMNHKRY])', cds) ins = re.search(r'ins[ATCGMNHYR]+|ins[0-9]+', cds) delete = fields[1] == 'deletion' indel = fields[1] == 'indel' dup = re.search(r'dup', cds) inv = re.search(r'inv|inv[0-9]+|inv[ATCGMNHYR]+', cds) if ins: delete = None indel = None elif delete: ins = None indel = None # parse from vcf file. Input chrom number # and chromStart, and return REF, ALT if chromStart: record = vcf_reader.fetch(chrom, int(chromStart)) else: record = None if record: REF = record.REF ALT = record.ALT ALT = ALT[0] if record.is_snp and len(ALT) < 2: mod = [REF, ALT] else: mod = ALT else: return if sub and record.is_snp: HGVS = "chr%s:g.%s%s>%s" % (chrom, chromStart, mod[0], mod[1]) elif ins: HGVS = "chr%s:g.%s_%sins%s" % (chrom, chromStart, chromEnd, mod) elif delete: HGVS = "chr%s:g.%s_%sdel" % (chrom, chromStart, chromEnd) elif indel: try: HGVS = "chr%s:g.%s_%sdelins%s" % (chrom, chromStart, chromEnd, mod) except AttributeError: print "ERROR:", fields[1], cds elif dup: HGVS = "chr%s:g.%s_%sdup%s" % (chrom, chromStart, chromEnd, mod) elif inv: HGVS = "chr%s:g.%s_%sinv%s" % (chrom, chromStart, chromEnd, mod) elif replace: HGVS = "chr%s:g.%s_%s%s" % (chrom, chromStart, chromEnd, mod) else: print 'ERROR:', fields[1], cds # load as json data if HGVS is None: print 'None:', fields[1], cds return None one_snp_json = { "_id": HGVS, "clinvar": { "allele_id": fields[0], "hg19": { "chr": fields[13], "start": fields[14], "end": fields[15] }, "type": fields[1], "name": fields[2], "gene": { "id": fields[3], "symbol": fields[4] }, "clinical_significance": fields[5].split(";"), "rsid": 'rs' + str(fields[6]), "nsv_dbvar": fields[7], "rcv_accession": fields[8].split(";"), "tested_in_gtr": fields[9], "phenotype_id": other_id(fields[10]), "origin": fields[11], "cytogenic": fields[16], "review_status": fields[17], "hgvs": { "coding": fields[18], "protein": fields[19] }, "number_submitters": fields[20], "last_evaluated": fields[21], "guidelines": fields[22], "other_ids": other_id(fields[23]), "clinvar_id": fields[24] } } return dict_sweep(unlist(value_convert_to_number(one_snp_json)), vals=["-"])
def _map_line_to_json(fields, version): chrInfo = fields[0].split(":") # grch37 chrom = chrInfo[0] chromStart = int(chrInfo[1]) ma_fin_percent = fields[7].split("/") if fields[3]: mutation = fields[3].split(">") ref = mutation[0] alt = mutation[1] hg19 = get_pos_start_end(chrom, chromStart, ref, alt) hg38 = get_pos_start_end(chrom, int(fields[30].split(":")[1]), ref, alt) if version == 'hg19': HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt) elif version == 'hg38': HGVS = get_hgvs_from_vcf(chrom, hg38[0], ref, alt) # load as json data if HGVS is None: return one_snp_json = { "_id": HGVS, "evs": { "chrom": str(chrom), "hg19": { "start": hg19[0], "end": hg19[1] }, "hg38": { "start": hg38[0], "end": hg38[1] }, "rsid": fields[1], "dbsnp_version": get_dbsnp(fields[2]), "ref": ref, "alt": alt, "allele_count": { "european_american": count_dict(fields[4]), "african_american": count_dict(fields[5]), "all": count_dict(fields[6]) }, "ma_fin_percent": { "european_american": ma_fin_percent[0], "african_american": ma_fin_percent[1], "all": ma_fin_percent[2] }, "genotype_count": { "european_american": count_dict(fields[8]), "african_american": count_dict(fields[9]), "all_genotype": count_dict(fields[10]) }, "avg_sample_read": fields[11], "gene": { "symbol": fields[12], "accession": fields[13] }, "function_gvs": fields[14], "hgvs": { "coding": fields[16], "protein": fields[15] }, "coding_dna_size": fields[17], "conservation": { "phast_cons": fields[18], "gerp": fields[19] }, "grantham_score": fields[20], "polyphen2": { "class": polyphen(fields[21])[0], "score": polyphen(fields[21])[1] }, "ref_base_ncbi": fields[22], "chimp_allele": fields[23], "clinical_info": fields[24], "filter_status": fields[25], "on_illumina_human_exome_chip": fields[26], "gwas_pubmed_info": fields[27], "estimated_age_kyrs": { "ea": fields[28], "aa": fields[29] } } } return dict_sweep(value_convert_to_number(one_snp_json), vals=["NA", "none", "unknown"])
def _map_line_to_json(df, version, include_gnomad, index=0): # specific variable treatment chrom = df["#chr"] if chrom == 'M': chrom = 'MT' # fields[7] in version 2, represent hg18_pos hg18_end = df["hg18_pos(1-based)"] if hg18_end == ".": hg18_end = "." else: hg18_end = int(hg18_end) # in case of no hg19 position provided, remove the item if df["hg19_pos(1-based)"] == '.': return None else: chromStart = int(df["hg19_pos(1-based)"]) chromEnd = chromStart chromStart_38 = int(df["pos(1-based)"]) ref = df["ref"].upper() alt = df["alt"].upper() HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt) HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt) if version == 'hg19': HGVS = HGVS_19 elif version == 'hg38': HGVS = HGVS_38 siphy_29way_pi = df["SiPhy_29way_pi"] if siphy_29way_pi == ".": siphy = "." else: freq = siphy_29way_pi.split(":") siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]} gtex_gene = df["GTEx_V8_gene"].split('|') gtex_tissue = df["GTEx_V8_tissue"].split('|') gtex = map( dict, map(lambda t: zip(('gene', 'tissue'), t), zip(gtex_gene, gtex_tissue))) acc = df["Uniprot_acc"].rstrip().rstrip(';').split(";") entry = df["Uniprot_entry"].rstrip().rstrip(';').split(";") uniprot = map(dict, map(lambda t: zip(('acc', 'entry'), t), zip(acc, entry))) provean_score = df["PROVEAN_score"].split(';') sift_score = df["SIFT_score"].split(';') sift4g_score = df["SIFT4G_score"].split(';') hdiv_score = df["Polyphen2_HDIV_score"].split(';') hvar_score = df["Polyphen2_HVAR_score"].split(';') lrt_score = df["LRT_score"].split(';') m_cap_score = df["M-CAP_score"].split(';') mutationtaster_score = df["MutationTaster_score"].split(';') mutationassessor_score = df["MutationAssessor_score"].split(';') vest3_score = df["VEST4_score"].split(';') metasvm_score = df["MetaSVM_score"].split(';') fathmm_score = df["FATHMM_score"].split(';') metalr_score = df["MetaLR_score"].split(';') revel_score = df["REVEL_score"].split(';') appris = df["APPRIS"].split(";") mpc_score = df["MPC_score"].split(';') mvp_score = df["MVP_score"].split(';') tsl = df["TSL"].split(';') vep_canonical = df["VEP_canonical"].split(';') deogen2_score = df["DEOGEN2_score"].split(';') ''' parse mutpred top 5 features ''' def modify_pvalue(pvalue): return float(pvalue.strip('P = ')) mutpred_mechanisms = df["MutPred_Top5features"] if mutpred_mechanisms not in ['.', ',', '-']: mutpred_mechanisms = mutpred_mechanisms.split( " (") and mutpred_mechanisms.split(";") mutpred_mechanisms = [m.rstrip(")") for m in mutpred_mechanisms] mutpred_mechanisms = [i.split(" (") for i in mutpred_mechanisms] mutpred_mechanisms = sum(mutpred_mechanisms, []) mechanisms = [{ "mechanism": mutpred_mechanisms[0], "p_val": modify_pvalue(mutpred_mechanisms[1]) }, { "mechanism": mutpred_mechanisms[2], "p_val": modify_pvalue(mutpred_mechanisms[3]) }, { "mechanism": mutpred_mechanisms[4], "p_val": modify_pvalue(mutpred_mechanisms[5]) }, { "mechanism": mutpred_mechanisms[6], "p_val": modify_pvalue(mutpred_mechanisms[7]) }, { "mechanism": mutpred_mechanisms[8], "p_val": modify_pvalue(mutpred_mechanisms[9]) }] else: mechanisms = '.' # normalize scores def norm(arr): return [None if item == '.' else item for item in arr] provean_score = norm(provean_score) sift_score = norm(sift_score) hdiv_score = norm(hdiv_score) hvar_score = norm(hvar_score) lrt_score = norm(lrt_score) m_cap_score = norm(m_cap_score) mutationtaster_score = norm(mutationtaster_score) mutationassessor_score = norm(mutationassessor_score) vest3_score = norm(vest3_score) metasvm_score = norm(metasvm_score) fathmm_score = norm(fathmm_score) metalr_score = norm(metalr_score) revel_score = norm(revel_score) gnomad = { "gnomad_exomes": { "flag": df["gnomAD_exomes_flag"], "nhomalt": df["gnomAD_exomes_nhomalt"], "ac": df["gnomAD_exomes_AC"], "an": df["gnomAD_exomes_AN"], "af": df["gnomAD_exomes_AF"], "nhomalt": df["gnomAD_exomes_nhomalt"], "afr_ac": df["gnomAD_exomes_AFR_AC"], "afr_af": df["gnomAD_exomes_AFR_AF"], "afr_an": df["gnomAD_exomes_AFR_AN"], "afr_nhomalt": df["gnomAD_exomes_AFR_nhomalt"], "amr_ac": df["gnomAD_exomes_AMR_AC"], "amr_an": df["gnomAD_exomes_AMR_AN"], "amr_af": df["gnomAD_exomes_AMR_AF"], "amr_nhomalt": df["gnomAD_exomes_AMR_nhomalt"], "asj_ac": df["gnomAD_exomes_ASJ_AC"], "asj_an": df["gnomAD_exomes_ASJ_AN"], "asj_af": df["gnomAD_exomes_ASJ_AF"], "asj_nhomalt": df["gnomAD_exomes_ASJ_nhomalt"], "eas_ac": df["gnomAD_exomes_EAS_AC"], "eas_af": df["gnomAD_exomes_EAS_AF"], "eas_an": df["gnomAD_exomes_EAS_AN"], "eas_nhomalt": df["gnomAD_exomes_EAS_nhomalt"], "fin_ac": df["gnomAD_exomes_FIN_AC"], "fin_af": df["gnomAD_exomes_FIN_AF"], "fin_an": df["gnomAD_exomes_FIN_AN"], "fin_nhomalt": df["gnomAD_exomes_FIN_nhomalt"], "nfe_ac": df["gnomAD_exomes_NFE_AC"], "nfe_af": df["gnomAD_exomes_NFE_AF"], "nfe_an": df["gnomAD_exomes_NFE_AN"], "nfe_nhomalt": df["gnomAD_exomes_NFE_nhomalt"], "sas_ac": df["gnomAD_exomes_SAS_AC"], "sas_af": df["gnomAD_exomes_SAS_AF"], "sas_an": df["gnomAD_exomes_SAS_AN"], "sas_nhomalt": df["gnomAD_exomes_SAS_nhomalt"], "popmax_ac": df["gnomAD_exomes_POPMAX_AC"], "popmax_af": df["gnomAD_exomes_POPMAX_AF"], "popmax_an": df["gnomAD_exomes_POPMAX_AN"], "popmax_nhomalt": df["gnomAD_exomes_POPMAX_nhomalt"] }, "gnomad_exomes_controls": { "nhomalt": df["gnomAD_exomes_controls_nhomalt"], "ac": df["gnomAD_exomes_controls_AC"], "an": df["gnomAD_exomes_controls_AN"], "af": df["gnomAD_exomes_controls_AF"], "nhomalt": df["gnomAD_exomes_controls_nhomalt"], "afr_ac": df["gnomAD_exomes_controls_AFR_AC"], "afr_af": df["gnomAD_exomes_controls_AFR_AF"], "afr_an": df["gnomAD_exomes_controls_AFR_AN"], "afr_nhomalt": df["gnomAD_exomes_controls_AFR_nhomalt"], "amr_ac": df["gnomAD_exomes_controls_AMR_AC"], "amr_an": df["gnomAD_exomes_controls_AMR_AN"], "amr_af": df["gnomAD_exomes_controls_AMR_AF"], "amr_nhomalt": df["gnomAD_exomes_controls_AMR_nhomalt"], "asj_ac": df["gnomAD_exomes_controls_ASJ_AC"], "asj_an": df["gnomAD_exomes_controls_ASJ_AN"], "asj_af": df["gnomAD_exomes_controls_ASJ_AF"], "asj_nhomalt": df["gnomAD_exomes_controls_ASJ_nhomalt"], "eas_ac": df["gnomAD_exomes_controls_EAS_AC"], "eas_af": df["gnomAD_exomes_controls_EAS_AF"], "eas_an": df["gnomAD_exomes_controls_EAS_AN"], "eas_nhomalt": df["gnomAD_exomes_controls_EAS_nhomalt"], "fin_ac": df["gnomAD_exomes_controls_FIN_AC"], "fin_af": df["gnomAD_exomes_controls_FIN_AF"], "fin_an": df["gnomAD_exomes_controls_FIN_AN"], "fin_nhomalt": df["gnomAD_exomes_controls_FIN_nhomalt"], "nfe_ac": df["gnomAD_exomes_controls_NFE_AC"], "nfe_af": df["gnomAD_exomes_controls_NFE_AF"], "nfe_an": df["gnomAD_exomes_controls_NFE_AN"], "nfe_nhomalt": df["gnomAD_exomes_controls_NFE_nhomalt"], "sas_ac": df["gnomAD_exomes_controls_SAS_AC"], "sas_af": df["gnomAD_exomes_controls_SAS_AF"], "sas_an": df["gnomAD_exomes_controls_SAS_AN"], "sas_nhomalt": df["gnomAD_exomes_controls_SAS_nhomalt"], "popmax_ac": df["gnomAD_exomes_controls_POPMAX_AC"], "popmax_af": df["gnomAD_exomes_controls_POPMAX_AF"], "popmax_an": df["gnomAD_exomes_controls_POPMAX_AN"], "popmax_nhomalt": df["gnomAD_exomes_controls_POPMAX_nhomalt"] }, "gnomad_genomes": { "flag": df["gnomAD_genomes_flag"], "nhomalt": df["gnomAD_genomes_nhomalt"], "ac": df["gnomAD_genomes_AC"], "an": df["gnomAD_genomes_AN"], "af": df["gnomAD_genomes_AF"], "nhomalt": df["gnomAD_genomes_nhomalt"], "afr_ac": df["gnomAD_genomes_AFR_AC"], "afr_af": df["gnomAD_genomes_AFR_AF"], "afr_an": df["gnomAD_genomes_AFR_AN"], "afr_nhomalt": df["gnomAD_genomes_AFR_nhomalt"], "ami_ac": df["gnomAD_genomes_AMI_AC"], "ami_an": df["gnomAD_genomes_AMI_AN"], "ami_af": df["gnomAD_genomes_AMI_AF"], "ami_nhomalt": df["gnomAD_genomes_AMI_nhomalt"], "amr_ac": df["gnomAD_genomes_AMR_AC"], "amr_an": df["gnomAD_genomes_AMR_AN"], "amr_af": df["gnomAD_genomes_AMR_AF"], "amr_nhomalt": df["gnomAD_genomes_AMR_nhomalt"], "asj_ac": df["gnomAD_genomes_ASJ_AC"], "asj_an": df["gnomAD_genomes_ASJ_AN"], "asj_af": df["gnomAD_genomes_ASJ_AF"], "asj_nhomalt": df["gnomAD_genomes_ASJ_nhomalt"], "eas_ac": df["gnomAD_genomes_EAS_AC"], "eas_af": df["gnomAD_genomes_EAS_AF"], "eas_an": df["gnomAD_genomes_EAS_AN"], "eas_nhomalt": df["gnomAD_genomes_EAS_nhomalt"], "fin_ac": df["gnomAD_genomes_FIN_AC"], "fin_af": df["gnomAD_genomes_FIN_AF"], "fin_an": df["gnomAD_genomes_FIN_AN"], "fin_nhomalt": df["gnomAD_genomes_FIN_nhomalt"], "nfe_ac": df["gnomAD_genomes_NFE_AC"], "nfe_af": df["gnomAD_genomes_NFE_AF"], "nfe_an": df["gnomAD_genomes_NFE_AN"], "nfe_nhomalt": df["gnomAD_genomes_NFE_nhomalt"], "popmax_ac": df["gnomAD_genomes_POPMAX_AC"], "popmax_af": df["gnomAD_genomes_POPMAX_AF"], "popmax_an": df["gnomAD_genomes_POPMAX_AN"], "popmax_nhomalt": df["gnomAD_genomes_POPMAX_nhomalt"] } } # load as json data one_snp_json = { "_id": HGVS, "dbnsfp": { "rsid": df["rs_dbSNP151"], #"rsid_dbSNP144": fields[6], "chrom": chrom, "hg19": { "start": chromStart, "end": chromEnd }, "hg18": { "start": df["hg18_pos(1-based)"], "end": hg18_end }, "hg38": { "start": df["pos(1-based)"], "end": df["pos(1-based)"] }, "ref": ref, "alt": alt, "aa": { "ref": df["aaref"], "alt": df["aaalt"], "pos": df["aapos"], "refcodon": df["refcodon"], "codonpos": df["codonpos"], "codon_degeneracy": df["codon_degeneracy"], }, "genename": df["genename"], "uniprot": list(uniprot), "vindijia_neandertal": [i for i in df["VindijiaNeandertal"].split("/") if i != "."], "interpro_domain": df["Interpro_domain"], "cds_strand": df["cds_strand"], "ancestral_allele": df["Ancestral_allele"], "appris": appris, "genecode_basic": df["GENCODE_basic"], "tsl": tsl, "vep_canonical": vep_canonical, #"altaineandertal": fields[17], #"denisova": fields[18] "ensembl": { "geneid": df["Ensembl_geneid"], "transcriptid": df["Ensembl_transcriptid"], "proteinid": df["Ensembl_proteinid"] }, "sift": { "score": sift_score, "converted_rankscore": df["SIFT_converted_rankscore"], "pred": df["SIFT_pred"] }, "sift4g": { "score": sift4g_score, "pred": df["SIFT4G_score"], "converted_rankscore": df["SIFT4G_converted_rankscore"] }, "polyphen2": { "hdiv": { "score": hdiv_score, "rankscore": df["Polyphen2_HDIV_rankscore"], "pred": df["Polyphen2_HDIV_pred"] }, "hvar": { "score": hvar_score, "rankscore": df["Polyphen2_HVAR_rankscore"], "pred": df["Polyphen2_HVAR_pred"] } }, "lrt": { "score": lrt_score, "converted_rankscore": df["LRT_converted_rankscore"], "pred": df["LRT_pred"], "omega": df["LRT_Omega"] }, "mvp": { "score": mvp_score, "rankscore": df["MVP_rankscore"] }, "mpc": { "score": mpc_score, "rankscore": df["MPC_rankscore"] }, "bstatistic": { "score": df['bStatistic'], "rankscore": df["bStatistic_rankscore"] }, "aloft": { "fraction_transcripts_affected": df["Aloft_Fraction_transcripts_affected"].split(';'), "prob_tolerant": df["Aloft_prob_Tolerant"], "prob_recessive": df["Aloft_prob_Recessive"], "prob_dominant": df["Aloft_prob_Dominant"], "pred": df["Aloft_pred"], "confidence": df["Aloft_Confidence"], }, "primateai": { "score": df["PrimateAI_score"], "rankscore": df["PrimateAI_rankscore"], "pred": df["PrimateAI_pred"] }, "mutationtaster": { "score": mutationtaster_score, "converted_rankscore": df["MutationTaster_converted_rankscore"], "pred": df["MutationTaster_pred"], "model": df["MutationTaster_model"], "AAE": df["MutationTaster_AAE"] }, "mutationassessor": { "score": mutationassessor_score, "rankscore": df["MutationAssessor_rankscore"], "pred": df["MutationAssessor_pred"] }, "fathmm": { "score": fathmm_score, "rankscore": df["FATHMM_converted_rankscore"], "pred": df["FATHMM_pred"] }, "provean": { "score": provean_score, "rankscore": df["PROVEAN_converted_rankscore"], "pred": df["PROVEAN_pred"] }, "vest4": { "score": vest3_score, "rankscore": df["VEST4_rankscore"] }, "deogen2": { "score": deogen2_score, "rankscore": df["DEOGEN2_rankscore"], "pred": df["DEOGEN2_pred"] }, "fathmm-mkl": { "coding_score": df["fathmm-MKL_coding_score"], "coding_rankscore": df["fathmm-MKL_coding_rankscore"], "coding_pred": df["fathmm-MKL_coding_pred"], "coding_group": df["fathmm-MKL_coding_group"] }, "fathmm-xf": { "coding_score": df["fathmm-XF_coding_score"], "coding_rankscore": df["fathmm-XF_coding_rankscore"], "coding_pred": df["fathmm-XF_coding_pred"] }, "eigen": { "raw_coding": df["Eigen-raw_coding"], "raw_coding_rankscore": df["Eigen-raw_coding_rankscore"], "phred_coding": df["Eigen-pred_coding"] }, "eigen-pc": { "raw_coding": df["Eigen-PC-raw_coding"], "phred_coding": df["Eigen-PC-phred_coding"], "raw_rankscore": df["Eigen-PC-raw_coding_rankscore"] }, "genocanyon": { "score": df["GenoCanyon_score"], "rankscore": df["GenoCanyon_rankscore"] }, "metasvm": { "score": metasvm_score, "rankscore": df["MetaSVM_rankscore"], "pred": df["MetaSVM_pred"] }, "metalr": { "score": metalr_score, "rankscore": df["MetaLR_rankscore"], "pred": df["MetaLR_pred"] }, "reliability_index": df["Reliability_index"], "m_cap_score": { "score": m_cap_score, "rankscore": df["M-CAP_rankscore"], "pred": df["M-CAP_pred"] }, "revel": { "score": revel_score, "rankscore": df["REVEL_rankscore"] }, "mutpred": { "score": df["MutPred_score"], "rankscore": df["MutPred_rankscore"], "accession": df["MutPred_protID"], "aa_change": df["MutPred_AAchange"], "pred": mechanisms }, "dann": { "score": df["DANN_score"], "rankscore": df["DANN_rankscore"] }, "gerp++": { "nr": df["GERP++_NR"], "rs": df["GERP++_RS"], "rs_rankscore": df["GERP++_RS_rankscore"] }, "integrated": { "fitcons_score": df["integrated_fitCons_score"], "fitcons_rankscore": df["integrated_fitCons_rankscore"], "confidence_value": df["integrated_confidence_value"] }, "gm12878": { "fitcons_score": df["GM12878_fitCons_score"], "fitcons_rankscore": df["GM12878_fitCons_rankscore"], "confidence_value": df["GM12878_confidence_value"] }, "h1-hesc": { "fitcons_score": df["H1-hESC_fitCons_score"], "fitcons_rankscore": df["H1-hESC_fitCons_rankscore"], "confidence_value": df["H1-hESC_confidence_value"] }, "huvec": { "fitcons_score": df["HUVEC_fitCons_score"], "fitcons_rankscore": df["HUVEC_fitCons_rankscore"], "confidence_value": df["HUVEC_confidence_value"] }, "phylo": { "p100way": { "vertebrate": df["phyloP100way_vertebrate"], "vertebrate_rankscore": df["phyloP100way_vertebrate_rankscore"] }, "p30way": { "mammalian": df["phyloP30way_mammalian"], "mammalian_rankscore": df["phyloP30way_mammalian_rankscore"] }, "p17way": { "primate": df["phyloP17way_primate"], "primate_rankscore": df["phyloP17way_primate_rankscore"] } }, "phastcons": { "100way": { "vertebrate": df["phastCons100way_vertebrate"], "vertebrate_rankscore": df["phastCons100way_vertebrate_rankscore"] }, "30way": { "mammalian": df["phastCons30way_mammalian"], "mammalian_rankscore": df["phastCons30way_mammalian_rankscore"] }, "p17way": { "primate": df["phastCons17way_primate"], "primate_rankscore": df["phastCons17way_primate_rankscore"] } }, "siphy_29way": { "pi": siphy, "logodds": df["SiPhy_29way_logOdds"], "logodds_rankscore": df["SiPhy_29way_logOdds_rankscore"] }, "bayesdel": { "add_af": { "score": df["BayesDel_addAF_score"], "rankscore": df["BayesDel_addAF_rankscore"], "pred": df["BayesDel_addAF_pred"] }, "no_af": { "score": df["BayesDel_noAF_score"], "rankscore": df["BayesDel_noAF_rankscore"], "pred": df["BayesDel_noAF_pred"] } }, "clinpred": { "score": df["ClinPred_score"], "rankscore": df["ClinPred_rankscore"], "pred": df["ClinPred_pred"] }, "list-s2": { "score": df["LIST-S2_score"], "rankscore": df["LIST-S2_rankscore"], "pred": df["LIST-S2_pred"] }, "1000gp3": { "ac": df["1000Gp3_AC"], "af": df["1000Gp3_AF"], "afr_ac": df["1000Gp3_AFR_AC"], "afr_af": df["1000Gp3_AFR_AF"], "eur_ac": df["1000Gp3_EUR_AC"], "eur_af": df["1000Gp3_EUR_AF"], "amr_ac": df["1000Gp3_AMR_AC"], "amr_af": df["1000Gp3_AMR_AF"], "eas_ac": df["1000Gp3_EAS_AC"], "eas_af": df["1000Gp3_EAS_AF"], "sas_ac": df["1000Gp3_SAS_AC"], "sas_af": df["1000Gp3_SAS_AF"] }, "twinsuk": { "ac": df["TWINSUK_AC"], "af": df["TWINSUK_AF"] }, "alspac": { "ac": df["ALSPAC_AC"], "af": df["ALSPAC_AF"] }, "esp6500": { "aa_ac": df["ESP6500_AA_AC"], "aa_af": df["ESP6500_AA_AF"], "ea_ac": df["ESP6500_EA_AC"], "ea_af": df["ESP6500_EA_AF"] }, "uk10k": { "ac": df["UK10K_AC"], "af": df["UK10K_AF"] }, "exac": { "ac": df["ExAC_AC"], "af": df["ExAC_AF"], "adj_ac": df["ExAC_Adj_AC"], "adj_af": df["ExAC_Adj_AF"], "afr_ac": df["ExAC_AFR_AC"], "afr_af": df["ExAC_AFR_AF"], "amr_ac": df["ExAC_AMR_AC"], "amr_af": df["ExAC_AMR_AF"], "eas_ac": df["ExAC_EAS_AC"], "eas_af": df["ExAC_EAS_AF"], "fin_ac": df["ExAC_FIN_AC"], "fin_af": df["ExAC_FIN_AF"], "nfe_ac": df["ExAC_NFE_AC"], "nfe_af": df["ExAC_NFE_AF"], "sas_ac": df["ExAC_SAS_AC"], "sas_af": df["ExAC_SAS_AF"] }, "exac_nontcga": { "ac": df["ExAC_nonTCGA_AC"], "af": df["ExAC_nonTCGA_AF"], "adj_ac": df["ExAC_nonTCGA_Adj_AC"], "adj_af": df["ExAC_nonTCGA_Adj_AF"], "afr_ac": df["ExAC_nonTCGA_AFR_AC"], "afr_af": df["ExAC_nonTCGA_AFR_AF"], "amr_ac": df["ExAC_nonTCGA_AMR_AC"], "amr_af": df["ExAC_nonTCGA_AMR_AF"], "eas_ac": df["ExAC_nonTCGA_EAS_AC"], "eas_af": df["ExAC_nonTCGA_EAS_AF"], "fin_ac": df["ExAC_nonTCGA_FIN_AC"], "fin_af": df["ExAC_nonTCGA_FIN_AF"], "nfe_ac": df["ExAC_nonTCGA_NFE_AC"], "nfe_af": df["ExAC_nonTCGA_NFE_AF"], "sas_ac": df["ExAC_nonTCGA_SAS_AC"], "sas_af": df["ExAC_nonTCGA_SAS_AF"] }, "exac_nonpsych": { "ac": df["ExAC_nonpsych_AC"], "af": df["ExAC_nonpsych_AF"], "adj_ac": df["ExAC_nonpsych_Adj_AC"], "adj_af": df["ExAC_nonpsych_Adj_AF"], "afr_ac": df["ExAC_nonpsych_AFR_AC"], "afr_af": df["ExAC_nonpsych_AFR_AF"], "amr_ac": df["ExAC_nonpsych_AMR_AC"], "amr_af": df["ExAC_nonpsych_AMR_AF"], "eas_ac": df["ExAC_nonpsych_EAS_AC"], "eas_af": df["ExAC_nonpsych_EAS_AF"], "fin_ac": df["ExAC_nonpsych_FIN_AC"], "fin_af": df["ExAC_nonpsych_FIN_AF"], "nfe_ac": df["ExAC_nonpsych_NFE_AC"], "nfe_af": df["ExAC_nonpsych_NFE_AF"], "sas_ac": df["ExAC_nonpsych_SAS_AC"], "sas_af": df["ExAC_nonpsych_SAS_AF"] }, "clinvar": { "clinvar_id": df["clinvar_id"], "clinsig": [i for i in df["clinvar_clnsig"].split("/") if i != "."], "trait": [i for i in df["clinvar_trait"].split("|") if i != "."], "review": [i for i in df["clinvar_review"].split(",") if i != "."], "hgvs": [i for i in df["clinvar_hgvs"].split("|") if i != "."], "omim": [i for i in df["clinvar_OMIM_id"].split("|") if i != "."], "medgen": [i for i in df["clinvar_MedGen_id"].split("|") if i != "."], "orphanet": [i for i in df["clinvar_Orphanet_id"].split("|") if i != "."], "var_source": [i for i in df["clinvar_var_source"].split("|") if i != "."] }, "hgvsc": list( set(df["HGVSc_ANNOVAR"].split(';') + df["HGVSc_snpEff"].split(';') + df["HGVSc_VEP"].split(';'))), "hgvsp": list( set(df["HGVSp_ANNOVAR"].split(';') + df["HGVSp_snpEff"].split(';') + df["HGVSp_VEP"].split(';'))), "gtex": list(gtex), "geuvadis_eqtl_target_gene": df["Geuvadis_eQTL_target_gene"] } } if include_gnomad: one_snp_json['dbnsfp'].update(gnomad) one_snp_json = list_split( dict_sweep(unlist(value_convert_to_number(one_snp_json)), vals=[".", '-', "NA", None], remove_invalid_list=True), ";") one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"]) return one_snp_json
def _map_line_to_json(df, version, index=0): # specific variable treatment chrom = df["#chr"] if chrom == 'M': chrom = 'MT' # fields[7] in version 2, represent hg18_pos hg18_end = df["hg18_pos(1-based)"] if hg18_end == ".": hg18_end = "." else: hg18_end = int(hg18_end) # in case of no hg19 position provided, remove the item if df["hg19_pos(1-based)"] == '.': return None else: chromStart = int(df["hg19_pos(1-based)"]) chromEnd = chromStart chromStart_38 = int(df["pos(1-based)"]) ref = df["ref"].upper() alt = df["alt"].upper() HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt) HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt) if version == 'hg19': HGVS = HGVS_19 elif version == 'hg38': HGVS = HGVS_38 siphy_29way_pi = df["SiPhy_29way_pi"] if siphy_29way_pi == ".": siphy = "." else: freq = siphy_29way_pi.split(":") siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]} gtex_gene = df["GTEx_V6_gene"].split('|') gtex_tissue = df["GTEx_V6_tissue "].split('|') gtex = map(dict, map(lambda t: zip(('gene', 'tissue'), t), zip(gtex_gene, gtex_tissue))) acc = df["Uniprot_acc_Polyphen2"].rstrip().rstrip(';').split(";") pos = df["Uniprot_aapos_Polyphen2"].rstrip().rstrip(';').split(";") uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos))) provean_score = df["PROVEAN_score"].split(';') sift_score = df["SIFT_score"].split(';') hdiv_score = df["Polyphen2_HDIV_score"].split(';') hvar_score = df["Polyphen2_HVAR_score"].split(';') lrt_score = df["LRT_score"].split(';') m_cap_score = df["M-CAP_score"].split(';') mutationtaster_score = df["MutationTaster_score"].split(';') mutationassessor_score = df["MutationAssessor_score"].split(';') vest3_score = df["VEST3_score"].split(';') metasvm_score = df["MetaSVM_score"].split(';') fathmm_score = df["FATHMM_score"].split(';') metalr_score = df["MetaLR_score"].split(';') revel_score = df["REVEL_score"].split(';') ''' parse mutpred top 5 features ''' def modify_pvalue(pvalue): return float(pvalue.strip('P = ')) mutpred_mechanisms = df["MutPred_Top5features"] if mutpred_mechanisms not in ['.', ',', '-']: mutpred_mechanisms = mutpred_mechanisms.split(" (") and mutpred_mechanisms.split(";") mutpred_mechanisms = [m.rstrip(")") for m in mutpred_mechanisms] mutpred_mechanisms = [i.split(" (") for i in mutpred_mechanisms] mutpred_mechanisms = sum(mutpred_mechanisms, []) mechanisms = [ {"mechanism": mutpred_mechanisms[0], "p_val": modify_pvalue(mutpred_mechanisms[1])}, {"mechanism": mutpred_mechanisms[2], "p_val": modify_pvalue(mutpred_mechanisms[3])}, {"mechanism": mutpred_mechanisms[4], "p_val": modify_pvalue(mutpred_mechanisms[5])}, {"mechanism": mutpred_mechanisms[6], "p_val": modify_pvalue(mutpred_mechanisms[7])}, {"mechanism": mutpred_mechanisms[8], "p_val": modify_pvalue(mutpred_mechanisms[9])} ] else: mechanisms = '.' # normalize scores def norm(arr): return [None if item == '.' else item for item in arr] provean_score = norm(provean_score) sift_score = norm(sift_score) hdiv_score = norm(hdiv_score) hvar_score = norm(hvar_score) lrt_score = norm(lrt_score) m_cap_score = norm(m_cap_score) mutationtaster_score = norm(mutationtaster_score) mutationassessor_score = norm(mutationassessor_score) vest3_score = norm(vest3_score) metasvm_score = norm(metasvm_score) fathmm_score = norm(fathmm_score) metalr_score = norm(metalr_score) revel_score = norm(revel_score) # load as json data one_snp_json = { "_id": HGVS, "dbnsfp": { "rsid": df["rs_dbSNP147"], #"rsid_dbSNP144": fields[6], "chrom": chrom, "hg19": { "start": chromStart, "end": chromEnd }, "hg18": { "start": df["hg18_pos(1-based)"], "end": hg18_end }, "hg38": { "start": df["pos(1-based)"], "end": df["pos(1-based)"] }, "ref": ref, "alt": alt, "aa": { "ref": df["aaref"], "alt": df["aaalt"], "pos": df["aapos"], "refcodon": df["refcodon"], "codonpos": df["codonpos"], "codon_degeneracy": df["codon_degeneracy"], }, "genename": df["genename"], "uniprot": list(uniprot), "interpro_domain": df["Interpro_domain"], "cds_strand": df["cds_strand"], "ancestral_allele": df["Ancestral_allele"], #"altaineandertal": fields[17], #"denisova": fields[18] "ensembl": { "geneid": df["Ensembl_geneid"], "transcriptid": df["Ensembl_transcriptid"], "proteinid": df["Ensembl_proteinid"] }, "sift": { "score": sift_score, "converted_rankscore": df["SIFT_converted_rankscore"], "pred": df["SIFT_pred"] }, "polyphen2": { "hdiv": { "score": hdiv_score, "rankscore": df["Polyphen2_HDIV_rankscore"], "pred": df["Polyphen2_HDIV_pred"] }, "hvar": { "score": hvar_score, "rankscore": df["Polyphen2_HVAR_rankscore"], "pred": df["Polyphen2_HVAR_pred"] } }, "lrt": { "score": lrt_score, "converted_rankscore": df["LRT_converted_rankscore"], "pred": df["LRT_pred"], "omega": df["LRT_Omega"] }, "mutationtaster": { "score": mutationtaster_score, "converted_rankscore": df["MutationTaster_converted_rankscore"], "pred": df["MutationTaster_pred"], "model": df["MutationTaster_model"], "AAE": df["MutationTaster_AAE"] }, "mutationassessor": { "score": mutationassessor_score, "rankscore": df["MutationAssessor_score_rankscore"], "pred": df["MutationAssessor_pred"] }, "fathmm": { "score": fathmm_score, "rankscore": df["FATHMM_converted_rankscore"], "pred": df["FATHMM_pred"] }, "provean": { "score": provean_score, "rankscore": df["PROVEAN_converted_rankscore"], "pred": df["PROVEAN_pred"] }, "vest3": { "score": vest3_score, "rankscore": df["VEST3_rankscore"], "transcriptid": df["Transcript_id_VEST3"], "transcriptvar": df["Transcript_var_VEST3"] }, "fathmm-mkl": { "coding_score": df["fathmm-MKL_coding_score"], "coding_rankscore": df["fathmm-MKL_coding_rankscore"], "coding_pred": df["fathmm-MKL_coding_pred"], "coding_group": df["fathmm-MKL_coding_group"] }, "eigen": { "coding_or_noncoding": df["Eigen_coding_or_noncoding"], "raw": df["Eigen-raw"], "phred": df["Eigen-phred"] }, "eigen-pc": { "raw": df["Eigen-PC-raw"], "phred": df["Eigen-PC-phred"], "raw_rankscore": df["Eigen-PC-raw_rankscore"] }, "genocanyon": { "score": df["GenoCanyon_score"], "rankscore": df["GenoCanyon_score_rankscore"] }, "metasvm": { "score": metasvm_score, "rankscore": df["MetaSVM_rankscore"], "pred": df["MetaSVM_pred"] }, "metalr": { "score": metalr_score, "rankscore": df["MetaLR_rankscore"], "pred": df["MetaLR_pred"] }, "reliability_index": df["Reliability_index"], "m_cap_score": { "score": m_cap_score, "rankscore": df["M-CAP_rankscore"], "pred": df["M-CAP_pred"] }, "revel": { "score": revel_score, "rankscore": df["REVEL_rankscore"] }, "mutpred": { "score": df["MutPred_score"], "rankscore": df["MutPred_rankscore"], "accession": df["MutPred_protID"], "aa_change": df["MutPred_AAchange"], "pred": mechanisms }, "dann": { "score": df["DANN_score"], "rankscore": df["DANN_rankscore"] }, "gerp++": { "nr": df["GERP++_NR"], "rs": df["GERP++_RS"], "rs_rankscore": df["GERP++_RS_rankscore"] }, "integrated": { "fitcons_score": df["integrated_fitCons_score"], "fitcons_rankscore": df["integrated_fitCons_score_rankscore"], "confidence_value": df["integrated_confidence_value"] }, "gm12878": { "fitcons_score": df["GM12878_fitCons_score"], "fitcons_rankscore": df["GM12878_fitCons_score_rankscore"], "confidence_value": df["GM12878_confidence_value"] }, "h1-hesc": { "fitcons_score": df["H1-hESC_fitCons_score"], "fitcons_rankscore": df["H1-hESC_fitCons_score_rankscore"], "confidence_value": df["H1-hESC_confidence_value"] }, "huvec": { "fitcons_score": df["HUVEC_fitCons_score"], "fitcons_rankscore": df["HUVEC_fitCons_score_rankscore"], "confidence_value": df["HUVEC_confidence_value"] }, "phylo": { "p100way": { "vertebrate": df["phyloP100way_vertebrate"], "vertebrate_rankscore": df["phyloP100way_vertebrate_rankscore"] }, "p20way": { "mammalian": df["phyloP20way_mammalian"], "mammalian_rankscore": df["phyloP20way_mammalian_rankscore"] } }, "phastcons": { "100way": { "vertebrate": df["phastCons100way_vertebrate"], "vertebrate_rankscore": df["phastCons100way_vertebrate_rankscore"] }, "20way": { "mammalian": df["phastCons20way_mammalian"], "mammalian_rankscore": df["phastCons20way_mammalian_rankscore"] } }, "siphy_29way": { "pi": siphy, "logodds": df["SiPhy_29way_logOdds"], "logodds_rankscore": df["SiPhy_29way_logOdds_rankscore"] }, "1000gp3": { "ac": df["1000Gp3_AC"], "af": df["1000Gp3_AF"], "afr_ac": df["1000Gp3_AFR_AC"], "afr_af": df["1000Gp3_AFR_AF"], "eur_ac": df["1000Gp3_EUR_AC"], "eur_af": df["1000Gp3_EUR_AF"], "amr_ac": df["1000Gp3_AMR_AC"], "amr_af": df["1000Gp3_AMR_AF"], "eas_ac": df["1000Gp3_EAS_AC"], "eas_af": df["1000Gp3_EAS_AF"], "sas_ac": df["1000Gp3_SAS_AC"], "sas_af": df["1000Gp3_SAS_AF"] }, "twinsuk": { "ac": df["TWINSUK_AC"], "af": df["TWINSUK_AF"] }, "alspac": { "ac": df["ALSPAC_AC"], "af": df["ALSPAC_AF"] }, "esp6500": { "aa_ac": df["ESP6500_AA_AC"], "aa_af": df["ESP6500_AA_AF"], "ea_ac": df["ESP6500_EA_AC"], "ea_af": df["ESP6500_EA_AF"] }, "exac": { "ac": df["ExAC_AC"], "af": df["ExAC_AF"], "adj_ac": df["ExAC_Adj_AC"], "adj_af": df["ExAC_Adj_AF"], "afr_ac": df["ExAC_AFR_AC"], "afr_af": df["ExAC_AFR_AF"], "amr_ac": df["ExAC_AMR_AC"], "amr_af": df["ExAC_AMR_AF"], "eas_ac": df["ExAC_EAS_AC"], "eas_af": df["ExAC_EAS_AF"], "fin_ac": df["ExAC_FIN_AC"], "fin_af": df["ExAC_FIN_AF"], "nfe_ac": df["ExAC_NFE_AC"], "nfe_af": df["ExAC_NFE_AF"], "sas_ac": df["ExAC_SAS_AC"], "sas_af": df["ExAC_SAS_AF"] }, "exac_nontcga": { "ac": df["ExAC_nonTCGA_AC"], "af": df["ExAC_nonTCGA_AF"], "adj_ac": df["ExAC_nonTCGA_Adj_AC"], "adj_af": df["ExAC_nonTCGA_Adj_AF"], "afr_ac": df["ExAC_nonTCGA_AFR_AC"], "afr_af": df["ExAC_nonTCGA_AFR_AF"], "amr_ac": df["ExAC_nonTCGA_AMR_AC"], "amr_af": df["ExAC_nonTCGA_AMR_AF"], "eas_ac": df["ExAC_nonTCGA_EAS_AC"], "eas_af": df["ExAC_nonTCGA_EAS_AF"], "fin_ac": df["ExAC_nonTCGA_FIN_AC"], "fin_af": df["ExAC_nonTCGA_FIN_AF"], "nfe_ac": df["ExAC_nonTCGA_NFE_AC"], "nfe_af": df["ExAC_nonTCGA_NFE_AF"], "sas_ac": df["ExAC_nonTCGA_SAS_AC"], "sas_af": df["ExAC_nonTCGA_SAS_AF"] }, "exac_nonpsych": { "ac": df["ExAC_nonpsych_AC"], "af": df["ExAC_nonpsych_AF"], "adj_ac": df["ExAC_nonpsych_Adj_AC"], "adj_af": df["ExAC_nonpsych_Adj_AF"], "afr_ac": df["ExAC_nonpsych_AFR_AC"], "afr_af": df["ExAC_nonpsych_AFR_AF"], "amr_ac": df["ExAC_nonpsych_AMR_AC"], "amr_af": df["ExAC_nonpsych_AMR_AF"], "eas_ac": df["ExAC_nonpsych_EAS_AC"], "eas_af": df["ExAC_nonpsych_EAS_AF"], "fin_ac": df["ExAC_nonpsych_FIN_AC"], "fin_af": df["ExAC_nonpsych_FIN_AF"], "nfe_ac": df["ExAC_nonpsych_NFE_AC"], "nfe_af": df["ExAC_nonpsych_NFE_AF"], "sas_ac": df["ExAC_nonpsych_SAS_AC"], "sas_af": df["ExAC_nonpsych_SAS_AF"] }, "clinvar": { "rs": df["clinvar_rs"], "clinsig": list(map(int,[i for i in df["clinvar_clnsig"].split("|") if i != "."])), "trait": [i for i in df["clinvar_trait"].split("|") if i != "."], "golden_stars": list(map(int,[i for i in df["clinvar_golden_stars"].split("|") if i != "."])) }, "gtex": list(gtex) } } one_snp_json = list_split(dict_sweep(unlist(value_convert_to_number(one_snp_json)), vals=[".", '-', None]), ";") one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"]) return one_snp_json
def _map_line_to_json(cp, hg19): try: clinical_significance = cp.ReferenceClinVarAssertion.\ ClinicalSignificance.Description except: clinical_significance = None rcv_accession = cp.ReferenceClinVarAssertion.ClinVarAccession.Acc try: review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.\ ReviewStatus except: review_status = None try: last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.\ DateLastEvaluated except: last_evaluated = None variant_id = cp.ReferenceClinVarAssertion.MeasureSet.ID number_submitters = len(cp.ClinVarAssertion) # some items in clinvar_xml doesn't have origin information try: origin = cp.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin except: origin = None conditions = [] for _trait in cp.ReferenceClinVarAssertion.TraitSet.Trait: synonyms = [] conditions_name = '' for name in _trait.Name: if name.ElementValue.Type == 'Alternate': synonyms.append(name.ElementValue.get_valueOf_()) if name.ElementValue.Type == 'Preferred': conditions_name += name.ElementValue.get_valueOf_() identifiers = {} for item in _trait.XRef: if item.DB == 'Human Phenotype Ontology': key = 'Human_Phenotype_Ontology' else: key = item.DB identifiers[key.lower()] = item.ID for symbol in _trait.Symbol: if symbol.ElementValue.Type == 'Preferred': conditions_name += ' (' + symbol.ElementValue.get_valueOf_( ) + ')' age_of_onset = '' for _set in _trait.AttributeSet: if _set.Attribute.Type == 'age of onset': age_of_onset = _set.Attribute.get_valueOf_() conditions.append({ "name": conditions_name, "synonyms": synonyms, "identifiers": identifiers, "age_of_onset": age_of_onset }) # MeasureSet.Measure return a list, there might be multiple # Measure under one MeasureSet for Measure in cp.ReferenceClinVarAssertion.MeasureSet.Measure: variation_type = Measure.Type # exclude any item of which types belong to # 'Variation', 'protein only' or 'Microsatellite' if variation_type == 'Variation' or variation_type\ == 'protein only' or variation_type == 'Microsatellite': continue allele_id = Measure.ID chrom = None chromStart_19 = None chromEnd_19 = None chromStart_38 = None chromEnd_38 = None ref = None alt = None if Measure.SequenceLocation: for SequenceLocation in Measure.SequenceLocation: # In this version, only accept information concerning GRCh37 if 'GRCh37' in SequenceLocation.Assembly: chrom = SequenceLocation.Chr chromStart_19 = SequenceLocation.start chromEnd_19 = SequenceLocation.stop ref = SequenceLocation.referenceAllele alt = SequenceLocation.alternateAllele if 'GRCh38' in SequenceLocation.Assembly: chromStart_38 = SequenceLocation.start chromEnd_38 = SequenceLocation.stop if not ref: ref = SequenceLocation.referenceAllele if not alt: alt = SequenceLocation.alternateAllele if Measure.MeasureRelationship: try: symbol = Measure.MeasureRelationship[0].\ Symbol[0].get_ElementValue().valueOf_ except: symbol = None gene_id = Measure.MeasureRelationship[0].XRef[0].ID else: symbol = None gene_id = None if Measure.Name: name = Measure.Name[0].ElementValue.valueOf_ else: name = None if len(Measure.CytogeneticLocation) == 1: cytogenic = Measure.CytogeneticLocation[0] else: cytogenic = Measure.CytogeneticLocation hgvs_coding = None hgvs_genome = None HGVS = {'genomic': [], 'coding': [], 'non-coding': [], 'protein': []} coding_hgvs_only = None hgvs_id = None if hg19: chromStart = chromStart_19 chromEnd = chromEnd_19 else: chromStart = chromStart_38 chromEnd = chromEnd_38 # hgvs_not_validated = None if Measure.AttributeSet: # 'copy number loss' or 'gain' have format different\ # from other types, should be dealt with seperately if (variation_type == 'copy number loss') or \ (variation_type == 'copy number gain'): for AttributeSet in Measure.AttributeSet: if 'HGVS, genomic, top level' in AttributeSet.\ Attribute.Type: if AttributeSet.Attribute.integerValue == 37: hgvs_genome = AttributeSet.Attribute.get_valueOf_() if 'genomic' in AttributeSet.Attribute.Type: HGVS['genomic'].append( AttributeSet.Attribute.get_valueOf_()) elif 'non-coding' in AttributeSet.Attribute.Type: HGVS['non-coding'].append( AttributeSet.Attribute.get_valueOf_()) elif 'coding' in AttributeSet.Attribute.Type: HGVS['coding'].append( AttributeSet.Attribute.get_valueOf_()) elif 'protein' in AttributeSet.Attribute.Type: HGVS['protein'].append( AttributeSet.Attribute.get_valueOf_()) else: for AttributeSet in Measure.AttributeSet: if 'genomic' in AttributeSet.Attribute.Type: HGVS['genomic'].append( AttributeSet.Attribute.get_valueOf_()) elif 'non-coding' in AttributeSet.Attribute.Type: HGVS['non-coding'].append( AttributeSet.Attribute.get_valueOf_()) elif 'coding' in AttributeSet.Attribute.Type: HGVS['coding'].append( AttributeSet.Attribute.get_valueOf_()) elif 'protein' in AttributeSet.Attribute.Type: HGVS['protein'].append( AttributeSet.Attribute.get_valueOf_()) if AttributeSet.Attribute.Type == 'HGVS, coding, RefSeq': hgvs_coding = AttributeSet.Attribute.get_valueOf_() elif AttributeSet.Attribute.Type == \ 'HGVS, genomic, top level, previous': hgvs_genome = AttributeSet.Attribute.get_valueOf_() break if chrom and chromStart and chromEnd: if variation_type == 'single nucleotide variant': hgvs_id = "chr%s:g.%s%s>%s" % (chrom, chromStart, ref, alt) # items whose type belong to 'Indel, Insertion, \ # Duplication' might not hava explicit alt information, \ # so we will parse from hgvs_genome elif variation_type == 'Indel': # to_do: hgvs_genome should distinguish hg19 and hg38 # RCV000156073, NC_000010.10:g.112581638_112581639delinsG if hgvs_genome: indel_position = hgvs_genome.find('ins') indel_alt = hgvs_genome[indel_position + 3:] if chromStart == chromEnd: hgvs_id = "chr%s:g.%sdelins%s" % \ (chrom, chromStart, indel_alt) else: hgvs_id = "chr%s:g.%s_%sdelins%s" % \ (chrom, chromStart, chromEnd, indel_alt) elif variation_type == 'Deletion': if chromStart == chromEnd: # RCV000048406, chr17:g.41243547del hgvs_id = "chr%s:g.%sdel" % (chrom, chromStart) else: hgvs_id = "chr%s:g.%s_%sdel" % (chrom, chromStart, chromEnd) elif variation_type == 'Insertion': if hgvs_genome: ins_position = hgvs_genome.find('ins') if 'ins' in hgvs_genome: ins_ref = hgvs_genome[ins_position + 3:] hgvs_id = "chr%s:g.%s_%sins%s" % \ (chrom, chromStart, chromEnd, ins_ref) elif variation_type == 'Duplication': if hgvs_genome: dup_position = hgvs_genome.find('dup') if 'dup' in hgvs_genome: dup_ref = hgvs_genome[dup_position + 3:] if chromStart == chromEnd: hgvs_id = "chr%s:g.%sdup%s" % \ (chrom, chromStart, dup_ref) else: hgvs_id = "chr%s:g.%s_%sdup%s" % \ (chrom, chromStart, chromEnd, dup_ref) elif variation_type == 'copy number loss' or\ variation_type == 'copy number gain': if hgvs_genome and chrom: hgvs_id = "chr" + chrom + ":g." + hgvs_genome.split('.')[2] elif hgvs_coding: hgvs_id = hgvs_coding coding_hgvs_only = True else: #logging.warn("couldn't find any id %s" % rcv_accession) return else: logging.debug('no measure.attribute %s' % rcv_accession) return for key in HGVS: HGVS[key].sort() rsid = [] cosmic = None dbvar = None uniprot = None omim = None # loop through XRef to find rsid as well as other ids if Measure.XRef: for XRef in Measure.XRef: #multiple rsid could be linked to one hgvs id if XRef.Type == 'rs': _rsid = 'rs' + str(XRef.ID) rsid.append(_rsid) elif XRef.DB == 'COSMIC': cosmic = XRef.ID elif XRef.DB == 'OMIM': omim = XRef.ID elif XRef.DB == 'UniProtKB/Swiss-Prot': uniprot = XRef.ID elif XRef.DB == 'dbVar': dbvar = XRef.ID # make sure the hgvs_id is not none if hgvs_id: one_snp_json = { "_id": hgvs_id, "clinvar": { "allele_id": allele_id, "variant_id": variant_id, "chrom": chrom, "omim": omim, "cosmic": cosmic, "uniprot": uniprot, "dbvar": dbvar, "hg19": { "start": chromStart_19, "end": chromEnd_19 }, "hg38": { "start": chromStart_38, "end": chromEnd_38 }, "type": variation_type, "gene": { "id": gene_id, "symbol": symbol }, "rcv": { "accession": rcv_accession, "clinical_significance": clinical_significance, "number_submitters": number_submitters, "review_status": review_status, "last_evaluated": str(last_evaluated), "preferred_name": name, "origin": origin, "conditions": conditions }, "rsid": rsid, "cytogenic": cytogenic, "hgvs": HGVS, "coding_hgvs_only": coding_hgvs_only, "ref": ref, "alt": alt } } obj = (dict_sweep( unlist( value_convert_to_number(one_snp_json, [ 'chrom', 'omim', 'id', 'orphanet', 'gene', 'rettbase_(cdkl5)', 'cosmic', 'dbrbc' ])), [None, '', 'None'])) yield obj
def _map_line_to_json(item, keys): key_start = ["AC", "AF", "AN", "Hom", "GC", "Hemi"] chrom = str(item.CHROM) if chrom not in CHROM_VALID_VALUES: return chromStart = item.POS ref = item.REF info = item.INFO _filter = item.FILTER rsid = item.ID # the following value could be missing in the vcf record # check first if the key exists in the vcf record # if not, return None vqslod = info[ 'VQSLOD'] if 'VQSLOD' in info and info['VQSLOD'] != math.inf else None vqsr_culprit = info['VQSR_culprit'] if 'VQSR_culprit' in info else None baseqranksum = info['BaseQRankSum'] if 'BaseQRankSum' in info else None clippingranksum = info[ 'ClippingRankSum'] if 'ClippingRankSum' in info else None mqranksum = info['MQRankSum'] if 'MQRankSum' in info else None readposranksum = info[ 'ReadPosRankSum'] if 'ReadPosRankSum' in info else None qd = info['QD'] if 'QD' in info else None inbreedingcoeff = info[ 'InbreedingCoeff'] if 'InbreedingCoeff' in info else None # convert vcf object to string item.ALT = [str(alt) for alt in item.ALT] # if multiallelic, put all variants as a list in multi-allelic field hgvs_list = None if len(item.ALT) > 1: hgvs_list = [ get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=False) for alt in item.ALT ] for i, alt in enumerate(item.ALT): (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True) if HGVS is None: return assert len(item.ALT) == len( info['AC'] ), "Expecting length of item.ALT= length of info.AC, but not for %s" % ( HGVS) assert len(item.ALT) == len( info['AF'] ), "Expecting length of item.ALT= length of info.AF, but not for %s" % ( HGVS) one_snp_json = { "_id": HGVS, "gnomad_genome": { "chrom": chrom, "pos": chromStart, "filter": _filter, "multi-allelic": hgvs_list, "ref": ref, "alt": alt, "alleles": item.ALT, "type": var_type, "rsid": rsid, "baseqranksum": baseqranksum, "clippingranksum": clippingranksum, "fs": info['FS'], "inbreedingcoeff": inbreedingcoeff, "mq": { "mq": info['MQ'], "mqranksum": mqranksum }, "qd": qd, "readposranksum": readposranksum, "vqslod": vqslod, "vqsr_culprit": vqsr_culprit } } # create a holder in one_snp_json for each _start, e.g. 'ac', 'af', 'gc' for _start in key_start: one_snp_json['gnomad_genome'][_start.lower()] = {} # loop through each available key for _key in keys: if _key in info: # loop through each prefix for _start in key_start: # "ac", "af" value is related to multi-allelic, need to deal with separately if _key.startswith(_start) and _start in [ 'AC', 'AF', 'Hom', 'Hemi' ]: one_snp_json['gnomad_genome'][_start.lower()][ _key.lower()] = info[_key][i] elif _key.startswith(_start) and _start not in [ 'AC', 'AF', 'Hom', 'Hemi' ]: one_snp_json['gnomad_genome'][_start.lower()][ _key.lower()] = info[_key] obj = (dict_sweep( unlist( value_convert_to_number(one_snp_json, skipped_keys=['chrom'])), [None])) yield obj
def _map_line_to_json(cp, hg19): try: clinical_significance = cp.ReferenceClinVarAssertion.\ ClinicalSignificance.Description except: clinical_significance = None rcv_accession = cp.ReferenceClinVarAssertion.ClinVarAccession.Acc try: review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.\ ReviewStatus except: review_status = None try: last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.\ DateLastEvaluated except: last_evaluated = None number_submitters = len(cp.ClinVarAssertion) # some items in clinvar_xml doesn't have origin information try: origin = cp.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin except: origin = None conditions = [] for _trait in cp.ReferenceClinVarAssertion.TraitSet.Trait: synonyms = [] conditions_name = '' for name in _trait.Name: if name.ElementValue.Type == 'Alternate': synonyms.append(name.ElementValue.get_valueOf_()) if name.ElementValue.Type == 'Preferred': conditions_name += name.ElementValue.get_valueOf_() identifiers = {} for item in _trait.XRef: if item.DB == 'Human Phenotype Ontology': key = 'Human_Phenotype_Ontology' else: key = item.DB identifiers[key.lower()] = item.ID for symbol in _trait.Symbol: if symbol.ElementValue.Type == 'Preferred': conditions_name += ' (' + symbol.ElementValue.get_valueOf_() + ')' age_of_onset = '' for _set in _trait.AttributeSet: if _set.Attribute.Type == 'age of onset': age_of_onset = _set.Attribute.get_valueOf_() conditions.append({"name": conditions_name, "synonyms": synonyms, "identifiers": identifiers, "age_of_onset": age_of_onset}) try: genotypeset = cp.ReferenceClinVarAssertion.GenotypeSet except: genotypeset = None if genotypeset: obj_list = [] id_list = [] for _set in cp.ReferenceClinVarAssertion.GenotypeSet.MeasureSet: variant_id = _set.ID for _measure in _set.Measure: json_obj = parse_measure(_measure, hg19=hg19) if json_obj: json_obj['clinvar']['rcv'].update({'accession': rcv_accession, 'clinical_significance': clinical_significance, 'number_submitters': number_submitters, 'review_status': review_status, 'last_evaluated': str(last_evaluated), 'origin': origin, 'conditions': conditions}) json_obj['clinvar'].update({'variant_id': variant_id}) json_obj = (dict_sweep(unlist(value_convert_to_number(json_obj, ['chrom', 'omim', 'id', 'orphanet', 'gene', 'rettbase_(cdkl5)', 'cosmic', 'dbrbc'])), [None, '', 'None'])) obj_list.append(json_obj) id_list.append(json_obj['_id']) for _obj in obj_list: _obj['clinvar'].update({'genotypeset': { 'type': 'CompoundHeterozygote', 'genotype': id_list }}) yield _obj else: variant_id = cp.ReferenceClinVarAssertion.MeasureSet.ID for _measure in cp.ReferenceClinVarAssertion.MeasureSet.Measure: json_obj = parse_measure(_measure, hg19=hg19) if json_obj: json_obj['clinvar']['rcv'].update({'accession': rcv_accession, 'clinical_significance': clinical_significance, 'number_submitters': number_submitters, 'review_status': review_status, 'last_evaluated': str(last_evaluated), 'origin': origin, 'conditions': conditions}) json_obj['clinvar'].update({'variant_id': variant_id}) json_obj = (dict_sweep(unlist(value_convert_to_number(json_obj, ['chrom', 'omim', 'id', 'orphanet', 'gene', 'rettbase_(cdkl5)', 'cosmic', 'dbrbc'])), [None, '', 'None'])) yield json_obj
def _map_line_to_json(df): chrom = df['chromosome'] if chrom == 'M': chrom = 'MT' ref = df["reference_allele"] alt = df["tumor_seq_allele1"] if alt == '-': HGVS = get_hgvs_from_vcf(chrom, int(df['start_position']) - 1, 'N' + ref, 'N', mutant_type=False) elif ref == '-': HGVS = get_hgvs_from_vcf(chrom, int(df['start_position']) - 1, 'N', 'N' + alt, mutant_type=False) else: HGVS = get_hgvs_from_vcf(chrom, int(df['start_position']), ref, alt, mutant_type=False) ccle_depmap = { 'gene': { 'id': df['entrez_gene_id'], 'symbol': df['hugo_symbol'] }, 'chrom': chrom, 'hg19': { 'start': df['start_position'], 'end': df['end_position'] }, 'strand': df['strand'], 'class': df['variant_classification'], 'vartype': df['variant_type'], 'ref': df['reference_allele'], 'tumor_seq_allele1': df['tumor_seq_allele1'], 'dbsnp': { 'rsid': df['dbsnp_rs'], 'val_status': df['dbsnp_val_status'] }, 'genome_change': df['genome_change'], 'annotation_transcript': df['annotation_transcript'], 'tumor_sample_barcode': df['tumor_sample_barcode'], 'cdna_change': df['cdna_change'], 'codon_change': df['codon_change'], 'protein_change': df['protein_change'], 'isdeleterious': to_boolean(df['isdeleterious'], true_str=[ 'TRUE', ], false_str=[ 'FALSE', ]), 'istcgahotspot': to_boolean(df['istcgahotspot'], true_str=[ 'TRUE', ], false_str=[ 'FALSE', ]), 'tcgahscnt': df['tcgahscnt'], 'iscosmichotspot': to_boolean(df['iscosmichotspot'], true_str=[ 'TRUE', ], false_str=[ 'FALSE', ]), 'cosmichscnt': df['cosmichscnt'], 'exac_af': df['exac_af'], 'wes_ac': df['wes_ac'], 'sanger': { 'wes_ac': df['sangerwes_ac'], 'recalibwes_ac': df['sangerrecalibwes_ac'] }, 'rnaseq_ac': df['rnaseq_ac'], 'hc_ac': df['hc_ac'], 'rd_ac': df['rd_ac'], 'wgs_ac': df['wgs_ac'], 'broad_id': df['broad_id'] } ccle_depmap = dict_sweep(ccle_depmap) # load as json data one_snp_json = {"_id": HGVS, "ccle": ccle_depmap} one_snp_json = value_convert_to_number(one_snp_json) one_snp_json['ccle']['chrom'] = str(one_snp_json['ccle']['chrom']) return one_snp_json
def _map_line_to_json(fields): assert len(fields) == VALID_COLUMN_NO chr_info = re.findall(r"[\w']+", fields[17]) chrom = chr_info[0] # Mutation GRCh37 genome position chromStart = chr_info[1] chromEnd = chr_info[2] HGVS = None cds = fields[13] sub = re.search(r'[ATCGMNHKRY]+>[ATCGMNHKRY]+', cds) ins = re.search(r'ins[ATCGMN]+|ins[0-9]+', cds) delete = cds.find('del') != -1 del_ins = re.search(r'[0-9]+>[ATCGMN]+', cds) comp = re.search(r'[ATCGMN]+', cds) if sub: HGVS = "chr%s:g.%s%s" % (chrom, chromStart, sub.group()) elif ins: HGVS = "chr%s:g.%s_%s%s" % (chrom, chromStart, chromEnd, ins.group()) elif delete: HGVS = "chr%s:g.%s_%sdel" % (chrom, chromStart, chromEnd) elif del_ins: HGVS = "chr%s:g.%s_%sdelins%s" % (chrom, chromStart, chromEnd, comp.group()) # elif comp: # HGVS = "chr%s:g.%s_%s%s" % (chrom, chromStart, chromEnd, comp.group()) else: HGVS = fields[12] print("Error2:", fields[15], cds, fields[17]) # load as json data if HGVS is None: return one_snp_json = { "sorter": fields[17] + fields[13], "_id": HGVS, "cosmic": { "gene": { "symbol": fields[0], # Gene name "id": fields[3], # HGNC ID "cds_length": fields[2] }, "transcript": fields[1], # Accession Number "sample": { "name": fields[4], # Sample name "id": fields[5] # ID_sample }, "tumour": { "id": fields[6], # ID_tumour "primary_site": fields[7], # Primary site "site_subtype": fields[8], # Site subtype "primary_histology": fields[9], # Primary histology "histology_subtype": fields[10], # Histology subtype "origin": fields[1] }, "mutation": { "id": "COSM" + fields[12], # Mutation ID "cds": cds, # Mutation CDS "aa": fields[14], # Mutation AA "description": fields[15], # Mutation Description "zygosity": fields[16], # Mutation zygosity "somatic_status": fields[21] # Mutation somatic status }, "chrom": chrom, "hg19": { "start": chromStart, "end": chromEnd }, "pubmed": fields[22] # Pubmed_PMID } } return dict_sweep(value_convert_to_number(one_snp_json), vals=[""])
def restructure_dict(dictionary): restr_dict = dict() d1 = dict() pred_properties_dict = {} products_list = [] categories_list = [] enzymes_list = [] targets_list = [] carriers_list = [] transporters_list = [] atccode_list = [] for key, value in iter(dictionary.items()): if key == 'name' and value: d1[key] = value elif key == 'drugbank-id' and value: id_list = [] if isinstance(value, list): for ele in value: if isinstance(ele, collections.OrderedDict): assert "@primary" in ele for x, y in iter(ele.items()): if x == '#text': # make sure we always have DB ID as drugbank_id d1.update({'drugbank_id': y}) restr_dict['_id'] = y if isinstance(ele, str): key = key.replace('-', '_') id_list.append(ele) d1.update({'accession_number': id_list}) elif isinstance(value, dict) or isinstance( value, collections.OrderedDict): for x, y in iter(value.items()): if x == '#text': key = key.replace('-', '_') id_list.append(y) d1.update({key: id_list}) restr_dict['_id'] = y elif key == 'description': d1.update({'pharmacology': {key: value}}) elif key == 'groups': for i, j in iter(value.items()): d1[key] = j elif key == 'indication': d1['pharmacology'].update({key: value}) elif key == 'pharmacodynamics': d1['pharmacology'].update({key: value}) elif key == 'mechanism-of-action': key = key.replace('-', '_') d1['pharmacology'].update({key: value}) elif key == 'toxicity': d1['pharmacology'].update({key: value}) elif key == 'metabolism': d1['pharmacology'].update({key: value}) elif key == 'absorption': d1['pharmacology'].update({key: value}) elif key == 'half-life': key = key.replace('-', '_') d1['pharmacology'].update({key: value}) elif key == 'protein-binding': key = key.replace('-', '_') d1['pharmacology'].update({key: value}) elif key == 'route-of-elimination': key = key.replace('-', '_') d1['pharmacology'].update({key: value}) elif key == 'volume-of-distribution': key = key.replace('-', '_') d1['pharmacology'].update({key: value}) elif key == 'clearance': d1['pharmacology'].update({key: value}) elif key == 'classification' and value: for m, n in iter(value.items()): m = m.lower().replace('-', '_') d1.update({'taxonomy': value}) elif key == 'salts' and value: salts_list = [] for m, n in iter(value.items()): if isinstance(n, list): for ele in n: for k in ele: if k == 'name': salts_list.append(ele[k]) d1.update({key: salts_list}) elif isinstance(n, dict) or isinstance( n, collections.OrderedDict): d1.update({key: n['name']}) elif key == 'synonyms' and value: synonym_list = [] if isinstance(value, collections.OrderedDict): for x, y in iter(value.items()): for ele in y: for name in ele: if name == '#text': synonym_list.append(ele[name]) d1.update({key: synonym_list}) elif key == 'products' and value: def restr_product_dict(dictionary): products_dict = {} for x in dictionary: if x == 'name': products_dict[x] = dictionary[x] elif x == 'dosage-form': products_dict['dosage_form'] = dictionary[x] elif x == 'strength': products_dict[x] = dictionary[x] elif x == 'route': products_dict[x] = dictionary[x] elif x == 'over-the-counter': products_dict['otc'] = dictionary[x] elif x == 'generic': products_dict[x] = dictionary[x] elif x == 'ndc-id': products_dict['ndc_id'] = dictionary[x] elif x == 'ndc-product-code': products_dict['ndc_product_code'] = dictionary[x] elif x == 'dpd-id': products_dict['dpd'] = dictionary[x] elif x == 'started-marketing-on': products_dict[x.replace('-', '_')] = dictionary[x] elif x == 'ended-marketing-on': products_dict[x.replace('-', '_')] = dictionary[x] elif x == 'fda-application-number': products_dict[x.replace('-', '_')] = dictionary[x] elif x == 'approved': products_dict[x] = dictionary[x] elif x == 'country': products_dict[x] = dictionary[x] elif x == 'source': products_dict[x] = dictionary[x] return products_dict for x, y in iter(value.items()): if isinstance(y, dict) or isinstance(y, collections.OrderedDict): _d = restr_product_dict(y) products_list.append(_d) elif isinstance(y, list): for _d in y: products_list.append(restr_product_dict(_d)) elif key == 'packagers' and value: pack_list = [] for pack in value: for pack1 in value[pack]: for s in pack1: if s == 'name' and pack1[s]: pack_list.append(pack1[s]) d1.update({key: pack_list}) elif key == 'manufacturers' and value: manuf_list = [] for x, y in iter(value.items()): if isinstance(y, dict) or isinstance(y, collections.OrderedDict): for i in y: if i == '#text': manuf_list.append(y[i]) d1.update({key: manuf_list}) if isinstance(y, list): for i in y: for m, n in iter(i.items()): if m == '#text': manuf_list.append(n) d1.update({key: manuf_list}) elif key == 'categories' and value: for x, y in iter(value.items()): d1.update({key: y}) elif key == "snp-effects" and value: key = key.replace('-', '_') d1['pharmacology'].update({key: value}) elif key == "snp-adverse-drug-reactions" and value: key = key.replace('-', '_') d1['pharmacology'].update({key: value}) elif key == 'affected-organisms' and value: for x, y in iter(value.items()): key = key.replace('-', '_') d1['pharmacology'].update({key: value["affected-organism"]}) elif key == 'ahfs-codes' and value: for x in value: key = key.replace('-', '_') d1.update({key: value[x]}) elif key == 'food-interactions' and value: food_interaction_list = [] for x, y in iter(value.items()): if isinstance(y, list): key = key.replace('-', '_') for i in y: food_interaction_list.append(i) d1.update({key: food_interaction_list}) else: d1.update({key: y}) elif key == 'drug-interactions' and value: key = key.replace('-', '_') for x, y in iter(value.items()): d1.update({key: y}) elif key == 'sequences' and value: for x, y in iter(value.items()): for i in y: if i == '@format': str1 = y[i] + '_sequences' d1[str1] = y['#text'].replace('\n', ' ') elif key == 'experimental-properties' and value: d1_exp_properties = {} def restr_properties_dict(dictionary): for x, y in iter(dictionary.items()): k1 = dictionary['kind'] k1 = k1.lower().replace(' ', '_').replace('-', '_') if k1 == "isoelectric_point": # make sure value are floats, if intervals, then list(float) try: d1_exp_properties[k1] = float(dictionary['value']) except ValueError: # not a float, maybe a range ? "5.6 - 7.6" vals = dictionary['value'].split("-") try: for i, val in enumerate([v for v in vals]): vals[i] = float(val) logging.info("Document ID '%s' has a range " % restr_dict["_id"] + \ "as isoelectric_point: %s" % vals) d1_exp_properties[k1] = vals except ValueError as e: # not something we can handle, skip it logging.warning("Document ID '%s' has non-convertible " % restr_dict["_id"] + \ " value for isoelectric_point, field ignored: %s" % dictionary['value']) continue else: d1_exp_properties[k1] = dictionary['value'] return d1_exp_properties for ele in value: key = key.replace('-', '_') if isinstance(value[ele], list): for _d in value[ele]: _d = restr_properties_dict(_d) d1.update({key: _d}) if isinstance(value[ele], dict) or isinstance( value[ele], collections.OrderedDict): _d = restr_properties_dict(value[ele]) d1.update({key: _d}) elif key == 'calculated-properties' and value: def restr_properties_dict(dictionary): for x in dictionary: k = dictionary['kind'] k = k.lower().replace(' ', '_').replace('-', '_') pred_properties_dict[k] = dictionary['value'] if dictionary['kind'] == "IUPAC Name": d1.update({'iupac': dictionary['value']}) elif dictionary['kind'] == "SMILES": d1.update({'smiles': dictionary['value']}) elif dictionary['kind'] == "Molecular Formula": d1.update({'formula': dictionary['value']}) elif dictionary['kind'] == "InChI": d1.update({'inchi': dictionary['value']}) elif dictionary['kind'] == "InChIKey": if dictionary['value'][0:9] == 'InChIKey=': d1.update({'inchi_key': dictionary['value'][9:]}) else: d1.update({'inchi_key': dictionary['value']}) elif dictionary['kind'] == "Molecular Weight": d1.update({'weight': {'average': dictionary['value']}}) elif dictionary['kind'] == "Monoisotopic Weight": d1['weight'].update( {'monoisotopic': dictionary['value']}) for x, y in iter(value.items()): if isinstance(y, list): for _d in y: _d = restr_properties_dict(_d) if isinstance(y, dict) or isinstance(y, collections.OrderedDict): _d = restr_properties_dict(y) elif key == 'external-identifiers' and value: for ele in value['external-identifier']: for x in ele: if x == 'resource': if ele[x] == "Drugs Product Database (DPD)": d1['dpd'] = ele['identifier'] elif ele[x] == "KEGG Drug": d1['kegg_drug'] = ele['identifier'] elif ele[x] == "KEGG Compound": d1['kegg_compound'] = ele['identifier'] elif ele[x] == "National Drug Code Directory": d1['ndc_directory'] = ele['identifier'] elif ele[x] == "PharmGKB": d1['pharmgkb'] = ele['identifier'] elif ele[x] == "UniProtKB": d1['uniprotkb'] = ele['identifier'] elif ele[x] == "Wikipedia": d1['wikipedia'] = ele['identifier'] elif ele[x] == "ChemSpider": d1['chemspider'] = ele['identifier'] elif ele[x] == "ChEBI": d1['chebi'] = ele['identifier'] elif ele[x] == "PubChem Compound": d1['pubchem_compound'] = ele['identifier'] elif ele[x] == "PubChem Substance": d1['pubchem_substance'] = ele['identifier'] elif ele[x] == "UniProtKB": d1['uniprotkb'] = ele['identifier'] elif ele[x] == "GenBank": d1['genbank'] = ele['identifier'] else: source = ele[x].lower().replace('-', '_').replace( ' ', '_') d1[source] = ele['identifier'] elif key == 'external-links' and value: if isinstance(value['external-link'], list): for ele in value['external-link']: for x in ele: try: resource = ele['resource'] d1[resource.lower().replace('.', '_')] = ele['url'] except: pass else: try: resource = ele['resource'] d1[resource.lower().replace('.', '_')] = ele['url'] except: pass elif key == 'patents' and value: if isinstance(value, dict): for x in value: d1.update({key: value[x]}) elif key == 'international-brands' and value: key = key.lower().replace('-', '_') d1.update({key: value['international-brand']}) elif key == 'mixtures' and value: d1.update({key: value['mixture']}) elif key == 'pathways' and value: _li = [] def restr_pathway_dict(dictionary): _dict = {} for x, y in iter(dictionary.items()): if x == 'smpdb-id': _dict.update({'smpdb_id': y}) elif x == 'name': _dict.update({x: y}) elif x == 'drugs': _dict.update({x: y['drug']}) elif x == 'enzymes': _dict.update({x: y}) return _dict if isinstance(value['pathway'], list): for ele in value['pathway']: _dict = restr_pathway_dict(ele) _li.append(_dict) d1.update({key: _li}) elif isinstance(value['pathway'], dict) or isinstance( value['pathway'], OrderedDict): _dict = restr_pathway_dict(value['pathway']) d1.update({key: _dict}) elif key == 'targets' and value: if isinstance(value['target'], list): for dictionary in value['target']: _dict = restr_protein_dict(dictionary) targets_list.append(_dict) elif isinstance(value['target'], dict) or isinstance( value['target'], OrderedDict): _dict = restr_protein_dict(value['target']) targets_list.append(_dict) elif key == 'enzymes' and value: if isinstance(value['enzyme'], list): for dictionary in value['enzyme']: _dict = restr_protein_dict(dictionary) enzymes_list.append(_dict) elif isinstance(value['enzyme'], dict) or isinstance( value['enzyme'], OrderedDict): _dict = restr_protein_dict(value['enzyme']) enzymes_list.append(_dict) elif key == 'transporters' and value: if isinstance(value['transporter'], list): for dictionary in value['transporter']: _dict = restr_protein_dict(dictionary) transporters_list.append(_dict) elif isinstance(value['transporter'], dict) or isinstance( value['transporter'], OrderedDict): _dict = restr_protein_dict(value['transporter']) transporters_list.append(_dict) elif key == 'carriers' and value: if isinstance(value['carrier'], list): for dictionary in value['carrier']: _dict = restr_protein_dict(dictionary) carriers_list.append(_dict) elif isinstance(value['carrier'], dict) or isinstance( value['carrier'], OrderedDict): _dict = restr_protein_dict(value['carrier']) carriers_list.append(_dict) elif key == 'atc-codes' and value: def restr_atccode_dict(dictionary): for x in dictionary: if x == '@code': atccode_list.append(dictionary[x]) return atccode_list if isinstance(value['atc-code'], list): for _d in value['atc-code']: restr_atccode_dict(_d) elif isinstance(value['atc-code'], dict) or isinstance( value['atc-code'], OrderedDict): restr_atccode_dict(value['atc-code']) d1['atc_codes'] = atccode_list d1['targets'] = targets_list d1['carriers'] = carriers_list d1['enzymes'] = enzymes_list d1['transporters'] = transporters_list d1['predicted_properties'] = pred_properties_dict d1['products'] = products_list restr_dict['drugbank'] = d1 restr_dict = unlist(restr_dict) restr_dict = dict_sweep(restr_dict, vals=[ None, math.inf, "INF", ".", "-", "", "NA", "none", " ", "Not Available", "unknown", "null", "None" ]) if restr_dict["drugbank"].get( 'inchi_key') == "IOFPEOPOAMOMBE-MRVPVSSYSA-N": print(repr(restr_dict["drugbank"].get("pdb"))) restr_dict = boolean_convert(restr_dict, [ "predicted_properties.mddr_like_rule", "predicted_properties.bioavailability", "predicted_properties.ghose_filter", "predicted_properties.rule_of_five", "products.generic", "products.otc", "products.approved", "products.pediatric-extension" ]) restr_dict = value_convert_to_number(restr_dict, skipped_keys=[ "dpd", "chemspider", "chebi", "pubchem_compound", "pubchem_substance", "bindingdb" ]) return restr_dict
def _map_line_to_json(fields, dbsnp_col): assert len(fields) == VALID_COLUMN_NO rsid = fields[8] # load as json data if rsid is None: return docs = [d for d in dbsnp_col.find({"dbsnp.rsid": rsid})] for doc in docs: HGVS = doc['_id'] one_snp_json = { "_id": HGVS, "grasp": { 'hg19': { 'chr': fields[5], 'pos': fields[6] }, 'hupfield': fields[1], 'last_curation_date': fields[2], 'creation_date': fields[3], 'srsid': fields[4], 'publication': { 'journal': fields[16], 'title': fields[17], 'pmid': fields[7], 'snpid': fields[8], 'location_within_paper': fields[9], 'p_value': fields[10], 'phenotype': fields[11], 'paper_phenotype_description': fields[12], 'paper_phenotype_categories': fields[13], 'date_pub': fields[14] }, 'includes_male_female_only_analyses': fields[18], 'exclusively_male_female': fields[19], 'initial_sample_description': fields[20], 'replication_sample_description': fields[21], 'platform_snps_passing_qc': fields[22], 'gwas_ancestry_description': fields[23], 'discovery': { 'total_samples': fields[25], 'european': fields[26], 'african': fields[27], 'east_asian': fields[28], 'indian_south_asian': fields[29], 'hispanic': fields[30], 'native': fields[31], 'micronesian': fields[32], 'arab_me': fields[33], 'mixed': fields[34], 'unspecified': fields[35], 'filipino': fields[36], 'indonesian': fields[37] }, 'replication': { 'total_samples': fields[38], 'european': fields[39], 'african': fields[40], 'east_asian': fields[41], 'indian_south_asian': fields[42], 'hispanic': fields[43], 'native': fields[44], 'micronesian': fields[45], 'arab_me': fields[46], 'mixed': fields[47], 'unspecified': fields[48], 'filipino': fields[49], 'indonesian': fields[50] }, 'in_gene': fields[51], 'nearest_gene': fields[52], 'in_lincrna': fields[53], 'in_mirna': fields[54], 'in_mirna_bs': fields[55], 'oreg_anno': fields[61], 'conserv_pred_tfbs': fields[62], 'human_enhancer': fields[63], 'rna_edit': fields[64], 'polyphen2': fields[65], 'sift': fields[66], 'ls_snp': fields[67], 'uniprot': fields[68], 'eqtl_meth_metab_study': fields[69] } } return list_split( dict_sweep(unlist(value_convert_to_number(one_snp_json)), [""]), ",")
def _map_line_to_json(df, version, include_gnomad, index=0): # specific variable treatment chrom = df["#chr"] if chrom == 'M': chrom = 'MT' # fields[7] in version 2, represent hg18_pos hg18_end = df["hg18_pos(1-based)"] if hg18_end == ".": hg18_end = "." else: hg18_end = int(hg18_end) # in case of no hg19 position provided, remove the item if df["hg19_pos(1-based)"] == '.': return None else: chromStart = int(df["hg19_pos(1-based)"]) chromEnd = chromStart chromStart_38 = int(df["pos(1-based)"]) ref = df["ref"].upper() alt = df["alt"].upper() HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt) HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt) if version == 'hg19': HGVS = HGVS_19 elif version == 'hg38': HGVS = HGVS_38 siphy_29way_pi = df["SiPhy_29way_pi"] if siphy_29way_pi == ".": siphy = "." else: freq = siphy_29way_pi.split(":") siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]} gtex_gene = df["GTEx_V6p_gene"].split('|') gtex_tissue = df["GTEx_V6p_tissue"].split('|') gtex = map( dict, map(lambda t: zip(('gene', 'tissue'), t), zip(gtex_gene, gtex_tissue))) acc = df["Uniprot_acc_Polyphen2"].rstrip().rstrip(';').split(";") pos = df["Uniprot_aapos_Polyphen2"].rstrip().rstrip(';').split(";") uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos))) provean_score = df["PROVEAN_score"].split(';') sift_score = df["SIFT_score"].split(';') hdiv_score = df["Polyphen2_HDIV_score"].split(';') hvar_score = df["Polyphen2_HVAR_score"].split(';') lrt_score = df["LRT_score"].split(';') m_cap_score = df["M-CAP_score"].split(';') mutationtaster_score = df["MutationTaster_score"].split(';') mutationassessor_score = df["MutationAssessor_score"].split(';') vest3_score = df["VEST3_score"].split(';') metasvm_score = df["MetaSVM_score"].split(';') fathmm_score = df["FATHMM_score"].split(';') metalr_score = df["MetaLR_score"].split(';') revel_score = df["REVEL_score"].split(';') ''' parse mutpred top 5 features ''' def modify_pvalue(pvalue): return float(pvalue.strip('P = ')) mutpred_mechanisms = df["MutPred_Top5features"] if mutpred_mechanisms not in ['.', ',', '-']: mutpred_mechanisms = mutpred_mechanisms.split( " (") and mutpred_mechanisms.split(";") mutpred_mechanisms = [m.rstrip(")") for m in mutpred_mechanisms] mutpred_mechanisms = [i.split(" (") for i in mutpred_mechanisms] mutpred_mechanisms = sum(mutpred_mechanisms, []) mechanisms = [{ "mechanism": mutpred_mechanisms[0], "p_val": modify_pvalue(mutpred_mechanisms[1]) }, { "mechanism": mutpred_mechanisms[2], "p_val": modify_pvalue(mutpred_mechanisms[3]) }, { "mechanism": mutpred_mechanisms[4], "p_val": modify_pvalue(mutpred_mechanisms[5]) }, { "mechanism": mutpred_mechanisms[6], "p_val": modify_pvalue(mutpred_mechanisms[7]) }, { "mechanism": mutpred_mechanisms[8], "p_val": modify_pvalue(mutpred_mechanisms[9]) }] else: mechanisms = '.' # normalize scores def norm(arr): return [None if item == '.' else item for item in arr] provean_score = norm(provean_score) sift_score = norm(sift_score) hdiv_score = norm(hdiv_score) hvar_score = norm(hvar_score) lrt_score = norm(lrt_score) m_cap_score = norm(m_cap_score) mutationtaster_score = norm(mutationtaster_score) mutationassessor_score = norm(mutationassessor_score) vest3_score = norm(vest3_score) metasvm_score = norm(metasvm_score) fathmm_score = norm(fathmm_score) metalr_score = norm(metalr_score) revel_score = norm(revel_score) gnomad = { "gnomad_exomes": { "ac": df["gnomAD_exomes_AC"], "an": df["gnomAD_exomes_AN"], "af": df["gnomAD_exomes_AF"], "afr_ac": df["gnomAD_exomes_AFR_AC"], "afr_af": df["gnomAD_exomes_AFR_AF"], "afr_an": df["gnomAD_exomes_AFR_AN"], "amr_ac": df["gnomAD_exomes_AMR_AC"], "amr_an": df["gnomAD_exomes_AMR_AN"], "amr_af": df["gnomAD_exomes_AMR_AF"], "asj_ac": df["gnomAD_exomes_ASJ_AC"], "asj_an": df["gnomAD_exomes_ASJ_AN"], "asj_af": df["gnomAD_exomes_ASJ_AF"], "eas_ac": df["gnomAD_exomes_EAS_AC"], "eas_af": df["gnomAD_exomes_EAS_AF"], "eas_an": df["gnomAD_exomes_EAS_AN"], "fin_ac": df["gnomAD_exomes_FIN_AC"], "fin_af": df["gnomAD_exomes_FIN_AF"], "fin_an": df["gnomAD_exomes_FIN_AN"], "nfe_ac": df["gnomAD_exomes_NFE_AC"], "nfe_af": df["gnomAD_exomes_NFE_AF"], "nfe_an": df["gnomAD_exomes_NFE_AN"], "sas_ac": df["gnomAD_exomes_SAS_AC"], "sas_af": df["gnomAD_exomes_SAS_AF"], "sas_an": df["gnomAD_exomes_SAS_AN"], "oth_ac": df["gnomAD_exomes_OTH_AC"], "oth_af": df["gnomAD_exomes_OTH_AF"], "oth_an": df["gnomAD_exomes_OTH_AN"] }, "gnomad_genomes": { "ac": df["gnomAD_genomes_AC"], "an": df["gnomAD_genomes_AN"], "af": df["gnomAD_genomes_AF"], "afr_ac": df["gnomAD_genomes_AFR_AC"], "afr_af": df["gnomAD_genomes_AFR_AF"], "afr_an": df["gnomAD_genomes_AFR_AN"], "amr_ac": df["gnomAD_genomes_AMR_AC"], "amr_an": df["gnomAD_genomes_AMR_AN"], "amr_af": df["gnomAD_genomes_AMR_AF"], "asj_ac": df["gnomAD_genomes_ASJ_AC"], "asj_an": df["gnomAD_genomes_ASJ_AN"], "asj_af": df["gnomAD_genomes_ASJ_AF"], "eas_ac": df["gnomAD_genomes_EAS_AC"], "eas_af": df["gnomAD_genomes_EAS_AF"], "eas_an": df["gnomAD_genomes_EAS_AN"], "fin_ac": df["gnomAD_genomes_FIN_AC"], "fin_af": df["gnomAD_genomes_FIN_AF"], "fin_an": df["gnomAD_genomes_FIN_AN"], "nfe_ac": df["gnomAD_genomes_NFE_AC"], "nfe_af": df["gnomAD_genomes_NFE_AF"], "nfe_an": df["gnomAD_genomes_NFE_AN"], "oth_ac": df["gnomAD_genomes_OTH_AC"], "oth_af": df["gnomAD_genomes_OTH_AF"], "oth_an": df["gnomAD_genomes_OTH_AN"] } } # load as json data one_snp_json = { "_id": HGVS, "dbnsfp": { "rsid": df["rs_dbSNP150"], #"rsid_dbSNP144": fields[6], "chrom": chrom, "hg19": { "start": chromStart, "end": chromEnd }, "hg18": { "start": df["hg18_pos(1-based)"], "end": hg18_end }, "hg38": { "start": df["pos(1-based)"], "end": df["pos(1-based)"] }, "ref": ref, "alt": alt, "aa": { "ref": df["aaref"], "alt": df["aaalt"], "pos": df["aapos"], "refcodon": df["refcodon"], "codonpos": df["codonpos"], "codon_degeneracy": df["codon_degeneracy"], }, "genename": df["genename"], "uniprot": list(uniprot), "interpro_domain": df["Interpro_domain"], "cds_strand": df["cds_strand"], "ancestral_allele": df["Ancestral_allele"], #"altaineandertal": fields[17], #"denisova": fields[18] "ensembl": { "geneid": df["Ensembl_geneid"], "transcriptid": df["Ensembl_transcriptid"], "proteinid": df["Ensembl_proteinid"] }, "sift": { "score": sift_score, "converted_rankscore": df["SIFT_converted_rankscore"], "pred": df["SIFT_pred"] }, "polyphen2": { "hdiv": { "score": hdiv_score, "rankscore": df["Polyphen2_HDIV_rankscore"], "pred": df["Polyphen2_HDIV_pred"] }, "hvar": { "score": hvar_score, "rankscore": df["Polyphen2_HVAR_rankscore"], "pred": df["Polyphen2_HVAR_pred"] } }, "lrt": { "score": lrt_score, "converted_rankscore": df["LRT_converted_rankscore"], "pred": df["LRT_pred"], "omega": df["LRT_Omega"] }, "mutationtaster": { "score": mutationtaster_score, "converted_rankscore": df["MutationTaster_converted_rankscore"], "pred": df["MutationTaster_pred"], "model": df["MutationTaster_model"], "AAE": df["MutationTaster_AAE"] }, "mutationassessor": { "score": mutationassessor_score, "rankscore": df["MutationAssessor_score_rankscore"], "pred": df["MutationAssessor_pred"] }, "fathmm": { "score": fathmm_score, "rankscore": df["FATHMM_converted_rankscore"], "pred": df["FATHMM_pred"] }, "provean": { "score": provean_score, "rankscore": df["PROVEAN_converted_rankscore"], "pred": df["PROVEAN_pred"] }, "vest3": { "score": vest3_score, "rankscore": df["VEST3_rankscore"], "transcriptid": df["Transcript_id_VEST3"], "transcriptvar": df["Transcript_var_VEST3"] }, "fathmm-mkl": { "coding_score": df["fathmm-MKL_coding_score"], "coding_rankscore": df["fathmm-MKL_coding_rankscore"], "coding_pred": df["fathmm-MKL_coding_pred"], "coding_group": df["fathmm-MKL_coding_group"] }, "eigen": { "coding_or_noncoding": df["Eigen_coding_or_noncoding"], "raw": df["Eigen-raw"], "phred": df["Eigen-phred"] }, "eigen-pc": { "raw": df["Eigen-PC-raw"], "phred": df["Eigen-PC-phred"], "raw_rankscore": df["Eigen-PC-raw_rankscore"] }, "genocanyon": { "score": df["GenoCanyon_score"], "rankscore": df["GenoCanyon_score_rankscore"] }, "metasvm": { "score": metasvm_score, "rankscore": df["MetaSVM_rankscore"], "pred": df["MetaSVM_pred"] }, "metalr": { "score": metalr_score, "rankscore": df["MetaLR_rankscore"], "pred": df["MetaLR_pred"] }, "reliability_index": df["Reliability_index"], "m_cap_score": { "score": m_cap_score, "rankscore": df["M-CAP_rankscore"], "pred": df["M-CAP_pred"] }, "revel": { "score": revel_score, "rankscore": df["REVEL_rankscore"] }, "mutpred": { "score": df["MutPred_score"], "rankscore": df["MutPred_rankscore"], "accession": df["MutPred_protID"], "aa_change": df["MutPred_AAchange"], "pred": mechanisms }, "dann": { "score": df["DANN_score"], "rankscore": df["DANN_rankscore"] }, "gerp++": { "nr": df["GERP++_NR"], "rs": df["GERP++_RS"], "rs_rankscore": df["GERP++_RS_rankscore"] }, "integrated": { "fitcons_score": df["integrated_fitCons_score"], "fitcons_rankscore": df["integrated_fitCons_score_rankscore"], "confidence_value": df["integrated_confidence_value"] }, "gm12878": { "fitcons_score": df["GM12878_fitCons_score"], "fitcons_rankscore": df["GM12878_fitCons_score_rankscore"], "confidence_value": df["GM12878_confidence_value"] }, "h1-hesc": { "fitcons_score": df["H1-hESC_fitCons_score"], "fitcons_rankscore": df["H1-hESC_fitCons_score_rankscore"], "confidence_value": df["H1-hESC_confidence_value"] }, "huvec": { "fitcons_score": df["HUVEC_fitCons_score"], "fitcons_rankscore": df["HUVEC_fitCons_score_rankscore"], "confidence_value": df["HUVEC_confidence_value"] }, "phylo": { "p100way": { "vertebrate": df["phyloP100way_vertebrate"], "vertebrate_rankscore": df["phyloP100way_vertebrate_rankscore"] }, "p20way": { "mammalian": df["phyloP20way_mammalian"], "mammalian_rankscore": df["phyloP20way_mammalian_rankscore"] } }, "phastcons": { "100way": { "vertebrate": df["phastCons100way_vertebrate"], "vertebrate_rankscore": df["phastCons100way_vertebrate_rankscore"] }, "20way": { "mammalian": df["phastCons20way_mammalian"], "mammalian_rankscore": df["phastCons20way_mammalian_rankscore"] } }, "siphy_29way": { "pi": siphy, "logodds": df["SiPhy_29way_logOdds"], "logodds_rankscore": df["SiPhy_29way_logOdds_rankscore"] }, "1000gp3": { "ac": df["1000Gp3_AC"], "af": df["1000Gp3_AF"], "afr_ac": df["1000Gp3_AFR_AC"], "afr_af": df["1000Gp3_AFR_AF"], "eur_ac": df["1000Gp3_EUR_AC"], "eur_af": df["1000Gp3_EUR_AF"], "amr_ac": df["1000Gp3_AMR_AC"], "amr_af": df["1000Gp3_AMR_AF"], "eas_ac": df["1000Gp3_EAS_AC"], "eas_af": df["1000Gp3_EAS_AF"], "sas_ac": df["1000Gp3_SAS_AC"], "sas_af": df["1000Gp3_SAS_AF"] }, "twinsuk": { "ac": df["TWINSUK_AC"], "af": df["TWINSUK_AF"] }, "alspac": { "ac": df["ALSPAC_AC"], "af": df["ALSPAC_AF"] }, "esp6500": { "aa_ac": df["ESP6500_AA_AC"], "aa_af": df["ESP6500_AA_AF"], "ea_ac": df["ESP6500_EA_AC"], "ea_af": df["ESP6500_EA_AF"] }, "exac": { "ac": df["ExAC_AC"], "af": df["ExAC_AF"], "adj_ac": df["ExAC_Adj_AC"], "adj_af": df["ExAC_Adj_AF"], "afr_ac": df["ExAC_AFR_AC"], "afr_af": df["ExAC_AFR_AF"], "amr_ac": df["ExAC_AMR_AC"], "amr_af": df["ExAC_AMR_AF"], "eas_ac": df["ExAC_EAS_AC"], "eas_af": df["ExAC_EAS_AF"], "fin_ac": df["ExAC_FIN_AC"], "fin_af": df["ExAC_FIN_AF"], "nfe_ac": df["ExAC_NFE_AC"], "nfe_af": df["ExAC_NFE_AF"], "sas_ac": df["ExAC_SAS_AC"], "sas_af": df["ExAC_SAS_AF"] }, "exac_nontcga": { "ac": df["ExAC_nonTCGA_AC"], "af": df["ExAC_nonTCGA_AF"], "adj_ac": df["ExAC_nonTCGA_Adj_AC"], "adj_af": df["ExAC_nonTCGA_Adj_AF"], "afr_ac": df["ExAC_nonTCGA_AFR_AC"], "afr_af": df["ExAC_nonTCGA_AFR_AF"], "amr_ac": df["ExAC_nonTCGA_AMR_AC"], "amr_af": df["ExAC_nonTCGA_AMR_AF"], "eas_ac": df["ExAC_nonTCGA_EAS_AC"], "eas_af": df["ExAC_nonTCGA_EAS_AF"], "fin_ac": df["ExAC_nonTCGA_FIN_AC"], "fin_af": df["ExAC_nonTCGA_FIN_AF"], "nfe_ac": df["ExAC_nonTCGA_NFE_AC"], "nfe_af": df["ExAC_nonTCGA_NFE_AF"], "sas_ac": df["ExAC_nonTCGA_SAS_AC"], "sas_af": df["ExAC_nonTCGA_SAS_AF"] }, "exac_nonpsych": { "ac": df["ExAC_nonpsych_AC"], "af": df["ExAC_nonpsych_AF"], "adj_ac": df["ExAC_nonpsych_Adj_AC"], "adj_af": df["ExAC_nonpsych_Adj_AF"], "afr_ac": df["ExAC_nonpsych_AFR_AC"], "afr_af": df["ExAC_nonpsych_AFR_AF"], "amr_ac": df["ExAC_nonpsych_AMR_AC"], "amr_af": df["ExAC_nonpsych_AMR_AF"], "eas_ac": df["ExAC_nonpsych_EAS_AC"], "eas_af": df["ExAC_nonpsych_EAS_AF"], "fin_ac": df["ExAC_nonpsych_FIN_AC"], "fin_af": df["ExAC_nonpsych_FIN_AF"], "nfe_ac": df["ExAC_nonpsych_NFE_AC"], "nfe_af": df["ExAC_nonpsych_NFE_AF"], "sas_ac": df["ExAC_nonpsych_SAS_AC"], "sas_af": df["ExAC_nonpsych_SAS_AF"] }, "clinvar": { "rs": df["clinvar_rs"], "clinsig": list( map(int, [ i for i in df["clinvar_clnsig"].split("|") if i != "." ])), "trait": [i for i in df["clinvar_trait"].split("|") if i != "."], "golden_stars": list( map(int, [ i for i in df["clinvar_golden_stars"].split("|") if i != "." ])) }, "gtex": list(gtex) } } if include_gnomad: one_snp_json['dbnsfp'].update(gnomad) one_snp_json = list_split( dict_sweep(unlist(value_convert_to_number(one_snp_json)), vals=[".", '-', None]), ";") one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"]) return one_snp_json
def _map_line_to_json(cp, hg19): try: clinical_significance = cp.ReferenceClinVarAssertion.\ ClinicalSignificance.Description except: clinical_significance = None rcv_accession = cp.ReferenceClinVarAssertion.ClinVarAccession.Acc try: review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.\ ReviewStatus except: review_status = None try: last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.\ DateLastEvaluated except: last_evaluated = None number_submitters = len(cp.ClinVarAssertion) # some items in clinvar_xml doesn't have origin information try: origin = cp.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin except: origin = None conditions = [] for _trait in cp.ReferenceClinVarAssertion.TraitSet.Trait: synonyms = [] conditions_name = '' for name in _trait.Name: if name.ElementValue.Type == 'Alternate': synonyms.append(name.ElementValue.get_valueOf_()) if name.ElementValue.Type == 'Preferred': conditions_name += name.ElementValue.get_valueOf_() identifiers = {} for item in _trait.XRef: if item.DB == 'Human Phenotype Ontology': key = 'Human_Phenotype_Ontology' else: key = item.DB identifiers[key.lower()] = item.ID for symbol in _trait.Symbol: if symbol.ElementValue.Type == 'Preferred': conditions_name += ' (' + symbol.ElementValue.get_valueOf_( ) + ')' age_of_onset = '' for _set in _trait.AttributeSet: if _set.Attribute.Type == 'age of onset': age_of_onset = _set.Attribute.get_valueOf_() conditions.append({ "name": conditions_name, "synonyms": synonyms, "identifiers": identifiers, "age_of_onset": age_of_onset }) try: genotypeset = cp.ReferenceClinVarAssertion.GenotypeSet except: genotypeset = None if genotypeset: obj_list = [] id_list = [] for _set in cp.ReferenceClinVarAssertion.GenotypeSet.MeasureSet: variant_id = _set.ID for _measure in _set.Measure: json_obj = parse_measure(_measure, hg19=hg19) if json_obj: json_obj['clinvar']['rcv'].update({ 'accession': rcv_accession, 'clinical_significance': clinical_significance, 'number_submitters': number_submitters, 'review_status': review_status, 'last_evaluated': str(last_evaluated), 'origin': origin, 'conditions': conditions }) json_obj['clinvar'].update({'variant_id': variant_id}) json_obj = (dict_sweep( unlist( value_convert_to_number(json_obj, [ 'chrom', 'omim', 'id', 'orphanet', 'gene', 'rettbase_(cdkl5)', 'cosmic', 'dbrbc' ])), [None, '', 'None'])) obj_list.append(json_obj) id_list.append(json_obj['_id']) for _obj in obj_list: _obj['clinvar'].update({ 'genotypeset': { 'type': 'CompoundHeterozygote', 'genotype': id_list } }) yield _obj else: variant_id = cp.ReferenceClinVarAssertion.MeasureSet.ID for _measure in cp.ReferenceClinVarAssertion.MeasureSet.Measure: json_obj = parse_measure(_measure, hg19=hg19) if json_obj: json_obj['clinvar']['rcv'].update({ 'accession': rcv_accession, 'clinical_significance': clinical_significance, 'number_submitters': number_submitters, 'review_status': review_status, 'last_evaluated': str(last_evaluated), 'origin': origin, 'conditions': conditions }) json_obj['clinvar'].update({'variant_id': variant_id}) json_obj = (dict_sweep( unlist( value_convert_to_number(json_obj, [ 'chrom', 'omim', 'id', 'orphanet', 'gene', 'rettbase_(cdkl5)', 'cosmic', 'dbrbc' ])), [None, '', 'None'])) yield json_obj
def _map_line_to_json(fields,dbsnp_col): assert len(fields) == VALID_COLUMN_NO rsid = fields[8] # load as json data if rsid is None: return docs = [d for d in dbsnp_col.find({"dbsnp.rsid":rsid})] for doc in docs: HGVS = doc['_id'] one_snp_json = { "_id": HGVS, "grasp": { 'hg19': { 'chr': fields[5], 'pos': fields[6] }, 'hupfield': fields[1], 'last_curation_date': fields[2], 'creation_date': fields[3], 'srsid': fields[4], 'publication': { 'journal': fields[16], 'title': fields[17], 'pmid': fields[7], 'snpid': fields[8], 'location_within_paper': fields[9], 'p_value': fields[10], 'phenotype': fields[11], 'paper_phenotype_description': fields[12], 'paper_phenotype_categories': fields[13], 'date_pub': fields[14] }, 'includes_male_female_only_analyses': fields[18], 'exclusively_male_female': fields[19], 'initial_sample_description': fields[20], 'replication_sample_description': fields[21], 'platform_snps_passing_qc': fields[22], 'gwas_ancestry_description': fields[23], 'discovery': { 'total_samples': fields[25], 'european': fields[26], 'african': fields[27], 'east_asian': fields[28], 'indian_south_asian': fields[29], 'hispanic': fields[30], 'native': fields[31], 'micronesian': fields[32], 'arab_me': fields[33], 'mixed': fields[34], 'unspecified': fields[35], 'filipino': fields[36], 'indonesian': fields[37] }, 'replication': { 'total_samples': fields[38], 'european': fields[39], 'african': fields[40], 'east_asian': fields[41], 'indian_south_asian': fields[42], 'hispanic': fields[43], 'native': fields[44], 'micronesian': fields[45], 'arab_me': fields[46], 'mixed': fields[47], 'unspecified': fields[48], 'filipino': fields[49], 'indonesian': fields[50] }, 'in_gene': fields[51], 'nearest_gene': fields[52], 'in_lincrna': fields[53], 'in_mirna': fields[54], 'in_mirna_bs': fields[55], 'oreg_anno': fields[61], 'conserv_pred_tfbs': fields[62], 'human_enhancer': fields[63], 'rna_edit': fields[64], 'polyphen2': fields[65], 'sift': fields[66], 'ls_snp': fields[67], 'uniprot': fields[68], 'eqtl_meth_metab_study': fields[69] } } return list_split(dict_sweep(unlist(value_convert_to_number(one_snp_json)), [""]), ",")
def restructure_dict(dictionary): smile_dict = dict() iupac_dict = dict() d = dict() for key, value in iter(dictionary.items()): if key == "PC-Compound_id": for cnt in value: for m, n in iter(value[cnt].items()): for x, y in iter(n.items()): d["cid"] = y elif key == "PC-Compound_charge": d["formal_charge"] = dictionary[key] elif key == "PC-Compound_props": for cnt in value: for ele in value[cnt]: for x, y in iter(ele.items()): if x == "PC-InfoData_urn": for i, j in iter(y.items()): if i == "PC-Urn": val = ele["PC-InfoData_value"] for z in val: val1 = val[z] for k, l in iter(j.items()): if l == "Hydrogen Bond Acceptor": d["hydrogen_bond_acceptor_count"] = val1 elif l == "Hydrogen Bond Donor": d["hydrogen_bond_donor_count"] = val1 elif l == "Rotatable Bond": d["rotatable_bond_count"] = val1 elif l == "IUPAC Name": IUPAC = j["PC-Urn_name"] IUPAC = IUPAC.lower() iupac_dict[IUPAC] = val1 d["iupac"] = iupac_dict iupac_dict = {} elif l == "InChI": d["inchi"] = val1 break elif l == "InChIKey": d["inchi_key"] = val1 break elif l == "Log P": d["xlogp"] = val1 elif l == "Mass": d["exact_mass"] = val1 elif l == "Molecular Formula": d["molecular_formula"] = val1 elif l == "Molecular Weight": d["molecular_weight"] = val1 elif l == "SMILES": smiles = j["PC-Urn_name"] smiles = smiles.lower() smile_dict[smiles] = val1 d["smiles"] = smile_dict smile_dict = {} elif l == "Topological": d["topological_polar_surface_area"] = val1 elif l == "Weight": d["monoisotopic_weight"] = val1 elif l == "Compound Complexity": d["complexity"] = val1 elif key == "PC-Compound_count": for cnt in value: for x, y in iter(value[cnt].items()): if x == "PC-Count_heavy-atom": d["heavy_atom_count"] = y elif x == "PC-Count_atom-chiral": d["chiral_atom_count"] = y elif x == "PC-Count_atom-chiral-def": d["defined_atom_stereocenter_count"] = y elif x == "PC-Count_atom-chiral-undef": d["undefined_atom_stereocenter_count"] = y elif x == "PC-Count_bond-chiral": d["chiral_bond_count"] = y elif x == "PC-Count_bond-chiral-def": d["defined_bond_stereocenter_count"] = y elif x == "PC-Count_bond-chiral-undef": d["undefined_bond_stereocenter_count"] = y elif x == "PC-Count_isotope-atom": d["isotope_atom_count"] = y elif x == "PC-Count_covalent-unit": d["covalently-bonded_unit_count"] = y elif x == "PC-Count_tautomers": d["tautomers_count"] = y restr_dict = {} restr_dict['_id'] = d["cid"] restr_dict["pubchem"] = d restr_dict = value_convert_to_number(restr_dict) return restr_dict
def _map_line_to_json(df): # specific variable treatment chrom = df["chr_id"] pos = df["chr_pos"] if chrom == 'M': chrom = 'MT' ref = df["ref_nt"] alt = df["alt_nt"] HGVS = get_hgvs_from_vcf(chrom, int(pos), ref, alt, mutant_type=False) transcript_id = clean_data(df["transcript_id"], ("-",)) peptide_id = clean_data(df["peptide_id"], ("-",)) uniprot_ac = clean_data(df["uniprot_ac"], ("-",)) refseq_ac = clean_data(df["refseq_ac"], ("-",)) cds_pos = clean_data(df["cds_pos"], ("-",)) pep_pos = clean_data(df["pep_pos"], ("-",)) uniprot_pos = clean_data(df["uniprot_pos"], ("-",)) ref_aa = clean_data(df["ref_aa"], ("-",)) alt_aa = clean_data(df["alt_aa"], ("-",)) mut_freq = clean_data(df["mut_freq"], ("-",)) data_src = clean_data(df["data_src"], ("-",)) do_id = clean_data(df["do_id"], ("-",)) do_name_id, do_name = do_name_split(df["do_name"]) if do_id and do_name_id: assert do_id == do_name_id, "do_id mismatch!" uberon_id = to_list(df["uberon_id"]) gene_name = clean_data(df["gene_name"], ("-",)) pmid_list = to_list(df["pmid_list"]) site_prd = site_prd_parser(clean_data(df["site_prd"], ("-",))) site_ann = site_ann_parser(df["site_ann"]) # load as json data one_snp_json = { "_id": HGVS, "biomuta": { 'chrom': chrom, 'pos': pos, 'ref': ref, 'alt': alt, 'transcript_id': transcript_id, 'peptide_id': peptide_id, 'uniprot_ac': uniprot_ac, 'refseq_ac': refseq_ac, 'cds_pos': cds_pos, 'pep_pos': pep_pos, 'uniprot_pos': uniprot_pos, 'ref_aa': ref_aa, 'alt_aa': alt_aa, 'mut_freq': mut_freq, 'data_src': data_src, 'do_id': { "do_id" : do_id, "do_name" : do_name }, 'uberon_id': uberon_id, 'gene_name': gene_name, 'pmid': pmid_list, } } if site_ann: for dic in site_ann: one_snp_json["biomuta"].update(dic) if site_prd: one_snp_json["biomuta"].update(site_prd) one_snp_json = value_convert_to_number(one_snp_json) one_snp_json['biomuta']['chrom'] = str(one_snp_json['biomuta']['chrom']) one_snp_json['biomuta']['do_id']['do_id'] = str(one_snp_json['biomuta']['do_id']['do_id']) return one_snp_json
def load_data(data_folder): input_file = os.path.join(data_folder, "alternative") # input_file = os.path.join(data_folder, "gwas_catalog_v1.0.2-associations_e96_r2019-04-21.tsv") assert os.path.exists( input_file), "Can't find input file '%s'" % input_file with open_anyfile(input_file) as in_f: # Remove duplicated lines if any header = next(in_f).strip().split('\t') lines = set(list(in_f)) reader = DictReader(lines, fieldnames=header, delimiter='\t') results = defaultdict(list) rsid_list = [] for row in reader: rsids, _ = parse_separator_and_snps(row) if rsids: rsid_list += rsids hgvs_rsid_dict = batch_query_hgvs_from_rsid(rsid_list) reader = DictReader(lines, fieldnames=header, delimiter='\t') for row in reader: variant = {} HGVS = False snps, seperator = parse_separator_and_snps(row) if not snps: continue region = reorganize_field(row["REGION"], seperator, len(snps)) chrom = reorganize_field(row["CHR_ID"], seperator, len(snps)) genes = reorganize_field(row["REPORTED GENE(S)"], seperator, len(snps)) position = reorganize_field(row["CHR_POS"], seperator, len(snps)) context = reorganize_field(row["CONTEXT"], seperator, len(snps)) for i, _snp in enumerate(snps): variant = {} if _snp in hgvs_rsid_dict: variant["_id"] = hgvs_rsid_dict[_snp] else: continue variant['gwascatalog'] = { "associations": { 'efo': {}, 'study': {} } } if not HGVS: variant["gwascatalog"]["rsid"] = _snp variant['gwascatalog']['associations']['snps'] = snps variant['gwascatalog']['associations']['pubmed'] = int( row['PUBMEDID']) variant['gwascatalog']['associations']['date_added'] = row[ 'DATE ADDED TO CATALOG'] variant['gwascatalog']['associations']['study']['name'] = row[ 'STUDY'] variant['gwascatalog']['associations']['trait'] = row[ 'DISEASE/TRAIT'] variant['gwascatalog'][ 'region'] = region[i] if region else None if not chrom: chrom = [''] * 10 elif str(chrom[i]).lower() not in CHROM_LIST: chrom[i] = '' variant['gwascatalog']['chrom'] = chrom[i] if chrom else None variant['gwascatalog'][ 'pos'] = position[i] if position else None variant['gwascatalog']['gene'] = genes[i].split(',') if ( genes and genes[i]) else None variant['gwascatalog'][ 'context'] = context[i] if context else None variant['gwascatalog']['associations']['raf'] = str2float( row['RISK ALLELE FREQUENCY']) variant['gwascatalog']['associations']['pval'] = str2float( row['P-VALUE']) # variant['gwascatalog']['p_val_mlog'] = str2float(row['PVALUE_MLOG']) variant['gwascatalog']['associations']['study'][ 'platform'] = row['PLATFORM [SNPS PASSING QC]'] variant['gwascatalog']['associations']['study'][ 'accession'] = row['STUDY ACCESSION'] variant['gwascatalog']['associations']['efo']['name'] = row[ 'MAPPED_TRAIT'].split(',') variant['gwascatalog']['associations']['efo']['id'] = [ _item.split('/')[-1].replace('_', ':') for _item in row['MAPPED_TRAIT_URI'].split(',') ] variant = dict_sweep(unlist( value_convert_to_number(variant, skipped_keys=['chrom'])), vals=[[], {}, None, '', 'NR']) results[variant["_id"]].append(variant) for v in results.values(): if len(v) == 1: yield v[0] else: doc = {'_id': v[0]['_id'], 'gwascatalog': {'associations': []}} for _item in ['gene', 'region', 'pos', 'context', 'rsid']: if _item in v[0]['gwascatalog']: doc['gwascatalog'][_item] = v[0]['gwascatalog'][_item] doc['gwascatalog']['associations'] = [ i['gwascatalog']['associations'] for i in v ] yield doc
def restructure_dict(dictionary): smile_dict = dict() iupac_dict = dict() d = dict() for key,value in iter(dictionary.items()): if key == "PC-Compound_id": for cnt in value: for m,n in iter(value[cnt].items()): for x,y in iter(n.items()): d["cid"] = y elif key == "PC-Compound_charge": d["formal_charge"] = dictionary[key] elif key == "PC-Compound_props": for cnt in value: for ele in value[cnt]: for x,y in iter(ele.items()): if x == "PC-InfoData_urn": for i,j in iter(y.items()): if i == "PC-Urn": val = ele["PC-InfoData_value"] for z in val: val1 = val[z] for k,l in iter(j.items()): if l == "Hydrogen Bond Acceptor": d["hydrogen_bond_acceptor_count"] = val1 elif l == "Hydrogen Bond Donor": d["hydrogen_bond_donor_count"] = val1 elif l == "Rotatable Bond": d["rotatable_bond_count"] = val1 elif l == "IUPAC Name": IUPAC = j["PC-Urn_name"] IUPAC = IUPAC.lower() iupac_dict[IUPAC] = val1 d["iupac"] = iupac_dict iupac_dict = {} elif l == "InChI": d["inchi"] = val1 break elif l == "InChIKey": d["inchi_key"] = val1 break elif l == "Log P": d["xlogp"] = val1 elif l == "Mass": d["exact_mass"] = val1 elif l == "Molecular Formula": d["molecular_formula"] = val1 elif l == "Molecular Weight": d["molecular_weight"] = val1 elif l == "SMILES": smiles = j["PC-Urn_name"] smiles = smiles.lower() smile_dict[smiles] = val1 d["smiles"] = smile_dict smile_dict = {} elif l == "Topological": d["topological_polar_surface_area"] = val1 elif l == "Weight": d["monoisotopic_weight"] = val1 elif l == "Compound Complexity": d["complexity"] = val1 elif key == "PC-Compound_count": for cnt in value: for x,y in iter(value[cnt].items()): if x == "PC-Count_heavy-atom": d["heavy_atom_count"] = y elif x == "PC-Count_atom-chiral": d["chiral_atom_count"] = y elif x == "PC-Count_atom-chiral-def": d["defined_atom_stereocenter_count"] = y elif x == "PC-Count_atom-chiral-undef": d["undefined_atom_stereocenter_count"] = y elif x == "PC-Count_bond-chiral": d["chiral_bond_count"] = y elif x == "PC-Count_bond-chiral-def": d["defined_bond_stereocenter_count"] = y elif x == "PC-Count_bond-chiral-undef": d["undefined_bond_stereocenter_count"] = y elif x == "PC-Count_isotope-atom": d["isotope_atom_count"] = y elif x == "PC-Count_covalent-unit": d["covalently-bonded_unit_count"] = y elif x == "PC-Count_tautomers": d["tautomers_count"] = y restr_dict = {} restr_dict['_id'] = str(d["cid"]) restr_dict["pubchem"] = d restr_dict = value_convert_to_number(restr_dict,skipped_keys=["_id"]) return restr_dict
def _map_line_to_json(doc_key, item): chrom = item.CHROM chromStart = item.POS ref = item.REF info = item.INFO _filter = item.FILTER try: baseqranksum = info['BaseQRankSum'] except: baseqranksum = None try: clippingranksum = info['ClippingRankSum'] except: clippingranksum = None try: mqranksum = info['MQRankSum'] except: mqranksum = None try: readposranksum = info['ReadPosRankSum'] except: readposranksum = None try: qd = info['QD'] except: qd = None try: inbreedingcoeff = info['InbreedingCoeff'] except: inbreedingcoeff = None # convert vcf object to string item.ALT = [str(alt) for alt in item.ALT] # if multiallelic, put all variants as a list in multi-allelic field hgvs_list = None if len(item.ALT) > 1: hgvs_list = [ get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=False) for alt in item.ALT ] for i, alt in enumerate(item.ALT): (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True) if HGVS is None: return assert len(item.ALT) == len( info['AC'] ), "Expecting length of item.ALT= length of info.AC, but not for %s" % ( HGVS) assert len(item.ALT) == len( info['AF'] ), "Expecting length of item.ALT= length of info.AF, but not for %s" % ( HGVS) assert len(item.ALT) == len( info['Hom_AFR'] ), "Expecting length of item.ALT= length of HOM_AFR, but not for %s" % ( HGVS) one_snp_json = { "_id": HGVS, doc_key: { "chrom": chrom, "pos": chromStart, "filter": _filter, "multi-allelic": hgvs_list, "ref": ref, "alt": alt, "alleles": item.ALT, "type": var_type, "ac": { "ac": info['AC'][i], "ac_afr": info['AC_AFR'][i], "ac_amr": info['AC_AMR'][i], "ac_adj": info['AC_Adj'][i], "ac_eas": info['AC_EAS'][i], "ac_fin": info['AC_FIN'][i], "ac_het": info['AC_Het'][i], "ac_hom": info['AC_Hom'][i], "ac_nfe": info['AC_NFE'][i], "ac_oth": info['AC_OTH'][i], "ac_sas": info['AC_SAS'][i], "ac_male": info['AC_MALE'][i], "ac_female": info['AC_FEMALE'][i] }, "af": info['AF'][i], "an": { "an": info['AN'], "an_afr": info['AN_AFR'], "an_amr": info['AN_AMR'], "an_adj": info['AN_Adj'], "an_eas": info['AN_EAS'], "an_fin": info['AN_FIN'], "an_nfe": info['AN_NFE'], "an_oth": info['AN_OTH'], "an_sas": info['AN_SAS'], "an_female": info['AN_FEMALE'], "an_male": info['AN_MALE'] }, "baseqranksum": baseqranksum, "clippingranksum": clippingranksum, "fs": info['FS'], "het": { "het_afr": info['Het_AFR'], "het_amr": info['Het_AMR'], "het_eas": info['Het_EAS'], "het_fin": info['Het_FIN'], "het_nfe": info['Het_NFE'], "het_oth": info['Het_OTH'], "het_sas": info['Het_SAS'] }, "hom": { "hom_afr": info['Hom_AFR'], "hom_amr": info['Hom_AMR'], "hom_eas": info['Hom_EAS'], "hom_fin": info['Hom_FIN'], "hom_nfe": info['Hom_NFE'], "hom_oth": info['Hom_OTH'], "hom_sas": info['Hom_SAS'] }, "inbreedingcoeff": inbreedingcoeff, "mq": { "mq": info['MQ'], "mq0": info['MQ0'], "mqranksum": mqranksum }, "ncc": info['NCC'], "qd": qd, "readposranksum": readposranksum, "vqslod": info['VQSLOD'], "culprit": info['culprit'] } } obj = (dict_sweep(unlist(value_convert_to_number(one_snp_json)), [None])) yield obj
def _map_line_to_json(doc_key, item): chrom = item.CHROM chromStart = item.POS ref = item.REF info = item.INFO try: baseqranksum = info['BaseQRankSum'] except: baseqranksum = None try: clippingranksum = info['ClippingRankSum'] except: clippingranksum = None try: mqranksum = info['MQRankSum'] except: mqranksum = None try: readposranksum = info['ReadPosRankSum'] except: readposranksum = None try: qd = info['QD'] except: qd = None try: inbreedingcoeff = info['InbreedingCoeff'] except: inbreedingcoeff = None # convert vcf object to string item.ALT = [str(alt) for alt in item.ALT] # if multiallelic, put all variants as a list in multi-allelic field hgvs_list = None if len(item.ALT) > 1: hgvs_list = [get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=False) for alt in item.ALT] for i, alt in enumerate(item.ALT): (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True) if HGVS is None: return assert len(item.ALT) == len(info['AC']), "Expecting length of item.ALT= length of info.AC, but not for %s" % (HGVS) assert len(item.ALT) == len(info['AF']), "Expecting length of item.ALT= length of info.AF, but not for %s" % (HGVS) assert len(item.ALT) == len(info['Hom_AFR']), "Expecting length of item.ALT= length of HOM_AFR, but not for %s" % (HGVS) one_snp_json = { "_id": HGVS, doc_key : { "chrom": chrom, "pos": chromStart, "multi-allelic": hgvs_list, "ref": ref, "alt": alt, "alleles": item.ALT, "type": var_type, "ac": { "ac": info['AC'][i], "ac_afr": info['AC_AFR'][i], "ac_amr": info['AC_AMR'][i], "ac_adj": info['AC_Adj'][i], "ac_eas": info['AC_EAS'][i], "ac_fin": info['AC_FIN'][i], "ac_het": info['AC_Het'][i], "ac_hom": info['AC_Hom'][i], "ac_nfe": info['AC_NFE'][i], "ac_oth": info['AC_OTH'][i], "ac_sas": info['AC_SAS'][i], "ac_male": info['AC_MALE'][i], "ac_female": info['AC_FEMALE'][i] }, "af": info['AF'][i], "an": { "an": info['AN'], "an_afr": info['AN_AFR'], "an_amr": info['AN_AMR'], "an_adj": info['AN_Adj'], "an_eas": info['AN_EAS'], "an_fin": info['AN_FIN'], "an_nfe": info['AN_NFE'], "an_oth": info['AN_OTH'], "an_sas": info['AN_SAS'], "an_female": info['AN_FEMALE'], "an_male": info['AN_MALE'] }, "baseqranksum": baseqranksum, "clippingranksum": clippingranksum, "fs": info['FS'], "het": { "het_afr": info['Het_AFR'], "het_amr": info['Het_AMR'], "het_eas": info['Het_EAS'], "het_fin": info['Het_FIN'], "het_nfe": info['Het_NFE'], "het_oth": info['Het_OTH'], "het_sas": info['Het_SAS'] }, "hom": { "hom_afr": info['Hom_AFR'], "hom_amr": info['Hom_AMR'], "hom_eas": info['Hom_EAS'], "hom_fin": info['Hom_FIN'], "hom_nfe": info['Hom_NFE'], "hom_oth": info['Hom_OTH'], "hom_sas": info['Hom_SAS'] }, "inbreedingcoeff": inbreedingcoeff, "mq": { "mq": info['MQ'], "mq0": info['MQ0'], "mqranksum": mqranksum }, "ncc": info['NCC'], "qd": qd, "readposranksum": readposranksum, "vqslod": info['VQSLOD'], "culprit": info['culprit'] } } obj = (dict_sweep(unlist(value_convert_to_number(one_snp_json)), [None])) yield obj
def _map_line_to_json(item): chrom = item.CHROM chromStart = item.POS ref = item.REF info = item.INFO try: af = info['AF'] except: af = None try: ac = info['AC'] except: ac = None try: an = info['AN'] except: ac = None try: ds = info['DS'] except: ds = None # convert vcf object to string item.ALT = [str(alt) for alt in item.ALT] # if multiallelic, put all variants as a list in multi-allelic field hgvs_list = None if len(item.ALT) > 1: hgvs_list = [] for alt in item.ALT: try: hgvs_list.append( get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=False)) except: hgvs_list.append(alt) assert len(item.ALT) == len( info['AC'] ), "Expecting length of item.ALT= length of info.AC, but not for %s" % ( item) assert len(item.ALT) == len( info['AF'] ), "Expecting length of item.ALT= length of info.AF, but not for %s" % ( item) if ds: if len(item.ALT) != len(info['DS']): ds_str = ",".join(info['DS']) ds_str = ds_str.replace("NA7022,18", "NA7022_18") ds_list = ds_str.split(",") info['DS'] = [ d.replace("NA7022_18", "NA7022,18") for d in ds_list ] assert len(item.ALT) == len( info['DS']), "info.DS mismatch, %s: %s\n## DS: %s" % ( item, info['DS']) for i, alt in enumerate(item.ALT): try: (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True) except: continue if HGVS is None: return # load as json data one_snp_json = { "_id": HGVS, "kaviar": { "multi-allelic": hgvs_list, "ref": ref, "alt": alt, "af": info['AF'][i], "ac": info['AC'][i], "an": an, "ds": info['DS'][i].split("|") if ds else None, } } yield value_convert_to_number(one_snp_json)