def load_data(): d_gene = process_gene(file_path_gene_disease) d_snp = process_snp(file_path_snp_disease) d_xrefs = process_xrefs(file_path_disease_mapping) umls_2_mondo = construct_umls_to_mondo_library(file_path_mondo) for umls_id in set(list(d_gene.keys()) + list(d_snp.keys())): if umls_id in umls_2_mondo: mondo_id = umls_2_mondo[umls_id] for _mondo in mondo_id: _doc = { '_id': _mondo, 'disgenet': { 'xrefs': d_xrefs.get(umls_id, {}), 'genes_related_to_disease': d_gene.get(umls_id, {}), 'variants_related_to_disease': d_snp.get(umls_id, {}) } } _doc = (dict_sweep(unlist(_doc), [None])) yield _doc else: _doc = { '_id': umls_id, 'disgenet': { 'xrefs': d_xrefs.get(umls_id, {}), 'genes_related_to_disease': d_gene.get(umls_id, {}), 'variants_related_to_disease': d_snp.get(umls_id, {}) } } _doc = (dict_sweep(unlist(_doc), [None])) yield _doc
def load_data(data_folder): file_path_disease_hpo = os.path.join(data_folder, 'phenotype.hpoa') file_path_mondo = os.path.join(data_folder, 'mondo.json') d_hpo = process_disease2hp(file_path_disease_hpo) orphanet_omim_2_mondo = construct_orphanet_omim_to_mondo_library(file_path_mondo) for disease_id in d_hpo.keys(): #for disease_id in set(list(d_go_bp.keys()) + list(d_go_mf.keys()) + list(d_go_cc.keys()) + list(d_pathway.keys())): if disease_id in orphanet_omim_2_mondo: mondo_id = orphanet_omim_2_mondo[disease_id] for _mondo in mondo_id: if disease_id.startswith('OMIM'): _doc = {'_id': _mondo, 'hpo': { 'disease_name': d_hpo.get(disease_id, {})[1], 'omim': disease_id.split(':')[1], 'phenotype_related_to_disease': d_hpo.get(disease_id, {})[0] } } elif disease_id.startswith('ORPHANET'): _doc = {'_id': _mondo, 'hpo': { 'disease_name': d_hpo.get(disease_id, {})[1], 'orphanet': disease_id.split(':')[1], 'phenotype_related_to_disease': d_hpo.get(disease_id, {})[0] } } else: print(disease_id) _doc = (dict_sweep(unlist(_doc), [None])) yield _doc else: if disease_id.startswith('OMIM'): _doc = {'_id': disease_id, 'hpo': { 'disease_name': d_hpo.get(disease_id, {})[1], 'omim': disease_id.split(':')[1], 'phenotype_related_to_disease': d_hpo.get(disease_id, {})[0] } } elif disease_id.startswith('ORPHANET'): _doc = {'_id': disease_id, 'hpo': { 'disease_name': d_hpo.get(disease_id, {})[1], 'orphanet': disease_id.split(':')[1], 'phenotype_related_to_disease': d_hpo.get(disease_id, {})[0] } } else: _doc = {'_id': disease_id, 'hpo': { 'disease_name': d_hpo.get(disease_id, {})[1], 'decipher': disease_id.split(':')[1], 'phenotype_related_to_disease': d_hpo.get(disease_id, {})[0] } } _doc = (dict_sweep(unlist(_doc), [None])) yield _doc
def load_data(data_folder): file_path_mondo = os.path.join(data_folder, "mondo.json") file_path_gene_disease = os.path.join( data_folder, "all_gene_disease_pmid_associations.tsv.gz" ) file_path_snp_disease = os.path.join( data_folder, "all_variant_disease_pmid_associations.tsv.gz" ) file_path_disease_mapping = os.path.join(data_folder, "disease_mappings.tsv.gz") d_gene = process_gene(file_path_gene_disease) d_snp = process_snp(file_path_snp_disease) d_xrefs = process_xrefs(file_path_disease_mapping) umls_2_mondo = construct_umls_to_mondo_library(file_path_mondo) for umls_id in set(list(d_gene.keys()) + list(d_snp.keys())): if umls_id in d_xrefs and "mondo" in d_xrefs[umls_id]: _doc = { "_id": d_xrefs[umls_id]["mondo"], "disgenet": { "xrefs": d_xrefs.get(umls_id, {}), "genes_related_to_disease": d_gene.get(umls_id, {}), "variants_related_to_disease": d_snp.get(umls_id, {}), }, } _doc = dict_sweep(unlist(_doc), [None]) yield _doc elif umls_id in umls_2_mondo: mondo_id = umls_2_mondo[umls_id] for _mondo in mondo_id: _doc = { "_id": _mondo, "disgenet": { "xrefs": d_xrefs.get(umls_id, {}), "genes_related_to_disease": d_gene.get(umls_id, {}), "variants_related_to_disease": d_snp.get(umls_id, {}), }, } _doc = dict_sweep(unlist(_doc), [None]) yield _doc else: _doc = { "_id": umls_id, "disgenet": { "xrefs": d_xrefs.get(umls_id, {}), "genes_related_to_disease": d_gene.get(umls_id, {}), "variants_related_to_disease": d_snp.get(umls_id, {}), }, } _doc = dict_sweep(unlist(_doc), [None]) yield _doc
def load_data(data_folder): input_file = os.path.join(data_folder, "phewas-catalog.csv") assert os.path.exists(input_file), "Can't find input file '%s'" % input_file with open_anyfile(input_file) as in_f: # Remove duplicated lines if any header = next(in_f).strip().split(',') header = [_item[1:-1] for _item in header] lines = set(list(in_f)) reader = DictReader(lines, fieldnames=header, delimiter=',') results = defaultdict(list) for row in reader: variant = {"associations": {"phenotype": {}}, "variant": {}} assert re.match("^rs\d+$", row["snp"]) != None variant["variant"]["rsid"] = row["snp"] variant["associations"]["phenotype"]["name"] = row["phewas phenotype"] variant["associations"]["cases"] = row["cases"] variant["associations"]["pval"] = float(row["p-value"]) variant["associations"]["odds-ratio"] = row["odds-ratio"] variant["associations"]["phenotype"]["phewas_code"] = row["phewas code"] variant["variant"]["gene"] = row["gene_name"] variant["variant"]["gwas_associations"] = row["gwas-associations"].split(',') pos_info = row["chromosome"].split(' ') if len(pos_info) == 2: variant["variant"]["chrom"], variant["variant"]["pos"] = pos_info else: variant["variant"]["chrom"] = pos_info[0] results[variant["variant"]["rsid"]].append(variant) # Merge duplications rsid_list = [_item for _item in results.keys()] hgvs_rsid_dict = batch_query_hgvs_from_rsid(rsid_list) for k, v in results.items(): if k in hgvs_rsid_dict and hgvs_rsid_dict[k]: if len(v) == 1: doc = {'_id': hgvs_rsid_dict[k], 'phewas': v[0]["variant"]} doc["phewas"]["associations"] = v[0]["associations"] yield dict_sweep(unlist(value_convert_to_number(doc, skipped_keys=['chrom'])), vals=[[], {}, None, '', 'NULL']) else: doc = {'_id': hgvs_rsid_dict[k], 'phewas': v[0]["variant"]} doc["phewas"]["associations"] = [] for _item in v: doc["phewas"]["associations"].append(_item["associations"]) yield dict_sweep(unlist(value_convert_to_number(doc, skipped_keys=['chrom'])), vals=[[], {}, None, '', 'NULL'])
def load_data(data_folder): file_path_mondo = os.path.join(data_folder, "mondo.json") file_path_gene_disease = os.path.join( data_folder, "curated_gene_disease_associations.tsv.gz") file_path_snp_disease = os.path.join( data_folder, "all_variant_disease_pmid_associations.tsv.gz") file_path_disease_mapping = os.path.join(data_folder, "disease_mappings.tsv.gz") d_gene = process_gene(file_path_gene_disease) d_snp = process_snp(file_path_snp_disease) d_xrefs = process_xrefs(file_path_disease_mapping) umls_2_mondo = construct_umls_to_mondo_library(file_path_mondo) for umls_id in set(list(d_gene.keys()) + list(d_snp.keys())): if umls_id in d_xrefs and 'mondo' in d_xrefs[umls_id]: _doc = { '_id': d_xrefs[umls_id]['mondo'], 'disgenet': { 'xrefs': d_xrefs.get(umls_id, {}), 'genes_related_to_disease': d_gene.get(umls_id, {}), 'variants_related_to_disease': d_snp.get(umls_id, {}) } } _doc = (dict_sweep(unlist(_doc), [None])) yield _doc elif umls_id in umls_2_mondo: mondo_id = umls_2_mondo[umls_id] for _mondo in mondo_id: _doc = { '_id': _mondo, 'disgenet': { 'xrefs': d_xrefs.get(umls_id, {}), 'genes_related_to_disease': d_gene.get(umls_id, {}), 'variants_related_to_disease': d_snp.get(umls_id, {}) } } _doc = (dict_sweep(unlist(_doc), [None])) yield _doc else: _doc = { '_id': umls_id, 'disgenet': { 'xrefs': d_xrefs.get(umls_id, {}), 'genes_related_to_disease': d_gene.get(umls_id, {}), 'variants_related_to_disease': d_snp.get(umls_id, {}) } } _doc = (dict_sweep(unlist(_doc), [None])) yield _doc
def load_data(data_folder): # Ontology data go_file = os.path.join(data_folder, "go.json") goterms = parse_ontology(go_file) # Gene annotation files for f in glob.glob(os.path.join(data_folder, "*.gaf.gz")): print("Parsing {}".format(f)) docs = parse_gene_annotations(f) # Create gene ID cache. Join all gene sets and fetch ids. all_genes = set() for _id, annotations in docs.items(): for key in [ "genes", "excluded_genes", "contributing_genes", "colocalized_genes" ]: if annotations.get(key) is not None: all_genes = all_genes | annotations[key] uniprot = [i for i, j in all_genes] symbols = [j for i, j in all_genes] taxid = annotations['taxid'] # Fetch gene data from mygene.info lookup = IDLookup(taxid) lookup.query_mygene(uniprot, "uniprot,retired,accession") lookup.retry_failed_with_new_ids(symbols, "symbol") for _id, annotations in docs.items(): # Add ontology annotations annotations['go'] = goterms[_id] annotations['source'] = 'go' # Add gene sets if annotations.get("genes") is not None: annotations['name'] = annotations['go']['name'] annotations['description'] = annotations['go']['description'] new_genes = [] for u, s in annotations['genes']: if lookup.query_cache.get(u) is not None: new_genes.append(lookup.query_cache[u]) elif lookup.query_cache.get(s) is not None: new_genes.append(lookup.query_cache[s]) annotations['genes'] = new_genes else: # No genes in set continue for key in [ "excluded_genes", "contributing_genes", "colocalized_genes" ]: if annotations.get(key) is not None: new_genes = [] for u, s in annotations.pop(key): if lookup.query_cache.get(u) is not None: new_genes.append(lookup.query_cache[u]) elif lookup.query_cache.get(s) is not None: new_genes.append(lookup.query_cache[s]) annotations['go'][key] = new_genes # Clean up data annotations = unlist(annotations) annotations = dict_sweep(annotations) yield annotations
def load_data(_file): f = open(_file, 'r') reader = csv.DictReader(f) for row in reader: _dict = restr_dict(row) _dict = unlist(dict_sweep(_dict)) yield _dict
def _map_line_to_json(fields): vid = fields[0].split(":") chrom = re.search(r'[1-9]+', vid[0]).group() if chrom == '23': chrom = chrom.replace('23', 'X') HGVS = "chr%s:%s" % (chrom, vid[1]) # load as json data if HGVS is None: return one_snp_json = { "_id": HGVS, "emv": { "gene": fields[2], "variant_id": fields[3], "exon": fields[4], "egl_variant": fields[5], "egl_protein": fields[6], "egl_classification": fields[7], "egl_classification_date": fields[8], "hgvs": fields[9].split(" | "), "clinvar_rcv": fields[10], } } return unlist(dict_sweep(value_convert_to_number(one_snp_json), vals=[""]))
def parse_ontology(f): "Get GO-term metadata from ontology JSON dump." with open(f, 'r') as infile: data = json.load(infile) nodes = data['graphs'][0]['nodes'] go_terms = {} for node in nodes: url = node['id'] _id = url.split("/")[-1] if not _id.startswith("GO_"): continue go_terms[_id] = { "id": _id, "url": url, } properties = node['meta'].get('basicPropertyValues') for p in properties: if p['val'] in [ "biological_process", "cellular_component", "molecular_function" ]: go_terms[_id]["class"] = [p['val']] if node.get('lbl'): go_terms[_id]['name'] = node['lbl'] if node['meta'].get("definition"): go_terms[_id]['description'] = node['meta']['definition'].get( 'val') go_terms[_id]['xrefs'] = node['meta']['definition'].get('xrefs') go_terms = unlist(go_terms) go_terms = dict_sweep(go_terms) return go_terms
def restructure_dict(dictionary): restr_dict = dict() _flag = 0 for key in list(dictionary): # this is for 1 if key == 'molecule_chembl_id': restr_dict['_id'] = dictionary[key] if key == 'molecule_structures' and type( dictionary['molecule_structures']) == dict: restr_dict['chembl'] = dictionary _flag = 1 for x, y in iter(dictionary['molecule_structures'].items()): if x == 'standard_inchi_key': restr_dict['chembl'].update(dictionary) restr_dict['chembl'].update({'inchi_key': y}) if x == 'canonical_smiles': restr_dict['chembl']['smiles'] = y if x == 'standard_inchi': restr_dict['chembl']['inchi'] = y if _flag == 0: restr_dict['chembl'] = dictionary del restr_dict['chembl']['molecule_structures'] restr_dict = unlist(restr_dict) restr_dict = dict_sweep(restr_dict, vals=[ None, ".", "-", "", "NA", "None", "none", " ", "Not Available", "unknown", "null" ]) restr_dict = value_convert_to_number( restr_dict, skipped_keys=["chebi_par_id", "first_approval"]) restr_dict = boolean_convert(restr_dict, [ "topical", "oral", "parenteral", "dosed_ingredient", "polymer_flag", "therapeutic_flag", "med_chem_friendly", "molecule_properties.ro3_pass" ]) return restr_dict
def load_data(data_folder): input_fn = os.path.join(data_folder,"biomuta-master.csv") open_file = open(input_fn) db_biomuta = csv.reader(open_file) index = next(db_biomuta) assert len(index) == VALID_COLUMN_NO, "Expecting %s columns, but got %s" % (VALID_COLUMN_NO, len(index)) index = [clean_index(s) for s in index] biomuta = (dict(zip(index, row)) for row in db_biomuta) json_rows = map(_map_line_to_json, biomuta) fd_tmp, tmp_path = mkstemp(dir=data_folder) try: with open(tmp_path, "w") as f: dbwriter = csv.writer(f) for i, doc in enumerate(json_rows): if doc: dbwriter.writerow([doc['_id'], json.dumps(doc)]) csvsort(tmp_path, [0,], has_header=False) with open(tmp_path) as csvfile: json_rows = csv.reader(csvfile) json_rows = (json.loads(row[1]) for row in json_rows) row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"])) json_rows = (merge_duplicate_rows(rg, "biomuta") for rg in row_groups) json_rows = (unlist(dict_sweep(row, vals=[None, ])) for row in json_rows) for res in json_rows: yield res finally: os.remove(tmp_path)
def get_gene_ids(symbols, uniprot_ids, taxid): """Fetch NCBI, Ensembl, and gene names from UniProt ids or gene symbol.""" mg = mygene.MyGeneInfo() fields = 'entrezgene,ensembl.gene,name,symbol' # Fetch ids from UniProt response = mg.querymany(uniprot_ids, scopes='uniprot', fields=fields, species=taxid, returnall=True) genes = {} for out in response['out']: if out.get("_id") is not None: query = out['query'] geneid = out['_id'] hits = genes.setdefault(query, {}) hits[geneid] = { "mygene_id": geneid, "uniprot": query, "symbol": out.get('symbol'), "name": out.get('name') } if out.get("entrezgene") is not None: hits[geneid].setdefault('ncbigene', []) if out['entrezgene'] not in hits[geneid]['ncbigene']: hits[geneid]['ncbigene'].append(out['entrezgene']) if out.get("ensembl") is not None: hits[geneid].setdefault('ensemblgene', []) hits[geneid]['ensemblgene'] = hits[geneid]['ensemblgene'] + \ [i['gene'] for i in alwayslist(out['ensembl'])] # Retry missing using gene symbol retry = [symbols[uniprot_ids.index(k)] for k in response['missing']] response = mg.querymany(retry, scopes='symbol', fields=fields, species=taxid, returnall=True) for out in response['out']: if out.get("_id") is not None: query = out['query'] geneid = out['_id'] hits = genes.setdefault(query, {}) hits[geneid] = { "mygene_id": geneid, "uniprot": uniprot_ids[symbols.index(query)], "symbol": out['symbol'], "name": out.get('name') } if out.get("entrezgene") is not None: hits[geneid].setdefault('ncbigene', []) if out['entrezgene'] not in hits[geneid]['ncbigene']: hits[geneid]['ncbigene'].append(out['entrezgene']) if out.get("ensembl") is not None: hits[geneid].setdefault('ensemblgene', []) hits[geneid]['ensemblgene'] = hits[geneid]['ensemblgene'] + \ [i['gene'] for i in alwayslist(out['ensembl'])] genes = unlist(genes) genes = dict_sweep(genes, vals=[None, 'null']) return genes
def load_data(): # number of civic ids with ref, alt, chrom no_case1 = 0 # number of civic ids with chrom, ref, but no alt no_case2 = 0 # number of civic ids with chrom, alt, but no ref no_case3 = 0 # number of civic ids with no alt and ref no_case4 = 0 for variant_id in range(MAX_VARIANT_NUMBER): if variant_id % 200 == 0: print("scanned {} variants".format(variant_id)) civic_url = 'https://civic.genome.wustl.edu/api/variants/' url = civic_url + str(variant_id) doc = requests.get(url).json() # time delay for 0.5s time.sleep(0.5) if set(['error', 'status']) != set(doc.keys()): [chrom, pos, ref, alt] = [doc['coordinates'][x] for x in ['chromosome', 'start', 'reference_bases', 'variant_bases']] doc.pop("id") new_doc = {} doc['variant_id'] = variant_id if chrom and ref and alt: no_case1 += 1 try: new_doc['_id'] = get_hgvs_from_vcf(chrom, pos, ref, alt) except ValueError: print("id has ref,alt, but coudn't be converted to hgvs id: {}".format(variant_id)) continue # handle cases of deletions where only ref info is provided elif chrom and ref and not alt: no_case2 += 1 start = int(pos) end = int(pos) + len(ref) - 1 if start == end: new_doc['_id'] = 'chr{0}:g.{1}del'.format(chrom, start) else: new_doc['_id'] = 'chr{0}:g.{1}_{2}del'.format(chrom, start, end) # handle cases of insertions where only alt info is provided elif chrom and alt and not ref: no_case3 += 1 new_doc['_id'] = 'chr{0}:g.{1}_{2}ins{3}'.format(chrom, start, end, alt) # handle cases where no ref or alt info provided, # in this case, use CIVIC internal ID as the primary id for MyVariant.info, e.g. CIVIC_VARIANT:1 else: no_case4 += 1 new_doc['_id'] = 'CIVIC_VARIANT:' + str(variant_id) for _evidence in doc['evidence_items']: if 'disease' in _evidence and 'doid' in _evidence['disease'] and _evidence['disease']['doid']: _evidence['disease']['doid'] = 'DOID:' + _evidence['disease']['doid'] new_doc['civic'] = doc yield dict_sweep(unlist(new_doc),['','null', 'N/A', None, [], {}]) # change doid into its formal representation, which should be sth like DOID:1 else: continue print("number of ids with ref, alt, chrom: {}".format(no_case1)) print("number of ids with chrom, ref but no alt: {}".format(no_case2)) print("number of ids with chrom, alt but no ref: {}".format(no_case3)) print("number of ids with no ref and alt: {}".format(no_case4))
def restructure_dict(dictionary): restr_dict = dict() restr_dict['_id'] = dictionary['ChEBI ID'] restr_dict['chebi']= dictionary restr_dict['chebi'] = clean_up(restr_dict['chebi']) restr_dict = dict_sweep(restr_dict,vals=[None,".", "-", "", "NA", "none", " ", "Not Available", "unknown","null","None"]) restr_dict = value_convert(unlist(restr_dict),skipped_keys=["beilstein_registry_numbers","pubchem_database_links","pubmed_citation_links","sabio_rk_database_links","gmelin_registry_numbers","molbase_database_links"]) return restr_dict
def load_packages(_file): f = open(_file, 'r', encoding='latin1') reader = csv.DictReader(f, dialect='excel-tab') for row in reader: _dict = package_restr_dict(row) _dict = unlist(dict_sweep(_dict)) _dict["_id"] = _dict["ndc"]["productndc"] yield _dict
def load_packages(_file): f = open(_file,'r',encoding='latin1') reader = csv.DictReader(f,dialect='excel-tab') for row in reader: _dict = package_restr_dict(row) _dict = unlist(dict_sweep(_dict)) _dict["_id"] = _dict["ndc"]["productndc"] yield _dict
def restructure_drug_indications(indication_data): """ Group drug indications by molecule_chembl_id """ restr_dict = {} for doc in indication_data: key = doc["molecule_chembl_id"] restr_dict.setdefault(key, []).append(doc) restr_dict = unlist(restr_dict) return restr_dict
def restructure_metabolisms(metabolism_data): """ Group metabolism data by molecule_chembl_id """ restr_dict = {} for doc in metabolism_data: key = doc["drug_chembl_id"] restr_dict.setdefault(key, []).append(doc) restr_dict = unlist(restr_dict) return restr_dict
def load_data(data_folder): # Ontology data go_file = os.path.join(data_folder, "go.json") goterms = parse_ontology(go_file) # Gene annotation files for f in glob.glob(os.path.join(data_folder, "*.gaf.gz")): print("Parsing {}".format(f)) docs = parse_gene_annotations(f) # Create gene ID cache. Join all gene sets and fetch ids. all_genes = set() for _id, annotations in docs.items(): for key in [ "genes", "excluded_genes", "contributing_genes", "colocalized_genes" ]: if annotations.get(key) is not None: all_genes = all_genes | annotations[key] uniprot = [i for i, j in all_genes] symbols = [j for i, j in all_genes] taxid = annotations['taxid'] genecache = get_gene_ids(symbols, uniprot, taxid) for _id, annotations in docs.items(): # Add ontology annotations annotations['go'] = goterms[_id] # Add gene sets if annotations.get("genes") is not None: genes = [] for u, s in annotations['genes']: if genecache.get(u) is not None: genes += [g for g in genecache[u].values()] elif genecache.get(s) is not None: genes += [g for g in genecache[s].values()] else: genes += {'symbol': s, 'uniprot': u} annotations['genes'] = genes else: # No genes in set continue for key in [ "excluded_genes", "contributing_genes", "colocalized_genes" ]: if annotations.get(key) is not None: genes = [] for u, s in annotations[key]: if genecache.get(u) is not None: genes += [g for g in genecache[u].values()] elif genecache.get(s) is not None: genes += [g for g in genecache[s].values()] else: genes += {'symbol': s, 'uniprot': u} annotations[key] = genes # Clean up data annotations = unlist(annotations) yield annotations
def restructure_activities(activity_data): """ Group activities by molecule_chembl_id """ restr_dict = {} for doc in activity_data: key = doc["molecule_chembl_id"] restr_dict.setdefault(key, []).append(doc) restr_dict = unlist(restr_dict) return restr_dict
def restructure_dict(dictionary): restr_dict = dict() restr_dict['_id'] = dictionary['ChEBI ID'] restr_dict['chebi']= dictionary restr_dict['chebi'] = clean_up(restr_dict['chebi']) restr_dict = dict_sweep(restr_dict,vals=[None,".", "-", "", "NA", "none", " ", "Not Available", "unknown","null","None","NaN"]) restr_dict = value_convert_to_number(unlist(restr_dict),skipped_keys=["cid","sid", "beilstein","pubmed","sabio_rk","gmelin","molbase", "synonyms", "wikipedia","url_stub"]) return restr_dict
def load_data(input_file): with open_anyfile(input_file) as in_f: result = defaultdict(list) for line in in_f: pharos_id, _id = line.strip().split(',') if _id != 'entrez_gene_id' and _id != '0': result[str(_id)].append(int(pharos_id)) for k, v in result.items(): json_doc = {'_id': str(k), 'pharos': {"target_id": v}} yield unlist(json_doc)
def reformat(cls, dictionary): ret_dict = dict() _flag = 0 for key in list(dictionary): if key == 'molecule_chembl_id': ret_dict['_id'] = dictionary[key] if key == 'molecule_structures' and type( dictionary['molecule_structures']) == dict: ret_dict['chembl'] = dictionary _flag = 1 for x, y in iter(dictionary['molecule_structures'].items()): if x == 'standard_inchi_key': ret_dict['chembl'].update(dictionary) ret_dict['chembl'].update({'inchi_key': y}) if x == 'canonical_smiles': ret_dict['chembl']['smiles'] = y if x == 'standard_inchi': ret_dict['chembl']['inchi'] = y if _flag == 0: ret_dict['chembl'] = dictionary if 'cross_references' in ret_dict['chembl'] and ret_dict['chembl'][ 'cross_references']: ret_dict['chembl'][ 'xrefs'] = MoleculeCrossReferenceListTransformer.transform_to_dict( ret_dict['chembl']['cross_references']) del ret_dict['chembl']['molecule_structures'] del ret_dict['chembl']['cross_references'] ret_dict = unlist(ret_dict) # Add "CHEBI:" prefix, standardize the way representing CHEBI IDs if 'chebi_par_id' in ret_dict['chembl'] and ret_dict['chembl'][ 'chebi_par_id']: ret_dict['chembl']['chebi_par_id'] = 'CHEBI:' + str( ret_dict['chembl']['chebi_par_id']) else: # clean, could be a None ret_dict['chembl'].pop("chebi_par_id", None) ret_dict = dict_sweep(ret_dict, vals=[ None, ".", "-", "", "NA", "None", "none", " ", "Not Available", "unknown", "null" ]) ret_dict = value_convert_to_number( ret_dict, skipped_keys=["chebi_par_id", "first_approval"]) ret_dict = boolean_convert(ret_dict, [ "topical", "oral", "parenteral", "dosed_ingredient", "polymer_flag", "therapeutic_flag", "med_chem_friendly", "molecule_properties.ro3_pass" ]) return ret_dict
def load_data(tsv_file): _file = open(tsv_file) reader = csv.DictReader(_file, delimiter='\t') _dict = {} drug_list = [] for row in reader: _id = row["PharmGKB Accession Id"] _d = restr_dict(row) _d = clean_up(_d) _d = unlist(dict_sweep(_d)) _dict = {'_id': _id, 'pharmgkb': _d} yield _dict
def load_data(tsv_file): _file = open(tsv_file) reader = csv.DictReader(_file,delimiter='\t') _dict = {} drug_list = [] for row in reader: _id = row["PharmGKB Accession Id"] _d = restr_dict(row) _d = clean_up(_d) _d = unlist(dict_sweep(_d)) _dict = {'_id':_id,'pharmgkb':_d} yield _dict
def load_data(): pharmacology_class = process_pharmacology_action(file_path_pharma_class) faers = process_faers(file_path_faers) act = process_act(file_path_act) omop = process_omop(file_path_omop) approval = process_approval(file_path_approval) drug_dosage = process_drug_dosage(file_path_drug_dosage) synonyms = process_synonym(file_path_synonym) structures = process_structure(file_path_structure) identifiers = process_identifier(file_path_identifier) for struc_id in set( list(pharmacology_class.keys()) + list(faers.keys()) + list(act.keys()) + list(omop.keys()) + list(approval.keys()) + list(drug_dosage.keys()) + list(identifiers.keys()) + list(synonyms.keys()) + list(structures.keys())): #for disease_id in set(list(d_go_bp.keys()) + list(d_go_mf.keys()) + list(d_go_cc.keys()) + list(d_pathway.keys())): if structures.get(struc_id, {}).get('inchikey', {}): _doc = { '_id': structures.get(struc_id, {}).get('inchikey', {}), 'drugcentral': { "pharmacology_class": pharmacology_class.get(struc_id, {}), "fda_adverse_event": faers.get(struc_id, {}), "bioactivity": act.get(struc_id, {}), "drug_use": omop.get(struc_id, {}), "approval": approval.get(struc_id, {}), "drug_dosage": drug_dosage.get(struc_id, {}), "synonyms": synonyms.get(struc_id, {}), "structures": structures.get(struc_id, {}), "xref": identifiers.get(struc_id, {}) } } else: _id = xref_2_inchikey(identifiers.get(struc_id, {})) if not _id: _id = 'DrugCentral:' + str(struc_id) _doc = { '_id': _id, 'drugcentral': { "pharmacology_class": pharmacology_class.get(struc_id, {}), "fda_adverse_event": faers.get(struc_id, {}), "bioactivity": act.get(struc_id, {}), "drug_use": omop.get(struc_id, {}), "approval": approval.get(struc_id, {}), "drug_dosage": drug_dosage.get(struc_id, {}), "synonyms": synonyms.get(struc_id, {}), "structures": structures.get(struc_id, {}), "xref": identifiers.get(struc_id, {}) } } _doc = (dict_sweep(unlist(_doc), [None])) yield _doc
def parse(self, record: vcf.model._Record, doc_key: str): """ When parsing gnomad.genomes.*.vcf.bgz files, `doc_key` should be "gnomad_genome"; when parsing gnomad.exomes.*.vcf.bgz files, `doc_key` should be "gnomad_exome". The returned document has the following structure: one_snp_json = { "_id": hgvs_id, doc_key: { "chrom": chrom, ... } } """ # the value of CHROM in hg38 GNOMAD source file startswith 'chr'; need to remove it first if record.CHROM.startswith('chr'): record.CHROM = record.CHROM[3:] # This step is necessary to `profile_parser.parse()` method if record.CHROM not in CHROM_VALID_VALUES: return info = record.INFO for key in ["AC", "AF", "nhomalt"]: if key in info: assert len(record.ALT) == len(info[key]), \ "length of record.ALT != length of info.%s, at CHROM=%s, POS=%s" % (key, record.CHROM, record.POS) profile_list = self.profile_parser.parse(record) site_quality_metrics_dict = self.site_quality_metrics_parser.parse(info) for i in range(len(record.ALT)): hgvs_id, profile_dict = profile_list[i] if hgvs_id is None: continue population_frequency_dict = self.population_frequency_parser.parse(info, i) one_snp_json = { "_id": hgvs_id, doc_key: { **profile_dict, **site_quality_metrics_dict, **population_frequency_dict } } obj = (dict_sweep(unlist(value_convert_to_number(one_snp_json, skipped_keys=['chrom'])), [None])) yield obj
def load_data(data_folder): tar = tarfile.open( os.path.join(data_folder, "Kaviar-160204-Public-hg19.vcf.tar")) member = tar.getmember( "Kaviar-160204-Public/vcfs/Kaviar-160204-Public-hg19.vcf.gz") member.name = os.path.basename(member.name) tar.extract(member, path=data_folder) tar.close() input_fn = os.path.join(data_folder, "Kaviar-160204-Public-hg19.vcf.gz") vcf_reader = vcf.Reader(filename=input_fn, compressed=True, strict_whitespace=True) json_rows = map(_map_line_to_json, vcf_reader) json_rows = chain.from_iterable(json_rows) fd_tmp, tmp_path = mkstemp(dir=data_folder) try: with open(tmp_path, "w") as f: dbwriter = csv.writer(f) for doc in json_rows: if doc: dbwriter.writerow([doc['_id'], json.dumps(doc)]) csvsort(tmp_path, [ 0, ]) with open(tmp_path) as csvfile: json_rows = csv.reader(csvfile) json_rows = (json.loads(row[1]) for row in json_rows) row_groups = ( it for (key, it) in groupby(json_rows, lambda row: row["_id"])) json_rows = (merge_duplicate_rows(rg, "kaviar") for rg in row_groups) import logging for row in json_rows: logging.debug(row) res = unlist(dict_sweep(row, vals=[ None, ])) yield res finally: os.remove(tmp_path) os.remove(input_fn)
def get_genesets(obo_filename, genemap_filename): disease_ontology = GO() obo_is_loaded = disease_ontology.load_obo(obo_filename) if obo_is_loaded is False: logging.error('Failed to load OBO file.') doid_omim_dict = build_doid_omim_dict(obo_filename) mim_diseases = build_mim_diseases_dict(genemap_filename) entrez_set = add_term_annotations(doid_omim_dict, disease_ontology, mim_diseases) genes_info = query_mygene(entrez_set, TAX_ID) disease_ontology.populated = True disease_ontology.propagate() genesets = list() for term_id, term in disease_ontology.go_terms.items(): # If a term includes anyvalid gene IDs, add it as a geneset. gid_set = set() for annotation in term.annotations: gid_set.add(annotation.gid) if gid_set: my_geneset = {} my_geneset['_id'] = term_id.replace(":", "_") my_geneset['is_public'] = True my_geneset['taxid'] = TAX_ID my_geneset['source'] = 'do' my_geneset['name'] = term.full_name do_abstract = create_gs_abstract(term, doid_omim_dict) my_geneset['description'] = do_abstract # Genes in a geneset are sorted by their IDs to make output reproducible. my_geneset['genes'] = [ genes_info[str(gid)] for gid in sorted(gid_set) ] my_geneset['do'] = {'id': term_id, 'abstract': do_abstract} my_geneset = dict_sweep(my_geneset, vals=[None], remove_invalid_list=True) my_geneset = unlist(my_geneset) genesets.append(my_geneset) return genesets
def _map_line_to_json(fields): """Mapping each lines in csv file into JSON doc """ one_snp_json = { "gene": fields[1], "variant_id": fields[2], "exon": fields[3], "egl_variant": fields[4], "egl_protein": fields[5], "egl_classification": fields[6], "egl_classification_date": fields[7], "hgvs": fields[8].split(" | ") } return unlist(dict_sweep(value_convert_to_number(one_snp_json), vals=[""]))
def query_mygene(self, ids, id_type): """Query information from mygene.info about each gene in 'ids'. Args: ids (iterable): Array or set of gene ids to query. id_type (str): query scope field for the ids. Can be a comma-separated string for multiple scopes. e.g. 'entrezgene,symbol' """ self.ids = ids mg = mygene.MyGeneInfo() # Fields to query fields = "entrezgene,ensembl.gene,uniprot.Swiss-Prot,symbol,name" if id_type == "symbol": scopes = "symbol,alias" elif id_type == "entrezgene": scopes = "entrezgene,retired" else: scopes = id_type response = mg.querymany(ids, scopes=scopes, fields=fields, species=self.species, returnall=True) # Save failed queries self.missing = response['missing'] # Format successful queries for out in response['out']: query = out['query'] if out.get('notfound'): continue gene = {'mygene_id': out['_id']} if out.get('symbol') is not None: gene['symbol'] = out['symbol'] if out.get('name') is not None: gene['name'] = out['name'] if out.get('entrezgene') is not None: gene['ncbigene'] = out['entrezgene'] if out.get('ensembl') is not None: if len(out['ensembl']) > 1: for i in out['ensembl']: gene.setdefault('ensemblgene', []).append(i['gene']) else: gene['ensemblgene'] = out['ensembl']['gene'] if out.get('uniprot') is not None: gene['uniprot'] = out['uniprot']['Swiss-Prot'] gene = dict_sweep(gene) gene = unlist(gene) self.query_cache[query] = gene
def load_data(assembly, input_file, chrom): import logging as loggingmod global logging logging = loggingmod.getLogger("dbsnp_upload") logging.info("Processing chr{}...".format(chrom)) snpdoc_iter = parse_vcf(assembly, input_file, compressed=True, verbose=False, by_id=True, reference=chrom) for doc in snpdoc_iter: _doc = {'dbsnp': doc} _doc['_id'] = doc['_id'] del doc['_id'] yield (dict_sweep(unlist(value_convert_to_number(_doc)), [None]))
def load_data(): pharmacology_class = process_pharmacology_action(file_path_pharma_class) faers = process_faers(file_path_faers) act = process_act(file_path_act) omop = process_omop(file_path_omop) approval = process_approval(file_path_approval) drug_dosage = process_drug_dosage(file_path_drug_dosage) synonyms = process_synonym(file_path_synonym) structures = process_structure(file_path_structure) identifiers = process_identifier(file_path_identifier) for struc_id in set(list(pharmacology_class.keys()) + list(faers.keys()) + list(act.keys()) + list(omop.keys()) + list(approval.keys()) + list(drug_dosage.keys()) + list(identifiers.keys()) + list(synonyms.keys()) + list(structures.keys())): #for disease_id in set(list(d_go_bp.keys()) + list(d_go_mf.keys()) + list(d_go_cc.keys()) + list(d_pathway.keys())): if structures.get(struc_id, {}).get('inchikey', {}): _doc = { '_id': structures.get(struc_id, {}).get('inchikey', {}), 'drugcentral': { "pharmacology_class": pharmacology_class.get(struc_id, {}), "fda_adverse_event": faers.get(struc_id, {}), "bioactivity": act.get(struc_id, {}), "drug_use": omop.get(struc_id, {}), "approval": approval.get(struc_id, {}), "drug_dosage": drug_dosage.get(struc_id, {}), "synonyms": synonyms.get(struc_id, {}), "structures": structures.get(struc_id, {}), "xrefs": identifiers.get(struc_id, {}) } } else: _id = xrefs_2_inchikey(identifiers.get(struc_id, {})) if not _id: _id = 'DrugCentral:' + str(struc_id) _doc = { '_id': _id, 'drugcentral': { "pharmacology_class": pharmacology_class.get(struc_id, {}), "fda_adverse_event": faers.get(struc_id, {}), "bioactivity": act.get(struc_id, {}), "drug_use": omop.get(struc_id, {}), "approval": approval.get(struc_id, {}), "drug_dosage": drug_dosage.get(struc_id, {}), "synonyms": synonyms.get(struc_id, {}), "structures": structures.get(struc_id, {}), "xrefs": identifiers.get(struc_id, {}) } } _doc = (dict_sweep(unlist(_doc), [None])) yield _doc
def load_data(data_folder): input_fn = os.path.join(data_folder, "CCLE_DepMap_18q3_maf_20180718.txt") db_ccle = csv.reader(open(input_fn), delimiter='\t') index = next(db_ccle) assert len(index) == VALID_COLUMN_NO, \ "Expecting %s columns, but got %s" % (VALID_COLUMN_NO, len(index)) index = [clean_index(s) for s in index] ccle = (dict(zip(index, row)) for row in db_ccle) ccle = filter(lambda row: row["chromosome"] != "", ccle) json_rows = map(_map_line_to_json, ccle) json_rows = (row for row in json_rows if row) json_rows = sorted(json_rows, key=lambda k: k['_id']) row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"])) json_rows = (merge_duplicate_rows(rg, "ccle") for rg in row_groups) return (unlist(dict_sweep(row, vals=[ None, ])) for row in json_rows)
def _map_line_to_json(item): chrom = item.CHROM chromStart = item.POS ref = item.REF info = item.INFO hpo_count=item.INFO['HPO_CT'] for alt in item.ALT: alt = str(alt) (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True) if HGVS is None: return one_snp_json = { "_id": HGVS, "geno2mp": { "hpo_count": hpo_count, } } obj = (dict_sweep(unlist(value_convert_to_number(one_snp_json)), [None])) yield obj
def restructure_dict(dictionary): restr_dict = dict() _flag = 0 for key in list(dictionary): # this is for 1 if key == 'molecule_chembl_id': restr_dict['_id']=dictionary[key] if key == 'molecule_structures' and type(dictionary['molecule_structures'])==dict: restr_dict['chembl'] = dictionary _flag=1 for x,y in iter(dictionary['molecule_structures'].items()): if x == 'standard_inchi_key': restr_dict['chembl'].update(dictionary) restr_dict['chembl'].update({'inchi_key':y}) if x == 'canonical_smiles': restr_dict['chembl']['smiles'] = y if x == 'standard_inchi': restr_dict['chembl']['inchi'] = y if _flag == 0: restr_dict['chembl'] = dictionary if 'cross_references' in restr_dict['chembl'] and restr_dict['chembl']['cross_references']: restr_dict['chembl']['xrefs'] = restructure_xref(restr_dict['chembl']['cross_references']) del restr_dict['chembl']['molecule_structures'] del restr_dict['chembl']['cross_references'] restr_dict = unlist(restr_dict) # Add "CHEBI:" prefix, standardize the way representing CHEBI IDs if 'chebi_par_id' in restr_dict['chembl'] and restr_dict['chembl']['chebi_par_id']: restr_dict['chembl']['chebi_par_id'] = 'CHEBI:' + str(restr_dict['chembl']['chebi_par_id']) else: # clean, could be a None restr_dict['chembl'].pop("chebi_par_id",None) restr_dict = dict_sweep(restr_dict, vals=[None,".", "-", "", "NA", "None","none", " ", "Not Available", "unknown","null"]) restr_dict = value_convert_to_number(restr_dict, skipped_keys=["chebi_par_id","first_approval"]) restr_dict = boolean_convert(restr_dict, ["topical","oral","parenteral","dosed_ingredient","polymer_flag", "therapeutic_flag","med_chem_friendly","molecule_properties.ro3_pass"]) return restr_dict
def _map_line_to_json(fields): assert len(fields) == VALID_COLUMN_NO chrom = fields[0] chromStart = fields[1] ref = fields[2] alt = fields[4] HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt) # load as json data if HGVS is None: return one_snp_json = { "_id": HGVS, "cadd": { 'chrom': fields[0], 'pos': fields[1], 'ref': fields[2], 'anc': fields[3], 'alt': fields[4], 'type': fields[5], 'length': fields[6], 'istv': fields[7], 'isderived': fields[8], 'annotype': fields[9], 'consequence': fields[10], 'consscore': fields[11], 'consdetail': fields[12], 'gc': fields[13], 'cpg': fields[14], 'mapability': { '20bp': fields[15], '35bp': fields[16] }, 'scoresegdup': fields[17], 'phast_cons': { 'primate': fields[18], 'mammalian': fields[19], 'vertebrate': fields[20] }, 'phylop': { 'primate': fields[21], 'mammalian': fields[22], 'vertebrate': fields[23] }, 'gerp': { 'n': fields[24], 's': fields[25], 'rs': fields[26], 'rs_pval': fields[27] }, 'bstatistic': fields[28], 'mutindex': fields[29], 'dna': { 'helt': fields[30], 'mgw': fields[31], 'prot': fields[32], 'roll': fields[33] }, 'mirsvr': { 'score': fields[34], 'e': fields[35], 'aln': fields[36] }, 'targetscans': fields[37], 'fitcons': fields[38], 'chmm': { 'tssa': fields[39], 'tssaflnk': fields[40], 'txflnk': fields[41], 'tx': fields[42], 'txwk': fields[43], 'enh': fields[44], # 'enh': fields[45], 'znfrpts': fields[46], 'het': fields[47], 'tssbiv': fields[48], 'bivflnk': fields[49], 'enhbiv': fields[50], 'reprpc': fields[51], 'reprpcwk': fields[52], 'quies': fields[53], }, 'encode': { 'exp': fields[54], 'h3k27ac': fields[55], 'h3k4me1': fields[56], 'h3k4me3': fields[57], 'nucleo': fields[58], 'occ': fields[59], 'p_val': { 'comb': fields[60], 'dnas': fields[61], 'faire': fields[62], 'polii': fields[63], 'ctcf': fields[64], 'mycp': fields[65] }, 'sig': { 'dnase': fields[66], 'faire': fields[67], 'polii': fields[68], 'ctcf': fields[69], 'myc': fields[70] }, }, 'segway': fields[71], 'motif': { 'toverlap': fields[72], 'dist': fields[73], 'ecount': fields[74], 'ename': fields[75], 'ehipos': fields[76], 'escorechng': fields[77] }, 'tf': { 'bs': fields[78], 'bs_peaks': fields[79], 'bs_peaks_max': fields[80] }, 'isknownvariant': fields[81], 'esp': { 'af': fields[82], 'afr': fields[83], 'eur': fields[84] }, '1000g': { 'af': fields[85], 'asn': fields[86], 'amr': fields[87], 'afr': fields[88], 'eur': fields[89] }, 'min_dist_tss': fields[90], 'min_dist_tse': fields[91], 'gene': { 'gene_id': fields[92], 'feature_id': fields[93], 'ccds_id': fields[94], 'genename': fields[95], 'cds': { 'cdna_pos': fields[96], 'rel_cdna_pos': fields[97], 'cds_pos': fields[98], 'rel_cds_pos': fields[99] }, 'prot': { 'protpos': fields[100], 'rel_prot_pos': fields[101], 'domain': fields[102] } }, 'dst2splice': fields[103], 'dst2spltype': fields[104], 'exon': fields[105], 'intron': fields[106], 'oaa': fields[107], # ref aa 'naa': fields[108], # alt aa 'grantham': fields[109], 'polyphen': { 'cat': fields[110], 'val': fields[111] }, 'sift': { 'cat': fields[112], 'val': fields[113] }, 'rawscore': fields[114], # raw CADD score 'phred': fields[115] # log-percentile of raw CADD score } } obj = dict_sweep(unlist(value_convert(one_snp_json)), ["NA"]) yield obj
def _map_line_to_json(df, version, index=0): # specific variable treatment chrom = df["#chr"] if chrom == 'M': chrom = 'MT' # fields[7] in version 2, represent hg18_pos hg18_end = df["hg18_pos(1-based)"] if hg18_end == ".": hg18_end = "." else: hg18_end = int(hg18_end) # in case of no hg19 position provided, remove the item if df["hg19_pos(1-based)"] == '.': return None else: chromStart = int(df["hg19_pos(1-based)"]) chromEnd = chromStart chromStart_38 = int(df["pos(1-based)"]) ref = df["ref"].upper() alt = df["alt"].upper() HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt) HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt) if version == 'hg19': HGVS = HGVS_19 elif version == 'hg38': HGVS = HGVS_38 siphy_29way_pi = df["SiPhy_29way_pi"] if siphy_29way_pi == ".": siphy = "." else: freq = siphy_29way_pi.split(":") siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]} gtex_gene = df["GTEx_V6_gene"].split('|') gtex_tissue = df["GTEx_V6_tissue "].split('|') gtex = map(dict, map(lambda t: zip(('gene', 'tissue'), t), zip(gtex_gene, gtex_tissue))) acc = df["Uniprot_acc_Polyphen2"].rstrip().rstrip(';').split(";") pos = df["Uniprot_aapos_Polyphen2"].rstrip().rstrip(';').split(";") uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos))) provean_score = df["PROVEAN_score"].split(';') sift_score = df["SIFT_score"].split(';') hdiv_score = df["Polyphen2_HDIV_score"].split(';') hvar_score = df["Polyphen2_HVAR_score"].split(';') lrt_score = df["LRT_score"].split(';') m_cap_score = df["M-CAP_score"].split(';') mutationtaster_score = df["MutationTaster_score"].split(';') mutationassessor_score = df["MutationAssessor_score"].split(';') vest3_score = df["VEST3_score"].split(';') metasvm_score = df["MetaSVM_score"].split(';') fathmm_score = df["FATHMM_score"].split(';') metalr_score = df["MetaLR_score"].split(';') revel_score = df["REVEL_score"].split(';') ''' parse mutpred top 5 features ''' def modify_pvalue(pvalue): return float(pvalue.strip('P = ')) mutpred_mechanisms = df["MutPred_Top5features"] if mutpred_mechanisms not in ['.', ',', '-']: mutpred_mechanisms = mutpred_mechanisms.split(" (") and mutpred_mechanisms.split(";") mutpred_mechanisms = [m.rstrip(")") for m in mutpred_mechanisms] mutpred_mechanisms = [i.split(" (") for i in mutpred_mechanisms] mutpred_mechanisms = sum(mutpred_mechanisms, []) mechanisms = [ {"mechanism": mutpred_mechanisms[0], "p_val": modify_pvalue(mutpred_mechanisms[1])}, {"mechanism": mutpred_mechanisms[2], "p_val": modify_pvalue(mutpred_mechanisms[3])}, {"mechanism": mutpred_mechanisms[4], "p_val": modify_pvalue(mutpred_mechanisms[5])}, {"mechanism": mutpred_mechanisms[6], "p_val": modify_pvalue(mutpred_mechanisms[7])}, {"mechanism": mutpred_mechanisms[8], "p_val": modify_pvalue(mutpred_mechanisms[9])} ] else: mechanisms = '.' # normalize scores def norm(arr): return [None if item == '.' else item for item in arr] provean_score = norm(provean_score) sift_score = norm(sift_score) hdiv_score = norm(hdiv_score) hvar_score = norm(hvar_score) lrt_score = norm(lrt_score) m_cap_score = norm(m_cap_score) mutationtaster_score = norm(mutationtaster_score) mutationassessor_score = norm(mutationassessor_score) vest3_score = norm(vest3_score) metasvm_score = norm(metasvm_score) fathmm_score = norm(fathmm_score) metalr_score = norm(metalr_score) revel_score = norm(revel_score) # load as json data one_snp_json = { "_id": HGVS, "dbnsfp": { "rsid": df["rs_dbSNP147"], #"rsid_dbSNP144": fields[6], "chrom": chrom, "hg19": { "start": chromStart, "end": chromEnd }, "hg18": { "start": df["hg18_pos(1-based)"], "end": hg18_end }, "hg38": { "start": df["pos(1-based)"], "end": df["pos(1-based)"] }, "ref": ref, "alt": alt, "aa": { "ref": df["aaref"], "alt": df["aaalt"], "pos": df["aapos"], "refcodon": df["refcodon"], "codonpos": df["codonpos"], "codon_degeneracy": df["codon_degeneracy"], }, "genename": df["genename"], "uniprot": list(uniprot), "interpro_domain": df["Interpro_domain"], "cds_strand": df["cds_strand"], "ancestral_allele": df["Ancestral_allele"], #"altaineandertal": fields[17], #"denisova": fields[18] "ensembl": { "geneid": df["Ensembl_geneid"], "transcriptid": df["Ensembl_transcriptid"], "proteinid": df["Ensembl_proteinid"] }, "sift": { "score": sift_score, "converted_rankscore": df["SIFT_converted_rankscore"], "pred": df["SIFT_pred"] }, "polyphen2": { "hdiv": { "score": hdiv_score, "rankscore": df["Polyphen2_HDIV_rankscore"], "pred": df["Polyphen2_HDIV_pred"] }, "hvar": { "score": hvar_score, "rankscore": df["Polyphen2_HVAR_rankscore"], "pred": df["Polyphen2_HVAR_pred"] } }, "lrt": { "score": lrt_score, "converted_rankscore": df["LRT_converted_rankscore"], "pred": df["LRT_pred"], "omega": df["LRT_Omega"] }, "mutationtaster": { "score": mutationtaster_score, "converted_rankscore": df["MutationTaster_converted_rankscore"], "pred": df["MutationTaster_pred"], "model": df["MutationTaster_model"], "AAE": df["MutationTaster_AAE"] }, "mutationassessor": { "score": mutationassessor_score, "rankscore": df["MutationAssessor_score_rankscore"], "pred": df["MutationAssessor_pred"] }, "fathmm": { "score": fathmm_score, "rankscore": df["FATHMM_converted_rankscore"], "pred": df["FATHMM_pred"] }, "provean": { "score": provean_score, "rankscore": df["PROVEAN_converted_rankscore"], "pred": df["PROVEAN_pred"] }, "vest3": { "score": vest3_score, "rankscore": df["VEST3_rankscore"], "transcriptid": df["Transcript_id_VEST3"], "transcriptvar": df["Transcript_var_VEST3"] }, "fathmm-mkl": { "coding_score": df["fathmm-MKL_coding_score"], "coding_rankscore": df["fathmm-MKL_coding_rankscore"], "coding_pred": df["fathmm-MKL_coding_pred"], "coding_group": df["fathmm-MKL_coding_group"] }, "eigen": { "coding_or_noncoding": df["Eigen_coding_or_noncoding"], "raw": df["Eigen-raw"], "phred": df["Eigen-phred"] }, "eigen-pc": { "raw": df["Eigen-PC-raw"], "phred": df["Eigen-PC-phred"], "raw_rankscore": df["Eigen-PC-raw_rankscore"] }, "genocanyon": { "score": df["GenoCanyon_score"], "rankscore": df["GenoCanyon_score_rankscore"] }, "metasvm": { "score": metasvm_score, "rankscore": df["MetaSVM_rankscore"], "pred": df["MetaSVM_pred"] }, "metalr": { "score": metalr_score, "rankscore": df["MetaLR_rankscore"], "pred": df["MetaLR_pred"] }, "reliability_index": df["Reliability_index"], "m_cap_score": { "score": m_cap_score, "rankscore": df["M-CAP_rankscore"], "pred": df["M-CAP_pred"] }, "revel": { "score": revel_score, "rankscore": df["REVEL_rankscore"] }, "mutpred": { "score": df["MutPred_score"], "rankscore": df["MutPred_rankscore"], "accession": df["MutPred_protID"], "aa_change": df["MutPred_AAchange"], "pred": mechanisms }, "dann": { "score": df["DANN_score"], "rankscore": df["DANN_rankscore"] }, "gerp++": { "nr": df["GERP++_NR"], "rs": df["GERP++_RS"], "rs_rankscore": df["GERP++_RS_rankscore"] }, "integrated": { "fitcons_score": df["integrated_fitCons_score"], "fitcons_rankscore": df["integrated_fitCons_score_rankscore"], "confidence_value": df["integrated_confidence_value"] }, "gm12878": { "fitcons_score": df["GM12878_fitCons_score"], "fitcons_rankscore": df["GM12878_fitCons_score_rankscore"], "confidence_value": df["GM12878_confidence_value"] }, "h1-hesc": { "fitcons_score": df["H1-hESC_fitCons_score"], "fitcons_rankscore": df["H1-hESC_fitCons_score_rankscore"], "confidence_value": df["H1-hESC_confidence_value"] }, "huvec": { "fitcons_score": df["HUVEC_fitCons_score"], "fitcons_rankscore": df["HUVEC_fitCons_score_rankscore"], "confidence_value": df["HUVEC_confidence_value"] }, "phylo": { "p100way": { "vertebrate": df["phyloP100way_vertebrate"], "vertebrate_rankscore": df["phyloP100way_vertebrate_rankscore"] }, "p20way": { "mammalian": df["phyloP20way_mammalian"], "mammalian_rankscore": df["phyloP20way_mammalian_rankscore"] } }, "phastcons": { "100way": { "vertebrate": df["phastCons100way_vertebrate"], "vertebrate_rankscore": df["phastCons100way_vertebrate_rankscore"] }, "20way": { "mammalian": df["phastCons20way_mammalian"], "mammalian_rankscore": df["phastCons20way_mammalian_rankscore"] } }, "siphy_29way": { "pi": siphy, "logodds": df["SiPhy_29way_logOdds"], "logodds_rankscore": df["SiPhy_29way_logOdds_rankscore"] }, "1000gp3": { "ac": df["1000Gp3_AC"], "af": df["1000Gp3_AF"], "afr_ac": df["1000Gp3_AFR_AC"], "afr_af": df["1000Gp3_AFR_AF"], "eur_ac": df["1000Gp3_EUR_AC"], "eur_af": df["1000Gp3_EUR_AF"], "amr_ac": df["1000Gp3_AMR_AC"], "amr_af": df["1000Gp3_AMR_AF"], "eas_ac": df["1000Gp3_EAS_AC"], "eas_af": df["1000Gp3_EAS_AF"], "sas_ac": df["1000Gp3_SAS_AC"], "sas_af": df["1000Gp3_SAS_AF"] }, "twinsuk": { "ac": df["TWINSUK_AC"], "af": df["TWINSUK_AF"] }, "alspac": { "ac": df["ALSPAC_AC"], "af": df["ALSPAC_AF"] }, "esp6500": { "aa_ac": df["ESP6500_AA_AC"], "aa_af": df["ESP6500_AA_AF"], "ea_ac": df["ESP6500_EA_AC"], "ea_af": df["ESP6500_EA_AF"] }, "exac": { "ac": df["ExAC_AC"], "af": df["ExAC_AF"], "adj_ac": df["ExAC_Adj_AC"], "adj_af": df["ExAC_Adj_AF"], "afr_ac": df["ExAC_AFR_AC"], "afr_af": df["ExAC_AFR_AF"], "amr_ac": df["ExAC_AMR_AC"], "amr_af": df["ExAC_AMR_AF"], "eas_ac": df["ExAC_EAS_AC"], "eas_af": df["ExAC_EAS_AF"], "fin_ac": df["ExAC_FIN_AC"], "fin_af": df["ExAC_FIN_AF"], "nfe_ac": df["ExAC_NFE_AC"], "nfe_af": df["ExAC_NFE_AF"], "sas_ac": df["ExAC_SAS_AC"], "sas_af": df["ExAC_SAS_AF"] }, "exac_nontcga": { "ac": df["ExAC_nonTCGA_AC"], "af": df["ExAC_nonTCGA_AF"], "adj_ac": df["ExAC_nonTCGA_Adj_AC"], "adj_af": df["ExAC_nonTCGA_Adj_AF"], "afr_ac": df["ExAC_nonTCGA_AFR_AC"], "afr_af": df["ExAC_nonTCGA_AFR_AF"], "amr_ac": df["ExAC_nonTCGA_AMR_AC"], "amr_af": df["ExAC_nonTCGA_AMR_AF"], "eas_ac": df["ExAC_nonTCGA_EAS_AC"], "eas_af": df["ExAC_nonTCGA_EAS_AF"], "fin_ac": df["ExAC_nonTCGA_FIN_AC"], "fin_af": df["ExAC_nonTCGA_FIN_AF"], "nfe_ac": df["ExAC_nonTCGA_NFE_AC"], "nfe_af": df["ExAC_nonTCGA_NFE_AF"], "sas_ac": df["ExAC_nonTCGA_SAS_AC"], "sas_af": df["ExAC_nonTCGA_SAS_AF"] }, "exac_nonpsych": { "ac": df["ExAC_nonpsych_AC"], "af": df["ExAC_nonpsych_AF"], "adj_ac": df["ExAC_nonpsych_Adj_AC"], "adj_af": df["ExAC_nonpsych_Adj_AF"], "afr_ac": df["ExAC_nonpsych_AFR_AC"], "afr_af": df["ExAC_nonpsych_AFR_AF"], "amr_ac": df["ExAC_nonpsych_AMR_AC"], "amr_af": df["ExAC_nonpsych_AMR_AF"], "eas_ac": df["ExAC_nonpsych_EAS_AC"], "eas_af": df["ExAC_nonpsych_EAS_AF"], "fin_ac": df["ExAC_nonpsych_FIN_AC"], "fin_af": df["ExAC_nonpsych_FIN_AF"], "nfe_ac": df["ExAC_nonpsych_NFE_AC"], "nfe_af": df["ExAC_nonpsych_NFE_AF"], "sas_ac": df["ExAC_nonpsych_SAS_AC"], "sas_af": df["ExAC_nonpsych_SAS_AF"] }, "clinvar": { "rs": df["clinvar_rs"], "clinsig": list(map(int,[i for i in df["clinvar_clnsig"].split("|") if i != "."])), "trait": [i for i in df["clinvar_trait"].split("|") if i != "."], "golden_stars": list(map(int,[i for i in df["clinvar_golden_stars"].split("|") if i != "."])) }, "gtex": list(gtex) } } one_snp_json = list_split(dict_sweep(unlist(value_convert_to_number(one_snp_json)), vals=[".", '-', None]), ";") one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"]) return one_snp_json
def _map_line_to_json(cp, hg19): try: clinical_significance = cp.ReferenceClinVarAssertion.\ ClinicalSignificance.Description except: clinical_significance = None rcv_accession = cp.ReferenceClinVarAssertion.ClinVarAccession.Acc try: review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.\ ReviewStatus except: review_status = None try: last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.\ DateLastEvaluated except: last_evaluated = None number_submitters = len(cp.ClinVarAssertion) # some items in clinvar_xml doesn't have origin information try: origin = cp.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin except: origin = None conditions = [] for _trait in cp.ReferenceClinVarAssertion.TraitSet.Trait: synonyms = [] conditions_name = '' for name in _trait.Name: if name.ElementValue.Type == 'Alternate': synonyms.append(name.ElementValue.get_valueOf_()) if name.ElementValue.Type == 'Preferred': conditions_name += name.ElementValue.get_valueOf_() identifiers = {} for item in _trait.XRef: if item.DB == 'Human Phenotype Ontology': key = 'Human_Phenotype_Ontology' else: key = item.DB identifiers[key.lower()] = item.ID for symbol in _trait.Symbol: if symbol.ElementValue.Type == 'Preferred': conditions_name += ' (' + symbol.ElementValue.get_valueOf_() + ')' age_of_onset = '' for _set in _trait.AttributeSet: if _set.Attribute.Type == 'age of onset': age_of_onset = _set.Attribute.get_valueOf_() conditions.append({"name": conditions_name, "synonyms": synonyms, "identifiers": identifiers, "age_of_onset": age_of_onset}) try: genotypeset = cp.ReferenceClinVarAssertion.GenotypeSet except: genotypeset = None if genotypeset: obj_list = [] id_list = [] for _set in cp.ReferenceClinVarAssertion.GenotypeSet.MeasureSet: variant_id = _set.ID for _measure in _set.Measure: json_obj = parse_measure(_measure, hg19=hg19) if json_obj: json_obj['clinvar']['rcv'].update({'accession': rcv_accession, 'clinical_significance': clinical_significance, 'number_submitters': number_submitters, 'review_status': review_status, 'last_evaluated': str(last_evaluated), 'origin': origin, 'conditions': conditions}) json_obj['clinvar'].update({'variant_id': variant_id}) json_obj = (dict_sweep(unlist(value_convert_to_number(json_obj, ['chrom', 'omim', 'id', 'orphanet', 'gene', 'rettbase_(cdkl5)', 'cosmic', 'dbrbc'])), [None, '', 'None'])) obj_list.append(json_obj) id_list.append(json_obj['_id']) for _obj in obj_list: _obj['clinvar'].update({'genotypeset': { 'type': 'CompoundHeterozygote', 'genotype': id_list }}) yield _obj else: variant_id = cp.ReferenceClinVarAssertion.MeasureSet.ID for _measure in cp.ReferenceClinVarAssertion.MeasureSet.Measure: json_obj = parse_measure(_measure, hg19=hg19) if json_obj: json_obj['clinvar']['rcv'].update({'accession': rcv_accession, 'clinical_significance': clinical_significance, 'number_submitters': number_submitters, 'review_status': review_status, 'last_evaluated': str(last_evaluated), 'origin': origin, 'conditions': conditions}) json_obj['clinvar'].update({'variant_id': variant_id}) json_obj = (dict_sweep(unlist(value_convert_to_number(json_obj, ['chrom', 'omim', 'id', 'orphanet', 'gene', 'rettbase_(cdkl5)', 'cosmic', 'dbrbc'])), [None, '', 'None'])) yield json_obj
def load_data(): dgidb_docs = fetch_all_docs_from_api() for _doc in dgidb_docs: _doc['interaction_id'] = _doc.pop('id') yield dict_sweep(unlist({'dgidb': _doc}), vals=[None, "", []])
def annotate(self,hgvs_vcfs): """hgvs_vcfs: list of {"vcf": {}, "_id": ""}""" # title of vcf vcf_stdin = ['#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO'] for hgvs_id in hgvs_vcfs: vcf = hgvs_vcfs[hgvs_id]["vcf"] try: self.check_hgvs_info(vcf) except (TypeError, ValueError) as e: self.logger.warning("Skipping HGVS %s: %s" % (repr(hgvs_vcfs[hgvs_id]),e)) continue # add hgvs ID at the end so we can match for sure which annotations correspond to which ID # instead of rebuild it from VCF info (they can be different) # this comment will be at the first position in the result line vcf_stdin.append(str(vcf["chrom"]) + '\t' + str(vcf["position"]) + '\t' + '.' + '\t' + vcf["ref"] + '\t' + vcf["alt"] + '\t.\t.\t.' + "\t# hgvs:" + hgvs_id) if (len(vcf_stdin) - 1) == 0: self.logger.info("No HGVS ID as input (previously filtered out)") return self.logger.info("Running '%s' on %d HGVS IDs" % (self.snpeff_cmd,len(vcf_stdin)-1)) # -1: header proc = subprocess.Popen(self.snpeff_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdout, stderr) = proc.communicate("\n".join(vcf_stdin).encode()) stderr = stderr.decode() # they print some news message on stderr, bad idea when we use it to detect errors. # try to get rid of it if "NEW VERSION!" in stderr: stderr = stderr.splitlines() start = stderr.index("NEW VERSION!") # message is 5 lines long (hopefully..) end = start + 5 stderr = stderr[:start] + stderr[end:] # rebuild and clean any empty lines stderr = "\n".join(stderr).strip() if stderr != '': fn = "snpeff_err_%s.pickle" % datetime.datetime.now().strftime("%Y%m%d_%H%M%S") pickle.dump({"input" : hgvs_vcfs, "vcf_stdin" : vcf_stdin, "stderr" : stderr},open(fn,"wb")) raise Exception("Something went wrong while generating snpeff annotation (see dump %s for more):\n%s" % (fn,stderr)) strout = stdout.decode() vcf_stdout_raw = strout.splitlines() for vcf_line in vcf_stdout_raw: if vcf_line.startswith('#'): continue elif vcf_line == '': continue else: fromi = vcf_line.index("#") str_id = vcf_line[fromi:] hgvs_info = str_id.replace("#","").strip().split(":") # extract HGVS assert hgvs_info[0] == "hgvs", "Can't find HGVS ID in VCF line '%s'" % repr(vcf_line) hgvs_id = ":".join(hgvs_info[1:]) # -1: remove the tab char also, before # vcf_line = vcf_line[:fromi-1] # assume the following item is 'ANN' ann_info = vcf_line.split(';')[0] ann = [] # Multiple annotations per VCF line for item in ann_info.split(','): if len(item.split('|')) > 1: (effect, putative_impact, gene_name, gene_id, feature_type, feature_id) = item.split('|')[1:7] (transcript_biotype, exon, hgvs_coding, hgvs_protein, cdna, cds, protein, distance_to_feature) = item.split('|')[7:15] if cdna: (cdna_position, cdna_len) = cdna.split('/') else: cdna_position = None cdna_len = None if cds: (cds_position, cds_len) = cds.split('/') else: cds_position = None cds_len = None if protein: (protein_position, protein_len) = protein.split('/') else: protein_position = None protein_len = None if exon: (rank, total) = exon.split('/') else: rank = None total = None ann.append({ "effect": effect, "putative_impact": putative_impact, "genename": gene_name, "gene_id": gene_id, "feature_type": feature_type, "feature_id": feature_id, "transcript_biotype": transcript_biotype, "rank": rank, "total": total, "hgvs_c": trim_delseq_from_hgvs(hgvs_coding), # trim long sequence "hgvs_p": hgvs_protein, "cdna": { "position": cdna_position, "length": cdna_len }, "cds": { "position": cds_position, "length": cds_len }, "protein": { "position": protein_position, "length": protein_len }, "distance_to_feature": distance_to_feature }) # not all annotations include lof & nmd information. Set them to 'None' as default lof = None nmd = None # the case that annotation include 'ann' & 'lof' & 'nmd' if len(vcf_line.split(';')) == 3: (lof_info, nmd_info) = vcf_line.split(';')[1:3] # assume the second item is 'lof' assert lof_info.startswith('LOF') # the information to be parsed is like this: 'LOF=(PTEN|PTEN|1|1.00)' lof_info = lof_info.split('(')[1].split(')')[0] nmd_info = nmd_info.split('(')[1].split(')')[0] (id_lof, name_lof, nt_lof, pt_lof) = lof_info.split('|') (id_nmd, name_nmd, nt_nmd, pt_nmd) = nmd_info.split('|') lof = { "gene_id": id_lof, "genename": name_lof, "number_of_transcripts_in_gene": nt_lof, "percent_of_transcripts_affected": pt_lof } nmd = { "gene_id": id_nmd, "genename": name_nmd, "number_of_transcripts_in_gene": nt_nmd, "percent_of_transcripts_affected": pt_nmd } # the case that annotation include 'ann' & 'lof or nmd' elif len(vcf_line.split(';')) == 2: (ann_info, idk_info) = vcf_line.split(';') if idk_info.startswith('LOF'): lof_info = idk_info.split('(')[1].split(')')[0] (id_lof, name_lof, nt_lof, pt_lof) = lof_info.split('|') lof = { "gene_id": id_lof, "genename": name_lof, "number_of_transcripts_in_gene": nt_lof, "percent_of_transcripts_affected": pt_lof } else: nmd_info = idk_info.split('(')[1].split(')')[0] (id_nmd, name_nmd, nt_nmd, pt_nmd) = nmd_info.split('|') nmd = { "gene_id": id_nmd, "genename": name_nmd, "number_of_transcripts_in_gene": nt_nmd, "percent_of_transcripts_affected": pt_nmd } (chrom, pos, _id, ref, alt) = ann_info.split('\t')[0:5] one_snp_json = { "_id": hgvs_id, "snpeff": { "ann": ann, "lof": lof, "nmd": nmd, }, } snpeff_json = dict_sweep(unlist(one_snp_json), vals=['', None]) yield snpeff_json
def _map_line_to_json(doc_key, item): chrom = item.CHROM chromStart = item.POS ref = item.REF info = item.INFO try: baseqranksum = info['BaseQRankSum'] except: baseqranksum = None try: clippingranksum = info['ClippingRankSum'] except: clippingranksum = None try: mqranksum = info['MQRankSum'] except: mqranksum = None try: readposranksum = info['ReadPosRankSum'] except: readposranksum = None try: qd = info['QD'] except: qd = None try: inbreedingcoeff = info['InbreedingCoeff'] except: inbreedingcoeff = None # convert vcf object to string item.ALT = [str(alt) for alt in item.ALT] # if multiallelic, put all variants as a list in multi-allelic field hgvs_list = None if len(item.ALT) > 1: hgvs_list = [get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=False) for alt in item.ALT] for i, alt in enumerate(item.ALT): (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True) if HGVS is None: return assert len(item.ALT) == len(info['AC']), "Expecting length of item.ALT= length of info.AC, but not for %s" % (HGVS) assert len(item.ALT) == len(info['AF']), "Expecting length of item.ALT= length of info.AF, but not for %s" % (HGVS) assert len(item.ALT) == len(info['Hom_AFR']), "Expecting length of item.ALT= length of HOM_AFR, but not for %s" % (HGVS) one_snp_json = { "_id": HGVS, doc_key : { "chrom": chrom, "pos": chromStart, "multi-allelic": hgvs_list, "ref": ref, "alt": alt, "alleles": item.ALT, "type": var_type, "ac": { "ac": info['AC'][i], "ac_afr": info['AC_AFR'][i], "ac_amr": info['AC_AMR'][i], "ac_adj": info['AC_Adj'][i], "ac_eas": info['AC_EAS'][i], "ac_fin": info['AC_FIN'][i], "ac_het": info['AC_Het'][i], "ac_hom": info['AC_Hom'][i], "ac_nfe": info['AC_NFE'][i], "ac_oth": info['AC_OTH'][i], "ac_sas": info['AC_SAS'][i], "ac_male": info['AC_MALE'][i], "ac_female": info['AC_FEMALE'][i] }, "af": info['AF'][i], "an": { "an": info['AN'], "an_afr": info['AN_AFR'], "an_amr": info['AN_AMR'], "an_adj": info['AN_Adj'], "an_eas": info['AN_EAS'], "an_fin": info['AN_FIN'], "an_nfe": info['AN_NFE'], "an_oth": info['AN_OTH'], "an_sas": info['AN_SAS'], "an_female": info['AN_FEMALE'], "an_male": info['AN_MALE'] }, "baseqranksum": baseqranksum, "clippingranksum": clippingranksum, "fs": info['FS'], "het": { "het_afr": info['Het_AFR'], "het_amr": info['Het_AMR'], "het_eas": info['Het_EAS'], "het_fin": info['Het_FIN'], "het_nfe": info['Het_NFE'], "het_oth": info['Het_OTH'], "het_sas": info['Het_SAS'] }, "hom": { "hom_afr": info['Hom_AFR'], "hom_amr": info['Hom_AMR'], "hom_eas": info['Hom_EAS'], "hom_fin": info['Hom_FIN'], "hom_nfe": info['Hom_NFE'], "hom_oth": info['Hom_OTH'], "hom_sas": info['Hom_SAS'] }, "inbreedingcoeff": inbreedingcoeff, "mq": { "mq": info['MQ'], "mq0": info['MQ0'], "mqranksum": mqranksum }, "ncc": info['NCC'], "qd": qd, "readposranksum": readposranksum, "vqslod": info['VQSLOD'], "culprit": info['culprit'] } } obj = (dict_sweep(unlist(value_convert_to_number(one_snp_json)), [None])) yield obj
def _map_line_to_json(fields): assert len(fields) == VALID_COLUMN_NO chrom = fields[13] chromStart = fields[14] chromEnd = fields[15] HGVS = None cds = fields[18].split(":") cds = cds[1] replace = re.findall(r'[ATCGMNYR=]+', cds) sub = re.search(r'\d([ATCGMNHKRY]>[ATCGMNHKRY])', cds) ins = re.search(r'ins[ATCGMNHYR]+|ins[0-9]+', cds) delete = fields[1] == 'deletion' indel = fields[1] == 'indel' dup = re.search(r'dup', cds) inv = re.search(r'inv|inv[0-9]+|inv[ATCGMNHYR]+', cds) if ins: delete = None indel = None elif delete: ins = None indel = None # parse from vcf file. Input chrom number # and chromStart, and return REF, ALT if chromStart: record = vcf_reader.fetch(chrom, int(chromStart)) else: record = None if record: REF = record.REF ALT = record.ALT ALT = ALT[0] if record.is_snp and len(ALT) < 2: mod = [REF, ALT] else: mod = ALT else: return if sub and record.is_snp: HGVS = "chr%s:g.%s%s>%s" % (chrom, chromStart, mod[0], mod[1]) elif ins: HGVS = "chr%s:g.%s_%sins%s" % (chrom, chromStart, chromEnd, mod) elif delete: HGVS = "chr%s:g.%s_%sdel" % (chrom, chromStart, chromEnd) elif indel: try: HGVS = "chr%s:g.%s_%sdelins%s" % (chrom, chromStart, chromEnd, mod) except AttributeError: print "ERROR:", fields[1], cds elif dup: HGVS = "chr%s:g.%s_%sdup%s" % (chrom, chromStart, chromEnd, mod) elif inv: HGVS = "chr%s:g.%s_%sinv%s" % (chrom, chromStart, chromEnd, mod) elif replace: HGVS = "chr%s:g.%s_%s%s" % (chrom, chromStart, chromEnd, mod) else: print 'ERROR:', fields[1], cds # load as json data if HGVS is None: print 'None:', fields[1], cds return None one_snp_json = { "_id": HGVS, "clinvar": { "allele_id": fields[0], "hg19": { "chr": fields[13], "start": fields[14], "end": fields[15] }, "type": fields[1], "name": fields[2], "gene": { "id": fields[3], "symbol": fields[4] }, "clinical_significance": fields[5].split(";"), "rsid": 'rs' + str(fields[6]), "nsv_dbvar": fields[7], "rcv_accession": fields[8].split(";"), "tested_in_gtr": fields[9], "phenotype_id": other_id(fields[10]), "origin": fields[11], "cytogenic": fields[16], "review_status": fields[17], "hgvs": { "coding": fields[18], "protein": fields[19] }, "number_submitters": fields[20], "last_evaluated": fields[21], "guidelines": fields[22], "other_ids": other_id(fields[23]), "clinvar_id": fields[24] } } return dict_sweep(unlist(value_convert_to_number(one_snp_json)), vals=["-"])
def _map_line_to_json(fields,dbsnp_col): assert len(fields) == VALID_COLUMN_NO rsid = fields[8] # load as json data if rsid is None: return docs = [d for d in dbsnp_col.find({"dbsnp.rsid":rsid})] for doc in docs: HGVS = doc['_id'] one_snp_json = { "_id": HGVS, "grasp": { 'hg19': { 'chr': fields[5], 'pos': fields[6] }, 'hupfield': fields[1], 'last_curation_date': fields[2], 'creation_date': fields[3], 'srsid': fields[4], 'publication': { 'journal': fields[16], 'title': fields[17], 'pmid': fields[7], 'snpid': fields[8], 'location_within_paper': fields[9], 'p_value': fields[10], 'phenotype': fields[11], 'paper_phenotype_description': fields[12], 'paper_phenotype_categories': fields[13], 'date_pub': fields[14] }, 'includes_male_female_only_analyses': fields[18], 'exclusively_male_female': fields[19], 'initial_sample_description': fields[20], 'replication_sample_description': fields[21], 'platform_snps_passing_qc': fields[22], 'gwas_ancestry_description': fields[23], 'discovery': { 'total_samples': fields[25], 'european': fields[26], 'african': fields[27], 'east_asian': fields[28], 'indian_south_asian': fields[29], 'hispanic': fields[30], 'native': fields[31], 'micronesian': fields[32], 'arab_me': fields[33], 'mixed': fields[34], 'unspecified': fields[35], 'filipino': fields[36], 'indonesian': fields[37] }, 'replication': { 'total_samples': fields[38], 'european': fields[39], 'african': fields[40], 'east_asian': fields[41], 'indian_south_asian': fields[42], 'hispanic': fields[43], 'native': fields[44], 'micronesian': fields[45], 'arab_me': fields[46], 'mixed': fields[47], 'unspecified': fields[48], 'filipino': fields[49], 'indonesian': fields[50] }, 'in_gene': fields[51], 'nearest_gene': fields[52], 'in_lincrna': fields[53], 'in_mirna': fields[54], 'in_mirna_bs': fields[55], 'oreg_anno': fields[61], 'conserv_pred_tfbs': fields[62], 'human_enhancer': fields[63], 'rna_edit': fields[64], 'polyphen2': fields[65], 'sift': fields[66], 'ls_snp': fields[67], 'uniprot': fields[68], 'eqtl_meth_metab_study': fields[69] } } return list_split(dict_sweep(unlist(value_convert_to_number(one_snp_json)), [""]), ",")