def get_hgvs_name(record, as_list=False): """construct the valid HGVS name as the _id field""" chrom = record.CHROM chromStart = record.INFO['RSPOS'] ref = record.REF _alt_list = [] _id_list = [] _pos_list = [] for alt in record.ALT: # should not make alt a string if alt is None if alt: alt = str(alt) _alt_list.append(alt) try: # NOTE: current get_pos_start_end doesn't handle ALT=None case # TODO: need to remove str(alt) when get_pos_start_end can # handle ALT=None case (start, end) = get_pos_start_end(chrom, chromStart, ref, alt) _pos_list.append(OrderedDict(start=start, end=end)) # handle cases where start & end position could not be # inferred from VCF except ValueError: _pos_list.append(OrderedDict(start=None, end=None)) try: HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, str(alt), mutant_type=False) _id_list.append(HGVS) # handle cases where hgvs id could not be inferred from vcf except ValueError: pass return _id_list, _alt_list, _pos_list
def parse_one_rec(assembly, record): """Restructure JSON """ doc = {"alleles": [], "gene": [], assembly: {}, "vartype": record.get("primary_snapshot_data").get("variant_type"), "rsid": "rs" + str(record.get("refsnp_id")), "dbsnp_build": int(record.get("last_update_build_id")), "dbsnp_merges": restructure_dbsnp_merge(record.get("dbsnp1_merges")), "citations": record.get("citations")} data = record.get('primary_snapshot_data') hgvs_vcf_info = get_hgvs_and_vcf(assembly, data.get("placements_with_allele")) allele_annotations = data.get('allele_annotations') allele_annotations = list(allele_annotations) doc["alleles"] = restructure_allele_freq_info(allele_annotations) doc['gene'] = restructure_gene_info(allele_annotations) for _item in hgvs_vcf_info: hgvs, vcf = _item if vcf: doc["chrom"], pos, doc["ref"], doc["alt"] = vcf doc["chrom"] = str(doc["chrom"]) if doc["chrom"] == "23": doc["chrom"] = "X" elif doc["chrom"] == "24": doc["chrom"] = "Y" doc[assembly] = {} try: if doc["vartype"] != "snv": ref = "T" + doc["ref"] alt = "T" + doc["alt"] else: ref = doc["ref"] alt = doc["alt"] if doc["vartype"] in ["ins", "del", "delins"]: doc[assembly]['start'], doc[assembly]['end'] = get_pos_start_end(doc["chrom"], pos - 1, ref, alt) else: doc[assembly]['start'], doc[assembly]['end'] = get_pos_start_end(doc["chrom"], pos, ref, alt) except (ValueError, AssertionError): doc[assembly] = {} if hgvs: doc["_id"] = hgvs.replace('chr23', 'chrX').replace('chr24', 'chrY') yield dict_sweep(unlist(value_convert_to_number(doc, skipped_keys=['chrom', 'ref', 'alt', 'allele', 'deleted_sequence', 'inserted_sequence'])), vals=[[], {}, None])
def get_start_end(variant_type, chrom, pos, ref, alt): # TODO this is actually a hack. # When the variant is not a 'snv', ref or alt might be empty. # However `get_pos_start_end` cannot work with empty ref or alt. # Therefore we add a preceding "T" (or any single character) to bypass this limitation. # The detail of this hack should be handled by `get_pos_start_end` itself, not here. if variant_type != "snv": ref = "T" + ref alt = "T" + alt try: if variant_type in ["ins", "del", "delins"]: start, end = get_pos_start_end(chrom, pos - 1, ref, alt) else: start, end = get_pos_start_end(chrom, pos, ref, alt) return start, end except (ValueError, AssertionError): return None, None
def annotate_start_end(hgvs_vcfs, assembly): for hgvs_id in hgvs_vcfs: st, end = None, None doc = hgvs_vcfs[hgvs_id] if 'vcf' in doc: # remove chrom, not needed doc['vcf'].pop('chrom', None) try: st, end = get_pos_start_end( chr=None, # not even used in func pos=doc['vcf']['position'], ref=doc['vcf']['ref'], alt=doc['vcf']['alt']) if st and end: doc[assembly] = {"start": st, "end": end} except Exception as e: pass yield doc
def annotate_start_end(hgvs_vcfs, assembly): for hgvs_id in hgvs_vcfs: st,end = None,None doc = hgvs_vcfs[hgvs_id] if 'vcf' in doc: # remove chrom, not needed doc['vcf'].pop('chrom',None) try: st, end = get_pos_start_end( chr=None, # not even used in func pos=doc['vcf']['position'], ref=doc['vcf']['ref'], alt=doc['vcf']['alt']) if st and end: doc[assembly] = {"start": st, "end": end} except Exception as e: pass yield doc
def _map_line_to_json(fields): chrInfo = fields[0].split(":") # grch37 chrom = chrInfo[0] chromStart = int(chrInfo[1]) ma_fin_percent = fields[7].split("/") if fields[3]: mutation = fields[3].split(">") ref = mutation[0] alt = mutation[1] HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt) hg19 = get_pos_start_end(chrom, chromStart, ref, alt) hg38 = get_pos_start_end(chrom, int(fields[30].split(":")[1]), ref, alt) # load as json data if HGVS is None: return one_snp_json = { "_id": HGVS, "evs": { "chrom": chrom, "hg19": { "start": hg19[0], "end": hg19[1] }, "hg38": { "start": hg38[0], "end": hg38[1] }, "rsid": fields[1], "dbsnp_version": get_dbsnp(fields[2]), "ref": ref, "alt": alt, "allele_count": { "european_american": count_dict(fields[4]), "african_american": count_dict(fields[5]), "all": count_dict(fields[6]) }, "ma_fin_percent": { "european_american": ma_fin_percent[0], "african_american": ma_fin_percent[1], "all": ma_fin_percent[2] }, "genotype_count": { "european_american": count_dict(fields[8]), "african_american": count_dict(fields[9]), "all_genotype": count_dict(fields[10]) }, "avg_sample_read": fields[11], "gene": { "symbol": fields[12], "accession": fields[13] }, "function_gvs": fields[14], "hgvs": { "coding": fields[16], "protein": fields[15] }, "coding_dna_size": fields[17], "conservation": { "phast_cons": fields[18], "gerp": fields[19] }, "grantham_score": fields[20], "polyphen2": { "class": polyphen(fields[21])[0], "score": polyphen(fields[21])[1] }, "ref_base_ncbi": fields[22], "chimp_allele": fields[23], "clinical_info": fields[24], "filter_status": fields[25], "on_illumina_human_exome_chip": fields[26], "gwas_pubmed_info": fields[27], "estimated_age_kyrs": { "ea": fields[28], "aa": fields[29] } } } return dict_sweep(value_convert(one_snp_json), vals=["NA", "none", "unknown"])
def _map_line_to_json(fields, version): chrInfo = fields[0].split(":") # grch37 chrom = chrInfo[0] chromStart = int(chrInfo[1]) ma_fin_percent = fields[7].split("/") if fields[3]: mutation = fields[3].split(">") ref = mutation[0] alt = mutation[1] hg19 = get_pos_start_end(chrom, chromStart, ref, alt) hg38 = get_pos_start_end(chrom, int(fields[30].split(":")[1]), ref, alt) if version == 'hg19': HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt) elif version == 'hg38': HGVS = get_hgvs_from_vcf(chrom, hg38[0], ref, alt) # load as json data if HGVS is None: return one_snp_json = { "_id": HGVS, "evs": { "chrom": chrom, "hg19": { "start": hg19[0], "end": hg19[1] }, "hg38": { "start": hg38[0], "end": hg38[1] }, "rsid": fields[1], "dbsnp_version": get_dbsnp(fields[2]), "ref": ref, "alt": alt, "allele_count": { "european_american": count_dict(fields[4]), "african_american": count_dict(fields[5]), "all": count_dict(fields[6]) }, "ma_fin_percent": { "european_american": ma_fin_percent[0], "african_american": ma_fin_percent[1], "all": ma_fin_percent[2] }, "genotype_count": { "european_american": count_dict(fields[8]), "african_american": count_dict(fields[9]), "all_genotype": count_dict(fields[10]) }, "avg_sample_read": fields[11], "gene": { "symbol": fields[12], "accession": fields[13] }, "function_gvs": fields[14], "hgvs": { "coding": fields[16], "protein": fields[15] }, "coding_dna_size": fields[17], "conservation": { "phast_cons": fields[18], "gerp": fields[19] }, "grantham_score": fields[20], "polyphen2": { "class": polyphen(fields[21])[0], "score": polyphen(fields[21])[1] }, "ref_base_ncbi": fields[22], "chimp_allele": fields[23], "clinical_info": fields[24], "filter_status": fields[25], "on_illumina_human_exome_chip": fields[26], "gwas_pubmed_info": fields[27], "estimated_age_kyrs": { "ea": fields[28], "aa": fields[29] } } } return dict_sweep(value_convert(one_snp_json), vals=["NA", "none", "unknown"])