def get_hgvs_name(record, as_list=False): """construct the valid HGVS name as the _id field""" chrom = record.CHROM chromStart = record.INFO['RSPOS'] ref = record.REF _alt_list = [] _id_list = [] _pos_list = [] for alt in record.ALT: # should not make alt a string if alt is None if alt: alt = str(alt) _alt_list.append(alt) try: # NOTE: current get_pos_start_end doesn't handle ALT=None case # TODO: need to remove str(alt) when get_pos_start_end can # handle ALT=None case (start, end) = get_pos_start_end(chrom, chromStart, ref, alt) _pos_list.append(OrderedDict(start=start, end=end)) # handle cases where start & end position could not be # inferred from VCF except ValueError: _pos_list.append(OrderedDict(start=None, end=None)) try: HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, str(alt), mutant_type=False) _id_list.append(HGVS) # handle cases where hgvs id could not be inferred from vcf except ValueError: pass return _id_list, _alt_list, _pos_list
def load_data(): # number of civic ids with ref, alt, chrom no_case1 = 0 # number of civic ids with chrom, ref, but no alt no_case2 = 0 # number of civic ids with chrom, alt, but no ref no_case3 = 0 # number of civic ids with no alt and ref no_case4 = 0 for variant_id in range(MAX_VARIANT_NUMBER): if variant_id % 200 == 0: print("scanned {} variants".format(variant_id)) civic_url = 'https://civic.genome.wustl.edu/api/variants/' url = civic_url + str(variant_id) doc = requests.get(url).json() # time delay for 0.5s time.sleep(0.5) if set(['error', 'status']) != set(doc.keys()): [chrom, pos, ref, alt] = [doc['coordinates'][x] for x in ['chromosome', 'start', 'reference_bases', 'variant_bases']] doc.pop("id") new_doc = {} doc['variant_id'] = variant_id if chrom and ref and alt: no_case1 += 1 try: new_doc['_id'] = get_hgvs_from_vcf(chrom, pos, ref, alt) except ValueError: print("id has ref,alt, but coudn't be converted to hgvs id: {}".format(variant_id)) continue # handle cases of deletions where only ref info is provided elif chrom and ref and not alt: no_case2 += 1 start = int(pos) end = int(pos) + len(ref) - 1 if start == end: new_doc['_id'] = 'chr{0}:g.{1}del'.format(chrom, start) else: new_doc['_id'] = 'chr{0}:g.{1}_{2}del'.format(chrom, start, end) # handle cases of insertions where only alt info is provided elif chrom and alt and not ref: no_case3 += 1 new_doc['_id'] = 'chr{0}:g.{1}_{2}ins{3}'.format(chrom, start, end, alt) # handle cases where no ref or alt info provided, # in this case, use CIVIC internal ID as the primary id for MyVariant.info, e.g. CIVIC_VARIANT:1 else: no_case4 += 1 new_doc['_id'] = 'CIVIC_VARIANT:' + str(variant_id) for _evidence in doc['evidence_items']: if 'disease' in _evidence and 'doid' in _evidence['disease'] and _evidence['disease']['doid']: _evidence['disease']['doid'] = 'DOID:' + _evidence['disease']['doid'] new_doc['civic'] = doc yield dict_sweep(unlist(new_doc),['','null', 'N/A', None, [], {}]) # change doid into its formal representation, which should be sth like DOID:1 else: continue print("number of ids with ref, alt, chrom: {}".format(no_case1)) print("number of ids with chrom, ref but no alt: {}".format(no_case2)) print("number of ids with chrom, alt but no ref: {}".format(no_case3)) print("number of ids with no ref and alt: {}".format(no_case4))
def fetch_generator(tabix, contig): dbfile_path = 'home/kevinxin/cadd/' + 'cadd_id' + contig db = dbm.open(dbfile_path) ids = db.keys() set_ids = set(ids) print(len(ids)) fetch = tabix.fetch(contig) rows = map(lambda x: x.split('\t'), fetch) # looking for annotype as 'codingtranscript', 'noncodingtranscript' annos = (row for row in rows if "CodingTranscript" in row[9] or get_hgvs_from_vcf(row[0], row[1], row[2], row[4]) in set_ids) json_rows = map(_map_line_to_json, annos) json_rows = (row for row in json_rows if row) row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"])) return (merge_duplicate_rows(rg, "cadd") for rg in row_groups)
def _map_line_to_json(item): chrom = item.CHROM chromStart = item.POS ref = item.REF info = item.INFO hpo_count=item.INFO['HPO_CT'] for alt in item.ALT: alt = str(alt) (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True) if HGVS is None: return one_snp_json = { "_id": HGVS, "geno2mp": { "hpo_count": hpo_count, } } obj = (dict_sweep(unlist(value_convert(one_snp_json)), [None])) yield obj
def _map_line_to_json(item): chrom = item.CHROM chromStart = item.POS ref = item.REF info = item.INFO hpo_count=item.INFO['HPO_CT'] for alt in item.ALT: alt = str(alt) (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True) if HGVS is None: return one_snp_json = { "_id": HGVS, "geno2mp": { "hpo_count": hpo_count, } } obj = (dict_sweep(unlist(value_convert_to_number(one_snp_json)), [None])) yield obj
def load_data (data_file): d = {} with open(data_file, "r+") as f: for line in f: try: y = re.split("[\t \n]", line) if y[0] != "Chrom": _id = hgvs.get_hgvs_from_vcf(y[0], y[1],y[2], y[3]) d = {"_id":_id, "fire": {}} d["fire"]["chr"] = y[0] d["fire"]["pos"] = y[1] d["fire"]["ref"] = y[2] d["fire"]["alt"] = y[3] d["fire"]["score"] = float(y[4]) yield d d = {} except Exception as e: logging.error("Pb with %s: %s" % (line,e)) continue
def parse(cls, record: vcf.model._Record) -> list: """ Read the profile data from a VCF record. Note that there is no such "profile" section shown in the gnomAD browser. These fields, i.e. "chrom", "pos", "filter", "multi-allelic", "ref", "alt", "alleles", "type", and "rsid", are named as profile fields simply for the convenience of implementation. Each ALT has its own profile (which will be wrapped into a dict) and this function will return a list of tuples (<hgvs_id>, <profile_dict>). It's feasible to return a dict of {<hgvs_id>: <profile_dict>} instead of a list of tuples, but the order of <hgvs_id> should be preserved (to the order of ALTs). It's easier to just use an index to iterate over the list of tuples, considering the implementation of `PopulationFrequencyParser.parse()` method. """ # although each ALT looks exactly like a string, it is a special type alt_list = [str(alt) for alt in record.ALT] # for each ALT, get its (hgvs_id, var_type) tuple # Here I assume that the "chr" prefix of `record.CHROM`, if any, has already been removed hgvs_list = [get_hgvs_from_vcf(record.CHROM, record.POS, record.REF, alt, mutant_type=True) for alt in alt_list] # if multi-allelic, put all variants' HGVS ids as a list in multi-allelic field multi_allelic = [t[0] for t in hgvs_list] if len(hgvs_list) > 1 else None def generate_profiles(): for alt, (hgvs_id, var_type) in zip(alt_list, hgvs_list): profile_dict = { "chrom": record.CHROM, "pos": record.POS, "filter": record.FILTER, "multi-allelic": multi_allelic, "ref": record.REF, "alt": alt, "alleles": alt_list, "type": var_type, "rsid": record.ID } yield hgvs_id, profile_dict return list(generate_profiles())
def _map_line_to_json(item, keys): key_start = ["AC", "AF", "AN", "Hom", "GC", "Hemi"] chrom = str(item.CHROM) if chrom not in CHROM_VALID_VALUES: return chromStart = item.POS ref = item.REF info = item.INFO _filter = item.FILTER rsid = item.ID # the following value could be missing in the vcf record # check first if the key exists in the vcf record # if not, return None vqslod = info[ 'VQSLOD'] if 'VQSLOD' in info and info['VQSLOD'] != math.inf else None vqsr_culprit = info['VQSR_culprit'] if 'VQSR_culprit' in info else None baseqranksum = info['BaseQRankSum'] if 'BaseQRankSum' in info else None clippingranksum = info[ 'ClippingRankSum'] if 'ClippingRankSum' in info else None mqranksum = info['MQRankSum'] if 'MQRankSum' in info else None readposranksum = info[ 'ReadPosRankSum'] if 'ReadPosRankSum' in info else None qd = info['QD'] if 'QD' in info else None inbreedingcoeff = info[ 'InbreedingCoeff'] if 'InbreedingCoeff' in info else None # convert vcf object to string item.ALT = [str(alt) for alt in item.ALT] # if multiallelic, put all variants as a list in multi-allelic field hgvs_list = None if len(item.ALT) > 1: hgvs_list = [ get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=False) for alt in item.ALT ] for i, alt in enumerate(item.ALT): (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True) if HGVS is None: return assert len(item.ALT) == len( info['AC'] ), "Expecting length of item.ALT= length of info.AC, but not for %s" % ( HGVS) assert len(item.ALT) == len( info['AF'] ), "Expecting length of item.ALT= length of info.AF, but not for %s" % ( HGVS) one_snp_json = { "_id": HGVS, "gnomad_genome": { "chrom": chrom, "pos": chromStart, "filter": _filter, "multi-allelic": hgvs_list, "ref": ref, "alt": alt, "alleles": item.ALT, "type": var_type, "rsid": rsid, "baseqranksum": baseqranksum, "clippingranksum": clippingranksum, "fs": info['FS'], "inbreedingcoeff": inbreedingcoeff, "mq": { "mq": info['MQ'], "mqranksum": mqranksum }, "qd": qd, "readposranksum": readposranksum, "vqslod": vqslod, "vqsr_culprit": vqsr_culprit } } # create a holder in one_snp_json for each _start, e.g. 'ac', 'af', 'gc' for _start in key_start: one_snp_json['gnomad_genome'][_start.lower()] = {} # loop through each available key for _key in keys: if _key in info: # loop through each prefix for _start in key_start: # "ac", "af" value is related to multi-allelic, need to deal with separately if _key.startswith(_start) and _start in [ 'AC', 'AF', 'Hom', 'Hemi' ]: one_snp_json['gnomad_genome'][_start.lower()][ _key.lower()] = info[_key][i] elif _key.startswith(_start) and _start not in [ 'AC', 'AF', 'Hom', 'Hemi' ]: one_snp_json['gnomad_genome'][_start.lower()][ _key.lower()] = info[_key] obj = (dict_sweep( unlist( value_convert_to_number(one_snp_json, skipped_keys=['chrom'])), [None])) yield obj
def annotate_by_snpeff(self, varobj_list): '''load data''' # title of vcf vcf_stdin = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n' # extract each item from list, transform into vcf format snpeff_valid_id = [] for item in varobj_list: if '>' in item: hgvs_info = self.snp_hgvs_id_parser(item) try: vcf_stdin += self.snp_vcf_constructer(hgvs_info) except TypeError: print(item) continue snpeff_valid_id.append(item) elif item.endswith('del'): hgvs_info = self.del_hgvs_id_parser(item) try: vcf_stdin += self.del_vcf_constructor(hgvs_info) except TypeError: print(item) continue snpeff_valid_id.append(item) elif 'ins' in item and 'del' not in item: hgvs_info = self.ins_hgvs_id_parser(item) try: vcf_stdin += self.ins_vcf_constructor(hgvs_info) except TypeError: print(item) continue snpeff_valid_id.append(item) elif 'delins' in item: hgvs_info = self.delins_hgvs_id_parser(item) try: vcf_stdin += self.delins_vcf_constructor(hgvs_info) except TypeError: print(item) continue else: print(item) print('beyond current capacity') proc = subprocess.Popen(SNPEFF_CMD, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdout, stderr) = proc.communicate(vcf_stdin) assert stderr == '', stderr vcf_stdout_raw = stdout.split('\n') for vcf_line in vcf_stdout_raw: if vcf_line.startswith('#'): continue elif vcf_line == '': continue else: # assume the first item is 'ANN' ann_info = vcf_line.split(';')[0] ann = [] # Multiple annotations per VCF line for item in ann_info.split(','): if len(item.split('|')) > 1: (effect, putative_impact, gene_name, gene_id, feature_type, feature_id) = item.split('|')[1:7] (transcript_biotype, exon, hgvs_coding, hgvs_protein, cdna, cds, protein, distance_to_feature) = item.split('|')[7:15] print(effect) if cdna: (cdna_position, cdna_len) = cdna.split('/') else: cdna_position = None cdna_len = None if cds: (cds_position, cds_len) = cds.split('/') else: cds_position = None cds_len = None if protein: (protein_position, protein_len) = protein.split('/') else: protein_position = None protein_len = None if exon: (rank, total) = exon.split('/') else: rank = None total = None ann.append({ "effect": effect, "putative_impact": putative_impact, "genename": gene_name, "gene_id": gene_id, "feature_type": feature_type, "feature_id": feature_id, "transcript_biotype": transcript_biotype, "rank": rank, "total": total, "hgvs.c": hgvs_coding, "hgvs.p": hgvs_protein, "cdna": { "position": cdna_position, "length": cdna_len }, "cds": { "position": cds_position, "length": cds_len }, "protein": { "position": protein_position, "length": protein_len }, "distance_to_feature": distance_to_feature }) print(ann) # not all annotations include lof & nmd information. Set them to 'None' as default lof = None nmd = None # the case that annotation include 'ann' & 'lof' & 'nmd' if len(vcf_line.split(';')) == 3: (lof_info, nmd_info) = vcf_line.split(';')[1:3] # assume the second item is 'lof' assert lof_info.startswith('LOF') # the information to be parsed is like this: 'LOF=(PTEN|PTEN|1|1.00)' lof_info = lof_info.split('(')[1].split(')')[0] nmd_info = nmd_info.split('(')[1].split(')')[0] (id_lof, name_lof, nt_lof, pt_lof) = lof_info.split('|') (id_nmd, name_nmd, nt_nmd, pt_nmd) = nmd_info.split('|') lof = { "gene_id": id_lof, "genename": name_lof, "number_of_transcripts_in_gene": nt_lof, "percent_of_transcripts_affected": pt_lof } nmd = { "gene_id": id_nmd, "genename": name_nmd, "number_of_transcripts_in_gene": nt_nmd, "percent_of_transcripts_affected": pt_nmd } # the case that annotation include 'ann' & 'lof or nmd' elif len(vcf_line.split(';')) == 2: (ann_info, idk_info) = vcf_line.split(';') if idk_info.startswith('LOF'): lof_info = idk_info.split('(')[1].split(')')[0] (id_lof, name_lof, nt_lof, pt_lof) = lof_info.split('|') lof = { "gene_id": id_lof, "genename": name_lof, "number_of_transcripts_in_gene": nt_lof, "percent_of_transcripts_affected": pt_lof } else: nmd_info = idk_info.split('(')[1].split(')')[0] (id_nmd, name_nmd, nt_nmd, pt_nmd) = nmd_info.split('|') nmd = { "gene_id": id_nmd, "genename": name_nmd, "number_of_transcripts_in_gene": nt_nmd, "percent_of_transcripts_affected": pt_nmd } (chrom, pos, _id, ref, alt) = ann_info.split('\t')[0:5] hgvs_id = get_hgvs_from_vcf(chrom, pos, ref, alt) one_snp_json = { "id": hgvs_id, "snpeff": { "ann": ann, "lof": lof, "nmd": nmd, "vcf": { "position": pos, "ref": ref, "alt": alt } } } snpeff_json = dict_sweep(unlist(one_snp_json), vals=['', None]) yield snpeff_json
def _map_line_to_json(fields): assert len(fields) == VALID_COLUMN_NO chrom = fields[0] chromStart = fields[1] ref = fields[2] alt = fields[4] HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt) # load as json data if HGVS is None: return one_snp_json = { "_id": HGVS, "cadd": { 'chrom': fields[0], 'pos': fields[1], 'ref': fields[2], 'anc': fields[3], 'alt': fields[4], 'type': fields[5], 'length': fields[6], 'istv': fields[7], 'isderived': fields[8], 'annotype': fields[9], 'consequence': fields[10], 'consscore': fields[11], 'consdetail': fields[12], 'gc': fields[13], 'cpg': fields[14], 'mapability': { '20bp': fields[15], '35bp': fields[16] }, 'scoresegdup': fields[17], 'phast_cons': { 'primate': fields[18], 'mammalian': fields[19], 'vertebrate': fields[20] }, 'phylop': { 'primate': fields[21], 'mammalian': fields[22], 'vertebrate': fields[23] }, 'gerp': { 'n': fields[24], 's': fields[25], 'rs': fields[26], 'rs_pval': fields[27] }, 'bstatistic': fields[28], 'mutindex': fields[29], 'dna': { 'helt': fields[30], 'mgw': fields[31], 'prot': fields[32], 'roll': fields[33] }, 'mirsvr': { 'score': fields[34], 'e': fields[35], 'aln': fields[36] }, 'targetscans': fields[37], 'fitcons': fields[38], 'chmm': { 'tssa': fields[39], 'tssaflnk': fields[40], 'txflnk': fields[41], 'tx': fields[42], 'txwk': fields[43], 'enh': fields[44], # 'enh': fields[45], 'znfrpts': fields[46], 'het': fields[47], 'tssbiv': fields[48], 'bivflnk': fields[49], 'enhbiv': fields[50], 'reprpc': fields[51], 'reprpcwk': fields[52], 'quies': fields[53], }, 'encode': { 'exp': fields[54], 'h3k27ac': fields[55], 'h3k4me1': fields[56], 'h3k4me3': fields[57], 'nucleo': fields[58], 'occ': fields[59], 'p_val': { 'comb': fields[60], 'dnas': fields[61], 'faire': fields[62], 'polii': fields[63], 'ctcf': fields[64], 'mycp': fields[65] }, 'sig': { 'dnase': fields[66], 'faire': fields[67], 'polii': fields[68], 'ctcf': fields[69], 'myc': fields[70] }, }, 'segway': fields[71], 'motif': { 'toverlap': fields[72], 'dist': fields[73], 'ecount': fields[74], 'ename': fields[75], 'ehipos': fields[76], 'escorechng': fields[77] }, 'tf': { 'bs': fields[78], 'bs_peaks': fields[79], 'bs_peaks_max': fields[80] }, 'isknownvariant': fields[81], 'esp': { 'af': fields[82], 'afr': fields[83], 'eur': fields[84] }, '1000g': { 'af': fields[85], 'asn': fields[86], 'amr': fields[87], 'afr': fields[88], 'eur': fields[89] }, 'min_dist_tss': fields[90], 'min_dist_tse': fields[91], 'gene': { 'gene_id': fields[92], 'feature_id': fields[93], 'ccds_id': fields[94], 'genename': fields[95], 'cds': { 'cdna_pos': fields[96], 'rel_cdna_pos': fields[97], 'cds_pos': fields[98], 'rel_cds_pos': fields[99] }, 'prot': { 'protpos': fields[100], 'rel_prot_pos': fields[101], 'domain': fields[102] } }, 'dst2splice': fields[103], 'dst2spltype': fields[104], 'exon': fields[105], 'intron': fields[106], 'oaa': fields[107], # ref aa 'naa': fields[108], # alt aa 'grantham': fields[109], 'polyphen': { 'cat': fields[110], 'val': fields[111] }, 'sift': { 'cat': fields[112], 'val': fields[113] }, 'rawscore': fields[114], # raw CADD score 'phred': fields[115] # log-percentile of raw CADD score } } obj = dict_sweep(unlist(value_convert(one_snp_json)), ["NA"]) yield obj
def _map_line_to_json(item): chrom = item.CHROM chromStart = item.POS ref = item.REF info = item.INFO try: baseqranksum = info['BaseQRankSum'] except: baseqranksum = None try: clippingranksum = info['ClippingRankSum'] except: clippingranksum = None try: mqranksum = info['MQRankSum'] except: mqranksum = None try: readposranksum = info['ReadPosRankSum'] except: readposranksum = None try: qd = info['QD'] except: qd = None try: inbreedingcoeff = info['InbreedingCoeff'] except: inbreedingcoeff = None for i in range(0, len(item.ALT)): item.ALT[i] = str(item.ALT[i]) for alt in item.ALT: alt = str(alt) (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True) if HGVS is None: return one_snp_json = { "_id": HGVS, "exac": { "chrom": chrom, "pos": chromStart, "ref": ref, "alt": alt, "alleles": item.ALT, "type": var_type, "ac": { "ac": info['AC'], "ac_afr": info['AC_AFR'], "ac_amr": info['AC_AMR'], "ac_adj": info['AC_Adj'], "ac_eas": info['AC_EAS'], "ac_fin": info['AC_FIN'], "ac_het": info['AC_Het'], "ac_hom": info['AC_Hom'], "ac_nfe": info['AC_NFE'], "ac_oth": info['AC_OTH'], "ac_sas": info['AC_SAS'] }, "af": info['AF'], "an": { "an": info['AN'], "an_afr": info['AN_AFR'], "an_amr": info['AN_AMR'], "an_adj": info['AN_Adj'], "an_eas": info['AN_EAS'], "an_fin": info['AN_FIN'], "an_nfe": info['AN_NFE'], "an_oth": info['AN_OTH'], "an_sas": info['AN_SAS'] }, "baseqranksum": baseqranksum, "clippingranksum": clippingranksum, "fs": info['FS'], "het": { "het_afr": info['Het_AFR'], "het_amr": info['Het_AMR'], "het_eas": info['Het_EAS'], "het_fin": info['Het_FIN'], "het_nfe": info['Het_NFE'], "het_oth": info['Het_OTH'], "het_sas": info['Het_SAS'] }, "hom": { "hom_afr": info['Hom_AFR'], "hom_amr": info['Hom_AMR'], "hom_eas": info['Hom_EAS'], "hom_fin": info['Hom_FIN'], "hom_nfe": info['Hom_NFE'], "hom_oth": info['Hom_OTH'], "hom_sas": info['Hom_SAS'] }, "inbreedingcoeff": inbreedingcoeff, "mq": { "mq": info['MQ'], "mq0": info['MQ0'], "mqranksum": mqranksum }, "ncc": info['NCC'], "qd": qd, "readposranksum": readposranksum, "vqslod": info['VQSLOD'], "culprit": info['culprit'] } } obj = (dict_sweep(unlist(value_convert(one_snp_json)), [None])) yield obj
def _map_line_to_json(doc_key, item): chrom = item.CHROM chromStart = item.POS ref = item.REF info = item.INFO try: baseqranksum = info['BaseQRankSum'] except: baseqranksum = None try: clippingranksum = info['ClippingRankSum'] except: clippingranksum = None try: mqranksum = info['MQRankSum'] except: mqranksum = None try: readposranksum = info['ReadPosRankSum'] except: readposranksum = None try: qd = info['QD'] except: qd = None try: inbreedingcoeff = info['InbreedingCoeff'] except: inbreedingcoeff = None for i in range(0, len(item.ALT)): item.ALT[i] = str(item.ALT[i]) for alt in item.ALT: alt = str(alt) (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True) if HGVS is None: return one_snp_json = { "_id": HGVS, doc_key: { "chrom": chrom, "pos": chromStart, "ref": ref, "alt": alt, "alleles": item.ALT, "type": var_type, "ac": { "ac": info['AC'], "ac_afr": info['AC_AFR'], "ac_amr": info['AC_AMR'], "ac_adj": info['AC_Adj'], "ac_eas": info['AC_EAS'], "ac_fin": info['AC_FIN'], "ac_het": info['AC_Het'], "ac_hom": info['AC_Hom'], "ac_nfe": info['AC_NFE'], "ac_oth": info['AC_OTH'], "ac_sas": info['AC_SAS'], "ac_female": info['AC_FEMALE'], "ac_male": info['AC_MALE'] }, "af": info['AF'], "an": { "an": info['AN'], "an_afr": info['AN_AFR'], "an_amr": info['AN_AMR'], "an_adj": info['AN_Adj'], "an_eas": info['AN_EAS'], "an_fin": info['AN_FIN'], "an_nfe": info['AN_NFE'], "an_oth": info['AN_OTH'], "an_sas": info['AN_SAS'], "an_female": info['AN_FEMALE'], "an_male": info['AN_MALE'] }, "baseqranksum": baseqranksum, "clippingranksum": clippingranksum, "fs": info['FS'], "het": { "het_afr": info['Het_AFR'], "het_amr": info['Het_AMR'], "het_eas": info['Het_EAS'], "het_fin": info['Het_FIN'], "het_nfe": info['Het_NFE'], "het_oth": info['Het_OTH'], "het_sas": info['Het_SAS'] }, "hom": { "hom_afr": info['Hom_AFR'], "hom_amr": info['Hom_AMR'], "hom_eas": info['Hom_EAS'], "hom_fin": info['Hom_FIN'], "hom_nfe": info['Hom_NFE'], "hom_oth": info['Hom_OTH'], "hom_sas": info['Hom_SAS'] }, "inbreedingcoeff": inbreedingcoeff, "mq": { "mq": info['MQ'], "mq0": info['MQ0'], "mqranksum": mqranksum }, "ncc": info['NCC'], "qd": qd, "readposranksum": readposranksum, "vqslod": info['VQSLOD'], "culprit": info['culprit'] } } obj = (dict_sweep(unlist(value_convert_to_number(one_snp_json)), [None])) yield obj
def _map_line_to_json(df): # specific variable treatment chrom = df["chr_id"] pos = df["chr_pos"] if chrom == 'M': chrom = 'MT' ref = df["ref_nt"] alt = df["alt_nt"] HGVS = get_hgvs_from_vcf(chrom, int(pos), ref, alt, mutant_type=False) transcript_id = clean_data(df["transcript_id"], ("-",)) peptide_id = clean_data(df["peptide_id"], ("-",)) uniprot_ac = clean_data(df["uniprot_ac"], ("-",)) refseq_ac = clean_data(df["refseq_ac"], ("-",)) cds_pos = clean_data(df["cds_pos"], ("-",)) pep_pos = clean_data(df["pep_pos"], ("-",)) uniprot_pos = clean_data(df["uniprot_pos"], ("-",)) ref_aa = clean_data(df["ref_aa"], ("-",)) alt_aa = clean_data(df["alt_aa"], ("-",)) mut_freq = clean_data(df["mut_freq"], ("-",)) data_src = clean_data(df["data_src"], ("-",)) do_id = clean_data(df["do_id"], ("-",)) do_name_id, do_name = do_name_split(df["do_name"]) if do_id and do_name_id: assert do_id == do_name_id, "do_id mismatch!" uberon_id = to_list(df["uberon_id"]) gene_name = clean_data(df["gene_name"], ("-",)) pmid_list = to_list(df["pmid_list"]) site_prd = site_prd_parser(clean_data(df["site_prd"], ("-",))) site_ann = site_ann_parser(df["site_ann"]) # load as json data one_snp_json = { "_id": HGVS, "biomuta": { 'chrom': chrom, 'pos': pos, 'ref': ref, 'alt': alt, 'transcript_id': transcript_id, 'peptide_id': peptide_id, 'uniprot_ac': uniprot_ac, 'refseq_ac': refseq_ac, 'cds_pos': cds_pos, 'pep_pos': pep_pos, 'uniprot_pos': uniprot_pos, 'ref_aa': ref_aa, 'alt_aa': alt_aa, 'mut_freq': mut_freq, 'data_src': data_src, 'do_id': { "do_id" : do_id, "do_name" : do_name }, 'uberon_id': uberon_id, 'gene_name': gene_name, 'pmid': pmid_list, } } if site_ann: for dic in site_ann: one_snp_json["biomuta"].update(dic) if site_prd: one_snp_json["biomuta"].update(site_prd) one_snp_json = value_convert_to_number(one_snp_json) one_snp_json['biomuta']['chrom'] = str(one_snp_json['biomuta']['chrom']) one_snp_json['biomuta']['do_id']['do_id'] = str(one_snp_json['biomuta']['do_id']['do_id']) return one_snp_json
def load_data(data_folder): # number of civic ids with ref, alt, chrom no_case1 = 0 # number of civic ids with chrom, ref, but no alt no_case2 = 0 # number of civic ids with chrom, alt, but no ref no_case3 = 0 # number of civic ids with no alt and ref no_case4 = 0 for infile in glob.glob(os.path.join(data_folder,"variant_*.json")): doc = json.load(open(infile)) if set(['error', 'status']) != set(doc.keys()): [chrom, pos, ref, alt] = [doc['coordinates'][x] for x in ['chromosome', 'start', 'reference_bases', 'variant_bases']] variant_id = doc.pop("id") new_doc = {} doc['variant_id'] = variant_id if chrom and ref and alt: no_case1 += 1 try: new_doc['_id'] = get_hgvs_from_vcf(chrom, pos, ref, alt) except ValueError: logging.warning("id has ref,alt, but coudn't be converted to hgvs id: {}".format(variant_id)) continue # handle cases of deletions where only ref info is provided elif chrom and ref and not alt: no_case2 += 1 start = int(pos) end = int(pos) + len(ref) - 1 if start == end: new_doc['_id'] = 'chr{0}:g.{1}del'.format(chrom, start) else: new_doc['_id'] = 'chr{0}:g.{1}_{2}del'.format(chrom, start, end) # handle cases of insertions where only alt info is provided elif chrom and alt and not ref: no_case3 += 1 new_doc['_id'] = 'chr{0}:g.{1}_{2}ins{3}'.format(chrom, start, end, alt) # handle cases where no ref or alt info provided, # in this case, use CIVIC internal ID as the primary id for MyVariant.info, e.g. CIVIC_VARIANT:1 else: no_case4 += 1 new_doc['_id'] = 'CIVIC_VARIANT:' + str(variant_id) for _evidence in doc['evidence_items']: if 'disease' in _evidence and 'doid' in _evidence['disease'] and _evidence['disease']['doid']: _evidence['disease']['doid'] = 'DOID:' + _evidence['disease']['doid'] if 'source' in _evidence and 'citation_id' in _evidence['source']: if _evidence['source']['source_type'] == "PubMed": _evidence['source']['pubmed'] = to_int(_evidence['source']['citation_id']) _evidence['source'].pop('source_type') _evidence['source'].pop('citation_id') elif _evidence['source']['source_type'] == "ASCO": _evidence['source']['asco'] = to_int(_evidence['source']['citation_id']) _evidence['source'].pop('source_type') _evidence['source'].pop('citation_id') else: raise ValueError("The value of source_type is not one of PubMed or ASCO, it's {}, need to restructure parser".format(_evidence['source']['source_type'])) new_doc['civic'] = doc yield dict_sweep(unlist(new_doc),['','null', 'N/A', None, [], {}]) # change doid into its formal representation, which should be sth like DOID:1 else: continue logging.info("number of ids with ref, alt, chrom: {}".format(no_case1)) logging.info("number of ids with chrom, ref but no alt: {}".format(no_case2)) logging.info("number of ids with chrom, alt but no ref: {}".format(no_case3)) logging.info("number of ids with no ref and alt: {}".format(no_case4))
def _map_line_to_json(doc_key, item): chrom = item.CHROM chromStart = item.POS ref = item.REF info = item.INFO try: baseqranksum = info['BaseQRankSum'] except: baseqranksum = None try: clippingranksum = info['ClippingRankSum'] except: clippingranksum = None try: mqranksum = info['MQRankSum'] except: mqranksum = None try: readposranksum = info['ReadPosRankSum'] except: readposranksum = None try: qd = info['QD'] except: qd = None try: inbreedingcoeff = info['InbreedingCoeff'] except: inbreedingcoeff = None # convert vcf object to string item.ALT = [str(alt) for alt in item.ALT] # if multiallelic, put all variants as a list in multi-allelic field hgvs_list = None if len(item.ALT) > 1: hgvs_list = [get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=False) for alt in item.ALT] for i, alt in enumerate(item.ALT): (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True) if HGVS is None: return assert len(item.ALT) == len(info['AC']), "Expecting length of item.ALT= length of info.AC, but not for %s" % (HGVS) assert len(item.ALT) == len(info['AF']), "Expecting length of item.ALT= length of info.AF, but not for %s" % (HGVS) assert len(item.ALT) == len(info['Hom_AFR']), "Expecting length of item.ALT= length of HOM_AFR, but not for %s" % (HGVS) one_snp_json = { "_id": HGVS, doc_key : { "chrom": chrom, "pos": chromStart, "multi-allelic": hgvs_list, "ref": ref, "alt": alt, "alleles": item.ALT, "type": var_type, "ac": { "ac": info['AC'][i], "ac_afr": info['AC_AFR'][i], "ac_amr": info['AC_AMR'][i], "ac_adj": info['AC_Adj'][i], "ac_eas": info['AC_EAS'][i], "ac_fin": info['AC_FIN'][i], "ac_het": info['AC_Het'][i], "ac_hom": info['AC_Hom'][i], "ac_nfe": info['AC_NFE'][i], "ac_oth": info['AC_OTH'][i], "ac_sas": info['AC_SAS'][i], "ac_male": info['AC_MALE'][i], "ac_female": info['AC_FEMALE'][i] }, "af": info['AF'][i], "an": { "an": info['AN'], "an_afr": info['AN_AFR'], "an_amr": info['AN_AMR'], "an_adj": info['AN_Adj'], "an_eas": info['AN_EAS'], "an_fin": info['AN_FIN'], "an_nfe": info['AN_NFE'], "an_oth": info['AN_OTH'], "an_sas": info['AN_SAS'], "an_female": info['AN_FEMALE'], "an_male": info['AN_MALE'] }, "baseqranksum": baseqranksum, "clippingranksum": clippingranksum, "fs": info['FS'], "het": { "het_afr": info['Het_AFR'], "het_amr": info['Het_AMR'], "het_eas": info['Het_EAS'], "het_fin": info['Het_FIN'], "het_nfe": info['Het_NFE'], "het_oth": info['Het_OTH'], "het_sas": info['Het_SAS'] }, "hom": { "hom_afr": info['Hom_AFR'], "hom_amr": info['Hom_AMR'], "hom_eas": info['Hom_EAS'], "hom_fin": info['Hom_FIN'], "hom_nfe": info['Hom_NFE'], "hom_oth": info['Hom_OTH'], "hom_sas": info['Hom_SAS'] }, "inbreedingcoeff": inbreedingcoeff, "mq": { "mq": info['MQ'], "mq0": info['MQ0'], "mqranksum": mqranksum }, "ncc": info['NCC'], "qd": qd, "readposranksum": readposranksum, "vqslod": info['VQSLOD'], "culprit": info['culprit'] } } obj = (dict_sweep(unlist(value_convert_to_number(one_snp_json)), [None])) yield obj
def _map_line_to_json(df): chrom = df['chromosome'] if chrom == 'M': chrom = 'MT' ref = df["reference_allele"] alt = df["tumor_seq_allele1"] if alt == '-': HGVS = get_hgvs_from_vcf(chrom, int(df['start_position']) - 1, 'N' + ref, 'N', mutant_type=False) elif ref == '-': HGVS = get_hgvs_from_vcf(chrom, int(df['start_position']) - 1, 'N', 'N' + alt, mutant_type=False) else: HGVS = get_hgvs_from_vcf(chrom, int(df['start_position']), ref, alt, mutant_type=False) ccle_depmap = { 'gene': { 'id': df['entrez_gene_id'], 'symbol': df['hugo_symbol'] }, 'chrom': chrom, 'hg19': { 'start': df['start_position'], 'end': df['end_position'] }, 'strand': df['strand'], 'class': df['variant_classification'], 'vartype': df['variant_type'], 'ref': df['reference_allele'], 'tumor_seq_allele1': df['tumor_seq_allele1'], 'dbsnp': { 'rsid': df['dbsnp_rs'], 'val_status': df['dbsnp_val_status'] }, 'genome_change': df['genome_change'], 'annotation_transcript': df['annotation_transcript'], 'tumor_sample_barcode': df['tumor_sample_barcode'], 'cdna_change': df['cdna_change'], 'codon_change': df['codon_change'], 'protein_change': df['protein_change'], 'isdeleterious': to_boolean(df['isdeleterious'], true_str=[ 'TRUE', ], false_str=[ 'FALSE', ]), 'istcgahotspot': to_boolean(df['istcgahotspot'], true_str=[ 'TRUE', ], false_str=[ 'FALSE', ]), 'tcgahscnt': df['tcgahscnt'], 'iscosmichotspot': to_boolean(df['iscosmichotspot'], true_str=[ 'TRUE', ], false_str=[ 'FALSE', ]), 'cosmichscnt': df['cosmichscnt'], 'exac_af': df['exac_af'], 'wes_ac': df['wes_ac'], 'sanger': { 'wes_ac': df['sangerwes_ac'], 'recalibwes_ac': df['sangerrecalibwes_ac'] }, 'rnaseq_ac': df['rnaseq_ac'], 'hc_ac': df['hc_ac'], 'rd_ac': df['rd_ac'], 'wgs_ac': df['wgs_ac'], 'broad_id': df['broad_id'] } ccle_depmap = dict_sweep(ccle_depmap) # load as json data one_snp_json = {"_id": HGVS, "ccle": ccle_depmap} one_snp_json = value_convert_to_number(one_snp_json) one_snp_json['ccle']['chrom'] = str(one_snp_json['ccle']['chrom']) return one_snp_json
def _map_line_to_json(fields, version): chrInfo = fields[0].split(":") # grch37 chrom = chrInfo[0] chromStart = int(chrInfo[1]) ma_fin_percent = fields[7].split("/") if fields[3]: mutation = fields[3].split(">") ref = mutation[0] alt = mutation[1] hg19 = get_pos_start_end(chrom, chromStart, ref, alt) hg38 = get_pos_start_end(chrom, int(fields[30].split(":")[1]), ref, alt) if version == 'hg19': HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt) elif version == 'hg38': HGVS = get_hgvs_from_vcf(chrom, hg38[0], ref, alt) # load as json data if HGVS is None: return one_snp_json = { "_id": HGVS, "evs": { "chrom": chrom, "hg19": { "start": hg19[0], "end": hg19[1] }, "hg38": { "start": hg38[0], "end": hg38[1] }, "rsid": fields[1], "dbsnp_version": get_dbsnp(fields[2]), "ref": ref, "alt": alt, "allele_count": { "european_american": count_dict(fields[4]), "african_american": count_dict(fields[5]), "all": count_dict(fields[6]) }, "ma_fin_percent": { "european_american": ma_fin_percent[0], "african_american": ma_fin_percent[1], "all": ma_fin_percent[2] }, "genotype_count": { "european_american": count_dict(fields[8]), "african_american": count_dict(fields[9]), "all_genotype": count_dict(fields[10]) }, "avg_sample_read": fields[11], "gene": { "symbol": fields[12], "accession": fields[13] }, "function_gvs": fields[14], "hgvs": { "coding": fields[16], "protein": fields[15] }, "coding_dna_size": fields[17], "conservation": { "phast_cons": fields[18], "gerp": fields[19] }, "grantham_score": fields[20], "polyphen2": { "class": polyphen(fields[21])[0], "score": polyphen(fields[21])[1] }, "ref_base_ncbi": fields[22], "chimp_allele": fields[23], "clinical_info": fields[24], "filter_status": fields[25], "on_illumina_human_exome_chip": fields[26], "gwas_pubmed_info": fields[27], "estimated_age_kyrs": { "ea": fields[28], "aa": fields[29] } } } return dict_sweep(value_convert(one_snp_json), vals=["NA", "none", "unknown"])
def _map_line_to_json(doc_key, item): chrom = item.CHROM chromStart = item.POS ref = item.REF info = item.INFO _filter = item.FILTER try: baseqranksum = info['BaseQRankSum'] except: baseqranksum = None try: clippingranksum = info['ClippingRankSum'] except: clippingranksum = None try: mqranksum = info['MQRankSum'] except: mqranksum = None try: readposranksum = info['ReadPosRankSum'] except: readposranksum = None try: qd = info['QD'] except: qd = None try: inbreedingcoeff = info['InbreedingCoeff'] except: inbreedingcoeff = None # convert vcf object to string item.ALT = [str(alt) for alt in item.ALT] # if multiallelic, put all variants as a list in multi-allelic field hgvs_list = None if len(item.ALT) > 1: hgvs_list = [ get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=False) for alt in item.ALT ] for i, alt in enumerate(item.ALT): (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True) if HGVS is None: return assert len(item.ALT) == len( info['AC'] ), "Expecting length of item.ALT= length of info.AC, but not for %s" % ( HGVS) assert len(item.ALT) == len( info['AF'] ), "Expecting length of item.ALT= length of info.AF, but not for %s" % ( HGVS) assert len(item.ALT) == len( info['Hom_AFR'] ), "Expecting length of item.ALT= length of HOM_AFR, but not for %s" % ( HGVS) one_snp_json = { "_id": HGVS, doc_key: { "chrom": chrom, "pos": chromStart, "filter": _filter, "multi-allelic": hgvs_list, "ref": ref, "alt": alt, "alleles": item.ALT, "type": var_type, "ac": { "ac": info['AC'][i], "ac_afr": info['AC_AFR'][i], "ac_amr": info['AC_AMR'][i], "ac_adj": info['AC_Adj'][i], "ac_eas": info['AC_EAS'][i], "ac_fin": info['AC_FIN'][i], "ac_het": info['AC_Het'][i], "ac_hom": info['AC_Hom'][i], "ac_nfe": info['AC_NFE'][i], "ac_oth": info['AC_OTH'][i], "ac_sas": info['AC_SAS'][i], "ac_male": info['AC_MALE'][i], "ac_female": info['AC_FEMALE'][i] }, "af": info['AF'][i], "an": { "an": info['AN'], "an_afr": info['AN_AFR'], "an_amr": info['AN_AMR'], "an_adj": info['AN_Adj'], "an_eas": info['AN_EAS'], "an_fin": info['AN_FIN'], "an_nfe": info['AN_NFE'], "an_oth": info['AN_OTH'], "an_sas": info['AN_SAS'], "an_female": info['AN_FEMALE'], "an_male": info['AN_MALE'] }, "baseqranksum": baseqranksum, "clippingranksum": clippingranksum, "fs": info['FS'], "het": { "het_afr": info['Het_AFR'], "het_amr": info['Het_AMR'], "het_eas": info['Het_EAS'], "het_fin": info['Het_FIN'], "het_nfe": info['Het_NFE'], "het_oth": info['Het_OTH'], "het_sas": info['Het_SAS'] }, "hom": { "hom_afr": info['Hom_AFR'], "hom_amr": info['Hom_AMR'], "hom_eas": info['Hom_EAS'], "hom_fin": info['Hom_FIN'], "hom_nfe": info['Hom_NFE'], "hom_oth": info['Hom_OTH'], "hom_sas": info['Hom_SAS'] }, "inbreedingcoeff": inbreedingcoeff, "mq": { "mq": info['MQ'], "mq0": info['MQ0'], "mqranksum": mqranksum }, "ncc": info['NCC'], "qd": qd, "readposranksum": readposranksum, "vqslod": info['VQSLOD'], "culprit": info['culprit'] } } obj = (dict_sweep(unlist(value_convert_to_number(one_snp_json)), [None])) yield obj
def _map_line_to_json(fields): chrInfo = fields[0].split(":") # grch37 chrom = chrInfo[0] chromStart = int(chrInfo[1]) ma_fin_percent = fields[7].split("/") if fields[3]: mutation = fields[3].split(">") ref = mutation[0] alt = mutation[1] HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt) hg19 = get_pos_start_end(chrom, chromStart, ref, alt) hg38 = get_pos_start_end(chrom, int(fields[30].split(":")[1]), ref, alt) # load as json data if HGVS is None: return one_snp_json = { "_id": HGVS, "evs": { "chrom": chrom, "hg19": { "start": hg19[0], "end": hg19[1] }, "hg38": { "start": hg38[0], "end": hg38[1] }, "rsid": fields[1], "dbsnp_version": get_dbsnp(fields[2]), "ref": ref, "alt": alt, "allele_count": { "european_american": count_dict(fields[4]), "african_american": count_dict(fields[5]), "all": count_dict(fields[6]) }, "ma_fin_percent": { "european_american": ma_fin_percent[0], "african_american": ma_fin_percent[1], "all": ma_fin_percent[2] }, "genotype_count": { "european_american": count_dict(fields[8]), "african_american": count_dict(fields[9]), "all_genotype": count_dict(fields[10]) }, "avg_sample_read": fields[11], "gene": { "symbol": fields[12], "accession": fields[13] }, "function_gvs": fields[14], "hgvs": { "coding": fields[16], "protein": fields[15] }, "coding_dna_size": fields[17], "conservation": { "phast_cons": fields[18], "gerp": fields[19] }, "grantham_score": fields[20], "polyphen2": { "class": polyphen(fields[21])[0], "score": polyphen(fields[21])[1] }, "ref_base_ncbi": fields[22], "chimp_allele": fields[23], "clinical_info": fields[24], "filter_status": fields[25], "on_illumina_human_exome_chip": fields[26], "gwas_pubmed_info": fields[27], "estimated_age_kyrs": { "ea": fields[28], "aa": fields[29] } } } return dict_sweep(value_convert(one_snp_json), vals=["NA", "none", "unknown"])