Exemple #1
0
def get_hgvs_name(record, as_list=False):
    """construct the valid HGVS name as the _id field"""
    chrom = record.CHROM
    chromStart = record.INFO['RSPOS']
    ref = record.REF
    _alt_list = []

    _id_list = []
    _pos_list = []
    for alt in record.ALT:
        # should not make alt a string if alt is None
        if alt:
            alt = str(alt)
        _alt_list.append(alt)
        try:
            # NOTE: current get_pos_start_end doesn't handle ALT=None case
            # TODO: need to remove str(alt) when get_pos_start_end can
            # handle ALT=None case
            (start, end) = get_pos_start_end(chrom, chromStart, ref, alt)
            _pos_list.append(OrderedDict(start=start, end=end))
        # handle cases where start & end position could not be
        # inferred from VCF
        except ValueError:
            _pos_list.append(OrderedDict(start=None, end=None))
        try:
            HGVS = get_hgvs_from_vcf(chrom,
                                     chromStart,
                                     ref,
                                     str(alt),
                                     mutant_type=False)
            _id_list.append(HGVS)
        # handle cases where hgvs id could not be inferred from vcf
        except ValueError:
            pass
    return _id_list, _alt_list, _pos_list
def load_data():
    # number of civic ids with ref, alt, chrom
    no_case1 = 0
    # number of civic ids with chrom, ref, but no alt
    no_case2 = 0
    # number of civic ids with chrom, alt, but no ref
    no_case3 = 0
    # number of civic ids with no alt and ref
    no_case4 = 0
    for variant_id in range(MAX_VARIANT_NUMBER):
        if variant_id % 200 == 0:
            print("scanned {} variants".format(variant_id))
        civic_url = 'https://civic.genome.wustl.edu/api/variants/'
        url = civic_url + str(variant_id)
        doc = requests.get(url).json()
        # time delay for 0.5s
        time.sleep(0.5)
        if set(['error', 'status']) != set(doc.keys()):
            [chrom, pos, ref, alt] = [doc['coordinates'][x] for x in ['chromosome', 'start', 'reference_bases', 'variant_bases']]
            doc.pop("id")
            new_doc = {}
            doc['variant_id'] = variant_id
            if chrom and ref and alt:
                no_case1 += 1
                try:
                  new_doc['_id'] = get_hgvs_from_vcf(chrom, pos, ref, alt)
                except ValueError:
                  print("id has ref,alt, but coudn't be converted to hgvs id: {}".format(variant_id))
                  continue
            # handle cases of deletions where only ref info is provided
            elif chrom and ref and not alt:
                no_case2 += 1
                start = int(pos)
                end = int(pos) + len(ref) - 1
                if start == end:
                    new_doc['_id'] = 'chr{0}:g.{1}del'.format(chrom, start)
                else:
                    new_doc['_id'] = 'chr{0}:g.{1}_{2}del'.format(chrom, start, end)
            # handle cases of insertions where only alt info is provided
            elif chrom and alt and not ref:
                no_case3 += 1
                new_doc['_id'] = 'chr{0}:g.{1}_{2}ins{3}'.format(chrom, start, end, alt)
            # handle cases where no ref or alt info provided,
            # in this case, use CIVIC internal ID as the primary id for MyVariant.info, e.g. CIVIC_VARIANT:1
            else:
                no_case4 += 1
                new_doc['_id'] = 'CIVIC_VARIANT:' + str(variant_id)
            for _evidence in doc['evidence_items']:
                if 'disease' in _evidence and 'doid' in _evidence['disease'] and _evidence['disease']['doid']:
                    _evidence['disease']['doid'] = 'DOID:' + _evidence['disease']['doid']
            new_doc['civic'] = doc
            yield dict_sweep(unlist(new_doc),['','null', 'N/A', None, [], {}])
            # change doid into its formal representation, which should be sth like DOID:1
        else:
            continue
    print("number of ids with ref, alt, chrom: {}".format(no_case1))
    print("number of ids with chrom, ref but no alt: {}".format(no_case2))
    print("number of ids with chrom, alt but no ref: {}".format(no_case3))
    print("number of ids with no ref and alt: {}".format(no_case4))
Exemple #3
0
def fetch_generator(tabix, contig):
    dbfile_path = 'home/kevinxin/cadd/' + 'cadd_id' + contig
    db = dbm.open(dbfile_path)
    ids = db.keys()
    set_ids = set(ids)
    print(len(ids))
    fetch = tabix.fetch(contig)
    rows = map(lambda x: x.split('\t'), fetch)
#   looking for annotype as 'codingtranscript', 'noncodingtranscript'
    annos = (row for row in rows if "CodingTranscript" in row[9] or
             get_hgvs_from_vcf(row[0], row[1], row[2], row[4]) in set_ids)
    json_rows = map(_map_line_to_json, annos)
    json_rows = (row for row in json_rows if row)
    row_groups = (it for (key, it) in groupby(json_rows, lambda row: row["_id"]))
    return (merge_duplicate_rows(rg, "cadd") for rg in row_groups)
Exemple #4
0
def fetch_generator(tabix, contig):
    dbfile_path = 'home/kevinxin/cadd/' + 'cadd_id' + contig
    db = dbm.open(dbfile_path)
    ids = db.keys()
    set_ids = set(ids)
    print(len(ids))
    fetch = tabix.fetch(contig)
    rows = map(lambda x: x.split('\t'), fetch)
    #   looking for annotype as 'codingtranscript', 'noncodingtranscript'
    annos = (row for row in rows if "CodingTranscript" in row[9]
             or get_hgvs_from_vcf(row[0], row[1], row[2], row[4]) in set_ids)
    json_rows = map(_map_line_to_json, annos)
    json_rows = (row for row in json_rows if row)
    row_groups = (it
                  for (key, it) in groupby(json_rows, lambda row: row["_id"]))
    return (merge_duplicate_rows(rg, "cadd") for rg in row_groups)
def _map_line_to_json(item):
    chrom = item.CHROM
    chromStart = item.POS
    ref = item.REF
    info = item.INFO
    hpo_count=item.INFO['HPO_CT']
    for alt in item.ALT:
        alt = str(alt)
        (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True)
        if HGVS is None:
            return
        one_snp_json = {
            "_id": HGVS,
            "geno2mp": {
                "hpo_count": hpo_count,

            }
        }
        obj = (dict_sweep(unlist(value_convert(one_snp_json)), [None]))
        yield obj
Exemple #6
0
def _map_line_to_json(item):
    chrom = item.CHROM
    chromStart = item.POS
    ref = item.REF
    info = item.INFO
    hpo_count=item.INFO['HPO_CT']
    for alt in item.ALT:
        alt = str(alt)
        (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True)
        if HGVS is None:
            return
        one_snp_json = {
            "_id": HGVS,
            "geno2mp": {
                "hpo_count": hpo_count,

            }
        }
        obj = (dict_sweep(unlist(value_convert_to_number(one_snp_json)), [None]))
        yield obj
Exemple #7
0
def load_data (data_file):
         
         d = {}
                   
         with open(data_file, "r+") as f:
             for line in f:
                 try:
                    y = re.split("[\t \n]", line)
                    if y[0] != "Chrom":
                       _id = hgvs.get_hgvs_from_vcf(y[0], y[1],y[2], y[3])
                       d = {"_id":_id, "fire": {}}
                       d["fire"]["chr"] = y[0]
                       d["fire"]["pos"] = y[1]
                       d["fire"]["ref"] = y[2]
                       d["fire"]["alt"] = y[3]
                       d["fire"]["score"] = float(y[4])
                       yield d
                       d = {}
                 except Exception as e:
                     logging.error("Pb with %s: %s" % (line,e))
                     continue
Exemple #8
0
    def parse(cls, record: vcf.model._Record) -> list:
        """
        Read the profile data from a VCF record. Note that there is no such "profile" section shown in the gnomAD
        browser. These fields, i.e. "chrom", "pos", "filter", "multi-allelic", "ref", "alt", "alleles", "type", and
        "rsid", are named as profile fields simply for the convenience of implementation.

        Each ALT has its own profile (which will be wrapped into a dict) and this function will return a list of tuples
        (<hgvs_id>, <profile_dict>).

        It's feasible to return a dict of {<hgvs_id>: <profile_dict>} instead of a list of tuples, but the order of
        <hgvs_id> should be preserved (to the order of ALTs). It's easier to just use an index to iterate over the list
        of tuples, considering the implementation of `PopulationFrequencyParser.parse()` method.
        """
        # although each ALT looks exactly like a string, it is a special type
        alt_list = [str(alt) for alt in record.ALT]
        # for each ALT, get its (hgvs_id, var_type) tuple
        # Here I assume that the "chr" prefix of `record.CHROM`, if any, has already been removed
        hgvs_list = [get_hgvs_from_vcf(record.CHROM, record.POS, record.REF, alt, mutant_type=True) for alt in alt_list]

        # if multi-allelic, put all variants' HGVS ids as a list in multi-allelic field
        multi_allelic = [t[0] for t in hgvs_list] if len(hgvs_list) > 1 else None

        def generate_profiles():
            for alt, (hgvs_id, var_type) in zip(alt_list, hgvs_list):
                profile_dict = {
                    "chrom": record.CHROM,
                    "pos": record.POS,
                    "filter": record.FILTER,
                    "multi-allelic": multi_allelic,
                    "ref": record.REF,
                    "alt": alt,
                    "alleles": alt_list,
                    "type": var_type,
                    "rsid": record.ID
                }
                yield hgvs_id, profile_dict

        return list(generate_profiles())
Exemple #9
0
def _map_line_to_json(item, keys):
    key_start = ["AC", "AF", "AN", "Hom", "GC", "Hemi"]
    chrom = str(item.CHROM)
    if chrom not in CHROM_VALID_VALUES:
        return
    chromStart = item.POS
    ref = item.REF
    info = item.INFO
    _filter = item.FILTER
    rsid = item.ID
    # the following value could be missing in the vcf record
    # check first if the key exists in the vcf record
    # if not, return None
    vqslod = info[
        'VQSLOD'] if 'VQSLOD' in info and info['VQSLOD'] != math.inf else None
    vqsr_culprit = info['VQSR_culprit'] if 'VQSR_culprit' in info else None
    baseqranksum = info['BaseQRankSum'] if 'BaseQRankSum' in info else None
    clippingranksum = info[
        'ClippingRankSum'] if 'ClippingRankSum' in info else None
    mqranksum = info['MQRankSum'] if 'MQRankSum' in info else None
    readposranksum = info[
        'ReadPosRankSum'] if 'ReadPosRankSum' in info else None
    qd = info['QD'] if 'QD' in info else None
    inbreedingcoeff = info[
        'InbreedingCoeff'] if 'InbreedingCoeff' in info else None
    # convert vcf object to string
    item.ALT = [str(alt) for alt in item.ALT]
    # if multiallelic, put all variants as a list in multi-allelic field
    hgvs_list = None
    if len(item.ALT) > 1:
        hgvs_list = [
            get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=False)
            for alt in item.ALT
        ]
    for i, alt in enumerate(item.ALT):
        (HGVS, var_type) = get_hgvs_from_vcf(chrom,
                                             chromStart,
                                             ref,
                                             alt,
                                             mutant_type=True)
        if HGVS is None:
            return
        assert len(item.ALT) == len(
            info['AC']
        ), "Expecting length of item.ALT= length of info.AC, but not for %s" % (
            HGVS)
        assert len(item.ALT) == len(
            info['AF']
        ), "Expecting length of item.ALT= length of info.AF, but not for %s" % (
            HGVS)
        one_snp_json = {
            "_id": HGVS,
            "gnomad_genome": {
                "chrom": chrom,
                "pos": chromStart,
                "filter": _filter,
                "multi-allelic": hgvs_list,
                "ref": ref,
                "alt": alt,
                "alleles": item.ALT,
                "type": var_type,
                "rsid": rsid,
                "baseqranksum": baseqranksum,
                "clippingranksum": clippingranksum,
                "fs": info['FS'],
                "inbreedingcoeff": inbreedingcoeff,
                "mq": {
                    "mq": info['MQ'],
                    "mqranksum": mqranksum
                },
                "qd": qd,
                "readposranksum": readposranksum,
                "vqslod": vqslod,
                "vqsr_culprit": vqsr_culprit
            }
        }
        # create a holder in one_snp_json for each _start, e.g. 'ac', 'af', 'gc'
        for _start in key_start:
            one_snp_json['gnomad_genome'][_start.lower()] = {}
        # loop through each available key
        for _key in keys:
            if _key in info:
                # loop through each prefix
                for _start in key_start:
                    # "ac", "af" value is related to multi-allelic, need to deal with separately
                    if _key.startswith(_start) and _start in [
                            'AC', 'AF', 'Hom', 'Hemi'
                    ]:
                        one_snp_json['gnomad_genome'][_start.lower()][
                            _key.lower()] = info[_key][i]
                    elif _key.startswith(_start) and _start not in [
                            'AC', 'AF', 'Hom', 'Hemi'
                    ]:
                        one_snp_json['gnomad_genome'][_start.lower()][
                            _key.lower()] = info[_key]
        obj = (dict_sweep(
            unlist(
                value_convert_to_number(one_snp_json, skipped_keys=['chrom'])),
            [None]))
        yield obj
 def annotate_by_snpeff(self, varobj_list):
     '''load data'''
     # title of vcf
     vcf_stdin = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n'
     # extract each item from list, transform into vcf format
     snpeff_valid_id = []
     for item in varobj_list:
         if '>' in item:
             hgvs_info = self.snp_hgvs_id_parser(item)
             try:
                 vcf_stdin += self.snp_vcf_constructer(hgvs_info)
             except TypeError:
                 print(item)
                 continue
             snpeff_valid_id.append(item)
         elif item.endswith('del'):
             hgvs_info = self.del_hgvs_id_parser(item)
             try:
                 vcf_stdin += self.del_vcf_constructor(hgvs_info)
             except TypeError:
                 print(item)
                 continue
             snpeff_valid_id.append(item)
         elif 'ins' in item and 'del' not in item:
             hgvs_info = self.ins_hgvs_id_parser(item)
             try:
                 vcf_stdin += self.ins_vcf_constructor(hgvs_info)
             except TypeError:
                 print(item)
                 continue
             snpeff_valid_id.append(item)
         elif 'delins' in item:
             hgvs_info = self.delins_hgvs_id_parser(item)
             try:
                 vcf_stdin += self.delins_vcf_constructor(hgvs_info)
             except TypeError:
                 print(item)
                 continue
         else:
             print(item)
             print('beyond current capacity')
     proc = subprocess.Popen(SNPEFF_CMD, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     (stdout, stderr) = proc.communicate(vcf_stdin)
     assert stderr == '', stderr
     vcf_stdout_raw = stdout.split('\n')
     for vcf_line in vcf_stdout_raw:
         if vcf_line.startswith('#'):
             continue
         elif vcf_line == '':
             continue
         else:
             # assume the first item is 'ANN'
             ann_info = vcf_line.split(';')[0]
             ann = []
             # Multiple annotations per VCF line
             for item in ann_info.split(','):
                 if len(item.split('|')) > 1:
                     (effect, putative_impact, gene_name, gene_id, feature_type, feature_id) = item.split('|')[1:7]
                     (transcript_biotype, exon, hgvs_coding, hgvs_protein, cdna, cds, protein, distance_to_feature) = item.split('|')[7:15]
                     print(effect)
                     if cdna:
                         (cdna_position, cdna_len) = cdna.split('/')
                     else:
                         cdna_position = None
                         cdna_len = None
                     if cds:
                         (cds_position, cds_len) = cds.split('/')
                     else:
                         cds_position = None
                         cds_len = None
                     if protein:
                         (protein_position, protein_len) = protein.split('/')
                     else:
                         protein_position = None
                         protein_len = None
                     if exon:
                         (rank, total) = exon.split('/')
                     else:
                         rank = None
                         total = None
                     ann.append({
                         "effect": effect,
                         "putative_impact": putative_impact,
                         "genename": gene_name,
                         "gene_id": gene_id,
                         "feature_type": feature_type,
                         "feature_id": feature_id,
                         "transcript_biotype": transcript_biotype,
                         "rank": rank,
                         "total": total,
                         "hgvs.c": hgvs_coding,
                         "hgvs.p": hgvs_protein,
                         "cdna": {
                             "position": cdna_position,
                             "length": cdna_len
                         },
                         "cds": {
                             "position": cds_position,
                             "length": cds_len
                         },
                         "protein": {
                             "position": protein_position,
                             "length": protein_len
                         },
                         "distance_to_feature": distance_to_feature
                     })
                     print(ann)
             # not all annotations include lof & nmd information. Set them to 'None' as default
             lof = None
             nmd = None
             # the case that annotation include 'ann' & 'lof' & 'nmd'
             if len(vcf_line.split(';')) == 3:
                 (lof_info, nmd_info) = vcf_line.split(';')[1:3]
                 # assume the second item is 'lof'
                 assert lof_info.startswith('LOF')
                 # the information to be parsed is like this: 'LOF=(PTEN|PTEN|1|1.00)'
                 lof_info = lof_info.split('(')[1].split(')')[0]
                 nmd_info = nmd_info.split('(')[1].split(')')[0]
                 (id_lof, name_lof, nt_lof, pt_lof) = lof_info.split('|')
                 (id_nmd, name_nmd, nt_nmd, pt_nmd) = nmd_info.split('|')
                 lof = {
                     "gene_id": id_lof,
                     "genename": name_lof,
                     "number_of_transcripts_in_gene": nt_lof,
                     "percent_of_transcripts_affected": pt_lof
                 }
                 nmd = {
                     "gene_id": id_nmd,
                     "genename": name_nmd,
                     "number_of_transcripts_in_gene": nt_nmd,
                     "percent_of_transcripts_affected": pt_nmd
                 }
             # the case that annotation include 'ann' & 'lof or nmd'
             elif len(vcf_line.split(';')) == 2:
                 (ann_info, idk_info) = vcf_line.split(';')
                 if idk_info.startswith('LOF'):
                     lof_info = idk_info.split('(')[1].split(')')[0]
                     (id_lof, name_lof, nt_lof, pt_lof) = lof_info.split('|')
                     lof = {
                         "gene_id": id_lof,
                         "genename": name_lof,
                         "number_of_transcripts_in_gene": nt_lof,
                         "percent_of_transcripts_affected": pt_lof
                     }
                 else:
                     nmd_info = idk_info.split('(')[1].split(')')[0]
                     (id_nmd, name_nmd, nt_nmd, pt_nmd) = nmd_info.split('|')
                     nmd = {
                         "gene_id": id_nmd,
                         "genename": name_nmd,
                         "number_of_transcripts_in_gene": nt_nmd,
                         "percent_of_transcripts_affected": pt_nmd
                     }
             (chrom, pos, _id, ref, alt) = ann_info.split('\t')[0:5]
             hgvs_id = get_hgvs_from_vcf(chrom, pos, ref, alt)
             one_snp_json = {
                 "id": hgvs_id,
                 "snpeff": {
                     "ann": ann,
                     "lof": lof,
                     "nmd": nmd,
                     "vcf": {
                         "position": pos,
                         "ref": ref,
                         "alt": alt
                     }
                 }
             }
             snpeff_json = dict_sweep(unlist(one_snp_json), vals=['', None])
             yield snpeff_json
Exemple #11
0
def _map_line_to_json(fields):
    assert len(fields) == VALID_COLUMN_NO
    chrom = fields[0]
    chromStart = fields[1]
    ref = fields[2]
    alt = fields[4]
    HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt)

    # load as json data
    if HGVS is None:
        return
    one_snp_json = {
        "_id": HGVS,
        "cadd": {
            'chrom': fields[0],
            'pos': fields[1],
            'ref': fields[2],
            'anc': fields[3],
            'alt': fields[4],
            'type': fields[5],
            'length': fields[6],
            'istv': fields[7],
            'isderived': fields[8],
            'annotype': fields[9],
            'consequence': fields[10],
            'consscore': fields[11],
            'consdetail': fields[12],
            'gc': fields[13],
            'cpg': fields[14],
            'mapability': {
                '20bp': fields[15],
                '35bp': fields[16]
            },
            'scoresegdup': fields[17],
            'phast_cons': {
                'primate': fields[18],
                'mammalian': fields[19],
                'vertebrate': fields[20]
            },
            'phylop': {
                'primate': fields[21],
                'mammalian': fields[22],
                'vertebrate': fields[23]
            },
            'gerp': {
                'n': fields[24],
                's': fields[25],
                'rs': fields[26],
                'rs_pval': fields[27]
            },
            'bstatistic': fields[28],
            'mutindex': fields[29],
            'dna': {
                'helt': fields[30],
                'mgw': fields[31],
                'prot': fields[32],
                'roll': fields[33]
            },
            'mirsvr': {
                'score': fields[34],
                'e': fields[35],
                'aln': fields[36]
            },
            'targetscans': fields[37],
            'fitcons': fields[38],
            'chmm': {
                'tssa': fields[39],
                'tssaflnk': fields[40],
                'txflnk': fields[41],
                'tx': fields[42],
                'txwk': fields[43],
                'enh': fields[44],
                # 'enh': fields[45],
                'znfrpts': fields[46],
                'het': fields[47],
                'tssbiv': fields[48],
                'bivflnk': fields[49],
                'enhbiv': fields[50],
                'reprpc': fields[51],
                'reprpcwk': fields[52],
                'quies': fields[53],
            },
            'encode': {
                'exp': fields[54],
                'h3k27ac': fields[55],
                'h3k4me1': fields[56],
                'h3k4me3': fields[57],
                'nucleo': fields[58],
                'occ': fields[59],
                'p_val': {
                    'comb': fields[60],
                    'dnas': fields[61],
                    'faire': fields[62],
                    'polii': fields[63],
                    'ctcf': fields[64],
                    'mycp': fields[65]
                },
                'sig': {
                    'dnase': fields[66],
                    'faire': fields[67],
                    'polii': fields[68],
                    'ctcf': fields[69],
                    'myc': fields[70]
                },
            },
            'segway': fields[71],
            'motif': {
                'toverlap': fields[72],
                'dist': fields[73],
                'ecount': fields[74],
                'ename': fields[75],
                'ehipos': fields[76],
                'escorechng': fields[77]
            },
            'tf': {
                'bs': fields[78],
                'bs_peaks': fields[79],
                'bs_peaks_max': fields[80]
            },
            'isknownvariant': fields[81],
            'esp': {
                'af': fields[82],
                'afr': fields[83],
                'eur': fields[84]
            },
            '1000g': {
                'af': fields[85],
                'asn': fields[86],
                'amr': fields[87],
                'afr': fields[88],
                'eur': fields[89]
            },
            'min_dist_tss': fields[90],
            'min_dist_tse': fields[91],
            'gene': {
                'gene_id': fields[92],
                'feature_id': fields[93],
                'ccds_id': fields[94],
                'genename': fields[95],
                'cds': {
                    'cdna_pos': fields[96],
                    'rel_cdna_pos': fields[97],
                    'cds_pos': fields[98],
                    'rel_cds_pos': fields[99]
                },
                'prot': {
                    'protpos': fields[100],
                    'rel_prot_pos': fields[101],
                    'domain': fields[102]
                }
            },
            'dst2splice': fields[103],
            'dst2spltype': fields[104],
            'exon': fields[105],
            'intron': fields[106],
            'oaa': fields[107],  # ref aa
            'naa': fields[108],  # alt aa
            'grantham': fields[109],
            'polyphen': {
                'cat': fields[110],
                'val': fields[111]
            },
            'sift': {
                'cat': fields[112],
                'val': fields[113]
            },
            'rawscore': fields[114],  # raw CADD score
            'phred': fields[115]  # log-percentile of raw CADD score
        }
    }

    obj = dict_sweep(unlist(value_convert(one_snp_json)), ["NA"])
    yield obj
def _map_line_to_json(item):
    chrom = item.CHROM
    chromStart = item.POS
    ref = item.REF
    info = item.INFO
    try:
        baseqranksum = info['BaseQRankSum']
    except:
        baseqranksum = None
    try:
        clippingranksum = info['ClippingRankSum']
    except:
        clippingranksum = None
    try:
        mqranksum = info['MQRankSum']
    except:
        mqranksum = None
    try:
        readposranksum = info['ReadPosRankSum']
    except:
        readposranksum = None
    try:
        qd = info['QD']
    except:
        qd = None
    try:
        inbreedingcoeff = info['InbreedingCoeff']
    except:
        inbreedingcoeff = None
    for i in range(0, len(item.ALT)):
        item.ALT[i] = str(item.ALT[i])
    for alt in item.ALT:
        alt = str(alt)
        (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True)
        if HGVS is None:
            return
        one_snp_json = {
            "_id": HGVS,
            "exac": {
                "chrom": chrom,
                "pos": chromStart,
                "ref": ref,
                "alt": alt,
                "alleles": item.ALT,
                "type": var_type,
                "ac": {
                    "ac": info['AC'],
                    "ac_afr": info['AC_AFR'],
                    "ac_amr": info['AC_AMR'],
                    "ac_adj": info['AC_Adj'],
                    "ac_eas": info['AC_EAS'],
                    "ac_fin": info['AC_FIN'],
                    "ac_het": info['AC_Het'],
                    "ac_hom": info['AC_Hom'],
                    "ac_nfe": info['AC_NFE'],
                    "ac_oth": info['AC_OTH'],
                    "ac_sas": info['AC_SAS']
                },
                "af": info['AF'],
                "an": {
                    "an": info['AN'],
                    "an_afr": info['AN_AFR'],
                    "an_amr": info['AN_AMR'],
                    "an_adj": info['AN_Adj'],
                    "an_eas": info['AN_EAS'],
                    "an_fin": info['AN_FIN'],
                    "an_nfe": info['AN_NFE'],
                    "an_oth": info['AN_OTH'],
                    "an_sas": info['AN_SAS']

                },
                "baseqranksum": baseqranksum,
                "clippingranksum": clippingranksum,
                "fs": info['FS'],
                "het": {
                    "het_afr": info['Het_AFR'],
                    "het_amr": info['Het_AMR'],
                    "het_eas": info['Het_EAS'],
                    "het_fin": info['Het_FIN'],
                    "het_nfe": info['Het_NFE'],
                    "het_oth": info['Het_OTH'],
                    "het_sas": info['Het_SAS']
                },
                "hom": {
                    "hom_afr": info['Hom_AFR'],
                    "hom_amr": info['Hom_AMR'],
                    "hom_eas": info['Hom_EAS'],
                    "hom_fin": info['Hom_FIN'],
                    "hom_nfe": info['Hom_NFE'],
                    "hom_oth": info['Hom_OTH'],
                    "hom_sas": info['Hom_SAS']
                },
                "inbreedingcoeff": inbreedingcoeff,
                "mq": {
                    "mq": info['MQ'],
                    "mq0": info['MQ0'],
                    "mqranksum": mqranksum
                },
                "ncc": info['NCC'],
                "qd": qd,
                "readposranksum": readposranksum,
                "vqslod": info['VQSLOD'],
                "culprit": info['culprit']
            }
        }
        obj = (dict_sweep(unlist(value_convert(one_snp_json)), [None]))
        yield obj
def _map_line_to_json(doc_key, item):
    chrom = item.CHROM
    chromStart = item.POS
    ref = item.REF
    info = item.INFO
    try:
        baseqranksum = info['BaseQRankSum']
    except:
        baseqranksum = None
    try:
        clippingranksum = info['ClippingRankSum']
    except:
        clippingranksum = None
    try:
        mqranksum = info['MQRankSum']
    except:
        mqranksum = None
    try:
        readposranksum = info['ReadPosRankSum']
    except:
        readposranksum = None
    try:
        qd = info['QD']
    except:
        qd = None
    try:
        inbreedingcoeff = info['InbreedingCoeff']
    except:
        inbreedingcoeff = None
    for i in range(0, len(item.ALT)):
        item.ALT[i] = str(item.ALT[i])
    for alt in item.ALT:
        alt = str(alt)
        (HGVS, var_type) = get_hgvs_from_vcf(chrom,
                                             chromStart,
                                             ref,
                                             alt,
                                             mutant_type=True)
        if HGVS is None:
            return
        one_snp_json = {
            "_id": HGVS,
            doc_key: {
                "chrom": chrom,
                "pos": chromStart,
                "ref": ref,
                "alt": alt,
                "alleles": item.ALT,
                "type": var_type,
                "ac": {
                    "ac": info['AC'],
                    "ac_afr": info['AC_AFR'],
                    "ac_amr": info['AC_AMR'],
                    "ac_adj": info['AC_Adj'],
                    "ac_eas": info['AC_EAS'],
                    "ac_fin": info['AC_FIN'],
                    "ac_het": info['AC_Het'],
                    "ac_hom": info['AC_Hom'],
                    "ac_nfe": info['AC_NFE'],
                    "ac_oth": info['AC_OTH'],
                    "ac_sas": info['AC_SAS'],
                    "ac_female": info['AC_FEMALE'],
                    "ac_male": info['AC_MALE']
                },
                "af": info['AF'],
                "an": {
                    "an": info['AN'],
                    "an_afr": info['AN_AFR'],
                    "an_amr": info['AN_AMR'],
                    "an_adj": info['AN_Adj'],
                    "an_eas": info['AN_EAS'],
                    "an_fin": info['AN_FIN'],
                    "an_nfe": info['AN_NFE'],
                    "an_oth": info['AN_OTH'],
                    "an_sas": info['AN_SAS'],
                    "an_female": info['AN_FEMALE'],
                    "an_male": info['AN_MALE']
                },
                "baseqranksum": baseqranksum,
                "clippingranksum": clippingranksum,
                "fs": info['FS'],
                "het": {
                    "het_afr": info['Het_AFR'],
                    "het_amr": info['Het_AMR'],
                    "het_eas": info['Het_EAS'],
                    "het_fin": info['Het_FIN'],
                    "het_nfe": info['Het_NFE'],
                    "het_oth": info['Het_OTH'],
                    "het_sas": info['Het_SAS']
                },
                "hom": {
                    "hom_afr": info['Hom_AFR'],
                    "hom_amr": info['Hom_AMR'],
                    "hom_eas": info['Hom_EAS'],
                    "hom_fin": info['Hom_FIN'],
                    "hom_nfe": info['Hom_NFE'],
                    "hom_oth": info['Hom_OTH'],
                    "hom_sas": info['Hom_SAS']
                },
                "inbreedingcoeff": inbreedingcoeff,
                "mq": {
                    "mq": info['MQ'],
                    "mq0": info['MQ0'],
                    "mqranksum": mqranksum
                },
                "ncc": info['NCC'],
                "qd": qd,
                "readposranksum": readposranksum,
                "vqslod": info['VQSLOD'],
                "culprit": info['culprit']
            }
        }
        obj = (dict_sweep(unlist(value_convert_to_number(one_snp_json)),
                          [None]))
        yield obj
Exemple #14
0
def _map_line_to_json(df):
    # specific variable treatment
    chrom = df["chr_id"]
    pos = df["chr_pos"]
    if chrom == 'M':
        chrom = 'MT'

    ref = df["ref_nt"]
    alt = df["alt_nt"]

    HGVS = get_hgvs_from_vcf(chrom, int(pos), ref, alt, mutant_type=False)
    
    transcript_id = clean_data(df["transcript_id"], ("-",))
    peptide_id = clean_data(df["peptide_id"], ("-",))
    uniprot_ac = clean_data(df["uniprot_ac"], ("-",))
    refseq_ac = clean_data(df["refseq_ac"], ("-",))
    cds_pos = clean_data(df["cds_pos"], ("-",))
    pep_pos = clean_data(df["pep_pos"], ("-",))
    uniprot_pos = clean_data(df["uniprot_pos"], ("-",))
    ref_aa = clean_data(df["ref_aa"], ("-",))
    alt_aa = clean_data(df["alt_aa"], ("-",))
    mut_freq = clean_data(df["mut_freq"], ("-",))
    data_src = clean_data(df["data_src"], ("-",))
    do_id = clean_data(df["do_id"], ("-",))
    do_name_id, do_name = do_name_split(df["do_name"])
    if do_id and do_name_id:
        assert do_id == do_name_id, "do_id mismatch!"

    uberon_id = to_list(df["uberon_id"])
    gene_name = clean_data(df["gene_name"], ("-",))
    pmid_list = to_list(df["pmid_list"])
    site_prd = site_prd_parser(clean_data(df["site_prd"], ("-",)))
    site_ann = site_ann_parser(df["site_ann"])


# load as json data
    one_snp_json = {
        "_id": HGVS,
        "biomuta": {
            'chrom': chrom,
            'pos': pos,
            'ref': ref,
            'alt': alt,
            'transcript_id': transcript_id,
            'peptide_id': peptide_id,
            'uniprot_ac': uniprot_ac,
            'refseq_ac': refseq_ac,
            'cds_pos': cds_pos,
            'pep_pos': pep_pos,
            'uniprot_pos': uniprot_pos,
            'ref_aa': ref_aa,
            'alt_aa': alt_aa,
            'mut_freq': mut_freq,
            'data_src': data_src,
            'do_id': {
                        "do_id" : do_id,
                        "do_name" : do_name
                        },
            'uberon_id': uberon_id,
            'gene_name': gene_name,
            'pmid': pmid_list,
        }
    }
    if site_ann:
        for dic in site_ann:
            one_snp_json["biomuta"].update(dic)

    if site_prd:
        one_snp_json["biomuta"].update(site_prd) 
    
    one_snp_json = value_convert_to_number(one_snp_json)
    one_snp_json['biomuta']['chrom'] = str(one_snp_json['biomuta']['chrom'])
    one_snp_json['biomuta']['do_id']['do_id'] = str(one_snp_json['biomuta']['do_id']['do_id'])
    return one_snp_json
Exemple #15
0
def load_data(data_folder):
    # number of civic ids with ref, alt, chrom
    no_case1 = 0
    # number of civic ids with chrom, ref, but no alt
    no_case2 = 0
    # number of civic ids with chrom, alt, but no ref
    no_case3 = 0
    # number of civic ids with no alt and ref
    no_case4 = 0
    for infile in glob.glob(os.path.join(data_folder,"variant_*.json")):
        doc = json.load(open(infile))
        if set(['error', 'status']) != set(doc.keys()):
            [chrom, pos, ref, alt] = [doc['coordinates'][x] for x in ['chromosome', 'start', 'reference_bases', 'variant_bases']]
            variant_id = doc.pop("id")
            new_doc = {}
            doc['variant_id'] = variant_id
            if chrom and ref and alt:
                no_case1 += 1
                try:
                  new_doc['_id'] = get_hgvs_from_vcf(chrom, pos, ref, alt)
                except ValueError:
                  logging.warning("id has ref,alt, but coudn't be converted to hgvs id: {}".format(variant_id))
                  continue
            # handle cases of deletions where only ref info is provided
            elif chrom and ref and not alt:
                no_case2 += 1
                start = int(pos)
                end = int(pos) + len(ref) - 1
                if start == end:
                    new_doc['_id'] = 'chr{0}:g.{1}del'.format(chrom, start)
                else:
                    new_doc['_id'] = 'chr{0}:g.{1}_{2}del'.format(chrom, start, end)
            # handle cases of insertions where only alt info is provided
            elif chrom and alt and not ref:
                no_case3 += 1
                new_doc['_id'] = 'chr{0}:g.{1}_{2}ins{3}'.format(chrom, start, end, alt)
            # handle cases where no ref or alt info provided,
            # in this case, use CIVIC internal ID as the primary id for MyVariant.info, e.g. CIVIC_VARIANT:1
            else:
                no_case4 += 1
                new_doc['_id'] = 'CIVIC_VARIANT:' + str(variant_id)
            for _evidence in doc['evidence_items']:
                if 'disease' in _evidence and 'doid' in _evidence['disease'] and _evidence['disease']['doid']:
                    _evidence['disease']['doid'] = 'DOID:' + _evidence['disease']['doid']
                if 'source' in _evidence and 'citation_id' in _evidence['source']:
                    if _evidence['source']['source_type'] == "PubMed":
                        _evidence['source']['pubmed'] = to_int(_evidence['source']['citation_id'])
                        _evidence['source'].pop('source_type')
                        _evidence['source'].pop('citation_id')
                    elif _evidence['source']['source_type'] == "ASCO":
                        _evidence['source']['asco'] = to_int(_evidence['source']['citation_id'])
                        _evidence['source'].pop('source_type')
                        _evidence['source'].pop('citation_id')
                    else:
                        raise ValueError("The value of source_type is not one of PubMed or ASCO, it's {}, need to restructure parser".format(_evidence['source']['source_type']))
            new_doc['civic'] = doc
            yield dict_sweep(unlist(new_doc),['','null', 'N/A', None, [], {}])
            # change doid into its formal representation, which should be sth like DOID:1
        else:
            continue
    logging.info("number of ids with ref, alt, chrom: {}".format(no_case1))
    logging.info("number of ids with chrom, ref but no alt: {}".format(no_case2))
    logging.info("number of ids with chrom, alt but no ref: {}".format(no_case3))
    logging.info("number of ids with no ref and alt: {}".format(no_case4))
Exemple #16
0
def _map_line_to_json(doc_key, item):
    chrom = item.CHROM
    chromStart = item.POS
    ref = item.REF
    info = item.INFO
    try:
        baseqranksum = info['BaseQRankSum']
    except:
        baseqranksum = None
    try:
        clippingranksum = info['ClippingRankSum']
    except:
        clippingranksum = None
    try:
        mqranksum = info['MQRankSum']
    except:
        mqranksum = None
    try:
        readposranksum = info['ReadPosRankSum']
    except:
        readposranksum = None
    try:
        qd = info['QD']
    except:
        qd = None
    try:
        inbreedingcoeff = info['InbreedingCoeff']
    except:
        inbreedingcoeff = None
    # convert vcf object to string
    item.ALT = [str(alt) for alt in item.ALT]
    # if multiallelic, put all variants as a list in multi-allelic field
    hgvs_list = None
    if len(item.ALT) > 1:
        hgvs_list = [get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=False) for alt in item.ALT]
    for i, alt in enumerate(item.ALT):
        (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True)
        if HGVS is None:
            return
        assert len(item.ALT) == len(info['AC']), "Expecting length of item.ALT= length of info.AC, but not for %s" % (HGVS)
        assert len(item.ALT) == len(info['AF']), "Expecting length of item.ALT= length of info.AF, but not for %s" % (HGVS)
        assert len(item.ALT) == len(info['Hom_AFR']), "Expecting length of item.ALT= length of HOM_AFR, but not for %s" % (HGVS)
        one_snp_json = {
            "_id": HGVS,
            doc_key : {
                "chrom": chrom,
                "pos": chromStart,
                "multi-allelic": hgvs_list,
                "ref": ref,
                "alt": alt,
                "alleles": item.ALT,
                "type": var_type,
                "ac": {
                    "ac": info['AC'][i],
                    "ac_afr": info['AC_AFR'][i],
                    "ac_amr": info['AC_AMR'][i],
                    "ac_adj": info['AC_Adj'][i],
                    "ac_eas": info['AC_EAS'][i],
                    "ac_fin": info['AC_FIN'][i],
                    "ac_het": info['AC_Het'][i],
                    "ac_hom": info['AC_Hom'][i],
                    "ac_nfe": info['AC_NFE'][i],
                    "ac_oth": info['AC_OTH'][i],
                    "ac_sas": info['AC_SAS'][i],
                    "ac_male": info['AC_MALE'][i],
                    "ac_female": info['AC_FEMALE'][i]
                },
                "af": info['AF'][i],
                "an": {
                    "an": info['AN'],
                    "an_afr": info['AN_AFR'],
                    "an_amr": info['AN_AMR'],
                    "an_adj": info['AN_Adj'],
                    "an_eas": info['AN_EAS'],
                    "an_fin": info['AN_FIN'],
                    "an_nfe": info['AN_NFE'],
                    "an_oth": info['AN_OTH'],
                    "an_sas": info['AN_SAS'],
                    "an_female": info['AN_FEMALE'],
                    "an_male": info['AN_MALE']

                },
                "baseqranksum": baseqranksum,
                "clippingranksum": clippingranksum,
                "fs": info['FS'],
                "het": {
                    "het_afr": info['Het_AFR'],
                    "het_amr": info['Het_AMR'],
                    "het_eas": info['Het_EAS'],
                    "het_fin": info['Het_FIN'],
                    "het_nfe": info['Het_NFE'],
                    "het_oth": info['Het_OTH'],
                    "het_sas": info['Het_SAS']
                },
                "hom": {
                    "hom_afr": info['Hom_AFR'],
                    "hom_amr": info['Hom_AMR'],
                    "hom_eas": info['Hom_EAS'],
                    "hom_fin": info['Hom_FIN'],
                    "hom_nfe": info['Hom_NFE'],
                    "hom_oth": info['Hom_OTH'],
                    "hom_sas": info['Hom_SAS']
                },
                "inbreedingcoeff": inbreedingcoeff,
                "mq": {
                    "mq": info['MQ'],
                    "mq0": info['MQ0'],
                    "mqranksum": mqranksum
                },
                "ncc": info['NCC'],
                "qd": qd,
                "readposranksum": readposranksum,
                "vqslod": info['VQSLOD'],
                "culprit": info['culprit']
            }
        }
        obj = (dict_sweep(unlist(value_convert_to_number(one_snp_json)), [None]))
        yield obj
Exemple #17
0
def _map_line_to_json(df):

    chrom = df['chromosome']
    if chrom == 'M':
        chrom = 'MT'

    ref = df["reference_allele"]
    alt = df["tumor_seq_allele1"]
    if alt == '-':
        HGVS = get_hgvs_from_vcf(chrom,
                                 int(df['start_position']) - 1,
                                 'N' + ref,
                                 'N',
                                 mutant_type=False)
    elif ref == '-':
        HGVS = get_hgvs_from_vcf(chrom,
                                 int(df['start_position']) - 1,
                                 'N',
                                 'N' + alt,
                                 mutant_type=False)
    else:
        HGVS = get_hgvs_from_vcf(chrom,
                                 int(df['start_position']),
                                 ref,
                                 alt,
                                 mutant_type=False)

    ccle_depmap = {
        'gene': {
            'id': df['entrez_gene_id'],
            'symbol': df['hugo_symbol']
        },
        'chrom':
        chrom,
        'hg19': {
            'start': df['start_position'],
            'end': df['end_position']
        },
        'strand':
        df['strand'],
        'class':
        df['variant_classification'],
        'vartype':
        df['variant_type'],
        'ref':
        df['reference_allele'],
        'tumor_seq_allele1':
        df['tumor_seq_allele1'],
        'dbsnp': {
            'rsid': df['dbsnp_rs'],
            'val_status': df['dbsnp_val_status']
        },
        'genome_change':
        df['genome_change'],
        'annotation_transcript':
        df['annotation_transcript'],
        'tumor_sample_barcode':
        df['tumor_sample_barcode'],
        'cdna_change':
        df['cdna_change'],
        'codon_change':
        df['codon_change'],
        'protein_change':
        df['protein_change'],
        'isdeleterious':
        to_boolean(df['isdeleterious'],
                   true_str=[
                       'TRUE',
                   ],
                   false_str=[
                       'FALSE',
                   ]),
        'istcgahotspot':
        to_boolean(df['istcgahotspot'],
                   true_str=[
                       'TRUE',
                   ],
                   false_str=[
                       'FALSE',
                   ]),
        'tcgahscnt':
        df['tcgahscnt'],
        'iscosmichotspot':
        to_boolean(df['iscosmichotspot'],
                   true_str=[
                       'TRUE',
                   ],
                   false_str=[
                       'FALSE',
                   ]),
        'cosmichscnt':
        df['cosmichscnt'],
        'exac_af':
        df['exac_af'],
        'wes_ac':
        df['wes_ac'],
        'sanger': {
            'wes_ac': df['sangerwes_ac'],
            'recalibwes_ac': df['sangerrecalibwes_ac']
        },
        'rnaseq_ac':
        df['rnaseq_ac'],
        'hc_ac':
        df['hc_ac'],
        'rd_ac':
        df['rd_ac'],
        'wgs_ac':
        df['wgs_ac'],
        'broad_id':
        df['broad_id']
    }

    ccle_depmap = dict_sweep(ccle_depmap)

    # load as json data
    one_snp_json = {"_id": HGVS, "ccle": ccle_depmap}
    one_snp_json = value_convert_to_number(one_snp_json)
    one_snp_json['ccle']['chrom'] = str(one_snp_json['ccle']['chrom'])
    return one_snp_json
 def annotate_by_snpeff(self, varobj_list):
     '''load data'''
     # title of vcf
     vcf_stdin = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n'
     # extract each item from list, transform into vcf format
     snpeff_valid_id = []
     for item in varobj_list:
         if '>' in item:
             hgvs_info = self.snp_hgvs_id_parser(item)
             try:
                 vcf_stdin += self.snp_vcf_constructer(hgvs_info)
             except TypeError:
                 print(item)
                 continue
             snpeff_valid_id.append(item)
         elif item.endswith('del'):
             hgvs_info = self.del_hgvs_id_parser(item)
             try:
                 vcf_stdin += self.del_vcf_constructor(hgvs_info)
             except TypeError:
                 print(item)
                 continue
             snpeff_valid_id.append(item)
         elif 'ins' in item and 'del' not in item:
             hgvs_info = self.ins_hgvs_id_parser(item)
             try:
                 vcf_stdin += self.ins_vcf_constructor(hgvs_info)
             except TypeError:
                 print(item)
                 continue
             snpeff_valid_id.append(item)
         elif 'delins' in item:
             hgvs_info = self.delins_hgvs_id_parser(item)
             try:
                 vcf_stdin += self.delins_vcf_constructor(hgvs_info)
             except TypeError:
                 print(item)
                 continue
         else:
             print(item)
             print('beyond current capacity')
     proc = subprocess.Popen(SNPEFF_CMD,
                             stdin=subprocess.PIPE,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
     (stdout, stderr) = proc.communicate(vcf_stdin)
     assert stderr == '', stderr
     vcf_stdout_raw = stdout.split('\n')
     for vcf_line in vcf_stdout_raw:
         if vcf_line.startswith('#'):
             continue
         elif vcf_line == '':
             continue
         else:
             # assume the first item is 'ANN'
             ann_info = vcf_line.split(';')[0]
             ann = []
             # Multiple annotations per VCF line
             for item in ann_info.split(','):
                 if len(item.split('|')) > 1:
                     (effect, putative_impact, gene_name, gene_id,
                      feature_type, feature_id) = item.split('|')[1:7]
                     (transcript_biotype, exon, hgvs_coding, hgvs_protein,
                      cdna, cds, protein,
                      distance_to_feature) = item.split('|')[7:15]
                     print(effect)
                     if cdna:
                         (cdna_position, cdna_len) = cdna.split('/')
                     else:
                         cdna_position = None
                         cdna_len = None
                     if cds:
                         (cds_position, cds_len) = cds.split('/')
                     else:
                         cds_position = None
                         cds_len = None
                     if protein:
                         (protein_position,
                          protein_len) = protein.split('/')
                     else:
                         protein_position = None
                         protein_len = None
                     if exon:
                         (rank, total) = exon.split('/')
                     else:
                         rank = None
                         total = None
                     ann.append({
                         "effect": effect,
                         "putative_impact": putative_impact,
                         "genename": gene_name,
                         "gene_id": gene_id,
                         "feature_type": feature_type,
                         "feature_id": feature_id,
                         "transcript_biotype": transcript_biotype,
                         "rank": rank,
                         "total": total,
                         "hgvs.c": hgvs_coding,
                         "hgvs.p": hgvs_protein,
                         "cdna": {
                             "position": cdna_position,
                             "length": cdna_len
                         },
                         "cds": {
                             "position": cds_position,
                             "length": cds_len
                         },
                         "protein": {
                             "position": protein_position,
                             "length": protein_len
                         },
                         "distance_to_feature": distance_to_feature
                     })
                     print(ann)
             # not all annotations include lof & nmd information. Set them to 'None' as default
             lof = None
             nmd = None
             # the case that annotation include 'ann' & 'lof' & 'nmd'
             if len(vcf_line.split(';')) == 3:
                 (lof_info, nmd_info) = vcf_line.split(';')[1:3]
                 # assume the second item is 'lof'
                 assert lof_info.startswith('LOF')
                 # the information to be parsed is like this: 'LOF=(PTEN|PTEN|1|1.00)'
                 lof_info = lof_info.split('(')[1].split(')')[0]
                 nmd_info = nmd_info.split('(')[1].split(')')[0]
                 (id_lof, name_lof, nt_lof, pt_lof) = lof_info.split('|')
                 (id_nmd, name_nmd, nt_nmd, pt_nmd) = nmd_info.split('|')
                 lof = {
                     "gene_id": id_lof,
                     "genename": name_lof,
                     "number_of_transcripts_in_gene": nt_lof,
                     "percent_of_transcripts_affected": pt_lof
                 }
                 nmd = {
                     "gene_id": id_nmd,
                     "genename": name_nmd,
                     "number_of_transcripts_in_gene": nt_nmd,
                     "percent_of_transcripts_affected": pt_nmd
                 }
             # the case that annotation include 'ann' & 'lof or nmd'
             elif len(vcf_line.split(';')) == 2:
                 (ann_info, idk_info) = vcf_line.split(';')
                 if idk_info.startswith('LOF'):
                     lof_info = idk_info.split('(')[1].split(')')[0]
                     (id_lof, name_lof, nt_lof,
                      pt_lof) = lof_info.split('|')
                     lof = {
                         "gene_id": id_lof,
                         "genename": name_lof,
                         "number_of_transcripts_in_gene": nt_lof,
                         "percent_of_transcripts_affected": pt_lof
                     }
                 else:
                     nmd_info = idk_info.split('(')[1].split(')')[0]
                     (id_nmd, name_nmd, nt_nmd,
                      pt_nmd) = nmd_info.split('|')
                     nmd = {
                         "gene_id": id_nmd,
                         "genename": name_nmd,
                         "number_of_transcripts_in_gene": nt_nmd,
                         "percent_of_transcripts_affected": pt_nmd
                     }
             (chrom, pos, _id, ref, alt) = ann_info.split('\t')[0:5]
             hgvs_id = get_hgvs_from_vcf(chrom, pos, ref, alt)
             one_snp_json = {
                 "id": hgvs_id,
                 "snpeff": {
                     "ann": ann,
                     "lof": lof,
                     "nmd": nmd,
                     "vcf": {
                         "position": pos,
                         "ref": ref,
                         "alt": alt
                     }
                 }
             }
             snpeff_json = dict_sweep(unlist(one_snp_json), vals=['', None])
             yield snpeff_json
Exemple #19
0
def _map_line_to_json(fields):
    assert len(fields) == VALID_COLUMN_NO
    chrom = fields[0]
    chromStart = fields[1]
    ref = fields[2]
    alt = fields[4]
    HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt)

    # load as json data
    if HGVS is None:
        return
    one_snp_json = {
        "_id": HGVS,
        "cadd": {
            'chrom': fields[0],
            'pos': fields[1],
            'ref': fields[2],
            'anc': fields[3],
            'alt': fields[4],
            'type': fields[5],
            'length': fields[6],
            'istv': fields[7],
            'isderived': fields[8],
            'annotype': fields[9],
            'consequence': fields[10],
            'consscore': fields[11],
            'consdetail': fields[12],
            'gc': fields[13],
            'cpg': fields[14],
            'mapability': {
                '20bp': fields[15],
                '35bp': fields[16]
            },
            'scoresegdup': fields[17],
            'phast_cons': {
                'primate': fields[18],
                'mammalian': fields[19],
                'vertebrate': fields[20]
            },
            'phylop': {
                'primate': fields[21],
                'mammalian': fields[22],
                'vertebrate': fields[23]
            },
            'gerp': {
                'n': fields[24],
                's': fields[25],
                'rs': fields[26],
                'rs_pval': fields[27]
            },
            'bstatistic': fields[28],
            'mutindex': fields[29],
            'dna': {
                'helt': fields[30],
                'mgw': fields[31],
                'prot': fields[32],
                'roll': fields[33]
            },
            'mirsvr': {
                'score': fields[34],
                'e': fields[35],
                'aln': fields[36]
            },
            'targetscans': fields[37],
            'fitcons': fields[38],
            'chmm': {
                'tssa': fields[39],
                'tssaflnk': fields[40],
                'txflnk': fields[41],
                'tx': fields[42],
                'txwk': fields[43],
                'enh': fields[44],
                # 'enh': fields[45],
                'znfrpts': fields[46],
                'het': fields[47],
                'tssbiv': fields[48],
                'bivflnk': fields[49],
                'enhbiv': fields[50],
                'reprpc': fields[51],
                'reprpcwk': fields[52],
                'quies': fields[53],
            },
            'encode': {
                'exp': fields[54],
                'h3k27ac': fields[55],
                'h3k4me1': fields[56],
                'h3k4me3': fields[57],
                'nucleo': fields[58],
                'occ': fields[59],
                'p_val': {
                    'comb': fields[60],
                    'dnas': fields[61],
                    'faire': fields[62],
                    'polii': fields[63],
                    'ctcf': fields[64],
                    'mycp': fields[65]
                },
                'sig': {
                    'dnase': fields[66],
                    'faire': fields[67],
                    'polii': fields[68],
                    'ctcf': fields[69],
                    'myc': fields[70]
                },
            },
            'segway': fields[71],
            'motif': {
                'toverlap': fields[72],
                'dist': fields[73],
                'ecount': fields[74],
                'ename': fields[75],
                'ehipos': fields[76],
                'escorechng': fields[77]
            },
            'tf': {
                'bs': fields[78],
                'bs_peaks': fields[79],
                'bs_peaks_max': fields[80]
            },
            'isknownvariant': fields[81],
            'esp': {
                'af': fields[82],
                'afr': fields[83],
                'eur': fields[84]
            },
            '1000g': {
                'af': fields[85],
                'asn': fields[86],
                'amr': fields[87],
                'afr': fields[88],
                'eur': fields[89]
            },
            'min_dist_tss': fields[90],
            'min_dist_tse': fields[91],
            'gene': {
                'gene_id': fields[92],
                'feature_id': fields[93],
                'ccds_id': fields[94],
                'genename': fields[95],
                'cds': {
                    'cdna_pos': fields[96],
                    'rel_cdna_pos': fields[97],
                    'cds_pos': fields[98],
                    'rel_cds_pos': fields[99]
                },
                'prot': {
                    'protpos': fields[100],
                    'rel_prot_pos': fields[101],
                    'domain': fields[102]
                }
            },
            'dst2splice': fields[103],
            'dst2spltype': fields[104],
            'exon': fields[105],
            'intron': fields[106],
            'oaa': fields[107],   # ref aa
            'naa': fields[108],   # alt aa
            'grantham': fields[109],
            'polyphen': {
                'cat': fields[110],
                'val': fields[111]
            },
            'sift': {
                'cat': fields[112],
                'val': fields[113]
            },
            'rawscore': fields[114],    # raw CADD score
            'phred': fields[115]        # log-percentile of raw CADD score
        }
    }

    obj = dict_sweep(unlist(value_convert(one_snp_json)), ["NA"])
    yield obj
Exemple #20
0
def _map_line_to_json(fields, version):
    chrInfo = fields[0].split(":")  # grch37
    chrom = chrInfo[0]
    chromStart = int(chrInfo[1])
    ma_fin_percent = fields[7].split("/")
    if fields[3]:
        mutation = fields[3].split(">")
        ref = mutation[0]
        alt = mutation[1]
        hg19 = get_pos_start_end(chrom, chromStart, ref, alt)
        hg38 = get_pos_start_end(chrom, int(fields[30].split(":")[1]), ref,
                                 alt)
        if version == 'hg19':
            HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt)
        elif version == 'hg38':
            HGVS = get_hgvs_from_vcf(chrom, hg38[0], ref, alt)

    # load as json data
    if HGVS is None:
        return

    one_snp_json = {
        "_id": HGVS,
        "evs": {
            "chrom": chrom,
            "hg19": {
                "start": hg19[0],
                "end": hg19[1]
            },
            "hg38": {
                "start": hg38[0],
                "end": hg38[1]
            },
            "rsid": fields[1],
            "dbsnp_version": get_dbsnp(fields[2]),
            "ref": ref,
            "alt": alt,
            "allele_count": {
                "european_american": count_dict(fields[4]),
                "african_american": count_dict(fields[5]),
                "all": count_dict(fields[6])
            },
            "ma_fin_percent": {
                "european_american": ma_fin_percent[0],
                "african_american": ma_fin_percent[1],
                "all": ma_fin_percent[2]
            },
            "genotype_count": {
                "european_american": count_dict(fields[8]),
                "african_american": count_dict(fields[9]),
                "all_genotype": count_dict(fields[10])
            },
            "avg_sample_read": fields[11],
            "gene": {
                "symbol": fields[12],
                "accession": fields[13]
            },
            "function_gvs": fields[14],
            "hgvs": {
                "coding": fields[16],
                "protein": fields[15]
            },
            "coding_dna_size": fields[17],
            "conservation": {
                "phast_cons": fields[18],
                "gerp": fields[19]
            },
            "grantham_score": fields[20],
            "polyphen2": {
                "class": polyphen(fields[21])[0],
                "score": polyphen(fields[21])[1]
            },
            "ref_base_ncbi": fields[22],
            "chimp_allele": fields[23],
            "clinical_info": fields[24],
            "filter_status": fields[25],
            "on_illumina_human_exome_chip": fields[26],
            "gwas_pubmed_info": fields[27],
            "estimated_age_kyrs": {
                "ea": fields[28],
                "aa": fields[29]
            }
        }
    }
    return dict_sweep(value_convert(one_snp_json),
                      vals=["NA", "none", "unknown"])
Exemple #21
0
def _map_line_to_json(doc_key, item):
    chrom = item.CHROM
    chromStart = item.POS
    ref = item.REF
    info = item.INFO
    _filter = item.FILTER
    try:
        baseqranksum = info['BaseQRankSum']
    except:
        baseqranksum = None
    try:
        clippingranksum = info['ClippingRankSum']
    except:
        clippingranksum = None
    try:
        mqranksum = info['MQRankSum']
    except:
        mqranksum = None
    try:
        readposranksum = info['ReadPosRankSum']
    except:
        readposranksum = None
    try:
        qd = info['QD']
    except:
        qd = None
    try:
        inbreedingcoeff = info['InbreedingCoeff']
    except:
        inbreedingcoeff = None
    # convert vcf object to string
    item.ALT = [str(alt) for alt in item.ALT]
    # if multiallelic, put all variants as a list in multi-allelic field
    hgvs_list = None
    if len(item.ALT) > 1:
        hgvs_list = [
            get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=False)
            for alt in item.ALT
        ]
    for i, alt in enumerate(item.ALT):
        (HGVS, var_type) = get_hgvs_from_vcf(chrom,
                                             chromStart,
                                             ref,
                                             alt,
                                             mutant_type=True)
        if HGVS is None:
            return
        assert len(item.ALT) == len(
            info['AC']
        ), "Expecting length of item.ALT= length of info.AC, but not for %s" % (
            HGVS)
        assert len(item.ALT) == len(
            info['AF']
        ), "Expecting length of item.ALT= length of info.AF, but not for %s" % (
            HGVS)
        assert len(item.ALT) == len(
            info['Hom_AFR']
        ), "Expecting length of item.ALT= length of HOM_AFR, but not for %s" % (
            HGVS)
        one_snp_json = {
            "_id": HGVS,
            doc_key: {
                "chrom": chrom,
                "pos": chromStart,
                "filter": _filter,
                "multi-allelic": hgvs_list,
                "ref": ref,
                "alt": alt,
                "alleles": item.ALT,
                "type": var_type,
                "ac": {
                    "ac": info['AC'][i],
                    "ac_afr": info['AC_AFR'][i],
                    "ac_amr": info['AC_AMR'][i],
                    "ac_adj": info['AC_Adj'][i],
                    "ac_eas": info['AC_EAS'][i],
                    "ac_fin": info['AC_FIN'][i],
                    "ac_het": info['AC_Het'][i],
                    "ac_hom": info['AC_Hom'][i],
                    "ac_nfe": info['AC_NFE'][i],
                    "ac_oth": info['AC_OTH'][i],
                    "ac_sas": info['AC_SAS'][i],
                    "ac_male": info['AC_MALE'][i],
                    "ac_female": info['AC_FEMALE'][i]
                },
                "af": info['AF'][i],
                "an": {
                    "an": info['AN'],
                    "an_afr": info['AN_AFR'],
                    "an_amr": info['AN_AMR'],
                    "an_adj": info['AN_Adj'],
                    "an_eas": info['AN_EAS'],
                    "an_fin": info['AN_FIN'],
                    "an_nfe": info['AN_NFE'],
                    "an_oth": info['AN_OTH'],
                    "an_sas": info['AN_SAS'],
                    "an_female": info['AN_FEMALE'],
                    "an_male": info['AN_MALE']
                },
                "baseqranksum": baseqranksum,
                "clippingranksum": clippingranksum,
                "fs": info['FS'],
                "het": {
                    "het_afr": info['Het_AFR'],
                    "het_amr": info['Het_AMR'],
                    "het_eas": info['Het_EAS'],
                    "het_fin": info['Het_FIN'],
                    "het_nfe": info['Het_NFE'],
                    "het_oth": info['Het_OTH'],
                    "het_sas": info['Het_SAS']
                },
                "hom": {
                    "hom_afr": info['Hom_AFR'],
                    "hom_amr": info['Hom_AMR'],
                    "hom_eas": info['Hom_EAS'],
                    "hom_fin": info['Hom_FIN'],
                    "hom_nfe": info['Hom_NFE'],
                    "hom_oth": info['Hom_OTH'],
                    "hom_sas": info['Hom_SAS']
                },
                "inbreedingcoeff": inbreedingcoeff,
                "mq": {
                    "mq": info['MQ'],
                    "mq0": info['MQ0'],
                    "mqranksum": mqranksum
                },
                "ncc": info['NCC'],
                "qd": qd,
                "readposranksum": readposranksum,
                "vqslod": info['VQSLOD'],
                "culprit": info['culprit']
            }
        }
        obj = (dict_sweep(unlist(value_convert_to_number(one_snp_json)),
                          [None]))
        yield obj
Exemple #22
0
def _map_line_to_json(fields):
    chrInfo = fields[0].split(":")  # grch37
    chrom = chrInfo[0]
    chromStart = int(chrInfo[1])

    ma_fin_percent = fields[7].split("/")

    if fields[3]:
        mutation = fields[3].split(">")
        ref = mutation[0]
        alt = mutation[1]
        HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt)
        hg19 = get_pos_start_end(chrom, chromStart, ref, alt)
        hg38 = get_pos_start_end(chrom, int(fields[30].split(":")[1]), ref, alt)

    # load as json data
    if HGVS is None:
        return

    one_snp_json = {
        "_id": HGVS,
        "evs":
            {
                "chrom": chrom,
                "hg19":
                    {
                        "start": hg19[0],
                        "end": hg19[1]
                    },
                "hg38":
                    {
                        "start": hg38[0],
                        "end": hg38[1]
                    },
                "rsid": fields[1],
                "dbsnp_version": get_dbsnp(fields[2]),
                "ref": ref,
                "alt": alt,
                "allele_count":
                    {
                        "european_american": count_dict(fields[4]),
                        "african_american": count_dict(fields[5]),
                        "all": count_dict(fields[6])
                    },
                "ma_fin_percent":
                    {
                        "european_american": ma_fin_percent[0],
                        "african_american": ma_fin_percent[1],
                        "all": ma_fin_percent[2]
                    },
                "genotype_count":
                    {
                        "european_american": count_dict(fields[8]),
                        "african_american": count_dict(fields[9]),
                        "all_genotype": count_dict(fields[10])
                    },
                "avg_sample_read": fields[11],
                "gene":
                    {
                        "symbol": fields[12],
                        "accession": fields[13]
                    },
                "function_gvs": fields[14],
                "hgvs":
                    {
                        "coding": fields[16],
                        "protein": fields[15]
                    },
                "coding_dna_size": fields[17],
                "conservation":
                    {
                        "phast_cons": fields[18],
                        "gerp": fields[19]
                    },
                "grantham_score": fields[20],
                "polyphen2":
                    {
                        "class": polyphen(fields[21])[0],
                        "score": polyphen(fields[21])[1]
                    },
                "ref_base_ncbi": fields[22],
                "chimp_allele": fields[23],
                "clinical_info": fields[24],
                "filter_status": fields[25],
                "on_illumina_human_exome_chip": fields[26],
                "gwas_pubmed_info": fields[27],
                "estimated_age_kyrs":
                    {
                        "ea": fields[28],
                        "aa": fields[29]
                    }
            }
        }
    return dict_sweep(value_convert(one_snp_json), vals=["NA", "none", "unknown"])