Exemple #1
0
def load_data(data_folder):

    input_file = os.path.join(data_folder, "phewas-catalog.csv")
    assert os.path.exists(input_file), "Can't find input file '%s'" % input_file
    with open_anyfile(input_file) as in_f:

        # Remove duplicated lines if any
        header = next(in_f).strip().split(',')
        header = [_item[1:-1] for _item in header]
        lines = set(list(in_f))
        reader = DictReader(lines, fieldnames=header, delimiter=',')

        results = defaultdict(list)
        for row in reader:
            variant = {"associations": {"phenotype": {}}, "variant": {}}
            assert re.match("^rs\d+$", row["snp"]) != None
            variant["variant"]["rsid"] = row["snp"]
            variant["associations"]["phenotype"]["name"] = row["phewas phenotype"]
            variant["associations"]["cases"] = row["cases"]

            variant["associations"]["pval"] = float(row["p-value"])
            variant["associations"]["odds-ratio"] = row["odds-ratio"]
            variant["associations"]["phenotype"]["phewas_code"] = row["phewas code"]
            variant["variant"]["gene"] = row["gene_name"]
            variant["variant"]["gwas_associations"] = row["gwas-associations"].split(',')
            pos_info = row["chromosome"].split(' ')
            if len(pos_info) == 2:
                variant["variant"]["chrom"], variant["variant"]["pos"] = pos_info
            else:
                variant["variant"]["chrom"] = pos_info[0]
            results[variant["variant"]["rsid"]].append(variant)
        # Merge duplications
        rsid_list = [_item for _item in results.keys()]
        hgvs_rsid_dict = batch_query_hgvs_from_rsid(rsid_list)
        for k, v in results.items():
            if k in hgvs_rsid_dict and hgvs_rsid_dict[k]:
                if len(v) == 1:
                    doc = {'_id': hgvs_rsid_dict[k],
                           'phewas': v[0]["variant"]}
                    doc["phewas"]["associations"] = v[0]["associations"]
                    yield dict_sweep(unlist(value_convert_to_number(doc, skipped_keys=['chrom'])), vals=[[], {}, None, '', 'NULL'])
                else:
                    doc = {'_id': hgvs_rsid_dict[k],
                           'phewas': v[0]["variant"]}
                    doc["phewas"]["associations"] = []
                    for _item in v:
                        doc["phewas"]["associations"].append(_item["associations"])
                    yield dict_sweep(unlist(value_convert_to_number(doc, skipped_keys=['chrom'])), vals=[[], {}, None, '', 'NULL'])
Exemple #2
0
def restructure_dict(dictionary):
    restr_dict = dict()
    _flag = 0
    for key in list(dictionary):  # this is for 1
        if key == 'molecule_chembl_id':
            restr_dict['_id'] = dictionary[key]
        if key == 'molecule_structures' and type(
                dictionary['molecule_structures']) == dict:
            restr_dict['chembl'] = dictionary
            _flag = 1
            for x, y in iter(dictionary['molecule_structures'].items()):
                if x == 'standard_inchi_key':
                    restr_dict['chembl'].update(dictionary)
                    restr_dict['chembl'].update({'inchi_key': y})
                if x == 'canonical_smiles':
                    restr_dict['chembl']['smiles'] = y
                if x == 'standard_inchi':
                    restr_dict['chembl']['inchi'] = y

    if _flag == 0:
        restr_dict['chembl'] = dictionary
    del restr_dict['chembl']['molecule_structures']
    restr_dict = unlist(restr_dict)
    restr_dict = dict_sweep(restr_dict,
                            vals=[
                                None, ".", "-", "", "NA", "None", "none", " ",
                                "Not Available", "unknown", "null"
                            ])
    restr_dict = value_convert_to_number(
        restr_dict, skipped_keys=["chebi_par_id", "first_approval"])
    restr_dict = boolean_convert(restr_dict, [
        "topical", "oral", "parenteral", "dosed_ingredient", "polymer_flag",
        "therapeutic_flag", "med_chem_friendly", "molecule_properties.ro3_pass"
    ])
    return restr_dict
Exemple #3
0
def _map_line_to_json(fields):
    vid = fields[0].split(":")
    chrom = re.search(r'[1-9]+', vid[0]).group()

    if chrom == '23':
        chrom = chrom.replace('23', 'X')
    HGVS = "chr%s:%s" % (chrom, vid[1])
    # load as json data
    if HGVS is None:
        return

    one_snp_json = {
        "_id": HGVS,
        "emv": {
            "gene": fields[2],
            "variant_id": fields[3],
            "exon": fields[4],
            "egl_variant": fields[5],
            "egl_protein": fields[6],
            "egl_classification": fields[7],
            "egl_classification_date": fields[8],
            "hgvs": fields[9].split(" | "),
            "clinvar_rcv": fields[10],
        }
    }

    return unlist(dict_sweep(value_convert_to_number(one_snp_json), vals=[""]))
def _map_line_to_json(fields):
    vid = fields[0].split(":")
    chrom = re.search(r'[1-9]+', vid[0]).group()

    if chrom == '23':
        chrom = chrom.replace('23', 'X')
    HGVS = "chr%s:%s" % (chrom, vid[1])
    # load as json data
    if HGVS is None:
        return

    one_snp_json = {
        "_id": HGVS,
        "emv": {
            "gene": fields[2],
            "variant_id": fields[3],
            "exon": fields[4],
            "egl_variant": fields[5],
            "egl_protein": fields[6],
            "egl_classification": fields[7],
            "egl_classification_date": fields[8],
            "hgvs": fields[9].split(" | "),
            "clinvar_rcv": fields[10],
        }
    }

    return unlist(dict_sweep(value_convert_to_number(one_snp_json), vals=[""]))
def restructure_dict(dictionary):
    restr_dict = dict()
    restr_dict['_id'] = dictionary['ChEBI ID']
    restr_dict['chebi']= dictionary
    restr_dict['chebi'] = clean_up(restr_dict['chebi'])
    restr_dict = dict_sweep(restr_dict,vals=[None,".", "-", "", "NA", "none", " ", "Not Available",
        "unknown","null","None","NaN"])
    restr_dict = value_convert_to_number(unlist(restr_dict),skipped_keys=["cid","sid",
        "beilstein","pubmed","sabio_rk","gmelin","molbase", "synonyms", "wikipedia","url_stub"])
    return restr_dict
    def reformat(cls, dictionary):
        ret_dict = dict()
        _flag = 0
        for key in list(dictionary):
            if key == 'molecule_chembl_id':
                ret_dict['_id'] = dictionary[key]
            if key == 'molecule_structures' and type(
                    dictionary['molecule_structures']) == dict:
                ret_dict['chembl'] = dictionary
                _flag = 1
                for x, y in iter(dictionary['molecule_structures'].items()):
                    if x == 'standard_inchi_key':
                        ret_dict['chembl'].update(dictionary)
                        ret_dict['chembl'].update({'inchi_key': y})
                    if x == 'canonical_smiles':
                        ret_dict['chembl']['smiles'] = y
                    if x == 'standard_inchi':
                        ret_dict['chembl']['inchi'] = y

        if _flag == 0:
            ret_dict['chembl'] = dictionary
        if 'cross_references' in ret_dict['chembl'] and ret_dict['chembl'][
                'cross_references']:
            ret_dict['chembl'][
                'xrefs'] = MoleculeCrossReferenceListTransformer.transform_to_dict(
                    ret_dict['chembl']['cross_references'])

        del ret_dict['chembl']['molecule_structures']
        del ret_dict['chembl']['cross_references']

        ret_dict = unlist(ret_dict)

        # Add "CHEBI:" prefix, standardize the way representing CHEBI IDs
        if 'chebi_par_id' in ret_dict['chembl'] and ret_dict['chembl'][
                'chebi_par_id']:
            ret_dict['chembl']['chebi_par_id'] = 'CHEBI:' + str(
                ret_dict['chembl']['chebi_par_id'])
        else:
            # clean, could be a None
            ret_dict['chembl'].pop("chebi_par_id", None)

        ret_dict = dict_sweep(ret_dict,
                              vals=[
                                  None, ".", "-", "", "NA", "None", "none",
                                  " ", "Not Available", "unknown", "null"
                              ])
        ret_dict = value_convert_to_number(
            ret_dict, skipped_keys=["chebi_par_id", "first_approval"])
        ret_dict = boolean_convert(ret_dict, [
            "topical", "oral", "parenteral", "dosed_ingredient",
            "polymer_flag", "therapeutic_flag", "med_chem_friendly",
            "molecule_properties.ro3_pass"
        ])
        return ret_dict
Exemple #7
0
    def parse(self, record: vcf.model._Record, doc_key: str):
        """
            When parsing gnomad.genomes.*.vcf.bgz files, `doc_key` should be "gnomad_genome";
            when parsing gnomad.exomes.*.vcf.bgz files, `doc_key` should be "gnomad_exome".

            The returned document has the following structure:

                one_snp_json = {
                    "_id": hgvs_id,
                    doc_key: {
                        "chrom": chrom,
                        ...
                    }
                }
            """
        # the value of CHROM in hg38 GNOMAD source file startswith 'chr'; need to remove it first
        if record.CHROM.startswith('chr'):
            record.CHROM = record.CHROM[3:]  # This step is necessary to `profile_parser.parse()` method
        if record.CHROM not in CHROM_VALID_VALUES:
            return

        info = record.INFO

        for key in ["AC", "AF", "nhomalt"]:
            if key in info:
                assert len(record.ALT) == len(info[key]), \
                    "length of record.ALT != length of info.%s, at CHROM=%s, POS=%s" % (key, record.CHROM, record.POS)

        profile_list = self.profile_parser.parse(record)
        site_quality_metrics_dict = self.site_quality_metrics_parser.parse(info)

        for i in range(len(record.ALT)):
            hgvs_id, profile_dict = profile_list[i]
            if hgvs_id is None:
                continue

            population_frequency_dict = self.population_frequency_parser.parse(info, i)

            one_snp_json = {
                "_id": hgvs_id,
                doc_key: {
                    **profile_dict,
                    **site_quality_metrics_dict,
                    **population_frequency_dict
                }
            }

            obj = (dict_sweep(unlist(value_convert_to_number(one_snp_json, skipped_keys=['chrom'])), [None]))
            yield obj
Exemple #8
0
def _map_line_to_json(fields):
    """Mapping each lines in csv file into JSON doc
    """
    one_snp_json = {
        "gene": fields[1],
        "variant_id": fields[2],
        "exon": fields[3],
        "egl_variant": fields[4],
        "egl_protein": fields[5],
        "egl_classification": fields[6],
        "egl_classification_date": fields[7],
        "hgvs": fields[8].split(" | ")
    }

    return unlist(dict_sweep(value_convert_to_number(one_snp_json), vals=[""]))
Exemple #9
0
def load_data(assembly, input_file, chrom):
    import logging as loggingmod
    global logging
    logging = loggingmod.getLogger("dbsnp_upload")
    logging.info("Processing chr{}...".format(chrom))
    snpdoc_iter = parse_vcf(assembly,
                            input_file,
                            compressed=True,
                            verbose=False,
                            by_id=True,
                            reference=chrom)
    for doc in snpdoc_iter:
        _doc = {'dbsnp': doc}
        _doc['_id'] = doc['_id']
        del doc['_id']
        yield (dict_sweep(unlist(value_convert_to_number(_doc)), [None]))
Exemple #10
0
def restr_dict(_dict, row):
    _d = {}
    _d.update({'stitch': {'flat': row[1], 'stereo': row[2]}})
    _d.update({
        'side_effect': {
            'name': row[10],
            'placebo': bool(row[4]),
            'frequency': row[5]
        }
    })
    _d.update({'meddra': {'type': row[8], 'umls_id': row[9]}})
    _d.update(
        {'indication': {
            'method_of_detection': row[11],
            'name': row[12]
        }})
    _d = dict_sweep(value_convert_to_number(_d))
    return _d
def parse_one_rec(assembly, record):
    """Restructure JSON
    """
    doc = {"alleles": [], "gene": [],
           assembly: {},
           "vartype": record.get("primary_snapshot_data").get("variant_type"),
           "rsid": "rs" + str(record.get("refsnp_id")),
           "dbsnp_build": int(record.get("last_update_build_id")),
           "dbsnp_merges": restructure_dbsnp_merge(record.get("dbsnp1_merges")),
           "citations": record.get("citations")}
    data = record.get('primary_snapshot_data')
    hgvs_vcf_info = get_hgvs_and_vcf(assembly,
                                     data.get("placements_with_allele"))
    allele_annotations = data.get('allele_annotations')
    allele_annotations = list(allele_annotations)
    doc["alleles"] = restructure_allele_freq_info(allele_annotations)
    doc['gene'] = restructure_gene_info(allele_annotations)
    for _item in hgvs_vcf_info:
        hgvs, vcf = _item
        if vcf:
            doc["chrom"], pos, doc["ref"], doc["alt"] = vcf
            doc["chrom"] = str(doc["chrom"])
            if doc["chrom"] == "23":
                doc["chrom"] = "X"
            elif doc["chrom"] == "24":
                doc["chrom"] = "Y"
            doc[assembly] = {}
            try:
                if doc["vartype"] != "snv":
                    ref = "T" + doc["ref"]
                    alt = "T" + doc["alt"]
                else:
                    ref = doc["ref"]
                    alt = doc["alt"]
                if doc["vartype"] in ["ins", "del", "delins"]:
                    doc[assembly]['start'], doc[assembly]['end'] = get_pos_start_end(doc["chrom"], pos - 1, ref, alt)
                else:
                    doc[assembly]['start'], doc[assembly]['end'] = get_pos_start_end(doc["chrom"], pos, ref, alt)
            except (ValueError, AssertionError):
                doc[assembly] = {}
        if hgvs:
            doc["_id"] = hgvs.replace('chr23', 'chrX').replace('chr24', 'chrY')
            yield dict_sweep(unlist(value_convert_to_number(doc, skipped_keys=['chrom', 'ref', 'alt', 'allele', 'deleted_sequence', 'inserted_sequence'])), vals=[[], {}, None])
def restructure_dict(dictionary):
    restr_dict = dict()
    restr_dict['_id'] = dictionary['ChEBI ID']
    restr_dict['chebi'] = dictionary
    restr_dict['chebi'] = clean_up(restr_dict['chebi'])
    restr_dict = dict_sweep(restr_dict,
                            vals=[
                                None, ".", "-", "", "NA", "none", " ",
                                "Not Available", "unknown", "null", "None",
                                "NaN"
                            ])
    restr_dict = value_convert_to_number(unlist(restr_dict),
                                         skipped_keys=[
                                             "cid", "sid", "beilstein",
                                             "pubmed", "sabio_rk", "gmelin",
                                             "molbase", "synonyms",
                                             "wikipedia", "url_stub"
                                         ])
    return restr_dict
Exemple #13
0
def _map_line_to_json(item):
    chrom = item.CHROM
    chromStart = item.POS
    ref = item.REF
    info = item.INFO
    hpo_count=item.INFO['HPO_CT']
    for alt in item.ALT:
        alt = str(alt)
        (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True)
        if HGVS is None:
            return
        one_snp_json = {
            "_id": HGVS,
            "geno2mp": {
                "hpo_count": hpo_count,

            }
        }
        obj = (dict_sweep(unlist(value_convert_to_number(one_snp_json)), [None]))
        yield obj
Exemple #14
0
def restructure_dict(dictionary):
    restr_dict = dict()
    restr_dict['_id'] = dictionary['ChEBI ID']
    restr_dict['chebi'] = dictionary
    restr_dict['chebi'] = clean_up(restr_dict['chebi'])
    restr_dict = dict_sweep(restr_dict,
                            vals=[
                                None, ".", "-", "", "NA", "none", " ",
                                "Not Available", "unknown", "null", "None",
                                "NaN"
                            ])
    restr_dict = value_convert_to_number(unlist(restr_dict),
                                         skipped_keys=[
                                             "beilstein_registry_numbers",
                                             "pubmed_citation_links",
                                             "sabio_rk_database_links",
                                             "gmelin_registry_numbers",
                                             "molbase_database_links"
                                         ])
    return restr_dict
def restructure_dict(dictionary):
    restr_dict = dict()
    _flag = 0
    for key in list(dictionary): # this is for 1
        if key == 'molecule_chembl_id':
            restr_dict['_id']=dictionary[key]
        if key == 'molecule_structures' and type(dictionary['molecule_structures'])==dict:
            restr_dict['chembl'] = dictionary
            _flag=1
            for x,y in iter(dictionary['molecule_structures'].items()):
                if x == 'standard_inchi_key':
                    restr_dict['chembl'].update(dictionary)
                    restr_dict['chembl'].update({'inchi_key':y})
                if x == 'canonical_smiles':
                    restr_dict['chembl']['smiles'] = y
                if x == 'standard_inchi':
                    restr_dict['chembl']['inchi'] = y

    if _flag == 0:
        restr_dict['chembl'] = dictionary
    if 'cross_references' in restr_dict['chembl'] and restr_dict['chembl']['cross_references']:
        restr_dict['chembl']['xrefs'] = restructure_xref(restr_dict['chembl']['cross_references'])

    del restr_dict['chembl']['molecule_structures']
    del restr_dict['chembl']['cross_references']
    restr_dict = unlist(restr_dict)
    # Add "CHEBI:" prefix, standardize the way representing CHEBI IDs
    if 'chebi_par_id' in restr_dict['chembl'] and restr_dict['chembl']['chebi_par_id']:
        restr_dict['chembl']['chebi_par_id'] = 'CHEBI:' + str(restr_dict['chembl']['chebi_par_id'])
    else:
        # clean, could be a None
        restr_dict['chembl'].pop("chebi_par_id",None)

    restr_dict = dict_sweep(restr_dict, vals=[None,".", "-", "", "NA", "None","none", " ", "Not Available", "unknown","null"])
    restr_dict = value_convert_to_number(restr_dict, skipped_keys=["chebi_par_id","first_approval"])
    restr_dict = boolean_convert(restr_dict, ["topical","oral","parenteral","dosed_ingredient","polymer_flag",
        "therapeutic_flag","med_chem_friendly","molecule_properties.ro3_pass"])
    return restr_dict
def _map_line_to_json(doc_key, item):
    chrom = item.CHROM
    chromStart = item.POS
    ref = item.REF
    info = item.INFO
    try:
        baseqranksum = info['BaseQRankSum']
    except:
        baseqranksum = None
    try:
        clippingranksum = info['ClippingRankSum']
    except:
        clippingranksum = None
    try:
        mqranksum = info['MQRankSum']
    except:
        mqranksum = None
    try:
        readposranksum = info['ReadPosRankSum']
    except:
        readposranksum = None
    try:
        qd = info['QD']
    except:
        qd = None
    try:
        inbreedingcoeff = info['InbreedingCoeff']
    except:
        inbreedingcoeff = None
    for i in range(0, len(item.ALT)):
        item.ALT[i] = str(item.ALT[i])
    for alt in item.ALT:
        alt = str(alt)
        (HGVS, var_type) = get_hgvs_from_vcf(chrom,
                                             chromStart,
                                             ref,
                                             alt,
                                             mutant_type=True)
        if HGVS is None:
            return
        one_snp_json = {
            "_id": HGVS,
            doc_key: {
                "chrom": chrom,
                "pos": chromStart,
                "ref": ref,
                "alt": alt,
                "alleles": item.ALT,
                "type": var_type,
                "ac": {
                    "ac": info['AC'],
                    "ac_afr": info['AC_AFR'],
                    "ac_amr": info['AC_AMR'],
                    "ac_adj": info['AC_Adj'],
                    "ac_eas": info['AC_EAS'],
                    "ac_fin": info['AC_FIN'],
                    "ac_het": info['AC_Het'],
                    "ac_hom": info['AC_Hom'],
                    "ac_nfe": info['AC_NFE'],
                    "ac_oth": info['AC_OTH'],
                    "ac_sas": info['AC_SAS'],
                    "ac_female": info['AC_FEMALE'],
                    "ac_male": info['AC_MALE']
                },
                "af": info['AF'],
                "an": {
                    "an": info['AN'],
                    "an_afr": info['AN_AFR'],
                    "an_amr": info['AN_AMR'],
                    "an_adj": info['AN_Adj'],
                    "an_eas": info['AN_EAS'],
                    "an_fin": info['AN_FIN'],
                    "an_nfe": info['AN_NFE'],
                    "an_oth": info['AN_OTH'],
                    "an_sas": info['AN_SAS'],
                    "an_female": info['AN_FEMALE'],
                    "an_male": info['AN_MALE']
                },
                "baseqranksum": baseqranksum,
                "clippingranksum": clippingranksum,
                "fs": info['FS'],
                "het": {
                    "het_afr": info['Het_AFR'],
                    "het_amr": info['Het_AMR'],
                    "het_eas": info['Het_EAS'],
                    "het_fin": info['Het_FIN'],
                    "het_nfe": info['Het_NFE'],
                    "het_oth": info['Het_OTH'],
                    "het_sas": info['Het_SAS']
                },
                "hom": {
                    "hom_afr": info['Hom_AFR'],
                    "hom_amr": info['Hom_AMR'],
                    "hom_eas": info['Hom_EAS'],
                    "hom_fin": info['Hom_FIN'],
                    "hom_nfe": info['Hom_NFE'],
                    "hom_oth": info['Hom_OTH'],
                    "hom_sas": info['Hom_SAS']
                },
                "inbreedingcoeff": inbreedingcoeff,
                "mq": {
                    "mq": info['MQ'],
                    "mq0": info['MQ0'],
                    "mqranksum": mqranksum
                },
                "ncc": info['NCC'],
                "qd": qd,
                "readposranksum": readposranksum,
                "vqslod": info['VQSLOD'],
                "culprit": info['culprit']
            }
        }
        obj = (dict_sweep(unlist(value_convert_to_number(one_snp_json)),
                          [None]))
        yield obj
Exemple #17
0
def parse_one_rec(assembly, record):
    """
    Parse a record from a 'refsnp-chr*.json.bz2' file into one or multiple documents.

    From https://ftp.ncbi.nlm.nih.gov/snp/latest_release/JSON/JSON_README.txt we know that each
    'refsnp-chr*.json.bz2' file conform to the "refsnp_snapshot_success" OpenAPI schema, as defined in
    https://api.ncbi.nlm.nih.gov/variation/v0/var_service.yaml

    From the above schema, we can find that each record (i.e. each line of one of 'refsnp-chr*.json.bz2' files) conform
    to the "refsnp_snapshot" OpenAPI schema, which **requires** the following components:

    - "refsnp_id" (type: string, format: uint64),
    - "create_date" (type: string, format: ISO 8601),
    - "last_update_date" (type: string, format: ISO 8601),
    - "last_update_build_id" (type: string, format: ascii),
    - "dbsnp1_merges" (type: array),
    - "lost_obs_movements" (type: array),
    - "present_obs_movements" (type: array),
    - "citations" (type: array)

    A another component of our interest is "primary_snapshot_data" (type: object), which is optional to
    "refsnp_snapshot". If exists, itself **requires** the following sub-components:

    - "placements_with_allele" (type: array),
    - "allele_annotations" (type: array),
    - "support" (type: array),
    - "anchor" (type: string, format: ascii),
    - "variant_type" (type: string, format: ascii)

    Plus, it's known that **none** of the fields defined in the above schema is "nullable".
    (See https://stackoverflow.com/questions/45575493/what-does-required-in-openapi-really-mean for more.)

    The requiredness, data types, and nullability of each components are a guideline to apply existence check and type
    conversion to those fields in the output json objects.
    """
    """
    We can extract common fields from the input record, and for each "allele" in each "placement" from the record's 
    "primary_snapshot_data" component, we can extract some allele-specific fields. The generation of the output 
    documents can be described with the pseudocode below:
    
        common_fields = {...}
        
        for placement in placements:
            for allele in placement["alleles"]
                allele_specific_fields = {...}
                doc = {
                    **common_fields,
                    **allele_specific_fields,
                }
            
                yield doc
    """
    snapshot = record.get("primary_snapshot_data")
    annotations = snapshot.get("allele_annotations")
    placements = snapshot.get("placements_with_allele")

    common_fields = {
        # fields parsed directly from `record`
        "rsid": "rs" + str(record.get("refsnp_id")),
        "dbsnp_build": int(record.get("last_update_build_id")),
        "dbsnp_merges": restructure_dbsnp_merge(record.get("dbsnp1_merges")),
        "citations": record.get("citations"),

        # fields parsed from `record["primary_snapshot_data"]`
        "vartype": snapshot.get("variant_type"),

        # fields parsed from `record["primary_snapshot_data"]["allele_annotations"]`
        "alleles": restructure_allele_freq_info(annotations),
        "gene": restructure_gene_info(annotations)
    }

    variant_type = common_fields["vartype"]

    # fields parsed from `record["primary_snapshot_data"]["placements_with_allele"]
    for hgvs, vcf in get_hgvs_and_vcf(assembly, placements):
        chrom, pos, ref, alt = vcf

        start, end = get_start_end(variant_type, chrom, pos, ref, alt)
        if start is None and end is None:
            coordinates = {}
        else:  # we can infer from `get_pos_start_end` that in this case, neither of `start` or `end` could be None
            coordinates = {"start": start, "end": end}

        allele_specific_fields = {
            "_id": hgvs,
            "chrom": chrom,
            "ref": ref,
            "alt": alt,
            assembly: coordinates
        }

        doc = {**common_fields, **allele_specific_fields}
        yield dict_sweep(unlist(
            value_convert_to_number(doc,
                                    skipped_keys=[
                                        'chrom', 'ref', 'alt', 'allele',
                                        'deleted_sequence', 'inserted_sequence'
                                    ])),
                         vals=[[], {}, None])
def _map_line_to_json(fields):
    assert len(fields) == VALID_COLUMN_NO
    chrom = fields[13]
    chromStart = fields[14]
    chromEnd = fields[15]

    HGVS = None
    cds = fields[18].split(":")
    cds = cds[1]
    replace = re.findall(r'[ATCGMNYR=]+', cds)
    sub = re.search(r'\d([ATCGMNHKRY]>[ATCGMNHKRY])', cds)
    ins = re.search(r'ins[ATCGMNHYR]+|ins[0-9]+', cds)
    delete = fields[1] == 'deletion'
    indel = fields[1] == 'indel'
    dup = re.search(r'dup', cds)
    inv = re.search(r'inv|inv[0-9]+|inv[ATCGMNHYR]+', cds)
    if ins:
        delete = None
        indel = None
    elif delete:
        ins = None
        indel = None
    # parse from vcf file. Input chrom number
    # and chromStart, and return REF, ALT
    if chromStart:
        record = vcf_reader.fetch(chrom, int(chromStart))
    else:
        record = None
    if record:
        REF = record.REF
        ALT = record.ALT
        ALT = ALT[0]
        if record.is_snp and len(ALT) < 2:
            mod = [REF, ALT]
        else:
            mod = ALT
    else:
        return

    if sub and record.is_snp:
        HGVS = "chr%s:g.%s%s>%s" % (chrom, chromStart, mod[0], mod[1])
    elif ins:
        HGVS = "chr%s:g.%s_%sins%s" % (chrom, chromStart, chromEnd, mod)
    elif delete:
        HGVS = "chr%s:g.%s_%sdel" % (chrom, chromStart, chromEnd)
    elif indel:
        try:
            HGVS = "chr%s:g.%s_%sdelins%s" % (chrom, chromStart, chromEnd, mod)
        except AttributeError:
            print "ERROR:", fields[1], cds
    elif dup:
        HGVS = "chr%s:g.%s_%sdup%s" % (chrom, chromStart, chromEnd, mod)
    elif inv:
        HGVS = "chr%s:g.%s_%sinv%s" % (chrom, chromStart, chromEnd, mod)
    elif replace:
        HGVS = "chr%s:g.%s_%s%s" % (chrom, chromStart, chromEnd, mod)
    else:
        print 'ERROR:', fields[1], cds

    # load as json data
    if HGVS is None:
        print 'None:', fields[1], cds
        return None

    one_snp_json = {
        "_id": HGVS,
        "clinvar": {
            "allele_id": fields[0],
            "hg19": {
                "chr": fields[13],
                "start": fields[14],
                "end": fields[15]
            },
            "type": fields[1],
            "name": fields[2],
            "gene": {
                "id": fields[3],
                "symbol": fields[4]
            },
            "clinical_significance": fields[5].split(";"),
            "rsid": 'rs' + str(fields[6]),
            "nsv_dbvar": fields[7],
            "rcv_accession": fields[8].split(";"),
            "tested_in_gtr": fields[9],
            "phenotype_id": other_id(fields[10]),
            "origin": fields[11],
            "cytogenic": fields[16],
            "review_status": fields[17],
            "hgvs": {
                "coding": fields[18],
                "protein": fields[19]
            },
            "number_submitters": fields[20],
            "last_evaluated": fields[21],
            "guidelines": fields[22],
            "other_ids": other_id(fields[23]),
            "clinvar_id": fields[24]
        }
    }
    return dict_sweep(unlist(value_convert_to_number(one_snp_json)),
                      vals=["-"])
Exemple #19
0
def _map_line_to_json(fields, version):
    chrInfo = fields[0].split(":")  # grch37
    chrom = chrInfo[0]
    chromStart = int(chrInfo[1])
    ma_fin_percent = fields[7].split("/")
    if fields[3]:
        mutation = fields[3].split(">")
        ref = mutation[0]
        alt = mutation[1]
        hg19 = get_pos_start_end(chrom, chromStart, ref, alt)
        hg38 = get_pos_start_end(chrom, int(fields[30].split(":")[1]), ref, alt)
        if version == 'hg19':
            HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt)
        elif version == 'hg38':
            HGVS = get_hgvs_from_vcf(chrom, hg38[0], ref, alt)

    # load as json data
    if HGVS is None:
        return

    one_snp_json = {
        "_id": HGVS,
        "evs":
            {
                "chrom": str(chrom),
                "hg19":
                    {
                        "start": hg19[0],
                        "end": hg19[1]
                    },
                "hg38":
                    {
                        "start": hg38[0],
                        "end": hg38[1]
                    },
                "rsid": fields[1],
                "dbsnp_version": get_dbsnp(fields[2]),
                "ref": ref,
                "alt": alt,
                "allele_count":
                    {
                        "european_american": count_dict(fields[4]),
                        "african_american": count_dict(fields[5]),
                        "all": count_dict(fields[6])
                    },
                "ma_fin_percent":
                    {
                        "european_american": ma_fin_percent[0],
                        "african_american": ma_fin_percent[1],
                        "all": ma_fin_percent[2]
                    },
                "genotype_count":
                    {
                        "european_american": count_dict(fields[8]),
                        "african_american": count_dict(fields[9]),
                        "all_genotype": count_dict(fields[10])
                    },
                "avg_sample_read": fields[11],
                "gene":
                    {
                        "symbol": fields[12],
                        "accession": fields[13]
                    },
                "function_gvs": fields[14],
                "hgvs":
                    {
                        "coding": fields[16],
                        "protein": fields[15]
                    },
                "coding_dna_size": fields[17],
                "conservation":
                    {
                        "phast_cons": fields[18],
                        "gerp": fields[19]
                    },
                "grantham_score": fields[20],
                "polyphen2":
                    {
                        "class": polyphen(fields[21])[0],
                        "score": polyphen(fields[21])[1]
                    },
                "ref_base_ncbi": fields[22],
                "chimp_allele": fields[23],
                "clinical_info": fields[24],
                "filter_status": fields[25],
                "on_illumina_human_exome_chip": fields[26],
                "gwas_pubmed_info": fields[27],
                "estimated_age_kyrs":
                    {
                        "ea": fields[28],
                        "aa": fields[29]
                    }
            }
        }
    return dict_sweep(value_convert_to_number(one_snp_json), vals=["NA", "none", "unknown"])
Exemple #20
0
def _map_line_to_json(fields, version):
    chrInfo = fields[0].split(":")  # grch37
    chrom = chrInfo[0]
    chromStart = int(chrInfo[1])
    ma_fin_percent = fields[7].split("/")
    if fields[3]:
        mutation = fields[3].split(">")
        ref = mutation[0]
        alt = mutation[1]
        hg19 = get_pos_start_end(chrom, chromStart, ref, alt)
        hg38 = get_pos_start_end(chrom, int(fields[30].split(":")[1]), ref,
                                 alt)
        if version == 'hg19':
            HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt)
        elif version == 'hg38':
            HGVS = get_hgvs_from_vcf(chrom, hg38[0], ref, alt)

    # load as json data
    if HGVS is None:
        return

    one_snp_json = {
        "_id": HGVS,
        "evs": {
            "chrom": str(chrom),
            "hg19": {
                "start": hg19[0],
                "end": hg19[1]
            },
            "hg38": {
                "start": hg38[0],
                "end": hg38[1]
            },
            "rsid": fields[1],
            "dbsnp_version": get_dbsnp(fields[2]),
            "ref": ref,
            "alt": alt,
            "allele_count": {
                "european_american": count_dict(fields[4]),
                "african_american": count_dict(fields[5]),
                "all": count_dict(fields[6])
            },
            "ma_fin_percent": {
                "european_american": ma_fin_percent[0],
                "african_american": ma_fin_percent[1],
                "all": ma_fin_percent[2]
            },
            "genotype_count": {
                "european_american": count_dict(fields[8]),
                "african_american": count_dict(fields[9]),
                "all_genotype": count_dict(fields[10])
            },
            "avg_sample_read": fields[11],
            "gene": {
                "symbol": fields[12],
                "accession": fields[13]
            },
            "function_gvs": fields[14],
            "hgvs": {
                "coding": fields[16],
                "protein": fields[15]
            },
            "coding_dna_size": fields[17],
            "conservation": {
                "phast_cons": fields[18],
                "gerp": fields[19]
            },
            "grantham_score": fields[20],
            "polyphen2": {
                "class": polyphen(fields[21])[0],
                "score": polyphen(fields[21])[1]
            },
            "ref_base_ncbi": fields[22],
            "chimp_allele": fields[23],
            "clinical_info": fields[24],
            "filter_status": fields[25],
            "on_illumina_human_exome_chip": fields[26],
            "gwas_pubmed_info": fields[27],
            "estimated_age_kyrs": {
                "ea": fields[28],
                "aa": fields[29]
            }
        }
    }
    return dict_sweep(value_convert_to_number(one_snp_json),
                      vals=["NA", "none", "unknown"])
Exemple #21
0
def _map_line_to_json(df, version, include_gnomad, index=0):
    # specific variable treatment
    chrom = df["#chr"]
    if chrom == 'M':
        chrom = 'MT'
    # fields[7] in version 2, represent hg18_pos
    hg18_end = df["hg18_pos(1-based)"]
    if hg18_end == ".":
        hg18_end = "."
    else:
        hg18_end = int(hg18_end)
    # in case of no hg19 position provided, remove the item
    if df["hg19_pos(1-based)"] == '.':
        return None
    else:
        chromStart = int(df["hg19_pos(1-based)"])
        chromEnd = chromStart
    chromStart_38 = int(df["pos(1-based)"])
    ref = df["ref"].upper()
    alt = df["alt"].upper()
    HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt)
    HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt)
    if version == 'hg19':
        HGVS = HGVS_19
    elif version == 'hg38':
        HGVS = HGVS_38
    siphy_29way_pi = df["SiPhy_29way_pi"]
    if siphy_29way_pi == ".":
        siphy = "."
    else:
        freq = siphy_29way_pi.split(":")
        siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}
    gtex_gene = df["GTEx_V8_gene"].split('|')
    gtex_tissue = df["GTEx_V8_tissue"].split('|')
    gtex = map(
        dict,
        map(lambda t: zip(('gene', 'tissue'), t), zip(gtex_gene, gtex_tissue)))
    acc = df["Uniprot_acc"].rstrip().rstrip(';').split(";")
    entry = df["Uniprot_entry"].rstrip().rstrip(';').split(";")
    uniprot = map(dict, map(lambda t: zip(('acc', 'entry'), t),
                            zip(acc, entry)))
    provean_score = df["PROVEAN_score"].split(';')
    sift_score = df["SIFT_score"].split(';')
    sift4g_score = df["SIFT4G_score"].split(';')
    hdiv_score = df["Polyphen2_HDIV_score"].split(';')
    hvar_score = df["Polyphen2_HVAR_score"].split(';')
    lrt_score = df["LRT_score"].split(';')
    m_cap_score = df["M-CAP_score"].split(';')
    mutationtaster_score = df["MutationTaster_score"].split(';')
    mutationassessor_score = df["MutationAssessor_score"].split(';')
    vest3_score = df["VEST4_score"].split(';')
    metasvm_score = df["MetaSVM_score"].split(';')
    fathmm_score = df["FATHMM_score"].split(';')
    metalr_score = df["MetaLR_score"].split(';')
    revel_score = df["REVEL_score"].split(';')
    appris = df["APPRIS"].split(";")
    mpc_score = df["MPC_score"].split(';')
    mvp_score = df["MVP_score"].split(';')
    tsl = df["TSL"].split(';')
    vep_canonical = df["VEP_canonical"].split(';')
    deogen2_score = df["DEOGEN2_score"].split(';')
    '''
    parse mutpred top 5 features
    '''
    def modify_pvalue(pvalue):
        return float(pvalue.strip('P = '))

    mutpred_mechanisms = df["MutPred_Top5features"]
    if mutpred_mechanisms not in ['.', ',', '-']:
        mutpred_mechanisms = mutpred_mechanisms.split(
            " (") and mutpred_mechanisms.split(";")
        mutpred_mechanisms = [m.rstrip(")") for m in mutpred_mechanisms]
        mutpred_mechanisms = [i.split(" (") for i in mutpred_mechanisms]
        mutpred_mechanisms = sum(mutpred_mechanisms, [])
        mechanisms = [{
            "mechanism": mutpred_mechanisms[0],
            "p_val": modify_pvalue(mutpred_mechanisms[1])
        }, {
            "mechanism": mutpred_mechanisms[2],
            "p_val": modify_pvalue(mutpred_mechanisms[3])
        }, {
            "mechanism": mutpred_mechanisms[4],
            "p_val": modify_pvalue(mutpred_mechanisms[5])
        }, {
            "mechanism": mutpred_mechanisms[6],
            "p_val": modify_pvalue(mutpred_mechanisms[7])
        }, {
            "mechanism": mutpred_mechanisms[8],
            "p_val": modify_pvalue(mutpred_mechanisms[9])
        }]
    else:
        mechanisms = '.'

    # normalize scores

    def norm(arr):
        return [None if item == '.' else item for item in arr]

    provean_score = norm(provean_score)
    sift_score = norm(sift_score)
    hdiv_score = norm(hdiv_score)
    hvar_score = norm(hvar_score)
    lrt_score = norm(lrt_score)
    m_cap_score = norm(m_cap_score)
    mutationtaster_score = norm(mutationtaster_score)
    mutationassessor_score = norm(mutationassessor_score)
    vest3_score = norm(vest3_score)
    metasvm_score = norm(metasvm_score)
    fathmm_score = norm(fathmm_score)
    metalr_score = norm(metalr_score)
    revel_score = norm(revel_score)

    gnomad = {
        "gnomad_exomes": {
            "flag": df["gnomAD_exomes_flag"],
            "nhomalt": df["gnomAD_exomes_nhomalt"],
            "ac": df["gnomAD_exomes_AC"],
            "an": df["gnomAD_exomes_AN"],
            "af": df["gnomAD_exomes_AF"],
            "nhomalt": df["gnomAD_exomes_nhomalt"],
            "afr_ac": df["gnomAD_exomes_AFR_AC"],
            "afr_af": df["gnomAD_exomes_AFR_AF"],
            "afr_an": df["gnomAD_exomes_AFR_AN"],
            "afr_nhomalt": df["gnomAD_exomes_AFR_nhomalt"],
            "amr_ac": df["gnomAD_exomes_AMR_AC"],
            "amr_an": df["gnomAD_exomes_AMR_AN"],
            "amr_af": df["gnomAD_exomes_AMR_AF"],
            "amr_nhomalt": df["gnomAD_exomes_AMR_nhomalt"],
            "asj_ac": df["gnomAD_exomes_ASJ_AC"],
            "asj_an": df["gnomAD_exomes_ASJ_AN"],
            "asj_af": df["gnomAD_exomes_ASJ_AF"],
            "asj_nhomalt": df["gnomAD_exomes_ASJ_nhomalt"],
            "eas_ac": df["gnomAD_exomes_EAS_AC"],
            "eas_af": df["gnomAD_exomes_EAS_AF"],
            "eas_an": df["gnomAD_exomes_EAS_AN"],
            "eas_nhomalt": df["gnomAD_exomes_EAS_nhomalt"],
            "fin_ac": df["gnomAD_exomes_FIN_AC"],
            "fin_af": df["gnomAD_exomes_FIN_AF"],
            "fin_an": df["gnomAD_exomes_FIN_AN"],
            "fin_nhomalt": df["gnomAD_exomes_FIN_nhomalt"],
            "nfe_ac": df["gnomAD_exomes_NFE_AC"],
            "nfe_af": df["gnomAD_exomes_NFE_AF"],
            "nfe_an": df["gnomAD_exomes_NFE_AN"],
            "nfe_nhomalt": df["gnomAD_exomes_NFE_nhomalt"],
            "sas_ac": df["gnomAD_exomes_SAS_AC"],
            "sas_af": df["gnomAD_exomes_SAS_AF"],
            "sas_an": df["gnomAD_exomes_SAS_AN"],
            "sas_nhomalt": df["gnomAD_exomes_SAS_nhomalt"],
            "popmax_ac": df["gnomAD_exomes_POPMAX_AC"],
            "popmax_af": df["gnomAD_exomes_POPMAX_AF"],
            "popmax_an": df["gnomAD_exomes_POPMAX_AN"],
            "popmax_nhomalt": df["gnomAD_exomes_POPMAX_nhomalt"]
        },
        "gnomad_exomes_controls": {
            "nhomalt": df["gnomAD_exomes_controls_nhomalt"],
            "ac": df["gnomAD_exomes_controls_AC"],
            "an": df["gnomAD_exomes_controls_AN"],
            "af": df["gnomAD_exomes_controls_AF"],
            "nhomalt": df["gnomAD_exomes_controls_nhomalt"],
            "afr_ac": df["gnomAD_exomes_controls_AFR_AC"],
            "afr_af": df["gnomAD_exomes_controls_AFR_AF"],
            "afr_an": df["gnomAD_exomes_controls_AFR_AN"],
            "afr_nhomalt": df["gnomAD_exomes_controls_AFR_nhomalt"],
            "amr_ac": df["gnomAD_exomes_controls_AMR_AC"],
            "amr_an": df["gnomAD_exomes_controls_AMR_AN"],
            "amr_af": df["gnomAD_exomes_controls_AMR_AF"],
            "amr_nhomalt": df["gnomAD_exomes_controls_AMR_nhomalt"],
            "asj_ac": df["gnomAD_exomes_controls_ASJ_AC"],
            "asj_an": df["gnomAD_exomes_controls_ASJ_AN"],
            "asj_af": df["gnomAD_exomes_controls_ASJ_AF"],
            "asj_nhomalt": df["gnomAD_exomes_controls_ASJ_nhomalt"],
            "eas_ac": df["gnomAD_exomes_controls_EAS_AC"],
            "eas_af": df["gnomAD_exomes_controls_EAS_AF"],
            "eas_an": df["gnomAD_exomes_controls_EAS_AN"],
            "eas_nhomalt": df["gnomAD_exomes_controls_EAS_nhomalt"],
            "fin_ac": df["gnomAD_exomes_controls_FIN_AC"],
            "fin_af": df["gnomAD_exomes_controls_FIN_AF"],
            "fin_an": df["gnomAD_exomes_controls_FIN_AN"],
            "fin_nhomalt": df["gnomAD_exomes_controls_FIN_nhomalt"],
            "nfe_ac": df["gnomAD_exomes_controls_NFE_AC"],
            "nfe_af": df["gnomAD_exomes_controls_NFE_AF"],
            "nfe_an": df["gnomAD_exomes_controls_NFE_AN"],
            "nfe_nhomalt": df["gnomAD_exomes_controls_NFE_nhomalt"],
            "sas_ac": df["gnomAD_exomes_controls_SAS_AC"],
            "sas_af": df["gnomAD_exomes_controls_SAS_AF"],
            "sas_an": df["gnomAD_exomes_controls_SAS_AN"],
            "sas_nhomalt": df["gnomAD_exomes_controls_SAS_nhomalt"],
            "popmax_ac": df["gnomAD_exomes_controls_POPMAX_AC"],
            "popmax_af": df["gnomAD_exomes_controls_POPMAX_AF"],
            "popmax_an": df["gnomAD_exomes_controls_POPMAX_AN"],
            "popmax_nhomalt": df["gnomAD_exomes_controls_POPMAX_nhomalt"]
        },
        "gnomad_genomes": {
            "flag": df["gnomAD_genomes_flag"],
            "nhomalt": df["gnomAD_genomes_nhomalt"],
            "ac": df["gnomAD_genomes_AC"],
            "an": df["gnomAD_genomes_AN"],
            "af": df["gnomAD_genomes_AF"],
            "nhomalt": df["gnomAD_genomes_nhomalt"],
            "afr_ac": df["gnomAD_genomes_AFR_AC"],
            "afr_af": df["gnomAD_genomes_AFR_AF"],
            "afr_an": df["gnomAD_genomes_AFR_AN"],
            "afr_nhomalt": df["gnomAD_genomes_AFR_nhomalt"],
            "ami_ac": df["gnomAD_genomes_AMI_AC"],
            "ami_an": df["gnomAD_genomes_AMI_AN"],
            "ami_af": df["gnomAD_genomes_AMI_AF"],
            "ami_nhomalt": df["gnomAD_genomes_AMI_nhomalt"],
            "amr_ac": df["gnomAD_genomes_AMR_AC"],
            "amr_an": df["gnomAD_genomes_AMR_AN"],
            "amr_af": df["gnomAD_genomes_AMR_AF"],
            "amr_nhomalt": df["gnomAD_genomes_AMR_nhomalt"],
            "asj_ac": df["gnomAD_genomes_ASJ_AC"],
            "asj_an": df["gnomAD_genomes_ASJ_AN"],
            "asj_af": df["gnomAD_genomes_ASJ_AF"],
            "asj_nhomalt": df["gnomAD_genomes_ASJ_nhomalt"],
            "eas_ac": df["gnomAD_genomes_EAS_AC"],
            "eas_af": df["gnomAD_genomes_EAS_AF"],
            "eas_an": df["gnomAD_genomes_EAS_AN"],
            "eas_nhomalt": df["gnomAD_genomes_EAS_nhomalt"],
            "fin_ac": df["gnomAD_genomes_FIN_AC"],
            "fin_af": df["gnomAD_genomes_FIN_AF"],
            "fin_an": df["gnomAD_genomes_FIN_AN"],
            "fin_nhomalt": df["gnomAD_genomes_FIN_nhomalt"],
            "nfe_ac": df["gnomAD_genomes_NFE_AC"],
            "nfe_af": df["gnomAD_genomes_NFE_AF"],
            "nfe_an": df["gnomAD_genomes_NFE_AN"],
            "nfe_nhomalt": df["gnomAD_genomes_NFE_nhomalt"],
            "popmax_ac": df["gnomAD_genomes_POPMAX_AC"],
            "popmax_af": df["gnomAD_genomes_POPMAX_AF"],
            "popmax_an": df["gnomAD_genomes_POPMAX_AN"],
            "popmax_nhomalt": df["gnomAD_genomes_POPMAX_nhomalt"]
        }
    }

    # load as json data
    one_snp_json = {
        "_id": HGVS,
        "dbnsfp": {
            "rsid":
            df["rs_dbSNP151"],
            #"rsid_dbSNP144": fields[6],
            "chrom":
            chrom,
            "hg19": {
                "start": chromStart,
                "end": chromEnd
            },
            "hg18": {
                "start": df["hg18_pos(1-based)"],
                "end": hg18_end
            },
            "hg38": {
                "start": df["pos(1-based)"],
                "end": df["pos(1-based)"]
            },
            "ref":
            ref,
            "alt":
            alt,
            "aa": {
                "ref": df["aaref"],
                "alt": df["aaalt"],
                "pos": df["aapos"],
                "refcodon": df["refcodon"],
                "codonpos": df["codonpos"],
                "codon_degeneracy": df["codon_degeneracy"],
            },
            "genename":
            df["genename"],
            "uniprot":
            list(uniprot),
            "vindijia_neandertal":
            [i for i in df["VindijiaNeandertal"].split("/") if i != "."],
            "interpro_domain":
            df["Interpro_domain"],
            "cds_strand":
            df["cds_strand"],
            "ancestral_allele":
            df["Ancestral_allele"],
            "appris":
            appris,
            "genecode_basic":
            df["GENCODE_basic"],
            "tsl":
            tsl,
            "vep_canonical":
            vep_canonical,
            #"altaineandertal": fields[17],
            #"denisova": fields[18]
            "ensembl": {
                "geneid": df["Ensembl_geneid"],
                "transcriptid": df["Ensembl_transcriptid"],
                "proteinid": df["Ensembl_proteinid"]
            },
            "sift": {
                "score": sift_score,
                "converted_rankscore": df["SIFT_converted_rankscore"],
                "pred": df["SIFT_pred"]
            },
            "sift4g": {
                "score": sift4g_score,
                "pred": df["SIFT4G_score"],
                "converted_rankscore": df["SIFT4G_converted_rankscore"]
            },
            "polyphen2": {
                "hdiv": {
                    "score": hdiv_score,
                    "rankscore": df["Polyphen2_HDIV_rankscore"],
                    "pred": df["Polyphen2_HDIV_pred"]
                },
                "hvar": {
                    "score": hvar_score,
                    "rankscore": df["Polyphen2_HVAR_rankscore"],
                    "pred": df["Polyphen2_HVAR_pred"]
                }
            },
            "lrt": {
                "score": lrt_score,
                "converted_rankscore": df["LRT_converted_rankscore"],
                "pred": df["LRT_pred"],
                "omega": df["LRT_Omega"]
            },
            "mvp": {
                "score": mvp_score,
                "rankscore": df["MVP_rankscore"]
            },
            "mpc": {
                "score": mpc_score,
                "rankscore": df["MPC_rankscore"]
            },
            "bstatistic": {
                "score": df['bStatistic'],
                "rankscore": df["bStatistic_rankscore"]
            },
            "aloft": {
                "fraction_transcripts_affected":
                df["Aloft_Fraction_transcripts_affected"].split(';'),
                "prob_tolerant":
                df["Aloft_prob_Tolerant"],
                "prob_recessive":
                df["Aloft_prob_Recessive"],
                "prob_dominant":
                df["Aloft_prob_Dominant"],
                "pred":
                df["Aloft_pred"],
                "confidence":
                df["Aloft_Confidence"],
            },
            "primateai": {
                "score": df["PrimateAI_score"],
                "rankscore": df["PrimateAI_rankscore"],
                "pred": df["PrimateAI_pred"]
            },
            "mutationtaster": {
                "score": mutationtaster_score,
                "converted_rankscore":
                df["MutationTaster_converted_rankscore"],
                "pred": df["MutationTaster_pred"],
                "model": df["MutationTaster_model"],
                "AAE": df["MutationTaster_AAE"]
            },
            "mutationassessor": {
                "score": mutationassessor_score,
                "rankscore": df["MutationAssessor_rankscore"],
                "pred": df["MutationAssessor_pred"]
            },
            "fathmm": {
                "score": fathmm_score,
                "rankscore": df["FATHMM_converted_rankscore"],
                "pred": df["FATHMM_pred"]
            },
            "provean": {
                "score": provean_score,
                "rankscore": df["PROVEAN_converted_rankscore"],
                "pred": df["PROVEAN_pred"]
            },
            "vest4": {
                "score": vest3_score,
                "rankscore": df["VEST4_rankscore"]
            },
            "deogen2": {
                "score": deogen2_score,
                "rankscore": df["DEOGEN2_rankscore"],
                "pred": df["DEOGEN2_pred"]
            },
            "fathmm-mkl": {
                "coding_score": df["fathmm-MKL_coding_score"],
                "coding_rankscore": df["fathmm-MKL_coding_rankscore"],
                "coding_pred": df["fathmm-MKL_coding_pred"],
                "coding_group": df["fathmm-MKL_coding_group"]
            },
            "fathmm-xf": {
                "coding_score": df["fathmm-XF_coding_score"],
                "coding_rankscore": df["fathmm-XF_coding_rankscore"],
                "coding_pred": df["fathmm-XF_coding_pred"]
            },
            "eigen": {
                "raw_coding": df["Eigen-raw_coding"],
                "raw_coding_rankscore": df["Eigen-raw_coding_rankscore"],
                "phred_coding": df["Eigen-pred_coding"]
            },
            "eigen-pc": {
                "raw_coding": df["Eigen-PC-raw_coding"],
                "phred_coding": df["Eigen-PC-phred_coding"],
                "raw_rankscore": df["Eigen-PC-raw_coding_rankscore"]
            },
            "genocanyon": {
                "score": df["GenoCanyon_score"],
                "rankscore": df["GenoCanyon_rankscore"]
            },
            "metasvm": {
                "score": metasvm_score,
                "rankscore": df["MetaSVM_rankscore"],
                "pred": df["MetaSVM_pred"]
            },
            "metalr": {
                "score": metalr_score,
                "rankscore": df["MetaLR_rankscore"],
                "pred": df["MetaLR_pred"]
            },
            "reliability_index":
            df["Reliability_index"],
            "m_cap_score": {
                "score": m_cap_score,
                "rankscore": df["M-CAP_rankscore"],
                "pred": df["M-CAP_pred"]
            },
            "revel": {
                "score": revel_score,
                "rankscore": df["REVEL_rankscore"]
            },
            "mutpred": {
                "score": df["MutPred_score"],
                "rankscore": df["MutPred_rankscore"],
                "accession": df["MutPred_protID"],
                "aa_change": df["MutPred_AAchange"],
                "pred": mechanisms
            },
            "dann": {
                "score": df["DANN_score"],
                "rankscore": df["DANN_rankscore"]
            },
            "gerp++": {
                "nr": df["GERP++_NR"],
                "rs": df["GERP++_RS"],
                "rs_rankscore": df["GERP++_RS_rankscore"]
            },
            "integrated": {
                "fitcons_score": df["integrated_fitCons_score"],
                "fitcons_rankscore": df["integrated_fitCons_rankscore"],
                "confidence_value": df["integrated_confidence_value"]
            },
            "gm12878": {
                "fitcons_score": df["GM12878_fitCons_score"],
                "fitcons_rankscore": df["GM12878_fitCons_rankscore"],
                "confidence_value": df["GM12878_confidence_value"]
            },
            "h1-hesc": {
                "fitcons_score": df["H1-hESC_fitCons_score"],
                "fitcons_rankscore": df["H1-hESC_fitCons_rankscore"],
                "confidence_value": df["H1-hESC_confidence_value"]
            },
            "huvec": {
                "fitcons_score": df["HUVEC_fitCons_score"],
                "fitcons_rankscore": df["HUVEC_fitCons_rankscore"],
                "confidence_value": df["HUVEC_confidence_value"]
            },
            "phylo": {
                "p100way": {
                    "vertebrate": df["phyloP100way_vertebrate"],
                    "vertebrate_rankscore":
                    df["phyloP100way_vertebrate_rankscore"]
                },
                "p30way": {
                    "mammalian": df["phyloP30way_mammalian"],
                    "mammalian_rankscore":
                    df["phyloP30way_mammalian_rankscore"]
                },
                "p17way": {
                    "primate": df["phyloP17way_primate"],
                    "primate_rankscore": df["phyloP17way_primate_rankscore"]
                }
            },
            "phastcons": {
                "100way": {
                    "vertebrate":
                    df["phastCons100way_vertebrate"],
                    "vertebrate_rankscore":
                    df["phastCons100way_vertebrate_rankscore"]
                },
                "30way": {
                    "mammalian": df["phastCons30way_mammalian"],
                    "mammalian_rankscore":
                    df["phastCons30way_mammalian_rankscore"]
                },
                "p17way": {
                    "primate": df["phastCons17way_primate"],
                    "primate_rankscore": df["phastCons17way_primate_rankscore"]
                }
            },
            "siphy_29way": {
                "pi": siphy,
                "logodds": df["SiPhy_29way_logOdds"],
                "logodds_rankscore": df["SiPhy_29way_logOdds_rankscore"]
            },
            "bayesdel": {
                "add_af": {
                    "score": df["BayesDel_addAF_score"],
                    "rankscore": df["BayesDel_addAF_rankscore"],
                    "pred": df["BayesDel_addAF_pred"]
                },
                "no_af": {
                    "score": df["BayesDel_noAF_score"],
                    "rankscore": df["BayesDel_noAF_rankscore"],
                    "pred": df["BayesDel_noAF_pred"]
                }
            },
            "clinpred": {
                "score": df["ClinPred_score"],
                "rankscore": df["ClinPred_rankscore"],
                "pred": df["ClinPred_pred"]
            },
            "list-s2": {
                "score": df["LIST-S2_score"],
                "rankscore": df["LIST-S2_rankscore"],
                "pred": df["LIST-S2_pred"]
            },
            "1000gp3": {
                "ac": df["1000Gp3_AC"],
                "af": df["1000Gp3_AF"],
                "afr_ac": df["1000Gp3_AFR_AC"],
                "afr_af": df["1000Gp3_AFR_AF"],
                "eur_ac": df["1000Gp3_EUR_AC"],
                "eur_af": df["1000Gp3_EUR_AF"],
                "amr_ac": df["1000Gp3_AMR_AC"],
                "amr_af": df["1000Gp3_AMR_AF"],
                "eas_ac": df["1000Gp3_EAS_AC"],
                "eas_af": df["1000Gp3_EAS_AF"],
                "sas_ac": df["1000Gp3_SAS_AC"],
                "sas_af": df["1000Gp3_SAS_AF"]
            },
            "twinsuk": {
                "ac": df["TWINSUK_AC"],
                "af": df["TWINSUK_AF"]
            },
            "alspac": {
                "ac": df["ALSPAC_AC"],
                "af": df["ALSPAC_AF"]
            },
            "esp6500": {
                "aa_ac": df["ESP6500_AA_AC"],
                "aa_af": df["ESP6500_AA_AF"],
                "ea_ac": df["ESP6500_EA_AC"],
                "ea_af": df["ESP6500_EA_AF"]
            },
            "uk10k": {
                "ac": df["UK10K_AC"],
                "af": df["UK10K_AF"]
            },
            "exac": {
                "ac": df["ExAC_AC"],
                "af": df["ExAC_AF"],
                "adj_ac": df["ExAC_Adj_AC"],
                "adj_af": df["ExAC_Adj_AF"],
                "afr_ac": df["ExAC_AFR_AC"],
                "afr_af": df["ExAC_AFR_AF"],
                "amr_ac": df["ExAC_AMR_AC"],
                "amr_af": df["ExAC_AMR_AF"],
                "eas_ac": df["ExAC_EAS_AC"],
                "eas_af": df["ExAC_EAS_AF"],
                "fin_ac": df["ExAC_FIN_AC"],
                "fin_af": df["ExAC_FIN_AF"],
                "nfe_ac": df["ExAC_NFE_AC"],
                "nfe_af": df["ExAC_NFE_AF"],
                "sas_ac": df["ExAC_SAS_AC"],
                "sas_af": df["ExAC_SAS_AF"]
            },
            "exac_nontcga": {
                "ac": df["ExAC_nonTCGA_AC"],
                "af": df["ExAC_nonTCGA_AF"],
                "adj_ac": df["ExAC_nonTCGA_Adj_AC"],
                "adj_af": df["ExAC_nonTCGA_Adj_AF"],
                "afr_ac": df["ExAC_nonTCGA_AFR_AC"],
                "afr_af": df["ExAC_nonTCGA_AFR_AF"],
                "amr_ac": df["ExAC_nonTCGA_AMR_AC"],
                "amr_af": df["ExAC_nonTCGA_AMR_AF"],
                "eas_ac": df["ExAC_nonTCGA_EAS_AC"],
                "eas_af": df["ExAC_nonTCGA_EAS_AF"],
                "fin_ac": df["ExAC_nonTCGA_FIN_AC"],
                "fin_af": df["ExAC_nonTCGA_FIN_AF"],
                "nfe_ac": df["ExAC_nonTCGA_NFE_AC"],
                "nfe_af": df["ExAC_nonTCGA_NFE_AF"],
                "sas_ac": df["ExAC_nonTCGA_SAS_AC"],
                "sas_af": df["ExAC_nonTCGA_SAS_AF"]
            },
            "exac_nonpsych": {
                "ac": df["ExAC_nonpsych_AC"],
                "af": df["ExAC_nonpsych_AF"],
                "adj_ac": df["ExAC_nonpsych_Adj_AC"],
                "adj_af": df["ExAC_nonpsych_Adj_AF"],
                "afr_ac": df["ExAC_nonpsych_AFR_AC"],
                "afr_af": df["ExAC_nonpsych_AFR_AF"],
                "amr_ac": df["ExAC_nonpsych_AMR_AC"],
                "amr_af": df["ExAC_nonpsych_AMR_AF"],
                "eas_ac": df["ExAC_nonpsych_EAS_AC"],
                "eas_af": df["ExAC_nonpsych_EAS_AF"],
                "fin_ac": df["ExAC_nonpsych_FIN_AC"],
                "fin_af": df["ExAC_nonpsych_FIN_AF"],
                "nfe_ac": df["ExAC_nonpsych_NFE_AC"],
                "nfe_af": df["ExAC_nonpsych_NFE_AF"],
                "sas_ac": df["ExAC_nonpsych_SAS_AC"],
                "sas_af": df["ExAC_nonpsych_SAS_AF"]
            },
            "clinvar": {
                "clinvar_id":
                df["clinvar_id"],
                "clinsig":
                [i for i in df["clinvar_clnsig"].split("/") if i != "."],
                "trait":
                [i for i in df["clinvar_trait"].split("|") if i != "."],
                "review":
                [i for i in df["clinvar_review"].split(",") if i != "."],
                "hgvs": [i for i in df["clinvar_hgvs"].split("|") if i != "."],
                "omim":
                [i for i in df["clinvar_OMIM_id"].split("|") if i != "."],
                "medgen":
                [i for i in df["clinvar_MedGen_id"].split("|") if i != "."],
                "orphanet":
                [i for i in df["clinvar_Orphanet_id"].split("|") if i != "."],
                "var_source":
                [i for i in df["clinvar_var_source"].split("|") if i != "."]
            },
            "hgvsc":
            list(
                set(df["HGVSc_ANNOVAR"].split(';') +
                    df["HGVSc_snpEff"].split(';') +
                    df["HGVSc_VEP"].split(';'))),
            "hgvsp":
            list(
                set(df["HGVSp_ANNOVAR"].split(';') +
                    df["HGVSp_snpEff"].split(';') +
                    df["HGVSp_VEP"].split(';'))),
            "gtex":
            list(gtex),
            "geuvadis_eqtl_target_gene":
            df["Geuvadis_eQTL_target_gene"]
        }
    }
    if include_gnomad:
        one_snp_json['dbnsfp'].update(gnomad)
    one_snp_json = list_split(
        dict_sweep(unlist(value_convert_to_number(one_snp_json)),
                   vals=[".", '-', "NA", None],
                   remove_invalid_list=True), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json
Exemple #22
0
def _map_line_to_json(df, version, index=0):
    # specific variable treatment
    chrom = df["#chr"]
    if chrom == 'M':
        chrom = 'MT'
    # fields[7] in version 2, represent hg18_pos
    hg18_end = df["hg18_pos(1-based)"]
    if hg18_end == ".":
        hg18_end = "."
    else:
        hg18_end = int(hg18_end)
    # in case of no hg19 position provided, remove the item
    if df["hg19_pos(1-based)"] == '.':
        return None
    else:
        chromStart = int(df["hg19_pos(1-based)"])
        chromEnd = chromStart
    chromStart_38 = int(df["pos(1-based)"])
    ref = df["ref"].upper()
    alt = df["alt"].upper()
    HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt)
    HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt)
    if version == 'hg19':
        HGVS = HGVS_19
    elif version == 'hg38':
        HGVS = HGVS_38
    siphy_29way_pi = df["SiPhy_29way_pi"]
    if siphy_29way_pi == ".":
        siphy = "."
    else:
        freq = siphy_29way_pi.split(":")
        siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}
    gtex_gene = df["GTEx_V6_gene"].split('|')
    gtex_tissue = df["GTEx_V6_tissue "].split('|')
    gtex = map(dict, map(lambda t: zip(('gene', 'tissue'), t), zip(gtex_gene, gtex_tissue)))
    acc = df["Uniprot_acc_Polyphen2"].rstrip().rstrip(';').split(";")
    pos = df["Uniprot_aapos_Polyphen2"].rstrip().rstrip(';').split(";")
    uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos)))
    provean_score = df["PROVEAN_score"].split(';')
    sift_score = df["SIFT_score"].split(';')
    hdiv_score = df["Polyphen2_HDIV_score"].split(';')
    hvar_score = df["Polyphen2_HVAR_score"].split(';')
    lrt_score = df["LRT_score"].split(';')
    m_cap_score = df["M-CAP_score"].split(';')
    mutationtaster_score = df["MutationTaster_score"].split(';')
    mutationassessor_score = df["MutationAssessor_score"].split(';')
    vest3_score = df["VEST3_score"].split(';')
    metasvm_score = df["MetaSVM_score"].split(';')
    fathmm_score = df["FATHMM_score"].split(';')
    metalr_score = df["MetaLR_score"].split(';')
    revel_score = df["REVEL_score"].split(';')
    '''
    parse mutpred top 5 features
    '''
    def modify_pvalue(pvalue):
        return float(pvalue.strip('P = '))
    mutpred_mechanisms = df["MutPred_Top5features"]
    if mutpred_mechanisms not in ['.', ',', '-']:
        mutpred_mechanisms = mutpred_mechanisms.split(" (") and mutpred_mechanisms.split(";")
        mutpred_mechanisms = [m.rstrip(")") for m in mutpred_mechanisms]
        mutpred_mechanisms = [i.split(" (") for i in mutpred_mechanisms]
        mutpred_mechanisms = sum(mutpred_mechanisms, [])
        mechanisms = [
            {"mechanism": mutpred_mechanisms[0],
             "p_val": modify_pvalue(mutpred_mechanisms[1])},
            {"mechanism": mutpred_mechanisms[2],
             "p_val": modify_pvalue(mutpred_mechanisms[3])},
            {"mechanism": mutpred_mechanisms[4],
             "p_val": modify_pvalue(mutpred_mechanisms[5])},
            {"mechanism": mutpred_mechanisms[6],
             "p_val": modify_pvalue(mutpred_mechanisms[7])},
            {"mechanism": mutpred_mechanisms[8],
             "p_val": modify_pvalue(mutpred_mechanisms[9])}
        ]
    else:
        mechanisms = '.'

    # normalize scores

    def norm(arr):
        return [None if item == '.' else item for item in arr]

    provean_score = norm(provean_score)
    sift_score = norm(sift_score)
    hdiv_score = norm(hdiv_score)
    hvar_score = norm(hvar_score)
    lrt_score = norm(lrt_score)
    m_cap_score = norm(m_cap_score)
    mutationtaster_score = norm(mutationtaster_score)
    mutationassessor_score = norm(mutationassessor_score)
    vest3_score = norm(vest3_score)
    metasvm_score = norm(metasvm_score)
    fathmm_score = norm(fathmm_score)
    metalr_score = norm(metalr_score)
    revel_score = norm(revel_score)

# load as json data
    one_snp_json = {
        "_id": HGVS,
        "dbnsfp": {
            "rsid": df["rs_dbSNP147"],
            #"rsid_dbSNP144": fields[6],
            "chrom": chrom,
            "hg19": {
                "start": chromStart,
                "end": chromEnd
            },
            "hg18": {
                "start": df["hg18_pos(1-based)"],
                "end": hg18_end
            },
            "hg38": {
                "start": df["pos(1-based)"],
                "end": df["pos(1-based)"]
            },
            "ref": ref,
            "alt": alt,
            "aa": {
                "ref": df["aaref"],
                "alt": df["aaalt"],
                "pos": df["aapos"],
                "refcodon": df["refcodon"],
                "codonpos": df["codonpos"],
                "codon_degeneracy": df["codon_degeneracy"],
            },
            "genename": df["genename"],
            "uniprot": list(uniprot),
            "interpro_domain": df["Interpro_domain"],
            "cds_strand": df["cds_strand"],
            "ancestral_allele": df["Ancestral_allele"],
            #"altaineandertal": fields[17],
            #"denisova": fields[18]
            "ensembl": {
                "geneid": df["Ensembl_geneid"],
                "transcriptid": df["Ensembl_transcriptid"],
                "proteinid": df["Ensembl_proteinid"]
            },
            "sift": {
                "score": sift_score,
                "converted_rankscore": df["SIFT_converted_rankscore"],
                "pred": df["SIFT_pred"]
            },
            "polyphen2": {
                "hdiv": {
                    "score": hdiv_score,
                    "rankscore": df["Polyphen2_HDIV_rankscore"],
                    "pred": df["Polyphen2_HDIV_pred"]
                },
                "hvar": {
                    "score": hvar_score,
                    "rankscore": df["Polyphen2_HVAR_rankscore"],
                    "pred": df["Polyphen2_HVAR_pred"]
                }
            },
            "lrt": {
                "score": lrt_score,
                "converted_rankscore": df["LRT_converted_rankscore"],
                "pred": df["LRT_pred"],
                "omega": df["LRT_Omega"]
            },
            "mutationtaster": {
                "score": mutationtaster_score,
                "converted_rankscore": df["MutationTaster_converted_rankscore"],
                "pred": df["MutationTaster_pred"],
                "model": df["MutationTaster_model"],
                "AAE": df["MutationTaster_AAE"]
            },
            "mutationassessor": {
                "score": mutationassessor_score,
                "rankscore": df["MutationAssessor_score_rankscore"],
                "pred": df["MutationAssessor_pred"]
            },
            "fathmm": {
                "score": fathmm_score,
                "rankscore": df["FATHMM_converted_rankscore"],
                "pred": df["FATHMM_pred"]
            },
            "provean": {
                "score": provean_score,
                "rankscore": df["PROVEAN_converted_rankscore"],
                "pred": df["PROVEAN_pred"]
            },
            "vest3": {
                "score": vest3_score,
                "rankscore": df["VEST3_rankscore"],
                "transcriptid": df["Transcript_id_VEST3"],
                "transcriptvar": df["Transcript_var_VEST3"]
            },
            "fathmm-mkl": {
                "coding_score": df["fathmm-MKL_coding_score"],
                "coding_rankscore": df["fathmm-MKL_coding_rankscore"],
                "coding_pred": df["fathmm-MKL_coding_pred"],
                "coding_group": df["fathmm-MKL_coding_group"]
            },
            "eigen": {
                "coding_or_noncoding": df["Eigen_coding_or_noncoding"],
                "raw": df["Eigen-raw"],
                "phred": df["Eigen-phred"]
            },
            "eigen-pc": {
                "raw": df["Eigen-PC-raw"],
                "phred": df["Eigen-PC-phred"],
                "raw_rankscore": df["Eigen-PC-raw_rankscore"]
            },
            "genocanyon": {
                "score": df["GenoCanyon_score"],
                "rankscore": df["GenoCanyon_score_rankscore"]
            },
            "metasvm": {
                "score": metasvm_score,
                "rankscore": df["MetaSVM_rankscore"],
                "pred": df["MetaSVM_pred"]
            },
            "metalr": {
                "score": metalr_score,
                "rankscore": df["MetaLR_rankscore"],
                "pred": df["MetaLR_pred"]
            },
            "reliability_index": df["Reliability_index"],
            "m_cap_score": {
                "score": m_cap_score,
                "rankscore": df["M-CAP_rankscore"],
                "pred": df["M-CAP_pred"]
            },
            "revel": {
                "score": revel_score,
                "rankscore": df["REVEL_rankscore"]
            },
            "mutpred": {
                "score": df["MutPred_score"],
                "rankscore": df["MutPred_rankscore"],
                "accession": df["MutPred_protID"],
                "aa_change": df["MutPred_AAchange"],
                "pred": mechanisms
            },
            "dann": {
                "score": df["DANN_score"],
                "rankscore": df["DANN_rankscore"]
            },
            "gerp++": {
                "nr": df["GERP++_NR"],
                "rs": df["GERP++_RS"],
                "rs_rankscore": df["GERP++_RS_rankscore"]
            },
            "integrated": {
                "fitcons_score": df["integrated_fitCons_score"],
                "fitcons_rankscore": df["integrated_fitCons_score_rankscore"],
                "confidence_value": df["integrated_confidence_value"]
            },
            "gm12878": {
                "fitcons_score": df["GM12878_fitCons_score"],
                "fitcons_rankscore": df["GM12878_fitCons_score_rankscore"],
                "confidence_value": df["GM12878_confidence_value"]
            },
            "h1-hesc": {
                "fitcons_score": df["H1-hESC_fitCons_score"],
                "fitcons_rankscore": df["H1-hESC_fitCons_score_rankscore"],
                "confidence_value": df["H1-hESC_confidence_value"]
            },
            "huvec": {
                "fitcons_score": df["HUVEC_fitCons_score"],
                "fitcons_rankscore": df["HUVEC_fitCons_score_rankscore"],
                "confidence_value": df["HUVEC_confidence_value"]
            },
            "phylo": {
                "p100way": {
                    "vertebrate": df["phyloP100way_vertebrate"],
                    "vertebrate_rankscore": df["phyloP100way_vertebrate_rankscore"]
                },
                "p20way": {
                    "mammalian": df["phyloP20way_mammalian"],
                    "mammalian_rankscore": df["phyloP20way_mammalian_rankscore"]
                }
            },
            "phastcons": {
                "100way": {
                    "vertebrate": df["phastCons100way_vertebrate"],
                    "vertebrate_rankscore": df["phastCons100way_vertebrate_rankscore"]
                },
                "20way": {
                    "mammalian": df["phastCons20way_mammalian"],
                    "mammalian_rankscore": df["phastCons20way_mammalian_rankscore"]
                }
            },
            "siphy_29way": {
                "pi": siphy,
                "logodds": df["SiPhy_29way_logOdds"],
                "logodds_rankscore": df["SiPhy_29way_logOdds_rankscore"]
            },
            "1000gp3": {
                "ac": df["1000Gp3_AC"],
                "af": df["1000Gp3_AF"],
                "afr_ac": df["1000Gp3_AFR_AC"],
                "afr_af": df["1000Gp3_AFR_AF"],
                "eur_ac": df["1000Gp3_EUR_AC"],
                "eur_af": df["1000Gp3_EUR_AF"],
                "amr_ac": df["1000Gp3_AMR_AC"],
                "amr_af": df["1000Gp3_AMR_AF"],
                "eas_ac": df["1000Gp3_EAS_AC"],
                "eas_af": df["1000Gp3_EAS_AF"],
                "sas_ac": df["1000Gp3_SAS_AC"],
                "sas_af": df["1000Gp3_SAS_AF"]
            },
            "twinsuk": {
                "ac": df["TWINSUK_AC"],
                "af": df["TWINSUK_AF"]
            },
            "alspac": {
                "ac": df["ALSPAC_AC"],
                "af": df["ALSPAC_AF"]
            },
            "esp6500": {
                "aa_ac": df["ESP6500_AA_AC"],
                "aa_af": df["ESP6500_AA_AF"],
                "ea_ac": df["ESP6500_EA_AC"],
                "ea_af": df["ESP6500_EA_AF"]
            },
            "exac": {
                "ac": df["ExAC_AC"],
                "af": df["ExAC_AF"],
                "adj_ac": df["ExAC_Adj_AC"],
                "adj_af": df["ExAC_Adj_AF"],
                "afr_ac": df["ExAC_AFR_AC"],
                "afr_af": df["ExAC_AFR_AF"],
                "amr_ac": df["ExAC_AMR_AC"],
                "amr_af": df["ExAC_AMR_AF"],
                "eas_ac": df["ExAC_EAS_AC"],
                "eas_af": df["ExAC_EAS_AF"],
                "fin_ac": df["ExAC_FIN_AC"],
                "fin_af": df["ExAC_FIN_AF"],
                "nfe_ac": df["ExAC_NFE_AC"],
                "nfe_af": df["ExAC_NFE_AF"],
                "sas_ac": df["ExAC_SAS_AC"],
                "sas_af": df["ExAC_SAS_AF"]
            },
            "exac_nontcga": {
                "ac": df["ExAC_nonTCGA_AC"],
                "af": df["ExAC_nonTCGA_AF"],
                "adj_ac": df["ExAC_nonTCGA_Adj_AC"],
                "adj_af": df["ExAC_nonTCGA_Adj_AF"],
                "afr_ac": df["ExAC_nonTCGA_AFR_AC"],
                "afr_af": df["ExAC_nonTCGA_AFR_AF"],
                "amr_ac": df["ExAC_nonTCGA_AMR_AC"],
                "amr_af": df["ExAC_nonTCGA_AMR_AF"],
                "eas_ac": df["ExAC_nonTCGA_EAS_AC"],
                "eas_af": df["ExAC_nonTCGA_EAS_AF"],
                "fin_ac": df["ExAC_nonTCGA_FIN_AC"],
                "fin_af": df["ExAC_nonTCGA_FIN_AF"],
                "nfe_ac": df["ExAC_nonTCGA_NFE_AC"],
                "nfe_af": df["ExAC_nonTCGA_NFE_AF"],
                "sas_ac": df["ExAC_nonTCGA_SAS_AC"],
                "sas_af": df["ExAC_nonTCGA_SAS_AF"]
            },
            "exac_nonpsych": {
                "ac": df["ExAC_nonpsych_AC"],
                "af": df["ExAC_nonpsych_AF"],
                "adj_ac": df["ExAC_nonpsych_Adj_AC"],
                "adj_af": df["ExAC_nonpsych_Adj_AF"],
                "afr_ac": df["ExAC_nonpsych_AFR_AC"],
                "afr_af": df["ExAC_nonpsych_AFR_AF"],
                "amr_ac": df["ExAC_nonpsych_AMR_AC"],
                "amr_af": df["ExAC_nonpsych_AMR_AF"],
                "eas_ac": df["ExAC_nonpsych_EAS_AC"],
                "eas_af": df["ExAC_nonpsych_EAS_AF"],
                "fin_ac": df["ExAC_nonpsych_FIN_AC"],
                "fin_af": df["ExAC_nonpsych_FIN_AF"],
                "nfe_ac": df["ExAC_nonpsych_NFE_AC"],
                "nfe_af": df["ExAC_nonpsych_NFE_AF"],
                "sas_ac": df["ExAC_nonpsych_SAS_AC"],
                "sas_af": df["ExAC_nonpsych_SAS_AF"]
            },
            "clinvar": {
                "rs": df["clinvar_rs"],
                "clinsig": list(map(int,[i for i in df["clinvar_clnsig"].split("|") if i != "."])),
                "trait": [i for i in df["clinvar_trait"].split("|") if i != "."],
                "golden_stars": list(map(int,[i for i in df["clinvar_golden_stars"].split("|") if i != "."]))
            },
            "gtex": list(gtex)
        }
    }

    one_snp_json = list_split(dict_sweep(unlist(value_convert_to_number(one_snp_json)), vals=[".", '-', None]), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json
def _map_line_to_json(cp, hg19):
    try:
        clinical_significance = cp.ReferenceClinVarAssertion.\
            ClinicalSignificance.Description
    except:
        clinical_significance = None
    rcv_accession = cp.ReferenceClinVarAssertion.ClinVarAccession.Acc
    try:
        review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.\
            ReviewStatus
    except:
        review_status = None
    try:
        last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.\
            DateLastEvaluated
    except:
        last_evaluated = None
    variant_id = cp.ReferenceClinVarAssertion.MeasureSet.ID
    number_submitters = len(cp.ClinVarAssertion)
    # some items in clinvar_xml doesn't have origin information
    try:
        origin = cp.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin
    except:
        origin = None
    conditions = []
    for _trait in cp.ReferenceClinVarAssertion.TraitSet.Trait:
        synonyms = []
        conditions_name = ''
        for name in _trait.Name:
            if name.ElementValue.Type == 'Alternate':
                synonyms.append(name.ElementValue.get_valueOf_())
            if name.ElementValue.Type == 'Preferred':
                conditions_name += name.ElementValue.get_valueOf_()
        identifiers = {}
        for item in _trait.XRef:
            if item.DB == 'Human Phenotype Ontology':
                key = 'Human_Phenotype_Ontology'
            else:
                key = item.DB
            identifiers[key.lower()] = item.ID
        for symbol in _trait.Symbol:
            if symbol.ElementValue.Type == 'Preferred':
                conditions_name += ' (' + symbol.ElementValue.get_valueOf_(
                ) + ')'
        age_of_onset = ''
        for _set in _trait.AttributeSet:
            if _set.Attribute.Type == 'age of onset':
                age_of_onset = _set.Attribute.get_valueOf_()
        conditions.append({
            "name": conditions_name,
            "synonyms": synonyms,
            "identifiers": identifiers,
            "age_of_onset": age_of_onset
        })

    # MeasureSet.Measure return a list, there might be multiple
    # Measure under one MeasureSet
    for Measure in cp.ReferenceClinVarAssertion.MeasureSet.Measure:
        variation_type = Measure.Type
        # exclude any item of which types belong to
        # 'Variation', 'protein only' or 'Microsatellite'
        if variation_type == 'Variation' or variation_type\
           == 'protein only' or variation_type == 'Microsatellite':
            continue
        allele_id = Measure.ID
        chrom = None
        chromStart_19 = None
        chromEnd_19 = None
        chromStart_38 = None
        chromEnd_38 = None
        ref = None
        alt = None
        if Measure.SequenceLocation:
            for SequenceLocation in Measure.SequenceLocation:
                # In this version, only accept information concerning GRCh37
                if 'GRCh37' in SequenceLocation.Assembly:
                    chrom = SequenceLocation.Chr
                    chromStart_19 = SequenceLocation.start
                    chromEnd_19 = SequenceLocation.stop
                    ref = SequenceLocation.referenceAllele
                    alt = SequenceLocation.alternateAllele
                if 'GRCh38' in SequenceLocation.Assembly:
                    chromStart_38 = SequenceLocation.start
                    chromEnd_38 = SequenceLocation.stop
                    if not ref:
                        ref = SequenceLocation.referenceAllele
                    if not alt:
                        alt = SequenceLocation.alternateAllele
        if Measure.MeasureRelationship:
            try:
                symbol = Measure.MeasureRelationship[0].\
                    Symbol[0].get_ElementValue().valueOf_
            except:
                symbol = None
            gene_id = Measure.MeasureRelationship[0].XRef[0].ID
        else:
            symbol = None
            gene_id = None
        if Measure.Name:
            name = Measure.Name[0].ElementValue.valueOf_
        else:
            name = None
        if len(Measure.CytogeneticLocation) == 1:
            cytogenic = Measure.CytogeneticLocation[0]
        else:
            cytogenic = Measure.CytogeneticLocation
        hgvs_coding = None
        hgvs_genome = None
        HGVS = {'genomic': [], 'coding': [], 'non-coding': [], 'protein': []}
        coding_hgvs_only = None
        hgvs_id = None
        if hg19:
            chromStart = chromStart_19
            chromEnd = chromEnd_19
        else:
            chromStart = chromStart_38
            chromEnd = chromEnd_38
        # hgvs_not_validated = None
        if Measure.AttributeSet:
            # 'copy number loss' or 'gain' have format different\
            # from other types, should be dealt with seperately
            if (variation_type == 'copy number loss') or \
                    (variation_type == 'copy number gain'):
                for AttributeSet in Measure.AttributeSet:
                    if 'HGVS, genomic, top level' in AttributeSet.\
                            Attribute.Type:
                        if AttributeSet.Attribute.integerValue == 37:
                            hgvs_genome = AttributeSet.Attribute.get_valueOf_()
                    if 'genomic' in AttributeSet.Attribute.Type:
                        HGVS['genomic'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    elif 'non-coding' in AttributeSet.Attribute.Type:
                        HGVS['non-coding'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    elif 'coding' in AttributeSet.Attribute.Type:
                        HGVS['coding'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    elif 'protein' in AttributeSet.Attribute.Type:
                        HGVS['protein'].append(
                            AttributeSet.Attribute.get_valueOf_())
            else:
                for AttributeSet in Measure.AttributeSet:
                    if 'genomic' in AttributeSet.Attribute.Type:
                        HGVS['genomic'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    elif 'non-coding' in AttributeSet.Attribute.Type:
                        HGVS['non-coding'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    elif 'coding' in AttributeSet.Attribute.Type:
                        HGVS['coding'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    elif 'protein' in AttributeSet.Attribute.Type:
                        HGVS['protein'].append(
                            AttributeSet.Attribute.get_valueOf_())
                    if AttributeSet.Attribute.Type == 'HGVS, coding, RefSeq':
                        hgvs_coding = AttributeSet.Attribute.get_valueOf_()
                    elif AttributeSet.Attribute.Type == \
                            'HGVS, genomic, top level, previous':
                        hgvs_genome = AttributeSet.Attribute.get_valueOf_()
                        break
            if chrom and chromStart and chromEnd:
                if variation_type == 'single nucleotide variant':
                    hgvs_id = "chr%s:g.%s%s>%s" % (chrom, chromStart, ref, alt)
                # items whose type belong to 'Indel, Insertion, \
                # Duplication' might not hava explicit alt information, \
                # so we will parse from hgvs_genome
                elif variation_type == 'Indel':
                    # to_do: hgvs_genome should distinguish hg19 and hg38
                    # RCV000156073, NC_000010.10:g.112581638_112581639delinsG
                    if hgvs_genome:
                        indel_position = hgvs_genome.find('ins')
                        indel_alt = hgvs_genome[indel_position + 3:]
                        if chromStart == chromEnd:
                            hgvs_id = "chr%s:g.%sdelins%s" % \
                                  (chrom, chromStart, indel_alt)
                        else:
                            hgvs_id = "chr%s:g.%s_%sdelins%s" % \
                                  (chrom, chromStart, chromEnd, indel_alt)
                elif variation_type == 'Deletion':
                    if chromStart == chromEnd:
                        # RCV000048406, chr17:g.41243547del
                        hgvs_id = "chr%s:g.%sdel" % (chrom, chromStart)
                    else:
                        hgvs_id = "chr%s:g.%s_%sdel" % (chrom, chromStart,
                                                        chromEnd)
                elif variation_type == 'Insertion':
                    if hgvs_genome:
                        ins_position = hgvs_genome.find('ins')
                        if 'ins' in hgvs_genome:
                            ins_ref = hgvs_genome[ins_position + 3:]
                            hgvs_id = "chr%s:g.%s_%sins%s" % \
                                      (chrom, chromStart, chromEnd, ins_ref)
                elif variation_type == 'Duplication':
                    if hgvs_genome:
                        dup_position = hgvs_genome.find('dup')
                        if 'dup' in hgvs_genome:
                            dup_ref = hgvs_genome[dup_position + 3:]
                            if chromStart == chromEnd:
                                hgvs_id = "chr%s:g.%sdup%s" % \
                                        (chrom, chromStart, dup_ref)
                            else:
                                hgvs_id = "chr%s:g.%s_%sdup%s" % \
                                        (chrom, chromStart, chromEnd, dup_ref)
            elif variation_type == 'copy number loss' or\
                    variation_type == 'copy number gain':
                if hgvs_genome and chrom:
                    hgvs_id = "chr" + chrom + ":g." + hgvs_genome.split('.')[2]
            elif hgvs_coding:
                hgvs_id = hgvs_coding
                coding_hgvs_only = True
            else:
                #logging.warn("couldn't find any id %s" % rcv_accession)
                return
        else:
            logging.debug('no measure.attribute %s' % rcv_accession)
            return
        for key in HGVS:
            HGVS[key].sort()
        rsid = []
        cosmic = None
        dbvar = None
        uniprot = None
        omim = None
        # loop through XRef to find rsid as well as other ids
        if Measure.XRef:
            for XRef in Measure.XRef:
                #multiple rsid could be linked to one hgvs id
                if XRef.Type == 'rs':
                    _rsid = 'rs' + str(XRef.ID)
                    rsid.append(_rsid)
                elif XRef.DB == 'COSMIC':
                    cosmic = XRef.ID
                elif XRef.DB == 'OMIM':
                    omim = XRef.ID
                elif XRef.DB == 'UniProtKB/Swiss-Prot':
                    uniprot = XRef.ID
                elif XRef.DB == 'dbVar':
                    dbvar = XRef.ID

        # make sure the hgvs_id is not none
        if hgvs_id:
            one_snp_json = {
                "_id": hgvs_id,
                "clinvar": {
                    "allele_id": allele_id,
                    "variant_id": variant_id,
                    "chrom": chrom,
                    "omim": omim,
                    "cosmic": cosmic,
                    "uniprot": uniprot,
                    "dbvar": dbvar,
                    "hg19": {
                        "start": chromStart_19,
                        "end": chromEnd_19
                    },
                    "hg38": {
                        "start": chromStart_38,
                        "end": chromEnd_38
                    },
                    "type": variation_type,
                    "gene": {
                        "id": gene_id,
                        "symbol": symbol
                    },
                    "rcv": {
                        "accession": rcv_accession,
                        "clinical_significance": clinical_significance,
                        "number_submitters": number_submitters,
                        "review_status": review_status,
                        "last_evaluated": str(last_evaluated),
                        "preferred_name": name,
                        "origin": origin,
                        "conditions": conditions
                    },
                    "rsid": rsid,
                    "cytogenic": cytogenic,
                    "hgvs": HGVS,
                    "coding_hgvs_only": coding_hgvs_only,
                    "ref": ref,
                    "alt": alt
                }
            }
            obj = (dict_sweep(
                unlist(
                    value_convert_to_number(one_snp_json, [
                        'chrom', 'omim', 'id', 'orphanet', 'gene',
                        'rettbase_(cdkl5)', 'cosmic', 'dbrbc'
                    ])), [None, '', 'None']))
            yield obj
Exemple #24
0
def _map_line_to_json(item, keys):
    key_start = ["AC", "AF", "AN", "Hom", "GC", "Hemi"]
    chrom = str(item.CHROM)
    if chrom not in CHROM_VALID_VALUES:
        return
    chromStart = item.POS
    ref = item.REF
    info = item.INFO
    _filter = item.FILTER
    rsid = item.ID
    # the following value could be missing in the vcf record
    # check first if the key exists in the vcf record
    # if not, return None
    vqslod = info[
        'VQSLOD'] if 'VQSLOD' in info and info['VQSLOD'] != math.inf else None
    vqsr_culprit = info['VQSR_culprit'] if 'VQSR_culprit' in info else None
    baseqranksum = info['BaseQRankSum'] if 'BaseQRankSum' in info else None
    clippingranksum = info[
        'ClippingRankSum'] if 'ClippingRankSum' in info else None
    mqranksum = info['MQRankSum'] if 'MQRankSum' in info else None
    readposranksum = info[
        'ReadPosRankSum'] if 'ReadPosRankSum' in info else None
    qd = info['QD'] if 'QD' in info else None
    inbreedingcoeff = info[
        'InbreedingCoeff'] if 'InbreedingCoeff' in info else None
    # convert vcf object to string
    item.ALT = [str(alt) for alt in item.ALT]
    # if multiallelic, put all variants as a list in multi-allelic field
    hgvs_list = None
    if len(item.ALT) > 1:
        hgvs_list = [
            get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=False)
            for alt in item.ALT
        ]
    for i, alt in enumerate(item.ALT):
        (HGVS, var_type) = get_hgvs_from_vcf(chrom,
                                             chromStart,
                                             ref,
                                             alt,
                                             mutant_type=True)
        if HGVS is None:
            return
        assert len(item.ALT) == len(
            info['AC']
        ), "Expecting length of item.ALT= length of info.AC, but not for %s" % (
            HGVS)
        assert len(item.ALT) == len(
            info['AF']
        ), "Expecting length of item.ALT= length of info.AF, but not for %s" % (
            HGVS)
        one_snp_json = {
            "_id": HGVS,
            "gnomad_genome": {
                "chrom": chrom,
                "pos": chromStart,
                "filter": _filter,
                "multi-allelic": hgvs_list,
                "ref": ref,
                "alt": alt,
                "alleles": item.ALT,
                "type": var_type,
                "rsid": rsid,
                "baseqranksum": baseqranksum,
                "clippingranksum": clippingranksum,
                "fs": info['FS'],
                "inbreedingcoeff": inbreedingcoeff,
                "mq": {
                    "mq": info['MQ'],
                    "mqranksum": mqranksum
                },
                "qd": qd,
                "readposranksum": readposranksum,
                "vqslod": vqslod,
                "vqsr_culprit": vqsr_culprit
            }
        }
        # create a holder in one_snp_json for each _start, e.g. 'ac', 'af', 'gc'
        for _start in key_start:
            one_snp_json['gnomad_genome'][_start.lower()] = {}
        # loop through each available key
        for _key in keys:
            if _key in info:
                # loop through each prefix
                for _start in key_start:
                    # "ac", "af" value is related to multi-allelic, need to deal with separately
                    if _key.startswith(_start) and _start in [
                            'AC', 'AF', 'Hom', 'Hemi'
                    ]:
                        one_snp_json['gnomad_genome'][_start.lower()][
                            _key.lower()] = info[_key][i]
                    elif _key.startswith(_start) and _start not in [
                            'AC', 'AF', 'Hom', 'Hemi'
                    ]:
                        one_snp_json['gnomad_genome'][_start.lower()][
                            _key.lower()] = info[_key]
        obj = (dict_sweep(
            unlist(
                value_convert_to_number(one_snp_json, skipped_keys=['chrom'])),
            [None]))
        yield obj
def _map_line_to_json(cp, hg19):
    try:
        clinical_significance = cp.ReferenceClinVarAssertion.\
            ClinicalSignificance.Description
    except:
        clinical_significance = None
    rcv_accession = cp.ReferenceClinVarAssertion.ClinVarAccession.Acc
    try:
        review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.\
            ReviewStatus
    except:
        review_status = None
    try:
        last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.\
            DateLastEvaluated
    except:
        last_evaluated = None
    
    number_submitters = len(cp.ClinVarAssertion)
    # some items in clinvar_xml doesn't have origin information
    try:
        origin = cp.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin
    except:
        origin = None
    conditions = []
    for _trait in cp.ReferenceClinVarAssertion.TraitSet.Trait:
        synonyms = []
        conditions_name = ''
        for name in _trait.Name:
            if name.ElementValue.Type == 'Alternate':
                synonyms.append(name.ElementValue.get_valueOf_())
            if name.ElementValue.Type == 'Preferred':
                conditions_name += name.ElementValue.get_valueOf_()
        identifiers = {}
        for item in _trait.XRef:
            if item.DB == 'Human Phenotype Ontology':
                key = 'Human_Phenotype_Ontology'
            else:
                key = item.DB
            identifiers[key.lower()] = item.ID
        for symbol in _trait.Symbol:
            if symbol.ElementValue.Type == 'Preferred':
                conditions_name += ' (' + symbol.ElementValue.get_valueOf_() + ')'
        age_of_onset = ''
        for _set in _trait.AttributeSet:
            if _set.Attribute.Type == 'age of onset':
                age_of_onset = _set.Attribute.get_valueOf_()
        conditions.append({"name": conditions_name, "synonyms": synonyms, "identifiers": identifiers, "age_of_onset": age_of_onset})

    try:
        genotypeset = cp.ReferenceClinVarAssertion.GenotypeSet
    except:
        genotypeset = None
    if genotypeset:
        obj_list = []
        id_list = []
        for _set in cp.ReferenceClinVarAssertion.GenotypeSet.MeasureSet:
            variant_id = _set.ID
            for _measure in _set.Measure:
                json_obj = parse_measure(_measure, hg19=hg19)
                if json_obj:
                    json_obj['clinvar']['rcv'].update({'accession': rcv_accession,
                        'clinical_significance': clinical_significance,
                        'number_submitters': number_submitters,
                        'review_status': review_status,
                        'last_evaluated': str(last_evaluated),
                        'origin': origin,
                        'conditions': conditions})
                    json_obj['clinvar'].update({'variant_id': variant_id})
                    json_obj = (dict_sweep(unlist(value_convert_to_number(json_obj,
                                               ['chrom', 'omim', 'id', 'orphanet', 'gene',
                                                'rettbase_(cdkl5)', 'cosmic', 'dbrbc'])), [None, '', 'None']))
                    obj_list.append(json_obj)
                    id_list.append(json_obj['_id'])
        for _obj in obj_list:
            _obj['clinvar'].update({'genotypeset': {
                    'type': 'CompoundHeterozygote',
                    'genotype': id_list
                    }})
            yield _obj
    else:
        variant_id = cp.ReferenceClinVarAssertion.MeasureSet.ID
        for _measure in cp.ReferenceClinVarAssertion.MeasureSet.Measure:
            json_obj = parse_measure(_measure, hg19=hg19)
            if json_obj:
                json_obj['clinvar']['rcv'].update({'accession': rcv_accession,
                        'clinical_significance': clinical_significance,
                        'number_submitters': number_submitters,
                        'review_status': review_status,
                        'last_evaluated': str(last_evaluated),
                        'origin': origin,
                        'conditions': conditions})
                json_obj['clinvar'].update({'variant_id': variant_id})
                json_obj = (dict_sweep(unlist(value_convert_to_number(json_obj,
                                               ['chrom', 'omim', 'id', 'orphanet', 'gene',
                                                'rettbase_(cdkl5)', 'cosmic', 'dbrbc'])), [None, '', 'None']))
                yield json_obj
def _map_line_to_json(fields):
    assert len(fields) == VALID_COLUMN_NO
    chrom = fields[13]
    chromStart = fields[14]
    chromEnd = fields[15]

    HGVS = None
    cds = fields[18].split(":")
    cds = cds[1]
    replace = re.findall(r'[ATCGMNYR=]+', cds)
    sub = re.search(r'\d([ATCGMNHKRY]>[ATCGMNHKRY])', cds)
    ins = re.search(r'ins[ATCGMNHYR]+|ins[0-9]+', cds)
    delete = fields[1] == 'deletion'
    indel = fields[1] == 'indel'
    dup = re.search(r'dup', cds)
    inv = re.search(r'inv|inv[0-9]+|inv[ATCGMNHYR]+', cds)
    if ins:
        delete = None
        indel = None
    elif delete:
        ins = None
        indel = None
    # parse from vcf file. Input chrom number
    # and chromStart, and return REF, ALT
    if chromStart:
        record = vcf_reader.fetch(chrom, int(chromStart))
    else:
        record = None
    if record:
        REF = record.REF
        ALT = record.ALT
        ALT = ALT[0]
        if record.is_snp and len(ALT) < 2:
            mod = [REF, ALT]
        else:
            mod = ALT
    else:
        return

    if sub and record.is_snp:
            HGVS = "chr%s:g.%s%s>%s" % (chrom, chromStart, mod[0], mod[1])
    elif ins:
        HGVS = "chr%s:g.%s_%sins%s" % (chrom, chromStart, chromEnd, mod)
    elif delete:
        HGVS = "chr%s:g.%s_%sdel" % (chrom, chromStart, chromEnd)
    elif indel:
        try:
            HGVS = "chr%s:g.%s_%sdelins%s" % (chrom, chromStart, chromEnd, mod)
        except AttributeError:
            print "ERROR:", fields[1], cds
    elif dup:
        HGVS = "chr%s:g.%s_%sdup%s" % (chrom, chromStart, chromEnd, mod)
    elif inv:
        HGVS = "chr%s:g.%s_%sinv%s" % (chrom, chromStart, chromEnd, mod)
    elif replace:
        HGVS = "chr%s:g.%s_%s%s" % (chrom, chromStart, chromEnd, mod)
    else:
        print 'ERROR:', fields[1], cds

    # load as json data
    if HGVS is None:
        print 'None:', fields[1], cds
        return None

    one_snp_json = {

        "_id": HGVS,
        "clinvar":
            {
                "allele_id": fields[0],
                "hg19":
                    {
                        "chr": fields[13],
                        "start": fields[14],
                        "end": fields[15]
                    },
                "type": fields[1],
                "name": fields[2],
                "gene":
                    {
                        "id": fields[3],
                        "symbol": fields[4]
                    },
                "clinical_significance": fields[5].split(";"),
                "rsid": 'rs' + str(fields[6]),
                "nsv_dbvar": fields[7],
                "rcv_accession": fields[8].split(";"),
                "tested_in_gtr": fields[9],
                "phenotype_id": other_id(fields[10]),
                "origin": fields[11],
                "cytogenic": fields[16],
                "review_status": fields[17],
                "hgvs":
                    {
                        "coding": fields[18],
                        "protein": fields[19]
                    },
                "number_submitters": fields[20],
                "last_evaluated": fields[21],
                "guidelines": fields[22],
                "other_ids": other_id(fields[23]),
                "clinvar_id": fields[24]
            }
        }
    return dict_sweep(unlist(value_convert_to_number(one_snp_json)), vals=["-"])
Exemple #27
0
def _map_line_to_json(df):

    chrom = df['chromosome']
    if chrom == 'M':
        chrom = 'MT'

    ref = df["reference_allele"]
    alt = df["tumor_seq_allele1"]
    if alt == '-':
        HGVS = get_hgvs_from_vcf(chrom,
                                 int(df['start_position']) - 1,
                                 'N' + ref,
                                 'N',
                                 mutant_type=False)
    elif ref == '-':
        HGVS = get_hgvs_from_vcf(chrom,
                                 int(df['start_position']) - 1,
                                 'N',
                                 'N' + alt,
                                 mutant_type=False)
    else:
        HGVS = get_hgvs_from_vcf(chrom,
                                 int(df['start_position']),
                                 ref,
                                 alt,
                                 mutant_type=False)

    ccle_depmap = {
        'gene': {
            'id': df['entrez_gene_id'],
            'symbol': df['hugo_symbol']
        },
        'chrom':
        chrom,
        'hg19': {
            'start': df['start_position'],
            'end': df['end_position']
        },
        'strand':
        df['strand'],
        'class':
        df['variant_classification'],
        'vartype':
        df['variant_type'],
        'ref':
        df['reference_allele'],
        'tumor_seq_allele1':
        df['tumor_seq_allele1'],
        'dbsnp': {
            'rsid': df['dbsnp_rs'],
            'val_status': df['dbsnp_val_status']
        },
        'genome_change':
        df['genome_change'],
        'annotation_transcript':
        df['annotation_transcript'],
        'tumor_sample_barcode':
        df['tumor_sample_barcode'],
        'cdna_change':
        df['cdna_change'],
        'codon_change':
        df['codon_change'],
        'protein_change':
        df['protein_change'],
        'isdeleterious':
        to_boolean(df['isdeleterious'],
                   true_str=[
                       'TRUE',
                   ],
                   false_str=[
                       'FALSE',
                   ]),
        'istcgahotspot':
        to_boolean(df['istcgahotspot'],
                   true_str=[
                       'TRUE',
                   ],
                   false_str=[
                       'FALSE',
                   ]),
        'tcgahscnt':
        df['tcgahscnt'],
        'iscosmichotspot':
        to_boolean(df['iscosmichotspot'],
                   true_str=[
                       'TRUE',
                   ],
                   false_str=[
                       'FALSE',
                   ]),
        'cosmichscnt':
        df['cosmichscnt'],
        'exac_af':
        df['exac_af'],
        'wes_ac':
        df['wes_ac'],
        'sanger': {
            'wes_ac': df['sangerwes_ac'],
            'recalibwes_ac': df['sangerrecalibwes_ac']
        },
        'rnaseq_ac':
        df['rnaseq_ac'],
        'hc_ac':
        df['hc_ac'],
        'rd_ac':
        df['rd_ac'],
        'wgs_ac':
        df['wgs_ac'],
        'broad_id':
        df['broad_id']
    }

    ccle_depmap = dict_sweep(ccle_depmap)

    # load as json data
    one_snp_json = {"_id": HGVS, "ccle": ccle_depmap}
    one_snp_json = value_convert_to_number(one_snp_json)
    one_snp_json['ccle']['chrom'] = str(one_snp_json['ccle']['chrom'])
    return one_snp_json
Exemple #28
0
def _map_line_to_json(fields):
    assert len(fields) == VALID_COLUMN_NO
    chr_info = re.findall(r"[\w']+", fields[17])
    chrom = chr_info[0]  # Mutation GRCh37 genome position
    chromStart = chr_info[1]
    chromEnd = chr_info[2]

    HGVS = None
    cds = fields[13]
    sub = re.search(r'[ATCGMNHKRY]+>[ATCGMNHKRY]+', cds)
    ins = re.search(r'ins[ATCGMN]+|ins[0-9]+', cds)
    delete = cds.find('del') != -1
    del_ins = re.search(r'[0-9]+>[ATCGMN]+', cds)
    comp = re.search(r'[ATCGMN]+', cds)

    if sub:
        HGVS = "chr%s:g.%s%s" % (chrom, chromStart, sub.group())
    elif ins:
        HGVS = "chr%s:g.%s_%s%s" % (chrom, chromStart, chromEnd, ins.group())
    elif delete:
        HGVS = "chr%s:g.%s_%sdel" % (chrom, chromStart, chromEnd)
    elif del_ins:
        HGVS = "chr%s:g.%s_%sdelins%s" % (chrom, chromStart, chromEnd, comp.group())
    # elif comp:
    #    HGVS = "chr%s:g.%s_%s%s" % (chrom, chromStart, chromEnd, comp.group())
    else:
        HGVS = fields[12]
        print("Error2:", fields[15], cds, fields[17])

    # load as json data
    if HGVS is None:
        return

    one_snp_json = {
        "sorter": fields[17] + fields[13],
        "_id": HGVS,
        "cosmic":
            {
                "gene":
                    {
                        "symbol": fields[0],  # Gene name
                        "id": fields[3],  # HGNC ID
                        "cds_length": fields[2]
                    },
                "transcript": fields[1],  # Accession Number
                "sample":
                    {
                        "name": fields[4],  # Sample name
                        "id": fields[5]  # ID_sample
                    },
                "tumour":
                    {
                        "id": fields[6],  # ID_tumour
                        "primary_site": fields[7],  # Primary site
                        "site_subtype": fields[8],  # Site subtype
                        "primary_histology": fields[9],  # Primary histology
                        "histology_subtype": fields[10],  # Histology subtype
                        "origin": fields[1]
                    },
                "mutation":
                    {
                        "id": "COSM" + fields[12],  # Mutation ID
                        "cds": cds,  # Mutation CDS
                        "aa": fields[14],  # Mutation AA
                        "description": fields[15],  # Mutation Description
                        "zygosity": fields[16],  # Mutation zygosity
                        "somatic_status": fields[21]  # Mutation somatic status
                    },
                "chrom": chrom,
                "hg19":
                   {
                        "start": chromStart,
                        "end": chromEnd
                    },
                "pubmed": fields[22]  # Pubmed_PMID
            }
        }
    return dict_sweep(value_convert_to_number(one_snp_json), vals=[""])
Exemple #29
0
def restructure_dict(dictionary):
    restr_dict = dict()
    d1 = dict()
    pred_properties_dict = {}
    products_list = []
    categories_list = []
    enzymes_list = []
    targets_list = []
    carriers_list = []
    transporters_list = []
    atccode_list = []

    for key, value in iter(dictionary.items()):
        if key == 'name' and value:
            d1[key] = value

        elif key == 'drugbank-id' and value:
            id_list = []
            if isinstance(value, list):
                for ele in value:
                    if isinstance(ele, collections.OrderedDict):
                        assert "@primary" in ele
                        for x, y in iter(ele.items()):
                            if x == '#text':
                                # make sure we always have DB ID as drugbank_id
                                d1.update({'drugbank_id': y})
                                restr_dict['_id'] = y

                    if isinstance(ele, str):
                        key = key.replace('-', '_')
                        id_list.append(ele)
                        d1.update({'accession_number': id_list})

            elif isinstance(value, dict) or isinstance(
                    value, collections.OrderedDict):
                for x, y in iter(value.items()):
                    if x == '#text':
                        key = key.replace('-', '_')
                        id_list.append(y)
                        d1.update({key: id_list})
                        restr_dict['_id'] = y

        elif key == 'description':
            d1.update({'pharmacology': {key: value}})

        elif key == 'groups':
            for i, j in iter(value.items()):
                d1[key] = j

        elif key == 'indication':
            d1['pharmacology'].update({key: value})

        elif key == 'pharmacodynamics':
            d1['pharmacology'].update({key: value})

        elif key == 'mechanism-of-action':
            key = key.replace('-', '_')
            d1['pharmacology'].update({key: value})

        elif key == 'toxicity':
            d1['pharmacology'].update({key: value})

        elif key == 'metabolism':
            d1['pharmacology'].update({key: value})

        elif key == 'absorption':
            d1['pharmacology'].update({key: value})

        elif key == 'half-life':
            key = key.replace('-', '_')
            d1['pharmacology'].update({key: value})

        elif key == 'protein-binding':
            key = key.replace('-', '_')
            d1['pharmacology'].update({key: value})

        elif key == 'route-of-elimination':
            key = key.replace('-', '_')
            d1['pharmacology'].update({key: value})

        elif key == 'volume-of-distribution':
            key = key.replace('-', '_')
            d1['pharmacology'].update({key: value})

        elif key == 'clearance':
            d1['pharmacology'].update({key: value})

        elif key == 'classification' and value:
            for m, n in iter(value.items()):
                m = m.lower().replace('-', '_')
                d1.update({'taxonomy': value})

        elif key == 'salts' and value:
            salts_list = []

            for m, n in iter(value.items()):
                if isinstance(n, list):
                    for ele in n:
                        for k in ele:
                            if k == 'name':
                                salts_list.append(ele[k])
                                d1.update({key: salts_list})

                elif isinstance(n, dict) or isinstance(
                        n, collections.OrderedDict):
                    d1.update({key: n['name']})

        elif key == 'synonyms' and value:
            synonym_list = []
            if isinstance(value, collections.OrderedDict):
                for x, y in iter(value.items()):
                    for ele in y:
                        for name in ele:
                            if name == '#text':
                                synonym_list.append(ele[name])
                                d1.update({key: synonym_list})

        elif key == 'products' and value:

            def restr_product_dict(dictionary):
                products_dict = {}
                for x in dictionary:
                    if x == 'name':
                        products_dict[x] = dictionary[x]
                    elif x == 'dosage-form':
                        products_dict['dosage_form'] = dictionary[x]
                    elif x == 'strength':
                        products_dict[x] = dictionary[x]
                    elif x == 'route':
                        products_dict[x] = dictionary[x]
                    elif x == 'over-the-counter':
                        products_dict['otc'] = dictionary[x]
                    elif x == 'generic':
                        products_dict[x] = dictionary[x]
                    elif x == 'ndc-id':
                        products_dict['ndc_id'] = dictionary[x]
                    elif x == 'ndc-product-code':
                        products_dict['ndc_product_code'] = dictionary[x]
                    elif x == 'dpd-id':
                        products_dict['dpd'] = dictionary[x]
                    elif x == 'started-marketing-on':
                        products_dict[x.replace('-', '_')] = dictionary[x]
                    elif x == 'ended-marketing-on':
                        products_dict[x.replace('-', '_')] = dictionary[x]
                    elif x == 'fda-application-number':
                        products_dict[x.replace('-', '_')] = dictionary[x]
                    elif x == 'approved':
                        products_dict[x] = dictionary[x]
                    elif x == 'country':
                        products_dict[x] = dictionary[x]
                    elif x == 'source':
                        products_dict[x] = dictionary[x]
                return products_dict

            for x, y in iter(value.items()):
                if isinstance(y, dict) or isinstance(y,
                                                     collections.OrderedDict):
                    _d = restr_product_dict(y)
                    products_list.append(_d)

                elif isinstance(y, list):
                    for _d in y:
                        products_list.append(restr_product_dict(_d))

        elif key == 'packagers' and value:
            pack_list = []
            for pack in value:
                for pack1 in value[pack]:
                    for s in pack1:
                        if s == 'name' and pack1[s]:
                            pack_list.append(pack1[s])
                            d1.update({key: pack_list})

        elif key == 'manufacturers' and value:
            manuf_list = []
            for x, y in iter(value.items()):
                if isinstance(y, dict) or isinstance(y,
                                                     collections.OrderedDict):
                    for i in y:
                        if i == '#text':
                            manuf_list.append(y[i])
                            d1.update({key: manuf_list})

                if isinstance(y, list):
                    for i in y:
                        for m, n in iter(i.items()):
                            if m == '#text':
                                manuf_list.append(n)
                                d1.update({key: manuf_list})

        elif key == 'categories' and value:
            for x, y in iter(value.items()):
                d1.update({key: y})

        elif key == "snp-effects" and value:
            key = key.replace('-', '_')
            d1['pharmacology'].update({key: value})

        elif key == "snp-adverse-drug-reactions" and value:
            key = key.replace('-', '_')
            d1['pharmacology'].update({key: value})

        elif key == 'affected-organisms' and value:
            for x, y in iter(value.items()):
                key = key.replace('-', '_')
                d1['pharmacology'].update({key: value["affected-organism"]})

        elif key == 'ahfs-codes' and value:
            for x in value:
                key = key.replace('-', '_')
                d1.update({key: value[x]})

        elif key == 'food-interactions' and value:
            food_interaction_list = []
            for x, y in iter(value.items()):
                if isinstance(y, list):
                    key = key.replace('-', '_')
                    for i in y:
                        food_interaction_list.append(i)
                        d1.update({key: food_interaction_list})
                else:
                    d1.update({key: y})

        elif key == 'drug-interactions' and value:
            key = key.replace('-', '_')
            for x, y in iter(value.items()):
                d1.update({key: y})

        elif key == 'sequences' and value:
            for x, y in iter(value.items()):
                for i in y:
                    if i == '@format':
                        str1 = y[i] + '_sequences'
                        d1[str1] = y['#text'].replace('\n', ' ')

        elif key == 'experimental-properties' and value:
            d1_exp_properties = {}

            def restr_properties_dict(dictionary):
                for x, y in iter(dictionary.items()):
                    k1 = dictionary['kind']
                    k1 = k1.lower().replace(' ', '_').replace('-', '_')
                    if k1 == "isoelectric_point":
                        # make sure value are floats, if intervals, then list(float)
                        try:
                            d1_exp_properties[k1] = float(dictionary['value'])
                        except ValueError:
                            # not a float, maybe a range ? "5.6 - 7.6"
                            vals = dictionary['value'].split("-")
                            try:
                                for i, val in enumerate([v for v in vals]):
                                    vals[i] = float(val)
                                logging.info("Document ID '%s' has a range " % restr_dict["_id"] + \
                                             "as isoelectric_point: %s" % vals)
                                d1_exp_properties[k1] = vals
                            except ValueError as e:
                                # not something we can handle, skip it
                                logging.warning("Document ID '%s' has non-convertible " % restr_dict["_id"] + \
                                                " value for isoelectric_point, field ignored: %s" % dictionary['value'])
                                continue
                    else:
                        d1_exp_properties[k1] = dictionary['value']
                return d1_exp_properties

            for ele in value:
                key = key.replace('-', '_')
                if isinstance(value[ele], list):
                    for _d in value[ele]:
                        _d = restr_properties_dict(_d)
                        d1.update({key: _d})

                if isinstance(value[ele], dict) or isinstance(
                        value[ele], collections.OrderedDict):
                    _d = restr_properties_dict(value[ele])
                    d1.update({key: _d})

        elif key == 'calculated-properties' and value:

            def restr_properties_dict(dictionary):
                for x in dictionary:
                    k = dictionary['kind']
                    k = k.lower().replace(' ', '_').replace('-', '_')
                    pred_properties_dict[k] = dictionary['value']

                    if dictionary['kind'] == "IUPAC Name":
                        d1.update({'iupac': dictionary['value']})
                    elif dictionary['kind'] == "SMILES":
                        d1.update({'smiles': dictionary['value']})
                    elif dictionary['kind'] == "Molecular Formula":
                        d1.update({'formula': dictionary['value']})
                    elif dictionary['kind'] == "InChI":
                        d1.update({'inchi': dictionary['value']})
                    elif dictionary['kind'] == "InChIKey":
                        if dictionary['value'][0:9] == 'InChIKey=':
                            d1.update({'inchi_key': dictionary['value'][9:]})
                        else:
                            d1.update({'inchi_key': dictionary['value']})
                    elif dictionary['kind'] == "Molecular Weight":
                        d1.update({'weight': {'average': dictionary['value']}})
                    elif dictionary['kind'] == "Monoisotopic Weight":
                        d1['weight'].update(
                            {'monoisotopic': dictionary['value']})

            for x, y in iter(value.items()):
                if isinstance(y, list):
                    for _d in y:
                        _d = restr_properties_dict(_d)

                if isinstance(y, dict) or isinstance(y,
                                                     collections.OrderedDict):
                    _d = restr_properties_dict(y)

        elif key == 'external-identifiers' and value:
            for ele in value['external-identifier']:
                for x in ele:
                    if x == 'resource':
                        if ele[x] == "Drugs Product Database (DPD)":
                            d1['dpd'] = ele['identifier']
                        elif ele[x] == "KEGG Drug":
                            d1['kegg_drug'] = ele['identifier']
                        elif ele[x] == "KEGG Compound":
                            d1['kegg_compound'] = ele['identifier']
                        elif ele[x] == "National Drug Code Directory":
                            d1['ndc_directory'] = ele['identifier']
                        elif ele[x] == "PharmGKB":
                            d1['pharmgkb'] = ele['identifier']
                        elif ele[x] == "UniProtKB":
                            d1['uniprotkb'] = ele['identifier']
                        elif ele[x] == "Wikipedia":
                            d1['wikipedia'] = ele['identifier']
                        elif ele[x] == "ChemSpider":
                            d1['chemspider'] = ele['identifier']
                        elif ele[x] == "ChEBI":
                            d1['chebi'] = ele['identifier']
                        elif ele[x] == "PubChem Compound":
                            d1['pubchem_compound'] = ele['identifier']
                        elif ele[x] == "PubChem Substance":
                            d1['pubchem_substance'] = ele['identifier']
                        elif ele[x] == "UniProtKB":
                            d1['uniprotkb'] = ele['identifier']
                        elif ele[x] == "GenBank":
                            d1['genbank'] = ele['identifier']
                        else:
                            source = ele[x].lower().replace('-', '_').replace(
                                ' ', '_')
                            d1[source] = ele['identifier']

        elif key == 'external-links' and value:
            if isinstance(value['external-link'], list):
                for ele in value['external-link']:
                    for x in ele:
                        try:
                            resource = ele['resource']
                            d1[resource.lower().replace('.', '_')] = ele['url']
                        except:
                            pass
            else:
                try:
                    resource = ele['resource']
                    d1[resource.lower().replace('.', '_')] = ele['url']
                except:
                    pass

        elif key == 'patents' and value:
            if isinstance(value, dict):
                for x in value:
                    d1.update({key: value[x]})

        elif key == 'international-brands' and value:
            key = key.lower().replace('-', '_')
            d1.update({key: value['international-brand']})

        elif key == 'mixtures' and value:
            d1.update({key: value['mixture']})

        elif key == 'pathways' and value:
            _li = []

            def restr_pathway_dict(dictionary):
                _dict = {}
                for x, y in iter(dictionary.items()):
                    if x == 'smpdb-id':
                        _dict.update({'smpdb_id': y})
                    elif x == 'name':
                        _dict.update({x: y})
                    elif x == 'drugs':
                        _dict.update({x: y['drug']})
                    elif x == 'enzymes':
                        _dict.update({x: y})
                return _dict

            if isinstance(value['pathway'], list):
                for ele in value['pathway']:
                    _dict = restr_pathway_dict(ele)
                    _li.append(_dict)
                    d1.update({key: _li})

            elif isinstance(value['pathway'], dict) or isinstance(
                    value['pathway'], OrderedDict):
                _dict = restr_pathway_dict(value['pathway'])
                d1.update({key: _dict})

        elif key == 'targets' and value:
            if isinstance(value['target'], list):
                for dictionary in value['target']:
                    _dict = restr_protein_dict(dictionary)
                    targets_list.append(_dict)

            elif isinstance(value['target'], dict) or isinstance(
                    value['target'], OrderedDict):
                _dict = restr_protein_dict(value['target'])
                targets_list.append(_dict)

        elif key == 'enzymes' and value:
            if isinstance(value['enzyme'], list):
                for dictionary in value['enzyme']:
                    _dict = restr_protein_dict(dictionary)
                    enzymes_list.append(_dict)

            elif isinstance(value['enzyme'], dict) or isinstance(
                    value['enzyme'], OrderedDict):
                _dict = restr_protein_dict(value['enzyme'])
                enzymes_list.append(_dict)

        elif key == 'transporters' and value:
            if isinstance(value['transporter'], list):
                for dictionary in value['transporter']:
                    _dict = restr_protein_dict(dictionary)
                    transporters_list.append(_dict)

            elif isinstance(value['transporter'], dict) or isinstance(
                    value['transporter'], OrderedDict):
                _dict = restr_protein_dict(value['transporter'])
                transporters_list.append(_dict)

        elif key == 'carriers' and value:
            if isinstance(value['carrier'], list):
                for dictionary in value['carrier']:
                    _dict = restr_protein_dict(dictionary)
                    carriers_list.append(_dict)

            elif isinstance(value['carrier'], dict) or isinstance(
                    value['carrier'], OrderedDict):
                _dict = restr_protein_dict(value['carrier'])
                carriers_list.append(_dict)

        elif key == 'atc-codes' and value:

            def restr_atccode_dict(dictionary):
                for x in dictionary:
                    if x == '@code':
                        atccode_list.append(dictionary[x])
                return atccode_list

            if isinstance(value['atc-code'], list):
                for _d in value['atc-code']:
                    restr_atccode_dict(_d)

            elif isinstance(value['atc-code'], dict) or isinstance(
                    value['atc-code'], OrderedDict):
                restr_atccode_dict(value['atc-code'])

    d1['atc_codes'] = atccode_list
    d1['targets'] = targets_list
    d1['carriers'] = carriers_list
    d1['enzymes'] = enzymes_list
    d1['transporters'] = transporters_list
    d1['predicted_properties'] = pred_properties_dict
    d1['products'] = products_list
    restr_dict['drugbank'] = d1
    restr_dict = unlist(restr_dict)
    restr_dict = dict_sweep(restr_dict,
                            vals=[
                                None, math.inf, "INF", ".", "-", "", "NA",
                                "none", " ", "Not Available", "unknown",
                                "null", "None"
                            ])
    if restr_dict["drugbank"].get(
            'inchi_key') == "IOFPEOPOAMOMBE-MRVPVSSYSA-N":
        print(repr(restr_dict["drugbank"].get("pdb")))
    restr_dict = boolean_convert(restr_dict, [
        "predicted_properties.mddr_like_rule",
        "predicted_properties.bioavailability",
        "predicted_properties.ghose_filter",
        "predicted_properties.rule_of_five", "products.generic",
        "products.otc", "products.approved", "products.pediatric-extension"
    ])
    restr_dict = value_convert_to_number(restr_dict,
                                         skipped_keys=[
                                             "dpd", "chemspider", "chebi",
                                             "pubchem_compound",
                                             "pubchem_substance", "bindingdb"
                                         ])
    return restr_dict
def _map_line_to_json(fields, dbsnp_col):
    assert len(fields) == VALID_COLUMN_NO
    rsid = fields[8]

    # load as json data
    if rsid is None:
        return
    docs = [d for d in dbsnp_col.find({"dbsnp.rsid": rsid})]
    for doc in docs:
        HGVS = doc['_id']
        one_snp_json = {
            "_id": HGVS,
            "grasp": {
                'hg19': {
                    'chr': fields[5],
                    'pos': fields[6]
                },
                'hupfield': fields[1],
                'last_curation_date': fields[2],
                'creation_date': fields[3],
                'srsid': fields[4],
                'publication': {
                    'journal': fields[16],
                    'title': fields[17],
                    'pmid': fields[7],
                    'snpid': fields[8],
                    'location_within_paper': fields[9],
                    'p_value': fields[10],
                    'phenotype': fields[11],
                    'paper_phenotype_description': fields[12],
                    'paper_phenotype_categories': fields[13],
                    'date_pub': fields[14]
                },
                'includes_male_female_only_analyses': fields[18],
                'exclusively_male_female': fields[19],
                'initial_sample_description': fields[20],
                'replication_sample_description': fields[21],
                'platform_snps_passing_qc': fields[22],
                'gwas_ancestry_description': fields[23],
                'discovery': {
                    'total_samples': fields[25],
                    'european': fields[26],
                    'african': fields[27],
                    'east_asian': fields[28],
                    'indian_south_asian': fields[29],
                    'hispanic': fields[30],
                    'native': fields[31],
                    'micronesian': fields[32],
                    'arab_me': fields[33],
                    'mixed': fields[34],
                    'unspecified': fields[35],
                    'filipino': fields[36],
                    'indonesian': fields[37]
                },
                'replication': {
                    'total_samples': fields[38],
                    'european': fields[39],
                    'african': fields[40],
                    'east_asian': fields[41],
                    'indian_south_asian': fields[42],
                    'hispanic': fields[43],
                    'native': fields[44],
                    'micronesian': fields[45],
                    'arab_me': fields[46],
                    'mixed': fields[47],
                    'unspecified': fields[48],
                    'filipino': fields[49],
                    'indonesian': fields[50]
                },
                'in_gene': fields[51],
                'nearest_gene': fields[52],
                'in_lincrna': fields[53],
                'in_mirna': fields[54],
                'in_mirna_bs': fields[55],
                'oreg_anno': fields[61],
                'conserv_pred_tfbs': fields[62],
                'human_enhancer': fields[63],
                'rna_edit': fields[64],
                'polyphen2': fields[65],
                'sift': fields[66],
                'ls_snp': fields[67],
                'uniprot': fields[68],
                'eqtl_meth_metab_study': fields[69]
            }
        }
        return list_split(
            dict_sweep(unlist(value_convert_to_number(one_snp_json)), [""]),
            ",")
Exemple #31
0
def _map_line_to_json(df, version, include_gnomad, index=0):
    # specific variable treatment
    chrom = df["#chr"]
    if chrom == 'M':
        chrom = 'MT'
    # fields[7] in version 2, represent hg18_pos
    hg18_end = df["hg18_pos(1-based)"]
    if hg18_end == ".":
        hg18_end = "."
    else:
        hg18_end = int(hg18_end)
    # in case of no hg19 position provided, remove the item
    if df["hg19_pos(1-based)"] == '.':
        return None
    else:
        chromStart = int(df["hg19_pos(1-based)"])
        chromEnd = chromStart
    chromStart_38 = int(df["pos(1-based)"])
    ref = df["ref"].upper()
    alt = df["alt"].upper()
    HGVS_19 = "chr%s:g.%d%s>%s" % (chrom, chromStart, ref, alt)
    HGVS_38 = "chr%s:g.%d%s>%s" % (chrom, chromStart_38, ref, alt)
    if version == 'hg19':
        HGVS = HGVS_19
    elif version == 'hg38':
        HGVS = HGVS_38
    siphy_29way_pi = df["SiPhy_29way_pi"]
    if siphy_29way_pi == ".":
        siphy = "."
    else:
        freq = siphy_29way_pi.split(":")
        siphy = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}
    gtex_gene = df["GTEx_V6p_gene"].split('|')
    gtex_tissue = df["GTEx_V6p_tissue"].split('|')
    gtex = map(
        dict,
        map(lambda t: zip(('gene', 'tissue'), t), zip(gtex_gene, gtex_tissue)))
    acc = df["Uniprot_acc_Polyphen2"].rstrip().rstrip(';').split(";")
    pos = df["Uniprot_aapos_Polyphen2"].rstrip().rstrip(';').split(";")
    uniprot = map(dict, map(lambda t: zip(('acc', 'pos'), t), zip(acc, pos)))
    provean_score = df["PROVEAN_score"].split(';')
    sift_score = df["SIFT_score"].split(';')
    hdiv_score = df["Polyphen2_HDIV_score"].split(';')
    hvar_score = df["Polyphen2_HVAR_score"].split(';')
    lrt_score = df["LRT_score"].split(';')
    m_cap_score = df["M-CAP_score"].split(';')
    mutationtaster_score = df["MutationTaster_score"].split(';')
    mutationassessor_score = df["MutationAssessor_score"].split(';')
    vest3_score = df["VEST3_score"].split(';')
    metasvm_score = df["MetaSVM_score"].split(';')
    fathmm_score = df["FATHMM_score"].split(';')
    metalr_score = df["MetaLR_score"].split(';')
    revel_score = df["REVEL_score"].split(';')
    '''
    parse mutpred top 5 features
    '''
    def modify_pvalue(pvalue):
        return float(pvalue.strip('P = '))

    mutpred_mechanisms = df["MutPred_Top5features"]
    if mutpred_mechanisms not in ['.', ',', '-']:
        mutpred_mechanisms = mutpred_mechanisms.split(
            " (") and mutpred_mechanisms.split(";")
        mutpred_mechanisms = [m.rstrip(")") for m in mutpred_mechanisms]
        mutpred_mechanisms = [i.split(" (") for i in mutpred_mechanisms]
        mutpred_mechanisms = sum(mutpred_mechanisms, [])
        mechanisms = [{
            "mechanism": mutpred_mechanisms[0],
            "p_val": modify_pvalue(mutpred_mechanisms[1])
        }, {
            "mechanism": mutpred_mechanisms[2],
            "p_val": modify_pvalue(mutpred_mechanisms[3])
        }, {
            "mechanism": mutpred_mechanisms[4],
            "p_val": modify_pvalue(mutpred_mechanisms[5])
        }, {
            "mechanism": mutpred_mechanisms[6],
            "p_val": modify_pvalue(mutpred_mechanisms[7])
        }, {
            "mechanism": mutpred_mechanisms[8],
            "p_val": modify_pvalue(mutpred_mechanisms[9])
        }]
    else:
        mechanisms = '.'

    # normalize scores

    def norm(arr):
        return [None if item == '.' else item for item in arr]

    provean_score = norm(provean_score)
    sift_score = norm(sift_score)
    hdiv_score = norm(hdiv_score)
    hvar_score = norm(hvar_score)
    lrt_score = norm(lrt_score)
    m_cap_score = norm(m_cap_score)
    mutationtaster_score = norm(mutationtaster_score)
    mutationassessor_score = norm(mutationassessor_score)
    vest3_score = norm(vest3_score)
    metasvm_score = norm(metasvm_score)
    fathmm_score = norm(fathmm_score)
    metalr_score = norm(metalr_score)
    revel_score = norm(revel_score)

    gnomad = {
        "gnomad_exomes": {
            "ac": df["gnomAD_exomes_AC"],
            "an": df["gnomAD_exomes_AN"],
            "af": df["gnomAD_exomes_AF"],
            "afr_ac": df["gnomAD_exomes_AFR_AC"],
            "afr_af": df["gnomAD_exomes_AFR_AF"],
            "afr_an": df["gnomAD_exomes_AFR_AN"],
            "amr_ac": df["gnomAD_exomes_AMR_AC"],
            "amr_an": df["gnomAD_exomes_AMR_AN"],
            "amr_af": df["gnomAD_exomes_AMR_AF"],
            "asj_ac": df["gnomAD_exomes_ASJ_AC"],
            "asj_an": df["gnomAD_exomes_ASJ_AN"],
            "asj_af": df["gnomAD_exomes_ASJ_AF"],
            "eas_ac": df["gnomAD_exomes_EAS_AC"],
            "eas_af": df["gnomAD_exomes_EAS_AF"],
            "eas_an": df["gnomAD_exomes_EAS_AN"],
            "fin_ac": df["gnomAD_exomes_FIN_AC"],
            "fin_af": df["gnomAD_exomes_FIN_AF"],
            "fin_an": df["gnomAD_exomes_FIN_AN"],
            "nfe_ac": df["gnomAD_exomes_NFE_AC"],
            "nfe_af": df["gnomAD_exomes_NFE_AF"],
            "nfe_an": df["gnomAD_exomes_NFE_AN"],
            "sas_ac": df["gnomAD_exomes_SAS_AC"],
            "sas_af": df["gnomAD_exomes_SAS_AF"],
            "sas_an": df["gnomAD_exomes_SAS_AN"],
            "oth_ac": df["gnomAD_exomes_OTH_AC"],
            "oth_af": df["gnomAD_exomes_OTH_AF"],
            "oth_an": df["gnomAD_exomes_OTH_AN"]
        },
        "gnomad_genomes": {
            "ac": df["gnomAD_genomes_AC"],
            "an": df["gnomAD_genomes_AN"],
            "af": df["gnomAD_genomes_AF"],
            "afr_ac": df["gnomAD_genomes_AFR_AC"],
            "afr_af": df["gnomAD_genomes_AFR_AF"],
            "afr_an": df["gnomAD_genomes_AFR_AN"],
            "amr_ac": df["gnomAD_genomes_AMR_AC"],
            "amr_an": df["gnomAD_genomes_AMR_AN"],
            "amr_af": df["gnomAD_genomes_AMR_AF"],
            "asj_ac": df["gnomAD_genomes_ASJ_AC"],
            "asj_an": df["gnomAD_genomes_ASJ_AN"],
            "asj_af": df["gnomAD_genomes_ASJ_AF"],
            "eas_ac": df["gnomAD_genomes_EAS_AC"],
            "eas_af": df["gnomAD_genomes_EAS_AF"],
            "eas_an": df["gnomAD_genomes_EAS_AN"],
            "fin_ac": df["gnomAD_genomes_FIN_AC"],
            "fin_af": df["gnomAD_genomes_FIN_AF"],
            "fin_an": df["gnomAD_genomes_FIN_AN"],
            "nfe_ac": df["gnomAD_genomes_NFE_AC"],
            "nfe_af": df["gnomAD_genomes_NFE_AF"],
            "nfe_an": df["gnomAD_genomes_NFE_AN"],
            "oth_ac": df["gnomAD_genomes_OTH_AC"],
            "oth_af": df["gnomAD_genomes_OTH_AF"],
            "oth_an": df["gnomAD_genomes_OTH_AN"]
        }
    }

    # load as json data
    one_snp_json = {
        "_id": HGVS,
        "dbnsfp": {
            "rsid": df["rs_dbSNP150"],
            #"rsid_dbSNP144": fields[6],
            "chrom": chrom,
            "hg19": {
                "start": chromStart,
                "end": chromEnd
            },
            "hg18": {
                "start": df["hg18_pos(1-based)"],
                "end": hg18_end
            },
            "hg38": {
                "start": df["pos(1-based)"],
                "end": df["pos(1-based)"]
            },
            "ref": ref,
            "alt": alt,
            "aa": {
                "ref": df["aaref"],
                "alt": df["aaalt"],
                "pos": df["aapos"],
                "refcodon": df["refcodon"],
                "codonpos": df["codonpos"],
                "codon_degeneracy": df["codon_degeneracy"],
            },
            "genename": df["genename"],
            "uniprot": list(uniprot),
            "interpro_domain": df["Interpro_domain"],
            "cds_strand": df["cds_strand"],
            "ancestral_allele": df["Ancestral_allele"],
            #"altaineandertal": fields[17],
            #"denisova": fields[18]
            "ensembl": {
                "geneid": df["Ensembl_geneid"],
                "transcriptid": df["Ensembl_transcriptid"],
                "proteinid": df["Ensembl_proteinid"]
            },
            "sift": {
                "score": sift_score,
                "converted_rankscore": df["SIFT_converted_rankscore"],
                "pred": df["SIFT_pred"]
            },
            "polyphen2": {
                "hdiv": {
                    "score": hdiv_score,
                    "rankscore": df["Polyphen2_HDIV_rankscore"],
                    "pred": df["Polyphen2_HDIV_pred"]
                },
                "hvar": {
                    "score": hvar_score,
                    "rankscore": df["Polyphen2_HVAR_rankscore"],
                    "pred": df["Polyphen2_HVAR_pred"]
                }
            },
            "lrt": {
                "score": lrt_score,
                "converted_rankscore": df["LRT_converted_rankscore"],
                "pred": df["LRT_pred"],
                "omega": df["LRT_Omega"]
            },
            "mutationtaster": {
                "score": mutationtaster_score,
                "converted_rankscore":
                df["MutationTaster_converted_rankscore"],
                "pred": df["MutationTaster_pred"],
                "model": df["MutationTaster_model"],
                "AAE": df["MutationTaster_AAE"]
            },
            "mutationassessor": {
                "score": mutationassessor_score,
                "rankscore": df["MutationAssessor_score_rankscore"],
                "pred": df["MutationAssessor_pred"]
            },
            "fathmm": {
                "score": fathmm_score,
                "rankscore": df["FATHMM_converted_rankscore"],
                "pred": df["FATHMM_pred"]
            },
            "provean": {
                "score": provean_score,
                "rankscore": df["PROVEAN_converted_rankscore"],
                "pred": df["PROVEAN_pred"]
            },
            "vest3": {
                "score": vest3_score,
                "rankscore": df["VEST3_rankscore"],
                "transcriptid": df["Transcript_id_VEST3"],
                "transcriptvar": df["Transcript_var_VEST3"]
            },
            "fathmm-mkl": {
                "coding_score": df["fathmm-MKL_coding_score"],
                "coding_rankscore": df["fathmm-MKL_coding_rankscore"],
                "coding_pred": df["fathmm-MKL_coding_pred"],
                "coding_group": df["fathmm-MKL_coding_group"]
            },
            "eigen": {
                "coding_or_noncoding": df["Eigen_coding_or_noncoding"],
                "raw": df["Eigen-raw"],
                "phred": df["Eigen-phred"]
            },
            "eigen-pc": {
                "raw": df["Eigen-PC-raw"],
                "phred": df["Eigen-PC-phred"],
                "raw_rankscore": df["Eigen-PC-raw_rankscore"]
            },
            "genocanyon": {
                "score": df["GenoCanyon_score"],
                "rankscore": df["GenoCanyon_score_rankscore"]
            },
            "metasvm": {
                "score": metasvm_score,
                "rankscore": df["MetaSVM_rankscore"],
                "pred": df["MetaSVM_pred"]
            },
            "metalr": {
                "score": metalr_score,
                "rankscore": df["MetaLR_rankscore"],
                "pred": df["MetaLR_pred"]
            },
            "reliability_index": df["Reliability_index"],
            "m_cap_score": {
                "score": m_cap_score,
                "rankscore": df["M-CAP_rankscore"],
                "pred": df["M-CAP_pred"]
            },
            "revel": {
                "score": revel_score,
                "rankscore": df["REVEL_rankscore"]
            },
            "mutpred": {
                "score": df["MutPred_score"],
                "rankscore": df["MutPred_rankscore"],
                "accession": df["MutPred_protID"],
                "aa_change": df["MutPred_AAchange"],
                "pred": mechanisms
            },
            "dann": {
                "score": df["DANN_score"],
                "rankscore": df["DANN_rankscore"]
            },
            "gerp++": {
                "nr": df["GERP++_NR"],
                "rs": df["GERP++_RS"],
                "rs_rankscore": df["GERP++_RS_rankscore"]
            },
            "integrated": {
                "fitcons_score": df["integrated_fitCons_score"],
                "fitcons_rankscore": df["integrated_fitCons_score_rankscore"],
                "confidence_value": df["integrated_confidence_value"]
            },
            "gm12878": {
                "fitcons_score": df["GM12878_fitCons_score"],
                "fitcons_rankscore": df["GM12878_fitCons_score_rankscore"],
                "confidence_value": df["GM12878_confidence_value"]
            },
            "h1-hesc": {
                "fitcons_score": df["H1-hESC_fitCons_score"],
                "fitcons_rankscore": df["H1-hESC_fitCons_score_rankscore"],
                "confidence_value": df["H1-hESC_confidence_value"]
            },
            "huvec": {
                "fitcons_score": df["HUVEC_fitCons_score"],
                "fitcons_rankscore": df["HUVEC_fitCons_score_rankscore"],
                "confidence_value": df["HUVEC_confidence_value"]
            },
            "phylo": {
                "p100way": {
                    "vertebrate": df["phyloP100way_vertebrate"],
                    "vertebrate_rankscore":
                    df["phyloP100way_vertebrate_rankscore"]
                },
                "p20way": {
                    "mammalian": df["phyloP20way_mammalian"],
                    "mammalian_rankscore":
                    df["phyloP20way_mammalian_rankscore"]
                }
            },
            "phastcons": {
                "100way": {
                    "vertebrate":
                    df["phastCons100way_vertebrate"],
                    "vertebrate_rankscore":
                    df["phastCons100way_vertebrate_rankscore"]
                },
                "20way": {
                    "mammalian": df["phastCons20way_mammalian"],
                    "mammalian_rankscore":
                    df["phastCons20way_mammalian_rankscore"]
                }
            },
            "siphy_29way": {
                "pi": siphy,
                "logodds": df["SiPhy_29way_logOdds"],
                "logodds_rankscore": df["SiPhy_29way_logOdds_rankscore"]
            },
            "1000gp3": {
                "ac": df["1000Gp3_AC"],
                "af": df["1000Gp3_AF"],
                "afr_ac": df["1000Gp3_AFR_AC"],
                "afr_af": df["1000Gp3_AFR_AF"],
                "eur_ac": df["1000Gp3_EUR_AC"],
                "eur_af": df["1000Gp3_EUR_AF"],
                "amr_ac": df["1000Gp3_AMR_AC"],
                "amr_af": df["1000Gp3_AMR_AF"],
                "eas_ac": df["1000Gp3_EAS_AC"],
                "eas_af": df["1000Gp3_EAS_AF"],
                "sas_ac": df["1000Gp3_SAS_AC"],
                "sas_af": df["1000Gp3_SAS_AF"]
            },
            "twinsuk": {
                "ac": df["TWINSUK_AC"],
                "af": df["TWINSUK_AF"]
            },
            "alspac": {
                "ac": df["ALSPAC_AC"],
                "af": df["ALSPAC_AF"]
            },
            "esp6500": {
                "aa_ac": df["ESP6500_AA_AC"],
                "aa_af": df["ESP6500_AA_AF"],
                "ea_ac": df["ESP6500_EA_AC"],
                "ea_af": df["ESP6500_EA_AF"]
            },
            "exac": {
                "ac": df["ExAC_AC"],
                "af": df["ExAC_AF"],
                "adj_ac": df["ExAC_Adj_AC"],
                "adj_af": df["ExAC_Adj_AF"],
                "afr_ac": df["ExAC_AFR_AC"],
                "afr_af": df["ExAC_AFR_AF"],
                "amr_ac": df["ExAC_AMR_AC"],
                "amr_af": df["ExAC_AMR_AF"],
                "eas_ac": df["ExAC_EAS_AC"],
                "eas_af": df["ExAC_EAS_AF"],
                "fin_ac": df["ExAC_FIN_AC"],
                "fin_af": df["ExAC_FIN_AF"],
                "nfe_ac": df["ExAC_NFE_AC"],
                "nfe_af": df["ExAC_NFE_AF"],
                "sas_ac": df["ExAC_SAS_AC"],
                "sas_af": df["ExAC_SAS_AF"]
            },
            "exac_nontcga": {
                "ac": df["ExAC_nonTCGA_AC"],
                "af": df["ExAC_nonTCGA_AF"],
                "adj_ac": df["ExAC_nonTCGA_Adj_AC"],
                "adj_af": df["ExAC_nonTCGA_Adj_AF"],
                "afr_ac": df["ExAC_nonTCGA_AFR_AC"],
                "afr_af": df["ExAC_nonTCGA_AFR_AF"],
                "amr_ac": df["ExAC_nonTCGA_AMR_AC"],
                "amr_af": df["ExAC_nonTCGA_AMR_AF"],
                "eas_ac": df["ExAC_nonTCGA_EAS_AC"],
                "eas_af": df["ExAC_nonTCGA_EAS_AF"],
                "fin_ac": df["ExAC_nonTCGA_FIN_AC"],
                "fin_af": df["ExAC_nonTCGA_FIN_AF"],
                "nfe_ac": df["ExAC_nonTCGA_NFE_AC"],
                "nfe_af": df["ExAC_nonTCGA_NFE_AF"],
                "sas_ac": df["ExAC_nonTCGA_SAS_AC"],
                "sas_af": df["ExAC_nonTCGA_SAS_AF"]
            },
            "exac_nonpsych": {
                "ac": df["ExAC_nonpsych_AC"],
                "af": df["ExAC_nonpsych_AF"],
                "adj_ac": df["ExAC_nonpsych_Adj_AC"],
                "adj_af": df["ExAC_nonpsych_Adj_AF"],
                "afr_ac": df["ExAC_nonpsych_AFR_AC"],
                "afr_af": df["ExAC_nonpsych_AFR_AF"],
                "amr_ac": df["ExAC_nonpsych_AMR_AC"],
                "amr_af": df["ExAC_nonpsych_AMR_AF"],
                "eas_ac": df["ExAC_nonpsych_EAS_AC"],
                "eas_af": df["ExAC_nonpsych_EAS_AF"],
                "fin_ac": df["ExAC_nonpsych_FIN_AC"],
                "fin_af": df["ExAC_nonpsych_FIN_AF"],
                "nfe_ac": df["ExAC_nonpsych_NFE_AC"],
                "nfe_af": df["ExAC_nonpsych_NFE_AF"],
                "sas_ac": df["ExAC_nonpsych_SAS_AC"],
                "sas_af": df["ExAC_nonpsych_SAS_AF"]
            },
            "clinvar": {
                "rs":
                df["clinvar_rs"],
                "clinsig":
                list(
                    map(int, [
                        i for i in df["clinvar_clnsig"].split("|") if i != "."
                    ])),
                "trait":
                [i for i in df["clinvar_trait"].split("|") if i != "."],
                "golden_stars":
                list(
                    map(int, [
                        i for i in df["clinvar_golden_stars"].split("|")
                        if i != "."
                    ]))
            },
            "gtex": list(gtex)
        }
    }
    if include_gnomad:
        one_snp_json['dbnsfp'].update(gnomad)
    one_snp_json = list_split(
        dict_sweep(unlist(value_convert_to_number(one_snp_json)),
                   vals=[".", '-', None]), ";")
    one_snp_json["dbnsfp"]["chrom"] = str(one_snp_json["dbnsfp"]["chrom"])
    return one_snp_json
Exemple #32
0
def _map_line_to_json(cp, hg19):
    try:
        clinical_significance = cp.ReferenceClinVarAssertion.\
            ClinicalSignificance.Description
    except:
        clinical_significance = None
    rcv_accession = cp.ReferenceClinVarAssertion.ClinVarAccession.Acc
    try:
        review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.\
            ReviewStatus
    except:
        review_status = None
    try:
        last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.\
            DateLastEvaluated
    except:
        last_evaluated = None

    number_submitters = len(cp.ClinVarAssertion)
    # some items in clinvar_xml doesn't have origin information
    try:
        origin = cp.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin
    except:
        origin = None
    conditions = []
    for _trait in cp.ReferenceClinVarAssertion.TraitSet.Trait:
        synonyms = []
        conditions_name = ''
        for name in _trait.Name:
            if name.ElementValue.Type == 'Alternate':
                synonyms.append(name.ElementValue.get_valueOf_())
            if name.ElementValue.Type == 'Preferred':
                conditions_name += name.ElementValue.get_valueOf_()
        identifiers = {}
        for item in _trait.XRef:
            if item.DB == 'Human Phenotype Ontology':
                key = 'Human_Phenotype_Ontology'
            else:
                key = item.DB
            identifiers[key.lower()] = item.ID
        for symbol in _trait.Symbol:
            if symbol.ElementValue.Type == 'Preferred':
                conditions_name += ' (' + symbol.ElementValue.get_valueOf_(
                ) + ')'
        age_of_onset = ''
        for _set in _trait.AttributeSet:
            if _set.Attribute.Type == 'age of onset':
                age_of_onset = _set.Attribute.get_valueOf_()
        conditions.append({
            "name": conditions_name,
            "synonyms": synonyms,
            "identifiers": identifiers,
            "age_of_onset": age_of_onset
        })

    try:
        genotypeset = cp.ReferenceClinVarAssertion.GenotypeSet
    except:
        genotypeset = None
    if genotypeset:
        obj_list = []
        id_list = []
        for _set in cp.ReferenceClinVarAssertion.GenotypeSet.MeasureSet:
            variant_id = _set.ID
            for _measure in _set.Measure:
                json_obj = parse_measure(_measure, hg19=hg19)
                if json_obj:
                    json_obj['clinvar']['rcv'].update({
                        'accession':
                        rcv_accession,
                        'clinical_significance':
                        clinical_significance,
                        'number_submitters':
                        number_submitters,
                        'review_status':
                        review_status,
                        'last_evaluated':
                        str(last_evaluated),
                        'origin':
                        origin,
                        'conditions':
                        conditions
                    })
                    json_obj['clinvar'].update({'variant_id': variant_id})
                    json_obj = (dict_sweep(
                        unlist(
                            value_convert_to_number(json_obj, [
                                'chrom', 'omim', 'id', 'orphanet', 'gene',
                                'rettbase_(cdkl5)', 'cosmic', 'dbrbc'
                            ])), [None, '', 'None']))
                    obj_list.append(json_obj)
                    id_list.append(json_obj['_id'])
        for _obj in obj_list:
            _obj['clinvar'].update({
                'genotypeset': {
                    'type': 'CompoundHeterozygote',
                    'genotype': id_list
                }
            })
            yield _obj
    else:
        variant_id = cp.ReferenceClinVarAssertion.MeasureSet.ID
        for _measure in cp.ReferenceClinVarAssertion.MeasureSet.Measure:
            json_obj = parse_measure(_measure, hg19=hg19)
            if json_obj:
                json_obj['clinvar']['rcv'].update({
                    'accession':
                    rcv_accession,
                    'clinical_significance':
                    clinical_significance,
                    'number_submitters':
                    number_submitters,
                    'review_status':
                    review_status,
                    'last_evaluated':
                    str(last_evaluated),
                    'origin':
                    origin,
                    'conditions':
                    conditions
                })
                json_obj['clinvar'].update({'variant_id': variant_id})
                json_obj = (dict_sweep(
                    unlist(
                        value_convert_to_number(json_obj, [
                            'chrom', 'omim', 'id', 'orphanet', 'gene',
                            'rettbase_(cdkl5)', 'cosmic', 'dbrbc'
                        ])), [None, '', 'None']))
                yield json_obj
Exemple #33
0
def _map_line_to_json(fields,dbsnp_col):
    assert len(fields) == VALID_COLUMN_NO
    rsid = fields[8]

    # load as json data
    if rsid is None:
        return
    docs = [d for d in dbsnp_col.find({"dbsnp.rsid":rsid})]
    for doc in docs:
        HGVS = doc['_id']
        one_snp_json = {

            "_id": HGVS,
            "grasp":
                {
                    'hg19':
                        {
                            'chr': fields[5],
                            'pos': fields[6]
                        },
                    'hupfield': fields[1],
                    'last_curation_date': fields[2],
                    'creation_date': fields[3],
                    'srsid': fields[4],
                    'publication':
                        {
                            'journal': fields[16],
                            'title': fields[17],
                            'pmid': fields[7],
                            'snpid': fields[8],
                            'location_within_paper': fields[9],
                            'p_value': fields[10],
                            'phenotype': fields[11],
                            'paper_phenotype_description': fields[12],
                            'paper_phenotype_categories': fields[13],
                            'date_pub': fields[14]
                        },
                    'includes_male_female_only_analyses': fields[18],
                    'exclusively_male_female': fields[19],
                    'initial_sample_description': fields[20],
                    'replication_sample_description': fields[21],
                    'platform_snps_passing_qc': fields[22],
                    'gwas_ancestry_description': fields[23],
                    'discovery':
                        {
                            'total_samples': fields[25],
                            'european': fields[26],
                            'african': fields[27],
                            'east_asian': fields[28],
                            'indian_south_asian': fields[29],
                            'hispanic': fields[30],
                            'native': fields[31],
                            'micronesian': fields[32],
                            'arab_me': fields[33],
                            'mixed': fields[34],
                            'unspecified': fields[35],
                            'filipino': fields[36],
                            'indonesian': fields[37]
                        },
                    'replication':
                        {
                            'total_samples': fields[38],
                            'european': fields[39],
                            'african': fields[40],
                            'east_asian': fields[41],
                            'indian_south_asian': fields[42],
                            'hispanic': fields[43],
                            'native': fields[44],
                            'micronesian': fields[45],
                            'arab_me': fields[46],
                            'mixed': fields[47],
                            'unspecified': fields[48],
                            'filipino': fields[49],
                            'indonesian': fields[50]
                        },
                    'in_gene': fields[51],
                    'nearest_gene': fields[52],
                    'in_lincrna': fields[53],
                    'in_mirna': fields[54],
                    'in_mirna_bs': fields[55],
                    'oreg_anno': fields[61],
                    'conserv_pred_tfbs': fields[62],
                    'human_enhancer': fields[63],
                    'rna_edit': fields[64],
                    'polyphen2': fields[65],
                    'sift': fields[66],
                    'ls_snp': fields[67],
                    'uniprot': fields[68],
                    'eqtl_meth_metab_study': fields[69]
                }
            }
        return list_split(dict_sweep(unlist(value_convert_to_number(one_snp_json)), [""]), ",")
def restructure_dict(dictionary):
    smile_dict = dict()
    iupac_dict = dict()
    d = dict()

    for key, value in iter(dictionary.items()):
        if key == "PC-Compound_id":
            for cnt in value:
                for m, n in iter(value[cnt].items()):
                    for x, y in iter(n.items()):
                        d["cid"] = y

        elif key == "PC-Compound_charge":
            d["formal_charge"] = dictionary[key]

        elif key == "PC-Compound_props":
            for cnt in value:
                for ele in value[cnt]:
                    for x, y in iter(ele.items()):
                        if x == "PC-InfoData_urn":
                            for i, j in iter(y.items()):
                                if i == "PC-Urn":
                                    val = ele["PC-InfoData_value"]
                                    for z in val:
                                        val1 = val[z]
                                    for k, l in iter(j.items()):
                                        if l == "Hydrogen Bond Acceptor":
                                            d["hydrogen_bond_acceptor_count"] = val1

                                        elif l == "Hydrogen Bond Donor":
                                            d["hydrogen_bond_donor_count"] = val1

                                        elif l == "Rotatable Bond":
                                            d["rotatable_bond_count"] = val1

                                        elif l == "IUPAC Name":
                                            IUPAC = j["PC-Urn_name"]
                                            IUPAC = IUPAC.lower()
                                            iupac_dict[IUPAC] = val1
                                            d["iupac"] = iupac_dict
                                            iupac_dict = {}

                                        elif l == "InChI":
                                            d["inchi"] = val1
                                            break

                                        elif l == "InChIKey":
                                            d["inchi_key"] = val1
                                            break

                                        elif l == "Log P":
                                            d["xlogp"] = val1

                                        elif l == "Mass":
                                            d["exact_mass"] = val1

                                        elif l == "Molecular Formula":
                                            d["molecular_formula"] = val1

                                        elif l == "Molecular Weight":
                                            d["molecular_weight"] = val1

                                        elif l == "SMILES":
                                            smiles = j["PC-Urn_name"]
                                            smiles = smiles.lower()
                                            smile_dict[smiles] = val1
                                            d["smiles"] = smile_dict
                                            smile_dict = {}

                                        elif l == "Topological":
                                            d["topological_polar_surface_area"] = val1

                                        elif l == "Weight":
                                            d["monoisotopic_weight"] = val1

                                        elif l == "Compound Complexity":
                                            d["complexity"] = val1

        elif key == "PC-Compound_count":
            for cnt in value:
                for x, y in iter(value[cnt].items()):
                    if x == "PC-Count_heavy-atom":
                        d["heavy_atom_count"] = y

                    elif x == "PC-Count_atom-chiral":
                        d["chiral_atom_count"] = y

                    elif x == "PC-Count_atom-chiral-def":
                        d["defined_atom_stereocenter_count"] = y

                    elif x == "PC-Count_atom-chiral-undef":
                        d["undefined_atom_stereocenter_count"] = y

                    elif x == "PC-Count_bond-chiral":
                        d["chiral_bond_count"] = y

                    elif x == "PC-Count_bond-chiral-def":
                        d["defined_bond_stereocenter_count"] = y

                    elif x == "PC-Count_bond-chiral-undef":
                        d["undefined_bond_stereocenter_count"] = y

                    elif x == "PC-Count_isotope-atom":
                        d["isotope_atom_count"] = y

                    elif x == "PC-Count_covalent-unit":
                        d["covalently-bonded_unit_count"] = y

                    elif x == "PC-Count_tautomers":
                        d["tautomers_count"] = y

    restr_dict = {}
    restr_dict['_id'] = d["cid"]
    restr_dict["pubchem"] = d
    restr_dict = value_convert_to_number(restr_dict)
    return restr_dict
Exemple #35
0
def _map_line_to_json(df):
    # specific variable treatment
    chrom = df["chr_id"]
    pos = df["chr_pos"]
    if chrom == 'M':
        chrom = 'MT'

    ref = df["ref_nt"]
    alt = df["alt_nt"]

    HGVS = get_hgvs_from_vcf(chrom, int(pos), ref, alt, mutant_type=False)
    
    transcript_id = clean_data(df["transcript_id"], ("-",))
    peptide_id = clean_data(df["peptide_id"], ("-",))
    uniprot_ac = clean_data(df["uniprot_ac"], ("-",))
    refseq_ac = clean_data(df["refseq_ac"], ("-",))
    cds_pos = clean_data(df["cds_pos"], ("-",))
    pep_pos = clean_data(df["pep_pos"], ("-",))
    uniprot_pos = clean_data(df["uniprot_pos"], ("-",))
    ref_aa = clean_data(df["ref_aa"], ("-",))
    alt_aa = clean_data(df["alt_aa"], ("-",))
    mut_freq = clean_data(df["mut_freq"], ("-",))
    data_src = clean_data(df["data_src"], ("-",))
    do_id = clean_data(df["do_id"], ("-",))
    do_name_id, do_name = do_name_split(df["do_name"])
    if do_id and do_name_id:
        assert do_id == do_name_id, "do_id mismatch!"

    uberon_id = to_list(df["uberon_id"])
    gene_name = clean_data(df["gene_name"], ("-",))
    pmid_list = to_list(df["pmid_list"])
    site_prd = site_prd_parser(clean_data(df["site_prd"], ("-",)))
    site_ann = site_ann_parser(df["site_ann"])


# load as json data
    one_snp_json = {
        "_id": HGVS,
        "biomuta": {
            'chrom': chrom,
            'pos': pos,
            'ref': ref,
            'alt': alt,
            'transcript_id': transcript_id,
            'peptide_id': peptide_id,
            'uniprot_ac': uniprot_ac,
            'refseq_ac': refseq_ac,
            'cds_pos': cds_pos,
            'pep_pos': pep_pos,
            'uniprot_pos': uniprot_pos,
            'ref_aa': ref_aa,
            'alt_aa': alt_aa,
            'mut_freq': mut_freq,
            'data_src': data_src,
            'do_id': {
                        "do_id" : do_id,
                        "do_name" : do_name
                        },
            'uberon_id': uberon_id,
            'gene_name': gene_name,
            'pmid': pmid_list,
        }
    }
    if site_ann:
        for dic in site_ann:
            one_snp_json["biomuta"].update(dic)

    if site_prd:
        one_snp_json["biomuta"].update(site_prd) 
    
    one_snp_json = value_convert_to_number(one_snp_json)
    one_snp_json['biomuta']['chrom'] = str(one_snp_json['biomuta']['chrom'])
    one_snp_json['biomuta']['do_id']['do_id'] = str(one_snp_json['biomuta']['do_id']['do_id'])
    return one_snp_json
Exemple #36
0
def load_data(data_folder):
    input_file = os.path.join(data_folder, "alternative")
    # input_file = os.path.join(data_folder, "gwas_catalog_v1.0.2-associations_e96_r2019-04-21.tsv")
    assert os.path.exists(
        input_file), "Can't find input file '%s'" % input_file
    with open_anyfile(input_file) as in_f:

        # Remove duplicated lines if any
        header = next(in_f).strip().split('\t')
        lines = set(list(in_f))
        reader = DictReader(lines, fieldnames=header, delimiter='\t')
        results = defaultdict(list)
        rsid_list = []
        for row in reader:
            rsids, _ = parse_separator_and_snps(row)
            if rsids:
                rsid_list += rsids
        hgvs_rsid_dict = batch_query_hgvs_from_rsid(rsid_list)
        reader = DictReader(lines, fieldnames=header, delimiter='\t')
        for row in reader:
            variant = {}
            HGVS = False
            snps, seperator = parse_separator_and_snps(row)
            if not snps:
                continue
            region = reorganize_field(row["REGION"], seperator, len(snps))
            chrom = reorganize_field(row["CHR_ID"], seperator, len(snps))
            genes = reorganize_field(row["REPORTED GENE(S)"], seperator,
                                     len(snps))
            position = reorganize_field(row["CHR_POS"], seperator, len(snps))
            context = reorganize_field(row["CONTEXT"], seperator, len(snps))
            for i, _snp in enumerate(snps):
                variant = {}
                if _snp in hgvs_rsid_dict:
                    variant["_id"] = hgvs_rsid_dict[_snp]
                else:
                    continue
                variant['gwascatalog'] = {
                    "associations": {
                        'efo': {},
                        'study': {}
                    }
                }
                if not HGVS:
                    variant["gwascatalog"]["rsid"] = _snp
                variant['gwascatalog']['associations']['snps'] = snps
                variant['gwascatalog']['associations']['pubmed'] = int(
                    row['PUBMEDID'])
                variant['gwascatalog']['associations']['date_added'] = row[
                    'DATE ADDED TO CATALOG']
                variant['gwascatalog']['associations']['study']['name'] = row[
                    'STUDY']
                variant['gwascatalog']['associations']['trait'] = row[
                    'DISEASE/TRAIT']
                variant['gwascatalog'][
                    'region'] = region[i] if region else None
                if not chrom:
                    chrom = [''] * 10
                elif str(chrom[i]).lower() not in CHROM_LIST:
                    chrom[i] = ''
                variant['gwascatalog']['chrom'] = chrom[i] if chrom else None
                variant['gwascatalog'][
                    'pos'] = position[i] if position else None
                variant['gwascatalog']['gene'] = genes[i].split(',') if (
                    genes and genes[i]) else None
                variant['gwascatalog'][
                    'context'] = context[i] if context else None
                variant['gwascatalog']['associations']['raf'] = str2float(
                    row['RISK ALLELE FREQUENCY'])
                variant['gwascatalog']['associations']['pval'] = str2float(
                    row['P-VALUE'])
                # variant['gwascatalog']['p_val_mlog'] = str2float(row['PVALUE_MLOG'])
                variant['gwascatalog']['associations']['study'][
                    'platform'] = row['PLATFORM [SNPS PASSING QC]']
                variant['gwascatalog']['associations']['study'][
                    'accession'] = row['STUDY ACCESSION']
                variant['gwascatalog']['associations']['efo']['name'] = row[
                    'MAPPED_TRAIT'].split(',')
                variant['gwascatalog']['associations']['efo']['id'] = [
                    _item.split('/')[-1].replace('_', ':')
                    for _item in row['MAPPED_TRAIT_URI'].split(',')
                ]
                variant = dict_sweep(unlist(
                    value_convert_to_number(variant, skipped_keys=['chrom'])),
                                     vals=[[], {}, None, '', 'NR'])
                results[variant["_id"]].append(variant)
        for v in results.values():
            if len(v) == 1:
                yield v[0]
            else:
                doc = {'_id': v[0]['_id'], 'gwascatalog': {'associations': []}}
                for _item in ['gene', 'region', 'pos', 'context', 'rsid']:
                    if _item in v[0]['gwascatalog']:
                        doc['gwascatalog'][_item] = v[0]['gwascatalog'][_item]
                doc['gwascatalog']['associations'] = [
                    i['gwascatalog']['associations'] for i in v
                ]
                yield doc
def restructure_dict(dictionary):
    smile_dict = dict()
    iupac_dict = dict()
    d = dict()

    for key,value in iter(dictionary.items()):
        if key == "PC-Compound_id":
            for cnt in value:
                for m,n in iter(value[cnt].items()):
                    for x,y in iter(n.items()):
                        d["cid"] = y

        elif key == "PC-Compound_charge":
            d["formal_charge"] = dictionary[key]

        elif key == "PC-Compound_props":
            for cnt in value:
                for ele in value[cnt]:
                    for x,y in iter(ele.items()):
                        if x == "PC-InfoData_urn":
                            for i,j in iter(y.items()):
                                if i == "PC-Urn":
                                    val = ele["PC-InfoData_value"]
                                    for z in val:
                                        val1 = val[z]
                                    for k,l in iter(j.items()):
                                        if l == "Hydrogen Bond Acceptor":
                                            d["hydrogen_bond_acceptor_count"] = val1

                                        elif l == "Hydrogen Bond Donor":
                                            d["hydrogen_bond_donor_count"] = val1

                                        elif l == "Rotatable Bond":
                                            d["rotatable_bond_count"] = val1

                                        elif l == "IUPAC Name":
                                            IUPAC = j["PC-Urn_name"]
                                            IUPAC = IUPAC.lower()
                                            iupac_dict[IUPAC] = val1
                                            d["iupac"] = iupac_dict
                                            iupac_dict = {}

                                        elif l == "InChI":
                                            d["inchi"] = val1
                                            break

                                        elif l == "InChIKey":
                                            d["inchi_key"] = val1
                                            break

                                        elif l == "Log P":
                                            d["xlogp"] = val1

                                        elif l == "Mass":
                                            d["exact_mass"] = val1

                                        elif l == "Molecular Formula":
                                            d["molecular_formula"] = val1

                                        elif l == "Molecular Weight":
                                            d["molecular_weight"] = val1

                                        elif l == "SMILES":
                                            smiles = j["PC-Urn_name"]
                                            smiles = smiles.lower()
                                            smile_dict[smiles] = val1
                                            d["smiles"] = smile_dict
                                            smile_dict = {}

                                        elif l == "Topological":
                                            d["topological_polar_surface_area"] = val1

                                        elif l == "Weight":
                                            d["monoisotopic_weight"] = val1

                                        elif l == "Compound Complexity":
                                            d["complexity"] = val1

        elif key == "PC-Compound_count":
            for cnt in value:
                for x,y in iter(value[cnt].items()):
                    if x == "PC-Count_heavy-atom":
                        d["heavy_atom_count"] = y

                    elif x == "PC-Count_atom-chiral":
                        d["chiral_atom_count"] = y

                    elif x == "PC-Count_atom-chiral-def":
                        d["defined_atom_stereocenter_count"] = y

                    elif x == "PC-Count_atom-chiral-undef":
                        d["undefined_atom_stereocenter_count"] = y

                    elif x == "PC-Count_bond-chiral":
                        d["chiral_bond_count"] = y

                    elif x == "PC-Count_bond-chiral-def":
                        d["defined_bond_stereocenter_count"] = y

                    elif x == "PC-Count_bond-chiral-undef":
                        d["undefined_bond_stereocenter_count"] = y

                    elif x == "PC-Count_isotope-atom":
                        d["isotope_atom_count"] = y

                    elif x == "PC-Count_covalent-unit":
                        d["covalently-bonded_unit_count"] = y

                    elif x == "PC-Count_tautomers":
                        d["tautomers_count"] = y

    restr_dict = {}
    restr_dict['_id'] = str(d["cid"])
    restr_dict["pubchem"] = d
    restr_dict = value_convert_to_number(restr_dict,skipped_keys=["_id"])
    return restr_dict
Exemple #38
0
def _map_line_to_json(doc_key, item):
    chrom = item.CHROM
    chromStart = item.POS
    ref = item.REF
    info = item.INFO
    _filter = item.FILTER
    try:
        baseqranksum = info['BaseQRankSum']
    except:
        baseqranksum = None
    try:
        clippingranksum = info['ClippingRankSum']
    except:
        clippingranksum = None
    try:
        mqranksum = info['MQRankSum']
    except:
        mqranksum = None
    try:
        readposranksum = info['ReadPosRankSum']
    except:
        readposranksum = None
    try:
        qd = info['QD']
    except:
        qd = None
    try:
        inbreedingcoeff = info['InbreedingCoeff']
    except:
        inbreedingcoeff = None
    # convert vcf object to string
    item.ALT = [str(alt) for alt in item.ALT]
    # if multiallelic, put all variants as a list in multi-allelic field
    hgvs_list = None
    if len(item.ALT) > 1:
        hgvs_list = [
            get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=False)
            for alt in item.ALT
        ]
    for i, alt in enumerate(item.ALT):
        (HGVS, var_type) = get_hgvs_from_vcf(chrom,
                                             chromStart,
                                             ref,
                                             alt,
                                             mutant_type=True)
        if HGVS is None:
            return
        assert len(item.ALT) == len(
            info['AC']
        ), "Expecting length of item.ALT= length of info.AC, but not for %s" % (
            HGVS)
        assert len(item.ALT) == len(
            info['AF']
        ), "Expecting length of item.ALT= length of info.AF, but not for %s" % (
            HGVS)
        assert len(item.ALT) == len(
            info['Hom_AFR']
        ), "Expecting length of item.ALT= length of HOM_AFR, but not for %s" % (
            HGVS)
        one_snp_json = {
            "_id": HGVS,
            doc_key: {
                "chrom": chrom,
                "pos": chromStart,
                "filter": _filter,
                "multi-allelic": hgvs_list,
                "ref": ref,
                "alt": alt,
                "alleles": item.ALT,
                "type": var_type,
                "ac": {
                    "ac": info['AC'][i],
                    "ac_afr": info['AC_AFR'][i],
                    "ac_amr": info['AC_AMR'][i],
                    "ac_adj": info['AC_Adj'][i],
                    "ac_eas": info['AC_EAS'][i],
                    "ac_fin": info['AC_FIN'][i],
                    "ac_het": info['AC_Het'][i],
                    "ac_hom": info['AC_Hom'][i],
                    "ac_nfe": info['AC_NFE'][i],
                    "ac_oth": info['AC_OTH'][i],
                    "ac_sas": info['AC_SAS'][i],
                    "ac_male": info['AC_MALE'][i],
                    "ac_female": info['AC_FEMALE'][i]
                },
                "af": info['AF'][i],
                "an": {
                    "an": info['AN'],
                    "an_afr": info['AN_AFR'],
                    "an_amr": info['AN_AMR'],
                    "an_adj": info['AN_Adj'],
                    "an_eas": info['AN_EAS'],
                    "an_fin": info['AN_FIN'],
                    "an_nfe": info['AN_NFE'],
                    "an_oth": info['AN_OTH'],
                    "an_sas": info['AN_SAS'],
                    "an_female": info['AN_FEMALE'],
                    "an_male": info['AN_MALE']
                },
                "baseqranksum": baseqranksum,
                "clippingranksum": clippingranksum,
                "fs": info['FS'],
                "het": {
                    "het_afr": info['Het_AFR'],
                    "het_amr": info['Het_AMR'],
                    "het_eas": info['Het_EAS'],
                    "het_fin": info['Het_FIN'],
                    "het_nfe": info['Het_NFE'],
                    "het_oth": info['Het_OTH'],
                    "het_sas": info['Het_SAS']
                },
                "hom": {
                    "hom_afr": info['Hom_AFR'],
                    "hom_amr": info['Hom_AMR'],
                    "hom_eas": info['Hom_EAS'],
                    "hom_fin": info['Hom_FIN'],
                    "hom_nfe": info['Hom_NFE'],
                    "hom_oth": info['Hom_OTH'],
                    "hom_sas": info['Hom_SAS']
                },
                "inbreedingcoeff": inbreedingcoeff,
                "mq": {
                    "mq": info['MQ'],
                    "mq0": info['MQ0'],
                    "mqranksum": mqranksum
                },
                "ncc": info['NCC'],
                "qd": qd,
                "readposranksum": readposranksum,
                "vqslod": info['VQSLOD'],
                "culprit": info['culprit']
            }
        }
        obj = (dict_sweep(unlist(value_convert_to_number(one_snp_json)),
                          [None]))
        yield obj
def _map_line_to_json(fields):
    assert len(fields) == VALID_COLUMN_NO
    chr_info = re.findall(r"[\w']+", fields[17])
    chrom = chr_info[0]  # Mutation GRCh37 genome position
    chromStart = chr_info[1]
    chromEnd = chr_info[2]

    HGVS = None
    cds = fields[13]
    sub = re.search(r'[ATCGMNHKRY]+>[ATCGMNHKRY]+', cds)
    ins = re.search(r'ins[ATCGMN]+|ins[0-9]+', cds)
    delete = cds.find('del') != -1
    del_ins = re.search(r'[0-9]+>[ATCGMN]+', cds)
    comp = re.search(r'[ATCGMN]+', cds)

    if sub:
        HGVS = "chr%s:g.%s%s" % (chrom, chromStart, sub.group())
    elif ins:
        HGVS = "chr%s:g.%s_%s%s" % (chrom, chromStart, chromEnd, ins.group())
    elif delete:
        HGVS = "chr%s:g.%s_%sdel" % (chrom, chromStart, chromEnd)
    elif del_ins:
        HGVS = "chr%s:g.%s_%sdelins%s" % (chrom, chromStart, chromEnd,
                                          comp.group())
    # elif comp:
    #    HGVS = "chr%s:g.%s_%s%s" % (chrom, chromStart, chromEnd, comp.group())
    else:
        HGVS = fields[12]
        print("Error2:", fields[15], cds, fields[17])

    # load as json data
    if HGVS is None:
        return

    one_snp_json = {
        "sorter": fields[17] + fields[13],
        "_id": HGVS,
        "cosmic": {
            "gene": {
                "symbol": fields[0],  # Gene name
                "id": fields[3],  # HGNC ID
                "cds_length": fields[2]
            },
            "transcript": fields[1],  # Accession Number
            "sample": {
                "name": fields[4],  # Sample name
                "id": fields[5]  # ID_sample
            },
            "tumour": {
                "id": fields[6],  # ID_tumour
                "primary_site": fields[7],  # Primary site
                "site_subtype": fields[8],  # Site subtype
                "primary_histology": fields[9],  # Primary histology
                "histology_subtype": fields[10],  # Histology subtype
                "origin": fields[1]
            },
            "mutation": {
                "id": "COSM" + fields[12],  # Mutation ID
                "cds": cds,  # Mutation CDS
                "aa": fields[14],  # Mutation AA
                "description": fields[15],  # Mutation Description
                "zygosity": fields[16],  # Mutation zygosity
                "somatic_status": fields[21]  # Mutation somatic status
            },
            "chrom": chrom,
            "hg19": {
                "start": chromStart,
                "end": chromEnd
            },
            "pubmed": fields[22]  # Pubmed_PMID
        }
    }
    return dict_sweep(value_convert_to_number(one_snp_json), vals=[""])
Exemple #40
0
def _map_line_to_json(doc_key, item):
    chrom = item.CHROM
    chromStart = item.POS
    ref = item.REF
    info = item.INFO
    try:
        baseqranksum = info['BaseQRankSum']
    except:
        baseqranksum = None
    try:
        clippingranksum = info['ClippingRankSum']
    except:
        clippingranksum = None
    try:
        mqranksum = info['MQRankSum']
    except:
        mqranksum = None
    try:
        readposranksum = info['ReadPosRankSum']
    except:
        readposranksum = None
    try:
        qd = info['QD']
    except:
        qd = None
    try:
        inbreedingcoeff = info['InbreedingCoeff']
    except:
        inbreedingcoeff = None
    # convert vcf object to string
    item.ALT = [str(alt) for alt in item.ALT]
    # if multiallelic, put all variants as a list in multi-allelic field
    hgvs_list = None
    if len(item.ALT) > 1:
        hgvs_list = [get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=False) for alt in item.ALT]
    for i, alt in enumerate(item.ALT):
        (HGVS, var_type) = get_hgvs_from_vcf(chrom, chromStart, ref, alt, mutant_type=True)
        if HGVS is None:
            return
        assert len(item.ALT) == len(info['AC']), "Expecting length of item.ALT= length of info.AC, but not for %s" % (HGVS)
        assert len(item.ALT) == len(info['AF']), "Expecting length of item.ALT= length of info.AF, but not for %s" % (HGVS)
        assert len(item.ALT) == len(info['Hom_AFR']), "Expecting length of item.ALT= length of HOM_AFR, but not for %s" % (HGVS)
        one_snp_json = {
            "_id": HGVS,
            doc_key : {
                "chrom": chrom,
                "pos": chromStart,
                "multi-allelic": hgvs_list,
                "ref": ref,
                "alt": alt,
                "alleles": item.ALT,
                "type": var_type,
                "ac": {
                    "ac": info['AC'][i],
                    "ac_afr": info['AC_AFR'][i],
                    "ac_amr": info['AC_AMR'][i],
                    "ac_adj": info['AC_Adj'][i],
                    "ac_eas": info['AC_EAS'][i],
                    "ac_fin": info['AC_FIN'][i],
                    "ac_het": info['AC_Het'][i],
                    "ac_hom": info['AC_Hom'][i],
                    "ac_nfe": info['AC_NFE'][i],
                    "ac_oth": info['AC_OTH'][i],
                    "ac_sas": info['AC_SAS'][i],
                    "ac_male": info['AC_MALE'][i],
                    "ac_female": info['AC_FEMALE'][i]
                },
                "af": info['AF'][i],
                "an": {
                    "an": info['AN'],
                    "an_afr": info['AN_AFR'],
                    "an_amr": info['AN_AMR'],
                    "an_adj": info['AN_Adj'],
                    "an_eas": info['AN_EAS'],
                    "an_fin": info['AN_FIN'],
                    "an_nfe": info['AN_NFE'],
                    "an_oth": info['AN_OTH'],
                    "an_sas": info['AN_SAS'],
                    "an_female": info['AN_FEMALE'],
                    "an_male": info['AN_MALE']

                },
                "baseqranksum": baseqranksum,
                "clippingranksum": clippingranksum,
                "fs": info['FS'],
                "het": {
                    "het_afr": info['Het_AFR'],
                    "het_amr": info['Het_AMR'],
                    "het_eas": info['Het_EAS'],
                    "het_fin": info['Het_FIN'],
                    "het_nfe": info['Het_NFE'],
                    "het_oth": info['Het_OTH'],
                    "het_sas": info['Het_SAS']
                },
                "hom": {
                    "hom_afr": info['Hom_AFR'],
                    "hom_amr": info['Hom_AMR'],
                    "hom_eas": info['Hom_EAS'],
                    "hom_fin": info['Hom_FIN'],
                    "hom_nfe": info['Hom_NFE'],
                    "hom_oth": info['Hom_OTH'],
                    "hom_sas": info['Hom_SAS']
                },
                "inbreedingcoeff": inbreedingcoeff,
                "mq": {
                    "mq": info['MQ'],
                    "mq0": info['MQ0'],
                    "mqranksum": mqranksum
                },
                "ncc": info['NCC'],
                "qd": qd,
                "readposranksum": readposranksum,
                "vqslod": info['VQSLOD'],
                "culprit": info['culprit']
            }
        }
        obj = (dict_sweep(unlist(value_convert_to_number(one_snp_json)), [None]))
        yield obj
Exemple #41
0
def _map_line_to_json(item):
    chrom = item.CHROM
    chromStart = item.POS
    ref = item.REF
    info = item.INFO

    try:
        af = info['AF']
    except:
        af = None
    try:
        ac = info['AC']
    except:
        ac = None
    try:
        an = info['AN']
    except:
        ac = None
    try:
        ds = info['DS']
    except:
        ds = None

    # convert vcf object to string
    item.ALT = [str(alt) for alt in item.ALT]

    # if multiallelic, put all variants as a list in multi-allelic field
    hgvs_list = None
    if len(item.ALT) > 1:
        hgvs_list = []
        for alt in item.ALT:
            try:
                hgvs_list.append(
                    get_hgvs_from_vcf(chrom,
                                      chromStart,
                                      ref,
                                      alt,
                                      mutant_type=False))
            except:
                hgvs_list.append(alt)

        assert len(item.ALT) == len(
            info['AC']
        ), "Expecting length of item.ALT= length of info.AC, but not for %s" % (
            item)
        assert len(item.ALT) == len(
            info['AF']
        ), "Expecting length of item.ALT= length of info.AF, but not for %s" % (
            item)
        if ds:
            if len(item.ALT) != len(info['DS']):
                ds_str = ",".join(info['DS'])
                ds_str = ds_str.replace("NA7022,18", "NA7022_18")
                ds_list = ds_str.split(",")
                info['DS'] = [
                    d.replace("NA7022_18", "NA7022,18") for d in ds_list
                ]
                assert len(item.ALT) == len(
                    info['DS']), "info.DS mismatch, %s: %s\n## DS: %s" % (
                        item, info['DS'])

    for i, alt in enumerate(item.ALT):
        try:
            (HGVS, var_type) = get_hgvs_from_vcf(chrom,
                                                 chromStart,
                                                 ref,
                                                 alt,
                                                 mutant_type=True)
        except:
            continue

        if HGVS is None:
            return

        # load as json data
        one_snp_json = {
            "_id": HGVS,
            "kaviar": {
                "multi-allelic": hgvs_list,
                "ref": ref,
                "alt": alt,
                "af": info['AF'][i],
                "ac": info['AC'][i],
                "an": an,
                "ds": info['DS'][i].split("|") if ds else None,
            }
        }

        yield value_convert_to_number(one_snp_json)