Ejemplo n.º 1
0
def get_hgvs_name(record, as_list=False):
    """construct the valid HGVS name as the _id field"""
    chrom = record.CHROM
    chromStart = record.INFO['RSPOS']
    ref = record.REF
    _alt_list = []

    _id_list = []
    _pos_list = []
    for alt in record.ALT:
        # should not make alt a string if alt is None
        if alt:
            alt = str(alt)
        _alt_list.append(alt)
        try:
            # NOTE: current get_pos_start_end doesn't handle ALT=None case
            # TODO: need to remove str(alt) when get_pos_start_end can
            # handle ALT=None case
            (start, end) = get_pos_start_end(chrom, chromStart, ref, alt)
            _pos_list.append(OrderedDict(start=start, end=end))
        # handle cases where start & end position could not be
        # inferred from VCF
        except ValueError:
            _pos_list.append(OrderedDict(start=None, end=None))
        try:
            HGVS = get_hgvs_from_vcf(chrom,
                                     chromStart,
                                     ref,
                                     str(alt),
                                     mutant_type=False)
            _id_list.append(HGVS)
        # handle cases where hgvs id could not be inferred from vcf
        except ValueError:
            pass
    return _id_list, _alt_list, _pos_list
Ejemplo n.º 2
0
def parse_one_rec(assembly, record):
    """Restructure JSON
    """
    doc = {"alleles": [], "gene": [],
           assembly: {},
           "vartype": record.get("primary_snapshot_data").get("variant_type"),
           "rsid": "rs" + str(record.get("refsnp_id")),
           "dbsnp_build": int(record.get("last_update_build_id")),
           "dbsnp_merges": restructure_dbsnp_merge(record.get("dbsnp1_merges")),
           "citations": record.get("citations")}
    data = record.get('primary_snapshot_data')
    hgvs_vcf_info = get_hgvs_and_vcf(assembly,
                                     data.get("placements_with_allele"))
    allele_annotations = data.get('allele_annotations')
    allele_annotations = list(allele_annotations)
    doc["alleles"] = restructure_allele_freq_info(allele_annotations)
    doc['gene'] = restructure_gene_info(allele_annotations)
    for _item in hgvs_vcf_info:
        hgvs, vcf = _item
        if vcf:
            doc["chrom"], pos, doc["ref"], doc["alt"] = vcf
            doc["chrom"] = str(doc["chrom"])
            if doc["chrom"] == "23":
                doc["chrom"] = "X"
            elif doc["chrom"] == "24":
                doc["chrom"] = "Y"
            doc[assembly] = {}
            try:
                if doc["vartype"] != "snv":
                    ref = "T" + doc["ref"]
                    alt = "T" + doc["alt"]
                else:
                    ref = doc["ref"]
                    alt = doc["alt"]
                if doc["vartype"] in ["ins", "del", "delins"]:
                    doc[assembly]['start'], doc[assembly]['end'] = get_pos_start_end(doc["chrom"], pos - 1, ref, alt)
                else:
                    doc[assembly]['start'], doc[assembly]['end'] = get_pos_start_end(doc["chrom"], pos, ref, alt)
            except (ValueError, AssertionError):
                doc[assembly] = {}
        if hgvs:
            doc["_id"] = hgvs.replace('chr23', 'chrX').replace('chr24', 'chrY')
            yield dict_sweep(unlist(value_convert_to_number(doc, skipped_keys=['chrom', 'ref', 'alt', 'allele', 'deleted_sequence', 'inserted_sequence'])), vals=[[], {}, None])
Ejemplo n.º 3
0
def get_start_end(variant_type, chrom, pos, ref, alt):
    # TODO this is actually a hack.
    #   When the variant is not a 'snv', ref or alt might be empty.
    #   However `get_pos_start_end` cannot work with empty ref or alt.
    #   Therefore we add a preceding "T" (or any single character) to bypass this limitation.
    #   The detail of this hack should be handled by `get_pos_start_end` itself, not here.
    if variant_type != "snv":
        ref = "T" + ref
        alt = "T" + alt

    try:
        if variant_type in ["ins", "del", "delins"]:
            start, end = get_pos_start_end(chrom, pos - 1, ref, alt)
        else:
            start, end = get_pos_start_end(chrom, pos, ref, alt)

        return start, end
    except (ValueError, AssertionError):
        return None, None
Ejemplo n.º 4
0
def annotate_start_end(hgvs_vcfs, assembly):
    for hgvs_id in hgvs_vcfs:
        st, end = None, None
        doc = hgvs_vcfs[hgvs_id]
        if 'vcf' in doc:
            # remove chrom, not needed
            doc['vcf'].pop('chrom', None)
            try:
                st, end = get_pos_start_end(
                    chr=None,  # not even used in func
                    pos=doc['vcf']['position'],
                    ref=doc['vcf']['ref'],
                    alt=doc['vcf']['alt'])
                if st and end:
                    doc[assembly] = {"start": st, "end": end}
            except Exception as e:
                pass

        yield doc
Ejemplo n.º 5
0
def annotate_start_end(hgvs_vcfs, assembly):
    for hgvs_id in hgvs_vcfs:
        st,end = None,None
        doc = hgvs_vcfs[hgvs_id]
        if 'vcf' in doc:
            # remove chrom, not needed
            doc['vcf'].pop('chrom',None)
            try:
                st, end = get_pos_start_end(
                                chr=None, # not even used in func
                                pos=doc['vcf']['position'],
                                ref=doc['vcf']['ref'],
                                alt=doc['vcf']['alt'])
                if st and end:
                    doc[assembly] = {"start": st, "end": end}
            except Exception as e:
                pass

        yield doc
Ejemplo n.º 6
0
def _map_line_to_json(fields):
    chrInfo = fields[0].split(":")  # grch37
    chrom = chrInfo[0]
    chromStart = int(chrInfo[1])

    ma_fin_percent = fields[7].split("/")

    if fields[3]:
        mutation = fields[3].split(">")
        ref = mutation[0]
        alt = mutation[1]
        HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt)
        hg19 = get_pos_start_end(chrom, chromStart, ref, alt)
        hg38 = get_pos_start_end(chrom, int(fields[30].split(":")[1]), ref, alt)

    # load as json data
    if HGVS is None:
        return

    one_snp_json = {
        "_id": HGVS,
        "evs":
            {
                "chrom": chrom,
                "hg19":
                    {
                        "start": hg19[0],
                        "end": hg19[1]
                    },
                "hg38":
                    {
                        "start": hg38[0],
                        "end": hg38[1]
                    },
                "rsid": fields[1],
                "dbsnp_version": get_dbsnp(fields[2]),
                "ref": ref,
                "alt": alt,
                "allele_count":
                    {
                        "european_american": count_dict(fields[4]),
                        "african_american": count_dict(fields[5]),
                        "all": count_dict(fields[6])
                    },
                "ma_fin_percent":
                    {
                        "european_american": ma_fin_percent[0],
                        "african_american": ma_fin_percent[1],
                        "all": ma_fin_percent[2]
                    },
                "genotype_count":
                    {
                        "european_american": count_dict(fields[8]),
                        "african_american": count_dict(fields[9]),
                        "all_genotype": count_dict(fields[10])
                    },
                "avg_sample_read": fields[11],
                "gene":
                    {
                        "symbol": fields[12],
                        "accession": fields[13]
                    },
                "function_gvs": fields[14],
                "hgvs":
                    {
                        "coding": fields[16],
                        "protein": fields[15]
                    },
                "coding_dna_size": fields[17],
                "conservation":
                    {
                        "phast_cons": fields[18],
                        "gerp": fields[19]
                    },
                "grantham_score": fields[20],
                "polyphen2":
                    {
                        "class": polyphen(fields[21])[0],
                        "score": polyphen(fields[21])[1]
                    },
                "ref_base_ncbi": fields[22],
                "chimp_allele": fields[23],
                "clinical_info": fields[24],
                "filter_status": fields[25],
                "on_illumina_human_exome_chip": fields[26],
                "gwas_pubmed_info": fields[27],
                "estimated_age_kyrs":
                    {
                        "ea": fields[28],
                        "aa": fields[29]
                    }
            }
        }
    return dict_sweep(value_convert(one_snp_json), vals=["NA", "none", "unknown"])
Ejemplo n.º 7
0
def _map_line_to_json(fields, version):
    chrInfo = fields[0].split(":")  # grch37
    chrom = chrInfo[0]
    chromStart = int(chrInfo[1])
    ma_fin_percent = fields[7].split("/")
    if fields[3]:
        mutation = fields[3].split(">")
        ref = mutation[0]
        alt = mutation[1]
        hg19 = get_pos_start_end(chrom, chromStart, ref, alt)
        hg38 = get_pos_start_end(chrom, int(fields[30].split(":")[1]), ref,
                                 alt)
        if version == 'hg19':
            HGVS = get_hgvs_from_vcf(chrom, chromStart, ref, alt)
        elif version == 'hg38':
            HGVS = get_hgvs_from_vcf(chrom, hg38[0], ref, alt)

    # load as json data
    if HGVS is None:
        return

    one_snp_json = {
        "_id": HGVS,
        "evs": {
            "chrom": chrom,
            "hg19": {
                "start": hg19[0],
                "end": hg19[1]
            },
            "hg38": {
                "start": hg38[0],
                "end": hg38[1]
            },
            "rsid": fields[1],
            "dbsnp_version": get_dbsnp(fields[2]),
            "ref": ref,
            "alt": alt,
            "allele_count": {
                "european_american": count_dict(fields[4]),
                "african_american": count_dict(fields[5]),
                "all": count_dict(fields[6])
            },
            "ma_fin_percent": {
                "european_american": ma_fin_percent[0],
                "african_american": ma_fin_percent[1],
                "all": ma_fin_percent[2]
            },
            "genotype_count": {
                "european_american": count_dict(fields[8]),
                "african_american": count_dict(fields[9]),
                "all_genotype": count_dict(fields[10])
            },
            "avg_sample_read": fields[11],
            "gene": {
                "symbol": fields[12],
                "accession": fields[13]
            },
            "function_gvs": fields[14],
            "hgvs": {
                "coding": fields[16],
                "protein": fields[15]
            },
            "coding_dna_size": fields[17],
            "conservation": {
                "phast_cons": fields[18],
                "gerp": fields[19]
            },
            "grantham_score": fields[20],
            "polyphen2": {
                "class": polyphen(fields[21])[0],
                "score": polyphen(fields[21])[1]
            },
            "ref_base_ncbi": fields[22],
            "chimp_allele": fields[23],
            "clinical_info": fields[24],
            "filter_status": fields[25],
            "on_illumina_human_exome_chip": fields[26],
            "gwas_pubmed_info": fields[27],
            "estimated_age_kyrs": {
                "ea": fields[28],
                "aa": fields[29]
            }
        }
    }
    return dict_sweep(value_convert(one_snp_json),
                      vals=["NA", "none", "unknown"])