Ejemplo n.º 1
0
def load_data(data_folder):
    input_file = os.path.join(data_folder, "all_cell_markers.txt")
    assert os.path.exists(input_file), "Can't find input file '{}'".format(input_file)

    with open_anyfile(input_file) as in_f:
        header = next(in_f).strip().split('\t')
        reader = DictReader(in_f, fieldnames=header, delimiter='\t')

        for row in reader:
            for field in ['cellMarker', 'geneID', 'geneSymbol', 'proteinID', 'proteinName']:
                row[field] = parse_field(row[field])
            if not all([
                len(row['cellMarker']) == len(row['geneID']),
                len(row['cellMarker']) == len(row['geneSymbol']),
                len(row['cellMarker']) == len(row['proteinID']),
                len(row['cellMarker']) == len(row['proteinName'])]):
                # handle weird cases...
                pass
            else:
                # they all match
                for index in range(len(row['cellMarker'])):
                    r = copy.copy(row)
                    for field in ['cellMarker', 'geneID', 'geneSymbol', 'proteinID', 'proteinName']:
                        r[field] = r[field][index]
                    _id = generate_id(r)
                    yield {"_id": _id,
                            "CellMarker": r} 
Ejemplo n.º 2
0
def load_data(data_folder, assembly="hg19"):
    """Load data from EMV csv file into list of JSON docs
    """
    input_file = os.path.join(data_folder, "EmVClass.2018-Q2.csv")
    assert os.path.exists(
        input_file), "Can't find input file '%s'" % input_file
    with open_anyfile(input_file) as in_f:
        lines = set(list(in_f))
        lines = [_doc.strip().split(',') for _doc in lines]
        print(list(lines)[0])
        results = defaultdict(list)
        # mapping non genomic hgvs ids to genomic hgvs ids used in MyVariant
        hgvs_ids = [_item[4] for _item in lines]
        #print(hgvs_ids)
        hgvs_mapping_dict = batch_query_myvariant_id_from_clingen(
            hgvs_ids, assembly)
        # loop through csv doc to convert into json docs
        for row in lines:
            # structure the content of emv docs
            variant = _map_line_to_json(row)
            # fetch corresponding genomic hgvs ids
            mapped_ids = hgvs_mapping_dict[row[4]]
            # could be one non-genomic hgvs id mapping to mulitple genomic ones
            if mapped_ids:
                for _id in mapped_ids:
                    results[_id].append(variant)
        for k, v in results.items():
            if len(v) == 1:
                doc = {'_id': k, 'emv': v[0]}
            else:
                doc = {'_id': k, 'emv': [_doc for _doc in v]}
                print('case of multi hits', doc)
            yield doc
Ejemplo n.º 3
0
def load_data(data_folder):

    input_file = os.path.join(data_folder, "cgi_biomarkers_per_variant.tsv")
    assert os.path.exists(
        input_file), "Can't find input file '%s'" % input_file
    with open_anyfile(input_file) as in_f:

        # Remove duplicated lines if any
        header = next(in_f).strip().split('\t')
        lines = set(list(in_f))
        reader = DictReader(lines, fieldnames=header, delimiter='\t')

        results = defaultdict(list)
        for row in reader:

            variant = {}

            # Skip
            if 'gDNA' not in row or row['gDNA'] == "":
                continue

            # Skip variants that are not mutations
            if 'Alteration type' not in row or row['Alteration type'] != 'MUT':
                continue

            # Use gDNA as variant identifier
            variant['_id'] = row['gDNA']
            variant['cgi'] = {}

            for k in [
                    'region', 'cDNA', 'Evidence level', 'transcript', 'Gene',
                ('individual_mutation', 'protein_change'),
                    'Primary Tumor type', ('Drug full name', 'drug'), 'Source',
                    'Association'
            ]:

                if isinstance(k, tuple):
                    new_k = k[1]
                    old_k = k[0]
                else:
                    new_k = k.lower().replace(' ', '_')
                    old_k = k

                variant['cgi'][new_k] = unicodedata.normalize(
                    "NFKD", row.get(old_k, None))

            variant = dict_sweep(variant,
                                 vals=['', 'null', 'N/A', None, [], {}])
            results[variant['_id']].append(variant)

        # Merge duplications
        for v in results.values():
            if len(v) == 1:
                yield v[0]
            else:
                yield {'_id': v[0]['_id'], 'cgi': [i['cgi'] for i in v]}
Ejemplo n.º 4
0
def load_data(input_file):

    with open_anyfile(input_file) as in_f:
        for line in in_f:
            _id, caid = line.strip().split()
            yield {
                '_id': _id,
                'clingen': {
                    "caid": caid
                },
            }
Ejemplo n.º 5
0
def load_data(input_file):

    with open_anyfile(input_file) as in_f:
        result = defaultdict(list)
        for line in in_f:
            pharos_id, _id = line.strip().split(',')
            if _id != 'entrez_gene_id' and _id != '0':
                result[str(_id)].append(int(pharos_id))
        for k, v in result.items():
            json_doc = {'_id': str(k), 'pharos': {"target_id": v}}
            yield unlist(json_doc)
Ejemplo n.º 6
0
def load_data(input_file):

    with open_anyfile(input_file) as in_f:
        for line in in_f:
            pharos_id, _id = line.strip().split(',')
            if _id != 'entrez_gene_id':
                yield {
                    '_id': int(_id),
                    'pharos': {
                        "target_id": int(pharos_id)
                    },
                }
Ejemplo n.º 7
0
def load_data(data_folder):

    input_file = os.path.join(data_folder, "phewas-catalog.csv")
    assert os.path.exists(input_file), "Can't find input file '%s'" % input_file
    with open_anyfile(input_file) as in_f:

        # Remove duplicated lines if any
        header = next(in_f).strip().split(',')
        header = [_item[1:-1] for _item in header]
        lines = set(list(in_f))
        reader = DictReader(lines, fieldnames=header, delimiter=',')

        results = defaultdict(list)
        for row in reader:
            variant = {"associations": {"phenotype": {}}, "variant": {}}
            assert re.match("^rs\d+$", row["snp"]) != None
            variant["variant"]["rsid"] = row["snp"]
            variant["associations"]["phenotype"]["name"] = row["phewas phenotype"]
            variant["associations"]["cases"] = row["cases"]

            variant["associations"]["pval"] = float(row["p-value"])
            variant["associations"]["odds-ratio"] = row["odds-ratio"]
            variant["associations"]["phenotype"]["phewas_code"] = row["phewas code"]
            variant["variant"]["gene"] = row["gene_name"]
            variant["variant"]["gwas_associations"] = row["gwas-associations"].split(',')
            pos_info = row["chromosome"].split(' ')
            if len(pos_info) == 2:
                variant["variant"]["chrom"], variant["variant"]["pos"] = pos_info
            else:
                variant["variant"]["chrom"] = pos_info[0]
            results[variant["variant"]["rsid"]].append(variant)
        # Merge duplications
        rsid_list = [_item for _item in results.keys()]
        hgvs_rsid_dict = batch_query_hgvs_from_rsid(rsid_list)
        for k, v in results.items():
            if k in hgvs_rsid_dict and hgvs_rsid_dict[k]:
                if len(v) == 1:
                    doc = {'_id': hgvs_rsid_dict[k],
                           'phewas': v[0]["variant"]}
                    doc["phewas"]["associations"] = v[0]["associations"]
                    yield dict_sweep(unlist(value_convert_to_number(doc, skipped_keys=['chrom'])), vals=[[], {}, None, '', 'NULL'])
                else:
                    doc = {'_id': hgvs_rsid_dict[k],
                           'phewas': v[0]["variant"]}
                    doc["phewas"]["associations"] = []
                    for _item in v:
                        doc["phewas"]["associations"].append(_item["associations"])
                    yield dict_sweep(unlist(value_convert_to_number(doc, skipped_keys=['chrom'])), vals=[[], {}, None, '', 'NULL'])
Ejemplo n.º 8
0
def load_data(data_folder: str):
    """
    Load data from a specified file path. Parse each line into a dictionary according to the schema
    given by `data_schema`. Then process each dict by normalizing data format, remove null fields (optional).
    Append each dict into final result using its id.

    :param data_folder: the path(folder) where the data file is stored
    :return: a generator that yields data.
    """
    input_file = os.path.join(data_folder, file_name)
    # raise an error if file not found
    assert os.path.exists(input_file), FILE_NOT_FOUND_ERROR.format(input_file)

    with open_anyfile(input_file) as file:
        # Remove duplicated lines if any
        lines = set(list(file))
        # read and parse each line into a dict
        reader = DictReader(lines, fieldnames=data_schema, delimiter=delimiter)
        # access non existing keys will return an empty list by default
        results = defaultdict(list)

        for row in reader:  # start processing each lines of data (stored in dicts)
            # construct id (e.g. chr1:g.678900_679000)
            _id = '{chrom}:g.{start}_{end}'.format(chrom=row['chrom'],
                                                   start=row['start'],
                                                   end=row['end'])
            # optional step: normalize data
            variant = {
                k: unicodedata.normalize('NFKD', v)
                for k, v in row.items()
            }
            # optional step: delete invalid fields within each dict
            variant = dict_sweep(variant,
                                 vals=['', 'null', 'N/A', None, [], {}])
            # append dict to the result list using the _id
            results[_id].append(variant)

        for k, v in results.items():
            yield {'_id': k, source_name: v}
Ejemplo n.º 9
0
def load_data(data_folder):
    input_file = os.path.join(data_folder, "alternative")
    # input_file = os.path.join(data_folder, "gwas_catalog_v1.0.2-associations_e96_r2019-04-21.tsv")
    assert os.path.exists(
        input_file), "Can't find input file '%s'" % input_file
    with open_anyfile(input_file) as in_f:

        # Remove duplicated lines if any
        header = next(in_f).strip().split('\t')
        lines = set(list(in_f))
        reader = DictReader(lines, fieldnames=header, delimiter='\t')
        results = defaultdict(list)
        rsid_list = []
        for row in reader:
            rsids, _ = parse_separator_and_snps(row)
            if rsids:
                rsid_list += rsids
        hgvs_rsid_dict = batch_query_hgvs_from_rsid(rsid_list)
        reader = DictReader(lines, fieldnames=header, delimiter='\t')
        for row in reader:
            variant = {}
            HGVS = False
            snps, seperator = parse_separator_and_snps(row)
            if not snps:
                continue
            region = reorganize_field(row["REGION"], seperator, len(snps))
            chrom = reorganize_field(row["CHR_ID"], seperator, len(snps))
            genes = reorganize_field(row["REPORTED GENE(S)"], seperator,
                                     len(snps))
            position = reorganize_field(row["CHR_POS"], seperator, len(snps))
            context = reorganize_field(row["CONTEXT"], seperator, len(snps))
            for i, _snp in enumerate(snps):
                variant = {}
                if _snp in hgvs_rsid_dict:
                    variant["_id"] = hgvs_rsid_dict[_snp]
                else:
                    continue
                variant['gwascatalog'] = {
                    "associations": {
                        'efo': {},
                        'study': {}
                    }
                }
                if not HGVS:
                    variant["gwascatalog"]["rsid"] = _snp
                variant['gwascatalog']['associations']['snps'] = snps
                variant['gwascatalog']['associations']['pubmed'] = int(
                    row['PUBMEDID'])
                variant['gwascatalog']['associations']['date_added'] = row[
                    'DATE ADDED TO CATALOG']
                variant['gwascatalog']['associations']['study']['name'] = row[
                    'STUDY']
                variant['gwascatalog']['associations']['trait'] = row[
                    'DISEASE/TRAIT']
                variant['gwascatalog'][
                    'region'] = region[i] if region else None
                if not chrom:
                    chrom = [''] * 10
                elif str(chrom[i]).lower() not in CHROM_LIST:
                    chrom[i] = ''
                variant['gwascatalog']['chrom'] = chrom[i] if chrom else None
                variant['gwascatalog'][
                    'pos'] = position[i] if position else None
                variant['gwascatalog']['gene'] = genes[i].split(',') if (
                    genes and genes[i]) else None
                variant['gwascatalog'][
                    'context'] = context[i] if context else None
                variant['gwascatalog']['associations']['raf'] = str2float(
                    row['RISK ALLELE FREQUENCY'])
                variant['gwascatalog']['associations']['pval'] = str2float(
                    row['P-VALUE'])
                # variant['gwascatalog']['p_val_mlog'] = str2float(row['PVALUE_MLOG'])
                variant['gwascatalog']['associations']['study'][
                    'platform'] = row['PLATFORM [SNPS PASSING QC]']
                variant['gwascatalog']['associations']['study'][
                    'accession'] = row['STUDY ACCESSION']
                variant['gwascatalog']['associations']['efo']['name'] = row[
                    'MAPPED_TRAIT'].split(',')
                variant['gwascatalog']['associations']['efo']['id'] = [
                    _item.split('/')[-1].replace('_', ':')
                    for _item in row['MAPPED_TRAIT_URI'].split(',')
                ]
                variant = dict_sweep(unlist(
                    value_convert_to_number(variant, skipped_keys=['chrom'])),
                                     vals=[[], {}, None, '', 'NR'])
                results[variant["_id"]].append(variant)
        for v in results.values():
            if len(v) == 1:
                yield v[0]
            else:
                doc = {'_id': v[0]['_id'], 'gwascatalog': {'associations': []}}
                for _item in ['gene', 'region', 'pos', 'context', 'rsid']:
                    if _item in v[0]['gwascatalog']:
                        doc['gwascatalog'][_item] = v[0]['gwascatalog'][_item]
                doc['gwascatalog']['associations'] = [
                    i['gwascatalog']['associations'] for i in v
                ]
                yield doc
Ejemplo n.º 10
0
def parse_data(data_access):
    """
    return: a list containing a nested dinctionary with ENTREZ ID as gene ID  
    """

    current_time = date.today().strftime("-%Y-%m-%d")
    file_name = "ClinGen-Gene-Disease-Summary{}.csv".format(str(current_time))
    data_dir = os.path.join(data_access, file_name)

    # check if the file exist
    assert os.path.exists(
        data_dir), "input file '%s' does not exist" % data_dir

    # read file
    with open_anyfile(data_dir) as input_file:

        for _ in range(4):
            next(input_file)

        header = next(input_file).strip().split(",")
        next(input_file)
        reader = csv.DictReader(set(list(input_file)),
                                fieldnames=header,
                                delimiter=",")
        output = defaultdict(list)

        # initialize a list to store HGNC ID
        #hgnc_list = []

        for row in reader:
            # skip samples with empty HGNC
            if not 'GENE ID (HGNC)' in row or not row['GENE ID (HGNC)']:
                continue
            # store HGNC gen ID for conversion
            hgnc_id = row['GENE ID (HGNC)'].split(':')[1]
            #hgnc_list.append(hgnc_id)

            # store every gene's information into a nested dictionary
            gene = {}
            gene['_id'] = hgnc_id
            gene['clingen'] = {}
            gene['clingen']['clinical_validity'] = {}
            key_list = [
                'DISEASE LABEL', 'DISEASE ID (MONDO)', 'SOP', 'CLASSIFICATION',
                'ONLINE REPORT'
            ]

            # for each key, store the value into the gene dictionary
            for key in key_list:

                # disease value: "MONDO_ID" -> "MONDO:ID"
                if key == 'DISEASE ID (MONDO)':
                    old_key = key
                    complete_key = 'mondo'
                    gene['clingen']['clinical_validity'][
                        complete_key] = row.get(old_key,
                                                None).replace("_", ":")

                elif key == 'CLASSIFICATION':
                    old_key = key
                    complete_key = key.lower().replace(
                        ' ', '_')  # key to lower case
                    gene['clingen'][
                        'clinical_validity'][complete_key] = row.get(
                            old_key, None).lower()  # value to lower case

                else:
                    old_key = key
                    complete_key = key.lower().replace(
                        ' ', '_')  # key to lower case
                    gene['clingen']['clinical_validity'][
                        complete_key] = row.get(old_key, None)

            gene = dict_sweep(gene, vals=['', 'null', 'N/A', None, [], {}])
            output[gene['_id']].append(gene)

        #entrez_hgnc_dict = hgnc2entrenz(hgnc_list)
        temp_output = []

        # merge duplicates, this amy happen when a gene causes multiple diseases amd has multiple labels
        for value in output.values():
            # genes without duplicate
            if len(value) == 1:
                temp_output.append(value[0])

            # genes in duplicate
            else:
                temp_output.append({
                    '_id': value[0]['_id'],
                    'clingen': {
                        'clinical_validity':
                        [v['clingen']['clinical_validity'] for v in value]
                    }
                })

    return hgnc2entrez(temp_output)