def load_data(data_folder): input_file = os.path.join(data_folder, "all_cell_markers.txt") assert os.path.exists(input_file), "Can't find input file '{}'".format(input_file) with open_anyfile(input_file) as in_f: header = next(in_f).strip().split('\t') reader = DictReader(in_f, fieldnames=header, delimiter='\t') for row in reader: for field in ['cellMarker', 'geneID', 'geneSymbol', 'proteinID', 'proteinName']: row[field] = parse_field(row[field]) if not all([ len(row['cellMarker']) == len(row['geneID']), len(row['cellMarker']) == len(row['geneSymbol']), len(row['cellMarker']) == len(row['proteinID']), len(row['cellMarker']) == len(row['proteinName'])]): # handle weird cases... pass else: # they all match for index in range(len(row['cellMarker'])): r = copy.copy(row) for field in ['cellMarker', 'geneID', 'geneSymbol', 'proteinID', 'proteinName']: r[field] = r[field][index] _id = generate_id(r) yield {"_id": _id, "CellMarker": r}
def load_data(data_folder, assembly="hg19"): """Load data from EMV csv file into list of JSON docs """ input_file = os.path.join(data_folder, "EmVClass.2018-Q2.csv") assert os.path.exists( input_file), "Can't find input file '%s'" % input_file with open_anyfile(input_file) as in_f: lines = set(list(in_f)) lines = [_doc.strip().split(',') for _doc in lines] print(list(lines)[0]) results = defaultdict(list) # mapping non genomic hgvs ids to genomic hgvs ids used in MyVariant hgvs_ids = [_item[4] for _item in lines] #print(hgvs_ids) hgvs_mapping_dict = batch_query_myvariant_id_from_clingen( hgvs_ids, assembly) # loop through csv doc to convert into json docs for row in lines: # structure the content of emv docs variant = _map_line_to_json(row) # fetch corresponding genomic hgvs ids mapped_ids = hgvs_mapping_dict[row[4]] # could be one non-genomic hgvs id mapping to mulitple genomic ones if mapped_ids: for _id in mapped_ids: results[_id].append(variant) for k, v in results.items(): if len(v) == 1: doc = {'_id': k, 'emv': v[0]} else: doc = {'_id': k, 'emv': [_doc for _doc in v]} print('case of multi hits', doc) yield doc
def load_data(data_folder): input_file = os.path.join(data_folder, "cgi_biomarkers_per_variant.tsv") assert os.path.exists( input_file), "Can't find input file '%s'" % input_file with open_anyfile(input_file) as in_f: # Remove duplicated lines if any header = next(in_f).strip().split('\t') lines = set(list(in_f)) reader = DictReader(lines, fieldnames=header, delimiter='\t') results = defaultdict(list) for row in reader: variant = {} # Skip if 'gDNA' not in row or row['gDNA'] == "": continue # Skip variants that are not mutations if 'Alteration type' not in row or row['Alteration type'] != 'MUT': continue # Use gDNA as variant identifier variant['_id'] = row['gDNA'] variant['cgi'] = {} for k in [ 'region', 'cDNA', 'Evidence level', 'transcript', 'Gene', ('individual_mutation', 'protein_change'), 'Primary Tumor type', ('Drug full name', 'drug'), 'Source', 'Association' ]: if isinstance(k, tuple): new_k = k[1] old_k = k[0] else: new_k = k.lower().replace(' ', '_') old_k = k variant['cgi'][new_k] = unicodedata.normalize( "NFKD", row.get(old_k, None)) variant = dict_sweep(variant, vals=['', 'null', 'N/A', None, [], {}]) results[variant['_id']].append(variant) # Merge duplications for v in results.values(): if len(v) == 1: yield v[0] else: yield {'_id': v[0]['_id'], 'cgi': [i['cgi'] for i in v]}
def load_data(input_file): with open_anyfile(input_file) as in_f: for line in in_f: _id, caid = line.strip().split() yield { '_id': _id, 'clingen': { "caid": caid }, }
def load_data(input_file): with open_anyfile(input_file) as in_f: result = defaultdict(list) for line in in_f: pharos_id, _id = line.strip().split(',') if _id != 'entrez_gene_id' and _id != '0': result[str(_id)].append(int(pharos_id)) for k, v in result.items(): json_doc = {'_id': str(k), 'pharos': {"target_id": v}} yield unlist(json_doc)
def load_data(input_file): with open_anyfile(input_file) as in_f: for line in in_f: pharos_id, _id = line.strip().split(',') if _id != 'entrez_gene_id': yield { '_id': int(_id), 'pharos': { "target_id": int(pharos_id) }, }
def load_data(data_folder): input_file = os.path.join(data_folder, "phewas-catalog.csv") assert os.path.exists(input_file), "Can't find input file '%s'" % input_file with open_anyfile(input_file) as in_f: # Remove duplicated lines if any header = next(in_f).strip().split(',') header = [_item[1:-1] for _item in header] lines = set(list(in_f)) reader = DictReader(lines, fieldnames=header, delimiter=',') results = defaultdict(list) for row in reader: variant = {"associations": {"phenotype": {}}, "variant": {}} assert re.match("^rs\d+$", row["snp"]) != None variant["variant"]["rsid"] = row["snp"] variant["associations"]["phenotype"]["name"] = row["phewas phenotype"] variant["associations"]["cases"] = row["cases"] variant["associations"]["pval"] = float(row["p-value"]) variant["associations"]["odds-ratio"] = row["odds-ratio"] variant["associations"]["phenotype"]["phewas_code"] = row["phewas code"] variant["variant"]["gene"] = row["gene_name"] variant["variant"]["gwas_associations"] = row["gwas-associations"].split(',') pos_info = row["chromosome"].split(' ') if len(pos_info) == 2: variant["variant"]["chrom"], variant["variant"]["pos"] = pos_info else: variant["variant"]["chrom"] = pos_info[0] results[variant["variant"]["rsid"]].append(variant) # Merge duplications rsid_list = [_item for _item in results.keys()] hgvs_rsid_dict = batch_query_hgvs_from_rsid(rsid_list) for k, v in results.items(): if k in hgvs_rsid_dict and hgvs_rsid_dict[k]: if len(v) == 1: doc = {'_id': hgvs_rsid_dict[k], 'phewas': v[0]["variant"]} doc["phewas"]["associations"] = v[0]["associations"] yield dict_sweep(unlist(value_convert_to_number(doc, skipped_keys=['chrom'])), vals=[[], {}, None, '', 'NULL']) else: doc = {'_id': hgvs_rsid_dict[k], 'phewas': v[0]["variant"]} doc["phewas"]["associations"] = [] for _item in v: doc["phewas"]["associations"].append(_item["associations"]) yield dict_sweep(unlist(value_convert_to_number(doc, skipped_keys=['chrom'])), vals=[[], {}, None, '', 'NULL'])
def load_data(data_folder: str): """ Load data from a specified file path. Parse each line into a dictionary according to the schema given by `data_schema`. Then process each dict by normalizing data format, remove null fields (optional). Append each dict into final result using its id. :param data_folder: the path(folder) where the data file is stored :return: a generator that yields data. """ input_file = os.path.join(data_folder, file_name) # raise an error if file not found assert os.path.exists(input_file), FILE_NOT_FOUND_ERROR.format(input_file) with open_anyfile(input_file) as file: # Remove duplicated lines if any lines = set(list(file)) # read and parse each line into a dict reader = DictReader(lines, fieldnames=data_schema, delimiter=delimiter) # access non existing keys will return an empty list by default results = defaultdict(list) for row in reader: # start processing each lines of data (stored in dicts) # construct id (e.g. chr1:g.678900_679000) _id = '{chrom}:g.{start}_{end}'.format(chrom=row['chrom'], start=row['start'], end=row['end']) # optional step: normalize data variant = { k: unicodedata.normalize('NFKD', v) for k, v in row.items() } # optional step: delete invalid fields within each dict variant = dict_sweep(variant, vals=['', 'null', 'N/A', None, [], {}]) # append dict to the result list using the _id results[_id].append(variant) for k, v in results.items(): yield {'_id': k, source_name: v}
def load_data(data_folder): input_file = os.path.join(data_folder, "alternative") # input_file = os.path.join(data_folder, "gwas_catalog_v1.0.2-associations_e96_r2019-04-21.tsv") assert os.path.exists( input_file), "Can't find input file '%s'" % input_file with open_anyfile(input_file) as in_f: # Remove duplicated lines if any header = next(in_f).strip().split('\t') lines = set(list(in_f)) reader = DictReader(lines, fieldnames=header, delimiter='\t') results = defaultdict(list) rsid_list = [] for row in reader: rsids, _ = parse_separator_and_snps(row) if rsids: rsid_list += rsids hgvs_rsid_dict = batch_query_hgvs_from_rsid(rsid_list) reader = DictReader(lines, fieldnames=header, delimiter='\t') for row in reader: variant = {} HGVS = False snps, seperator = parse_separator_and_snps(row) if not snps: continue region = reorganize_field(row["REGION"], seperator, len(snps)) chrom = reorganize_field(row["CHR_ID"], seperator, len(snps)) genes = reorganize_field(row["REPORTED GENE(S)"], seperator, len(snps)) position = reorganize_field(row["CHR_POS"], seperator, len(snps)) context = reorganize_field(row["CONTEXT"], seperator, len(snps)) for i, _snp in enumerate(snps): variant = {} if _snp in hgvs_rsid_dict: variant["_id"] = hgvs_rsid_dict[_snp] else: continue variant['gwascatalog'] = { "associations": { 'efo': {}, 'study': {} } } if not HGVS: variant["gwascatalog"]["rsid"] = _snp variant['gwascatalog']['associations']['snps'] = snps variant['gwascatalog']['associations']['pubmed'] = int( row['PUBMEDID']) variant['gwascatalog']['associations']['date_added'] = row[ 'DATE ADDED TO CATALOG'] variant['gwascatalog']['associations']['study']['name'] = row[ 'STUDY'] variant['gwascatalog']['associations']['trait'] = row[ 'DISEASE/TRAIT'] variant['gwascatalog'][ 'region'] = region[i] if region else None if not chrom: chrom = [''] * 10 elif str(chrom[i]).lower() not in CHROM_LIST: chrom[i] = '' variant['gwascatalog']['chrom'] = chrom[i] if chrom else None variant['gwascatalog'][ 'pos'] = position[i] if position else None variant['gwascatalog']['gene'] = genes[i].split(',') if ( genes and genes[i]) else None variant['gwascatalog'][ 'context'] = context[i] if context else None variant['gwascatalog']['associations']['raf'] = str2float( row['RISK ALLELE FREQUENCY']) variant['gwascatalog']['associations']['pval'] = str2float( row['P-VALUE']) # variant['gwascatalog']['p_val_mlog'] = str2float(row['PVALUE_MLOG']) variant['gwascatalog']['associations']['study'][ 'platform'] = row['PLATFORM [SNPS PASSING QC]'] variant['gwascatalog']['associations']['study'][ 'accession'] = row['STUDY ACCESSION'] variant['gwascatalog']['associations']['efo']['name'] = row[ 'MAPPED_TRAIT'].split(',') variant['gwascatalog']['associations']['efo']['id'] = [ _item.split('/')[-1].replace('_', ':') for _item in row['MAPPED_TRAIT_URI'].split(',') ] variant = dict_sweep(unlist( value_convert_to_number(variant, skipped_keys=['chrom'])), vals=[[], {}, None, '', 'NR']) results[variant["_id"]].append(variant) for v in results.values(): if len(v) == 1: yield v[0] else: doc = {'_id': v[0]['_id'], 'gwascatalog': {'associations': []}} for _item in ['gene', 'region', 'pos', 'context', 'rsid']: if _item in v[0]['gwascatalog']: doc['gwascatalog'][_item] = v[0]['gwascatalog'][_item] doc['gwascatalog']['associations'] = [ i['gwascatalog']['associations'] for i in v ] yield doc
def parse_data(data_access): """ return: a list containing a nested dinctionary with ENTREZ ID as gene ID """ current_time = date.today().strftime("-%Y-%m-%d") file_name = "ClinGen-Gene-Disease-Summary{}.csv".format(str(current_time)) data_dir = os.path.join(data_access, file_name) # check if the file exist assert os.path.exists( data_dir), "input file '%s' does not exist" % data_dir # read file with open_anyfile(data_dir) as input_file: for _ in range(4): next(input_file) header = next(input_file).strip().split(",") next(input_file) reader = csv.DictReader(set(list(input_file)), fieldnames=header, delimiter=",") output = defaultdict(list) # initialize a list to store HGNC ID #hgnc_list = [] for row in reader: # skip samples with empty HGNC if not 'GENE ID (HGNC)' in row or not row['GENE ID (HGNC)']: continue # store HGNC gen ID for conversion hgnc_id = row['GENE ID (HGNC)'].split(':')[1] #hgnc_list.append(hgnc_id) # store every gene's information into a nested dictionary gene = {} gene['_id'] = hgnc_id gene['clingen'] = {} gene['clingen']['clinical_validity'] = {} key_list = [ 'DISEASE LABEL', 'DISEASE ID (MONDO)', 'SOP', 'CLASSIFICATION', 'ONLINE REPORT' ] # for each key, store the value into the gene dictionary for key in key_list: # disease value: "MONDO_ID" -> "MONDO:ID" if key == 'DISEASE ID (MONDO)': old_key = key complete_key = 'mondo' gene['clingen']['clinical_validity'][ complete_key] = row.get(old_key, None).replace("_", ":") elif key == 'CLASSIFICATION': old_key = key complete_key = key.lower().replace( ' ', '_') # key to lower case gene['clingen'][ 'clinical_validity'][complete_key] = row.get( old_key, None).lower() # value to lower case else: old_key = key complete_key = key.lower().replace( ' ', '_') # key to lower case gene['clingen']['clinical_validity'][ complete_key] = row.get(old_key, None) gene = dict_sweep(gene, vals=['', 'null', 'N/A', None, [], {}]) output[gene['_id']].append(gene) #entrez_hgnc_dict = hgnc2entrenz(hgnc_list) temp_output = [] # merge duplicates, this amy happen when a gene causes multiple diseases amd has multiple labels for value in output.values(): # genes without duplicate if len(value) == 1: temp_output.append(value[0]) # genes in duplicate else: temp_output.append({ '_id': value[0]['_id'], 'clingen': { 'clinical_validity': [v['clingen']['clinical_validity'] for v in value] } }) return hgnc2entrez(temp_output)