def load_refsnp_merged(fname): refsnp_merged = {} debug(f"Loading refsnp_merged file '{fname}'...") with gzip.open(fname, "rt", encoding="utf-8") as f: for line in f: fields = line.strip().split() rsid, merged_rsid = fields[0], fields[1] refsnp_merged[rsid] = merged_rsid debug(f"Complete loading refsnp_merged file '{fname}'...") return refsnp_merged
def parse_grch38_dbsnp(fname): debug(f'Began parsing GRCh38') db ={} with gzip.open(fname, 'rt') as f: for line in f: fields = line.strip().split() rsid, chromosome, strand = fields[0], fields[1], fields[3] db[rsid] = (chromosome, strand) debug(f'Finishing parsing GRCh38') return db
def load_dbsnp_by_coordinate(fname, coordinates, offset=0): """ Read in NCBI dbSNP and return subset of entries keyed by coordinate. E.g.: db = {"1:1900500": ["rs123"], "3:2900500": ["rs456", "rs789"], ...} """ db = {} plink_map = {str(n): str(n) for n in range(1, 23)} plink_map.update({ "X": "23", "Y": "24", "PAR": "25", "M": "26", "MT": "26" }) debug(f"Loading dbSNP file '{fname}'...") with gzip.open(fname, "rt", encoding="utf-8") as f: for line in f: fields = line.strip().split() fields_len = len(fields) if fields_len < 3 or fields[2] == "": continue snp_id = "rs" + fields[0] chromosome = plink_map[fields[1]] position = str(int(fields[2]) + offset) k = chromosome + ":" + position if k in coordinates: if fields_len >= 4: db.setdefault(k, []).append(snp_id) else: if fields[1] == "AltOnly": db[k] = ["AltOnly"] else: debug("len(fields) < 4 and not AltOnly: " + str(fields)) return db
def load_dbsnp_by_snp_id(fname, snp_ids, offset=0): """ Read in NCBI dbSNP and return subset of entries keyed by SNP Id. E.g.: db = {"rs123": "1:1900500", "rs456": "2:3434343"}" """ db = {} plink_map = {str(n): str(n) for n in range(1, 23)} plink_map.update({ "X": "23", "Y": "24", "PAR": "25", "M": "26", "MT": "26" }) debug(f"Loading dbSNP file '{fname}'...") with gzip.open(fname, "rt", encoding="utf-8") as f: for line in f: fields = line.strip().split() fields_len = len(fields) if fields_len < 3 or fields[2] == "": continue snp_id = "rs" + fields[0] if snp_id in snp_ids: if fields[1] == 'AltOnly': db[snp_id] = ['AltOnly'] else: chromosome = plink_map[fields[1]] position = str(int(fields[2]) + offset) db[snp_id] = chromosome + ':' + position debug(f"Completed loading dbSNP file '{fname}'...") return db
def process_dbsnp(fnames, outfile): with gzip.open(outfile + '.gz', 'at') as out: for fname in fnames: chromosome = re.search('chr(.*).json', fname).group(1) debug(f'Began parsing chr{chromosome} dbsnp file') with bz2.open(fname, "rb") as f2: for line in f2: d = json.loads(line.decode('utf-8')) snpid = d['refsnp_id'] orientation = '-' if d['present_obs_movements']: position = str(d['present_obs_movements'][0] ['allele_in_cur_release']['position']) else: debug( f'rs{snpid} on chr{chromosome} does not have a position available' ) continue if len(d['primary_snapshot_data']['allele_annotations'][0] ['assembly_annotation']) > 0: if len(d['primary_snapshot_data']['allele_annotations'] [0]['assembly_annotation'][0]['genes']) > 0: orientation = d['primary_snapshot_data'][ 'allele_annotations'][0][ 'assembly_annotation'][0]['genes'][0][ 'orientation'] if orientation == 'plus': orientation = '0' elif orientation == 'minus': orientation = '1' else: orientation = '-' else: debug( f'rs{snpid} on chr{chromosome} does not have a orientation information', level=2) print(snpid + " " + chromosome + " " + position + " " + orientation, file=out) debug(f'Finished parsing chr{chromosome} dbsnp file')
def map_using_coord_logic(bim_entries, snps, dbsnp, keep_multi=False, keep_unmapped_rsids=False, skip_rs_ids=False): snps_to_update = [] snps_to_delete = [] multi_snps = [] for entry in bim_entries: k = entry["chromosome"] + ":" + entry["position"] snp = entry["snp_id"] if snp.startswith("rs") and skip_rs_ids: continue if k in dbsnp: if len(dbsnp[k]) > 1: debug(f"Has more than one snp_id dbsnp[{k}] = {str(dbsnp[k])}") if keep_multi: multi_snps.append((k, dbsnp[k])) # This prevents rs123 being updated to rs123 (No change) # This also checks to see if snp already in bim so duplicate snps are not included twice if dbsnp[k][0] != snp and dbsnp[k][0] not in snps: snps_to_update.append((snp, dbsnp[k][0])) else: continue else: if keep_unmapped_rsids and snp.startswith("rs"): continue snps_to_delete.append(snp) else: if dbsnp[k][0] != snp: debug( f"Rewrote snp_id {snp} to {dbsnp[k][0]} for position {k}" ) snps_to_update.append((snp, dbsnp[k][0])) snp = dbsnp[k][0] else: if keep_unmapped_rsids and snp.startswith("rs"): continue debug("NO_MATCH: " + "\t".join(entry.values())) snps_to_delete.append(snp) return snps_to_delete, snps_to_update, multi_snps
def process_rsmerge(fname, outfile): with gzip.open(outfile + '.gz', 'at') as out: debug(f'Began parsing Rsmerge file') with bz2.open(fname, "rb") as f2: for line in f2: d = json.loads(line.decode('utf-8')) if len(d['merged_snapshot_data']['merged_into']) > 0: snpid = d['refsnp_id'] merged_snpid = d['merged_snapshot_data']['merged_into'][0] print(snpid + " " + merged_snpid, file=out) else: snpid = d['refsnp_id'] debug(f'rs{snpid} has no merge info!') debug(f'Finished parsing Rsmerge file')
def map_chromosomes(grch37, grch38_db): debug(f'Began mapping GRCh37 chromosomes') db = {} multi_entries = set() with gzip.open(grch37, 'rt') as f: for line in f: fields = line.strip().split() rsid, chromosome, position = fields[:] if not chromosome.startswith("NC"): continue rsid = rsid[2:] if rsid in db: multi_entries.add(rsid) continue if rsid in grch38_db: chromosome = grch38_db[rsid][0] strand = grch38_db[rsid][1] else: debug(f'{rsid} was not found in GRCh38, therefore no change in chromosome', level=2) continue db[rsid] = chromosome + " " + position + " " + strand # remove any multi snps for rsid in multi_entries: del db[rsid] debug(f'Finished mapping GRCh37 chromosomes') return db, multi_entries
def map_using_rs_id_logic(snp_map, dbsnp, unmappable_snps): snps_to_delete = [] snps_to_update = [] coords_to_update = [] chromosomes_to_update = [] snps_already_updated = set() for snp_id, original_coord, snp_id_new in snp_map: # If the snp has been updated (merged) if snp_id_new != snp_id: # If the merged snp was already in the original if snp_id_new in [snp[0] for snp in snp_map]: snps_to_delete.append(snp_id) elif snp_id_new in dbsnp: # If snp already being updated avoids duplicate snps if snp_id_new in snps_already_updated: snps_to_delete.append(snp_id) continue snps_to_update.append((snp_id, snp_id_new)) snps_already_updated.add(snp_id_new) debug( f"original_coord={original_coord} updated_coord={dbsnp[snp_id_new]}", level=2) new_chromosome, new_position = dbsnp[snp_id_new].split(":") original_chromosome, original_position = original_coord.split( ":") if new_position != original_position: coords_to_update.append((snp_id_new, new_position)) if new_chromosome != original_chromosome: chromosomes_to_update.append((snp_id_new, new_chromosome)) # If snp_id is updated to snp_id_new but unable to update chromosome and position elif snp_id_new in unmappable_snps: # If snp already being updated avoids duplicate snps if snp_id_new in snps_already_updated: snps_to_delete.append(snp_id) continue snps_to_update.append((snp_id, snp_id_new)) snps_already_updated.add(snp_id_new) debug( f"{snp_id} was updated to {snp_id_new} but cannot be updated by chr:position due to having multiple positions inside of GRCh37 VCF file" ) else: snps_to_delete.append(snp_id) # If snp_id was not merged and is the same as snp_id_new (no change) else: if snp_id in dbsnp: debug( f"original_coord={original_coord} updated_coord={dbsnp[snp_id]}", level=2) new_chromosome, new_position = dbsnp[snp_id].split(":") original_chromosome, original_position = original_coord.split( ":") if new_position != original_position: coords_to_update.append((snp_id, new_position)) if new_chromosome != original_chromosome: chromosomes_to_update.append((snp_id, new_chromosome)) elif snp_id in unmappable_snps: debug( f"{snp_id} cannot be updated due to having multiple positions inside of GRCh37 VCF file" ) # If snp_id is not in dbsnp it has been deleted else: snps_to_delete.append(snp_id) return snps_to_delete, snps_to_update, coords_to_update, chromosomes_to_update