def calculate_matrix_svg(snplst, pop, request, genome_build, r2_d="r2", collapseTranscript=True): # Set data directories using config.yml with open('config.yml', 'r') as yml_file: config = yaml.load(yml_file) env = config['env'] connect_external = config['database']['connect_external'] api_mongo_addr = config['database']['api_mongo_addr'] population_samples_dir = config['data']['population_samples_dir'] data_dir = config['data']['data_dir'] tmp_dir = config['data']['tmp_dir'] genotypes_dir = config['data']['genotypes_dir'] aws_info = config['aws'] mongo_username = config['database']['mongo_user_readonly'] mongo_password = config['database']['mongo_password'] mongo_port = config['database']['mongo_port'] export_s3_keys = retrieveAWSCredentials() # Ensure tmp directory exists if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) # Open SNP list file snps_raw = open(snplst).readlines() # Remove duplicate RS numbers snps = [] for snp_raw in snps_raw: snp = snp_raw.strip().split() if snp not in snps: snps.append(snp) # Select desired ancestral populations pops = pop.split("+") pop_dirs = [] for pop_i in pops: if pop_i in [ "ALL", "AFR", "AMR", "EAS", "EUR", "SAS", "ACB", "ASW", "BEB", "CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH", "GWD", "IBS", "ITU", "JPT", "KHV", "LWK", "MSL", "MXL", "PEL", "PJL", "PUR", "STU", "TSI", "YRI" ]: pop_dirs.append(data_dir + population_samples_dir + pop_i + ".txt") get_pops = "cat " + " ".join(pop_dirs) pop_list = [ x.decode('utf-8') for x in subprocess.Popen( get_pops, shell=True, stdout=subprocess.PIPE).stdout.readlines() ] ids = [i.strip() for i in pop_list] pop_ids = list(set(ids)) # Connect to Mongo snp database if env == 'local' or connect_external: mongo_host = api_mongo_addr else: mongo_host = 'localhost' client = MongoClient( 'mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host + '/admin', mongo_port) db = client["LDLink"] def get_coords(db, rsid): rsid = rsid.strip("rs") query_results = db.dbsnp.find_one({"id": rsid}) query_results_sanitized = json.loads(json_util.dumps(query_results)) return query_results_sanitized # Query genomic coordinates def get_rsnum(db, coord): temp_coord = coord.strip("chr").split(":") chro = temp_coord[0] pos = temp_coord[1] query_results = db.dbsnp.find({ "chromosome": chro.upper() if chro == 'x' or chro == 'y' else str(chro), genome_build_vars[genome_build]['position']: str(pos) }) query_results_sanitized = json.loads(json_util.dumps(query_results)) return query_results_sanitized # Replace input genomic coordinates with variant ids (rsids) def replace_coords_rsid(db, snp_lst): new_snp_lst = [] for snp_raw_i in snp_lst: if snp_raw_i[0][0:2] == "rs": new_snp_lst.append(snp_raw_i) else: snp_info_lst = get_rsnum(db, snp_raw_i[0]) #print("snp_info_lst") #print(snp_info_lst) if snp_info_lst != None: if len(snp_info_lst) > 1: var_id = "rs" + snp_info_lst[0]['id'] ref_variants = [] for snp_info in snp_info_lst: if snp_info['id'] == snp_info['ref_id']: ref_variants.append(snp_info['id']) if len(ref_variants) > 1: var_id = "rs" + ref_variants[0] elif len(ref_variants) == 0 and len(snp_info_lst) > 1: var_id = "rs" + snp_info_lst[0]['id'] else: var_id = "rs" + ref_variants[0] new_snp_lst.append([var_id]) elif len(snp_info_lst) == 1: var_id = "rs" + snp_info_lst[0]['id'] new_snp_lst.append([var_id]) else: new_snp_lst.append(snp_raw_i) else: new_snp_lst.append(snp_raw_i) return new_snp_lst snps = replace_coords_rsid(db, snps) # Find RS numbers in snp database rs_nums = [] snp_pos = [] snp_coords = [] tabix_coords = "" for snp_i in snps: if len(snp_i) > 0: if len(snp_i[0]) > 2: if (snp_i[0][0:2] == "rs" or snp_i[0][0:3] == "chr") and snp_i[0][-1].isdigit(): snp_coord = get_coords(db, snp_i[0]) if snp_coord != None and snp_coord[genome_build_vars[ genome_build]['position']] != "NA": # check if variant is on chrY for genome build = GRCh38 if not (snp_coord['chromosome'] == "Y" and (genome_build == "grch38" or genome_build == "grch38_high_coverage")): rs_nums.append(snp_i[0]) snp_pos.append(snp_coord[ genome_build_vars[genome_build]['position']]) temp = [ snp_i[0], snp_coord['chromosome'], snp_coord[genome_build_vars[genome_build] ['position']] ] snp_coords.append(temp) # Check max distance between SNPs distance_bp = [] for i in range(len(snp_coords)): distance_bp.append(int(snp_coords[i][2])) # Sort coordinates and make tabix formatted coordinates snp_pos_int = [int(i) for i in snp_pos] snp_pos_int.sort() # keep track of rs and snp postion after sort rs_snp_pos = [] for i in snp_pos_int: rs_snp_pos.append(snp_pos.index(str(i))) snp_coord_str = [ genome_build_vars[genome_build]['1000G_chr_prefix'] + snp_coords[0][1] + ":" + str(i) + "-" + str(i) for i in snp_pos_int ] tabix_coords = " " + " ".join(snp_coord_str) # Extract 1000 Genomes phased genotypes vcf_filePath = "%s/%s%s/%s" % ( config['aws']['data_subfolder'], genotypes_dir, genome_build_vars[genome_build]['1000G_dir'], genome_build_vars[genome_build]['1000G_file'] % (snp_coords[0][1])) vcf_query_snp_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath) checkS3File(aws_info, config['aws']['bucket'], vcf_filePath) vcf, h = retrieveTabix1000GData( vcf_query_snp_file, tabix_coords, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) # Define function to correct indel alleles def set_alleles(a1, a2): if len(a1) == 1 and len(a2) == 1: a1_n = a1 a2_n = a2 elif len(a1) == 1 and len(a2) > 1: a1_n = "-" a2_n = a2[1:] elif len(a1) > 1 and len(a2) == 1: a1_n = a1[1:] a2_n = "-" elif len(a1) > 1 and len(a2) > 1: a1_n = a1[1:] a2_n = a2[1:] return (a1_n, a2_n) head = vcf[h].strip().split() # Extract haplotypes index = [] for i in range(9, len(head)): if head[i] in pop_ids: index.append(i) hap1 = [[]] for i in range(len(index) - 1): hap1.append([]) hap2 = [[]] for i in range(len(index) - 1): hap2.append([]) snp_dict, missing_snp = parse_vcf(vcf[h + 1:], snp_coords, True) rsnum_lst = [] allele_lst = [] pos_lst = [] for s_key in snp_dict: # parse snp_key such as chr7:pos_rs4 snp_keys = s_key.split("_") snp_key = snp_keys[0].split(':')[1] rs_input = snp_keys[1] geno_list = snp_dict[s_key] g = -1 for geno in geno_list: g = g + 1 geno = geno.strip().split() geno[0] = geno[0].lstrip('chr') # if 1000G position does not match dbSNP position for variant, use dbSNP position if geno[1] != snp_key: geno[1] = snp_key if "," not in geno[3] and "," not in geno[4]: a1, a2 = set_alleles(geno[3], geno[4]) for i in range(len(index)): if geno[index[i]] == "0|0": hap1[i].append(a1) hap2[i].append(a1) elif geno[index[i]] == "0|1": hap1[i].append(a1) hap2[i].append(a2) elif geno[index[i]] == "1|0": hap1[i].append(a2) hap2[i].append(a1) elif geno[index[i]] == "1|1": hap1[i].append(a2) hap2[i].append(a2) elif geno[index[i]] == "0": hap1[i].append(a1) hap2[i].append(".") elif geno[index[i]] == "1": hap1[i].append(a2) hap2[i].append(".") else: hap1[i].append(".") hap2[i].append(".") rsnum_lst.append(rs_input) position = "chr" + geno[0] + ":" + geno[1] + "-" + geno[1] pos_lst.append(position) alleles = a1 + "/" + a2 allele_lst.append(alleles) # Calculate Pairwise LD Statistics all_haps = hap1 + hap2 ld_matrix = [[[None for v in range(2)] for i in range(len(all_haps[0]))] for j in range(len(all_haps[0]))] for i in range(len(all_haps[0])): for j in range(i, len(all_haps[0])): hap = {} for k in range(len(all_haps)): # Extract haplotypes hap_k = all_haps[k][i] + all_haps[k][j] if hap_k in hap: hap[hap_k] += 1 else: hap[hap_k] = 1 # Remove Missing Haplotypes keys = list(hap.keys()) for key in keys: if "." in key: hap.pop(key, None) # Check all haplotypes are present if len(hap) != 4: snp_i_a = allele_lst[i].split("/") snp_j_a = allele_lst[j].split("/") haps = [ snp_i_a[0] + snp_j_a[0], snp_i_a[0] + snp_j_a[1], snp_i_a[1] + snp_j_a[0], snp_i_a[1] + snp_j_a[1] ] for h in haps: if h not in hap: hap[h] = 0 # Perform LD calculations A = hap[sorted(hap)[0]] B = hap[sorted(hap)[1]] C = hap[sorted(hap)[2]] D = hap[sorted(hap)[3]] tmax = max(A, B, C, D) delta = float(A * D - B * C) Ms = float((A + C) * (B + D) * (A + B) * (C + D)) if Ms != 0: # D prime if delta < 0: D_prime = round( abs(delta / min((A + C) * (A + B), (B + D) * (C + D))), 3) else: D_prime = round( abs(delta / min((A + C) * (C + D), (A + B) * (B + D))), 3) # R2 r2 = round((delta**2) / Ms, 3) # Find Correlated Alleles if str(r2) != "NA" and float(r2) > 0.1: Ac = hap[sorted(hap)[0]] Bc = hap[sorted(hap)[1]] Cc = hap[sorted(hap)[2]] Dc = hap[sorted(hap)[3]] if ((Ac * Dc) / max((Bc * Cc), 0.01) > 1): match = sorted(hap)[0][0] + "=" + sorted( hap)[0][1] + "," + sorted( hap)[3][0] + "=" + sorted(hap)[3][1] else: match = sorted(hap)[1][0] + "=" + sorted( hap)[1][1] + "," + sorted( hap)[2][0] + "=" + sorted(hap)[2][1] else: match = " = , = " else: D_prime = "NA" r2 = "NA" match = " = , = " snp1 = rsnum_lst[i] snp2 = rsnum_lst[j] pos1 = pos_lst[i].split("-")[0] pos2 = pos_lst[j].split("-")[0] allele1 = allele_lst[i] allele2 = allele_lst[j] corr = match.split(",")[0].split("=")[1] + "=" + match.split( ",")[0].split("=")[0] + "," + match.split(",")[1].split( "=")[1] + "=" + match.split(",")[1].split("=")[0] corr_f = match ld_matrix[i][j] = [ snp1, snp2, allele1, allele2, corr, pos1, pos2, D_prime, r2 ] ld_matrix[j][i] = [ snp2, snp1, allele2, allele1, corr_f, pos2, pos1, D_prime, r2 ] # Generate Plot Variables out = [j for i in ld_matrix for j in i] xnames = [] ynames = [] xA = [] yA = [] corA = [] xpos = [] ypos = [] D = [] R = [] box_color = [] box_trans = [] if r2_d not in ["r2", "d"]: r2_d = "r2" for i in range(len(out)): snp1, snp2, allele1, allele2, corr, pos1, pos2, D_prime, r2 = out[i] xnames.append(snp1) ynames.append(snp2) xA.append(allele1) yA.append(allele2) corA.append(corr) xpos.append(pos1) ypos.append(pos2) sqrti = math.floor(math.sqrt(len(out))) if sqrti == 0: D.append(str(round(float(D_prime), 4))) R.append(str(round(float(r2), 4))) box_color.append("red") box_trans.append(r2) elif i % sqrti < i // sqrti and r2 != "NA": D.append(str(round(float(D_prime), 4))) R.append(str(round(float(r2), 4))) box_color.append("blue") box_trans.append(abs(D_prime)) elif i % sqrti > i // sqrti and D_prime != "NA": D.append(str(round(float(D_prime), 4))) R.append(str(round(float(r2), 4))) box_color.append("red") box_trans.append(r2) elif i % sqrti == i // sqrti and D_prime != "NA": D.append(str(round(float(D_prime), 4))) R.append(str(round(float(r2), 4))) box_color.append("purple") box_trans.append(r2) else: D.append("NA") R.append("NA") box_color.append("gray") box_trans.append(0.1) # Import plotting modules from collections import OrderedDict from bokeh.embed import components, file_html from bokeh.layouts import gridplot from bokeh.models import HoverTool, LinearAxis, Range1d from bokeh.plotting import ColumnDataSource, curdoc, figure, output_file, reset_output, save from bokeh.resources import CDN from bokeh.io import export_svgs import svgutils.compose as sg from math import pi reset_output() # Aggregate Plotting Data x = [] y = [] w = [] h = [] coord_snps_plot = [] snp_id_plot = [] alleles_snp_plot = [] for i in range(0, len(xpos), int(len(xpos)**0.5)): x.append(int(xpos[i].split(":")[1]) / 1000000.0) y.append(0.5) w.append(0.00003) h.append(1.06) coord_snps_plot.append(xpos[i]) snp_id_plot.append(xnames[i]) alleles_snp_plot.append(xA[i]) buffer = (x[-1] - x[0]) * 0.025 xr = Range1d(start=x[0] - buffer, end=x[-1] + buffer) yr = Range1d(start=-0.03, end=1.03) y2_ll = [-0.03] * len(x) y2_ul = [1.03] * len(x) yr_pos = Range1d(start=(x[-1] + buffer) * -1, end=(x[0] - buffer) * -1) yr0 = Range1d(start=0, end=1) yr2 = Range1d(start=0, end=3.8) yr3 = Range1d(start=0, end=1) spacing = (x[-1] - x[0] + buffer + buffer) / (len(x) * 1.0) x2 = [] y0 = [] y1 = [] y2 = [] y3 = [] y4 = [] for i in range(len(x)): x2.append(x[0] - buffer + spacing * (i + 0.5)) y0.append(0) y1.append(0.20) y2.append(0.80) y3.append(1) y4.append(1.15) xname_pos = [] for i in x2: for j in range(len(x2)): xname_pos.append(i) data = { 'xname': xnames, 'xname_pos': xname_pos, 'yname': ynames, 'xA': xA, 'yA': yA, 'xpos': xpos, 'ypos': ypos, 'R2': R, 'Dp': D, 'corA': corA, 'box_color': box_color, 'box_trans': box_trans } source = ColumnDataSource(data) threshold = 70 if len(snps) < threshold: matrix_plot = figure( outline_line_color="white", min_border_top=0, min_border_bottom=2, min_border_left=100, min_border_right=5, x_range=xr, y_range=list(reversed(rsnum_lst)), h_symmetry=False, v_symmetry=False, border_fill_color='white', x_axis_type=None, logo=None, tools="hover,undo,redo,reset,pan,box_zoom,previewsave", title=" ", plot_width=800, plot_height=700) else: matrix_plot = figure( outline_line_color="white", min_border_top=0, min_border_bottom=2, min_border_left=100, min_border_right=5, x_range=xr, y_range=list(reversed(rsnum_lst)), h_symmetry=False, v_symmetry=False, border_fill_color='white', x_axis_type=None, y_axis_type=None, logo=None, tools="hover,undo,redo,reset,pan,box_zoom,previewsave", title=" ", plot_width=800, plot_height=700) matrix_plot.rect(x='xname_pos', y='yname', width=0.95 * spacing, height=0.95, source=source, color="box_color", alpha="box_trans", line_color=None) matrix_plot.grid.grid_line_color = None matrix_plot.axis.axis_line_color = None matrix_plot.axis.major_tick_line_color = None if len(snps) < threshold: matrix_plot.axis.major_label_text_font_size = "8pt" matrix_plot.xaxis.major_label_orientation = "vertical" matrix_plot.axis.major_label_text_font_style = "normal" matrix_plot.xaxis.major_label_standoff = 0 sup_2 = "\u00B2" hover = matrix_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Variant 1", " " + "@yname (@yA)"), ("Variant 2", " " + "@xname (@xA)"), ("D\'", " " + "@Dp"), ("R" + sup_2, " " + "@R2"), ("Correlated Alleles", " " + "@corA"), ]) # Connecting and Rug Plots # Connector Plot if len(snps) < threshold: connector = figure(outline_line_color="white", y_axis_type=None, x_axis_type=None, x_range=xr, y_range=yr2, border_fill_color='white', title="", min_border_left=100, min_border_right=5, min_border_top=0, min_border_bottom=0, h_symmetry=False, v_symmetry=False, plot_width=800, plot_height=90, tools="xpan,tap") connector.segment(x, y0, x, y1, color="black") connector.segment(x, y1, x2, y2, color="black") connector.segment(x2, y2, x2, y3, color="black") connector.text(x2, y4, text=snp_id_plot, alpha=1, angle=pi / 2, text_font_size="8pt", text_baseline="middle", text_align="left") else: connector = figure(outline_line_color="white", y_axis_type=None, x_axis_type=None, x_range=xr, y_range=yr3, border_fill_color='white', title="", min_border_left=100, min_border_right=5, min_border_top=0, min_border_bottom=0, h_symmetry=False, v_symmetry=False, plot_width=800, plot_height=30, tools="xpan,tap") connector.segment(x, y0, x, y1, color="black") connector.segment(x, y1, x2, y2, color="black") connector.segment(x2, y2, x2, y3, color="black") connector.yaxis.major_label_text_color = None connector.yaxis.minor_tick_line_alpha = 0 # Option does not work connector.yaxis.axis_label = " " connector.grid.grid_line_color = None connector.axis.axis_line_color = None connector.axis.major_tick_line_color = None connector.axis.minor_tick_line_color = None connector.toolbar_location = None data_rug = { 'x': x, 'y': y, 'w': w, 'h': h, 'coord_snps_plot': coord_snps_plot, 'snp_id_plot': snp_id_plot, 'alleles_snp_plot': alleles_snp_plot } source_rug = ColumnDataSource(data_rug) # Rug Plot rug = figure(x_range=xr, y_range=yr, y_axis_type=None, title="", min_border_top=1, min_border_bottom=0, min_border_left=100, min_border_right=5, h_symmetry=False, v_symmetry=False, plot_width=800, plot_height=50, tools="hover,xpan,tap") rug.rect(x='x', y='y', width='w', height='h', fill_color='red', dilate=True, line_color=None, fill_alpha=0.6, source=source_rug) hover = rug.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("SNP", "@snp_id_plot (@alleles_snp_plot)"), ("Coord", "@coord_snps_plot"), ]) rug.toolbar_location = None if collapseTranscript == "false": # Gene Plot (All Transcripts) genes_file = tmp_dir + "genes_" + request + ".json" genes_raw = open(genes_file).readlines() genes_plot_start = [] genes_plot_end = [] genes_plot_y = [] genes_plot_name = [] exons_plot_x = [] exons_plot_y = [] exons_plot_w = [] exons_plot_h = [] exons_plot_name = [] exons_plot_id = [] exons_plot_exon = [] message = ["Too many genes to plot."] lines = [0] gap = 80000 tall = 0.75 if genes_raw != None and len(genes_raw) > 0: for gene_raw_obj in genes_raw: gene_obj = json.loads(gene_raw_obj) bin = gene_obj["bin"] name_id = gene_obj["name"] chrom = gene_obj["chrom"] strand = gene_obj["strand"] txStart = gene_obj["txStart"] txEnd = gene_obj["txEnd"] cdsStart = gene_obj["cdsStart"] cdsEnd = gene_obj["cdsEnd"] exonCount = gene_obj["exonCount"] exonStarts = gene_obj["exonStarts"] exonEnds = gene_obj["exonEnds"] score = gene_obj["score"] name2 = gene_obj["name2"] cdsStartStat = gene_obj["cdsStartStat"] cdsEndStat = gene_obj["cdsEndStat"] exonFrames = gene_obj["exonFrames"] name = name2 id = name_id e_start = exonStarts.split(",") e_end = exonEnds.split(",") # Determine Y Coordinate i = 0 y_coord = None while y_coord == None: if i > len(lines) - 1: y_coord = i + 1 lines.append(int(txEnd)) elif int(txStart) > (gap + lines[i]): y_coord = i + 1 lines[i] = int(txEnd) else: i += 1 genes_plot_start.append(int(txStart) / 1000000.0) genes_plot_end.append(int(txEnd) / 1000000.0) genes_plot_y.append(y_coord) genes_plot_name.append(name + " ") for i in range(len(e_start) - 1): if strand == "+": exon = i + 1 else: exon = len(e_start) - 1 - i width = (int(e_end[i]) - int(e_start[i])) / 1000000.0 x_coord = int(e_start[i]) / 1000000.0 + (width / 2) exons_plot_x.append(x_coord) exons_plot_y.append(y_coord) exons_plot_w.append(width) exons_plot_h.append(tall) exons_plot_name.append(name) exons_plot_id.append(id) exons_plot_exon.append(exon) n_rows = len(lines) genes_plot_yn = [n_rows - w + 0.5 for w in genes_plot_y] exons_plot_yn = [n_rows - w + 0.5 for w in exons_plot_y] yr2 = Range1d(start=0, end=n_rows) data_gene_plot = { 'exons_plot_x': exons_plot_x, 'exons_plot_yn': exons_plot_yn, 'exons_plot_w': exons_plot_w, 'exons_plot_h': exons_plot_h, 'exons_plot_name': exons_plot_name, 'exons_plot_id': exons_plot_id, 'exons_plot_exon': exons_plot_exon, 'coord_snps_plot': coord_snps_plot, 'snp_id_plot': snp_id_plot, 'alleles_snp_plot': alleles_snp_plot } source_gene_plot = ColumnDataSource(data_gene_plot) max_genes = 40 # if len(lines) < 3 or len(genes_raw) > max_genes: if len(lines) < 3: plot_h_pix = 250 else: plot_h_pix = 250 + (len(lines) - 2) * 50 gene_plot = figure( min_border_top=2, min_border_bottom=0, min_border_left=100, min_border_right=5, x_range=xr, y_range=yr2, border_fill_color='white', title="", h_symmetry=False, v_symmetry=False, logo=None, plot_width=800, plot_height=plot_h_pix, tools= "hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave") # if len(genes_raw) <= max_genes: gene_plot.segment(genes_plot_start, genes_plot_yn, genes_plot_end, genes_plot_yn, color="black", alpha=1, line_width=2) gene_plot.rect(x='exons_plot_x', y='exons_plot_yn', width='exons_plot_w', height='exons_plot_h', source=source_gene_plot, fill_color='grey', line_color="grey") gene_plot.text(genes_plot_start, genes_plot_yn, text=genes_plot_name, alpha=1, text_font_size="7pt", text_font_style="bold", text_baseline="middle", text_align="right", angle=0) hover = gene_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Gene", "@exons_plot_name"), ("ID", "@exons_plot_id"), ("Exon", "@exons_plot_exon"), ]) # else: # x_coord_text = x[0] + (x[-1] - x[0]) / 2.0 # gene_plot.text(x_coord_text, n_rows / 2.0, text=message, alpha=1, # text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0) gene_plot.xaxis.axis_label = "Chromosome " + \ snp_coords[1][1] + " Coordinate (Mb)(" + genome_build_vars[genome_build]['title'] + ")" gene_plot.yaxis.axis_label = "Genes (All Transcripts)" gene_plot.ygrid.grid_line_color = None gene_plot.yaxis.axis_line_color = None gene_plot.yaxis.minor_tick_line_color = None gene_plot.yaxis.major_tick_line_color = None gene_plot.yaxis.major_label_text_color = None gene_plot.toolbar_location = "below" # Gene Plot (Collapsed) else: genes_c_file = tmp_dir + "genes_c_" + request + ".json" genes_c_raw = open(genes_c_file).readlines() genes_c_plot_start = [] genes_c_plot_end = [] genes_c_plot_y = [] genes_c_plot_name = [] exons_c_plot_x = [] exons_c_plot_y = [] exons_c_plot_w = [] exons_c_plot_h = [] exons_c_plot_name = [] exons_c_plot_id = [] message_c = ["Too many genes to plot."] lines_c = [0] gap = 80000 tall = 0.75 if genes_c_raw != None and len(genes_c_raw) > 0: for gene_c_raw_obj in genes_c_raw: gene_c_obj = json.loads(gene_c_raw_obj) chrom = gene_c_obj["chrom"] txStart = gene_c_obj["txStart"] txEnd = gene_c_obj["txEnd"] exonStarts = gene_c_obj["exonStarts"] exonEnds = gene_c_obj["exonEnds"] name2 = gene_c_obj["name2"] transcripts = gene_c_obj["transcripts"] name = name2 e_start = exonStarts.split(",") e_end = exonEnds.split(",") e_transcripts = transcripts.split(",") # Determine Y Coordinate i = 0 y_coord = None while y_coord == None: if i > len(lines_c) - 1: y_coord = i + 1 lines_c.append(int(txEnd)) elif int(txStart) > (gap + lines_c[i]): y_coord = i + 1 lines_c[i] = int(txEnd) else: i += 1 genes_c_plot_start.append(int(txStart) / 1000000.0) genes_c_plot_end.append(int(txEnd) / 1000000.0) genes_c_plot_y.append(y_coord) genes_c_plot_name.append(name + " ") # for i in range(len(e_start)): for i in range(len(e_start) - 1): width = (int(e_end[i]) - int(e_start[i])) / 1000000.0 x_coord = int(e_start[i]) / 1000000.0 + (width / 2) exons_c_plot_x.append(x_coord) exons_c_plot_y.append(y_coord) exons_c_plot_w.append(width) exons_c_plot_h.append(tall) exons_c_plot_name.append(name) exons_c_plot_id.append(e_transcripts[i].replace("-", ",")) n_rows_c = len(lines_c) genes_c_plot_yn = [n_rows_c - x + 0.5 for x in genes_c_plot_y] exons_c_plot_yn = [n_rows_c - x + 0.5 for x in exons_c_plot_y] yr2_c = Range1d(start=0, end=n_rows_c) data_gene_c_plot = { 'exons_c_plot_x': exons_c_plot_x, 'exons_c_plot_yn': exons_c_plot_yn, 'exons_c_plot_w': exons_c_plot_w, 'exons_c_plot_h': exons_c_plot_h, 'exons_c_plot_name': exons_c_plot_name, 'exons_c_plot_id': exons_c_plot_id } source_gene_c_plot = ColumnDataSource(data_gene_c_plot) max_genes_c = 40 # if len(lines_c) < 3 or len(genes_c_raw) > max_genes_c: if len(lines_c) < 3: plot_h_pix = 250 else: plot_h_pix = 250 + (len(lines_c) - 2) * 50 gene_plot = figure( min_border_top=2, min_border_bottom=0, min_border_left=100, min_border_right=5, x_range=xr, y_range=yr2_c, border_fill_color='white', title="", h_symmetry=False, v_symmetry=False, logo=None, plot_width=900, plot_height=plot_h_pix, tools= "hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave") # if len(genes_c_raw) <= max_genes_c: gene_plot.segment(genes_c_plot_start, genes_c_plot_yn, genes_c_plot_end, genes_c_plot_yn, color="black", alpha=1, line_width=2) gene_plot.rect(x='exons_c_plot_x', y='exons_c_plot_yn', width='exons_c_plot_w', height='exons_c_plot_h', source=source_gene_c_plot, fill_color="grey", line_color="grey") gene_plot.text(genes_c_plot_start, genes_c_plot_yn, text=genes_c_plot_name, alpha=1, text_font_size="7pt", text_font_style="bold", text_baseline="middle", text_align="right", angle=0) hover = gene_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Gene", "@exons_c_plot_name"), ("Transcript IDs", "@exons_c_plot_id"), ]) # else: # x_coord_text = coord1/1000000.0 + (coord2/1000000.0 - coord1/1000000.0) / 2.0 # gene_c_plot.text(x_coord_text, n_rows_c / 2.0, text=message_c, alpha=1, # text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0) gene_plot.xaxis.axis_label = "Chromosome " + snp_coords[1][ 1] + " Coordinate (Mb)(" + genome_build_vars[genome_build][ 'title'] + ")" gene_plot.yaxis.axis_label = "Genes (Transcripts Collapsed)" gene_plot.ygrid.grid_line_color = None gene_plot.yaxis.axis_line_color = None gene_plot.yaxis.minor_tick_line_color = None gene_plot.yaxis.major_tick_line_color = None gene_plot.yaxis.major_label_text_color = None gene_plot.toolbar_location = "below" # Change output backend to SVG temporarily for headless export # Will be changed back to canvas in LDlink.js matrix_plot.output_backend = "svg" connector.output_backend = "svg" rug.output_backend = "svg" gene_plot.output_backend = "svg" export_svgs(matrix_plot, filename=tmp_dir + "matrix_plot_1_" + request + ".svg") export_svgs(connector, filename=tmp_dir + "connector_1_" + request + ".svg") export_svgs(rug, filename=tmp_dir + "rug_1_" + request + ".svg") export_svgs(gene_plot, filename=tmp_dir + "gene_plot_1_" + request + ".svg") # 1 pixel = 0.0264583333 cm svg_height = str(25.00 + (0.0264583333 * plot_h_pix)) + "cm" svg_height_scaled = str(110.00 + (0.1322916665 * plot_h_pix)) + "cm" # Concatenate svgs sg.Figure( "21.59cm", svg_height, sg.SVG(tmp_dir + "matrix_plot_1_" + request + ".svg"), sg.SVG(tmp_dir + "connector_1_" + request + ".svg").scale(.97).move( 0, 700), sg.SVG(tmp_dir + "rug_1_" + request + ".svg").scale(.97).move(0, 790), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(.97).move( 0, 840)).save(tmp_dir + "matrix_plot_" + request + ".svg") sg.Figure( "107.95cm", svg_height_scaled, sg.SVG(tmp_dir + "matrix_plot_1_" + request + ".svg").scale(5), sg.SVG(tmp_dir + "connector_1_" + request + ".svg").scale(4.85).move( 0, 3500), sg.SVG(tmp_dir + "rug_1_" + request + ".svg").scale(4.85).move( 0, 3930), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(4.85).move( 0, 4160)).save(tmp_dir + "matrix_plot_scaled_" + request + ".svg") # Export to PDF subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "matrix_plot_" + request + ".svg " + tmp_dir + "matrix_plot_" + request + ".pdf", shell=True) # Export to PNG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "matrix_plot_scaled_" + request + ".svg " + tmp_dir + "matrix_plot_" + request + ".png", shell=True) # Export to JPEG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "matrix_plot_scaled_" + request + ".svg " + tmp_dir + "matrix_plot_" + request + ".jpeg", shell=True) # Remove individual SVG files after they are combined subprocess.call("rm " + tmp_dir + "matrix_plot_1_" + request + ".svg", shell=True) subprocess.call("rm " + tmp_dir + "gene_plot_1_" + request + ".svg", shell=True) subprocess.call("rm " + tmp_dir + "rug_1_" + request + ".svg", shell=True) subprocess.call("rm " + tmp_dir + "connector_1_" + request + ".svg", shell=True) # Remove scaled SVG file after it is converted to png and jpeg subprocess.call("rm " + tmp_dir + "matrix_plot_scaled_" + request + ".svg", shell=True) # Remove temporary file(s) subprocess.call("rm " + tmp_dir + "genes_*" + request + "*.json", shell=True) reset_output() return None
def calculate_hap(snplst, pop, request, web, genome_build): # Set data directories using config.yml with open('config.yml', 'r') as yml_file: config = yaml.load(yml_file) dbsnp_version = config['data']['dbsnp_version'] data_dir = config['data']['data_dir'] tmp_dir = config['data']['tmp_dir'] population_samples_dir = config['data']['population_samples_dir'] genotypes_dir = config['data']['genotypes_dir'] aws_info = config['aws'] # Create JSON output output = {} # Validate genome build param if genome_build not in genome_build_vars['vars']: output[ "error"] = "Invalid genome build. Please specify either " + ", ".join( genome_build_vars['vars']) + ". " + str( output["warning"] if "warning" in output else "") return (json.dumps(output, sort_keys=True, indent=2)) # Open Inputted SNPs list file snps_raw = open(snplst).readlines() if len(snps_raw) > 30: output["error"] = "Maximum variant list is 30 RS numbers or coordinates. Your list contains " + \ str(len(snps_raw))+" entries. " + str(output["warning"] if "warning" in output else "") return (json.dumps(output, sort_keys=True, indent=2)) # Remove duplicate RS numbers and cast to lower case snps = [] for snp_raw in snps_raw: snp = snp_raw.lower().strip().split() if snp not in snps: snps.append(snp) # Select desired ancestral populations pops = pop.split("+") pop_dirs = [] for pop_i in pops: if pop_i in [ "ALL", "AFR", "AMR", "EAS", "EUR", "SAS", "ACB", "ASW", "BEB", "CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH", "GWD", "IBS", "ITU", "JPT", "KHV", "LWK", "MSL", "MXL", "PEL", "PJL", "PUR", "STU", "TSI", "YRI" ]: pop_dirs.append(data_dir + population_samples_dir + pop_i + ".txt") else: output[ "error"] = pop_i + " is not an ancestral population. Choose one of the following ancestral populations: AFR, AMR, EAS, EUR, or SAS; or one of the following sub-populations: ACB, ASW, BEB, CDX, CEU, CHB, CHS, CLM, ESN, FIN, GBR, GIH, GWD, IBS, ITU, JPT, KHV, LWK, MSL, MXL, PEL, PJL, PUR, STU, TSI, or YRI. " + str( output["warning"] if "warning" in output else "") return (json.dumps(output, sort_keys=True, indent=2)) get_pops = "cat " + " ".join(pop_dirs) pop_list = [ x.decode('utf-8') for x in subprocess.Popen( get_pops, shell=True, stdout=subprocess.PIPE).stdout.readlines() ] ids = [i.strip() for i in pop_list] pop_ids = list(set(ids)) db = connectMongoDBReadOnly(web) def get_coords(db, rsid): rsid = rsid.strip("rs") query_results = db.dbsnp.find_one({"id": rsid}) query_results_sanitized = json.loads(json_util.dumps(query_results)) return query_results_sanitized # Query genomic coordinates def get_rsnum(db, coord): temp_coord = coord.strip("chr").split(":") chro = temp_coord[0] pos = temp_coord[1] query_results = db.dbsnp.find({ "chromosome": chro.upper() if chro == 'x' or chro == 'y' else str(chro), genome_build_vars[genome_build]['position']: str(pos) }) query_results_sanitized = json.loads(json_util.dumps(query_results)) return query_results_sanitized # Replace input genomic coordinates with variant ids (rsids) def replace_coords_rsid(db, snp_lst): new_snp_lst = [] for snp_raw_i in snp_lst: if snp_raw_i[0][0:2] == "rs": new_snp_lst.append(snp_raw_i) else: snp_info_lst = get_rsnum(db, snp_raw_i[0]) # print("snp_info_lst") # print(snp_info_lst) if snp_info_lst != None: if len(snp_info_lst) > 1: var_id = "rs" + snp_info_lst[0]['id'] ref_variants = [] for snp_info in snp_info_lst: if snp_info['id'] == snp_info['ref_id']: ref_variants.append(snp_info['id']) if len(ref_variants) > 1: var_id = "rs" + ref_variants[0] if "warning" in output: output["warning"] = output["warning"] + \ "Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp_raw_i[0] + ". " else: output["warning"] = "Multiple rsIDs (" + ", ".join( ["rs" + ref_id for ref_id in ref_variants] ) + ") map to genomic coordinates " + snp_raw_i[ 0] + ". " elif len(ref_variants) == 0 and len(snp_info_lst) > 1: var_id = "rs" + snp_info_lst[0]['id'] if "warning" in output: output["warning"] = output["warning"] + \ "Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp_raw_i[0] + ". " else: output["warning"] = "Multiple rsIDs (" + ", ".join( ["rs" + ref_id for ref_id in ref_variants] ) + ") map to genomic coordinates " + snp_raw_i[ 0] + ". " else: var_id = "rs" + ref_variants[0] new_snp_lst.append([var_id]) elif len(snp_info_lst) == 1: var_id = "rs" + snp_info_lst[0]['id'] new_snp_lst.append([var_id]) else: new_snp_lst.append(snp_raw_i) else: new_snp_lst.append(snp_raw_i) return new_snp_lst snps = replace_coords_rsid(db, snps) # print("Input SNPs (replace genomic coords with RSIDs)", str(snps)) # Find RS numbers and genomic coords in snp database rs_nums = [] snp_pos = [] snp_coords = [] warn = [] tabix_coords = "" for snp_i in snps: if len(snp_i) > 0: # Length entire list of snps if len(snp_i[0]) > 2: # Length of each snp in snps # Check first two charcters are rs and last charcter of each snp if (snp_i[0][0:2] == "rs" or snp_i[0][0:3] == "chr") and snp_i[0][-1].isdigit(): snp_coord = get_coords(db, snp_i[0]) if snp_coord != None and snp_coord[genome_build_vars[ genome_build]['position']] != "NA": # check if variant is on chrY for genome build = GRCh38 if snp_coord['chromosome'] == "Y" and ( genome_build == "grch38" or genome_build == "grch38_high_coverage"): if "warning" in output: output["warning"] = output["warning"] + \ "Input variants on chromosome Y are unavailable for GRCh38, only available for GRCh37 (" + "rs" + snp_coord['id'] + " = chr" + snp_coord['chromosome'] + ":" + snp_coord[genome_build_vars[genome_build]['position']] + "). " else: output[ "warning"] = "Input variants on chromosome Y are unavailable for GRCh38, only available for GRCh37 (" + "rs" + snp_coord[ 'id'] + " = chr" + snp_coord[ 'chromosome'] + ":" + snp_coord[ genome_build_vars[genome_build] ['position']] + "). " warn.append(snp_i[0]) else: rs_nums.append(snp_i[0]) snp_pos.append(snp_coord[ genome_build_vars[genome_build]['position']]) temp = [ snp_i[0], snp_coord['chromosome'], snp_coord[genome_build_vars[genome_build] ['position']] ] snp_coords.append(temp) else: warn.append(snp_i[0]) else: warn.append(snp_i[0]) else: warn.append(snp_i[0]) if warn != []: if "warning" in output: output["warning"] = output["warning"] + \ "The following RS number(s) or coordinate(s) inputs have warnings: " + ", ".join(warn) + ". " else: output[ "warning"] = "The following RS number(s) or coordinate(s) inputs have warnings: " + ", ".join( warn) + ". " if len(rs_nums) == 0: output[ "error"] = "Input variant list does not contain any valid RS numbers or coordinates. " + str( output["warning"] if "warning" in output else "") return (json.dumps(output, sort_keys=True, indent=2)) # Check SNPs are all on the same chromosome for i in range(len(snp_coords)): if snp_coords[0][1] != snp_coords[i][1]: output["error"] = "Not all input variants are on the same chromosome: "+snp_coords[i-1][0]+"=chr" + \ str(snp_coords[i-1][1])+":"+str(snp_coords[i-1][2])+", "+snp_coords[i][0] + \ "=chr"+str(snp_coords[i][1])+":"+str(snp_coords[i][2])+". " + str(output["warning"] if "warning" in output else "") return (json.dumps(output, sort_keys=True, indent=2)) # Check max distance between SNPs distance_bp = [] for i in range(len(snp_coords)): distance_bp.append(int(snp_coords[i][2])) distance_max = max(distance_bp) - min(distance_bp) if distance_max > 1000000: if "warning" in output: output["warning"] = output["warning"] + \ "Switch rate errors become more common as distance between query variants increases (Query range = "+str( distance_max)+" bp). " else: output[ "warning"] = "Switch rate errors become more common as distance between query variants increases (Query range = " + str( distance_max) + " bp). " # Sort coordinates and make tabix formatted coordinates snp_pos_int = [int(i) for i in snp_pos] snp_pos_int.sort() # keep track of rs and snp postion after sort rs_snp_pos = [] for i in snp_pos_int: rs_snp_pos.append(snp_pos.index(str(i))) snp_coord_str = [ genome_build_vars[genome_build]['1000G_chr_prefix'] + snp_coords[0][1] + ":" + str(i) + "-" + str(i) for i in snp_pos_int ] tabix_coords = " " + " ".join(snp_coord_str) #print("tabix_coords", tabix_coords) # # Extract 1000 Genomes phased genotypes vcf_filePath = "%s/%s%s/%s" % ( config['aws']['data_subfolder'], genotypes_dir, genome_build_vars[genome_build]['1000G_dir'], genome_build_vars[genome_build]['1000G_file'] % (snp_coords[0][1])) vcf_query_snp_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath) checkS3File(aws_info, config['aws']['bucket'], vcf_filePath) vcf, h = retrieveTabix1000GData( vcf_query_snp_file, tabix_coords, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) # Define function to correct indel alleles def set_alleles(a1, a2): if len(a1) == 1 and len(a2) == 1: a1_n = a1 a2_n = a2 elif len(a1) == 1 and len(a2) > 1: a1_n = "-" a2_n = a2[1:] elif len(a1) > 1 and len(a2) == 1: a1_n = a1[1:] a2_n = "-" elif len(a1) > 1 and len(a2) > 1: a1_n = a1[1:] a2_n = a2[1:] return (a1_n, a2_n) # Make sure there are genotype data in VCF file #if vcf[-1][0:6] == "#CHROM": # output["error"] = "No query variants were found in 1000G VCF file. " + str(output["warning"] if "warning" in output else "") # return(json.dumps(output, sort_keys=True, indent=2)) head = vcf[h].strip().split() # Extract haplotypes index = [] for i in range(9, len(head)): if head[i] in pop_ids: index.append(i) hap1 = [[]] for i in range(len(index) - 1): hap1.append([]) hap2 = [[]] for i in range(len(index) - 1): hap2.append([]) # parse vcf snp_dict, missing_snp = parse_vcf(vcf[h + 1:], snp_coords, True) # throw error if no data is returned from 1000G if len(missing_snp.split()) == len(snp_pos): output[ "error"] = "Input variant list does not contain any valid RS numbers or coordinates. " + str( output["warning"] if "warning" in output else "") return (json.dumps(output, sort_keys=True, indent=2)) if len(missing_snp) > 0: output["warning"] = "Query variant " + str( missing_snp) + " is missing from 1000G (" + genome_build_vars[ genome_build]['title'] + ") data. " + str( output["warning"] if "warning" in output else "") rsnum_lst = [] allele_lst = [] pos_lst = [] for s_key in snp_dict: # parse snp_key such as chr7:pos_rs4 snp_keys = s_key.split("_") snp_key = snp_keys[0].split(':')[1] rs_input = snp_keys[1] geno_list = snp_dict[s_key] g = -1 for geno in geno_list: g = g + 1 geno = geno.strip().split() geno[0] = geno[0].lstrip('chr') # if 1000G position does not match dbSNP position for variant, use dbSNP position if geno[1] != snp_key: mismatch_msg = "Genomic position ("+geno[1]+") in 1000G data does not match dbSNP" + \ dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ") search coordinates for query variant " +\ rs_input + ". " if "warning" in output: output["warning"] = output["warning"] + mismatch_msg else: output["warning"] = mismatch_msg # throw an error in the event of missing query SNPs in 1000G data geno[1] = snp_key if "," not in geno[3] and "," not in geno[4]: a1, a2 = set_alleles(geno[3], geno[4]) count0 = 0 count1 = 0 #print(geno) for i in range(len(index)): if geno[index[i]] == "0|0": hap1[i].append(a1) hap2[i].append(a1) count0 += 2 elif geno[index[i]] == "0|1": hap1[i].append(a1) hap2[i].append(a2) count0 += 1 count1 += 1 elif geno[index[i]] == "1|0": hap1[i].append(a2) hap2[i].append(a1) count0 += 1 count1 += 1 elif geno[index[i]] == "1|1": hap1[i].append(a2) hap2[i].append(a2) count1 += 2 elif geno[index[i]] == "0": hap1[i].append(a1) hap2[i].append(".") count0 += 1 elif geno[index[i]] == "1": hap1[i].append(a2) hap2[i].append(".") count1 += 1 else: hap1[i].append(".") hap2[i].append(".") rsnum_lst.append(rs_input) position = "chr" + geno[0] + ":" + geno[1] pos_lst.append(position) f0 = round(float(count0) / (count0 + count1), 4) f1 = round(float(count1) / (count0 + count1), 4) if f0 >= f1: alleles = a1+"="+str(round(f0, 3))+", " + \ a2+"="+str(round(f1, 3)) else: alleles = a2+"="+str(round(f1, 3))+", " + \ a1+"="+str(round(f0, 3)) allele_lst.append(alleles) haps = {} for i in range(len(index)): h1 = "_".join(hap1[i]) h2 = "_".join(hap2[i]) if h1 in haps: haps[h1] += 1 else: haps[h1] = 1 if h2 in haps: haps[h2] += 1 else: haps[h2] = 1 # Remove Missing Haplotypes keys = list(haps.keys()) for key in keys: if "." in key: haps.pop(key, None) # Sort results results = [] for hap in haps: temp = [hap, haps[hap]] results.append(temp) total_haps = sum(haps.values()) results_sort1 = sorted(results, key=operator.itemgetter(0)) results_sort2 = sorted(results_sort1, key=operator.itemgetter(1), reverse=True) # Generate JSON output digits = len(str(len(results_sort2))) haps_out = {} for i in range(len(results_sort2)): hap_info = {} hap_info["Haplotype"] = results_sort2[i][0] hap_info["Count"] = results_sort2[i][1] hap_info["Frequency"] = round( float(results_sort2[i][1]) / total_haps, 4) haps_out["haplotype_" + (digits - len(str(i + 1))) * "0" + str(i + 1)] = hap_info output["haplotypes"] = haps_out digits = len(str(len(rsnum_lst))) snps_out = {} for i in range(len(rsnum_lst)): snp_info = {} snp_info["RS"] = rsnum_lst[i] snp_info["Alleles"] = allele_lst[i] snp_info["Coord"] = pos_lst[i] snps_out["snp_" + (digits - len(str(i + 1))) * "0" + str(i + 1)] = snp_info output["snps"] = snps_out # Create SNP File snp_out = open(tmp_dir + "snps_" + request + ".txt", "w") print("RS_Number\tPosition (" + genome_build_vars[genome_build]['title_hg'] + ")\tAllele Frequency", file=snp_out) for k in sorted(output["snps"].keys()): rs_k = output["snps"][k]["RS"] coord_k = output["snps"][k]["Coord"] alleles_k0 = output["snps"][k]["Alleles"].strip(" ").split(",") alleles_k1 = alleles_k0[0]+"0"*(7-len(str(alleles_k0[0]))) + \ ","+alleles_k0[1]+"0"*(8-len(str(alleles_k0[1]))) temp_k = [rs_k, coord_k, alleles_k1] print("\t".join(temp_k), file=snp_out) snp_out.close() # Create Haplotype File hap_out = open(tmp_dir + "haplotypes_" + request + ".txt", "w") print("Haplotype\tCount\tFrequency", file=hap_out) for k in sorted(output["haplotypes"].keys()): hap_k = output["haplotypes"][k]["Haplotype"] count_k = str(output["haplotypes"][k]["Count"]) freq_k = str(output["haplotypes"][k]["Frequency"]) temp_k = [hap_k, count_k, freq_k] print("\t".join(temp_k), file=hap_out) hap_out.close() # Return JSON output return (json.dumps(output, sort_keys=True, indent=2))
def calculate_clip(snplst, pop, request, web, genome_build, r2_threshold=0.1, maf_threshold=0.01): max_list = 5000 # Set data directories using config.yml with open('config.yml', 'r') as yml_file: config = yaml.load(yml_file) env = config['env'] api_mongo_addr = config['api']['api_mongo_addr'] dbsnp_version = config['data']['dbsnp_version'] population_samples_dir = config['data']['population_samples_dir'] data_dir = config['data']['data_dir'] tmp_dir = config['data']['tmp_dir'] genotypes_dir = config['data']['genotypes_dir'] aws_info = config['aws'] mongo_username = config['database']['mongo_user_readonly'] mongo_password = config['database']['mongo_password'] mongo_port = config['database']['mongo_port'] export_s3_keys = retrieveAWSCredentials() # Ensure tmp directory exists if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) # Create JSON output out_json = open(tmp_dir+"clip"+request+".json", "w") output = {} # Validate genome build param print("genome_build " + genome_build) if genome_build not in genome_build_vars['vars']: output["error"] = "Invalid genome build. Please specify either " + ", ".join(genome_build_vars['vars']) + "." json_output = json.dumps(output, sort_keys=True, indent=2) print(json_output, file=out_json) out_json.close() return("", "", "") # Open SNP list file snps_raw = open(snplst).readlines() if len(snps_raw) > max_list: output["error"] = "Maximum SNP list is " + \ str(max_list)+" RS numbers. Your list contains " + \ str(len(snps_raw))+" entries." json_output = json.dumps(output, sort_keys=True, indent=2) print(json_output, file=out_json) out_json.close() return("", "", "") # Remove duplicate RS numbers snps = [] for snp_raw in snps_raw: snp = snp_raw.strip().split() if snp not in snps: snps.append(snp) # Select desired ancestral populations pops = pop.split("+") pop_dirs = [] for pop_i in pops: if pop_i in ["ALL", "AFR", "AMR", "EAS", "EUR", "SAS", "ACB", "ASW", "BEB", "CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH", "GWD", "IBS", "ITU", "JPT", "KHV", "LWK", "MSL", "MXL", "PEL", "PJL", "PUR", "STU", "TSI", "YRI"]: pop_dirs.append(data_dir + population_samples_dir + pop_i + ".txt") else: output["error"] = pop_i+" is not an ancestral population. Choose one of the following ancestral populations: AFR, AMR, EAS, EUR, or SAS; or one of the following sub-populations: ACB, ASW, BEB, CDX, CEU, CHB, CHS, CLM, ESN, FIN, GBR, GIH, GWD, IBS, ITU, JPT, KHV, LWK, MSL, MXL, PEL, PJL, PUR, STU, TSI, or YRI." json_output = json.dumps(output, sort_keys=True, indent=2) print(json_output, file=out_json) out_json.close() return("", "", "") get_pops = "cat " + " ".join(pop_dirs) pop_list = [x.decode('utf-8') for x in subprocess.Popen(get_pops, shell=True, stdout=subprocess.PIPE).stdout.readlines()] ids = [i.strip() for i in pop_list] pop_ids = list(set(ids)) # Connect to Mongo snp database if env == 'local': mongo_host = api_mongo_addr else: mongo_host = 'localhost' if web: client = MongoClient('mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host+'/admin', mongo_port) else: if env == 'local': client = MongoClient('mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host+'/admin', mongo_port) else: client = MongoClient('localhost', mongo_port) db = client["LDLink"] def get_coords(db, rsid): rsid = rsid.strip("rs") query_results = db.dbsnp.find_one({"id": rsid}) query_results_sanitized = json.loads(json_util.dumps(query_results)) return query_results_sanitized # Replace input genomic coordinates with variant ids (rsids) def replace_coords_rsid(db, snp_lst): new_snp_lst = [] for snp_raw_i in snp_lst: if snp_raw_i[0][0:2] == "rs": new_snp_lst.append(snp_raw_i) else: snp_info_lst = get_rsnum(db, snp_raw_i[0], genome_build) print("snp_info_lst") print(snp_info_lst) if snp_info_lst != None: if len(snp_info_lst) > 1: var_id = "rs" + snp_info_lst[0]['id'] ref_variants = [] for snp_info in snp_info_lst: if snp_info['id'] == snp_info['ref_id']: ref_variants.append(snp_info['id']) if len(ref_variants) > 1: var_id = "rs" + ref_variants[0] if "warning" in output: output["warning"] = output["warning"] + \ ". Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp_raw_i[0] else: output["warning"] = "Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp_raw_i[0] elif len(ref_variants) == 0 and len(snp_info_lst) > 1: var_id = "rs" + snp_info_lst[0]['id'] if "warning" in output: output["warning"] = output["warning"] + \ ". Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp_raw_i[0] else: output["warning"] = "Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp_raw_i[0] else: var_id = "rs" + ref_variants[0] new_snp_lst.append([var_id]) elif len(snp_info_lst) == 1: var_id = "rs" + snp_info_lst[0]['id'] new_snp_lst.append([var_id]) else: new_snp_lst.append(snp_raw_i) else: new_snp_lst.append(snp_raw_i) return new_snp_lst snps = replace_coords_rsid(db, snps) # Find RS numbers in snp database details = collections.OrderedDict() rs_nums = [] snp_pos = [] snp_coords = [] warn = [] tabix_coords = "" for snp_i in snps: if len(snp_i) > 0: if len(snp_i[0]) > 2: if (snp_i[0][0:2] == "rs" or snp_i[0][0:3] == "chr") and snp_i[0][-1].isdigit(): snp_coord = get_coords(db, snp_i[0]) if snp_coord != None and snp_coord[genome_build_vars[genome_build]['position']] != "NA": # check if variant is on chrY for genome build = GRCh38 if snp_coord['chromosome'] == "Y" and (genome_build == "grch38" or genome_build == "grch38_high_coverage"): if "warning" in output: output["warning"] = output["warning"] + \ ". " + "Input variants on chromosome Y are unavailable for GRCh38, only available for GRCh37 (" + "rs" + snp_coord['id'] + " = chr" + snp_coord['chromosome'] + ":" + snp_coord[genome_build_vars[genome_build]['position']] + ")" else: output["warning"] = "Input variants on chromosome Y are unavailable for GRCh38, only available for GRCh37 (" + "rs" + snp_coord['id'] + " = chr" + snp_coord['chromosome'] + ":" + snp_coord[genome_build_vars[genome_build]['position']] + ")" warn.append(snp_i[0]) details[snp_i[0]] = ["NA", "NA", "Chromosome Y variants are unavailable for GRCh38, only available for GRCh37."] else: rs_nums.append(snp_i[0]) snp_pos.append(snp_coord[genome_build_vars[genome_build]['position']]) temp = [snp_i[0], snp_coord['chromosome'], snp_coord[genome_build_vars[genome_build]['position']]] snp_coords.append(temp) else: warn.append(snp_i[0]) details[snp_i[0]] = ["NA", "NA", "Variant not found in dbSNP" + dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + "), variant removed."] else: warn.append(snp_i[0]) details[snp_i[0]] = ["NA", "NA", "Not a RS number, query removed."] else: warn.append(snp_i[0]) details[snp_i[0]] = ["NA", "NA", "Not a RS number, query removed."] else: output["error"] = "Input list of RS numbers is empty" json_output = json.dumps(output, sort_keys=True, indent=2) print(json_output, file=out_json) out_json.close() return("", "", "") if warn != []: if "warning" in output: output["warning"] = output["warning"] + \ ". The following RS number(s) or coordinate(s) inputs have warnings: " + ", ".join(warn) else: output["warning"] = "The following RS number(s) or coordinate(s) inputs have warnings: " + ", ".join(warn) if len(rs_nums) == 0: output["error"] = "Input SNP list does not contain any valid RS numbers or coordinates. " + output["warning"] json_output = json.dumps(output, sort_keys=True, indent=2) print(json_output, file=out_json) out_json.close() return("", "", "") # Check SNPs are all on the same chromosome for i in range(len(snp_coords)): if snp_coords[0][1] != snp_coords[i][1]: output["error"] = "Not all input variants are on the same chromosome: "+snp_coords[i-1][0]+"=chr" + \ str(snp_coords[i-1][1])+":"+str(snp_coords[i-1][2])+", "+snp_coords[i][0] + \ "=chr"+str(snp_coords[i][1])+":"+str(snp_coords[i][2])+"." json_output = json.dumps(output, sort_keys=True, indent=2) print(json_output, file=out_json) out_json.close() return("", "", "") # Make tabix formatted coordinates snp_coord_str = [genome_build_vars[genome_build]['1000G_chr_prefix'] + snp_coords[0][1]+":"+i+"-"+i for i in snp_pos] tabix_coords = " "+" ".join(snp_coord_str) # Extract 1000 Genomes phased genotypes vcf_filePath = "%s/%s%s/%s" % (config['aws']['data_subfolder'], genotypes_dir, genome_build_vars[genome_build]['1000G_dir'], genome_build_vars[genome_build]['1000G_file'] % (snp_coords[0][1])) vcf_query_snp_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath) checkS3File(aws_info, config['aws']['bucket'], vcf_filePath) vcf = retrieveTabix1000GData(vcf_query_snp_file, tabix_coords, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) # Make MAF function def calc_maf(genos): vals = {"0|0": 0, "0|1": 0, "1|0": 0, "1|1": 0, "0": 0, "1": 0} for i in range(len(genos)): if genos[i] in vals: vals[genos[i]] += 1 zeros = vals["0|0"]*2+vals["0|1"]+vals["1|0"]+vals["0"] ones = vals["1|1"]*2+vals["0|1"]+vals["1|0"]+vals["1"] total = zeros+ones f0 = zeros*1.0/total f1 = ones*1.0/total maf = min(f0, f1) return f0, f1, maf # Define function to correct indel alleles def set_alleles(a1, a2): if len(a1) == 1 and len(a2) == 1: a1_n = a1 a2_n = a2 elif len(a1) == 1 and len(a2) > 1: a1_n = "-" a2_n = a2[1:] elif len(a1) > 1 and len(a2) == 1: a1_n = a1[1:] a2_n = "-" elif len(a1) > 1 and len(a2) > 1: a1_n = a1[1:] a2_n = a2[1:] return(a1_n, a2_n) # Make R2 function def calc_r2(var1, var2): hap_vals = {"0|0-0|0": 0, "0|0-0|1": 0, "0|0-1|0": 0, "0|0-1|1": 0, "0|1-0|0": 0, "0|1-0|1": 0, "0|1-1|0": 0, "0|1-1|1": 0, "1|0-0|0": 0, "1|0-0|1": 0, "1|0-1|0": 0, "1|0-1|1": 0, "1|1-0|0": 0, "1|1-0|1": 0, "1|1-1|0": 0, "1|1-1|1": 0, "0-0": 0, "0-1": 0, "1-0": 0, "1-1": 0} for i in range(len(var1)): ind_geno = var1[i]+"-"+var2[i] if ind_geno in hap_vals: hap_vals[ind_geno] += 1 A = hap_vals["0|0-0|0"]*2+hap_vals["0|0-0|1"]+hap_vals["0|0-1|0"]+hap_vals["0|1-0|0"] + \ hap_vals["0|1-0|1"]+hap_vals["1|0-0|0"] + \ hap_vals["1|0-1|0"]+hap_vals["0-0"] B = hap_vals["0|0-0|1"]+hap_vals["0|0-1|0"]+hap_vals["0|0-1|1"]*2+hap_vals["0|1-1|0"] + \ hap_vals["0|1-1|1"]+hap_vals["1|0-0|1"] + \ hap_vals["1|0-1|1"]+hap_vals["0-1"] C = hap_vals["0|1-0|0"]+hap_vals["0|1-1|0"]+hap_vals["1|0-0|0"]+hap_vals["1|0-0|1"] + \ hap_vals["1|1-0|0"]*2+hap_vals["1|1-0|1"] + \ hap_vals["1|1-1|0"]+hap_vals["1-0"] D = hap_vals["0|1-0|1"]+hap_vals["0|1-1|1"]+hap_vals["1|0-1|0"]+hap_vals["1|0-1|1"] + \ hap_vals["1|1-0|1"]+hap_vals["1|1-1|0"] + \ hap_vals["1|1-1|1"]*2+hap_vals["1-1"] delta = float(A*D-B*C) Ms = float((A+C)*(B+D)*(A+B)*(C+D)) if Ms != 0: r2 = (delta**2)/Ms else: r2 = None return(r2) # Import SNP VCF file hap_dict = {} h = 0 while vcf[h][0:2] == "##": h += 1 head = vcf[h].strip().split() # Extract population specific haplotypes pop_index = [] for i in range(9, len(head)): if head[i] in pop_ids: pop_index.append(i) rsnum_lst = [] for g in range(h+1, len(vcf)): geno = vcf[g].strip().split() geno[0] = geno[0].lstrip('chr') if geno[1] not in snp_pos: if "warning" in output: output["warning"] = output["warning"]+". Genomic position ("+geno[1]+") in VCF file does not match db" + \ dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ") search coordinates for query variant" else: output["warning"] = "Genomic position ("+geno[1]+") in VCF file does not match db" + \ dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ") search coordinates for query variant" continue if snp_pos.count(geno[1]) == 1: rs_query = rs_nums[snp_pos.index(geno[1])] else: pos_index = [] for p in range(len(snp_pos)): if snp_pos[p] == geno[1]: pos_index.append(p) for p in pos_index: if rs_nums[p] not in rsnum_lst: rs_query = rs_nums[p] break if rs_query in rsnum_lst: continue rs_1000g = geno[2] if rs_query == rs_1000g: rsnum = rs_1000g else: count = -2 found = "false" while count <= 2 and count+g < len(vcf): geno_next = vcf[g+count].strip().split() geno_next[0] = geno_next[0].lstrip('chr') if len(geno_next) >= 3 and rs_query == geno_next[2]: found = "true" break count += 1 if found == "false": if "rs" in rs_1000g: if "warning" in output: output["warning"] = output["warning"] + \ ". Genomic position for query variant ("+rs_query + \ ") does not match RS number at 1000G position (chr" + \ geno[0]+":"+geno[1]+" = "+rs_1000g+")" else: output["warning"] = "Genomic position for query variant ("+rs_query + \ ") does not match RS number at 1000G position (chr" + \ geno[0]+":"+geno[1]+" = "+rs_1000g+")" indx = [i[0] for i in snps].index(rs_query) # snps[indx][0]=geno[2] # rsnum=geno[2] snps[indx][0] = rs_query rsnum = rs_query # try: # indx=[i[0] for i in snps].index(rs_query) # snps[indx][0]=geno[2] # rsnum=geno[2] # except ValueError: # print("List does not contain value:") # print "#####" # print "variable rs_query " + rs_query # print "variable snps " + str(snps) # print "#####" else: continue details[rsnum] = ["chr"+geno[0]+":"+geno[1]] if "," not in geno[3] and "," not in geno[4]: temp_genos = [] for i in range(len(pop_index)): temp_genos.append(geno[pop_index[i]]) f0, f1, maf = calc_maf(temp_genos) a0, a1 = set_alleles(geno[3], geno[4]) details[rsnum].append( a0+"="+str(round(f0, 3))+", "+a1+"="+str(round(f1, 3))) if maf_threshold <= maf: hap_dict[rsnum] = [temp_genos] rsnum_lst.append(rsnum) else: details[rsnum].append( "Variant MAF is "+str(round(maf, 4))+", variant removed.") else: details[rsnum].append(geno[3]+"=NA, "+geno[4]+"=NA") details[rsnum].append("Variant is not biallelic, variant removed.") for i in rs_nums: if i not in rsnum_lst: if i not in details: index_i = rs_nums.index(i) details[i] = ["chr"+snp_coords[index_i][1]+":"+snp_coords[index_i][2]+"-" + snp_coords[index_i][2], "NA", "Variant not in 1000G VCF file, variant removed."] # Thin the SNPs # sup_2=u"\u00B2" sup_2 = "2" i = 0 while i < len(rsnum_lst): details[rsnum_lst[i]].append("Variant kept.") remove_list = [] for j in range(i+1, len(rsnum_lst)): r2 = calc_r2(hap_dict[rsnum_lst[i]][0], hap_dict[rsnum_lst[j]][0]) if r2_threshold <= r2: snp = rsnum_lst[j] details[snp].append("Variant in LD with "+rsnum_lst[i] + " (R"+sup_2+"="+str(round(r2, 4))+"), variant removed.") remove_list.append(snp) for snp in remove_list: rsnum_lst.remove(snp) i += 1 # Return output json_output = json.dumps(output, sort_keys=True, indent=2) print(json_output, file=out_json) out_json.close() return(snps, rsnum_lst, details)