def calculate_matrix_svg(snplst, pop, request, genome_build, r2_d="r2", collapseTranscript=True): # Set data directories using config.yml with open('config.yml', 'r') as yml_file: config = yaml.load(yml_file) env = config['env'] api_mongo_addr = config['api']['api_mongo_addr'] population_samples_dir = config['data']['population_samples_dir'] data_dir = config['data']['data_dir'] tmp_dir = config['data']['tmp_dir'] genotypes_dir = config['data']['genotypes_dir'] aws_info = config['aws'] mongo_username = config['database']['mongo_user_readonly'] mongo_password = config['database']['mongo_password'] mongo_port = config['database']['mongo_port'] export_s3_keys = retrieveAWSCredentials() # Ensure tmp directory exists if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) # Open SNP list file snps_raw = open(snplst).readlines() # Remove duplicate RS numbers snps = [] for snp_raw in snps_raw: snp = snp_raw.strip().split() if snp not in snps: snps.append(snp) # Select desired ancestral populations pops = pop.split("+") pop_dirs = [] for pop_i in pops: if pop_i in [ "ALL", "AFR", "AMR", "EAS", "EUR", "SAS", "ACB", "ASW", "BEB", "CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH", "GWD", "IBS", "ITU", "JPT", "KHV", "LWK", "MSL", "MXL", "PEL", "PJL", "PUR", "STU", "TSI", "YRI" ]: pop_dirs.append(data_dir + population_samples_dir + pop_i + ".txt") get_pops = "cat " + " ".join(pop_dirs) pop_list = [ x.decode('utf-8') for x in subprocess.Popen( get_pops, shell=True, stdout=subprocess.PIPE).stdout.readlines() ] ids = [i.strip() for i in pop_list] pop_ids = list(set(ids)) # Connect to Mongo snp database if env == 'local': mongo_host = api_mongo_addr else: mongo_host = 'localhost' client = MongoClient( 'mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host + '/admin', mongo_port) db = client["LDLink"] def get_coords(db, rsid): rsid = rsid.strip("rs") query_results = db.dbsnp.find_one({"id": rsid}) query_results_sanitized = json.loads(json_util.dumps(query_results)) return query_results_sanitized # Query genomic coordinates def get_rsnum(db, coord): temp_coord = coord.strip("chr").split(":") chro = temp_coord[0] pos = temp_coord[1] query_results = db.dbsnp.find({ "chromosome": chro.upper() if chro == 'x' or chro == 'y' else str(chro), genome_build_vars[genome_build]['position']: str(pos) }) query_results_sanitized = json.loads(json_util.dumps(query_results)) return query_results_sanitized # Replace input genomic coordinates with variant ids (rsids) def replace_coords_rsid(db, snp_lst): new_snp_lst = [] for snp_raw_i in snp_lst: if snp_raw_i[0][0:2] == "rs": new_snp_lst.append(snp_raw_i) else: snp_info_lst = get_rsnum(db, snp_raw_i[0]) print("snp_info_lst") print(snp_info_lst) if snp_info_lst != None: if len(snp_info_lst) > 1: var_id = "rs" + snp_info_lst[0]['id'] ref_variants = [] for snp_info in snp_info_lst: if snp_info['id'] == snp_info['ref_id']: ref_variants.append(snp_info['id']) if len(ref_variants) > 1: var_id = "rs" + ref_variants[0] elif len(ref_variants) == 0 and len(snp_info_lst) > 1: var_id = "rs" + snp_info_lst[0]['id'] else: var_id = "rs" + ref_variants[0] new_snp_lst.append([var_id]) elif len(snp_info_lst) == 1: var_id = "rs" + snp_info_lst[0]['id'] new_snp_lst.append([var_id]) else: new_snp_lst.append(snp_raw_i) else: new_snp_lst.append(snp_raw_i) return new_snp_lst snps = replace_coords_rsid(db, snps) # Find RS numbers in snp database rs_nums = [] snp_pos = [] snp_coords = [] tabix_coords = "" for snp_i in snps: if len(snp_i) > 0: if len(snp_i[0]) > 2: if (snp_i[0][0:2] == "rs" or snp_i[0][0:3] == "chr") and snp_i[0][-1].isdigit(): snp_coord = get_coords(db, snp_i[0]) if snp_coord != None and snp_coord[genome_build_vars[ genome_build]['position']] != "NA": # check if variant is on chrY for genome build = GRCh38 if not (snp_coord['chromosome'] == "Y" and (genome_build == "grch38" or genome_build == "grch38_high_coverage")): rs_nums.append(snp_i[0]) snp_pos.append(snp_coord[ genome_build_vars[genome_build]['position']]) temp = [ snp_i[0], snp_coord['chromosome'], snp_coord[genome_build_vars[genome_build] ['position']] ] snp_coords.append(temp) # Check max distance between SNPs distance_bp = [] for i in range(len(snp_coords)): distance_bp.append(int(snp_coords[i][2])) # Sort coordinates and make tabix formatted coordinates snp_pos_int = [int(i) for i in snp_pos] snp_pos_int.sort() snp_coord_str = [ genome_build_vars[genome_build]['1000G_chr_prefix'] + snp_coords[0][1] + ":" + str(i) + "-" + str(i) for i in snp_pos_int ] tabix_coords = " " + " ".join(snp_coord_str) # Extract 1000 Genomes phased genotypes vcf_filePath = "%s/%s%s/%s" % ( config['aws']['data_subfolder'], genotypes_dir, genome_build_vars[genome_build]['1000G_dir'], genome_build_vars[genome_build]['1000G_file'] % (snp_coords[0][1])) vcf_query_snp_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath) checkS3File(aws_info, config['aws']['bucket'], vcf_filePath) # Define function to correct indel alleles def set_alleles(a1, a2): if len(a1) == 1 and len(a2) == 1: a1_n = a1 a2_n = a2 elif len(a1) == 1 and len(a2) > 1: a1_n = "-" a2_n = a2[1:] elif len(a1) > 1 and len(a2) == 1: a1_n = a1[1:] a2_n = "-" elif len(a1) > 1 and len(a2) > 1: a1_n = a1[1:] a2_n = a2[1:] return (a1_n, a2_n) # Import SNP VCF files tabix_snps = export_s3_keys + " cd {2}; tabix -fhD {0}{1} | grep -v -e END".format( vcf_query_snp_file, tabix_coords, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) vcf = [ x.decode('utf-8') for x in subprocess.Popen( tabix_snps, shell=True, stdout=subprocess.PIPE).stdout.readlines() ] h = 0 while vcf[h][0:2] == "##": h += 1 head = vcf[h].strip().split() # Extract haplotypes index = [] for i in range(9, len(head)): if head[i] in pop_ids: index.append(i) hap1 = [[]] for i in range(len(index) - 1): hap1.append([]) hap2 = [[]] for i in range(len(index) - 1): hap2.append([]) rsnum_lst = [] allele_lst = [] pos_lst = [] for g in range(h + 1, len(vcf)): geno = vcf[g].strip().split() geno[0] = geno[0].lstrip('chr') if geno[1] not in snp_pos: continue if snp_pos.count(geno[1]) == 1: rs_query = rs_nums[snp_pos.index(geno[1])] else: pos_index = [] for p in range(len(snp_pos)): if snp_pos[p] == geno[1]: pos_index.append(p) for p in pos_index: if rs_nums[p] not in rsnum_lst: rs_query = rs_nums[p] break if rs_query in rsnum_lst: continue rs_1000g = geno[2] if rs_query == rs_1000g: rsnum = rs_1000g else: count = -2 found = "false" while count <= 2 and count + g < len(vcf): geno_next = vcf[g + count].strip().split() geno_next[0] = geno_next[0].lstrip('chr') if len(geno_next) >= 3 and rs_query == geno_next[2]: found = "true" break count += 1 if found == "false": indx = [i[0] for i in snps].index(rs_query) # snps[indx][0] = geno[2] # rsnum = geno[2] snps[indx][0] = rs_query rsnum = rs_query else: continue if "," not in geno[3] and "," not in geno[4]: a1, a2 = set_alleles(geno[3], geno[4]) for i in range(len(index)): if geno[index[i]] == "0|0": hap1[i].append(a1) hap2[i].append(a1) elif geno[index[i]] == "0|1": hap1[i].append(a1) hap2[i].append(a2) elif geno[index[i]] == "1|0": hap1[i].append(a2) hap2[i].append(a1) elif geno[index[i]] == "1|1": hap1[i].append(a2) hap2[i].append(a2) elif geno[index[i]] == "0": hap1[i].append(a1) hap2[i].append(".") elif geno[index[i]] == "1": hap1[i].append(a2) hap2[i].append(".") else: hap1[i].append(".") hap2[i].append(".") rsnum_lst.append(rsnum) position = "chr" + geno[0] + ":" + geno[1] + "-" + geno[1] pos_lst.append(position) alleles = a1 + "/" + a2 allele_lst.append(alleles) # Calculate Pairwise LD Statistics all_haps = hap1 + hap2 ld_matrix = [[[None for v in range(2)] for i in range(len(all_haps[0]))] for j in range(len(all_haps[0]))] for i in range(len(all_haps[0])): for j in range(i, len(all_haps[0])): hap = {} for k in range(len(all_haps)): # Extract haplotypes hap_k = all_haps[k][i] + all_haps[k][j] if hap_k in hap: hap[hap_k] += 1 else: hap[hap_k] = 1 # Remove Missing Haplotypes keys = list(hap.keys()) for key in keys: if "." in key: hap.pop(key, None) # Check all haplotypes are present if len(hap) != 4: snp_i_a = allele_lst[i].split("/") snp_j_a = allele_lst[j].split("/") haps = [ snp_i_a[0] + snp_j_a[0], snp_i_a[0] + snp_j_a[1], snp_i_a[1] + snp_j_a[0], snp_i_a[1] + snp_j_a[1] ] for h in haps: if h not in hap: hap[h] = 0 # Perform LD calculations A = hap[sorted(hap)[0]] B = hap[sorted(hap)[1]] C = hap[sorted(hap)[2]] D = hap[sorted(hap)[3]] tmax = max(A, B, C, D) delta = float(A * D - B * C) Ms = float((A + C) * (B + D) * (A + B) * (C + D)) if Ms != 0: # D prime if delta < 0: D_prime = round( abs(delta / min((A + C) * (A + B), (B + D) * (C + D))), 3) else: D_prime = round( abs(delta / min((A + C) * (C + D), (A + B) * (B + D))), 3) # R2 r2 = round((delta**2) / Ms, 3) # Find Correlated Alleles if str(r2) != "NA" and float(r2) > 0.1: Ac = hap[sorted(hap)[0]] Bc = hap[sorted(hap)[1]] Cc = hap[sorted(hap)[2]] Dc = hap[sorted(hap)[3]] if ((Ac * Dc) / max((Bc * Cc), 0.01) > 1): match = sorted(hap)[0][0] + "=" + sorted( hap)[0][1] + "," + sorted( hap)[3][0] + "=" + sorted(hap)[3][1] else: match = sorted(hap)[1][0] + "=" + sorted( hap)[1][1] + "," + sorted( hap)[2][0] + "=" + sorted(hap)[2][1] else: match = " = , = " else: D_prime = "NA" r2 = "NA" match = " = , = " snp1 = rsnum_lst[i] snp2 = rsnum_lst[j] pos1 = pos_lst[i].split("-")[0] pos2 = pos_lst[j].split("-")[0] allele1 = allele_lst[i] allele2 = allele_lst[j] corr = match.split(",")[0].split("=")[1] + "=" + match.split( ",")[0].split("=")[0] + "," + match.split(",")[1].split( "=")[1] + "=" + match.split(",")[1].split("=")[0] corr_f = match ld_matrix[i][j] = [ snp1, snp2, allele1, allele2, corr, pos1, pos2, D_prime, r2 ] ld_matrix[j][i] = [ snp2, snp1, allele2, allele1, corr_f, pos2, pos1, D_prime, r2 ] # Generate Plot Variables out = [j for i in ld_matrix for j in i] xnames = [] ynames = [] xA = [] yA = [] corA = [] xpos = [] ypos = [] D = [] R = [] box_color = [] box_trans = [] if r2_d not in ["r2", "d"]: r2_d = "r2" for i in range(len(out)): snp1, snp2, allele1, allele2, corr, pos1, pos2, D_prime, r2 = out[i] xnames.append(snp1) ynames.append(snp2) xA.append(allele1) yA.append(allele2) corA.append(corr) xpos.append(pos1) ypos.append(pos2) sqrti = math.floor(math.sqrt(len(out))) if sqrti == 0: D.append(str(round(float(D_prime), 4))) R.append(str(round(float(r2), 4))) box_color.append("red") box_trans.append(r2) elif i % sqrti < i // sqrti and r2 != "NA": D.append(str(round(float(D_prime), 4))) R.append(str(round(float(r2), 4))) box_color.append("blue") box_trans.append(abs(D_prime)) elif i % sqrti > i // sqrti and D_prime != "NA": D.append(str(round(float(D_prime), 4))) R.append(str(round(float(r2), 4))) box_color.append("red") box_trans.append(r2) elif i % sqrti == i // sqrti and D_prime != "NA": D.append(str(round(float(D_prime), 4))) R.append(str(round(float(r2), 4))) box_color.append("purple") box_trans.append(r2) else: D.append("NA") R.append("NA") box_color.append("gray") box_trans.append(0.1) # Import plotting modules from collections import OrderedDict from bokeh.embed import components, file_html from bokeh.layouts import gridplot from bokeh.models import HoverTool, LinearAxis, Range1d from bokeh.plotting import ColumnDataSource, curdoc, figure, output_file, reset_output, save from bokeh.resources import CDN from bokeh.io import export_svgs import svgutils.compose as sg from math import pi reset_output() # Aggregate Plotting Data x = [] y = [] w = [] h = [] coord_snps_plot = [] snp_id_plot = [] alleles_snp_plot = [] for i in range(0, len(xpos), int(len(xpos)**0.5)): x.append(int(xpos[i].split(":")[1]) / 1000000.0) y.append(0.5) w.append(0.00003) h.append(1.06) coord_snps_plot.append(xpos[i]) snp_id_plot.append(xnames[i]) alleles_snp_plot.append(xA[i]) buffer = (x[-1] - x[0]) * 0.025 xr = Range1d(start=x[0] - buffer, end=x[-1] + buffer) yr = Range1d(start=-0.03, end=1.03) y2_ll = [-0.03] * len(x) y2_ul = [1.03] * len(x) yr_pos = Range1d(start=(x[-1] + buffer) * -1, end=(x[0] - buffer) * -1) yr0 = Range1d(start=0, end=1) yr2 = Range1d(start=0, end=3.8) yr3 = Range1d(start=0, end=1) spacing = (x[-1] - x[0] + buffer + buffer) / (len(x) * 1.0) x2 = [] y0 = [] y1 = [] y2 = [] y3 = [] y4 = [] for i in range(len(x)): x2.append(x[0] - buffer + spacing * (i + 0.5)) y0.append(0) y1.append(0.20) y2.append(0.80) y3.append(1) y4.append(1.15) xname_pos = [] for i in x2: for j in range(len(x2)): xname_pos.append(i) data = { 'xname': xnames, 'xname_pos': xname_pos, 'yname': ynames, 'xA': xA, 'yA': yA, 'xpos': xpos, 'ypos': ypos, 'R2': R, 'Dp': D, 'corA': corA, 'box_color': box_color, 'box_trans': box_trans } source = ColumnDataSource(data) threshold = 70 if len(snps) < threshold: matrix_plot = figure( outline_line_color="white", min_border_top=0, min_border_bottom=2, min_border_left=100, min_border_right=5, x_range=xr, y_range=list(reversed(rsnum_lst)), h_symmetry=False, v_symmetry=False, border_fill_color='white', x_axis_type=None, logo=None, tools="hover,undo,redo,reset,pan,box_zoom,previewsave", title=" ", plot_width=800, plot_height=700) else: matrix_plot = figure( outline_line_color="white", min_border_top=0, min_border_bottom=2, min_border_left=100, min_border_right=5, x_range=xr, y_range=list(reversed(rsnum_lst)), h_symmetry=False, v_symmetry=False, border_fill_color='white', x_axis_type=None, y_axis_type=None, logo=None, tools="hover,undo,redo,reset,pan,box_zoom,previewsave", title=" ", plot_width=800, plot_height=700) matrix_plot.rect(x='xname_pos', y='yname', width=0.95 * spacing, height=0.95, source=source, color="box_color", alpha="box_trans", line_color=None) matrix_plot.grid.grid_line_color = None matrix_plot.axis.axis_line_color = None matrix_plot.axis.major_tick_line_color = None if len(snps) < threshold: matrix_plot.axis.major_label_text_font_size = "8pt" matrix_plot.xaxis.major_label_orientation = "vertical" matrix_plot.axis.major_label_text_font_style = "normal" matrix_plot.xaxis.major_label_standoff = 0 sup_2 = "\u00B2" hover = matrix_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Variant 1", " " + "@yname (@yA)"), ("Variant 2", " " + "@xname (@xA)"), ("D\'", " " + "@Dp"), ("R" + sup_2, " " + "@R2"), ("Correlated Alleles", " " + "@corA"), ]) # Connecting and Rug Plots # Connector Plot if len(snps) < threshold: connector = figure(outline_line_color="white", y_axis_type=None, x_axis_type=None, x_range=xr, y_range=yr2, border_fill_color='white', title="", min_border_left=100, min_border_right=5, min_border_top=0, min_border_bottom=0, h_symmetry=False, v_symmetry=False, plot_width=800, plot_height=90, tools="xpan,tap") connector.segment(x, y0, x, y1, color="black") connector.segment(x, y1, x2, y2, color="black") connector.segment(x2, y2, x2, y3, color="black") connector.text(x2, y4, text=snp_id_plot, alpha=1, angle=pi / 2, text_font_size="8pt", text_baseline="middle", text_align="left") else: connector = figure(outline_line_color="white", y_axis_type=None, x_axis_type=None, x_range=xr, y_range=yr3, border_fill_color='white', title="", min_border_left=100, min_border_right=5, min_border_top=0, min_border_bottom=0, h_symmetry=False, v_symmetry=False, plot_width=800, plot_height=30, tools="xpan,tap") connector.segment(x, y0, x, y1, color="black") connector.segment(x, y1, x2, y2, color="black") connector.segment(x2, y2, x2, y3, color="black") connector.yaxis.major_label_text_color = None connector.yaxis.minor_tick_line_alpha = 0 # Option does not work connector.yaxis.axis_label = " " connector.grid.grid_line_color = None connector.axis.axis_line_color = None connector.axis.major_tick_line_color = None connector.axis.minor_tick_line_color = None connector.toolbar_location = None data_rug = { 'x': x, 'y': y, 'w': w, 'h': h, 'coord_snps_plot': coord_snps_plot, 'snp_id_plot': snp_id_plot, 'alleles_snp_plot': alleles_snp_plot } source_rug = ColumnDataSource(data_rug) # Rug Plot rug = figure(x_range=xr, y_range=yr, y_axis_type=None, title="", min_border_top=1, min_border_bottom=0, min_border_left=100, min_border_right=5, h_symmetry=False, v_symmetry=False, plot_width=800, plot_height=50, tools="hover,xpan,tap") rug.rect(x='x', y='y', width='w', height='h', fill_color='red', dilate=True, line_color=None, fill_alpha=0.6, source=source_rug) hover = rug.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("SNP", "@snp_id_plot (@alleles_snp_plot)"), ("Coord", "@coord_snps_plot"), ]) rug.toolbar_location = None if collapseTranscript == "false": # Gene Plot (All Transcripts) genes_file = tmp_dir + "genes_" + request + ".json" genes_raw = open(genes_file).readlines() genes_plot_start = [] genes_plot_end = [] genes_plot_y = [] genes_plot_name = [] exons_plot_x = [] exons_plot_y = [] exons_plot_w = [] exons_plot_h = [] exons_plot_name = [] exons_plot_id = [] exons_plot_exon = [] message = ["Too many genes to plot."] lines = [0] gap = 80000 tall = 0.75 if genes_raw != None and len(genes_raw) > 0: for gene_raw_obj in genes_raw: gene_obj = json.loads(gene_raw_obj) bin = gene_obj["bin"] name_id = gene_obj["name"] chrom = gene_obj["chrom"] strand = gene_obj["strand"] txStart = gene_obj["txStart"] txEnd = gene_obj["txEnd"] cdsStart = gene_obj["cdsStart"] cdsEnd = gene_obj["cdsEnd"] exonCount = gene_obj["exonCount"] exonStarts = gene_obj["exonStarts"] exonEnds = gene_obj["exonEnds"] score = gene_obj["score"] name2 = gene_obj["name2"] cdsStartStat = gene_obj["cdsStartStat"] cdsEndStat = gene_obj["cdsEndStat"] exonFrames = gene_obj["exonFrames"] name = name2 id = name_id e_start = exonStarts.split(",") e_end = exonEnds.split(",") # Determine Y Coordinate i = 0 y_coord = None while y_coord == None: if i > len(lines) - 1: y_coord = i + 1 lines.append(int(txEnd)) elif int(txStart) > (gap + lines[i]): y_coord = i + 1 lines[i] = int(txEnd) else: i += 1 genes_plot_start.append(int(txStart) / 1000000.0) genes_plot_end.append(int(txEnd) / 1000000.0) genes_plot_y.append(y_coord) genes_plot_name.append(name + " ") for i in range(len(e_start) - 1): if strand == "+": exon = i + 1 else: exon = len(e_start) - 1 - i width = (int(e_end[i]) - int(e_start[i])) / 1000000.0 x_coord = int(e_start[i]) / 1000000.0 + (width / 2) exons_plot_x.append(x_coord) exons_plot_y.append(y_coord) exons_plot_w.append(width) exons_plot_h.append(tall) exons_plot_name.append(name) exons_plot_id.append(id) exons_plot_exon.append(exon) n_rows = len(lines) genes_plot_yn = [n_rows - w + 0.5 for w in genes_plot_y] exons_plot_yn = [n_rows - w + 0.5 for w in exons_plot_y] yr2 = Range1d(start=0, end=n_rows) data_gene_plot = { 'exons_plot_x': exons_plot_x, 'exons_plot_yn': exons_plot_yn, 'exons_plot_w': exons_plot_w, 'exons_plot_h': exons_plot_h, 'exons_plot_name': exons_plot_name, 'exons_plot_id': exons_plot_id, 'exons_plot_exon': exons_plot_exon, 'coord_snps_plot': coord_snps_plot, 'snp_id_plot': snp_id_plot, 'alleles_snp_plot': alleles_snp_plot } source_gene_plot = ColumnDataSource(data_gene_plot) max_genes = 40 # if len(lines) < 3 or len(genes_raw) > max_genes: if len(lines) < 3: plot_h_pix = 250 else: plot_h_pix = 250 + (len(lines) - 2) * 50 gene_plot = figure( min_border_top=2, min_border_bottom=0, min_border_left=100, min_border_right=5, x_range=xr, y_range=yr2, border_fill_color='white', title="", h_symmetry=False, v_symmetry=False, logo=None, plot_width=800, plot_height=plot_h_pix, tools= "hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave") # if len(genes_raw) <= max_genes: gene_plot.segment(genes_plot_start, genes_plot_yn, genes_plot_end, genes_plot_yn, color="black", alpha=1, line_width=2) gene_plot.rect(x='exons_plot_x', y='exons_plot_yn', width='exons_plot_w', height='exons_plot_h', source=source_gene_plot, fill_color='grey', line_color="grey") gene_plot.text(genes_plot_start, genes_plot_yn, text=genes_plot_name, alpha=1, text_font_size="7pt", text_font_style="bold", text_baseline="middle", text_align="right", angle=0) hover = gene_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Gene", "@exons_plot_name"), ("ID", "@exons_plot_id"), ("Exon", "@exons_plot_exon"), ]) # else: # x_coord_text = x[0] + (x[-1] - x[0]) / 2.0 # gene_plot.text(x_coord_text, n_rows / 2.0, text=message, alpha=1, # text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0) gene_plot.xaxis.axis_label = "Chromosome " + \ snp_coords[1][1] + " Coordinate (Mb)(" + genome_build_vars[genome_build]['title'] + ")" gene_plot.yaxis.axis_label = "Genes (All Transcripts)" gene_plot.ygrid.grid_line_color = None gene_plot.yaxis.axis_line_color = None gene_plot.yaxis.minor_tick_line_color = None gene_plot.yaxis.major_tick_line_color = None gene_plot.yaxis.major_label_text_color = None gene_plot.toolbar_location = "below" # Gene Plot (Collapsed) else: genes_c_file = tmp_dir + "genes_c_" + request + ".json" genes_c_raw = open(genes_c_file).readlines() genes_c_plot_start = [] genes_c_plot_end = [] genes_c_plot_y = [] genes_c_plot_name = [] exons_c_plot_x = [] exons_c_plot_y = [] exons_c_plot_w = [] exons_c_plot_h = [] exons_c_plot_name = [] exons_c_plot_id = [] message_c = ["Too many genes to plot."] lines_c = [0] gap = 80000 tall = 0.75 if genes_c_raw != None and len(genes_c_raw) > 0: for gene_c_raw_obj in genes_c_raw: gene_c_obj = json.loads(gene_c_raw_obj) chrom = gene_c_obj["chrom"] txStart = gene_c_obj["txStart"] txEnd = gene_c_obj["txEnd"] exonStarts = gene_c_obj["exonStarts"] exonEnds = gene_c_obj["exonEnds"] name2 = gene_c_obj["name2"] transcripts = gene_c_obj["transcripts"] name = name2 e_start = exonStarts.split(",") e_end = exonEnds.split(",") e_transcripts = transcripts.split(",") # Determine Y Coordinate i = 0 y_coord = None while y_coord == None: if i > len(lines_c) - 1: y_coord = i + 1 lines_c.append(int(txEnd)) elif int(txStart) > (gap + lines_c[i]): y_coord = i + 1 lines_c[i] = int(txEnd) else: i += 1 genes_c_plot_start.append(int(txStart) / 1000000.0) genes_c_plot_end.append(int(txEnd) / 1000000.0) genes_c_plot_y.append(y_coord) genes_c_plot_name.append(name + " ") # for i in range(len(e_start)): for i in range(len(e_start) - 1): width = (int(e_end[i]) - int(e_start[i])) / 1000000.0 x_coord = int(e_start[i]) / 1000000.0 + (width / 2) exons_c_plot_x.append(x_coord) exons_c_plot_y.append(y_coord) exons_c_plot_w.append(width) exons_c_plot_h.append(tall) exons_c_plot_name.append(name) exons_c_plot_id.append(e_transcripts[i].replace("-", ",")) n_rows_c = len(lines_c) genes_c_plot_yn = [n_rows_c - x + 0.5 for x in genes_c_plot_y] exons_c_plot_yn = [n_rows_c - x + 0.5 for x in exons_c_plot_y] yr2_c = Range1d(start=0, end=n_rows_c) data_gene_c_plot = { 'exons_c_plot_x': exons_c_plot_x, 'exons_c_plot_yn': exons_c_plot_yn, 'exons_c_plot_w': exons_c_plot_w, 'exons_c_plot_h': exons_c_plot_h, 'exons_c_plot_name': exons_c_plot_name, 'exons_c_plot_id': exons_c_plot_id } source_gene_c_plot = ColumnDataSource(data_gene_c_plot) max_genes_c = 40 # if len(lines_c) < 3 or len(genes_c_raw) > max_genes_c: if len(lines_c) < 3: plot_h_pix = 250 else: plot_h_pix = 250 + (len(lines_c) - 2) * 50 gene_plot = figure( min_border_top=2, min_border_bottom=0, min_border_left=100, min_border_right=5, x_range=xr, y_range=yr2_c, border_fill_color='white', title="", h_symmetry=False, v_symmetry=False, logo=None, plot_width=900, plot_height=plot_h_pix, tools= "hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave") # if len(genes_c_raw) <= max_genes_c: gene_plot.segment(genes_c_plot_start, genes_c_plot_yn, genes_c_plot_end, genes_c_plot_yn, color="black", alpha=1, line_width=2) gene_plot.rect(x='exons_c_plot_x', y='exons_c_plot_yn', width='exons_c_plot_w', height='exons_c_plot_h', source=source_gene_c_plot, fill_color="grey", line_color="grey") gene_plot.text(genes_c_plot_start, genes_c_plot_yn, text=genes_c_plot_name, alpha=1, text_font_size="7pt", text_font_style="bold", text_baseline="middle", text_align="right", angle=0) hover = gene_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Gene", "@exons_c_plot_name"), ("Transcript IDs", "@exons_c_plot_id"), ]) # else: # x_coord_text = coord1/1000000.0 + (coord2/1000000.0 - coord1/1000000.0) / 2.0 # gene_c_plot.text(x_coord_text, n_rows_c / 2.0, text=message_c, alpha=1, # text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0) gene_plot.xaxis.axis_label = "Chromosome " + snp_coords[1][ 1] + " Coordinate (Mb)(" + genome_build_vars[genome_build][ 'title'] + ")" gene_plot.yaxis.axis_label = "Genes (Transcripts Collapsed)" gene_plot.ygrid.grid_line_color = None gene_plot.yaxis.axis_line_color = None gene_plot.yaxis.minor_tick_line_color = None gene_plot.yaxis.major_tick_line_color = None gene_plot.yaxis.major_label_text_color = None gene_plot.toolbar_location = "below" # Change output backend to SVG temporarily for headless export # Will be changed back to canvas in LDlink.js matrix_plot.output_backend = "svg" connector.output_backend = "svg" rug.output_backend = "svg" gene_plot.output_backend = "svg" export_svgs(matrix_plot, filename=tmp_dir + "matrix_plot_1_" + request + ".svg") export_svgs(connector, filename=tmp_dir + "connector_1_" + request + ".svg") export_svgs(rug, filename=tmp_dir + "rug_1_" + request + ".svg") export_svgs(gene_plot, filename=tmp_dir + "gene_plot_1_" + request + ".svg") # 1 pixel = 0.0264583333 cm svg_height = str(25.00 + (0.0264583333 * plot_h_pix)) + "cm" svg_height_scaled = str(110.00 + (0.1322916665 * plot_h_pix)) + "cm" # Concatenate svgs sg.Figure( "21.59cm", svg_height, sg.SVG(tmp_dir + "matrix_plot_1_" + request + ".svg"), sg.SVG(tmp_dir + "connector_1_" + request + ".svg").scale(.97).move( 0, 700), sg.SVG(tmp_dir + "rug_1_" + request + ".svg").scale(.97).move(0, 790), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(.97).move( 0, 840)).save(tmp_dir + "matrix_plot_" + request + ".svg") sg.Figure( "107.95cm", svg_height_scaled, sg.SVG(tmp_dir + "matrix_plot_1_" + request + ".svg").scale(5), sg.SVG(tmp_dir + "connector_1_" + request + ".svg").scale(4.85).move( 0, 3500), sg.SVG(tmp_dir + "rug_1_" + request + ".svg").scale(4.85).move( 0, 3930), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(4.85).move( 0, 4160)).save(tmp_dir + "matrix_plot_scaled_" + request + ".svg") # Export to PDF subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "matrix_plot_" + request + ".svg " + tmp_dir + "matrix_plot_" + request + ".pdf", shell=True) # Export to PNG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "matrix_plot_scaled_" + request + ".svg " + tmp_dir + "matrix_plot_" + request + ".png", shell=True) # Export to JPEG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "matrix_plot_scaled_" + request + ".svg " + tmp_dir + "matrix_plot_" + request + ".jpeg", shell=True) # Remove individual SVG files after they are combined subprocess.call("rm " + tmp_dir + "matrix_plot_1_" + request + ".svg", shell=True) subprocess.call("rm " + tmp_dir + "gene_plot_1_" + request + ".svg", shell=True) subprocess.call("rm " + tmp_dir + "rug_1_" + request + ".svg", shell=True) subprocess.call("rm " + tmp_dir + "connector_1_" + request + ".svg", shell=True) # Remove scaled SVG file after it is converted to png and jpeg subprocess.call("rm " + tmp_dir + "matrix_plot_scaled_" + request + ".svg", shell=True) # Remove temporary file(s) subprocess.call("rm " + tmp_dir + "genes_*" + request + "*.json", shell=True) reset_output() return None
def calculate_proxy_svg(snp, pop, request, genome_build, r2_d="r2", window=500000, collapseTranscript=True): # Set data directories using config.yml with open('config.yml', 'r') as yml_file: config = yaml.load(yml_file) env = config['env'] connect_external = config['database']['connect_external'] api_mongo_addr = config['database']['api_mongo_addr'] data_dir = config['data']['data_dir'] tmp_dir = config['data']['tmp_dir'] genotypes_dir = config['data']['genotypes_dir'] mongo_username = config['database']['mongo_user_readonly'] mongo_password = config['database']['mongo_password'] mongo_port = config['database']['mongo_port'] aws_info = config['aws'] num_subprocesses = config['performance']['num_subprocesses'] export_s3_keys = retrieveAWSCredentials() # Ensure tmp directory exists if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) if request is False: request = str(time.strftime("%I%M%S")) # Create JSON output # Find coordinates (GRCh37/hg19) or (GRCh38/hg38) for SNP RS number # Connect to Mongo snp database if env == 'local' or connect_external: mongo_host = api_mongo_addr else: mongo_host = 'localhost' client = MongoClient( 'mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host + '/admin', mongo_port) db = client["LDLink"] def get_coords(db, rsid): rsid = rsid.strip("rs") query_results = db.dbsnp.find_one({"id": rsid}) query_results_sanitized = json.loads(json_util.dumps(query_results)) return query_results_sanitized # Query genomic coordinates def get_rsnum(db, coord): temp_coord = coord.strip("chr").split(":") chro = temp_coord[0] pos = temp_coord[1] query_results = db.dbsnp.find({ "chromosome": chro.upper() if chro == 'x' or chro == 'y' else str(chro), genome_build_vars[genome_build]['position']: str(pos) }) query_results_sanitized = json.loads(json_util.dumps(query_results)) return query_results_sanitized # Replace input genomic coordinates with variant ids (rsids) def replace_coord_rsid(db, snp): if snp[0:2] == "rs": return snp else: snp_info_lst = get_rsnum(db, snp) print("snp_info_lst") print(snp_info_lst) if snp_info_lst != None: if len(snp_info_lst) > 1: var_id = "rs" + snp_info_lst[0]['id'] ref_variants = [] for snp_info in snp_info_lst: if snp_info['id'] == snp_info['ref_id']: ref_variants.append(snp_info['id']) if len(ref_variants) > 1: var_id = "rs" + ref_variants[0] elif len(ref_variants) == 0 and len(snp_info_lst) > 1: var_id = "rs" + snp_info_lst[0]['id'] else: var_id = "rs" + ref_variants[0] return var_id elif len(snp_info_lst) == 1: var_id = "rs" + snp_info_lst[0]['id'] return var_id else: return snp else: return snp return snp snp = replace_coord_rsid(db, snp) # Find RS number in snp database snp_coord = get_coords(db, snp) # Get population ids from LDproxy.py tmp output files pop_list = open(tmp_dir + "pops_" + request + ".txt").readlines() ids = [] for i in range(len(pop_list)): ids.append(pop_list[i].strip()) pop_ids = list(set(ids)) # Extract query SNP phased genotypes vcf_filePath = "%s/%s%s/%s" % ( config['aws']['data_subfolder'], genotypes_dir, genome_build_vars[genome_build]['1000G_dir'], genome_build_vars[genome_build]['1000G_file'] % (snp_coord['chromosome'])) vcf_query_snp_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath) checkS3File(aws_info, config['aws']['bucket'], vcf_filePath) tabix_snp_h = export_s3_keys + " cd {1}; tabix -HD {0} | grep CHROM".format( vcf_query_snp_file, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) head = [ x.decode('utf-8') for x in subprocess.Popen(tabix_snp_h, shell=True, stdout=subprocess.PIPE).stdout.readlines() ][0].strip().split() tabix_snp = export_s3_keys + " cd {4}; tabix -D {0} {1}:{2}-{2} | grep -v -e END > {3}".format( vcf_query_snp_file, genome_build_vars[genome_build]['1000G_chr_prefix'] + snp_coord['chromosome'], snp_coord[genome_build_vars[genome_build]['position']], tmp_dir + "snp_no_dups_" + request + ".vcf", data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) subprocess.call(tabix_snp, shell=True) # Check SNP is in the 1000G population, has the correct RS number, and not # monoallelic vcf = open(tmp_dir + "snp_no_dups_" + request + ".vcf").readlines() if len(vcf) == 0: subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) return None elif len(vcf) > 1: geno = [] for i in range(len(vcf)): # if vcf[i].strip().split()[2] == snp: geno = vcf[i].strip().split() geno[0] = geno[0].lstrip('chr') if geno == []: subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) return None else: geno = vcf[0].strip().split() geno[0] = geno[0].lstrip('chr') if geno[2] != snp and snp[0:2] == "rs" and "rs" in geno[2]: snp = geno[2] if "," in geno[3] or "," in geno[4]: subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) return None index = [] for i in range(9, len(head)): if head[i] in pop_ids: index.append(i) genotypes = {"0": 0, "1": 0} for i in index: sub_geno = geno[i].split("|") for j in sub_geno: if j in genotypes: genotypes[j] += 1 else: genotypes[j] = 1 if genotypes["0"] == 0 or genotypes["1"] == 0: subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) return None # Define window of interest around query SNP # window = 500000 coord1 = int( snp_coord[genome_build_vars[genome_build]['position']]) - window if coord1 < 0: coord1 = 0 coord2 = int( snp_coord[genome_build_vars[genome_build]['position']]) + window # Calculate proxy LD statistics in parallel # threads = 4 # block = (2 * window) // 4 # block = (2 * window) // num_subprocesses windowChunkRanges = chunkWindow( int(snp_coord[genome_build_vars[genome_build]['position']]), window, num_subprocesses) commands = [] # for i in range(num_subprocesses): # if i == min(range(num_subprocesses)) and i == max(range(num_subprocesses)): # command = "python3 LDproxy_sub.py " + "True " + snp + " " + \ # snp_coord['chromosome'] + " " + str(coord1) + " " + \ # str(coord2) + " " + request + " " + str(i) # elif i == min(range(num_subprocesses)): # command = "python3 LDproxy_sub.py " + "True " + snp + " " + \ # snp_coord['chromosome'] + " " + str(coord1) + " " + \ # str(coord1 + block) + " " + request + " " + str(i) # elif i == max(range(num_subprocesses)): # command = "python3 LDproxy_sub.py " + "True " + snp + " " + snp_coord['chromosome'] + " " + str( # coord1 + (block * i) + 1) + " " + str(coord2) + " " + request + " " + str(i) # else: # command = "python3 LDproxy_sub.py " + "True " + snp + " " + snp_coord['chromosome'] + " " + str(coord1 + ( # block * i) + 1) + " " + str(coord1 + (block * (i + 1))) + " " + request + " " + str(i) # commands.append(command) for subprocess_id in range(num_subprocesses): getWindowVariantsArgs = " ".join([ "True", str(snp), str(snp_coord['chromosome']), str(windowChunkRanges[subprocess_id][0]), str(windowChunkRanges[subprocess_id][1]), str(request), genome_build, str(subprocess_id) ]) commands.append("python3 LDproxy_sub.py " + getWindowVariantsArgs) processes = [ subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) for command in commands ] # collect output in parallel def get_output(process): return process.communicate()[0].splitlines() if not hasattr(threading.current_thread(), "_children"): threading.current_thread()._children = weakref.WeakKeyDictionary() pool = Pool(len(processes)) out_raw = pool.map(get_output, processes) pool.close() pool.join() # Aggregate output out_prox = [] for i in range(len(out_raw)): for j in range(len(out_raw[i])): col = out_raw[i][j].decode('utf-8').strip().split("\t") col[6] = int(col[6]) col[7] = float(col[7]) col[8] = float(col[8]) col.append(abs(int(col[6]))) out_prox.append(col) # Sort output if r2_d not in ["r2", "d"]: r2_d = "r2" out_dist_sort = sorted(out_prox, key=operator.itemgetter(14)) if r2_d == "r2": out_ld_sort = sorted(out_dist_sort, key=operator.itemgetter(8), reverse=True) else: out_ld_sort = sorted(out_dist_sort, key=operator.itemgetter(7), reverse=True) # Organize scatter plot data q_rs = [] q_allele = [] q_coord = [] q_maf = [] p_rs = [] p_allele = [] p_coord = [] p_maf = [] dist = [] d_prime = [] d_prime_round = [] r2 = [] r2_round = [] corr_alleles = [] regdb = [] funct = [] color = [] size = [] for i in range(len(out_ld_sort)): q_rs_i, q_allele_i, q_coord_i, p_rs_i, p_allele_i, p_coord_i, dist_i, d_prime_i, r2_i, corr_alleles_i, regdb_i, q_maf_i, p_maf_i, funct_i, dist_abs = out_ld_sort[ i] if float(r2_i) > 0.01: q_rs.append(q_rs_i) q_allele.append(q_allele_i) q_coord.append(float(q_coord_i.split(":")[1]) / 1000000) q_maf.append(str(round(float(q_maf_i), 4))) if p_rs_i == ".": p_rs_i = p_coord_i p_rs.append(p_rs_i) p_allele.append(p_allele_i) p_coord.append(float(p_coord_i.split(":")[1]) / 1000000) p_maf.append(str(round(float(p_maf_i), 4))) dist.append(str(round(dist_i / 1000000.0, 4))) d_prime.append(float(d_prime_i)) d_prime_round.append(str(round(float(d_prime_i), 4))) r2.append(float(r2_i)) r2_round.append(str(round(float(r2_i), 4))) corr_alleles.append(corr_alleles_i) # Correct Missing Annotations if regdb_i == ".": regdb_i = "" regdb.append(regdb_i) if funct_i == ".": funct_i = "" if funct_i == "NA": funct_i = "none" funct.append(funct_i) # Set Color if i == 0: color_i = "blue" elif funct_i != "none" and funct_i != "": color_i = "red" else: color_i = "orange" color.append(color_i) # Set Size size_i = 9 + float(p_maf_i) * 14.0 size.append(size_i) # Begin Bokeh Plotting from collections import OrderedDict from bokeh.embed import components, file_html from bokeh.layouts import gridplot from bokeh.models import HoverTool, LinearAxis, Range1d from bokeh.plotting import ColumnDataSource, curdoc, figure, output_file, reset_output, save from bokeh.resources import CDN from bokeh.io import export_svgs import svgutils.compose as sg reset_output() # Proxy Plot x = p_coord if r2_d == "r2": y = r2 else: y = d_prime whitespace = 0.01 xr = Range1d(start=coord1 / 1000000.0 - whitespace, end=coord2 / 1000000.0 + whitespace) yr = Range1d(start=-0.03, end=1.03) sup_2 = "\u00B2" proxy_plot = figure( title="Proxies for " + snp + " in " + pop, min_border_top=2, min_border_bottom=2, min_border_left=60, min_border_right=60, h_symmetry=False, v_symmetry=False, plot_width=900, plot_height=600, x_range=xr, y_range=yr, tools="hover,tap,pan,box_zoom,box_select,undo,redo,reset,previewsave", logo=None, toolbar_location="above") proxy_plot.title.align = "center" # Add recombination rate from LDproxy.py output file recomb_file = tmp_dir + "recomb_" + request + ".json" recomb_raw = open(recomb_file).readlines() recomb_x = [] recomb_y = [] for recomb_raw_obj in recomb_raw: recomb_obj = json.loads(recomb_raw_obj) recomb_x.append( int(recomb_obj[genome_build_vars[genome_build]['position']]) / 1000000.0) recomb_y.append(float(recomb_obj['rate']) / 100.0) data = { 'x': x, 'y': y, 'qrs': q_rs, 'q_alle': q_allele, 'q_maf': q_maf, 'prs': p_rs, 'p_alle': p_allele, 'p_maf': p_maf, 'dist': dist, 'r': r2_round, 'd': d_prime_round, 'alleles': corr_alleles, 'regdb': regdb, 'funct': funct, 'size': size, 'color': color } source = ColumnDataSource(data) proxy_plot.line(recomb_x, recomb_y, line_width=1, color="black", alpha=0.5) proxy_plot.circle(x='x', y='y', size='size', color='color', alpha=0.5, source=source) hover = proxy_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Query Variant", "@qrs @q_alle"), ("Proxy Variant", "@prs @p_alle"), ("Distance (Mb)", "@dist"), ("MAF (Query,Proxy)", "@q_maf,@p_maf"), ("R" + sup_2, "@r"), ("D\'", "@d"), ("Correlated Alleles", "@alleles"), ("RegulomeDB", "@regdb"), ("Functional Class", "@funct"), ]) proxy_plot.text(x, y, text=regdb, alpha=1, text_font_size="7pt", text_baseline="middle", text_align="center", angle=0) if r2_d == "r2": proxy_plot.yaxis.axis_label = "R" + sup_2 else: proxy_plot.yaxis.axis_label = "D\'" proxy_plot.extra_y_ranges = {"y2_axis": Range1d(start=-3, end=103)} proxy_plot.add_layout( LinearAxis(y_range_name="y2_axis", axis_label="Combined Recombination Rate (cM/Mb)"), "right") # Rug Plot y2_ll = [-0.03] * len(x) y2_ul = [1.03] * len(x) yr_rug = Range1d(start=-0.03, end=1.03) data_rug = { 'x': x, 'y': y, 'y2_ll': y2_ll, 'y2_ul': y2_ul, 'qrs': q_rs, 'q_alle': q_allele, 'q_maf': q_maf, 'prs': p_rs, 'p_alle': p_allele, 'p_maf': p_maf, 'dist': dist, 'r': r2_round, 'd': d_prime_round, 'alleles': corr_alleles, 'regdb': regdb, 'funct': funct, 'size': size, 'color': color } source_rug = ColumnDataSource(data_rug) rug = figure(x_range=xr, y_range=yr_rug, border_fill_color='white', y_axis_type=None, title="", min_border_top=2, min_border_bottom=2, min_border_left=60, min_border_right=60, h_symmetry=False, v_symmetry=False, plot_width=900, plot_height=50, tools="xpan,tap", logo=None) rug.segment(x0='x', y0='y2_ll', x1='x', y1='y2_ul', source=source_rug, color='color', alpha=0.5, line_width=1) rug.toolbar_location = None if collapseTranscript == "false": # Gene Plot (All Transcripts) genes_file = tmp_dir + "genes_" + request + ".json" genes_raw = open(genes_file).readlines() genes_plot_start = [] genes_plot_end = [] genes_plot_y = [] genes_plot_name = [] exons_plot_x = [] exons_plot_y = [] exons_plot_w = [] exons_plot_h = [] exons_plot_name = [] exons_plot_id = [] exons_plot_exon = [] lines = [0] gap = 80000 tall = 0.75 if genes_raw != None and len(genes_raw) > 0: for gene_raw_obj in genes_raw: gene_obj = json.loads(gene_raw_obj) bin = gene_obj["bin"] name_id = gene_obj["name"] chrom = gene_obj["chrom"] strand = gene_obj["strand"] txStart = gene_obj["txStart"] txEnd = gene_obj["txEnd"] cdsStart = gene_obj["cdsStart"] cdsEnd = gene_obj["cdsEnd"] exonCount = gene_obj["exonCount"] exonStarts = gene_obj["exonStarts"] exonEnds = gene_obj["exonEnds"] score = gene_obj["score"] name2 = gene_obj["name2"] cdsStartStat = gene_obj["cdsStartStat"] cdsEndStat = gene_obj["cdsEndStat"] exonFrames = gene_obj["exonFrames"] name = name2 id = name_id e_start = exonStarts.split(",") e_end = exonEnds.split(",") # Determine Y Coordinate i = 0 y_coord = None while y_coord == None: if i > len(lines) - 1: y_coord = i + 1 lines.append(int(txEnd)) elif int(txStart) > (gap + lines[i]): y_coord = i + 1 lines[i] = int(txEnd) else: i += 1 genes_plot_start.append(int(txStart) / 1000000.0) genes_plot_end.append(int(txEnd) / 1000000.0) genes_plot_y.append(y_coord) genes_plot_name.append(name + " ") for i in range(len(e_start) - 1): if strand == "+": exon = i + 1 else: exon = len(e_start) - 1 - i width = (int(e_end[i]) - int(e_start[i])) / 1000000.0 x_coord = int(e_start[i]) / 1000000.0 + (width / 2) exons_plot_x.append(x_coord) exons_plot_y.append(y_coord) exons_plot_w.append(width) exons_plot_h.append(tall) exons_plot_name.append(name) exons_plot_id.append(id) exons_plot_exon.append(exon) n_rows = len(lines) genes_plot_yn = [n_rows - x + 0.5 for x in genes_plot_y] exons_plot_yn = [n_rows - x + 0.5 for x in exons_plot_y] yr2 = Range1d(start=0, end=n_rows) data_gene_plot = { 'exons_plot_x': exons_plot_x, 'exons_plot_yn': exons_plot_yn, 'exons_plot_w': exons_plot_w, 'exons_plot_h': exons_plot_h, 'exons_plot_name': exons_plot_name, 'exons_plot_id': exons_plot_id, 'exons_plot_exon': exons_plot_exon } source_gene_plot = ColumnDataSource(data_gene_plot) if len(lines) < 3: plot_h_pix = 250 else: plot_h_pix = 250 + (len(lines) - 2) * 50 gene_plot = figure( x_range=xr, y_range=yr2, border_fill_color='white', title="", min_border_top=2, min_border_bottom=2, min_border_left=60, min_border_right=60, h_symmetry=False, v_symmetry=False, plot_width=900, plot_height=plot_h_pix, tools="hover,tap,xpan,box_zoom,undo,redo,reset,previewsave", logo=None) gene_plot.segment(genes_plot_start, genes_plot_yn, genes_plot_end, genes_plot_yn, color="black", alpha=1, line_width=2) gene_plot.rect(x='exons_plot_x', y='exons_plot_yn', width='exons_plot_w', height='exons_plot_h', source=source_gene_plot, fill_color="grey", line_color="grey") gene_plot.xaxis.axis_label = "Chromosome " + snp_coord[ 'chromosome'] + " Coordinate (Mb)(" + genome_build_vars[ genome_build]['title'] + ")" gene_plot.yaxis.axis_label = "Genes (All Transcripts)" gene_plot.ygrid.grid_line_color = None gene_plot.yaxis.axis_line_color = None gene_plot.yaxis.minor_tick_line_color = None gene_plot.yaxis.major_tick_line_color = None gene_plot.yaxis.major_label_text_color = None hover = gene_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Gene", "@exons_plot_name"), ("ID", "@exons_plot_id"), ("Exon", "@exons_plot_exon"), ]) gene_plot.text(genes_plot_start, genes_plot_yn, text=genes_plot_name, alpha=1, text_font_size="7pt", text_font_style="bold", text_baseline="middle", text_align="right", angle=0) gene_plot.toolbar_location = "below" # Gene Plot (Collapsed) else: genes_c_file = tmp_dir + "genes_c_" + request + ".json" genes_c_raw = open(genes_c_file).readlines() genes_c_plot_start = [] genes_c_plot_end = [] genes_c_plot_y = [] genes_c_plot_name = [] exons_c_plot_x = [] exons_c_plot_y = [] exons_c_plot_w = [] exons_c_plot_h = [] exons_c_plot_name = [] exons_c_plot_id = [] message_c = ["Too many genes to plot."] lines_c = [0] gap = 80000 tall = 0.75 if genes_c_raw != None and len(genes_c_raw) > 0: for gene_c_raw_obj in genes_c_raw: gene_c_obj = json.loads(gene_c_raw_obj) chrom = gene_c_obj["chrom"] txStart = gene_c_obj["txStart"] txEnd = gene_c_obj["txEnd"] exonStarts = gene_c_obj["exonStarts"] exonEnds = gene_c_obj["exonEnds"] name2 = gene_c_obj["name2"] transcripts = gene_c_obj["transcripts"] name = name2 e_start = exonStarts.split(",") e_end = exonEnds.split(",") e_transcripts = transcripts.split(",") # Determine Y Coordinate i = 0 y_coord = None while y_coord == None: if i > len(lines_c) - 1: y_coord = i + 1 lines_c.append(int(txEnd)) elif int(txStart) > (gap + lines_c[i]): y_coord = i + 1 lines_c[i] = int(txEnd) else: i += 1 genes_c_plot_start.append(int(txStart) / 1000000.0) genes_c_plot_end.append(int(txEnd) / 1000000.0) genes_c_plot_y.append(y_coord) genes_c_plot_name.append(name + " ") # for i in range(len(e_start)): for i in range(len(e_start) - 1): width = (int(e_end[i]) - int(e_start[i])) / 1000000.0 x_coord = int(e_start[i]) / 1000000.0 + (width / 2) exons_c_plot_x.append(x_coord) exons_c_plot_y.append(y_coord) exons_c_plot_w.append(width) exons_c_plot_h.append(tall) exons_c_plot_name.append(name) exons_c_plot_id.append(e_transcripts[i].replace("-", ",")) n_rows_c = len(lines_c) genes_c_plot_yn = [n_rows_c - x + 0.5 for x in genes_c_plot_y] exons_c_plot_yn = [n_rows_c - x + 0.5 for x in exons_c_plot_y] yr2_c = Range1d(start=0, end=n_rows_c) data_gene_c_plot = { 'exons_c_plot_x': exons_c_plot_x, 'exons_c_plot_yn': exons_c_plot_yn, 'exons_c_plot_w': exons_c_plot_w, 'exons_c_plot_h': exons_c_plot_h, 'exons_c_plot_name': exons_c_plot_name, 'exons_c_plot_id': exons_c_plot_id } source_gene_c_plot = ColumnDataSource(data_gene_c_plot) max_genes_c = 40 # if len(lines_c) < 3 or len(genes_c_raw) > max_genes_c: if len(lines_c) < 3: plot_h_pix = 250 else: plot_h_pix = 250 + (len(lines_c) - 2) * 50 gene_plot = figure( min_border_top=2, min_border_bottom=0, min_border_left=100, min_border_right=5, x_range=xr, y_range=yr2_c, border_fill_color='white', title="", h_symmetry=False, v_symmetry=False, logo=None, plot_width=900, plot_height=plot_h_pix, tools= "hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave") # if len(genes_c_raw) <= max_genes_c: gene_plot.segment(genes_c_plot_start, genes_c_plot_yn, genes_c_plot_end, genes_c_plot_yn, color="black", alpha=1, line_width=2) gene_plot.rect(x='exons_c_plot_x', y='exons_c_plot_yn', width='exons_c_plot_w', height='exons_c_plot_h', source=source_gene_c_plot, fill_color="grey", line_color="grey") gene_plot.text(genes_c_plot_start, genes_c_plot_yn, text=genes_c_plot_name, alpha=1, text_font_size="7pt", text_font_style="bold", text_baseline="middle", text_align="right", angle=0) hover = gene_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Gene", "@exons_c_plot_name"), ("Transcript IDs", "@exons_c_plot_id"), ]) # else: # x_coord_text = coord1/1000000.0 + (coord2/1000000.0 - coord1/1000000.0) / 2.0 # gene_c_plot.text(x_coord_text, n_rows_c / 2.0, text=message_c, alpha=1, # text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0) gene_plot.xaxis.axis_label = "Chromosome " + snp_coord[ 'chromosome'] + " Coordinate (Mb)(" + genome_build_vars[ genome_build]['title'] + ")" gene_plot.yaxis.axis_label = "Genes (Transcripts Collapsed)" gene_plot.ygrid.grid_line_color = None gene_plot.yaxis.axis_line_color = None gene_plot.yaxis.minor_tick_line_color = None gene_plot.yaxis.major_tick_line_color = None gene_plot.yaxis.major_label_text_color = None gene_plot.toolbar_location = "below" # Change output backend to SVG temporarily for headless export # Will be changed back to canvas in LDlink.js proxy_plot.output_backend = "svg" rug.output_backend = "svg" gene_plot.output_backend = "svg" export_svgs(proxy_plot, filename=tmp_dir + "proxy_plot_1_" + request + ".svg") export_svgs(gene_plot, filename=tmp_dir + "gene_plot_1_" + request + ".svg") # 1 pixel = 0.0264583333 cm svg_height = str(20.00 + (0.0264583333 * plot_h_pix)) + "cm" svg_height_scaled = str(100.00 + (0.1322916665 * plot_h_pix)) + "cm" # Concatenate svgs sg.Figure("24.59cm", svg_height, sg.SVG(tmp_dir + "proxy_plot_1_" + request + ".svg"), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").move( 0, 630)).save(tmp_dir + "proxy_plot_" + request + ".svg") sg.Figure( "122.95cm", svg_height_scaled, sg.SVG(tmp_dir + "proxy_plot_1_" + request + ".svg").scale(5), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(5).move( 0, 3150)).save(tmp_dir + "proxy_plot_scaled_" + request + ".svg") # Export to PDF subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "proxy_plot_" + request + ".svg " + tmp_dir + "proxy_plot_" + request + ".pdf", shell=True) # Export to PNG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "proxy_plot_scaled_" + request + ".svg " + tmp_dir + "proxy_plot_" + request + ".png", shell=True) # Export to JPEG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "proxy_plot_scaled_" + request + ".svg " + tmp_dir + "proxy_plot_" + request + ".jpeg", shell=True) # Remove individual SVG files after they are combined subprocess.call("rm " + tmp_dir + "proxy_plot_1_" + request + ".svg", shell=True) subprocess.call("rm " + tmp_dir + "gene_plot_1_" + request + ".svg", shell=True) # Remove scaled SVG file after it is converted to png and jpeg subprocess.call("rm " + tmp_dir + "proxy_plot_scaled_" + request + ".svg", shell=True) reset_output() # Remove temporary files subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) subprocess.call("rm " + tmp_dir + "genes_*" + request + "*.json", shell=True) subprocess.call("rm " + tmp_dir + "recomb_" + request + ".txt", shell=True) # Return plot output return None
def calculate_pair(snp_pairs, pop, web, genome_build, request): # Set data directories using config.yml with open('config.yml', 'r') as yml_file: config = yaml.load(yml_file) env = config['env'] connect_external = config['database']['connect_external'] api_mongo_addr = config['database']['api_mongo_addr'] dbsnp_version = config['data']['dbsnp_version'] population_samples_dir = config['data']['population_samples_dir'] data_dir = config['data']['data_dir'] tmp_dir = config['data']['tmp_dir'] genotypes_dir = config['data']['genotypes_dir'] aws_info = config['aws'] mongo_username = config['database']['mongo_user_readonly'] mongo_password = config['database']['mongo_password'] mongo_port = config['database']['mongo_port'] export_s3_keys = retrieveAWSCredentials() # Ensure tmp directory exists if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) # Create JSON output output_list = [] snp_pair_limit = 10 # Throw max SNP pairs error message if len(snp_pairs) > snp_pair_limit: error_out = [{ "error": "Maximum SNP pair list is " + str(snp_pair_limit) + " pairs. Your list contains " + str(len(snp_pairs)) + " pairs." }] return(json.dumps(error_out, sort_keys=True, indent=2)) # Validate genome build param # print("genome_build " + genome_build) if genome_build not in genome_build_vars['vars']: error_out = [{ "error": "Invalid genome build. Please specify either " + ", ".join(genome_build_vars['vars']) + "." }] return(json.dumps(error_out, sort_keys=True, indent=2)) # Select desired ancestral populations pops = pop.split("+") pop_dirs = [] for pop_i in pops: if pop_i in ["ALL", "AFR", "AMR", "EAS", "EUR", "SAS", "ACB", "ASW", "BEB", "CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH", "GWD", "IBS", "ITU", "JPT", "KHV", "LWK", "MSL", "MXL", "PEL", "PJL", "PUR", "STU", "TSI", "YRI"]: pop_dirs.append(data_dir + population_samples_dir + pop_i + ".txt") else: error_out = [{ "error": pop_i + " is not an ancestral population. Choose one of the following ancestral populations: AFR, AMR, EAS, EUR, or SAS; or one of the following sub-populations: ACB, ASW, BEB, CDX, CEU, CHB, CHS, CLM, ESN, FIN, GBR, GIH, GWD, IBS, ITU, JPT, KHV, LWK, MSL, MXL, PEL, PJL, PUR, STU, TSI, or YRI." }] return(json.dumps(error_out, sort_keys=True, indent=2)) get_pops = "cat " + " ".join(pop_dirs) pop_list = [x.decode('utf-8') for x in subprocess.Popen(get_pops, shell=True, stdout=subprocess.PIPE).stdout.readlines()] ids = [i.strip() for i in pop_list] pop_ids = list(set(ids)) # Connect to Mongo snp database if env == 'local' or connect_external: mongo_host = api_mongo_addr else: mongo_host = 'localhost' if web: client = MongoClient('mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host+'/admin', mongo_port) else: if env == 'local' or connect_external: client = MongoClient('mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host+'/admin', mongo_port) else: client = MongoClient('localhost', mongo_port) db = client["LDLink"] def get_coords(db, rsid): rsid = rsid.strip("rs") query_results = db.dbsnp.find_one({"id": rsid}) query_results_sanitized = json.loads(json_util.dumps(query_results)) return query_results_sanitized # Replace input genomic coordinates with variant ids (rsids) def replace_coord_rsid(db, snp): if snp[0:2] == "rs": return snp else: snp_info_lst = get_rsnum(db, snp, genome_build) print("snp_info_lst") print(snp_info_lst) if snp_info_lst != None: if len(snp_info_lst) > 1: var_id = "rs" + snp_info_lst[0]['id'] ref_variants = [] for snp_info in snp_info_lst: if snp_info['id'] == snp_info['ref_id']: ref_variants.append(snp_info['id']) if len(ref_variants) > 1: var_id = "rs" + ref_variants[0] if "warning" in output: output["warning"] = output["warning"] + \ "Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp + ". " else: output["warning"] = "Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp + ". " elif len(ref_variants) == 0 and len(snp_info_lst) > 1: var_id = "rs" + snp_info_lst[0]['id'] if "warning" in output: output["warning"] = output["warning"] + \ "Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp + ". " else: output["warning"] = "Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp + ". " else: var_id = "rs" + ref_variants[0] return var_id elif len(snp_info_lst) == 1: var_id = "rs" + snp_info_lst[0]['id'] return var_id else: return snp else: return snp return snp if len(snp_pairs) < 1: output = {} output["error"] = "Missing at least 1 SNP pair input. " + str(output["warning"] if "warning" in output else "") output_list.append(output) for pair in snp_pairs: output = {} output["pair"] = pair if len(pair) < 2 or len(pair) > 2 or len(pair[0]) < 3 or len(pair[1]) < 3: output["error"] = "Missing or additional SNPs in pair. " + str(output["warning"] if "warning" in output else "") output_list.append(output) continue # trim any whitespace snp1 = pair[0].lower().strip() snp2 = pair[1].lower().strip() # Find RS numbers in snp database # SNP1 if re.compile(r'rs\d+', re.IGNORECASE).match(snp1) is None and re.compile(r'chr\d+:\d+', re.IGNORECASE).match(snp1) is None and re.compile(r'chr[X|Y]:\d+', re.IGNORECASE).match(snp1) is None: output["error"] = snp1 + " is not a valid SNP. " + str(output["warning"] if "warning" in output else "") output_list.append(output) continue snp1 = replace_coord_rsid(db, snp1) snp1_coord = get_coords(db, snp1) if snp1_coord == None or snp1_coord[genome_build_vars[genome_build]['position']] == "NA": output["error"] = snp1 + " is not in dbSNP build " + dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + "). " + str(output["warning"] if "warning" in output else "") output_list.append(output) continue # SNP2 if re.compile(r'rs\d+', re.IGNORECASE).match(snp2) is None and re.compile(r'chr\d+:\d+', re.IGNORECASE).match(snp2) is None and re.compile(r'chr[X|Y]:\d+', re.IGNORECASE).match(snp2) is None: output["error"] = snp1 + " is not a valid SNP. " + str(output["warning"] if "warning" in output else "") output_list.append(output) continue snp2 = replace_coord_rsid(db, snp2) snp2_coord = get_coords(db, snp2) if snp2_coord == None or snp2_coord[genome_build_vars[genome_build]['position']] == "NA": output["error"] = snp2 + " is not in dbSNP build " + dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + "). " + str(output["warning"] if "warning" in output else "") output_list.append(output) continue # Check if SNPs are on the same chromosome if snp1_coord['chromosome'] != snp2_coord['chromosome']: if "warning" in output: output["warning"] = output["warning"] + snp1 + " and " + snp2 + " are on different chromosomes. " else: output["warning"] = snp1 + " and " + snp2 + " are on different chromosomes. " # Check if input SNPs are on chromosome Y while genome build == grch38 # SNP1 if snp1_coord['chromosome'] == "Y" and (genome_build == "grch38" or genome_build == "grch38_high_coverage"): output["error"] = "Input variants on chromosome Y are unavailable for GRCh38, only available for GRCh37 (" + "rs" + snp1_coord['id'] + " - chr" + snp1_coord['chromosome'] + ":" + snp1_coord[genome_build_vars[genome_build]['position']] + "). " + str(output["warning"] if "warning" in output else "") output_list.append(output) continue # SNP2 if snp2_coord['chromosome'] == "Y" and (genome_build == "grch38" or genome_build == "grch38_high_coverage"): output["error"] = "Input variants on chromosome Y are unavailable for GRCh38, only available for GRCh37 (" + "rs" + snp2_coord['id'] + " - chr" + snp2_coord['chromosome'] + ":" + snp2_coord[genome_build_vars[genome_build]['position']] + "). " + str(output["warning"] if "warning" in output else "") output_list.append(output) continue # Extract 1000 Genomes phased genotypes # SNP1 vcf_filePath1 = "%s/%s%s/%s" % (config['aws']['data_subfolder'], genotypes_dir, genome_build_vars[genome_build]['1000G_dir'], genome_build_vars[genome_build]['1000G_file'] % snp1_coord['chromosome']) vcf_file1 = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath1) checkS3File(aws_info, config['aws']['bucket'], vcf_filePath1) tabix_snp1_offset = export_s3_keys + " cd {3}; tabix -D {0} {1}:{2}-{2} | grep -v -e END".format( vcf_file1, genome_build_vars[genome_build]['1000G_chr_prefix'] + snp1_coord['chromosome'], snp1_coord[genome_build_vars[genome_build]['position']], data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) vcf1_offset = [x.decode('utf-8') for x in subprocess.Popen(tabix_snp1_offset, shell=True, stdout=subprocess.PIPE).stdout.readlines()] # SNP2 vcf_filePath2 = "%s/%s%s/%s" % (config['aws']['data_subfolder'], genotypes_dir, genome_build_vars[genome_build]['1000G_dir'], genome_build_vars[genome_build]['1000G_file'] % snp2_coord['chromosome']) vcf_file2 = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath2) checkS3File(aws_info, config['aws']['bucket'], vcf_filePath2) tabix_snp2_offset = export_s3_keys + " cd {3}; tabix -D {0} {1}:{2}-{2} | grep -v -e END".format( vcf_file2, genome_build_vars[genome_build]['1000G_chr_prefix'] + snp2_coord['chromosome'], snp2_coord[genome_build_vars[genome_build]['position']], data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) vcf2_offset = [x.decode('utf-8') for x in subprocess.Popen(tabix_snp2_offset, shell=True, stdout=subprocess.PIPE).stdout.readlines()] vcf1_pos = snp1_coord[genome_build_vars[genome_build]['position']] vcf2_pos = snp2_coord[genome_build_vars[genome_build]['position']] vcf1 = vcf1_offset vcf2 = vcf2_offset # Import SNP VCF files # SNP1 if len(vcf1) == 0: output["error"] = snp1 + " is not in 1000G reference panel. " + str(output["warning"] if "warning" in output else "") output_list.append(output) continue elif len(vcf1) > 1: geno1 = [] for i in range(len(vcf1)): geno1 = vcf1[i].strip().split() geno1[0] = geno1[0].lstrip('chr') if not (geno1[0] == snp1_coord['chromosome'] and geno1[1] == snp1_coord[genome_build_vars[genome_build]['position']]): geno1 = [] if geno1 == []: output["error"] = snp1 + " is not in 1000G reference panel. " + str(output["warning"] if "warning" in output else "") output_list.append(output) continue else: geno1 = vcf1[0].strip().split() geno1[0] = geno1[0].lstrip('chr') if geno1[2] != snp1 and snp1[0:2] == "rs" and "rs" in geno1[2]: if "warning" in output: output["warning"] = output["warning"] + \ "Genomic position for query variant1 (" + snp1 + \ ") does not match RS number at 1000G position (chr" + \ geno1[0]+":"+geno1[1]+" = "+geno1[2]+"). " else: output["warning"] = "Genomic position for query variant1 (" + snp1 + \ ") does not match RS number at 1000G position (chr" + \ geno1[0]+":"+geno1[1]+" = "+geno1[2]+"). " snp1 = geno1[2] if "," in geno1[3] or "," in geno1[4]: output["error"] = snp1 + " is not a biallelic variant. " + str(output["warning"] if "warning" in output else "") output_list.append(output) continue if len(geno1[3]) == 1 and len(geno1[4]) == 1: snp1_a1 = geno1[3] snp1_a2 = geno1[4] elif len(geno1[3]) == 1 and len(geno1[4]) > 1: snp1_a1 = "-" snp1_a2 = geno1[4][1:] elif len(geno1[3]) > 1 and len(geno1[4]) == 1: snp1_a1 = geno1[3][1:] snp1_a2 = "-" elif len(geno1[3]) > 1 and len(geno1[4]) > 1: snp1_a1 = geno1[3][1:] snp1_a2 = geno1[4][1:] allele1 = {"0|0": [snp1_a1, snp1_a1], "0|1": [snp1_a1, snp1_a2], "1|0": [snp1_a2, snp1_a1], "1|1": [ snp1_a2, snp1_a2], "0": [snp1_a1, "."], "1": [snp1_a2, "."], "./.": [".", "."], ".": [".", "."]} # SNP2 if len(vcf2) == 0: output["error"] = snp2 + " is not in 1000G reference panel. " + str(output["warning"] if "warning" in output else "") output_list.append(output) continue elif len(vcf2) > 1: geno2 = [] for i in range(len(vcf2)): geno2 = vcf2[i].strip().split() geno2[0] = geno2[0].lstrip('chr') if not (geno2[0] == snp2_coord['chromosome'] and geno2[1] == snp2_coord[genome_build_vars[genome_build]['position']]): geno2 = [] if geno2 == []: output["error"] = snp2 + " is not in 1000G reference panel. " + str(output["warning"] if "warning" in output else "") output_list.append(output) continue else: geno2 = vcf2[0].strip().split() geno2[0] = geno2[0].lstrip('chr') if geno2[2] != snp2 and snp2[0:2] == "rs" and "rs" in geno2[2]: if "warning" in output: output["warning"] = output["warning"] + \ "Genomic position for query variant2 (" + snp2 + \ ") does not match RS number at 1000G position (chr" + \ geno2[0]+":"+geno2[1]+" = "+geno2[2]+"). " else: output["warning"] = "Genomic position for query variant2 (" + snp2 + \ ") does not match RS number at 1000G position (chr" + \ geno2[0]+":"+geno2[1]+" = "+geno2[2]+"). " snp2 = geno2[2] if "," in geno2[3] or "," in geno2[4]: output["error"] = snp2 + " is not a biallelic variant. " + str(output["warning"] if "warning" in output else "") output_list.append(output) continue if len(geno2[3]) == 1 and len(geno2[4]) == 1: snp2_a1 = geno2[3] snp2_a2 = geno2[4] elif len(geno2[3]) == 1 and len(geno2[4]) > 1: snp2_a1 = "-" snp2_a2 = geno2[4][1:] elif len(geno2[3]) > 1 and len(geno2[4]) == 1: snp2_a1 = geno2[3][1:] snp2_a2 = "-" elif len(geno2[3]) > 1 and len(geno2[4]) > 1: snp2_a1 = geno2[3][1:] snp2_a2 = geno2[4][1:] allele2 = {"0|0": [snp2_a1, snp2_a1], "0|1": [snp2_a1, snp2_a2], "1|0": [snp2_a2, snp2_a1], "1|1": [ snp2_a2, snp2_a2], "0": [snp2_a1, "."], "1": [snp2_a2, "."], "./.": [".", "."], ".": [".", "."]} if geno1[1] != vcf1_pos: if "warning" in output: output["warning"] = output["warning"] + "VCF File does not match variant coordinates for SNP1. " else: output["warning"] = "VCF File does not match variant coordinates for SNP1. " output_list.append(output) geno1[1] = vcf1_pos if geno2[1] != vcf2_pos: if "warning" in output: output["warning"] = output["warning"] + "VCF File does not match variant coordinates for SNP2. " else: output["warning"] = "VCF File does not match variant coordinates for SNP2. " output_list.append(output) geno2[1] = vcf2_pos # Get headers tabix_snp1_h = export_s3_keys + " cd {1}; tabix -HD {0} | grep CHROM".format(vcf_file1, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) head1 = [x.decode('utf-8') for x in subprocess.Popen(tabix_snp1_h, shell=True, stdout=subprocess.PIPE).stdout.readlines()][0].strip().split() tabix_snp2_h = export_s3_keys + " cd {1}; tabix -HD {0} | grep CHROM".format(vcf_file2, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) head2 = [x.decode('utf-8') for x in subprocess.Popen(tabix_snp2_h, shell=True, stdout=subprocess.PIPE).stdout.readlines()][0].strip().split() # Combine phased genotypes geno = {} for i in range(9, len(head1)): geno[head1[i]] = [allele1[geno1[i]], ".."] for i in range(9, len(head2)): if head2[i] in geno: geno[head2[i]][1] = allele2[geno2[i]] # Extract haplotypes hap = {} for ind in pop_ids: if ind in geno: hap1 = geno[ind][0][0] + "_" + geno[ind][1][0] hap2 = geno[ind][0][1] + "_" + geno[ind][1][1] if hap1 in hap: hap[hap1] += 1 else: hap[hap1] = 1 if hap2 in hap: hap[hap2] += 1 else: hap[hap2] = 1 # Remove missing haplotypes keys = list(hap.keys()) for key in keys: if "." in key: hap.pop(key, None) # Check all haplotypes are present if len(hap) != 4: snp1_a = [snp1_a1, snp1_a2] snp2_a = [snp2_a1, snp2_a2] haps = [snp1_a[0] + "_" + snp2_a[0], snp1_a[0] + "_" + snp2_a[1], snp1_a[1] + "_" + snp2_a[0], snp1_a[1] + "_" + snp2_a[1]] for i in haps: if i not in hap: hap[i] = 0 # Sort haplotypes A = hap[sorted(hap)[0]] B = hap[sorted(hap)[1]] C = hap[sorted(hap)[2]] D = hap[sorted(hap)[3]] N = A + B + C + D tmax = max(A, B, C, D) hap1 = sorted(hap, key=hap.get, reverse=True)[0] hap2 = sorted(hap, key=hap.get, reverse=True)[1] hap3 = sorted(hap, key=hap.get, reverse=True)[2] hap4 = sorted(hap, key=hap.get, reverse=True)[3] delta = float(A * D - B * C) Ms = float((A + C) * (B + D) * (A + B) * (C + D)) if Ms != 0: # D prime if delta < 0: D_prime = abs(delta / min((A + C) * (A + B), (B + D) * (C + D))) else: D_prime = abs(delta / min((A + C) * (C + D), (A + B) * (B + D))) # R2 r2 = (delta**2) / Ms # P-value num = (A + B + C + D) * (A * D - B * C)**2 denom = Ms chisq = num / denom p = 2 * (1 - (0.5 * (1 + math.erf(chisq**0.5 / 2**0.5)))) else: D_prime = "NA" r2 = "NA" chisq = "NA" p = "NA" # Find Correlated Alleles if str(r2) != "NA" and float(r2) > 0.1: Ac=hap[sorted(hap)[0]] Bc=hap[sorted(hap)[1]] Cc=hap[sorted(hap)[2]] Dc=hap[sorted(hap)[3]] if ((Ac*Dc) / max((Bc*Cc), 0.01) > 1): corr1 = snp1 + "(" + sorted(hap)[0].split("_")[0] + ") allele is correlated with " + snp2 + "(" + sorted(hap)[0].split("_")[1] + ") allele" corr2 = snp1 + "(" + sorted(hap)[3].split("_")[0] + ") allele is correlated with " + snp2 + "(" + sorted(hap)[3].split("_")[1] + ") allele" corr_alleles = [corr1, corr2] else: corr1 = snp1 + "(" + sorted(hap)[1].split("_")[0] + ") allele is correlated with " + snp2 + "(" + sorted(hap)[1].split("_")[1] + ") allele" corr2 = snp1 + "(" + sorted(hap)[2].split("_")[0] + ") allele is correlated with " + snp2 + "(" + sorted(hap)[2].split("_")[1] + ") allele" corr_alleles = [corr1, corr2] else: corr_alleles = [snp1 + " and " + snp2 + " are in linkage equilibrium"] # Create JSON output snp_1 = {} snp_1["rsnum"] = snp1 snp_1["coord"] = "chr" + snp1_coord['chromosome'] + ":" + \ vcf1_pos snp_1_allele_1 = {} snp_1_allele_1["allele"] = sorted(hap)[0].split("_")[0] snp_1_allele_1["count"] = str(A + B) snp_1_allele_1["frequency"] = str(round(float(A + B) / N, 3)) snp_1["allele_1"] = snp_1_allele_1 snp_1_allele_2 = {} snp_1_allele_2["allele"] = sorted(hap)[2].split("_")[0] snp_1_allele_2["count"] = str(C + D) snp_1_allele_2["frequency"] = str(round(float(C + D) / N, 3)) snp_1["allele_2"] = snp_1_allele_2 output["snp1"] = snp_1 snp_2 = {} snp_2["rsnum"] = snp2 snp_2["coord"] = "chr" + snp2_coord['chromosome'] + ":" + \ vcf2_pos snp_2_allele_1 = {} snp_2_allele_1["allele"] = sorted(hap)[0].split("_")[1] snp_2_allele_1["count"] = str(A + C) snp_2_allele_1["frequency"] = str(round(float(A + C) / N, 3)) snp_2["allele_1"] = snp_2_allele_1 snp_2_allele_2 = {} snp_2_allele_2["allele"] = sorted(hap)[1].split("_")[1] snp_2_allele_2["count"] = str(B + D) snp_2_allele_2["frequency"] = str(round(float(B + D) / N, 3)) snp_2["allele_2"] = snp_2_allele_2 output["snp2"] = snp_2 two_by_two = {} cells = {} cells["c11"] = str(A) cells["c12"] = str(B) cells["c21"] = str(C) cells["c22"] = str(D) two_by_two["cells"] = cells two_by_two["total"] = str(N) output["two_by_two"] = two_by_two haplotypes = {} hap_1 = {} hap_1["alleles"] = hap1 hap_1["count"] = str(hap[hap1]) hap_1["frequency"] = str(round(float(hap[hap1]) / N, 3)) haplotypes["hap1"] = hap_1 hap_2 = {} hap_2["alleles"] = hap2 hap_2["count"] = str(hap[hap2]) hap_2["frequency"] = str(round(float(hap[hap2]) / N, 3)) haplotypes["hap2"] = hap_2 hap_3 = {} hap_3["alleles"] = hap3 hap_3["count"] = str(hap[hap3]) hap_3["frequency"] = str(round(float(hap[hap3]) / N, 3)) haplotypes["hap3"] = hap_3 hap_4 = {} hap_4["alleles"] = hap4 hap_4["count"] = str(hap[hap4]) hap_4["frequency"] = str(round(float(hap[hap4]) / N, 3)) haplotypes["hap4"] = hap_4 output["haplotypes"] = haplotypes statistics = {} if Ms != 0: statistics["d_prime"] = str(round(D_prime, 4)) statistics["r2"] = str(round(r2, 4)) statistics["chisq"] = str(round(chisq, 4)) if p >= 0.0001: statistics["p"] = str(round(p, 4)) else: statistics["p"] = "<0.0001" else: statistics["d_prime"] = D_prime statistics["r2"] = r2 statistics["chisq"] = chisq statistics["p"] = p output["statistics"] = statistics output["corr_alleles"] = corr_alleles output["request"] = request output_list.append(output) ### OUTPUT ERROR IF ONLY SINGLE SNP PAIR ### if len(snp_pairs) == 1 and len(output_list) == 1 and "error" in output_list[0]: return(json.dumps(output_list, sort_keys=True, indent=2)) # Generate output file only for single SNP pair inputs if len(snp_pairs) == 1 and len(output_list) == 1: ldpair_out = open(tmp_dir + "LDpair_" + request + ".txt", "w") print("Query SNPs:", file=ldpair_out) print(output_list[0]["snp1"]["rsnum"] + \ " (" + output_list[0]["snp1"]["coord"] + ")", file=ldpair_out) print(output_list[0]["snp2"]["rsnum"] + \ " (" + output_list[0]["snp2"]["coord"] + ")", file=ldpair_out) print("", file=ldpair_out) print(pop + " Haplotypes:", file=ldpair_out) print(" " * 15 + output_list[0]["snp2"]["rsnum"], file=ldpair_out) print(" " * 15 + \ output_list[0]["snp2"]["allele_1"]["allele"] + " " * \ 7 + output_list[0]["snp2"]["allele_2"]["allele"], file=ldpair_out) print(" " * 13 + "-" * 17, file=ldpair_out) print(" " * 11 + output_list[0]["snp1"]["allele_1"]["allele"] + " | " + output_list[0]["two_by_two"]["cells"]["c11"] + " " * (5 - len(output["two_by_two"]["cells"]["c11"])) + " | " + output["two_by_two"]["cells"]["c12"] + " " * ( 5 - len(output_list[0]["two_by_two"]["cells"]["c12"])) + " | " + output_list[0]["snp1"]["allele_1"]["count"] + " " * (5 - len(output["snp1"]["allele_1"]["count"])) + " (" + output["snp1"]["allele_1"]["frequency"] + ")", file=ldpair_out) print(output_list[0]["snp1"]["rsnum"] + " " * \ (10 - len(output_list[0]["snp1"]["rsnum"])) + " " * 3 + "-" * 17, file=ldpair_out) print(" " * 11 + output_list[0]["snp1"]["allele_2"]["allele"] + " | " + output_list[0]["two_by_two"]["cells"]["c21"] + " " * (5 - len(output["two_by_two"]["cells"]["c21"])) + " | " + output["two_by_two"]["cells"]["c22"] + " " * ( 5 - len(output_list[0]["two_by_two"]["cells"]["c22"])) + " | " + output_list[0]["snp1"]["allele_2"]["count"] + " " * (5 - len(output["snp1"]["allele_2"]["count"])) + " (" + output["snp1"]["allele_2"]["frequency"] + ")", file=ldpair_out) print(" " * 13 + "-" * 17, file=ldpair_out) print(" " * 15 + output_list[0]["snp2"]["allele_1"]["count"] + " " * (5 - len(output_list[0]["snp2"]["allele_1"]["count"])) + " " * 3 + output["snp2"]["allele_2"]["count"] + " " * ( 5 - len(output_list[0]["snp2"]["allele_2"]["count"])) + " " * 3 + output_list[0]["two_by_two"]["total"], file=ldpair_out) print(" " * 14 + "(" + output_list[0]["snp2"]["allele_1"]["frequency"] + ")" + " " * (5 - len(output_list[0]["snp2"]["allele_1"]["frequency"])) + \ " (" + output_list[0]["snp2"]["allele_2"]["frequency"] + ")" + \ " " * (5 - len(output_list[0]["snp2"]["allele_2"]["frequency"])), file=ldpair_out) print("", file=ldpair_out) print(" " + output_list[0]["haplotypes"]["hap1"]["alleles"] + ": " + \ output_list[0]["haplotypes"]["hap1"]["count"] + \ " (" + output_list[0]["haplotypes"]["hap1"]["frequency"] + ")", file=ldpair_out) print(" " + output_list[0]["haplotypes"]["hap2"]["alleles"] + ": " + \ output_list[0]["haplotypes"]["hap2"]["count"] + \ " (" + output_list[0]["haplotypes"]["hap2"]["frequency"] + ")", file=ldpair_out) print(" " + output_list[0]["haplotypes"]["hap3"]["alleles"] + ": " + \ output_list[0]["haplotypes"]["hap3"]["count"] + \ " (" + output_list[0]["haplotypes"]["hap3"]["frequency"] + ")", file=ldpair_out) print(" " + output_list[0]["haplotypes"]["hap4"]["alleles"] + ": " + \ output["haplotypes"]["hap4"]["count"] + \ " (" + output["haplotypes"]["hap4"]["frequency"] + ")", file=ldpair_out) print("", file=ldpair_out) print(" D': " + output_list[0]["statistics"]["d_prime"], file=ldpair_out) print(" R2: " + output_list[0]["statistics"]["r2"], file=ldpair_out) print(" Chi-sq: " + output_list[0]["statistics"]["chisq"], file=ldpair_out) print(" p-value: " + output_list[0]["statistics"]["p"], file=ldpair_out) print("", file=ldpair_out) if len(output_list[0]["corr_alleles"]) == 2: print(output_list[0]["corr_alleles"][0], file=ldpair_out) print(output_list[0]["corr_alleles"][1], file=ldpair_out) else: print(output_list[0]["corr_alleles"][0], file=ldpair_out) try: output_list[0]["warning"] except KeyError: www = "do nothing" else: print("WARNING: " + output_list[0]["warning"] + "!", file=ldpair_out) ldpair_out.close() # Return output return(json.dumps(output_list, sort_keys=True, indent=2))
# Set data directories using config.yml with open('config.yml', 'r') as yml_file: config = yaml.load(yml_file) env = config['env'] api_mongo_addr = config['api']['api_mongo_addr'] population_samples_dir = config['data']['population_samples_dir'] data_dir = config['data']['data_dir'] tmp_dir = config['data']['tmp_dir'] genotypes_dir = config['data']['genotypes_dir'] # reg_dir = config['data']['reg_dir'] aws_info = config['aws'] mongo_username = config['database']['mongo_user_readonly'] mongo_password = config['database']['mongo_password'] mongo_port = config['database']['mongo_port'] export_s3_keys = retrieveAWSCredentials() # Get population ids pop_list = open(tmp_dir + "pops_" + request + ".txt").readlines() ids = [] for i in range(len(pop_list)): ids.append(pop_list[i].strip()) pop_ids = list(set(ids)) variantPairs = [] with open(tmp_dir + 'trait_ld_' + str(subprocess_id) + '_' + str(request) + '.txt') as snpPairsFile: lines = snpPairsFile.readlines() for line in lines:
def calculate_proxy(snp, pop, request, web, genome_build, r2_d="r2", window=500000, collapseTranscript=True): # trim any whitespace snp = snp.lower().strip() start_time = time.time() # Set data directories using config.yml with open('config.yml', 'r') as yml_file: config = yaml.load(yml_file) env = config['env'] api_mongo_addr = config['api']['api_mongo_addr'] dbsnp_version = config['data']['dbsnp_version'] data_dir = config['data']['data_dir'] tmp_dir = config['data']['tmp_dir'] population_samples_dir = config['data']['population_samples_dir'] genotypes_dir = config['data']['genotypes_dir'] aws_info = config['aws'] mongo_username = config['database']['mongo_user_readonly'] mongo_password = config['database']['mongo_password'] mongo_port = config['database']['mongo_port'] num_subprocesses = config['performance']['num_subprocesses'] export_s3_keys = retrieveAWSCredentials() # Ensure tmp directory exists if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) if request is False: request = str(time.strftime("%I%M%S")) # Create JSON output out_json = open(tmp_dir + 'proxy' + request + ".json", "w") output = {} # Validate genome build param if genome_build not in genome_build_vars['vars']: output["error"] = "Invalid genome build. Please specify either " + ", ".join(genome_build_vars['vars']) + "." json_output = json.dumps(output, sort_keys=True, indent=2) print(json_output, file=out_json) out_json.close() return("", "") if window < 0 or window > 1000000: output["error"] = "Window value must be a number between 0 and 1,000,000." json_output = json.dumps(output, sort_keys=True, indent=2) print(json_output, file=out_json) out_json.close() return("", "") # Connect to Mongo snp database if env == 'local': mongo_host = api_mongo_addr else: mongo_host = 'localhost' if web: client = MongoClient('mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host+'/admin', mongo_port) else: if env == 'local': client = MongoClient('mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host+'/admin', mongo_port) else: client = MongoClient('localhost', mongo_port) db = client["LDLink"] def get_coords(rsid): rsid = rsid.strip("rs") query_results = db.dbsnp.find_one({"id": rsid}) query_results_sanitized = json.loads(json_util.dumps(query_results)) return query_results_sanitized # Query genomic coordinates def get_rsnum(coord): temp_coord = coord.strip("chr").split(":") chro = temp_coord[0] pos = temp_coord[1] query_results = db.dbsnp.find({"chromosome": chro.upper() if chro == 'x' or chro == 'y' else str(chro), genome_build_vars[genome_build]['position']: str(pos)}) query_results_sanitized = json.loads(json_util.dumps(query_results)) return query_results_sanitized # Replace input genomic coordinates with variant ids (rsids) def replace_coord_rsid(snp): if snp[0:2] == "rs": return snp else: snp_info_lst = get_rsnum(snp) print("snp_info_lst") print(snp_info_lst) if snp_info_lst != None: if len(snp_info_lst) > 1: var_id = "rs" + snp_info_lst[0]['id'] ref_variants = [] for snp_info in snp_info_lst: if snp_info['id'] == snp_info['ref_id']: ref_variants.append(snp_info['id']) if len(ref_variants) > 1: var_id = "rs" + ref_variants[0] if "warning" in output: output["warning"] = output["warning"] + \ ". Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp else: output["warning"] = "Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp elif len(ref_variants) == 0 and len(snp_info_lst) > 1: var_id = "rs" + snp_info_lst[0]['id'] if "warning" in output: output["warning"] = output["warning"] + \ ". Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp else: output["warning"] = "Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp else: var_id = "rs" + ref_variants[0] return var_id elif len(snp_info_lst) == 1: var_id = "rs" + snp_info_lst[0]['id'] return var_id else: return snp else: return snp return snp snp = replace_coord_rsid(snp) # Find RS number in snp database snp_coord = get_coords(snp) if snp_coord == None or snp_coord[genome_build_vars[genome_build]['position']] == "NA": output["error"] = snp + " is not in dbSNP " + dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ")." json_output = json.dumps(output, sort_keys=True, indent=2) print(json_output, file=out_json) out_json.close() return("", "") # check if variant is on chrY for genome build = GRCh38 if snp_coord['chromosome'] == "Y" and (genome_build == "grch38" or genome_build == "grch38_high_coverage"): output["error"] = "Input variants on chromosome Y are unavailable for GRCh38, only available for GRCh37 (" + "rs" + snp_coord['id'] + " = chr" + snp_coord['chromosome'] + ":" + snp_coord[genome_build_vars[genome_build]['position']] + ")" json_output = json.dumps(output, sort_keys=True, indent=2) print(json_output, file=out_json) out_json.close() return("", "") # Select desired ancestral populations pops = pop.split("+") pop_dirs = [] for pop_i in pops: if pop_i in ["ALL", "AFR", "AMR", "EAS", "EUR", "SAS", "ACB", "ASW", "BEB", "CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH", "GWD", "IBS", "ITU", "JPT", "KHV", "LWK", "MSL", "MXL", "PEL", "PJL", "PUR", "STU", "TSI", "YRI"]: pop_dirs.append(data_dir + population_samples_dir + pop_i + ".txt") else: output["error"] = pop_i + " is not an ancestral population. Choose one of the following ancestral populations: AFR, AMR, EAS, EUR, or SAS; or one of the following sub-populations: ACB, ASW, BEB, CDX, CEU, CHB, CHS, CLM, ESN, FIN, GBR, GIH, GWD, IBS, ITU, JPT, KHV, LWK, MSL, MXL, PEL, PJL, PUR, STU, TSI, or YRI." json_output = json.dumps(output, sort_keys=True, indent=2) print(json_output, file=out_json) out_json.close() return("", "") get_pops = "cat " + " ".join(pop_dirs) + " > " + \ tmp_dir + "pops_" + request + ".txt" subprocess.call(get_pops, shell=True) # Get population ids pop_list = open(tmp_dir + "pops_" + request + ".txt").readlines() ids = [] for i in range(len(pop_list)): ids.append(pop_list[i].strip()) pop_ids = list(set(ids)) # Extract query SNP phased genotypes vcf_filePath = "%s/%s%s/%s" % (config['aws']['data_subfolder'], genotypes_dir, genome_build_vars[genome_build]["1000G_dir"], genome_build_vars[genome_build]["1000G_file"] % (snp_coord['chromosome'])) vcf_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath) checkS3File(aws_info, config['aws']['bucket'], vcf_filePath) tabix_snp_h = export_s3_keys + " cd {1}; tabix -HD {0} | grep CHROM".format(vcf_file, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) head = [x.decode('utf-8') for x in subprocess.Popen(tabix_snp_h, shell=True, stdout=subprocess.PIPE).stdout.readlines()][0].strip().split() tabix_snp = export_s3_keys + " cd {4}; tabix -D {0} {1}:{2}-{2} | grep -v -e END > {3}".format( vcf_file, genome_build_vars[genome_build]['1000G_chr_prefix'] + snp_coord['chromosome'], snp_coord[genome_build_vars[genome_build]['position']], tmp_dir + "snp_no_dups_" + request + ".vcf", data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) subprocess.call(tabix_snp, shell=True) # Check SNP is in the 1000G population, has the correct RS number, and not # monoallelic vcf = open(tmp_dir + "snp_no_dups_" + request + ".vcf").readlines() if len(vcf) == 0: output["error"] = snp + " is not in 1000G reference panel." json_output = json.dumps(output, sort_keys=True, indent=2) print(json_output, file=out_json) out_json.close() subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) return("", "") elif len(vcf) > 1: geno = [] for i in range(len(vcf)): if vcf[i].strip().split()[2] == snp: geno = vcf[i].strip().split() geno[0] = geno[0].lstrip('chr') if geno == []: output["error"] = snp + " is not in 1000G reference panel." json_output = json.dumps(output, sort_keys=True, indent=2) print(json_output, file=out_json) out_json.close() subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) return("", "") else: geno = vcf[0].strip().split() geno[0] = geno[0].lstrip('chr') if geno[2] != snp and snp[0:2]=="rs" and "rs" in geno[2]: output["warning"] = "Genomic position for query variant (" + snp + \ ") does not match RS number at 1000G position (chr" + \ geno[0]+":"+geno[1]+" = "+geno[2]+")" snp = geno[2] if "," in geno[3] or "," in geno[4]: output["error"] = snp + " is not a biallelic variant." json_output = json.dumps(output, sort_keys=True, indent=2) print(json_output, file=out_json) out_json.close() subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) return("", "") index = [] for i in range(9, len(head)): if head[i] in pop_ids: index.append(i) genotypes = {"0": 0, "1": 0} for i in index: sub_geno = geno[i].split("|") for j in sub_geno: if j in genotypes: genotypes[j] += 1 else: genotypes[j] = 1 if genotypes["0"] == 0 or genotypes["1"] == 0: output["error"] = snp + \ " is monoallelic in the " + pop + " population." json_output = json.dumps(output, sort_keys=True, indent=2) print(json_output, file=out_json) out_json.close() subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) return("", "") # Define window of interest around query SNP # window = 500000 coord1 = int(snp_coord[genome_build_vars[genome_build]['position']]) - window if coord1 < 0: coord1 = 0 coord2 = int(snp_coord[genome_build_vars[genome_build]['position']]) + window print("") # Calculate proxy LD statistics in parallel # threads = 4 # block = (2 * window) // 4 # block = (2 * window) // num_subprocesses windowChunkRanges = chunkWindow(int(snp_coord[genome_build_vars[genome_build]['position']]), window, num_subprocesses) commands = [] for subprocess_id in range(num_subprocesses): getWindowVariantsArgs = " ".join([str(web), str(snp), str(snp_coord['chromosome']), str(windowChunkRanges[subprocess_id][0]), str(windowChunkRanges[subprocess_id][1]), str(request), genome_build, str(subprocess_id)]) commands.append("python3 LDproxy_sub.py " + getWindowVariantsArgs) processes = [subprocess.Popen( command, shell=True, stdout=subprocess.PIPE) for command in commands] # collect output in parallel def get_output(process): return process.communicate()[0].splitlines() if not hasattr(threading.current_thread(), "_children"): threading.current_thread()._children = weakref.WeakKeyDictionary() pool = Pool(len(processes)) out_raw = pool.map(get_output, processes) pool.close() pool.join() # Aggregate output out_prox = [] for i in range(len(out_raw)): for j in range(len(out_raw[i])): col = out_raw[i][j].decode('utf-8').strip().split("\t") col[6] = int(col[6]) col[7] = float(col[7]) col[8] = float(col[8]) col.append(abs(int(col[6]))) out_prox.append(col) # Sort output if r2_d not in ["r2", "d"]: if "warning" in output: output["warning"] = output["warning"] + ". " + r2_d + \ " is not an acceptable value for r2_d (r2 or d required). r2 is used by default" else: output["warning"] = r2_d + \ " is not an acceptable value for r2_d (r2 or d required). r2 is used by default" r2_d = "r2" out_dist_sort = sorted(out_prox, key=operator.itemgetter(14)) if r2_d == "r2": out_ld_sort = sorted( out_dist_sort, key=operator.itemgetter(8), reverse=True) else: out_ld_sort = sorted( out_dist_sort, key=operator.itemgetter(7), reverse=True) # Populate JSON and text output outfile = open(tmp_dir + "proxy" + request + ".txt", "w") header = ["RS_Number", "Coord", "Alleles", "MAF", "Distance", "Dprime", "R2", "Correlated_Alleles", "RegulomeDB", "Function"] print("\t".join(header), file=outfile) ucsc_track = {} ucsc_track["header"] = ["chr", "pos", "rsid", "stat"] query_snp = {} query_snp["RS"] = out_ld_sort[0][3] query_snp["Alleles"] = out_ld_sort[0][1] query_snp["Coord"] = out_ld_sort[0][2] query_snp["Dist"] = out_ld_sort[0][6] query_snp["Dprime"] = str(round(float(out_ld_sort[0][7]), 4)) query_snp["R2"] = str(round(float(out_ld_sort[0][8]), 4)) query_snp["Corr_Alleles"] = out_ld_sort[0][9] query_snp["RegulomeDB"] = out_ld_sort[0][10] query_snp["MAF"] = str(round(float(out_ld_sort[0][11]), 4)) query_snp["Function"] = out_ld_sort[0][13] output["query_snp"] = query_snp temp = [query_snp["RS"], query_snp["Coord"], query_snp["Alleles"], query_snp["MAF"], str(query_snp["Dist"]), str( query_snp["Dprime"]), str(query_snp["R2"]), query_snp["Corr_Alleles"], query_snp["RegulomeDB"], query_snp["Function"]] print("\t".join(temp), file=outfile) chr, pos = query_snp["Coord"].split(':') if r2_d == "r2": temp2 = [chr, pos, query_snp["RS"], query_snp["R2"]] else: temp2 = [chr, pos, query_snp["RS"], query_snp["Dprime"]] ucsc_track["query_snp"] = temp2 ucsc_track["0.8-1.0"] = [] ucsc_track["0.6-0.8"] = [] ucsc_track["0.4-0.6"] = [] ucsc_track["0.2-0.4"] = [] ucsc_track["0.0-0.2"] = [] proxies = {} rows = [] digits = len(str(len(out_ld_sort))) for i in range(1, len(out_ld_sort)): if float(out_ld_sort[i][8]) > 0.01 and out_ld_sort[i][3] != snp: proxy_info = {} row = [] proxy_info["RS"] = out_ld_sort[i][3] proxy_info["Alleles"] = out_ld_sort[i][4] proxy_info["Coord"] = out_ld_sort[i][5] proxy_info["Dist"] = out_ld_sort[i][6] proxy_info["Dprime"] = str(round(float(out_ld_sort[i][7]), 4)) proxy_info["R2"] = str(round(float(out_ld_sort[i][8]), 4)) proxy_info["Corr_Alleles"] = out_ld_sort[i][9] proxy_info["RegulomeDB"] = out_ld_sort[i][10] proxy_info["MAF"] = str(round(float(out_ld_sort[i][12]), 4)) proxy_info["Function"] = out_ld_sort[i][13] proxies["proxy_" + (digits - len(str(i))) * "0" + str(i)] = proxy_info chr, pos = proxy_info["Coord"].split(':') # Adding a row for the Data Table row.append(proxy_info["RS"]) row.append(chr) row.append(pos) row.append(proxy_info["Alleles"]) row.append(str(round(float(proxy_info["MAF"]), 4))) row.append(abs(proxy_info["Dist"])) row.append(str(round(float(proxy_info["Dprime"]), 4))) row.append(str(round(float(proxy_info["R2"]), 4))) row.append(proxy_info["Corr_Alleles"]) row.append(proxy_info["RegulomeDB"]) row.append("HaploReg link") row.append(proxy_info["Function"]) rows.append(row) temp = [proxy_info["RS"], proxy_info["Coord"], proxy_info["Alleles"], proxy_info["MAF"], str(proxy_info["Dist"]), str( proxy_info["Dprime"]), str(proxy_info["R2"]), proxy_info["Corr_Alleles"], proxy_info["RegulomeDB"], proxy_info["Function"]] print("\t".join(temp), file=outfile) chr, pos = proxy_info["Coord"].split(':') if r2_d == "r2": temp2 = [chr, pos, proxy_info["RS"], round(float(out_ld_sort[i][8]), 4)] else: temp2 = [chr, pos, proxy_info["RS"], round(float(out_ld_sort[i][7]), 4)] if 0.8 < temp2[3] <= 1.0: ucsc_track["0.8-1.0"].append(temp2) elif 0.6 < temp2[3] <= 0.8: ucsc_track["0.6-0.8"].append(temp2) elif 0.4 < temp2[3] <= 0.6: ucsc_track["0.4-0.6"].append(temp2) elif 0.2 < temp2[3] <= 0.4: ucsc_track["0.2-0.4"].append(temp2) else: ucsc_track["0.0-0.2"].append(temp2) track = open(tmp_dir + "track" + request + ".txt", "w") print("browser position chr" + \ str(snp_coord['chromosome']) + ":" + str(coord1) + "-" + str(coord2), file=track) print("", file=track) if r2_d == "r2": print("track type=bedGraph name=\"R2 Plot\" description=\"Plot of R2 values\" color=50,50,50 visibility=full alwaysZero=on graphType=bar maxHeightPixels=60", file=track) else: print("track type=bedGraph name=\"D Prime Plot\" description=\"Plot of D prime values\" color=50,50,50 visibility=full alwaysZero=on graphType=bar maxHeightPixels=60", file=track) print("\t".join( [str(ucsc_track["query_snp"][i]) for i in [0, 1, 1, 3]]), file=track) if len(ucsc_track["0.8-1.0"]) > 0: for var in ucsc_track["0.8-1.0"]: print("\t".join([str(var[i]) for i in [0, 1, 1, 3]]), file=track) if len(ucsc_track["0.6-0.8"]) > 0: for var in ucsc_track["0.6-0.8"]: print("\t".join([str(var[i]) for i in [0, 1, 1, 3]]), file=track) if len(ucsc_track["0.4-0.6"]) > 0: for var in ucsc_track["0.4-0.6"]: print("\t".join([str(var[i]) for i in [0, 1, 1, 3]]), file=track) if len(ucsc_track["0.2-0.4"]) > 0: for var in ucsc_track["0.2-0.4"]: print("\t".join([str(var[i]) for i in [0, 1, 1, 3]]), file=track) if len(ucsc_track["0.0-0.2"]) > 0: for var in ucsc_track["0.0-0.2"]: print("\t".join([str(var[i]) for i in [0, 1, 1, 3]]), file=track) print("", file=track) print("track type=bed name=\"" + snp + \ "\" description=\"Query Variant: " + snp + "\" color=108,108,255", file=track) print("\t".join([ucsc_track["query_snp"][i] for i in [0, 1, 1, 2]]), file=track) print("", file=track) if len(ucsc_track["0.8-1.0"]) > 0: if r2_d == "r2": print("track type=bed name=\"0.8<R2<=1.0\" description=\"Proxy Variants with 0.8<R2<=1.0\" color=198,129,0", file=track) else: print("track type=bed name=\"0.8<D'<=1.0\" description=\"Proxy Variants with 0.8<D'<=1.0\" color=198,129,0", file=track) for var in ucsc_track["0.8-1.0"]: print("\t".join([var[i] for i in [0, 1, 1, 2]]), file=track) print("", file=track) if len(ucsc_track["0.6-0.8"]) > 0: if r2_d == "r2": print("track type=bed name=\"0.6<R2<=0.8\" description=\"Proxy Variants with 0.6<R2<=0.8\" color=198,129,0", file=track) else: print("track type=bed name=\"0.6<D'<=0.8\" description=\"Proxy Variants with 0.6<D'<=0.8\" color=198,129,0", file=track) for var in ucsc_track["0.6-0.8"]: print("\t".join([var[i] for i in [0, 1, 1, 2]]), file=track) print("", file=track) if len(ucsc_track["0.4-0.6"]) > 0: if r2_d == "r2": print("track type=bed name=\"0.4<R2<=0.6\" description=\"Proxy Variants with 0.4<R2<=0.6\" color=198,129,0", file=track) else: print("track type=bed name=\"0.4<D'<=0.6\" description=\"Proxy Variants with 0.4<D'<=0.6\" color=198,129,0", file=track) for var in ucsc_track["0.4-0.6"]: print("\t".join([var[i] for i in [0, 1, 1, 2]]), file=track) print("", file=track) if len(ucsc_track["0.2-0.4"]) > 0: if r2_d == "r2": print("track type=bed name=\"0.2<R2<=0.4\" description=\"Proxy Variants with 0.2<R2<=0.4\" color=198,129,0", file=track) else: print("track type=bed name=\"0.2<D'<=0.4\" description=\"Proxy Variants with 0.2<D'<=0.4\" color=198,129,0", file=track) for var in ucsc_track["0.2-0.4"]: print("\t".join([var[i] for i in [0, 1, 1, 2]]), file=track) print("", file=track) if len(ucsc_track["0.0-0.2"]) > 0: if r2_d == "r2": print("track type=bed name=\"0.0<R2<=0.2\" description=\"Proxy Variants with 0.0<R2<=0.2\" color=198,129,0", file=track) else: print("track type=bed name=\"0.0<D'<=0.2\" description=\"Proxy Variants with 0.0<D'<=0.2\" color=198,129,0", file=track) for var in ucsc_track["0.0-0.2"]: print("\t".join([var[i] for i in [0, 1, 1, 2]]), file=track) print("", file=track) output["aaData"] = rows output["proxy_snps"] = proxies # Output JSON and text file json_output = json.dumps(output, sort_keys=True, indent=2) print(json_output, file=out_json) out_json.close() outfile.close() track.close() out_script = "" out_div = "" if web: # Organize scatter plot data q_rs = [] q_allele = [] q_coord = [] q_maf = [] p_rs = [] p_allele = [] p_coord = [] p_maf = [] dist = [] d_prime = [] d_prime_round = [] r2 = [] r2_round = [] corr_alleles = [] regdb = [] funct = [] color = [] size = [] for i in range(len(out_ld_sort)): q_rs_i, q_allele_i, q_coord_i, p_rs_i, p_allele_i, p_coord_i, dist_i, d_prime_i, r2_i, corr_alleles_i, regdb_i, q_maf_i, p_maf_i, funct_i, dist_abs = out_ld_sort[ i] if float(r2_i) > 0.01: q_rs.append(q_rs_i) q_allele.append(q_allele_i) q_coord.append(float(q_coord_i.split(":")[1]) / 1000000) q_maf.append(str(round(float(q_maf_i), 4))) if p_rs_i == ".": p_rs_i = p_coord_i p_rs.append(p_rs_i) p_allele.append(p_allele_i) p_coord.append(float(p_coord_i.split(":")[1]) / 1000000) p_maf.append(str(round(float(p_maf_i), 4))) dist.append(str(round(dist_i / 1000000.0, 4))) d_prime.append(float(d_prime_i)) d_prime_round.append(str(round(float(d_prime_i), 4))) r2.append(float(r2_i)) r2_round.append(str(round(float(r2_i), 4))) corr_alleles.append(corr_alleles_i) # Correct Missing Annotations if regdb_i == ".": regdb_i = "" regdb.append(regdb_i) if funct_i == ".": funct_i = "" if funct_i == "NA": funct_i = "none" funct.append(funct_i) # Set Color if i == 0: color_i = "blue" elif funct_i != "none" and funct_i != "": color_i = "red" else: color_i = "orange" color.append(color_i) # Set Size size_i = 9 + float(p_maf_i) * 14.0 size.append(size_i) # Begin Bokeh Plotting from collections import OrderedDict from bokeh.embed import components, file_html from bokeh.layouts import gridplot from bokeh.models import HoverTool, LinearAxis, Range1d from bokeh.plotting import ColumnDataSource, curdoc, figure, output_file, reset_output, save from bokeh.resources import CDN reset_output() # Proxy Plot x = p_coord if r2_d == "r2": y = r2 else: y = d_prime whitespace = 0.01 xr = Range1d(start=coord1 / 1000000.0 - whitespace, end=coord2 / 1000000.0 + whitespace) yr = Range1d(start=-0.03, end=1.03) sup_2 = "\u00B2" proxy_plot = figure( title="Proxies for " + snp + " in " + pop, min_border_top=2, min_border_bottom=2, min_border_left=60, min_border_right=60, h_symmetry=False, v_symmetry=False, plot_width=900, plot_height=600, x_range=xr, y_range=yr, tools="hover,tap,pan,box_zoom,box_select,undo,redo,reset,previewsave", logo=None, toolbar_location="above") proxy_plot.title.align = "center" # Add recombination rate recomb_file = tmp_dir + "recomb_" + request + ".json" recomb_json = getRecomb(db, recomb_file, snp_coord['chromosome'], coord1 - whitespace, coord2 + whitespace, genome_build) recomb_x = [] recomb_y = [] for recomb_obj in recomb_json: recomb_x.append(int(recomb_obj[genome_build_vars[genome_build]['position']]) / 1000000.0) recomb_y.append(float(recomb_obj['rate']) / 100.0) data = { 'x': x, 'y': y, 'qrs': q_rs, 'q_alle': q_allele, 'q_maf': q_maf, 'prs': p_rs, 'p_alle': p_allele, 'p_maf': p_maf, 'dist': dist, 'r': r2_round, 'd': d_prime_round, 'alleles': corr_alleles, 'regdb': regdb, 'funct': funct, 'size': size, 'color': color } source = ColumnDataSource(data) proxy_plot.line(recomb_x, recomb_y, line_width=1, color="black", alpha=0.5) proxy_plot.circle(x='x', y='y', size='size', color='color', alpha=0.5, source=source) hover = proxy_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Query Variant", "@qrs @q_alle"), ("Proxy Variant", "@prs @p_alle"), ("Distance (Mb)", "@dist"), ("MAF (Query,Proxy)", "@q_maf,@p_maf"), ("R" + sup_2, "@r"), ("D\'", "@d"), ("Correlated Alleles", "@alleles"), ("RegulomeDB", "@regdb"), ("Functional Class", "@funct"), ]) proxy_plot.text(x, y, text=regdb, alpha=1, text_font_size="7pt", text_baseline="middle", text_align="center", angle=0) if r2_d == "r2": proxy_plot.yaxis.axis_label = "R" + sup_2 else: proxy_plot.yaxis.axis_label = "D\'" proxy_plot.extra_y_ranges = {"y2_axis": Range1d(start=-3, end=103)} proxy_plot.add_layout(LinearAxis(y_range_name="y2_axis", axis_label="Combined Recombination Rate (cM/Mb)"), "right") # Rug Plot y2_ll = [-0.03] * len(x) y2_ul = [1.03] * len(x) yr_rug = Range1d(start=-0.03, end=1.03) data_rug = { 'x': x, 'y': y, 'y2_ll': y2_ll, 'y2_ul': y2_ul, 'qrs': q_rs, 'q_alle': q_allele, 'q_maf': q_maf, 'prs': p_rs, 'p_alle': p_allele, 'p_maf': p_maf, 'dist': dist, 'r': r2_round, 'd': d_prime_round, 'alleles': corr_alleles, 'regdb': regdb, 'funct': funct, 'size': size, 'color': color } source_rug = ColumnDataSource(data_rug) rug = figure( x_range=xr, y_range=yr_rug, border_fill_color='white', y_axis_type=None, title="", min_border_top=2, min_border_bottom=2, min_border_left=60, min_border_right=60, h_symmetry=False, v_symmetry=False, plot_width=900, plot_height=50, tools="xpan,tap", logo=None) rug.segment(x0='x', y0='y2_ll', x1='x', y1='y2_ul', source=source_rug, color='color', alpha=0.5, line_width=1) rug.toolbar_location = None if collapseTranscript == "false": # Gene Plot (All Transcripts) genes_file = tmp_dir + "genes_" + request + ".json" genes_json = getRefGene(db, genes_file, snp_coord['chromosome'], int(coord1), int(coord2), genome_build, False) genes_plot_start = [] genes_plot_end = [] genes_plot_y = [] genes_plot_name = [] exons_plot_x = [] exons_plot_y = [] exons_plot_w = [] exons_plot_h = [] exons_plot_name = [] exons_plot_id = [] exons_plot_exon = [] lines = [0] gap = 80000 tall = 0.75 if genes_json != None and len(genes_json) > 0: for gene_obj in genes_json: bin = gene_obj["bin"] name_id = gene_obj["name"] chrom = gene_obj["chrom"] strand = gene_obj["strand"] txStart = gene_obj["txStart"] txEnd = gene_obj["txEnd"] cdsStart = gene_obj["cdsStart"] cdsEnd = gene_obj["cdsEnd"] exonCount = gene_obj["exonCount"] exonStarts = gene_obj["exonStarts"] exonEnds = gene_obj["exonEnds"] score = gene_obj["score"] name2 = gene_obj["name2"] cdsStartStat = gene_obj["cdsStartStat"] cdsEndStat = gene_obj["cdsEndStat"] exonFrames = gene_obj["exonFrames"] name = name2 id = name_id e_start = exonStarts.split(",") e_end = exonEnds.split(",") # Determine Y Coordinate i = 0 y_coord = None while y_coord == None: if i > len(lines) - 1: y_coord = i + 1 lines.append(int(txEnd)) elif int(txStart) > (gap + lines[i]): y_coord = i + 1 lines[i] = int(txEnd) else: i += 1 genes_plot_start.append(int(txStart) / 1000000.0) genes_plot_end.append(int(txEnd) / 1000000.0) genes_plot_y.append(y_coord) genes_plot_name.append(name + " ") for i in range(len(e_start) - 1): if strand == "+": exon = i + 1 else: exon = len(e_start) - 1 - i width = (int(e_end[i]) - int(e_start[i])) / 1000000.0 x_coord = int(e_start[i]) / 1000000.0 + (width / 2) exons_plot_x.append(x_coord) exons_plot_y.append(y_coord) exons_plot_w.append(width) exons_plot_h.append(tall) exons_plot_name.append(name) exons_plot_id.append(id) exons_plot_exon.append(exon) n_rows = len(lines) genes_plot_yn = [n_rows - x + 0.5 for x in genes_plot_y] exons_plot_yn = [n_rows - x + 0.5 for x in exons_plot_y] yr2 = Range1d(start=0, end=n_rows) data_gene_plot = { 'exons_plot_x': exons_plot_x, 'exons_plot_yn': exons_plot_yn, 'exons_plot_w': exons_plot_w, 'exons_plot_h': exons_plot_h, 'exons_plot_name': exons_plot_name, 'exons_plot_id': exons_plot_id, 'exons_plot_exon': exons_plot_exon } source_gene_plot = ColumnDataSource(data_gene_plot) if len(lines) < 3: plot_h_pix = 250 else: plot_h_pix = 250 + (len(lines) - 2) * 50 gene_plot = figure( x_range=xr, y_range=yr2, border_fill_color='white', title="", min_border_top=2, min_border_bottom=2, min_border_left=60, min_border_right=60, h_symmetry=False, v_symmetry=False, plot_width=900, plot_height=plot_h_pix, tools="hover,tap,xpan,box_zoom,undo,redo,reset,previewsave", logo=None) gene_plot.segment(genes_plot_start, genes_plot_yn, genes_plot_end, genes_plot_yn, color="black", alpha=1, line_width=2) gene_plot.rect(x='exons_plot_x', y='exons_plot_yn', width='exons_plot_w', height='exons_plot_h', source=source_gene_plot, fill_color="grey", line_color="grey") gene_plot.xaxis.axis_label = "Chromosome " + snp_coord['chromosome'] + " Coordinate (Mb)(" + genome_build_vars[genome_build]['title'] + ")" gene_plot.yaxis.axis_label = "Genes (All Transcripts)" gene_plot.ygrid.grid_line_color = None gene_plot.yaxis.axis_line_color = None gene_plot.yaxis.minor_tick_line_color = None gene_plot.yaxis.major_tick_line_color = None gene_plot.yaxis.major_label_text_color = None hover = gene_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Gene", "@exons_plot_name"), ("ID", "@exons_plot_id"), ("Exon", "@exons_plot_exon"), ]) gene_plot.text(genes_plot_start, genes_plot_yn, text=genes_plot_name, alpha=1, text_font_size="7pt", text_font_style="bold", text_baseline="middle", text_align="right", angle=0) gene_plot.toolbar_location = "below" # Combine plots into a grid out_grid = gridplot(proxy_plot, rug, gene_plot, ncols=1, toolbar_options=dict(logo=None)) # Gene Plot (Collapsed) else: genes_c_file = tmp_dir + "genes_c_" + request + ".json" genes_c_json = getRefGene(db, genes_c_file, snp_coord['chromosome'], int(coord1), int(coord2), genome_build, True) genes_c_plot_start=[] genes_c_plot_end=[] genes_c_plot_y=[] genes_c_plot_name=[] exons_c_plot_x=[] exons_c_plot_y=[] exons_c_plot_w=[] exons_c_plot_h=[] exons_c_plot_name=[] exons_c_plot_id=[] message_c = ["Too many genes to plot."] lines_c=[0] gap=80000 tall=0.75 if genes_c_json != None and len(genes_c_json) > 0: for gene_c_obj in genes_c_json: chrom = gene_c_obj["chrom"] txStart = gene_c_obj["txStart"] txEnd = gene_c_obj["txEnd"] exonStarts = gene_c_obj["exonStarts"] exonEnds = gene_c_obj["exonEnds"] name2 = gene_c_obj["name2"] transcripts = gene_c_obj["transcripts"] name = name2 e_start = exonStarts.split(",") e_end = exonEnds.split(",") e_transcripts=transcripts.split(",") # Determine Y Coordinate i=0 y_coord=None while y_coord==None: if i>len(lines_c)-1: y_coord=i+1 lines_c.append(int(txEnd)) elif int(txStart)>(gap+lines_c[i]): y_coord=i+1 lines_c[i]=int(txEnd) else: i+=1 genes_c_plot_start.append(int(txStart)/1000000.0) genes_c_plot_end.append(int(txEnd)/1000000.0) genes_c_plot_y.append(y_coord) genes_c_plot_name.append(name+" ") # for i in range(len(e_start)): for i in range(len(e_start)-1): width=(int(e_end[i])-int(e_start[i]))/1000000.0 x_coord=int(e_start[i])/1000000.0+(width/2) exons_c_plot_x.append(x_coord) exons_c_plot_y.append(y_coord) exons_c_plot_w.append(width) exons_c_plot_h.append(tall) exons_c_plot_name.append(name) exons_c_plot_id.append(e_transcripts[i].replace("-",",")) n_rows_c=len(lines_c) genes_c_plot_yn=[n_rows_c-x+0.5 for x in genes_c_plot_y] exons_c_plot_yn=[n_rows_c-x+0.5 for x in exons_c_plot_y] yr2_c=Range1d(start=0, end=n_rows_c) data_gene_c_plot = {'exons_c_plot_x': exons_c_plot_x, 'exons_c_plot_yn': exons_c_plot_yn, 'exons_c_plot_w': exons_c_plot_w, 'exons_c_plot_h': exons_c_plot_h, 'exons_c_plot_name': exons_c_plot_name, 'exons_c_plot_id': exons_c_plot_id} source_gene_c_plot=ColumnDataSource(data_gene_c_plot) max_genes_c = 40 # if len(lines_c) < 3 or len(genes_c_raw) > max_genes_c: if len(lines_c) < 3: plot_c_h_pix = 250 else: plot_c_h_pix = 250 + (len(lines_c) - 2) * 50 gene_c_plot = figure(min_border_top=2, min_border_bottom=0, min_border_left=100, min_border_right=5, x_range=xr, y_range=yr2_c, border_fill_color='white', title="", h_symmetry=False, v_symmetry=False, logo=None, plot_width=900, plot_height=plot_c_h_pix, tools="hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave") # if len(genes_c_raw) <= max_genes_c: gene_c_plot.segment(genes_c_plot_start, genes_c_plot_yn, genes_c_plot_end, genes_c_plot_yn, color="black", alpha=1, line_width=2) gene_c_plot.rect(x='exons_c_plot_x', y='exons_c_plot_yn', width='exons_c_plot_w', height='exons_c_plot_h', source=source_gene_c_plot, fill_color="grey", line_color="grey") gene_c_plot.text(genes_c_plot_start, genes_c_plot_yn, text=genes_c_plot_name, alpha=1, text_font_size="7pt", text_font_style="bold", text_baseline="middle", text_align="right", angle=0) hover = gene_c_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Gene", "@exons_c_plot_name"), ("Transcript IDs", "@exons_c_plot_id"), ]) # else: # x_coord_text = coord1/1000000.0 + (coord2/1000000.0 - coord1/1000000.0) / 2.0 # gene_c_plot.text(x_coord_text, n_rows_c / 2.0, text=message_c, alpha=1, # text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0) gene_c_plot.xaxis.axis_label = "Chromosome " + snp_coord['chromosome'] + " Coordinate (Mb)(" + genome_build_vars[genome_build]['title'] + ")" gene_c_plot.yaxis.axis_label = "Genes (Transcripts Collapsed)" gene_c_plot.ygrid.grid_line_color = None gene_c_plot.yaxis.axis_line_color = None gene_c_plot.yaxis.minor_tick_line_color = None gene_c_plot.yaxis.major_tick_line_color = None gene_c_plot.yaxis.major_label_text_color = None gene_c_plot.toolbar_location = "below" out_grid = gridplot(proxy_plot, rug, gene_c_plot, ncols=1, toolbar_options=dict(logo=None)) # Generate high quality images only if accessed via web instance # Open thread for high quality image exports command = "python3 LDproxy_plot_sub.py " + snp + " " + pop + " " + request + " " + genome_build + " " + r2_d + " " + str(window) + " " + collapseTranscript subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) ########################### # Html output for testing # ########################### #html=file_html(out_grid, CDN, "Test Plot") # out_html=open("LDproxy.html","w") #print >> out_html, html # out_html.close() out_script, out_div = components(out_grid, CDN) reset_output() # Print run time statistics pop_list = open(tmp_dir + "pops_" + request + ".txt").readlines() print("\nNumber of Individuals: " + str(len(pop_list))) print("SNPs in Region: " + str(len(out_prox))) duration = time.time() - start_time print("Run time: " + str(duration) + " seconds\n") # Return plot output return(out_script, out_div)
def calculate_clip(snplst, pop, request, web, genome_build, r2_threshold=0.1, maf_threshold=0.01): max_list = 5000 # Set data directories using config.yml with open('config.yml', 'r') as yml_file: config = yaml.load(yml_file) env = config['env'] api_mongo_addr = config['api']['api_mongo_addr'] dbsnp_version = config['data']['dbsnp_version'] population_samples_dir = config['data']['population_samples_dir'] data_dir = config['data']['data_dir'] tmp_dir = config['data']['tmp_dir'] genotypes_dir = config['data']['genotypes_dir'] aws_info = config['aws'] mongo_username = config['database']['mongo_user_readonly'] mongo_password = config['database']['mongo_password'] mongo_port = config['database']['mongo_port'] export_s3_keys = retrieveAWSCredentials() # Ensure tmp directory exists if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) # Create JSON output out_json = open(tmp_dir+"clip"+request+".json", "w") output = {} # Validate genome build param print("genome_build " + genome_build) if genome_build not in genome_build_vars['vars']: output["error"] = "Invalid genome build. Please specify either " + ", ".join(genome_build_vars['vars']) + "." json_output = json.dumps(output, sort_keys=True, indent=2) print(json_output, file=out_json) out_json.close() return("", "", "") # Open SNP list file snps_raw = open(snplst).readlines() if len(snps_raw) > max_list: output["error"] = "Maximum SNP list is " + \ str(max_list)+" RS numbers. Your list contains " + \ str(len(snps_raw))+" entries." json_output = json.dumps(output, sort_keys=True, indent=2) print(json_output, file=out_json) out_json.close() return("", "", "") # Remove duplicate RS numbers snps = [] for snp_raw in snps_raw: snp = snp_raw.strip().split() if snp not in snps: snps.append(snp) # Select desired ancestral populations pops = pop.split("+") pop_dirs = [] for pop_i in pops: if pop_i in ["ALL", "AFR", "AMR", "EAS", "EUR", "SAS", "ACB", "ASW", "BEB", "CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH", "GWD", "IBS", "ITU", "JPT", "KHV", "LWK", "MSL", "MXL", "PEL", "PJL", "PUR", "STU", "TSI", "YRI"]: pop_dirs.append(data_dir + population_samples_dir + pop_i + ".txt") else: output["error"] = pop_i+" is not an ancestral population. Choose one of the following ancestral populations: AFR, AMR, EAS, EUR, or SAS; or one of the following sub-populations: ACB, ASW, BEB, CDX, CEU, CHB, CHS, CLM, ESN, FIN, GBR, GIH, GWD, IBS, ITU, JPT, KHV, LWK, MSL, MXL, PEL, PJL, PUR, STU, TSI, or YRI." json_output = json.dumps(output, sort_keys=True, indent=2) print(json_output, file=out_json) out_json.close() return("", "", "") get_pops = "cat " + " ".join(pop_dirs) pop_list = [x.decode('utf-8') for x in subprocess.Popen(get_pops, shell=True, stdout=subprocess.PIPE).stdout.readlines()] ids = [i.strip() for i in pop_list] pop_ids = list(set(ids)) # Connect to Mongo snp database if env == 'local': mongo_host = api_mongo_addr else: mongo_host = 'localhost' if web: client = MongoClient('mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host+'/admin', mongo_port) else: if env == 'local': client = MongoClient('mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host+'/admin', mongo_port) else: client = MongoClient('localhost', mongo_port) db = client["LDLink"] def get_coords(db, rsid): rsid = rsid.strip("rs") query_results = db.dbsnp.find_one({"id": rsid}) query_results_sanitized = json.loads(json_util.dumps(query_results)) return query_results_sanitized # Replace input genomic coordinates with variant ids (rsids) def replace_coords_rsid(db, snp_lst): new_snp_lst = [] for snp_raw_i in snp_lst: if snp_raw_i[0][0:2] == "rs": new_snp_lst.append(snp_raw_i) else: snp_info_lst = get_rsnum(db, snp_raw_i[0], genome_build) print("snp_info_lst") print(snp_info_lst) if snp_info_lst != None: if len(snp_info_lst) > 1: var_id = "rs" + snp_info_lst[0]['id'] ref_variants = [] for snp_info in snp_info_lst: if snp_info['id'] == snp_info['ref_id']: ref_variants.append(snp_info['id']) if len(ref_variants) > 1: var_id = "rs" + ref_variants[0] if "warning" in output: output["warning"] = output["warning"] + \ ". Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp_raw_i[0] else: output["warning"] = "Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp_raw_i[0] elif len(ref_variants) == 0 and len(snp_info_lst) > 1: var_id = "rs" + snp_info_lst[0]['id'] if "warning" in output: output["warning"] = output["warning"] + \ ". Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp_raw_i[0] else: output["warning"] = "Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp_raw_i[0] else: var_id = "rs" + ref_variants[0] new_snp_lst.append([var_id]) elif len(snp_info_lst) == 1: var_id = "rs" + snp_info_lst[0]['id'] new_snp_lst.append([var_id]) else: new_snp_lst.append(snp_raw_i) else: new_snp_lst.append(snp_raw_i) return new_snp_lst snps = replace_coords_rsid(db, snps) # Find RS numbers in snp database details = collections.OrderedDict() rs_nums = [] snp_pos = [] snp_coords = [] warn = [] tabix_coords = "" for snp_i in snps: if len(snp_i) > 0: if len(snp_i[0]) > 2: if (snp_i[0][0:2] == "rs" or snp_i[0][0:3] == "chr") and snp_i[0][-1].isdigit(): snp_coord = get_coords(db, snp_i[0]) if snp_coord != None and snp_coord[genome_build_vars[genome_build]['position']] != "NA": # check if variant is on chrY for genome build = GRCh38 if snp_coord['chromosome'] == "Y" and (genome_build == "grch38" or genome_build == "grch38_high_coverage"): if "warning" in output: output["warning"] = output["warning"] + \ ". " + "Input variants on chromosome Y are unavailable for GRCh38, only available for GRCh37 (" + "rs" + snp_coord['id'] + " = chr" + snp_coord['chromosome'] + ":" + snp_coord[genome_build_vars[genome_build]['position']] + ")" else: output["warning"] = "Input variants on chromosome Y are unavailable for GRCh38, only available for GRCh37 (" + "rs" + snp_coord['id'] + " = chr" + snp_coord['chromosome'] + ":" + snp_coord[genome_build_vars[genome_build]['position']] + ")" warn.append(snp_i[0]) details[snp_i[0]] = ["NA", "NA", "Chromosome Y variants are unavailable for GRCh38, only available for GRCh37."] else: rs_nums.append(snp_i[0]) snp_pos.append(snp_coord[genome_build_vars[genome_build]['position']]) temp = [snp_i[0], snp_coord['chromosome'], snp_coord[genome_build_vars[genome_build]['position']]] snp_coords.append(temp) else: warn.append(snp_i[0]) details[snp_i[0]] = ["NA", "NA", "Variant not found in dbSNP" + dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + "), variant removed."] else: warn.append(snp_i[0]) details[snp_i[0]] = ["NA", "NA", "Not a RS number, query removed."] else: warn.append(snp_i[0]) details[snp_i[0]] = ["NA", "NA", "Not a RS number, query removed."] else: output["error"] = "Input list of RS numbers is empty" json_output = json.dumps(output, sort_keys=True, indent=2) print(json_output, file=out_json) out_json.close() return("", "", "") if warn != []: if "warning" in output: output["warning"] = output["warning"] + \ ". The following RS number(s) or coordinate(s) inputs have warnings: " + ", ".join(warn) else: output["warning"] = "The following RS number(s) or coordinate(s) inputs have warnings: " + ", ".join(warn) if len(rs_nums) == 0: output["error"] = "Input SNP list does not contain any valid RS numbers or coordinates. " + output["warning"] json_output = json.dumps(output, sort_keys=True, indent=2) print(json_output, file=out_json) out_json.close() return("", "", "") # Check SNPs are all on the same chromosome for i in range(len(snp_coords)): if snp_coords[0][1] != snp_coords[i][1]: output["error"] = "Not all input variants are on the same chromosome: "+snp_coords[i-1][0]+"=chr" + \ str(snp_coords[i-1][1])+":"+str(snp_coords[i-1][2])+", "+snp_coords[i][0] + \ "=chr"+str(snp_coords[i][1])+":"+str(snp_coords[i][2])+"." json_output = json.dumps(output, sort_keys=True, indent=2) print(json_output, file=out_json) out_json.close() return("", "", "") # Make tabix formatted coordinates snp_coord_str = [genome_build_vars[genome_build]['1000G_chr_prefix'] + snp_coords[0][1]+":"+i+"-"+i for i in snp_pos] tabix_coords = " "+" ".join(snp_coord_str) # Extract 1000 Genomes phased genotypes vcf_filePath = "%s/%s%s/%s" % (config['aws']['data_subfolder'], genotypes_dir, genome_build_vars[genome_build]['1000G_dir'], genome_build_vars[genome_build]['1000G_file'] % (snp_coords[0][1])) vcf_query_snp_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath) checkS3File(aws_info, config['aws']['bucket'], vcf_filePath) vcf = retrieveTabix1000GData(vcf_query_snp_file, tabix_coords, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) # Make MAF function def calc_maf(genos): vals = {"0|0": 0, "0|1": 0, "1|0": 0, "1|1": 0, "0": 0, "1": 0} for i in range(len(genos)): if genos[i] in vals: vals[genos[i]] += 1 zeros = vals["0|0"]*2+vals["0|1"]+vals["1|0"]+vals["0"] ones = vals["1|1"]*2+vals["0|1"]+vals["1|0"]+vals["1"] total = zeros+ones f0 = zeros*1.0/total f1 = ones*1.0/total maf = min(f0, f1) return f0, f1, maf # Define function to correct indel alleles def set_alleles(a1, a2): if len(a1) == 1 and len(a2) == 1: a1_n = a1 a2_n = a2 elif len(a1) == 1 and len(a2) > 1: a1_n = "-" a2_n = a2[1:] elif len(a1) > 1 and len(a2) == 1: a1_n = a1[1:] a2_n = "-" elif len(a1) > 1 and len(a2) > 1: a1_n = a1[1:] a2_n = a2[1:] return(a1_n, a2_n) # Make R2 function def calc_r2(var1, var2): hap_vals = {"0|0-0|0": 0, "0|0-0|1": 0, "0|0-1|0": 0, "0|0-1|1": 0, "0|1-0|0": 0, "0|1-0|1": 0, "0|1-1|0": 0, "0|1-1|1": 0, "1|0-0|0": 0, "1|0-0|1": 0, "1|0-1|0": 0, "1|0-1|1": 0, "1|1-0|0": 0, "1|1-0|1": 0, "1|1-1|0": 0, "1|1-1|1": 0, "0-0": 0, "0-1": 0, "1-0": 0, "1-1": 0} for i in range(len(var1)): ind_geno = var1[i]+"-"+var2[i] if ind_geno in hap_vals: hap_vals[ind_geno] += 1 A = hap_vals["0|0-0|0"]*2+hap_vals["0|0-0|1"]+hap_vals["0|0-1|0"]+hap_vals["0|1-0|0"] + \ hap_vals["0|1-0|1"]+hap_vals["1|0-0|0"] + \ hap_vals["1|0-1|0"]+hap_vals["0-0"] B = hap_vals["0|0-0|1"]+hap_vals["0|0-1|0"]+hap_vals["0|0-1|1"]*2+hap_vals["0|1-1|0"] + \ hap_vals["0|1-1|1"]+hap_vals["1|0-0|1"] + \ hap_vals["1|0-1|1"]+hap_vals["0-1"] C = hap_vals["0|1-0|0"]+hap_vals["0|1-1|0"]+hap_vals["1|0-0|0"]+hap_vals["1|0-0|1"] + \ hap_vals["1|1-0|0"]*2+hap_vals["1|1-0|1"] + \ hap_vals["1|1-1|0"]+hap_vals["1-0"] D = hap_vals["0|1-0|1"]+hap_vals["0|1-1|1"]+hap_vals["1|0-1|0"]+hap_vals["1|0-1|1"] + \ hap_vals["1|1-0|1"]+hap_vals["1|1-1|0"] + \ hap_vals["1|1-1|1"]*2+hap_vals["1-1"] delta = float(A*D-B*C) Ms = float((A+C)*(B+D)*(A+B)*(C+D)) if Ms != 0: r2 = (delta**2)/Ms else: r2 = None return(r2) # Import SNP VCF file hap_dict = {} h = 0 while vcf[h][0:2] == "##": h += 1 head = vcf[h].strip().split() # Extract population specific haplotypes pop_index = [] for i in range(9, len(head)): if head[i] in pop_ids: pop_index.append(i) rsnum_lst = [] for g in range(h+1, len(vcf)): geno = vcf[g].strip().split() geno[0] = geno[0].lstrip('chr') if geno[1] not in snp_pos: if "warning" in output: output["warning"] = output["warning"]+". Genomic position ("+geno[1]+") in VCF file does not match db" + \ dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ") search coordinates for query variant" else: output["warning"] = "Genomic position ("+geno[1]+") in VCF file does not match db" + \ dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ") search coordinates for query variant" continue if snp_pos.count(geno[1]) == 1: rs_query = rs_nums[snp_pos.index(geno[1])] else: pos_index = [] for p in range(len(snp_pos)): if snp_pos[p] == geno[1]: pos_index.append(p) for p in pos_index: if rs_nums[p] not in rsnum_lst: rs_query = rs_nums[p] break if rs_query in rsnum_lst: continue rs_1000g = geno[2] if rs_query == rs_1000g: rsnum = rs_1000g else: count = -2 found = "false" while count <= 2 and count+g < len(vcf): geno_next = vcf[g+count].strip().split() geno_next[0] = geno_next[0].lstrip('chr') if len(geno_next) >= 3 and rs_query == geno_next[2]: found = "true" break count += 1 if found == "false": if "rs" in rs_1000g: if "warning" in output: output["warning"] = output["warning"] + \ ". Genomic position for query variant ("+rs_query + \ ") does not match RS number at 1000G position (chr" + \ geno[0]+":"+geno[1]+" = "+rs_1000g+")" else: output["warning"] = "Genomic position for query variant ("+rs_query + \ ") does not match RS number at 1000G position (chr" + \ geno[0]+":"+geno[1]+" = "+rs_1000g+")" indx = [i[0] for i in snps].index(rs_query) # snps[indx][0]=geno[2] # rsnum=geno[2] snps[indx][0] = rs_query rsnum = rs_query # try: # indx=[i[0] for i in snps].index(rs_query) # snps[indx][0]=geno[2] # rsnum=geno[2] # except ValueError: # print("List does not contain value:") # print "#####" # print "variable rs_query " + rs_query # print "variable snps " + str(snps) # print "#####" else: continue details[rsnum] = ["chr"+geno[0]+":"+geno[1]] if "," not in geno[3] and "," not in geno[4]: temp_genos = [] for i in range(len(pop_index)): temp_genos.append(geno[pop_index[i]]) f0, f1, maf = calc_maf(temp_genos) a0, a1 = set_alleles(geno[3], geno[4]) details[rsnum].append( a0+"="+str(round(f0, 3))+", "+a1+"="+str(round(f1, 3))) if maf_threshold <= maf: hap_dict[rsnum] = [temp_genos] rsnum_lst.append(rsnum) else: details[rsnum].append( "Variant MAF is "+str(round(maf, 4))+", variant removed.") else: details[rsnum].append(geno[3]+"=NA, "+geno[4]+"=NA") details[rsnum].append("Variant is not biallelic, variant removed.") for i in rs_nums: if i not in rsnum_lst: if i not in details: index_i = rs_nums.index(i) details[i] = ["chr"+snp_coords[index_i][1]+":"+snp_coords[index_i][2]+"-" + snp_coords[index_i][2], "NA", "Variant not in 1000G VCF file, variant removed."] # Thin the SNPs # sup_2=u"\u00B2" sup_2 = "2" i = 0 while i < len(rsnum_lst): details[rsnum_lst[i]].append("Variant kept.") remove_list = [] for j in range(i+1, len(rsnum_lst)): r2 = calc_r2(hap_dict[rsnum_lst[i]][0], hap_dict[rsnum_lst[j]][0]) if r2_threshold <= r2: snp = rsnum_lst[j] details[snp].append("Variant in LD with "+rsnum_lst[i] + " (R"+sup_2+"="+str(round(r2, 4))+"), variant removed.") remove_list.append(snp) for snp in remove_list: rsnum_lst.remove(snp) i += 1 # Return output json_output = json.dumps(output, sort_keys=True, indent=2) print(json_output, file=out_json) out_json.close() return(snps, rsnum_lst, details)
def calculate_pop(snp1, snp2, pop, r2_d, web, genome_build, request=None): # trim any whitespace snp1 = snp1.lower().strip() snp2 = snp2.lower().strip() snp1_input = snp1 snp2_input = snp2 # Set data directories using config.yml with open('config.yml', 'r') as yml_file: config = yaml.load(yml_file) env = config['env'] api_mongo_addr = config['api']['api_mongo_addr'] dbsnp_version = config['data']['dbsnp_version'] population_samples_dir = config['data']['population_samples_dir'] data_dir = config['data']['data_dir'] tmp_dir = config['data']['tmp_dir'] genotypes_dir = config['data']['genotypes_dir'] aws_info = config['aws'] mongo_username = config['database']['mongo_user_readonly'] mongo_password = config['database']['mongo_password'] mongo_port = config['database']['mongo_port'] export_s3_keys = retrieveAWSCredentials() # Ensure tmp directory exists if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) # Create JSON output output = {} # Validate genome build param print("genome_build " + genome_build) if genome_build not in genome_build_vars['vars']: output["error"] = "Invalid genome build. Please specify either " + ", ".join(genome_build_vars['vars']) + "." return(json.dumps(output, sort_keys=True, indent=2)) # Connect to Mongo snp database if env == 'local': mongo_host = api_mongo_addr else: mongo_host = 'localhost' if web: client = MongoClient('mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host+'/admin', mongo_port) else: if env == 'local': client = MongoClient('mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host+'/admin', mongo_port) else: client = MongoClient('localhost', mongo_port) db = client["LDLink"] def get_chrom_coords(db, rsid): rsid = rsid.strip("rs") query_results = db.dbsnp.find_one({"id": rsid}) query_results_sanitized = json.loads(json_util.dumps(query_results)) return query_results_sanitized # Replace input genomic coordinates with variant ids (rsids) def replace_coord_rsid(db, snp): if snp[0:2] == "rs": return snp else: snp_info_lst = get_rsnum(db, snp, genome_build) # print "snp_info_lst" # print snp_info_lst if snp_info_lst != None: if len(snp_info_lst) > 1: var_id = "rs" + snp_info_lst[0]['id'] ref_variants = [] for snp_info in snp_info_lst: if snp_info['id'] == snp_info['ref_id']: ref_variants.append(snp_info['id']) if len(ref_variants) > 1: var_id = "rs" + ref_variants[0] if "warning" in output: output["warning"] = output["warning"] + \ ". Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp else: output["warning"] = "Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp elif len(ref_variants) == 0 and len(snp_info_lst) > 1: var_id = "rs" + snp_info_lst[0]['id'] if "warning" in output: output["warning"] = output["warning"] + \ ". Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp else: output["warning"] = "Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp else: var_id = "rs" + ref_variants[0] return var_id elif len(snp_info_lst) == 1: var_id = "rs" + snp_info_lst[0]['id'] return var_id else: return snp else: return snp return snp snp1 = replace_coord_rsid(db, snp1) snp2 = replace_coord_rsid(db, snp2) snp1_ldpair = snp1 snp2_ldpair = snp2 snp1_coord = get_chrom_coords(db, snp1) snp2_coord = get_chrom_coords(db, snp2) # Check if RS numbers are in snp database # SNP1 if snp1_coord == None or snp1_coord[genome_build_vars[genome_build]['position']] == "NA": output["error"] = snp1 + " is not in dbSNP build " + dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ")." if web: output = json.dumps(output, sort_keys=True, indent=2) return output # SNP2 if snp2_coord == None or snp2_coord[genome_build_vars[genome_build]['position']] == "NA": output["error"] = snp2 + " is not in dbSNP build " + dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ")." if web: output = json.dumps(output, sort_keys=True, indent=2) return output # Check if SNPs are on the same chromosome if snp1_coord['chromosome'] != snp2_coord['chromosome']: output["warning"] = snp1 + " and " + \ snp2 + " are on different chromosomes" # Check if input SNPs are on chromosome Y while genome build == grch38 # SNP1 if snp1_coord['chromosome'] == "Y" and (genome_build == "grch38" or genome_build == "grch38_high_coverage"): output["error"] = "Input variants on chromosome Y are unavailable for GRCh38, only available for GRCh37 (" + "rs" + snp1_coord['id'] + " - chr" + snp1_coord['chromosome'] + ":" + snp1_coord[genome_build_vars[genome_build]['position']] + ")" return(json.dumps(output, sort_keys=True, indent=2)) # SNP2 if snp2_coord['chromosome'] == "Y" and (genome_build == "grch38" or genome_build == "grch38_high_coverage"): output["error"] = "Input variants on chromosome Y are unavailable for GRCh38, only available for GRCh37 (" + "rs" + snp2_coord['id'] + " - chr" + snp2_coord['chromosome'] + ":" + snp2_coord[genome_build_vars[genome_build]['position']] + ")" return(json.dumps(output, sort_keys=True, indent=2)) # create indexes for population order pop_order = { "ALL": 1, "AFR": 2, "YRI": 3, "LWK": 4, "GWD": 5, "MSL": 6, "ESN": 7, "ASW": 8, "ACB": 9, "AMR": 10, "MXL": 11, "PUR": 12, "CLM": 13, "PEL": 14, "EAS": 15, "CHB": 16, "JPT": 17, "CHS": 18, "CDX": 19, "KHV": 20, "EUR": 21, "CEU": 22, "TSI": 23, "FIN": 24, "GBR": 25, "IBS": 26, "SAS": 27, "GIH": 28, "PJL": 29, "BEB": 30, "STU": 31, "ITU": 32 } pop_groups = { "ALL": ["ACB", "ASW", "BEB", "CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH", "GWD", "IBS", "ITU", "JPT", "KHV", "LWK", "MSL", "MXL", "PEL", "PJL", "PUR", "STU", "TSI", "YRI"], "AFR": ["YRI", "LWK", "GWD", "MSL", "ESN", "ASW", "ACB"], "AMR": ["MXL", "PUR", "CLM", "PEL"], "EAS": ["CHB", "JPT", "CHS", "CDX", "KHV"], "EUR": ["CEU", "TSI", "FIN", "GBR" , "IBS"], "SAS": ["GIH", "PJL", "BEB", "STU" , "ITU"] } # empty list for paths to population data pop_dirs = [] pop_split = pop.split("+") # display superpopulation and all subpopulations if "ALL" in pop_split: # pop_split.remove("ALL") pop_split = pop_split + pop_groups["ALL"] + list(pop_groups.keys()) pop_split = list(set(pop_split)) # unique elements else: if "AFR" in pop_split: # pop_split.remove("AFR") pop_split = pop_split + pop_groups["AFR"] pop_split = list(set(pop_split)) # unique elements if "AMR" in pop_split: # pop_split.remove("AMR") pop_split = pop_split + pop_groups["AMR"] pop_split = list(set(pop_split)) # unique elements if "EAS" in pop_split: # pop_split.remove("EAS") pop_split = pop_split + pop_groups["EAS"] pop_split = list(set(pop_split)) # unique elements if "EUR" in pop_split: # pop_split.remove("EUR") pop_split = pop_split + pop_groups["EUR"] pop_split = list(set(pop_split)) # unique elements if "SAS" in pop_split: # pop_split.remove("SAS") pop_split = pop_split + pop_groups["SAS"] pop_split = list(set(pop_split)) # unique elements for pop_i in pop_split: if pop_i in ["ALL", "AFR", "AMR", "EAS", "EUR", "SAS", "ACB", "ASW", "BEB", "CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH", "GWD", "IBS", "ITU", "JPT", "KHV", "LWK", "MSL", "MXL", "PEL", "PJL", "PUR", "STU", "TSI", "YRI"]: pop_dirs.append(data_dir + population_samples_dir + pop_i + ".txt") else: output["error"] = pop_i + " is not an ancestral population. Choose one of the following ancestral populations: AFR, AMR, EAS, EUR, or SAS; or one of the following sub-populations: ACB, ASW, BEB, CDX, CEU, CHB, CHS, CLM, ESN, FIN, GBR, GIH, GWD, IBS, ITU, JPT, KHV, LWK, MSL, MXL, PEL, PJL, PUR, STU, TSI, or YRI." if web: output = json.dumps(output, sort_keys=True, indent=2) return output #make empty dictionary to keep sample IDs in for each wanted population ID_dict = {k: [] for k in pop_split} adds = ["CHROM", "POS", "ID", "REF", "ALT"] for pop_i in pop_split: with open(data_dir + population_samples_dir + pop_i + ".txt", "r") as f: # print pop_dir + pop_i + ".txt" for line in f: cleanedLine = line.strip() if cleanedLine: # is not empty ID_dict[pop_i].append(cleanedLine) for entry in adds: ID_dict[pop_i].append(entry) # Extract 1000 Genomes phased genotypes # SNP1 vcf_filePath1 = "%s/%s%s/%s" % (config['aws']['data_subfolder'], genotypes_dir, genome_build_vars[genome_build]['1000G_dir'], genome_build_vars[genome_build]['1000G_file'] % snp1_coord['chromosome']) vcf_rs1 = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath1) checkS3File(aws_info, config['aws']['bucket'], vcf_filePath1) rs1_test = export_s3_keys + " cd {3}; tabix -D {0} {1}:{2}-{2} | grep -v -e END".format(vcf_rs1, genome_build_vars[genome_build]['1000G_chr_prefix'] + snp1_coord['chromosome'], snp1_coord[genome_build_vars[genome_build]['position']], data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) vcf1 = [x.decode('utf-8') for x in subprocess.Popen(rs1_test, shell=True, stdout=subprocess.PIPE).stdout.readlines()] vcf_filePath2 = "%s/%s%s/%s" % (config['aws']['data_subfolder'], genotypes_dir, genome_build_vars[genome_build]['1000G_dir'], genome_build_vars[genome_build]['1000G_file'] % snp2_coord['chromosome']) vcf_rs2 = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath2) checkS3File(aws_info, config['aws']['bucket'], vcf_filePath2) # need to add | grep -v -e END ??? rs2_test = export_s3_keys + " cd {3}; tabix -D {0} {1}:{2}-{2} | grep -v -e END".format(vcf_rs2, genome_build_vars[genome_build]['1000G_chr_prefix'] + snp2_coord['chromosome'], snp2_coord[genome_build_vars[genome_build]['position']], data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) vcf2 = [x.decode('utf-8') for x in subprocess.Popen(rs2_test, shell=True, stdout=subprocess.PIPE).stdout.readlines()] # Check if SNPs are in 1000G reference panel # SNP1 if len(vcf1) == 0: output["error"] = snp1 + " is not in 1000G reference panel." if web: output = json.dumps(output, sort_keys=True, indent=2) return output elif len(vcf1) > 1: geno1 = [] for i in range(len(vcf1)): if vcf1[i].strip().split()[2] == snp1: geno1 = vcf1[i].strip().split() geno1[0] = geno1[0].lstrip('chr') if geno1 == []: output["error"] = snp1 + " is not in 1000G reference panel." if web: output = json.dumps(output, sort_keys=True, indent=2) return output else: geno1 = vcf1[0].strip().split() geno1[0] = geno1[0].lstrip('chr') if geno1[2] != snp1 and snp1[0:2] == "rs" and "rs" in geno1[2]: if "warning" in output: output["warning"] = output["warning"] + \ ". Genomic position for query variant1 (" + snp1 + \ ") does not match RS number at 1000G position (chr" + \ geno1[0]+":"+geno1[1]+" = "+geno1[2]+")" else: output["warning"] = "Genomic position for query variant1 (" + snp1 + \ ") does not match RS number at 1000G position (chr" + \ geno1[0]+":"+geno1[1]+" = "+geno1[2]+")" snp1 = geno1[2] if "," in geno1[3] or "," in geno1[4]: output["error"] = snp1 + " is not a biallelic variant." return(json.dumps(output, sort_keys=True, indent=2)) # SNP2 if len(vcf2) == 0: output["error"] = snp2 + " is not in 1000G reference panel." if web: output = json.dumps(output, sort_keys=True, indent=2) return output elif len(vcf2) > 1: geno2 = [] for i in range(len(vcf2)): if vcf2[i].strip().split()[2] == snp2: geno2 = vcf2[i].strip().split() geno2[0] = geno2[0].lstrip('chr') if geno2 == []: output["error"] = snp2 + " is not in 1000G reference panel." if web: output = json.dumps(output, sort_keys=True, indent=2) return output else: geno2 = vcf2[0].strip().split() geno2[0] = geno2[0].lstrip('chr') if geno2[2] != snp2 and snp2[0:2] == "rs" and "rs" in geno2[2]: if "warning" in output: output["warning"] = output["warning"] + \ ". Genomic position for query variant2 (" + snp2 + \ ") does not match RS number at 1000G position (chr" + \ geno2[0]+":"+geno2[1]+" = "+geno2[2]+")" else: output["warning"] = "Genomic position for query variant2 (" + snp2 + \ ") does not match RS number at 1000G position (chr" + \ geno2[0]+":"+geno2[1]+" = "+geno2[2]+")" snp2 = geno2[2] if "," in geno2[3] or "," in geno2[4]: output["error"] = snp2 + " is not a biallelic variant." return(json.dumps(output, sort_keys=True, indent=2)) # vcf1 = vcf1[0].strip().split() # vcf2 = vcf2[0].strip().split() # Get headers tabix_snp1_h = export_s3_keys + " cd {1}; tabix -HD {0} | grep CHROM".format(vcf_rs1, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) head1 = [x.decode('utf-8') for x in subprocess.Popen(tabix_snp1_h, shell=True, stdout=subprocess.PIPE).stdout.readlines()][0].strip().split() tabix_snp2_h = export_s3_keys + " cd {1}; tabix -HD {0} | grep CHROM".format(vcf_rs2, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) head2 = [x.decode('utf-8') for x in subprocess.Popen(tabix_snp2_h, shell=True, stdout=subprocess.PIPE).stdout.readlines()][0].strip().split() rs1_dict = dict(list(zip(head1, geno1))) rs2_dict = dict(list(zip(head2, geno2))) if "<" in rs1_dict["REF"]: if "warning" in output: output["warning"] = output["warning"] + \ "." + snp1 + "is a CNV marker. " else: output["warning"] = snp1 + "is a CNV marker. " if "<" in rs2_dict["REF"]: if "warning" in output: output["warning"] = output["warning"] + \ "." + snp2 + "is a CNV marker. " else: output["warning"] = snp2 + "is a CNV marker. " geno_ind = { "rs1" : {k: [] for k in pop_split}, "rs2" : {k: [] for k in pop_split} } #SNP1 for colname in rs1_dict: for key in ID_dict: if (colname in ID_dict[key]) and (colname not in adds): geno_ind["rs1"][key].append(rs1_dict[colname] + "|." if len(rs1_dict[colname]) == 1 else rs1_dict[colname]) #SNP2 for colname in rs2_dict: for key in ID_dict: if (colname in ID_dict[key]) and (colname not in adds): geno_ind["rs2"][key].append(rs2_dict[colname] + "|." if len(rs2_dict[colname]) == 1 else rs2_dict[colname]) #population freqency dictionary to fill in pop_freqs = { "ref_freq_snp1" : { }, \ "ref_freq_snp2" : { }, \ "alt_freq_snp1" : { }, \ "alt_freq_snp2" : { }, \ "total_alleles": { } } for key in geno_ind["rs1"]: pop_freqs["total_alleles"][key] = float(2*geno_ind["rs1"][key].count("0|0") + 2*geno_ind["rs1"][key].count("0|1") + 2*geno_ind["rs1"][key].count("1|1") + 2* geno_ind["rs1"][key].count("1|0") + 2* geno_ind["rs1"][key].count("0|.") + 2* geno_ind["rs1"][key].count("1|.")) if (pop_freqs["total_alleles"][key] > 0): pop_freqs["ref_freq_snp1"][key] = round(((2*geno_ind["rs1"][key].count("0|0") + geno_ind["rs1"][key].count("0|1") + geno_ind["rs1"][key].count("1|0") + geno_ind["rs1"][key].count("1|.") + geno_ind["rs1"][key].count("0|."))/ float(pop_freqs["total_alleles"][key])) *100, 2) pop_freqs["ref_freq_snp2"][key] = round(((2*geno_ind["rs2"][key].count("0|0") + geno_ind["rs2"][key].count("0|1") + geno_ind["rs2"][key].count("1|0") + geno_ind["rs2"][key].count("1|.") + geno_ind["rs2"][key].count("0|."))/ float(pop_freqs["total_alleles"][key])) *100, 2) pop_freqs["alt_freq_snp1"][key] = round(((2*geno_ind["rs1"][key].count("1|1") + geno_ind["rs1"][key].count("0|1") + geno_ind["rs1"][key].count("1|0") + geno_ind["rs1"][key].count("1|.") + geno_ind["rs1"][key].count("0|."))/ float(pop_freqs["total_alleles"][key])) *100, 2) pop_freqs["alt_freq_snp2"][key] = round(((2*geno_ind["rs2"][key].count("1|1") + geno_ind["rs2"][key].count("0|1") + geno_ind["rs2"][key].count("1|0") + geno_ind["rs2"][key].count("1|.") + geno_ind["rs2"][key].count("0|."))/ float(pop_freqs["total_alleles"][key])) *100, 2) else : output["error"] = "Insufficient haplotype data for " + snp1 + " and " + snp2 + " in 1000G reference panel." if web: output = json.dumps(output, sort_keys=True, indent=2) return output #get sample size for each population sample_size_dict = {} for key in ID_dict: sample_size_dict[key] = len(ID_dict[key])- len(adds) # Combine phased genotype # Extract haplotypes hap = {k: {"0_0": 0, "0_1": 0, "1_0": 0, "1_1": 0, "0_.": 0, "1_.": 0, "._.": 0, "._0": 0, "._1": 0} for k in pop_split} for pop in geno_ind["rs1"]: if len(geno_ind["rs1"][pop]) == len(geno_ind["rs2"][pop]): geno_ind_range = len(geno_ind["rs1"][pop]) elif len(geno_ind["rs1"][pop]) < len(geno_ind["rs2"][pop]): geno_ind_range = len(geno_ind["rs1"][pop]) else: geno_ind_range = len(geno_ind["rs2"][pop]) for ind in range(geno_ind_range): # if len(geno_ind["rs1"][pop][ind]) == 3: hap1 = geno_ind["rs1"][pop][ind][0] + "_" + geno_ind["rs2"][pop][ind][0] hap2 = geno_ind["rs1"][pop][ind][2] + "_" + geno_ind["rs2"][pop][ind][2] if hap1 in hap[pop]: hap[pop][hap1] += 1 hap[pop][hap2] += 1 # Remove missing haplotypes pops = list(hap.keys()) for pop in pops: keys = list(hap[pop].keys()) for key in keys: if "." in key: hap[pop].pop(key, None) # Sort haplotypes matrix_values = {k : {"A": "", "B": "", "C": "", "D": "", "N": "", "delta" : "", "Ms" : "" , "D_prime":"", "r2":""} for k in pop_split} for pop in hap: matrix_values[pop]["A"] = hap[pop][sorted(hap[pop])[0]] matrix_values[pop]["B"] = hap[pop][sorted(hap[pop])[1]] matrix_values[pop]["C"] = hap[pop][sorted(hap[pop])[2]] matrix_values[pop]["D"] = hap[pop][sorted(hap[pop])[3]] matrix_values[pop]["N"] = matrix_values[pop]["A"] + matrix_values[pop]["B"] + matrix_values[pop]["C"] + matrix_values[pop]["D"] matrix_values[pop]["delta"] = float(matrix_values[pop]["A"] * matrix_values[pop]["D"] - matrix_values[pop]["B"] * matrix_values[pop]["C"]) matrix_values[pop]["Ms"] = float((matrix_values[pop]["A"] + matrix_values[pop]["C"]) * (matrix_values[pop]["B"] + matrix_values[pop]["D"]) * (matrix_values[pop]["A"] + matrix_values[pop]["B"]) * (matrix_values[pop]["C"] + matrix_values[pop]["D"])) if matrix_values[pop]["Ms"] != 0: # D prime if matrix_values[pop]["delta"] < 0: matrix_values[pop]["D_prime"] = abs(matrix_values[pop]["delta"] / min((matrix_values[pop]["A"] + matrix_values[pop]["C"]) * (matrix_values[pop]["A"] + matrix_values[pop]["B"]), (matrix_values[pop]["B"] + matrix_values[pop]["D"]) * (matrix_values[pop]["C"] + matrix_values[pop]["D"]))) else: matrix_values[pop]["D_prime"] = abs(matrix_values[pop]["delta"] / min((matrix_values[pop]["A"] + matrix_values[pop]["C"]) * (matrix_values[pop]["C"] + matrix_values[pop]["D"]), (matrix_values[pop]["A"] + matrix_values[pop]["B"]) * (matrix_values[pop]["B"] + matrix_values[pop]["D"]))) # R2 matrix_values[pop]["r2"]= (matrix_values[pop]["delta"]**2) / matrix_values[pop]["Ms"] num = (matrix_values[pop]["A"] + matrix_values[pop]["B"] + matrix_values[pop]["C"] + matrix_values[pop]["D"]) * (matrix_values[pop]["A"] * matrix_values[pop]["D"] - matrix_values[pop]["B"] * matrix_values[pop]["C"])**2 denom = matrix_values[pop]["Ms"] matrix_values[pop]["chisq"] = num / denom matrix_values[pop]["p"] = 2 * (1 - (0.5 * (1 + math.erf(matrix_values[pop]["chisq"] **0.5 / 2**0.5)))) else: matrix_values[pop]["D_prime"] = "NA" matrix_values[pop]["r2"] = "NA" matrix_values[pop]["chisq"] = "NA" matrix_values[pop]["p"] = "NA" for pops in sample_size_dict: output[pops] = { 'Population': pops , 'N': sample_size_dict[pops], \ # rs1_dict["ID"] + ' Allele Freq': { # rs1_dict["REF"] : str(pop_freqs["ref_freq_snp1"][pops]) + "%", \ # rs1_dict["ALT"] : str(pop_freqs["alt_freq_snp1"][pops]) + "%" # }, \ # rs2_dict["ID"] + ' Allele Freq': { # rs2_dict["REF"] : str(pop_freqs["ref_freq_snp2"][pops]) + "%", \ # rs2_dict["ALT"] : str(pop_freqs["alt_freq_snp2"][pops]) + "%" # }, 'rs#1 Allele Freq': { rs1_dict["REF"] : str(pop_freqs["ref_freq_snp1"][pops]) + "%", \ rs1_dict["ALT"] : str(pop_freqs["alt_freq_snp1"][pops]) + "%" }, \ 'rs#2 Allele Freq': { rs2_dict["REF"] : str(pop_freqs["ref_freq_snp2"][pops]) + "%", \ rs2_dict["ALT"] : str(pop_freqs["alt_freq_snp2"][pops]) + "%" }, "D'" : matrix_values[pops]["D_prime"] if isinstance(matrix_values[pops]["D_prime"], str) else round(float(matrix_values[pops]["D_prime"]), 4), \ "R2" : matrix_values[pops]["r2"] if isinstance(matrix_values[pops]["r2"], str) else round(float(matrix_values[pops]["r2"]), 4), \ "chisq" : matrix_values[pops]["chisq"] if isinstance(matrix_values[pops]["chisq"], str) else round(float(matrix_values[pops]["chisq"]), 4), \ "p" : matrix_values[pops]["p"] if isinstance(matrix_values[pops]["p"], str) else round(float(matrix_values[pops]["p"]), 4) } # print json.dumps(output) location_data = { "ALL" : { "location": "All Populations" }, "AFR" : { "location": "African" }, "AMR" : { "location": "Ad Mixed American" }, "EAS" : { "location": "East Asian" }, "EUR" : { "location": "European" }, "SAS" : { "location": "South Asian" }, "YRI": { "location": "Yoruba in Ibadan, Nigeria", "superpopulation": "AFR", "latitude": 7.40026, "longitude": 3.910742 }, "LWK": { "location": "Luhya in Webuye, Kenya", "superpopulation": "AFR", "latitude": 0.59738, "longitude": 34.777227 }, "GWD": { "location": "Gambian in Western Divisions in the Gambia", "superpopulation": "AFR", "latitude": 13.474133, "longitude": -16.394272 }, "MSL": { "location": "Mende in Sierra Leone", "superpopulation": "AFR", "latitude": 8.176076, "longitude": -11.040253 }, "ESN": { "location": "Esan in Nigeria", "superpopulation": "AFR", "latitude": 6.687988, "longitude": 6.212868 }, "ASW": { "location": "Americans of African Ancestry in SW USA", "superpopulation": "AFR", "latitude": 35.310647, "longitude": -107.975885 }, "ACB": { "location": "African Caribbeans in Barbados", "superpopulation": "AFR", "latitude": 13.172483, "longitude": -59.552779 }, "MXL": { "location": "Mexican Ancestry from Los Angeles USA", "superpopulation": "AMR", "latitude": 34.113837, "longitude": -118.440427 }, "PUR": { "location": "Puerto Ricans from Puerto Rico", "superpopulation": "AMR", "latitude": 18.234429, "longitude": -66.418775 }, "CLM": { "location": "Colombians from Medellin, Colombia", "superpopulation": "AMR", "latitude": 6.252089, "longitude": -75.594652 }, "PEL": { "location": "Peruvians from Lima, Peru", "superpopulation": "AMR", "latitude": -12.046543, "longitude": -77.046155 }, "CHB": { "location": "Han Chinese in Beijing, China", "superpopulation": "EAS", "latitude": 39.906802, "longitude": 116.407323 }, "JPT": { "location": "Japanese in Tokyo, Japan", "superpopulation": "EAS", "latitude": 35.709444, "longitude": 139.731815 }, "CHS": { "location": "Southern Han Chinese", "superpopulation": "EAS", "latitude": 24.719998, "longitude": 113.043464 }, "CDX": { "location": "Chinese Dai in Xishuangbanna, China", "superpopulation": "EAS", "latitude": 22.008264, "longitude": 100.796045 }, "KHV": { "location": "Kinh in Ho Chi Minh City, Vietnam", "superpopulation": "EAS", "latitude": 10.812236, "longitude": 106.633978 }, "CEU": { "location": "Utah Residents (CEPH) with Northern and Western European Ancestry", "superpopulation": "EUR", "latitude": 39.250493, "longitude": -111.631295 }, "TSI": { "location": "Toscani in Italia", "superpopulation": "EUR", "latitude": 43.444187, "longitude": 11.117199 }, "FIN": { "location": "Finnish in Finland", "superpopulation": "EUR", "latitude": 63.112, "longitude": 26.770837 }, "GBR": { "location": "British in England and Scotland", "superpopulation": "EUR", "latitude": 54.55902, "longitude": -2.143222 }, "IBS": { "location": "Iberian Population in Spain", "superpopulation": "EUR", "latitude": 40.482057, "longitude": -4.088383 }, "GIH": { "location": "Gujarati Indian from Houston, Texas", "superpopulation": "SAS", "latitude": 29.760619, "longitude": -95.361356 }, "PJL": { "location": "Punjabi from Lahore, Pakistan", "superpopulation": "SAS", "latitude": 31.515188, "longitude": 74.357703 }, "BEB": { "location": "Bengali from Bangladesh", "superpopulation": "SAS", "latitude": 24.013458, "longitude": 90.233561 }, "STU": { "location": "Sri Lankan Tamil from the UK", "superpopulation": "SAS", "latitude": 7.595905, "longitude": 80.843382 }, "ITU": { "location": "Indian Telugu from the UK", "superpopulation": "SAS", "latitude": 15.489823, "longitude": 78.487081 } } # Change manipulate output data for frontend only if accessed via Web instance # if web: output_table = { "inputs": { "rs1": snp1_input, "rs2": snp2_input, "LD": r2_d }, "aaData": [], "locations": { "rs1_rs2_LD_map": [], "rs1_map": [], "rs2_map": [] } } table_data = [] rs1_map_data = [] rs2_map_data = [] rs1_rs2_LD_map_data = [] # print(list(output.keys())) # populate table data for key in list(output.keys()): if key in list(pop_order.keys()): # print key, "parse for table" key_order = pop_order[key] key_pop = output[key]['Population'] key_N = output[key]['N'] # key_rs1_allele_freq = ", ".join([allele + ": " + output[key]['rs#1 Allele Freq'][allele] + "%" for allele in output[key]['rs#1 Allele Freq']]) key_rs1_allele_freq = rs1_dict["REF"] + ": " + output[key]['rs#1 Allele Freq'][rs1_dict["REF"]] + ", " + rs1_dict["ALT"] + ": " + output[key]['rs#1 Allele Freq'][rs1_dict["ALT"]] # key_rs2_allele_freq = ", ".join([allele + ": " + output[key]['rs#2 Allele Freq'][allele] + "%" for allele in output[key]['rs#2 Allele Freq']]) key_rs2_allele_freq = rs2_dict["REF"] + ": " + output[key]['rs#2 Allele Freq'][rs2_dict["REF"]] + ", " + rs2_dict["ALT"] + ": " + output[key]['rs#2 Allele Freq'][rs2_dict["ALT"]] key_D_prime = output[key]["D'"] key_R_2 = output[key]['R2'] # set up data for ldpair link ldpair_pops = [key] key_chisq = output[key]['chisq'] key_p = output[key]['p'] if key in list(pop_groups.keys()): ldpair_pops = pop_groups[key] ldpair_data = [snp1_ldpair, snp2_ldpair, "%2B".join(ldpair_pops)] table_data.append([key_order, key_pop, key_N, key_rs1_allele_freq, key_rs2_allele_freq, key_R_2, key_D_prime, ldpair_data, key_chisq, key_p]) # populate map data if key not in list(pop_groups.keys()): rs1_rs2_LD_map_data.append([key, location_data[key]["location"], location_data[key]["superpopulation"], location_data[key]["latitude"], location_data[key]["longitude"], key_rs1_allele_freq, key_rs2_allele_freq, key_R_2, key_D_prime]) rs1_map_data.append([key, location_data[key]["location"], location_data[key]["superpopulation"], location_data[key]["latitude"], location_data[key]["longitude"], key_rs1_allele_freq]) rs2_map_data.append([key, location_data[key]["location"], location_data[key]["superpopulation"], location_data[key]["latitude"], location_data[key]["longitude"], key_rs2_allele_freq]) # Add map data output_table["locations"]["rs1_rs2_LD_map"] = rs1_rs2_LD_map_data output_table["locations"]["rs1_map"] = rs1_map_data output_table["locations"]["rs2_map"] = rs2_map_data def getKeyOrder(element): return element[0] table_data.sort(key=getKeyOrder) # Add table data sorting order of rows output_table["aaData"] = [xs[1:] for xs in table_data] # Add final row link to LDpair # ldpair_pops = [] # for pop in output.keys(): # if pop not in pop_groups.keys() and len(pop) == 3: # ldpair_pops.append(pop) # ldpair_data = [snp1_input, snp2_input, "%2B".join(ldpair_pops)] # output_table["aaData"].append(["LDpair", ldpair_data, ldpair_data, ldpair_data, ldpair_data, ldpair_data]) if "warning" in output: output_table["warning"] = output["warning"] if "error" in output: output_table["error"] = output["error"] # Generate output file with open(tmp_dir + "LDpop_" + request + ".txt", "w") as ldpop_out: ldpop_out.write("\t".join(["Population", "Abbrev", "N", output_table["inputs"]["rs1"] + " Allele Freq", output_table["inputs"]["rs2"] + " Allele Freq", "R2", "D\'", "Chisq", "P"]) + "\n") # print("output_table", output_table) # print('output_table["aaData"]', output_table["aaData"]) for row in output_table["aaData"]: ldpop_out.write(str(location_data[row[0]]["location"] + "\t" + row[0]) + "\t" + str(row[1]) + "\t" + str(row[2]) + "\t" + str(row[3]) + "\t" + str(row[4]) + "\t" + str(row[5]) + "\t" + str(row[7]) + "\t" + str(row[8]) + "\n") if "error" in output_table: ldpop_out.write("\n") ldpop_out.write(output_table["error"]) if "warning" in output_table: ldpop_out.write("\n") ldpop_out.write(output_table["warning"]) # Change manipulate output data for frontend only if accessed via Web instance # if web: output = json.dumps(output_table, sort_keys=True, indent=2) return output
def get_query_variant(snp_coord, pop_ids, request, genome_build): export_s3_keys = retrieveAWSCredentials() vcf_filePath = "%s/%s%s/%s" % ( config['aws']['data_subfolder'], genotypes_dir, genome_build_vars[genome_build]['1000G_dir'], genome_build_vars[genome_build]['1000G_file'] % (snp_coord[1])) vcf_query_snp_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath) queryVariantWarnings = [] # Extract query SNP phased genotypes checkS3File(aws_info, config['aws']['bucket'], vcf_filePath) tabix_query_snp_h = export_s3_keys + " cd {1}; tabix -HD {0} | grep CHROM".format( vcf_query_snp_file, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) # print("tabix_query_snp_h", tabix_query_snp_h) head = [ x.decode('utf-8') for x in subprocess.Popen(tabix_query_snp_h, shell=True, stdout=subprocess.PIPE).stdout.readlines() ][0].strip().split() tabix_query_snp = export_s3_keys + " cd {4}; tabix -D {0} {1}:{2}-{2} | grep -v -e END > {3}".format( vcf_query_snp_file, genome_build_vars[genome_build]['1000G_chr_prefix'] + snp_coord[1], snp_coord[2], tmp_dir + "snp_no_dups_" + request + ".vcf", data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) # print("tabix_query_snp", tabix_query_snp) subprocess.call(tabix_query_snp, shell=True) tabix_query_snp_out = open(tmp_dir + "snp_no_dups_" + request + ".vcf").readlines() # Validate error if len(tabix_query_snp_out) == 0: # print("ERROR", "len(tabix_query_snp_out) == 0") # handle error: snp + " is not in 1000G reference panel." queryVariantWarnings.append( [snp_coord[0], "NA", "Variant is not in 1000G reference panel."]) subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) return (None, queryVariantWarnings) elif len(tabix_query_snp_out) > 1: geno = [] for i in range(len(tabix_query_snp_out)): if tabix_query_snp_out[i].strip().split()[2] == snp_coord[0]: geno = tabix_query_snp_out[i].strip().split() geno[0] = geno[0].lstrip('chr') if geno == []: # print("ERROR", "geno == []") # handle error: snp + " is not in 1000G reference panel." queryVariantWarnings.append([ snp_coord[0], "NA", "Variant is not in 1000G reference panel." ]) subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) return (None, queryVariantWarnings) else: geno = tabix_query_snp_out[0].strip().split() geno[0] = geno[0].lstrip('chr') if geno[2] != snp_coord[0] and "rs" in geno[2]: queryVariantWarnings.append([ snp_coord[0], "NA", "Genomic position does not match RS number at 1000G position (chr" + geno[0] + ":" + geno[1] + " = " + geno[2] + ")." ]) # snp = geno[2] if "," in geno[3] or "," in geno[4]: # print('handle error: snp + " is not a biallelic variant."') queryVariantWarnings.append( [snp_coord[0], "NA", "Variant is not a biallelic."]) index = [] for i in range(9, len(head)): if head[i] in pop_ids: index.append(i) genotypes = {"0": 0, "1": 0} for i in index: sub_geno = geno[i].split("|") for j in sub_geno: if j in genotypes: genotypes[j] += 1 else: genotypes[j] = 1 if genotypes["0"] == 0 or genotypes["1"] == 0: # print('handle error: snp + " is monoallelic in the " + pop + " population."') queryVariantWarnings.append([ snp_coord[0], "NA", "Variant is monoallelic in the chosen population(s)." ]) return (geno, queryVariantWarnings)
def calculate_assoc_svg(file, region, pop, request, genome_build, myargs, myargsName, myargsOrigin): # Set data directories using config.yml with open('config.yml', 'r') as yml_file: config = yaml.load(yml_file) env = config['env'] api_mongo_addr = config['api']['api_mongo_addr'] data_dir = config['data']['data_dir'] tmp_dir = config['data']['tmp_dir'] genotypes_dir = config['data']['genotypes_dir'] aws_info = config['aws'] mongo_username = config['database']['mongo_user_readonly'] mongo_password = config['database']['mongo_password'] mongo_port = config['database']['mongo_port'] num_subprocesses = config['performance']['num_subprocesses'] export_s3_keys = retrieveAWSCredentials() # Ensure tmp directory exists if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) chrs = [ "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y" ] # Define parameters for --variant option if region == "variant": if myargsOrigin == "None": return None if myargsOrigin != "None": # Find coordinates (GRCh37/hg19) or (GRCh38/hg38) for SNP RS number if myargsOrigin[0:2] == "rs": snp = myargsOrigin # Connect to Mongo snp database if env == 'local': mongo_host = api_mongo_addr else: mongo_host = 'localhost' client = MongoClient( 'mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host + '/admin', mongo_port) db = client["LDLink"] def get_coords_var(db, rsid): rsid = rsid.strip("rs") query_results = db.dbsnp.find_one({"id": rsid}) query_results_sanitized = json.loads( json_util.dumps(query_results)) return query_results_sanitized # Find RS number in snp database var_coord = get_coords_var(db, snp) if var_coord == None: return None elif myargsOrigin.split(":")[0].strip("chr") in chrs and len( myargsOrigin.split(":")) == 2: snp = myargsOrigin #var_coord=[None,myargsOrigin.split(":")[0].strip("chr"),myargsOrigin.split(":")[1]] var_coord = { 'chromosome': myargsOrigin.split(":")[0].strip("chr"), 'position': myargsOrigin.split(":")[1] } else: return None chromosome = var_coord['chromosome'] org_coord = var_coord[genome_build_vars[genome_build]['position']] # Open Association Data header_list = [] header_list.append(myargs['chr']) header_list.append(myargs['bp']) header_list.append(myargs['pval']) # Load input file with open(file) as fp: header = fp.readline().strip().split() first = fp.readline().strip().split() if len(header) != len(first): return None # Check header for item in header_list: if item not in header: return None len_head = len(header) chr_index = header.index(myargs['chr']) pos_index = header.index(myargs['bp']) p_index = header.index(myargs['pval']) # Define window of interest around query SNP if myargs['window'] == None: if region == "variant": window = 500000 elif region == "gene": window = 100000 else: window = 0 else: window = myargs['window'] if region == "variant": coord1 = int(org_coord) - window if coord1 < 0: coord1 = 0 coord2 = int(org_coord) + window elif region == "gene": if myargsName == "None": return None def get_coords_gene(gene_raw, db): gene = gene_raw.upper() mongoResult = db.genes_name_coords.find_one({"name": gene}) #format mongo output if mongoResult != None: geneResult = [ mongoResult["name"], mongoResult[genome_build_vars[genome_build]['chromosome']], mongoResult[genome_build_vars[genome_build]['gene_begin']], mongoResult[genome_build_vars[genome_build]['gene_end']] ] return geneResult else: return None # Find RS number in snp database gene_coord = get_coords_gene(myargsName, db) if gene_coord == None or gene_coord[2] == 'NA' or gene_coord == 'NA': return None # Define search coordinates coord1 = int(gene_coord[2]) - window if coord1 < 0: coord1 = 0 coord2 = int(gene_coord[3]) + window # Run with --origin option if myargsOrigin != "None": if gene_coord[1] != chromosome: return None if coord1 > int(org_coord) or int(org_coord) > coord2: return None else: chromosome = gene_coord[1] elif region == "region": if myargs['start'] == None: return None if myargs['end'] == None: return None # Parse out chr and positions for --region option if len(myargs['start'].split(":")) != 2: return None if len(myargs['end'].split(":")) != 2: return None chr_s = myargs['start'].strip("chr").split(":")[0] coord_s = myargs['start'].split(":")[1] chr_e = myargs['end'].strip("chr").split(":")[0] coord_e = myargs['end'].split(":")[1] if chr_s not in chrs: return None if chr_e not in chrs: return None if chr_s != chr_e: return None if coord_s >= coord_e: return None coord1 = int(coord_s) - window if coord1 < 0: coord1 = 0 coord2 = int(coord_e) + window # Run with --origin option if myargsOrigin != "None": if chr_s != chromosome: return None if coord1 > int(org_coord) or int(org_coord) > coord2: return None else: chromosome = chr_s # Generate coordinate list and P-value dictionary max_window = 3000000 if coord2 - coord1 > max_window: return None assoc_coords = [] a_pos = [] assoc_dict = {} assoc_list = [] with open(file) as fp: for line in fp: col = line.strip().split() if len(col) == len_head: if col[chr_index].strip("chr") == chromosome: try: int(col[pos_index]) except ValueError: continue else: if coord1 <= int(col[pos_index]) <= coord2: try: float(col[p_index]) except ValueError: continue else: coord_i = genome_build_vars[genome_build][ '1000G_chr_prefix'] + col[chr_index].strip( "chr") + ":" + col[ pos_index] + "-" + col[pos_index] assoc_coords.append(coord_i) a_pos.append(col[pos_index]) assoc_dict[coord_i] = [col[p_index]] assoc_list.append( [coord_i, float(col[p_index])]) # Coordinate list checks if len(assoc_coords) == 0: return None # Get population ids from population output file from LDassoc.py pop_list = open(tmp_dir + "pops_" + request + ".txt").readlines() ids = [] for i in range(len(pop_list)): ids.append(pop_list[i].strip()) pop_ids = list(set(ids)) # Define LD origin coordinate try: org_coord except NameError: for var_p in sorted(assoc_list, key=operator.itemgetter(1)): snp = "chr" + var_p[0].split("-")[0] # Extract lowest P SNP phased genotypes vcf_filePath = "%s/%s%s/%s" % ( config['aws']['data_subfolder'], genotypes_dir, genome_build_vars[genome_build]["1000G_dir"], genome_build_vars[genome_build]["1000G_file"] % (chromosome)) vcf_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath) checkS3File(aws_info, config['aws']['bucket'], vcf_filePath) tabix_snp_h = export_s3_keys + " cd {1}; tabix -HD {0} | grep CHROM".format( vcf_file, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) head = [ x.decode('utf-8') for x in subprocess.Popen( tabix_snp_h, shell=True, stdout=subprocess.PIPE).stdout.readlines() ][0].strip().split() # Check lowest P SNP is in the 1000G population and not monoallelic from LDassoc.py output file vcf = open(tmp_dir + "snp_no_dups_" + request + ".vcf").readlines() if len(vcf) == 0: continue elif len(vcf) > 1: geno = vcf[0].strip().split() geno[0] = geno[0].lstrip('chr') else: geno = vcf[0].strip().split() geno[0] = geno[0].lstrip('chr') if "," in geno[3] or "," in geno[4]: continue index = [] for i in range(9, len(head)): if head[i] in pop_ids: index.append(i) genotypes = {"0": 0, "1": 0} for i in index: sub_geno = geno[i].split("|") for j in sub_geno: if j in genotypes: genotypes[j] += 1 else: genotypes[j] = 1 if genotypes["0"] == 0 or genotypes["1"] == 0: continue org_coord = var_p[0].split("-")[1] break else: if genome_build_vars[genome_build][ '1000G_chr_prefix'] + chromosome + ":" + org_coord + "-" + org_coord not in assoc_coords: return None # Extract query SNP phased genotypes vcf_filePath = "%s/%s%s/%s" % ( config['aws']['data_subfolder'], genotypes_dir, genome_build_vars[genome_build]["1000G_dir"], genome_build_vars[genome_build]["1000G_file"] % (chromosome)) vcf_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath) checkS3File(aws_info, config['aws']['bucket'], vcf_filePath) tabix_snp_h = export_s3_keys + " cd {1}; tabix -HD {0} | grep CHROM".format( vcf_file, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) head = [ x.decode('utf-8') for x in subprocess.Popen(tabix_snp_h, shell=True, stdout=subprocess.PIPE).stdout.readlines() ][0].strip().split() # Check query SNP is in the 1000G population, has the correct RS number, and not monoallelic vcf = open(tmp_dir + "snp_no_dups_" + request + ".vcf").readlines() if len(vcf) == 0: subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) return None elif len(vcf) > 1: geno = [] for i in range(len(vcf)): if vcf[i].strip().split()[2] == snp: geno = vcf[i].strip().split() geno[0] = geno[0].lstrip('chr') if geno == []: subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) return None else: geno = vcf[0].strip().split() geno[0] = geno[0].lstrip('chr') if geno[2] != snp and snp[0:2] == "rs" and "rs" in geno[2]: snp = geno[2] if "," in geno[3] or "," in geno[4]: subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) return None index = [] for i in range(9, len(head)): if head[i] in pop_ids: index.append(i) genotypes = {"0": 0, "1": 0} for i in index: sub_geno = geno[i].split("|") for j in sub_geno: if j in genotypes: genotypes[j] += 1 else: genotypes[j] = 1 if genotypes["0"] == 0 or genotypes["1"] == 0: subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) return None # Calculate proxy LD statistics in parallel if len(assoc_coords) < 60: num_subprocesses = 1 # else: # threads=4 assoc_coords_subset_chunks = np.array_split(assoc_coords, num_subprocesses) # block=len(assoc_coords) // num_subprocesses commands = [] # for i in range(num_subprocesses): # if i==min(range(num_subprocesses)) and i==max(range(num_subprocesses)): # command="python3 LDassoc_sub.py "+snp+" "+chromosome+" "+"_".join(assoc_coords)+" "+request+" "+str(i) # elif i==min(range(num_subprocesses)): # command="python3 LDassoc_sub.py "+snp+" "+chromosome+" "+"_".join(assoc_coords[:block])+" "+request+" "+str(i) # elif i==max(range(num_subprocesses)): # command="python3 LDassoc_sub.py "+snp+" "+chromosome+" "+"_".join(assoc_coords[(block*i)+1:])+" "+request+" "+str(i) # else: # command="python3 LDassoc_sub.py "+snp+" "+chromosome+" "+"_".join(assoc_coords[(block*i)+1:block*(i+1)])+" "+request+" "+str(i) # commands.append(command) for subprocess_id in range(num_subprocesses): subprocessArgs = " ".join([ str(snp), str(chromosome), str("_".join(assoc_coords_subset_chunks[subprocess_id])), str(request), str(genome_build), str(subprocess_id) ]) commands.append("python3 LDassoc_sub.py " + subprocessArgs) processes = [ subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) for command in commands ] # collect output in parallel def get_output(process): return process.communicate()[0].splitlines() pool = Pool(len(processes)) out_raw = pool.map(get_output, processes) pool.close() pool.join() # Aggregate output out_prox = [] for i in range(len(out_raw)): for j in range(len(out_raw[i])): col = out_raw[i][j].decode('utf-8').strip().split("\t") col[6] = int(col[6]) col[7] = float(col[7]) col[8] = float(col[8]) col.append(abs(int(col[6]))) pos_i_j = col[5].split(":")[1] coord_i_j = genome_build_vars[genome_build][ '1000G_chr_prefix'] + chromosome + ":" + pos_i_j + "-" + pos_i_j if coord_i_j in assoc_dict: col.append(float(assoc_dict[coord_i_j][0])) out_prox.append(col) out_dist_sort = sorted(out_prox, key=operator.itemgetter(14)) out_p_sort = sorted(out_dist_sort, key=operator.itemgetter(15), reverse=False) # Organize scatter plot data q_rs = [] q_allele = [] q_coord = [] q_maf = [] p_rs = [] p_allele = [] p_coord = [] p_pos = [] p_maf = [] dist = [] d_prime = [] d_prime_round = [] r2 = [] r2_round = [] corr_alleles = [] regdb = [] funct = [] color = [] alpha = [] size = [] p_val = [] neg_log_p = [] for i in range(len(out_p_sort)): q_rs_i, q_allele_i, q_coord_i, p_rs_i, p_allele_i, p_coord_i, dist_i, d_prime_i, r2_i, corr_alleles_i, regdb_i, q_maf_i, p_maf_i, funct_i, dist_abs, p_val_i = out_p_sort[ i] q_rs.append(q_rs_i) q_allele.append(q_allele_i) q_coord.append(float(q_coord_i.split(":")[1]) / 1000000) q_maf.append(str(round(float(q_maf_i), 4))) if p_rs_i == ".": p_rs_i = p_coord_i p_rs.append(p_rs_i) p_allele.append(p_allele_i) p_coord.append(float(p_coord_i.split(":")[1]) / 1000000) p_pos.append(p_coord_i.split(":")[1]) p_maf.append(str(round(float(p_maf_i), 4))) dist.append(str(round(dist_i / 1000000.0, 4))) d_prime.append(float(d_prime_i)) d_prime_round.append(str(round(float(d_prime_i), 4))) r2.append(float(r2_i)) r2_round.append(str(round(float(r2_i), 4))) corr_alleles.append(corr_alleles_i) # P-value p_val.append(p_val_i) neg_log_p.append(-log10(p_val_i)) # Correct Missing Annotations if regdb_i == ".": regdb_i = "" regdb.append(regdb_i) if funct_i == ".": funct_i = "" if funct_i == "NA": funct_i = "none" funct.append(funct_i) # Set Color reds = [ "#FFCCCC", "#FFCACA", "#FFC8C8", "#FFC6C6", "#FFC4C4", "#FFC2C2", "#FFC0C0", "#FFBEBE", "#FFBCBC", "#FFBABA", "#FFB8B8", "#FFB6B6", "#FFB4B4", "#FFB1B1", "#FFAFAF", "#FFADAD", "#FFABAB", "#FFA9A9", "#FFA7A7", "#FFA5A5", "#FFA3A3", "#FFA1A1", "#FF9F9F", "#FF9D9D", "#FF9B9B", "#FF9999", "#FF9797", "#FF9595", "#FF9393", "#FF9191", "#FF8F8F", "#FF8D8D", "#FF8B8B", "#FF8989", "#FF8787", "#FF8585", "#FF8383", "#FF8181", "#FF7E7E", "#FF7C7C", "#FF7A7A", "#FF7878", "#FF7676", "#FF7474", "#FF7272", "#FF7070", "#FF6E6E", "#FF6C6C", "#FF6A6A", "#FF6868", "#FF6666", "#FF6464", "#FF6262", "#FF6060", "#FF5E5E", "#FF5C5C", "#FF5A5A", "#FF5858", "#FF5656", "#FF5454", "#FF5252", "#FF5050", "#FF4E4E", "#FF4B4B", "#FF4949", "#FF4747", "#FF4545", "#FF4343", "#FF4141", "#FF3F3F", "#FF3D3D", "#FF3B3B", "#FF3939", "#FF3737", "#FF3535", "#FF3333", "#FF3131", "#FF2F2F", "#FF2D2D", "#FF2B2B", "#FF2929", "#FF2727", "#FF2525", "#FF2323", "#FF2121", "#FF1F1F", "#FF1D1D", "#FF1B1B", "#FF1818", "#FF1616", "#FF1414", "#FF1212", "#FF1010", "#FF0E0E", "#FF0C0C", "#FF0A0A", "#FF0808", "#FF0606", "#FF0404", "#FF0202", "#FF0000" ] if q_coord_i == p_coord_i: color_i = "#0000FF" alpha_i = 0.7 else: if myargs['dprime'] == True: color_i = reds[int(d_prime_i * 100.0)] alpha_i = 0.7 elif myargs['dprime'] == False: color_i = reds[int(r2_i * 100.0)] alpha_i = 0.7 color.append(color_i) alpha.append(alpha_i) # Set Size size_i = 9 + float(p_maf_i) * 14.0 size.append(size_i) # Pull out SNPs from association file not found in 1000G p_plot_pos = [] p_plot_pval = [] p_plot_pos2 = [] p_plot_pval2 = [] p_plot_dist = [] index_var_pos = float(q_coord_i.split(":")[1]) / 1000000 for input_pos in a_pos: if input_pos not in p_pos: p_plot_pos.append(float(input_pos) / 1000000) p_plot_pval.append(-log10( float(assoc_dict[chromosome + ":" + input_pos + "-" + input_pos][0]))) p_plot_pos2.append("chr" + chromosome + ":" + input_pos) p_plot_pval2.append( float(assoc_dict[chromosome + ":" + input_pos + "-" + input_pos][0])) p_plot_dist.append( str(round(float(input_pos) / 1000000 - index_var_pos, 4))) # Begin Bokeh Plotting from collections import OrderedDict from bokeh.embed import components, file_html from bokeh.layouts import gridplot from bokeh.models import HoverTool, LinearAxis, Range1d from bokeh.plotting import ColumnDataSource, curdoc, figure, output_file, reset_output, save from bokeh.resources import CDN from bokeh.io import export_svgs import svgutils.compose as sg reset_output() data_p = { 'p_plot_posX': p_plot_pos, 'p_plot_pvalY': p_plot_pval, 'p_plot_pos2': p_plot_pos2, 'p_plot_pval2': p_plot_pval2, 'p_plot_dist': p_plot_dist } source_p = ColumnDataSource(data_p) # Assoc Plot x = p_coord y = neg_log_p data = { 'x': x, 'y': y, 'qrs': q_rs, 'q_alle': q_allele, 'q_maf': q_maf, 'prs': p_rs, 'p_alle': p_allele, 'p_maf': p_maf, 'dist': dist, 'r': r2_round, 'd': d_prime_round, 'alleles': corr_alleles, 'regdb': regdb, 'funct': funct, 'p_val': p_val, 'size': size, 'color': color, 'alpha': alpha } source = ColumnDataSource(data) whitespace = 0.01 xr = Range1d(start=coord1 / 1000000.0 - whitespace, end=coord2 / 1000000.0 + whitespace) yr = Range1d(start=-0.03, end=max(y) * 1.03) sup_2 = "\u00B2" assoc_plot = figure( title="P-values and Regional LD for " + snp + " in " + pop, min_border_top=2, min_border_bottom=2, min_border_left=60, min_border_right=60, h_symmetry=False, v_symmetry=False, plot_width=900, plot_height=600, x_range=xr, y_range=yr, tools= "tap,pan,box_zoom,wheel_zoom,box_select,undo,redo,reset,previewsave", logo=None, toolbar_location="above") assoc_plot.title.align = "center" # Add recombination rate from LDassoc.py output file recomb_file = tmp_dir + "recomb_" + request + ".json" recomb_raw = open(recomb_file).readlines() recomb_x = [] recomb_y = [] for recomb_raw_obj in recomb_raw: recomb_obj = json.loads(recomb_raw_obj) recomb_x.append( int(recomb_obj[genome_build_vars[genome_build]['position']]) / 1000000.0) recomb_y.append(float(recomb_obj['rate']) / 100 * max(y)) assoc_plot.line(recomb_x, recomb_y, line_width=1, color="black", alpha=0.5) # Add genome-wide significance a = [coord1 / 1000000.0 - whitespace, coord2 / 1000000.0 + whitespace] b = [-log10(0.00000005), -log10(0.00000005)] assoc_plot.line(a, b, color="blue", alpha=0.5) assoc_points_not1000G = assoc_plot.circle(x='p_plot_posX', y='p_plot_pvalY', size=9 + float("0.25") * 14.0, source=source_p, line_color="gray", fill_color="white") assoc_points = assoc_plot.circle(x='x', y='y', size='size', color='color', alpha='alpha', source=source) assoc_plot.add_tools( HoverTool(renderers=[assoc_points_not1000G], tooltips=OrderedDict([("Variant", "@p_plot_pos2"), ("P-value", "@p_plot_pval2"), ("Distance (Mb)", "@p_plot_dist")]))) hover = HoverTool(renderers=[assoc_points]) hover.tooltips = OrderedDict([ ("Variant", "@prs @p_alle"), ("P-value", "@p_val"), ("Distance (Mb)", "@dist"), ("MAF", "@p_maf"), ("R" + sup_2 + " (" + q_rs[0] + ")", "@r"), ("D\' (" + q_rs[0] + ")", "@d"), ("Correlated Alleles", "@alleles"), ("RegulomeDB", "@regdb"), ("Functional Class", "@funct"), ]) assoc_plot.add_tools(hover) # Annotate RebulomeDB scores if myargs['annotate'] == True: assoc_plot.text(x, y, text=regdb, alpha=1, text_font_size="7pt", text_baseline="middle", text_align="center", angle=0) assoc_plot.yaxis.axis_label = "-log10 P-value" assoc_plot.extra_y_ranges = {"y2_axis": Range1d(start=-3, end=103)} assoc_plot.add_layout( LinearAxis(y_range_name="y2_axis", axis_label="Combined Recombination Rate (cM/Mb)"), "right") ## Need to confirm units # Rug Plot y2_ll = [-0.03] * len(x) y2_ul = [1.03] * len(x) yr_rug = Range1d(start=-0.03, end=1.03) data_rug = { 'x': x, 'y': y, 'y2_ll': y2_ll, 'y2_ul': y2_ul, 'qrs': q_rs, 'q_alle': q_allele, 'q_maf': q_maf, 'prs': p_rs, 'p_alle': p_allele, 'p_maf': p_maf, 'dist': dist, 'r': r2_round, 'd': d_prime_round, 'alleles': corr_alleles, 'regdb': regdb, 'funct': funct, 'p_val': p_val, 'size': size, 'color': color, 'alpha': alpha } source_rug = ColumnDataSource(data_rug) rug = figure(x_range=xr, y_range=yr_rug, border_fill_color='white', y_axis_type=None, title="", min_border_top=2, min_border_bottom=2, min_border_left=60, min_border_right=60, h_symmetry=False, v_symmetry=False, plot_width=900, plot_height=50, tools="xpan,tap,wheel_zoom", logo=None) rug.segment(x0='x', y0='y2_ll', x1='x', y1='y2_ul', source=source_rug, color='color', alpha='alpha', line_width=1) rug.toolbar_location = None # Gene Plot (All Transcripts) if myargs['transcript'] == True: # Get genes from LDassoc.py output file genes_file = tmp_dir + "genes_" + request + ".json" genes_raw = open(genes_file).readlines() genes_plot_start = [] genes_plot_end = [] genes_plot_y = [] genes_plot_name = [] exons_plot_x = [] exons_plot_y = [] exons_plot_w = [] exons_plot_h = [] exons_plot_name = [] exons_plot_id = [] exons_plot_exon = [] message = ["Too many genes to plot."] lines = [0] gap = 80000 tall = 0.75 if genes_raw != None and len(genes_raw) > 0: for gene_raw_obj in genes_raw: gene_obj = json.loads(gene_raw_obj) bin = gene_obj["bin"] name_id = gene_obj["name"] chrom = gene_obj["chrom"] strand = gene_obj["strand"] txStart = gene_obj["txStart"] txEnd = gene_obj["txEnd"] cdsStart = gene_obj["cdsStart"] cdsEnd = gene_obj["cdsEnd"] exonCount = gene_obj["exonCount"] exonStarts = gene_obj["exonStarts"] exonEnds = gene_obj["exonEnds"] score = gene_obj["score"] name2 = gene_obj["name2"] cdsStartStat = gene_obj["cdsStartStat"] cdsEndStat = gene_obj["cdsEndStat"] exonFrames = gene_obj["exonFrames"] name = name2 id = name_id e_start = exonStarts.split(",") e_end = exonEnds.split(",") # Determine Y Coordinate i = 0 y_coord = None while y_coord == None: if i > len(lines) - 1: y_coord = i + 1 lines.append(int(txEnd)) elif int(txStart) > (gap + lines[i]): y_coord = i + 1 lines[i] = int(txEnd) else: i += 1 genes_plot_start.append(int(txStart) / 1000000.0) genes_plot_end.append(int(txEnd) / 1000000.0) genes_plot_y.append(y_coord) genes_plot_name.append(name + " ") for i in range(len(e_start) - 1): if strand == "+": exon = i + 1 else: exon = len(e_start) - 1 - i width = (int(e_end[i]) - int(e_start[i])) / 1000000.0 x_coord = int(e_start[i]) / 1000000.0 + (width / 2) exons_plot_x.append(x_coord) exons_plot_y.append(y_coord) exons_plot_w.append(width) exons_plot_h.append(tall) exons_plot_name.append(name) exons_plot_id.append(id) exons_plot_exon.append(exon) n_rows = len(lines) genes_plot_yn = [n_rows - x + 0.5 for x in genes_plot_y] exons_plot_yn = [n_rows - x + 0.5 for x in exons_plot_y] yr2 = Range1d(start=0, end=n_rows) data_gene_plot = { 'exons_plot_x': exons_plot_x, 'exons_plot_yn': exons_plot_yn, 'exons_plot_w': exons_plot_w, 'exons_plot_h': exons_plot_h, 'exons_plot_name': exons_plot_name, 'exons_plot_id': exons_plot_id, 'exons_plot_exon': exons_plot_exon } source_gene_plot = ColumnDataSource(data_gene_plot) max_genes = 40 # if len(lines) < 3 or len(genes_raw) > max_genes: if len(lines) < 3: plot_h_pix = 250 else: plot_h_pix = 250 + (len(lines) - 2) * 50 gene_plot = figure( min_border_top=2, min_border_bottom=0, min_border_left=100, min_border_right=5, x_range=xr, y_range=yr2, border_fill_color='white', title="", h_symmetry=False, v_symmetry=False, logo=None, plot_width=900, plot_height=plot_h_pix, tools= "hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave") # if len(genes_raw) <= max_genes: gene_plot.segment(genes_plot_start, genes_plot_yn, genes_plot_end, genes_plot_yn, color="black", alpha=1, line_width=2) gene_plot.rect(x='exons_plot_x', y='exons_plot_yn', width='exons_plot_w', height='exons_plot_h', source=source_gene_plot, fill_color="grey", line_color="grey") gene_plot.text(genes_plot_start, genes_plot_yn, text=genes_plot_name, alpha=1, text_font_size="7pt", text_font_style="bold", text_baseline="middle", text_align="right", angle=0) hover = gene_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Gene", "@exons_plot_name"), ("Transcript ID", "@exons_plot_id"), ("Exon", "@exons_plot_exon"), ]) # else: # x_coord_text = coord1/1000000.0 + (coord2/1000000.0 - coord1/1000000.0) / 2.0 # gene_plot.text(x_coord_text, n_rows / 2.0, text=message, alpha=1, # text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0) gene_plot.xaxis.axis_label = "Chromosome " + chromosome + " Coordinate (Mb)(" + genome_build_vars[ genome_build]['title'] + ")" gene_plot.yaxis.axis_label = "Genes (All Transcripts)" gene_plot.ygrid.grid_line_color = None gene_plot.yaxis.axis_line_color = None gene_plot.yaxis.minor_tick_line_color = None gene_plot.yaxis.major_tick_line_color = None gene_plot.yaxis.major_label_text_color = None gene_plot.toolbar_location = "below" # Change output backend to SVG temporarily for headless export assoc_plot.output_backend = "svg" rug.output_backend = "svg" gene_plot.output_backend = "svg" export_svgs(assoc_plot, filename=tmp_dir + "assoc_plot_1_" + request + ".svg") export_svgs(gene_plot, filename=tmp_dir + "gene_plot_1_" + request + ".svg") # 1 pixel = 0.0264583333 cm svg_height = str(20.00 + (0.0264583333 * plot_h_pix)) + "cm" svg_height_scaled = str(100.00 + (0.1322916665 * plot_h_pix)) + "cm" # Concatenate svgs sg.Figure( "24.59cm", svg_height, sg.SVG(tmp_dir + "assoc_plot_1_" + request + ".svg"), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").move( -40, 630)).save(tmp_dir + "assoc_plot_" + request + ".svg") sg.Figure( "122.95cm", svg_height_scaled, sg.SVG(tmp_dir + "assoc_plot_1_" + request + ".svg").scale(5), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(5).move( -200, 3150)).save(tmp_dir + "assoc_plot_scaled_" + request + ".svg") # Export to PDF subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "assoc_plot_" + request + ".svg " + tmp_dir + "assoc_plot_" + request + ".pdf", shell=True) # Export to PNG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "assoc_plot_scaled_" + request + ".svg " + tmp_dir + "assoc_plot_" + request + ".png", shell=True) # Export to JPEG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "assoc_plot_scaled_" + request + ".svg " + tmp_dir + "assoc_plot_" + request + ".jpeg", shell=True) # Remove individual SVG files after they are combined subprocess.call("rm " + tmp_dir + "assoc_plot_1_" + request + ".svg", shell=True) subprocess.call("rm " + tmp_dir + "gene_plot_1_" + request + ".svg", shell=True) # Remove scaled SVG file after it is converted to png and jpeg subprocess.call("rm " + tmp_dir + "assoc_plot_scaled_" + request + ".svg", shell=True) # Gene Plot (Collapsed) else: # Get genes from LDassoc.py output file genes_c_file = tmp_dir + "genes_c_" + request + ".json" genes_c_raw = open(genes_c_file).readlines() genes_c_plot_start = [] genes_c_plot_end = [] genes_c_plot_y = [] genes_c_plot_name = [] exons_c_plot_x = [] exons_c_plot_y = [] exons_c_plot_w = [] exons_c_plot_h = [] exons_c_plot_name = [] exons_c_plot_id = [] message_c = ["Too many genes to plot."] lines_c = [0] gap = 80000 tall = 0.75 if genes_c_raw != None and len(genes_c_raw) > 0: for gene_raw_obj in genes_c_raw: gene_c_obj = json.loads(gene_raw_obj) chrom = gene_c_obj["chrom"] txStart = gene_c_obj["txStart"] txEnd = gene_c_obj["txEnd"] exonStarts = gene_c_obj["exonStarts"] exonEnds = gene_c_obj["exonEnds"] name2 = gene_c_obj["name2"] transcripts = gene_c_obj["transcripts"] name = name2 e_start = exonStarts.split(",") e_end = exonEnds.split(",") e_transcripts = transcripts.split(",") # Determine Y Coordinate i = 0 y_coord = None while y_coord == None: if i > len(lines_c) - 1: y_coord = i + 1 lines_c.append(int(txEnd)) elif int(txStart) > (gap + lines_c[i]): y_coord = i + 1 lines_c[i] = int(txEnd) else: i += 1 genes_c_plot_start.append(int(txStart) / 1000000.0) genes_c_plot_end.append(int(txEnd) / 1000000.0) genes_c_plot_y.append(y_coord) genes_c_plot_name.append(name + " ") # for i in range(len(e_start)): for i in range(len(e_start) - 1): width = (int(e_end[i]) - int(e_start[i])) / 1000000.0 x_coord = int(e_start[i]) / 1000000.0 + (width / 2) exons_c_plot_x.append(x_coord) exons_c_plot_y.append(y_coord) exons_c_plot_w.append(width) exons_c_plot_h.append(tall) exons_c_plot_name.append(name) exons_c_plot_id.append(e_transcripts[i].replace("-", ",")) n_rows_c = len(lines_c) genes_c_plot_yn = [n_rows_c - x + 0.5 for x in genes_c_plot_y] exons_c_plot_yn = [n_rows_c - x + 0.5 for x in exons_c_plot_y] yr2_c = Range1d(start=0, end=n_rows_c) data_gene_c_plot = { 'exons_c_plot_x': exons_c_plot_x, 'exons_c_plot_yn': exons_c_plot_yn, 'exons_c_plot_w': exons_c_plot_w, 'exons_c_plot_h': exons_c_plot_h, 'exons_c_plot_name': exons_c_plot_name, 'exons_c_plot_id': exons_c_plot_id } source_gene_c_plot = ColumnDataSource(data_gene_c_plot) max_genes_c = 40 # if len(lines_c) < 3 or len(genes_c_raw) > max_genes_c: if len(lines_c) < 3: plot_c_h_pix = 250 else: plot_c_h_pix = 250 + (len(lines_c) - 2) * 50 gene_c_plot = figure( min_border_top=2, min_border_bottom=0, min_border_left=100, min_border_right=5, x_range=xr, y_range=yr2_c, border_fill_color='white', title="", h_symmetry=False, v_symmetry=False, logo=None, plot_width=900, plot_height=plot_c_h_pix, tools= "hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave") # if len(genes_c_raw) <= max_genes_c: gene_c_plot.segment(genes_c_plot_start, genes_c_plot_yn, genes_c_plot_end, genes_c_plot_yn, color="black", alpha=1, line_width=2) gene_c_plot.rect(x='exons_c_plot_x', y='exons_c_plot_yn', width='exons_c_plot_w', height='exons_c_plot_h', source=source_gene_c_plot, fill_color="grey", line_color="grey") gene_c_plot.text(genes_c_plot_start, genes_c_plot_yn, text=genes_c_plot_name, alpha=1, text_font_size="7pt", text_font_style="bold", text_baseline="middle", text_align="right", angle=0) hover = gene_c_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Gene", "@exons_c_plot_name"), ("Transcript IDs", "@exons_c_plot_id"), ]) # else: # x_coord_text = coord1/1000000.0 + (coord2/1000000.0 - coord1/1000000.0) / 2.0 # gene_c_plot.text(x_coord_text, n_rows_c / 2.0, text=message_c, alpha=1, # text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0) gene_c_plot.xaxis.axis_label = "Chromosome " + chromosome + " Coordinate (Mb)(" + genome_build_vars[ genome_build]['title'] + ")" gene_c_plot.yaxis.axis_label = "Genes (Transcripts Collapsed)" gene_c_plot.ygrid.grid_line_color = None gene_c_plot.yaxis.axis_line_color = None gene_c_plot.yaxis.minor_tick_line_color = None gene_c_plot.yaxis.major_tick_line_color = None gene_c_plot.yaxis.major_label_text_color = None gene_c_plot.toolbar_location = "below" # Change output backend to SVG temporarily for headless export assoc_plot.output_backend = "svg" rug.output_backend = "svg" gene_c_plot.output_backend = "svg" export_svgs(assoc_plot, filename=tmp_dir + "assoc_plot_1_" + request + ".svg") export_svgs(gene_c_plot, filename=tmp_dir + "gene_plot_1_" + request + ".svg") # 1 pixel = 0.0264583333 cm svg_height = str(20.00 + (0.0264583333 * plot_c_h_pix)) + "cm" svg_height_scaled = str(100.00 + (0.1322916665 * plot_c_h_pix)) + "cm" # Concatenate svgs sg.Figure( "24.59cm", svg_height, sg.SVG(tmp_dir + "assoc_plot_1_" + request + ".svg"), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").move( -40, 630)).save(tmp_dir + "assoc_plot_" + request + ".svg") sg.Figure( "122.95cm", svg_height_scaled, sg.SVG(tmp_dir + "assoc_plot_1_" + request + ".svg").scale(5), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(5).move( -200, 3150)).save(tmp_dir + "assoc_plot_scaled_" + request + ".svg") # Export to PDF subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "assoc_plot_" + request + ".svg " + tmp_dir + "assoc_plot_" + request + ".pdf", shell=True) # Export to PNG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "assoc_plot_scaled_" + request + ".svg " + tmp_dir + "assoc_plot_" + request + ".png", shell=True) # Export to JPEG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "assoc_plot_scaled_" + request + ".svg " + tmp_dir + "assoc_plot_" + request + ".jpeg", shell=True) # Remove individual SVG files after they are combined subprocess.call("rm " + tmp_dir + "assoc_plot_1_" + request + ".svg", shell=True) subprocess.call("rm " + tmp_dir + "gene_plot_1_" + request + ".svg", shell=True) # Remove scaled SVG file after it is converted to png and jpeg subprocess.call("rm " + tmp_dir + "assoc_plot_scaled_" + request + ".svg", shell=True) reset_output() # Remove temporary files subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) subprocess.call("rm " + tmp_dir + "genes_*" + request + "*.json", shell=True) subprocess.call("rm " + tmp_dir + "recomb_" + request + ".json", shell=True) subprocess.call("rm " + tmp_dir + "assoc_args" + request + ".json", shell=True) print("Bokeh high quality image export complete!") # Return plot output return None