def calculate_matrix_svg(snplst, pop, request, genome_build, r2_d="r2", collapseTranscript=True): # Set data directories using config.yml with open('config.yml', 'r') as yml_file: config = yaml.load(yml_file) env = config['env'] api_mongo_addr = config['api']['api_mongo_addr'] population_samples_dir = config['data']['population_samples_dir'] data_dir = config['data']['data_dir'] tmp_dir = config['data']['tmp_dir'] genotypes_dir = config['data']['genotypes_dir'] aws_info = config['aws'] mongo_username = config['database']['mongo_user_readonly'] mongo_password = config['database']['mongo_password'] mongo_port = config['database']['mongo_port'] export_s3_keys = retrieveAWSCredentials() # Ensure tmp directory exists if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) # Open SNP list file snps_raw = open(snplst).readlines() # Remove duplicate RS numbers snps = [] for snp_raw in snps_raw: snp = snp_raw.strip().split() if snp not in snps: snps.append(snp) # Select desired ancestral populations pops = pop.split("+") pop_dirs = [] for pop_i in pops: if pop_i in [ "ALL", "AFR", "AMR", "EAS", "EUR", "SAS", "ACB", "ASW", "BEB", "CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH", "GWD", "IBS", "ITU", "JPT", "KHV", "LWK", "MSL", "MXL", "PEL", "PJL", "PUR", "STU", "TSI", "YRI" ]: pop_dirs.append(data_dir + population_samples_dir + pop_i + ".txt") get_pops = "cat " + " ".join(pop_dirs) pop_list = [ x.decode('utf-8') for x in subprocess.Popen( get_pops, shell=True, stdout=subprocess.PIPE).stdout.readlines() ] ids = [i.strip() for i in pop_list] pop_ids = list(set(ids)) # Connect to Mongo snp database if env == 'local': mongo_host = api_mongo_addr else: mongo_host = 'localhost' client = MongoClient( 'mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host + '/admin', mongo_port) db = client["LDLink"] def get_coords(db, rsid): rsid = rsid.strip("rs") query_results = db.dbsnp.find_one({"id": rsid}) query_results_sanitized = json.loads(json_util.dumps(query_results)) return query_results_sanitized # Query genomic coordinates def get_rsnum(db, coord): temp_coord = coord.strip("chr").split(":") chro = temp_coord[0] pos = temp_coord[1] query_results = db.dbsnp.find({ "chromosome": chro.upper() if chro == 'x' or chro == 'y' else str(chro), genome_build_vars[genome_build]['position']: str(pos) }) query_results_sanitized = json.loads(json_util.dumps(query_results)) return query_results_sanitized # Replace input genomic coordinates with variant ids (rsids) def replace_coords_rsid(db, snp_lst): new_snp_lst = [] for snp_raw_i in snp_lst: if snp_raw_i[0][0:2] == "rs": new_snp_lst.append(snp_raw_i) else: snp_info_lst = get_rsnum(db, snp_raw_i[0]) print("snp_info_lst") print(snp_info_lst) if snp_info_lst != None: if len(snp_info_lst) > 1: var_id = "rs" + snp_info_lst[0]['id'] ref_variants = [] for snp_info in snp_info_lst: if snp_info['id'] == snp_info['ref_id']: ref_variants.append(snp_info['id']) if len(ref_variants) > 1: var_id = "rs" + ref_variants[0] elif len(ref_variants) == 0 and len(snp_info_lst) > 1: var_id = "rs" + snp_info_lst[0]['id'] else: var_id = "rs" + ref_variants[0] new_snp_lst.append([var_id]) elif len(snp_info_lst) == 1: var_id = "rs" + snp_info_lst[0]['id'] new_snp_lst.append([var_id]) else: new_snp_lst.append(snp_raw_i) else: new_snp_lst.append(snp_raw_i) return new_snp_lst snps = replace_coords_rsid(db, snps) # Find RS numbers in snp database rs_nums = [] snp_pos = [] snp_coords = [] tabix_coords = "" for snp_i in snps: if len(snp_i) > 0: if len(snp_i[0]) > 2: if (snp_i[0][0:2] == "rs" or snp_i[0][0:3] == "chr") and snp_i[0][-1].isdigit(): snp_coord = get_coords(db, snp_i[0]) if snp_coord != None and snp_coord[genome_build_vars[ genome_build]['position']] != "NA": # check if variant is on chrY for genome build = GRCh38 if not (snp_coord['chromosome'] == "Y" and (genome_build == "grch38" or genome_build == "grch38_high_coverage")): rs_nums.append(snp_i[0]) snp_pos.append(snp_coord[ genome_build_vars[genome_build]['position']]) temp = [ snp_i[0], snp_coord['chromosome'], snp_coord[genome_build_vars[genome_build] ['position']] ] snp_coords.append(temp) # Check max distance between SNPs distance_bp = [] for i in range(len(snp_coords)): distance_bp.append(int(snp_coords[i][2])) # Sort coordinates and make tabix formatted coordinates snp_pos_int = [int(i) for i in snp_pos] snp_pos_int.sort() snp_coord_str = [ genome_build_vars[genome_build]['1000G_chr_prefix'] + snp_coords[0][1] + ":" + str(i) + "-" + str(i) for i in snp_pos_int ] tabix_coords = " " + " ".join(snp_coord_str) # Extract 1000 Genomes phased genotypes vcf_filePath = "%s/%s%s/%s" % ( config['aws']['data_subfolder'], genotypes_dir, genome_build_vars[genome_build]['1000G_dir'], genome_build_vars[genome_build]['1000G_file'] % (snp_coords[0][1])) vcf_query_snp_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath) checkS3File(aws_info, config['aws']['bucket'], vcf_filePath) # Define function to correct indel alleles def set_alleles(a1, a2): if len(a1) == 1 and len(a2) == 1: a1_n = a1 a2_n = a2 elif len(a1) == 1 and len(a2) > 1: a1_n = "-" a2_n = a2[1:] elif len(a1) > 1 and len(a2) == 1: a1_n = a1[1:] a2_n = "-" elif len(a1) > 1 and len(a2) > 1: a1_n = a1[1:] a2_n = a2[1:] return (a1_n, a2_n) # Import SNP VCF files tabix_snps = export_s3_keys + " cd {2}; tabix -fhD {0}{1} | grep -v -e END".format( vcf_query_snp_file, tabix_coords, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) vcf = [ x.decode('utf-8') for x in subprocess.Popen( tabix_snps, shell=True, stdout=subprocess.PIPE).stdout.readlines() ] h = 0 while vcf[h][0:2] == "##": h += 1 head = vcf[h].strip().split() # Extract haplotypes index = [] for i in range(9, len(head)): if head[i] in pop_ids: index.append(i) hap1 = [[]] for i in range(len(index) - 1): hap1.append([]) hap2 = [[]] for i in range(len(index) - 1): hap2.append([]) rsnum_lst = [] allele_lst = [] pos_lst = [] for g in range(h + 1, len(vcf)): geno = vcf[g].strip().split() geno[0] = geno[0].lstrip('chr') if geno[1] not in snp_pos: continue if snp_pos.count(geno[1]) == 1: rs_query = rs_nums[snp_pos.index(geno[1])] else: pos_index = [] for p in range(len(snp_pos)): if snp_pos[p] == geno[1]: pos_index.append(p) for p in pos_index: if rs_nums[p] not in rsnum_lst: rs_query = rs_nums[p] break if rs_query in rsnum_lst: continue rs_1000g = geno[2] if rs_query == rs_1000g: rsnum = rs_1000g else: count = -2 found = "false" while count <= 2 and count + g < len(vcf): geno_next = vcf[g + count].strip().split() geno_next[0] = geno_next[0].lstrip('chr') if len(geno_next) >= 3 and rs_query == geno_next[2]: found = "true" break count += 1 if found == "false": indx = [i[0] for i in snps].index(rs_query) # snps[indx][0] = geno[2] # rsnum = geno[2] snps[indx][0] = rs_query rsnum = rs_query else: continue if "," not in geno[3] and "," not in geno[4]: a1, a2 = set_alleles(geno[3], geno[4]) for i in range(len(index)): if geno[index[i]] == "0|0": hap1[i].append(a1) hap2[i].append(a1) elif geno[index[i]] == "0|1": hap1[i].append(a1) hap2[i].append(a2) elif geno[index[i]] == "1|0": hap1[i].append(a2) hap2[i].append(a1) elif geno[index[i]] == "1|1": hap1[i].append(a2) hap2[i].append(a2) elif geno[index[i]] == "0": hap1[i].append(a1) hap2[i].append(".") elif geno[index[i]] == "1": hap1[i].append(a2) hap2[i].append(".") else: hap1[i].append(".") hap2[i].append(".") rsnum_lst.append(rsnum) position = "chr" + geno[0] + ":" + geno[1] + "-" + geno[1] pos_lst.append(position) alleles = a1 + "/" + a2 allele_lst.append(alleles) # Calculate Pairwise LD Statistics all_haps = hap1 + hap2 ld_matrix = [[[None for v in range(2)] for i in range(len(all_haps[0]))] for j in range(len(all_haps[0]))] for i in range(len(all_haps[0])): for j in range(i, len(all_haps[0])): hap = {} for k in range(len(all_haps)): # Extract haplotypes hap_k = all_haps[k][i] + all_haps[k][j] if hap_k in hap: hap[hap_k] += 1 else: hap[hap_k] = 1 # Remove Missing Haplotypes keys = list(hap.keys()) for key in keys: if "." in key: hap.pop(key, None) # Check all haplotypes are present if len(hap) != 4: snp_i_a = allele_lst[i].split("/") snp_j_a = allele_lst[j].split("/") haps = [ snp_i_a[0] + snp_j_a[0], snp_i_a[0] + snp_j_a[1], snp_i_a[1] + snp_j_a[0], snp_i_a[1] + snp_j_a[1] ] for h in haps: if h not in hap: hap[h] = 0 # Perform LD calculations A = hap[sorted(hap)[0]] B = hap[sorted(hap)[1]] C = hap[sorted(hap)[2]] D = hap[sorted(hap)[3]] tmax = max(A, B, C, D) delta = float(A * D - B * C) Ms = float((A + C) * (B + D) * (A + B) * (C + D)) if Ms != 0: # D prime if delta < 0: D_prime = round( abs(delta / min((A + C) * (A + B), (B + D) * (C + D))), 3) else: D_prime = round( abs(delta / min((A + C) * (C + D), (A + B) * (B + D))), 3) # R2 r2 = round((delta**2) / Ms, 3) # Find Correlated Alleles if str(r2) != "NA" and float(r2) > 0.1: Ac = hap[sorted(hap)[0]] Bc = hap[sorted(hap)[1]] Cc = hap[sorted(hap)[2]] Dc = hap[sorted(hap)[3]] if ((Ac * Dc) / max((Bc * Cc), 0.01) > 1): match = sorted(hap)[0][0] + "=" + sorted( hap)[0][1] + "," + sorted( hap)[3][0] + "=" + sorted(hap)[3][1] else: match = sorted(hap)[1][0] + "=" + sorted( hap)[1][1] + "," + sorted( hap)[2][0] + "=" + sorted(hap)[2][1] else: match = " = , = " else: D_prime = "NA" r2 = "NA" match = " = , = " snp1 = rsnum_lst[i] snp2 = rsnum_lst[j] pos1 = pos_lst[i].split("-")[0] pos2 = pos_lst[j].split("-")[0] allele1 = allele_lst[i] allele2 = allele_lst[j] corr = match.split(",")[0].split("=")[1] + "=" + match.split( ",")[0].split("=")[0] + "," + match.split(",")[1].split( "=")[1] + "=" + match.split(",")[1].split("=")[0] corr_f = match ld_matrix[i][j] = [ snp1, snp2, allele1, allele2, corr, pos1, pos2, D_prime, r2 ] ld_matrix[j][i] = [ snp2, snp1, allele2, allele1, corr_f, pos2, pos1, D_prime, r2 ] # Generate Plot Variables out = [j for i in ld_matrix for j in i] xnames = [] ynames = [] xA = [] yA = [] corA = [] xpos = [] ypos = [] D = [] R = [] box_color = [] box_trans = [] if r2_d not in ["r2", "d"]: r2_d = "r2" for i in range(len(out)): snp1, snp2, allele1, allele2, corr, pos1, pos2, D_prime, r2 = out[i] xnames.append(snp1) ynames.append(snp2) xA.append(allele1) yA.append(allele2) corA.append(corr) xpos.append(pos1) ypos.append(pos2) sqrti = math.floor(math.sqrt(len(out))) if sqrti == 0: D.append(str(round(float(D_prime), 4))) R.append(str(round(float(r2), 4))) box_color.append("red") box_trans.append(r2) elif i % sqrti < i // sqrti and r2 != "NA": D.append(str(round(float(D_prime), 4))) R.append(str(round(float(r2), 4))) box_color.append("blue") box_trans.append(abs(D_prime)) elif i % sqrti > i // sqrti and D_prime != "NA": D.append(str(round(float(D_prime), 4))) R.append(str(round(float(r2), 4))) box_color.append("red") box_trans.append(r2) elif i % sqrti == i // sqrti and D_prime != "NA": D.append(str(round(float(D_prime), 4))) R.append(str(round(float(r2), 4))) box_color.append("purple") box_trans.append(r2) else: D.append("NA") R.append("NA") box_color.append("gray") box_trans.append(0.1) # Import plotting modules from collections import OrderedDict from bokeh.embed import components, file_html from bokeh.layouts import gridplot from bokeh.models import HoverTool, LinearAxis, Range1d from bokeh.plotting import ColumnDataSource, curdoc, figure, output_file, reset_output, save from bokeh.resources import CDN from bokeh.io import export_svgs import svgutils.compose as sg from math import pi reset_output() # Aggregate Plotting Data x = [] y = [] w = [] h = [] coord_snps_plot = [] snp_id_plot = [] alleles_snp_plot = [] for i in range(0, len(xpos), int(len(xpos)**0.5)): x.append(int(xpos[i].split(":")[1]) / 1000000.0) y.append(0.5) w.append(0.00003) h.append(1.06) coord_snps_plot.append(xpos[i]) snp_id_plot.append(xnames[i]) alleles_snp_plot.append(xA[i]) buffer = (x[-1] - x[0]) * 0.025 xr = Range1d(start=x[0] - buffer, end=x[-1] + buffer) yr = Range1d(start=-0.03, end=1.03) y2_ll = [-0.03] * len(x) y2_ul = [1.03] * len(x) yr_pos = Range1d(start=(x[-1] + buffer) * -1, end=(x[0] - buffer) * -1) yr0 = Range1d(start=0, end=1) yr2 = Range1d(start=0, end=3.8) yr3 = Range1d(start=0, end=1) spacing = (x[-1] - x[0] + buffer + buffer) / (len(x) * 1.0) x2 = [] y0 = [] y1 = [] y2 = [] y3 = [] y4 = [] for i in range(len(x)): x2.append(x[0] - buffer + spacing * (i + 0.5)) y0.append(0) y1.append(0.20) y2.append(0.80) y3.append(1) y4.append(1.15) xname_pos = [] for i in x2: for j in range(len(x2)): xname_pos.append(i) data = { 'xname': xnames, 'xname_pos': xname_pos, 'yname': ynames, 'xA': xA, 'yA': yA, 'xpos': xpos, 'ypos': ypos, 'R2': R, 'Dp': D, 'corA': corA, 'box_color': box_color, 'box_trans': box_trans } source = ColumnDataSource(data) threshold = 70 if len(snps) < threshold: matrix_plot = figure( outline_line_color="white", min_border_top=0, min_border_bottom=2, min_border_left=100, min_border_right=5, x_range=xr, y_range=list(reversed(rsnum_lst)), h_symmetry=False, v_symmetry=False, border_fill_color='white', x_axis_type=None, logo=None, tools="hover,undo,redo,reset,pan,box_zoom,previewsave", title=" ", plot_width=800, plot_height=700) else: matrix_plot = figure( outline_line_color="white", min_border_top=0, min_border_bottom=2, min_border_left=100, min_border_right=5, x_range=xr, y_range=list(reversed(rsnum_lst)), h_symmetry=False, v_symmetry=False, border_fill_color='white', x_axis_type=None, y_axis_type=None, logo=None, tools="hover,undo,redo,reset,pan,box_zoom,previewsave", title=" ", plot_width=800, plot_height=700) matrix_plot.rect(x='xname_pos', y='yname', width=0.95 * spacing, height=0.95, source=source, color="box_color", alpha="box_trans", line_color=None) matrix_plot.grid.grid_line_color = None matrix_plot.axis.axis_line_color = None matrix_plot.axis.major_tick_line_color = None if len(snps) < threshold: matrix_plot.axis.major_label_text_font_size = "8pt" matrix_plot.xaxis.major_label_orientation = "vertical" matrix_plot.axis.major_label_text_font_style = "normal" matrix_plot.xaxis.major_label_standoff = 0 sup_2 = "\u00B2" hover = matrix_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Variant 1", " " + "@yname (@yA)"), ("Variant 2", " " + "@xname (@xA)"), ("D\'", " " + "@Dp"), ("R" + sup_2, " " + "@R2"), ("Correlated Alleles", " " + "@corA"), ]) # Connecting and Rug Plots # Connector Plot if len(snps) < threshold: connector = figure(outline_line_color="white", y_axis_type=None, x_axis_type=None, x_range=xr, y_range=yr2, border_fill_color='white', title="", min_border_left=100, min_border_right=5, min_border_top=0, min_border_bottom=0, h_symmetry=False, v_symmetry=False, plot_width=800, plot_height=90, tools="xpan,tap") connector.segment(x, y0, x, y1, color="black") connector.segment(x, y1, x2, y2, color="black") connector.segment(x2, y2, x2, y3, color="black") connector.text(x2, y4, text=snp_id_plot, alpha=1, angle=pi / 2, text_font_size="8pt", text_baseline="middle", text_align="left") else: connector = figure(outline_line_color="white", y_axis_type=None, x_axis_type=None, x_range=xr, y_range=yr3, border_fill_color='white', title="", min_border_left=100, min_border_right=5, min_border_top=0, min_border_bottom=0, h_symmetry=False, v_symmetry=False, plot_width=800, plot_height=30, tools="xpan,tap") connector.segment(x, y0, x, y1, color="black") connector.segment(x, y1, x2, y2, color="black") connector.segment(x2, y2, x2, y3, color="black") connector.yaxis.major_label_text_color = None connector.yaxis.minor_tick_line_alpha = 0 # Option does not work connector.yaxis.axis_label = " " connector.grid.grid_line_color = None connector.axis.axis_line_color = None connector.axis.major_tick_line_color = None connector.axis.minor_tick_line_color = None connector.toolbar_location = None data_rug = { 'x': x, 'y': y, 'w': w, 'h': h, 'coord_snps_plot': coord_snps_plot, 'snp_id_plot': snp_id_plot, 'alleles_snp_plot': alleles_snp_plot } source_rug = ColumnDataSource(data_rug) # Rug Plot rug = figure(x_range=xr, y_range=yr, y_axis_type=None, title="", min_border_top=1, min_border_bottom=0, min_border_left=100, min_border_right=5, h_symmetry=False, v_symmetry=False, plot_width=800, plot_height=50, tools="hover,xpan,tap") rug.rect(x='x', y='y', width='w', height='h', fill_color='red', dilate=True, line_color=None, fill_alpha=0.6, source=source_rug) hover = rug.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("SNP", "@snp_id_plot (@alleles_snp_plot)"), ("Coord", "@coord_snps_plot"), ]) rug.toolbar_location = None if collapseTranscript == "false": # Gene Plot (All Transcripts) genes_file = tmp_dir + "genes_" + request + ".json" genes_raw = open(genes_file).readlines() genes_plot_start = [] genes_plot_end = [] genes_plot_y = [] genes_plot_name = [] exons_plot_x = [] exons_plot_y = [] exons_plot_w = [] exons_plot_h = [] exons_plot_name = [] exons_plot_id = [] exons_plot_exon = [] message = ["Too many genes to plot."] lines = [0] gap = 80000 tall = 0.75 if genes_raw != None and len(genes_raw) > 0: for gene_raw_obj in genes_raw: gene_obj = json.loads(gene_raw_obj) bin = gene_obj["bin"] name_id = gene_obj["name"] chrom = gene_obj["chrom"] strand = gene_obj["strand"] txStart = gene_obj["txStart"] txEnd = gene_obj["txEnd"] cdsStart = gene_obj["cdsStart"] cdsEnd = gene_obj["cdsEnd"] exonCount = gene_obj["exonCount"] exonStarts = gene_obj["exonStarts"] exonEnds = gene_obj["exonEnds"] score = gene_obj["score"] name2 = gene_obj["name2"] cdsStartStat = gene_obj["cdsStartStat"] cdsEndStat = gene_obj["cdsEndStat"] exonFrames = gene_obj["exonFrames"] name = name2 id = name_id e_start = exonStarts.split(",") e_end = exonEnds.split(",") # Determine Y Coordinate i = 0 y_coord = None while y_coord == None: if i > len(lines) - 1: y_coord = i + 1 lines.append(int(txEnd)) elif int(txStart) > (gap + lines[i]): y_coord = i + 1 lines[i] = int(txEnd) else: i += 1 genes_plot_start.append(int(txStart) / 1000000.0) genes_plot_end.append(int(txEnd) / 1000000.0) genes_plot_y.append(y_coord) genes_plot_name.append(name + " ") for i in range(len(e_start) - 1): if strand == "+": exon = i + 1 else: exon = len(e_start) - 1 - i width = (int(e_end[i]) - int(e_start[i])) / 1000000.0 x_coord = int(e_start[i]) / 1000000.0 + (width / 2) exons_plot_x.append(x_coord) exons_plot_y.append(y_coord) exons_plot_w.append(width) exons_plot_h.append(tall) exons_plot_name.append(name) exons_plot_id.append(id) exons_plot_exon.append(exon) n_rows = len(lines) genes_plot_yn = [n_rows - w + 0.5 for w in genes_plot_y] exons_plot_yn = [n_rows - w + 0.5 for w in exons_plot_y] yr2 = Range1d(start=0, end=n_rows) data_gene_plot = { 'exons_plot_x': exons_plot_x, 'exons_plot_yn': exons_plot_yn, 'exons_plot_w': exons_plot_w, 'exons_plot_h': exons_plot_h, 'exons_plot_name': exons_plot_name, 'exons_plot_id': exons_plot_id, 'exons_plot_exon': exons_plot_exon, 'coord_snps_plot': coord_snps_plot, 'snp_id_plot': snp_id_plot, 'alleles_snp_plot': alleles_snp_plot } source_gene_plot = ColumnDataSource(data_gene_plot) max_genes = 40 # if len(lines) < 3 or len(genes_raw) > max_genes: if len(lines) < 3: plot_h_pix = 250 else: plot_h_pix = 250 + (len(lines) - 2) * 50 gene_plot = figure( min_border_top=2, min_border_bottom=0, min_border_left=100, min_border_right=5, x_range=xr, y_range=yr2, border_fill_color='white', title="", h_symmetry=False, v_symmetry=False, logo=None, plot_width=800, plot_height=plot_h_pix, tools= "hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave") # if len(genes_raw) <= max_genes: gene_plot.segment(genes_plot_start, genes_plot_yn, genes_plot_end, genes_plot_yn, color="black", alpha=1, line_width=2) gene_plot.rect(x='exons_plot_x', y='exons_plot_yn', width='exons_plot_w', height='exons_plot_h', source=source_gene_plot, fill_color='grey', line_color="grey") gene_plot.text(genes_plot_start, genes_plot_yn, text=genes_plot_name, alpha=1, text_font_size="7pt", text_font_style="bold", text_baseline="middle", text_align="right", angle=0) hover = gene_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Gene", "@exons_plot_name"), ("ID", "@exons_plot_id"), ("Exon", "@exons_plot_exon"), ]) # else: # x_coord_text = x[0] + (x[-1] - x[0]) / 2.0 # gene_plot.text(x_coord_text, n_rows / 2.0, text=message, alpha=1, # text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0) gene_plot.xaxis.axis_label = "Chromosome " + \ snp_coords[1][1] + " Coordinate (Mb)(" + genome_build_vars[genome_build]['title'] + ")" gene_plot.yaxis.axis_label = "Genes (All Transcripts)" gene_plot.ygrid.grid_line_color = None gene_plot.yaxis.axis_line_color = None gene_plot.yaxis.minor_tick_line_color = None gene_plot.yaxis.major_tick_line_color = None gene_plot.yaxis.major_label_text_color = None gene_plot.toolbar_location = "below" # Gene Plot (Collapsed) else: genes_c_file = tmp_dir + "genes_c_" + request + ".json" genes_c_raw = open(genes_c_file).readlines() genes_c_plot_start = [] genes_c_plot_end = [] genes_c_plot_y = [] genes_c_plot_name = [] exons_c_plot_x = [] exons_c_plot_y = [] exons_c_plot_w = [] exons_c_plot_h = [] exons_c_plot_name = [] exons_c_plot_id = [] message_c = ["Too many genes to plot."] lines_c = [0] gap = 80000 tall = 0.75 if genes_c_raw != None and len(genes_c_raw) > 0: for gene_c_raw_obj in genes_c_raw: gene_c_obj = json.loads(gene_c_raw_obj) chrom = gene_c_obj["chrom"] txStart = gene_c_obj["txStart"] txEnd = gene_c_obj["txEnd"] exonStarts = gene_c_obj["exonStarts"] exonEnds = gene_c_obj["exonEnds"] name2 = gene_c_obj["name2"] transcripts = gene_c_obj["transcripts"] name = name2 e_start = exonStarts.split(",") e_end = exonEnds.split(",") e_transcripts = transcripts.split(",") # Determine Y Coordinate i = 0 y_coord = None while y_coord == None: if i > len(lines_c) - 1: y_coord = i + 1 lines_c.append(int(txEnd)) elif int(txStart) > (gap + lines_c[i]): y_coord = i + 1 lines_c[i] = int(txEnd) else: i += 1 genes_c_plot_start.append(int(txStart) / 1000000.0) genes_c_plot_end.append(int(txEnd) / 1000000.0) genes_c_plot_y.append(y_coord) genes_c_plot_name.append(name + " ") # for i in range(len(e_start)): for i in range(len(e_start) - 1): width = (int(e_end[i]) - int(e_start[i])) / 1000000.0 x_coord = int(e_start[i]) / 1000000.0 + (width / 2) exons_c_plot_x.append(x_coord) exons_c_plot_y.append(y_coord) exons_c_plot_w.append(width) exons_c_plot_h.append(tall) exons_c_plot_name.append(name) exons_c_plot_id.append(e_transcripts[i].replace("-", ",")) n_rows_c = len(lines_c) genes_c_plot_yn = [n_rows_c - x + 0.5 for x in genes_c_plot_y] exons_c_plot_yn = [n_rows_c - x + 0.5 for x in exons_c_plot_y] yr2_c = Range1d(start=0, end=n_rows_c) data_gene_c_plot = { 'exons_c_plot_x': exons_c_plot_x, 'exons_c_plot_yn': exons_c_plot_yn, 'exons_c_plot_w': exons_c_plot_w, 'exons_c_plot_h': exons_c_plot_h, 'exons_c_plot_name': exons_c_plot_name, 'exons_c_plot_id': exons_c_plot_id } source_gene_c_plot = ColumnDataSource(data_gene_c_plot) max_genes_c = 40 # if len(lines_c) < 3 or len(genes_c_raw) > max_genes_c: if len(lines_c) < 3: plot_h_pix = 250 else: plot_h_pix = 250 + (len(lines_c) - 2) * 50 gene_plot = figure( min_border_top=2, min_border_bottom=0, min_border_left=100, min_border_right=5, x_range=xr, y_range=yr2_c, border_fill_color='white', title="", h_symmetry=False, v_symmetry=False, logo=None, plot_width=900, plot_height=plot_h_pix, tools= "hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave") # if len(genes_c_raw) <= max_genes_c: gene_plot.segment(genes_c_plot_start, genes_c_plot_yn, genes_c_plot_end, genes_c_plot_yn, color="black", alpha=1, line_width=2) gene_plot.rect(x='exons_c_plot_x', y='exons_c_plot_yn', width='exons_c_plot_w', height='exons_c_plot_h', source=source_gene_c_plot, fill_color="grey", line_color="grey") gene_plot.text(genes_c_plot_start, genes_c_plot_yn, text=genes_c_plot_name, alpha=1, text_font_size="7pt", text_font_style="bold", text_baseline="middle", text_align="right", angle=0) hover = gene_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Gene", "@exons_c_plot_name"), ("Transcript IDs", "@exons_c_plot_id"), ]) # else: # x_coord_text = coord1/1000000.0 + (coord2/1000000.0 - coord1/1000000.0) / 2.0 # gene_c_plot.text(x_coord_text, n_rows_c / 2.0, text=message_c, alpha=1, # text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0) gene_plot.xaxis.axis_label = "Chromosome " + snp_coords[1][ 1] + " Coordinate (Mb)(" + genome_build_vars[genome_build][ 'title'] + ")" gene_plot.yaxis.axis_label = "Genes (Transcripts Collapsed)" gene_plot.ygrid.grid_line_color = None gene_plot.yaxis.axis_line_color = None gene_plot.yaxis.minor_tick_line_color = None gene_plot.yaxis.major_tick_line_color = None gene_plot.yaxis.major_label_text_color = None gene_plot.toolbar_location = "below" # Change output backend to SVG temporarily for headless export # Will be changed back to canvas in LDlink.js matrix_plot.output_backend = "svg" connector.output_backend = "svg" rug.output_backend = "svg" gene_plot.output_backend = "svg" export_svgs(matrix_plot, filename=tmp_dir + "matrix_plot_1_" + request + ".svg") export_svgs(connector, filename=tmp_dir + "connector_1_" + request + ".svg") export_svgs(rug, filename=tmp_dir + "rug_1_" + request + ".svg") export_svgs(gene_plot, filename=tmp_dir + "gene_plot_1_" + request + ".svg") # 1 pixel = 0.0264583333 cm svg_height = str(25.00 + (0.0264583333 * plot_h_pix)) + "cm" svg_height_scaled = str(110.00 + (0.1322916665 * plot_h_pix)) + "cm" # Concatenate svgs sg.Figure( "21.59cm", svg_height, sg.SVG(tmp_dir + "matrix_plot_1_" + request + ".svg"), sg.SVG(tmp_dir + "connector_1_" + request + ".svg").scale(.97).move( 0, 700), sg.SVG(tmp_dir + "rug_1_" + request + ".svg").scale(.97).move(0, 790), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(.97).move( 0, 840)).save(tmp_dir + "matrix_plot_" + request + ".svg") sg.Figure( "107.95cm", svg_height_scaled, sg.SVG(tmp_dir + "matrix_plot_1_" + request + ".svg").scale(5), sg.SVG(tmp_dir + "connector_1_" + request + ".svg").scale(4.85).move( 0, 3500), sg.SVG(tmp_dir + "rug_1_" + request + ".svg").scale(4.85).move( 0, 3930), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(4.85).move( 0, 4160)).save(tmp_dir + "matrix_plot_scaled_" + request + ".svg") # Export to PDF subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "matrix_plot_" + request + ".svg " + tmp_dir + "matrix_plot_" + request + ".pdf", shell=True) # Export to PNG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "matrix_plot_scaled_" + request + ".svg " + tmp_dir + "matrix_plot_" + request + ".png", shell=True) # Export to JPEG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "matrix_plot_scaled_" + request + ".svg " + tmp_dir + "matrix_plot_" + request + ".jpeg", shell=True) # Remove individual SVG files after they are combined subprocess.call("rm " + tmp_dir + "matrix_plot_1_" + request + ".svg", shell=True) subprocess.call("rm " + tmp_dir + "gene_plot_1_" + request + ".svg", shell=True) subprocess.call("rm " + tmp_dir + "rug_1_" + request + ".svg", shell=True) subprocess.call("rm " + tmp_dir + "connector_1_" + request + ".svg", shell=True) # Remove scaled SVG file after it is converted to png and jpeg subprocess.call("rm " + tmp_dir + "matrix_plot_scaled_" + request + ".svg", shell=True) # Remove temporary file(s) subprocess.call("rm " + tmp_dir + "genes_*" + request + "*.json", shell=True) reset_output() return None
def calculate_matrix_svg(snplst, pop, request, r2_d="r2"): # Set data directories using config.yml with open('config.yml', 'r') as f: config = yaml.load(f) gene_dir=config['data']['gene_dir'] snp_dir=config['data']['snp_dir'] pop_dir=config['data']['pop_dir'] vcf_dir=config['data']['vcf_dir'] tmp_dir = "./tmp/" # Ensure tmp directory exists if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) # Open SNP list file snps_raw = open(snplst).readlines() # Remove duplicate RS numbers snps = [] for snp_raw in snps_raw: snp = snp_raw.strip().split() if snp not in snps: snps.append(snp) # Select desired ancestral populations pops = pop.split("+") pop_dirs = [] for pop_i in pops: if pop_i in ["ALL", "AFR", "AMR", "EAS", "EUR", "SAS", "ACB", "ASW", "BEB", "CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH", "GWD", "IBS", "ITU", "JPT", "KHV", "LWK", "MSL", "MXL", "PEL", "PJL", "PUR", "STU", "TSI", "YRI"]: pop_dirs.append(pop_dir + pop_i + ".txt") get_pops = "cat " + " ".join(pop_dirs) proc = subprocess.Popen(get_pops, shell=True, stdout=subprocess.PIPE) pop_list = proc.stdout.readlines() ids = [i.strip() for i in pop_list] pop_ids = list(set(ids)) # Connect to snp database conn = sqlite3.connect(snp_dir) conn.text_factory = str cur = conn.cursor() def get_coords(rs): id = rs.strip("rs") t = (id,) cur.execute("SELECT * FROM tbl_" + id[-1] + " WHERE id=?", t) return cur.fetchone() # Find RS numbers in snp database rs_nums = [] snp_pos = [] snp_coords = [] tabix_coords = "" for snp_i in snps: if len(snp_i) > 0: if len(snp_i[0]) > 2: if snp_i[0][0:2] == "rs" and snp_i[0][-1].isdigit(): snp_coord = get_coords(snp_i[0]) if snp_coord != None: rs_nums.append(snp_i[0]) snp_pos.append(snp_coord[2]) temp = [snp_i[0], snp_coord[1], snp_coord[2]] snp_coords.append(temp) # Close snp connection cur.close() conn.close() # Check max distance between SNPs distance_bp = [] for i in range(len(snp_coords)): distance_bp.append(int(snp_coords[i][2])) # Sort coordinates and make tabix formatted coordinates snp_pos_int = [int(i) for i in snp_pos] snp_pos_int.sort() snp_coord_str = [snp_coords[0][1] + ":" + str(i) + "-" + str(i) for i in snp_pos_int] tabix_coords = " " + " ".join(snp_coord_str) # Extract 1000 Genomes phased genotypes vcf_file = vcf_dir + \ snp_coords[0][ 1] + ".phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.vcf.gz" tabix_snps = "tabix -h {0}{1} | grep -v -e END".format( vcf_file, tabix_coords) proc = subprocess.Popen(tabix_snps, shell=True, stdout=subprocess.PIPE) # Define function to correct indel alleles def set_alleles(a1, a2): if len(a1) == 1 and len(a2) == 1: a1_n = a1 a2_n = a2 elif len(a1) == 1 and len(a2) > 1: a1_n = "-" a2_n = a2[1:] elif len(a1) > 1 and len(a2) == 1: a1_n = a1[1:] a2_n = "-" elif len(a1) > 1 and len(a2) > 1: a1_n = a1[1:] a2_n = a2[1:] return(a1_n, a2_n) # Import SNP VCF files vcf = proc.stdout.readlines() h = 0 while vcf[h][0:2] == "##": h += 1 head = vcf[h].strip().split() # Extract haplotypes index = [] for i in range(9, len(head)): if head[i] in pop_ids: index.append(i) hap1 = [[]] for i in range(len(index) - 1): hap1.append([]) hap2 = [[]] for i in range(len(index) - 1): hap2.append([]) rsnum_lst = [] allele_lst = [] pos_lst = [] for g in range(h + 1, len(vcf)): geno = vcf[g].strip().split() if geno[1] not in snp_pos: continue if snp_pos.count(geno[1]) == 1: rs_query = rs_nums[snp_pos.index(geno[1])] else: pos_index = [] for p in range(len(snp_pos)): if snp_pos[p] == geno[1]: pos_index.append(p) for p in pos_index: if rs_nums[p] not in rsnum_lst: rs_query = rs_nums[p] break if rs_query in rsnum_lst: continue rs_1000g = geno[2] if rs_query == rs_1000g: rsnum = rs_1000g else: count = -2 found = "false" while count <= 2 and count + g < len(vcf): geno_next = vcf[g + count].strip().split() if rs_query == geno_next[2]: found = "true" break count += 1 if found == "false": indx = [i[0] for i in snps].index(rs_query) # snps[indx][0] = geno[2] # rsnum = geno[2] snps[indx][0]=rs_query rsnum=rs_query else: continue if "," not in geno[3] and "," not in geno[4]: a1, a2 = set_alleles(geno[3], geno[4]) for i in range(len(index)): if geno[index[i]] == "0|0": hap1[i].append(a1) hap2[i].append(a1) elif geno[index[i]] == "0|1": hap1[i].append(a1) hap2[i].append(a2) elif geno[index[i]] == "1|0": hap1[i].append(a2) hap2[i].append(a1) elif geno[index[i]] == "1|1": hap1[i].append(a2) hap2[i].append(a2) elif geno[index[i]] == "0": hap1[i].append(a1) hap2[i].append(".") elif geno[index[i]] == "1": hap1[i].append(a2) hap2[i].append(".") else: hap1[i].append(".") hap2[i].append(".") rsnum_lst.append(rsnum) position = "chr" + geno[0] + ":" + geno[1] + "-" + geno[1] pos_lst.append(position) alleles = a1 + "/" + a2 allele_lst.append(alleles) # Calculate Pairwise LD Statistics all_haps = hap1 + hap2 ld_matrix = [[[None for v in range(2)] for i in range( len(all_haps[0]))] for j in range(len(all_haps[0]))] for i in range(len(all_haps[0])): for j in range(i, len(all_haps[0])): hap = {} for k in range(len(all_haps)): # Extract haplotypes hap_k = all_haps[k][i] + all_haps[k][j] if hap_k in hap: hap[hap_k] += 1 else: hap[hap_k] = 1 # Remove Missing Haplotypes keys = hap.keys() for key in keys: if "." in key: hap.pop(key, None) # Check all haplotypes are present if len(hap) != 4: snp_i_a = allele_lst[i].split("/") snp_j_a = allele_lst[j].split("/") haps = [snp_i_a[0] + snp_j_a[0], snp_i_a[0] + snp_j_a[1], snp_i_a[1] + snp_j_a[0], snp_i_a[1] + snp_j_a[1]] for h in haps: if h not in hap: hap[h] = 0 # Perform LD calculations A = hap[sorted(hap)[0]] B = hap[sorted(hap)[1]] C = hap[sorted(hap)[2]] D = hap[sorted(hap)[3]] tmax = max(A, B, C, D) delta = float(A * D - B * C) Ms = float((A + C) * (B + D) * (A + B) * (C + D)) if Ms != 0: # D prime if delta < 0: D_prime = round( abs(delta / min((A + C) * (A + B), (B + D) * (C + D))), 3) else: D_prime = round( abs(delta / min((A + C) * (C + D), (A + B) * (B + D))), 3) # R2 r2 = round((delta**2) / Ms, 3) # Find Correlated Alleles if r2 > 0.1: N = A + B + C + D # Expected Cell Counts eA = (A + B) * (A + C) / N eB = (B + A) * (B + D) / N eC = (C + A) * (C + D) / N eD = (D + C) * (D + B) / N # Calculate Deltas dA = (A - eA)**2 dB = (B - eB)**2 dC = (C - eC)**2 dD = (D - eD)**2 dmax = max(dA, dB, dC, dD) if dA == dB == dC == dD: if tmax == dA or tmax == dD: match = sorted(hap)[0][ 0] + "=" + sorted(hap)[0][1] + "," + sorted(hap)[2][0] + "=" + sorted(hap)[1][1] else: match = sorted(hap)[0][ 0] + "=" + sorted(hap)[1][1] + "," + sorted(hap)[2][0] + "=" + sorted(hap)[0][1] elif dmax == dA or dmax == dD: match = sorted(hap)[0][ 0] + "=" + sorted(hap)[0][1] + "," + sorted(hap)[2][0] + "=" + sorted(hap)[1][1] else: match = sorted(hap)[0][ 0] + "=" + sorted(hap)[1][1] + "," + sorted(hap)[2][0] + "=" + sorted(hap)[0][1] else: match = " = , = " else: D_prime = "NA" r2 = "NA" match = " = , = " snp1 = rsnum_lst[i] snp2 = rsnum_lst[j] pos1 = pos_lst[i].split("-")[0] pos2 = pos_lst[j].split("-")[0] allele1 = allele_lst[i] allele2 = allele_lst[j] corr = match.split(",")[0].split("=")[1] + "=" + match.split(",")[0].split("=")[ 0] + "," + match.split(",")[1].split("=")[1] + "=" + match.split(",")[1].split("=")[0] corr_f = match ld_matrix[i][j] = [snp1, snp2, allele1, allele2, corr, pos1, pos2, D_prime, r2] ld_matrix[j][i] = [snp2, snp1, allele2, allele1, corr_f, pos2, pos1, D_prime, r2] # Generate Plot Variables out = [j for i in ld_matrix for j in i] xnames = [] ynames = [] xA = [] yA = [] corA = [] xpos = [] ypos = [] D = [] R = [] box_color = [] box_trans = [] if r2_d not in ["r2", "d"]: r2_d = "r2" for i in range(len(out)): snp1, snp2, allele1, allele2, corr, pos1, pos2, D_prime, r2 = out[i] xnames.append(snp1) ynames.append(snp2) xA.append(allele1) yA.append(allele2) corA.append(corr) xpos.append(pos1) ypos.append(pos2) if r2_d == "r2" and r2 != "NA": D.append(str(round(float(D_prime), 4))) R.append(str(round(float(r2), 4))) box_color.append("red") box_trans.append(r2) elif r2_d == "d" and D_prime != "NA": D.append(str(round(float(D_prime), 4))) R.append(str(round(float(r2), 4))) box_color.append("red") box_trans.append(abs(D_prime)) else: D.append("NA") R.append("NA") box_color.append("blue") box_trans.append(0.1) # Import plotting modules from collections import OrderedDict from bokeh.embed import components, file_html from bokeh.layouts import gridplot from bokeh.models import HoverTool, LinearAxis, Range1d from bokeh.plotting import ColumnDataSource, curdoc, figure, output_file, reset_output, save from bokeh.resources import CDN from bokeh.io import export_svgs import svgutils.compose as sg from math import pi reset_output() # Aggregate Plotting Data x = [] y = [] w = [] h = [] coord_snps_plot = [] snp_id_plot = [] alleles_snp_plot = [] for i in range(0, len(xpos), int(len(xpos)**0.5)): x.append(int(xpos[i].split(":")[1]) / 1000000.0) y.append(0.5) w.append(0.00003) h.append(1.06) coord_snps_plot.append(xpos[i]) snp_id_plot.append(xnames[i]) alleles_snp_plot.append(xA[i]) buffer = (x[-1] - x[0]) * 0.025 xr = Range1d(start=x[0] - buffer, end=x[-1] + buffer) yr = Range1d(start=-0.03, end=1.03) y2_ll = [-0.03] * len(x) y2_ul = [1.03] * len(x) yr_pos = Range1d(start=(x[-1] + buffer) * -1, end=(x[0] - buffer) * -1) yr0 = Range1d(start=0, end=1) yr2 = Range1d(start=0, end=3.8) yr3 = Range1d(start=0, end=1) spacing = (x[-1] - x[0] + buffer + buffer) / (len(x) * 1.0) x2 = [] y0 = [] y1 = [] y2 = [] y3 = [] y4 = [] for i in range(len(x)): x2.append(x[0] - buffer + spacing * (i + 0.5)) y0.append(0) y1.append(0.20) y2.append(0.80) y3.append(1) y4.append(1.15) xname_pos = [] for i in x2: for j in range(len(x2)): xname_pos.append(i) data = { 'xname': xnames, 'xname_pos': xname_pos, 'yname': ynames, 'xA': xA, 'yA': yA, 'xpos': xpos, 'ypos': ypos, 'R2': R, 'Dp': D, 'corA': corA, 'box_color': box_color, 'box_trans': box_trans } source = ColumnDataSource(data) threshold = 70 if len(snps) < threshold: matrix_plot = figure(outline_line_color="white", min_border_top=0, min_border_bottom=2, min_border_left=100, min_border_right=5, x_range=xr, y_range=list(reversed(rsnum_lst)), h_symmetry=False, v_symmetry=False, border_fill_color='white', x_axis_type=None, logo=None, tools="hover,undo,redo,reset,pan,box_zoom,previewsave", title=" ", plot_width=800, plot_height=700) else: matrix_plot = figure(outline_line_color="white", min_border_top=0, min_border_bottom=2, min_border_left=100, min_border_right=5, x_range=xr, y_range=list(reversed(rsnum_lst)), h_symmetry=False, v_symmetry=False, border_fill_color='white', x_axis_type=None, y_axis_type=None, logo=None, tools="hover,undo,redo,reset,pan,box_zoom,previewsave", title=" ", plot_width=800, plot_height=700) matrix_plot.rect(x='xname_pos', y='yname', width=0.95 * spacing, height=0.95, source=source, color="box_color", alpha="box_trans", line_color=None) matrix_plot.grid.grid_line_color = None matrix_plot.axis.axis_line_color = None matrix_plot.axis.major_tick_line_color = None if len(snps) < threshold: matrix_plot.axis.major_label_text_font_size = "8pt" matrix_plot.xaxis.major_label_orientation = "vertical" matrix_plot.axis.major_label_text_font_style = "normal" matrix_plot.xaxis.major_label_standoff = 0 sup_2 = u"\u00B2" hover = matrix_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Variant 1", " " + "@yname (@yA)"), ("Variant 2", " " + "@xname (@xA)"), ("D\'", " " + "@Dp"), ("R" + sup_2, " " + "@R2"), ("Correlated Alleles", " " + "@corA"), ]) # Connecting and Rug Plots # Connector Plot if len(snps) < threshold: connector = figure(outline_line_color="white", y_axis_type=None, x_axis_type=None, x_range=xr, y_range=yr2, border_fill_color='white', title="", min_border_left=100, min_border_right=5, min_border_top=0, min_border_bottom=0, h_symmetry=False, v_symmetry=False, plot_width=800, plot_height=90, tools="xpan,tap") connector.segment(x, y0, x, y1, color="black") connector.segment(x, y1, x2, y2, color="black") connector.segment(x2, y2, x2, y3, color="black") connector.text(x2, y4, text=snp_id_plot, alpha=1, angle=pi / 2, text_font_size="8pt", text_baseline="middle", text_align="left") else: connector = figure(outline_line_color="white", y_axis_type=None, x_axis_type=None, x_range=xr, y_range=yr3, border_fill_color='white', title="", min_border_left=100, min_border_right=5, min_border_top=0, min_border_bottom=0, h_symmetry=False, v_symmetry=False, plot_width=800, plot_height=30, tools="xpan,tap") connector.segment(x, y0, x, y1, color="black") connector.segment(x, y1, x2, y2, color="black") connector.segment(x2, y2, x2, y3, color="black") connector.yaxis.major_label_text_color = None connector.yaxis.minor_tick_line_alpha = 0 # Option does not work connector.yaxis.axis_label = " " connector.grid.grid_line_color = None connector.axis.axis_line_color = None connector.axis.major_tick_line_color = None connector.axis.minor_tick_line_color = None connector.toolbar_location = None data_rug = { 'x': x, 'y': y, 'w': w, 'h': h, 'coord_snps_plot': coord_snps_plot, 'snp_id_plot': snp_id_plot, 'alleles_snp_plot': alleles_snp_plot } source_rug = ColumnDataSource(data_rug) # Rug Plot rug = figure(x_range=xr, y_range=yr, y_axis_type=None, title="", min_border_top=1, min_border_bottom=0, min_border_left=100, min_border_right=5, h_symmetry=False, v_symmetry=False, plot_width=800, plot_height=50, tools="hover,xpan,tap") rug.rect(x='x', y='y', width='w', height='h', fill_color='red', dilate=True, line_color=None, fill_alpha=0.6, source=source_rug) hover = rug.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("SNP", "@snp_id_plot (@alleles_snp_plot)"), ("Coord", "@coord_snps_plot"), ]) rug.toolbar_location = None # Gene Plot tabix_gene = "tabix -fh {0} {1}:{2}-{3} > {4}".format(gene_dir, snp_coords[1][1], int( (x[0] - buffer) * 1000000), int((x[-1] + buffer) * 1000000), tmp_dir + "genes_" + request + ".txt") subprocess.call(tabix_gene, shell=True) filename = tmp_dir + "genes_" + request + ".txt" genes_raw = open(filename).readlines() genes_plot_start = [] genes_plot_end = [] genes_plot_y = [] genes_plot_name = [] exons_plot_x = [] exons_plot_y = [] exons_plot_w = [] exons_plot_h = [] exons_plot_name = [] exons_plot_id = [] exons_plot_exon = [] message = ["Too many genes to plot."] lines = [0] gap = 80000 tall = 0.75 if genes_raw != None: for i in range(len(genes_raw)): bin, name_id, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, score, name2, cdsStartStat, cdsEndStat, exonFrames = genes_raw[ i].strip().split() name = name2 id = name_id e_start = exonStarts.split(",") e_end = exonEnds.split(",") # Determine Y Coordinate i = 0 y_coord = None while y_coord == None: if i > len(lines) - 1: y_coord = i + 1 lines.append(int(txEnd)) elif int(txStart) > (gap + lines[i]): y_coord = i + 1 lines[i] = int(txEnd) else: i += 1 genes_plot_start.append(int(txStart) / 1000000.0) genes_plot_end.append(int(txEnd) / 1000000.0) genes_plot_y.append(y_coord) genes_plot_name.append(name + " ") for i in range(len(e_start) - 1): if strand == "+": exon = i + 1 else: exon = len(e_start) - 1 - i width = (int(e_end[i]) - int(e_start[i])) / 1000000.0 x_coord = int(e_start[i]) / 1000000.0 + (width / 2) exons_plot_x.append(x_coord) exons_plot_y.append(y_coord) exons_plot_w.append(width) exons_plot_h.append(tall) exons_plot_name.append(name) exons_plot_id.append(id) exons_plot_exon.append(exon) n_rows = len(lines) genes_plot_yn = [n_rows - w + 0.5 for w in genes_plot_y] exons_plot_yn = [n_rows - w + 0.5 for w in exons_plot_y] yr2 = Range1d(start=0, end=n_rows) data_gene_plot = { 'exons_plot_x': exons_plot_x, 'exons_plot_yn': exons_plot_yn, 'exons_plot_w': exons_plot_w, 'exons_plot_h': exons_plot_h, 'exons_plot_name': exons_plot_name, 'exons_plot_id': exons_plot_id, 'exons_plot_exon': exons_plot_exon, 'coord_snps_plot': coord_snps_plot, 'snp_id_plot': snp_id_plot, 'alleles_snp_plot': alleles_snp_plot } source_gene_plot = ColumnDataSource(data_gene_plot) max_genes = 40 if len(lines) < 3 or len(genes_raw) > max_genes: plot_h_pix = 150 else: plot_h_pix = 150 + (len(lines) - 2) * 50 gene_plot = figure(min_border_top=2, min_border_bottom=0, min_border_left=100, min_border_right=5, x_range=xr, y_range=yr2, border_fill_color='white', title="", h_symmetry=False, v_symmetry=False, logo=None, plot_width=800, plot_height=plot_h_pix, tools="hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave") if len(genes_raw) <= max_genes: gene_plot.segment(genes_plot_start, genes_plot_yn, genes_plot_end, genes_plot_yn, color="black", alpha=1, line_width=2) gene_plot.rect(x='exons_plot_x', y='exons_plot_yn', width='exons_plot_w', height='exons_plot_h', source=source_gene_plot, fill_color='grey', line_color="grey") gene_plot.text(genes_plot_start, genes_plot_yn, text=genes_plot_name, alpha=1, text_font_size="7pt", text_font_style="bold", text_baseline="middle", text_align="right", angle=0) hover = gene_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Gene", "@exons_plot_name"), ("ID", "@exons_plot_id"), ("Exon", "@exons_plot_exon"), ]) else: x_coord_text = x[0] + (x[-1] - x[0]) / 2.0 gene_plot.text(x_coord_text, n_rows / 2.0, text=message, alpha=1, text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0) gene_plot.xaxis.axis_label = "Chromosome " + \ snp_coords[1][1] + " Coordinate (Mb)(GRCh37)" gene_plot.yaxis.axis_label = "Genes" gene_plot.ygrid.grid_line_color = None gene_plot.yaxis.axis_line_color = None gene_plot.yaxis.minor_tick_line_color = None gene_plot.yaxis.major_tick_line_color = None gene_plot.yaxis.major_label_text_color = None gene_plot.toolbar_location = "below" # Change output backend to SVG temporarily for headless export # Will be changed back to canvas in LDlink.js matrix_plot.output_backend = "svg" rug.output_backend = "svg" gene_plot.output_backend = "svg" export_svgs(matrix_plot, filename=tmp_dir + "matrix_plot_1_" + request + ".svg") export_svgs(gene_plot, filename=tmp_dir + "gene_plot_1_" + request + ".svg") # Concatenate svgs sg.Figure("21.59cm", "27.94cm", sg.SVG(tmp_dir + "matrix_plot_1_" + request + ".svg"), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").move(0, 720) ).save(tmp_dir + "matrix_plot_" + request + ".svg") sg.Figure("107.95cm", "139.70cm", sg.SVG(tmp_dir + "matrix_plot_1_" + request + ".svg").scale(5), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(5).move(0, 3600) ).save(tmp_dir + "matrix_plot_scaled_" + request + ".svg") # Export to PDF subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "matrix_plot_" + request + ".svg " + tmp_dir + "matrix_plot_" + request + ".pdf", shell=True) # Export to PNG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "matrix_plot_scaled_" + request + ".svg " + tmp_dir + "matrix_plot_" + request + ".png", shell=True) # Export to JPEG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "matrix_plot_scaled_" + request + ".svg " + tmp_dir + "matrix_plot_" + request + ".jpeg", shell=True) # Remove individual SVG files after they are combined subprocess.call("rm " + tmp_dir + "matrix_plot_1_" + request + ".svg", shell=True) subprocess.call("rm " + tmp_dir + "gene_plot_1_" + request + ".svg", shell=True) # Remove scaled SVG file after it is converted to png and jpeg subprocess.call("rm " + tmp_dir + "matrix_plot_scaled_" + request + ".svg", shell=True) reset_output() return None
def calculate_assoc_svg(file, region, pop, request, myargs, myargsName, myargsOrigin): # Set data directories using config.yml with open('config.yml', 'r') as f: config = yaml.load(f) gene_dir2 = config['data']['gene_dir2'] vcf_dir = config['data']['vcf_dir'] tmp_dir = "./tmp/" # Ensure tmp directory exists if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) chrs=["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","X","Y"] # Define parameters for --variant option if region=="variant": if myargsOrigin=="None": return None if myargsOrigin!="None": # Find coordinates (GRCh37/hg19) for SNP RS number if myargsOrigin[0:2]=="rs": snp=myargsOrigin # Connect to Mongo snp database client = MongoClient('mongodb://'+username+':'+password+'@localhost/admin', port) db = client["LDLink"] def get_coords_var(db, rsid): rsid = rsid.strip("rs") query_results = db.dbsnp151.find_one({"id": rsid}) query_results_sanitized = json.loads(json_util.dumps(query_results)) return query_results_sanitized # Find RS number in snp database var_coord=get_coords_var(db, snp) if var_coord==None: return None elif myargsOrigin.split(":")[0].strip("chr") in chrs and len(myargsOrigin.split(":"))==2: snp=myargsOrigin var_coord=[None,myargsOrigin.split(":")[0].strip("chr"),myargsOrigin.split(":")[1]] else: return None chromosome = var_coord['chromosome'] org_coord = var_coord['position'] # Open Association Data header_list=[] header_list.append(myargs['chr']) header_list.append(myargs['bp']) header_list.append(myargs['pval']) # Load input file with open(file) as fp: header = fp.readline().strip().split() first = fp.readline().strip().split() if len(header)!=len(first): return None # Check header for item in header_list: if item not in header: return None len_head=len(header) chr_index=header.index(myargs['chr']) pos_index=header.index(myargs['bp']) p_index=header.index(myargs['pval']) # Define window of interest around query SNP if myargs['window']==None: if region=="variant": window=500000 elif region=="gene": window=100000 else: window=0 else: window=myargs['window'] if region=="variant": coord1=int(org_coord)-window if coord1<0: coord1=0 coord2=int(org_coord)+window elif region=="gene": if myargsName=="None": return None # Connect to gene database conn=sqlite3.connect(gene_dir2) conn.text_factory=str cur=conn.cursor() def get_coords_gene(gene_raw): gene=gene_raw.upper() t=(gene,) cur.execute("SELECT * FROM genes WHERE name=?", t) return cur.fetchone() # Find RS number in snp database gene_coord=get_coords_gene(myargsName) # Close snp connection cur.close() conn.close() if gene_coord==None: return None # Define search coordinates coord1=int(gene_coord[2])-window if coord1<0: coord1=0 coord2=int(gene_coord[3])+window # Run with --origin option if myargsOrigin!="None": if gene_coord[1]!=chromosome: return None if coord1>int(org_coord) or int(org_coord)>coord2: return None else: chromosome=gene_coord[1] elif region=="region": if myargs['start']==None: return None if myargs['end']==None: return None # Parse out chr and positions for --region option if len(myargs['start'].split(":"))!=2: return None if len(myargs['end'].split(":"))!=2: return None chr_s=myargs['start'].strip("chr").split(":")[0] coord_s=myargs['start'].split(":")[1] chr_e=myargs['end'].strip("chr").split(":")[0] coord_e=myargs['end'].split(":")[1] if chr_s not in chrs: return None if chr_e not in chrs: return None if chr_s!=chr_e: return None if coord_s>=coord_e: return None coord1=int(coord_s)-window if coord1<0: coord1=0 coord2=int(coord_e)+window # Run with --origin option if myargsOrigin!="None": if chr_s!=chromosome: return None if coord1>int(org_coord) or int(org_coord)>coord2: return None else: chromosome=chr_s # Generate coordinate list and P-value dictionary max_window=3000000 if coord2-coord1>max_window: return None assoc_coords=[] a_pos=[] assoc_dict={} assoc_list=[] with open(file) as fp: for line in fp: col=line.strip().split() if len(col)==len_head: if col[chr_index].strip("chr")==chromosome: try: int(col[pos_index]) except ValueError: continue else: if coord1<=int(col[pos_index])<=coord2: try: float(col[p_index]) except ValueError: continue else: coord_i=col[chr_index].strip("chr")+":"+col[pos_index]+"-"+col[pos_index] assoc_coords.append(coord_i) a_pos.append(col[pos_index]) assoc_dict[coord_i]=[col[p_index]] assoc_list.append([coord_i,float(col[p_index])]) # Coordinate list checks if len(assoc_coords)==0: return None # Get population ids from population output file from LDassoc.py pop_list=open(tmp_dir+"pops_"+request+".txt").readlines() ids=[] for i in range(len(pop_list)): ids.append(pop_list[i].strip()) pop_ids=list(set(ids)) # Define LD origin coordinate try: org_coord except NameError: for var_p in sorted(assoc_list, key=operator.itemgetter(1)): snp="chr"+var_p[0].split("-")[0] # Extract lowest P SNP phased genotypes vcf_file=vcf_dir+chromosome+".phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.vcf.gz" tabix_snp_h="tabix -H {0} | grep CHROM".format(vcf_file) proc_h=subprocess.Popen(tabix_snp_h, shell=True, stdout=subprocess.PIPE) head=[x.decode('utf-8') for x in proc_h.stdout.readlines()][0].strip().split() # Check lowest P SNP is in the 1000G population and not monoallelic from LDassoc.py output file vcf=open(tmp_dir+"snp_no_dups_"+request+".vcf").readlines() if len(vcf)==0: continue elif len(vcf)>1: geno=vcf[0].strip().split() else: geno=vcf[0].strip().split() if "," in geno[3] or "," in geno[4]: continue index=[] for i in range(9,len(head)): if head[i] in pop_ids: index.append(i) genotypes={"0":0, "1":0} for i in index: sub_geno=geno[i].split("|") for j in sub_geno: if j in genotypes: genotypes[j]+=1 else: genotypes[j]=1 if genotypes["0"]==0 or genotypes["1"]==0: continue org_coord=var_p[0].split("-")[1] break else: if chromosome+":"+org_coord+"-"+org_coord not in assoc_coords: return None # Extract query SNP phased genotypes vcf_file=vcf_dir+chromosome+".phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.vcf.gz" tabix_snp_h="tabix -H {0} | grep CHROM".format(vcf_file) proc_h=subprocess.Popen(tabix_snp_h, shell=True, stdout=subprocess.PIPE) head=[x.decode('utf-8') for x in proc_h.stdout.readlines()][0].strip().split() tabix_snp="tabix {0} {1}:{2}-{2} | grep -v -e END > {3}".format(vcf_file, chromosome, org_coord, tmp_dir+"snp_no_dups_"+request+".vcf") subprocess.call(tabix_snp, shell=True) # Check query SNP is in the 1000G population, has the correct RS number, and not monoallelic vcf=open(tmp_dir+"snp_no_dups_"+request+".vcf").readlines() if len(vcf)==0: subprocess.call("rm "+tmp_dir+"pops_"+request+".txt", shell=True) subprocess.call("rm "+tmp_dir+"*"+request+"*.vcf", shell=True) return None elif len(vcf)>1: geno=[] for i in range(len(vcf)): if vcf[i].strip().split()[2]==snp: geno=vcf[i].strip().split() if geno==[]: subprocess.call("rm "+tmp_dir+"pops_"+request+".txt", shell=True) subprocess.call("rm "+tmp_dir+"*"+request+"*.vcf", shell=True) return None else: geno=vcf[0].strip().split() if geno[2]!=snp and snp[0:2]=="rs": snp=geno[2] if "," in geno[3] or "," in geno[4]: subprocess.call("rm "+tmp_dir+"pops_"+request+".txt", shell=True) subprocess.call("rm "+tmp_dir+"*"+request+"*.vcf", shell=True) return None index=[] for i in range(9,len(head)): if head[i] in pop_ids: index.append(i) genotypes={"0":0, "1":0} for i in index: sub_geno=geno[i].split("|") for j in sub_geno: if j in genotypes: genotypes[j]+=1 else: genotypes[j]=1 if genotypes["0"]==0 or genotypes["1"]==0: subprocess.call("rm "+tmp_dir+"pops_"+request+".txt", shell=True) subprocess.call("rm "+tmp_dir+"*"+request+"*.vcf", shell=True) return None # Calculate proxy LD statistics in parallel if len(assoc_coords)<60: threads=1 else: threads=4 block=len(assoc_coords)/threads commands=[] for i in range(threads): if i==min(range(threads)) and i==max(range(threads)): command="python LDassoc_sub.py "+snp+" "+chromosome+" "+"_".join(assoc_coords)+" "+request+" "+str(i) elif i==min(range(threads)): command="python LDassoc_sub.py "+snp+" "+chromosome+" "+"_".join(assoc_coords[:block])+" "+request+" "+str(i) elif i==max(range(threads)): command="python LDassoc_sub.py "+snp+" "+chromosome+" "+"_".join(assoc_coords[(block*i)+1:])+" "+request+" "+str(i) else: command="python LDassoc_sub.py "+snp+" "+chromosome+" "+"_".join(assoc_coords[(block*i)+1:block*(i+1)])+" "+request+" "+str(i) commands.append(command) processes=[subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) for command in commands] # collect output in parallel def get_output(process): return process.communicate()[0].splitlines() pool = Pool(len(processes)) out_raw=pool.map(get_output, processes) pool.close() pool.join() # Aggregate output out_prox=[] for i in range(len(out_raw)): for j in range(len(out_raw[i])): col=out_raw[i][j].decode('utf-8').strip().split("\t") col[6]=int(col[6]) col[7]=float(col[7]) col[8]=float(col[8]) col.append(abs(int(col[6]))) pos_i_j=col[5].split(":")[1] coord_i_j=chromosome+":"+pos_i_j+"-"+pos_i_j if coord_i_j in assoc_dict: col.append(float(assoc_dict[coord_i_j][0])) out_prox.append(col) out_dist_sort=sorted(out_prox, key=operator.itemgetter(14)) out_p_sort=sorted(out_dist_sort, key=operator.itemgetter(15), reverse=False) # Organize scatter plot data q_rs=[] q_allele=[] q_coord=[] q_maf=[] p_rs=[] p_allele=[] p_coord=[] p_pos=[] p_maf=[] dist=[] d_prime=[] d_prime_round=[] r2=[] r2_round=[] corr_alleles=[] regdb=[] funct=[] color=[] alpha=[] size=[] p_val=[] neg_log_p=[] for i in range(len(out_p_sort)): q_rs_i,q_allele_i,q_coord_i,p_rs_i,p_allele_i,p_coord_i,dist_i,d_prime_i,r2_i,corr_alleles_i,regdb_i,q_maf_i,p_maf_i,funct_i,dist_abs,p_val_i=out_p_sort[i] q_rs.append(q_rs_i) q_allele.append(q_allele_i) q_coord.append(float(q_coord_i.split(":")[1])/1000000) q_maf.append(str(round(float(q_maf_i),4))) if p_rs_i==".": p_rs_i=p_coord_i p_rs.append(p_rs_i) p_allele.append(p_allele_i) p_coord.append(float(p_coord_i.split(":")[1])/1000000) p_pos.append(p_coord_i.split(":")[1]) p_maf.append(str(round(float(p_maf_i),4))) dist.append(str(round(dist_i/1000000.0,4))) d_prime.append(float(d_prime_i)) d_prime_round.append(str(round(float(d_prime_i),4))) r2.append(float(r2_i)) r2_round.append(str(round(float(r2_i),4))) corr_alleles.append(corr_alleles_i) # P-value p_val.append(p_val_i) neg_log_p.append(-log10(p_val_i)) # Correct Missing Annotations if regdb_i==".": regdb_i="" regdb.append(regdb_i) if funct_i==".": funct_i="" if funct_i=="NA": funct_i="none" funct.append(funct_i) # Set Color reds=["#FFCCCC","#FFCACA","#FFC8C8","#FFC6C6","#FFC4C4","#FFC2C2","#FFC0C0","#FFBEBE","#FFBCBC","#FFBABA","#FFB8B8","#FFB6B6","#FFB4B4","#FFB1B1","#FFAFAF","#FFADAD","#FFABAB","#FFA9A9","#FFA7A7","#FFA5A5","#FFA3A3","#FFA1A1","#FF9F9F","#FF9D9D","#FF9B9B","#FF9999","#FF9797","#FF9595","#FF9393","#FF9191","#FF8F8F","#FF8D8D","#FF8B8B","#FF8989","#FF8787","#FF8585","#FF8383","#FF8181","#FF7E7E","#FF7C7C","#FF7A7A","#FF7878","#FF7676","#FF7474","#FF7272","#FF7070","#FF6E6E","#FF6C6C","#FF6A6A","#FF6868","#FF6666","#FF6464","#FF6262","#FF6060","#FF5E5E","#FF5C5C","#FF5A5A","#FF5858","#FF5656","#FF5454","#FF5252","#FF5050","#FF4E4E","#FF4B4B","#FF4949","#FF4747","#FF4545","#FF4343","#FF4141","#FF3F3F","#FF3D3D","#FF3B3B","#FF3939","#FF3737","#FF3535","#FF3333","#FF3131","#FF2F2F","#FF2D2D","#FF2B2B","#FF2929","#FF2727","#FF2525","#FF2323","#FF2121","#FF1F1F","#FF1D1D","#FF1B1B","#FF1818","#FF1616","#FF1414","#FF1212","#FF1010","#FF0E0E","#FF0C0C","#FF0A0A","#FF0808","#FF0606","#FF0404","#FF0202","#FF0000"] if q_coord_i==p_coord_i: color_i="#0000FF" alpha_i=0.7 else: if myargs['dprime']==True: color_i=reds[int(d_prime_i*100.0)] alpha_i=0.7 elif myargs['dprime']==False: color_i=reds[int(r2_i*100.0)] alpha_i=0.7 color.append(color_i) alpha.append(alpha_i) # Set Size size_i=9+float(p_maf_i)*14.0 size.append(size_i) # Pull out SNPs from association file not found in 1000G p_plot_pos=[] p_plot_pval=[] p_plot_pos2=[] p_plot_pval2=[] p_plot_dist=[] index_var_pos=float(q_coord_i.split(":")[1])/1000000 for input_pos in a_pos: if input_pos not in p_pos: p_plot_pos.append(float(input_pos)/1000000) p_plot_pval.append(-log10(float(assoc_dict[chromosome+":"+input_pos+"-"+input_pos][0]))) p_plot_pos2.append("chr"+chromosome+":"+input_pos) p_plot_pval2.append(float(assoc_dict[chromosome+":"+input_pos+"-"+input_pos][0])) p_plot_dist.append(str(round(float(input_pos)/1000000-index_var_pos,4))) # Begin Bokeh Plotting from collections import OrderedDict from bokeh.embed import components,file_html from bokeh.layouts import gridplot from bokeh.models import HoverTool,LinearAxis,Range1d from bokeh.plotting import ColumnDataSource,curdoc,figure,output_file,reset_output,save from bokeh.resources import CDN from bokeh.io import export_svgs import svgutils.compose as sg reset_output() data_p = {'p_plot_posX': p_plot_pos, 'p_plot_pvalY': p_plot_pval, 'p_plot_pos2': p_plot_pos2, 'p_plot_pval2': p_plot_pval2, 'p_plot_dist': p_plot_dist} source_p = ColumnDataSource(data_p) # Assoc Plot x=p_coord y=neg_log_p data = {'x': x, 'y': y, 'qrs': q_rs, 'q_alle': q_allele, 'q_maf': q_maf, 'prs': p_rs, 'p_alle': p_allele, 'p_maf': p_maf, 'dist': dist, 'r': r2_round, 'd': d_prime_round, 'alleles': corr_alleles, 'regdb': regdb, 'funct': funct, 'p_val': p_val, 'size': size, 'color': color, 'alpha': alpha} source = ColumnDataSource(data) whitespace=0.01 xr=Range1d(start=coord1/1000000.0-whitespace, end=coord2/1000000.0+whitespace) yr=Range1d(start=-0.03, end=max(y)*1.03) sup_2="\u00B2" assoc_plot=figure( title="P-values and Regional LD for "+snp+" in "+pop, min_border_top=2, min_border_bottom=2, min_border_left=60, min_border_right=60, h_symmetry=False, v_symmetry=False, plot_width=900, plot_height=600, x_range=xr, y_range=yr, tools="tap,pan,box_zoom,wheel_zoom,box_select,undo,redo,reset,previewsave", logo=None, toolbar_location="above") assoc_plot.title.align="center" # Add recombination rate from LDassoc.py output file filename=tmp_dir+"recomb_"+request+".txt" recomb_raw=open(filename).readlines() recomb_x=[] recomb_y=[] for i in range(len(recomb_raw)): chr,pos,rate=recomb_raw[i].strip().split() recomb_x.append(int(pos)/1000000.0) recomb_y.append(float(rate)/100*max(y)) assoc_plot.line(recomb_x, recomb_y, line_width=1, color="black", alpha=0.5) # Add genome-wide significance a = [coord1/1000000.0-whitespace,coord2/1000000.0+whitespace] b = [-log10(0.00000005),-log10(0.00000005)] assoc_plot.line(a, b, color="blue", alpha=0.5) assoc_points_not1000G=assoc_plot.circle(x='p_plot_posX', y='p_plot_pvalY', size=9+float("0.25")*14.0, source=source_p, line_color="gray", fill_color="white") assoc_points=assoc_plot.circle(x='x', y='y', size='size', color='color', alpha='alpha', source=source) assoc_plot.add_tools(HoverTool(renderers=[assoc_points_not1000G], tooltips=OrderedDict([("Variant", "@p_plot_pos2"), ("P-value", "@p_plot_pval2"), ("Distance (Mb)", "@p_plot_dist")]))) hover=HoverTool(renderers=[assoc_points]) hover.tooltips=OrderedDict([ ("Variant", "@prs @p_alle"), ("P-value", "@p_val"), ("Distance (Mb)", "@dist"), ("MAF", "@p_maf"), ("R"+sup_2+" ("+q_rs[0]+")", "@r"), ("D\' ("+q_rs[0]+")", "@d"), ("Correlated Alleles", "@alleles"), ("RegulomeDB", "@regdb"), ("Functional Class", "@funct"), ]) assoc_plot.add_tools(hover) # Annotate RebulomeDB scores if myargs['annotate']==True: assoc_plot.text(x, y, text=regdb, alpha=1, text_font_size="7pt", text_baseline="middle", text_align="center", angle=0) assoc_plot.yaxis.axis_label="-log10 P-value" assoc_plot.extra_y_ranges = {"y2_axis": Range1d(start=-3, end=103)} assoc_plot.add_layout(LinearAxis(y_range_name="y2_axis", axis_label="Combined Recombination Rate (cM/Mb)"), "right") ## Need to confirm units # Rug Plot y2_ll=[-0.03]*len(x) y2_ul=[1.03]*len(x) yr_rug=Range1d(start=-0.03, end=1.03) data_rug = {'x': x, 'y': y, 'y2_ll': y2_ll, 'y2_ul': y2_ul,'qrs': q_rs, 'q_alle': q_allele, 'q_maf': q_maf, 'prs': p_rs, 'p_alle': p_allele, 'p_maf': p_maf, 'dist': dist, 'r': r2_round, 'd': d_prime_round, 'alleles': corr_alleles, 'regdb': regdb, 'funct': funct, 'p_val': p_val, 'size': size, 'color': color, 'alpha': alpha} source_rug = ColumnDataSource(data_rug) rug=figure( x_range=xr, y_range=yr_rug, border_fill_color='white', y_axis_type=None, title="", min_border_top=2, min_border_bottom=2, min_border_left=60, min_border_right=60, h_symmetry=False, v_symmetry=False, plot_width=900, plot_height=50, tools="xpan,tap,wheel_zoom", logo=None) rug.segment(x0='x', y0='y2_ll', x1='x', y1='y2_ul', source=source_rug, color='color', alpha='alpha', line_width=1) rug.toolbar_location=None # Gene Plot (All Transcripts) if myargs['transcript']==True: # Get genes from LDassoc.py output file filename=tmp_dir+"genes_"+request+".txt" genes_raw=open(filename).readlines() genes_plot_start=[] genes_plot_end=[] genes_plot_y=[] genes_plot_name=[] exons_plot_x=[] exons_plot_y=[] exons_plot_w=[] exons_plot_h=[] exons_plot_name=[] exons_plot_id=[] exons_plot_exon=[] message = ["Too many genes to plot."] lines=[0] gap=80000 tall=0.75 if genes_raw!=None: for i in range(len(genes_raw)): bin,name_id,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,score,name2,cdsStartStat,cdsEndStat,exonFrames=genes_raw[i].strip().split() name=name2 id=name_id e_start=exonStarts.split(",") e_end=exonEnds.split(",") # Determine Y Coordinate i=0 y_coord=None while y_coord==None: if i>len(lines)-1: y_coord=i+1 lines.append(int(txEnd)) elif int(txStart)>(gap+lines[i]): y_coord=i+1 lines[i]=int(txEnd) else: i+=1 genes_plot_start.append(int(txStart)/1000000.0) genes_plot_end.append(int(txEnd)/1000000.0) genes_plot_y.append(y_coord) genes_plot_name.append(name+" ") for i in range(len(e_start)-1): if strand=="+": exon=i+1 else: exon=len(e_start)-1-i width=(int(e_end[i])-int(e_start[i]))/1000000.0 x_coord=int(e_start[i])/1000000.0+(width/2) exons_plot_x.append(x_coord) exons_plot_y.append(y_coord) exons_plot_w.append(width) exons_plot_h.append(tall) exons_plot_name.append(name) exons_plot_id.append(id) exons_plot_exon.append(exon) n_rows=len(lines) genes_plot_yn=[n_rows-x+0.5 for x in genes_plot_y] exons_plot_yn=[n_rows-x+0.5 for x in exons_plot_y] yr2=Range1d(start=0, end=n_rows) data_gene_plot = {'exons_plot_x': exons_plot_x, 'exons_plot_yn': exons_plot_yn, 'exons_plot_w': exons_plot_w, 'exons_plot_h': exons_plot_h,'exons_plot_name': exons_plot_name, 'exons_plot_id': exons_plot_id, 'exons_plot_exon': exons_plot_exon} source_gene_plot=ColumnDataSource(data_gene_plot) max_genes = 40 # if len(lines) < 3 or len(genes_raw) > max_genes: if len(lines) < 3: plot_h_pix = 150 else: plot_h_pix = 150 + (len(lines) - 2) * 50 gene_plot = figure(min_border_top=2, min_border_bottom=0, min_border_left=100, min_border_right=5, x_range=xr, y_range=yr2, border_fill_color='white', title="", h_symmetry=False, v_symmetry=False, logo=None, plot_width=900, plot_height=plot_h_pix, tools="hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave") # if len(genes_raw) <= max_genes: gene_plot.segment(genes_plot_start, genes_plot_yn, genes_plot_end, genes_plot_yn, color="black", alpha=1, line_width=2) gene_plot.rect(x='exons_plot_x', y='exons_plot_yn', width='exons_plot_w', height='exons_plot_h', source=source_gene_plot, fill_color="grey", line_color="grey") gene_plot.text(genes_plot_start, genes_plot_yn, text=genes_plot_name, alpha=1, text_font_size="7pt", text_font_style="bold", text_baseline="middle", text_align="right", angle=0) hover = gene_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Gene", "@exons_plot_name"), ("Transcript ID", "@exons_plot_id"), ("Exon", "@exons_plot_exon"), ]) # else: # x_coord_text = coord1/1000000.0 + (coord2/1000000.0 - coord1/1000000.0) / 2.0 # gene_plot.text(x_coord_text, n_rows / 2.0, text=message, alpha=1, # text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0) gene_plot.xaxis.axis_label = "Chromosome " + chromosome + " Coordinate (Mb)(GRCh37)" gene_plot.yaxis.axis_label = "Genes (All Transcripts)" gene_plot.ygrid.grid_line_color = None gene_plot.yaxis.axis_line_color = None gene_plot.yaxis.minor_tick_line_color = None gene_plot.yaxis.major_tick_line_color = None gene_plot.yaxis.major_label_text_color = None gene_plot.toolbar_location = "below" # Change output backend to SVG temporarily for headless export assoc_plot.output_backend = "svg" rug.output_backend = "svg" gene_plot.output_backend = "svg" export_svgs(assoc_plot, filename=tmp_dir + "assoc_plot_1_" + request + ".svg") export_svgs(gene_plot, filename=tmp_dir + "gene_plot_1_" + request + ".svg") # 1 pixel = 0.0264583333 cm svg_height = str(20.00 + (0.0264583333 * plot_h_pix)) + "cm" svg_height_scaled = str(100.00 + (0.1322916665 * plot_h_pix)) + "cm" # Concatenate svgs sg.Figure("24.59cm", svg_height, sg.SVG(tmp_dir + "assoc_plot_1_" + request + ".svg"), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").move(-40, 630) ).save(tmp_dir + "assoc_plot_" + request + ".svg") sg.Figure("122.95cm", svg_height_scaled, sg.SVG(tmp_dir + "assoc_plot_1_" + request + ".svg").scale(5), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(5).move(-200, 3150) ).save(tmp_dir + "assoc_plot_scaled_" + request + ".svg") # Export to PDF subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "assoc_plot_" + request + ".svg " + tmp_dir + "assoc_plot_" + request + ".pdf", shell=True) # Export to PNG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "assoc_plot_scaled_" + request + ".svg " + tmp_dir + "assoc_plot_" + request + ".png", shell=True) # Export to JPEG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "assoc_plot_scaled_" + request + ".svg " + tmp_dir + "assoc_plot_" + request + ".jpeg", shell=True) # Remove individual SVG files after they are combined subprocess.call("rm " + tmp_dir + "assoc_plot_1_" + request + ".svg", shell=True) subprocess.call("rm " + tmp_dir + "gene_plot_1_" + request + ".svg", shell=True) # Remove scaled SVG file after it is converted to png and jpeg subprocess.call("rm " + tmp_dir + "assoc_plot_scaled_" + request + ".svg", shell=True) # Gene Plot (Collapsed) else: # Get genes from LDassoc.py output file filename_c=tmp_dir+"genes_c_"+request+".txt" genes_c_raw=open(filename_c).readlines() genes_c_plot_start=[] genes_c_plot_end=[] genes_c_plot_y=[] genes_c_plot_name=[] exons_c_plot_x=[] exons_c_plot_y=[] exons_c_plot_w=[] exons_c_plot_h=[] exons_c_plot_name=[] exons_c_plot_id=[] message_c = ["Too many genes to plot."] lines_c=[0] gap=80000 tall=0.75 if genes_c_raw!=None: for i in range(len(genes_c_raw)): chrom,txStart,txEnd,name,exonStarts,exonEnds,transcripts=genes_c_raw[i].strip().split() e_start=exonStarts.split(",") e_end=exonEnds.split(",") e_transcripts=transcripts.split(",") # Determine Y Coordinate i=0 y_coord=None while y_coord==None: if i>len(lines_c)-1: y_coord=i+1 lines_c.append(int(txEnd)) elif int(txStart)>(gap+lines_c[i]): y_coord=i+1 lines_c[i]=int(txEnd) else: i+=1 genes_c_plot_start.append(int(txStart)/1000000.0) genes_c_plot_end.append(int(txEnd)/1000000.0) genes_c_plot_y.append(y_coord) genes_c_plot_name.append(name+" ") for i in range(len(e_start)): width=(int(e_end[i])-int(e_start[i]))/1000000.0 x_coord=int(e_start[i])/1000000.0+(width/2) exons_c_plot_x.append(x_coord) exons_c_plot_y.append(y_coord) exons_c_plot_w.append(width) exons_c_plot_h.append(tall) exons_c_plot_name.append(name) exons_c_plot_id.append(e_transcripts[i].replace("-",",")) n_rows_c=len(lines_c) genes_c_plot_yn=[n_rows_c-x+0.5 for x in genes_c_plot_y] exons_c_plot_yn=[n_rows_c-x+0.5 for x in exons_c_plot_y] yr2_c=Range1d(start=0, end=n_rows_c) data_gene_c_plot = {'exons_c_plot_x': exons_c_plot_x, 'exons_c_plot_yn': exons_c_plot_yn, 'exons_c_plot_w': exons_c_plot_w, 'exons_c_plot_h': exons_c_plot_h, 'exons_c_plot_name': exons_c_plot_name, 'exons_c_plot_id': exons_c_plot_id} source_gene_c_plot=ColumnDataSource(data_gene_c_plot) max_genes_c = 40 # if len(lines_c) < 3 or len(genes_c_raw) > max_genes_c: if len(lines_c) < 3: plot_c_h_pix = 150 else: plot_c_h_pix = 150 + (len(lines_c) - 2) * 50 gene_c_plot = figure(min_border_top=2, min_border_bottom=0, min_border_left=100, min_border_right=5, x_range=xr, y_range=yr2_c, border_fill_color='white', title="", h_symmetry=False, v_symmetry=False, logo=None, plot_width=900, plot_height=plot_c_h_pix, tools="hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave") # if len(genes_c_raw) <= max_genes_c: gene_c_plot.segment(genes_c_plot_start, genes_c_plot_yn, genes_c_plot_end, genes_c_plot_yn, color="black", alpha=1, line_width=2) gene_c_plot.rect(x='exons_c_plot_x', y='exons_c_plot_yn', width='exons_c_plot_w', height='exons_c_plot_h', source=source_gene_c_plot, fill_color="grey", line_color="grey") gene_c_plot.text(genes_c_plot_start, genes_c_plot_yn, text=genes_c_plot_name, alpha=1, text_font_size="7pt", text_font_style="bold", text_baseline="middle", text_align="right", angle=0) hover = gene_c_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Gene", "@exons_c_plot_name"), ("Transcript IDs", "@exons_c_plot_id"), ]) # else: # x_coord_text = coord1/1000000.0 + (coord2/1000000.0 - coord1/1000000.0) / 2.0 # gene_c_plot.text(x_coord_text, n_rows_c / 2.0, text=message_c, alpha=1, # text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0) gene_c_plot.xaxis.axis_label = "Chromosome " + chromosome + " Coordinate (Mb)(GRCh37)" gene_c_plot.yaxis.axis_label = "Genes (Transcripts Collapsed)" gene_c_plot.ygrid.grid_line_color = None gene_c_plot.yaxis.axis_line_color = None gene_c_plot.yaxis.minor_tick_line_color = None gene_c_plot.yaxis.major_tick_line_color = None gene_c_plot.yaxis.major_label_text_color = None gene_c_plot.toolbar_location = "below" # Change output backend to SVG temporarily for headless export assoc_plot.output_backend = "svg" rug.output_backend = "svg" gene_c_plot.output_backend = "svg" export_svgs(assoc_plot, filename=tmp_dir + "assoc_plot_1_" + request + ".svg") export_svgs(gene_c_plot, filename=tmp_dir + "gene_plot_1_" + request + ".svg") # 1 pixel = 0.0264583333 cm svg_height = str(20.00 + (0.0264583333 * plot_c_h_pix)) + "cm" svg_height_scaled = str(100.00 + (0.1322916665 * plot_c_h_pix)) + "cm" # Concatenate svgs sg.Figure("24.59cm", svg_height, sg.SVG(tmp_dir + "assoc_plot_1_" + request + ".svg"), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").move(-40, 630) ).save(tmp_dir + "assoc_plot_" + request + ".svg") sg.Figure("122.95cm", svg_height_scaled, sg.SVG(tmp_dir + "assoc_plot_1_" + request + ".svg").scale(5), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(5).move(-200, 3150) ).save(tmp_dir + "assoc_plot_scaled_" + request + ".svg") # Export to PDF subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "assoc_plot_" + request + ".svg " + tmp_dir + "assoc_plot_" + request + ".pdf", shell=True) # Export to PNG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "assoc_plot_scaled_" + request + ".svg " + tmp_dir + "assoc_plot_" + request + ".png", shell=True) # Export to JPEG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "assoc_plot_scaled_" + request + ".svg " + tmp_dir + "assoc_plot_" + request + ".jpeg", shell=True) # Remove individual SVG files after they are combined subprocess.call("rm " + tmp_dir + "assoc_plot_1_" + request + ".svg", shell=True) subprocess.call("rm " + tmp_dir + "gene_plot_1_" + request + ".svg", shell=True) # Remove scaled SVG file after it is converted to png and jpeg subprocess.call("rm " + tmp_dir + "assoc_plot_scaled_" + request + ".svg", shell=True) reset_output() # Remove temporary files subprocess.call("rm "+tmp_dir+"pops_"+request+".txt", shell=True) subprocess.call("rm "+tmp_dir+"*"+request+"*.vcf", shell=True) subprocess.call("rm "+tmp_dir+"genes_*"+request+"*.txt", shell=True) subprocess.call("rm "+tmp_dir+"recomb_"+request+".txt", shell=True) subprocess.call("rm "+tmp_dir+"assoc_args"+request+".json", shell=True) print("Bokeh high quality image export complete!") # Return plot output return None
def calculate_proxy_svg(snp, pop, request, r2_d="r2"): # Set data directories using config.yml with open('config.yml', 'r') as f: config = yaml.load(f) vcf_dir = config['data']['vcf_dir'] tmp_dir = "./tmp/" # Ensure tmp directory exists if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) if request is False: request = str(time.strftime("%I%M%S")) # Create JSON output # Find coordinates (GRCh37/hg19) for SNP RS number # Connect to Mongo snp database client = MongoClient( 'mongodb://' + username + ':' + password + '@localhost/admin', port) db = client["LDLink"] def get_coords(db, rsid): rsid = rsid.strip("rs") query_results = db.dbsnp151.find_one({"id": rsid}) query_results_sanitized = json.loads(json_util.dumps(query_results)) return query_results_sanitized # Query genomic coordinates def get_rsnum(db, coord): temp_coord = coord.strip("chr").split(":") chro = temp_coord[0] pos = temp_coord[1] query_results = db.dbsnp151.find({ "chromosome": chro.upper() if chro == 'x' or chro == 'y' else chro, "position": pos }) query_results_sanitized = json.loads(json_util.dumps(query_results)) return query_results_sanitized # Replace input genomic coordinates with variant ids (rsids) def replace_coord_rsid(db, snp): if snp[0:2] == "rs": return snp else: snp_info_lst = get_rsnum(db, snp) print "snp_info_lst" print snp_info_lst if snp_info_lst != None: if len(snp_info_lst) > 1: var_id = "rs" + snp_info_lst[0]['id'] ref_variants = [] for snp_info in snp_info_lst: if snp_info['id'] == snp_info['ref_id']: ref_variants.append(snp_info['id']) if len(ref_variants) > 1: var_id = "rs" + ref_variants[0] elif len(ref_variants) == 0 and len(snp_info_lst) > 1: var_id = "rs" + snp_info_lst[0]['id'] else: var_id = "rs" + ref_variants[0] return var_id elif len(snp_info_lst) == 1: var_id = "rs" + snp_info_lst[0]['id'] return var_id else: return snp else: return snp return snp snp = replace_coord_rsid(db, snp) # Find RS number in snp database snp_coord = get_coords(db, snp) # Get population ids from LDproxy.py tmp output files pop_list = open(tmp_dir + "pops_" + request + ".txt").readlines() ids = [] for i in range(len(pop_list)): ids.append(pop_list[i].strip()) pop_ids = list(set(ids)) # Extract query SNP phased genotypes vcf_file = vcf_dir + \ snp_coord['chromosome'] + ".phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.vcf.gz" tabix_snp_h = "tabix -H {0} | grep CHROM".format(vcf_file) proc_h = subprocess.Popen(tabix_snp_h, shell=True, stdout=subprocess.PIPE) head = proc_h.stdout.readlines()[0].strip().split() tabix_snp = "tabix {0} {1}:{2}-{2} | grep -v -e END > {3}".format( vcf_file, snp_coord['chromosome'], snp_coord['position'], tmp_dir + "snp_no_dups_" + request + ".vcf") subprocess.call(tabix_snp, shell=True) # Check SNP is in the 1000G population, has the correct RS number, and not # monoallelic vcf = open(tmp_dir + "snp_no_dups_" + request + ".vcf").readlines() if len(vcf) == 0: subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) return None elif len(vcf) > 1: geno = [] for i in range(len(vcf)): if vcf[i].strip().split()[2] == snp: geno = vcf[i].strip().split() if geno == []: subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) return None else: geno = vcf[0].strip().split() if geno[2] != snp: snp = geno[2] if "," in geno[3] or "," in geno[4]: subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) return None index = [] for i in range(9, len(head)): if head[i] in pop_ids: index.append(i) genotypes = {"0": 0, "1": 0} for i in index: sub_geno = geno[i].split("|") for j in sub_geno: if j in genotypes: genotypes[j] += 1 else: genotypes[j] = 1 if genotypes["0"] == 0 or genotypes["1"] == 0: subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) return None # Define window of interest around query SNP window = 500000 coord1 = int(snp_coord['position']) - window if coord1 < 0: coord1 = 0 coord2 = int(snp_coord['position']) + window # Calculate proxy LD statistics in parallel threads = 4 block = (2 * window) / 4 commands = [] for i in range(threads): if i == min(range(threads)) and i == max(range(threads)): command = "python LDproxy_sub.py " + "True " + snp + " " + \ snp_coord['chromosome'] + " " + str(coord1) + " " + \ str(coord2) + " " + request + " " + str(i) elif i == min(range(threads)): command = "python LDproxy_sub.py " + "True " + snp + " " + \ snp_coord['chromosome'] + " " + str(coord1) + " " + \ str(coord1 + block) + " " + request + " " + str(i) elif i == max(range(threads)): command = "python LDproxy_sub.py " + "True " + snp + " " + snp_coord[ 'chromosome'] + " " + str( coord1 + (block * i) + 1) + " " + str(coord2) + " " + request + " " + str(i) else: command = "python LDproxy_sub.py " + "True " + snp + " " + snp_coord[ 'chromosome'] + " " + str( coord1 + (block * i) + 1) + " " + str(coord1 + (block * (i + 1))) + " " + request + " " + str(i) commands.append(command) processes = [ subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) for command in commands ] # collect output in parallel def get_output(process): return process.communicate()[0].splitlines() if not hasattr(threading.current_thread(), "_children"): threading.current_thread()._children = weakref.WeakKeyDictionary() pool = Pool(len(processes)) out_raw = pool.map(get_output, processes) pool.close() pool.join() # Aggregate output out_prox = [] for i in range(len(out_raw)): for j in range(len(out_raw[i])): col = out_raw[i][j].strip().split("\t") col[6] = int(col[6]) col[7] = float(col[7]) col[8] = float(col[8]) col.append(abs(int(col[6]))) out_prox.append(col) # Sort output if r2_d not in ["r2", "d"]: r2_d = "r2" out_dist_sort = sorted(out_prox, key=operator.itemgetter(14)) if r2_d == "r2": out_ld_sort = sorted(out_dist_sort, key=operator.itemgetter(8), reverse=True) else: out_ld_sort = sorted(out_dist_sort, key=operator.itemgetter(7), reverse=True) # Organize scatter plot data q_rs = [] q_allele = [] q_coord = [] q_maf = [] p_rs = [] p_allele = [] p_coord = [] p_maf = [] dist = [] d_prime = [] d_prime_round = [] r2 = [] r2_round = [] corr_alleles = [] regdb = [] funct = [] color = [] size = [] for i in range(len(out_ld_sort)): q_rs_i, q_allele_i, q_coord_i, p_rs_i, p_allele_i, p_coord_i, dist_i, d_prime_i, r2_i, corr_alleles_i, regdb_i, q_maf_i, p_maf_i, funct_i, dist_abs = out_ld_sort[ i] if float(r2_i) > 0.01: q_rs.append(q_rs_i) q_allele.append(q_allele_i) q_coord.append(float(q_coord_i.split(":")[1]) / 1000000) q_maf.append(str(round(float(q_maf_i), 4))) if p_rs_i == ".": p_rs_i = p_coord_i p_rs.append(p_rs_i) p_allele.append(p_allele_i) p_coord.append(float(p_coord_i.split(":")[1]) / 1000000) p_maf.append(str(round(float(p_maf_i), 4))) dist.append(str(round(dist_i / 1000000.0, 4))) d_prime.append(float(d_prime_i)) d_prime_round.append(str(round(float(d_prime_i), 4))) r2.append(float(r2_i)) r2_round.append(str(round(float(r2_i), 4))) corr_alleles.append(corr_alleles_i) # Correct Missing Annotations if regdb_i == ".": regdb_i = "" regdb.append(regdb_i) if funct_i == ".": funct_i = "" if funct_i == "NA": funct_i = "none" funct.append(funct_i) # Set Color if i == 0: color_i = "blue" elif funct_i != "none" and funct_i != "": color_i = "red" else: color_i = "orange" color.append(color_i) # Set Size size_i = 9 + float(p_maf_i) * 14.0 size.append(size_i) # Begin Bokeh Plotting from collections import OrderedDict from bokeh.embed import components, file_html from bokeh.layouts import gridplot from bokeh.models import HoverTool, LinearAxis, Range1d from bokeh.plotting import ColumnDataSource, curdoc, figure, output_file, reset_output, save from bokeh.resources import CDN from bokeh.io import export_svgs import svgutils.compose as sg reset_output() # Proxy Plot x = p_coord if r2_d == "r2": y = r2 else: y = d_prime whitespace = 0.01 xr = Range1d(start=coord1 / 1000000.0 - whitespace, end=coord2 / 1000000.0 + whitespace) yr = Range1d(start=-0.03, end=1.03) sup_2 = u"\u00B2" proxy_plot = figure( title="Proxies for " + snp + " in " + pop, min_border_top=2, min_border_bottom=2, min_border_left=60, min_border_right=60, h_symmetry=False, v_symmetry=False, plot_width=900, plot_height=600, x_range=xr, y_range=yr, tools="hover,tap,pan,box_zoom,box_select,undo,redo,reset,previewsave", logo=None, toolbar_location="above") proxy_plot.title.align = "center" # Get recomb from LDproxy.py tmp output files filename = tmp_dir + "recomb_" + request + ".txt" recomb_raw = open(filename).readlines() recomb_x = [] recomb_y = [] for i in range(len(recomb_raw)): chr, pos, rate = recomb_raw[i].strip().split() recomb_x.append(int(pos) / 1000000.0) recomb_y.append(float(rate) / 100.0) data = { 'x': x, 'y': y, 'qrs': q_rs, 'q_alle': q_allele, 'q_maf': q_maf, 'prs': p_rs, 'p_alle': p_allele, 'p_maf': p_maf, 'dist': dist, 'r': r2_round, 'd': d_prime_round, 'alleles': corr_alleles, 'regdb': regdb, 'funct': funct, 'size': size, 'color': color } source = ColumnDataSource(data) proxy_plot.line(recomb_x, recomb_y, line_width=1, color="black", alpha=0.5) proxy_plot.circle(x='x', y='y', size='size', color='color', alpha=0.5, source=source) hover = proxy_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Query Variant", "@qrs @q_alle"), ("Proxy Variant", "@prs @p_alle"), ("Distance (Mb)", "@dist"), ("MAF (Query,Proxy)", "@q_maf,@p_maf"), ("R" + sup_2, "@r"), ("D\'", "@d"), ("Correlated Alleles", "@alleles"), ("RegulomeDB", "@regdb"), ("Functional Class", "@funct"), ]) proxy_plot.text(x, y, text=regdb, alpha=1, text_font_size="7pt", text_baseline="middle", text_align="center", angle=0) if r2_d == "r2": proxy_plot.yaxis.axis_label = "R" + sup_2 else: proxy_plot.yaxis.axis_label = "D\'" proxy_plot.extra_y_ranges = {"y2_axis": Range1d(start=-3, end=103)} proxy_plot.add_layout( LinearAxis(y_range_name="y2_axis", axis_label="Combined Recombination Rate (cM/Mb)"), "right") # Rug Plot y2_ll = [-0.03] * len(x) y2_ul = [1.03] * len(x) yr_rug = Range1d(start=-0.03, end=1.03) data_rug = { 'x': x, 'y': y, 'y2_ll': y2_ll, 'y2_ul': y2_ul, 'qrs': q_rs, 'q_alle': q_allele, 'q_maf': q_maf, 'prs': p_rs, 'p_alle': p_allele, 'p_maf': p_maf, 'dist': dist, 'r': r2_round, 'd': d_prime_round, 'alleles': corr_alleles, 'regdb': regdb, 'funct': funct, 'size': size, 'color': color } source_rug = ColumnDataSource(data_rug) rug = figure(x_range=xr, y_range=yr_rug, border_fill_color='white', y_axis_type=None, title="", min_border_top=2, min_border_bottom=2, min_border_left=60, min_border_right=60, h_symmetry=False, v_symmetry=False, plot_width=900, plot_height=50, tools="xpan,tap", logo=None) rug.segment(x0='x', y0='y2_ll', x1='x', y1='y2_ul', source=source_rug, color='color', alpha=0.5, line_width=1) rug.toolbar_location = None # Gene Plot # Get genes from LDproxy.py tmp output files filename = tmp_dir + "genes_" + request + ".txt" genes_raw = open(filename).readlines() genes_plot_start = [] genes_plot_end = [] genes_plot_y = [] genes_plot_name = [] exons_plot_x = [] exons_plot_y = [] exons_plot_w = [] exons_plot_h = [] exons_plot_name = [] exons_plot_id = [] exons_plot_exon = [] lines = [0] gap = 80000 tall = 0.75 if genes_raw != None: for i in range(len(genes_raw)): bin, name_id, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, score, name2, cdsStartStat, cdsEndStat, exonFrames = genes_raw[ i].strip().split() name = name2 id = name_id e_start = exonStarts.split(",") e_end = exonEnds.split(",") # Determine Y Coordinate i = 0 y_coord = None while y_coord == None: if i > len(lines) - 1: y_coord = i + 1 lines.append(int(txEnd)) elif int(txStart) > (gap + lines[i]): y_coord = i + 1 lines[i] = int(txEnd) else: i += 1 genes_plot_start.append(int(txStart) / 1000000.0) genes_plot_end.append(int(txEnd) / 1000000.0) genes_plot_y.append(y_coord) genes_plot_name.append(name + " ") for i in range(len(e_start) - 1): if strand == "+": exon = i + 1 else: exon = len(e_start) - 1 - i width = (int(e_end[i]) - int(e_start[i])) / 1000000.0 x_coord = int(e_start[i]) / 1000000.0 + (width / 2) exons_plot_x.append(x_coord) exons_plot_y.append(y_coord) exons_plot_w.append(width) exons_plot_h.append(tall) exons_plot_name.append(name) exons_plot_id.append(id) exons_plot_exon.append(exon) n_rows = len(lines) genes_plot_yn = [n_rows - x + 0.5 for x in genes_plot_y] exons_plot_yn = [n_rows - x + 0.5 for x in exons_plot_y] yr2 = Range1d(start=0, end=n_rows) data_gene_plot = { 'exons_plot_x': exons_plot_x, 'exons_plot_yn': exons_plot_yn, 'exons_plot_w': exons_plot_w, 'exons_plot_h': exons_plot_h, 'exons_plot_name': exons_plot_name, 'exons_plot_id': exons_plot_id, 'exons_plot_exon': exons_plot_exon } source_gene_plot = ColumnDataSource(data_gene_plot) if len(lines) < 3: plot_h_pix = 150 else: plot_h_pix = 150 + (len(lines) - 2) * 50 gene_plot = figure( x_range=xr, y_range=yr2, border_fill_color='white', title="", min_border_top=2, min_border_bottom=2, min_border_left=60, min_border_right=60, h_symmetry=False, v_symmetry=False, plot_width=900, plot_height=plot_h_pix, tools="hover,tap,xpan,box_zoom,undo,redo,reset,previewsave", logo=None) gene_plot.segment(genes_plot_start, genes_plot_yn, genes_plot_end, genes_plot_yn, color="black", alpha=1, line_width=2) gene_plot.rect(x='exons_plot_x', y='exons_plot_yn', width='exons_plot_w', height='exons_plot_h', source=source_gene_plot, fill_color="grey", line_color="grey") gene_plot.xaxis.axis_label = "Chromosome " + \ snp_coord['chromosome'] + " Coordinate (Mb)(GRCh37)" gene_plot.yaxis.axis_label = "Genes" gene_plot.ygrid.grid_line_color = None gene_plot.yaxis.axis_line_color = None gene_plot.yaxis.minor_tick_line_color = None gene_plot.yaxis.major_tick_line_color = None gene_plot.yaxis.major_label_text_color = None hover = gene_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Gene", "@exons_plot_name"), ("ID", "@exons_plot_id"), ("Exon", "@exons_plot_exon"), ]) gene_plot.text(genes_plot_start, genes_plot_yn, text=genes_plot_name, alpha=1, text_font_size="7pt", text_font_style="bold", text_baseline="middle", text_align="right", angle=0) gene_plot.toolbar_location = "below" # Change output backend to SVG temporarily for headless export # Will be changed back to canvas in LDlink.js proxy_plot.output_backend = "svg" rug.output_backend = "svg" gene_plot.output_backend = "svg" export_svgs(proxy_plot, filename=tmp_dir + "proxy_plot_1_" + request + ".svg") export_svgs(gene_plot, filename=tmp_dir + "gene_plot_1_" + request + ".svg") # 1 pixel = 0.0264583333 cm svg_height = str(20.00 + (0.0264583333 * plot_h_pix)) + "cm" svg_height_scaled = str(100.00 + (0.1322916665 * plot_h_pix)) + "cm" # Concatenate svgs sg.Figure("24.59cm", svg_height, sg.SVG(tmp_dir + "proxy_plot_1_" + request + ".svg"), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").move( 0, 630)).save(tmp_dir + "proxy_plot_" + request + ".svg") sg.Figure( "122.95cm", svg_height_scaled, sg.SVG(tmp_dir + "proxy_plot_1_" + request + ".svg").scale(5), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(5).move( 0, 3150)).save(tmp_dir + "proxy_plot_scaled_" + request + ".svg") # Export to PDF subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "proxy_plot_" + request + ".svg " + tmp_dir + "proxy_plot_" + request + ".pdf", shell=True) # Export to PNG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "proxy_plot_scaled_" + request + ".svg " + tmp_dir + "proxy_plot_" + request + ".png", shell=True) # Export to JPEG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "proxy_plot_scaled_" + request + ".svg " + tmp_dir + "proxy_plot_" + request + ".jpeg", shell=True) # Remove individual SVG files after they are combined subprocess.call("rm " + tmp_dir + "proxy_plot_1_" + request + ".svg", shell=True) subprocess.call("rm " + tmp_dir + "gene_plot_1_" + request + ".svg", shell=True) # Remove scaled SVG file after it is converted to png and jpeg subprocess.call("rm " + tmp_dir + "proxy_plot_scaled_" + request + ".svg", shell=True) reset_output() # Remove temporary files subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) subprocess.call("rm " + tmp_dir + "genes_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "recomb_" + request + ".txt", shell=True) # Return plot output return None
def fsapt_analyze(lig_dir, mode, ene_type): lig_name = os.path.basename(os.path.abspath(lig_dir)) matrix_dfs = [] outfiles = glob('%s/FSAPT*out' % lig_dir) for of in outfiles: df = _get_ene_matrix(of, ene_type) if not df is None: matrix_dfs.append(df) all_df = pd.concat(matrix_dfs, axis=1) mean_df = all_df.stack().groupby(level=[0, 1]).mean().unstack() std_df = all_df.stack().groupby(level=[0, 1]).std().unstack() if mode in ['prolig', 'proliglig']: old_columns = mean_df.columns[:] new_labels = [] numbering = [] for old_label in old_columns: if old_label == 'Total': new_labels.append('Total') numbering.append(100000) else: labels = old_label.split('-') if len(labels) == 2: new_labels.append(''.join(labels)) numbering.append(float(labels[-1])) elif len(labels) == 3: new_labels.append('-'.join(labels[1:])) numbering.append(0.5 * (float(labels[-1]) + float(labels[-2]))) new_columns = [nl for _, nl in sorted(zip(numbering, new_labels))] old_columns = [ol for _, ol in sorted(zip(numbering, old_columns))] new_mean_df = pd.DataFrame() new_std_df = pd.DataFrame() for nc, oc in zip(new_columns, old_columns): new_mean_df[nc] = mean_df[oc] new_std_df[nc] = std_df[oc] mean_df = new_mean_df std_df = new_std_df mean_anno = mean_df.applymap(lambda x: '%+.2f\n' % x) std_anno = std_df.applymap(lambda x: r'+/-%.2f' % x) all_anno = mean_anno + std_anno matrix_svg = '%s/ene_matrix_%s.svg' % (lig_dir, ene_type) plot_matrix(mean_df, all_anno, matrix_svg, mode, ene_type) mean_df.to_csv('%s/ene_mean_%s_%s_%s.csv' % (lig_dir, lig_name, mode, ene_type)) std_df.to_csv('%s/ene_std_%s_%s_%s.csv' % (lig_dir, lig_name, mode, ene_type)) # Plot the ligand dpi = 96 width = len(mean_df.columns) + 2 height = 4 ligmol = cs._RdkitMolBase.from_file('MD/%s/cmp_sybyl.mol2' % lig_name) ligmol._init_atominfo(reset=False) ligmol.charged_mol2file = 'MD/%s/cmp_sybyl.mol2' % lig_name ligmol.get_noh_mol() AllChem.Compute2DCoords(ligmol.noh_mol, canonOrient=True, bondLength=1.5) drawer = rdMolDraw2D.MolDraw2DSVG(width * dpi, height * dpi) opts = drawer.drawOptions() opts.additionalAtomLabelPadding = 0.1 frag_dict, _ = fragment_mol(ligmol, 'L1') for noha in ligmol.noh_mol.GetAtoms(): noh_idx = noha.GetIdx() h_idx = ligmol.noh_to_h_atom_mapping[noh_idx] frag_label = str(frag_dict[h_idx]['resid']) if not 'L1-%02d' % int(frag_label) in mean_df.index: continue if noha.GetAtomicNum() == 6: opts.atomLabels[noh_idx] = '%02d' % int(frag_label) else: elem = ligmol.GetAtomWithIdx(h_idx).GetProp( '_TriposAtomType').split('.')[0] opts.atomLabels[noh_idx] = '%s/%02d' % (elem, int(frag_label)) drawer.DrawMolecule(ligmol.noh_mol) drawer.FinishDrawing() svg = drawer.GetDrawingText().replace('svg:', '') struct_svg = '%s/lig_frag_%s.svg' % (lig_dir, ene_type) with open(struct_svg, 'w') as fh: fh.writelines(svg) # Consolidate the panels if mode == 'prolig': mat_title = 'Protein-Ligand %s Interaction' % ene_type.capitalize() else: mat_title = 'Ligand-Ligand %s Interaction' % ene_type.capitalize() mat_title = sc.Panel(sc.Text(mat_title, size=24)).move(20, 20) mat_panel = sc.Panel(sc.SVG(matrix_svg).scale(1.4)).move(0, 20) struct_title = sc.Panel(sc.Text('Ligand %s' % lig_name, size=24)).move(20, dpi * len(mean_df) + 20) struct_panel = sc.Panel(sc.SVG(struct_svg)).move(0, dpi * len(mean_df) + 20) final_figure = sc.Figure(dpi * width, dpi * (len(mean_df) + height) + 40, mat_panel, mat_title, struct_panel, struct_title) final_name = '%s/%s_%s_%s' % (lig_dir, lig_name, mode, ene_type) final_figure.save('%s.svg' % final_name) os.system('convert -density 100 %s.svg %s.pdf' % (final_name, final_name)) os.system('rm -f %s %s' % (matrix_svg, struct_svg)) # Write pdb for pymol inpdb = '%s/frame0/fsapt.pdb' % lig_dir outpdb = '%s_pymol.pdb' % final_name write_pymol_pdb(inpdb, outpdb, mean_df)
import svgutils.compose as cg from tqdm import tqdm for c in tqdm([1,2,4,9,18]): wh = str(16*c/12)+"cm" cg.Figure(wh,wh,*[cg.SVG('img/cpu.svg').scale(3) for __ in range(c*c)]).tile(c,c).save("img/cpugrids/cpu1-"+str(c)+".svg")
def put_list_of_figs_to_svg_fig( FIGS, fig_name="fig.svg", initial_guess=True, visualize=False, export_as_png=False, Props=None, figsize=None, fontsize=9, SCALING_FACTOR=1.34, # needed to get the right cm size ... with_top_left_letter=False, transparent=True): """ take a list of figures and make a multi panel plot""" label = list(string.ascii_uppercase)[:len(FIGS)] SIZE = [] for fig in FIGS: if type(fig) == str: SIZE.append([1., 1.]) else: SIZE.append(fig.get_size_inches()) width = np.max([s[0] for s in SIZE]) height = np.max([s[1] for s in SIZE]) if Props is None: LABELS, XCOORD, YCOORD = [], [], [] # saving as svg for i in range(len(FIGS)): LABELS.append(label[i]) XCOORD.append((i % 3) * width * 100) YCOORD.append(int(i / 3) * height * 100) XCOORD_LABELS,\ YCOORD_LABELS = XCOORD, YCOORD else: XCOORD, YCOORD = Props['XCOORD'],\ Props['YCOORD'], if 'LABELS' in Props: LABELS = Props['LABELS'] else: LABELS = ['' for x in XCOORD] if 'XCOORD_LABELS' in Props: XCOORD_LABELS,\ YCOORD_LABELS = Props['XCOORD_LABELS'],\ Props['YCOORD_LABELS'] else: XCOORD_LABELS,\ YCOORD_LABELS = XCOORD, YCOORD LOCATIONS = [] for i in range(len(FIGS)): if type(FIGS[i]) is str: LOCATIONS.append(FIGS[i]) else: LOCATIONS.append(os.path.join(gettempdir(), str(i) + '.svg')) FIGS[i].savefig(LOCATIONS[-1], format='svg', transparent=transparent) PANELS = [] for i in range(len(FIGS)): PANELS.append(sg.Panel(\ sg.SVG(LOCATIONS[i]).move(XCOORD[i],YCOORD[i]))) for i in range(len(LABELS)): PANELS.append(sg.Panel(\ sg.Text(LABELS[i], 15, 10, size=fontsize, weight='bold').move(\ XCOORD_LABELS[i],YCOORD_LABELS[i]))\ ) sg.Figure("21cm", "29.7cm", *PANELS).scale(SCALING_FACTOR).save(fig_name) # if figsize is None: # sg.Figure("21cm", "29.7cm", *PANELS).save(fig_name) # else: # sg.Figure(str(inch2cm(figsize[0]*A0_format['width'])[0])+"cm",\ # str(inch2cm(figsize[1]*A0_format['height'])[0])+"cm",\ # *PANELS).scale(SCALING_FACTOR).save(fig_name) if visualize: os.system('open ' + fig_name) # works well with 'Gapplin' on OS-X