def panel4x4(fn0, fn1, fn2, fn3, fn_out): single_size = plot_config.single_figure_size sc.Figure( str(single_size[0] * 150) + "px", str(single_size[1] * 150) + "px", sc.Panel(sc.SVG(fn0).scale(1.0).move(0, 0)), sc.Text("A", 5, 25, size=plot_config.fontsize_xhuge, weight='bold'), sc.Panel(sc.SVG(fn1).scale(1.0).move(single_size[0] * 150 / 2., 0)), sc.Text("B", single_size[0] * 150 / 2 + 5., 25, size=plot_config.fontsize_xhuge, weight='bold'), sc.Panel(sc.SVG(fn2).scale(1.0).move(0, single_size[1] * 150 / 2.)), sc.Text("C", 5, single_size[1] * 150 / 2. + 25, size=plot_config.fontsize_xhuge, weight='bold'), sc.Panel( sc.SVG(fn3).scale(1.0).move(single_size[0] * 150 / 2., single_size[1] * 150 / 2.)), sc.Text("D", single_size[0] * 150 / 2. + 5, single_size[1] * 150 / 2. + 25, size=plot_config.fontsize_xhuge, weight='bold'), ).save(fn_out)
def plot_analysis(self, display=True): figdir = set_figdir(verbose=False) figure_name = f"{figdir}/{timestamp()}-analysis.svg" fig_handles = [ self.plot_singular_vals(savefig=True, verbose=False, display=False)[-1], self.plot_eigs(savefig=True, verbose=False, display=False)[-1], self.plot_power_spectrum(savefig=True, verbose=False, display=False)[-1] ] svgs = [] for fig in fig_handles: svgs.append(sc.SVG(fig + '.svg', fix_mpl=True)) os.remove(f"{fig}.svg") sc.Figure( sum(svg.width for svg in svgs), max([svg.height for svg in svgs]), sc.Panel(svgs[0], sc.Text("(a)", 6, 16, size=11)).move(0, 0), sc.Panel(svgs[1], sc.Text("(b)", 6, 16, size=11)).move(svgs[0].width, 0), sc.Panel(svgs[2], sc.Text("(c)", 6, 16, size=11)).move(svgs[0].width + svgs[1].width, 0)).save(figure_name) if display: IPython.display.display(IPython.display.SVG(figure_name))
def plot(self, gr1=None, gr2=None): """ Parameters ---------- gr1 : {str, GenomeRange} First genome range gr2 : {str, GenomeRange}, optional Second genome range """ frame2grange = self.frame_granges(gr1, gr2) gr1, gr2 = self.current_range sub_frames = self.properties['sub_frames'] frame_svgs = self.plot_frames(frame2grange) center_svg = self.plot_center(gr1, gr2) center_offsets = self.__get_center_offsets(sub_frames) center_svg.move(*self.cm2px(center_offsets)) self.__transform_sub_svgs(frame_svgs, sub_frames, center_offsets) figsize = self.cm2px(self.__get_figsize(sub_frames)) fig = sc.Figure(f"{figsize[0]}px", f"{figsize[1]}px", sc.Panel(center_svg), *[sc.Panel(svg) for svg in frame_svgs.values()]) return fig
def layout(codes, filename): """export pdf A4 pages filled with these barcodes, so that they can be printed as stickers filename will prefix temporary files produced and removed during the process, it shouldn't have any extension TODO: I guess it's possible achieving this without creating so many intermediate files. Try to get rid of them ;) """ # A4 sheet_size = WH(210., 297.) * XY.mm sticker_size = EAN13Data.full_size # so how many codes fit on one sheet? n_stickers = sheet_size / sticker_size n_stickers = np.floor(n_stickers).astype(int) stickers_per_sheet = n_stickers.w * n_stickers.h # so how many sheets do we need? n_sheets = ceil(len(codes) / stickers_per_sheet) sheets = [] # store produced .pdf sheets filenames here # iterate until they are all consumed stickers = iter(codes) for n in range(n_sheets): panels = [] # according to sc logic try: for i in range(n_stickers.w): for j in range(n_stickers.h): # export this code as svg temp file code = next(stickers) tpfile = code.id + '.svg' code.draw(tpfile) panels.append( sc.Panel( sc.SVG(tpfile).scale(1.).move( i * sticker_size.w, j * sticker_size.h))) # cleanup os.remove(tpfile) except StopIteration: pass # no more stickers to print sheetname = filename + ('-' + str(n + 1) if n_sheets > 1 else '') ssvg = sheetname + '.svg' spdf = sheetname + '.pdf' sc.Figure(sheet_size.w, sheet_size.h, *panels).save(ssvg) # convert to .pdf renderPDF.drawToFile(svg2rlg(ssvg), spdf) # remove .svg file os.remove(ssvg) # remember this other temp file sheets.append(spdf) # bring all sheets together into pdf pages if n_sheets > 1: final = PdfFileWriter() for sheet in sheets: append_pdf(PdfFileReader(sheet), final) final.write(open(filename + '.pdf', 'wb')) # so we can now supress them while sheets: os.remove(sheets.pop())
def test_embedded_svg(): svg = sc.SVG("examples/files/svg_logo.svg") fig = sc.Figure("5cm", "5cm", svg) poly = fig.root.find(".//{}polygon".format(SVG)) ok_(poly.get("id") == "V") ok_(svg.height is None) ok_(svg.width is None)
def panel2x2(fn0, fn1, fn_out, single_size=plot_config.single_figure_size): sc.Figure( str(single_size[0] * 150) + "px", str(single_size[1] * 80) + "px", sc.Panel(sc.SVG(fn0).scale(1.0).move(0, 15)), sc.Text("A", 4, 12, size=plot_config.fontsize_large, weight='bold', font='serif'), sc.Panel(sc.SVG(fn1).scale(1.0).move(single_size[0] * 150 / 2., 15)), sc.Text("B", single_size[0] * 150 / 2 + 4., 12, size=plot_config.fontsize_large, weight='bold', font='serif')).save(fn_out)
def compose_svg(svg_board, svg_plot, svg_combined): """ Create a combined SVG in which the board image is put in the background of the axes area of the plot image. :param svg_board: filename of existing board image :type svg_board: str :param svg_plot: filename of existing plot image :type svg_plot: str :param svg_combined: filename of combined image, to be written :type svg_combined: str """ scale = TOP_RIGHT_MARGIN - LEFT_BOTTOM_MARGIN xdel = LEFT_BOTTOM_MARGIN * SIZE ydel = (1.0 - TOP_RIGHT_MARGIN) * SIZE compose.Figure( SIZE, SIZE, compose.Panel(compose.SVG(svg_board).scale(scale).move(xdel, ydel)), compose.Panel(compose.SVG(svg_plot))).save(svg_combined)
def make_multipanel_fig(FIGS, CAP_SIZE=14,\ fig_name="fig.svg",\ transparent=True, correc_factor=70., DPI=100.): """ take a list of figures and make a multi panel plot""" label = list(string.ascii_uppercase)[:len(FIGS)] SIZE = [] for fig in FIGS: SIZE.append(fig.get_size_inches()) width = np.max([s[0] for s in SIZE]) height = np.max([s[1] for s in SIZE]) LABELS, XCOORD, YCOORD, SCALE = [], [], [], [] for i in range(len(FIGS)): ff = 'f.svg' FIGS[i].savefig('/tmp/' + str(i) + '.svg', format='svg', transparent=transparent) if translate_to_bitmap_if_too_big(FIGS[i], '/tmp/' + str(i) + '.svg'): SCALE.append(.7) else: SCALE.append(1.) LABELS.append(label[i]) XCOORD.append((i % 3) * width * correc_factor) YCOORD.append(int(i / 3) * height * correc_factor) PANELS = [] for i in range(len(FIGS)): PANELS.append(sg.Panel(\ sg.SVG('/tmp/'+str(i)+'.svg').move(XCOORD[i],YCOORD[i]).scale(SCALE[i]),\ sg.Text(LABELS[i], 25, 20, size=22, weight='bold').move(\ XCOORD[i]-15,YCOORD[i]))\ ) sg.Figure(str((min(len(FIGS)%3,3))*inch_to_cm(width))+"cm",\ str(inch_to_cm(height)*(int(len(FIGS)/3.01)+1))+"cm",\ *PANELS).save(fig_name)
def draw_plot(data, output): """Draws piecharts from given data on lab012 background Args: data - list of lists of three points with experiment results output - name of generated svg file """ positions = [[1.6, 1], [1.6, 3], [1.6, 5], [5, 1], [5, 3], [5, 5], [8.7, 1], [8.7, 3], [8.7, 5], [12, 1], [12, 3], [12, 5]] # prepare background fig, ax = plt.subplots() ax.imshow([[[0, 0, 0, 0]], [[0, 0, 0, 0]]], extent=[-1, 13.25, -1, 7.3]) ax.patch.set_alpha(0.0) ax.axis('off') # robot blob draw_single_pie([1.3, 5.7], 0.3, [1, 1, 1], ax, ('k', 'k', 'k'), plot_labels=False) ax.text(1.6, 5.9, "Mikrofon") # draw points for i, d in enumerate(zip(positions, data)): draw_single_pie(d[0], 0.5, d[1], ax, ('blue', 'orange', 'green'), str(i + 1)) # save fig.tight_layout() fig.savefig("out.svg", dpi=200, transparent=True) # merge with proper background sc.Figure("247.59521mm", "129.31232mm", sc.Panel(sc.SVG("./assets/012-base.svg").scale(0.352)), sc.Panel(sc.SVG("out.svg").scale(0.6).move(-25, -40))).save(output) os.remove("./out.svg")
def multipanel_figure(graph_env, FIGS, X = None, Y = None, Labels=None, LABELS = None, X_LABELS = None, Y_LABELS = None, width=85.,# mm height=None, # mm grid=False, autoposition=False, SCALING_FACTOR = 1.34, fontsize=None, fontweight='bold', export_to_png=False, bg='white', fig_name='fig.svg'): """ """ # building the figure matrix if not explicited if type(FIGS) is mpl.figure.Figure: FIGS = [[FIGS]] elif type(FIGS) is list: if (len(FIGS)>0) and (type(FIGS[0]) is mpl.figure.Figure): FIGS = [FIGS] elif (len(FIGS)>0) and (type(FIGS[0]) is str): FIGS = [FIGS] # else should be list of list if autoposition: X, Y = [], [] y = [0] for i, lfig in enumerate(FIGS): Y.append([np.max(y) for fig in lfig]) x = [] for fig in lfig: if type(fig) is not str: x.append(72.*fig.get_size_inches()[0]) y.append(72.*fig.get_size_inches()[1]) else: x.append(120) y.append(80) X.append([0]+list(np.cumsum(x))) y = [dy+Y[-1][0] for dy in y] Y.append([np.max(y)]) print('X = ', X) print('Y = ', Y) if X is None: X = [[0 for fig in lfig] for lfig in FIGS] if Y is None: Y = [[0 for fig in lfig] for lfig in FIGS] if LABELS is None: LABELS = [['' for fig in lfig] for lfig in FIGS] if X_LABELS is None: X_LABELS = X if Y_LABELS is None: Y_LABELS = Y if height is None: try: height = np.max([50, Y[-1][-1]])*0.27 # TO BE SET UP except IndexError: height = 50 # size if width=='single-column': width = 85. elif width=='one-and-a-half-column': width = 114. elif width=='one-column-and-a-half': width = 114. elif width=='double-column': width = 174. if fontsize is None: fontsize = graph_env.fontsize+1 LOCATIONS, PANELS = [], [] for i, lfig in enumerate(FIGS): LOCATIONS.append([]) for j, fig in enumerate(lfig): if type(FIGS[i][j]) is str: LOCATIONS[i].append(FIGS[i][j]) # 1.26625 -- NEW SCALING FACTOR else: LOCATIONS[i].append(os.path.join(gettempdir(), '%i_%i.svg' % (i,j))) FIGS[i][j].savefig(LOCATIONS[i][j], format='svg', transparent=graph_env.transparency) PANELS.append(sg.Panel(sg.SVG(LOCATIONS[i][j]).move(X[i][j], Y[i][j]))) for i, labels in enumerate(LABELS): for j, label in enumerate(labels): if label!='': PANELS.append(sg.Panel(sg.Text(label, 3, 10, size=fontsize, weight=fontweight).move(\ X_LABELS[i][j],Y_LABELS[i][j]))) if grid: sg.Figure("%.1fcm" % (width/10.), "%.1fcm" % (height/10.), *PANELS, sg.Grid(40,40)).scale(SCALING_FACTOR).save(fig_name.replace('.png', '.svg')) else: sg.Figure("%.1fcm" % (width/10.), "%.1fcm" % (height/10.), *PANELS).scale(SCALING_FACTOR).save(fig_name.replace('.png', '.svg')) if fig_name.endswith('.png'): export_as_png(fig_name.replace('.png', '.svg'), dpi=300, background=bg) os.remove(fig_name.replace('.png', '.svg')) print('[ok] removed %s' % fig_name.replace('.png', '.svg')) elif export_to_png: export_as_png(fig_name, dpi=300, background=bg)
def calculate_matrix_svg(snplst, pop, request, r2_d="r2"): # Set data directories using config.yml with open('config.yml', 'r') as f: config = yaml.load(f) gene_dir=config['data']['gene_dir'] snp_dir=config['data']['snp_dir'] pop_dir=config['data']['pop_dir'] vcf_dir=config['data']['vcf_dir'] tmp_dir = "./tmp/" # Ensure tmp directory exists if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) # Open SNP list file snps_raw = open(snplst).readlines() # Remove duplicate RS numbers snps = [] for snp_raw in snps_raw: snp = snp_raw.strip().split() if snp not in snps: snps.append(snp) # Select desired ancestral populations pops = pop.split("+") pop_dirs = [] for pop_i in pops: if pop_i in ["ALL", "AFR", "AMR", "EAS", "EUR", "SAS", "ACB", "ASW", "BEB", "CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH", "GWD", "IBS", "ITU", "JPT", "KHV", "LWK", "MSL", "MXL", "PEL", "PJL", "PUR", "STU", "TSI", "YRI"]: pop_dirs.append(pop_dir + pop_i + ".txt") get_pops = "cat " + " ".join(pop_dirs) proc = subprocess.Popen(get_pops, shell=True, stdout=subprocess.PIPE) pop_list = proc.stdout.readlines() ids = [i.strip() for i in pop_list] pop_ids = list(set(ids)) # Connect to snp database conn = sqlite3.connect(snp_dir) conn.text_factory = str cur = conn.cursor() def get_coords(rs): id = rs.strip("rs") t = (id,) cur.execute("SELECT * FROM tbl_" + id[-1] + " WHERE id=?", t) return cur.fetchone() # Find RS numbers in snp database rs_nums = [] snp_pos = [] snp_coords = [] tabix_coords = "" for snp_i in snps: if len(snp_i) > 0: if len(snp_i[0]) > 2: if snp_i[0][0:2] == "rs" and snp_i[0][-1].isdigit(): snp_coord = get_coords(snp_i[0]) if snp_coord != None: rs_nums.append(snp_i[0]) snp_pos.append(snp_coord[2]) temp = [snp_i[0], snp_coord[1], snp_coord[2]] snp_coords.append(temp) # Close snp connection cur.close() conn.close() # Check max distance between SNPs distance_bp = [] for i in range(len(snp_coords)): distance_bp.append(int(snp_coords[i][2])) # Sort coordinates and make tabix formatted coordinates snp_pos_int = [int(i) for i in snp_pos] snp_pos_int.sort() snp_coord_str = [snp_coords[0][1] + ":" + str(i) + "-" + str(i) for i in snp_pos_int] tabix_coords = " " + " ".join(snp_coord_str) # Extract 1000 Genomes phased genotypes vcf_file = vcf_dir + \ snp_coords[0][ 1] + ".phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.vcf.gz" tabix_snps = "tabix -h {0}{1} | grep -v -e END".format( vcf_file, tabix_coords) proc = subprocess.Popen(tabix_snps, shell=True, stdout=subprocess.PIPE) # Define function to correct indel alleles def set_alleles(a1, a2): if len(a1) == 1 and len(a2) == 1: a1_n = a1 a2_n = a2 elif len(a1) == 1 and len(a2) > 1: a1_n = "-" a2_n = a2[1:] elif len(a1) > 1 and len(a2) == 1: a1_n = a1[1:] a2_n = "-" elif len(a1) > 1 and len(a2) > 1: a1_n = a1[1:] a2_n = a2[1:] return(a1_n, a2_n) # Import SNP VCF files vcf = proc.stdout.readlines() h = 0 while vcf[h][0:2] == "##": h += 1 head = vcf[h].strip().split() # Extract haplotypes index = [] for i in range(9, len(head)): if head[i] in pop_ids: index.append(i) hap1 = [[]] for i in range(len(index) - 1): hap1.append([]) hap2 = [[]] for i in range(len(index) - 1): hap2.append([]) rsnum_lst = [] allele_lst = [] pos_lst = [] for g in range(h + 1, len(vcf)): geno = vcf[g].strip().split() if geno[1] not in snp_pos: continue if snp_pos.count(geno[1]) == 1: rs_query = rs_nums[snp_pos.index(geno[1])] else: pos_index = [] for p in range(len(snp_pos)): if snp_pos[p] == geno[1]: pos_index.append(p) for p in pos_index: if rs_nums[p] not in rsnum_lst: rs_query = rs_nums[p] break if rs_query in rsnum_lst: continue rs_1000g = geno[2] if rs_query == rs_1000g: rsnum = rs_1000g else: count = -2 found = "false" while count <= 2 and count + g < len(vcf): geno_next = vcf[g + count].strip().split() if rs_query == geno_next[2]: found = "true" break count += 1 if found == "false": indx = [i[0] for i in snps].index(rs_query) # snps[indx][0] = geno[2] # rsnum = geno[2] snps[indx][0]=rs_query rsnum=rs_query else: continue if "," not in geno[3] and "," not in geno[4]: a1, a2 = set_alleles(geno[3], geno[4]) for i in range(len(index)): if geno[index[i]] == "0|0": hap1[i].append(a1) hap2[i].append(a1) elif geno[index[i]] == "0|1": hap1[i].append(a1) hap2[i].append(a2) elif geno[index[i]] == "1|0": hap1[i].append(a2) hap2[i].append(a1) elif geno[index[i]] == "1|1": hap1[i].append(a2) hap2[i].append(a2) elif geno[index[i]] == "0": hap1[i].append(a1) hap2[i].append(".") elif geno[index[i]] == "1": hap1[i].append(a2) hap2[i].append(".") else: hap1[i].append(".") hap2[i].append(".") rsnum_lst.append(rsnum) position = "chr" + geno[0] + ":" + geno[1] + "-" + geno[1] pos_lst.append(position) alleles = a1 + "/" + a2 allele_lst.append(alleles) # Calculate Pairwise LD Statistics all_haps = hap1 + hap2 ld_matrix = [[[None for v in range(2)] for i in range( len(all_haps[0]))] for j in range(len(all_haps[0]))] for i in range(len(all_haps[0])): for j in range(i, len(all_haps[0])): hap = {} for k in range(len(all_haps)): # Extract haplotypes hap_k = all_haps[k][i] + all_haps[k][j] if hap_k in hap: hap[hap_k] += 1 else: hap[hap_k] = 1 # Remove Missing Haplotypes keys = hap.keys() for key in keys: if "." in key: hap.pop(key, None) # Check all haplotypes are present if len(hap) != 4: snp_i_a = allele_lst[i].split("/") snp_j_a = allele_lst[j].split("/") haps = [snp_i_a[0] + snp_j_a[0], snp_i_a[0] + snp_j_a[1], snp_i_a[1] + snp_j_a[0], snp_i_a[1] + snp_j_a[1]] for h in haps: if h not in hap: hap[h] = 0 # Perform LD calculations A = hap[sorted(hap)[0]] B = hap[sorted(hap)[1]] C = hap[sorted(hap)[2]] D = hap[sorted(hap)[3]] tmax = max(A, B, C, D) delta = float(A * D - B * C) Ms = float((A + C) * (B + D) * (A + B) * (C + D)) if Ms != 0: # D prime if delta < 0: D_prime = round( abs(delta / min((A + C) * (A + B), (B + D) * (C + D))), 3) else: D_prime = round( abs(delta / min((A + C) * (C + D), (A + B) * (B + D))), 3) # R2 r2 = round((delta**2) / Ms, 3) # Find Correlated Alleles if r2 > 0.1: N = A + B + C + D # Expected Cell Counts eA = (A + B) * (A + C) / N eB = (B + A) * (B + D) / N eC = (C + A) * (C + D) / N eD = (D + C) * (D + B) / N # Calculate Deltas dA = (A - eA)**2 dB = (B - eB)**2 dC = (C - eC)**2 dD = (D - eD)**2 dmax = max(dA, dB, dC, dD) if dA == dB == dC == dD: if tmax == dA or tmax == dD: match = sorted(hap)[0][ 0] + "=" + sorted(hap)[0][1] + "," + sorted(hap)[2][0] + "=" + sorted(hap)[1][1] else: match = sorted(hap)[0][ 0] + "=" + sorted(hap)[1][1] + "," + sorted(hap)[2][0] + "=" + sorted(hap)[0][1] elif dmax == dA or dmax == dD: match = sorted(hap)[0][ 0] + "=" + sorted(hap)[0][1] + "," + sorted(hap)[2][0] + "=" + sorted(hap)[1][1] else: match = sorted(hap)[0][ 0] + "=" + sorted(hap)[1][1] + "," + sorted(hap)[2][0] + "=" + sorted(hap)[0][1] else: match = " = , = " else: D_prime = "NA" r2 = "NA" match = " = , = " snp1 = rsnum_lst[i] snp2 = rsnum_lst[j] pos1 = pos_lst[i].split("-")[0] pos2 = pos_lst[j].split("-")[0] allele1 = allele_lst[i] allele2 = allele_lst[j] corr = match.split(",")[0].split("=")[1] + "=" + match.split(",")[0].split("=")[ 0] + "," + match.split(",")[1].split("=")[1] + "=" + match.split(",")[1].split("=")[0] corr_f = match ld_matrix[i][j] = [snp1, snp2, allele1, allele2, corr, pos1, pos2, D_prime, r2] ld_matrix[j][i] = [snp2, snp1, allele2, allele1, corr_f, pos2, pos1, D_prime, r2] # Generate Plot Variables out = [j for i in ld_matrix for j in i] xnames = [] ynames = [] xA = [] yA = [] corA = [] xpos = [] ypos = [] D = [] R = [] box_color = [] box_trans = [] if r2_d not in ["r2", "d"]: r2_d = "r2" for i in range(len(out)): snp1, snp2, allele1, allele2, corr, pos1, pos2, D_prime, r2 = out[i] xnames.append(snp1) ynames.append(snp2) xA.append(allele1) yA.append(allele2) corA.append(corr) xpos.append(pos1) ypos.append(pos2) if r2_d == "r2" and r2 != "NA": D.append(str(round(float(D_prime), 4))) R.append(str(round(float(r2), 4))) box_color.append("red") box_trans.append(r2) elif r2_d == "d" and D_prime != "NA": D.append(str(round(float(D_prime), 4))) R.append(str(round(float(r2), 4))) box_color.append("red") box_trans.append(abs(D_prime)) else: D.append("NA") R.append("NA") box_color.append("blue") box_trans.append(0.1) # Import plotting modules from collections import OrderedDict from bokeh.embed import components, file_html from bokeh.layouts import gridplot from bokeh.models import HoverTool, LinearAxis, Range1d from bokeh.plotting import ColumnDataSource, curdoc, figure, output_file, reset_output, save from bokeh.resources import CDN from bokeh.io import export_svgs import svgutils.compose as sg from math import pi reset_output() # Aggregate Plotting Data x = [] y = [] w = [] h = [] coord_snps_plot = [] snp_id_plot = [] alleles_snp_plot = [] for i in range(0, len(xpos), int(len(xpos)**0.5)): x.append(int(xpos[i].split(":")[1]) / 1000000.0) y.append(0.5) w.append(0.00003) h.append(1.06) coord_snps_plot.append(xpos[i]) snp_id_plot.append(xnames[i]) alleles_snp_plot.append(xA[i]) buffer = (x[-1] - x[0]) * 0.025 xr = Range1d(start=x[0] - buffer, end=x[-1] + buffer) yr = Range1d(start=-0.03, end=1.03) y2_ll = [-0.03] * len(x) y2_ul = [1.03] * len(x) yr_pos = Range1d(start=(x[-1] + buffer) * -1, end=(x[0] - buffer) * -1) yr0 = Range1d(start=0, end=1) yr2 = Range1d(start=0, end=3.8) yr3 = Range1d(start=0, end=1) spacing = (x[-1] - x[0] + buffer + buffer) / (len(x) * 1.0) x2 = [] y0 = [] y1 = [] y2 = [] y3 = [] y4 = [] for i in range(len(x)): x2.append(x[0] - buffer + spacing * (i + 0.5)) y0.append(0) y1.append(0.20) y2.append(0.80) y3.append(1) y4.append(1.15) xname_pos = [] for i in x2: for j in range(len(x2)): xname_pos.append(i) data = { 'xname': xnames, 'xname_pos': xname_pos, 'yname': ynames, 'xA': xA, 'yA': yA, 'xpos': xpos, 'ypos': ypos, 'R2': R, 'Dp': D, 'corA': corA, 'box_color': box_color, 'box_trans': box_trans } source = ColumnDataSource(data) threshold = 70 if len(snps) < threshold: matrix_plot = figure(outline_line_color="white", min_border_top=0, min_border_bottom=2, min_border_left=100, min_border_right=5, x_range=xr, y_range=list(reversed(rsnum_lst)), h_symmetry=False, v_symmetry=False, border_fill_color='white', x_axis_type=None, logo=None, tools="hover,undo,redo,reset,pan,box_zoom,previewsave", title=" ", plot_width=800, plot_height=700) else: matrix_plot = figure(outline_line_color="white", min_border_top=0, min_border_bottom=2, min_border_left=100, min_border_right=5, x_range=xr, y_range=list(reversed(rsnum_lst)), h_symmetry=False, v_symmetry=False, border_fill_color='white', x_axis_type=None, y_axis_type=None, logo=None, tools="hover,undo,redo,reset,pan,box_zoom,previewsave", title=" ", plot_width=800, plot_height=700) matrix_plot.rect(x='xname_pos', y='yname', width=0.95 * spacing, height=0.95, source=source, color="box_color", alpha="box_trans", line_color=None) matrix_plot.grid.grid_line_color = None matrix_plot.axis.axis_line_color = None matrix_plot.axis.major_tick_line_color = None if len(snps) < threshold: matrix_plot.axis.major_label_text_font_size = "8pt" matrix_plot.xaxis.major_label_orientation = "vertical" matrix_plot.axis.major_label_text_font_style = "normal" matrix_plot.xaxis.major_label_standoff = 0 sup_2 = u"\u00B2" hover = matrix_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Variant 1", " " + "@yname (@yA)"), ("Variant 2", " " + "@xname (@xA)"), ("D\'", " " + "@Dp"), ("R" + sup_2, " " + "@R2"), ("Correlated Alleles", " " + "@corA"), ]) # Connecting and Rug Plots # Connector Plot if len(snps) < threshold: connector = figure(outline_line_color="white", y_axis_type=None, x_axis_type=None, x_range=xr, y_range=yr2, border_fill_color='white', title="", min_border_left=100, min_border_right=5, min_border_top=0, min_border_bottom=0, h_symmetry=False, v_symmetry=False, plot_width=800, plot_height=90, tools="xpan,tap") connector.segment(x, y0, x, y1, color="black") connector.segment(x, y1, x2, y2, color="black") connector.segment(x2, y2, x2, y3, color="black") connector.text(x2, y4, text=snp_id_plot, alpha=1, angle=pi / 2, text_font_size="8pt", text_baseline="middle", text_align="left") else: connector = figure(outline_line_color="white", y_axis_type=None, x_axis_type=None, x_range=xr, y_range=yr3, border_fill_color='white', title="", min_border_left=100, min_border_right=5, min_border_top=0, min_border_bottom=0, h_symmetry=False, v_symmetry=False, plot_width=800, plot_height=30, tools="xpan,tap") connector.segment(x, y0, x, y1, color="black") connector.segment(x, y1, x2, y2, color="black") connector.segment(x2, y2, x2, y3, color="black") connector.yaxis.major_label_text_color = None connector.yaxis.minor_tick_line_alpha = 0 # Option does not work connector.yaxis.axis_label = " " connector.grid.grid_line_color = None connector.axis.axis_line_color = None connector.axis.major_tick_line_color = None connector.axis.minor_tick_line_color = None connector.toolbar_location = None data_rug = { 'x': x, 'y': y, 'w': w, 'h': h, 'coord_snps_plot': coord_snps_plot, 'snp_id_plot': snp_id_plot, 'alleles_snp_plot': alleles_snp_plot } source_rug = ColumnDataSource(data_rug) # Rug Plot rug = figure(x_range=xr, y_range=yr, y_axis_type=None, title="", min_border_top=1, min_border_bottom=0, min_border_left=100, min_border_right=5, h_symmetry=False, v_symmetry=False, plot_width=800, plot_height=50, tools="hover,xpan,tap") rug.rect(x='x', y='y', width='w', height='h', fill_color='red', dilate=True, line_color=None, fill_alpha=0.6, source=source_rug) hover = rug.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("SNP", "@snp_id_plot (@alleles_snp_plot)"), ("Coord", "@coord_snps_plot"), ]) rug.toolbar_location = None # Gene Plot tabix_gene = "tabix -fh {0} {1}:{2}-{3} > {4}".format(gene_dir, snp_coords[1][1], int( (x[0] - buffer) * 1000000), int((x[-1] + buffer) * 1000000), tmp_dir + "genes_" + request + ".txt") subprocess.call(tabix_gene, shell=True) filename = tmp_dir + "genes_" + request + ".txt" genes_raw = open(filename).readlines() genes_plot_start = [] genes_plot_end = [] genes_plot_y = [] genes_plot_name = [] exons_plot_x = [] exons_plot_y = [] exons_plot_w = [] exons_plot_h = [] exons_plot_name = [] exons_plot_id = [] exons_plot_exon = [] message = ["Too many genes to plot."] lines = [0] gap = 80000 tall = 0.75 if genes_raw != None: for i in range(len(genes_raw)): bin, name_id, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, score, name2, cdsStartStat, cdsEndStat, exonFrames = genes_raw[ i].strip().split() name = name2 id = name_id e_start = exonStarts.split(",") e_end = exonEnds.split(",") # Determine Y Coordinate i = 0 y_coord = None while y_coord == None: if i > len(lines) - 1: y_coord = i + 1 lines.append(int(txEnd)) elif int(txStart) > (gap + lines[i]): y_coord = i + 1 lines[i] = int(txEnd) else: i += 1 genes_plot_start.append(int(txStart) / 1000000.0) genes_plot_end.append(int(txEnd) / 1000000.0) genes_plot_y.append(y_coord) genes_plot_name.append(name + " ") for i in range(len(e_start) - 1): if strand == "+": exon = i + 1 else: exon = len(e_start) - 1 - i width = (int(e_end[i]) - int(e_start[i])) / 1000000.0 x_coord = int(e_start[i]) / 1000000.0 + (width / 2) exons_plot_x.append(x_coord) exons_plot_y.append(y_coord) exons_plot_w.append(width) exons_plot_h.append(tall) exons_plot_name.append(name) exons_plot_id.append(id) exons_plot_exon.append(exon) n_rows = len(lines) genes_plot_yn = [n_rows - w + 0.5 for w in genes_plot_y] exons_plot_yn = [n_rows - w + 0.5 for w in exons_plot_y] yr2 = Range1d(start=0, end=n_rows) data_gene_plot = { 'exons_plot_x': exons_plot_x, 'exons_plot_yn': exons_plot_yn, 'exons_plot_w': exons_plot_w, 'exons_plot_h': exons_plot_h, 'exons_plot_name': exons_plot_name, 'exons_plot_id': exons_plot_id, 'exons_plot_exon': exons_plot_exon, 'coord_snps_plot': coord_snps_plot, 'snp_id_plot': snp_id_plot, 'alleles_snp_plot': alleles_snp_plot } source_gene_plot = ColumnDataSource(data_gene_plot) max_genes = 40 if len(lines) < 3 or len(genes_raw) > max_genes: plot_h_pix = 150 else: plot_h_pix = 150 + (len(lines) - 2) * 50 gene_plot = figure(min_border_top=2, min_border_bottom=0, min_border_left=100, min_border_right=5, x_range=xr, y_range=yr2, border_fill_color='white', title="", h_symmetry=False, v_symmetry=False, logo=None, plot_width=800, plot_height=plot_h_pix, tools="hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave") if len(genes_raw) <= max_genes: gene_plot.segment(genes_plot_start, genes_plot_yn, genes_plot_end, genes_plot_yn, color="black", alpha=1, line_width=2) gene_plot.rect(x='exons_plot_x', y='exons_plot_yn', width='exons_plot_w', height='exons_plot_h', source=source_gene_plot, fill_color='grey', line_color="grey") gene_plot.text(genes_plot_start, genes_plot_yn, text=genes_plot_name, alpha=1, text_font_size="7pt", text_font_style="bold", text_baseline="middle", text_align="right", angle=0) hover = gene_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Gene", "@exons_plot_name"), ("ID", "@exons_plot_id"), ("Exon", "@exons_plot_exon"), ]) else: x_coord_text = x[0] + (x[-1] - x[0]) / 2.0 gene_plot.text(x_coord_text, n_rows / 2.0, text=message, alpha=1, text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0) gene_plot.xaxis.axis_label = "Chromosome " + \ snp_coords[1][1] + " Coordinate (Mb)(GRCh37)" gene_plot.yaxis.axis_label = "Genes" gene_plot.ygrid.grid_line_color = None gene_plot.yaxis.axis_line_color = None gene_plot.yaxis.minor_tick_line_color = None gene_plot.yaxis.major_tick_line_color = None gene_plot.yaxis.major_label_text_color = None gene_plot.toolbar_location = "below" # Change output backend to SVG temporarily for headless export # Will be changed back to canvas in LDlink.js matrix_plot.output_backend = "svg" rug.output_backend = "svg" gene_plot.output_backend = "svg" export_svgs(matrix_plot, filename=tmp_dir + "matrix_plot_1_" + request + ".svg") export_svgs(gene_plot, filename=tmp_dir + "gene_plot_1_" + request + ".svg") # Concatenate svgs sg.Figure("21.59cm", "27.94cm", sg.SVG(tmp_dir + "matrix_plot_1_" + request + ".svg"), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").move(0, 720) ).save(tmp_dir + "matrix_plot_" + request + ".svg") sg.Figure("107.95cm", "139.70cm", sg.SVG(tmp_dir + "matrix_plot_1_" + request + ".svg").scale(5), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(5).move(0, 3600) ).save(tmp_dir + "matrix_plot_scaled_" + request + ".svg") # Export to PDF subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "matrix_plot_" + request + ".svg " + tmp_dir + "matrix_plot_" + request + ".pdf", shell=True) # Export to PNG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "matrix_plot_scaled_" + request + ".svg " + tmp_dir + "matrix_plot_" + request + ".png", shell=True) # Export to JPEG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "matrix_plot_scaled_" + request + ".svg " + tmp_dir + "matrix_plot_" + request + ".jpeg", shell=True) # Remove individual SVG files after they are combined subprocess.call("rm " + tmp_dir + "matrix_plot_1_" + request + ".svg", shell=True) subprocess.call("rm " + tmp_dir + "gene_plot_1_" + request + ".svg", shell=True) # Remove scaled SVG file after it is converted to png and jpeg subprocess.call("rm " + tmp_dir + "matrix_plot_scaled_" + request + ".svg", shell=True) reset_output() return None
import svgutils.compose as cg from tqdm import tqdm for c in tqdm([1,2,4,9,18]): wh = str(16*c/12)+"cm" cg.Figure(wh,wh,*[cg.SVG('img/cpu.svg').scale(3) for __ in range(c*c)]).tile(c,c).save("img/cpugrids/cpu1-"+str(c)+".svg")
def calculate_matrix_svg(snplst, pop, request, genome_build, r2_d="r2", collapseTranscript=True): # Set data directories using config.yml with open('config.yml', 'r') as yml_file: config = yaml.load(yml_file) env = config['env'] api_mongo_addr = config['api']['api_mongo_addr'] population_samples_dir = config['data']['population_samples_dir'] data_dir = config['data']['data_dir'] tmp_dir = config['data']['tmp_dir'] genotypes_dir = config['data']['genotypes_dir'] aws_info = config['aws'] mongo_username = config['database']['mongo_user_readonly'] mongo_password = config['database']['mongo_password'] mongo_port = config['database']['mongo_port'] export_s3_keys = retrieveAWSCredentials() # Ensure tmp directory exists if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) # Open SNP list file snps_raw = open(snplst).readlines() # Remove duplicate RS numbers snps = [] for snp_raw in snps_raw: snp = snp_raw.strip().split() if snp not in snps: snps.append(snp) # Select desired ancestral populations pops = pop.split("+") pop_dirs = [] for pop_i in pops: if pop_i in [ "ALL", "AFR", "AMR", "EAS", "EUR", "SAS", "ACB", "ASW", "BEB", "CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH", "GWD", "IBS", "ITU", "JPT", "KHV", "LWK", "MSL", "MXL", "PEL", "PJL", "PUR", "STU", "TSI", "YRI" ]: pop_dirs.append(data_dir + population_samples_dir + pop_i + ".txt") get_pops = "cat " + " ".join(pop_dirs) pop_list = [ x.decode('utf-8') for x in subprocess.Popen( get_pops, shell=True, stdout=subprocess.PIPE).stdout.readlines() ] ids = [i.strip() for i in pop_list] pop_ids = list(set(ids)) # Connect to Mongo snp database if env == 'local': mongo_host = api_mongo_addr else: mongo_host = 'localhost' client = MongoClient( 'mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host + '/admin', mongo_port) db = client["LDLink"] def get_coords(db, rsid): rsid = rsid.strip("rs") query_results = db.dbsnp.find_one({"id": rsid}) query_results_sanitized = json.loads(json_util.dumps(query_results)) return query_results_sanitized # Query genomic coordinates def get_rsnum(db, coord): temp_coord = coord.strip("chr").split(":") chro = temp_coord[0] pos = temp_coord[1] query_results = db.dbsnp.find({ "chromosome": chro.upper() if chro == 'x' or chro == 'y' else str(chro), genome_build_vars[genome_build]['position']: str(pos) }) query_results_sanitized = json.loads(json_util.dumps(query_results)) return query_results_sanitized # Replace input genomic coordinates with variant ids (rsids) def replace_coords_rsid(db, snp_lst): new_snp_lst = [] for snp_raw_i in snp_lst: if snp_raw_i[0][0:2] == "rs": new_snp_lst.append(snp_raw_i) else: snp_info_lst = get_rsnum(db, snp_raw_i[0]) print("snp_info_lst") print(snp_info_lst) if snp_info_lst != None: if len(snp_info_lst) > 1: var_id = "rs" + snp_info_lst[0]['id'] ref_variants = [] for snp_info in snp_info_lst: if snp_info['id'] == snp_info['ref_id']: ref_variants.append(snp_info['id']) if len(ref_variants) > 1: var_id = "rs" + ref_variants[0] elif len(ref_variants) == 0 and len(snp_info_lst) > 1: var_id = "rs" + snp_info_lst[0]['id'] else: var_id = "rs" + ref_variants[0] new_snp_lst.append([var_id]) elif len(snp_info_lst) == 1: var_id = "rs" + snp_info_lst[0]['id'] new_snp_lst.append([var_id]) else: new_snp_lst.append(snp_raw_i) else: new_snp_lst.append(snp_raw_i) return new_snp_lst snps = replace_coords_rsid(db, snps) # Find RS numbers in snp database rs_nums = [] snp_pos = [] snp_coords = [] tabix_coords = "" for snp_i in snps: if len(snp_i) > 0: if len(snp_i[0]) > 2: if (snp_i[0][0:2] == "rs" or snp_i[0][0:3] == "chr") and snp_i[0][-1].isdigit(): snp_coord = get_coords(db, snp_i[0]) if snp_coord != None and snp_coord[genome_build_vars[ genome_build]['position']] != "NA": # check if variant is on chrY for genome build = GRCh38 if not (snp_coord['chromosome'] == "Y" and (genome_build == "grch38" or genome_build == "grch38_high_coverage")): rs_nums.append(snp_i[0]) snp_pos.append(snp_coord[ genome_build_vars[genome_build]['position']]) temp = [ snp_i[0], snp_coord['chromosome'], snp_coord[genome_build_vars[genome_build] ['position']] ] snp_coords.append(temp) # Check max distance between SNPs distance_bp = [] for i in range(len(snp_coords)): distance_bp.append(int(snp_coords[i][2])) # Sort coordinates and make tabix formatted coordinates snp_pos_int = [int(i) for i in snp_pos] snp_pos_int.sort() snp_coord_str = [ genome_build_vars[genome_build]['1000G_chr_prefix'] + snp_coords[0][1] + ":" + str(i) + "-" + str(i) for i in snp_pos_int ] tabix_coords = " " + " ".join(snp_coord_str) # Extract 1000 Genomes phased genotypes vcf_filePath = "%s/%s%s/%s" % ( config['aws']['data_subfolder'], genotypes_dir, genome_build_vars[genome_build]['1000G_dir'], genome_build_vars[genome_build]['1000G_file'] % (snp_coords[0][1])) vcf_query_snp_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath) checkS3File(aws_info, config['aws']['bucket'], vcf_filePath) # Define function to correct indel alleles def set_alleles(a1, a2): if len(a1) == 1 and len(a2) == 1: a1_n = a1 a2_n = a2 elif len(a1) == 1 and len(a2) > 1: a1_n = "-" a2_n = a2[1:] elif len(a1) > 1 and len(a2) == 1: a1_n = a1[1:] a2_n = "-" elif len(a1) > 1 and len(a2) > 1: a1_n = a1[1:] a2_n = a2[1:] return (a1_n, a2_n) # Import SNP VCF files tabix_snps = export_s3_keys + " cd {2}; tabix -fhD {0}{1} | grep -v -e END".format( vcf_query_snp_file, tabix_coords, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) vcf = [ x.decode('utf-8') for x in subprocess.Popen( tabix_snps, shell=True, stdout=subprocess.PIPE).stdout.readlines() ] h = 0 while vcf[h][0:2] == "##": h += 1 head = vcf[h].strip().split() # Extract haplotypes index = [] for i in range(9, len(head)): if head[i] in pop_ids: index.append(i) hap1 = [[]] for i in range(len(index) - 1): hap1.append([]) hap2 = [[]] for i in range(len(index) - 1): hap2.append([]) rsnum_lst = [] allele_lst = [] pos_lst = [] for g in range(h + 1, len(vcf)): geno = vcf[g].strip().split() geno[0] = geno[0].lstrip('chr') if geno[1] not in snp_pos: continue if snp_pos.count(geno[1]) == 1: rs_query = rs_nums[snp_pos.index(geno[1])] else: pos_index = [] for p in range(len(snp_pos)): if snp_pos[p] == geno[1]: pos_index.append(p) for p in pos_index: if rs_nums[p] not in rsnum_lst: rs_query = rs_nums[p] break if rs_query in rsnum_lst: continue rs_1000g = geno[2] if rs_query == rs_1000g: rsnum = rs_1000g else: count = -2 found = "false" while count <= 2 and count + g < len(vcf): geno_next = vcf[g + count].strip().split() geno_next[0] = geno_next[0].lstrip('chr') if len(geno_next) >= 3 and rs_query == geno_next[2]: found = "true" break count += 1 if found == "false": indx = [i[0] for i in snps].index(rs_query) # snps[indx][0] = geno[2] # rsnum = geno[2] snps[indx][0] = rs_query rsnum = rs_query else: continue if "," not in geno[3] and "," not in geno[4]: a1, a2 = set_alleles(geno[3], geno[4]) for i in range(len(index)): if geno[index[i]] == "0|0": hap1[i].append(a1) hap2[i].append(a1) elif geno[index[i]] == "0|1": hap1[i].append(a1) hap2[i].append(a2) elif geno[index[i]] == "1|0": hap1[i].append(a2) hap2[i].append(a1) elif geno[index[i]] == "1|1": hap1[i].append(a2) hap2[i].append(a2) elif geno[index[i]] == "0": hap1[i].append(a1) hap2[i].append(".") elif geno[index[i]] == "1": hap1[i].append(a2) hap2[i].append(".") else: hap1[i].append(".") hap2[i].append(".") rsnum_lst.append(rsnum) position = "chr" + geno[0] + ":" + geno[1] + "-" + geno[1] pos_lst.append(position) alleles = a1 + "/" + a2 allele_lst.append(alleles) # Calculate Pairwise LD Statistics all_haps = hap1 + hap2 ld_matrix = [[[None for v in range(2)] for i in range(len(all_haps[0]))] for j in range(len(all_haps[0]))] for i in range(len(all_haps[0])): for j in range(i, len(all_haps[0])): hap = {} for k in range(len(all_haps)): # Extract haplotypes hap_k = all_haps[k][i] + all_haps[k][j] if hap_k in hap: hap[hap_k] += 1 else: hap[hap_k] = 1 # Remove Missing Haplotypes keys = list(hap.keys()) for key in keys: if "." in key: hap.pop(key, None) # Check all haplotypes are present if len(hap) != 4: snp_i_a = allele_lst[i].split("/") snp_j_a = allele_lst[j].split("/") haps = [ snp_i_a[0] + snp_j_a[0], snp_i_a[0] + snp_j_a[1], snp_i_a[1] + snp_j_a[0], snp_i_a[1] + snp_j_a[1] ] for h in haps: if h not in hap: hap[h] = 0 # Perform LD calculations A = hap[sorted(hap)[0]] B = hap[sorted(hap)[1]] C = hap[sorted(hap)[2]] D = hap[sorted(hap)[3]] tmax = max(A, B, C, D) delta = float(A * D - B * C) Ms = float((A + C) * (B + D) * (A + B) * (C + D)) if Ms != 0: # D prime if delta < 0: D_prime = round( abs(delta / min((A + C) * (A + B), (B + D) * (C + D))), 3) else: D_prime = round( abs(delta / min((A + C) * (C + D), (A + B) * (B + D))), 3) # R2 r2 = round((delta**2) / Ms, 3) # Find Correlated Alleles if str(r2) != "NA" and float(r2) > 0.1: Ac = hap[sorted(hap)[0]] Bc = hap[sorted(hap)[1]] Cc = hap[sorted(hap)[2]] Dc = hap[sorted(hap)[3]] if ((Ac * Dc) / max((Bc * Cc), 0.01) > 1): match = sorted(hap)[0][0] + "=" + sorted( hap)[0][1] + "," + sorted( hap)[3][0] + "=" + sorted(hap)[3][1] else: match = sorted(hap)[1][0] + "=" + sorted( hap)[1][1] + "," + sorted( hap)[2][0] + "=" + sorted(hap)[2][1] else: match = " = , = " else: D_prime = "NA" r2 = "NA" match = " = , = " snp1 = rsnum_lst[i] snp2 = rsnum_lst[j] pos1 = pos_lst[i].split("-")[0] pos2 = pos_lst[j].split("-")[0] allele1 = allele_lst[i] allele2 = allele_lst[j] corr = match.split(",")[0].split("=")[1] + "=" + match.split( ",")[0].split("=")[0] + "," + match.split(",")[1].split( "=")[1] + "=" + match.split(",")[1].split("=")[0] corr_f = match ld_matrix[i][j] = [ snp1, snp2, allele1, allele2, corr, pos1, pos2, D_prime, r2 ] ld_matrix[j][i] = [ snp2, snp1, allele2, allele1, corr_f, pos2, pos1, D_prime, r2 ] # Generate Plot Variables out = [j for i in ld_matrix for j in i] xnames = [] ynames = [] xA = [] yA = [] corA = [] xpos = [] ypos = [] D = [] R = [] box_color = [] box_trans = [] if r2_d not in ["r2", "d"]: r2_d = "r2" for i in range(len(out)): snp1, snp2, allele1, allele2, corr, pos1, pos2, D_prime, r2 = out[i] xnames.append(snp1) ynames.append(snp2) xA.append(allele1) yA.append(allele2) corA.append(corr) xpos.append(pos1) ypos.append(pos2) sqrti = math.floor(math.sqrt(len(out))) if sqrti == 0: D.append(str(round(float(D_prime), 4))) R.append(str(round(float(r2), 4))) box_color.append("red") box_trans.append(r2) elif i % sqrti < i // sqrti and r2 != "NA": D.append(str(round(float(D_prime), 4))) R.append(str(round(float(r2), 4))) box_color.append("blue") box_trans.append(abs(D_prime)) elif i % sqrti > i // sqrti and D_prime != "NA": D.append(str(round(float(D_prime), 4))) R.append(str(round(float(r2), 4))) box_color.append("red") box_trans.append(r2) elif i % sqrti == i // sqrti and D_prime != "NA": D.append(str(round(float(D_prime), 4))) R.append(str(round(float(r2), 4))) box_color.append("purple") box_trans.append(r2) else: D.append("NA") R.append("NA") box_color.append("gray") box_trans.append(0.1) # Import plotting modules from collections import OrderedDict from bokeh.embed import components, file_html from bokeh.layouts import gridplot from bokeh.models import HoverTool, LinearAxis, Range1d from bokeh.plotting import ColumnDataSource, curdoc, figure, output_file, reset_output, save from bokeh.resources import CDN from bokeh.io import export_svgs import svgutils.compose as sg from math import pi reset_output() # Aggregate Plotting Data x = [] y = [] w = [] h = [] coord_snps_plot = [] snp_id_plot = [] alleles_snp_plot = [] for i in range(0, len(xpos), int(len(xpos)**0.5)): x.append(int(xpos[i].split(":")[1]) / 1000000.0) y.append(0.5) w.append(0.00003) h.append(1.06) coord_snps_plot.append(xpos[i]) snp_id_plot.append(xnames[i]) alleles_snp_plot.append(xA[i]) buffer = (x[-1] - x[0]) * 0.025 xr = Range1d(start=x[0] - buffer, end=x[-1] + buffer) yr = Range1d(start=-0.03, end=1.03) y2_ll = [-0.03] * len(x) y2_ul = [1.03] * len(x) yr_pos = Range1d(start=(x[-1] + buffer) * -1, end=(x[0] - buffer) * -1) yr0 = Range1d(start=0, end=1) yr2 = Range1d(start=0, end=3.8) yr3 = Range1d(start=0, end=1) spacing = (x[-1] - x[0] + buffer + buffer) / (len(x) * 1.0) x2 = [] y0 = [] y1 = [] y2 = [] y3 = [] y4 = [] for i in range(len(x)): x2.append(x[0] - buffer + spacing * (i + 0.5)) y0.append(0) y1.append(0.20) y2.append(0.80) y3.append(1) y4.append(1.15) xname_pos = [] for i in x2: for j in range(len(x2)): xname_pos.append(i) data = { 'xname': xnames, 'xname_pos': xname_pos, 'yname': ynames, 'xA': xA, 'yA': yA, 'xpos': xpos, 'ypos': ypos, 'R2': R, 'Dp': D, 'corA': corA, 'box_color': box_color, 'box_trans': box_trans } source = ColumnDataSource(data) threshold = 70 if len(snps) < threshold: matrix_plot = figure( outline_line_color="white", min_border_top=0, min_border_bottom=2, min_border_left=100, min_border_right=5, x_range=xr, y_range=list(reversed(rsnum_lst)), h_symmetry=False, v_symmetry=False, border_fill_color='white', x_axis_type=None, logo=None, tools="hover,undo,redo,reset,pan,box_zoom,previewsave", title=" ", plot_width=800, plot_height=700) else: matrix_plot = figure( outline_line_color="white", min_border_top=0, min_border_bottom=2, min_border_left=100, min_border_right=5, x_range=xr, y_range=list(reversed(rsnum_lst)), h_symmetry=False, v_symmetry=False, border_fill_color='white', x_axis_type=None, y_axis_type=None, logo=None, tools="hover,undo,redo,reset,pan,box_zoom,previewsave", title=" ", plot_width=800, plot_height=700) matrix_plot.rect(x='xname_pos', y='yname', width=0.95 * spacing, height=0.95, source=source, color="box_color", alpha="box_trans", line_color=None) matrix_plot.grid.grid_line_color = None matrix_plot.axis.axis_line_color = None matrix_plot.axis.major_tick_line_color = None if len(snps) < threshold: matrix_plot.axis.major_label_text_font_size = "8pt" matrix_plot.xaxis.major_label_orientation = "vertical" matrix_plot.axis.major_label_text_font_style = "normal" matrix_plot.xaxis.major_label_standoff = 0 sup_2 = "\u00B2" hover = matrix_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Variant 1", " " + "@yname (@yA)"), ("Variant 2", " " + "@xname (@xA)"), ("D\'", " " + "@Dp"), ("R" + sup_2, " " + "@R2"), ("Correlated Alleles", " " + "@corA"), ]) # Connecting and Rug Plots # Connector Plot if len(snps) < threshold: connector = figure(outline_line_color="white", y_axis_type=None, x_axis_type=None, x_range=xr, y_range=yr2, border_fill_color='white', title="", min_border_left=100, min_border_right=5, min_border_top=0, min_border_bottom=0, h_symmetry=False, v_symmetry=False, plot_width=800, plot_height=90, tools="xpan,tap") connector.segment(x, y0, x, y1, color="black") connector.segment(x, y1, x2, y2, color="black") connector.segment(x2, y2, x2, y3, color="black") connector.text(x2, y4, text=snp_id_plot, alpha=1, angle=pi / 2, text_font_size="8pt", text_baseline="middle", text_align="left") else: connector = figure(outline_line_color="white", y_axis_type=None, x_axis_type=None, x_range=xr, y_range=yr3, border_fill_color='white', title="", min_border_left=100, min_border_right=5, min_border_top=0, min_border_bottom=0, h_symmetry=False, v_symmetry=False, plot_width=800, plot_height=30, tools="xpan,tap") connector.segment(x, y0, x, y1, color="black") connector.segment(x, y1, x2, y2, color="black") connector.segment(x2, y2, x2, y3, color="black") connector.yaxis.major_label_text_color = None connector.yaxis.minor_tick_line_alpha = 0 # Option does not work connector.yaxis.axis_label = " " connector.grid.grid_line_color = None connector.axis.axis_line_color = None connector.axis.major_tick_line_color = None connector.axis.minor_tick_line_color = None connector.toolbar_location = None data_rug = { 'x': x, 'y': y, 'w': w, 'h': h, 'coord_snps_plot': coord_snps_plot, 'snp_id_plot': snp_id_plot, 'alleles_snp_plot': alleles_snp_plot } source_rug = ColumnDataSource(data_rug) # Rug Plot rug = figure(x_range=xr, y_range=yr, y_axis_type=None, title="", min_border_top=1, min_border_bottom=0, min_border_left=100, min_border_right=5, h_symmetry=False, v_symmetry=False, plot_width=800, plot_height=50, tools="hover,xpan,tap") rug.rect(x='x', y='y', width='w', height='h', fill_color='red', dilate=True, line_color=None, fill_alpha=0.6, source=source_rug) hover = rug.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("SNP", "@snp_id_plot (@alleles_snp_plot)"), ("Coord", "@coord_snps_plot"), ]) rug.toolbar_location = None if collapseTranscript == "false": # Gene Plot (All Transcripts) genes_file = tmp_dir + "genes_" + request + ".json" genes_raw = open(genes_file).readlines() genes_plot_start = [] genes_plot_end = [] genes_plot_y = [] genes_plot_name = [] exons_plot_x = [] exons_plot_y = [] exons_plot_w = [] exons_plot_h = [] exons_plot_name = [] exons_plot_id = [] exons_plot_exon = [] message = ["Too many genes to plot."] lines = [0] gap = 80000 tall = 0.75 if genes_raw != None and len(genes_raw) > 0: for gene_raw_obj in genes_raw: gene_obj = json.loads(gene_raw_obj) bin = gene_obj["bin"] name_id = gene_obj["name"] chrom = gene_obj["chrom"] strand = gene_obj["strand"] txStart = gene_obj["txStart"] txEnd = gene_obj["txEnd"] cdsStart = gene_obj["cdsStart"] cdsEnd = gene_obj["cdsEnd"] exonCount = gene_obj["exonCount"] exonStarts = gene_obj["exonStarts"] exonEnds = gene_obj["exonEnds"] score = gene_obj["score"] name2 = gene_obj["name2"] cdsStartStat = gene_obj["cdsStartStat"] cdsEndStat = gene_obj["cdsEndStat"] exonFrames = gene_obj["exonFrames"] name = name2 id = name_id e_start = exonStarts.split(",") e_end = exonEnds.split(",") # Determine Y Coordinate i = 0 y_coord = None while y_coord == None: if i > len(lines) - 1: y_coord = i + 1 lines.append(int(txEnd)) elif int(txStart) > (gap + lines[i]): y_coord = i + 1 lines[i] = int(txEnd) else: i += 1 genes_plot_start.append(int(txStart) / 1000000.0) genes_plot_end.append(int(txEnd) / 1000000.0) genes_plot_y.append(y_coord) genes_plot_name.append(name + " ") for i in range(len(e_start) - 1): if strand == "+": exon = i + 1 else: exon = len(e_start) - 1 - i width = (int(e_end[i]) - int(e_start[i])) / 1000000.0 x_coord = int(e_start[i]) / 1000000.0 + (width / 2) exons_plot_x.append(x_coord) exons_plot_y.append(y_coord) exons_plot_w.append(width) exons_plot_h.append(tall) exons_plot_name.append(name) exons_plot_id.append(id) exons_plot_exon.append(exon) n_rows = len(lines) genes_plot_yn = [n_rows - w + 0.5 for w in genes_plot_y] exons_plot_yn = [n_rows - w + 0.5 for w in exons_plot_y] yr2 = Range1d(start=0, end=n_rows) data_gene_plot = { 'exons_plot_x': exons_plot_x, 'exons_plot_yn': exons_plot_yn, 'exons_plot_w': exons_plot_w, 'exons_plot_h': exons_plot_h, 'exons_plot_name': exons_plot_name, 'exons_plot_id': exons_plot_id, 'exons_plot_exon': exons_plot_exon, 'coord_snps_plot': coord_snps_plot, 'snp_id_plot': snp_id_plot, 'alleles_snp_plot': alleles_snp_plot } source_gene_plot = ColumnDataSource(data_gene_plot) max_genes = 40 # if len(lines) < 3 or len(genes_raw) > max_genes: if len(lines) < 3: plot_h_pix = 250 else: plot_h_pix = 250 + (len(lines) - 2) * 50 gene_plot = figure( min_border_top=2, min_border_bottom=0, min_border_left=100, min_border_right=5, x_range=xr, y_range=yr2, border_fill_color='white', title="", h_symmetry=False, v_symmetry=False, logo=None, plot_width=800, plot_height=plot_h_pix, tools= "hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave") # if len(genes_raw) <= max_genes: gene_plot.segment(genes_plot_start, genes_plot_yn, genes_plot_end, genes_plot_yn, color="black", alpha=1, line_width=2) gene_plot.rect(x='exons_plot_x', y='exons_plot_yn', width='exons_plot_w', height='exons_plot_h', source=source_gene_plot, fill_color='grey', line_color="grey") gene_plot.text(genes_plot_start, genes_plot_yn, text=genes_plot_name, alpha=1, text_font_size="7pt", text_font_style="bold", text_baseline="middle", text_align="right", angle=0) hover = gene_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Gene", "@exons_plot_name"), ("ID", "@exons_plot_id"), ("Exon", "@exons_plot_exon"), ]) # else: # x_coord_text = x[0] + (x[-1] - x[0]) / 2.0 # gene_plot.text(x_coord_text, n_rows / 2.0, text=message, alpha=1, # text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0) gene_plot.xaxis.axis_label = "Chromosome " + \ snp_coords[1][1] + " Coordinate (Mb)(" + genome_build_vars[genome_build]['title'] + ")" gene_plot.yaxis.axis_label = "Genes (All Transcripts)" gene_plot.ygrid.grid_line_color = None gene_plot.yaxis.axis_line_color = None gene_plot.yaxis.minor_tick_line_color = None gene_plot.yaxis.major_tick_line_color = None gene_plot.yaxis.major_label_text_color = None gene_plot.toolbar_location = "below" # Gene Plot (Collapsed) else: genes_c_file = tmp_dir + "genes_c_" + request + ".json" genes_c_raw = open(genes_c_file).readlines() genes_c_plot_start = [] genes_c_plot_end = [] genes_c_plot_y = [] genes_c_plot_name = [] exons_c_plot_x = [] exons_c_plot_y = [] exons_c_plot_w = [] exons_c_plot_h = [] exons_c_plot_name = [] exons_c_plot_id = [] message_c = ["Too many genes to plot."] lines_c = [0] gap = 80000 tall = 0.75 if genes_c_raw != None and len(genes_c_raw) > 0: for gene_c_raw_obj in genes_c_raw: gene_c_obj = json.loads(gene_c_raw_obj) chrom = gene_c_obj["chrom"] txStart = gene_c_obj["txStart"] txEnd = gene_c_obj["txEnd"] exonStarts = gene_c_obj["exonStarts"] exonEnds = gene_c_obj["exonEnds"] name2 = gene_c_obj["name2"] transcripts = gene_c_obj["transcripts"] name = name2 e_start = exonStarts.split(",") e_end = exonEnds.split(",") e_transcripts = transcripts.split(",") # Determine Y Coordinate i = 0 y_coord = None while y_coord == None: if i > len(lines_c) - 1: y_coord = i + 1 lines_c.append(int(txEnd)) elif int(txStart) > (gap + lines_c[i]): y_coord = i + 1 lines_c[i] = int(txEnd) else: i += 1 genes_c_plot_start.append(int(txStart) / 1000000.0) genes_c_plot_end.append(int(txEnd) / 1000000.0) genes_c_plot_y.append(y_coord) genes_c_plot_name.append(name + " ") # for i in range(len(e_start)): for i in range(len(e_start) - 1): width = (int(e_end[i]) - int(e_start[i])) / 1000000.0 x_coord = int(e_start[i]) / 1000000.0 + (width / 2) exons_c_plot_x.append(x_coord) exons_c_plot_y.append(y_coord) exons_c_plot_w.append(width) exons_c_plot_h.append(tall) exons_c_plot_name.append(name) exons_c_plot_id.append(e_transcripts[i].replace("-", ",")) n_rows_c = len(lines_c) genes_c_plot_yn = [n_rows_c - x + 0.5 for x in genes_c_plot_y] exons_c_plot_yn = [n_rows_c - x + 0.5 for x in exons_c_plot_y] yr2_c = Range1d(start=0, end=n_rows_c) data_gene_c_plot = { 'exons_c_plot_x': exons_c_plot_x, 'exons_c_plot_yn': exons_c_plot_yn, 'exons_c_plot_w': exons_c_plot_w, 'exons_c_plot_h': exons_c_plot_h, 'exons_c_plot_name': exons_c_plot_name, 'exons_c_plot_id': exons_c_plot_id } source_gene_c_plot = ColumnDataSource(data_gene_c_plot) max_genes_c = 40 # if len(lines_c) < 3 or len(genes_c_raw) > max_genes_c: if len(lines_c) < 3: plot_h_pix = 250 else: plot_h_pix = 250 + (len(lines_c) - 2) * 50 gene_plot = figure( min_border_top=2, min_border_bottom=0, min_border_left=100, min_border_right=5, x_range=xr, y_range=yr2_c, border_fill_color='white', title="", h_symmetry=False, v_symmetry=False, logo=None, plot_width=900, plot_height=plot_h_pix, tools= "hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave") # if len(genes_c_raw) <= max_genes_c: gene_plot.segment(genes_c_plot_start, genes_c_plot_yn, genes_c_plot_end, genes_c_plot_yn, color="black", alpha=1, line_width=2) gene_plot.rect(x='exons_c_plot_x', y='exons_c_plot_yn', width='exons_c_plot_w', height='exons_c_plot_h', source=source_gene_c_plot, fill_color="grey", line_color="grey") gene_plot.text(genes_c_plot_start, genes_c_plot_yn, text=genes_c_plot_name, alpha=1, text_font_size="7pt", text_font_style="bold", text_baseline="middle", text_align="right", angle=0) hover = gene_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Gene", "@exons_c_plot_name"), ("Transcript IDs", "@exons_c_plot_id"), ]) # else: # x_coord_text = coord1/1000000.0 + (coord2/1000000.0 - coord1/1000000.0) / 2.0 # gene_c_plot.text(x_coord_text, n_rows_c / 2.0, text=message_c, alpha=1, # text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0) gene_plot.xaxis.axis_label = "Chromosome " + snp_coords[1][ 1] + " Coordinate (Mb)(" + genome_build_vars[genome_build][ 'title'] + ")" gene_plot.yaxis.axis_label = "Genes (Transcripts Collapsed)" gene_plot.ygrid.grid_line_color = None gene_plot.yaxis.axis_line_color = None gene_plot.yaxis.minor_tick_line_color = None gene_plot.yaxis.major_tick_line_color = None gene_plot.yaxis.major_label_text_color = None gene_plot.toolbar_location = "below" # Change output backend to SVG temporarily for headless export # Will be changed back to canvas in LDlink.js matrix_plot.output_backend = "svg" connector.output_backend = "svg" rug.output_backend = "svg" gene_plot.output_backend = "svg" export_svgs(matrix_plot, filename=tmp_dir + "matrix_plot_1_" + request + ".svg") export_svgs(connector, filename=tmp_dir + "connector_1_" + request + ".svg") export_svgs(rug, filename=tmp_dir + "rug_1_" + request + ".svg") export_svgs(gene_plot, filename=tmp_dir + "gene_plot_1_" + request + ".svg") # 1 pixel = 0.0264583333 cm svg_height = str(25.00 + (0.0264583333 * plot_h_pix)) + "cm" svg_height_scaled = str(110.00 + (0.1322916665 * plot_h_pix)) + "cm" # Concatenate svgs sg.Figure( "21.59cm", svg_height, sg.SVG(tmp_dir + "matrix_plot_1_" + request + ".svg"), sg.SVG(tmp_dir + "connector_1_" + request + ".svg").scale(.97).move( 0, 700), sg.SVG(tmp_dir + "rug_1_" + request + ".svg").scale(.97).move(0, 790), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(.97).move( 0, 840)).save(tmp_dir + "matrix_plot_" + request + ".svg") sg.Figure( "107.95cm", svg_height_scaled, sg.SVG(tmp_dir + "matrix_plot_1_" + request + ".svg").scale(5), sg.SVG(tmp_dir + "connector_1_" + request + ".svg").scale(4.85).move( 0, 3500), sg.SVG(tmp_dir + "rug_1_" + request + ".svg").scale(4.85).move( 0, 3930), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(4.85).move( 0, 4160)).save(tmp_dir + "matrix_plot_scaled_" + request + ".svg") # Export to PDF subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "matrix_plot_" + request + ".svg " + tmp_dir + "matrix_plot_" + request + ".pdf", shell=True) # Export to PNG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "matrix_plot_scaled_" + request + ".svg " + tmp_dir + "matrix_plot_" + request + ".png", shell=True) # Export to JPEG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "matrix_plot_scaled_" + request + ".svg " + tmp_dir + "matrix_plot_" + request + ".jpeg", shell=True) # Remove individual SVG files after they are combined subprocess.call("rm " + tmp_dir + "matrix_plot_1_" + request + ".svg", shell=True) subprocess.call("rm " + tmp_dir + "gene_plot_1_" + request + ".svg", shell=True) subprocess.call("rm " + tmp_dir + "rug_1_" + request + ".svg", shell=True) subprocess.call("rm " + tmp_dir + "connector_1_" + request + ".svg", shell=True) # Remove scaled SVG file after it is converted to png and jpeg subprocess.call("rm " + tmp_dir + "matrix_plot_scaled_" + request + ".svg", shell=True) # Remove temporary file(s) subprocess.call("rm " + tmp_dir + "genes_*" + request + "*.json", shell=True) reset_output() return None
def calculate_assoc_svg(file, region, pop, request, myargs, myargsName, myargsOrigin): # Set data directories using config.yml with open('config.yml', 'r') as f: config = yaml.load(f) gene_dir2 = config['data']['gene_dir2'] vcf_dir = config['data']['vcf_dir'] tmp_dir = "./tmp/" # Ensure tmp directory exists if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) chrs=["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","X","Y"] # Define parameters for --variant option if region=="variant": if myargsOrigin=="None": return None if myargsOrigin!="None": # Find coordinates (GRCh37/hg19) for SNP RS number if myargsOrigin[0:2]=="rs": snp=myargsOrigin # Connect to Mongo snp database client = MongoClient('mongodb://'+username+':'+password+'@localhost/admin', port) db = client["LDLink"] def get_coords_var(db, rsid): rsid = rsid.strip("rs") query_results = db.dbsnp151.find_one({"id": rsid}) query_results_sanitized = json.loads(json_util.dumps(query_results)) return query_results_sanitized # Find RS number in snp database var_coord=get_coords_var(db, snp) if var_coord==None: return None elif myargsOrigin.split(":")[0].strip("chr") in chrs and len(myargsOrigin.split(":"))==2: snp=myargsOrigin var_coord=[None,myargsOrigin.split(":")[0].strip("chr"),myargsOrigin.split(":")[1]] else: return None chromosome = var_coord['chromosome'] org_coord = var_coord['position'] # Open Association Data header_list=[] header_list.append(myargs['chr']) header_list.append(myargs['bp']) header_list.append(myargs['pval']) # Load input file with open(file) as fp: header = fp.readline().strip().split() first = fp.readline().strip().split() if len(header)!=len(first): return None # Check header for item in header_list: if item not in header: return None len_head=len(header) chr_index=header.index(myargs['chr']) pos_index=header.index(myargs['bp']) p_index=header.index(myargs['pval']) # Define window of interest around query SNP if myargs['window']==None: if region=="variant": window=500000 elif region=="gene": window=100000 else: window=0 else: window=myargs['window'] if region=="variant": coord1=int(org_coord)-window if coord1<0: coord1=0 coord2=int(org_coord)+window elif region=="gene": if myargsName=="None": return None # Connect to gene database conn=sqlite3.connect(gene_dir2) conn.text_factory=str cur=conn.cursor() def get_coords_gene(gene_raw): gene=gene_raw.upper() t=(gene,) cur.execute("SELECT * FROM genes WHERE name=?", t) return cur.fetchone() # Find RS number in snp database gene_coord=get_coords_gene(myargsName) # Close snp connection cur.close() conn.close() if gene_coord==None: return None # Define search coordinates coord1=int(gene_coord[2])-window if coord1<0: coord1=0 coord2=int(gene_coord[3])+window # Run with --origin option if myargsOrigin!="None": if gene_coord[1]!=chromosome: return None if coord1>int(org_coord) or int(org_coord)>coord2: return None else: chromosome=gene_coord[1] elif region=="region": if myargs['start']==None: return None if myargs['end']==None: return None # Parse out chr and positions for --region option if len(myargs['start'].split(":"))!=2: return None if len(myargs['end'].split(":"))!=2: return None chr_s=myargs['start'].strip("chr").split(":")[0] coord_s=myargs['start'].split(":")[1] chr_e=myargs['end'].strip("chr").split(":")[0] coord_e=myargs['end'].split(":")[1] if chr_s not in chrs: return None if chr_e not in chrs: return None if chr_s!=chr_e: return None if coord_s>=coord_e: return None coord1=int(coord_s)-window if coord1<0: coord1=0 coord2=int(coord_e)+window # Run with --origin option if myargsOrigin!="None": if chr_s!=chromosome: return None if coord1>int(org_coord) or int(org_coord)>coord2: return None else: chromosome=chr_s # Generate coordinate list and P-value dictionary max_window=3000000 if coord2-coord1>max_window: return None assoc_coords=[] a_pos=[] assoc_dict={} assoc_list=[] with open(file) as fp: for line in fp: col=line.strip().split() if len(col)==len_head: if col[chr_index].strip("chr")==chromosome: try: int(col[pos_index]) except ValueError: continue else: if coord1<=int(col[pos_index])<=coord2: try: float(col[p_index]) except ValueError: continue else: coord_i=col[chr_index].strip("chr")+":"+col[pos_index]+"-"+col[pos_index] assoc_coords.append(coord_i) a_pos.append(col[pos_index]) assoc_dict[coord_i]=[col[p_index]] assoc_list.append([coord_i,float(col[p_index])]) # Coordinate list checks if len(assoc_coords)==0: return None # Get population ids from population output file from LDassoc.py pop_list=open(tmp_dir+"pops_"+request+".txt").readlines() ids=[] for i in range(len(pop_list)): ids.append(pop_list[i].strip()) pop_ids=list(set(ids)) # Define LD origin coordinate try: org_coord except NameError: for var_p in sorted(assoc_list, key=operator.itemgetter(1)): snp="chr"+var_p[0].split("-")[0] # Extract lowest P SNP phased genotypes vcf_file=vcf_dir+chromosome+".phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.vcf.gz" tabix_snp_h="tabix -H {0} | grep CHROM".format(vcf_file) proc_h=subprocess.Popen(tabix_snp_h, shell=True, stdout=subprocess.PIPE) head=[x.decode('utf-8') for x in proc_h.stdout.readlines()][0].strip().split() # Check lowest P SNP is in the 1000G population and not monoallelic from LDassoc.py output file vcf=open(tmp_dir+"snp_no_dups_"+request+".vcf").readlines() if len(vcf)==0: continue elif len(vcf)>1: geno=vcf[0].strip().split() else: geno=vcf[0].strip().split() if "," in geno[3] or "," in geno[4]: continue index=[] for i in range(9,len(head)): if head[i] in pop_ids: index.append(i) genotypes={"0":0, "1":0} for i in index: sub_geno=geno[i].split("|") for j in sub_geno: if j in genotypes: genotypes[j]+=1 else: genotypes[j]=1 if genotypes["0"]==0 or genotypes["1"]==0: continue org_coord=var_p[0].split("-")[1] break else: if chromosome+":"+org_coord+"-"+org_coord not in assoc_coords: return None # Extract query SNP phased genotypes vcf_file=vcf_dir+chromosome+".phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.vcf.gz" tabix_snp_h="tabix -H {0} | grep CHROM".format(vcf_file) proc_h=subprocess.Popen(tabix_snp_h, shell=True, stdout=subprocess.PIPE) head=[x.decode('utf-8') for x in proc_h.stdout.readlines()][0].strip().split() tabix_snp="tabix {0} {1}:{2}-{2} | grep -v -e END > {3}".format(vcf_file, chromosome, org_coord, tmp_dir+"snp_no_dups_"+request+".vcf") subprocess.call(tabix_snp, shell=True) # Check query SNP is in the 1000G population, has the correct RS number, and not monoallelic vcf=open(tmp_dir+"snp_no_dups_"+request+".vcf").readlines() if len(vcf)==0: subprocess.call("rm "+tmp_dir+"pops_"+request+".txt", shell=True) subprocess.call("rm "+tmp_dir+"*"+request+"*.vcf", shell=True) return None elif len(vcf)>1: geno=[] for i in range(len(vcf)): if vcf[i].strip().split()[2]==snp: geno=vcf[i].strip().split() if geno==[]: subprocess.call("rm "+tmp_dir+"pops_"+request+".txt", shell=True) subprocess.call("rm "+tmp_dir+"*"+request+"*.vcf", shell=True) return None else: geno=vcf[0].strip().split() if geno[2]!=snp and snp[0:2]=="rs": snp=geno[2] if "," in geno[3] or "," in geno[4]: subprocess.call("rm "+tmp_dir+"pops_"+request+".txt", shell=True) subprocess.call("rm "+tmp_dir+"*"+request+"*.vcf", shell=True) return None index=[] for i in range(9,len(head)): if head[i] in pop_ids: index.append(i) genotypes={"0":0, "1":0} for i in index: sub_geno=geno[i].split("|") for j in sub_geno: if j in genotypes: genotypes[j]+=1 else: genotypes[j]=1 if genotypes["0"]==0 or genotypes["1"]==0: subprocess.call("rm "+tmp_dir+"pops_"+request+".txt", shell=True) subprocess.call("rm "+tmp_dir+"*"+request+"*.vcf", shell=True) return None # Calculate proxy LD statistics in parallel if len(assoc_coords)<60: threads=1 else: threads=4 block=len(assoc_coords)/threads commands=[] for i in range(threads): if i==min(range(threads)) and i==max(range(threads)): command="python LDassoc_sub.py "+snp+" "+chromosome+" "+"_".join(assoc_coords)+" "+request+" "+str(i) elif i==min(range(threads)): command="python LDassoc_sub.py "+snp+" "+chromosome+" "+"_".join(assoc_coords[:block])+" "+request+" "+str(i) elif i==max(range(threads)): command="python LDassoc_sub.py "+snp+" "+chromosome+" "+"_".join(assoc_coords[(block*i)+1:])+" "+request+" "+str(i) else: command="python LDassoc_sub.py "+snp+" "+chromosome+" "+"_".join(assoc_coords[(block*i)+1:block*(i+1)])+" "+request+" "+str(i) commands.append(command) processes=[subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) for command in commands] # collect output in parallel def get_output(process): return process.communicate()[0].splitlines() pool = Pool(len(processes)) out_raw=pool.map(get_output, processes) pool.close() pool.join() # Aggregate output out_prox=[] for i in range(len(out_raw)): for j in range(len(out_raw[i])): col=out_raw[i][j].decode('utf-8').strip().split("\t") col[6]=int(col[6]) col[7]=float(col[7]) col[8]=float(col[8]) col.append(abs(int(col[6]))) pos_i_j=col[5].split(":")[1] coord_i_j=chromosome+":"+pos_i_j+"-"+pos_i_j if coord_i_j in assoc_dict: col.append(float(assoc_dict[coord_i_j][0])) out_prox.append(col) out_dist_sort=sorted(out_prox, key=operator.itemgetter(14)) out_p_sort=sorted(out_dist_sort, key=operator.itemgetter(15), reverse=False) # Organize scatter plot data q_rs=[] q_allele=[] q_coord=[] q_maf=[] p_rs=[] p_allele=[] p_coord=[] p_pos=[] p_maf=[] dist=[] d_prime=[] d_prime_round=[] r2=[] r2_round=[] corr_alleles=[] regdb=[] funct=[] color=[] alpha=[] size=[] p_val=[] neg_log_p=[] for i in range(len(out_p_sort)): q_rs_i,q_allele_i,q_coord_i,p_rs_i,p_allele_i,p_coord_i,dist_i,d_prime_i,r2_i,corr_alleles_i,regdb_i,q_maf_i,p_maf_i,funct_i,dist_abs,p_val_i=out_p_sort[i] q_rs.append(q_rs_i) q_allele.append(q_allele_i) q_coord.append(float(q_coord_i.split(":")[1])/1000000) q_maf.append(str(round(float(q_maf_i),4))) if p_rs_i==".": p_rs_i=p_coord_i p_rs.append(p_rs_i) p_allele.append(p_allele_i) p_coord.append(float(p_coord_i.split(":")[1])/1000000) p_pos.append(p_coord_i.split(":")[1]) p_maf.append(str(round(float(p_maf_i),4))) dist.append(str(round(dist_i/1000000.0,4))) d_prime.append(float(d_prime_i)) d_prime_round.append(str(round(float(d_prime_i),4))) r2.append(float(r2_i)) r2_round.append(str(round(float(r2_i),4))) corr_alleles.append(corr_alleles_i) # P-value p_val.append(p_val_i) neg_log_p.append(-log10(p_val_i)) # Correct Missing Annotations if regdb_i==".": regdb_i="" regdb.append(regdb_i) if funct_i==".": funct_i="" if funct_i=="NA": funct_i="none" funct.append(funct_i) # Set Color reds=["#FFCCCC","#FFCACA","#FFC8C8","#FFC6C6","#FFC4C4","#FFC2C2","#FFC0C0","#FFBEBE","#FFBCBC","#FFBABA","#FFB8B8","#FFB6B6","#FFB4B4","#FFB1B1","#FFAFAF","#FFADAD","#FFABAB","#FFA9A9","#FFA7A7","#FFA5A5","#FFA3A3","#FFA1A1","#FF9F9F","#FF9D9D","#FF9B9B","#FF9999","#FF9797","#FF9595","#FF9393","#FF9191","#FF8F8F","#FF8D8D","#FF8B8B","#FF8989","#FF8787","#FF8585","#FF8383","#FF8181","#FF7E7E","#FF7C7C","#FF7A7A","#FF7878","#FF7676","#FF7474","#FF7272","#FF7070","#FF6E6E","#FF6C6C","#FF6A6A","#FF6868","#FF6666","#FF6464","#FF6262","#FF6060","#FF5E5E","#FF5C5C","#FF5A5A","#FF5858","#FF5656","#FF5454","#FF5252","#FF5050","#FF4E4E","#FF4B4B","#FF4949","#FF4747","#FF4545","#FF4343","#FF4141","#FF3F3F","#FF3D3D","#FF3B3B","#FF3939","#FF3737","#FF3535","#FF3333","#FF3131","#FF2F2F","#FF2D2D","#FF2B2B","#FF2929","#FF2727","#FF2525","#FF2323","#FF2121","#FF1F1F","#FF1D1D","#FF1B1B","#FF1818","#FF1616","#FF1414","#FF1212","#FF1010","#FF0E0E","#FF0C0C","#FF0A0A","#FF0808","#FF0606","#FF0404","#FF0202","#FF0000"] if q_coord_i==p_coord_i: color_i="#0000FF" alpha_i=0.7 else: if myargs['dprime']==True: color_i=reds[int(d_prime_i*100.0)] alpha_i=0.7 elif myargs['dprime']==False: color_i=reds[int(r2_i*100.0)] alpha_i=0.7 color.append(color_i) alpha.append(alpha_i) # Set Size size_i=9+float(p_maf_i)*14.0 size.append(size_i) # Pull out SNPs from association file not found in 1000G p_plot_pos=[] p_plot_pval=[] p_plot_pos2=[] p_plot_pval2=[] p_plot_dist=[] index_var_pos=float(q_coord_i.split(":")[1])/1000000 for input_pos in a_pos: if input_pos not in p_pos: p_plot_pos.append(float(input_pos)/1000000) p_plot_pval.append(-log10(float(assoc_dict[chromosome+":"+input_pos+"-"+input_pos][0]))) p_plot_pos2.append("chr"+chromosome+":"+input_pos) p_plot_pval2.append(float(assoc_dict[chromosome+":"+input_pos+"-"+input_pos][0])) p_plot_dist.append(str(round(float(input_pos)/1000000-index_var_pos,4))) # Begin Bokeh Plotting from collections import OrderedDict from bokeh.embed import components,file_html from bokeh.layouts import gridplot from bokeh.models import HoverTool,LinearAxis,Range1d from bokeh.plotting import ColumnDataSource,curdoc,figure,output_file,reset_output,save from bokeh.resources import CDN from bokeh.io import export_svgs import svgutils.compose as sg reset_output() data_p = {'p_plot_posX': p_plot_pos, 'p_plot_pvalY': p_plot_pval, 'p_plot_pos2': p_plot_pos2, 'p_plot_pval2': p_plot_pval2, 'p_plot_dist': p_plot_dist} source_p = ColumnDataSource(data_p) # Assoc Plot x=p_coord y=neg_log_p data = {'x': x, 'y': y, 'qrs': q_rs, 'q_alle': q_allele, 'q_maf': q_maf, 'prs': p_rs, 'p_alle': p_allele, 'p_maf': p_maf, 'dist': dist, 'r': r2_round, 'd': d_prime_round, 'alleles': corr_alleles, 'regdb': regdb, 'funct': funct, 'p_val': p_val, 'size': size, 'color': color, 'alpha': alpha} source = ColumnDataSource(data) whitespace=0.01 xr=Range1d(start=coord1/1000000.0-whitespace, end=coord2/1000000.0+whitespace) yr=Range1d(start=-0.03, end=max(y)*1.03) sup_2="\u00B2" assoc_plot=figure( title="P-values and Regional LD for "+snp+" in "+pop, min_border_top=2, min_border_bottom=2, min_border_left=60, min_border_right=60, h_symmetry=False, v_symmetry=False, plot_width=900, plot_height=600, x_range=xr, y_range=yr, tools="tap,pan,box_zoom,wheel_zoom,box_select,undo,redo,reset,previewsave", logo=None, toolbar_location="above") assoc_plot.title.align="center" # Add recombination rate from LDassoc.py output file filename=tmp_dir+"recomb_"+request+".txt" recomb_raw=open(filename).readlines() recomb_x=[] recomb_y=[] for i in range(len(recomb_raw)): chr,pos,rate=recomb_raw[i].strip().split() recomb_x.append(int(pos)/1000000.0) recomb_y.append(float(rate)/100*max(y)) assoc_plot.line(recomb_x, recomb_y, line_width=1, color="black", alpha=0.5) # Add genome-wide significance a = [coord1/1000000.0-whitespace,coord2/1000000.0+whitespace] b = [-log10(0.00000005),-log10(0.00000005)] assoc_plot.line(a, b, color="blue", alpha=0.5) assoc_points_not1000G=assoc_plot.circle(x='p_plot_posX', y='p_plot_pvalY', size=9+float("0.25")*14.0, source=source_p, line_color="gray", fill_color="white") assoc_points=assoc_plot.circle(x='x', y='y', size='size', color='color', alpha='alpha', source=source) assoc_plot.add_tools(HoverTool(renderers=[assoc_points_not1000G], tooltips=OrderedDict([("Variant", "@p_plot_pos2"), ("P-value", "@p_plot_pval2"), ("Distance (Mb)", "@p_plot_dist")]))) hover=HoverTool(renderers=[assoc_points]) hover.tooltips=OrderedDict([ ("Variant", "@prs @p_alle"), ("P-value", "@p_val"), ("Distance (Mb)", "@dist"), ("MAF", "@p_maf"), ("R"+sup_2+" ("+q_rs[0]+")", "@r"), ("D\' ("+q_rs[0]+")", "@d"), ("Correlated Alleles", "@alleles"), ("RegulomeDB", "@regdb"), ("Functional Class", "@funct"), ]) assoc_plot.add_tools(hover) # Annotate RebulomeDB scores if myargs['annotate']==True: assoc_plot.text(x, y, text=regdb, alpha=1, text_font_size="7pt", text_baseline="middle", text_align="center", angle=0) assoc_plot.yaxis.axis_label="-log10 P-value" assoc_plot.extra_y_ranges = {"y2_axis": Range1d(start=-3, end=103)} assoc_plot.add_layout(LinearAxis(y_range_name="y2_axis", axis_label="Combined Recombination Rate (cM/Mb)"), "right") ## Need to confirm units # Rug Plot y2_ll=[-0.03]*len(x) y2_ul=[1.03]*len(x) yr_rug=Range1d(start=-0.03, end=1.03) data_rug = {'x': x, 'y': y, 'y2_ll': y2_ll, 'y2_ul': y2_ul,'qrs': q_rs, 'q_alle': q_allele, 'q_maf': q_maf, 'prs': p_rs, 'p_alle': p_allele, 'p_maf': p_maf, 'dist': dist, 'r': r2_round, 'd': d_prime_round, 'alleles': corr_alleles, 'regdb': regdb, 'funct': funct, 'p_val': p_val, 'size': size, 'color': color, 'alpha': alpha} source_rug = ColumnDataSource(data_rug) rug=figure( x_range=xr, y_range=yr_rug, border_fill_color='white', y_axis_type=None, title="", min_border_top=2, min_border_bottom=2, min_border_left=60, min_border_right=60, h_symmetry=False, v_symmetry=False, plot_width=900, plot_height=50, tools="xpan,tap,wheel_zoom", logo=None) rug.segment(x0='x', y0='y2_ll', x1='x', y1='y2_ul', source=source_rug, color='color', alpha='alpha', line_width=1) rug.toolbar_location=None # Gene Plot (All Transcripts) if myargs['transcript']==True: # Get genes from LDassoc.py output file filename=tmp_dir+"genes_"+request+".txt" genes_raw=open(filename).readlines() genes_plot_start=[] genes_plot_end=[] genes_plot_y=[] genes_plot_name=[] exons_plot_x=[] exons_plot_y=[] exons_plot_w=[] exons_plot_h=[] exons_plot_name=[] exons_plot_id=[] exons_plot_exon=[] message = ["Too many genes to plot."] lines=[0] gap=80000 tall=0.75 if genes_raw!=None: for i in range(len(genes_raw)): bin,name_id,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,score,name2,cdsStartStat,cdsEndStat,exonFrames=genes_raw[i].strip().split() name=name2 id=name_id e_start=exonStarts.split(",") e_end=exonEnds.split(",") # Determine Y Coordinate i=0 y_coord=None while y_coord==None: if i>len(lines)-1: y_coord=i+1 lines.append(int(txEnd)) elif int(txStart)>(gap+lines[i]): y_coord=i+1 lines[i]=int(txEnd) else: i+=1 genes_plot_start.append(int(txStart)/1000000.0) genes_plot_end.append(int(txEnd)/1000000.0) genes_plot_y.append(y_coord) genes_plot_name.append(name+" ") for i in range(len(e_start)-1): if strand=="+": exon=i+1 else: exon=len(e_start)-1-i width=(int(e_end[i])-int(e_start[i]))/1000000.0 x_coord=int(e_start[i])/1000000.0+(width/2) exons_plot_x.append(x_coord) exons_plot_y.append(y_coord) exons_plot_w.append(width) exons_plot_h.append(tall) exons_plot_name.append(name) exons_plot_id.append(id) exons_plot_exon.append(exon) n_rows=len(lines) genes_plot_yn=[n_rows-x+0.5 for x in genes_plot_y] exons_plot_yn=[n_rows-x+0.5 for x in exons_plot_y] yr2=Range1d(start=0, end=n_rows) data_gene_plot = {'exons_plot_x': exons_plot_x, 'exons_plot_yn': exons_plot_yn, 'exons_plot_w': exons_plot_w, 'exons_plot_h': exons_plot_h,'exons_plot_name': exons_plot_name, 'exons_plot_id': exons_plot_id, 'exons_plot_exon': exons_plot_exon} source_gene_plot=ColumnDataSource(data_gene_plot) max_genes = 40 # if len(lines) < 3 or len(genes_raw) > max_genes: if len(lines) < 3: plot_h_pix = 150 else: plot_h_pix = 150 + (len(lines) - 2) * 50 gene_plot = figure(min_border_top=2, min_border_bottom=0, min_border_left=100, min_border_right=5, x_range=xr, y_range=yr2, border_fill_color='white', title="", h_symmetry=False, v_symmetry=False, logo=None, plot_width=900, plot_height=plot_h_pix, tools="hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave") # if len(genes_raw) <= max_genes: gene_plot.segment(genes_plot_start, genes_plot_yn, genes_plot_end, genes_plot_yn, color="black", alpha=1, line_width=2) gene_plot.rect(x='exons_plot_x', y='exons_plot_yn', width='exons_plot_w', height='exons_plot_h', source=source_gene_plot, fill_color="grey", line_color="grey") gene_plot.text(genes_plot_start, genes_plot_yn, text=genes_plot_name, alpha=1, text_font_size="7pt", text_font_style="bold", text_baseline="middle", text_align="right", angle=0) hover = gene_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Gene", "@exons_plot_name"), ("Transcript ID", "@exons_plot_id"), ("Exon", "@exons_plot_exon"), ]) # else: # x_coord_text = coord1/1000000.0 + (coord2/1000000.0 - coord1/1000000.0) / 2.0 # gene_plot.text(x_coord_text, n_rows / 2.0, text=message, alpha=1, # text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0) gene_plot.xaxis.axis_label = "Chromosome " + chromosome + " Coordinate (Mb)(GRCh37)" gene_plot.yaxis.axis_label = "Genes (All Transcripts)" gene_plot.ygrid.grid_line_color = None gene_plot.yaxis.axis_line_color = None gene_plot.yaxis.minor_tick_line_color = None gene_plot.yaxis.major_tick_line_color = None gene_plot.yaxis.major_label_text_color = None gene_plot.toolbar_location = "below" # Change output backend to SVG temporarily for headless export assoc_plot.output_backend = "svg" rug.output_backend = "svg" gene_plot.output_backend = "svg" export_svgs(assoc_plot, filename=tmp_dir + "assoc_plot_1_" + request + ".svg") export_svgs(gene_plot, filename=tmp_dir + "gene_plot_1_" + request + ".svg") # 1 pixel = 0.0264583333 cm svg_height = str(20.00 + (0.0264583333 * plot_h_pix)) + "cm" svg_height_scaled = str(100.00 + (0.1322916665 * plot_h_pix)) + "cm" # Concatenate svgs sg.Figure("24.59cm", svg_height, sg.SVG(tmp_dir + "assoc_plot_1_" + request + ".svg"), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").move(-40, 630) ).save(tmp_dir + "assoc_plot_" + request + ".svg") sg.Figure("122.95cm", svg_height_scaled, sg.SVG(tmp_dir + "assoc_plot_1_" + request + ".svg").scale(5), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(5).move(-200, 3150) ).save(tmp_dir + "assoc_plot_scaled_" + request + ".svg") # Export to PDF subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "assoc_plot_" + request + ".svg " + tmp_dir + "assoc_plot_" + request + ".pdf", shell=True) # Export to PNG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "assoc_plot_scaled_" + request + ".svg " + tmp_dir + "assoc_plot_" + request + ".png", shell=True) # Export to JPEG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "assoc_plot_scaled_" + request + ".svg " + tmp_dir + "assoc_plot_" + request + ".jpeg", shell=True) # Remove individual SVG files after they are combined subprocess.call("rm " + tmp_dir + "assoc_plot_1_" + request + ".svg", shell=True) subprocess.call("rm " + tmp_dir + "gene_plot_1_" + request + ".svg", shell=True) # Remove scaled SVG file after it is converted to png and jpeg subprocess.call("rm " + tmp_dir + "assoc_plot_scaled_" + request + ".svg", shell=True) # Gene Plot (Collapsed) else: # Get genes from LDassoc.py output file filename_c=tmp_dir+"genes_c_"+request+".txt" genes_c_raw=open(filename_c).readlines() genes_c_plot_start=[] genes_c_plot_end=[] genes_c_plot_y=[] genes_c_plot_name=[] exons_c_plot_x=[] exons_c_plot_y=[] exons_c_plot_w=[] exons_c_plot_h=[] exons_c_plot_name=[] exons_c_plot_id=[] message_c = ["Too many genes to plot."] lines_c=[0] gap=80000 tall=0.75 if genes_c_raw!=None: for i in range(len(genes_c_raw)): chrom,txStart,txEnd,name,exonStarts,exonEnds,transcripts=genes_c_raw[i].strip().split() e_start=exonStarts.split(",") e_end=exonEnds.split(",") e_transcripts=transcripts.split(",") # Determine Y Coordinate i=0 y_coord=None while y_coord==None: if i>len(lines_c)-1: y_coord=i+1 lines_c.append(int(txEnd)) elif int(txStart)>(gap+lines_c[i]): y_coord=i+1 lines_c[i]=int(txEnd) else: i+=1 genes_c_plot_start.append(int(txStart)/1000000.0) genes_c_plot_end.append(int(txEnd)/1000000.0) genes_c_plot_y.append(y_coord) genes_c_plot_name.append(name+" ") for i in range(len(e_start)): width=(int(e_end[i])-int(e_start[i]))/1000000.0 x_coord=int(e_start[i])/1000000.0+(width/2) exons_c_plot_x.append(x_coord) exons_c_plot_y.append(y_coord) exons_c_plot_w.append(width) exons_c_plot_h.append(tall) exons_c_plot_name.append(name) exons_c_plot_id.append(e_transcripts[i].replace("-",",")) n_rows_c=len(lines_c) genes_c_plot_yn=[n_rows_c-x+0.5 for x in genes_c_plot_y] exons_c_plot_yn=[n_rows_c-x+0.5 for x in exons_c_plot_y] yr2_c=Range1d(start=0, end=n_rows_c) data_gene_c_plot = {'exons_c_plot_x': exons_c_plot_x, 'exons_c_plot_yn': exons_c_plot_yn, 'exons_c_plot_w': exons_c_plot_w, 'exons_c_plot_h': exons_c_plot_h, 'exons_c_plot_name': exons_c_plot_name, 'exons_c_plot_id': exons_c_plot_id} source_gene_c_plot=ColumnDataSource(data_gene_c_plot) max_genes_c = 40 # if len(lines_c) < 3 or len(genes_c_raw) > max_genes_c: if len(lines_c) < 3: plot_c_h_pix = 150 else: plot_c_h_pix = 150 + (len(lines_c) - 2) * 50 gene_c_plot = figure(min_border_top=2, min_border_bottom=0, min_border_left=100, min_border_right=5, x_range=xr, y_range=yr2_c, border_fill_color='white', title="", h_symmetry=False, v_symmetry=False, logo=None, plot_width=900, plot_height=plot_c_h_pix, tools="hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave") # if len(genes_c_raw) <= max_genes_c: gene_c_plot.segment(genes_c_plot_start, genes_c_plot_yn, genes_c_plot_end, genes_c_plot_yn, color="black", alpha=1, line_width=2) gene_c_plot.rect(x='exons_c_plot_x', y='exons_c_plot_yn', width='exons_c_plot_w', height='exons_c_plot_h', source=source_gene_c_plot, fill_color="grey", line_color="grey") gene_c_plot.text(genes_c_plot_start, genes_c_plot_yn, text=genes_c_plot_name, alpha=1, text_font_size="7pt", text_font_style="bold", text_baseline="middle", text_align="right", angle=0) hover = gene_c_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Gene", "@exons_c_plot_name"), ("Transcript IDs", "@exons_c_plot_id"), ]) # else: # x_coord_text = coord1/1000000.0 + (coord2/1000000.0 - coord1/1000000.0) / 2.0 # gene_c_plot.text(x_coord_text, n_rows_c / 2.0, text=message_c, alpha=1, # text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0) gene_c_plot.xaxis.axis_label = "Chromosome " + chromosome + " Coordinate (Mb)(GRCh37)" gene_c_plot.yaxis.axis_label = "Genes (Transcripts Collapsed)" gene_c_plot.ygrid.grid_line_color = None gene_c_plot.yaxis.axis_line_color = None gene_c_plot.yaxis.minor_tick_line_color = None gene_c_plot.yaxis.major_tick_line_color = None gene_c_plot.yaxis.major_label_text_color = None gene_c_plot.toolbar_location = "below" # Change output backend to SVG temporarily for headless export assoc_plot.output_backend = "svg" rug.output_backend = "svg" gene_c_plot.output_backend = "svg" export_svgs(assoc_plot, filename=tmp_dir + "assoc_plot_1_" + request + ".svg") export_svgs(gene_c_plot, filename=tmp_dir + "gene_plot_1_" + request + ".svg") # 1 pixel = 0.0264583333 cm svg_height = str(20.00 + (0.0264583333 * plot_c_h_pix)) + "cm" svg_height_scaled = str(100.00 + (0.1322916665 * plot_c_h_pix)) + "cm" # Concatenate svgs sg.Figure("24.59cm", svg_height, sg.SVG(tmp_dir + "assoc_plot_1_" + request + ".svg"), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").move(-40, 630) ).save(tmp_dir + "assoc_plot_" + request + ".svg") sg.Figure("122.95cm", svg_height_scaled, sg.SVG(tmp_dir + "assoc_plot_1_" + request + ".svg").scale(5), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(5).move(-200, 3150) ).save(tmp_dir + "assoc_plot_scaled_" + request + ".svg") # Export to PDF subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "assoc_plot_" + request + ".svg " + tmp_dir + "assoc_plot_" + request + ".pdf", shell=True) # Export to PNG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "assoc_plot_scaled_" + request + ".svg " + tmp_dir + "assoc_plot_" + request + ".png", shell=True) # Export to JPEG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "assoc_plot_scaled_" + request + ".svg " + tmp_dir + "assoc_plot_" + request + ".jpeg", shell=True) # Remove individual SVG files after they are combined subprocess.call("rm " + tmp_dir + "assoc_plot_1_" + request + ".svg", shell=True) subprocess.call("rm " + tmp_dir + "gene_plot_1_" + request + ".svg", shell=True) # Remove scaled SVG file after it is converted to png and jpeg subprocess.call("rm " + tmp_dir + "assoc_plot_scaled_" + request + ".svg", shell=True) reset_output() # Remove temporary files subprocess.call("rm "+tmp_dir+"pops_"+request+".txt", shell=True) subprocess.call("rm "+tmp_dir+"*"+request+"*.vcf", shell=True) subprocess.call("rm "+tmp_dir+"genes_*"+request+"*.txt", shell=True) subprocess.call("rm "+tmp_dir+"recomb_"+request+".txt", shell=True) subprocess.call("rm "+tmp_dir+"assoc_args"+request+".json", shell=True) print("Bokeh high quality image export complete!") # Return plot output return None
def calculate_proxy_svg(snp, pop, request, r2_d="r2"): start_time = time.time() # Set data directories using config.yml with open('config.yml', 'r') as f: config = yaml.load(f) gene_dir = config['data']['gene_dir'] recomb_dir = config['data']['recomb_dir'] snp_dir = config['data']['snp_dir'] pop_dir = config['data']['pop_dir'] vcf_dir = config['data']['vcf_dir'] tmp_dir = "./tmp/" # Ensure tmp directory exists if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) if request is False: request = str(time.strftime("%I%M%S")) # Create JSON output # Find coordinates (GRCh37/hg19) for SNP RS number # Connect to snp database conn = sqlite3.connect(snp_dir) conn.text_factory = str cur = conn.cursor() def get_coords(rs): id = rs.strip("rs") t = (id, ) cur.execute("SELECT * FROM tbl_" + id[-1] + " WHERE id=?", t) return cur.fetchone() # Find RS number in snp database snp_coord = get_coords(snp) # Close snp connection cur.close() conn.close() # Select desired ancestral populations pops = pop.split("+") pop_dirs = [] for pop_i in pops: if pop_i in [ "ALL", "AFR", "AMR", "EAS", "EUR", "SAS", "ACB", "ASW", "BEB", "CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH", "GWD", "IBS", "ITU", "JPT", "KHV", "LWK", "MSL", "MXL", "PEL", "PJL", "PUR", "STU", "TSI", "YRI" ]: pop_dirs.append(pop_dir + pop_i + ".txt") get_pops = "cat " + " ".join(pop_dirs) + " > " + \ tmp_dir + "pops_" + request + ".txt" subprocess.call(get_pops, shell=True) # Get population ids pop_list = open(tmp_dir + "pops_" + request + ".txt").readlines() ids = [] for i in range(len(pop_list)): ids.append(pop_list[i].strip()) pop_ids = list(set(ids)) # Extract query SNP phased genotypes vcf_file = vcf_dir + \ snp_coord[ 1] + ".phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.vcf.gz" tabix_snp_h = "tabix -H {0} | grep CHROM".format(vcf_file) proc_h = subprocess.Popen(tabix_snp_h, shell=True, stdout=subprocess.PIPE) head = proc_h.stdout.readlines()[0].strip().split() tabix_snp = "tabix {0} {1}:{2}-{2} | grep -v -e END > {3}".format( vcf_file, snp_coord[1], snp_coord[2], tmp_dir + "snp_no_dups_" + request + ".vcf") subprocess.call(tabix_snp, shell=True) # Check SNP is in the 1000G population, has the correct RS number, and not # monoallelic vcf = open(tmp_dir + "snp_no_dups_" + request + ".vcf").readlines() if len(vcf) == 0: subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) return None elif len(vcf) > 1: geno = [] for i in range(len(vcf)): if vcf[i].strip().split()[2] == snp: geno = vcf[i].strip().split() if geno == []: subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) return None else: geno = vcf[0].strip().split() if geno[2] != snp: snp = geno[2] if "," in geno[3] or "," in geno[4]: subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) return None index = [] for i in range(9, len(head)): if head[i] in pop_ids: index.append(i) genotypes = {"0": 0, "1": 0} for i in index: sub_geno = geno[i].split("|") for j in sub_geno: if j in genotypes: genotypes[j] += 1 else: genotypes[j] = 1 if genotypes["0"] == 0 or genotypes["1"] == 0: subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) return None # Define window of interest around query SNP window = 500000 coord1 = int(snp_coord[2]) - window if coord1 < 0: coord1 = 0 coord2 = int(snp_coord[2]) + window # Calculate proxy LD statistics in parallel threads = 4 block = (2 * window) / 4 commands = [] for i in range(threads): if i == min(range(threads)) and i == max(range(threads)): command = "python LDproxy_sub.py " + snp + " " + \ snp_coord[1] + " " + str(coord1) + " " + \ str(coord2) + " " + request + " " + str(i) elif i == min(range(threads)): command = "python LDproxy_sub.py " + snp + " " + \ snp_coord[1] + " " + str(coord1) + " " + \ str(coord1 + block) + " " + request + " " + str(i) elif i == max(range(threads)): command = "python LDproxy_sub.py " + snp + " " + snp_coord[ 1] + " " + str(coord1 + (block * i) + 1) + " " + str( coord2) + " " + request + " " + str(i) else: command = "python LDproxy_sub.py " + snp + " " + snp_coord[ 1] + " " + str(coord1 + (block * i) + 1) + " " + str( coord1 + (block * (i + 1))) + " " + request + " " + str(i) commands.append(command) processes = [ subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) for command in commands ] # collect output in parallel def get_output(process): return process.communicate()[0].splitlines() if not hasattr(threading.current_thread(), "_children"): threading.current_thread()._children = weakref.WeakKeyDictionary() pool = Pool(len(processes)) out_raw = pool.map(get_output, processes) pool.close() pool.join() # Aggregate output out_prox = [] for i in range(len(out_raw)): for j in range(len(out_raw[i])): col = out_raw[i][j].strip().split("\t") col[6] = int(col[6]) col[7] = float(col[7]) col[8] = float(col[8]) col.append(abs(int(col[6]))) out_prox.append(col) # Sort output if r2_d not in ["r2", "d"]: r2_d = "r2" out_dist_sort = sorted(out_prox, key=operator.itemgetter(14)) if r2_d == "r2": out_ld_sort = sorted(out_dist_sort, key=operator.itemgetter(8), reverse=True) else: out_ld_sort = sorted(out_dist_sort, key=operator.itemgetter(7), reverse=True) # Organize scatter plot data q_rs = [] q_allele = [] q_coord = [] q_maf = [] p_rs = [] p_allele = [] p_coord = [] p_maf = [] dist = [] d_prime = [] d_prime_round = [] r2 = [] r2_round = [] corr_alleles = [] regdb = [] funct = [] color = [] size = [] for i in range(len(out_ld_sort)): q_rs_i, q_allele_i, q_coord_i, p_rs_i, p_allele_i, p_coord_i, dist_i, d_prime_i, r2_i, corr_alleles_i, regdb_i, q_maf_i, p_maf_i, funct_i, dist_abs = out_ld_sort[ i] if float(r2_i) > 0.01: q_rs.append(q_rs_i) q_allele.append(q_allele_i) q_coord.append(float(q_coord_i.split(":")[1]) / 1000000) q_maf.append(str(round(float(q_maf_i), 4))) if p_rs_i == ".": p_rs_i = p_coord_i p_rs.append(p_rs_i) p_allele.append(p_allele_i) p_coord.append(float(p_coord_i.split(":")[1]) / 1000000) p_maf.append(str(round(float(p_maf_i), 4))) dist.append(str(round(dist_i / 1000000.0, 4))) d_prime.append(float(d_prime_i)) d_prime_round.append(str(round(float(d_prime_i), 4))) r2.append(float(r2_i)) r2_round.append(str(round(float(r2_i), 4))) corr_alleles.append(corr_alleles_i) # Correct Missing Annotations if regdb_i == ".": regdb_i = "" regdb.append(regdb_i) if funct_i == ".": funct_i = "" if funct_i == "NA": funct_i = "none" funct.append(funct_i) # Set Color if i == 0: color_i = "blue" elif funct_i != "none" and funct_i != "": color_i = "red" else: color_i = "orange" color.append(color_i) # Set Size size_i = 9 + float(p_maf_i) * 14.0 size.append(size_i) # Begin Bokeh Plotting from collections import OrderedDict from bokeh.embed import components, file_html from bokeh.layouts import gridplot from bokeh.models import HoverTool, LinearAxis, Range1d from bokeh.plotting import ColumnDataSource, curdoc, figure, output_file, reset_output, save from bokeh.resources import CDN from bokeh.io import export_svgs import svgutils.compose as sg reset_output() # Proxy Plot x = p_coord if r2_d == "r2": y = r2 else: y = d_prime whitespace = 0.01 xr = Range1d(start=coord1 / 1000000.0 - whitespace, end=coord2 / 1000000.0 + whitespace) yr = Range1d(start=-0.03, end=1.03) sup_2 = u"\u00B2" proxy_plot = figure( title="Proxies for " + snp + " in " + pop, min_border_top=2, min_border_bottom=2, min_border_left=60, min_border_right=60, h_symmetry=False, v_symmetry=False, plot_width=900, plot_height=600, x_range=xr, y_range=yr, tools="hover,tap,pan,box_zoom,box_select,undo,redo,reset,previewsave", logo=None, toolbar_location="above") proxy_plot.title.align = "center" tabix_recomb = "tabix -fh {0} {1}:{2}-{3} > {4}".format( recomb_dir, snp_coord[1], coord1 - whitespace, coord2 + whitespace, tmp_dir + "recomb_" + request + ".txt") subprocess.call(tabix_recomb, shell=True) filename = tmp_dir + "recomb_" + request + ".txt" recomb_raw = open(filename).readlines() recomb_x = [] recomb_y = [] for i in range(len(recomb_raw)): chr, pos, rate = recomb_raw[i].strip().split() recomb_x.append(int(pos) / 1000000.0) recomb_y.append(float(rate) / 100.0) data = { 'x': x, 'y': y, 'qrs': q_rs, 'q_alle': q_allele, 'q_maf': q_maf, 'prs': p_rs, 'p_alle': p_allele, 'p_maf': p_maf, 'dist': dist, 'r': r2_round, 'd': d_prime_round, 'alleles': corr_alleles, 'regdb': regdb, 'funct': funct, 'size': size, 'color': color } source = ColumnDataSource(data) proxy_plot.line(recomb_x, recomb_y, line_width=1, color="black", alpha=0.5) proxy_plot.circle(x='x', y='y', size='size', color='color', alpha=0.5, source=source) hover = proxy_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Query Variant", "@qrs @q_alle"), ("Proxy Variant", "@prs @p_alle"), ("Distance (Mb)", "@dist"), ("MAF (Query,Proxy)", "@q_maf,@p_maf"), ("R" + sup_2, "@r"), ("D\'", "@d"), ("Correlated Alleles", "@alleles"), ("RegulomeDB", "@regdb"), ("Functional Class", "@funct"), ]) proxy_plot.text(x, y, text=regdb, alpha=1, text_font_size="7pt", text_baseline="middle", text_align="center", angle=0) if r2_d == "r2": proxy_plot.yaxis.axis_label = "R" + sup_2 else: proxy_plot.yaxis.axis_label = "D\'" proxy_plot.extra_y_ranges = {"y2_axis": Range1d(start=-3, end=103)} proxy_plot.add_layout( LinearAxis(y_range_name="y2_axis", axis_label="Combined Recombination Rate (cM/Mb)"), "right") # Rug Plot y2_ll = [-0.03] * len(x) y2_ul = [1.03] * len(x) yr_rug = Range1d(start=-0.03, end=1.03) data_rug = { 'x': x, 'y': y, 'y2_ll': y2_ll, 'y2_ul': y2_ul, 'qrs': q_rs, 'q_alle': q_allele, 'q_maf': q_maf, 'prs': p_rs, 'p_alle': p_allele, 'p_maf': p_maf, 'dist': dist, 'r': r2_round, 'd': d_prime_round, 'alleles': corr_alleles, 'regdb': regdb, 'funct': funct, 'size': size, 'color': color } source_rug = ColumnDataSource(data_rug) rug = figure(x_range=xr, y_range=yr_rug, border_fill_color='white', y_axis_type=None, title="", min_border_top=2, min_border_bottom=2, min_border_left=60, min_border_right=60, h_symmetry=False, v_symmetry=False, plot_width=900, plot_height=50, tools="xpan,tap", logo=None) rug.segment(x0='x', y0='y2_ll', x1='x', y1='y2_ul', source=source_rug, color='color', alpha=0.5, line_width=1) rug.toolbar_location = None # Gene Plot tabix_gene = "tabix -fh {0} {1}:{2}-{3} > {4}".format( gene_dir, snp_coord[1], coord1, coord2, tmp_dir + "genes_" + request + ".txt") subprocess.call(tabix_gene, shell=True) filename = tmp_dir + "genes_" + request + ".txt" genes_raw = open(filename).readlines() genes_plot_start = [] genes_plot_end = [] genes_plot_y = [] genes_plot_name = [] exons_plot_x = [] exons_plot_y = [] exons_plot_w = [] exons_plot_h = [] exons_plot_name = [] exons_plot_id = [] exons_plot_exon = [] lines = [0] gap = 80000 tall = 0.75 if genes_raw != None: for i in range(len(genes_raw)): bin, name_id, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, score, name2, cdsStartStat, cdsEndStat, exonFrames = genes_raw[ i].strip().split() name = name2 id = name_id e_start = exonStarts.split(",") e_end = exonEnds.split(",") # Determine Y Coordinate i = 0 y_coord = None while y_coord == None: if i > len(lines) - 1: y_coord = i + 1 lines.append(int(txEnd)) elif int(txStart) > (gap + lines[i]): y_coord = i + 1 lines[i] = int(txEnd) else: i += 1 genes_plot_start.append(int(txStart) / 1000000.0) genes_plot_end.append(int(txEnd) / 1000000.0) genes_plot_y.append(y_coord) genes_plot_name.append(name + " ") for i in range(len(e_start) - 1): if strand == "+": exon = i + 1 else: exon = len(e_start) - 1 - i width = (int(e_end[i]) - int(e_start[i])) / 1000000.0 x_coord = int(e_start[i]) / 1000000.0 + (width / 2) exons_plot_x.append(x_coord) exons_plot_y.append(y_coord) exons_plot_w.append(width) exons_plot_h.append(tall) exons_plot_name.append(name) exons_plot_id.append(id) exons_plot_exon.append(exon) n_rows = len(lines) genes_plot_yn = [n_rows - x + 0.5 for x in genes_plot_y] exons_plot_yn = [n_rows - x + 0.5 for x in exons_plot_y] yr2 = Range1d(start=0, end=n_rows) data_gene_plot = { 'exons_plot_x': exons_plot_x, 'exons_plot_yn': exons_plot_yn, 'exons_plot_w': exons_plot_w, 'exons_plot_h': exons_plot_h, 'exons_plot_name': exons_plot_name, 'exons_plot_id': exons_plot_id, 'exons_plot_exon': exons_plot_exon } source_gene_plot = ColumnDataSource(data_gene_plot) if len(lines) < 3: plot_h_pix = 150 else: plot_h_pix = 150 + (len(lines) - 2) * 50 gene_plot = figure( x_range=xr, y_range=yr2, border_fill_color='white', title="", min_border_top=2, min_border_bottom=2, min_border_left=60, min_border_right=60, h_symmetry=False, v_symmetry=False, plot_width=900, plot_height=plot_h_pix, tools="hover,tap,xpan,box_zoom,undo,redo,reset,previewsave", logo=None) gene_plot.segment(genes_plot_start, genes_plot_yn, genes_plot_end, genes_plot_yn, color="black", alpha=1, line_width=2) gene_plot.rect(x='exons_plot_x', y='exons_plot_yn', width='exons_plot_w', height='exons_plot_h', source=source_gene_plot, fill_color="grey", line_color="grey") gene_plot.xaxis.axis_label = "Chromosome " + \ snp_coord[1] + " Coordinate (Mb)(GRCh37)" gene_plot.yaxis.axis_label = "Genes" gene_plot.ygrid.grid_line_color = None gene_plot.yaxis.axis_line_color = None gene_plot.yaxis.minor_tick_line_color = None gene_plot.yaxis.major_tick_line_color = None gene_plot.yaxis.major_label_text_color = None hover = gene_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Gene", "@exons_plot_name"), ("ID", "@exons_plot_id"), ("Exon", "@exons_plot_exon"), ]) gene_plot.text(genes_plot_start, genes_plot_yn, text=genes_plot_name, alpha=1, text_font_size="7pt", text_font_style="bold", text_baseline="middle", text_align="right", angle=0) gene_plot.toolbar_location = "below" # Change output backend to SVG temporarily for headless export # Will be changed back to canvas in LDlink.js proxy_plot.output_backend = "svg" rug.output_backend = "svg" gene_plot.output_backend = "svg" export_svgs(proxy_plot, filename=tmp_dir + "proxy_plot_1_" + request + ".svg") export_svgs(gene_plot, filename=tmp_dir + "gene_plot_1_" + request + ".svg") # Concatenate svgs sg.Figure("24.59cm", "27.94cm", sg.SVG(tmp_dir + "proxy_plot_1_" + request + ".svg"), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").move( 0, 630)).save(tmp_dir + "proxy_plot_" + request + ".svg") sg.Figure( "122.95cm", "139.70cm", sg.SVG(tmp_dir + "proxy_plot_1_" + request + ".svg").scale(5), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(5).move( 0, 3150)).save(tmp_dir + "proxy_plot_scaled_" + request + ".svg") # Export to PDF subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "proxy_plot_" + request + ".svg " + tmp_dir + "proxy_plot_" + request + ".pdf", shell=True) # Export to PNG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "proxy_plot_scaled_" + request + ".svg " + tmp_dir + "proxy_plot_" + request + ".png", shell=True) # Export to JPEG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "proxy_plot_scaled_" + request + ".svg " + tmp_dir + "proxy_plot_" + request + ".jpeg", shell=True) # Remove individual SVG files after they are combined subprocess.call("rm " + tmp_dir + "proxy_plot_1_" + request + ".svg", shell=True) subprocess.call("rm " + tmp_dir + "gene_plot_1_" + request + ".svg", shell=True) # Remove scaled SVG file after it is converted to png and jpeg subprocess.call("rm " + tmp_dir + "proxy_plot_scaled_" + request + ".svg", shell=True) reset_output() # Remove temporary files subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) subprocess.call("rm " + tmp_dir + "genes_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "recomb_" + request + ".txt", shell=True) # Return plot output return None
def composite( fig_spec: FigureSpec, memoize_panels: bool = False, recompute_panels: bool = True, delete_png: bool = True, ) -> None: """ Function that composites a figure from a FigureSpec. Parameters ---------- fig_spec : FigureSpec memoize_panels : bool recompute_panels : bool delete_png : bool See the pubfig.compositor decorator for a description of the parameters. Returns ------- None """ import tempfile svg_path = fig_spec.output_file if isinstance(svg_path, str): svg_path = Path(svg_path) assert not svg_path.is_dir(), "The output file name you provided is a directory" if svg_path.suffix != ".svg": svg_path = svg_path.with_suffix(".svg") svg_path = svg_path.expanduser() if not svg_path.parent.exists(): svg_path.parent.mkdir(parents=True, exist_ok=True) if memoize_panels: panels_path = svg_path.parent / ".panels" if not panels_path.exists(): panels_path.mkdir() else: panels_path = Path(tempfile.gettempdir()) panels = [] if fig_spec.plot_grid_every > 0: panels.append(_generate_grid(fig_spec.figure_size, fig_spec.plot_grid_every, font_size=8)) auto_label = fig_spec.auto_label_options label_generator = auto_label.label_generator(auto_label.first_char.text) for name in fig_spec.panels._fields: panel = getattr(fig_spec.panels, name) assert isinstance(panel, Panel) panel_elements = [] assert isinstance(panel.fig, (plt.Figure, VectorImage, RasterImage)) content_offset = _location_to_str( panel.location.units or fig_spec.figure_size.units, panel.content_offset ) if isinstance(panel, PanelFig): svg = _get_panel_content(panels_path, panel, name, memoize_panels, recompute_panels) panel_elements.append(svg.move(*content_offset)) elif isinstance(panel.fig, VectorImage): scale = panel.scale or panel.fig.scale print(f"Scaling vector image {panel.fig.file.absolute()} by {scale:.3f}") panel_elements.append(panel.fig.svg.scale(scale).move(*content_offset)) elif isinstance(panel.fig, RasterImage): img_size = panel.fig.img_size scale = panel.scale or 1. img = sc.Image( img_size.units.to_px(img_size.width), img_size.units.to_px(img_size.height), f"{panel.fig.file}", ) panel_elements.append(img.scale(scale).move(*content_offset)) else: raise TypeError(f"Unknown type of panel content {type(panel.fig)} for panel {name}") if panel.text is not None: panel_text = [ sc.Text( t.text, *_location_to_str(fig_spec.figure_size.units, Location(t.x, t.y, panel.location.units)), **t.kwargs ) for t in panel.text] for t, pt in zip(panel_text, panel.text): # Need separate loop because rotate doesn't return the Text Element t.move(*content_offset).rotate(pt.angle) panel_elements += panel_text if panel.auto_label: label = sc.Text( next(label_generator), *_location_to_str( fig_spec.figure_size.units, Location(auto_label.first_char.x, auto_label.first_char.y) ), **auto_label.first_char.kwargs ) panel_elements.append(label) location = _location_to_str(fig_spec.figure_size.units, panel.location) panels.append(sc.Panel(*panel_elements).move(*location)) fs = fig_spec.figure_size sc.Figure( f"{fs.units.to_px(fs.width):.2f}px", f"{fs.units.to_px(fs.height):.2f}px", *panels ).save(svg_path) if fig_spec.generate_image != ImageType.none: """ Taken from this shell script: #!/bin/sh # Convert all arguments (assumed SVG) to a TIFF acceptable to PLOS # Requires Inkscape and ImageMagick 6.8 (doesn't work with 6.6.9) for i in $@; do BN=$(basename $i .svg) inkscape --without-gui --export-png="$BN.png" --export-dpi 400 $i convert -compress LZW -alpha remove $BN.png $BN.tiff mogrify -alpha off $BN.tiff rm $BN.png done """ basename = f"{svg_path}"[:-4] image_name = f"{basename}.png" _run(f"inkscape --without-gui --export-png='{image_name}' --export-dpi {fig_spec.image_dpi} {svg_path}") if fig_spec.generate_image == ImageType.tiff: tiff_name = f"{basename}.tiff" _run(f"convert -compress LZW -alpha remove {image_name} {tiff_name}") _run(f"mogrify -alpha off {tiff_name}") if delete_png: _run(f"rm {image_name}") image_name = tiff_name _run(f"eog {image_name}")
def main(self): settings = self.settings SMILESSTRING = settings['SMILESSTRING'] resulting_plots = [] pRList = [] mol_svg, d2d, dm = self.draw_smiles() replace_index = [] for scope_plot in self.plots: # for each scope plot, make a vals list containing empty first items for the wedge with alpha=0 if type(scope_plot) != dict: continue sizes = [ 360 - scope_plot['coverangle_wedges'] ] + [scope_plot['coverangle_wedges'] / scope_plot['no_wedges'] ] * scope_plot['no_wedges'] label_inner_circle, label_outer_circle = [ '' ] + [''] * scope_plot['no_wedges'], [ '' ] + [''] * scope_plot['no_wedges'] if (len(scope_plot['value_inner_circle']) != scope_plot['no_wedges'] or len(scope_plot['value_outer_circle']) != scope_plot['no_wedges']): print('not equal') value_inner_circle, value_outer_circle = scope_plot[ 'value_inner_circle'], scope_plot['value_outer_circle'] rounding_boundary = scope_plot['rounding_boundary'] value_groups = scope_plot['value_groups'] for i in range(scope_plot['no_wedges']): if scope_plot['rounding']: if value_inner_circle[i] >= rounding_boundary: label_inner_circle[i + 1] = ">" + str( value_inner_circle[i]) else: label_inner_circle[i + 1] = str(value_inner_circle[i]) if value_outer_circle[i] >= rounding_boundary: label_outer_circle[i + 1] = ">" + str( value_outer_circle[i]) else: label_outer_circle[i + 1] = str(value_outer_circle[i]) else: label_inner_circle[i + 1] = str(value_inner_circle[i]) label_outer_circle[i + 1] = str(value_outer_circle[i]) j = 0 for i, item in enumerate(value_groups): if item[0] == '~': replace_index.append(('~' + str(j), item[1:])) value_groups[i] = '~' + str(j) j = j + 1 vals = [ sizes, # size of the wedges, the first wedge is transparent and will not be shown [0] + value_inner_circle, # colormap values for the inner circle, maximum value determines intensity, first is for the transparent wedge and should stay 0 [0] + value_outer_circle, # colormap values for the outer circle, maximum value determines intensity, first is for the transparent wedge and should stay 0 label_inner_circle, #labels for the inner circle label_outer_circle, #labels for the outer circle [""] + value_groups, #groups ] resulting_plots.append( self.plot_figure_and_colorbar(scope_plot, vals)) # get the atom id from the settings and save its position rIdx = scope_plot['attach_atom_id'] pRList.append( d2d.GetDrawCoords( Geometry.Point2D(dm.GetConformer().GetAtomPosition(rIdx)))) # take colorbar from first plot #ToDo extension to multiple colorbars colorbar = compose.Panel( strSVG(resulting_plots[0][1]).scale(0.8).move(-350, 400)) panels = [compose.Panel(strSVG('<svg></svg>'))] * len(resulting_plots) for i, plot in enumerate(resulting_plots): panels[i] = strSVG(resulting_plots[i][0]).move( -369, -358).scale(1).move(pRList[i].x, pRList[i].y) #panels[i]=strSVG(resulting_plots[i][0]).move(-369*1,-358*1).scale(0.4).move(pRList[i].x,pRList[i].y) compose.Figure( "600", "600", #720 default` compose.Panel(strSVG(mol_svg).scale(1).move(0, 0)), colorbar, *panels #).move(350,350).scale(self.settings['scalefactor']).save("substrate_scope.svg") ).move(350, 100).scale( self.settings['scalefactor']).save("substrate_scope.svg") new_svg = SVG('substrate_scope.svg')._data for item in replace_index: new_svg = self.replace_label_with_smiles(svg_file=new_svg, smiles=item[1], search_index=item[0]) if settings['use_bold_font']: new_svg.replace('font-weight:normal', 'font-weight:bold') f = open("substrate_scope_replaced.svg", "w") f.write(new_svg) f.close() print('File written to:', os.getcwd() + '/substrate_scope_replaced.svg')
def put_list_of_figs_to_svg_fig( FIGS, fig_name="fig.svg", initial_guess=True, visualize=False, export_as_png=False, Props=None, figsize=None, fontsize=9, SCALING_FACTOR=1.34, # needed to get the right cm size ... with_top_left_letter=False, transparent=True): """ take a list of figures and make a multi panel plot""" label = list(string.ascii_uppercase)[:len(FIGS)] SIZE = [] for fig in FIGS: if type(fig) == str: SIZE.append([1., 1.]) else: SIZE.append(fig.get_size_inches()) width = np.max([s[0] for s in SIZE]) height = np.max([s[1] for s in SIZE]) if Props is None: LABELS, XCOORD, YCOORD = [], [], [] # saving as svg for i in range(len(FIGS)): LABELS.append(label[i]) XCOORD.append((i % 3) * width * 100) YCOORD.append(int(i / 3) * height * 100) XCOORD_LABELS,\ YCOORD_LABELS = XCOORD, YCOORD else: XCOORD, YCOORD = Props['XCOORD'],\ Props['YCOORD'], if 'LABELS' in Props: LABELS = Props['LABELS'] else: LABELS = ['' for x in XCOORD] if 'XCOORD_LABELS' in Props: XCOORD_LABELS,\ YCOORD_LABELS = Props['XCOORD_LABELS'],\ Props['YCOORD_LABELS'] else: XCOORD_LABELS,\ YCOORD_LABELS = XCOORD, YCOORD LOCATIONS = [] for i in range(len(FIGS)): if type(FIGS[i]) is str: LOCATIONS.append(FIGS[i]) else: LOCATIONS.append(os.path.join(gettempdir(), str(i) + '.svg')) FIGS[i].savefig(LOCATIONS[-1], format='svg', transparent=transparent) PANELS = [] for i in range(len(FIGS)): PANELS.append(sg.Panel(\ sg.SVG(LOCATIONS[i]).move(XCOORD[i],YCOORD[i]))) for i in range(len(LABELS)): PANELS.append(sg.Panel(\ sg.Text(LABELS[i], 15, 10, size=fontsize, weight='bold').move(\ XCOORD_LABELS[i],YCOORD_LABELS[i]))\ ) sg.Figure("21cm", "29.7cm", *PANELS).scale(SCALING_FACTOR).save(fig_name) # if figsize is None: # sg.Figure("21cm", "29.7cm", *PANELS).save(fig_name) # else: # sg.Figure(str(inch2cm(figsize[0]*A0_format['width'])[0])+"cm",\ # str(inch2cm(figsize[1]*A0_format['height'])[0])+"cm",\ # *PANELS).scale(SCALING_FACTOR).save(fig_name) if visualize: os.system('open ' + fig_name) # works well with 'Gapplin' on OS-X
def fsapt_analyze(lig_dir, mode, ene_type): lig_name = os.path.basename(os.path.abspath(lig_dir)) matrix_dfs = [] outfiles = glob('%s/FSAPT*out' % lig_dir) for of in outfiles: df = _get_ene_matrix(of, ene_type) if not df is None: matrix_dfs.append(df) all_df = pd.concat(matrix_dfs, axis=1) mean_df = all_df.stack().groupby(level=[0, 1]).mean().unstack() std_df = all_df.stack().groupby(level=[0, 1]).std().unstack() if mode in ['prolig', 'proliglig']: old_columns = mean_df.columns[:] new_labels = [] numbering = [] for old_label in old_columns: if old_label == 'Total': new_labels.append('Total') numbering.append(100000) else: labels = old_label.split('-') if len(labels) == 2: new_labels.append(''.join(labels)) numbering.append(float(labels[-1])) elif len(labels) == 3: new_labels.append('-'.join(labels[1:])) numbering.append(0.5 * (float(labels[-1]) + float(labels[-2]))) new_columns = [nl for _, nl in sorted(zip(numbering, new_labels))] old_columns = [ol for _, ol in sorted(zip(numbering, old_columns))] new_mean_df = pd.DataFrame() new_std_df = pd.DataFrame() for nc, oc in zip(new_columns, old_columns): new_mean_df[nc] = mean_df[oc] new_std_df[nc] = std_df[oc] mean_df = new_mean_df std_df = new_std_df mean_anno = mean_df.applymap(lambda x: '%+.2f\n' % x) std_anno = std_df.applymap(lambda x: r'+/-%.2f' % x) all_anno = mean_anno + std_anno matrix_svg = '%s/ene_matrix_%s.svg' % (lig_dir, ene_type) plot_matrix(mean_df, all_anno, matrix_svg, mode, ene_type) mean_df.to_csv('%s/ene_mean_%s_%s_%s.csv' % (lig_dir, lig_name, mode, ene_type)) std_df.to_csv('%s/ene_std_%s_%s_%s.csv' % (lig_dir, lig_name, mode, ene_type)) # Plot the ligand dpi = 96 width = len(mean_df.columns) + 2 height = 4 ligmol = cs._RdkitMolBase.from_file('MD/%s/cmp_sybyl.mol2' % lig_name) ligmol._init_atominfo(reset=False) ligmol.charged_mol2file = 'MD/%s/cmp_sybyl.mol2' % lig_name ligmol.get_noh_mol() AllChem.Compute2DCoords(ligmol.noh_mol, canonOrient=True, bondLength=1.5) drawer = rdMolDraw2D.MolDraw2DSVG(width * dpi, height * dpi) opts = drawer.drawOptions() opts.additionalAtomLabelPadding = 0.1 frag_dict, _ = fragment_mol(ligmol, 'L1') for noha in ligmol.noh_mol.GetAtoms(): noh_idx = noha.GetIdx() h_idx = ligmol.noh_to_h_atom_mapping[noh_idx] frag_label = str(frag_dict[h_idx]['resid']) if not 'L1-%02d' % int(frag_label) in mean_df.index: continue if noha.GetAtomicNum() == 6: opts.atomLabels[noh_idx] = '%02d' % int(frag_label) else: elem = ligmol.GetAtomWithIdx(h_idx).GetProp( '_TriposAtomType').split('.')[0] opts.atomLabels[noh_idx] = '%s/%02d' % (elem, int(frag_label)) drawer.DrawMolecule(ligmol.noh_mol) drawer.FinishDrawing() svg = drawer.GetDrawingText().replace('svg:', '') struct_svg = '%s/lig_frag_%s.svg' % (lig_dir, ene_type) with open(struct_svg, 'w') as fh: fh.writelines(svg) # Consolidate the panels if mode == 'prolig': mat_title = 'Protein-Ligand %s Interaction' % ene_type.capitalize() else: mat_title = 'Ligand-Ligand %s Interaction' % ene_type.capitalize() mat_title = sc.Panel(sc.Text(mat_title, size=24)).move(20, 20) mat_panel = sc.Panel(sc.SVG(matrix_svg).scale(1.4)).move(0, 20) struct_title = sc.Panel(sc.Text('Ligand %s' % lig_name, size=24)).move(20, dpi * len(mean_df) + 20) struct_panel = sc.Panel(sc.SVG(struct_svg)).move(0, dpi * len(mean_df) + 20) final_figure = sc.Figure(dpi * width, dpi * (len(mean_df) + height) + 40, mat_panel, mat_title, struct_panel, struct_title) final_name = '%s/%s_%s_%s' % (lig_dir, lig_name, mode, ene_type) final_figure.save('%s.svg' % final_name) os.system('convert -density 100 %s.svg %s.pdf' % (final_name, final_name)) os.system('rm -f %s %s' % (matrix_svg, struct_svg)) # Write pdb for pymol inpdb = '%s/frame0/fsapt.pdb' % lig_dir outpdb = '%s_pymol.pdb' % final_name write_pymol_pdb(inpdb, outpdb, mean_df)
def calculate_proxy_svg(snp, pop, request, genome_build, r2_d="r2", window=500000, collapseTranscript=True): # Set data directories using config.yml with open('config.yml', 'r') as yml_file: config = yaml.load(yml_file) env = config['env'] connect_external = config['database']['connect_external'] api_mongo_addr = config['database']['api_mongo_addr'] data_dir = config['data']['data_dir'] tmp_dir = config['data']['tmp_dir'] genotypes_dir = config['data']['genotypes_dir'] mongo_username = config['database']['mongo_user_readonly'] mongo_password = config['database']['mongo_password'] mongo_port = config['database']['mongo_port'] aws_info = config['aws'] num_subprocesses = config['performance']['num_subprocesses'] export_s3_keys = retrieveAWSCredentials() # Ensure tmp directory exists if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) if request is False: request = str(time.strftime("%I%M%S")) # Create JSON output # Find coordinates (GRCh37/hg19) or (GRCh38/hg38) for SNP RS number # Connect to Mongo snp database if env == 'local' or connect_external: mongo_host = api_mongo_addr else: mongo_host = 'localhost' client = MongoClient( 'mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host + '/admin', mongo_port) db = client["LDLink"] def get_coords(db, rsid): rsid = rsid.strip("rs") query_results = db.dbsnp.find_one({"id": rsid}) query_results_sanitized = json.loads(json_util.dumps(query_results)) return query_results_sanitized # Query genomic coordinates def get_rsnum(db, coord): temp_coord = coord.strip("chr").split(":") chro = temp_coord[0] pos = temp_coord[1] query_results = db.dbsnp.find({ "chromosome": chro.upper() if chro == 'x' or chro == 'y' else str(chro), genome_build_vars[genome_build]['position']: str(pos) }) query_results_sanitized = json.loads(json_util.dumps(query_results)) return query_results_sanitized # Replace input genomic coordinates with variant ids (rsids) def replace_coord_rsid(db, snp): if snp[0:2] == "rs": return snp else: snp_info_lst = get_rsnum(db, snp) print("snp_info_lst") print(snp_info_lst) if snp_info_lst != None: if len(snp_info_lst) > 1: var_id = "rs" + snp_info_lst[0]['id'] ref_variants = [] for snp_info in snp_info_lst: if snp_info['id'] == snp_info['ref_id']: ref_variants.append(snp_info['id']) if len(ref_variants) > 1: var_id = "rs" + ref_variants[0] elif len(ref_variants) == 0 and len(snp_info_lst) > 1: var_id = "rs" + snp_info_lst[0]['id'] else: var_id = "rs" + ref_variants[0] return var_id elif len(snp_info_lst) == 1: var_id = "rs" + snp_info_lst[0]['id'] return var_id else: return snp else: return snp return snp snp = replace_coord_rsid(db, snp) # Find RS number in snp database snp_coord = get_coords(db, snp) # Get population ids from LDproxy.py tmp output files pop_list = open(tmp_dir + "pops_" + request + ".txt").readlines() ids = [] for i in range(len(pop_list)): ids.append(pop_list[i].strip()) pop_ids = list(set(ids)) # Extract query SNP phased genotypes vcf_filePath = "%s/%s%s/%s" % ( config['aws']['data_subfolder'], genotypes_dir, genome_build_vars[genome_build]['1000G_dir'], genome_build_vars[genome_build]['1000G_file'] % (snp_coord['chromosome'])) vcf_query_snp_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath) checkS3File(aws_info, config['aws']['bucket'], vcf_filePath) tabix_snp_h = export_s3_keys + " cd {1}; tabix -HD {0} | grep CHROM".format( vcf_query_snp_file, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) head = [ x.decode('utf-8') for x in subprocess.Popen(tabix_snp_h, shell=True, stdout=subprocess.PIPE).stdout.readlines() ][0].strip().split() tabix_snp = export_s3_keys + " cd {4}; tabix -D {0} {1}:{2}-{2} | grep -v -e END > {3}".format( vcf_query_snp_file, genome_build_vars[genome_build]['1000G_chr_prefix'] + snp_coord['chromosome'], snp_coord[genome_build_vars[genome_build]['position']], tmp_dir + "snp_no_dups_" + request + ".vcf", data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir']) subprocess.call(tabix_snp, shell=True) # Check SNP is in the 1000G population, has the correct RS number, and not # monoallelic vcf = open(tmp_dir + "snp_no_dups_" + request + ".vcf").readlines() if len(vcf) == 0: subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) return None elif len(vcf) > 1: geno = [] for i in range(len(vcf)): # if vcf[i].strip().split()[2] == snp: geno = vcf[i].strip().split() geno[0] = geno[0].lstrip('chr') if geno == []: subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) return None else: geno = vcf[0].strip().split() geno[0] = geno[0].lstrip('chr') if geno[2] != snp and snp[0:2] == "rs" and "rs" in geno[2]: snp = geno[2] if "," in geno[3] or "," in geno[4]: subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) return None index = [] for i in range(9, len(head)): if head[i] in pop_ids: index.append(i) genotypes = {"0": 0, "1": 0} for i in index: sub_geno = geno[i].split("|") for j in sub_geno: if j in genotypes: genotypes[j] += 1 else: genotypes[j] = 1 if genotypes["0"] == 0 or genotypes["1"] == 0: subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) return None # Define window of interest around query SNP # window = 500000 coord1 = int( snp_coord[genome_build_vars[genome_build]['position']]) - window if coord1 < 0: coord1 = 0 coord2 = int( snp_coord[genome_build_vars[genome_build]['position']]) + window # Calculate proxy LD statistics in parallel # threads = 4 # block = (2 * window) // 4 # block = (2 * window) // num_subprocesses windowChunkRanges = chunkWindow( int(snp_coord[genome_build_vars[genome_build]['position']]), window, num_subprocesses) commands = [] # for i in range(num_subprocesses): # if i == min(range(num_subprocesses)) and i == max(range(num_subprocesses)): # command = "python3 LDproxy_sub.py " + "True " + snp + " " + \ # snp_coord['chromosome'] + " " + str(coord1) + " " + \ # str(coord2) + " " + request + " " + str(i) # elif i == min(range(num_subprocesses)): # command = "python3 LDproxy_sub.py " + "True " + snp + " " + \ # snp_coord['chromosome'] + " " + str(coord1) + " " + \ # str(coord1 + block) + " " + request + " " + str(i) # elif i == max(range(num_subprocesses)): # command = "python3 LDproxy_sub.py " + "True " + snp + " " + snp_coord['chromosome'] + " " + str( # coord1 + (block * i) + 1) + " " + str(coord2) + " " + request + " " + str(i) # else: # command = "python3 LDproxy_sub.py " + "True " + snp + " " + snp_coord['chromosome'] + " " + str(coord1 + ( # block * i) + 1) + " " + str(coord1 + (block * (i + 1))) + " " + request + " " + str(i) # commands.append(command) for subprocess_id in range(num_subprocesses): getWindowVariantsArgs = " ".join([ "True", str(snp), str(snp_coord['chromosome']), str(windowChunkRanges[subprocess_id][0]), str(windowChunkRanges[subprocess_id][1]), str(request), genome_build, str(subprocess_id) ]) commands.append("python3 LDproxy_sub.py " + getWindowVariantsArgs) processes = [ subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) for command in commands ] # collect output in parallel def get_output(process): return process.communicate()[0].splitlines() if not hasattr(threading.current_thread(), "_children"): threading.current_thread()._children = weakref.WeakKeyDictionary() pool = Pool(len(processes)) out_raw = pool.map(get_output, processes) pool.close() pool.join() # Aggregate output out_prox = [] for i in range(len(out_raw)): for j in range(len(out_raw[i])): col = out_raw[i][j].decode('utf-8').strip().split("\t") col[6] = int(col[6]) col[7] = float(col[7]) col[8] = float(col[8]) col.append(abs(int(col[6]))) out_prox.append(col) # Sort output if r2_d not in ["r2", "d"]: r2_d = "r2" out_dist_sort = sorted(out_prox, key=operator.itemgetter(14)) if r2_d == "r2": out_ld_sort = sorted(out_dist_sort, key=operator.itemgetter(8), reverse=True) else: out_ld_sort = sorted(out_dist_sort, key=operator.itemgetter(7), reverse=True) # Organize scatter plot data q_rs = [] q_allele = [] q_coord = [] q_maf = [] p_rs = [] p_allele = [] p_coord = [] p_maf = [] dist = [] d_prime = [] d_prime_round = [] r2 = [] r2_round = [] corr_alleles = [] regdb = [] funct = [] color = [] size = [] for i in range(len(out_ld_sort)): q_rs_i, q_allele_i, q_coord_i, p_rs_i, p_allele_i, p_coord_i, dist_i, d_prime_i, r2_i, corr_alleles_i, regdb_i, q_maf_i, p_maf_i, funct_i, dist_abs = out_ld_sort[ i] if float(r2_i) > 0.01: q_rs.append(q_rs_i) q_allele.append(q_allele_i) q_coord.append(float(q_coord_i.split(":")[1]) / 1000000) q_maf.append(str(round(float(q_maf_i), 4))) if p_rs_i == ".": p_rs_i = p_coord_i p_rs.append(p_rs_i) p_allele.append(p_allele_i) p_coord.append(float(p_coord_i.split(":")[1]) / 1000000) p_maf.append(str(round(float(p_maf_i), 4))) dist.append(str(round(dist_i / 1000000.0, 4))) d_prime.append(float(d_prime_i)) d_prime_round.append(str(round(float(d_prime_i), 4))) r2.append(float(r2_i)) r2_round.append(str(round(float(r2_i), 4))) corr_alleles.append(corr_alleles_i) # Correct Missing Annotations if regdb_i == ".": regdb_i = "" regdb.append(regdb_i) if funct_i == ".": funct_i = "" if funct_i == "NA": funct_i = "none" funct.append(funct_i) # Set Color if i == 0: color_i = "blue" elif funct_i != "none" and funct_i != "": color_i = "red" else: color_i = "orange" color.append(color_i) # Set Size size_i = 9 + float(p_maf_i) * 14.0 size.append(size_i) # Begin Bokeh Plotting from collections import OrderedDict from bokeh.embed import components, file_html from bokeh.layouts import gridplot from bokeh.models import HoverTool, LinearAxis, Range1d from bokeh.plotting import ColumnDataSource, curdoc, figure, output_file, reset_output, save from bokeh.resources import CDN from bokeh.io import export_svgs import svgutils.compose as sg reset_output() # Proxy Plot x = p_coord if r2_d == "r2": y = r2 else: y = d_prime whitespace = 0.01 xr = Range1d(start=coord1 / 1000000.0 - whitespace, end=coord2 / 1000000.0 + whitespace) yr = Range1d(start=-0.03, end=1.03) sup_2 = "\u00B2" proxy_plot = figure( title="Proxies for " + snp + " in " + pop, min_border_top=2, min_border_bottom=2, min_border_left=60, min_border_right=60, h_symmetry=False, v_symmetry=False, plot_width=900, plot_height=600, x_range=xr, y_range=yr, tools="hover,tap,pan,box_zoom,box_select,undo,redo,reset,previewsave", logo=None, toolbar_location="above") proxy_plot.title.align = "center" # Add recombination rate from LDproxy.py output file recomb_file = tmp_dir + "recomb_" + request + ".json" recomb_raw = open(recomb_file).readlines() recomb_x = [] recomb_y = [] for recomb_raw_obj in recomb_raw: recomb_obj = json.loads(recomb_raw_obj) recomb_x.append( int(recomb_obj[genome_build_vars[genome_build]['position']]) / 1000000.0) recomb_y.append(float(recomb_obj['rate']) / 100.0) data = { 'x': x, 'y': y, 'qrs': q_rs, 'q_alle': q_allele, 'q_maf': q_maf, 'prs': p_rs, 'p_alle': p_allele, 'p_maf': p_maf, 'dist': dist, 'r': r2_round, 'd': d_prime_round, 'alleles': corr_alleles, 'regdb': regdb, 'funct': funct, 'size': size, 'color': color } source = ColumnDataSource(data) proxy_plot.line(recomb_x, recomb_y, line_width=1, color="black", alpha=0.5) proxy_plot.circle(x='x', y='y', size='size', color='color', alpha=0.5, source=source) hover = proxy_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Query Variant", "@qrs @q_alle"), ("Proxy Variant", "@prs @p_alle"), ("Distance (Mb)", "@dist"), ("MAF (Query,Proxy)", "@q_maf,@p_maf"), ("R" + sup_2, "@r"), ("D\'", "@d"), ("Correlated Alleles", "@alleles"), ("RegulomeDB", "@regdb"), ("Functional Class", "@funct"), ]) proxy_plot.text(x, y, text=regdb, alpha=1, text_font_size="7pt", text_baseline="middle", text_align="center", angle=0) if r2_d == "r2": proxy_plot.yaxis.axis_label = "R" + sup_2 else: proxy_plot.yaxis.axis_label = "D\'" proxy_plot.extra_y_ranges = {"y2_axis": Range1d(start=-3, end=103)} proxy_plot.add_layout( LinearAxis(y_range_name="y2_axis", axis_label="Combined Recombination Rate (cM/Mb)"), "right") # Rug Plot y2_ll = [-0.03] * len(x) y2_ul = [1.03] * len(x) yr_rug = Range1d(start=-0.03, end=1.03) data_rug = { 'x': x, 'y': y, 'y2_ll': y2_ll, 'y2_ul': y2_ul, 'qrs': q_rs, 'q_alle': q_allele, 'q_maf': q_maf, 'prs': p_rs, 'p_alle': p_allele, 'p_maf': p_maf, 'dist': dist, 'r': r2_round, 'd': d_prime_round, 'alleles': corr_alleles, 'regdb': regdb, 'funct': funct, 'size': size, 'color': color } source_rug = ColumnDataSource(data_rug) rug = figure(x_range=xr, y_range=yr_rug, border_fill_color='white', y_axis_type=None, title="", min_border_top=2, min_border_bottom=2, min_border_left=60, min_border_right=60, h_symmetry=False, v_symmetry=False, plot_width=900, plot_height=50, tools="xpan,tap", logo=None) rug.segment(x0='x', y0='y2_ll', x1='x', y1='y2_ul', source=source_rug, color='color', alpha=0.5, line_width=1) rug.toolbar_location = None if collapseTranscript == "false": # Gene Plot (All Transcripts) genes_file = tmp_dir + "genes_" + request + ".json" genes_raw = open(genes_file).readlines() genes_plot_start = [] genes_plot_end = [] genes_plot_y = [] genes_plot_name = [] exons_plot_x = [] exons_plot_y = [] exons_plot_w = [] exons_plot_h = [] exons_plot_name = [] exons_plot_id = [] exons_plot_exon = [] lines = [0] gap = 80000 tall = 0.75 if genes_raw != None and len(genes_raw) > 0: for gene_raw_obj in genes_raw: gene_obj = json.loads(gene_raw_obj) bin = gene_obj["bin"] name_id = gene_obj["name"] chrom = gene_obj["chrom"] strand = gene_obj["strand"] txStart = gene_obj["txStart"] txEnd = gene_obj["txEnd"] cdsStart = gene_obj["cdsStart"] cdsEnd = gene_obj["cdsEnd"] exonCount = gene_obj["exonCount"] exonStarts = gene_obj["exonStarts"] exonEnds = gene_obj["exonEnds"] score = gene_obj["score"] name2 = gene_obj["name2"] cdsStartStat = gene_obj["cdsStartStat"] cdsEndStat = gene_obj["cdsEndStat"] exonFrames = gene_obj["exonFrames"] name = name2 id = name_id e_start = exonStarts.split(",") e_end = exonEnds.split(",") # Determine Y Coordinate i = 0 y_coord = None while y_coord == None: if i > len(lines) - 1: y_coord = i + 1 lines.append(int(txEnd)) elif int(txStart) > (gap + lines[i]): y_coord = i + 1 lines[i] = int(txEnd) else: i += 1 genes_plot_start.append(int(txStart) / 1000000.0) genes_plot_end.append(int(txEnd) / 1000000.0) genes_plot_y.append(y_coord) genes_plot_name.append(name + " ") for i in range(len(e_start) - 1): if strand == "+": exon = i + 1 else: exon = len(e_start) - 1 - i width = (int(e_end[i]) - int(e_start[i])) / 1000000.0 x_coord = int(e_start[i]) / 1000000.0 + (width / 2) exons_plot_x.append(x_coord) exons_plot_y.append(y_coord) exons_plot_w.append(width) exons_plot_h.append(tall) exons_plot_name.append(name) exons_plot_id.append(id) exons_plot_exon.append(exon) n_rows = len(lines) genes_plot_yn = [n_rows - x + 0.5 for x in genes_plot_y] exons_plot_yn = [n_rows - x + 0.5 for x in exons_plot_y] yr2 = Range1d(start=0, end=n_rows) data_gene_plot = { 'exons_plot_x': exons_plot_x, 'exons_plot_yn': exons_plot_yn, 'exons_plot_w': exons_plot_w, 'exons_plot_h': exons_plot_h, 'exons_plot_name': exons_plot_name, 'exons_plot_id': exons_plot_id, 'exons_plot_exon': exons_plot_exon } source_gene_plot = ColumnDataSource(data_gene_plot) if len(lines) < 3: plot_h_pix = 250 else: plot_h_pix = 250 + (len(lines) - 2) * 50 gene_plot = figure( x_range=xr, y_range=yr2, border_fill_color='white', title="", min_border_top=2, min_border_bottom=2, min_border_left=60, min_border_right=60, h_symmetry=False, v_symmetry=False, plot_width=900, plot_height=plot_h_pix, tools="hover,tap,xpan,box_zoom,undo,redo,reset,previewsave", logo=None) gene_plot.segment(genes_plot_start, genes_plot_yn, genes_plot_end, genes_plot_yn, color="black", alpha=1, line_width=2) gene_plot.rect(x='exons_plot_x', y='exons_plot_yn', width='exons_plot_w', height='exons_plot_h', source=source_gene_plot, fill_color="grey", line_color="grey") gene_plot.xaxis.axis_label = "Chromosome " + snp_coord[ 'chromosome'] + " Coordinate (Mb)(" + genome_build_vars[ genome_build]['title'] + ")" gene_plot.yaxis.axis_label = "Genes (All Transcripts)" gene_plot.ygrid.grid_line_color = None gene_plot.yaxis.axis_line_color = None gene_plot.yaxis.minor_tick_line_color = None gene_plot.yaxis.major_tick_line_color = None gene_plot.yaxis.major_label_text_color = None hover = gene_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Gene", "@exons_plot_name"), ("ID", "@exons_plot_id"), ("Exon", "@exons_plot_exon"), ]) gene_plot.text(genes_plot_start, genes_plot_yn, text=genes_plot_name, alpha=1, text_font_size="7pt", text_font_style="bold", text_baseline="middle", text_align="right", angle=0) gene_plot.toolbar_location = "below" # Gene Plot (Collapsed) else: genes_c_file = tmp_dir + "genes_c_" + request + ".json" genes_c_raw = open(genes_c_file).readlines() genes_c_plot_start = [] genes_c_plot_end = [] genes_c_plot_y = [] genes_c_plot_name = [] exons_c_plot_x = [] exons_c_plot_y = [] exons_c_plot_w = [] exons_c_plot_h = [] exons_c_plot_name = [] exons_c_plot_id = [] message_c = ["Too many genes to plot."] lines_c = [0] gap = 80000 tall = 0.75 if genes_c_raw != None and len(genes_c_raw) > 0: for gene_c_raw_obj in genes_c_raw: gene_c_obj = json.loads(gene_c_raw_obj) chrom = gene_c_obj["chrom"] txStart = gene_c_obj["txStart"] txEnd = gene_c_obj["txEnd"] exonStarts = gene_c_obj["exonStarts"] exonEnds = gene_c_obj["exonEnds"] name2 = gene_c_obj["name2"] transcripts = gene_c_obj["transcripts"] name = name2 e_start = exonStarts.split(",") e_end = exonEnds.split(",") e_transcripts = transcripts.split(",") # Determine Y Coordinate i = 0 y_coord = None while y_coord == None: if i > len(lines_c) - 1: y_coord = i + 1 lines_c.append(int(txEnd)) elif int(txStart) > (gap + lines_c[i]): y_coord = i + 1 lines_c[i] = int(txEnd) else: i += 1 genes_c_plot_start.append(int(txStart) / 1000000.0) genes_c_plot_end.append(int(txEnd) / 1000000.0) genes_c_plot_y.append(y_coord) genes_c_plot_name.append(name + " ") # for i in range(len(e_start)): for i in range(len(e_start) - 1): width = (int(e_end[i]) - int(e_start[i])) / 1000000.0 x_coord = int(e_start[i]) / 1000000.0 + (width / 2) exons_c_plot_x.append(x_coord) exons_c_plot_y.append(y_coord) exons_c_plot_w.append(width) exons_c_plot_h.append(tall) exons_c_plot_name.append(name) exons_c_plot_id.append(e_transcripts[i].replace("-", ",")) n_rows_c = len(lines_c) genes_c_plot_yn = [n_rows_c - x + 0.5 for x in genes_c_plot_y] exons_c_plot_yn = [n_rows_c - x + 0.5 for x in exons_c_plot_y] yr2_c = Range1d(start=0, end=n_rows_c) data_gene_c_plot = { 'exons_c_plot_x': exons_c_plot_x, 'exons_c_plot_yn': exons_c_plot_yn, 'exons_c_plot_w': exons_c_plot_w, 'exons_c_plot_h': exons_c_plot_h, 'exons_c_plot_name': exons_c_plot_name, 'exons_c_plot_id': exons_c_plot_id } source_gene_c_plot = ColumnDataSource(data_gene_c_plot) max_genes_c = 40 # if len(lines_c) < 3 or len(genes_c_raw) > max_genes_c: if len(lines_c) < 3: plot_h_pix = 250 else: plot_h_pix = 250 + (len(lines_c) - 2) * 50 gene_plot = figure( min_border_top=2, min_border_bottom=0, min_border_left=100, min_border_right=5, x_range=xr, y_range=yr2_c, border_fill_color='white', title="", h_symmetry=False, v_symmetry=False, logo=None, plot_width=900, plot_height=plot_h_pix, tools= "hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave") # if len(genes_c_raw) <= max_genes_c: gene_plot.segment(genes_c_plot_start, genes_c_plot_yn, genes_c_plot_end, genes_c_plot_yn, color="black", alpha=1, line_width=2) gene_plot.rect(x='exons_c_plot_x', y='exons_c_plot_yn', width='exons_c_plot_w', height='exons_c_plot_h', source=source_gene_c_plot, fill_color="grey", line_color="grey") gene_plot.text(genes_c_plot_start, genes_c_plot_yn, text=genes_c_plot_name, alpha=1, text_font_size="7pt", text_font_style="bold", text_baseline="middle", text_align="right", angle=0) hover = gene_plot.select(dict(type=HoverTool)) hover.tooltips = OrderedDict([ ("Gene", "@exons_c_plot_name"), ("Transcript IDs", "@exons_c_plot_id"), ]) # else: # x_coord_text = coord1/1000000.0 + (coord2/1000000.0 - coord1/1000000.0) / 2.0 # gene_c_plot.text(x_coord_text, n_rows_c / 2.0, text=message_c, alpha=1, # text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0) gene_plot.xaxis.axis_label = "Chromosome " + snp_coord[ 'chromosome'] + " Coordinate (Mb)(" + genome_build_vars[ genome_build]['title'] + ")" gene_plot.yaxis.axis_label = "Genes (Transcripts Collapsed)" gene_plot.ygrid.grid_line_color = None gene_plot.yaxis.axis_line_color = None gene_plot.yaxis.minor_tick_line_color = None gene_plot.yaxis.major_tick_line_color = None gene_plot.yaxis.major_label_text_color = None gene_plot.toolbar_location = "below" # Change output backend to SVG temporarily for headless export # Will be changed back to canvas in LDlink.js proxy_plot.output_backend = "svg" rug.output_backend = "svg" gene_plot.output_backend = "svg" export_svgs(proxy_plot, filename=tmp_dir + "proxy_plot_1_" + request + ".svg") export_svgs(gene_plot, filename=tmp_dir + "gene_plot_1_" + request + ".svg") # 1 pixel = 0.0264583333 cm svg_height = str(20.00 + (0.0264583333 * plot_h_pix)) + "cm" svg_height_scaled = str(100.00 + (0.1322916665 * plot_h_pix)) + "cm" # Concatenate svgs sg.Figure("24.59cm", svg_height, sg.SVG(tmp_dir + "proxy_plot_1_" + request + ".svg"), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").move( 0, 630)).save(tmp_dir + "proxy_plot_" + request + ".svg") sg.Figure( "122.95cm", svg_height_scaled, sg.SVG(tmp_dir + "proxy_plot_1_" + request + ".svg").scale(5), sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(5).move( 0, 3150)).save(tmp_dir + "proxy_plot_scaled_" + request + ".svg") # Export to PDF subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "proxy_plot_" + request + ".svg " + tmp_dir + "proxy_plot_" + request + ".pdf", shell=True) # Export to PNG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "proxy_plot_scaled_" + request + ".svg " + tmp_dir + "proxy_plot_" + request + ".png", shell=True) # Export to JPEG subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "proxy_plot_scaled_" + request + ".svg " + tmp_dir + "proxy_plot_" + request + ".jpeg", shell=True) # Remove individual SVG files after they are combined subprocess.call("rm " + tmp_dir + "proxy_plot_1_" + request + ".svg", shell=True) subprocess.call("rm " + tmp_dir + "gene_plot_1_" + request + ".svg", shell=True) # Remove scaled SVG file after it is converted to png and jpeg subprocess.call("rm " + tmp_dir + "proxy_plot_scaled_" + request + ".svg", shell=True) reset_output() # Remove temporary files subprocess.call("rm " + tmp_dir + "pops_" + request + ".txt", shell=True) subprocess.call("rm " + tmp_dir + "*" + request + "*.vcf", shell=True) subprocess.call("rm " + tmp_dir + "genes_*" + request + "*.json", shell=True) subprocess.call("rm " + tmp_dir + "recomb_" + request + ".txt", shell=True) # Return plot output return None
def add_title(self, fig, title, y_pos=-0.2, w=None): """ `fig`: a plotly/svgutils Figure object, or the path of a png file. `title`: figure title. `w`: output image width in px. """ if isinstance(fig, str): if not os.path.exists(fig): print('File %s does exist!'%(fig)) return if not os.path.splitext(os.path.abspath(fig))[-1]=='.png': print('Only png image supported.') return im = Image.open(fig) if w: _w = w else: _w = self.output_width _scalar = _w * 1.0 / im.width svg_fig = sc.Figure(_w, ceil(_scalar * im.height), sc.Image( im.width, im.height, fig ).scale(_scalar) ) fig = svg_fig assert isinstance(fig, go.Figure) or isinstance(fig, sc.Figure) # if input a plotly Figure object, convert it into a svg file first title_txt = '%s%s '%(self.prefix, self.current_num) + title if isinstance(fig, go.Figure): # compute image size if w: _w = w else: _w = self.output_width if not fig.layout.width is None: _scalar = _w * 1.0 / fig.layout.width else: # default width is 700px _scalar = _w * 1.0 / 700 if not fig.layout.height is None: _h = fig.layout.height * _scalar else: # default height is 450px _h = 450.0 * _scalar title_annotation = go.layout.Annotation( xref = 'paper', yref = 'paper', x = 0.5, y = y_pos, xanchor = 'center', yanchor = 'top', text = title_txt, font = dict( family = self.font_family, size = self.font_size, color = "#000000", ), showarrow = False, ) fig.update_layout( width = _w, height = _h, annotations = list(fig.layout['annotations']) + [title_annotation], ) # increasing counter number self.current_num += 1 return fig else: # compute image size if w: _w = w else: _w = self.output_width _scalar = _w * 1.0 / fig.width.value _h = fig.height.value * _scalar + 25 if y_pos==-0.2: text_y = _h - 5 else: assert y_pos<=1 and y_pos>=0 text_y = int(_h*(1-y_pos)) # if w < default output width, add margin if _w < self.output_width: _outw = self.output_width _move_x = int((_outw - _w) / 2) else: _outw = _w _move_x = int((_outw - _w) / 2) # add title new_figure = sc.Figure(_outw, _h, fig.scale(_scalar).move(_move_x, 0), sc.Text(title_txt, _outw / 2, text_y, anchor='middle', size=self.font_size, font=self.font_family, ) ) # increasing counter number self.current_num += 1 return new_figure