コード例 #1
0
def calculate_matrix_svg(snplst,
                         pop,
                         request,
                         genome_build,
                         r2_d="r2",
                         collapseTranscript=True):

    # Set data directories using config.yml
    with open('config.yml', 'r') as yml_file:
        config = yaml.load(yml_file)
    env = config['env']
    connect_external = config['database']['connect_external']
    api_mongo_addr = config['database']['api_mongo_addr']
    population_samples_dir = config['data']['population_samples_dir']
    data_dir = config['data']['data_dir']
    tmp_dir = config['data']['tmp_dir']
    genotypes_dir = config['data']['genotypes_dir']
    aws_info = config['aws']
    mongo_username = config['database']['mongo_user_readonly']
    mongo_password = config['database']['mongo_password']
    mongo_port = config['database']['mongo_port']

    export_s3_keys = retrieveAWSCredentials()

    # Ensure tmp directory exists
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    # Open SNP list file
    snps_raw = open(snplst).readlines()

    # Remove duplicate RS numbers
    snps = []
    for snp_raw in snps_raw:
        snp = snp_raw.strip().split()
        if snp not in snps:
            snps.append(snp)

    # Select desired ancestral populations
    pops = pop.split("+")
    pop_dirs = []
    for pop_i in pops:
        if pop_i in [
                "ALL", "AFR", "AMR", "EAS", "EUR", "SAS", "ACB", "ASW", "BEB",
                "CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH",
                "GWD", "IBS", "ITU", "JPT", "KHV", "LWK", "MSL", "MXL", "PEL",
                "PJL", "PUR", "STU", "TSI", "YRI"
        ]:
            pop_dirs.append(data_dir + population_samples_dir + pop_i + ".txt")

    get_pops = "cat " + " ".join(pop_dirs)
    pop_list = [
        x.decode('utf-8') for x in subprocess.Popen(
            get_pops, shell=True, stdout=subprocess.PIPE).stdout.readlines()
    ]

    ids = [i.strip() for i in pop_list]
    pop_ids = list(set(ids))

    # Connect to Mongo snp database
    if env == 'local' or connect_external:
        mongo_host = api_mongo_addr
    else:
        mongo_host = 'localhost'
    client = MongoClient(
        'mongodb://' + mongo_username + ':' + mongo_password + '@' +
        mongo_host + '/admin', mongo_port)
    db = client["LDLink"]

    def get_coords(db, rsid):
        rsid = rsid.strip("rs")
        query_results = db.dbsnp.find_one({"id": rsid})
        query_results_sanitized = json.loads(json_util.dumps(query_results))
        return query_results_sanitized

    # Query genomic coordinates
    def get_rsnum(db, coord):
        temp_coord = coord.strip("chr").split(":")
        chro = temp_coord[0]
        pos = temp_coord[1]
        query_results = db.dbsnp.find({
            "chromosome":
            chro.upper() if chro == 'x' or chro == 'y' else str(chro),
            genome_build_vars[genome_build]['position']:
            str(pos)
        })
        query_results_sanitized = json.loads(json_util.dumps(query_results))
        return query_results_sanitized

    # Replace input genomic coordinates with variant ids (rsids)
    def replace_coords_rsid(db, snp_lst):
        new_snp_lst = []
        for snp_raw_i in snp_lst:
            if snp_raw_i[0][0:2] == "rs":
                new_snp_lst.append(snp_raw_i)
            else:
                snp_info_lst = get_rsnum(db, snp_raw_i[0])
                #print("snp_info_lst")
                #print(snp_info_lst)
                if snp_info_lst != None:
                    if len(snp_info_lst) > 1:
                        var_id = "rs" + snp_info_lst[0]['id']
                        ref_variants = []
                        for snp_info in snp_info_lst:
                            if snp_info['id'] == snp_info['ref_id']:
                                ref_variants.append(snp_info['id'])
                        if len(ref_variants) > 1:
                            var_id = "rs" + ref_variants[0]
                        elif len(ref_variants) == 0 and len(snp_info_lst) > 1:
                            var_id = "rs" + snp_info_lst[0]['id']
                        else:
                            var_id = "rs" + ref_variants[0]
                        new_snp_lst.append([var_id])
                    elif len(snp_info_lst) == 1:
                        var_id = "rs" + snp_info_lst[0]['id']
                        new_snp_lst.append([var_id])
                    else:
                        new_snp_lst.append(snp_raw_i)
                else:
                    new_snp_lst.append(snp_raw_i)
        return new_snp_lst

    snps = replace_coords_rsid(db, snps)

    # Find RS numbers in snp database
    rs_nums = []
    snp_pos = []
    snp_coords = []
    tabix_coords = ""
    for snp_i in snps:
        if len(snp_i) > 0:
            if len(snp_i[0]) > 2:
                if (snp_i[0][0:2] == "rs"
                        or snp_i[0][0:3] == "chr") and snp_i[0][-1].isdigit():
                    snp_coord = get_coords(db, snp_i[0])
                    if snp_coord != None and snp_coord[genome_build_vars[
                            genome_build]['position']] != "NA":
                        # check if variant is on chrY for genome build = GRCh38
                        if not (snp_coord['chromosome'] == "Y" and
                                (genome_build == "grch38"
                                 or genome_build == "grch38_high_coverage")):
                            rs_nums.append(snp_i[0])
                            snp_pos.append(snp_coord[
                                genome_build_vars[genome_build]['position']])
                            temp = [
                                snp_i[0], snp_coord['chromosome'],
                                snp_coord[genome_build_vars[genome_build]
                                          ['position']]
                            ]
                            snp_coords.append(temp)

    # Check max distance between SNPs
    distance_bp = []
    for i in range(len(snp_coords)):
        distance_bp.append(int(snp_coords[i][2]))

    # Sort coordinates and make tabix formatted coordinates
    snp_pos_int = [int(i) for i in snp_pos]
    snp_pos_int.sort()
    # keep track of rs and snp postion after sort
    rs_snp_pos = []
    for i in snp_pos_int:
        rs_snp_pos.append(snp_pos.index(str(i)))
    snp_coord_str = [
        genome_build_vars[genome_build]['1000G_chr_prefix'] +
        snp_coords[0][1] + ":" + str(i) + "-" + str(i) for i in snp_pos_int
    ]
    tabix_coords = " " + " ".join(snp_coord_str)

    # Extract 1000 Genomes phased genotypes
    vcf_filePath = "%s/%s%s/%s" % (
        config['aws']['data_subfolder'], genotypes_dir,
        genome_build_vars[genome_build]['1000G_dir'],
        genome_build_vars[genome_build]['1000G_file'] % (snp_coords[0][1]))
    vcf_query_snp_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath)

    checkS3File(aws_info, config['aws']['bucket'], vcf_filePath)
    vcf, h = retrieveTabix1000GData(
        vcf_query_snp_file, tabix_coords, data_dir + genotypes_dir +
        genome_build_vars[genome_build]['1000G_dir'])

    # Define function to correct indel alleles
    def set_alleles(a1, a2):
        if len(a1) == 1 and len(a2) == 1:
            a1_n = a1
            a2_n = a2
        elif len(a1) == 1 and len(a2) > 1:
            a1_n = "-"
            a2_n = a2[1:]
        elif len(a1) > 1 and len(a2) == 1:
            a1_n = a1[1:]
            a2_n = "-"
        elif len(a1) > 1 and len(a2) > 1:
            a1_n = a1[1:]
            a2_n = a2[1:]
        return (a1_n, a2_n)

    head = vcf[h].strip().split()

    # Extract haplotypes
    index = []
    for i in range(9, len(head)):
        if head[i] in pop_ids:
            index.append(i)

    hap1 = [[]]
    for i in range(len(index) - 1):
        hap1.append([])
    hap2 = [[]]
    for i in range(len(index) - 1):
        hap2.append([])

    snp_dict, missing_snp = parse_vcf(vcf[h + 1:], snp_coords, True)

    rsnum_lst = []
    allele_lst = []
    pos_lst = []

    for s_key in snp_dict:
        # parse snp_key such as chr7:pos_rs4
        snp_keys = s_key.split("_")
        snp_key = snp_keys[0].split(':')[1]
        rs_input = snp_keys[1]
        geno_list = snp_dict[s_key]
        g = -1
        for geno in geno_list:
            g = g + 1
            geno = geno.strip().split()
            geno[0] = geno[0].lstrip('chr')
            # if 1000G position does not match dbSNP position for variant, use dbSNP position
            if geno[1] != snp_key:
                geno[1] = snp_key

            if "," not in geno[3] and "," not in geno[4]:
                a1, a2 = set_alleles(geno[3], geno[4])
                for i in range(len(index)):
                    if geno[index[i]] == "0|0":
                        hap1[i].append(a1)
                        hap2[i].append(a1)
                    elif geno[index[i]] == "0|1":
                        hap1[i].append(a1)
                        hap2[i].append(a2)
                    elif geno[index[i]] == "1|0":
                        hap1[i].append(a2)
                        hap2[i].append(a1)
                    elif geno[index[i]] == "1|1":
                        hap1[i].append(a2)
                        hap2[i].append(a2)
                    elif geno[index[i]] == "0":
                        hap1[i].append(a1)
                        hap2[i].append(".")
                    elif geno[index[i]] == "1":
                        hap1[i].append(a2)
                        hap2[i].append(".")
                    else:
                        hap1[i].append(".")
                        hap2[i].append(".")

                rsnum_lst.append(rs_input)

                position = "chr" + geno[0] + ":" + geno[1] + "-" + geno[1]
                pos_lst.append(position)
                alleles = a1 + "/" + a2
                allele_lst.append(alleles)

    # Calculate Pairwise LD Statistics
    all_haps = hap1 + hap2
    ld_matrix = [[[None for v in range(2)] for i in range(len(all_haps[0]))]
                 for j in range(len(all_haps[0]))]

    for i in range(len(all_haps[0])):
        for j in range(i, len(all_haps[0])):
            hap = {}
            for k in range(len(all_haps)):
                # Extract haplotypes
                hap_k = all_haps[k][i] + all_haps[k][j]
                if hap_k in hap:
                    hap[hap_k] += 1
                else:
                    hap[hap_k] = 1

            # Remove Missing Haplotypes
            keys = list(hap.keys())
            for key in keys:
                if "." in key:
                    hap.pop(key, None)

            # Check all haplotypes are present
            if len(hap) != 4:
                snp_i_a = allele_lst[i].split("/")
                snp_j_a = allele_lst[j].split("/")
                haps = [
                    snp_i_a[0] + snp_j_a[0], snp_i_a[0] + snp_j_a[1],
                    snp_i_a[1] + snp_j_a[0], snp_i_a[1] + snp_j_a[1]
                ]
                for h in haps:
                    if h not in hap:
                        hap[h] = 0

            # Perform LD calculations
            A = hap[sorted(hap)[0]]
            B = hap[sorted(hap)[1]]
            C = hap[sorted(hap)[2]]
            D = hap[sorted(hap)[3]]
            tmax = max(A, B, C, D)
            delta = float(A * D - B * C)
            Ms = float((A + C) * (B + D) * (A + B) * (C + D))
            if Ms != 0:
                # D prime
                if delta < 0:
                    D_prime = round(
                        abs(delta / min((A + C) * (A + B), (B + D) * (C + D))),
                        3)
                else:
                    D_prime = round(
                        abs(delta / min((A + C) * (C + D), (A + B) * (B + D))),
                        3)

                # R2
                r2 = round((delta**2) / Ms, 3)

                # Find Correlated Alleles
                if str(r2) != "NA" and float(r2) > 0.1:
                    Ac = hap[sorted(hap)[0]]
                    Bc = hap[sorted(hap)[1]]
                    Cc = hap[sorted(hap)[2]]
                    Dc = hap[sorted(hap)[3]]

                    if ((Ac * Dc) / max((Bc * Cc), 0.01) > 1):
                        match = sorted(hap)[0][0] + "=" + sorted(
                            hap)[0][1] + "," + sorted(
                                hap)[3][0] + "=" + sorted(hap)[3][1]
                    else:
                        match = sorted(hap)[1][0] + "=" + sorted(
                            hap)[1][1] + "," + sorted(
                                hap)[2][0] + "=" + sorted(hap)[2][1]
                else:
                    match = "  =  ,  =  "
            else:
                D_prime = "NA"
                r2 = "NA"
                match = "  =  ,  =  "

            snp1 = rsnum_lst[i]
            snp2 = rsnum_lst[j]
            pos1 = pos_lst[i].split("-")[0]
            pos2 = pos_lst[j].split("-")[0]
            allele1 = allele_lst[i]
            allele2 = allele_lst[j]
            corr = match.split(",")[0].split("=")[1] + "=" + match.split(
                ",")[0].split("=")[0] + "," + match.split(",")[1].split(
                    "=")[1] + "=" + match.split(",")[1].split("=")[0]
            corr_f = match

            ld_matrix[i][j] = [
                snp1, snp2, allele1, allele2, corr, pos1, pos2, D_prime, r2
            ]
            ld_matrix[j][i] = [
                snp2, snp1, allele2, allele1, corr_f, pos2, pos1, D_prime, r2
            ]

    # Generate Plot Variables
    out = [j for i in ld_matrix for j in i]
    xnames = []
    ynames = []
    xA = []
    yA = []
    corA = []
    xpos = []
    ypos = []
    D = []
    R = []
    box_color = []
    box_trans = []

    if r2_d not in ["r2", "d"]:
        r2_d = "r2"

    for i in range(len(out)):
        snp1, snp2, allele1, allele2, corr, pos1, pos2, D_prime, r2 = out[i]
        xnames.append(snp1)
        ynames.append(snp2)
        xA.append(allele1)
        yA.append(allele2)
        corA.append(corr)
        xpos.append(pos1)
        ypos.append(pos2)
        sqrti = math.floor(math.sqrt(len(out)))
        if sqrti == 0:
            D.append(str(round(float(D_prime), 4)))
            R.append(str(round(float(r2), 4)))
            box_color.append("red")
            box_trans.append(r2)
        elif i % sqrti < i // sqrti and r2 != "NA":
            D.append(str(round(float(D_prime), 4)))
            R.append(str(round(float(r2), 4)))
            box_color.append("blue")
            box_trans.append(abs(D_prime))
        elif i % sqrti > i // sqrti and D_prime != "NA":
            D.append(str(round(float(D_prime), 4)))
            R.append(str(round(float(r2), 4)))
            box_color.append("red")
            box_trans.append(r2)
        elif i % sqrti == i // sqrti and D_prime != "NA":
            D.append(str(round(float(D_prime), 4)))
            R.append(str(round(float(r2), 4)))
            box_color.append("purple")
            box_trans.append(r2)
        else:
            D.append("NA")
            R.append("NA")
            box_color.append("gray")
            box_trans.append(0.1)
    # Import plotting modules
    from collections import OrderedDict
    from bokeh.embed import components, file_html
    from bokeh.layouts import gridplot
    from bokeh.models import HoverTool, LinearAxis, Range1d
    from bokeh.plotting import ColumnDataSource, curdoc, figure, output_file, reset_output, save
    from bokeh.resources import CDN
    from bokeh.io import export_svgs
    import svgutils.compose as sg
    from math import pi

    reset_output()

    # Aggregate Plotting Data
    x = []
    y = []
    w = []
    h = []
    coord_snps_plot = []
    snp_id_plot = []
    alleles_snp_plot = []
    for i in range(0, len(xpos), int(len(xpos)**0.5)):
        x.append(int(xpos[i].split(":")[1]) / 1000000.0)
        y.append(0.5)
        w.append(0.00003)
        h.append(1.06)
        coord_snps_plot.append(xpos[i])
        snp_id_plot.append(xnames[i])
        alleles_snp_plot.append(xA[i])

    buffer = (x[-1] - x[0]) * 0.025
    xr = Range1d(start=x[0] - buffer, end=x[-1] + buffer)
    yr = Range1d(start=-0.03, end=1.03)
    y2_ll = [-0.03] * len(x)
    y2_ul = [1.03] * len(x)

    yr_pos = Range1d(start=(x[-1] + buffer) * -1, end=(x[0] - buffer) * -1)
    yr0 = Range1d(start=0, end=1)
    yr2 = Range1d(start=0, end=3.8)
    yr3 = Range1d(start=0, end=1)

    spacing = (x[-1] - x[0] + buffer + buffer) / (len(x) * 1.0)
    x2 = []
    y0 = []
    y1 = []
    y2 = []
    y3 = []
    y4 = []
    for i in range(len(x)):
        x2.append(x[0] - buffer + spacing * (i + 0.5))
        y0.append(0)
        y1.append(0.20)
        y2.append(0.80)
        y3.append(1)
        y4.append(1.15)

    xname_pos = []
    for i in x2:
        for j in range(len(x2)):
            xname_pos.append(i)

    data = {
        'xname': xnames,
        'xname_pos': xname_pos,
        'yname': ynames,
        'xA': xA,
        'yA': yA,
        'xpos': xpos,
        'ypos': ypos,
        'R2': R,
        'Dp': D,
        'corA': corA,
        'box_color': box_color,
        'box_trans': box_trans
    }

    source = ColumnDataSource(data)

    threshold = 70
    if len(snps) < threshold:
        matrix_plot = figure(
            outline_line_color="white",
            min_border_top=0,
            min_border_bottom=2,
            min_border_left=100,
            min_border_right=5,
            x_range=xr,
            y_range=list(reversed(rsnum_lst)),
            h_symmetry=False,
            v_symmetry=False,
            border_fill_color='white',
            x_axis_type=None,
            logo=None,
            tools="hover,undo,redo,reset,pan,box_zoom,previewsave",
            title=" ",
            plot_width=800,
            plot_height=700)

    else:
        matrix_plot = figure(
            outline_line_color="white",
            min_border_top=0,
            min_border_bottom=2,
            min_border_left=100,
            min_border_right=5,
            x_range=xr,
            y_range=list(reversed(rsnum_lst)),
            h_symmetry=False,
            v_symmetry=False,
            border_fill_color='white',
            x_axis_type=None,
            y_axis_type=None,
            logo=None,
            tools="hover,undo,redo,reset,pan,box_zoom,previewsave",
            title=" ",
            plot_width=800,
            plot_height=700)

    matrix_plot.rect(x='xname_pos',
                     y='yname',
                     width=0.95 * spacing,
                     height=0.95,
                     source=source,
                     color="box_color",
                     alpha="box_trans",
                     line_color=None)

    matrix_plot.grid.grid_line_color = None
    matrix_plot.axis.axis_line_color = None
    matrix_plot.axis.major_tick_line_color = None
    if len(snps) < threshold:
        matrix_plot.axis.major_label_text_font_size = "8pt"
        matrix_plot.xaxis.major_label_orientation = "vertical"

    matrix_plot.axis.major_label_text_font_style = "normal"
    matrix_plot.xaxis.major_label_standoff = 0

    sup_2 = "\u00B2"

    hover = matrix_plot.select(dict(type=HoverTool))
    hover.tooltips = OrderedDict([
        ("Variant 1", " " + "@yname (@yA)"),
        ("Variant 2", " " + "@xname (@xA)"),
        ("D\'", " " + "@Dp"),
        ("R" + sup_2, " " + "@R2"),
        ("Correlated Alleles", " " + "@corA"),
    ])

    # Connecting and Rug Plots
    # Connector Plot
    if len(snps) < threshold:
        connector = figure(outline_line_color="white",
                           y_axis_type=None,
                           x_axis_type=None,
                           x_range=xr,
                           y_range=yr2,
                           border_fill_color='white',
                           title="",
                           min_border_left=100,
                           min_border_right=5,
                           min_border_top=0,
                           min_border_bottom=0,
                           h_symmetry=False,
                           v_symmetry=False,
                           plot_width=800,
                           plot_height=90,
                           tools="xpan,tap")
        connector.segment(x, y0, x, y1, color="black")
        connector.segment(x, y1, x2, y2, color="black")
        connector.segment(x2, y2, x2, y3, color="black")
        connector.text(x2,
                       y4,
                       text=snp_id_plot,
                       alpha=1,
                       angle=pi / 2,
                       text_font_size="8pt",
                       text_baseline="middle",
                       text_align="left")
    else:
        connector = figure(outline_line_color="white",
                           y_axis_type=None,
                           x_axis_type=None,
                           x_range=xr,
                           y_range=yr3,
                           border_fill_color='white',
                           title="",
                           min_border_left=100,
                           min_border_right=5,
                           min_border_top=0,
                           min_border_bottom=0,
                           h_symmetry=False,
                           v_symmetry=False,
                           plot_width=800,
                           plot_height=30,
                           tools="xpan,tap")
        connector.segment(x, y0, x, y1, color="black")
        connector.segment(x, y1, x2, y2, color="black")
        connector.segment(x2, y2, x2, y3, color="black")

    connector.yaxis.major_label_text_color = None
    connector.yaxis.minor_tick_line_alpha = 0  # Option does not work
    connector.yaxis.axis_label = " "
    connector.grid.grid_line_color = None
    connector.axis.axis_line_color = None
    connector.axis.major_tick_line_color = None
    connector.axis.minor_tick_line_color = None

    connector.toolbar_location = None

    data_rug = {
        'x': x,
        'y': y,
        'w': w,
        'h': h,
        'coord_snps_plot': coord_snps_plot,
        'snp_id_plot': snp_id_plot,
        'alleles_snp_plot': alleles_snp_plot
    }

    source_rug = ColumnDataSource(data_rug)

    # Rug Plot
    rug = figure(x_range=xr,
                 y_range=yr,
                 y_axis_type=None,
                 title="",
                 min_border_top=1,
                 min_border_bottom=0,
                 min_border_left=100,
                 min_border_right=5,
                 h_symmetry=False,
                 v_symmetry=False,
                 plot_width=800,
                 plot_height=50,
                 tools="hover,xpan,tap")
    rug.rect(x='x',
             y='y',
             width='w',
             height='h',
             fill_color='red',
             dilate=True,
             line_color=None,
             fill_alpha=0.6,
             source=source_rug)

    hover = rug.select(dict(type=HoverTool))
    hover.tooltips = OrderedDict([
        ("SNP", "@snp_id_plot (@alleles_snp_plot)"),
        ("Coord", "@coord_snps_plot"),
    ])

    rug.toolbar_location = None

    if collapseTranscript == "false":
        # Gene Plot (All Transcripts)
        genes_file = tmp_dir + "genes_" + request + ".json"
        genes_raw = open(genes_file).readlines()

        genes_plot_start = []
        genes_plot_end = []
        genes_plot_y = []
        genes_plot_name = []
        exons_plot_x = []
        exons_plot_y = []
        exons_plot_w = []
        exons_plot_h = []
        exons_plot_name = []
        exons_plot_id = []
        exons_plot_exon = []
        message = ["Too many genes to plot."]
        lines = [0]
        gap = 80000
        tall = 0.75
        if genes_raw != None and len(genes_raw) > 0:
            for gene_raw_obj in genes_raw:
                gene_obj = json.loads(gene_raw_obj)
                bin = gene_obj["bin"]
                name_id = gene_obj["name"]
                chrom = gene_obj["chrom"]
                strand = gene_obj["strand"]
                txStart = gene_obj["txStart"]
                txEnd = gene_obj["txEnd"]
                cdsStart = gene_obj["cdsStart"]
                cdsEnd = gene_obj["cdsEnd"]
                exonCount = gene_obj["exonCount"]
                exonStarts = gene_obj["exonStarts"]
                exonEnds = gene_obj["exonEnds"]
                score = gene_obj["score"]
                name2 = gene_obj["name2"]
                cdsStartStat = gene_obj["cdsStartStat"]
                cdsEndStat = gene_obj["cdsEndStat"]
                exonFrames = gene_obj["exonFrames"]
                name = name2
                id = name_id
                e_start = exonStarts.split(",")
                e_end = exonEnds.split(",")

                # Determine Y Coordinate
                i = 0
                y_coord = None
                while y_coord == None:
                    if i > len(lines) - 1:
                        y_coord = i + 1
                        lines.append(int(txEnd))
                    elif int(txStart) > (gap + lines[i]):
                        y_coord = i + 1
                        lines[i] = int(txEnd)
                    else:
                        i += 1

                genes_plot_start.append(int(txStart) / 1000000.0)
                genes_plot_end.append(int(txEnd) / 1000000.0)
                genes_plot_y.append(y_coord)
                genes_plot_name.append(name + "  ")

                for i in range(len(e_start) - 1):
                    if strand == "+":
                        exon = i + 1
                    else:
                        exon = len(e_start) - 1 - i

                    width = (int(e_end[i]) - int(e_start[i])) / 1000000.0
                    x_coord = int(e_start[i]) / 1000000.0 + (width / 2)

                    exons_plot_x.append(x_coord)
                    exons_plot_y.append(y_coord)
                    exons_plot_w.append(width)
                    exons_plot_h.append(tall)
                    exons_plot_name.append(name)
                    exons_plot_id.append(id)
                    exons_plot_exon.append(exon)

        n_rows = len(lines)
        genes_plot_yn = [n_rows - w + 0.5 for w in genes_plot_y]
        exons_plot_yn = [n_rows - w + 0.5 for w in exons_plot_y]
        yr2 = Range1d(start=0, end=n_rows)

        data_gene_plot = {
            'exons_plot_x': exons_plot_x,
            'exons_plot_yn': exons_plot_yn,
            'exons_plot_w': exons_plot_w,
            'exons_plot_h': exons_plot_h,
            'exons_plot_name': exons_plot_name,
            'exons_plot_id': exons_plot_id,
            'exons_plot_exon': exons_plot_exon,
            'coord_snps_plot': coord_snps_plot,
            'snp_id_plot': snp_id_plot,
            'alleles_snp_plot': alleles_snp_plot
        }

        source_gene_plot = ColumnDataSource(data_gene_plot)

        max_genes = 40
        # if len(lines) < 3 or len(genes_raw) > max_genes:
        if len(lines) < 3:
            plot_h_pix = 250
        else:
            plot_h_pix = 250 + (len(lines) - 2) * 50

        gene_plot = figure(
            min_border_top=2,
            min_border_bottom=0,
            min_border_left=100,
            min_border_right=5,
            x_range=xr,
            y_range=yr2,
            border_fill_color='white',
            title="",
            h_symmetry=False,
            v_symmetry=False,
            logo=None,
            plot_width=800,
            plot_height=plot_h_pix,
            tools=
            "hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave")

        # if len(genes_raw) <= max_genes:
        gene_plot.segment(genes_plot_start,
                          genes_plot_yn,
                          genes_plot_end,
                          genes_plot_yn,
                          color="black",
                          alpha=1,
                          line_width=2)
        gene_plot.rect(x='exons_plot_x',
                       y='exons_plot_yn',
                       width='exons_plot_w',
                       height='exons_plot_h',
                       source=source_gene_plot,
                       fill_color='grey',
                       line_color="grey")
        gene_plot.text(genes_plot_start,
                       genes_plot_yn,
                       text=genes_plot_name,
                       alpha=1,
                       text_font_size="7pt",
                       text_font_style="bold",
                       text_baseline="middle",
                       text_align="right",
                       angle=0)
        hover = gene_plot.select(dict(type=HoverTool))
        hover.tooltips = OrderedDict([
            ("Gene", "@exons_plot_name"),
            ("ID", "@exons_plot_id"),
            ("Exon", "@exons_plot_exon"),
        ])

        # else:
        #     x_coord_text = x[0] + (x[-1] - x[0]) / 2.0
        #     gene_plot.text(x_coord_text, n_rows / 2.0, text=message, alpha=1,
        #                    text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0)

        gene_plot.xaxis.axis_label = "Chromosome " + \
            snp_coords[1][1] + " Coordinate (Mb)(" + genome_build_vars[genome_build]['title'] + ")"
        gene_plot.yaxis.axis_label = "Genes (All Transcripts)"
        gene_plot.ygrid.grid_line_color = None
        gene_plot.yaxis.axis_line_color = None
        gene_plot.yaxis.minor_tick_line_color = None
        gene_plot.yaxis.major_tick_line_color = None
        gene_plot.yaxis.major_label_text_color = None

        gene_plot.toolbar_location = "below"

    # Gene Plot (Collapsed)
    else:
        genes_c_file = tmp_dir + "genes_c_" + request + ".json"
        genes_c_raw = open(genes_c_file).readlines()

        genes_c_plot_start = []
        genes_c_plot_end = []
        genes_c_plot_y = []
        genes_c_plot_name = []
        exons_c_plot_x = []
        exons_c_plot_y = []
        exons_c_plot_w = []
        exons_c_plot_h = []
        exons_c_plot_name = []
        exons_c_plot_id = []
        message_c = ["Too many genes to plot."]
        lines_c = [0]
        gap = 80000
        tall = 0.75
        if genes_c_raw != None and len(genes_c_raw) > 0:
            for gene_c_raw_obj in genes_c_raw:
                gene_c_obj = json.loads(gene_c_raw_obj)
                chrom = gene_c_obj["chrom"]
                txStart = gene_c_obj["txStart"]
                txEnd = gene_c_obj["txEnd"]
                exonStarts = gene_c_obj["exonStarts"]
                exonEnds = gene_c_obj["exonEnds"]
                name2 = gene_c_obj["name2"]
                transcripts = gene_c_obj["transcripts"]
                name = name2
                e_start = exonStarts.split(",")
                e_end = exonEnds.split(",")
                e_transcripts = transcripts.split(",")

                # Determine Y Coordinate
                i = 0
                y_coord = None
                while y_coord == None:
                    if i > len(lines_c) - 1:
                        y_coord = i + 1
                        lines_c.append(int(txEnd))
                    elif int(txStart) > (gap + lines_c[i]):
                        y_coord = i + 1
                        lines_c[i] = int(txEnd)
                    else:
                        i += 1

                genes_c_plot_start.append(int(txStart) / 1000000.0)
                genes_c_plot_end.append(int(txEnd) / 1000000.0)
                genes_c_plot_y.append(y_coord)
                genes_c_plot_name.append(name + "  ")

                # for i in range(len(e_start)):
                for i in range(len(e_start) - 1):
                    width = (int(e_end[i]) - int(e_start[i])) / 1000000.0
                    x_coord = int(e_start[i]) / 1000000.0 + (width / 2)

                    exons_c_plot_x.append(x_coord)
                    exons_c_plot_y.append(y_coord)
                    exons_c_plot_w.append(width)
                    exons_c_plot_h.append(tall)
                    exons_c_plot_name.append(name)
                    exons_c_plot_id.append(e_transcripts[i].replace("-", ","))

        n_rows_c = len(lines_c)
        genes_c_plot_yn = [n_rows_c - x + 0.5 for x in genes_c_plot_y]
        exons_c_plot_yn = [n_rows_c - x + 0.5 for x in exons_c_plot_y]
        yr2_c = Range1d(start=0, end=n_rows_c)

        data_gene_c_plot = {
            'exons_c_plot_x': exons_c_plot_x,
            'exons_c_plot_yn': exons_c_plot_yn,
            'exons_c_plot_w': exons_c_plot_w,
            'exons_c_plot_h': exons_c_plot_h,
            'exons_c_plot_name': exons_c_plot_name,
            'exons_c_plot_id': exons_c_plot_id
        }
        source_gene_c_plot = ColumnDataSource(data_gene_c_plot)
        max_genes_c = 40
        # if len(lines_c) < 3 or len(genes_c_raw) > max_genes_c:
        if len(lines_c) < 3:
            plot_h_pix = 250
        else:
            plot_h_pix = 250 + (len(lines_c) - 2) * 50

        gene_plot = figure(
            min_border_top=2,
            min_border_bottom=0,
            min_border_left=100,
            min_border_right=5,
            x_range=xr,
            y_range=yr2_c,
            border_fill_color='white',
            title="",
            h_symmetry=False,
            v_symmetry=False,
            logo=None,
            plot_width=900,
            plot_height=plot_h_pix,
            tools=
            "hover,xpan,box_zoom,wheel_zoom,tap,undo,redo,reset,previewsave")

        # if len(genes_c_raw) <= max_genes_c:
        gene_plot.segment(genes_c_plot_start,
                          genes_c_plot_yn,
                          genes_c_plot_end,
                          genes_c_plot_yn,
                          color="black",
                          alpha=1,
                          line_width=2)
        gene_plot.rect(x='exons_c_plot_x',
                       y='exons_c_plot_yn',
                       width='exons_c_plot_w',
                       height='exons_c_plot_h',
                       source=source_gene_c_plot,
                       fill_color="grey",
                       line_color="grey")
        gene_plot.text(genes_c_plot_start,
                       genes_c_plot_yn,
                       text=genes_c_plot_name,
                       alpha=1,
                       text_font_size="7pt",
                       text_font_style="bold",
                       text_baseline="middle",
                       text_align="right",
                       angle=0)
        hover = gene_plot.select(dict(type=HoverTool))
        hover.tooltips = OrderedDict([
            ("Gene", "@exons_c_plot_name"),
            ("Transcript IDs", "@exons_c_plot_id"),
        ])

        # else:
        # 	x_coord_text = coord1/1000000.0 + (coord2/1000000.0 - coord1/1000000.0) / 2.0
        # 	gene_c_plot.text(x_coord_text, n_rows_c / 2.0, text=message_c, alpha=1,
        # 				   text_font_size="12pt", text_font_style="bold", text_baseline="middle", text_align="center", angle=0)

        gene_plot.xaxis.axis_label = "Chromosome " + snp_coords[1][
            1] + " Coordinate (Mb)(" + genome_build_vars[genome_build][
                'title'] + ")"
        gene_plot.yaxis.axis_label = "Genes (Transcripts Collapsed)"
        gene_plot.ygrid.grid_line_color = None
        gene_plot.yaxis.axis_line_color = None
        gene_plot.yaxis.minor_tick_line_color = None
        gene_plot.yaxis.major_tick_line_color = None
        gene_plot.yaxis.major_label_text_color = None

        gene_plot.toolbar_location = "below"

    # Change output backend to SVG temporarily for headless export
    # Will be changed back to canvas in LDlink.js
    matrix_plot.output_backend = "svg"
    connector.output_backend = "svg"
    rug.output_backend = "svg"
    gene_plot.output_backend = "svg"
    export_svgs(matrix_plot,
                filename=tmp_dir + "matrix_plot_1_" + request + ".svg")
    export_svgs(connector,
                filename=tmp_dir + "connector_1_" + request + ".svg")
    export_svgs(rug, filename=tmp_dir + "rug_1_" + request + ".svg")
    export_svgs(gene_plot,
                filename=tmp_dir + "gene_plot_1_" + request + ".svg")

    # 1 pixel = 0.0264583333 cm
    svg_height = str(25.00 + (0.0264583333 * plot_h_pix)) + "cm"
    svg_height_scaled = str(110.00 + (0.1322916665 * plot_h_pix)) + "cm"

    # Concatenate svgs
    sg.Figure(
        "21.59cm", svg_height,
        sg.SVG(tmp_dir + "matrix_plot_1_" + request + ".svg"),
        sg.SVG(tmp_dir + "connector_1_" + request + ".svg").scale(.97).move(
            0, 700),
        sg.SVG(tmp_dir + "rug_1_" + request + ".svg").scale(.97).move(0, 790),
        sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(.97).move(
            0, 840)).save(tmp_dir + "matrix_plot_" + request + ".svg")

    sg.Figure(
        "107.95cm", svg_height_scaled,
        sg.SVG(tmp_dir + "matrix_plot_1_" + request + ".svg").scale(5),
        sg.SVG(tmp_dir + "connector_1_" + request + ".svg").scale(4.85).move(
            0, 3500),
        sg.SVG(tmp_dir + "rug_1_" + request + ".svg").scale(4.85).move(
            0, 3930),
        sg.SVG(tmp_dir + "gene_plot_1_" + request + ".svg").scale(4.85).move(
            0, 4160)).save(tmp_dir + "matrix_plot_scaled_" + request + ".svg")

    # Export to PDF
    subprocess.call("phantomjs ./rasterize.js " + tmp_dir + "matrix_plot_" +
                    request + ".svg " + tmp_dir + "matrix_plot_" + request +
                    ".pdf",
                    shell=True)
    # Export to PNG
    subprocess.call("phantomjs ./rasterize.js " + tmp_dir +
                    "matrix_plot_scaled_" + request + ".svg " + tmp_dir +
                    "matrix_plot_" + request + ".png",
                    shell=True)
    # Export to JPEG
    subprocess.call("phantomjs ./rasterize.js " + tmp_dir +
                    "matrix_plot_scaled_" + request + ".svg " + tmp_dir +
                    "matrix_plot_" + request + ".jpeg",
                    shell=True)
    # Remove individual SVG files after they are combined
    subprocess.call("rm " + tmp_dir + "matrix_plot_1_" + request + ".svg",
                    shell=True)
    subprocess.call("rm " + tmp_dir + "gene_plot_1_" + request + ".svg",
                    shell=True)
    subprocess.call("rm " + tmp_dir + "rug_1_" + request + ".svg", shell=True)
    subprocess.call("rm " + tmp_dir + "connector_1_" + request + ".svg",
                    shell=True)
    # Remove scaled SVG file after it is converted to png and jpeg
    subprocess.call("rm " + tmp_dir + "matrix_plot_scaled_" + request + ".svg",
                    shell=True)
    # Remove temporary file(s)
    subprocess.call("rm " + tmp_dir + "genes_*" + request + "*.json",
                    shell=True)

    reset_output()

    return None
コード例 #2
0
ファイル: LDhap.py プロジェクト: machiela-lab/LDlink
def calculate_hap(snplst, pop, request, web, genome_build):
    # Set data directories using config.yml
    with open('config.yml', 'r') as yml_file:
        config = yaml.load(yml_file)
    dbsnp_version = config['data']['dbsnp_version']
    data_dir = config['data']['data_dir']
    tmp_dir = config['data']['tmp_dir']
    population_samples_dir = config['data']['population_samples_dir']
    genotypes_dir = config['data']['genotypes_dir']
    aws_info = config['aws']

    # Create JSON output
    output = {}

    # Validate genome build param
    if genome_build not in genome_build_vars['vars']:
        output[
            "error"] = "Invalid genome build. Please specify either " + ", ".join(
                genome_build_vars['vars']) + ". " + str(
                    output["warning"] if "warning" in output else "")
        return (json.dumps(output, sort_keys=True, indent=2))

    # Open Inputted SNPs list file
    snps_raw = open(snplst).readlines()
    if len(snps_raw) > 30:
        output["error"] = "Maximum variant list is 30 RS numbers or coordinates. Your list contains " + \
            str(len(snps_raw))+" entries. " + str(output["warning"] if "warning" in output else "")
        return (json.dumps(output, sort_keys=True, indent=2))

    # Remove duplicate RS numbers and cast to lower case
    snps = []
    for snp_raw in snps_raw:
        snp = snp_raw.lower().strip().split()
        if snp not in snps:
            snps.append(snp)

    # Select desired ancestral populations
    pops = pop.split("+")
    pop_dirs = []
    for pop_i in pops:
        if pop_i in [
                "ALL", "AFR", "AMR", "EAS", "EUR", "SAS", "ACB", "ASW", "BEB",
                "CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH",
                "GWD", "IBS", "ITU", "JPT", "KHV", "LWK", "MSL", "MXL", "PEL",
                "PJL", "PUR", "STU", "TSI", "YRI"
        ]:
            pop_dirs.append(data_dir + population_samples_dir + pop_i + ".txt")
        else:
            output[
                "error"] = pop_i + " is not an ancestral population. Choose one of the following ancestral populations: AFR, AMR, EAS, EUR, or SAS; or one of the following sub-populations: ACB, ASW, BEB, CDX, CEU, CHB, CHS, CLM, ESN, FIN, GBR, GIH, GWD, IBS, ITU, JPT, KHV, LWK, MSL, MXL, PEL, PJL, PUR, STU, TSI, or YRI. " + str(
                    output["warning"] if "warning" in output else "")
            return (json.dumps(output, sort_keys=True, indent=2))

    get_pops = "cat " + " ".join(pop_dirs)
    pop_list = [
        x.decode('utf-8') for x in subprocess.Popen(
            get_pops, shell=True, stdout=subprocess.PIPE).stdout.readlines()
    ]

    ids = [i.strip() for i in pop_list]
    pop_ids = list(set(ids))

    db = connectMongoDBReadOnly(web)

    def get_coords(db, rsid):
        rsid = rsid.strip("rs")
        query_results = db.dbsnp.find_one({"id": rsid})
        query_results_sanitized = json.loads(json_util.dumps(query_results))
        return query_results_sanitized

    # Query genomic coordinates
    def get_rsnum(db, coord):
        temp_coord = coord.strip("chr").split(":")
        chro = temp_coord[0]
        pos = temp_coord[1]
        query_results = db.dbsnp.find({
            "chromosome":
            chro.upper() if chro == 'x' or chro == 'y' else str(chro),
            genome_build_vars[genome_build]['position']:
            str(pos)
        })
        query_results_sanitized = json.loads(json_util.dumps(query_results))
        return query_results_sanitized

    # Replace input genomic coordinates with variant ids (rsids)
    def replace_coords_rsid(db, snp_lst):
        new_snp_lst = []
        for snp_raw_i in snp_lst:
            if snp_raw_i[0][0:2] == "rs":
                new_snp_lst.append(snp_raw_i)
            else:
                snp_info_lst = get_rsnum(db, snp_raw_i[0])
                # print("snp_info_lst")
                # print(snp_info_lst)
                if snp_info_lst != None:
                    if len(snp_info_lst) > 1:
                        var_id = "rs" + snp_info_lst[0]['id']
                        ref_variants = []
                        for snp_info in snp_info_lst:
                            if snp_info['id'] == snp_info['ref_id']:
                                ref_variants.append(snp_info['id'])
                        if len(ref_variants) > 1:
                            var_id = "rs" + ref_variants[0]
                            if "warning" in output:
                                output["warning"] = output["warning"] + \
                                "Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp_raw_i[0] + ". "
                            else:
                                output["warning"] = "Multiple rsIDs (" + ", ".join(
                                    ["rs" + ref_id for ref_id in ref_variants]
                                ) + ") map to genomic coordinates " + snp_raw_i[
                                    0] + ". "
                        elif len(ref_variants) == 0 and len(snp_info_lst) > 1:
                            var_id = "rs" + snp_info_lst[0]['id']
                            if "warning" in output:
                                output["warning"] = output["warning"] + \
                                "Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp_raw_i[0] + ". "
                            else:
                                output["warning"] = "Multiple rsIDs (" + ", ".join(
                                    ["rs" + ref_id for ref_id in ref_variants]
                                ) + ") map to genomic coordinates " + snp_raw_i[
                                    0] + ". "
                        else:
                            var_id = "rs" + ref_variants[0]
                        new_snp_lst.append([var_id])
                    elif len(snp_info_lst) == 1:
                        var_id = "rs" + snp_info_lst[0]['id']
                        new_snp_lst.append([var_id])
                    else:
                        new_snp_lst.append(snp_raw_i)
                else:
                    new_snp_lst.append(snp_raw_i)
        return new_snp_lst

    snps = replace_coords_rsid(db, snps)
    # print("Input SNPs (replace genomic coords with RSIDs)", str(snps))
    # Find RS numbers and genomic coords in snp database
    rs_nums = []
    snp_pos = []
    snp_coords = []
    warn = []
    tabix_coords = ""
    for snp_i in snps:
        if len(snp_i) > 0:  # Length entire list of snps
            if len(snp_i[0]) > 2:  # Length of each snp in snps
                # Check first two charcters are rs and last charcter of each snp
                if (snp_i[0][0:2] == "rs"
                        or snp_i[0][0:3] == "chr") and snp_i[0][-1].isdigit():
                    snp_coord = get_coords(db, snp_i[0])
                    if snp_coord != None and snp_coord[genome_build_vars[
                            genome_build]['position']] != "NA":
                        # check if variant is on chrY for genome build = GRCh38
                        if snp_coord['chromosome'] == "Y" and (
                                genome_build == "grch38"
                                or genome_build == "grch38_high_coverage"):
                            if "warning" in output:
                                output["warning"] = output["warning"] + \
                                    "Input variants on chromosome Y are unavailable for GRCh38, only available for GRCh37 (" + "rs" + snp_coord['id'] + " = chr" + snp_coord['chromosome'] + ":" + snp_coord[genome_build_vars[genome_build]['position']] + "). "
                            else:
                                output[
                                    "warning"] = "Input variants on chromosome Y are unavailable for GRCh38, only available for GRCh37 (" + "rs" + snp_coord[
                                        'id'] + " = chr" + snp_coord[
                                            'chromosome'] + ":" + snp_coord[
                                                genome_build_vars[genome_build]
                                                ['position']] + "). "
                            warn.append(snp_i[0])
                        else:
                            rs_nums.append(snp_i[0])
                            snp_pos.append(snp_coord[
                                genome_build_vars[genome_build]['position']])
                            temp = [
                                snp_i[0], snp_coord['chromosome'],
                                snp_coord[genome_build_vars[genome_build]
                                          ['position']]
                            ]
                            snp_coords.append(temp)
                    else:
                        warn.append(snp_i[0])
                else:
                    warn.append(snp_i[0])
            else:
                warn.append(snp_i[0])

    if warn != []:
        if "warning" in output:
            output["warning"] = output["warning"] + \
                "The following RS number(s) or coordinate(s) inputs have warnings: " + ", ".join(warn) + ". "
        else:
            output[
                "warning"] = "The following RS number(s) or coordinate(s) inputs have warnings: " + ", ".join(
                    warn) + ". "

    if len(rs_nums) == 0:
        output[
            "error"] = "Input variant list does not contain any valid RS numbers or coordinates. " + str(
                output["warning"] if "warning" in output else "")
        return (json.dumps(output, sort_keys=True, indent=2))

    # Check SNPs are all on the same chromosome
    for i in range(len(snp_coords)):
        if snp_coords[0][1] != snp_coords[i][1]:
            output["error"] = "Not all input variants are on the same chromosome: "+snp_coords[i-1][0]+"=chr" + \
                str(snp_coords[i-1][1])+":"+str(snp_coords[i-1][2])+", "+snp_coords[i][0] + \
                "=chr"+str(snp_coords[i][1])+":"+str(snp_coords[i][2])+". " + str(output["warning"] if "warning" in output else "")
            return (json.dumps(output, sort_keys=True, indent=2))

    # Check max distance between SNPs
    distance_bp = []
    for i in range(len(snp_coords)):
        distance_bp.append(int(snp_coords[i][2]))
    distance_max = max(distance_bp) - min(distance_bp)
    if distance_max > 1000000:
        if "warning" in output:
            output["warning"] = output["warning"] + \
                "Switch rate errors become more common as distance between query variants increases (Query range = "+str(
                    distance_max)+" bp). "
        else:
            output[
                "warning"] = "Switch rate errors become more common as distance between query variants increases (Query range = " + str(
                    distance_max) + " bp). "

    # Sort coordinates and make tabix formatted coordinates
    snp_pos_int = [int(i) for i in snp_pos]
    snp_pos_int.sort()
    # keep track of rs and snp postion after sort
    rs_snp_pos = []
    for i in snp_pos_int:
        rs_snp_pos.append(snp_pos.index(str(i)))

    snp_coord_str = [
        genome_build_vars[genome_build]['1000G_chr_prefix'] +
        snp_coords[0][1] + ":" + str(i) + "-" + str(i) for i in snp_pos_int
    ]
    tabix_coords = " " + " ".join(snp_coord_str)
    #print("tabix_coords", tabix_coords)
    # # Extract 1000 Genomes phased genotypes
    vcf_filePath = "%s/%s%s/%s" % (
        config['aws']['data_subfolder'], genotypes_dir,
        genome_build_vars[genome_build]['1000G_dir'],
        genome_build_vars[genome_build]['1000G_file'] % (snp_coords[0][1]))
    vcf_query_snp_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath)

    checkS3File(aws_info, config['aws']['bucket'], vcf_filePath)
    vcf, h = retrieveTabix1000GData(
        vcf_query_snp_file, tabix_coords, data_dir + genotypes_dir +
        genome_build_vars[genome_build]['1000G_dir'])

    # Define function to correct indel alleles
    def set_alleles(a1, a2):
        if len(a1) == 1 and len(a2) == 1:
            a1_n = a1
            a2_n = a2
        elif len(a1) == 1 and len(a2) > 1:
            a1_n = "-"
            a2_n = a2[1:]
        elif len(a1) > 1 and len(a2) == 1:
            a1_n = a1[1:]
            a2_n = "-"
        elif len(a1) > 1 and len(a2) > 1:
            a1_n = a1[1:]
            a2_n = a2[1:]
        return (a1_n, a2_n)

    # Make sure there are genotype data in VCF file
    #if vcf[-1][0:6] == "#CHROM":
    #    output["error"] = "No query variants were found in 1000G VCF file. " + str(output["warning"] if "warning" in output else "")
    #    return(json.dumps(output, sort_keys=True, indent=2))

    head = vcf[h].strip().split()

    # Extract haplotypes
    index = []
    for i in range(9, len(head)):
        if head[i] in pop_ids:
            index.append(i)

    hap1 = [[]]
    for i in range(len(index) - 1):
        hap1.append([])
    hap2 = [[]]
    for i in range(len(index) - 1):
        hap2.append([])

    # parse vcf
    snp_dict, missing_snp = parse_vcf(vcf[h + 1:], snp_coords, True)
    # throw error if no data is returned from 1000G
    if len(missing_snp.split()) == len(snp_pos):
        output[
            "error"] = "Input variant list does not contain any valid RS numbers or coordinates. " + str(
                output["warning"] if "warning" in output else "")
        return (json.dumps(output, sort_keys=True, indent=2))

    if len(missing_snp) > 0:
        output["warning"] = "Query variant " + str(
            missing_snp) + " is missing from 1000G (" + genome_build_vars[
                genome_build]['title'] + ") data. " + str(
                    output["warning"] if "warning" in output else "")

    rsnum_lst = []
    allele_lst = []
    pos_lst = []

    for s_key in snp_dict:
        # parse snp_key such as chr7:pos_rs4
        snp_keys = s_key.split("_")
        snp_key = snp_keys[0].split(':')[1]
        rs_input = snp_keys[1]
        geno_list = snp_dict[s_key]
        g = -1
        for geno in geno_list:
            g = g + 1
            geno = geno.strip().split()
            geno[0] = geno[0].lstrip('chr')
            # if 1000G position does not match dbSNP position for variant, use dbSNP position
            if geno[1] != snp_key:
                mismatch_msg = "Genomic position ("+geno[1]+") in 1000G data does not match dbSNP" + \
                        dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ") search coordinates for query variant " +\
                        rs_input + ". "
                if "warning" in output:
                    output["warning"] = output["warning"] + mismatch_msg
                else:
                    output["warning"] = mismatch_msg
                # throw an error in the event of missing query SNPs in 1000G data
                geno[1] = snp_key

            if "," not in geno[3] and "," not in geno[4]:
                a1, a2 = set_alleles(geno[3], geno[4])
                count0 = 0
                count1 = 0
                #print(geno)
                for i in range(len(index)):
                    if geno[index[i]] == "0|0":
                        hap1[i].append(a1)
                        hap2[i].append(a1)
                        count0 += 2
                    elif geno[index[i]] == "0|1":
                        hap1[i].append(a1)
                        hap2[i].append(a2)
                        count0 += 1
                        count1 += 1
                    elif geno[index[i]] == "1|0":
                        hap1[i].append(a2)
                        hap2[i].append(a1)
                        count0 += 1
                        count1 += 1
                    elif geno[index[i]] == "1|1":
                        hap1[i].append(a2)
                        hap2[i].append(a2)
                        count1 += 2
                    elif geno[index[i]] == "0":
                        hap1[i].append(a1)
                        hap2[i].append(".")
                        count0 += 1
                    elif geno[index[i]] == "1":
                        hap1[i].append(a2)
                        hap2[i].append(".")
                        count1 += 1
                    else:
                        hap1[i].append(".")
                        hap2[i].append(".")
                rsnum_lst.append(rs_input)
                position = "chr" + geno[0] + ":" + geno[1]
                pos_lst.append(position)
                f0 = round(float(count0) / (count0 + count1), 4)
                f1 = round(float(count1) / (count0 + count1), 4)
                if f0 >= f1:
                    alleles = a1+"="+str(round(f0, 3))+", " + \
                        a2+"="+str(round(f1, 3))
                else:
                    alleles = a2+"="+str(round(f1, 3))+", " + \
                        a1+"="+str(round(f0, 3))
                allele_lst.append(alleles)

    haps = {}
    for i in range(len(index)):
        h1 = "_".join(hap1[i])
        h2 = "_".join(hap2[i])
        if h1 in haps:
            haps[h1] += 1
        else:
            haps[h1] = 1

        if h2 in haps:
            haps[h2] += 1
        else:
            haps[h2] = 1

    # Remove Missing Haplotypes
    keys = list(haps.keys())
    for key in keys:
        if "." in key:
            haps.pop(key, None)

    # Sort results
    results = []
    for hap in haps:
        temp = [hap, haps[hap]]
        results.append(temp)

    total_haps = sum(haps.values())

    results_sort1 = sorted(results, key=operator.itemgetter(0))
    results_sort2 = sorted(results_sort1,
                           key=operator.itemgetter(1),
                           reverse=True)

    # Generate JSON output
    digits = len(str(len(results_sort2)))
    haps_out = {}
    for i in range(len(results_sort2)):
        hap_info = {}
        hap_info["Haplotype"] = results_sort2[i][0]
        hap_info["Count"] = results_sort2[i][1]
        hap_info["Frequency"] = round(
            float(results_sort2[i][1]) / total_haps, 4)
        haps_out["haplotype_" + (digits - len(str(i + 1))) * "0" +
                 str(i + 1)] = hap_info
    output["haplotypes"] = haps_out

    digits = len(str(len(rsnum_lst)))
    snps_out = {}
    for i in range(len(rsnum_lst)):
        snp_info = {}
        snp_info["RS"] = rsnum_lst[i]
        snp_info["Alleles"] = allele_lst[i]
        snp_info["Coord"] = pos_lst[i]
        snps_out["snp_" + (digits - len(str(i + 1))) * "0" +
                 str(i + 1)] = snp_info
    output["snps"] = snps_out

    # Create SNP File
    snp_out = open(tmp_dir + "snps_" + request + ".txt", "w")
    print("RS_Number\tPosition (" +
          genome_build_vars[genome_build]['title_hg'] + ")\tAllele Frequency",
          file=snp_out)
    for k in sorted(output["snps"].keys()):
        rs_k = output["snps"][k]["RS"]
        coord_k = output["snps"][k]["Coord"]
        alleles_k0 = output["snps"][k]["Alleles"].strip(" ").split(",")
        alleles_k1 = alleles_k0[0]+"0"*(7-len(str(alleles_k0[0]))) + \
            ","+alleles_k0[1]+"0"*(8-len(str(alleles_k0[1])))
        temp_k = [rs_k, coord_k, alleles_k1]
        print("\t".join(temp_k), file=snp_out)
    snp_out.close()

    # Create Haplotype File
    hap_out = open(tmp_dir + "haplotypes_" + request + ".txt", "w")
    print("Haplotype\tCount\tFrequency", file=hap_out)
    for k in sorted(output["haplotypes"].keys()):
        hap_k = output["haplotypes"][k]["Haplotype"]
        count_k = str(output["haplotypes"][k]["Count"])
        freq_k = str(output["haplotypes"][k]["Frequency"])
        temp_k = [hap_k, count_k, freq_k]
        print("\t".join(temp_k), file=hap_out)
    hap_out.close()

    # Return JSON output
    return (json.dumps(output, sort_keys=True, indent=2))
コード例 #3
0
def calculate_clip(snplst, pop, request, web, genome_build, r2_threshold=0.1, maf_threshold=0.01):

    max_list = 5000

    # Set data directories using config.yml
    with open('config.yml', 'r') as yml_file:
        config = yaml.load(yml_file)
    env = config['env']
    api_mongo_addr = config['api']['api_mongo_addr']
    dbsnp_version = config['data']['dbsnp_version']
    population_samples_dir = config['data']['population_samples_dir']
    data_dir = config['data']['data_dir']
    tmp_dir = config['data']['tmp_dir']
    genotypes_dir = config['data']['genotypes_dir']
    aws_info = config['aws']
    mongo_username = config['database']['mongo_user_readonly']
    mongo_password = config['database']['mongo_password']
    mongo_port = config['database']['mongo_port']

    export_s3_keys = retrieveAWSCredentials()

    # Ensure tmp directory exists
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    # Create JSON output
    out_json = open(tmp_dir+"clip"+request+".json", "w")
    output = {}

    # Validate genome build param
    print("genome_build " + genome_build)
    if genome_build not in genome_build_vars['vars']:
        output["error"] = "Invalid genome build. Please specify either " + ", ".join(genome_build_vars['vars']) + "."
        json_output = json.dumps(output, sort_keys=True, indent=2)
        print(json_output, file=out_json)
        out_json.close()
        return("", "", "")

    # Open SNP list file
    snps_raw = open(snplst).readlines()
    if len(snps_raw) > max_list:
        output["error"] = "Maximum SNP list is " + \
            str(max_list)+" RS numbers. Your list contains " + \
            str(len(snps_raw))+" entries."
        json_output = json.dumps(output, sort_keys=True, indent=2)
        print(json_output, file=out_json)
        out_json.close()
        return("", "", "")

    # Remove duplicate RS numbers
    snps = []
    for snp_raw in snps_raw:
        snp = snp_raw.strip().split()
        if snp not in snps:
            snps.append(snp)

    # Select desired ancestral populations
    pops = pop.split("+")
    pop_dirs = []
    for pop_i in pops:
        if pop_i in ["ALL", "AFR", "AMR", "EAS", "EUR", "SAS", "ACB", "ASW", "BEB", "CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH", "GWD", "IBS", "ITU", "JPT", "KHV", "LWK", "MSL", "MXL", "PEL", "PJL", "PUR", "STU", "TSI", "YRI"]:
            pop_dirs.append(data_dir + population_samples_dir + pop_i + ".txt")
        else:
            output["error"] = pop_i+" is not an ancestral population. Choose one of the following ancestral populations: AFR, AMR, EAS, EUR, or SAS; or one of the following sub-populations: ACB, ASW, BEB, CDX, CEU, CHB, CHS, CLM, ESN, FIN, GBR, GIH, GWD, IBS, ITU, JPT, KHV, LWK, MSL, MXL, PEL, PJL, PUR, STU, TSI, or YRI."
            json_output = json.dumps(output, sort_keys=True, indent=2)
            print(json_output, file=out_json)
            out_json.close()
            return("", "", "")

    get_pops = "cat " + " ".join(pop_dirs)
    pop_list = [x.decode('utf-8') for x in subprocess.Popen(get_pops, shell=True, stdout=subprocess.PIPE).stdout.readlines()]

    ids = [i.strip() for i in pop_list]
    pop_ids = list(set(ids))

    # Connect to Mongo snp database
    if env == 'local':
        mongo_host = api_mongo_addr
    else: 
        mongo_host = 'localhost'
    if web:
        client = MongoClient('mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host+'/admin', mongo_port)
    else:
        if env == 'local':
            client = MongoClient('mongodb://' + mongo_username + ':' + mongo_password + '@' + mongo_host+'/admin', mongo_port)
        else:
            client = MongoClient('localhost', mongo_port)
    db = client["LDLink"]

    def get_coords(db, rsid):
        rsid = rsid.strip("rs")
        query_results = db.dbsnp.find_one({"id": rsid})
        query_results_sanitized = json.loads(json_util.dumps(query_results))
        return query_results_sanitized

    # Replace input genomic coordinates with variant ids (rsids)
    def replace_coords_rsid(db, snp_lst):
        new_snp_lst = []
        for snp_raw_i in snp_lst:
            if snp_raw_i[0][0:2] == "rs":
                new_snp_lst.append(snp_raw_i)
            else:
                snp_info_lst = get_rsnum(db, snp_raw_i[0], genome_build)
                print("snp_info_lst")
                print(snp_info_lst)
                if snp_info_lst != None:
                    if len(snp_info_lst) > 1:
                        var_id = "rs" + snp_info_lst[0]['id']
                        ref_variants = []
                        for snp_info in snp_info_lst:
                            if snp_info['id'] == snp_info['ref_id']:
                                ref_variants.append(snp_info['id'])
                        if len(ref_variants) > 1:
                            var_id = "rs" + ref_variants[0]
                            if "warning" in output:
                                output["warning"] = output["warning"] + \
                                ". Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp_raw_i[0]
                            else:
                                output["warning"] = "Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp_raw_i[0]
                        elif len(ref_variants) == 0 and len(snp_info_lst) > 1:
                            var_id = "rs" + snp_info_lst[0]['id']
                            if "warning" in output:
                                output["warning"] = output["warning"] + \
                                ". Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp_raw_i[0]
                            else:
                                output["warning"] = "Multiple rsIDs (" + ", ".join(["rs" + ref_id for ref_id in ref_variants]) + ") map to genomic coordinates " + snp_raw_i[0]
                        else:
                            var_id = "rs" + ref_variants[0]
                        new_snp_lst.append([var_id])
                    elif len(snp_info_lst) == 1:
                        var_id = "rs" + snp_info_lst[0]['id']
                        new_snp_lst.append([var_id])
                    else:
                        new_snp_lst.append(snp_raw_i)
                else:
                    new_snp_lst.append(snp_raw_i)
        return new_snp_lst

    snps = replace_coords_rsid(db, snps)

    # Find RS numbers in snp database
    details = collections.OrderedDict()
    rs_nums = []
    snp_pos = []
    snp_coords = []
    warn = []
    tabix_coords = ""
    for snp_i in snps:
        if len(snp_i) > 0:
            if len(snp_i[0]) > 2:
                if (snp_i[0][0:2] == "rs" or snp_i[0][0:3] == "chr") and snp_i[0][-1].isdigit():
                    snp_coord = get_coords(db, snp_i[0])
                    if snp_coord != None and snp_coord[genome_build_vars[genome_build]['position']] != "NA":
                        # check if variant is on chrY for genome build = GRCh38
                        if snp_coord['chromosome'] == "Y" and (genome_build == "grch38" or genome_build == "grch38_high_coverage"):
                            if "warning" in output:
                                output["warning"] = output["warning"] + \
                                    ". " + "Input variants on chromosome Y are unavailable for GRCh38, only available for GRCh37 (" + "rs" + snp_coord['id'] + " = chr" + snp_coord['chromosome'] + ":" + snp_coord[genome_build_vars[genome_build]['position']] + ")"
                            else:
                                output["warning"] = "Input variants on chromosome Y are unavailable for GRCh38, only available for GRCh37 (" + "rs" + snp_coord['id'] + " = chr" + snp_coord['chromosome'] + ":" + snp_coord[genome_build_vars[genome_build]['position']] + ")"
                            warn.append(snp_i[0])
                            details[snp_i[0]] = ["NA", "NA", "Chromosome Y variants are unavailable for GRCh38, only available for GRCh37."]
                        else:
                            rs_nums.append(snp_i[0])
                            snp_pos.append(snp_coord[genome_build_vars[genome_build]['position']])
                            temp = [snp_i[0], snp_coord['chromosome'], snp_coord[genome_build_vars[genome_build]['position']]]
                            snp_coords.append(temp)
                    else:
                        warn.append(snp_i[0])
                        details[snp_i[0]] = ["NA", "NA", "Variant not found in dbSNP" + dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + "), variant removed."]
                else:
                    warn.append(snp_i[0])
                    details[snp_i[0]] = ["NA", "NA",
                                         "Not a RS number, query removed."]
            else:
                warn.append(snp_i[0])
                details[snp_i[0]] = ["NA", "NA",
                                     "Not a RS number, query removed."]
        else:
            output["error"] = "Input list of RS numbers is empty"
            json_output = json.dumps(output, sort_keys=True, indent=2)
            print(json_output, file=out_json)
            out_json.close()
            return("", "", "")

    if warn != []:
        if "warning" in output:
            output["warning"] = output["warning"] + \
                ". The following RS number(s) or coordinate(s) inputs have warnings: " + ", ".join(warn)
        else:
            output["warning"] = "The following RS number(s) or coordinate(s) inputs have warnings: " + ", ".join(warn)

    if len(rs_nums) == 0:
        output["error"] = "Input SNP list does not contain any valid RS numbers or coordinates. " + output["warning"]
        json_output = json.dumps(output, sort_keys=True, indent=2)
        print(json_output, file=out_json)
        out_json.close()
        return("", "", "")

    # Check SNPs are all on the same chromosome
    for i in range(len(snp_coords)):
        if snp_coords[0][1] != snp_coords[i][1]:
            output["error"] = "Not all input variants are on the same chromosome: "+snp_coords[i-1][0]+"=chr" + \
                str(snp_coords[i-1][1])+":"+str(snp_coords[i-1][2])+", "+snp_coords[i][0] + \
                "=chr"+str(snp_coords[i][1])+":"+str(snp_coords[i][2])+"."
            json_output = json.dumps(output, sort_keys=True, indent=2)
            print(json_output, file=out_json)
            out_json.close()
            return("", "", "")

    # Make tabix formatted coordinates
    snp_coord_str = [genome_build_vars[genome_build]['1000G_chr_prefix'] + snp_coords[0][1]+":"+i+"-"+i for i in snp_pos]
    tabix_coords = " "+" ".join(snp_coord_str)

    # Extract 1000 Genomes phased genotypes
    vcf_filePath = "%s/%s%s/%s" % (config['aws']['data_subfolder'], genotypes_dir, genome_build_vars[genome_build]['1000G_dir'], genome_build_vars[genome_build]['1000G_file'] % (snp_coords[0][1]))
    vcf_query_snp_file = "s3://%s/%s" % (config['aws']['bucket'], vcf_filePath)

    checkS3File(aws_info, config['aws']['bucket'], vcf_filePath)

    vcf = retrieveTabix1000GData(vcf_query_snp_file, tabix_coords, data_dir + genotypes_dir + genome_build_vars[genome_build]['1000G_dir'])

    # Make MAF function
    def calc_maf(genos):
        vals = {"0|0": 0, "0|1": 0, "1|0": 0, "1|1": 0, "0": 0, "1": 0}
        for i in range(len(genos)):
            if genos[i] in vals:
                vals[genos[i]] += 1

        zeros = vals["0|0"]*2+vals["0|1"]+vals["1|0"]+vals["0"]
        ones = vals["1|1"]*2+vals["0|1"]+vals["1|0"]+vals["1"]
        total = zeros+ones

        f0 = zeros*1.0/total
        f1 = ones*1.0/total
        maf = min(f0, f1)

        return f0, f1, maf

    # Define function to correct indel alleles
    def set_alleles(a1, a2):
        if len(a1) == 1 and len(a2) == 1:
            a1_n = a1
            a2_n = a2
        elif len(a1) == 1 and len(a2) > 1:
            a1_n = "-"
            a2_n = a2[1:]
        elif len(a1) > 1 and len(a2) == 1:
            a1_n = a1[1:]
            a2_n = "-"
        elif len(a1) > 1 and len(a2) > 1:
            a1_n = a1[1:]
            a2_n = a2[1:]
        return(a1_n, a2_n)

    # Make R2 function
    def calc_r2(var1, var2):
        hap_vals = {"0|0-0|0": 0, "0|0-0|1": 0, "0|0-1|0": 0, "0|0-1|1": 0, "0|1-0|0": 0, "0|1-0|1": 0, "0|1-1|0": 0, "0|1-1|1": 0, "1|0-0|0": 0,
                    "1|0-0|1": 0, "1|0-1|0": 0, "1|0-1|1": 0, "1|1-0|0": 0, "1|1-0|1": 0, "1|1-1|0": 0, "1|1-1|1": 0, "0-0": 0, "0-1": 0, "1-0": 0, "1-1": 0}
        for i in range(len(var1)):
            ind_geno = var1[i]+"-"+var2[i]
            if ind_geno in hap_vals:
                hap_vals[ind_geno] += 1

        A = hap_vals["0|0-0|0"]*2+hap_vals["0|0-0|1"]+hap_vals["0|0-1|0"]+hap_vals["0|1-0|0"] + \
            hap_vals["0|1-0|1"]+hap_vals["1|0-0|0"] + \
            hap_vals["1|0-1|0"]+hap_vals["0-0"]
        B = hap_vals["0|0-0|1"]+hap_vals["0|0-1|0"]+hap_vals["0|0-1|1"]*2+hap_vals["0|1-1|0"] + \
            hap_vals["0|1-1|1"]+hap_vals["1|0-0|1"] + \
            hap_vals["1|0-1|1"]+hap_vals["0-1"]
        C = hap_vals["0|1-0|0"]+hap_vals["0|1-1|0"]+hap_vals["1|0-0|0"]+hap_vals["1|0-0|1"] + \
            hap_vals["1|1-0|0"]*2+hap_vals["1|1-0|1"] + \
            hap_vals["1|1-1|0"]+hap_vals["1-0"]
        D = hap_vals["0|1-0|1"]+hap_vals["0|1-1|1"]+hap_vals["1|0-1|0"]+hap_vals["1|0-1|1"] + \
            hap_vals["1|1-0|1"]+hap_vals["1|1-1|0"] + \
            hap_vals["1|1-1|1"]*2+hap_vals["1-1"]

        delta = float(A*D-B*C)
        Ms = float((A+C)*(B+D)*(A+B)*(C+D))
        if Ms != 0:
            r2 = (delta**2)/Ms
        else:
            r2 = None

        return(r2)

    # Import SNP VCF file
    hap_dict = {}
    h = 0
    while vcf[h][0:2] == "##":
        h += 1

    head = vcf[h].strip().split()

    # Extract population specific haplotypes
    pop_index = []
    for i in range(9, len(head)):
        if head[i] in pop_ids:
            pop_index.append(i)

    rsnum_lst = []

    for g in range(h+1, len(vcf)):
        geno = vcf[g].strip().split()
        geno[0] = geno[0].lstrip('chr')
        if geno[1] not in snp_pos:
            if "warning" in output:
                output["warning"] = output["warning"]+". Genomic position ("+geno[1]+") in VCF file does not match db" + \
                    dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ") search coordinates for query variant"
            else:
                output["warning"] = "Genomic position ("+geno[1]+") in VCF file does not match db" + \
                    dbsnp_version + " (" + genome_build_vars[genome_build]['title'] + ") search coordinates for query variant"
            continue

        if snp_pos.count(geno[1]) == 1:
            rs_query = rs_nums[snp_pos.index(geno[1])]

        else:
            pos_index = []
            for p in range(len(snp_pos)):
                if snp_pos[p] == geno[1]:
                    pos_index.append(p)
            for p in pos_index:
                if rs_nums[p] not in rsnum_lst:
                    rs_query = rs_nums[p]
                    break

        if rs_query in rsnum_lst:
            continue

        rs_1000g = geno[2]

        if rs_query == rs_1000g:
            rsnum = rs_1000g
        else:
            count = -2
            found = "false"
            while count <= 2 and count+g < len(vcf):
                geno_next = vcf[g+count].strip().split()
                geno_next[0] = geno_next[0].lstrip('chr')
                if len(geno_next) >= 3 and rs_query == geno_next[2]:
                    found = "true"
                    break
                count += 1

            if found == "false":
                if "rs" in rs_1000g:
                    if "warning" in output:
                        output["warning"] = output["warning"] + \
                            ". Genomic position for query variant ("+rs_query + \
                            ") does not match RS number at 1000G position (chr" + \
                            geno[0]+":"+geno[1]+" = "+rs_1000g+")"
                    else:
                        output["warning"] = "Genomic position for query variant ("+rs_query + \
                            ") does not match RS number at 1000G position (chr" + \
                            geno[0]+":"+geno[1]+" = "+rs_1000g+")"

                indx = [i[0] for i in snps].index(rs_query)
                # snps[indx][0]=geno[2]
                # rsnum=geno[2]
                snps[indx][0] = rs_query
                rsnum = rs_query
                # try:
                # 	indx=[i[0] for i in snps].index(rs_query)
                # 	snps[indx][0]=geno[2]
                # 	rsnum=geno[2]
                # except ValueError:
                # 	print("List does not contain value:")
                # 	print "#####"
                # 	print "variable rs_query " + rs_query
                # 	print "variable snps " + str(snps)
                # 	print "#####"
            else:
                continue

        details[rsnum] = ["chr"+geno[0]+":"+geno[1]]

        if "," not in geno[3] and "," not in geno[4]:
            temp_genos = []
            for i in range(len(pop_index)):
                temp_genos.append(geno[pop_index[i]])
            f0, f1, maf = calc_maf(temp_genos)
            a0, a1 = set_alleles(geno[3], geno[4])
            details[rsnum].append(
                a0+"="+str(round(f0, 3))+", "+a1+"="+str(round(f1, 3)))
            if maf_threshold <= maf:
                hap_dict[rsnum] = [temp_genos]
                rsnum_lst.append(rsnum)
            else:
                details[rsnum].append(
                    "Variant MAF is "+str(round(maf, 4))+", variant removed.")
        else:
            details[rsnum].append(geno[3]+"=NA, "+geno[4]+"=NA")
            details[rsnum].append("Variant is not biallelic, variant removed.")

    for i in rs_nums:
        if i not in rsnum_lst:
            if i not in details:
                index_i = rs_nums.index(i)
                details[i] = ["chr"+snp_coords[index_i][1]+":"+snp_coords[index_i][2]+"-" +
                              snp_coords[index_i][2], "NA", "Variant not in 1000G VCF file, variant removed."]

    # Thin the SNPs
    # sup_2=u"\u00B2"
    sup_2 = "2"
    i = 0
    while i < len(rsnum_lst):
        details[rsnum_lst[i]].append("Variant kept.")
        remove_list = []
        for j in range(i+1, len(rsnum_lst)):
            r2 = calc_r2(hap_dict[rsnum_lst[i]][0], hap_dict[rsnum_lst[j]][0])
            if r2_threshold <= r2:
                snp = rsnum_lst[j]
                details[snp].append("Variant in LD with "+rsnum_lst[i] +
                                    " (R"+sup_2+"="+str(round(r2, 4))+"), variant removed.")
                remove_list.append(snp)
        for snp in remove_list:
            rsnum_lst.remove(snp)
        i += 1

    # Return output
    json_output = json.dumps(output, sort_keys=True, indent=2)
    print(json_output, file=out_json)
    out_json.close()
    return(snps, rsnum_lst, details)