Esempio n. 1
0
def write_matrix(outfile, matrix):
    from genomicode import AnnotationMatrix as AM

    if isinstance(matrix, AM.AnnotationMatrix):
        AM.write(outfile, matrix)
    else:
        write_express(outfile, matrix)
Esempio n. 2
0
def make_matrix(samples, callers, annot_header, annot_data, named_data,
                call_data):
    # annot_header  list of headers for annot_data.
    # annot_data    list of tuples:  chrom, pos, ref, alt[, more]
    # named_data    list of (name, headers, all_annots)
    # call_data     list of tuples: chrom, pos, ref, alt, sample, caller, call
    # chrom   string
    # pos     int
    # ref     string
    # alt     string
    # sample  string
    # caller  string
    # call    Call object
    from genomicode import AnnotationMatrix

    # Make sure there's no duplicates.
    assert annot_header[:4] == ["Chrom", "Pos", "Ref", "Alt"]
    seen = {}
    for x in annot_data:
        x = x[:4]
        x = tuple(x)
        assert x not in seen, "Duplicate"
        seen[x] = 1

    # Make annotation matrix.
    for x in annot_data:
        assert len(x) == len(annot_header)
    headers = annot_header
    all_annots = []
    for i in range(len(headers)):
        x = [x[i] for x in annot_data]
        all_annots.append(x)
    annot_matrix = AnnotationMatrix.create_from_annotations(
        headers, all_annots)

    # Make named matrices.
    named_matrices = []
    for x in named_data:
        name, headers, all_annots = x
        matrix = AnnotationMatrix.create_from_annotations(headers, all_annots)
        x = name, matrix
        named_matrices.append(x)

    # Make call matrix.
    call_matrix = SparseCallMatrix(call_data)

    return SimpleVariantMatrix(samples, callers, annot_matrix, named_matrices,
                               call_matrix)
Esempio n. 3
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_filename):
        import os
        import stat
        from genomicode import AnnotationMatrix

        # If the file is empty, then just create an empty positions file.
        if os.stat(in_data.identifier)[stat.ST_SIZE] == 0:
            open(out_filename, 'w')
            return

        M = AnnotationMatrix.read(in_data.identifier, header_char="##")

        # Headers are:
        # #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT [Samples...]
        # Pull out the #CHROM and POS columns.
        assert M.num_headers()
        assert M.headers[0] == "#CHROM"
        assert M.headers[1] == "POS"
        chrom_annots = M["#CHROM"]
        pos_annots = M["POS"]

        lines = []
        seen = {}
        for chrom, pos in zip(chrom_annots, pos_annots):
            chrom, pos = chrom.strip(), pos.strip()
            x = chrom, pos
            if x in seen:
                continue
            seen[x] = 1
            x = "\t".join(x) + "\n"
            lines.append(x)
        open(out_filename, 'w').writelines(lines)
Esempio n. 4
0
def filter_min_gene_expression_in_every_sample(MATRIX, gxp):
    # Gene expression >= 1 in all samples.
    from genomicode import AnnotationMatrix

    assert type(gxp) is type(0.0)

    x = MATRIX.headers
    x = [x for x in x if x.startswith("Gene Expression")]
    sample_h = x
    assert sample_h, 'Missing: "Gene Expression" columns'

    I_keep = []
    for i in range(MATRIX.num_annots()):
        keep = True
        for h in sample_h:
            if not MATRIX[h][i]:
                keep = False
                break
            # 5.3
            # 0,0.379
            x = MATRIX[h][i]
            x = x.split(",")
            x = [float(x) for x in x]
            x = max(x)
            exp = x
            if exp < gxp:
                keep = False
                break
        if not keep:
            continue
        I_keep.append(i)

    x = AnnotationMatrix.rowslice(MATRIX, I_keep)
    return x
Esempio n. 5
0
def filter_min_coverage_in_every_sample(MATRIX, coverage):
    from genomicode import AnnotationMatrix

    assert type(coverage) is type(0)

    x = MATRIX.headers
    x = [x for x in x if x.startswith("Coverage")]
    sample_h = x
    assert sample_h, 'Missing: "Coverage" columns'

    I_keep = []
    for i in range(MATRIX.num_annots()):
        keep = True
        for h in sample_h:
            if not MATRIX[h][i]:
                keep = False
                break
            # Ref/Alt/VAF
            x = MATRIX[h][i]
            x = x.split("/")
            assert len(x) == 3
            cov = int(x[0]) + int(x[1])
            if cov < coverage:
                keep = False
                break
        if keep:
            I_keep.append(i)
    x = AnnotationMatrix.rowslice(MATRIX, I_keep)
    return x
Esempio n. 6
0
def filter_min_callers_in_any_sample(MATRIX, num_callers):
    from genomicode import AnnotationMatrix

    assert type(num_callers) is type(0)

    x = MATRIX.headers
    x = [x for x in x if x.startswith("Num Callers")]
    callers_h = x
    assert callers_h, 'Missing: "Gene Expression" columns'

    I_keep = []
    for i in range(MATRIX.num_annots()):
        keep = False
        for h in callers_h:
            if not MATRIX[h][i]:
                continue
            nc = int(MATRIX[h][i])
            if nc >= num_callers:
                keep = True
                break
        if keep:
            I_keep.append(i)

    x = AnnotationMatrix.rowslice(MATRIX, I_keep)
    return x
Esempio n. 7
0
def sort_vcf_file(filename):
    from genomicode import vcflib
    from genomicode import jmath
    from genomicode import AnnotationMatrix

    vcf = vcflib.read(filename)
    CHROM = vcf.matrix["#CHROM"]
    POS = vcf.matrix["POS"]
    POS = [int(x) for x in POS]

    # Check if POS is sorted.  If it's already sorted, then return.
    is_sorted = True
    for i in range(len(CHROM) - 1):
        c1, p1 = CHROM[i], POS[i]
        c2, p2 = CHROM[i + 1], POS[i + 1]
        if c1 != c2:
            continue
        if p2 < p1:
            is_sorted = False
            break
    if is_sorted:
        return

    # Sort by CHROM and POS.
    S = ["%s:%d" % (CHROM[i], POS[i]) for i in range(len(CHROM))]
    O = jmath.order_list(S, natural=True)
    vcf.matrix = AnnotationMatrix.rowslice(vcf.matrix, O)
    vcflib.write(filename, vcf)
def add_snpeff_to_svm(svm_file, snpeff_file, outfile):
    import shutil
    from genomicode import filelib
    from genomicode import SimpleVariantMatrix
    from genomicode import AnnotationMatrix

    if not filelib.exists_nz(snpeff_file):
        shutil.copy2(svm_file, outfile)
        return

    # Read the annotations.
    header = None  # includes Chrom, Pos, Ref, Alt
    coord2d = {}
    for d in filelib.read_row(snpeff_file, header=1):
        if header is None:
            header = d._header
        coord = d.Chrom, d.Pos, d.Ref, d.Alt
        coord2d[coord] = d

    svm = SimpleVariantMatrix.read_as_am(svm_file)
    CHROM = svm.header2annots["______Chrom"]
    POS = svm.header2annots["______Pos"]
    REF = svm.header2annots["______Ref"]
    ALT = svm.header2annots["______Alt"]

    snpeff_header = header[4:]
    snpeff_matrix = []  # Row major.
    for i in range(len(CHROM)):
        coord = CHROM[i], POS[i], REF[i], ALT[i]
        row = [""] * len(snpeff_header)
        d = coord2d.get(coord)
        if d:
            row = d._cols[4:]
        assert len(row) == len(snpeff_header)
        snpeff_matrix.append(row)
    assert len(snpeff_matrix) == len(CHROM)
    # AnnotationMatrix is column major.
    snpeff_annots = []
    for j in range(len(snpeff_header)):
        x = [snpeff_matrix[i][j] for i in range(len(snpeff_matrix))]
        snpeff_annots.append(x)
    # Convert the headers to SVM format.
    snpeff_header = ["SnpEff______%s" % x for x in snpeff_header]
    # Make the new SimpleVariantMatrix.
    headers = svm.headers[:4] + snpeff_header + svm.headers[4:]
    x = [svm.header2annots[x] for x in svm.headers_h]
    all_annots = x[:4] + snpeff_annots + x[4:]
    merged = AnnotationMatrix.create_from_annotations(
        headers, all_annots, headerlines=svm.headerlines)
    SimpleVariantMatrix.write_from_am(outfile, merged)
Esempio n. 9
0
def align_annot(matrix, indexes, null_string):
    from genomicode import AnnotationMatrix as AM

    name2annots_new = {}
    for name, annots in matrix.header2annots.iteritems():
        annots_new = []
        for i, i_annot in enumerate(indexes):
            if i_annot != None:
                annots_new.append(annots[i_annot])
            #elif name == header:
            #    annots_new.append(samples[i])
            else:
                annots_new.append(null_string)
        name2annots_new[name] = annots_new
    #return AnnotationMatrix(name2annots_new, matrix.name_order)
    x = AM.AnnotationMatrix(matrix.headers, matrix.headers_h, name2annots_new)
    return x
Esempio n. 10
0
def check_matrix(X):
    import re
    import arrayio
    import copy
    from genomicode import hashlib
    from genomicode import AnnotationMatrix

    assert arrayio.gct_format.is_matrix(X)

    # Make sure gene IDs (NAME) is unique and non-empty.
    assert X.row_names()[0].upper() == "NAME", \
           "Header of first column should be: NAME"
    seen = {}
    for i, name in enumerate(X.row_names("NAME")):
        assert name.strip(), "Empty gene ID in row %d." % (i + 1)
        assert name not in seen, "Duplicate gene ID: %s" % name
        seen[name] = 1

    # Make sure sample names don't contain spaces or other
    # punctuation.  GSEA seems to be sensitive to these things.
    sample_names = X.col_names(arrayio.tdf.SAMPLE_NAME)
    bad_names = []
    for i, name in enumerate(sample_names):
        if not name:
            bad_names.append("<blank>")
        elif re.search("[^a-zA-Z0-9_-]", name):
            bad_names.append(name)
    #assert not bad_names, "Bad sample name: %s" % ", ".join(bad_names)

    # If there are bad names, try to fix them.
    if bad_names:
        X = copy.deepcopy(X)
        sample_names = [hashlib.hash_var(x) for x in sample_names]
        sample_names = AnnotationMatrix.uniquify_headers(sample_names)
        header = X._resolve_synonym(arrayio.tdf.SAMPLE_NAME, X.col_names,
                                    X._synonyms)
        X._col_names[header] = sample_names

    # Make sure sample names are unique.
    seen = {}
    for i, name in enumerate(sample_names):
        assert name not in seen, "Duplicate sample name: %s" % name
        seen[name] = 1

    return X
Esempio n. 11
0
def filter_sift_polyphen_damaging(MATRIX):
    from genomicode import AnnotationMatrix

    x = [x for x in MATRIX.headers if x.endswith("SIFT_pred")]
    assert len(x) == 1
    SIFT_pred = MATRIX[x[0]]
    x = [x for x in MATRIX.headers if x.endswith("Polyphen2_HDIV_pred")]
    assert len(x) == 1
    hdiv_pred = MATRIX[x[0]]
    x = [x for x in MATRIX.headers if x.endswith("Polyphen2_HVAR_pred")]
    assert len(x) == 1
    hvar_pred = MATRIX[x[0]]

    I_keep = []
    for i, (sift, hdiv, hvar) in enumerate(zip(SIFT_pred, hdiv_pred,
                                               hvar_pred)):
        if sift == "D" and hdiv in ["D", "P"] and hvar in ["D", "P"]:
            I_keep.append(i)
    x = AnnotationMatrix.rowslice(MATRIX, I_keep)
    return x
Esempio n. 12
0
def filter_linked_perc(MATRIX, args):
    if args is None:
        return MATRIX
    from genomicode import AnnotationMatrix

    filter_perc = float(args)
    assert filter_perc >= 0 and filter_perc <= 100

    h = "Linkage______Perc Linked"
    perc_linked = MATRIX[h]

    I = []
    for i, perc in enumerate(perc_linked):
        if perc == "":
            I.append(i)
            continue
        perc = float(perc)
        if perc <= filter_perc:
            I.append(i)
    return AnnotationMatrix.rowslice(MATRIX, I)
Esempio n. 13
0
def filter_min_callers(MATRIX, args, germline):
    if args is None:
        return MATRIX
    from genomicode import AnnotationMatrix

    num_callers = args
    assert num_callers >= 1 and num_callers < 20

    I_nc = [
        i for (i, x) in enumerate(MATRIX.headers)
        if x.startswith("Num Callers")
    ]
    headers_nc = [MATRIX.headers_h[i] for i in I_nc]
    for i, h in enumerate(headers_nc):
        is_germ = False
        for g in germline:
            if h.endswith(g):
                is_germ = True
                break
        if is_germ:
            headers_nc[i] = None
    headers_nc = [x for x in headers_nc if x]

    I_remove = []
    for i in range(MATRIX.num_annots()):
        has_sample = False
        for h in headers_nc:
            x = MATRIX.header2annots[h][i]
            if not x.strip():
                continue
            nc = int(x)
            if nc >= num_callers:
                has_sample = True
                break
        if not has_sample:
            I_remove.append(i)

    x = {}.fromkeys(I_remove)
    I_keep = [i for i in range(MATRIX.num_annots()) if i not in x]
    filtered_matrix = AnnotationMatrix.rowslice(MATRIX, I_keep)
    return filtered_matrix
Esempio n. 14
0
def exonic_only(MATRIX, args):
    if not args:
        return MATRIX
    from genomicode import AnnotationMatrix

    header = "Annovar______Func.refGene"
    assert header in MATRIX.headers_h

    I_keep = []
    func = MATRIX.header2annots[header]
    for i in range(len(func)):
        # exonic
        # ncRNA_exonic;splicing
        # exonic;splicing
        x = func[i]
        x = x.split(";")
        if "exonic" not in x:
            continue
        I_keep.append(i)
    MATRIX = AnnotationMatrix.rowslice(MATRIX, I_keep)
    return MATRIX
Esempio n. 15
0
def filter_nonsynonymous(MATRIX):
    # Filter out synonymous variants.
    from genomicode import AnnotationMatrix

    # Make sure annotated with Annovar.
    HEADER = "Annovar______ExonicFunc.refGene"
    assert HEADER in MATRIX.headers, "Missing: ExonicFunc.refGene"
    exonic_func = MATRIX[HEADER]
    I_keep = []
    for i, efunc in enumerate(exonic_func):
        assert efunc in [
            "", "nonsynonymous SNV", "synonymous SNV",
            "stopgain", "stoploss",
            "frameshift substitution", "nonframeshift substitution",
            "unknown"], \
            "Unknown exonic_func: %s" % efunc
        if efunc in [
                "nonsynonymous SNV", "stopgain", "stoploss",
                "frameshift substitution"
        ]:
            I_keep.append(i)
    x = AnnotationMatrix.rowslice(MATRIX, I_keep)
    return x
Esempio n. 16
0
def annotate_linked_variants(MATRIX, args):
    if not args:
        return MATRIX
    from genomicode import filelib
    from genomicode import AnnotationMatrix

    link_file = args
    filelib.assert_exists_nz(link_file)
    coord2perc = {}
    for d in filelib.read_row(link_file, header=1):
        chrom = d.Chrom
        pos = int(d.Pos)
        perc = float(d.Perc_Linked)
        coord2perc[(chrom, pos)] = perc

    chrom = MATRIX.header2annots["______Chrom"]
    pos = MATRIX.header2annots["______Pos"]
    pos = [int(x) for x in pos]

    link_score = [""] * len(chrom)
    for i in range(len(chrom)):
        link_score[i] = coord2perc.get((chrom[i], pos[i]), "")

    # Add after:
    # Chrom, Pos, Ref, Alt
    header = "Linkage______Score"
    assert header not in MATRIX.headers
    headers = MATRIX.headers[:4] + [header] + MATRIX.headers[4:]
    all_annots = []
    for h in headers:
        if h != header:
            x = MATRIX[h]
        else:
            x = link_score
        all_annots.append(x)
    return AnnotationMatrix.create_from_annotations(headers, all_annots,
                                                    MATRIX.headerlines)
Esempio n. 17
0
def main():
    import os
    import argparse
    
    from genomicode import jmath
    from genomicode import AnnotationMatrix

    parser = argparse.ArgumentParser(description="")
    parser.add_argument("datafile", help="Tab-delimited data file.")
    parser.add_argument("header", help="Which column contains data to plot.")
    parser.add_argument(
        "plot_file", help="Name of image file, e.g. outfile.png.  "
        "Will generate PNG format by default.  If this file name ends with "
        ".pdf, will generate a PDF file instead.")
    parser.add_argument(
        "--prism_file", help="Write Prism-formatted results to this file.")
    parser.add_argument(
        "--ignore_missing_values", action="store_true",
        help="Ignore missing values in the file.")

    group = parser.add_argument_group(title="Calculations")
    group.add_argument(
        "--breaks_seq",
        help="Set the breakpoints.  Format: <start>,<stop>,<skip>.")
    group.add_argument(
        "--num_breaks", type=int, help="Number of breakpoints.")
    group.add_argument(
        "--ymax", type=int,
        help="Set the maximum value for the Y axis.")
    
    group = parser.add_argument_group(title="Plot Labels")
    group.add_argument("--title", help="Put a title on the plot.")
    group.add_argument("--xlab", help="Label the X-axis.")
    group.add_argument(
        "--xlabel_size", default=1.0, type=float,
        help="Scale the size of the labels on X-axis.  Default 1.0.")
    group.add_argument(
        "--xlabel_off", action="store_true", help="Do not label the X axis.")
    group.add_argument(
        "--ylabel_off", action="store_true", help="Do not label the Y axis.")
    group.add_argument(
        "--xtick_label_off", action="store_true",
        help="Do not draw the tick labels on the X axis.")

    group = parser.add_argument_group(title="Colors")
    group.add_argument(
        "--bar_color",  help="Set the color of the bars.  Default #FFFFFF")
    x = _fmt_palettes()
    group.add_argument(
        "--bar_palette", help="Color the bars according to a palette: %s." % x)
    group.add_argument(
        "--symmetric_palette", action="store_true",
        help="Make the color symmetric.")

    group = parser.add_argument_group(title="Appearance")
    group.add_argument(
        "--height", type=int, help="Height (in pixels) of the plot.")
    group.add_argument(
        "--width", type=int, help="Width (in pixels) of the plot.")
    group.add_argument(
        "--mar_left", default=1.0, type=float,
        help="Scale margin at left of plot.  Default 1.0 (no scaling).")
    group.add_argument(
        "--mar_bottom", default=1.0, type=float,
        help="Scale margin at bottom of plot.  Default 1.0.")
    group.add_argument(
        "--xaxis_off", action="store_true", help="Do not show the X axis.")
    group.add_argument(
        "--yaxis_off", action="store_true", help="Do not show the Y axis.")


    # Parse the input arguments.
    args = parser.parse_args()
    if not os.path.exists(args.datafile):
        parser.error("File not found: %s" % args.datafile)
    assert not (args.breaks_seq and args.num_breaks)
    if args.num_breaks:
        assert args.num_breaks >= 2 and args.num_breaks <= 1000
    if args.width is not None:
        assert args.width > 10, "too small"
        assert args.width < 4096*16, "width too big"
    if args.height is not None:
        assert args.height > 10, "too small"
        assert args.height < 4096*16, "height too big"
    assert args.mar_bottom > 0 and args.mar_bottom < 10
    assert args.mar_left > 0 and args.mar_left < 10
    assert args.xlabel_size > 0 and args.xlabel_size < 10
    assert not (args.bar_color and args.bar_palette)
    assert not args.symmetric_palette or args.bar_palette
    assert args.ymax is None or args.ymax > 0


    height = args.height or 2400
    width = args.width or 3200

    MATRIX = AnnotationMatrix.read(args.datafile, False)
    assert MATRIX.num_headers() and MATRIX.num_annots(), "Empty matrix."
    assert args.header in MATRIX.headers, "header not found: %s" % args.header

    # Pull out the values for the histogram.
    x = MATRIX[args.header]
    if args.ignore_missing_values:
        x = [x for x in x if x.strip()]
    values = map(float, x)

    value_min = value_max = None

    # Start R and set up the environment.
    R = jmath.start_R()

    main = jmath.R_var("NA")
    if args.title:
        main = args.title
    sub = ""
    xlab = ""
    if args.xlab:
        xlab = args.xlab
    ylab = "Frequency"
    xtick_labels = jmath.R_var("TRUE")
    ytick_labels = jmath.R_var("TRUE")

    if args.xlabel_off:
        xlab = ""
    if args.ylabel_off:
        ylab = ""
    if args.xtick_label_off:
        xtick_labels = jmath.R_var("FALSE")

    breaks = "Sturges"
    if args.breaks_seq:
        breaks = _parse_breaks_seq(args.breaks_seq)
        value_min, value_max = min(breaks), max(breaks)
        jmath.R_equals(breaks, "breaks")
        breaks = jmath.R_var("breaks")
    if args.num_breaks:
        breaks = args.num_breaks

    if value_min is not None:
        values = [x for x in values if x >= value_min]
    if value_max is not None:
        values = [x for x in values if x < value_max]

    lwd = 2
    cex_lab = 1.5
    cex_main = 2.0
    cex_sub = 1.5
    ylim = jmath.R_var("NULL")
    if args.ymax is not None:
        ylim = [0, args.ymax]

    assert values
    jmath.R_equals(values, "X")

    # Figure out the colors.  Do it after X is assigned.
    col = jmath.R_var("NULL")
    if args.bar_color:
        assert args.bar_color.startswith("#")
        col = args.bar_color
    elif args.bar_palette:
        # Figure out how many breaks there are.  Number of bars is num
        # breaks + 1.
        jmath.R_fn(
            "hist", jmath.R_var("X"), breaks=breaks, plot=jmath.R_var("FALSE"),
            RETVAL="x")
        breaks = [x for x in R["x"].rx2("breaks")]
        num_bars = len(breaks) + 1
        col = _make_col_palette(
            args.bar_palette, num_bars, args.symmetric_palette)

    bm_type = "png16m"
    if args.plot_file.lower().endswith(".pdf"):
        bm_type = "pdfwrite"
    jmath.R_fn(
        "bitmap", args.plot_file, type=bm_type, 
        height=height, width=width, units="px", res=300)
    
    # Set the margins.
    x = 5*1.2*args.mar_bottom, 4*1.2*args.mar_left, 4, 2
    mar = [x+0.1 for x in x]
    jmath.R_fn("par", mar=mar, RETVAL="op")

    jmath.R_fn(
        "hist", jmath.R_var("X"), breaks=breaks, main="", xlab="", ylab="",
        ylim=ylim, axes=jmath.R_var("FALSE"), col=col, RETVAL="x")
    # Make plot area solid white.
    #jmath.R('usr <- par("usr")')
    #jmath.R('rect(usr[1], usr[3], usr[2], usr[4], col="#FFFFFF")')
    #jmath.R_fn(
    #    "hist", jmath.R_var("X"), plot=jmath.R_var("FALSE"),
    #    main=main, xlab="", ylab="", axes=jmath.R_var("FALSE"),
    #    add=jmath.R_var("TRUE"))
    
    #jmath.R_fn("box", lwd=lwd)
    # x-axis
    if not args.xaxis_off:
        jmath.R_fn(
            "axis", 1, lwd=lwd, labels=xtick_labels, **{ "cex.axis" : 1.5 })
    # y-axis
    if not args.yaxis_off:
        jmath.R_fn(
            "axis", 2, lwd=lwd, labels=ytick_labels, **{ "cex.axis" : 1.5 })
    jmath.R_fn(
        "title", main=main, sub=sub, xlab=xlab, ylab=ylab,
        **{ "cex.lab" : cex_lab, "cex.main" : cex_main, "cex.sub" : cex_sub })
    R("par(op)")
    jmath.R_fn("dev.off")

    if args.prism_file:
        write_prism_file(args.prism_file, R["x"])
Esempio n. 18
0
def read_as_am(filename, is_csv=False):
    # Read file in SVM format.  Return an AnnotationMatrix object.
    # Does no special processing on any columns (i.e. no parsing as
    # integers or Call objects).  Everything is a string.

    # Header format:  <header0>___<header1>___<header2>
    # "blanks" are filled in.  E.g. "Annovar" occurs in each Annovar
    # column in header0.
    #
    # Headers:
    # ______Chrom
    # ______Pos
    # ______Ref
    # ______Alt
    # Num Callers______<Sample>
    # ...
    from genomicode import filelib
    from genomicode import AnnotationMatrix

    delimiter = "\t"
    if is_csv:
        delimiter = ","

    matrix = []
    for x in filelib.read_cols(filename, delimiter=delimiter):
        matrix.append(x)
    assert len(matrix) >= 3  # at least 3 rows for the header
    for i in range(1, len(matrix)):
        assert len(matrix[i]) == len(matrix[0])
    assert len(matrix[0]) >= 4  # Chrom, Pos, Ref, Alt
    assert len(matrix[0]) >= 5, "No calls"

    header0 = matrix[0]
    header1 = matrix[1]
    header2 = matrix[2]
    assert header2[:4] == ["Chrom", "Pos", "Ref", "Alt"]

    # Fill in the blanks for header1.
    for i in range(1, len(header1)):
        if header1[i]:
            continue
        # header1[i] is blank.  If header0[i], then this starts a new
        # "block".  Start with a new header1, and do not copy the old
        # one over.
        if not header1[i] and not header0[i]:
            header1[i] = header1[i - 1]
    # Fill in the blanks for header0.
    for i in range(1, len(header0)):
        if not header0[i]:
            header0[i] = header0[i - 1]

    # Make a list of all samples.
    I = [i for (i, x) in enumerate(header2) if x == "Ref/Alt/VAF"]
    assert I
    x = [header0[i] for i in I]
    x = [x for x in x if x]
    # Get rid of duplicates, preserving order.
    x = [x[i] for (i, y) in enumerate(x) if y not in x[:i]]
    samples = x

    # Make a list of all callers.
    x = [header1[i] for i in I]
    x = [x for x in x if x]
    # Get rid of duplicates, preserving order.
    x = [x[i] for (i, y) in enumerate(x) if y not in x[:i]]
    callers = x

    headers = []
    for x in zip(header0, header1, header2):
        x = "___".join(x)
        headers.append(x)
    all_annots = []
    for j in range(len(headers)):
        annots = [x[j] for x in matrix[3:]]
        all_annots.append(annots)
    matrix = AnnotationMatrix.create_from_annotations(headers, all_annots)
    matrix.samples = samples
    matrix.callers = callers
    return matrix
Esempio n. 19
0
def main():
    import argparse
    from genomicode import AnnotationMatrix as AM

    SKIP_OUTFILE = "_"

    parser = argparse.ArgumentParser(
        description="Align a set of matrices.  Preserve the order of the "
        "first file given.")
    parser.add_argument("outfile", nargs="+")

    parser.add_argument(
        "--express_file", default=[], action="append", help="")
    parser.add_argument(
        "--annot_file", default=[], action="append", help="")
    parser.add_argument(
        "--header", default=[], action="append",
        help="Specify the header for an annotation file.  Should come "
        "after the --annot_file that it refers to.")
    parser.add_argument(
        "--annot_path",
        help="Align all the annotation files in a path.  "
        "If using this argument, no --annot_file or --express_file should "
        "be given.  "
        "--header is still required, and should apply to at least one file.  "
        'Only one "outfile" should be given, and it should refer to a path '
        "in which to store the aligned files.")
    
    #parser.add_argument(
    #    "--first_annot_header", help="If only aligning annotation files, "
    #    "find the samples to be matched under this header in the first "
    #    "annotation file.")
    parser.add_argument(
        "--clobber", default=False, action="store_true",
        help="Overwrite output files, if they already exist.")

    group = parser.add_argument_group(title="Comparisons")
    group.add_argument(
        "--case_insensitive", default=False, action="store_true",
        help="Do a case insensitive search of sample names.")
    group.add_argument(
        "--hash", default=False, action="store_true",
        help="Hash the sample names to [a-zA-Z0-9_] before comparison.")
    group.add_argument(
        "--ignore_nonalnum", default=False, action="store_true",
        help="Ignore non-alphanumeric characters in the IDs.")
    group.add_argument(
        "--ignore_blank", default=False, action="store_true",
        help="Ignore IDs that are blank (don't align them.")

    group = parser.add_argument_group(title="Joins")
    group.add_argument(
        "--strict", default=False, action="store_true",
        help="Complain if a file is missing a sample.")
    group.add_argument(
        "--left_join", default=False, action="store_true",
        help='By default, does an "inner join" and keeps only the '
        'records that are present in all files.  A "left join" will '
        'keep all records that occur in the first file.')
    group.add_argument(
        "--outer_join", default=False, action="store_true",
        help='By default, does an "inner join" and keeps only the '
        'records that are present in all files.  An "outer join" will '
        'also keep records that occur in any file.')

    group = parser.add_argument_group(title="Output")
    group.add_argument(
        "--null_string", default="",
        help='For left_join or outer_join, what to give the missing values.')
    group.add_argument(
        "--unaligned_only", action="store_true",
        help="Show only the rows that are not aligned.")
    group.add_argument(
        "--dont_add_missing_samples", action="store_true",
        help="If a matrix does not have a sample, don't fill in the value "
        "from another matrix.")

    group = parser.add_argument_group(title="Debug")
    group.add_argument(
        "--debug_nrows", type=int,
        help="Debugging: Only read this many rows from the annotation files.")
    
    args = parser.parse_args()
    # If the user specified an --annot_path, revise args to
    # contain --annot_files instead.
    sys.argv, args = _handle_annot_path(sys.argv, args)

    ni, no = len(args.express_file)+len(args.annot_file), len(args.outfile)
    assert ni == no, "Mismatch: %d inputs and %d outputs" % (ni, no)
        
    for x in args.express_file + args.annot_file:
        assert os.path.exists(x), "I could not find file: %s" % x
    for x in args.outfile:
        if x == SKIP_OUTFILE:
            continue
        assert args.clobber or not os.path.exists(x), "File exists: %s" % x
    assert not (args.left_join and args.outer_join)
    if args.null_string:
        assert args.outer_join or args.left_join, \
               "null_string given, but only used for outer_join"


    # Align the outfiles to the expression and annotation files.
    express_file = args.express_file[:]
    annot_file = args.annot_file[:]
    outfile = args.outfile[:]
    matrix_data = []  # list of (infile, outfile, is_express_file)
    for arg in sys.argv:
        if arg not in ["--express_file", "--annot_file"]:
            continue
        assert outfile
        if arg == "--express_file":
            assert express_file
            x = express_file.pop(0), outfile.pop(0), True
        else:
            assert annot_file
            x = annot_file.pop(0), outfile.pop(0), False
        matrix_data.append(x)
    assert not express_file
    assert not annot_file
    assert not outfile

    # Align the --header arguments to the annotation files.
    headers = [None] * len(matrix_data)
    header_i = -1
    for i, arg in enumerate(sys.argv):
        if arg == "--header":
            assert header_i >= 0, \
                   "--header given before an --express_file or --annot_file."
            assert headers[header_i] is None, "Two --header for one file."
            headers[header_i] = sys.argv[i+1]
        elif arg in ["--express_file", "--annot_file"]:
            header_i += 1

    # Add the headers to the matrix_data.
    new_matrix_data = []  # list of (infile, outfile, is_express_file, header)
    for i in range(len(matrix_data)):
        infile, outfile, is_express_file = matrix_data[i]
        if is_express_file and headers[i]:
            raise NotImplementedError, "No headers for --express_file."
        x = infile, outfile, is_express_file, headers[i]
        new_matrix_data.append(x)
    matrix_data = new_matrix_data

    # Read each of the files.
    new_matrix_data = []  # list of (infile, outfile, matrix, header)
    for x in matrix_data:
        infile, outfile, is_express_file, header = x
        if is_express_file:
            data = read_express(infile)
        else:
            data = AM.read(infile, nrows=args.debug_nrows)
        x = infile, outfile, data, header
        new_matrix_data.append(x)
    matrix_data = new_matrix_data

    # Find the samples in each matrix.
    new_matrix_data = []  # list of (infile, outfile, matrix, header, samples)
    samples_hint = peek_samples_hint(matrix_data)
    for x in matrix_data:
        infile, outfile, matrix, header = x
        headers_hint = [x for x in headers if x]
        x = get_samples(
            matrix, header, samples_hint, headers_hint,
            args.case_insensitive, args.hash, args.ignore_nonalnum)
        assert x, "I could not find the samples for %s" % infile
        header, samples = x
        x = infile, outfile, matrix, header, samples
        new_matrix_data.append(x)
    matrix_data = new_matrix_data

    if args.left_join:
        assert not args.strict, "Can't do a strict left join."
        # No duplicates.
        samples = list_all_samples(
            matrix_data[:1], args.case_insensitive, args.hash,
            args.ignore_nonalnum)
        assert samples, "No samples."
    elif args.outer_join:
        assert not args.strict, "Can't do a strict outer join."
        samples = list_all_samples(
            matrix_data, args.case_insensitive, args.hash,
            args.ignore_nonalnum)
        assert samples, "No samples."
    else:  # inner join
        samples = list_common_samples(
            matrix_data, args.case_insensitive, args.hash,
            args.ignore_nonalnum)
        assert samples, "No common samples found."

    if args.strict:
        all_samples = list_all_samples(
            matrix_data, args.case_insensitive, args.hash,
            args.ignore_nonalnum)
        common_samples = list_common_samples(
            matrix_data, args.case_insensitive, args.hash,
            args.ignore_nonalnum)
        if sorted(all_samples) != sorted(common_samples):
            missing_samples = []
            for x in all_samples:
                i = find_sample(
                    common_samples, x, args.case_insensitive, args.hash,
                    args.ignore_nonalnum, args.ignore_blank)
                if i >= 0:
                    continue
                missing_samples.append(x)
            short = missing_samples
            if len(short) > 10:
                short = short[:10] + ["..."]
            short = "\n".join(short)
            raise AssertionError, "%d samples not in all data sets.\n%s" % \
                  (len(missing_samples), short)

    # Align each of the matrices.
    matrix_data = align_matrices(
        matrix_data, samples, args.case_insensitive, args.hash,
        args.ignore_nonalnum, args.ignore_blank,
        args.left_join, args.outer_join, args.unaligned_only,
        args.null_string)

    # Add the missing samples back to the matrix.
    if not args.dont_add_missing_samples:
        matrix_data = add_missing_samples(matrix_data, args.null_string)

    # Write out each of the matrices.
    for x in matrix_data:
        infile, outfile, matrix, header, samples = x
        if outfile == SKIP_OUTFILE:
            continue
        write_matrix(outfile, matrix)
Esempio n. 20
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        import math
        from genomicode import filelib
        from genomicode import jmath
        from genomicode import AnnotationMatrix
        from genomicode import SimpleVariantMatrix
        from Betsy import module_utils as mlib

        svm_node = in_data
        filelib.assert_exists_nz(svm_node.identifier)

        linked_file = mlib.get_user_option(user_options,
                                           "linked_variants_file",
                                           not_empty=True,
                                           check_file=True)

        # Read the variant file.
        SVM = SimpleVariantMatrix.read_as_am(svm_node.identifier)
        CHROM = SVM["______Chrom"]
        POS = SVM["______Pos"]
        POS = [int(x) for x in POS]
        all_coords = {}  # (chrom, pos) -> 1
        for x in zip(CHROM, POS):
            all_coords[x] = 1

        # Read the linked variant file.
        # Chrom  Pos  Perc Linked  p
        coord2info = {}  # (chrom, pos) -> d
        for d in filelib.read_row(linked_file, header=1):
            pos = int(d.Pos)
            if (d.Chrom, pos) not in all_coords:
                continue
            coord2info[(d.Chrom, pos)] = d

        # Align the linked annotations to the matrix.
        MAX_SCORE = 1000
        min_p = 10**-(MAX_SCORE / 10)
        linked_headers = ["Perc Linked", "Score"]
        annotations = []
        for (chrom, pos) in zip(CHROM, POS):
            if (chrom, pos) not in coord2info:
                x = [""] * len(linked_headers)
                annotations.append(x)
                continue
            d = coord2info[(chrom, pos)]
            score = MAX_SCORE
            if float(d.p) >= min_p:
                score = -10 * math.log(float(d.p), 10)
            x = d.Perc_Linked, score
            assert len(x) == len(linked_headers)
            annotations.append(x)
        # Convert the headers and annotations to SVM format.
        linked_headers = ["Linkage______%s" % x for x in linked_headers]
        linked_annotations = jmath.transpose(annotations)

        # Make the new SimpleVariantMatrix.
        # Figure out where to put these annotations.
        INDEX = 4
        ## If Annovar exists, put after.
        #I = [i for (i, x) in enumerate(SVM.headers)
        #     if x.upper().startswith("ANNOVAR")]
        #if I:
        #    INDEX = max(INDEX, max(I)+1)
        headers = SVM.headers[:INDEX] + linked_headers + SVM.headers[INDEX:]
        x = [SVM.header2annots[x] for x in SVM.headers_h]
        all_annots = x[:INDEX] + linked_annotations + x[INDEX:]
        merged = AnnotationMatrix.create_from_annotations(
            headers, all_annots, headerlines=SVM.headerlines)

        SimpleVariantMatrix.write_from_am(outfile, merged)
    def run(
        self, network, in_data, out_attributes, user_options, num_cores,
        outfile):
        from genomicode import filelib
        from genomicode import jmath
        from genomicode import AnnotationMatrix
        from genomicode import SimpleVariantMatrix
        from Betsy import module_utils as mlib

        svm_node = in_data
        filelib.assert_exists_nz(svm_node.identifier)

        cosmic_file = mlib.get_user_option(
            user_options, "cosmic_variants_file", not_empty=True,
            check_file=True)
        
        # Read the variant file.
        SVM = SimpleVariantMatrix.read_as_am(svm_node.identifier)
        CHROM = SVM["______Chrom"]
        POS = SVM["______Pos"]
        POS = [int(x) for x in POS]
        all_coords = {}  # (chrom, pos) -> 1
        for x in zip(CHROM, POS):
            all_coords[x] = 1

        # Read the COSMIC variant file.
        # Chrom  Start  End  GRCh  Count  SNP
        # Mutation CDS  Mutation AA
        # FATHMM prediction  FATHMM score  Mutation somatic status
        coord2info = {}  # (chrom, pos) -> d
        for d in filelib.read_row(cosmic_file, header=1):
            start, end = int(d.Start), int(d.End)
            in_svm = False
            for pos in range(start, end+1):
                if (d.Chrom, pos) in all_coords:
                    in_svm = True
                    break
            if not in_svm:
                continue
            coord2info[(d.Chrom, pos)] = d

        # Align the COSMIC annotations to the matrix.
        cosmic_headers = [
            "SNP", "Num Tumors", "Mutation CDS", "Mutation AA",
            "FATHMM prediction", "FATHMM score", "Mutation somatic status"]
        annotations = []
        for (chrom, pos) in zip(CHROM, POS):
            if (chrom, pos) not in coord2info:
                x = [""] * len(cosmic_headers)
                annotations.append(x)
                continue
            d = coord2info[(chrom, pos)]
            x = d.SNP, d.Count, d.Mutation_CDS, d.Mutation_AA, \
                d.FATHMM_prediction, d.FATHMM_score, \
                d.Mutation_somatic_status
            annotations.append(x)
        # Convert the headers and annotations to SVM format.
        cosmic_headers = ["COSMIC______%s" % x for x in cosmic_headers]
        cosmic_annotations = jmath.transpose(annotations)

        # Make the new SimpleVariantMatrix.
        # Figure out where to put these annotations.
        INDEX = 4
        # If Annovar exists, put after.
        I = [i for (i, x) in enumerate(SVM.headers)
             if x.upper().startswith("ANNOVAR")]
        if I:
            INDEX = max(INDEX, max(I)+1)
        # If SnpEff exists, put after.
        I = [i for (i, x) in enumerate(SVM.headers)
             if x.upper().startswith("SNPEFF")]
        if I:
            INDEX = max(INDEX, max(I)+1)
        headers = SVM.headers[:INDEX] + cosmic_headers + SVM.headers[INDEX:]
        x = [SVM.header2annots[x] for x in SVM.headers_h]
        all_annots = x[:INDEX] + cosmic_annotations + x[INDEX:]
        merged = AnnotationMatrix.create_from_annotations(
            headers, all_annots, headerlines=SVM.headerlines)

        SimpleVariantMatrix.write_from_am(outfile, merged)
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_filename):
        from genomicode import filelib
        from genomicode import SimpleVariantMatrix
        from genomicode import AnnotationMatrix

        simple_file = in_data.identifier
        metadata = {}

        # Read all in memory.  Hopefully, not too big.
        ds = []
        for d in filelib.read_row(simple_file, header=-1):
            ds.append(d)
            #if len(ds) > 50000:  # DEBUG
            #    break

        # MuSE sometimes has alternates.
        # Alt       A,C
        # Num_Alt  13,0
        # VAF      0.19,0.0
        # Detect this and fix it.  Take the alternate with the highest VAF.
        for d in ds:
            if d.Num_Alt.find(",") < 0:
                continue
            x1 = d.Num_Alt.split(",")
            x2 = d.VAF.split(",")
            assert len(x1) == len(x2)
            x1 = map(int, x1)
            x2 = map(float, x2)
            max_vaf = max_i = None
            for i in range(len(x2)):
                if max_vaf is None or x2[i] > max_vaf:
                    max_vaf = x2[i]
                    max_i = i
            assert max_i is not None
            d.Num_Alt = str(x1[max_i])
            d.VAF = str(x2[max_i])

        # Make a list of all the positions.
        positions = {}  # (Chrom, Pos) -> 1
        for d in ds:
            positions[(d.Chrom, int(d.Pos))] = 1
        positions = sorted(positions)

        # Make a list of all the callers.
        callers = {}
        for d in ds:
            callers[d.Caller] = 1
        callers = sorted(callers)

        # Make a list of all the samples.
        samples = {}
        for d in ds:
            samples[d.Sample] = 1
        samples = sorted(samples)

        # Make a list of the coordinates.
        coord_data = {}
        for d in ds:
            x = d.Chrom, int(d.Pos), d.Ref, d.Alt
            coord_data[x] = 1
        coord_data = sorted(coord_data)

        # Make a list of all DNA calls.
        call_data = []
        for d in ds:
            assert d.Source in ["DNA", "RNA"]
            if d.Source != "DNA":
                continue
            num_ref = num_alt = vaf = None
            if d.Num_Ref:
                num_ref = int(d.Num_Ref)
            if d.Num_Alt:
                num_alt = int(d.Num_Alt)
            if d.VAF:
                vaf = float(d.VAF)
            if num_ref is None and num_alt is None and vaf is None:
                continue
            call = SimpleVariantMatrix.Call(num_ref, num_alt, vaf)
            x = d.Chrom, int(d.Pos), d.Ref, d.Alt, d.Sample, d.Caller, call
            call_data.append(x)

        # sample -> caller -> chrom, pos, ref, alt -> call
        samp2caller2coord2call = {}
        for x in call_data:
            chrom, pos, ref, alt, sample, caller, call = x
            coord = chrom, pos, ref, alt
            if sample not in samp2caller2coord2call:
                samp2caller2coord2call[sample] = {}
            caller2coord2call = samp2caller2coord2call[sample]
            if caller not in caller2coord2call:
                caller2coord2call[caller] = {}
            coord2call = caller2coord2call[caller]
            # A (sample, caller, coord) may have multiple calls.  For
            # example, for germline samples that are called with each
            # tumor sample.  If this is the case, then take the call
            # with the highest coverage.
            if coord in coord2call:
                old_call = coord2call[coord]
                cov = old_cov = None
                if call.num_ref is not None and call.num_alt is not None:
                    cov = call.num_ref + call.num_alt
                if old_call.num_ref is not None and \
                       old_call.num_alt is not None:
                    old_cov = old_call.num_ref + old_call.num_alt
                if cov is None and old_cov is not None:
                    call = old_call
                elif cov is not None and old_cov is not None and cov < old_cov:
                    call = old_call
            coord2call[coord] = call

        # Count the number of callers that called a variant at each
        # position for each sample.
        samp2coord2caller = {}  # sample -> chrom, pos, ref, alt -> caller -> 1
        # Need to do this first, to make sure each caller is counted
        # at most once.  This is to account for germline samples that
        # is called by each caller multiple times.
        for x in call_data:
            chrom, pos, ref, alt, sample, caller, call = x
            coord = chrom, pos, ref, alt
            if sample not in samp2coord2caller:
                samp2coord2caller[sample] = {}
            if coord not in samp2coord2caller[sample]:
                samp2coord2caller[sample][coord] = {}
            samp2coord2caller[sample][coord][caller] = 1
        samp2coord2nc = {}  # sample -> chrom, pos, ref, alt -> num_callers
        for sample in samp2coord2caller:
            samp2coord2nc[sample] = {}
            for coord in samp2coord2caller[sample]:
                samp2coord2nc[sample][coord] = len(
                    samp2coord2caller[sample][coord])
        #for x in call_data:
        #    chrom, pos, ref, alt, sample, caller, call = x
        #    coord = chrom, pos, ref, alt
        #    if sample not in samp2coord2nc:
        #        samp2coord2nc[sample] = {}
        #    nc = samp2coord2nc[sample].get(coord, 0) + 1
        #    samp2coord2nc[sample][coord] = nc

        # Format everything into an annotation matrix.
        headers0 = []
        headers1 = []
        headers2 = []
        all_annots = []

        # Add the positions.
        headers0 += ["", "", "", ""]
        headers1 += ["", "", "", ""]
        headers2 += ["Chrom", "Pos", "Ref", "Alt"]
        for i in range(4):
            x = [x[i] for x in coord_data]
            x = [str(x) for x in x]
            all_annots.append(x)

        # Add the number of callers information.
        headers0 += ["Num Callers"] * len(samples)
        headers1 += [""] * len(samples)
        headers2 += samples
        for sample in samples:
            annots = []
            for coord in coord_data:
                nc = samp2coord2nc.get(sample, {}).get(coord, "")
                annots.append(nc)
            all_annots.append(annots)

        # Add information about calls.
        for sample in samples:
            caller2coord2call = samp2caller2coord2call.get(sample, {})
            for i, caller in enumerate(callers):
                h0 = ""
                if not i:
                    h0 = sample
                h1 = caller
                h2 = "Ref/Alt/VAF"
                headers0.append(h0)
                headers1.append(h1)
                headers2.append(h2)

                coord2call = caller2coord2call.get(caller, {})
                annots = []
                for coord in coord_data:
                    x = ""
                    call = coord2call.get(coord)
                    if call:
                        x = SimpleVariantMatrix._format_call(call)
                    annots.append(x)
                all_annots.append(annots)

        # Set the headers.
        assert len(headers0) == len(headers1)
        assert len(headers0) == len(headers2)
        assert len(headers0) == len(all_annots)
        headers = [None] * len(headers0)
        for i, x in enumerate(zip(headers0, headers1, headers2)):
            x = "___".join(x)
            headers[i] = x
        matrix = AnnotationMatrix.create_from_annotations(headers, all_annots)
        SimpleVariantMatrix.write_from_am(out_filename, matrix)

        #annot_header = ["Chrom", "Pos", "Ref", "Alt"]
        #matrix = SimpleVariantMatrix.make_matrix(
        #    samples, callers, annot_header, coord_data, named_data,
        #    call_data)
        #SimpleVariantMatrix.write(out_filename, matrix)

        return metadata
Esempio n. 23
0
    def run(
        self, network, in_data, out_attributes, user_options, num_cores,
        outfile):
        from genomicode import filelib
        from genomicode import hashlib
        from genomicode import jmath
        from genomicode import AnnotationMatrix
        from genomicode import SimpleVariantMatrix
        from Betsy import module_utils as mlib

        simple_node = in_data
        filelib.assert_exists_nz(simple_node.identifier)

        gene_file = mlib.get_user_option(
            user_options, "cancer_genes_file", not_empty=True, check_file=True)

        # Read the cancer genes file.
        # <Gene ID>  <Gene Symbol>  <Dataset>  ...
        symbol2info = {}  # symbol -> d
        gene_iter = filelib.read_row(gene_file, header=1)
        header = None
        for d in gene_iter:
            assert "Gene Symbol" in d._header
            if header is None:
                header = [
                    x for x in d._header
                    if x not in ["Gene ID", "Gene Symbol"]]
            if not d.Gene_Symbol:
                continue
            symbol2info[d.Gene_Symbol] = d

        # Read the variant file.
        SVM = SimpleVariantMatrix.read_as_am(simple_node.identifier)

        GENE_H = "Annovar______Gene.refGene"
        assert GENE_H in SVM.headers, "Missing annotation: %s" % GENE_H
        GENES = SVM[GENE_H]

        # Align the matrix to the simple variant matrix.
        gene_headers = header
        gene_annotations = []
        for i, gene_str in enumerate(GENES):
            # Format of genes:
            # PFN1P2
            # PMS2P2,PMS2P7
            values = [""] * len(gene_headers)
            genes = gene_str.split(",")
            for gene in genes:
                if gene not in symbol2info:
                    continue
                d = symbol2info[gene]
                for j, h in enumerate(gene_headers):
                    h = hashlib.hash_var(h)
                    assert hasattr(d, h)
                    x = getattr(d, h)
                    assert x in ["", "1"]
                    if x == "1":
                        values[j] = 1
            gene_annotations.append(values)
        # Convert the headers and annotations to SVM format.
        gene_headers = ["Cancer Genes______%s" % x for x in gene_headers]
        gene_annotations = jmath.transpose(gene_annotations)

        # Make the new SimpleVariantMatrix.
        # Figure out where to put these annotations.
        INDEX = 4
        # If Annovar exists, put after.
        I = [i for (i, x) in enumerate(SVM.headers)
             if x.upper().startswith("ANNOVAR")]
        if I:
            INDEX = max(INDEX, max(I)+1)
        # If SnpEff exists, put after.
        I = [i for (i, x) in enumerate(SVM.headers)
             if x.upper().startswith("SNPEFF")]
        if I:
            INDEX = max(INDEX, max(I)+1)
        # If COSMIC exists, put after.
        I = [i for (i, x) in enumerate(SVM.headers)
             if x.upper().startswith("COSMIC")]
        if I:
            INDEX = max(INDEX, max(I)+1)
        headers = SVM.headers[:INDEX] + gene_headers + SVM.headers[INDEX:]
        x = [SVM.header2annots[x] for x in SVM.headers_h]
        all_annots = x[:INDEX] + gene_annotations + x[INDEX:]
        merged = AnnotationMatrix.create_from_annotations(
            headers, all_annots, headerlines=SVM.headerlines)

        SimpleVariantMatrix.write_from_am(outfile, merged)
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_filename):
        import arrayio
        from genomicode import filelib
        from genomicode import AnnotationMatrix
        from genomicode import SimpleVariantMatrix

        simple_node, signal_node = antecedents
        filelib.assert_exists_nz(simple_node.identifier)
        filelib.assert_exists_nz(signal_node.identifier)

        # Read the variant file.
        SVM = SimpleVariantMatrix.read(simple_node.identifier)
        #AM = SVM.annot_matrix
        #assert GENE_H in AM.headers

        # Read the gene expression file.
        GXP = arrayio.read(signal_node.identifier)

        # Make sure the samples from the variant matrix can be found
        # in the gene expression matrix.
        GXP_samples = GXP.col_names(arrayio.COL_ID)
        missing = [x for x in SVM.samples if x not in GXP_samples]
        assert len(missing) < len(SVM.samples), (
            "SimpleVariantMatrix and gene expression file have "
            "no common samples.")
        # Actually, may not have all the same samples.  For example, a
        # gene expression profile might not have been calculated for
        # the germline sample.  So ignore if something is missing.
        #x = missing
        #if len(x) > 5:
        #    x = x[:5] + ["..."]
        #msg = "Samples (%d) not found in gene expression file: %s" % (
        #    len(missing), ", ".join(x))
        #assert not missing, msg

        # Add all the samples from the gene expression file.
        SAMPLES = GXP_samples

        # Find the genes in each row.
        GENE_H = "Gene.refGene"
        annovar_matrix = None
        for (name, matrix) in SVM.named_matrices:
            if GENE_H in matrix.headers:
                annovar_matrix = matrix
                break
        assert annovar_matrix, "Missing annotation: %s" % GENE_H
        GENES = annovar_matrix[GENE_H]

        # Make a list of the genes.
        genes = {}
        for i, gene_str in enumerate(GENES):
            # Format of genes:
            # PFN1P2
            # PMS2P2,PMS2P7
            for x in gene_str.split(","):
                genes[x] = 1
        genes = sorted(genes)

        # Make a matrix of the gene expression values for each gene
        # and each sample.
        #I = [GXP_samples.index(x) for x in SVM.samples]
        #GXP_a = GXP.matrix(genes, I)  # align the matrices.
        GXP_a = GXP.matrix(genes, None)
        
        # Write out the expression matrix for debugging purposes.
        arrayio.write(GXP_a, "expression.txt")

        # Search for each of the genes in the matrix.
        gene2I = {}   # gene -> list of row indexes
        for gene in genes:
            x = GXP_a._index(row=gene)
            I_row, i_col = x
            if I_row:
                gene2I[gene] = I_row

        # Align the gene expression matrix to the simple variant
        # matrix.
        #matrix = [[None]*len(SVM.samples) for i in range(len(GENES))]
        matrix = [[None]*len(SAMPLES) for i in range(len(GENES))]
        for i, gene_str in enumerate(GENES):
            # Format of genes:     Format of output
            # PFN1P2                  5.2
            # PMS2P2,PMS2P7           2.2,8.6
            # If a gene is missing, then skip it.
            genes = gene_str.split(",")
            #for j in range(len(SVM.samples)):
            for j in range(len(SAMPLES)):
                values = []  # expression values for each gene.
                for k in range(len(genes)):
                    if genes[k] not in gene2I:
                        continue
                    x = [GXP_a._X[l][j] for l in gene2I[genes[k]]]
                    # If there are multiple instances of this gene,
                    # then pick the one with the maximum expression.
                    x = max(x)
                    values.append(x)
                values = [_pretty_gxp(x) for x in values]
                x = ",".join(values)
                matrix[i][j] = x

        # Add the matrix back to the simple variant matrix.
        #headers = SVM.samples
        headers = SAMPLES
        all_annots = []
        for j in range(len(headers)):
            x = [matrix[i][j] for i in range(len(matrix))]
            all_annots.append(x)
        x = AnnotationMatrix.create_from_annotations(headers, all_annots)
        SVM.named_matrices.append(("Gene Expression", x))

        # Write to file.
        SimpleVariantMatrix.write(out_filename, SVM)
Esempio n. 25
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_filename):
        import itertools
        from genomicode import SimpleVariantMatrix
        from genomicode import AnnotationMatrix
        from Betsy import module_utils as mlib

        summary_file = in_data.identifier
        metadata = {}

        #x = mlib.get_user_option(
        #    user_options, "nonsynonymous_and_stopgain_only",
        #    allowed_values=["no", "yes"])
        #nonsynonymous_and_stopgain_only = (x == "yes")

        min_alt_reads = mlib.get_user_option(user_options,
                                             "filter_by_min_alt_reads",
                                             not_empty=True,
                                             type=int)
        assert min_alt_reads >= 0 and min_alt_reads < 10000

        min_total_reads = mlib.get_user_option(user_options,
                                               "filter_by_min_total_reads",
                                               not_empty=True,
                                               type=int)
        assert min_total_reads >= 0 and min_total_reads < 10000

        min_vaf = mlib.get_user_option(user_options,
                                       "filter_by_min_vaf",
                                       not_empty=True,
                                       type=float)
        assert min_vaf >= 0.0 and min_vaf < 1.0

        #min_gq = mlib.get_user_option(
        #    user_options, "filter_by_min_GQ", not_empty=True, type=float)
        #assert min_gq >= 0 and min_gq < 1000

        assert min_total_reads or min_alt_reads, "No filter"

        matrix = SimpleVariantMatrix.read_as_am(summary_file)
        #var_matrix = SimpleVariantMatrix.read(summary_file)
        #call_matrix = var_matrix.call_matrix
        #annot_matrix = var_matrix.annot_matrix

        #annovar_matrix = None
        #for (name, matrix) in var_matrix.named_matrices:
        #    if "ExonicFunc.refGene" in matrix.headers:
        #        annovar_matrix = matrix
        #        break
        #assert annovar_matrix, "Missing annotation: ExonicFunc.refGene"

        # copy.deepcopy is very slow.  Try to avoid it.
        # Strategy:
        # 1.  Make a list of the changes to be made.
        # 2.  Save the filtered rows.
        # 3.  Make the changes.
        # 4.  Save the non-filtered rows.
        I_remove = {}  # i -> 1
        call_remove = {}  # i -> (sample, caller) -> 1

        #CHROM = matrix.header2annots["______Chrom"]
        #POS = matrix.header2annots["______Pos"]
        #POS = [int(x) for x in POS]
        #REF = matrix.header2annots["______Ref"]
        #ALT = matrix.header2annots["______Alt"]

        # Optimization: normalize the headers for the samples and callers.
        sc2header = {}  # (sample, caller) -> header_h
        for sc in itertools.product(matrix.samples, matrix.callers):
            sample, caller = sc
            header = "%s___%s___Ref/Alt/VAF" % (sample, caller)
            header_h = matrix.normalize_header(header)
            assert header_h
            sc2header[sc] = header_h

        for i in range(matrix.num_annots()):
            has_calls = False  # whether this row has any calls.
            for sc in itertools.product(matrix.samples, matrix.callers):
                sample, caller = sc

                header_h = sc2header[sc]
                call_str = matrix.header2annots[header_h][i]
                if not call_str:
                    continue
                call = SimpleVariantMatrix._parse_call(call_str)

                filt = False
                # filter_by_min_alt_reads
                if min_alt_reads > 0 and \
                   (call.num_alt is None or call.num_alt < min_alt_reads):
                    filt = True
                # filter_by_min_total_reads
                if min_total_reads > 0 and (call.total is None
                                            or call.total < min_total_reads):
                    filt = True

                # filter_by_min_vaf
                if min_vaf >= 1E-6 and (call.vaf is None
                                        or call.vaf < min_vaf):
                    filt = True

                if filt:
                    if i not in call_remove:
                        call_remove[i] = {}
                    call_remove[i][sc] = 1
                else:
                    has_calls = True

            # If this coordinate has no more calls, then remove the
            # whole row.
            if not has_calls:
                I_remove[i] = 1
        I_remove = sorted(I_remove)

        # Write out a matrix of the discarded rows.
        filtered_matrix = AnnotationMatrix.rowslice(matrix, I_remove)
        SimpleVariantMatrix.write_from_am("discarded.txt", filtered_matrix)

        # Remove the calls.
        for i in call_remove:
            for sc in call_remove[i]:
                header_h = sc2header[sc]
                call_str = matrix.header2annots[header_h][i]
                assert call_str
                matrix.header2annots[header_h][i] = ""

        # Which rows to keep.
        I_remove_dict = {}.fromkeys(I_remove)
        I_keep = [
            i for i in range(matrix.num_annots()) if i not in I_remove_dict
        ]
        filtered_matrix = AnnotationMatrix.rowslice(matrix, I_keep)
        SimpleVariantMatrix.write_from_am(out_filename, filtered_matrix)

        ## ## Filter out synonymous variants.
        ## #if nonsynonymous_and_stopgain_only:
        ## #    # Make sure annotated with Annovar.
        ## #    assert "ExonicFunc.refGene" in annovar_matrix.headers
        ## #    exonic_func = annovar_matrix["ExonicFunc.refGene"]
        ## #    for i, efunc in enumerate(exonic_func):
        ## #        efunc = exonic_func[i]
        ## #        assert efunc in [
        ## #            "", "nonsynonymous SNV", "synonymous SNV",
        ## #            "stopgain", "stoploss",
        ## #            "frameshift substitution", "nonframeshift substitution",
        ## #            "unknown"], \
        ## #            "Unknown exonic_func: %s" % efunc
        ## #        if efunc not in ["nonsynonymous SNV", "stopgain"]:
        ## #            I_remove[i] = 1
        ## #            continue

        ## # Filter based on the calls.
        ## if min_alt_reads > 0 or min_total_reads > 0:
        ##     all_coord = call_matrix.coord2samplecaller2call.keys()
        ##     for coord in all_coord:
        ##         all_sc = call_matrix.coord2samplecaller2call[coord].keys()
        ##         for sc in all_sc:
        ##             # SimpleVariantMatrix.Call object.
        ##             call = call_matrix.coord2samplecaller2call[coord][sc]

        ##             # filter_by_min_alt_reads
        ##             if min_alt_reads > 0 and \
        ##                (call.num_alt is None or call.num_alt < min_alt_reads):
        ##                 if coord not in call_remove:
        ##                     call_remove[coord] = {}
        ##                 call_remove[coord][sc] = 1

        ##             # filter_by_min_total_reads
        ##             if min_total_reads > 0 and (
        ##                 call.total is None or call.total < min_total_reads):
        ##                 if coord not in call_remove:
        ##                     call_remove[coord] = {}
        ##                 call_remove[coord][sc] = 1

        ## # Filter based on VAF.
        ## if min_vaf >= 1E-6:
        ##     all_coord = call_matrix.coord2samplecaller2call.keys()
        ##     for coord in all_coord:
        ##         all_sc = call_matrix.coord2samplecaller2call[coord].keys()
        ##         for sc in all_sc:
        ##             call = call_matrix.coord2samplecaller2call[coord][sc]

        ##             # filter_by_min_vaf
        ##             if call.vaf is None or call.vaf < min_vaf:
        ##                 if coord not in call_remove:
        ##                     call_remove[coord] = {}
        ##                 call_remove[coord][sc] = 1

        ## # If any of these coordinates have no more variants, then
        ## # remove the whole row.
        ## if call_remove:
        ##     chrom, pos = annot_matrix["Chrom"], annot_matrix["Pos"]
        ##     ref, alt = annot_matrix["Ref"], annot_matrix["Alt"]
        ##     pos = [int(x) for x in pos]
        ##     coord2i = {}
        ##     for i, coord in enumerate(zip(chrom, pos, ref, alt)):
        ##         coord2i[coord] = i

        ##     for coord in call_remove:
        ##         num_remove = len(call_remove[coord])
        ##         num_calls = len(call_matrix.coord2samplecaller2call[coord])
        ##         assert num_remove <= num_calls
        ##         if num_remove == num_calls:
        ##             i = coord2i[coord]
        ##             I_remove[i] = 1

        ## # Make a matrix of the discarded rows.
        ## old_annot_matrix = var_matrix.annot_matrix
        ## old_named_matrices = var_matrix.named_matrices
        ## filtered_matrix = var_matrix
        ## x = AnnotationMatrix.rowslice(var_matrix.annot_matrix, I_remove)
        ## filtered_matrix.annot_matrix = x
        ## named_matrices = []
        ## for (name, matrix) in var_matrix.named_matrices:
        ##     matrix = AnnotationMatrix.rowslice(matrix, I_remove)
        ##     named_matrices.append((name, matrix))
        ## filtered_matrix.named_matrices = named_matrices
        ## SimpleVariantMatrix.write("discarded.txt", filtered_matrix)
        ## var_matrix.annot_matrix = old_annot_matrix
        ## var_matrix.named_matrices = old_named_matrices

        ## # Remove the calls.
        ## for coord in call_remove:
        ##     chrom, pos, ref, alt = coord
        ##     for (sample, caller) in call_remove[coord]:
        ##         var_matrix.call_matrix.set_call(
        ##             chrom, pos, ref, alt, sample, caller, None)

        ## # Which rows to keep.
        ## I_keep = [
        ##     i for i in range(var_matrix.num_variants()) if i not in I_remove]
        ## # Filter annotation matrix
        ## var_matrix.annot_matrix = AnnotationMatrix.rowslice(
        ##     var_matrix.annot_matrix, I_keep)
        ## # Filter named matrices.
        ## for i, (name, matrix) in enumerate(var_matrix.named_matrices):
        ##     matrix = AnnotationMatrix.rowslice(matrix, I_keep)
        ##     var_matrix.named_matrices[i] = (name, matrix)

        ## SimpleVariantMatrix.write(out_filename, var_matrix)

        return metadata
Esempio n. 26
0
    def run(
        self, network, in_data, out_attributes, user_options, num_cores,
        out_filename):
        import os
        from genomicode import jmath
        from genomicode import AnnotationMatrix
        from genomicode import alignlib
        #from Betsy import module_utils as mlib

        rsem_path = in_data.identifier
        assert os.path.exists(rsem_path)
        assert os.path.isdir(rsem_path)
        result_files = alignlib.find_rsem_result_files(rsem_path)
        assert result_files, "No .results files found."
        metadata = {}

        preprocess = out_attributes.get("preprocess")
        assert preprocess in ["tpm", "fpkm"]

        #x = mlib.get_user_option(
        #    user_options, "genes_or_isoforms", not_empty=True,
        #    allowed_values=["genes", "isoforms"])
        #get_genes = x == "genes"

        # Figure out whether to align to genome or transcriptome.
        x = out_attributes["expression_of"]
        assert x in ["gene", "isoform"]
        get_genes = x == "gene"

        transcript_header = "transcript_id(s)"
        if not get_genes:
            transcript_header = "transcript_id"

        # For each of the gene files, get the expression data.
        sample2matrix = {}  # sample -> AnnotationMatrix
        for x in result_files:
            sample, gene_filename, isoform_filename = x
            # Get the gene results.
            # TODO: Implement isoforms.
            filename = gene_filename
            if not get_genes:
                filename = isoform_filename
            assert filename is not None, "Missing: %s" % filename
            #if filename is None:
            #    continue
            assert os.path.exists(filename)
            matrix = AnnotationMatrix.read(filename)
            # Do some checking on the matrix.
            assert "gene_id" in matrix.headers
            assert transcript_header in matrix.headers
            assert "TPM" in matrix.headers
            assert "FPKM" in matrix.headers
            sample2matrix[sample] = matrix
        assert sample2matrix, "No samples"

        gene_id = transcript_id = None
        # Pull out the gene and transcript IDs.
        for matrix in sample2matrix.itervalues():
            x1 = matrix["gene_id"]
            x2 = matrix[transcript_header]
            if gene_id is None:
                gene_id = x1
            if transcript_id is None:
                transcript_id = x2
            assert x1 == gene_id
            assert x2 == transcript_id
        assert gene_id
        assert transcript_id
        assert len(gene_id) == len(transcript_id)

        # Assemble into a gene expression matrix.
        header = "TPM"
        if preprocess == "fpkm":
            header = "FPKM"
        t_data = []  # matrix, where each row is a sample.
        t_data.append(gene_id)
        t_data.append(transcript_id)
        samples = []
        for sample in sorted(sample2matrix):
            matrix = sample2matrix[sample]
            exp = matrix[header]
            assert len(exp) == len(gene_id)
            t_data.append(exp)
            samples.append(sample)

        data = jmath.transpose(t_data)
        header = ["gene_id", transcript_header] + samples
        data = [header] + data

        # Write out the data file.
        handle = open(out_filename, 'w')
        for x in data:
            print >>handle, "\t".join(map(str, x))

        return metadata
def add_coverage_to_svm(svm_file, coverage_file, outfile, is_rna_cov):
    from genomicode import jmath
    from genomicode import filelib
    from genomicode import AnnotationMatrix
    from genomicode import SimpleVariantMatrix
    
    # Read the variant file.
    SVM = SimpleVariantMatrix.read(svm_file)
    AM = SVM.annot_matrix
    assert "Chrom" in AM.headers
    assert "Pos" in AM.headers
    CHROM = AM["Chrom"]
    POS = AM["Pos"]
    POS = [int(x) for x in POS]

    # Read the coverage matrix.
    # Chrom  Pos  <Sample>  [<Sample> ...]
    # Pos is 1-based.
    coord2sample2cov = {}  # (chrom, pos) -> sample -> ref/alt/vaf
    cov_samples = {}
    for d in filelib.read_row(coverage_file, header=1):
        coord = d.Chrom, int(d.Pos)
        if coord not in coord2sample2cov:
            coord2sample2cov[coord] = {}
        for i in range(2, len(d._header)):
            sample = d._header[i]
            cov = d._cols[i]
            if not cov:
                continue
            #coord2sample2cov[coord][sample] = int(cov)
            coord2sample2cov[coord][sample] = cov
            cov_samples[sample] = 1

    # Make sure the samples from the variant matrix can be found
    # in the coverage matrix.
    missing = [x for x in SVM.samples if x not in cov_samples]
    assert len(missing) < len(SVM.samples), (
        "SimpleVariantMatrix and coverage file have "
        "no common samples.")
    # If the samples aren't sequenced at high coverage, it's
    # possible they just don't have reads at these positions.  Be
    # a little lenient here, and accept the file if some of the
    # samples overlap.
    #x = missing
    #if len(x) > 5:
    #    x = x[:5] + ["..."]
    #msg = "Samples (%d) not found in coverage file: %s" % (
    #    len(missing), ", ".join(x))
    #assert not missing, msg
    # Report the coverage for the samples at the intersection.
    SAMPLES = [x for x in SVM.samples if x in cov_samples]

    # Align the matrix to the simple variant matrix.
    #matrix = [[None]*len(SVM.samples) for i in range(AM.num_annots())]
    matrix = [[None]*len(SAMPLES) for i in range(AM.num_annots())]
    for i in range(AM.num_annots()):
        coord = CHROM[i], POS[i]
        sample2cov = coord2sample2cov.get(coord, {})
        x = [sample2cov.get(x, "") for x in SAMPLES]
        #x = map(str, x)
        matrix[i] = x

    # Add the matrix back to the simple variant matrix.
    headers = SAMPLES
    all_annots = jmath.transpose(matrix)
    name = "Coverage"
    # If this is being used to add RNA coverage, use a different
    # name.
    if is_rna_cov:
        name = "RNA Coverage"
    x = AnnotationMatrix.create_from_annotations(headers, all_annots)
    SVM.named_matrices.append((name, x))

    # Write to file.
    SimpleVariantMatrix.write(outfile, SVM)
Esempio n. 28
0
def main():
    import os
    import argparse

    from genomicode import jmath
    from genomicode import AnnotationMatrix
    from genomicode import colorlib
    from genomicode import pcalib

    parser = argparse.ArgumentParser(description="")
    parser.add_argument("datafile", help="Tab-delimited data file.")
    #parser.add_argument("x_header", help="Which column for X values.")
    #parser.add_argument("y_header", help="Which column for Y values.")
    parser.add_argument(
        "plot_file",
        help="Name of image file, e.g. outfile.png.  "
        "Will generate PNG format by default.  If this file name ends with "
        ".pdf, will generate a PDF file instead.")

    group = parser.add_argument_group(title="Data Series")
    group.add_argument(
        "--series",
        action="append",
        help="Add a data series to the plot.  At least one series must be "
        "plotted.  Format: <x_header>;<y_header>")

    group = parser.add_argument_group(title="General Appearance")
    group.add_argument("--no_box",
                       action="store_true",
                       help="Turn off the box around the plot.")
    group.add_argument("--height",
                       type=int,
                       help="Height (in pixels) of the plot.")
    group.add_argument("--width",
                       type=int,
                       help="Width (in pixels) of the plot.")
    group.add_argument(
        "--mar_left",
        default=1.0,
        type=float,
        help="Scale margin at left of plot.  Default 1.0 (no scaling).")
    group.add_argument("--mar_bottom",
                       default=1.0,
                       type=float,
                       help="Scale margin at bottom of plot.  Default 1.0.")
    #group.add_argument(
    #    "--xlabel_size", default=1.0, type=float,
    #    help="Scale the size of the labels on X-axis.  Default 1.0.")
    group.add_argument("--log_x",
                       action="store_true",
                       help="Plot the X-axis on a log scale.")
    group.add_argument("--log_y",
                       action="store_true",
                       help="Plot the Y-axis on a log scale.")
    group.add_argument(
        "--qq",
        action="store_true",
        help="Make a QQ-plot.  Will sort the values to be plotted.")

    group = parser.add_argument_group(title="Plot Labels")
    group.add_argument("--title", help="Put a title on the plot.")
    group.add_argument("--xlab", help="Label the X-axis.")
    group.add_argument("--ylab", help="Label the Y-axis.")
    group.add_argument("--add_regression",
                       action="store_true",
                       help="Put a regression line on the plot.")

    group = parser.add_argument_group(title="Legend")
    group.add_argument("--add_legend",
                       action="store_true",
                       help="Add a legend to the plot.")
    group.add_argument("--legend_inset", type=float, default=0.05, help="")
    LEGEND_LOCATIONS = [
        "bottomright",
        "bottom",
        "bottomleft",
        "left",
        "topleft",
        "top",
        "topright",
        "right",
        "center",
    ]
    group.add_argument("--legend_loc",
                       choices=LEGEND_LOCATIONS,
                       help="Where to draw the legend.")

    group = parser.add_argument_group(title="Point Appearance")
    group.add_argument("--scale_points",
                       default=1.0,
                       type=float,
                       help="Scale the size of the points.  Default 1.0")
    group.add_argument("--label_header",
                       help="Label each point with the values in this column.")
    group.add_argument("--label_size",
                       type=float,
                       help="Scale the size of the labels by this value.")
    group.add_argument("--label_pos",
                       default="top",
                       choices=["top", "bottom", "left", "right"],
                       help="Where to label the points.")

    group = parser.add_argument_group(title="Line Appearance")
    group.add_argument("--add_lines",
                       action="store_true",
                       help="Add lines that connect the points.")
    group.add_argument("--scale_lines",
                       default=1.0,
                       type=float,
                       help="Scale the thickness of the lines.  Default 1.0")

    group = parser.add_argument_group(title="Identity Line")
    group.add_argument("--add_identity_line",
                       action="store_true",
                       help="Add an identity line to the plot.")

    group = parser.add_argument_group(title="Colors")
    group.add_argument(
        "-c",
        "--cluster",
        action="append",
        help="Group samples into a cluster (e.g. -c 1-5); 1-based, inclusive.")
    group.add_argument(
        "--indexes_include_headers",
        "--iih",
        action="store_true",
        help="If not given (default), then index 1 is the first row "
        "with data.  If given, then index 1 is the very first row "
        "in the file, including the headers.")
    group.add_argument("--default_color",
                       help="Default color of points.  Format: #000000.")

    # Parse the input arguments.
    args = parser.parse_args()
    if not os.path.exists(args.datafile):
        parser.error("File not found: %s" % args.datafile)
    if args.width is not None:
        assert args.width > 10, "too small"
        assert args.width < 4096 * 16, "width too big"
    if args.height is not None:
        assert args.height > 10, "too small"
        assert args.height < 4096 * 16, "height too big"
    assert args.mar_bottom > 0 and args.mar_bottom < 10
    assert args.mar_left > 0 and args.mar_left < 10
    #assert args.xlabel_size > 0 and args.xlabel_size < 10

    assert args.legend_inset >= 0 and args.legend_inset < 10
    if args.legend_loc is None:
        args.legend_loc = "bottomright"

    if args.default_color:
        assert len(args.default_color) == 7
        assert args.default_color[0] == "#"

    MATRIX = AnnotationMatrix.read(args.datafile, False)
    assert MATRIX.num_headers() and MATRIX.num_annots(), "Empty matrix."
    assert args.series, "Need to add a data --series to plot."
    #assert len(args.series) <= 1, "Not implemented."
    #assert args.x_header in MATRIX.headers, \
    #       "header not found: %s" % args.x_header
    #assert args.y_header in MATRIX.headers, \
    #       "header not found: %s" % args.y_header
    if args.label_header:
        assert args.label_header in MATRIX.headers, \
               "header not found: %s" % args.label_header
    if args.label_size is not None:
        assert args.label_size > 0 and args.label_size <= 20
    assert args.scale_points > 0 and args.scale_points < 20
    assert args.scale_lines > 0 and args.scale_lines < 20

    series = _parse_series(MATRIX, args.series)
    cluster = None
    if args.cluster:
        cluster = _parse_cluster(args.cluster, args.indexes_include_headers,
                                 MATRIX)

    if len(series) > 1:
        assert not cluster, "Series and cluster not implemented."

    height = args.height or 2400
    width = args.width or 3200

    # Pull out the values and colors for the plot.
    default_color = "#000000"
    if args.default_color:
        default_color = args.default_color

    assert len(series) < len(colorlib.BREWER_QUALITATIVE_SET1)
    series_data = []  # list of (x_values, y_values, col) for each series
    for i in range(len(series)):
        x_header, y_header = series[i]
        x = MATRIX[x_header]
        y = MATRIX[y_header]
        I1 = [j for (j, a) in enumerate(x) if a]
        I2 = [j for (j, a) in enumerate(y) if a]
        I = [j for j in I1 if j in I2]
        x = [x[j] for j in I]
        y = [y[j] for j in I]
        x = map(float, x)
        y = map(float, y)
        assert len(x) == len(y)
        c = default_color
        if len(series) > 1:
            rgb = colorlib.BREWER_QUALITATIVE_SET1[i]
            c = colorlib.rgb2hex(rgb, prefix="#")
        c = [c] * len(x)
        x = x, y, c
        series_data.append(x)

    # Merge all the data point for each series.
    x_values = []
    y_values = []
    col = []
    for (x, y, c) in series_data:
        x_values.extend(x)
        y_values.extend(y)
        #c = [c] * len(x)
        col.extend(c)
    assert len(x_values) == len(y_values)
    assert len(x_values) == len(col)

    if args.qq:
        O = jmath.order(x_values)
        x_values = [x_values[i] for i in O]
        y_values = [y_values[i] for i in O]
        col = [col[i] for i in O]

    if cluster is not None:
        col_rgb = pcalib.choose_colors(cluster)
        col = [default_color] * len(col_rgb)
        for i in range(len(col_rgb)):
            if col_rgb[i] is None:
                continue
            col[i] = colorlib.rgb2hex(col_rgb[i], prefix="#")
        assert len(col) == len(x_values)

    #for i in range(len(x_values)):
    #    x = x_values[i], y_values[i], cluster[i], col[i]
    #    print "\t".join(map(str, x))

    # Start R and set up the environment.
    R = jmath.start_R()

    main = jmath.R_var("NA")
    if args.title:
        main = args.title
    sub = ""
    xlab = ""
    if len(series) == 1:
        xlab = x_header
    if args.xlab:
        xlab = args.xlab
    ylab = ""
    if len(series) == 1:
        ylab = y_header
    if args.xlab:
        ylab = args.ylab

    lwd_box = 2
    lwd_axis = 2
    lwd_regr = 3
    cex = 1.0 * args.scale_points
    cex_lab = 1.5
    cex_main = 2.0
    cex_sub = 1.0

    plot_log = ""
    if args.log_x:
        plot_log += "x"
    if args.log_y:
        plot_log += "y"

    assert x_values
    assert y_values
    jmath.R_equals(x_values, "X")
    jmath.R_equals(y_values, "Y")

    bm_type = "png16m"
    if args.plot_file.lower().endswith(".pdf"):
        bm_type = "pdfwrite"
    jmath.R_fn("bitmap",
               args.plot_file,
               type=bm_type,
               height=height,
               width=width,
               units="px",
               res=300)

    # Set the margins.
    x = 5 * 1.2 * args.mar_bottom, 4 * 1.2 * args.mar_left, 4, 2
    mar = [x + 0.1 for x in x]
    jmath.R_fn("par", mar=mar, RETVAL="op")

    jmath.R_fn("plot",
               jmath.R_var("X"),
               jmath.R_var("Y"),
               main="",
               xlab="",
               ylab="",
               pch=19,
               cex=cex,
               log=plot_log,
               col=col,
               axes=jmath.R_var("FALSE"),
               RETVAL="x")
    # Make plot area solid white.
    #jmath.R('usr <- par("usr")')
    #jmath.R('rect(usr[1], usr[3], usr[2], usr[4], col="#FFFFFF")')
    #jmath.R_fn(
    #    "hist", jmath.R_var("X"), plot=jmath.R_var("FALSE"),
    #    main=main, xlab="", ylab="", axes=jmath.R_var("FALSE"),
    #    add=jmath.R_var("TRUE"))

    if args.add_lines:
        lwd = 4 * args.scale_lines
        i = 0
        for (x, y, c) in series_data:
            # Cannot use c for the color.  It might've been changed by
            # --cluster.
            assert col and i < len(col)
            c = col[i:i + len(x)]
            i += len(x)

            # The "lines" function takes a scalar for col (except for
            # type=h, histogram vertical lines).  If there are
            # multiple colors, then split up the points based on the
            # colors.
            l_x, l_y, l_c = [], [], None
            for j in range(len(x)):
                if c[j] != l_c:
                    if l_x:
                        jmath.R_fn("lines", l_x, l_y, lwd=lwd, col=l_c)
                    # Add the previous point so that the points will
                    # connect.
                    if l_x:
                        l_x = [l_x[-1]]
                        l_y = [l_y[-1]]
                    else:
                        l_x, l_y, l_c = [], [], None
                l_x.append(x[j])
                l_y.append(y[j])
                l_c = c[j]
            if l_x:
                jmath.R_fn("lines", l_x, l_y, lwd=lwd, col=l_c)

    if args.add_identity_line:
        lwd = 4

        x_min, x_max = min(x_values), max(x_values)
        y_min, y_max = min(y_values), max(y_values)

        iden_min = max(x_min, y_min)
        iden_max = min(x_max, y_max)

        l_x = [iden_min, iden_max]
        l_y = l_x
        l_c = "#FF0000"
        jmath.R_fn("lines", l_x, l_y, lwd=lwd, col=l_c)

    if args.label_header:
        cex = 1
        if args.label_size is not None:
            cex = args.label_size
        pos2specifier = {
            "top": 3,
            "bottom": 1,
            "left": 2,
            "right": 4,
        }
        pos = pos2specifier[args.label_pos]
        point_labels = MATRIX[args.label_header]
        jmath.R_fn("text",
                   jmath.R_var("X"),
                   jmath.R_var("Y"),
                   labels=point_labels,
                   cex=cex,
                   pos=pos)

    # Calculate correlation, and other statistics.
    # TODO: Should calculate this for each series.
    r = jmath.R("cor(X, Y)")
    p_value = jmath.R("cor.test(X, Y)$p.value")
    r = r[0]
    p_value = p_value[0]
    print "R = %.2f" % r
    print "p = %.2g" % p_value

    # Add a regression line.
    if args.add_regression:
        jmath.R("fit <- lm(Y ~ X)")
        coef = jmath.R("fit$coefficients")
        assert len(coef) == 2
        b, m = coef
        x1 = min(x_values)
        y1 = x1 * m + b
        x2 = max(x_values)
        y2 = x2 * m + b
        jmath.R_fn("lines", [x1, x2], [y1, y2],
                   lwd=lwd_regr,
                   lty=2,
                   col="#C63F31")
        sub = "R=%.2f (p=%.2g)" % (r, p_value)
        header = "X", "Y", "R", "p"
        print "\t".join(header)
        x = xlab, ylab, r, p_value
        print "\t".join(map(str, x))

    if args.add_legend:
        leg = [x[1] for x in series]
        fill = [x[-1] for x in series_data]
        #jmath.R("x <- rgb(0.5, 0.5, 0.5, 0.5)")
        # alpha does not seem to be supported here.
        jmath.R_fn("legend",
                   args.legend_loc,
                   legend=leg,
                   fill=fill,
                   inset=args.legend_inset)

    if not args.no_box:
        jmath.R_fn("box", lwd=lwd_box)
    jmath.R_fn("axis", 1, lwd=lwd_axis, **{"cex.axis": 1.5})
    jmath.R_fn("axis", 2, lwd=lwd_axis, **{"cex.axis": 1.5})
    jmath.R_fn("title",
               main=main,
               sub=sub,
               xlab=xlab,
               ylab=ylab,
               **{
                   "cex.lab": cex_lab,
                   "cex.main": cex_main,
                   "cex.sub": cex_sub
               })
    R("par(op)")
    jmath.R_fn("dev.off")
Esempio n. 29
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        import StringIO
        import arrayio
        from genomicode import arrayplatformlib
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import AnnotationMatrix
        from Betsy import module_utils as mlib

        M = arrayio.read(in_data.identifier)
        metadata = {}

        # Add GENE_ID, GENE_SYMBOL, and DESCRIPTION.  Figure out which
        # platforms provide each one of this.
        CATEGORIES = [
            arrayplatformlib.GENE_ID,
            arrayplatformlib.GENE_SYMBOL,
            # biomaRt doesn't convert description.  So just ignore it
            # for now.
            # TODO: implement DESCRIPTION.
            #arrayplatformlib.DESCRIPTION,
        ]

        #all_platforms = arrayplatformlib.identify_all_platforms_of_matrix(M)
        #assert all_platforms, "Unknown platform: %s" % in_data.identifier
        #header, platform_name = all_platforms[0]
        scores = arrayplatformlib.score_matrix(M)
        scores = [x for x in scores if x.max_score >= 0.75]
        assert scores, "I could not identify any platforms."

        # Find all the platforms not in the matrix.
        platforms = [
            arrayplatformlib.find_platform_by_name(x.platform_name)
            for x in scores
        ]
        categories = [x.category for x in platforms]
        missing = [x for x in CATEGORIES if x not in categories]

        score = scores[0]
        platform = platforms[0]
        to_add = []  # list of platform names
        for category in missing:
            x = arrayplatformlib.PLATFORMS
            x = [x for x in x if x.category == category]
            x = [x for x in x if x.bm_organism == platform.bm_organism]
            x = [x for x in x if x.name != score.platform_name]
            # Take the first one, if any.
            if x:
                to_add.append(x[0].name)

        if to_add:
            annotate = mlib.get_config("annotate_matrix",
                                       which_assert_file=True)
            sq = parallel.quote
            cmd = [
                "python",
                sq(annotate),
                "--no_na",
                "--header",
                sq(score.header),
            ]
            for x in to_add:
                x = ["--platform", sq(x)]
                cmd.extend(x)
            cmd.append(in_data.identifier)
            cmd = " ".join(cmd)
            data = parallel.sshell(cmd)
            metadata["commands"] = [cmd]
            assert data.find("Traceback") < 0, data
        else:
            data = open(in_data.identifier).read()

        # Clean up the headers.
        platform2pretty = {
            "Entrez_ID_human": "Gene ID",
            "Entrez_Symbol_human": "Gene Symbol",
            "Entrez_ID_mouse": "Gene ID",
            "Entrez_Symbol_mouse": "Gene Symbol",
        }
        handle = open(outfile, 'w')
        header_written = False
        for cols in filelib.read_cols(StringIO.StringIO(data)):
            if not header_written:
                cols = [platform2pretty.get(x, x) for x in cols]
                cols = AnnotationMatrix.uniquify_headers(cols)
                header_written = True
            print >> handle, "\t".join(cols)

        return metadata
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_filename):
        #import shutil
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from genomicode import SimpleVariantMatrix
        from genomicode import AnnotationMatrix
        from Betsy import module_utils as mlib

        summary_node = in_data
        summary_filename = summary_node.identifier
        metadata = {}

        buildver = mlib.get_user_option(user_options,
                                        "annovar_buildver",
                                        allowed_values=["hg19"],
                                        not_empty=True)

        # Name files.
        p, root, ext = mlib.splitpath(summary_filename)
        annovar_infile = "pos.txt"
        log_filename = "annovar.log"
        # Annovar takes a filestem, without the ".vcf".
        annovar_outstem = "annotations"
        # Produces file:
        # <annovar_outstem>.hg19_multianno.txt
        multianno_file = "%s.hg19_multianno.txt" % annovar_outstem
        #temp_file = "temp.txt"

        # Make the infile for Annovar.
        # <chrom> <start> <end> <ref> <alt>
        handle = open(annovar_infile, 'w')
        for d in filelib.read_row(summary_filename, skip=2, header=1):
            x = d.Chrom, d.Pos, d.Pos, d.Ref, d.Alt
            print >> handle, "\t".join(x)
        handle.close()

        cmd = alignlib.make_annovar_command(annovar_infile,
                                            log_filename,
                                            annovar_outstem,
                                            buildver,
                                            vcf_input=False)
        parallel.sshell(cmd)
        metadata["commands"] = [cmd]

        filelib.assert_exists_nz(log_filename)
        filelib.assert_exists_nz(multianno_file)

        matrix = SimpleVariantMatrix.read(summary_filename)
        annot_matrix = matrix.annot_matrix
        #headers = annot_matrix.headers + anno_header[5:]
        chrom, pos = annot_matrix["Chrom"], annot_matrix["Pos"]
        ref, alt = annot_matrix["Ref"], annot_matrix["Alt"]
        pos = [int(x) for x in pos]

        # Read in the multianno output file.
        pos2d = {}  # (chrom, start, ref, alt) -> d
        anno_header = None
        for d in filelib.read_row(multianno_file, header=1):
            key = d.Chr, int(d.Start), d.Ref, d.Alt
            assert key not in pos2d, "Duplicate pos: %s" % str(key)
            pos2d[key] = d
            if not anno_header:
                anno_header = d._header
        assert anno_header

        # Multianno starts with:
        # Chr Start End Ref Alt
        # Ignore these.
        assert anno_header[:5] == ["Chr", "Start", "End", "Ref", "Alt"]
        headers = anno_header[5:]

        all_annots = []
        #for h in annot_matrix.headers_h:
        #    x = annot_matrix.header2annots[h]
        #    all_annots.append(x)
        for i in range(5, len(anno_header)):
            annots = []
            for coord in zip(chrom, pos, ref, alt):
                d = pos2d.get(coord)
                x = ""
                if d:
                    x = d._cols[i]
                annots.append(x)
            all_annots.append(annots)
        x = AnnotationMatrix.create_from_annotations(headers, all_annots)
        matrix.named_matrices.insert(0, ("Annovar", x))

        SimpleVariantMatrix.write(out_filename, matrix)

        ## cols_to_add = len(anno_header) - 5
        ## assert cols_to_add > 0

        ## # Merge the multianno file with the simple call summary.  Add
        ## # these columns before the <Sample>.
        ## # Sample                <Sample>
        ## # Caller                <Caller>
        ## # Chrom  Pos  Ref  Alt  Ref/Alt/VAF
        ## handle = open(temp_file, 'w')
        ## it = filelib.read_cols(summary_filename)
        ## header1 = it.next()
        ## header2 = it.next()
        ## header3 = it.next()
        ## assert len(header1) == len(header2), "%d %d %d %s" % (
        ##     len(header1), len(header2), len(header3), summary_filename)
        ## assert len(header1) == len(header3), "%d %d %d %s" % (
        ##     len(header1), len(header2), len(header3), summary_filename)
        ## assert header1[0] == "Sample"
        ## assert header2[0] == "Caller"
        ## assert header3[:4] == ["Chrom", "Pos", "Ref", "Alt"]
        ## header1 = header1[:4] + [""]*cols_to_add + header1[4:]
        ## header2 = header2[:4] + [""]*cols_to_add + header2[4:]
        ## header3 = header3[:4] + anno_header[5:] + header3[4:]
        ## print >>handle, "\t".join(header1)
        ## print >>handle, "\t".join(header2)
        ## print >>handle, "\t".join(header3)
        ## for cols in it:
        ##     chrom, pos, ref, alt = cols[:4]
        ##     pos = int(pos)
        ##     d = pos2d.get((chrom, pos))
        ##     if not d:
        ##         cols = cols[:4] + [""]*cols_to_add + cols[4:]
        ##         continue
        ##     assert ref == d.Ref, "%s %s %s %s %s %s" % (
        ##         chrom, pos, ref, alt, d.Ref, d.Alt)
        ##     assert alt == d.Alt, "%s %s %s %s %s %s" % (
        ##         chrom, pos, ref, alt, d.Ref, d.Alt)
        ##     x = d._cols[5:]
        ##     assert len(x) == cols_to_add
        ##     cols = cols[:4] + x + cols[4:]
        ##     print >>handle, "\t".join(cols)
        ## handle.close()

        ## shutil.move(temp_file, out_filename)

        return metadata