Beispiel #1
0
 def __init__(
     self,
     path,
     lastcol=None,
     lastrow=None,
     colmeta=None,
     rowmeta=None,
     eps_noise=False,
 ):
     self.tbl = table(path)
     self.bak = self.tbl.copy()
     # strip / save metadata
     if lastrow is not None:
         self.tbl.head(lastrow, invert=True)
     if lastcol is not None:
         self.tbl.head(lastcol, invert=True, transposed=True)
     self.row = self.tbl.rowheads[:]
     self.col = self.tbl.colheads[:]
     if eps_noise:
         self.tbl.float()
         self.tbl.apply_entries(lambda x: x + c_eps * random.random())
     self.dat = self.tbl.table2array()
     # colmetas from file / table
     if colmeta is None:
         self.colmeta = None
         self.colmetaname = None
     else:
         self.colmeta = []
         self.colmetaname = []
         for x in colmeta:
             if os.path.exists(x):
                 warn("Loading col metadata from file:", x)
                 temp = col2dict(x, value=1)
                 self.colmeta.append(
                     [temp.get(k, c_str_none) for k in self.col])
                 self.colmetaname.append(path2name(x))
             else:
                 temp = self.bak.rowdict(x)
                 self.colmeta.append(
                     [temp.get(k, c_str_none) for k in self.col])
                 self.colmetaname.append(x)
     # rowmetas from file / table
     if rowmeta is None:
         self.rowmeta = None
         self.rowmetaname = None
     else:
         self.rowmeta = []
         self.rowmetaname = []
         for x in rowmeta:
             if os.path.exists(x):
                 warn("Loading row metadata from file:", x)
                 temp = col2dict(x, value=1)
                 self.rowmeta.append(
                     [temp.get(k, c_str_none) for k in self.row])
                 self.rowmetaname.append(path2name(x))
             else:
                 temp = self.bak.coldict(x)
                 self.rowmeta.append(
                     [temp.get(k, c_str_none) for k in self.row])
                 self.rowmetaname.append(x)
Beispiel #2
0
def main( ):

    args = get_args( )
    tbl = table( args.table )
    data = tbl.table2array( args.last_metadata ).transpose( )
    dist = distance_matrix( data, args.dissimilarity )

    if args.method == "manual":
        embedding, varexp, goodness = ordinate_cmdscale( dist )
    else:
        embedding, varexp, goodness = ordinate_sklearn( dist, method=args.method )
        
    fig = plt.figure()
    fig.set_size_inches( 10, 6 )
    ax = plt.subplot( 111 )
    xcoords = embedding[:, 0]
    ycoords = embedding[:, 1]

    shapes = ["o" for x in xcoords]
    if args.shapeby is not None:
        field, path = args.shapeby
        shapes = shapeize( tbl.row( field ), path, ax )     
    
    colors = ["black" for x in xcoords]
    if args.colorby is not None:
        field, path = args.colorby
        colors = colorize( tbl.row( field ), path, ax )
        
    for x, y, c, s in zip( xcoords, ycoords, colors, shapes ):
        ax.scatter( x, y, color=c, marker=s )
    ax.set_xlim( min( xcoords ), max( xcoords ) )
    ax.set_ylim( min( ycoords ), max( ycoords ) )
    mu.funcMargin( ax )

    title = path2name( args.table ) if args.title is None else args.title
    ax.set_title( "%s | Goodness of fit = %.3f" % ( title, goodness ) )
    ax.set_xlabel( "Dimension 1 (%.1f%%)" % (100 * varexp[0] ) )
    ax.set_ylabel( "Dimension 2 (%.1f%%)" % (100 * varexp[1] ) )
    mu.funcSetTickParams( ax )

    for m in args.level_biplot:
        level_biplot( ax, embedding, tbl.row( m ) )
    for m in args.quant_biplot:
        quant_biplot( ax, embedding, tbl.row( m ), m )

    # Shrink current axis by 20%
    box = ax.get_position( )
    ax.set_position( [box.x0, box.y0, box.width * 0.7, box.height] )

    ax.legend( scatterpoints=1,
               fontsize=8,
               loc='center left',
               bbox_to_anchor=(1, 0.5),
               )
               
    #plt.tight_layout( )
    plt.savefig( args.outfile )
)
args = parser.parse_args()

# ---------------------------------------------------------------
# load all data
# ---------------------------------------------------------------

dictTableData = {}
# modified for faster looking up 4/2/2015
dictFeatureIndex = {}

for iDex, strPath in enumerate(args.input):
    print >> sys.stderr, "Loading", iDex + 1, "of", len(
        args.input), ":", strPath
    aastrData = []
    strColhead = path2name(strPath)
    with open(strPath) as fh:
        for astrItems in reader(fh):
            aastrData.append(
                [astrItems[args.key_col], astrItems[args.val_col]]),
    if args.strip_comments:
        aastrData = [astrRow for astrRow in aastrData if astrRow[0][0] != "#"]
    if args.use_headers:
        strColhead = aastrData[0][1]
    if args.strip_headers:
        aastrData = aastrData[1:]
    if args.key_pattern:
        aastrData = [
            astrRow for astrRow in aastrData
            if re.search(args.key_pattern, astrRow[0])
        ]
Beispiel #4
0
try:
    import openpyxl as xl
except:
    die("This script requires the OPENPYXL module")

# argument parsing (python argparse)
parser = argparse.ArgumentParser()
parser.add_argument("xlsx", help="")
args = parser.parse_args()

wb = xl.load_workbook(filename=args.xlsx)

for ws in wb:

    basename = path2name(args.xlsx)
    sheet = ws.title
    sheet = re.sub("[^A-Za-z0-9]+", "_", sheet)
    newname = "{}.{}.tsv".format(basename, sheet)
    fh = open(newname, "w")
    ww = csv.writer(fh, csv.excel_tab)

    for row in ws.iter_rows():
        row2 = []
        for cell in row:
            value = cell.value
            if value is None:
                continue
            try:
                value = value.encode("utf-8")
            except:
Beispiel #5
0
say("Will load:", len(args.inputs), "gathered from command line")

if args.file is not None:
    before = len(args.inputs)
    with open(args.file) as fh:
        for line in fh:
            args.inputs.append(line.strip())
    after = len(args.inputs)
    say("Will load:", after - before, "additional files gathered from:",
        args.file)

for iDex, strPath in enumerate(args.inputs):
    say(sys.stderr, "Loading", iDex + 1, "of", len(args.inputs), ":", strPath)
    aastrData = []
    strColhead = path2name(
        strPath) if not args.use_full_names else os.path.split(strPath)[1]
    with open(strPath) as fh:
        for astrItems in reader(fh):
            if args.strip_comments and astrItems[0][0] == "#":
                continue
            aastrData.append(
                [astrItems[args.key_col], astrItems[args.val_col]])
    if args.use_headers:
        strColhead = aastrData[0][1]
    if args.strip_headers:
        aastrData = aastrData[1:]
    if args.key_pattern:
        aastrData = [
            astrRow for astrRow in aastrData
            if re.search(args.key_pattern, astrRow[0])
        ]
Beispiel #6
0
def main():

    args = get_args()
    fig = plt.figure()
    dims = Dimensions()
    df = DataFrame(
        args.table,
        lastcol=args.lastcol,
        lastrow=args.lastrow,
        colmeta=args.colmeta,
        rowmeta=args.rowmeta,
        eps_noise=args.eps_noise,
    )

    # force labeling all features
    if args.force_labels:
        global c_max_lab
        c_max_lab = 1e6

    # dim overrides
    vscale = 1
    hscale = 1
    if args.colmeta is not None and args.metascale:
        dims.colmeta_r = len(args.colmeta)
    if args.rowmeta is not None and args.metascale:
        dims.rowmeta_c = len(args.rowmeta)
    if args.vscale:
        old = dims.heat_r
        new = int(len(df.row) / 2.0) + len(df.row) % 2
        new = max(new, 8)
        dims.heat_r = new
        vscale = new / float(old)
    if args.hscale:
        old = dims.heat_c
        new = int(len(df.col) / 2.0) + len(df.col) % 2
        dims.heat_c = new
        new = max(new, 12)
        hscale = new / float(old)
    if args.cbar_extent is not None:
        dims.cbar_extent = args.cbar_extent
    if not args.debug:
        if args.title is None:
            dims.title_r = 0
        # no tree axes if last sort is on 1) file or 2) metadata or 3) nothing
        if os.path.exists( args.colsort[-1] ) \
           or re.search( "none|names|mean|metadata", args.colsort[-1] ):
            dims.coltree_r = 0
        if os.path.exists( args.rowsort[-1] ) \
           or re.search( "none|names|mean|metadata", args.rowsort[-1] ):
            dims.rowtree_c = 0
        if args.colmeta is None:
            dims.colmeta_r = 0
        if args.rowmeta is None:
            dims.rowmeta_c = 0
        if len(df.col) > c_max_lab and not args.hscale:
            dims.colnames_r = 1
        if len(df.row) > c_max_lab and not args.vscale:
            dims.rownames_c = 1
    dims.update()

    # manual overrides
    for o in args.overrides:
        p, v = o.split(":")
        v = int(v)
        setattr(dims, p, v)
    dims.update()

    # define figure
    fig.set_size_inches(
        args.hstretch * dims.csize * args.grid_inches / dims.scale,
        args.vstretch * dims.rsize * args.grid_inches / dims.scale)

    # setup axes
    axes = HeatmapAxes(dims)

    # cluster cols
    Z = None
    for metric in args.colsort:
        Z = df.colsort(metric, linkage=args.linkage)
    if Z is not None:
        sch.dendrogram( Z, ax=axes.coltree, \
                        above_threshold_color="0.75",
                        color_threshold=0, )

    # cluster rows
    Z = None
    for metric in args.rowsort:
        Z = df.rowsort(metric, linkage=args.linkage)
    if Z is not None:
        sch.dendrogram( Z, ax=axes.rowtree, orientation="left", \
                        above_threshold_color="0.75",
                        color_threshold=0, )

    # apply transform
    df.transform(args.transform)

    # check limits
    poormin = False
    poormax = False
    vmin, vmax = (None, None) if args.limits is None else args.limits
    dmin, dmax = np.min(df.dat), np.max(df.dat)
    if vmin is None:
        vmin = dmin
    elif dmin < vmin:
        poormin = True
        n, p = acheck(df.dat, lambda x: x < vmin)
        warn("{} values ({:.2f}%) < vmin ({}), extreme: {}".format(
            n, 100 * p, vmin, dmin))
    if vmax is None:
        vmax = dmax
    elif dmax > vmax:
        poormax = True
        n, p = acheck(df.dat, lambda x: x > vmax)
        warn("{} values ({:.2f}%) > vmax ({}), extreme: {}".format(
            n, 100 * p, vmax, dmax))

    # add heatmap
    axes.heatmap.set_xlim(0, len(df.col))
    axes.heatmap.set_ylim(0, len(df.row))
    # imshow is similar to pcolorfast, but better centered
    if args.engine == "imshow":
        nr = len(df.row)
        nc = len(df.col)
        kwargs = {
            "interpolation": "none",
            "origin": "lower",
            "aspect": "auto",
            "extent": [0, nc, 0, nr]
        }
        pc = axes.heatmap.imshow(df.dat,
                                 cmap=args.cmap,
                                 vmin=vmin,
                                 vmax=vmax,
                                 **kwargs)
    # probably no reason to use this
    elif args.engine == "pcolorfast":
        pc = axes.heatmap.pcolorfast(df.dat,
                                     cmap=args.cmap,
                                     vmin=vmin,
                                     vmax=vmax)
    # use this if you want the individual heatmap cells to be editable shapes
    elif args.engine == "pcolormesh":
        pc = axes.heatmap.pcolormesh(df.dat,
                                     cmap=args.cmap,
                                     vmin=vmin,
                                     vmax=vmax)

    # add cmap bar
    fig.colorbar(pc, cax=axes.cbar)
    axes.cbar.set_ylabel( args.units if args.transform == "none" else \
                          "{}( {} )".format( args.transform, args.units ), size=c_font3 )
    set_cbar_ticks(axes.cbar, pc.get_clim(), poormin=poormin, poormax=poormax)

    # add column metadata
    if df.colmeta is not None:
        colmeta_cmaps = axes.colmetaplot(df, args.colmeta_colors,
                                         args.max_levels)

    # add row metadata
    if df.rowmeta is not None:
        rowmeta_cmaps = axes.rowmetaplot(df, args.rowmeta_colors,
                                         args.max_levels)

    # column transition lines
    if "metadata" in args.colsort[-1]:
        args.colbreaks = args.colsort[-1]
    if args.colbreaks is not None:
        lastsort = args.colbreaks
        index = 0 if ":" not in lastsort else \
                ( int( lastsort.split( ":" )[1] ) - 1 )
        pos = []
        for i, value in enumerate(df.colmeta[index]):
            if i > 0 and df.colmeta[index][i - 1] != value:
                pos.append(i)
        for i in pos:
            mu.vline(axes.colmeta, i, color="black")
            mu.vline(axes.heatmap, i, color=args.break_color)

    # add row transition lines if ending on a metasort
    if "metadata" in args.rowsort[-1]:
        args.rowbreaks = args.rowsort[-1]
    if args.rowbreaks is not None:
        lastsort = args.rowbreaks
        index = 0 if ":" not in lastsort else \
                ( int( lastsort.split( ":" )[1] ) - 1 )
        pos = []
        for i, value in enumerate(df.rowmeta[index]):
            if i > 0 and df.rowmeta[index][i - 1] != value:
                pos.append(i)
        for i in pos:
            mu.hline(axes.rowmeta, i, color="black")
            mu.hline(axes.heatmap, i, color=args.break_color)

    # add generic grids
    if "x" in args.grid:
        for i in range(1, len(df.col)):
            mu.vline(axes.heatmap, i, color=args.break_color)
    if "y" in args.grid:
        for i in range(1, len(df.row)):
            mu.hline(axes.heatmap, i, color=args.break_color)

    # title
    if args.title is not None:
        axes.set_title(args.title)

    # add dots
    dots_added = []
    if args.dots is not None:
        for p in args.dots:
            dots_added.append(add_dots(axes, df, p))

    # legend
    L = mu.Legendizer(axes.legend, vscale=0.7 / vscale)
    # col sort legend
    L.subhead("Col sort")
    for m in args.colsort:
        if "metadata" in m:
            i = 0
            if ":" in m:
                i = int(m.split(":")[1]) - 1
            m = "metadata:" + df.colmetaname[i]
        L.element("_", color="0.75", label=m)
    # row sort legend
    L.subhead("Row sort")
    for m in args.rowsort:
        if "metadata" in m:
            i = 0
            if ":" in m:
                i = int(m.split(":")[1]) - 1
            m = "metadata:" + df.rowmetaname[i]
        L.element("_", color="0.75", label=m)
    # col metadata legend
    levelorder = {c_str_other: 1, c_str_none: 2}
    if df.colmeta is not None:
        for n, c in zip(df.colmetaname[::-1], colmeta_cmaps[::-1]):
            L.subhead(n)
            for l in sorted(c, key=lambda x: [levelorder.get(x, 0), x]):
                color = c[l]
                L.element("s", color=color, label=l)
    # row metadata legend
    if df.rowmeta is not None:
        for n, c in zip(df.rowmetaname[::-1], rowmeta_cmaps[::-1]):
            L.subhead(n)
            for l in sorted(c, key=lambda x: [levelorder.get(x, 0), x]):
                color = c[l]
                L.element("s", color=color, label=l)
    if len(dots_added) > 0:
        L.subhead("Dots")
        for p, kwargs in dots_added:
            marker = kwargs.get("marker", "o")
            kwargs = {
                k: v
                for k, v in kwargs.items() if k not in "s marker".split()
            }
            L.element(marker, label=path2name(p), **kwargs)
    # finalize
    L.draw()

    # cleanup
    if args.override_colnames is not "-":
        axes.collabel(df,
                      args.collabel,
                      scaled=args.hscale,
                      path=args.override_colnames)
    if args.override_rownames is not "-":
        axes.rowlabel(df,
                      args.rowlabel,
                      scaled=args.vscale,
                      path=args.override_rownames)
    if not args.debug:
        axes.clean()
    plt.subplots_adjust(wspace=0.3, hspace=0.3)
    plt.savefig(args.output, bbox_inches="tight")

    # logging
    if args.dump_colsort_order:
        with open(args.output + ".colsort", "w") as fh:
            for item in df.col:
                print >> fh, item
    if args.dump_rowsort_order:
        with open(args.output + ".rowsort", "w") as fh:
            for item in df.row:
                print >> fh, item
Beispiel #7
0
#! /usr/bin/env python

import os, sys, re, glob, argparse
from zopy.table2 import table
from zopy.dictation import col2dict
from zopy.utils import path2name

dictMap = col2dict( sys.argv[1], key=0, value=1 )
tableData = table( sys.argv[2] )
tableData.apply_colheads( lambda x: path2name( x ) )
tableData.apply_colheads( lambda x: x.split( "." )[0] )
tableData.apply_colheads( lambda x: dictMap[x] )
tableData.dump( sys.argv[3] )
Beispiel #8
0
    die("This script requires the OPENPYXL module")

# argument parsing (python argparse)
parser = argparse.ArgumentParser()
parser.add_argument("tsv_files", nargs="+", help="")
parser.add_argument("--outfile", default=None)
args = parser.parse_args()

wb = xl.Workbook()
sheets = []
for i, p in enumerate(args.tsv_files):
    if i == 0:
        sheets.append(wb.active)
    else:
        sheets.append(wb.create_sheet())
    sheets[-1].title = path2name(p)

for p, ws in zip(args.tsv_files, sheets):
    for i, row in enumerate(iter_rows(p)):
        for j, val in enumerate(row):
            try:
                val = float(val)
            except:
                pass
            ws.cell(row=i + 1, column=j + 1, value=val)

if args.outfile is not None:
    outfile = args.outfile
elif len(args.tsv_files) == 1:
    outfile = "{}.xlsx".format(path2name(args.tsv_files[0]))
else:
Beispiel #9
0
    parser.add_argument('-i',
                        '--input',
                        nargs="+",
                        help='One or more MetaPhlAn clade profiles')
    parser.add_argument('-o', '--output', help='Marker PCL file')
    parser.add_argument('-e',
                        '--headers',
                        action="store_true",
                        help='File has headers')
    parser.add_argument('-g', '--grep', default=None, help='grep on clades')
    parser.add_argument('-x',
                        '--extension_groups',
                        default=1,
                        type=int,
                        help='.txt is 1, .cp.txt is 2, etc.')
    args = parser.parse_args()
    # load everything as nested dict [sample][marker]=value
    nesteddictData = {}
    for i, path in enumerate(args.input):
        print >> sys.stderr, "loading", i + 1, "of", len(args.input)
        name = path2name(path, args.extension_groups)
        nesteddictData[name] = funcLoadCladeProfile(path,
                                                    grep=args.grep,
                                                    headers=args.headers)
    # convert to a table, substituting 0 for missing values
    tableData = nesteddict2table(nesteddictData, empty=0)
    # transpose to get markers on the rows, unfloat, save as pcl
    tableData.transpose()
    tableData.unfloat()
    tableData.dump(args.output)
Beispiel #10
0
t.apply_entries(lambda x: float(x) if x is not None else None)

# derived features
aColors = [
    plt.cm.Dark2(i / float(len(t.colheads)))
    for i, colhead in enumerate(t.colheads)
]
aaData = [row for rowhead, row in t.iter_rows()]
logmin, logmax = args.logmin, args.logmax

# make plot
fig = plt.figure()
axes = plt.subplot(111)
stepplot(axes, aaData, colors=aColors)

# configure
axes.set_yscale("log")
axes.set_ylim(logmin, logmax)
axes.xaxis.set_ticklabels(t.colheads,
                          rotation=35,
                          rotation_mode="anchor",
                          ha="right")
axes.set_title(utils.path2name(args.input))
axes.set_ylabel("Relative abundance")
mu.funcGrid(axes, xaxis=False, color="gray", linestyle=":")
mu.funcSetTickParams(axes)

# done
plt.tight_layout()
plt.savefig(args.output)