def rowdex(self, index): ret = None if isinstance(index, int) and 0 <= index < len(self.rowheads): ret = index elif index in self.rowmap: ret = self.rowmap[index] else: self.report("Bad row index <{}>.".format(index), die=True) zu.die() return ret
def verify_dict(d): bad = False ll = set() for k, v in d: if type(k) is not str: bad = True elif type(v) is not list: bad = True ll.add(len(v)) if len(ll) != 1: bad = True if bad: die("Can't convert dict to Frame")
def __init__( self, path=None, fh=None, matrix=None, data=None, rowheads=None, colheads=None, headers=True, name=None, verbose=True, numeric=False, ): """ """ self.data = None self.rowheads = None self.colheads = None self.axheads = [] self.origin = c_default_origin self.name = name self.shape = [] self.is_verbose = verbose self.is_numeric = numeric if data is not None: self.data = coerce(data) self.rowheads = rowheads if rowheads is not None \ else range( self.data.shape[0] ) self.colheads = colheads if colheads is not None \ else range( self.data.shape[1] ) else: if path is not None: matrix = load_from_path(path) elif fh is not None: matrix = load_from_handle(fh) elif matrix is not None: matrix = coerce(matrix) else: die("No loading option.") if not headers: self.data = matrix self.rowheads = range(self.data.shape[0]) self.colheads = range(self.data.shape[1]) else: self.data = matrix[1:, 1:] self.rowheads = list(matrix[1:, 0]) self.colheads = list(matrix[0, 1:]) self.origin = matrix[0][0] self.axheads = [self.rowheads, self.colheads] self.index()
def verify_list_of_lists(aa): bad = False ll = set() for a in aa: if type(a) is not list: bad = True ll.add(len(a)) for k in aa[0]: if type(k) is not str: bad = True if len(ll) != 1: bad = True if bad: die("Can't convert lists to Frame")
def apfilter(self, minabund=0, minprev=1, **kwargs): if not self.is_numeric: die("Can't apfilter non-numeric table:", self.name) if type(minprev) is float: self.report("will interpret minimum prevalence", minprev, "as a fraction of samples") if not (kwargs.get("t", False) or kwargs.get("transposed", False)): self.report( "using apfilter on non-transposed table is not standard") def inner(vector, minabund=minabund, minprev=minprev): if type(minprev) is float: minprev = int(len(vector) * minprev) return list(vector >= minabund).count(True) >= minprev return self.filter(inner, vectors=True, **kwargs)
def __init__(self, source=None, data=None, colheads=None, rowheads=None, origin="#", missing="#N/A", headless=False, transposed=False, verbose=True): # set up object attributes self.source = source self.data = data self.rowheads = rowheads self.colheads = colheads self.origin = origin self.missing = missing self.transposed = transposed self.headless = headless self.verbose = verbose self.sourcename = None self.rowmap = None self.colmap = None # decide how to load table if source is None: self.sourcename = "<runtime>" if None in [data, rowheads, colheads]: zu.die("If no <source> then <data/rowheads/colheads> required") elif isinstance(source, list): self.sourcename = "<list of lists>" self.load_from_nested_lists(source) elif isinstance(source, dict): self.sourcename = "<dict of dicts>" self.load_from_nested_dicts(source) elif isinstance(source, file): self.sourcename = "<file handle>" self.load_from_file_handle(source) elif os.path.exists(source): self.sourcename = "<{}>".format(source) self.load_from_file(source) # set up other table elements self.remap() self.report("New table with size {}.".format(self.size()))
def limit(self, header, criterion, **kwargs): """ keep only rows where field value satisfies numerical criterion """ M = re.search("([<>=]+) *(.*)", criterion) if M is None: zu.die(criterion, "is not a valid limit criterion") op, threshold = M.groups() threshold = float(threshold) choices = { "<": lambda x: float(x) < threshold, "<=": lambda x: float(x) <= threshold, ">": lambda x: float(x) > threshold, ">=": lambda x: float(x) >= threshold, } selector = choices[op] function = lambda r: selector(self.get(r, header)) self.report("applying <limit>, requiring", header, "to be", op, threshold, kwargs) return self.filter(function, **kwargs)
def row_stats(row, as_strings=True, engin=False): q1, q2, q3 = mquantiles(row) stats = {} stats["N"] = len(row) stats["#0s"] = len([k for k in row if abs(k) < c_eps]) stats["%0s"] = stats["#0s"] / float(len(row)) stats["Sum"] = sum(row) stats["Min"] = min(row) stats["Q1"] = q1 stats["Q2_Med"] = q2 stats["Q3"] = q3 stats["Max"] = max(row) stats["Mean"] = mean(row) stats["StDev"] = std(row) stats[ "CfVar"] = stats["StDev"] / stats["Mean"] if stats["Mean"] != 0 else 0 if set(stats.keys()) != set(c_props): die("Inconsistent stat lists. Check code.") if as_strings: stats = {k: pretty(v, engin) for k, v in stats.items()} return stats
def __init__(self, gff_row, counter): # unique tag for locus based on position in GFF self.index = counter # gff fields if len(gff_row) != len(c_gff_fields): zu.die("Bad GFF row:", gff_row) for [fname, ftype], value in zip(c_gff_fields, gff_row): setattr(self, fname, ftype(value) if value != "." else value) # attributes temp = {} for item in self.attributes.split(";"): if "=" not in item: continue item = item.strip() system, value = item.split("=") if system in temp: zu.say("Warning: Multiple definitions for system", system) temp[system] = value self.attributes = temp # no name by default self.name = self.attributes.get("ID", None) self.code = ":".join([str(self.start), str(self.end), self.strand])
def __init__( self, row, config="6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore", ): if config[0:2] == "6 ": config = config[2:] config = config.split(" ") if len(config) != len(row): die("config doesn't match row") self.data = {} for value, field in zip(row, config): value = format[field](value) self.data[field] = value # qcov self.data["qcov"] = None if contains("qstart qend qlen".split(), self.data): self.data["qcov"] = abs(self.data["qend"] - self.data["qstart"]) + 1 self.data["qcov"] /= float(self.data["qlen"]) # scov self.data["scov"] = None if contains("sstart send slen".split(), self.data): self.data["scov"] = abs(self.data["send"] - self.data["sstart"]) + 1 self.data["scov"] /= float(self.data["slen"]) # mcov self.data["mcov"] = None if self.data["qcov"] is not None and self.data["scov"] is not None: self.data["mcov"] = min(self.data["qcov"], self.data["scov"]) # score self.data["strength"] = None if contains("pident mcov".split(), self.data): self.data[ "strength"] = self.data["mcov"] * self.data["pident"] / 100.0 # set as attr for f, v in self.data.items(): setattr(self, f, v)
def remap(self): # integrity checks if len(self.data) > 0: if len(set([len(row) for row in self.data])) != 1: zu.die("Table has inconsistent row lengths") if len(self.colheads) != len(self.data[0]): zu.die("Colheads do not align to data") if len(self.rowheads) != len(self.data): zu.die("Rowheads do not align to data") # auto-replace duplicate field names deduplicate(self.rowheads) deduplicate(self.colheads) # build fast maps self.rowmap = {k: i for i, k in enumerate(self.rowheads)} self.colmap = {k: i for i, k in enumerate(self.colheads)}
#!/usr/bin/env python import os import sys import re import argparse import csv from zopy.utils import path2name, die try: import openpyxl as xl except: die("This script requires the OPENPYXL module") # argument parsing (python argparse) parser = argparse.ArgumentParser() parser.add_argument("xlsx", help="") args = parser.parse_args() wb = xl.load_workbook(filename=args.xlsx) for ws in wb: basename = path2name(args.xlsx) sheet = ws.title sheet = re.sub("[^A-Za-z0-9]+", "_", sheet) newname = "{}.{}.tsv".format(basename, sheet) fh = open(newname, "w") ww = csv.writer(fh, csv.excel_tab)
def __getitem__(self, field): if field not in self.data: die("Non-existing field:", field) return self.data[field]
def curve_check(labels, scores): if not set(labels) <= set([0, 1]): die("non-binary [0, 1] labels vector") if not len(labels) == len(scores): die("labels and scores have non-equal lengths") return None
def report(self, *args, **kwargs): items = [self.sourcename, "::", " ".join([str(k) for k in args])] if kwargs.get("die", False): zu.die(*items) elif self.verbose: zu.say(*items)
for items in csv.reader(fh, dialect="excel-tab"): lengths2.append(len(items)) if headers2 is None and args.head2: headers2 = c_sep.join(items) continue key = items[args.key2] d.setdefault(key, {})["\t".join(items)] = 1 print("finished loading file2", file=sys.stderr) # make dummy line to add when join fails if len(set(lengths2)) != 1: warn("file2 lines have unequal lengths") if args.het: dummyline2 = c_na else: die() else: dummyline2 = "\t".join(c_na for k in range(lengths2[0])) if not args.head2: headers2 = dummyline2 # load first file, print join counts = Counter() lengths1 = [] hits = {} headers1 = None with (try_open(args.file1) if args.file1 != "-" else sys.stdin) as fh: for items in csv.reader(fh, dialect="excel-tab"): line = "\t".join(items) lengths1.append(len(items)) if headers1 is None and args.head1: