def save_tsv(p, vs): 'Write sheet to file `fn` as TSV.' delim = options.delimiter trdict = tsv_trdict() save_tsv_header(p, vs) with p.open_text(mode='a') as fp: for r in Progress(vs.rows): dispvals = [] for col in vs.visibleCols: v = col.getDisplayValue(r) if isinstance(v, TypedWrapper): if not options.save_errors: continue v = str(v) if trdict: v = str(v).translate(trdict) dispvals.append(v) fp.write(delim.join(dispvals)) fp.write('\n') status('%s save finished' % p)
def save_json(p, vs): with p.open_text(mode='w') as fp: vcols = vs.visibleCols jsonenc = _vjsonEncoder(indent=options.json_indent) for chunk in jsonenc.iterencode( [_rowdict(vcols, r) for r in Progress(vs.rows, 'saving')]): fp.write(chunk)
def setValuesFromExpr(self, rows, expr): compiledExpr = compile(expr, '<expr>', 'eval') vd.addUndoSetValues([self], rows) for row in Progress(rows, 'setting'): self.setValueSafe(row, self.sheet.evalexpr(compiledExpr, row)) self.recalc() status('set %d values = %s' % (len(rows), expr))
def genAllValues(rows, cols, trdict={}, format=True): transformers = collections.OrderedDict() # list of transformers for each column in order for col in cols: transformers[col] = [ col.type ] if format: transformers[col].append( lambda v,fmtfunc=getType(col.type).formatter,fmtstr=col.fmtstr: fmtfunc(fmtstr, '' if v is None else v) ) if trdict: transformers[col].append(lambda v,trdict=trdict: v.translate(trdict)) options_safe_error = options.safe_error for r in Progress(rows): dispvals = [] for col, transforms in transformers.items(): try: dispval = col.getValue(r) except Exception as e: exceptionCaught(e) dispval = options_safe_error or str(e) try: for t in transforms: if dispval is None: dispval = '' break dispval = t(dispval) except Exception as e: dispval = str(dispval) dispvals.append(dispval) yield dispvals
def gen_identify_duplicates(sheet): """ Takes a sheet, and returns a generator yielding a tuple for each row encountered. The tuple's structure is `(row_object, is_dupe)`, where is_dupe is True/False. See note in Usage section above regarding how duplicates are determined. """ keyCols = sheet.keyCols cols_to_check = None if len(keyCols) == 0: warning("No key cols specified. Using all columns.") cols_to_check = sheet.visibleCols else: cols_to_check = sheet.keyCols seen = set() for r in Progress(sheet.rows): vals = tuple(col.getValue(r) for col in cols_to_check) is_dupe = vals in seen if not is_dupe: seen.add(vals) yield (r, is_dupe)
def fillNullValues(col, rows): 'Fill null cells in col with the previous non-null value' lastval = None oldvals = [] # for undo nullfunc = isNullFunc() n = 0 rowsToFill = list(rows) for r in Progress(col.sheet.rows, 'filling'): # loop over all rows try: val = col.getValue(r) except Exception as e: val = e if nullfunc(val) and r in rowsToFill: if lastval: oldvals.append((col, r, val)) col.setValue(r, lastval) n += 1 else: lastval = val def _undo(): for c, r, v in oldvals: c.setValue(r, v) vd.addUndo(_undo) col.recalc() status("filled %d values" % n)
def _reload(self=vs): self.rows = [] gen = gen_identify_duplicates(sheet) prog = Progress(gen, gerund="deduplicating", total=sheet.nRows) for row, is_dupe in prog: if not is_dupe: self.addRow(row)
def deleteBy(self, func): 'Delete rows for which func(row) is true. Returns number of deleted rows.' oldrows = copy(self.rows) oldidx = self.cursorRowIndex ndeleted = 0 row = None # row to re-place cursor after while oldidx < len(oldrows): if not func(oldrows[oldidx]): row = self.rows[oldidx] break oldidx += 1 self.rows.clear() for r in Progress(oldrows, 'deleting'): if not func(r): self.rows.append(r) if r is row: self.cursorRowIndex = len(self.rows) - 1 else: ndeleted += 1 vd.addUndo(setattr, self, 'rows', oldrows) status('deleted %s %s' % (ndeleted, self.rowtype)) return ndeleted
def save_jsonl(p, vs): with p.open_text(mode='w') as fp: vcols = vs.visibleCols jsonenc = _vjsonEncoder() for r in Progress(vs.rows): rowdict = _rowdict(vcols, r) fp.write(jsonenc.encode(rowdict) + '\n')
def addRegexColumns(regexMaker, vs, colIndex, origcol, regexstr): regexstr or vd.fail('regex required') regex = re.compile(regexstr, vs.regex_flags()) func = regexMaker(regex, origcol) n = options.default_sample_size if n and n < len(vs.rows): exampleRows = random.sample(vs.rows, max( 0, n - 1)) # -1 to account for included cursorRow else: exampleRows = vs.rows ncols = 0 # number of new columns added already for r in Progress(exampleRows + [vs.cursorRow]): try: m = func(r) if not m: continue except Exception as e: vd.exceptionCaught(e) for _ in range(len(m) - ncols): c = Column( origcol.name + '_re' + str(ncols), getter=lambda col, row, i=ncols, func=func: func(row)[i], origCol=origcol) vs.addColumn(c, index=colIndex + ncols + 1) ncols += 1
def setValuesFromRegex(cols, rows, rex): transforms = [regexTransform(col, rex) for col in cols] vd.addUndoSetValues(cols, rows) for r in Progress(rows, 'replacing'): for col, transform in zip(cols, transforms): col.setValueSafe(r, transform(col, r)) for col in cols: col.recalc()
def unselect(self, rows, status=True, progress=True): "Unselect given rows. Don't show progress if progress=False; don't show status if status=False." self.addUndoSelection() before = self.nSelected for r in (Progress(rows, 'unselecting') if progress else rows): self.unselectRow(r) if status: vd.status('unselected %s/%s %s' % (before - self.nSelected, before, self.rowtype))
def reload(self): self.rows = [] self.columns = [] if len(self.source) == 0: return for i, row in enumerate(Progress(self.source, total=len(self.source))): if i == 0: self.set_columns_from_row(row) self.addRow(row)
def gatherBy(self, func, gerund='gathering'): 'Generate only rows for which the given func returns True.' for i in Progress(rotateRange(self.nRows, self.cursorRowIndex - 1), total=self.nRows, gerund=gerund): try: r = self.rows[i] if func(r): yield r except Exception: pass
def getValueRows(self, rows): 'Generate (val, row) for the given `rows` at this Column, excluding errors and nulls.' f = isNullFunc() for r in Progress(rows, 'calculating'): try: v = self.getTypedValue(r) if not f(v): yield v, r except Exception: pass
def to_tabulate_table(sheet, fmt): if fmt not in SUPPORTED_FORMATS: fail(f"'{fmt}' is not a supported 'tabulate' format") headers = [col.name for col in sheet.visibleCols] def get_row_values(row): return [col.getDisplayValue(row) for col in sheet.visibleCols] return tabulate(map(get_row_values, Progress(sheet.rows)), headers, tablefmt=fmt)
def addNewRows(sheet, n, idx): addedRows = {} for i in Progress(range(n), 'adding'): row = sheet.newRow() addedRows[sheet.rowid(row)] = row sheet.addRow(row, idx + 1) @asyncthread def _removeRows(): sheet.deleteBy(lambda r, sheet=sheet, addedRows=addedRows: sheet.rowid( r) in addedRows) vd.addUndo(_removeRows)
def reload_json(self): self.rows = [] with self.source.open_text() as fp: ret = json.load(fp) if isinstance(ret, dict): self.rows = [ret] self.columns = [] for k in self.rows[0]: self.addColumn(ColumnItem(k, type=deduceType(self.rows[0][k]))) else: self.rows = [] for row in Progress(ret): self.addRow(row)
def reload(self): sheets = self.sources # first item in joined row is the key tuple from the first sheet. # first columns are the key columns from the first sheet, using its row (0) self.columns = [] for i, c in enumerate(sheets[0].keyCols): self.addColumn( SubrowColumn(c.name, ColumnItem(c.name, i, type=c.type, width=c.width), 0)) self.setKeys(self.columns) for sheetnum, vs in enumerate(sheets): # subsequent elements are the rows from each source, in order of the source sheets ctr = collections.Counter(c.name for c in vs.nonKeyVisibleCols) for c in vs.nonKeyVisibleCols: newname = c.name if ctr[c.name] == 1 else '%s_%s' % (vs.name, c.name) self.addColumn(SubrowColumn(newname, c, sheetnum + 1)) rowsBySheetKey = {} rowsByKey = {} groupRowsByKey(sheets, rowsBySheetKey, rowsByKey) self.rows = [] with Progress(gerund='joining', total=len(rowsByKey)) as prog: for k, combinedRows in rowsByKey.items(): prog.addProgress(1) if self.jointype == 'full': # keep all rows from all sheets for combinedRow in combinedRows: self.addRow(combinedRow) elif self.jointype == 'inner': # only rows with matching key on all sheets for combinedRow in combinedRows: if all(combinedRow): self.addRow(combinedRow) elif self.jointype == 'outer': # all rows from first sheet for combinedRow in combinedRows: if combinedRow[1]: self.addRow(combinedRow) elif self.jointype == 'diff': # only rows without matching key on all sheets for combinedRow in combinedRows: if not all(combinedRow): self.addRow(combinedRow)
def save_dot(vd, p, vs): unusedColors = 'orange green purple cyan red blue black'.split() assignedColors = {} srccol = vs.keyCols[0] dstcol = vs.keyCols[1] with p.open_text(mode='w') as fp: print('graph { concentrate=true;', file=fp) for row in Progress(vs.rows, 'saving'): src = srccol.getTypedValue(row) dst = dstcol.getTypedValue(row) if not is_valid(src) or not is_valid(dst): continue downsrc = clean_to_id(str(src)) or src downdst = clean_to_id(str(dst)) or dst edgenotes = [ c.getTypedValue(row) for c in vs.nonKeyVisibleCols if not isNumeric(c) ] edgetype = '-'.join(str(x) for x in edgenotes if is_valid(x)) color = assignedColors.get(edgetype, None) if not color: color = unusedColors.pop() if unusedColors else 'black' assignedColors[edgetype] = color if options.graphviz_edge_labels: nodelabels = [ wrapply(SI, c.getTypedValue(row)) for c in vs.nonKeyVisibleCols if isNumeric(c) ] label = '/'.join(str(x) for x in nodelabels if is_valid(x)) else: label = '' print('\t%s[label="%s"];' % (downsrc, src), file=fp) print('\t%s[label="%s"];' % (downdst, dst), file=fp) print('\t%s -- %s[label="%s", color=%s];' % (downsrc, downdst, label, color), file=fp) print('label="%s"' % vs.name, file=fp) print('node[shape=plaintext];', file=fp) print('subgraph cluster_legend {', file=fp) print('label="Legend";', file=fp) for i, (k, color) in enumerate(assignedColors.items()): print('key%d[label="%s", fontcolor=%s];' % (i, k, color), file=fp) print('}', file=fp) # legend subgraph print('}', file=fp)
def select(self, rows, status=True, progress=True): "Bulk select given rows. Don't show progress if progress=False; don't show status if status=False." self.addUndoSelection() before = self.nSelected if options.bulk_select_clear: self.clearSelected() for r in (Progress(rows, 'selecting') if progress else rows): self.selectRow(r) if status: if options.bulk_select_clear: msg = 'selected %s %s%s' % (self.nSelected, self.rowtype, ' instead' if before > 0 else '') else: msg = 'selected %s%s %s' % (self.nSelected - before, ' more' if before > 0 else '', self.rowtype) vd.status(msg)
def rotateRange(n, idx, reverse=False): 'Wraps an iter starting from idx. Yields indices from idx to n and then 0 to idx.' if reverse: rng = range(idx - 1, -1, -1) rng2 = range(n - 1, idx - 1, -1) else: rng = range(idx + 1, n) rng2 = range(0, idx + 1) wrapped = False with Progress(total=n) as prog: for r in itertools.chain(rng, rng2): prog.addProgress(1) if not wrapped and r in rng2: status('search wrapped') wrapped = True yield r
def iterload(self): delim = self.options.delimiter rowdelim = self.options.row_delimiter with self.source.open_text() as fp: with Progress(total=filesize(self.source)) as prog: for line in splitter(fp, rowdelim): if not line: continue prog.addProgress(len(line)) row = list(line.split(delim)) if len(row) < self.nVisibleCols: # extend rows that are missing entries row.extend([None] * (self.nVisibleCols - len(row))) yield row
def normalize_column_names(sheet): """ Normalize the names of all non-hidden columns on the active sheet. """ init_names = [] gen = gen_normalize_names(c.name for c in sheet.visibleCols) prog = Progress(gen, gerund="normalizing", total=sheet.nVisibleCols) for i, norm_name in enumerate(prog): col = sheet.visibleCols[i] init_names.append(col.name) # Store for undo col.name = norm_name @asyncthread def undo(): for i, c in enumerate(init_names): sheet.visibleCols[i].name = c vd.addUndo(undo)
def sort(self): 'Sort rows according to the current self._ordering.' try: with Progress(gerund='sorting', total=self.nRows) as prog: def sortkey(r): ret = [] for col, reverse in self._ordering: if isinstance(col, str): col = self.column(col) val = col.getTypedValue(r) ret.append(Reversor(val) if reverse else val) prog.addProgress(1) return ret # must not reassign self.rows: use .sort() instead of sorted() self.rows.sort(key=sortkey) except TypeError as e: vd.warning('sort incomplete due to TypeError; change column type') vd.exceptionCaught(e, status=False)
def reload_sync(self): 'Perform synchronous loading of TSV file, discarding header lines.' header_lines = options.get('header', self) with self.source.open_text() as fp: # get one line anyway to determine number of columns lines = list(getlines(fp, int(header_lines) or 1)) headers = [L.split(options.delimiter) for L in lines] if header_lines <= 0: self.columns = [ ColumnItem('', i) for i in range(len(headers[0])) ] else: self.columns = [ ColumnItem('\\n'.join(x), i) for i, x in enumerate(zip(*headers[:header_lines])) ] lines = lines[header_lines:] # in case of header_lines == 0 self._rowtype = namedlist('tsvobj', [c.name for c in self.columns]) self.recalc() delim = options.delimiter self.rows = [] with Progress(total=self.source.filesize) as prog: for L in itertools.chain(lines, getlines(fp)): row = L.split(delim) ncols = self._rowtype.length() # current number of cols if len(row) > ncols: newcols = [ ColumnItem('', len(row) + i, width=8) for i in range(len(row) - ncols) ] self._rowtype = namedlist( self._rowtype.__name__, list(self._rowtype._fields) + ['_' for c in newcols]) self.addRow(self._rowtype(row)) prog.addProgress(len(L))
def select_duplicate_rows(sheet, duplicates=True): """ Given a sheet, sets the selection status in VisiData to `selected` for each row that is a duplicate of a prior row. If `duplicates = False`, then the behavior is reversed; sets the selection status to `selected` for each row that is *not* a duplicate. """ before = len(sheet.selectedRows) gen = gen_identify_duplicates(sheet) prog = Progress(gen, gerund="selecting", total=sheet.nRows) for row, is_dupe in prog: if is_dupe == duplicates: sheet.selectRow(row) sel_count = len(sheet.selectedRows) - before more_str = " more" if before > 0 else "" vd.status(f"selected {sel_count}{more_str} {sheet.rowtype}")
def groupRowsByKey(sheets, rowsBySheetKey, rowsByKey): with Progress(total=sum(len(vs.rows) for vs in sheets) * 2) as prog: for vs in sheets: # tally rows by keys for each sheet rowsBySheetKey[vs] = collections.defaultdict(list) for r in vs.rows: prog.addProgress(1) key = joinkey(vs, r) rowsBySheetKey[vs][key].append(r) for vs in sheets: for r in vs.rows: prog.addProgress(1) key = joinkey(vs, r) if key not in rowsByKey: # gather for this key has not been done yet # multiplicative for non-unique keys rowsByKey[key] = [] for crow in itertools.product(*[ rowsBySheetKey[vs2].get(key, [None]) for vs2 in sheets ]): rowsByKey[key].append([key] + list(crow))
def ExtendedSheet_reload(self, sheets): self.joinSources = sheets # first item in joined row is the key tuple from the first sheet. # first columns are the key columns from the first sheet, using its row (0) self.columns = [] for i, c in enumerate(sheets[0].keyCols): self.addColumn(copy(c)) self.setKeys(self.columns) for i, c in enumerate(sheets[0].nonKeyVisibleCols): self.addColumn(copy(c)) for sheetnum, vs in enumerate(sheets[1:]): # subsequent elements are the rows from each source, in order of the source sheets ctr = collections.Counter(c.name for c in vs.nonKeyVisibleCols) for c in vs.nonKeyVisibleCols: newname = '%s_%s' % (vs.name, c.name) newcol = ExtendedColumn(newname, sheetnum=sheetnum + 1, sourceCol=c) self.addColumn(newcol) self.rowsBySheetKey = {} # [srcSheet][key] -> list(rowobjs from sheets[0]) rowsByKey = {} # [key] -> [key, rows0, rows1, ...] groupRowsByKey(sheets, self.rowsBySheetKey, rowsByKey) self.rows = [] with Progress(gerund='joining', total=len(rowsByKey)) as prog: for k, combinedRows in rowsByKey.items(): prog.addProgress(1) for combinedRow in combinedRows: if combinedRow[1]: self.addRow(combinedRow[1])
def toggle(self, rows): 'Toggle selection of given `rows`.' self.addUndoSelection() for r in Progress(rows, 'toggling', total=len(self.rows)): if not self.unselectRow(r): self.selectRow(r)