def aggregate(self, aggregations={}, parentKey=AttributeDict()): if not aggregations: return HierarchyLeaf() return HierarchyLeaf([ AttributeDict((field, aggMethod(parentKey, self)) for field, aggMethod in aggregations.items()) ])
def join(self, other, joinParams=None, otherFieldPrefix='', joinType=JoinType.LEFT_OUTER_JOIN): ''' dataTable.join(otherTable, joinParams, otherFieldPrefix='') returns a new table with rows in the first table joined with rows in the second table, using joinParams to map fields in the first to fields in the second Parameters: other - the table to join joinParams - a dictionary of <field in self> to <field in other>. Defaults to "natural join", merging common headers otherFieldPrefix - a string to prepend to the fields added from the second table joinType - the instance of JoinType which indicates if items should be included in one data table which aren't in the other ''' if joinParams is None: joinParams = {h: h for h in self.headers() if h in other.headers()} elif not isinstance(joinParams, dict): raise Exception( "joinParams must be a dictionary of <field in self> to <field in other>" ) selfJoinHeaders = list(joinParams.values()) otherJoinHeaders = [joinParams[h] for h in selfJoinHeaders] newOtherHeaders = { (v if v in joinParams.values() else otherFieldPrefix + v) for v in otherJoinHeaders } otherBuckets = other.extend( lambda row: { otherFieldPrefix + v: row[v] for v in other.headers() if v not in otherJoinHeaders }).project(newOtherHeaders).bucket(*otherJoinHeaders) emptyOtherRow = AttributeDict({ otherFieldPrefix + v: None for v in other.headers() if v not in otherJoinHeaders }) emptySelfRow = AttributeDict({ header: None for header in self.headers() if header not in selfJoinHeaders }) otherKeysSeen = set() def it(): for row in self: rowKey = tuple(row[selfHeader] for selfHeader, otherHeader in joinParams) otherKeysSeen.add(rowKey) if rowKey in otherBuckets: for otherRow in otherBuckets[rowKey]: yield row + otherRow elif joinType.leftOuter: yield emptyOtherRow + row if joinType.rightOuter: for otherKey, otherBucket in otherBuckets.items(): if otherKey not in otherKeysSeen: for row in otherBucket: yield emptySelfRow + row return DataTableStream(it(), set(self.headers()).union(newOtherHeaders))
def originalToRows(self, keyFields): return [ AttributeDict({ field: toRow[fieldIdx] for field, fieldIdx in self.diffFields.items() }) + dict(zip(keyFields, self.key)) for toRow in (self.toRow or []) ]
def aggregate(self, groupBy, aggregations={}): '''return an aggregation of the data grouped by a given set of fields. Parameters: groupBy - the set of fields to group aggregations - a dict of field name -> aggregate method, where the method takes an intermediate DataTable and returns the value for that field for that row. ''' if not aggregations: return self.project(groupBy).distinct() accumulatedRows = {} for row in self: key = tuple(row[field] for field in groupBy) if key not in accumulatedRows: accumulatedRows[key] = { a: agg.newBucket(row) for a, agg in aggregations.items() } accRow = accumulatedRows[key] for a, agg in aggregations.items(): accRow[a] = agg.addRow(row, accRow[a]) newData = [] for key, accRow in sorted(accumulatedRows.items()): newData.append( AttributeDict(zip(groupBy, key)) + { a: agg.finalize(accRow[a]) for a, agg in aggregations.items() }) return DataTable(newData)
def bucket(self, *fields): '''Returns a dict of bucket -> DataTable of rows matching that bucket''' buckets = defaultdict(lambda: []) for data in self.__data: key = tuple(data[field] for field in fields) buckets[key].append(data) return AttributeDict( (key, DataTable(bucket)) for key, bucket in buckets.items())
def renameColumn(self, column, newName): '''rename the column in place''' swap = lambda h: h if h != column else newName transform = lambda row: AttributeDict( (swap(k), v) for k, v in row.items()) return self.transform(transform, {swap(header) for header in self.__headers})
def __init__(self, data=None, parseMethod=None): '''Create a data table from the given data data may be one of the following: A sequence of dictionaries, where all of the dictionaries share common keys A sequence of sequences where the first item is the list of headers Another DataTable instance, which will create a deep copy A string which may be parsed into one of the previous by calling parseMethod on the string. ''' if isinstance(data, DataTable): self.__headers = { h: DataColumn(self, c) for h, c in data.__headers.items() } self.__data = [ AttributeDict((h.header, row[h.header]) for h in self.__headers.values()) for row in data ] return if isinstance(data, str): data = parseMethod(data) if not data: self.__data = [] self.__headers = {} return data = [row for row in data] if not data: self.__data = [] self.__headers = {} return if isinstance(data[0], dict): headers = {k for row in data for k in row.keys()} self.__headers = {h: DataColumn(self, h) for h in headers} for row in data: for header in self.__headers.keys(): if header not in row: row[header] = None self.__data = [AttributeDict(row) for row in data] else: headers = data.pop(0) self.__headers = {h: DataColumn(self, h) for h in headers} self.__data = [AttributeDict(zip(headers, row)) for row in data]
def addValues(self, values): '''Adds the values from the values dict to this hierarchy''' val = values[self.keyHeaders[0]] if len(self.keyHeaders) == 1: if val not in self._data: self._data[val] = HierarchyLeaf() self._data[val].append( AttributeDict((k, values[k]) for k in self.leafHeaders)) return if val not in self._data: self._data[val] = Hierarchy(self.keyHeaders[1:], self.leafHeaders) self[val].addValues(values)
def checkRemove_multiField(self, filterMethod, *fields): ''' remove the set of fields from the result if filterMethod returns true for those entries filterMethod is a method which takes two dicts: fromRow and toRow, with those fields specified by the fields parameter and returns if those values can be removed from the result fields is a list of fields to check and possibly remove ''' fieldIdxs = tuple((field, self.diffFields[field]) for field in fields) if any(fieldIdx not in self.__data for field, fieldIdx in fieldIdxs): return fromRow, toRow = (AttributeDict((field, self.__data[fieldIdx][i]) for field, fieldIdx in fieldIdxs) for i in (0, 1)) if filterMethod(fromRow, toRow): for field, fieldIdx in fieldIdxs: del self.__data[fieldIdx]
def aggregate(self, aggregations={}, parentKey=AttributeDict()): '''return an aggregation of the hiararchy leaf tables the resulting Hierarchy will have the same structure, except that the leaf tables will be collapsed to single rows containing the results of applying the aggregations to the original leaf tables Parameters: aggregations - a dict of field name -> aggregate method, where the method takes an intermediate HierarchyLeaf and returns the value for that field for that row. ''' if not aggregations: return self.reindex(self.keyHeaders, ()) new = Hierarchy(self.keyHeaders, aggregations.keys()) for key, child in self: new._data[key] = child.aggregate(aggregations, parentKey=parentKey + {self.keyHeaders[0]: key}) return new
def tempIterRows(): accumulatedRows = {} for row in self: key = tuple(row[field] for field in groupBy) if key not in accumulatedRows: accumulatedRows[key] = { a: agg.newBucket(row) for a, agg in aggregations.items() } accRow = accumulatedRows[key] for a, agg in aggregations.items(): accRow[a] = agg.addRow(row, accRow[a]) for key, accRow in sorted(accumulatedRows.items()): yield AttributeDict(zip(groupBy, key)) + { a: agg.finalize(accRow[a]) for a, agg in aggregations.items() }
def tempIterRows(): for column in self.columns(): row = AttributeDict(zip(rowIDs, column)) row['Field'] = column.header yield row
def project(self, newLeafHeaders): return HierarchyLeaf( AttributeDict((k, v) for k, v in row.items() if k in newLeafHeaders) for row in self)
def copy(self): return HierarchyLeaf(AttributeDict(row) for row in self)
def parseCsv(f, headers=None, sep=',', quot='"'): return DataTable( AttributeDict(line) for line in csv.DictReader( f, fieldnames=headers, delimiter=sep, quotechar=quot))
def asDict(self): return AttributeDict(iter(self))
def parse(): for line in f: d = AttributeDict() for header in headers: d[header[0]] = line[header[1]:header[2]] yield d