def read_csv(self, name, use_whole_file=False, names=None, skiprows=0, *args, **kwargs): """Read a CSV file in and parse it into Pandas DataFrames. If no names is provided we use the first row for the names. header=0 is the default unless names is provided in which case header=None is the default. skiprows indicates how many rows of input to skip. This will only be applied to the first partition of the data (so if #skiprows > #row in first partition this will not work). Generally this shouldn't be an issue for small values of skiprows. No other values of header is supported. All additional parameters are passed to the read_csv function. """ def csv_file(partitionNumber, files): file_count = 0 for filename, contents in files: # Only skip lines on the first file if partitionNumber == 0 and file_count == 0 and _skiprows > 0: yield pandas.read_csv(StringIO(contents), *args, header=None, names=mynames, skiprows=_skiprows, **kwargs) else: file_count += 1 yield pandas.read_csv(StringIO(contents), *args, header=None, names=mynames, **kwargs) def csv_rows(partitionNumber, rows): rowCount = 0 inputStr = "\n".join(rows) if partitionNumber == 0: return pandas.read_csv(StringIO(row), *args, header=None, names=mynames, skiprows=_skiprows, **kwargs) else: return pandas.read_csv(StringIO(row), *args, header=None, names=mynames, **kwargs) # If we need to peak at the first partition and determine the column # names mynames = None _skiprows = skiprows if names: mynames = names else: # In the future we could avoid this expensive call. first_line = self.sc.textFile(name).first() frame = pandas.read_csv(StringIO(first_line)) mynames = list(frame.columns.values) _skiprows += 1 # Do the actual load if use_whole_file: return PRDD.fromRDD( self.sc.wholeTextFiles(name).mapPartitionsWithIndex(csv_file)) else: return PRDD.fromRDD( self.sc.textFile(name).mapPartitionsWithIndex(csv_rows))
def DataFrame(self, elements, *args, **kwargs): """Wraps the pandas.DataFrame operation.""" def _load_partitions(partition): """Convert partitions of tuples.""" partitionList = list(partition) if len(partitionList) > 0: (indices, elements) = zip(*partitionList) return iter([ pandas.DataFrame(data=list(elements), index=list(indices), *args, **kwargs) ]) else: return iter([]) # Zip with the index so we have consistent indexing as if it was # operated on locally index = range(len(elements)) # TODO(holden): test this issue #13 if 'index' in kwargs: index = kwargs['index'] elementsWithIndex = zip(index, elements) return PRDD.fromRDD( self.sc.parallelize(elementsWithIndex).mapPartitions( _load_partitions))
def aggregate(self, f): """Apply the aggregation function. Note: This implementation does note take advantage of partial aggregation. """ return PRDD.fromRDD( self._regroup_mergedRDD().values().map(lambda g: g.aggregate(f)))
def median(self): """Compute median of groups, excluding missing values. For multiple groupings, the result index will be a MultiIndex. """ return PRDD.fromRDD( self._regroup_mergedRDD().values().map(lambda x: x.median()))
def DataFrame(self, elements, *args, **kwargs): """Wraps the pandas.DataFrame operation.""" def _load_partitions(partition): """Convert partitions of tuples.""" partitionList = list(partition) if len(partitionList) > 0: (indices, elements) = zip(*partitionList) return iter([ pandas.DataFrame( data=list(elements), index=list(indices), *args, **kwargs)]) else: return iter([]) # Zip with the index so we have consistent indexing as if it was # operated on locally index = range(len(elements)) # TODO(holden): test this issue #13 if 'index' in kwargs: index = kwargs['index'] elementsWithIndex = zip(index, elements) return PRDD.fromRDD( self.sc.parallelize(elementsWithIndex).mapPartitions( _load_partitions))
def aggregate(self, f): """Apply the aggregation function. Note: This implementation does note take advantage of partial aggregation. """ return PRDD.fromRDD( self._regroup_mergedRDD().values().map( lambda g: g.aggregate(f)))
def median(self): """Compute median of groups, excluding missing values. For multiple groupings, the result index will be a MultiIndex. """ return PRDD.fromRDD( self._regroup_mergedRDD().values().map( lambda x: x.median()))
def nth(self, n, *args, **kwargs): """Take the nth element of each grouby.""" # TODO: Stop collecting the entire frame for each key. myargs = self._myargs mykwargs = self._mykwargs nthRDD = self._regroup_mergedRDD().mapValues( lambda r: r.nth(n, *args, **kwargs)).values() return PRDD.fromRDD(nthRDD)
def mean(self): """Compute mean of groups, excluding missing values. For multiple groupings, the result index will be a MultiIndex. """ # TODO(holden): use stats counter return PRDD.fromRDD( self._regroup_mergedRDD().values().map(lambda x: x.mean()))
def var(self, ddof=1): """Compute standard deviation of groups, excluding missing values. For multiple groupings, the result index will be a MultiIndex. """ # TODO(holden): use stats counter return PRDD.fromRDD( self._regroup_mergedRDD().values().map(lambda x: x.var(ddof=ddof)))
def nth(self, n, *args, **kwargs): """Take the nth element of each grouby.""" # TODO: Stop collecting the entire frame for each key. myargs = self._myargs mykwargs = self._mykwargs nthRDD = self._regroup_mergedRDD().mapValues( lambda r: r.nth( n, *args, **kwargs)).values() return PRDD.fromRDD(nthRDD)
def mean(self): """Compute mean of groups, excluding missing values. For multiple groupings, the result index will be a MultiIndex. """ # TODO(holden): use stats counter return PRDD.fromRDD( self._regroup_mergedRDD().values().map( lambda x: x.mean()))
def var(self, ddof=1): """Compute standard deviation of groups, excluding missing values. For multiple groupings, the result index will be a MultiIndex. """ # TODO(holden): use stats counter return PRDD.fromRDD( self._regroup_mergedRDD().values().map( lambda x: x.var( ddof=ddof)))
def from_schema_rdd(self, schemaRDD): """Convert a schema RDD to a L{PRDD}.""" def _load_kv_partitions(partition): """Convert a partition where each row is key/value data.""" partitionList = list(partition) if len(partitionList) > 0: return iter([pandas.DataFrame(data=partitionList)]) else: return iter([]) return PRDD.fromRDD(schemaRDD.mapPartitions(_load_kv_partitions))
def from_schema_rdd(self, schemaRDD): """Convert a schema RDD to a L{PRDD}.""" def _load_kv_partitions(partition): """Convert a partition where each row is key/value data.""" partitionList = list(partition) if len(partitionList) > 0: return iter([ pandas.DataFrame(data=partitionList) ]) else: return iter([]) return PRDD.fromRDD(schemaRDD.mapPartitions(_load_kv_partitions))
def read_json(self, name, *args, **kwargs): """Read a json file in and parse it into Pandas DataFrames. If no names is provided we use the first row for the names. Currently, it is not possible to skip the first n rows of a file. Headers are provided in the json file and not specified separately. """ def json_file(partitionNumber, files): for filename, contents in files: yield pandas.read_json(sio(contents), *args, **kwargs) return PRDD.fromRDD( self.sc.wholeTextFiles(name).mapPartitionsWithIndex(json_file))
def csvfile(self, name, use_whole_file=True, *args, **kwargs): """ Read a CSV file in and parse it into panda data frames. Note this uses wholeTextFiles by default underneath the hood so as to support multi-line CSV records so many small input files are preferred. All additional parameters are passed to the read_csv function """ # TODO(holden): string IO stuff def csv_file(contents, *args, **kwargs): return pandas.read_csv(StringIO(contents), *args, header=0, **kwargs) def csv_rows(rows, *args, **kwargs): for row in rows: yield pandas.read_csv(StringIO(row), *args, header=0, **kwargs) if use_whole_file: return PRDD.fromRDD(self.sc.wholeTextFiles(name).map( lambda (name, contents): csv_file(contents, *args, **kwargs))) else: return PRDD.fromRDD(self.sc.textFile(name).mapPartitions( lambda x: csv_rows(x, *args, **kwargs)))
def max(self): """Compute the max for each group.""" myargs = self._myargs mykwargs = self._mykwargs def create_combiner(x): return x.groupby(*myargs, **mykwargs).max() def merge_value(x, y): return x.append(create_combiner(y)).max() def merge_combiner(x, y): return x.append(y).max(level=0) rddOfMax = self._sortIfNeeded( self._distributedRDD.combineByKey(create_combiner, merge_value, merge_combiner)).values() return PRDD.fromRDD(rddOfMax)
def sum(self): """Compute the sum for each group.""" myargs = self._myargs mykwargs = self._mykwargs def create_combiner(x): return x.groupby(*myargs, **mykwargs).sum() def merge_value(x, y): return pandas.concat([x, create_combiner(y)]) def merge_combiner(x, y): return x + y rddOfSum = self._sortIfNeeded( self._distributedRDD.combineByKey(create_combiner, merge_value, merge_combiner)).values() return PRDD.fromRDD(rddOfSum)
def last(self): """Pull out the last from each group.""" myargs = self._myargs mykwargs = self._mykwargs def create_combiner(x): return x.groupby(*myargs, **mykwargs).last() def merge_value(x, y): return create_combiner(y) def merge_combiner(x, y): return y rddOfLast = self._sortIfNeeded( self._distributedRDD.combineByKey(create_combiner, merge_value, merge_combiner)).values() return PRDD.fromRDD(rddOfLast)
def from_data_frame(self, df): """Make a distributed dataframe from a local dataframe. The intend use is for testing. Note: dtypes are re-infered, so they may not match.""" mydtype = df.dtypes mycols = df.columns def loadFromKeyRow(partition): pll = list(partition) if len(pll) > 0: index, data = zip(*pll) return iter([ pandas.DataFrame(list(data), columns=mycols, index=index)]) else: return iter([]) indexedData = zip(df.index, df.itertuples(index=False)) rdd = self.sc.parallelize(indexedData).mapPartitions(loadFromKeyRow) return PRDD.fromRDD(rdd)
def from_data_frame(self, df): """Make a distributed dataframe from a local dataframe. The intend use is for testing. Note: dtypes are re-infered, so they may not match.""" mydtype = df.dtypes mycols = df.columns def loadFromKeyRow(partition): pll = list(partition) if len(pll) > 0: index, data = zip(*pll) return iter([ pandas.DataFrame(list(data), columns=mycols, index=index) ]) else: return iter([]) indexedData = zip(df.index, df.itertuples(index=False)) rdd = self.sc.parallelize(indexedData).mapPartitions(loadFromKeyRow) return PRDD.fromRDD(rdd)
def sum(self): """Compute the sum for each group.""" myargs = self._myargs mykwargs = self._mykwargs def create_combiner(x): return x.groupby(*myargs, **mykwargs).sum() def merge_value(x, y): return pandas.concat([x, create_combiner(y)]) def merge_combiner(x, y): return x + y rddOfSum = self._sortIfNeeded(self._distributedRDD.combineByKey( create_combiner, merge_value, merge_combiner)).values() return PRDD.fromRDD(rddOfSum)
def last(self): """Pull out the last from each group.""" myargs = self._myargs mykwargs = self._mykwargs def create_combiner(x): return x.groupby(*myargs, **mykwargs).last() def merge_value(x, y): return create_combiner(y) def merge_combiner(x, y): return y rddOfLast = self._sortIfNeeded(self._distributedRDD.combineByKey( create_combiner, merge_value, merge_combiner)).values() return PRDD.fromRDD(rddOfLast)
def max(self): """Compute the max for each group.""" myargs = self._myargs mykwargs = self._mykwargs def create_combiner(x): return x.groupby(*myargs, **mykwargs).max() def merge_value(x, y): return x.append(create_combiner(y)).max() def merge_combiner(x, y): return x.append(y).max(level=0) rddOfMax = self._sortIfNeeded(self._distributedRDD.combineByKey( create_combiner, merge_value, merge_combiner)).values() return PRDD.fromRDD(rddOfMax)
def first(self): """ Pull out the first from each group. Note: this is different than Spark's first. """ myargs = self._myargs mykwargs = self._mykwargs def create_combiner(x): return x.groupby(*myargs, **mykwargs).first() def merge_value(x, y): return create_combiner(x) def merge_combiner(x, y): return x rddOfFirst = self._sortIfNeeded( self._distributedRDD.combineByKey(create_combiner, merge_value, merge_combiner)).values() return PRDD.fromRDD(rddOfFirst)
def first(self): """ Pull out the first from each group. Note: this is different than Spark's first. """ myargs = self._myargs mykwargs = self._mykwargs def create_combiner(x): return x.groupby(*myargs, **mykwargs).first() def merge_value(x, y): return create_combiner(x) def merge_combiner(x, y): return x rddOfFirst = self._sortIfNeeded(self._distributedRDD.combineByKey( create_combiner, merge_value, merge_combiner)).values() return PRDD.fromRDD(rddOfFirst)
def apply(self, func, *args, **kwargs): """Apply the provided function and combine the results together in the same way as apply from groupby in pandas. This returns a PRDD. """ def key_by_index(data): """Key each row by its index. """ # TODO: Is there a better way to do this? for key, row in data.iterrows(): yield (key, pandas.DataFrame.from_dict(dict([(key, row)]), orient='index')) myargs = self._myargs mykwargs = self._mykwargs regroupedRDD = self._distributedRDD.mapValues( lambda data: data.groupby(*myargs, **mykwargs)) appliedRDD = regroupedRDD.map( lambda key_data: key_data[1].apply(func, *args, **kwargs)) reKeyedRDD = appliedRDD.flatMap(key_by_index) prdd = self._sortIfNeeded(reKeyedRDD).values() return PRDD.fromRDD(prdd)
def DataFrame(self, elements, *args, **kwargs): """ Wraps the pandas.DataFrame operation. """ return PRDD.fromRDD(self.sc.parallelize(elements).map( lambda element: pandas.DataFrame(data=[element], *args, **kwargs)))
def sql(self, query): """Perform a SQL query and create a L{PRDD} of the result.""" return PRDD.fromRDD( self.from_schema_rdd( self._get_sqlctx().sql(query)))
def sql(self, query): """Perform a SQL query and create a L{PRDD} of the result.""" return PRDD.fromRDD(self.from_schema_rdd( self._get_sqlctx().sql(query)))
def read_csv(self, name, use_whole_file=False, names=None, skiprows=0, *args, **kwargs): """Read a CSV file in and parse it into Pandas DataFrames. If no names is provided we use the first row for the names. header=0 is the default unless names is provided in which case header=None is the default. skiprows indicates how many rows of input to skip. This will only be applied to the first partition of the data (so if #skiprows > #row in first partition this will not work). Generally this shouldn't be an issue for small values of skiprows. No other values of header is supported. All additional parameters are passed to the read_csv function. """ def csv_file(partitionNumber, files): file_count = 0 for filename, contents in files: # Only skip lines on the first file if partitionNumber == 0 and file_count == 0 and _skiprows > 0: yield pandas.read_csv(sio(contents), *args, header=None, names=mynames, skiprows=_skiprows, **kwargs) else: file_count += 1 yield pandas.read_csv(sio(contents), *args, header=None, names=mynames, **kwargs) def csv_rows(partitionNumber, rows): rowCount = 0 inputStr = "\n".join(rows) if partitionNumber == 0: return iter([ pandas.read_csv(sio(inputStr), *args, header=None, names=mynames, skiprows=_skiprows, **kwargs) ]) else: # could use .iterows instead? return iter([ pandas.read_csv(sio(inputStr), *args, header=None, names=mynames, **kwargs) ]) # If we need to peak at the first partition and determine the column # names mynames = None _skiprows = skiprows if names: mynames = names else: # In the future we could avoid this expensive call. first_line = self.sc.textFile(name).first() frame = pandas.read_csv(sio(first_line), **kwargs) mynames = list(frame.columns.values) _skiprows += 1 # Do the actual load if use_whole_file: return PRDD.fromRDD( self.sc.wholeTextFiles(name).mapPartitionsWithIndex(csv_file)) else: return PRDD.fromRDD( self.sc.textFile(name).mapPartitionsWithIndex(csv_rows))