コード例 #1
0
ファイル: pcontext.py プロジェクト: 0asa/sparklingpandas
    def read_csv(self, name, use_whole_file=False, names=None, skiprows=0,
                 *args, **kwargs):
        """Read a CSV file in and parse it into Pandas DataFrames.
        If no names is provided we use the first row for the names.
        header=0 is the default unless names is provided in which case
        header=None is the default.
        skiprows indicates how many rows of input to skip. This will
        only be applied to the first partition of the data (so if
        #skiprows > #row in first partition this will not work). Generally
        this shouldn't be an issue for small values of skiprows.
        No other values of header is supported.
        All additional parameters are passed to the read_csv function.
        """
        def csv_file(partitionNumber, files):
            file_count = 0
            for filename, contents in files:
                # Only skip lines on the first file
                if partitionNumber == 0 and file_count == 0 and _skiprows > 0:
                    yield pandas.read_csv(StringIO(contents), *args,
                                          header=None,
                                          names=mynames,
                                          skiprows=_skiprows, **kwargs)
                else:
                    file_count += 1
                    yield pandas.read_csv(StringIO(contents), *args,
                                          header=None,
                                          names=mynames,
                                          **kwargs)

        def csv_rows(partitionNumber, rows):
            rowCount = 0
            inputStr = "\n".join(rows)
            if partitionNumber == 0:
                return pandas.read_csv(StringIO(row), *args, header=None,
                                       names=mynames, skiprows=_skiprows,
                                       **kwargs)
            else:
                return pandas.read_csv(StringIO(row), *args, header=None,
                                       names=mynames, **kwargs)

        # If we need to peak at the first partition and determine the column
        # names
        mynames = None
        _skiprows = skiprows
        if names:
            mynames = names
        else:
            # In the future we could avoid this expensive call.
            first_line = self.sc.textFile(name).first()
            frame = pandas.read_csv(StringIO(first_line))
            mynames = list(frame.columns.values)
            _skiprows += 1

        # Do the actual load
        if use_whole_file:
            return PRDD.fromRDD(
                self.sc.wholeTextFiles(name).mapPartitionsWithIndex(csv_file))
        else:
            return PRDD.fromRDD(
                self.sc.textFile(name).mapPartitionsWithIndex(csv_rows))
コード例 #2
0
ファイル: pcontext.py プロジェクト: snouhaud/sparklingpandas
    def DataFrame(self, elements, *args, **kwargs):
        """Wraps the pandas.DataFrame operation."""
        def _load_partitions(partition):
            """Convert partitions of tuples."""
            partitionList = list(partition)
            if len(partitionList) > 0:
                (indices, elements) = zip(*partitionList)
                return iter([
                    pandas.DataFrame(data=list(elements),
                                     index=list(indices),
                                     *args,
                                     **kwargs)
                ])
            else:
                return iter([])

        # Zip with the index so we have consistent indexing as if it was
        # operated on locally
        index = range(len(elements))
        # TODO(holden): test this issue #13
        if 'index' in kwargs:
            index = kwargs['index']
        elementsWithIndex = zip(index, elements)
        return PRDD.fromRDD(
            self.sc.parallelize(elementsWithIndex).mapPartitions(
                _load_partitions))
コード例 #3
0
 def aggregate(self, f):
     """Apply the aggregation function.
     Note: This implementation does note take advantage of partial
     aggregation.
     """
     return PRDD.fromRDD(
         self._regroup_mergedRDD().values().map(lambda g: g.aggregate(f)))
コード例 #4
0
    def median(self):
        """Compute median of groups, excluding missing values.

        For multiple groupings, the result index will be a MultiIndex.
        """
        return PRDD.fromRDD(
            self._regroup_mergedRDD().values().map(lambda x: x.median()))
コード例 #5
0
ファイル: pcontext.py プロジェクト: MeethuM/sparklingpandas
 def DataFrame(self, elements, *args, **kwargs):
     """Wraps the pandas.DataFrame operation."""
     def _load_partitions(partition):
         """Convert partitions of tuples."""
         partitionList = list(partition)
         if len(partitionList) > 0:
             (indices, elements) = zip(*partitionList)
             return iter([
                 pandas.DataFrame(
                     data=list(elements),
                     index=list(indices),
                     *args,
                     **kwargs)])
         else:
             return iter([])
     # Zip with the index so we have consistent indexing as if it was
     # operated on locally
     index = range(len(elements))
     # TODO(holden): test this issue #13
     if 'index' in kwargs:
         index = kwargs['index']
     elementsWithIndex = zip(index, elements)
     return PRDD.fromRDD(
         self.sc.parallelize(elementsWithIndex).mapPartitions(
             _load_partitions))
コード例 #6
0
ファイル: groupby.py プロジェクト: 0asa/sparklingpandas
 def aggregate(self, f):
     """Apply the aggregation function.
     Note: This implementation does note take advantage of partial
     aggregation.
     """
     return PRDD.fromRDD(
         self._regroup_mergedRDD().values().map(
             lambda g: g.aggregate(f)))
コード例 #7
0
ファイル: groupby.py プロジェクト: 0asa/sparklingpandas
    def median(self):
        """Compute median of groups, excluding missing values.

        For multiple groupings, the result index will be a MultiIndex.
        """
        return PRDD.fromRDD(
            self._regroup_mergedRDD().values().map(
                lambda x: x.median()))
コード例 #8
0
 def nth(self, n, *args, **kwargs):
     """Take the nth element of each grouby."""
     # TODO: Stop collecting the entire frame for each key.
     myargs = self._myargs
     mykwargs = self._mykwargs
     nthRDD = self._regroup_mergedRDD().mapValues(
         lambda r: r.nth(n, *args, **kwargs)).values()
     return PRDD.fromRDD(nthRDD)
コード例 #9
0
    def mean(self):
        """Compute mean of groups, excluding missing values.

        For multiple groupings, the result index will be a MultiIndex.
        """
        # TODO(holden): use stats counter
        return PRDD.fromRDD(
            self._regroup_mergedRDD().values().map(lambda x: x.mean()))
コード例 #10
0
    def var(self, ddof=1):
        """Compute standard deviation of groups, excluding missing values.

        For multiple groupings, the result index will be a MultiIndex.
        """
        # TODO(holden): use stats counter
        return PRDD.fromRDD(
            self._regroup_mergedRDD().values().map(lambda x: x.var(ddof=ddof)))
コード例 #11
0
ファイル: groupby.py プロジェクト: 0asa/sparklingpandas
 def nth(self, n, *args, **kwargs):
     """Take the nth element of each grouby."""
     # TODO: Stop collecting the entire frame for each key.
     myargs = self._myargs
     mykwargs = self._mykwargs
     nthRDD = self._regroup_mergedRDD().mapValues(
         lambda r: r.nth(
             n, *args, **kwargs)).values()
     return PRDD.fromRDD(nthRDD)
コード例 #12
0
ファイル: groupby.py プロジェクト: 0asa/sparklingpandas
    def mean(self):
        """Compute mean of groups, excluding missing values.

        For multiple groupings, the result index will be a MultiIndex.
        """
        # TODO(holden): use stats counter
        return PRDD.fromRDD(
            self._regroup_mergedRDD().values().map(
                lambda x: x.mean()))
コード例 #13
0
ファイル: groupby.py プロジェクト: 0asa/sparklingpandas
    def var(self, ddof=1):
        """Compute standard deviation of groups, excluding missing values.

        For multiple groupings, the result index will be a MultiIndex.
        """
        # TODO(holden): use stats counter
        return PRDD.fromRDD(
            self._regroup_mergedRDD().values().map(
                lambda x: x.var(
                    ddof=ddof)))
コード例 #14
0
ファイル: pcontext.py プロジェクト: snouhaud/sparklingpandas
    def from_schema_rdd(self, schemaRDD):
        """Convert a schema RDD to a L{PRDD}."""
        def _load_kv_partitions(partition):
            """Convert a partition where each row is key/value data."""
            partitionList = list(partition)
            if len(partitionList) > 0:
                return iter([pandas.DataFrame(data=partitionList)])
            else:
                return iter([])

        return PRDD.fromRDD(schemaRDD.mapPartitions(_load_kv_partitions))
コード例 #15
0
ファイル: pcontext.py プロジェクト: MeethuM/sparklingpandas
 def from_schema_rdd(self, schemaRDD):
     """Convert a schema RDD to a L{PRDD}."""
     def _load_kv_partitions(partition):
         """Convert a partition where each row is key/value data."""
         partitionList = list(partition)
         if len(partitionList) > 0:
             return iter([
                 pandas.DataFrame(data=partitionList)
             ])
         else:
             return iter([])
     return PRDD.fromRDD(schemaRDD.mapPartitions(_load_kv_partitions))
コード例 #16
0
    def read_json(self, name,
                  *args, **kwargs):
        """Read a json file in and parse it into Pandas DataFrames.
        If no names is provided we use the first row for the names.
        Currently, it is not possible to skip the first n rows of a file.
        Headers are provided in the json file and not specified separately.
        """
        def json_file(partitionNumber, files):
            for filename, contents in files:
                yield pandas.read_json(sio(contents), *args, **kwargs)

        return PRDD.fromRDD(
            self.sc.wholeTextFiles(name).mapPartitionsWithIndex(json_file))
コード例 #17
0
ファイル: pcontext.py プロジェクト: MLnick/sparklingpandas
    def csvfile(self, name, use_whole_file=True, *args, **kwargs):
        """
        Read a CSV file in and parse it into panda data frames. Note this uses
        wholeTextFiles by default underneath the hood so as to support
        multi-line CSV records so many small input files are preferred.
        All additional parameters are passed to the read_csv function
        """
        # TODO(holden): string IO stuff

        def csv_file(contents, *args, **kwargs):
            return pandas.read_csv(StringIO(contents), *args, header=0,
                                   **kwargs)

        def csv_rows(rows, *args, **kwargs):
            for row in rows:
                yield pandas.read_csv(StringIO(row), *args, header=0, **kwargs)

        if use_whole_file:
            return PRDD.fromRDD(self.sc.wholeTextFiles(name).map(
                lambda (name, contents): csv_file(contents, *args, **kwargs)))
        else:
            return PRDD.fromRDD(self.sc.textFile(name).mapPartitions(
                lambda x: csv_rows(x, *args, **kwargs)))
コード例 #18
0
    def max(self):
        """Compute the max for each group."""
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).max()

        def merge_value(x, y):
            return x.append(create_combiner(y)).max()

        def merge_combiner(x, y):
            return x.append(y).max(level=0)

        rddOfMax = self._sortIfNeeded(
            self._distributedRDD.combineByKey(create_combiner, merge_value,
                                              merge_combiner)).values()
        return PRDD.fromRDD(rddOfMax)
コード例 #19
0
    def sum(self):
        """Compute the sum for each group."""
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).sum()

        def merge_value(x, y):
            return pandas.concat([x, create_combiner(y)])

        def merge_combiner(x, y):
            return x + y

        rddOfSum = self._sortIfNeeded(
            self._distributedRDD.combineByKey(create_combiner, merge_value,
                                              merge_combiner)).values()
        return PRDD.fromRDD(rddOfSum)
コード例 #20
0
    def last(self):
        """Pull out the last from each group."""
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).last()

        def merge_value(x, y):
            return create_combiner(y)

        def merge_combiner(x, y):
            return y

        rddOfLast = self._sortIfNeeded(
            self._distributedRDD.combineByKey(create_combiner, merge_value,
                                              merge_combiner)).values()
        return PRDD.fromRDD(rddOfLast)
コード例 #21
0
ファイル: pcontext.py プロジェクト: MeethuM/sparklingpandas
    def from_data_frame(self, df):
        """Make a distributed dataframe from a local dataframe. The intend use
        is for testing. Note: dtypes are re-infered, so they may not match."""
        mydtype = df.dtypes
        mycols = df.columns

        def loadFromKeyRow(partition):
            pll = list(partition)
            if len(pll) > 0:
                index, data = zip(*pll)
                return iter([
                    pandas.DataFrame(list(data),
                                     columns=mycols,
                                     index=index)])
            else:
                return iter([])
        indexedData = zip(df.index, df.itertuples(index=False))
        rdd = self.sc.parallelize(indexedData).mapPartitions(loadFromKeyRow)
        return PRDD.fromRDD(rdd)
コード例 #22
0
ファイル: pcontext.py プロジェクト: snouhaud/sparklingpandas
    def from_data_frame(self, df):
        """Make a distributed dataframe from a local dataframe. The intend use
        is for testing. Note: dtypes are re-infered, so they may not match."""
        mydtype = df.dtypes
        mycols = df.columns

        def loadFromKeyRow(partition):
            pll = list(partition)
            if len(pll) > 0:
                index, data = zip(*pll)
                return iter([
                    pandas.DataFrame(list(data), columns=mycols, index=index)
                ])
            else:
                return iter([])

        indexedData = zip(df.index, df.itertuples(index=False))
        rdd = self.sc.parallelize(indexedData).mapPartitions(loadFromKeyRow)
        return PRDD.fromRDD(rdd)
コード例 #23
0
ファイル: groupby.py プロジェクト: 0asa/sparklingpandas
    def sum(self):
        """Compute the sum for each group."""
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).sum()

        def merge_value(x, y):
            return pandas.concat([x, create_combiner(y)])

        def merge_combiner(x, y):
            return x + y

        rddOfSum = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return PRDD.fromRDD(rddOfSum)
コード例 #24
0
ファイル: groupby.py プロジェクト: 0asa/sparklingpandas
    def last(self):
        """Pull out the last from each group."""
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).last()

        def merge_value(x, y):
            return create_combiner(y)

        def merge_combiner(x, y):
            return y

        rddOfLast = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return PRDD.fromRDD(rddOfLast)
コード例 #25
0
ファイル: groupby.py プロジェクト: 0asa/sparklingpandas
    def max(self):
        """Compute the max for each group."""
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).max()

        def merge_value(x, y):
            return x.append(create_combiner(y)).max()

        def merge_combiner(x, y):
            return x.append(y).max(level=0)

        rddOfMax = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return PRDD.fromRDD(rddOfMax)
コード例 #26
0
    def first(self):
        """
        Pull out the first from each group. Note: this is different than
        Spark's first.
        """
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).first()

        def merge_value(x, y):
            return create_combiner(x)

        def merge_combiner(x, y):
            return x

        rddOfFirst = self._sortIfNeeded(
            self._distributedRDD.combineByKey(create_combiner, merge_value,
                                              merge_combiner)).values()
        return PRDD.fromRDD(rddOfFirst)
コード例 #27
0
ファイル: groupby.py プロジェクト: 0asa/sparklingpandas
    def first(self):
        """
        Pull out the first from each group. Note: this is different than
        Spark's first.
        """
        myargs = self._myargs
        mykwargs = self._mykwargs

        def create_combiner(x):
            return x.groupby(*myargs, **mykwargs).first()

        def merge_value(x, y):
            return create_combiner(x)

        def merge_combiner(x, y):
            return x

        rddOfFirst = self._sortIfNeeded(self._distributedRDD.combineByKey(
            create_combiner,
            merge_value,
            merge_combiner)).values()
        return PRDD.fromRDD(rddOfFirst)
コード例 #28
0
ファイル: groupby.py プロジェクト: 0asa/sparklingpandas
    def apply(self, func, *args, **kwargs):
        """Apply the provided function and combine the results together in the
        same way as apply from groupby in pandas.

        This returns a PRDD.
        """
        def key_by_index(data):
            """Key each row by its index.
            """
            # TODO: Is there a better way to do this?
            for key, row in data.iterrows():
                yield (key, pandas.DataFrame.from_dict(dict([(key, row)]),
                                                       orient='index'))

        myargs = self._myargs
        mykwargs = self._mykwargs
        regroupedRDD = self._distributedRDD.mapValues(
            lambda data: data.groupby(*myargs, **mykwargs))
        appliedRDD = regroupedRDD.map(
            lambda key_data: key_data[1].apply(func, *args, **kwargs))
        reKeyedRDD = appliedRDD.flatMap(key_by_index)
        prdd = self._sortIfNeeded(reKeyedRDD).values()
        return PRDD.fromRDD(prdd)
コード例 #29
0
    def apply(self, func, *args, **kwargs):
        """Apply the provided function and combine the results together in the
        same way as apply from groupby in pandas.

        This returns a PRDD.
        """
        def key_by_index(data):
            """Key each row by its index.
            """
            # TODO: Is there a better way to do this?
            for key, row in data.iterrows():
                yield (key,
                       pandas.DataFrame.from_dict(dict([(key, row)]),
                                                  orient='index'))

        myargs = self._myargs
        mykwargs = self._mykwargs
        regroupedRDD = self._distributedRDD.mapValues(
            lambda data: data.groupby(*myargs, **mykwargs))
        appliedRDD = regroupedRDD.map(
            lambda key_data: key_data[1].apply(func, *args, **kwargs))
        reKeyedRDD = appliedRDD.flatMap(key_by_index)
        prdd = self._sortIfNeeded(reKeyedRDD).values()
        return PRDD.fromRDD(prdd)
コード例 #30
0
ファイル: pcontext.py プロジェクト: MLnick/sparklingpandas
 def DataFrame(self, elements, *args, **kwargs):
     """
     Wraps the pandas.DataFrame operation.
     """
     return PRDD.fromRDD(self.sc.parallelize(elements).map(
         lambda element: pandas.DataFrame(data=[element], *args, **kwargs)))
コード例 #31
0
ファイル: pcontext.py プロジェクト: MeethuM/sparklingpandas
 def sql(self, query):
     """Perform a SQL query and create a L{PRDD} of the result."""
     return PRDD.fromRDD(
         self.from_schema_rdd(
             self._get_sqlctx().sql(query)))
コード例 #32
0
ファイル: pcontext.py プロジェクト: snouhaud/sparklingpandas
 def sql(self, query):
     """Perform a SQL query and create a L{PRDD} of the result."""
     return PRDD.fromRDD(self.from_schema_rdd(
         self._get_sqlctx().sql(query)))
コード例 #33
0
ファイル: pcontext.py プロジェクト: snouhaud/sparklingpandas
    def read_csv(self,
                 name,
                 use_whole_file=False,
                 names=None,
                 skiprows=0,
                 *args,
                 **kwargs):
        """Read a CSV file in and parse it into Pandas DataFrames.
        If no names is provided we use the first row for the names.
        header=0 is the default unless names is provided in which case
        header=None is the default.
        skiprows indicates how many rows of input to skip. This will
        only be applied to the first partition of the data (so if
        #skiprows > #row in first partition this will not work). Generally
        this shouldn't be an issue for small values of skiprows.
        No other values of header is supported.
        All additional parameters are passed to the read_csv function.
        """
        def csv_file(partitionNumber, files):
            file_count = 0
            for filename, contents in files:
                # Only skip lines on the first file
                if partitionNumber == 0 and file_count == 0 and _skiprows > 0:
                    yield pandas.read_csv(sio(contents),
                                          *args,
                                          header=None,
                                          names=mynames,
                                          skiprows=_skiprows,
                                          **kwargs)
                else:
                    file_count += 1
                    yield pandas.read_csv(sio(contents),
                                          *args,
                                          header=None,
                                          names=mynames,
                                          **kwargs)

        def csv_rows(partitionNumber, rows):
            rowCount = 0
            inputStr = "\n".join(rows)
            if partitionNumber == 0:
                return iter([
                    pandas.read_csv(sio(inputStr),
                                    *args,
                                    header=None,
                                    names=mynames,
                                    skiprows=_skiprows,
                                    **kwargs)
                ])
            else:
                # could use .iterows instead?
                return iter([
                    pandas.read_csv(sio(inputStr),
                                    *args,
                                    header=None,
                                    names=mynames,
                                    **kwargs)
                ])

        # If we need to peak at the first partition and determine the column
        # names
        mynames = None
        _skiprows = skiprows
        if names:
            mynames = names
        else:
            # In the future we could avoid this expensive call.
            first_line = self.sc.textFile(name).first()
            frame = pandas.read_csv(sio(first_line), **kwargs)
            mynames = list(frame.columns.values)
            _skiprows += 1

        # Do the actual load
        if use_whole_file:
            return PRDD.fromRDD(
                self.sc.wholeTextFiles(name).mapPartitionsWithIndex(csv_file))
        else:
            return PRDD.fromRDD(
                self.sc.textFile(name).mapPartitionsWithIndex(csv_rows))