Example #1
0
def get_inspect(self,
                n=10,
                offset=0,
                columns=None,
                wrap=inspect_settings._unspecified,
                truncate=inspect_settings._unspecified,
                round=inspect_settings._unspecified,
                width=inspect_settings._unspecified,
                margin=inspect_settings._unspecified,
                with_types=inspect_settings._unspecified):
    """Returns an ATable object representing the table inspect --see frame.inspect()"""
    from sparktk.frame.ops.take import take_rich
    format_settings = inspect_settings.copy(wrap, truncate, round, width,
                                            margin, with_types)
    result = take_rich(self, n, offset, columns)
    return ATable(result.data,
                  result.schema,
                  offset=offset,
                  format_settings=format_settings)
Example #2
0
def inspect(self,
            n=10,
            offset=0,
            columns=None,
            wrap=inspect_settings._unspecified,
            truncate=inspect_settings._unspecified,
            round=inspect_settings._unspecified,
            width=inspect_settings._unspecified,
            margin=inspect_settings._unspecified,
            with_types=inspect_settings._unspecified):
    """
    Pretty-print of the frame data

    Essentially returns a string, but technically returns a RowInspection object which renders a string.
    The RowInspection object naturally converts to a str when needed, like when printed or when displayed
    by python REPL (i.e. using the object's __repr__).  If running in a script and want the inspect output
    to be printed, then it must be explicitly printed, then `print frame.inspect()`

    Parameters
    ----------
    :param n: (Optional[int]) The number of rows to print
    :param offset: (Optional[int]) The number of rows to skip before printing.
    :param columns: (Optional[List[str]]) Filter columns to be included.  By default, all columns are included.
    :param wrap: (Optional[int or 'stripes']) If set to 'stripes' then inspect prints rows in stripes; if set to an
                 integer N, rows will be printed in clumps of N columns, where the columns are wrapped.
    :param truncate: (Optional[int]) If set to integer N, all strings will be truncated to length N, including all
                     tagged ellipses.
    :param round: (Optional[int]) If set to integer N, all floating point numbers will be rounded and truncated to
                  N digits.
    :param width: (Optional[int]) If set to integer N, the print out will try to honor a max line width of N.
    :param margin: (Optional[int]) Applies to 'stripes' mode only.  If set to integer N, the margin for printing names
                   in a stripe will be limited to N characters.
    :param with_types: (Optinoal[bool]) If set to True, header will include the data_type of each column.
    :return: (RowsInspection) An object which naturally converts to a pretty-print string.

    Examples
    --------
    To look at the first 4 rows of data in a frame:

    <skip>
        >>> frame.inspect(4)
        [#]  animal    name    age  weight
        ==================================
        [0]  human     George    8   542.5
        [1]  human     Ursula    6   495.0
        [2]  ape       Ape      41   400.0
        [3]  elephant  Shep      5  8630.0
    </skip>

    # For other examples, see :ref:`example_frame.inspect`.

    Note: if the frame data contains unicode characters, this method may raise a Unicode exception when
    running in an interactive REPL or otherwise which triggers the standard python repr().  To get around
    this problem, explicitly print the unicode of the returned object:

    <skip>
        >>> print unicode(frame.inspect())
    </skip>


    **Global Settings**

    If not specified, the arguments that control formatting receive default values from
    'sparktk.inspect_settings'.  Make changes there to affect all calls to inspect.

        >>> import sparktk
        >>> sparktk.inspect_settings
        wrap             20
        truncate       None
        round          None
        width            80
        margin         None
        with_types    False
        >>> sparktk.inspect_settings.width = 120  # changes inspect to use 120 width globally
        >>> sparktk.inspect_settings.truncate = 16  # changes inspect to always truncate strings to 16 chars
        >>> sparktk.inspect_settings
        wrap             20
        truncate         16
        round          None
        width           120
        margin         None
        with_types    False
        >>> sparktk.inspect_settings.width = None  # return value back to default
        >>> sparktk.inspect_settings
        wrap             20
        truncate         16
        round          None
        width            80
        margin         None
        with_types    False
        >>> sparktk.inspect_settings.reset()  # set everything back to default
        >>> sparktk.inspect_settings
        wrap             20
        truncate       None
        round          None
        width            80
        margin         None
        with_types    False

    """
    from sparktk.frame.ops.take import take_rich
    format_settings = inspect_settings.copy(wrap, truncate, round, width, margin, with_types)
    result = take_rich(self, n, offset, columns)
    return RowsInspection(result.data, result.schema, offset=offset, format_settings=format_settings)
def to_pandas(self, n=None, offset=0, columns=None):
    """
    Brings data into a local pandas dataframe.

    Similar to the 'take' function, but puts the data into a pandas dataframe.

    Parameters
    ----------

    :param n: (Optional(int)) The number of rows to get from the frame (warning: do not overwhelm the python session
                    by taking too much)
    :param offset: (Optional(int)) The number of rows to skip before copying.  Defaults to 0.
    :param columns: (Optional(List[str])) Column filter.  The list of names to be included.  Default is all columns.
    :return: (pandas.DataFrame) A new pandas dataframe object containing the taken frame data.

    Examples
    --------

        <hide>
        >>> data = [["Fred", "555-1234"],["Susan", "555-0202"],["Thurston","555-4510"],["Judy","555-2183"]]
        >>> column_names = ["name", "phone"]
        >>> frame = tc.frame.create(data, column_names)
        </hide>

    Consider the following spark-tk frame, where we have columns for name and phone number:

        >>> frame.inspect()
        [#]  name      phone
        =======================
        [0]  Fred      555-1234
        [1]  Susan     555-0202
        [2]  Thurston  555-4510
        [3]  Judy      555-2183

        >>> frame.schema
        [('name', <type 'str'>), ('phone', <type 'str'>)]

    The frame to_pandas() method is used to get a pandas DataFrame that contains the data from the spark-tk frame.  Note
    that since no parameters are provided when to_pandas() is called, the default values are used for the number of
    rows, the row offset, and the columns.

        >>> pandas_frame = frame.to_pandas()
        >>> pandas_frame
               name     phone
        0      Fred  555-1234
        1     Susan  555-0202
        2  Thurston  555-4510
        3      Judy  555-2183

    """
    try:
        import pandas
    except:
        raise RuntimeError(
            "pandas module not found, unable to download.  Install pandas or try the take command."
        )
    from sparktk.frame.ops.take import take_rich

    result = take_rich(self, n, offset, columns)
    headers, data_types = zip(*result.schema)
    frame_data = result.data

    from sparktk import dtypes
    import datetime

    date_time_columns = [
        i for i, x in enumerate(self.schema)
        if x[1] in (dtypes.datetime, datetime.datetime)
    ]
    has_date_time = len(date_time_columns) > 0

    # translate our datetime long to datetime, so that it gets into the pandas df as a datetime column
    def long_to_date_time(row):
        for i in date_time_columns:
            if isinstance(row[i], long):
                row[i] = datetime.datetime.fromtimestamp(
                    row[i] // 1000).replace(microsecond=row[i] % 1000 * 1000)
        return row

    if (has_date_time):
        frame_data = map(long_to_date_time, frame_data)

    # create pandas df
    pandas_df = pandas.DataFrame(frame_data, columns=headers)

    for i, dtype in enumerate(data_types):
        dtype_str = _sparktk_dtype_to_pandas_str(dtype)
        try:
            pandas_df[[headers[i]]] = pandas_df[[headers[i]]].astype(dtype_str)
        except (TypeError, ValueError):
            if dtype_str.startswith("int"):
                # DataFrame does not handle missing values in int columns. If we get this error, use the 'object' datatype instead.
                print "WARNING - Encountered problem casting column %s to %s, possibly due to missing values (i.e. presence of None).  Continued by casting column %s as 'object'" % (
                    headers[i], dtype_str, headers[i])
                pandas_df[[headers[i]]] = pandas_df[[headers[i]
                                                     ]].astype("object")
            else:
                raise
    return pandas_df
Example #4
0
def to_pandas(self, n=None, offset=0, columns=None):
    """
    Brings data into a local pandas dataframe.

    Similar to the 'take' function, but puts the data into a pandas dataframe.

    Parameters
    ----------

    :param n: (Optional(int)) The number of rows to get from the frame (warning: do not overwhelm the python session
                    by taking too much)
    :param offset: (Optional(int)) The number of rows to skip before copying.  Defaults to 0.
    :param columns: (Optional(List[str])) Column filter.  The list of names to be included.  Default is all columns.
    :return: (pandas.DataFrame) A new pandas dataframe object containing the taken frame data.

    Examples
    --------

        <hide>
        >>> data = [["Fred", "555-1234"],["Susan", "555-0202"],["Thurston","555-4510"],["Judy","555-2183"]]
        >>> column_names = ["name", "phone"]
        >>> frame = tc.frame.create(data, column_names)
        </hide>

    Consider the following spark-tk frame, where we have columns for name and phone number:

        >>> frame.inspect()
        [#]  name      phone
        =======================
        [0]  Fred      555-1234
        [1]  Susan     555-0202
        [2]  Thurston  555-4510
        [3]  Judy      555-2183

        >>> frame.schema
        [('name', <type 'str'>), ('phone', <type 'str'>)]

    The frame to_pandas() method is used to get a pandas DataFrame that contains the data from the spark-tk frame.  Note
    that since no parameters are provided when to_pandas() is called, the default values are used for the number of
    rows, the row offset, and the columns.

        >>> pandas_frame = frame.to_pandas()
        >>> pandas_frame
               name     phone
        0      Fred  555-1234
        1     Susan  555-0202
        2  Thurston  555-4510
        3      Judy  555-2183

    """
    try:
        import pandas
    except:
        raise RuntimeError("pandas module not found, unable to download.  Install pandas or try the take command.")
    from sparktk.frame.ops.take import take_rich

    result = take_rich(self, n, offset, columns)
    headers, data_types = zip(*result.schema)
    frame_data = result.data

    from sparktk import dtypes
    import datetime

    date_time_columns = [i for i, x in enumerate(self.schema) if x[1] in (dtypes.datetime, datetime.datetime)]
    has_date_time = len(date_time_columns) > 0

    # translate our datetime long to datetime, so that it gets into the pandas df as a datetime column
    def long_to_date_time(row):
        for i in date_time_columns:
            if isinstance(row[i], long):
                row[i] = datetime.datetime.fromtimestamp(row[i]//1000).replace(microsecond=row[i]%1000*1000)
        return row

    if (has_date_time):
        frame_data = map(long_to_date_time, frame_data)

    # create pandas df
    pandas_df = pandas.DataFrame(frame_data, columns=headers)

    for i, dtype in enumerate(data_types):
        dtype_str = _sparktk_dtype_to_pandas_str(dtype)
        try:
            pandas_df[[headers[i]]] = pandas_df[[headers[i]]].astype(dtype_str)
        except (TypeError, ValueError):
            if dtype_str.startswith("int"):
                # DataFrame does not handle missing values in int columns. If we get this error, use the 'object' datatype instead.
                print "WARNING - Encountered problem casting column %s to %s, possibly due to missing values (i.e. presence of None).  Continued by casting column %s as 'object'" % (headers[i], dtype_str, headers[i])
                pandas_df[[headers[i]]] = pandas_df[[headers[i]]].astype("object")
            else:
                raise
    return pandas_df