Example #1
0
result = data.apply(value_counts).fillna(0)
resutl
result
## Handling Missing Data
string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data
string_data.isnull
string_data.isnull()
string_data[0] = None
string_data.isnull()
string_data
# Filtering out Missing Data
from numpy import nan as NA
data = Series([1, NA, 3.5, NA, 7])
data.dropna()
data[data.notnull()]
data = DataFrame([[1., 6.5, 3.], [1., NA, NA],
[NA, NA, NA], [NA, 6.5, 3.]])
data
cleaned = data.dropna()
cleaned
cleaned = data.dropna() # it drops every row which contains at least one Na value
data.dropna(how='all') # it only drops row with all value equals to NA
data[4] = NA
data
data.dropna(axis=1, how='all')
## Filling in Missing Data
df
df.fillna(0)
df = DataFrame(np.random.randn(7, 3))
df
Example #2
0
class MagicDataFrame(object):

    """
    Each MagicDataFrame corresponds to one MagIC table.
    The MagicDataFrame object consists of a pandas DataFrame,
    and assorted methods for manipulating that DataFrame.
    """

    def __init__(self, magic_file=None, columns=None, dtype=None, groups=None, dmodel=None, df=None):
        """
        Provide either a magic_file or a dtype.
        List of columns is optional,
        and will only be used if magic_file == None.
        Instead of a list of columns, you can also provide
        a list of group-names, and the specific col_names
        will be filled in by the data model.
        If provided, col_names takes precedence.
        """
        if isinstance(df, pd.DataFrame):
            self.df = df
            if dtype:
                self.dtype = dtype
            else:
                print "-W- Please provide data type..."
        # make sure all required arguments are present
        if not magic_file and not dtype and not isinstance(df, pd.DataFrame):
            print "-W- To make a MagicDataFrame, you must provide either a filename or a datatype"
            self.df = None
            return
        # fetch data model if not provided
        if isinstance(dmodel, type(None)):
            self.data_model = data_model.DataModel()
        else:
            self.data_model = dmodel

        if isinstance(df, pd.DataFrame):
            pass
        # if no file is provided, make an empty dataframe of the appropriate type
        elif not magic_file:
            self.dtype = dtype
            if not isinstance(columns, type(None)):
                self.df = DataFrame(columns=columns)
            else:
                self.df = DataFrame()
                self.df.index.name = dtype[:-1] if dtype.endswith("s") else dtype
        # if there is a file provided, read in the data and ascertain dtype
        else:
            ## old way of reading in data using pmag.magic_read
            # data, dtype, keys = pmag.magic_read(magic_file, return_keys=True)
            ## create dataframe, maintaining column order:
            # self.df = DataFrame(data, columns=keys)
            # if dtype == 'bad_file':
            #    print "-W- Bad file {}".format(magic_file)
            #    self.dtype = 'empty'
            #    return

            ## new way of reading in data using pd.read_table
            with open(magic_file) as f:
                try:
                    delim, dtype = f.readline().split("\t")[:2]
                except ValueError:
                    print "-W- Empty file {}".format(magic_file)
                    self.df = DataFrame()
                    return
            self.df = pd.read_table(magic_file, skiprows=[0])
            self.dtype = dtype.strip()
            if self.dtype == "measurements":
                ###self.df['measurement_name'] = self.df['experiment_name'] + self.df['measurement_number']
                self.df["measurement"] = self.df["experiment"] + self.df["number"].astype(str)
                name = "measurement"
            elif self.dtype.endswith("s"):
                # dtype = dtype[:-1]
                name = "{}".format(self.dtype[:-1])
            elif self.dtype == "contribution":
                name = "doi"
                # **** this is broken at the moment, fix it!
                return
            else:
                name = self.dtype
            # fix these:
            if self.dtype == "images":
                self.df = pd.DataFrame()
                return
            if self.dtype == "criteria":
                # self.df = pd.DataFrame()
                self.df.index = self.df["table_column"]
                return
            if len(self.df) and self.dtype != "ages":
                self.df.index = self.df[name].astype(str)
            elif self.dtype == "ages":
                self.df.index = self.df.index.astype(str)
            # del self.df[name]
            # self.dtype = dtype
            # replace '' with None, so you can use isnull(), notnull(), etc.
            # can always switch back with DataFrame.fillna('')
            self.df = self.df.where(self.df.notnull(), None)

            # drop any completely blank columns
            # this is not necessarily a good idea....
            # self.df.dropna(axis=1, how='all', inplace=True)
            #
            # add df columns that were passed in but weren't in the file
            if columns:
                for col in columns:
                    if col not in self.df.columns:
                        self.df[col] = None

        # add col_names by group
        if groups and not columns:
            columns = []
            for group_name in groups:
                columns.extend(list(self.data_model.get_group_headers(self.dtype, group_name)))
            for col in columns:
                if col not in self.df.columns:
                    self.df[col] = None
            self.df = self.df[columns]

    ## Methods to change self.df inplace

    def update_row(self, ind, row_data):
        """
        Update a row with data.
        Must provide the specific numeric index (not row label).
        If any new keys are present in row_data dictionary,
        that column will be added to the dataframe.
        This is done inplace.
        """
        if sorted(row_data.keys()) != sorted(self.df.columns):
            # add any new column names
            for key in row_data:
                if key not in self.df.columns:
                    self.df[key] = None
            # add missing column names into row_data
            for col_label in self.df.columns:
                if col_label not in row_data.keys():
                    row_data[col_label] = None
        try:
            self.df.iloc[ind] = pd.Series(row_data)
        except IndexError:
            return False
        return self.df

    def add_row(self, label, row_data, columns=""):
        """
        Add a row with data.
        If any new keys are present in row_data dictionary,
        that column will be added to the dataframe.
        This is done inplace
        """
        # use provided column order, making sure you don't lose any values
        # from self.df.columns
        if len(columns):
            if sorted(self.df.columns) == sorted(columns):
                self.df.columns = columns
            else:
                new_columns = []
                new_columns.extend(columns)
                for col in self.df.columns:
                    if col not in new_columns:
                        new_columns.append(col)
        # makes sure all columns have data or None
        if sorted(row_data.keys()) != sorted(self.df.columns):
            # add any new column names
            for key in row_data:
                if key not in self.df.columns:
                    self.df[key] = None
            # add missing column names into row_data
            for col_label in self.df.columns:
                if col_label not in row_data.keys():
                    row_data[col_label] = None

        # (make sure you are working with strings)
        self.df.index = self.df.index.astype(str)
        label = str(label)

        # create a new row with suffix "new"
        # (this ensures that you get a unique, new row,
        #  instead of adding on to an existing row with the same label)
        self.df.loc[label + "new"] = pd.Series(row_data)
        # rename it to be correct
        self.df.rename(index={label + "new": label}, inplace=True)
        # use next line to sort index inplace
        # self.df.sort_index(inplace=True)
        return self.df

    def add_blank_row(self, label):
        """
        Add a blank row with only an index value to self.df.
        This is done inplace.
        """
        col_labels = self.df.columns
        blank_item = pd.Series({}, index=col_labels, name=label)
        # use .loc to add in place (append won't do that)
        self.df.loc[blank_item.name] = blank_item
        return self.df

    def delete_row(self, ind):
        """
        remove self.df row at ind
        inplace
        """
        self.df = pd.concat([self.df[:ind], self.df[ind + 1 :]])
        return self.df

    def delete_rows(self, condition):
        """
        delete all rows with  condition==True
        inplace
        """
        self.df["num"] = range(len(self.df))
        df_data = self.df
        # delete all records that meet condition
        if len(df_data[condition]) > 0:  # we have one or more records to delete
            inds = df_data[condition]["num"]  # list of all rows where condition is TRUE
            for ind in inds[::-1]:
                df_data = self.delete_row(ind)
                print "deleting row {}".format(str(ind))
        # sort so that all rows for an item are together
        df_data.sort_index(inplace=True)
        # redo temporary index
        df_data["num"] = range(len(df_data))
        self.df = df_data
        return df_data

    def update_record(self, name, new_data, condition, update_only=False, debug=False):
        """
        Find the first row in self.df with index == name
        and condition == True.
        Update that record with new_data, then delete any
        additional records where index == name and condition == True.
        Change is inplace
        """
        # add numeric index column temporarily
        self.df["num"] = range(len(self.df))
        df_data = self.df
        # edit first of existing data that meets condition
        if len(df_data[condition]) > 0:  # we have one or more records to update or delete
            condition2 = df_data.index == name
            # list of all rows where condition is true and index == name
            inds = df_data[condition & condition2]["num"]
            # inds = df_data[condition]['num'] # list of all rows where condition is true
            existing_data = dict(df_data.iloc[inds[0]])  # get first record of existing_data from dataframe
            existing_data.update(new_data)  # update existing data with new interpretations
            # update row
            self.update_row(inds[0], existing_data)
            # now remove all the remaining records of same condition
            if len(inds) > 1:
                for ind in inds[1:]:
                    print "deleting redundant records for:", name
                    df_data = self.delete_row(ind)
        else:
            if update_only:
                print "no record found for that condition, not updating ", name
            else:
                print "no record found - creating new one for ", name
                # add new row
                df_data = self.add_row(name, new_data)
        # sort so that all rows for an item are together
        df_data.sort_index(inplace=True)
        # redo temporary index
        df_data["num"] = range(len(df_data))
        self.df = df_data
        return df_data

    ## Methods that take self.df and extract some information from it

    def convert_to_pmag_data_list(self, lst_or_dict="lst", df=None):

        """
        Take MagicDataFrame and turn it into a list of dictionaries.
        This will have the same format as reading in a 2.5 file
        with pmag.magic_read(), i.e.:
        if "lst":
          [{"sample": "samp_name", "azimuth": 12, ...}, {...}]
        if "dict":
          {"samp_name": {"azimuth": 12, ...}, "samp_name2": {...}, ...}
        NOTE: "dict" not recommended with 3.0, as one sample can have
        many rows, which means that dictionary items can be overwritten
        """
        if isinstance(df, type(None)):
            df = self.df
        if lst_or_dict == "lst":
            return list(df.T.apply(dict))
        else:
            return {str(i[df.index.name]): dict(i) for i in list(df.T.apply(dict))}

    def get_name(self, col_name, df_slice="", index_names=""):
        """
        Takes in a column name, and either a DataFrame slice or
        a list of index_names to slice self.df using fancy indexing.
        Then return the value for that column in the relevant slice.
        (Assumes that all values for column will be the same in the
         chosen slice, so return the first one.)
        """
        # if slice is provided, use it
        if any(df_slice):
            df_slice = df_slice
        # if given index_names, grab a slice using fancy indexing
        elif index_names:
            df_slice = self.df.ix[index_names]
        # otherwise, use the full DataFrame
        else:
            df_slice = self.df
        # if the slice is empty, return ""
        if len(df_slice) == 0:
            return ""
        # if the column name isn't present in the slice, return ""
        if col_name not in df_slice.columns:
            return ""
        # otherwise, return the first value from that column
        first_val = df_slice[col_name].dropna()
        if any(first_val):
            return first_val[0]
        else:
            return ""
        # return df_slice[col_name].dropna()[0]

    def get_di_block(self, df_slice=None, do_index=False, item_names=None, tilt_corr="100", excl=None):
        """
        Input either a DataFrame slice
        or
        do_index=True and a list of index_names.
        Optional arguments:
        Provide tilt_corr (default 100).
        Excl is a list of method codes to exclude.
        Output dec/inc from the slice in this format:
        [[dec1, inc1], [dec2, inc2], ...].
        Not inplace
        """
        tilt_corr = int(tilt_corr)
        if isinstance(df_slice, str):
            if df_slice.lower() == "all":
                # use entire DataFrame
                df_slice = self.df
        elif do_index:
            # use fancy indexing (but note this will give duplicates)
            df_slice = self.df.ix[item_names]
        elif not do_index:
            # otherwise use the provided slice
            df_slice = df_slice

        # once you have the slice, fix up the data
        # tilt correction must match
        if tilt_corr != 0:
            df_slice = df_slice[df_slice["dir_tilt_correction"] == tilt_corr]
        else:
            # if geographic ("0"),
            # use records with no tilt_corr and assume geographic
            cond1 = df_slice["dir_tilt_correction"] == None
            cond2 = df_slice["dir_tilt_correction"] == tilt_corr
            df_slice = df_slice[cond1 | cond2]
        # exclude data with unwanted codes
        if excl:
            for ex in excl:
                df_slice = self.get_records_for_code(ex, incl=False, use_slice=True, sli=df_slice)

        df_slice = df_slice[df_slice["dir_inc"].notnull() & df_slice["dir_dec"].notnull()]
        # possible add in:
        # split out di_block from this study from di_block from other studies (in citations column)
        # for now, just use "This study"
        if "citations" in df_slice.columns:
            df_slice = df_slice[df_slice["citations"] == "This study"]

        # convert values into DIblock format
        di_block = [[float(row["dir_dec"]), float(row["dir_inc"])] for ind, row in df_slice.iterrows()]
        return di_block

    def get_records_for_code(self, meth_code, incl=True, use_slice=False, sli=None, strict_match=True):
        """
        Use regex to see if meth_code is in the method_codes ":" delimited list.
        If incl == True, return all records WITH meth_code.
        If incl == False, return all records WITHOUT meth_code.
        If strict_match == True, return only records with the exact meth_code.
        If strict_match == False, return records that contain the meth_code partial string,
        (i.e., "DE-").
        Not inplace
        """
        # (must use fillna to replace np.nan with False for indexing)
        if use_slice:
            df = sli.copy()
        else:
            df = self.df.copy()
        # if meth_code not provided, return unchanged dataframe
        if not meth_code:
            return df
        # get regex
        if not strict_match:
            # grab any record that contains any part of meth_code
            cond = df["method_codes"].str.contains(meth_code).fillna(False)
        else:
            # grab only an exact match
            pattern = re.compile("{}(?=:|\s|\Z)".format(meth_code))
            cond = df["method_codes"].str.contains(pattern).fillna(False)
        if incl:
            # return a copy of records with that method code:
            return df[cond]
        else:
            # return a copy of records without that method code
            return df[~cond]

    ## Combining multiple DataFrames

    def merge_dfs(self, df1, replace_dir_or_int):
        """
        Description: takes new calculated directional, intensity data, or both and replaces the corresponding data in self.df with the new input data preserving any data that is not replaced.

        @param: df1 - first DataFrame whose data will preferentially be used.
        @param: replace_dir_or_int - must be string 'dir', 'int', or 'full' and acts as a flag to tell the funciton weather to replace directional, intensity data, or just everything in current table. If there is not enough data in the current table to split by dir or int the two dfs will be fully merged (Note: if you are dealing with tables other than specimens.txt you should likely use full as that is the only table the other options have been tested on)
        """

        if self.df.empty:
            return df1
        elif df1.empty:
            return self.df

        # copy to prevent mutation
        cdf2 = self.df.copy()

        # split data into types and decide which to replace
        if replace_dir_or_int == "dir" and "method_codes" in cdf2.columns:
            cdf2 = cdf2[cdf2["method_codes"].notnull()]
            acdf2 = cdf2[cdf2["method_codes"].str.contains("LP-PI")]
            mcdf2 = cdf2[cdf2["method_codes"].str.contains("LP-DIR")]
        elif replace_dir_or_int == "int" and "method_codes" in cdf2.columns:
            cdf2 = cdf2[cdf2["method_codes"].notnull()]
            mcdf2 = cdf2[cdf2["method_codes"].str.contains("LP-PI")]
            acdf2 = cdf2[cdf2["method_codes"].str.contains("LP-DIR")]
        else:
            mcdf2 = cdf2
            acdf2 = pd.DataFrame(columns=mcdf2.columns)

        # get rid of stupid duplicates
        [mcdf2.drop(cx, inplace=True, axis=1) for cx in mcdf2.columns if cx in df1.columns]

        # join the new calculated data with the old data of same type
        if self.dtype.endswith("s"):
            dtype = self.dtype[:-1]
        else:
            dtype = self.dtype
        mdf = df1.join(mcdf2, how="left", rsuffix="_remove", on=dtype)
        # drop duplicate columns if they are created
        [mdf.drop(col, inplace=True, axis=1) for col in mdf.columns if col.endswith("_remove")]
        # duplicates rows for some freaking reason
        mdf.drop_duplicates(inplace=True, subset=[col for col in mdf.columns if col != "description"])
        # merge the data of the other type with the new data
        mdf = mdf.merge(acdf2, how="outer")
        if dtype in mdf.columns:
            # fix freaking indecies because pandas
            mdf = mdf.set_index(dtype)
            # really? I wanted the index changed not a column deleted?!?
            mdf[dtype] = mdf.index
            mdf.sort_index(inplace=True)

        return mdf

    ## Methods for writing self.df out to tab-delimited file

    def write_magic_file(self, custom_name=None, dir_path=".", append=False):
        """
        Write self.df out to tab-delimited file.
        By default will use standard MagIC filenames (specimens.txt, etc.),
        or you can provide a custom_name to write to instead.
        By default will write to current directory,
        or provide dir_path to write out to instead.
        """
        # *** maybe add some logical order to the column names, here?
        # *** i.e., alphabetical...  see grid_frame3.GridBuilder.make_grid
        df = self.df
        # if indexing column was put in, remove it
        if "num" in self.df.columns:
            self.df.drop("num", axis=1, inplace=True)
        dir_path = os.path.realpath(dir_path)
        if custom_name:
            fname = os.path.join(dir_path, custom_name)
        else:
            fname = os.path.join(dir_path, self.dtype + ".txt")
        # add to existing file
        if append:
            print "-I- appending {} data to {}".format(self.dtype, fname)
            mode = "a"
        # overwrite existing file
        elif os.path.exists(fname):
            print "-I- overwriting {}".format(fname)
            mode = "w"
        # or create new file
        else:
            print "-I- writing {} data to {}".format(self.dtype, fname)
            mode = "w"
        f = open(fname, mode)
        f.write("tab\t{}\n".format(self.dtype))
        df.to_csv(f, sep="\t", header=True, index=False)
        f.close()