Beispiel #1
0
 def test_partial_boolean_frame_indexing(self):
     # GH 17170
     df = DataFrame(np.arange(9.).reshape(3, 3),
                    index=list('abc'),
                    columns=list('ABC'))
     index_df = DataFrame(1, index=list('ab'), columns=list('AB'))
     result = df[index_df.notnull()]
     expected = DataFrame(np.array([[0., 1., np.nan], [3., 4., np.nan],
                                    [np.nan] * 3]),
                          index=list('abc'),
                          columns=list('ABC'))
     tm.assert_frame_equal(result, expected)
Beispiel #2
0
 def test_partial_boolean_frame_indexing(self):
     # GH 17170
     df = DataFrame(np.arange(9.).reshape(3, 3),
                    index=list('abc'), columns=list('ABC'))
     index_df = DataFrame(1, index=list('ab'), columns=list('AB'))
     result = df[index_df.notnull()]
     expected = DataFrame(np.array([[0., 1., np.nan],
                                    [3., 4., np.nan],
                                    [np.nan] * 3]),
                          index=list('abc'),
                          columns=list('ABC'))
     tm.assert_frame_equal(result, expected)
Beispiel #3
0
 def test_partial_boolean_frame_indexing(self):
     # GH 17170
     df = DataFrame(np.arange(9.0).reshape(3, 3),
                    index=list("abc"),
                    columns=list("ABC"))
     index_df = DataFrame(1, index=list("ab"), columns=list("AB"))
     result = df[index_df.notnull()]
     expected = DataFrame(
         np.array([[0.0, 1.0, np.nan], [3.0, 4.0, np.nan], [np.nan] * 3]),
         index=list("abc"),
         columns=list("ABC"),
     )
     tm.assert_frame_equal(result, expected)
Beispiel #4
0
def _build_seq_len_table(qscores: pd.DataFrame) -> str:
    sequence_lengths = qscores.notnull().sum(axis=1).copy()
    stats = _compute_stats_of_df(sequence_lengths)

    stats[stats.index != 'count'] = \
        stats[stats.index != 'count'].astype(int).apply('{} nts'.format)

    stats.rename(index={
        '50%': '50% (Median)',
        'count': 'Total Sequences Sampled'
    },
                 inplace=True)
    frame = stats.to_frame(name="")
    return q2templates.df_to_html(frame)
Beispiel #5
0
def plot_first_heatmap(heatmap: pd.DataFrame, ax: plt.Axes) -> None:
    """Plots the top heatmap in the output figure."""
    expression = 'expression'
    if LOG2_TRANSFORM and LOG2_FIRST:
        expression = 'log2(expression)'
        heatmap = np.log2(heatmap)
    label = f'raw {expression}'
    if NORMALIZE:
        label = f'{expression} (normalized for each marker)'
        if GLOBAL_NORMALIZE:
            label = f'{expression} (normalized across all markers)'
            heat_max = np.nanmax(heatmap.values)
            heat_min = np.nanmin(heatmap.values)
            heatmap = (2 * (heatmap - heat_min)) / (heat_max - heat_min) - 1
        else:
            heatmap = heatmap.apply(lambda x: (2 * (x - x.min())) /
                                    (x.max() - x.min()) - 1)

    sns.heatmap(data=heatmap,
                ax=ax,
                cmap=CMAP,
                square=False,
                xticklabels=1,
                linewidths=0.1,
                vmin=-1,
                vmax=1)
    if LABEL_NANS:
        data_labels = heatmap.isnull().replace({True: 'no data', False: ''})
        nan_data = pd.DataFrame(1.0,
                                index=data_labels.index,
                                columns=data_labels.columns)
        sns.heatmap(data=nan_data,
                    ax=ax,
                    cmap='binary',
                    center=1.0,
                    linewidths=0.1,
                    mask=heatmap.notnull(),
                    cbar=False,
                    annot=data_labels,
                    annot_kws={
                        'color': 'k',
                        'size': 6
                    },
                    fmt='',
                    xticklabels=1)
    cbar = ax.collections[0].colorbar
    cbar.set_label(label)
    cbar.set_ticks([-0.99, 0, 1.0])
    cbar.set_ticklabels(['-1.0', '0.0', '1.0'])
    ax.set_ylabel('')
Beispiel #6
0
def convert_types(df: DataFrame, metric: str) -> DataFrame:
    df = (
        df
        .where(df.notnull(), None)
        .rename(columns={
            f'{metric}RollingSum': "rollingSum",
            f'{metric}Change': "change",
            f'{metric}Direction': "direction",
            f'{metric}ChangePercentage': "changePercentage",
            f'{metric}RollingRate': "rollingRate"
        })
        .to_dict("index")
    )

    return DataFrame(df.items(), columns=["date", "payload"])
Beispiel #7
0
def not_nulls(df:pd.DataFrame, *columns) -> pd.DataFrame:
    '''
        Remove rows with null values
        if no columns are passed so dropna is applied to the whole data frame
        if columns are passed
        are filtered with notnull() func and & operators
        return:
            Pandas DataFrame
    '''
    if not columns:
        return df[df.notnull()]
    filter_ = df[columns[0]].notnull()
    for column in columns[1:]:
        filter_ &= df[column].notnull() # Avoinding null values
    return df[filter_]
Beispiel #8
0
 def encapsulation(df_: pd.DataFrame) -> Iterable:
     df_sub = df.where(df_.notnull(), None)
     i = 1
     for index_, row_ in df_sub.iterrows():
         # i += 1
         # if i > 2300:
         #     break
         R = FactorRetData()
         R.date = dt.datetime.strptime(index_, "%Y-%m-%d")
         R.factor_T = row_['T'] if ret_type == 'Pearson' else None
         R.holding_period = hp
         R.factor_return = row_['factor_return']
         R.factor_name = self.fact_name
         R.factor_name_chinese = self.factor_mapping[self.fact_name]
         R.ret_type = ret_type
         yield R
Beispiel #9
0
 def upload_bi_dataset(self,
                       df: pd.DataFrame,
                       table_name,
                       dsid=None,
                       primary_keys=None,
                       replace=False):
     """
     :param df: 上传的pd.DataFrame()
     :param dsid: BI上的数据集的dsId
     :param table_name: BI上的数据集的表名
     :param primary_keys: 主键列表;如果设置了主键,则上传时会根据主键覆盖更新
     :param replace: False表示在BI的数据集上追加上传df, True表示使用df全量替换BI上的数据集
     """
     df = df.where(df.notnull(), None)
     data = df.to_dict(orient='records')
     url = f"{self.home_url}/public-api/upload-dataset"
     headers = {
         "Content-Type": "application/json",
         "X-Auth-Token": self.get_user_auth_token()
     }
     for i in range(math.ceil(len(data) / 1000)):
         start = 1000 * i
         end = 1000 * (i + 1) if 1000 * (i + 1) < len(data) else len(data)
         logging.info(f'uploading index {start} to {end}')
         finish = False if end < len(data) else True
         if replace:
             overwrite = i == 0
         else:
             overwrite = False
         body = {
             "tableName": table_name,
             "overwriteExistingData": overwrite,
             "data": data[start:end],
             "batchFinish": finish
         }
         if primary_keys:
             primary_columns = list()
             for key in primary_keys:
                 primary_columns.append({"name": key, "isPrimaryKey": True})
             body["columns"] = primary_columns
         if dsid:
             body['dsId'] = dsid
         res = requests.post(url, json=body, headers=headers).json()
         if res['result'] != 'ok':
             logging.error(f'UPLOAD ERROR, ERROR MESSAGE:{res}')
             break
     return None
Beispiel #10
0
 def encapsulation(df: pd.DataFrame) -> Iterable:
     df_sub = df.where(df.notnull(), None)
     i = 1
     for index_, row_ in df_sub.iterrows():
         i += 1
         if i > 2300:
             break
         G = GroupData()
         G.stock_id = row_[KN.STOCK_ID.value]
         G.date = index_[0].to_pydatetime()
         G.stock_return = row_[KN.STOCK_RETURN.value]
         G.factor_value = row_[self.fact_name]
         G.factor_name = self.fact_name
         G.holding_period = hp
         G.factor_name_chinese = self.factor_mapping[self.fact_name]
         G.group = index_[1]
         G.industry = row_[SN.INDUSTRY_FLAG.value]
         G.factor_type = self.factor_dict[self.fact_name].factor_type
         yield G
Beispiel #11
0
def beta_diversity(X: pd.DataFrame,
                   y: pd.Series,
                   label_intra="Intra",
                   label_inter="Inter",
                   label_metric="Distance",
                   class_type="Class"):
    """
    X: a symmetric pd.DataFrame distance matrix with a diagonal of 0
    y: a pd.Series of class labels
    """
    if isinstance(X, Symmetric):
        X = X.to_dense()
    # Assertions
    assert is_symmetrical(X, tol=1e-10), "X must be symmetric"
    assert np.all(X.index == X.columns), "X.index != X.columns"
    assert set(X.index) <= set(
        y.index), "Not all elements in X.index are in y.index"
    assert np.all(X.notnull(
    )), "X cannot contain any NaN.  Please preprocess the data accordingly"

    # Get indexable arrays
    idx_labels = X.index
    loc_labels = np.arange(X.shape[0])
    A = X.values

    # Calculate the diversity
    diversity_data = defaultdict(dict)
    for id_class, idx_class in pd_series_collapse(
            y[idx_labels], type_collection=pd.Index).items():
        loc_class = idx_class.map(lambda x: idx_labels.get_loc(x)).values
        loc_nonclass = np.delete(loc_labels, loc_class)
        # Intra
        intra_distances = squareform(A[loc_class, :][:, loc_class])
        diversity_data[(id_class, label_intra)] = pd.Series(intra_distances)
        # Inter
        inter_distances = A[loc_class, :][:, loc_nonclass].ravel()
        diversity_data[(id_class, label_inter)] = pd.Series(inter_distances)

    # Create output
    df_beta_diversity = pd.DataFrame(diversity_data)
    df_beta_diversity.columns.names = [class_type, "Diversity"]
    return df_beta_diversity
Beispiel #12
0
def get_tree(df: pd.DataFrame, index_key, parent_key):
    """
    界面权限树构造
    """
    df = df.where(df.notnull(), None)
    # drop掉 某行 index_key 和 parent_key 同时为null的数据; 替换nan为none
    result = df.dropna(how='all', subset=[index_key, parent_key])
    # 删除weight列
    result = df.drop(columns='weight')
    result["index"] = result[index_key]
    result.set_index("index", inplace=True)

    # 移除value为空的数据
    result = result[~(result[index_key].isnull())]
    # 记录转为字典,格式 {“index1”: row1_dict, “index2”:row2_dict ...}
    result_dict = result.to_dict(orient="index", into=OrderedDict)
    # 获取根节点列表
    # root_key = [i for i in result[result[parent_key].isna()].index]
    root_key = []
    for index, row in result_dict.items():
        if not row[parent_key] in result_dict:
            root_key.append(index)

    # 获取parent分组, 格式 {“parent1”: childrenes_list,  “parent2”: childrens_list}
    parent_groups = result.groupby(parent_key).groups
    for group, childrens in parent_groups.items():
        # 在result_dict上维护父子关系
        for children in childrens:
            if result_dict.get(group):
                result_dict[group].setdefault("children",
                                              []).append(result_dict[children])
            else:
                break
    content = []
    # 获取维护好父子关系result_dict中的根节点
    for i in root_key:
        content.append(result_dict[i])

    return content, result_dict
def clean_null(data: pd.DataFrame):
    tlogg.info('clean_null')

    # Force all null values to None rather than mixed type with np.nan, NaT
    def _replace_nat(series: pd.Series) -> pd.Series:
        new_series = series.mask(series.isnull(), None)
        return new_series

    def _replace_empty_strings(x):
        if x == '':
            return None
        return x

    temp_data = data.copy()
    temp_data = temp_data.where(data.notnull(), None)
    for col in temp_data.columns[temp_data.dtypes == object]:
        temp_data[col] = temp_data[col].apply(_replace_empty_strings)
    # Date cleanup
    date_columns = [column for column in temp_data.columns if "date" in column]
    for col in date_columns:
        temp_data[col] = temp_data[col].astype('object')
        tlogg.info(f'Cleaning Null Dates from Column: {col}')
        temp_data[col] = _replace_nat(temp_data[col])
    return temp_data
## Search for missing data using: 
df1.isnull()  #this is the pandas method to find missing data
np.isnan(df1) #numpy way # returns boolean ways. true is missing. 
## how to find out the "verbs"/a list of all the options. that DataFrame contains. Type in DataFrame. then hit tab

## subset of columns to find missing values
cols = ['a', 'c']
df1[cols]
df1[cols].isnull()

## For series
df1['b'].isnull()  #this will actually help us extract data

## find non-missing values
df1.notnull() #now true means it is not a missing value
df1.isnull()

## FILLING IN OR DROPPING MISSING VALUES -----------------------------------

##FILLING
## pandas method fillna

df1.fillna(9999) #if missing, itll fill it with 9999
df1
df2 = df1.fillna(999)
df2

## DROPPING empty/missing values
##pandas method dropna
Beispiel #15
0
class MagicDataFrame(object):

    """
    Each MagicDataFrame corresponds to one MagIC table.
    The MagicDataFrame object consists of a pandas DataFrame,
    and assorted methods for manipulating that DataFrame.
    """

    def __init__(self, magic_file=None, columns=None, dtype=None,
                 groups=None, dmodel=None, df=None):
        """
        Provide either a magic_file or a dtype.
        List of columns is optional,
        and will only be used if magic_file == None.
        Instead of a list of columns, you can also provide
        a list of group-names, and the specific col_names
        will be filled in by the data model.
        If provided, col_names takes precedence.
        """
        if isinstance(df, pd.DataFrame):
            self.df = df
            if dtype:
                self.dtype = dtype
            else:
                print '-W- Please provide data type...'
        # make sure all required arguments are present
        if not magic_file and not dtype and not isinstance(df, pd.DataFrame):
            print "-W- To make a MagicDataFrame, you must provide either a filename or a datatype"
            return
        # fetch data model if not provided
        if isinstance(dmodel, type(None)):
            self.data_model = data_model.DataModel()
        else:
            self.data_model = dmodel

        if isinstance(df, pd.DataFrame):
            pass
        # if no file is provided, make an empty dataframe of the appropriate type
        elif not magic_file:
            self.dtype = dtype
            if not isinstance(columns, type(None)):
                self.df = DataFrame(columns=columns)
            else:
                self.df = DataFrame()
                self.df.index.name = dtype[:-1] if dtype.endswith("s") else dtype
        # if there is a file provided, read in the data and ascertain dtype
        else:
            ## old way of reading in data using pmag.magic_read
            #data, dtype, keys = pmag.magic_read(magic_file, return_keys=True)
            ## create dataframe, maintaining column order:
            #self.df = DataFrame(data, columns=keys)
            #if dtype == 'bad_file':
            #    print "-W- Bad file {}".format(magic_file)
            #    self.dtype = 'empty'
            #    return

            ## new way of reading in data using pd.read_table
            with open(magic_file) as f:
                delim, dtype = f.readline().split('\t')[:2]
            self.df = pd.read_table(magic_file, skiprows=[0])
            self.dtype = dtype.strip()
            if self.dtype == 'measurements':
                ###self.df['measurement_name'] = self.df['experiment_name'] + self.df['measurement_number']
                self.df['measurement'] = self.df['experiment'] + self.df['number'].astype(str)
                name = 'measurement'
            elif self.dtype.endswith('s'):
                #dtype = dtype[:-1]
                name = '{}'.format(self.dtype[:-1])
            elif self.dtype == 'contribution':
                name = 'doi'
                # **** this is broken at the moment, fix it!
                return
            else:
                name = self.dtype
            # fix these:
            if self.dtype == 'images':
                self.df = pd.DataFrame()
                return
            if self.dtype == 'criteria':
                #self.df = pd.DataFrame()
                self.df.index = self.df['table_column']
                return
            if len(self.df) and self.dtype != 'ages':
                self.df.index = self.df[name].astype(str)
            elif self.dtype == 'ages':
                self.df.index = self.df.index.astype(str)
            #del self.df[name]
            #self.dtype = dtype
            # replace '' with None, so you can use isnull(), notnull(), etc.
            # can always switch back with DataFrame.fillna('')
            self.df = self.df.where(self.df.notnull(), None)

            # drop any completely blank columns
            # this is not necessarily a good idea....
            #self.df.dropna(axis=1, how='all', inplace=True)
            #
            # add df columns that were passed in but weren't in the file
            if columns:
                for col in columns:
                    if col not in self.df.columns:
                        self.df[col] = None

        # add col_names by group
        if groups and not columns:
            columns = []
            for group_name in groups:
                columns.extend(list(self.data_model.get_group_headers(self.dtype, group_name)))
            for col in columns:
                if col not in self.df.columns:
                    self.df[col] = None
            self.df = self.df[columns]



    ## Methods to change self.df inplace

    def update_row(self, ind, row_data):
        """
        Update a row with data.
        Must provide the specific numeric index (not row label).
        If any new keys are present in row_data dictionary,
        that column will be added to the dataframe.
        This is done inplace.
        """
        if sorted(row_data.keys()) != sorted(self.df.columns):
            # add any new column names
            for key in row_data:
                if key not in self.df.columns:
                    self.df[key] = None
            # add missing column names into row_data
            for col_label in self.df.columns:
                if col_label not in row_data.keys():
                    row_data[col_label] = None
        try:
            self.df.iloc[ind] = pd.Series(row_data)
        except IndexError:
            return False
        return self.df


    def add_row(self, label, row_data, columns=""):
        """
        Add a row with data.
        If any new keys are present in row_data dictionary,
        that column will be added to the dataframe.
        This is done inplace
        """
        # use provided column order, making sure you don't lose any values
        # from self.df.columns
        if len(columns):
            if sorted(self.df.columns) == sorted(columns):
                self.df.columns = columns
            else:
                new_columns = []
                new_columns.extend(columns)
                for col in self.df.columns:
                    if col not in new_columns:
                        new_columns.append(col)
        # makes sure all columns have data or None
        if sorted(row_data.keys()) != sorted(self.df.columns):
            # add any new column names
            for key in row_data:
                if key not in self.df.columns:
                    self.df[key] = None
            # add missing column names into row_data
            for col_label in self.df.columns:
                if col_label not in row_data.keys():
                    row_data[col_label] = None

        # (make sure you are working with strings)
        self.df.index = self.df.index.astype(str)
        label = str(label)

        # create a new row with suffix "new"
        # (this ensures that you get a unique, new row,
        #  instead of adding on to an existing row with the same label)
        self.df.loc[label + "new"] = pd.Series(row_data)
        # rename it to be correct
        self.df.rename(index={label + "new": label}, inplace=True)
        # use next line to sort index inplace
        #self.df.sort_index(inplace=True)
        return self.df


    def add_blank_row(self, label):
        """
        Add a blank row with only an index value to self.df.
        This is done inplace.
        """
        col_labels = self.df.columns
        blank_item = pd.Series({}, index=col_labels, name=label)
        # use .loc to add in place (append won't do that)
        self.df.loc[blank_item.name] = blank_item
        return self.df


    def delete_row(self, ind):
        """
        remove self.df row at ind
        inplace
        """
        self.df = pd.concat([self.df[:ind], self.df[ind+1:]])
        return self.df

    def delete_rows(self,condition):
        """
        delete all rows with  condition==True
        inplace
        """
        self.df['num'] = range(len(self.df))
        df_data = self.df
        # delete all records that meet condition
        if len(df_data[condition]) > 0:  #we have one or more records to delete
            inds = df_data[condition]['num'] # list of all rows where condition is true
            for ind in inds:
                df_data = self.delete_row(ind)
                print 'deleting row where: ',condition
        # sort so that all rows for an item are together
        df_data.sort_index(inplace=True)
        # redo temporary index
        df_data['num'] = range(len(df_data))
        self.df = df_data
        return df_data


    def update_record(self, name, new_data, condition, update_only=False,
                      debug=False):
        """
        Find the first row in self.df with index == name
        and condition == True.
        Update that record with new_data, then delete any
        additional records where index == name and condition == True.
        Change is inplace
        """
        # add numeric index column temporarily
        self.df['num'] = range(len(self.df))
        df_data = self.df
        # edit first of existing data that meets condition
        if len(df_data[condition]) > 0:  #we have one or more records to update or delete
            #print "updating:", name
            inds = df_data[condition]['num'] # list of all rows where condition is true
            existing_data = dict(df_data.iloc[inds[0]]) # get first record of existing_data from dataframe
            existing_data.update(new_data) # update existing data with new interpretations
            # update row
            self.update_row(inds[0], existing_data)
            # now remove all the remaining records of same condition
            if len(inds) > 1:
                for ind in inds[1:]:
                    print "deleting redundant records for:", name
                    df_data = self.delete_row(ind)
        else:
            if update_only:
                print "no record found for that condition, not updating ", name
            else:
                print 'no record found - creating new one for ', name
                # add new row
                df_data = self.add_row(name, new_data)
        # sort so that all rows for an item are together
        df_data.sort_index(inplace=True)
        # redo temporary index
        df_data['num'] = range(len(df_data))
        self.df = df_data
        return df_data


    ## Methods that take self.df and extract some information from it

    def convert_to_pmag_data_list(self, lst_or_dict="lst", df=None):

        """
        Take MagicDataFrame and turn it into a list of dictionaries.
        This will have the same format as reading in a 2.5 file
        with pmag.magic_read(), i.e.:
        if "lst":
          [{"sample": "samp_name", "azimuth": 12, ...}, {...}]
        if "dict":
          {"samp_name": {"azimuth": 12, ...}, "samp_name2": {...}, ...}
        """
        if isinstance(df, type(None)):
            df = self.df
        dictionary = dict(df.T)
        if lst_or_dict == "lst":
            return [dict(dictionary[key]) for key in dictionary]
        else:
            return {key: dict(dictionary[key]) for key in dictionary}


    def get_name(self, col_name, df_slice="", index_names=""):
        """
        Takes in a column name, and either a DataFrame slice or
        a list of index_names to slice self.df using fancy indexing.
        Then return the value for that column in the relevant slice.
        (Assumes that all values for column will be the same in the
         chosen slice, so return the first one.)
        """
        # if slice is provided, use it
        if any(df_slice):
            df_slice = df_slice
        # if given index_names, grab a slice using fancy indexing
        elif index_names:
            df_slice = self.df.ix[index_names]
        # otherwise, use the full DataFrame
        else:
            df_slice = self.df
        # if the slice is empty, return ""
        if len(df_slice) == 0:
            return ""
        # if the column name isn't present in the slice, return ""
        if col_name not in df_slice.columns:
            return ""
        # otherwise, return the first value from that column
        first_val = df_slice[col_name].dropna()
        if any(first_val):
            return first_val[0]
        else:
            return ""
        #return df_slice[col_name].dropna()[0]


    def get_di_block(self, df_slice=None, do_index=False,
                     item_names=None, tilt_corr='100',
                     excl=None):
        """
        Input either a DataFrame slice
        or
        do_index=True and a list of index_names.
        Output dec/inc from the slice in this format:
        [[dec1, inc1], [dec2, inc2], ...].
        Not inplace
        """
        tilt_corr = int(tilt_corr)
        if isinstance(df_slice, str):
            if df_slice.lower() == "all":
                # use entire DataFrame
                df_slice = self.df
        elif do_index:
            # use fancy indexing (but note this will give duplicates)
            df_slice = self.df.ix[item_names]
        elif not do_index:
            # otherwise use the provided slice
            df_slice = df_slice

        # once you have the slice, fix up the data
        # tilt correction must match
        if tilt_corr != 0:
            df_slice = df_slice[df_slice['dir_tilt_correction'] == tilt_corr]
        else:
            # if geographic ("0"),
            # use records with no tilt_corr and assume geographic
            cond1 = df_slice['dir_tilt_correction'] == None
            cond2 = df_slice['dir_tilt_correction'] == tilt_corr
            df_slice = df_slice[cond1 | cond2]
        # exclude data with unwanted codes
        if excl:
            for ex in excl:
                df_slice = self.get_records_for_code(ex, incl=False,
                                                     use_slice=True,
                                                     sli=df_slice)

        df_slice = df_slice[df_slice['dir_inc'].notnull() & df_slice['dir_dec'].notnull()]
        # possible add in:
        # split out di_block from this study from di_block from other studies (in citations column)
        # for now, just use "This study"
        if 'citations' in df_slice.columns:
            df_slice = df_slice[df_slice['citations'] == "This study"]

        # convert values into DIblock format
        di_block = [[float(row['dir_dec']), float(row['dir_inc'])] for ind, row in df_slice.iterrows()]
        return di_block


    def get_records_for_code(self, meth_code, incl=True, use_slice=False,
                             sli=None, strict_match=True):
        """
        Use regex to see if meth_code is in the method_codes ":" delimited list.
        If incl == True, return all records WITH meth_code.
        If incl == False, return all records WITHOUT meth_code.
        If strict_match == True, return only records with the exact meth_code.
        If strict_match == False, return records that contain the meth_code partial string,
        (i.e., "DE-").
        Not inplace
        """
        # (must use fillna to replace np.nan with False for indexing)
        if use_slice:
            df = sli.copy()
        else:
            df = self.df.copy()
        # if meth_code not provided, return unchanged dataframe
        if not meth_code:
            return df
        # get regex
        if not strict_match:
            # grab any record that contains any part of meth_code
            cond = df['method_codes'].str.contains(meth_code).fillna(False)
        else:
            # grab only an exact match
            pattern = re.compile('{}(?=:|\s|\Z)'.format(meth_code))
            cond = df['method_codes'].str.contains(pattern).fillna(False)
        if incl:
            # return a copy of records with that method code:
            return df[cond]
        else:
            # return a copy of records without that method code
            return df[~cond]


    ## Combining multiple DataFrames

    def merge_dfs(self, df1, replace_dir_or_int):
        """
        Description: takes new calculated directional, intensity data, or both and replaces the corresponding data in self.df with the new input data preserving any data that is not replaced.

        @param: df1 - first DataFrame whose data will preferentially be used.
        @param: replace_dir_or_int - must be string 'dir', 'int', or 'full' and acts as a flag to tell the funciton weather to replace directional, intensity data, or just everything in current table. If there is not enough data in the current table to split by dir or int the two dfs will be fully merged (Note: if you are dealing with tables other than specimens.txt you should likely use full as that is the only table the other options have been tested on)
        """

        if self.df.empty: return df1
        elif df1.empty: return self.df

        #copy to prevent mutation
        cdf2 = self.df.copy()

        #split data into types and decide which to replace
        if replace_dir_or_int == 'dir' and 'method_codes' in cdf2.columns:
            cdf2 = cdf2[cdf2['method_codes'].notnull()]
            acdf2 = cdf2[cdf2['method_codes'].str.contains('LP-PI')]
            mcdf2 = cdf2[cdf2['method_codes'].str.contains('LP-DIR')]
        elif replace_dir_or_int == 'int' and 'method_codes' in cdf2.columns:
            cdf2 = cdf2[cdf2['method_codes'].notnull()]
            mcdf2 = cdf2[cdf2['method_codes'].str.contains('LP-PI')]
            acdf2 = cdf2[cdf2['method_codes'].str.contains('LP-DIR')]
        else:
            mcdf2 = cdf2
            acdf2 = pd.DataFrame(columns=mcdf2.columns)

        #get rid of stupid duplicates
        for c in [cx for cx in mcdf2.columns if cx in df1.columns]:
            del mcdf2[c]

        #join the new calculated data with the old data of same type
        mdf = df1.join(mcdf2, how='inner', lsuffix='__remove')
        #duplicates rows for some freaking reason
        mdf.drop_duplicates(inplace=True,subset=[col for col in mdf.columns if col != 'description'])
        #merge the data of the other type with the new data
        mdf = mdf.merge(acdf2, how='outer')
        if self.dtype.endswith('s'): dtype = self.dtype[:-1]
        else: dtype = self.dtype
        if dtype in mdf.columns:
            #fix freaking indecies because pandas
            mdf = mdf.set_index(dtype)
            #really? I wanted the index changed not a column deleted?!?
            mdf[dtype] = mdf.index
            mdf.sort_index(inplace=True)

        return mdf


    ## Methods for writing self.df out to tab-delimited file

    def write_magic_file(self, custom_name=None, dir_path=".", append=False):
        """
        Write self.df out to tab-delimited file.
        By default will use standard MagIC filenames (specimens.txt, etc.),
        or you can provide a custom_name to write to instead.
        By default will write to current directory,
        or provide dir_path to write out to instead.
        """
        # *** maybe add some logical order to the column names, here?
        # *** i.e., alphabetical...  see grid_frame3.GridBuilder.make_grid
        df = self.df
        # if indexing column was put in, remove it
        if "num" in self.df.columns:
            self.df.drop("num", axis=1, inplace=True)
        dir_path = os.path.realpath(dir_path)
        if custom_name:
            fname = os.path.join(dir_path, custom_name)
        else:
            fname = os.path.join(dir_path, self.dtype + ".txt")
        # add to existing file
        if append:
            print '-I- appending {} data to {}'.format(self.dtype, fname)
            mode = "a"
        # overwrite existing file
        elif os.path.exists(fname):
            print '-I- overwriting {}'.format(fname)
            mode = "w"
        # or create new file
        else:
            print '-I- writing {} data to {}'.format(self.dtype, fname)
            mode = "w"
        f = open(fname, mode)
        f.write('tab\t{}\n'.format(self.dtype))
        df.to_csv(f, sep="\t", header=True, index=False)
        f.close()
frame.tz.value_counts(dropna=False)

#see counts of browser details

browsers = Series([x.split()[0] for x in frame['a'].dropna()])

##More modern approach to the above
browser2 = Series(frame['a'] \
            .str.split(' ') \
            .str.get(0))


operating_system = np.where(frame['a'].str.contains('Windows'),
'Windows', 'Not Windows')

frame=frame[frame.notnull()]

agg_counts = frame.groupby(['tz', operating_system]) \
            .size() \
            .unstack() \
            .fillna(0)

agg_counts[:10]

#use to sort in ascending order
#argsort used just to get the indices of the sort by summed columns
## OF course- a simpler way would just be to derive a 'total' column then sort by it
## take the index of 
indexer = agg_counts.sum(axis=1).argsort()
indexer[:10]
count_subset = agg_counts.take(indexer)[-10:]
Beispiel #17
0
# 分别对每列统计频数
data = DataFrame({'Qu1':[1,3,4,3,4],
	'Qu2':[2,3,1,2,3],
	'Qu3':[2,3,1,2,3]})
data.apply(pd.value_counts)

## 处理缺失数据
string_data = Series(['aardvark','artichoke',np.nan,'avocado'])
# Python中的空值会被处理为NA
string_data[0] = None
string_data.isnull()
# 滤除缺失数据
from numpy import nan as NA
data = Series([1,NA,3.5,NA,7])
data.dropna()
data[data.notnull()]
# 对于DataFrame
data = D ataFrame([[1.,6.5,3.],[1.,NA,NA],
	[NA,NA,NA],[NA,6.5,3.]])
cleaned = data.dropna()
# 丢掉全为NA的行
data.dropna(how='all')
# 丢掉全为NA的列
data.dropna(axis=1,show='all')
# 填充缺失数据
df.fillna(0# 不同列填充不同的值
)
df.fillna({'one':0.5,'two':-1})
# 对现有对象进行修改
_ = df.fillna({'one':0.5,'two':-1}, inplace=True)
# 前向填充
Beispiel #18
0
 def valid_for(self, prediction_range):
     """confirm whether the model can predict for a given range of months"""
     index = date_range(start=prediction_range.start_date, end=prediction_range.end_date, freq="MS")
     result = DataFrame(index=index).join(self.parameters, on=index.month).value
     return result.notnull().all()
Beispiel #19
0
# 1    False
# 2     True
# 3    False

print '####################################'

from numpy import nan as NA

data = Series([1, NA, 3.5, NA, 7])
print data.dropna()
# 0    1.0
# 2    3.5
# 4    7.0

#也可以通过布尔型索引达到这个目的
print data[data.notnull()]
# 0    1.0
# 2    3.5
# 4    7.0

data = DataFrame([[1, 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])
print data
#     0    1   2
# 0   1  6.5   3
# 1   1  NaN NaN
# 2 NaN  NaN NaN
# 3 NaN  6.5   3
#dropna默认丢弃任何包含缺失值的行
clearned = data.dropna()
print clearned
#    0    1  2
Beispiel #20
0
 def _handle_dataframe_nans(df: pd.DataFrame):
     return df.where(df.notnull(), None)
Beispiel #21
0
result = data.apply(value_counts).fillna(0)
resutl
result
## Handling Missing Data
string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data
string_data.isnull
string_data.isnull()
string_data[0] = None
string_data.isnull()
string_data
# Filtering out Missing Data
from numpy import nan as NA
data = Series([1, NA, 3.5, NA, 7])
data.dropna()
data[data.notnull()]
data = DataFrame([[1., 6.5, 3.], [1., NA, NA],
[NA, NA, NA], [NA, 6.5, 3.]])
data
cleaned = data.dropna()
cleaned
cleaned = data.dropna() # it drops every row which contains at least one Na value
data.dropna(how='all') # it only drops row with all value equals to NA
data[4] = NA
data
data.dropna(axis=1, how='all')
## Filling in Missing Data
df
df.fillna(0)
df = DataFrame(np.random.randn(7, 3))
df
Beispiel #22
0
# search a whole dataframe
df1.isnull()
np.isnan(df1)

# search specific columns
cols = ['a', 'c'] # create a list of column keys
df1[cols]
df1[cols].isnull()

# also works on a series
df1['b']
df1['b'].isnull()

# pandas also has a negation of `isnull`, `notnull`
df1.isnull()
df1.notnull()
df1.isnull() == df1.notnull() # all false! perfectly opposite

# FILLING IN OR DROPPING MISSING VALUES ----------------------------

## using pandas method `fillna`
df1.fillna(999) # put the value you want to fill missing values with
df2 = df1.fillna(999)
df2

## we can also drop columns or rows with missing values using `dropna`
## `dropna` also has two options:
##  - `axis` where `axis = 0` is rows and = 1 is columns
##  - `how` where ` how = 'any' ` means drop if row/col has ANY missing values
##          and ` how = 'all' ` means drop if row/col has ALL missing values
##  for more, type 'DataFrame.dropna?' in the console
Beispiel #23
0
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

n = np.nan
print(type(n))
print(1 + n)

s1 = Series([1, 2, n, 3, 4], index=list('ABCDE'))
print(s1)
print(s1.isnull(), s1.notnull())

s1 = s1.dropna()
print(s1)
#Series里面的isnull,notnull,dropna都适用于DataFrame
df = DataFrame([[1, 2, n], [1, n, 3], [n, n, 3], [n, n, n]])
print(df)
print(df.isnull(), df.notnull())
print(
    df.dropna(axis=0, how='all', thresh=1)
)  #axis=0按照行,how=‘any’凡是那一列(hang)有NaN就删除,=‘all'只有那一行(lie)都是NaN才删除,thresh=1 那一列(行)的NaN大于一个就删除
df2 = df.fillna(value={0: 'A', 1: 'B', 2: 'C', 3: "D"})  #第0列的NaN填充为A,……
print(df2)
Beispiel #24
0
# isnull() : NaN값이 있으면 True로 반환한다.
print(stringData.isnull())
stringData[0] = None
print(stringData.isnull())

from numpy import nan as NA

# 누락된 데이터 골라내기
# dropna 를 사용하는 것이 유용한 방법이며, 사용 결과값으로 Series객체를 반환

# dropna() : na값을 배제시킴
data = Series([1, NA, 3.4, NA, 8])
print(data.dropna())

# boolean 을 이용해서 직접 계산한 후에 가져오기
print(data.notnull())
print(data[data.notnull()])

# DataFrame에서 누락된 데이터 골라 내기
# dropna는 기본적으로 NA값이 하나라도 있는 row(행) 자체를 제외시켜 버린다.
data = DataFrame([[1, 5.5, 3], [1, NA, NA], [NA, NA, NA], [NA, 3.3, 3]])
print(data)
print(data.dropna())

# how='all' 옵션을 주면 모든 값이 NA인 행만 제외된다.
print(data.dropna(how='all'))

data[4] = NA
print(data)

# 열의 값이 모두 NA인 경우에만 지우고자 할 때에는 역시 axis값을 1로 주면 된다.
# ### Vectorized string functions in pandas

# In[ ]:


data = {'Dave': '*****@*****.**', 'Steve': '*****@*****.**',
        'Rob': '*****@*****.**', 'Wes': np.nan}
data = Series(data)
data


# In[ ]:


data.isnull()
data[data.notnull()]


# In[ ]:


data.str.contains('gmail')


# In[ ]:


pattern


# In[ ]:
Beispiel #26
0
    def fit(self,
            X: pd.DataFrame,
            Y: pd.DataFrame,
            fixed_effects_variables: list,
            formula: str = None,
            add_attributes_to_fixed_variables=False,
            categorical_variables="infer",
            residual_type="resid_response",
            fit_kws=dict(),
            assert_missing_values=True):
        start_time = time.time()
        # Check inputs
        if assert_missing_values:
            assert np.all(X.notnull(
            )), "X has missing values and it will throw downstream errors."
            assert np.all(Y.notnull(
            )), "Y has missing values and it will throw downstream errors."
        assert len(set(X.columns) & set(Y.columns)
                   ) == 0, "There can't be an overlap of columns in X and Y."
        assert set(X.index) == set(
            Y.index), f"X.index and Y.index must have same values"
        Y = Y.loc[X.index, :]
        self.X_ = X.copy()
        self.Y_ = Y.copy()
        self.formula = formula
        self.residual_type = residual_type

        # Adding attributes to fixed variables
        self.add_attributes_to_fixed_variables = add_attributes_to_fixed_variables
        if self.add_attributes_to_fixed_variables:
            warnings.warn("Experimental: `add_attributes_to_fixed_variables`")

        # Categorical variables
        if categorical_variables == "infer":
            categorical_variables = list()
            for id_variable in self.Y_.columns:
                if self.Y_[id_variable].dtype == object:
                    categorical_variables.append(id_variable)
            if self.verbose:
                if bool(categorical_variables):
                    print("Inferred categorical variables from `Y`: {}".format(
                        categorical_variables),
                          file=sys.stderr)

        if bool(categorical_variables):
            assert is_nonstring_iterable(
                categorical_variables
            ), "`categorical_variables` must be a non-string iterable if it is not None"
            for id_variable in categorical_variables:
                assert id_variable in self.Y_.columns, "Not all variables from `categorical_variables` are in `Y.columns`"
                self.Y_[id_variable] = self.Y_[id_variable].astype("category")

        # Encode the variables
        X_relabeled, encoding_X, decoding_X = self.relabel_attributes(
            X, attribute_prefix="attr_")
        Y_relabeled, encoding_Y, decoding_Y = self.relabel_attributes(
            Y, attribute_prefix="meta_")
        self.encoding_ = {**encoding_X, **encoding_Y}
        self.decoding_ = {**decoding_X, **decoding_Y}

        # Check variables
        self.attributes_ = X.index
        self.fixed_effect_variables_ = [
            *map(lambda x: self.encoding_[x], fixed_effects_variables)
        ]
        variables = fixed_effects_variables
        assert set(variables) <= set(
            Y.columns), f"{set(variables) - set(Y.columns)} not in `Y.columns`"

        # Run regression models
        args = {
            "X": X_relabeled,
            "Y": Y_relabeled,
            "formula": formula,
            "multiple_comparison_method": self.multiple_comparison_method,
            "residual_type": self.residual_type,
            "fit_kws": fit_kws,
        }
        if self.verbose:
            data = Parallel(n_jobs=self.n_jobs)(
                delayed(self._run_glm)(query_attr, **args)
                for query_attr in tqdm(X_relabeled.columns,
                                       "Modeling each attribute"))
        else:
            data = Parallel(n_jobs=self.n_jobs)(
                delayed(self._run_glm)(query_attr, **args)
                for query_attr in X_relabeled.columns)

        # Model results
        self.models_ = OrderedDict(zip(X.columns, map(lambda x: x[0], data)))
        self.model_results_ = OrderedDict(
            zip(X.columns, map(lambda x: x[1], data)))
        self.synopsis_ = pd.DataFrame([*map(lambda x: x[2], data)])
        self.synopsis_.index.name = "id_attribute"
        self.residuals_ = pd.DataFrame(
            OrderedDict(zip(X.columns, map(lambda x: x[3], data))))

        # Decode the variables
        self.fixed_effect_variables_ = [
            *map(lambda x: self.decoding_[x], self.fixed_effect_variables_)
        ]

        # Duration
        self.fitted = True
        self.duration_ = format_duration(start_time)
        return self
Beispiel #27
0
def gantt_to_excel(
    data: pd.DataFrame,
    start_col: str,
    end_col: str,
    duration_col: str,
    description: str,
    output: str,
    date_format: str = "d-m-yyyy",
    colour: str = "f79646",
    symbol: str = "",
    only_workdays: bool = True,
    holidays: typing.Iterable[str] = {},
):
    """Function that converts a DataFrame into Gantt chart in an Excel spreadsheet.

    data: pd.DataFrame
        Input dataframe containing start and end dates, durations and descriptions of
        every task

    start_col: str
        Column that contains the start dates of every task

    end_col: str
        Column that contains the end dates of every task

    duration_col: str
        Column that contains the duration (in days) for every task

    output: str
        Name of the output file to be generated. Typically has the extension of .xls / .xlsx

    date_format: str, default="d-m-yyyy"
        The formatting of the dates in the output. The supported options can be found at
        https://xlsxwriter.readthedocs.io/working_with_dates_and_time.html#working-with-dates-and-time

    colour: str, default="f79646"
        Sets the colour of the bars in the generated Gantt chart. The default is a shade of
        orange. A colour pallette of reference is http://wordfaqs.ssbarnhill.com/Word%202007%20Color%20Swatches.pdf

    symbol: str, default=""
        Displays symbols in the bars populated in the Gantt chart. Blank by default. Can be
        used to simulate ASCII-like output

    only_workdays: bool, default=True
        Flag for whether only workdays are displayed on the chart

    holidays: Iterable[str], default={}
        Optional list of holidays
    """
    assert {start_col, end_col, duration_col, description}.issubset(
        data.columns
    ), "Some of the columns are not present in the data"
    assert data.notnull().any(None), "Nulls are not permitted in the data."

    data = data.copy()  # Don't mutate the original dataframe
    data[start_col] = pd.to_datetime(data[start_col])
    data[end_col] = pd.to_datetime(data[end_col])

    row_nums = {
        desc: row
        for row, desc in enumerate(
            data.groupby(description)
            .apply(lambda x: x[start_col].min())
            .sort_values()
            .index
        )
    }
    data.index = data[description].map(row_nums)

    # Setting up the workbook object
    workbook = xlsxwriter.Workbook(output)
    # Formats
    # https://xlsxwriter.readthedocs.io/working_with_dates_and_time.html#working-with-dates-and-time
    date_format = workbook.add_format({"num_format": date_format, "bold": True})
    bold_format = workbook.add_format({"bold": True})
    cell_colour = workbook.add_format()
    # Pick colours from http://wordfaqs.ssbarnhill.com/Word%202007%20Color%20Swatches.pdf
    cell_colour.set_bg_color(colour)
    worksheet = workbook.add_worksheet("Chart")

    min_date, max_date = data[start_col].min(), data[end_col].max()
    if only_workdays:
        date_range = generate_date_series(min_date, max_date, holidays=holidays)
    else:
        date_range = generate_date_series(min_date, max_date, {}, {})

    for col, day in enumerate(date_range):
        worksheet.write(0, col + 1, day, date_format)

    endpoints = zip(data[start_col], data[end_col], data.index)
    for task in data[description]:
        start, end, row = next(endpoints)
        worksheet.write(row + 1, 0, task, bold_format)
        start_index = where(start, date_range) + 1
        end_index = where(end, date_range) + 2

        for col in range(
            start_index,
            end_index,
        ):
            worksheet.write(row + 1, col, symbol, cell_colour)

    workbook.close()
Beispiel #28
0
print(df.duplicated(subset='name'))  # 只检查name这列,只要这列的值相同就被视为重复行,不管其他列的值
# keep=False:所有重复行都标记为True,包括第一行。keep='first'(默认)/'last':除了第一/最后一行外其他行都标记为True
print(df.duplicated(subset='age',
                    keep=False))  # 只检查name这列,只要这列的值相同就被视为重复行,不管其他列的值
# 删除重复行,只保留一行
print(df.drop_duplicates())
print(df.drop_duplicates(['name']))  # 只检查 name 列

# (2)处理缺失值
# ①识别缺失数据
# Pandas使用NaN表示浮点和非浮点数组里的缺失数据,使用.isnull() .notnull():判断是否缺失
filename = r'\rz.xlsx'
df = read_excel(data_source_path + filename, sheet_name='Sheet2')
print(df)
print(df.isnull())
print(df.notnull())

# ②处理缺失数据
# 处理方式:数据补齐、删除对应行、不处理
# 1.删除对应行:dropna
newDf = df.dropna()  # 删除包含NaN的行
print(newDf)
print(len(newDf))  # 返回行数
print(newDf.columns)  # 含列名的Index
newDf = df.dropna(how='all')  # 只有当所有列全为空时,该行才删除
print(newDf)
print(df.dropna(axis=1))  # 按列丢弃
print(df.dropna(how='all', axis=1))  # 按列丢弃
# 2.数据补齐:fillna
print(df.fillna('?'))
df.at[0, '数分'] = None
Beispiel #29
0
## search for missing data using
df1.isnull()  #pandas method to find missing data
np.isnan(df1)  # numpy way

## subset of columns
cols = ['a', 'c']
df1[cols]
df1[cols].isnull()

## for series
df1['b'].isnull()

## find non-missing values
df1.isnull()
df1.notnull()
df1.isnull() == df1.notnull()

# FILLING IN OR DROPPING VALUES

## pandas method 'fillna'
df1
df1.fillna(999)
df2 = df1.fillna(999)

## pandas method 'dropna'
# in R, 'null' and 'n/a' are 2 different values
# in python, think of 'n/a', 'null', and 'NaN' as interchangable
df1.dropna()  # drops ROWS with ANY missing values
df1.dropna(axis=0, how='any')  # drop ROWS with ANY missing values
df1.dropna(axis=1, how='any')  # drop COLUMNS with ANY missing values
Beispiel #30
0
## search for missing data using
df1.isnull() # pandas method to find missing data
np.isnan(df1) # numpy way

## subset of columns
cols = ['a', 'c']
df1[cols]
df1[cols].isnull()

## for series
df1['b'].isnull()

## find non-missing values
df1.isnull()
df1.notnull()
df1.isnull() == df1.notnull()


# FILLING IN OR DROPPING VALUES

## pandas method `fillna`
df1.fillna(999)
df2 = df1.fillna(999)

## pandas method `dropna`
df1.dropna() # drops ROWS with ANY missing values
df1.dropna(axis = 0, how = 'any') # drop ROWS with ANY missing values
df1.dropna(axis = 1, how = 'any') # drop COLS with ANY missing values
df1.dropna(axis = 0, how = 'all') # drop ROWS with ALL missing values
Beispiel #31
0
t0 = time.clock()
np.arange(1e5, dtype=int).sum()
t1 = time.clock()
print("Total running time: %s s" % (str(t1 - t0)))
'''7.4 判断函数-isnull()和notnull()'''
df = DataFrame({
    'Python': [88, 104, 113],
    '数学': [118, 132, 119]
},
               columns=['Python', '数学', '英语'])
df.loc[2, '英语'] = 127

# 7.4.1判断元素是否为空
pd.isnull(df)  # 如果为空则返回true,与下面的结果是完全相同的
df.isnull()  # 如果为空则返回true
df.notnull()  # 如果为空则返回false,不为空返回true
~df.notnull()  # 如果为空则返回true
df.isnull().all(axis=0)  # 判断整列,全为空则返回true
df.isnull().all(axis=1)  # 判断整行,全为空则返回true
df.isnull().any(axis=0)  # 判断整列,有一个空则返回True
df.isnull().any(axis=1)  # 判断整行,有一个空则返回True
df.loc[df.isnull().any(axis=1)]  # 过滤

# 7.4.2判断整列/整行是否为空,全部不为空则返回true,有空则返回false
df.notnull().all(axis=0)  # 判断整列
df.notnull().all(axis=1)  # 判断整行
df[df.notnull().all(axis=1)]  # 将不为空的全部输出

# 7.4.3判断整列/整行是否为空,只要有一个不为空就返回true,全部为空则返回false
df.notnull().any(axis=0)  # 判断整列
df.notnull().any(axis=1)  # 判断整行
Beispiel #32
0
def validate_schedule_data(available_schedule: pd.DataFrame):
    assert (available_schedule.notnull().all().all()
            ), f"Some task schedule data is missing: {available_schedule}"
Beispiel #33
0
def find_not_null(rate:float,df:pd.DataFrame) -> list:
    se = df.notnull().sum()
    return [x for x in se.index if se[x] / len(df) >= rate]
Beispiel #34
0
def validate_single_column_int_data(dataframe: pd.DataFrame):
    assert (dataframe.notnull().all().all()
            ), f"Some task durations are missing: {dataframe}"
    assert (len(dataframe.columns) == 1
            ), f"Multiple columns for task durations data: {dataframe}"
import pandas
import numpy as np

from pandas import Series, DataFrame

df = DataFrame(data=np.random.randint(0, 150, size=(100, 5)),
               index=np.arange(100, 200),
               columns=["Python", "En", "Math", "Java", "MySQL"])
# print(df)

# 判断DataFrame是否存在空数据: 列属性
print(df.isnull())  # 有空数据返回True,无空数据返回False
print(df.isnull().any())  # any()有一个True就返回True,没有(即都是Flase)就返回False
# df.isnull().any() 返回都是Flase,说明没有空数据

print(df.notnull())  # 有空数据返回Flase,无空数据返回True
print(df.notnull().all())  # all()有一个Flase就返回Flase,没有(即都是True)就返回True
# df.notnull().all() 返回都是True,说明没有空数据

# 给DataFrame设置一些空数据
for i in range(30):
    # 行索引
    index = np.random.randint(100, 200, size=1)[0]
    cols = df.columns
    # 列索引
    col = np.random.choice(cols)  # choise()随机获取

    # 设置空数据
    df.loc[index, col] = None
    # df.loc[index,col] = np.NAN
Beispiel #36
0
def replace_nan_with_none_in_dataframe(dataframe: pd.DataFrame) -> pd.DataFrame:
    dataframe = dataframe.where(dataframe.notnull(), None).dropna(axis=0, how="all")
    return dataframe.replace({np.nan: None})