def test_fillna_categorical_nan(self): # GH 14021 # np.nan should always be a valid filler cat = Categorical([np.nan, 2, np.nan]) val = Categorical([np.nan, np.nan, np.nan]) df = DataFrame({"cats": cat, "vals": val}) res = df.fillna(df.median()) v_exp = [np.nan, np.nan, np.nan] df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, dtype='category') tm.assert_frame_equal(res, df_exp) result = df.cats.fillna(np.nan) tm.assert_series_equal(result, df.cats) result = df.vals.fillna(np.nan) tm.assert_series_equal(result, df.vals) idx = pd.DatetimeIndex(['2011-01-01 09:00', '2016-01-01 23:45', '2011-01-01 09:00', pd.NaT, pd.NaT]) df = DataFrame({'a': Categorical(idx)}) tm.assert_frame_equal(df.fillna(value=pd.NaT), df) idx = pd.PeriodIndex(['2011-01', '2011-01', '2011-01', pd.NaT, pd.NaT], freq='M') df = DataFrame({'a': Categorical(idx)}) tm.assert_frame_equal(df.fillna(value=pd.NaT), df) idx = pd.TimedeltaIndex(['1 days', '2 days', '1 days', pd.NaT, pd.NaT]) df = DataFrame({'a': Categorical(idx)}) tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
def test_fillna_skip_certain_blocks(self): # don't try to fill boolean, int blocks df = DataFrame(np.random.randn(10, 4).astype(int)) # it works! df.fillna(np.nan)
def test_merge_na_keys(self): data = [[1950, "A", 1.5], [1950, "B", 1.5], [1955, "B", 1.5], [1960, "B", np.nan], [1970, "B", 4.], [1950, "C", 4.], [1960, "C", np.nan], [1965, "C", 3.], [1970, "C", 4.]] frame = DataFrame(data, columns=["year", "panel", "data"]) other_data = [[1960, 'A', np.nan], [1970, 'A', np.nan], [1955, 'A', np.nan], [1965, 'A', np.nan], [1965, 'B', np.nan], [1955, 'C', np.nan]] other = DataFrame(other_data, columns=['year', 'panel', 'data']) result = frame.merge(other, how='outer') expected = frame.fillna(-999).merge(other.fillna(-999), how='outer') expected = expected.replace(-999, np.nan) tm.assert_frame_equal(result, expected)
def getPostData(fbGraph, entry): global CHART_LIMIT retrieved = False i=0 while retrieved == False: i += 1 try: posts = fbGraph.get_object(entry['page'] + '/posts', limit=CHART_LIMIT*15)['data'] retrieved = True except facebook.GraphAPIError: print "Failed retrieving Graph object from facebook, retrying..." pass if i > 14: print "Giving up" return None frame = DataFrame(posts) ##Later, maybe output this frame for further study postData = DataFrame(columns=('Date', 'Likes', 'Shares')) postData['Shares'] = frame['shares'].map(fmtShares) postData['Likes'] = frame['id'].map(fmtLikes) postData['Date'] = frame['created_time'].map(fmtDate) postData = postData.groupby(by='Date', sort=False).mean() postData = postData.head(n=CHART_LIMIT) postData.fillna(value=0) return postData
def test_operators_none_as_na(self): df = DataFrame({"col1": [2, 5.0, 123, None], "col2": [1, 2, 3, 4]}, dtype=object) ops = [operator.add, operator.sub, operator.mul, operator.truediv] # since filling converts dtypes from object, changed expected to be # object for op in ops: filled = df.fillna(np.nan) result = op(df, 3) expected = op(filled, 3).astype(object) expected[com.isnull(expected)] = None assert_frame_equal(result, expected) result = op(df, df) expected = op(filled, filled).astype(object) expected[com.isnull(expected)] = None assert_frame_equal(result, expected) result = op(df, df.fillna(7)) assert_frame_equal(result, expected) result = op(df.fillna(7), df) assert_frame_equal(result, expected, check_dtype=False)
def timeseries_to_supervised(data, lag=1): df = DataFrame(data) columns = [df.shift(i) for i in range(1, lag+1)] columns.append(df) df = concat(columns, axis=1) df.fillna(0, inplace=True) return df
def test_fillna_dtype_conversion(self): # make sure that fillna on an empty frame works df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) result = df.get_dtype_counts().sort_values() expected = Series({'object': 5}) assert_series_equal(result, expected) result = df.fillna(1) expected = DataFrame(1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) result = result.get_dtype_counts().sort_values() expected = Series({'int64': 5}) assert_series_equal(result, expected) # empty block df = DataFrame(index=lrange(3), columns=['A', 'B'], dtype='float64') result = df.fillna('nan') expected = DataFrame('nan', index=lrange(3), columns=['A', 'B']) assert_frame_equal(result, expected) # equiv of replace df = DataFrame(dict(A=[1, np.nan], B=[1., 2.])) for v in ['', 1, np.nan, 1.0]: expected = df.replace(np.nan, v) result = df.fillna(v) assert_frame_equal(result, expected)
def pad_smooth(sample, window_len): nr_frames = sample.frame.max() + 40 out = DataFrame({'frame': np.arange(nr_frames)}) out['sample_id'] = sample.sample_id.unique()[0] out = pd.merge(out, sample, how='outer', on=['sample_id', 'frame']) out.fillna(method='ffill', inplace=True, limit=2) out.fillna(method='bfill', inplace=True) out.fillna(method='ffill', inplace=True) for gesture in range(21): out[gesture] = smooth(out[gesture], window_len=window_len, window='hanning') return out
def test_fillna_columns(self): df = DataFrame(np.random.randn(10, 10)) df.values[:, ::2] = np.nan result = df.fillna(method='ffill', axis=1) expected = df.T.fillna(method='pad').T assert_frame_equal(result, expected) df.insert(6, 'foo', 5) result = df.fillna(method='ffill', axis=1) expected = df.astype(float).fillna(method='ffill', axis=1) assert_frame_equal(result, expected)
class Fillna(object): params = ([True, False], ['pad', 'bfill']) param_names = ['inplace', 'method'] def setup(self, inplace, method): values = np.random.randn(10000, 100) values[::2] = np.nan self.df = DataFrame(values) def time_frame_fillna(self, inplace, method): self.df.fillna(inplace=inplace, method=method)
def makeguesses(morceaux, collectionname='grenoble', databasename='local'): with MongoClient() as client: coll = client[databasename][collectionname] dbvecs = DataFrame({d['T']: titlevector(d['T']) for d in coll.find() if d['T'] is not None}) dbvecs.fillna(0, inplace=True) dbvecs.sort_index(inplace=True) dbkeys = {d['T']: d['_id'] for d in coll.find() if d['T'] is not None} # get the best fit guesses = (np.abs(dbvecs.subtract(titlevector(m), axis=0)).sum(axis=0).idxmin() for m in morceaux) return [[m, g, dbkeys[g]] for m, g in zip(morceaux, guesses)]
def calc_distance_matrix(G, max_distance=None): """Returns a matrix containing the shortest distance between all nodes in a network Parameters ---------- G : graph A NetworkX graph max_distance : float or None, optional (default='None') The maximum possible distance value in the network. If None, max_distance is the longest shortest path between two nodes of the network (the graph eccentricity) Returns ------- dist_matrix : NumPy array An NxN numpy array. Notes ----- Along the diagonal, the values are all 0. Unconnected nodes have a distance of max_distance to other nodes. """ # Network (collaborator) Distance dist_matrix = nx.all_pairs_shortest_path_length(G) dist_matrix = DataFrame(dist_matrix, index=G.nodes(), columns=G.nodes()) if max_distance is None: max_distance = float(dist_matrix.max().max()) dist_matrix = dist_matrix.fillna(max_distance) # The unconnected ones are infinitely far from the rest diag_idx = np.diag_indices(len(dist_matrix), ndim=2) dist_matrix.values[diag_idx] = 0 return dist_matrix
def get_date_trend(self, mode_date): """ :param mode_date: 日期模式,合并到最短时间单位. 0-day, 1-week, 2-month, 3-Quarter. (default 2) """ axisLabels = self.oriDate[:] pointVals = [{copy.deepcopy(oriValue): 1} for oriValue in self.oriValues] rule_mode = {'0': 'D', '1': 'W', '2': 'M', '3': 'Q'} df = DataFrame(pointVals, index=axisLabels) df = df.resample(rule_mode[str(mode_date)], how='sum') df = df.fillna(0) """各项总和""" # cols_name = [] # for name, col in df.iteritems(): # cols_name.append(name) # df['SUM'] = 0 # for i in xrange(len(cols_name)): # df['SUM'] += df[cols_name[i]] """宿舍比重""" # df['PER_DORM'] = df['dorm']/df['SUM'] if 'dorm' in df else 0 # 仅当存在宿舍值时才计算宿舍比重,否则设为0 axisLabels = map(lambda x: x.strftime('%Y-%m-%d'), df.index.tolist()) # 从dataframe 中取出作为索引的日期标签成为队列 seriesData = [] legendLabels = [] for colName, col in df.iteritems(): legendLabels.append(colName) data = map(lambda x: 0.0 if isnan(x) else float(x), col.tolist()) seriesData.append({'name': colName, 'data': data}) json_dateTrend = {'axisLabels': axisLabels, 'legendLabels': legendLabels, 'seriesData': seriesData} return json_dateTrend
def twitter_daily_aggregate(retrievaldate): #Date Retrieval d=[] dt = parser.parse(retrievaldate) + timedelta(days=-1) d.append(dt) d.append(d[-1] + timedelta(days=1)) #DataFrame Init ctrend = DataFrame() while d[-1] < datetime.utcnow(): print 'processing ', d[-1], ' ..........' #Daily Mention Count mnts = twitter_count(d, mentions) #User Follower Count usrs = twitter_follower(d,users) #Join trend = mnts.join(usrs) trend['Date'] = Period(d[-1],'D') #Append to DataFrame ctrend = concat([ctrend,trend]) #Extend Dates d.append(d[-1] + timedelta(days=1)) #Join DataFrames and Fill NAs ctrend = ctrend.fillna(0) #Save print 'printing the file' ctrend.to_csv('twitter_trend.csv') return ctrend
def pandas_fillna(df: pd.DataFrame, value: float = None, method: str = None, limit: int = None, **kwargs) -> pd.DataFrame: """ Return a new dataframe with NaN values filled according to the given value or method. This is a wrapper for the ``pandas.fillna()`` function For additional keyword arguments and information refer to pandas documentation at http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.fillna.html :param df: The dataframe to fill :param value: Value to fill :param method: Method according to which to fill NaN. ffill/pad will propagate the last valid observation to the next valid observation. backfill/bfill will propagate the next valid observation back to the last valid observation. :param limit: Maximum number of NaN values to forward/backward fill. :return: A dataframe with nan values filled with the given value or according to the given method. """ # The following code is needed, because Pandas treats any kw given in kwargs as being set, even if just None. kwargs = dict(kwargs) if value: kwargs.update(value=value) if method: kwargs.update(method=method) if limit: kwargs.update(limit=limit) return df.fillna(**kwargs)
def main(): """ Handling of not applicable values """ string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado']) print string_data print string_data.isnull() string_data[0] = None print string_data.isnull() print None is np.nan, None == np.nan # not same # Exclude N/A print '','' NA = np.nan data = Series([1, NA, 3.5, NA, 7]) print data.dropna() print data[data.notnull()] data = DataFrame([ [1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.] ]) cleaned = data.dropna() # row that all value is not NA print data print cleaned print data.dropna(how='all') data[4] = None print data.dropna(axis=1, how='all') print data.dropna(thresh=2) # non NA is more 2 # Fill NA print '','' print data.fillna(0) print data.fillna({1: 0.5, 2: -1}) _ = data.fillna(0, inplace=True) print data print '','' df = DataFrame(np.arange(18).reshape((6, 3))) df.ix[2:, 1] = NA; df.ix[4:, 2] = NA print df print df.fillna(method='ffill') print df.fillna(method='ffill', limit=2) data = Series([1., NA, 3.5, NA, 7]) print data.fillna(data.mean())
def parallel_cumulative_blame(self, branch='master', limit=None, skip=None, num_datapoints=None, committer=True, workers=1, ignore_globs=None, include_globs=None): """ Returns the blame at every revision of interest. Index is a datetime, column per committer, with number of lines blamed to each committer at each timestamp as data. :param branch: (optional, default 'master') the branch to work in :param limit: (optional, default None), the maximum number of revisions to return, None for no limit :param skip: (optional, default None), the number of revisions to skip. Ex: skip=2 returns every other revision, None for no skipping. :param num_datapoints: (optional, default=None) if limit and skip are none, and this isn't, then num_datapoints evenly spaced revs will be used :param committer: (optional, defualt=True) true if committer should be reported, false if author :param ignore_globs: (optional, default=None) a list of globs to ignore, default none excludes nothing :param include_globs: (optinal, default=None) a list of globs to include, default of None includes everything. :param workers: (optional, default=1) integer, the number of workers to use in the threadpool, -1 for one per core. :return: DataFrame """ if not _has_joblib: raise ImportError('''Must have joblib installed to use parallel_cumulative_blame(), please use cumulative_blame() instead.''') revs = self.revs(branch=branch, limit=limit, skip=skip, num_datapoints=num_datapoints) if self.verbose: print('Beginning processing for cumulative blame:') revisions = json.loads(revs.to_json(orient='index')) revisions = [revisions[key] for key in revisions] ds = Parallel(n_jobs=workers, backend='threading', verbose=5)( delayed(_parallel_cumulative_blame_func) (self, x, committer, ignore_globs, include_globs) for x in revisions ) revs = DataFrame(ds) del revs['rev'] revs['date'] = to_datetime(revs['date'].map(datetime.datetime.fromtimestamp)) revs.set_index(keys=['date'], drop=True, inplace=True) revs = revs.fillna(0.0) # drop 0 cols for col in revs.columns.values: if col != 'col': if revs[col].sum() == 0: del revs[col] # drop 0 rows keep_idx = [] committers = [x for x in revs.columns.values if x != 'date'] for idx, row in revs.iterrows(): if sum([row[x] for x in committers]) > 0: keep_idx.append(idx) revs = revs.ix[keep_idx] revs.sort_index(ascending=False, inplace=True) return revs
def prepare_pop_data(population_data: pd.DataFrame, cols=None) -> pd.DataFrame: pop_data = population_data.fillna(value=0) if not cols: cols = ['plot.number', 'total.men', 'total.women', 'orthodox', 'other.christian', 'other.religion'] pop_data.loc[:, cols] = pop_data.loc[:, cols].astype(int) pop_data['lutheran'] = pop_data['total.men'] + pop_data['total.women'] \ - pop_data['orthodox'] - pop_data['other.christian'] - pop_data['other.religion'] return pop_data
def test_fill_value_when_combine_const(self): # GH12723 dat = np.array([0, 1, np.nan, 3, 4, 5], dtype='float') df = DataFrame({'foo': dat}, index=range(6)) exp = df.fillna(0).add(2) res = df.add(2, fill_value=0) assert_frame_equal(res, exp)
def test_fillna_nat(self): series = Series([0, 1, 2, NaT], dtype="M8[us]") filled = series.fillna(method="pad") filled2 = series.fillna(value=series[2]) expected = series.copy() expected[3] = expected[2] assert_series_equal(filled, expected) assert_series_equal(filled2, expected) df = DataFrame({"A": series}) filled = df.fillna(method="pad") filled2 = df.fillna(value=series[2]) expected = DataFrame({"A": expected}) assert_frame_equal(filled, expected) assert_frame_equal(filled2, expected)
def calculate_ground_truth(self): gt = DataFrame(copy.deepcopy(self.events_consuming)) gt = gt.fillna(-1) for x in gt.columns: l = len(gt[x]) for y in range(1,l): if (gt[x][y] == -1): gt[x][y] = gt[x][y-1] self.appliances_status = gt
def fillna_dict(cls, prop): """ Use trade history then fill empty with value row above """ df = DataFrame(prop) df = df.replace(['', 'DEBIT', 'CREDIT'], numpy.nan) df = df.fillna(method='ffill') return [r.to_dict() for k, r in df.iterrows()]
def test_fillna_nat(self): series = Series([0, 1, 2, NaT], dtype='M8[ns]') filled = series.fillna(method='pad') filled2 = series.fillna(value=series.values[2]) expected = series.copy() expected.values[3] = expected.values[2] assert_series_equal(filled, expected) assert_series_equal(filled2, expected) df = DataFrame({'A': series}) filled = df.fillna(method='pad') filled2 = df.fillna(value=series.values[2]) expected = DataFrame({'A': expected}) assert_frame_equal(filled, expected) assert_frame_equal(filled2, expected)
def seriesData_fitTimeFrame(dfList, colName, ts): dataList = [] #ts = TimeSeries(pd.DateRange(datetime(1998,1,1), datetime(2013, 3, 20))) for i in range(len(dfList)): tf = DataFrame(index = ts) df = dfList[i] tf[colName] = df[colName] if colName == 'TradeID' or colName == 'Symbol' or colName == 'Year' or colName == 'Buy' or colName == 'Sell' \ or colName == 'Strat' or colName == 'StartTrading' or colName == 'EndTrading': tf = tf.fillna(method = 'bfill') tf = tf.fillna(-99) elif colName == 'Close' or colName == 'IsPeriod' or colName == 'PosSize' or colName == 'PosDir' \ or colName == 'DynamicPNL' or colName == 'DynamicDollarPNL': tf = tf.fillna(method = 'ffill') tf = tf.fillna(0) elif colName == 'RealizedPNL' or colName == 'RealizedDollarPNL' or colName == 'DailyPNLChange': tf = tf.fillna(0) dataList.append(np.asarray(tf[colName])) return dataList
def test_replace_datetimetz(self): # GH 11326 # behaving poorly when presented with a datetime64[ns, tz] df = DataFrame({'A': date_range('20130101', periods=3, tz='US/Eastern'), 'B': [0, np.nan, 2]}) result = df.replace(np.nan, 1) expected = DataFrame({'A': date_range('20130101', periods=3, tz='US/Eastern'), 'B': Series([0, 1, 2], dtype='float64')}) assert_frame_equal(result, expected) result = df.fillna(1) assert_frame_equal(result, expected) result = df.replace(0, np.nan) expected = DataFrame({'A': date_range('20130101', periods=3, tz='US/Eastern'), 'B': [np.nan, np.nan, 2]}) assert_frame_equal(result, expected) result = df.replace(Timestamp('20130102', tz='US/Eastern'), Timestamp('20130104', tz='US/Eastern')) expected = DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'), Timestamp('20130104', tz='US/Eastern'), Timestamp('20130103', tz='US/Eastern')], 'B': [0, np.nan, 2]}) assert_frame_equal(result, expected) result = df.copy() result.iloc[1, 0] = np.nan result = result.replace( {'A': pd.NaT}, Timestamp('20130104', tz='US/Eastern')) assert_frame_equal(result, expected) # coerce to object result = df.copy() result.iloc[1, 0] = np.nan result = result.replace( {'A': pd.NaT}, Timestamp('20130104', tz='US/Pacific')) expected = DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'), Timestamp('20130104', tz='US/Pacific'), Timestamp('20130103', tz='US/Eastern')], 'B': [0, np.nan, 2]}) assert_frame_equal(result, expected) result = df.copy() result.iloc[1, 0] = np.nan result = result.replace({'A': np.nan}, Timestamp('20130104')) expected = DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'), Timestamp('20130104'), Timestamp('20130103', tz='US/Eastern')], 'B': [0, np.nan, 2]}) assert_frame_equal(result, expected)
def test_na_actions_categorical(self): cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) vals = ["a", "b", np.nan, "d"] df = DataFrame({"cats": cat, "vals": vals}) cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3]) vals2 = ["a", "b", "b", "d"] df_exp_fill = DataFrame({"cats": cat2, "vals": vals2}) cat3 = Categorical([1, 2, 3], categories=[1, 2, 3]) vals3 = ["a", "b", np.nan] df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3}) cat4 = Categorical([1, 2], categories=[1, 2, 3]) vals4 = ["a", "b"] df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4}) # fillna res = df.fillna(value={"cats": 3, "vals": "b"}) tm.assert_frame_equal(res, df_exp_fill) with pytest.raises(ValueError, match=("fill value must " "be in categories")): df.fillna(value={"cats": 4, "vals": "c"}) res = df.fillna(method='pad') tm.assert_frame_equal(res, df_exp_fill) # dropna res = df.dropna(subset=["cats"]) tm.assert_frame_equal(res, df_exp_drop_cats) res = df.dropna() tm.assert_frame_equal(res, df_exp_drop_all) # make sure that fillna takes missing values into account c = Categorical([np.nan, "b", np.nan], categories=["a", "b"]) df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]}) cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"]) df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]}) res = df.fillna("a") tm.assert_frame_equal(res, df_exp)
def _create(self, data): dct = self.get_prep_data(data) data = get_single_column(data) docs = [dct.doc2bow(d) for d in data] ids = dct.keys() df = DataFrame([dict(row) for row in docs], columns=ids, index=data.index) df.columns = ["%s_%s" % (dct[i], data.name) for i in ids] df = df.fillna(0) if self.bool_: df = df.astype(bool).astype(int) return df
def _apply(self, data, fitted_feature): docs = list(data) dct = self.dictionary.get_dict(self.context, docs) tfidf = self.dictionary.get_tfidf(self.context, docs) docs = [dct.doc2bow(d) for d in docs] vecs = tfidf[docs] df = DataFrame([dict(row) for row in vecs], index=data.index) df.columns = ['%s_%s' % (dct[i], data.name) for i in df.columns] df = df.fillna(0) logging.debug(df) return df
def _create(self, data): docs = list(data) dct = self.dictionary.get_dict(self.context, docs) tfidf = self.dictionary.get_tfidf(self.context, docs) docs = [dct.doc2bow(d) for d in docs] vecs = tfidf[docs] df = DataFrame([dict(row) for row in vecs], index=data.index) df.columns = ['%s_%s' % (dct[i], data.name) for i in df.columns] df = df.fillna(0) print df return df
def merge_dataset(files): test = DataFrame(columns = ['file']) for csvfile in files: if csvfile != '.DS_Store': dataframe = pd.read_csv(textdir + csvfile) test = pd.concat([test, dataframe], ignore_index = True) test = test.fillna(np.float(re.match(r'(.+)\.csv', csvfile).group(1))) test = test[ ['file', 'position', 'number_of_stopwords', 'length', 'highlight_marker', 'sum_of_word_weigth', 'scoreline_s1', 'scoreline_s2', 'scoreline_s3', 'specific_timestamp', 'similarity1', 'similarity2', 'f_score']] return test
obj.notnull() pd.notnull(obj) obj[obj.notnull()] obj.dropna() df = DataFrame([[1,2,3,],[1,NA,NA],[NA,NA,NA],[NA,2,3]]) df.dropna() #Nan하나라도 있으며 ㄴ제외 df.dropna(how = 'all') #rowdp모든값이 Nan이면 제외 df[4] = NA df.dropna(how = 'all',axis = 1) #컬럼에 모든 값이 Nan으로 되어있는 컬럼만 제외 df.fillna(0) df[0].fillna(0) df.fillna({0:0,1:1,2:2,4:4}) df.fillna(0,inplace = True) #바로 수정한다. df df = DataFrame([[1,2,5],[NA,NA,4],[3,2,NA],[2,NA,3]]) df.fillna(method='ffill') df.fillna(method='bfill') [문제 139]커미션이 null 인 사원들의 이름과 커미션을 출력하세요 emp = pd.read_csv("c:/r/emp.csv",names = ["empid","name","job","mgr","hire_date","sal","comm","deptno"])
def test_replace_datetimetz(self): # GH 11326 # behaving poorly when presented with a datetime64[ns, tz] df = DataFrame({ 'A': date_range('20130101', periods=3, tz='US/Eastern'), 'B': [0, np.nan, 2] }) result = df.replace(np.nan, 1) expected = DataFrame({ 'A': date_range('20130101', periods=3, tz='US/Eastern'), 'B': Series([0, 1, 2], dtype='float64') }) assert_frame_equal(result, expected) result = df.fillna(1) assert_frame_equal(result, expected) result = df.replace(0, np.nan) expected = DataFrame({ 'A': date_range('20130101', periods=3, tz='US/Eastern'), 'B': [np.nan, np.nan, 2] }) assert_frame_equal(result, expected) result = df.replace(Timestamp('20130102', tz='US/Eastern'), Timestamp('20130104', tz='US/Eastern')) expected = DataFrame({ 'A': [ Timestamp('20130101', tz='US/Eastern'), Timestamp('20130104', tz='US/Eastern'), Timestamp('20130103', tz='US/Eastern') ], 'B': [0, np.nan, 2] }) assert_frame_equal(result, expected) result = df.copy() result.iloc[1, 0] = np.nan result = result.replace({'A': pd.NaT}, Timestamp('20130104', tz='US/Eastern')) assert_frame_equal(result, expected) # coerce to object result = df.copy() result.iloc[1, 0] = np.nan result = result.replace({'A': pd.NaT}, Timestamp('20130104', tz='US/Pacific')) expected = DataFrame({ 'A': [ Timestamp('20130101', tz='US/Eastern'), Timestamp('20130104', tz='US/Pacific'), Timestamp('20130103', tz='US/Eastern') ], 'B': [0, np.nan, 2] }) assert_frame_equal(result, expected) result = df.copy() result.iloc[1, 0] = np.nan result = result.replace({'A': np.nan}, Timestamp('20130104')) expected = DataFrame({ 'A': [ Timestamp('20130101', tz='US/Eastern'), Timestamp('20130104'), Timestamp('20130103', tz='US/Eastern') ], 'B': [0, np.nan, 2] }) assert_frame_equal(result, expected)
def timeseries_to_supervised(data, column_name): df = DataFrame(data) df['output'] = df[column_name] df['output'] = df['output'].shift(1) df.fillna(0, inplace=True) return df
def test_replace_datetimetz(self): # GH 11326 # behaving poorly when presented with a datetime64[ns, tz] df = DataFrame( { "A": date_range("20130101", periods=3, tz="US/Eastern"), "B": [0, np.nan, 2], } ) result = df.replace(np.nan, 1) expected = DataFrame( { "A": date_range("20130101", periods=3, tz="US/Eastern"), "B": Series([0, 1, 2], dtype="float64"), } ) tm.assert_frame_equal(result, expected) result = df.fillna(1) tm.assert_frame_equal(result, expected) result = df.replace(0, np.nan) expected = DataFrame( { "A": date_range("20130101", periods=3, tz="US/Eastern"), "B": [np.nan, np.nan, 2], } ) tm.assert_frame_equal(result, expected) result = df.replace( Timestamp("20130102", tz="US/Eastern"), Timestamp("20130104", tz="US/Eastern"), ) expected = DataFrame( { "A": [ Timestamp("20130101", tz="US/Eastern"), Timestamp("20130104", tz="US/Eastern"), Timestamp("20130103", tz="US/Eastern"), ], "B": [0, np.nan, 2], } ) tm.assert_frame_equal(result, expected) result = df.copy() result.iloc[1, 0] = np.nan result = result.replace({"A": pd.NaT}, Timestamp("20130104", tz="US/Eastern")) tm.assert_frame_equal(result, expected) # coerce to object result = df.copy() result.iloc[1, 0] = np.nan result = result.replace({"A": pd.NaT}, Timestamp("20130104", tz="US/Pacific")) expected = DataFrame( { "A": [ Timestamp("20130101", tz="US/Eastern"), Timestamp("20130104", tz="US/Pacific"), Timestamp("20130103", tz="US/Eastern"), ], "B": [0, np.nan, 2], } ) tm.assert_frame_equal(result, expected) result = df.copy() result.iloc[1, 0] = np.nan result = result.replace({"A": np.nan}, Timestamp("20130104")) expected = DataFrame( { "A": [ Timestamp("20130101", tz="US/Eastern"), Timestamp("20130104"), Timestamp("20130103", tz="US/Eastern"), ], "B": [0, np.nan, 2], } ) tm.assert_frame_equal(result, expected)
def get_qq_nums(self, user, password, qq_group): try: # cf = ConfigParser.ConfigParser() # cf.read('conf.ini') # chromedriver = cf.get('main', 'path') # chromedriver = "/Users/Homosum/Downloads/chromedriver" # driver = webdriver.Chrome(self.chromedirverPath) driver = webdriver.Firefox() driver.get("http://qun.qq.com/member.html#gid={}".format(qq_group)) IframeElement = driver.find_element_by_name("login_frame") driver.switch_to_frame(IframeElement) driver.find_element_by_xpath( "//*[@id='bottom_qlogin']/a[1]").click() # 登录界面 driver.find_element_by_xpath("//*[@id='u']").send_keys(user) driver.find_element_by_xpath("//*[@id='p']").send_keys(password) driver.find_element_by_xpath( "//*[@id='login_button']").click() # 点击登录 time.sleep(1.5) driver.switch_to_default_content( ) # 防止出现TypeError: can't access dead object 错误特别重要 time.sleep(1.5) web_data = driver.page_source selector = etree.HTML(web_data) try: #print('999') people_num = selector.xpath( "//*[@id='groupMemberNum']/text()") # 获取群组人数量 print('people_num:%s' % people_num) if len(people_num) == 0: people_nums = 0 else: people_nums = int(people_num[0]) #print('777') except Exception as e: logger.warning("网络问题") driver.close() count = 1 logger.info('QQ群人数%d' % (people_nums)) for _ in range(int(people_nums / 20)): js = "var q=document.documentElement.scrollTop=500000" # js = "var q=document.body.scrollTop=500000" driver.execute_script(js) time.sleep(2) count += 1 #print('666') web_data = driver.page_source # 重新获取网页源代码 selector = etree.HTML(web_data) people_nicks = selector.xpath( "//tbody[@class='list']/tr/td[3]/span/text()") # 获取昵称 people_nicks = get_freshList(people_nicks) people_names = selector.xpath( "//tbody[@class='list']/tr/td[4]/span/text()" ) # 获取群名片 #获取群名片 people_names = get_freshList(people_names) people_QQs = selector.xpath( "//tbody[@class='list']/tr/td[5]/text()") # 获取qq号 people_QQs = get_freshList(people_QQs) people_sexs = selector.xpath( "//tbody[@class='list']/tr/td[6]/text()") # 获取性别 people_sexs = get_freshList(people_sexs) people_ages = selector.xpath( "//tbody[@class='list']/tr/td[7]/text()") # 获取Q龄 people_ages = get_freshList(people_ages) people_grades = selector.xpath( "//tbody[@class='list']/tr/td[9]/text()") # 获取活跃度 people_grades = get_freshList(people_grades) #print('555') result_array = [] countS = 0 #name_ = driver.find_element_by_xpath("//*[@id='groupTit']").text name_A = selector.xpath("//*[@id='groupTit']/text()") print('name_%s' % name_A) name_ = name_A[0] logger.info('用户:%s,群号:%s,爬取人数:%d' % (user, name_, len(people_QQs))) #print('用户:%s,群号:%s,爬取人数:%d' % (user,name_,len(people_QQs))) #print('444') for countS in range(len(people_QQs)): member = QQ_Member() member.name = people_nicks[countS] member.sex = people_sexs[countS] member.qq_age = people_ages[countS] member.num = people_QQs[countS] member.source = name_ dic = classToDict(member) result_array.append(dic) #print('333') frame = DataFrame(result_array) frame.fillna('NA') filePath = my_web.qqSavePath path = ('%s/%s.csv' % (filePath, qq_group)) frame.to_csv(path, encoding='utf-8') #print('222') print('已保存%s' % path) logger.info('已保存%s' % path) driver.close() #print('111') pass except Exception as e: driver.close()
# print(s1.isnull()) # 判断是否为nan # print(s1.notnull()) # 判断是否为非nan # print(s1.dropna()) # 去掉nan值 df1 = DataFrame([[1, 2, 3], [np.nan, 5, 6], [7, 8, np.nan], [np.nan, np.nan, np.nan]]) # DataFrame中nan的使用 print(df1) # print(df1.isnull()) # print(df1.notnull()) df2 = df1.dropna(axis=0) # 去掉有nan的行 # print(df2) df3 = df1.dropna(axis=1) # 去掉有nan的列 # print(df3) df4 = df1.dropna(axis=0, how='any') # 参数how,any:如果存在nan则去掉, # print(df4) df5 = df1.dropna(axis=0, how='all') # all:如果全为nan则去掉 # print(df5) dframe = DataFrame([[1, 2, 3], [np.nan, 5, 6], [7, np.nan, np.nan], [np.nan, np.nan, np.nan]]) # print(dframe) df6 = dframe.dropna(axis=0, thresh=2) # 参数thresh指定去掉多少个nan,大于等于 # print(df6) # print(dframe.fillna(value=1)) # nan填充 print(dframe.fillna(value={ 0: 0, 1: 1, 2: 2, 3: 3 })) # 指定列进行填充,如0列填充0,1列填充1,自己指定
def create_plots(df: pd.DataFrame, output_folder: PathLike, max_combsize: int = 1, xpaxis: str = "budget") -> None: """Saves all representing plots to the provided folder Parameters ---------- df: pd.DataFrame the experiment data output_folder: PathLike path of the folder where the plots should be saved max_combsize: int maximum number of parameters to fix (combinations) when creating experiment plots xpaxis: str x-axis for xp plots (either budget or pseudotime) """ assert xpaxis in ["budget", "pseudotime"] df = remove_errors(df) df.loc[:, "loss"] = pd.to_numeric(df.loc[:, "loss"]) df = tools.Selector(df.fillna("N-A")) # remove NaN in non score values assert not any("Unnamed: " in x for x in df.columns), f"Remove the unnamed index column: {df.columns}" assert "error " not in df.columns, f"Remove error rows before plotting" required = {"optimizer_name", "budget", "loss", "elapsed_time", "elapsed_budget"} missing = required - set(df.columns) assert not missing, f"Missing fields: {missing}" output_folder = Path(output_folder) os.makedirs(output_folder, exist_ok=True) # check which descriptors do vary descriptors = sorted(set(df.columns) - (required | {"seed", "pseudotime"})) # all other columns are descriptors to_drop = [x for x in descriptors if len(df.unique(x)) == 1] df = tools.Selector(df.loc[:, [x for x in df.columns if x not in to_drop]]) descriptors = sorted(set(df.columns) - (required | {"seed", "pseudotime"})) # now those should be actual interesting descriptors print(f"Descriptors: {descriptors}") print("# Fight plots") # # fight plot # choice of the combination variables to fix fight_descriptors = descriptors + ["budget"] # budget can be used as a descriptor for fight plots combinable = [x for x in fight_descriptors if len(df.unique(x)) > 1] # should be all now num_rows = 6 for fixed in list(itertools.chain.from_iterable(itertools.combinations(combinable, order) for order in range(max_combsize + 1))): # choice of the cases with values for the fixed variables for case in df.unique(fixed): print("\n# new case #", fixed, case) casedf = df.select(**dict(zip(fixed, case))) data_df = FightPlotter.winrates_from_selection(casedf, fight_descriptors, num_rows=num_rows) fplotter = FightPlotter(data_df) # save name = "fight_" + ",".join("{}{}".format(x, y) for x, y in zip(fixed, case)) + ".png" name = "fight_all.png" if name == "fight_.png" else name fplotter.save(str(output_folder / name), dpi=_DPI) plt.close("all") # # xp plots # plot mean loss / budget for each optimizer for 1 context print("# Xp plots") name_style = NameStyle() # keep the same style for each algorithm cases = df.unique(descriptors) for case in cases: subdf = df.select_and_drop(**dict(zip(descriptors, case))) description = ",".join("{}:{}".format(x, y) for x, y in zip(descriptors, case)) out_filepath = output_folder / "xpresults{}{}.png".format("_" if description else "", description.replace(":", "")) data = XpPlotter.make_data(subdf) xpplotter = XpPlotter(data, title=description, name_style=name_style, xaxis=xpaxis) xpplotter.save(out_filepath) plt.close("all")
temp = df['Close'].astype('float64').pct_change().fillna(0.) return temp secIDs = [ '000300.ZICN', '000905.ZICN', '399006.ZICN', 'SPX.ZIUS', '000012.ZICN', '000013.ZICN' ] rtn_table = DataFrame() for secID in secIDs: cp = get_return(secID) cp.name = secID rtn_table = pd.concat([rtn_table, cp], axis=1) rtn_table.fillna(0, inplace=True) #rtn_table.head(5) #rtn_table.mean()*250 #rtn_table.corr() print(rtn_table.mean() * 250) print(rtn_table.corr()) print("*************************************************") from cvxopt import matrix, solvers portfolio1 = [0, 1, 2, 4, 5] portfolio2 = range(6)
def create_head_traces(dict): heads = DataFrame(dict, columns=get_names()) heads = heads.fillna(0) return heads
def create_plots( df: pd.DataFrame, output_folder: tp.PathLike, max_combsize: int = 1, xpaxis: str = "budget", competencemaps: bool = False, ) -> None: """Saves all representing plots to the provided folder Parameters ---------- df: pd.DataFrame the experiment data output_folder: PathLike path of the folder where the plots should be saved max_combsize: int maximum number of parameters to fix (combinations) when creating experiment plots xpaxis: str x-axis for xp plots (either budget or pseudotime) """ assert xpaxis in ["budget", "pseudotime"] df = remove_errors(df) df.loc[:, "loss"] = pd.to_numeric(df.loc[:, "loss"]) # If we have a descriptor "instrum_str", # we assume that it describes the instrumentation as a string, # that we should include the various instrumentations as distinct curves in the same plot. # So we concat it at the end of the optimizer name, and we remove "parametrization" # from the descriptor. if "instrum_str" in set(df.columns): df.loc[:, "optimizer_name"] = df.loc[:, "optimizer_name"] + df.loc[:, "instrum_str"] df = df.drop(columns="instrum_str") df = df.drop(columns="dimension") if "parametrization" in set(df.columns): df = df.drop(columns="parametrization") df = utils.Selector(df.fillna("N-A")) # remove NaN in non score values assert not any( "Unnamed: " in x for x in df.columns), f"Remove the unnamed index column: {df.columns}" assert "error " not in df.columns, f"Remove error rows before plotting" required = { "optimizer_name", "budget", "loss", "elapsed_time", "elapsed_budget" } missing = required - set(df.columns) assert not missing, f"Missing fields: {missing}" output_folder = Path(output_folder) os.makedirs(output_folder, exist_ok=True) # check which descriptors do vary descriptors = sorted( set(df.columns) - (required | {"instrum_str", "seed", "pseudotime" })) # all other columns are descriptors to_drop = [x for x in descriptors if len(df.unique(x)) == 1] df = utils.Selector(df.loc[:, [x for x in df.columns if x not in to_drop]]) # now those should be actual interesting descriptors all_descriptors = sorted( set(df.columns) - (required | {"instrum_str", "seed", "pseudotime"})) print(f"Descriptors: {all_descriptors}") print("# Fight plots") # # fight plot # choice of the combination variables to fix fight_descriptors = all_descriptors + [ "budget" ] # budget can be used as a descriptor for fight plots combinable = [x for x in fight_descriptors if len(df.unique(x)) > 1] # should be all now # We remove descriptors which have only one value for each budget. descriptors = [] for d in all_descriptors: acceptable = False for b in df.budget.unique(): if len(df.loc[df["budget"] == b][d].unique()) > 1: acceptable = True break if acceptable: descriptors += [d] num_rows = 6 # For the competence map case we must consider pairs of attributes, hence maxcomb_size >= 2. # A competence map shows for each value of each of two attributes which algorithm was best. if competencemaps: max_combsize = max(max_combsize, 2) for fixed in list( itertools.chain.from_iterable( itertools.combinations(combinable, order) for order in range(max_combsize + 1))): orders = [len(c) for c in df.unique(fixed)] if orders: assert min(orders) == max(orders) order = min(orders) else: order = 0 best_algo: tp.List[tp.List[str]] = [] if competencemaps and order == 2: # With order 2 we can create a competence map. print("\n#trying to competence-map") if all([len(c) > 1 for c in df.unique(fixed) ]): # Let us try if data are adapted to competence maps. # This is not always the case, as some attribute1/value1 + attribute2/value2 might be empty # (typically when attribute1 and attribute2 are correlated). try: xindices = sorted(set(c[0] for c in df.unique(fixed))) except TypeError: xindices = list(set(c[0] for c in df.unique(fixed))) try: yindices = sorted(set(c[1] for c in df.unique(fixed))) except TypeError: yindices = list(set(c[1] for c in df.unique(fixed))) for _ in range(len(xindices)): best_algo += [[]] for i in range(len(xindices)): for _ in range(len(yindices)): best_algo[i] += ["none"] # Let us loop over all combinations of variables. for case in df.unique(fixed) if fixed else [()]: print("\n# new case #", fixed, case) casedf = df.select(**dict(zip(fixed, case))) data_df = FightPlotter.winrates_from_selection(casedf, fight_descriptors, num_rows=num_rows) fplotter = FightPlotter(data_df) # Competence maps: we find out the best algorithm for each attribute1=valuei/attribute2=valuej. if order == 2 and competencemaps and best_algo: print("\n#storing data for competence-map") best_algo[xindices.index(case[0])][yindices.index( case[1])] = fplotter.winrates.index[0] # save name = "fight_" + ",".join("{}{}".format(x, y) for x, y in zip(fixed, case)) + ".png" name = "fight_all.png" if name == "fight_.png" else name if name == "fight_all.png": with open(str(output_folder / name) + ".cp.txt", "w") as f: f.write("ranking:\n") for i, algo in enumerate(data_df.columns[:8]): f.write(f" algo {i}: {algo}\n") if len(name) > 240: hashcode = hashlib.md5(bytes(name, "utf8")).hexdigest() name = re.sub(r"\([^()]*\)", "", name) mid = 120 name = name[:mid] + hashcode + name[-mid:] fplotter.save(str(output_folder / name), dpi=_DPI) if order == 2 and competencemaps and best_algo: # With order 2 we can create a competence map. print("\n# Competence map") name = "competencemap_" + ",".join("{}".format(x) for x in fixed) + ".tex" export_table(str(output_folder / name), xindices, yindices, best_algo) print("Competence map data:", fixed, case, best_algo) plt.close("all") # xp plots: for each experimental setup, we plot curves with budget in x-axis. # plot mean loss / budget for each optimizer for 1 context print("# Xp plots") name_style = NameStyle() # keep the same style for each algorithm cases = df.unique(descriptors) if not cases: cases = [()] # Average normalized plot with everything. out_filepath = output_folder / "xpresults_all.png" data = XpPlotter.make_data(df, normalized_loss=True) xpplotter = XpPlotter(data, title=os.path.basename(output_folder), name_style=name_style, xaxis=xpaxis) xpplotter.save(out_filepath) # Now one xp plot per case. for case in cases: subdf = df.select_and_drop(**dict(zip(descriptors, case))) description = ",".join("{}:{}".format(x, y) for x, y in zip(descriptors, case)) if len(description) > 280: hash_ = hashlib.md5(bytes(description, "utf8")).hexdigest() description = description[:140] + hash_ + description[-140:] out_filepath = output_folder / "xpresults{}{}.png".format( "_" if description else "", description.replace(":", "")) data = XpPlotter.make_data(subdf) try: xpplotter = XpPlotter(data, title=description, name_style=name_style, xaxis=xpaxis) except Exception as e: # pylint: disable=broad-except warnings.warn(f"Bypassing error in xpplotter:\n{e}", RuntimeWarning) else: xpplotter.save(out_filepath) plt.close("all")
import pandas as pd import numpy as np from pandas import Series, DataFrame from numpy.random import randn df1 = pd.read_clipboard() #đọc bảng từ clipboard df1.sum() #tính tổng các cột df1.sum(axis=1) #tính tổng các hàng df1.max() # trả về giá trị lớn nhất của mỗi cột df1.idxmax() # trả về idx của giá trị max ở mỗi cột df1.cumsum() # tính tổng dần từ trên xuống: # hàng 2=h1=h2;h3=h1+h2+h3 df1.describe() # mô tả bảng nd = np.nan # hàm nào đó A = [1, 2, 3] B = [4, nd, 6] C = [nd, 8, nd] D = [nd, nd, nd] df2 = DataFrame([A, B, C, D]) df2.dropna() # drop hàng nào có ô giá trị NaN df2.dropna(how='all') # drop hàng tất cả các ô đều NaN df2.dropna(thresh=1) # drop một dòng từ dưới lên df2.fillna(100) # thay ô có giá trị NaN thành 100 df2.fillna(24, inplace=True) # thay ô có giá trị NaN thành 24 df2
df1.ix[row, column_name] = 1 #添加手机号码前3位 column_name = 'head' + '_' + head df1.ix[row, column_name] = 1 #添加手机号码最后一位 column_name = 'tail' + '_' + tail df1.ix[row, column_name] = 1 if subroots != "\N": keyword_list = subroots.split(unicode(';', 'utf-8')) for element in keyword_list: column_name = element df2.ix[row, column_name] = 1 fin.close() df1.fillna(0, inplace=True) #默认不为NAN,而是为0 df2.fillna(0, inplace=True) #默认不为NAN,而是为0 columns_path = os.path.join( os.path.split(os.path.realpath(__file__))[0], "column_names1.txt") columns_fout = open(columns_path, 'w') for column_name in df1.columns: #将列名写到文件 columns_fout.write(column_name + '\n') columns_fout.close() A1_path = os.path.join(os.path.split(os.path.realpath(__file__))[0], "A1.txt") df1.to_csv(A1_path, index=False) #将表格写到文件 # print df1.columns print df1.shape #输出表格的行列数 columns_path = os.path.join( os.path.split(os.path.realpath(__file__))[0], "column_names2.txt")
df = DataFrame([[1,2,3,],[1,NA,NA],[NA,NA,NA],[NA,2,3]]) df df.isnull() pd.isnull(df) df.dropna() # NaN 하나라도 있으면 row단위 제외 df.dropna(how = 'all') # NaN만 전부인 row 제외 df[4] = NA # 새로운 열 추가하면서 NaN 가득히 df df.dropna(how = 'all', axis = 1) # NaN만 전부인 열 제거 df.fillna(0) # NaN -> 0 치환 df[0].fillna(0) # 0 col만 df.fillna({0:0,1:1,2:2,4:4}) # col:input, NaN 열별 적용값 다르게 주기 df.fillna(0, inplace = True) # inplace = True 바로 적용 df df = DataFrame([[1,2,5],[NA,NA,4],[3,2,NA],[2,NA,3]]) df df.fillna(method = "ffill") # NaN 앞의 값으로 채원 df.ffill() df.fillna(method = "bfill") # NaN 뒤의 값으로 채원
def ReportGenerator(df: pd.DataFrame, ClusteringVariables: np.array, FillMissingReport=None) -> pd.DataFrame: """ Function generates easy-erading clustering report. It takes 2 arguments as an input: DataFrame - dataframe with predicted cluester column; FillMissingReport - dictionary of rules how we are going to fill missing values of for final report generate (not included in modeling); in order to run the function following libraries must be imported: import pandas as pd import numpy as np >>> data = pd.DataFrame() >>> data['numbers'] = [1, 2, 3] >>> data['col1'] = [0.5, 2.5, 4.5] >>> data['col2'] = [100, 200, 300] >>> data['col3'] = [10, 20, 30] >>> data['Cluster'] = [1, 1, 2] >>> ReportGenerator(data, ['col1', 'col2'], 0) Features Type Mark 1 2 0 # of Customers ClusterSize False 2.000000 1.000000 1 % of Customers ClusterProportion False 0.666667 0.333333 2 col1 mean_with_zeros True 1.500000 4.500000 3 col2 mean_with_zeros True 150.000000 300.000000 4 numbers mean_with_zeros False 1.500000 3.000000 .. ... ... ... ... ... 99 dummy 5% False 1.000000 1.000000 100 dummy 95% False 1.000000 1.000000 101 dummy stdev False 0.000000 NaN 102 dummy mode False 1.000000 1.000000 103 dummy median False 1.000000 1.000000 <BLANKLINE> [104 rows x 5 columns] """ # Fill missing values with given rules if FillMissingReport: df.fillna(value=FillMissingReport, inplace=True) df["dummy"] = 1 numeric_cols = df.select_dtypes(np.number).columns report = ( df.groupby(["Cluster"])[ # construct report dataframe numeric_cols] # group by cluster number .agg([ ("sum", np.sum), ("mean_with_zeros", lambda x: np.mean(np.nan_to_num(x))), ("mean_without_zeros", lambda x: x.replace(0, np.NaN).mean()), ( "mean_25-75", lambda x: np.mean( np.nan_to_num( sorted(x)[round( (len(x) * 25 / 100)):round(len(x) * 75 / 100)])), ), ("mean_with_na", np.mean), ("min", lambda x: x.min()), ("5%", lambda x: x.quantile(0.05)), ("25%", lambda x: x.quantile(0.25)), ("50%", lambda x: x.quantile(0.50)), ("75%", lambda x: x.quantile(0.75)), ("95%", lambda x: x.quantile(0.95)), ("max", lambda x: x.max()), ("count", lambda x: x.count()), ("stdev", lambda x: x.std()), ("mode", lambda x: x.mode()[0]), ("median", lambda x: x.median()), ("# > 0", lambda x: (x > 0).sum()), ]).T.reset_index().rename(index=str, columns={ "level_0": "Features", "level_1": "Type" })) # rename columns # calculate the size of cluster(count of clientID's) clustersize = report[(report["Features"] == "dummy") & ( report["Type"] == "count")].copy() # avoid SettingWithCopyWarning clustersize.Type = ( "ClusterSize" # rename created cluster df to match report column names ) clustersize.Features = "# of Customers" clusterproportion = pd.DataFrame(clustersize.iloc[:, 2:].values / clustersize.iloc[:, 2:].values.sum( ) # calculating the proportion of cluster ) clusterproportion[ "Type"] = "% of Customers" # rename created cluster df to match report column names clusterproportion["Features"] = "ClusterProportion" cols = clusterproportion.columns.tolist() cols = cols[-2:] + cols[:-2] clusterproportion = clusterproportion[ cols] # rearrange columns to match report clusterproportion.columns = report.columns a = pd.DataFrame( abs(report[report["Type"] == "count"].iloc[:, 2:].values - clustersize.iloc[:, 2:].values) ) # generating df with count of nan values a["Features"] = 0 a["Type"] = "# of nan" a.Features = report[report["Type"] == "count"].Features.tolist( ) # filling values in order to match report cols = a.columns.tolist() cols = cols[-2:] + cols[:-2] a = a[cols] # rearrange columns to match report a.columns = report.columns # rename columns to match report report = report.drop(report[ report.Type == "count"].index) # drop count values except cluster size report = pd.concat( [report, a, clustersize, clusterproportion], axis=0) # concat report with clustert size and nan values report["Mark"] = report["Features"].isin(ClusteringVariables) cols = report.columns.tolist() cols = cols[0:2] + cols[-1:] + cols[2:-1] report = report[cols] sorter1 = { "ClusterSize": 9, "ClusterProportion": 8, "mean_with_zeros": 7, "mean_with_na": 6, "max": 5, "50%": 4, "min": 3, "25%": 2, "75%": 1, "# of nan": 0, "# > 0": -1, "sum_with_na": -2, } report = (report.assign( Sorter1=lambda x: x.Type.map(sorter1), Sorter2=lambda x: list(reversed(range(len(x)))), ).sort_values(["Sorter1", "Mark", "Sorter2"], ascending=False).drop(["Sorter1", "Sorter2"], axis=1)) report.columns.name = "" report = report.reset_index() report.drop(columns=["index"], inplace=True) return report
def test_fillna_integer_limit(self, type): df = DataFrame(np.random.randn(10, 4)).astype(type) msg = "Limit must be an integer" with pytest.raises(ValueError, match=msg): df.fillna(0, limit=0.5)
from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error import numpy as np #reading data kingcnt = pd.read_csv("data/King_County_House_prices_dataset.csv") df_kingcnt = DataFrame(kingcnt,columns=['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15']) #dealing with missing values df_kingcnt.fillna({'waterfront':0, 'view':0}, inplace=True) #splitting data print("----- Splitting the data in train and test ----") train, test = train_test_split(df_kingcnt, test_size=0.33, random_state=42) #cleaning training data train = train[train.bedrooms != 33] train = train[train.sqft_living < 12000] train = train[train.sqft_lot < 1100000] train = train[train.sqft_above < 9000] train = train[train.sqft_lot15 < 500000]
print(df.isnull()) print(df.notnull()) # ②处理缺失数据 # 处理方式:数据补齐、删除对应行、不处理 # 1.删除对应行:dropna newDf = df.dropna() # 删除包含NaN的行 print(newDf) print(len(newDf)) # 返回行数 print(newDf.columns) # 含列名的Index newDf = df.dropna(how='all') # 只有当所有列全为空时,该行才删除 print(newDf) print(df.dropna(axis=1)) # 按列丢弃 print(df.dropna(how='all', axis=1)) # 按列丢弃 # 2.数据补齐:fillna print(df.fillna('?')) df.at[0, '数分'] = None print(df.fillna(method='pad')) # 使用该列的前一个值填充,若该行没有前一行,则仍然为NaN print(df.fillna(method='bfill')) # 使用该列的后一个值填充,若该行没有后一行,则仍然为NaN # 使用平均值或其他统计量代替NaN print(df.fillna(df.mean())) # 使用该列的平均数替代 print(df.fillna(df.mean()['高代':'解几'])) # 用其他列('解几')均值替代指定列('高代')的NaN # 不同列填充不同值 print(df.fillna({'数分': 100, '高代': 0})) # 没有列出的列不变 # strip()、lstrip()、rstrip():清除字符型数据首尾指定的字符(默认空白符) df2 = DataFrame({ 'age': Series([26, 34, 27, 34, 88, 21, 27]), 'name': Series([' Tom', 'Lee ', ' Jon', ' Lee', 'James ', 'Curry ', ' Curryy']) })
def transform(df: pd.DataFrame, year: int, fillgps: bool = False, naninvalid: bool = False, dropnan: bool = False, masknan: float = None, fillnan: float = None, aqsnumerical: bool = False, sites=[]) -> pd.DataFrame: if len(sites) > 0: df.drop(df[~df['AQS_Code'].isin(list(sites.keys()))].index, inplace=True) # This is probobly not needed anymore after changes Data_structure_3 (level3_data) if naninvalid: if year < 2014: val = 'VAL' if year >= 2014: val = 'K' df[df['nox_flag'] != val]['nox_flag'] = np.nan df[df['no_flag'] != val]['no_flag'] = np.nan df[df['no2_flag'] != val]['no2_flag'] = np.nan df[df['o3_flag'] != val]['o3_flag'] = np.nan # This is probobly not needed anymore after changes Data_structure_3 (level3_data) if fillgps: unique = df['AQS_Code'].unique() for site in HOUSTON: if site in unique: df[df['AQS_Code'] == site]['Longitude'] = HOUSTON[site]['Longitude'] df[df['AQS_Code'] == site]['Latitude'] = HOUSTON[site]['Latitude'] if dropnan: if year < 2014: val = 'VAL' if year >= 2014: val = 'K' df.dropna(inplace=True) if aqsnumerical: df['AQS_Code'].str.replace('_', '') df['AQS_Code'] = df['AQS_Code'].astype(int) df['wind_x_dir'] = df['windspd'] * np.cos(df['winddir'] * (np.pi / 180)) df['wind_y_dir'] = df['windspd'] * np.sin(df['winddir'] * (np.pi / 180)) df['hour'] = pd.to_datetime(df['epoch'], unit='s').dt.hour df['day_of_year'] = pd.Series(pd.to_datetime(df['epoch'], unit='s')) df['day_of_year'] = df['day_of_year'].dt.dayofyear if masknan is not None: s = df['AQS_Code'] df[df.isnull().any(axis=1)] = 1000 df['AQS_Code'] = s elif fillnan is not None: df.fillna(fillnan, inplace=True) return df
def test_fillna(self): tf = self.tsframe tf.loc[tf.index[:5], 'A'] = nan tf.loc[tf.index[-5:], 'A'] = nan zero_filled = self.tsframe.fillna(0) assert (zero_filled.loc[zero_filled.index[:5], 'A'] == 0).all() padded = self.tsframe.fillna(method='pad') assert np.isnan(padded.loc[padded.index[:5], 'A']).all() assert (padded.loc[padded.index[-5:], 'A'] == padded.loc[padded.index[-5], 'A']).all() # mixed type mf = self.mixed_frame mf.loc[mf.index[5:20], 'foo'] = nan mf.loc[mf.index[-10:], 'A'] = nan result = self.mixed_frame.fillna(value=0) result = self.mixed_frame.fillna(method='pad') pytest.raises(ValueError, self.tsframe.fillna) pytest.raises(ValueError, self.tsframe.fillna, 5, method='ffill') # mixed numeric (but no float16) mf = self.mixed_float.reindex(columns=['A', 'B', 'D']) mf.loc[mf.index[-10:], 'A'] = nan result = mf.fillna(value=0) _check_mixed_float(result, dtype=dict(C=None)) result = mf.fillna(method='pad') _check_mixed_float(result, dtype=dict(C=None)) # empty frame (GH #2778) df = DataFrame(columns=['x']) for m in ['pad', 'backfill']: df.x.fillna(method=m, inplace=True) df.x.fillna(method=m) # with different dtype (GH3386) df = DataFrame([['a', 'a', np.nan, 'a'], [ 'b', 'b', np.nan, 'b'], ['c', 'c', np.nan, 'c']]) result = df.fillna({2: 'foo'}) expected = DataFrame([['a', 'a', 'foo', 'a'], ['b', 'b', 'foo', 'b'], ['c', 'c', 'foo', 'c']]) assert_frame_equal(result, expected) df.fillna({2: 'foo'}, inplace=True) assert_frame_equal(df, expected) # limit and value df = DataFrame(np.random.randn(10, 3)) df.iloc[2:7, 0] = np.nan df.iloc[3:5, 2] = np.nan expected = df.copy() expected.iloc[2, 0] = 999 expected.iloc[3, 2] = 999 result = df.fillna(999, limit=1) assert_frame_equal(result, expected) # with datelike # GH 6344 df = DataFrame({ 'Date': [pd.NaT, Timestamp("2014-1-1")], 'Date2': [Timestamp("2013-1-1"), pd.NaT] }) expected = df.copy() expected['Date'] = expected['Date'].fillna( df.loc[df.index[0], 'Date2']) result = df.fillna(value={'Date': df['Date2']}) assert_frame_equal(result, expected) # with timezone # GH 15855 df = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'), pd.NaT]}) exp = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'), pd.Timestamp('2012-11-11 00:00:00+01:00')]}) assert_frame_equal(df.fillna(method='pad'), exp) df = pd.DataFrame({'A': [pd.NaT, pd.Timestamp('2012-11-11 00:00:00+01:00')]}) exp = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'), pd.Timestamp('2012-11-11 00:00:00+01:00')]}) assert_frame_equal(df.fillna(method='bfill'), exp)
def test_fillna_downcast_dict(self): # GH#40809 df = DataFrame({"col1": [1, np.nan]}) result = df.fillna({"col1": 2}, downcast={"col1": "int64"}) expected = DataFrame({"col1": [1, 2]}) tm.assert_frame_equal(result, expected)
def test_fillna_positive_limit(self, type): df = DataFrame(np.random.randn(10, 4)).astype(type) msg = "Limit must be greater than 0" with pytest.raises(ValueError, match=msg): df.fillna(0, limit=-5)
def test_fillna_col_reordering(self): cols = ["COL." + str(i) for i in range(5, 0, -1)] data = np.random.rand(20, 5) df = DataFrame(index=lrange(20), columns=cols, data=data) filled = df.fillna(method='ffill') assert df.columns.tolist() == filled.columns.tolist()
tmp_df = daily_df.filter(['movieNm', 'audiCnt']) daily_rank_df = tmp_df.rename(index=tmp_df['movieNm'], columns={'audiCnt': yesterday_str}) daily_rank_df.drop('movieNm', axis=1, inplace=True) daily_rank_df[yesterday_str] = daily_rank_df[yesterday_str].astype(int) df = pd.merge(df, daily_rank_df, left_index=True, right_index=True, how='outer') final_df = df.fillna(0) # 결측치 yesterday = dt.datetime.now() + dt.timedelta(days=-1) yesterday_str = yesterday.strftime('%Y%m%d') final_df = final_df.sort_values(yesterday_str, ascending=False).head(5) pyplot.rcParams['font.family'] = 'Malgun Gothic' pyplot.rcParams['font.size'] = 17 pyplot.rcParams['figure.figsize'] = (20, 10) final_df.plot.bar(rot=45) pyplot.title('날짜별 관람객 빈도') pyplot.show() final_df.T.plot(rot=45)
def _extract_weather_power_plants(_dataset): _dataset_list = [] df_average = DataFrame() df_average['Global_tavg'] = "" df_average['Global_tmin'] = "" df_average['Global_tmax'] = "" df_average['Global_prcp'] = "" df_average['Global_snow'] = "" df_average['Global_wdir'] = "" df_average['Global_wspd'] = "" df_average['Global_wpgt'] = "" df_average['Global_pres'] = "" df_average['Global_tsun'] = "" counter = 0 for i in range(len(_dataset)): print(i) latitude = _dataset.at[i, 'latitude'] longitude = _dataset.at[i, 'longitude'] name = _dataset.at[i, 'country'] weather_data = DataFrame(_weather_data_extraction( latitude, longitude)).add_prefix(name + "_" + str(i) + "_") weather_data.columns = [ 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt', 'pres', 'tsun' ] weather_data = weather_data.fillna(0.0) set_option('display.max_columns', 50) if i == 0: df_average['Global_tmin'] = weather_data['tmin'] df_average['Global_tmax'] = weather_data['tmax'] df_average['Global_prcp'] = weather_data['prcp'] df_average['Global_snow'] = weather_data['snow'] df_average['Global_wdir'] = weather_data['wdir'] df_average['Global_wspd'] = weather_data['wspd'] df_average['Global_wpgt'] = weather_data['wpgt'] df_average['Global_pres'] = weather_data['pres'] df_average['Global_tsun'] = weather_data['tsun'] else: df_average['Global_tmin'] += weather_data['tmin'] df_average['Global_tmax'] += weather_data['tmax'] df_average['Global_prcp'] += weather_data['prcp'] df_average['Global_snow'] += weather_data['snow'] df_average['Global_wdir'] += weather_data['wdir'] df_average['Global_wspd'] += weather_data['wspd'] df_average['Global_wpgt'] += weather_data['wpgt'] df_average['Global_pres'] += weather_data['pres'] df_average['Global_tsun'] += weather_data['tsun'] # print(weather_data['tavg']) # print("avg") # print(df_average) # print("---") counter += 1 # print(weather_data) # _dataset_list.append(weather_data) df_average['Global_tavg'] = df_average['Global_tavg'] / counter df_average['Global_tmin'] = df_average['Global_tmin'] / counter df_average['Global_tmax'] = df_average['Global_tmax'] / counter df_average['Global_prcp'] = df_average['Global_prcp'] / counter df_average['Global_snow'] = df_average['Global_snow'] / counter df_average['Global_wdir'] = df_average['Global_wdir'] / counter df_average['Global_wspd'] = df_average['Global_wspd'] / counter df_average['Global_wpgt'] = df_average['Global_wpgt'] / counter df_average['Global_pres'] = df_average['Global_pres'] / counter df_average['Global_tsun'] = df_average['Global_tsun'] / counter #return concat(_dataset_list, join='inner', axis=1) return df_average
# Compare salary.dropna().mean() # with salary.fillna(0).mean() # When you have a DataFrame things get a bit messier - do you want to drop all the rows with NAs or just some of them? salary = DataFrame({'salary':[53215, 112454, 22365, np.nan, 30493, None], 'grade':[5, 7, None, np.nan, 2, 9]},index=['Margaret', 'Stephen', 'Joanne', 'Joe', 'Matthew', 'Nelson']) # Drop NA drops all rows with NAs - this permits complete case analysis salary.dropna() #3 - Wrangling ****************************************************************** #Impute with column mean salary.fillna(salary.mean()) # Create a design matrix X with 5 columns, the first of which is all 1 the subsequent ones are x_1, x_2, etc X = np.array(([1]*n,x_1,x_2,x_3,x_4)).T # The below code separates out the data set into two parts. The first part, a DataFrame called X, should contain # just the covariate information (i.e. the first 191 columns) standardised to have # mean 0 and standard deviation 1. The second part, called y, should contain just # the RockOrNot variable. X = (music.drop('RockOrNot',axis=1) - music.drop('RockOrNot',axis=1).mean())/music.drop('RockOrNot',axis=1).std() y = music.RockOrNot # Use corrwith and np.where to find which variable has the biggest correlation with y cors = X.corrwith(y) np.where(cors==max(cors)) # So we now want each row to contain their rating and all their details
def prepare_data(test, traces, options): std_out('Preparing data for plot') # Dataframe to return df = DataFrame() # Check if there are different subplots n_subplots = 1 for trace in traces: if 'subplot' in traces[trace].keys(): n_subplots = max(n_subplots, traces[trace]['subplot']) else: std_out(f'Trace {trace} not assigned to subplot. Skipping', 'WARNING') std_out(f'Making {n_subplots} subplots') # Generate list of subplots subplots = [[] for x in range(n_subplots)] # Put data in the df for trace in traces.keys(): if 'subplot' not in traces[trace].keys(): std_out( f'The trace {traces[trace]} was not placed in any subplot. Assuming subplot #1', 'WARNING') traces[trace]['subplot'] = 1 ndevs = traces[trace]['devices'] nchans = traces[trace]['channel'] # Make them lists always if ndevs == 'all': devices = list(test.devices.keys()) elif type(ndevs) == str or type(ndevs) == int: devices = [ndevs] else: devices = ndevs for device in devices: ndev = str(device) # Make them lists always if nchans == 'all': channels = list(test.devices[ndev].readings.columns) elif type(nchans) == str: channels = [nchans] else: channels = nchans for channel in channels: # Check if device is in columns if channel not in test.devices[ndev].readings.columns: std_out( f'The device {ndev} does not contain {channel}. Ignoring', 'WARNING') continue # Put channel in subplots subplots[traces[trace]['subplot'] - 1].append(channel + '_' + ndev) column_orig = [channel] columns_add = [channel + '_' + ndev] # Add filtering name to dfdev if 'filter' in traces[trace]: col_name = traces[trace]['filter']['col'] if col_name not in test.devices[ndev].readings.columns: std_out( f'Column {col_name} not in dataframe. Ignoring filtering', 'WARNING') else: column_orig.append(col_name) columns_add.append(col_name) # Device dataframe dfdev = DataFrame( test.devices[ndev].readings[column_orig].values, columns=columns_add, index=test.devices[ndev].readings.index) # Add filtering function if 'filter' in traces[trace]: value = traces[trace]['filter']['value'] relationship = traces[trace]['filter']['relationship'] if col_name in dfdev.columns: if relationship == '==': dfdev.loc[dfdev[col_name] == value] elif relationship == '<=': dfdev.loc[dfdev[col_name] <= value] elif relationship == '>=': dfdev.loc[dfdev[col_name] >= value] elif relationship == '<': dfdev.loc[dfdev[col_name] < value] elif relationship == '>': dfdev.loc[dfdev[col_name] > value] else: std_out( f"Not valid relationship. Valid options: '==', '<=', '>=', '<', '>'", 'ERROR') continue # Remove column for filtering from dfdev dfdev.drop(columns=[col_name], inplace=True) # Combine it in the df df = df.combine_first(dfdev) # Add average or other extras # TODO Check this to simplify # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.core.resample.Resampler.aggregate.html if 'extras' in traces[trace]: for extra in traces[trace]['extras']: extra_name = channel + f'-{extra.upper()}' sbl = subplots[traces[trace]['subplot'] - 1] if extra == 'max': df[extra_name] = df.loc[:, sbl].max(axis=1) if extra == 'mean': df[extra_name] = df.loc[:, sbl].mean(axis=1) if extra == 'min': df[extra_name] = df.loc[:, sbl].min(axis=1) subplots[traces[trace]['subplot'] - 1].append(extra_name) # Trim data if options['min_date'] is not None: df = df[df.index > options['min_date']] if options['max_date'] is not None: df = df[df.index < options['max_date']] # Make sure everything is numeric before resampling # https://stackoverflow.com/questions/34257069/resampling-pandas-dataframe-is-deleting-column#34270422 # df = df.apply(to_numeric, errors='coerce') df = df.astype(float, errors='ignore') # Resample it if options['frequency'] is not None: std_out(f"Resampling at {options['frequency']}", "INFO") if 'resample' in options: if options['resample'] == 'max': df = df.resample(options['frequency']).max() if options['resample'] == 'min': df = df.resample(options['frequency']).min() if options['resample'] == 'mean': df = df.resample(options['frequency']).mean() else: df = df.resample(options['frequency']).mean() # Clean na if options['clean_na'] is not None: if options['clean_na'] == 'fill': df = df.fillna(method='ffill') if options['clean_na'] == 'drop': df.dropna(axis=0, how='any') if df.empty: std_out('Dataframe for selected options is empty', 'WARNING') return df, subplots
xLastTimeFeature = xTotalGroup['time'].min().unstack('behavior_type') xLastTimeFeature.fillna(value = 10, inplace = True) xLastTimeFeature.sort_index(axis = 1, inplace = True) xLastTimeFeature = xLastTimeFeature.add_prefix('x_last_time_') xLastTimeFeature13 = xLastTimeFeature.ix[ : , ['x_last_time_1', 'x_last_time_3']] #time relative xTimeGroup = u.groupby(['user_id', 'item_id', 'time', 'behavior_type']) xTimeFeature = xTimeGroup['item_category'].count() xTimeFeature = xTimeFeature.unstack(['time', 'behavior_type']) xTimeFeature.sort_index(axis = 1, inplace = True) xTimeFeature.fillna(value = 0, inplace = True) # 3 not 4 x_last1_3not4 = DataFrame(index = xTimeFeature[0].index) x_last1_3not4_index = np.logical_and(xTimeFeature[0][4] == 0, xTimeFeature[0][3] != 0) x_last1_3not4.ix[x_last1_3not4_index, 'x_last1_3not4'] = 1 x_last1_3not4.fillna(0, inplace = True) # x_last2_3not4 = DataFrame(index = xTimeFeature[0].index) # x_last2_3not4_index = np.logical_and(xTimeFeature[1][4] == 0, xTimeFeature[1][3] != 0) # x_last2_3not4.ix[x_last2_3not4_index, 'x_last2_3not4'] = 1 # x_last2_3not4.fillna(0, inplace = True) #every xevery2= [(xTimeFeature[i] + xTimeFeature[i + 1]).add_prefix(str(i) + '_') for i in range(0, 10, 2)] xevery2 = pd.concat(xevery2, axis = 1).add_prefix('x_every2_') #last xlast1 = (xTimeFeature[0]).add_prefix('x_last_1') xlast3 = (xTimeFeature[0] + xTimeFeature[1] + xTimeFeature[2]).add_prefix('x_last_3') xlast5 = (xTimeFeature[0] + xTimeFeature[1] + xTimeFeature[2] + xTimeFeature[3] + xTimeFeature[4]).add_prefix('x_last_5') xT = pd.concat([xlast1, xlast3, xlast5], axis = 1)
def test_fillna_other(self): # empty frame (GH #2778) df = DataFrame(columns=['x']) for m in ['pad', 'backfill']: df.x.fillna(method=m, inplace=True) df.x.fillna(method=m) # with different dtype (GH3386) df = DataFrame([['a', 'a', np.nan, 'a'], ['b', 'b', np.nan, 'b'], ['c', 'c', np.nan, 'c']]) result = df.fillna({2: 'foo'}) expected = DataFrame([['a', 'a', 'foo', 'a'], ['b', 'b', 'foo', 'b'], ['c', 'c', 'foo', 'c']]) assert_frame_equal(result, expected) df.fillna({2: 'foo'}, inplace=True) assert_frame_equal(df, expected) # limit and value df = DataFrame(np.random.randn(10, 3)) df.iloc[2:7, 0] = np.nan df.iloc[3:5, 2] = np.nan expected = df.copy() expected.iloc[2, 0] = 999 expected.iloc[3, 2] = 999 result = df.fillna(999, limit=1) assert_frame_equal(result, expected) # with datelike # GH 6344 df = DataFrame({ 'Date': [pd.NaT, Timestamp("2014-1-1")], 'Date2': [Timestamp("2013-1-1"), pd.NaT] }) expected = df.copy() expected['Date'] = expected['Date'].fillna(df.loc[df.index[0], 'Date2']) result = df.fillna(value={'Date': df['Date2']}) assert_frame_equal(result, expected) # with timezone # GH 15855 df = pd.DataFrame( {'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'), pd.NaT]}) exp = pd.DataFrame({ 'A': [ pd.Timestamp('2012-11-11 00:00:00+01:00'), pd.Timestamp('2012-11-11 00:00:00+01:00') ] }) assert_frame_equal(df.fillna(method='pad'), exp) df = pd.DataFrame( {'A': [pd.NaT, pd.Timestamp('2012-11-11 00:00:00+01:00')]}) exp = pd.DataFrame({ 'A': [ pd.Timestamp('2012-11-11 00:00:00+01:00'), pd.Timestamp('2012-11-11 00:00:00+01:00') ] }) assert_frame_equal(df.fillna(method='bfill'), exp) # with timezone in another column # GH 15522 df = pd.DataFrame({ 'A': pd.date_range('20130101', periods=4, tz='US/Eastern'), 'B': [1, 2, np.nan, np.nan] }) result = df.fillna(method='pad') expected = pd.DataFrame({ 'A': pd.date_range('20130101', periods=4, tz='US/Eastern'), 'B': [1., 2., 2., 2.] }) assert_frame_equal(result, expected)
print(df1.dropna(how='all')) #Column wise dropping null values print(df1.dropna( axis=1)) #The dropna() works similar in row and column NaN value drops print() #Threshold property of dropna() [thresh] df2 = DataFrame([[1, 2, 3, np.nan], [4, 5, 6, 7], [8, 9, np.nan, np.nan], [12, np.nan, np.nan, np.nan]]) print( df2.dropna(thresh=3) ) #This drops all rows where number of data values (not equal to NaN) is lesser than 3 #3. Filling NaN values with chosen numerical values - fillna() function print(df2.fillna({0: 0, 1: 50, 2: 100, 3: 200})) print() #----------- NEXT LECTURE -------------# #Selecting and modifying data in pandas series1 = Series([100, 200, 300], index=['a', 'b', 'c']) #The Series always have in-built indices of 0,1,2.... even if we explicitly mention other indices. Hence we can access #the series elements by the in-built indicies as well #Conditional Indexing print(series1[series1 > 150]) print(series1[series1 == 300]) df1 = DataFrame(np.arange(9).reshape(3, 3), index=['car', 'bike', 'cycle'],
def test_fillna_dtype_conversion_equiv_replace(self, val): df = DataFrame({"A": [1, np.nan], "B": [1.0, 2.0]}) expected = df.replace(np.nan, val) result = df.fillna(val) tm.assert_frame_equal(result, expected)