Beispiel #1
0
    def test_fillna_categorical_nan(self):
        # GH 14021
        # np.nan should always be a valid filler
        cat = Categorical([np.nan, 2, np.nan])
        val = Categorical([np.nan, np.nan, np.nan])
        df = DataFrame({"cats": cat, "vals": val})
        res = df.fillna(df.median())
        v_exp = [np.nan, np.nan, np.nan]
        df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp},
                           dtype='category')
        tm.assert_frame_equal(res, df_exp)

        result = df.cats.fillna(np.nan)
        tm.assert_series_equal(result, df.cats)
        result = df.vals.fillna(np.nan)
        tm.assert_series_equal(result, df.vals)

        idx = pd.DatetimeIndex(['2011-01-01 09:00', '2016-01-01 23:45',
                                '2011-01-01 09:00', pd.NaT, pd.NaT])
        df = DataFrame({'a': Categorical(idx)})
        tm.assert_frame_equal(df.fillna(value=pd.NaT), df)

        idx = pd.PeriodIndex(['2011-01', '2011-01', '2011-01',
                              pd.NaT, pd.NaT], freq='M')
        df = DataFrame({'a': Categorical(idx)})
        tm.assert_frame_equal(df.fillna(value=pd.NaT), df)

        idx = pd.TimedeltaIndex(['1 days', '2 days',
                                 '1 days', pd.NaT, pd.NaT])
        df = DataFrame({'a': Categorical(idx)})
        tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
Beispiel #2
0
    def test_fillna_skip_certain_blocks(self):
        # don't try to fill boolean, int blocks

        df = DataFrame(np.random.randn(10, 4).astype(int))

        # it works!
        df.fillna(np.nan)
Beispiel #3
0
    def test_merge_na_keys(self):
        data = [[1950, "A", 1.5],
                [1950, "B", 1.5],
                [1955, "B", 1.5],
                [1960, "B", np.nan],
                [1970, "B", 4.],
                [1950, "C", 4.],
                [1960, "C", np.nan],
                [1965, "C", 3.],
                [1970, "C", 4.]]

        frame = DataFrame(data, columns=["year", "panel", "data"])

        other_data = [[1960, 'A', np.nan],
                      [1970, 'A', np.nan],
                      [1955, 'A', np.nan],
                      [1965, 'A', np.nan],
                      [1965, 'B', np.nan],
                      [1955, 'C', np.nan]]
        other = DataFrame(other_data, columns=['year', 'panel', 'data'])

        result = frame.merge(other, how='outer')

        expected = frame.fillna(-999).merge(other.fillna(-999), how='outer')
        expected = expected.replace(-999, np.nan)

        tm.assert_frame_equal(result, expected)
def getPostData(fbGraph, entry):
    global CHART_LIMIT
    retrieved = False
    i=0
    while retrieved == False:
        i += 1
        try:
            posts = fbGraph.get_object(entry['page'] + '/posts',
                                       limit=CHART_LIMIT*15)['data']
            retrieved = True
        except facebook.GraphAPIError:
            print "Failed retrieving Graph object from facebook, retrying..."
            pass
        if i > 14:
            print "Giving up"
            return None
        
    frame = DataFrame(posts)
    ##Later, maybe output this frame for further study
    
    postData = DataFrame(columns=('Date', 'Likes', 'Shares'))
    postData['Shares'] = frame['shares'].map(fmtShares)
    postData['Likes']  = frame['id'].map(fmtLikes)
    postData['Date']   = frame['created_time'].map(fmtDate)
    
    postData = postData.groupby(by='Date', sort=False).mean()
    postData = postData.head(n=CHART_LIMIT)
    postData.fillna(value=0)
    return postData
Beispiel #5
0
    def test_operators_none_as_na(self):
        df = DataFrame({"col1": [2, 5.0, 123, None],
                        "col2": [1, 2, 3, 4]}, dtype=object)

        ops = [operator.add, operator.sub, operator.mul, operator.truediv]

        # since filling converts dtypes from object, changed expected to be
        # object
        for op in ops:
            filled = df.fillna(np.nan)
            result = op(df, 3)
            expected = op(filled, 3).astype(object)
            expected[com.isnull(expected)] = None
            assert_frame_equal(result, expected)

            result = op(df, df)
            expected = op(filled, filled).astype(object)
            expected[com.isnull(expected)] = None
            assert_frame_equal(result, expected)

            result = op(df, df.fillna(7))
            assert_frame_equal(result, expected)

            result = op(df.fillna(7), df)
            assert_frame_equal(result, expected, check_dtype=False)
Beispiel #6
0
def timeseries_to_supervised(data, lag=1):
    df = DataFrame(data)
    columns = [df.shift(i) for i in range(1, lag+1)]
    columns.append(df)
    df = concat(columns, axis=1)
    df.fillna(0, inplace=True)
    return df
Beispiel #7
0
    def test_fillna_dtype_conversion(self):
        # make sure that fillna on an empty frame works
        df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
        result = df.get_dtype_counts().sort_values()
        expected = Series({'object': 5})
        assert_series_equal(result, expected)

        result = df.fillna(1)
        expected = DataFrame(1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
        result = result.get_dtype_counts().sort_values()
        expected = Series({'int64': 5})
        assert_series_equal(result, expected)

        # empty block
        df = DataFrame(index=lrange(3), columns=['A', 'B'], dtype='float64')
        result = df.fillna('nan')
        expected = DataFrame('nan', index=lrange(3), columns=['A', 'B'])
        assert_frame_equal(result, expected)

        # equiv of replace
        df = DataFrame(dict(A=[1, np.nan], B=[1., 2.]))
        for v in ['', 1, np.nan, 1.0]:
            expected = df.replace(np.nan, v)
            result = df.fillna(v)
            assert_frame_equal(result, expected)
def pad_smooth(sample, window_len):
    nr_frames = sample.frame.max() + 40
    out = DataFrame({'frame': np.arange(nr_frames)})
    out['sample_id'] = sample.sample_id.unique()[0]

    out = pd.merge(out, sample, how='outer', on=['sample_id', 'frame'])
    out.fillna(method='ffill', inplace=True, limit=2)
    out.fillna(method='bfill', inplace=True)
    out.fillna(method='ffill', inplace=True)
    for gesture in range(21):
        out[gesture] = smooth(out[gesture], window_len=window_len, window='hanning')
    return out
Beispiel #9
0
    def test_fillna_columns(self):
        df = DataFrame(np.random.randn(10, 10))
        df.values[:, ::2] = np.nan

        result = df.fillna(method='ffill', axis=1)
        expected = df.T.fillna(method='pad').T
        assert_frame_equal(result, expected)

        df.insert(6, 'foo', 5)
        result = df.fillna(method='ffill', axis=1)
        expected = df.astype(float).fillna(method='ffill', axis=1)
        assert_frame_equal(result, expected)
Beispiel #10
0
class Fillna(object):

    params = ([True, False], ['pad', 'bfill'])
    param_names = ['inplace', 'method']

    def setup(self, inplace, method):
        values = np.random.randn(10000, 100)
        values[::2] = np.nan
        self.df = DataFrame(values)

    def time_frame_fillna(self, inplace, method):
        self.df.fillna(inplace=inplace, method=method)
Beispiel #11
0
def makeguesses(morceaux,
                collectionname='grenoble',
                databasename='local'):
    with MongoClient() as client:
        coll = client[databasename][collectionname]
        dbvecs = DataFrame({d['T']: titlevector(d['T'])
                            for d in coll.find()
                            if d['T'] is not None})
        dbvecs.fillna(0, inplace=True)
        dbvecs.sort_index(inplace=True)
        dbkeys = {d['T']: d['_id']
                  for d in coll.find() if d['T'] is not None}
        # get the best fit
        guesses = (np.abs(dbvecs.subtract(titlevector(m), axis=0)).sum(axis=0).idxmin() for m in morceaux)
    return [[m, g, dbkeys[g]] for m, g in zip(morceaux, guesses)]
def calc_distance_matrix(G, max_distance=None):
    """Returns a matrix containing the shortest distance
    between all nodes in a network

    Parameters
    ----------
    G : graph
       A NetworkX graph

    max_distance : float or None, optional (default='None')
       The maximum possible distance value in the network.
       If None, max_distance is the longest shortest path between
       two nodes of the network (the graph eccentricity)

    Returns
    -------
    dist_matrix : NumPy array
      An NxN numpy array.

    Notes
    -----
    Along the diagonal, the values are all 0.
    Unconnected nodes have a distance of max_distance to other nodes.
    """

    # Network (collaborator) Distance
    dist_matrix = nx.all_pairs_shortest_path_length(G)
    dist_matrix = DataFrame(dist_matrix, index=G.nodes(), columns=G.nodes())
    if max_distance is None:
        max_distance = float(dist_matrix.max().max())
    dist_matrix = dist_matrix.fillna(max_distance)
    # The unconnected ones are infinitely far from the rest
    diag_idx = np.diag_indices(len(dist_matrix), ndim=2)
    dist_matrix.values[diag_idx] = 0
    return dist_matrix
Beispiel #13
0
    def get_date_trend(self, mode_date):
        """
        :param mode_date: 日期模式,合并到最短时间单位. 0-day, 1-week, 2-month, 3-Quarter. (default 2)
        """
        axisLabels = self.oriDate[:]
        pointVals = [{copy.deepcopy(oriValue): 1} for oriValue in self.oriValues]

        rule_mode = {'0': 'D', '1': 'W', '2': 'M', '3': 'Q'}

        df = DataFrame(pointVals, index=axisLabels)
        df = df.resample(rule_mode[str(mode_date)], how='sum')
        df = df.fillna(0)

        """各项总和"""
        # cols_name = []
        # for name, col in df.iteritems():
        #     cols_name.append(name)
        # df['SUM'] = 0
        # for i in xrange(len(cols_name)):
        #     df['SUM'] += df[cols_name[i]]

        """宿舍比重"""
        # df['PER_DORM'] = df['dorm']/df['SUM'] if 'dorm' in df else 0  # 仅当存在宿舍值时才计算宿舍比重,否则设为0

        axisLabels = map(lambda x: x.strftime('%Y-%m-%d'), df.index.tolist())  # 从dataframe 中取出作为索引的日期标签成为队列
        seriesData = []
        legendLabels = []
        for colName, col in df.iteritems():
            legendLabels.append(colName)
            data = map(lambda x: 0.0 if isnan(x) else float(x), col.tolist())
            seriesData.append({'name': colName, 'data': data})

        json_dateTrend = {'axisLabels': axisLabels, 'legendLabels': legendLabels, 'seriesData': seriesData}
        return json_dateTrend
Beispiel #14
0
def twitter_daily_aggregate(retrievaldate):

	#Date Retrieval
	d=[]
	dt = parser.parse(retrievaldate) + timedelta(days=-1)
	d.append(dt)
	d.append(d[-1] + timedelta(days=1))

	#DataFrame Init
	ctrend = DataFrame()
	while d[-1] < datetime.utcnow(): 
		print 'processing ', d[-1], ' ..........'
		#Daily Mention Count
		mnts = twitter_count(d, mentions)

		#User Follower Count
		usrs =  twitter_follower(d,users)
		#Join
		trend = mnts.join(usrs)
		trend['Date'] = Period(d[-1],'D')
		#Append to DataFrame
		ctrend = concat([ctrend,trend])
		#Extend Dates
		d.append(d[-1] + timedelta(days=1))
	#Join DataFrames and Fill NAs
	ctrend =  ctrend.fillna(0)
	#Save
	print 'printing the file'
	ctrend.to_csv('twitter_trend.csv')
	return ctrend
Beispiel #15
0
def pandas_fillna(df: pd.DataFrame,
                  value: float = None,
                  method: str = None,
                  limit: int = None,
                  **kwargs) -> pd.DataFrame:
    """
    Return a new dataframe with NaN values filled according to the given value
    or method.

    This is a wrapper for the ``pandas.fillna()`` function For additional
    keyword arguments and information refer to pandas documentation at
    http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.fillna.html

    :param df: The dataframe to fill
    :param value: Value to fill
    :param method: Method according to which to fill NaN. ffill/pad will
           propagate the last valid observation to the next valid observation.
           backfill/bfill will propagate the next valid observation back to the last
           valid observation.
    :param limit: Maximum number of NaN values to forward/backward fill.
    :return: A dataframe with nan values filled with the given value or according to the given method.
    """
    # The following code is needed, because Pandas treats any kw given in kwargs as being set, even if just None.
    kwargs = dict(kwargs)
    if value:
        kwargs.update(value=value)
    if method:
        kwargs.update(method=method)
    if limit:
        kwargs.update(limit=limit)

    return df.fillna(**kwargs)
Beispiel #16
0
def main():
    """
    Handling of not applicable values
    """

    string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])
    print string_data
    print string_data.isnull()
    string_data[0] = None
    print string_data.isnull()
    print None is np.nan, None == np.nan # not same

    # Exclude N/A
    print '',''
    NA = np.nan
    data = Series([1, NA, 3.5, NA, 7])
    print data.dropna()
    print data[data.notnull()]

    data = DataFrame([
        [1., 6.5, 3.],
        [1., NA, NA],
        [NA, NA, NA],
        [NA, 6.5, 3.]
    ])
    cleaned = data.dropna() # row that all value is not NA
    print data
    print cleaned
    print data.dropna(how='all')
    data[4] = None
    print data.dropna(axis=1, how='all')
    print data.dropna(thresh=2) # non NA is more 2

    # Fill NA
    print '',''
    print data.fillna(0)
    print data.fillna({1: 0.5, 2: -1})
    _ = data.fillna(0, inplace=True)
    print data
    print '',''
    df = DataFrame(np.arange(18).reshape((6, 3)))
    df.ix[2:, 1] = NA; df.ix[4:, 2] = NA
    print df
    print df.fillna(method='ffill')
    print df.fillna(method='ffill', limit=2)
    data = Series([1., NA, 3.5, NA, 7])
    print data.fillna(data.mean())
Beispiel #17
0
    def parallel_cumulative_blame(self, branch='master', limit=None, skip=None, num_datapoints=None, committer=True,
                                  workers=1, ignore_globs=None, include_globs=None):
        """
        Returns the blame at every revision of interest. Index is a datetime, column per committer, with number of lines
        blamed to each committer at each timestamp as data.

        :param branch: (optional, default 'master') the branch to work in
        :param limit: (optional, default None), the maximum number of revisions to return, None for no limit
        :param skip: (optional, default None), the number of revisions to skip. Ex: skip=2 returns every other revision, None for no skipping.
        :param num_datapoints: (optional, default=None) if limit and skip are none, and this isn't, then num_datapoints evenly spaced revs will be used
        :param committer: (optional, defualt=True) true if committer should be reported, false if author
        :param ignore_globs: (optional, default=None) a list of globs to ignore, default none excludes nothing
        :param include_globs: (optinal, default=None) a list of globs to include, default of None includes everything.
        :param workers: (optional, default=1) integer, the number of workers to use in the threadpool, -1 for one per core.
        :return: DataFrame

        """

        if not _has_joblib:
            raise ImportError('''Must have joblib installed to use parallel_cumulative_blame(), please use
            cumulative_blame() instead.''')

        revs = self.revs(branch=branch, limit=limit, skip=skip, num_datapoints=num_datapoints)

        if self.verbose:
            print('Beginning processing for cumulative blame:')

        revisions = json.loads(revs.to_json(orient='index'))
        revisions = [revisions[key] for key in revisions]

        ds = Parallel(n_jobs=workers, backend='threading', verbose=5)(
            delayed(_parallel_cumulative_blame_func)
            (self, x, committer, ignore_globs, include_globs) for x in revisions
        )

        revs = DataFrame(ds)
        del revs['rev']

        revs['date'] = to_datetime(revs['date'].map(datetime.datetime.fromtimestamp))
        revs.set_index(keys=['date'], drop=True, inplace=True)
        revs = revs.fillna(0.0)

        # drop 0 cols
        for col in revs.columns.values:
            if col != 'col':
                if revs[col].sum() == 0:
                    del revs[col]

        # drop 0 rows
        keep_idx = []
        committers = [x for x in revs.columns.values if x != 'date']
        for idx, row in revs.iterrows():
            if sum([row[x] for x in committers]) > 0:
                keep_idx.append(idx)

        revs = revs.ix[keep_idx]
        revs.sort_index(ascending=False, inplace=True)

        return revs
def prepare_pop_data(population_data: pd.DataFrame, cols=None) -> pd.DataFrame:
    pop_data = population_data.fillna(value=0)
    if not cols:
        cols = ['plot.number', 'total.men', 'total.women', 'orthodox', 'other.christian', 'other.religion']
    pop_data.loc[:, cols] = pop_data.loc[:, cols].astype(int)
    pop_data['lutheran'] = pop_data['total.men'] + pop_data['total.women'] \
                           - pop_data['orthodox'] - pop_data['other.christian'] - pop_data['other.religion']
    return pop_data
Beispiel #19
0
    def test_fill_value_when_combine_const(self):
        # GH12723
        dat = np.array([0, 1, np.nan, 3, 4, 5], dtype='float')
        df = DataFrame({'foo': dat}, index=range(6))

        exp = df.fillna(0).add(2)
        res = df.add(2, fill_value=0)
        assert_frame_equal(res, exp)
Beispiel #20
0
    def test_fillna_nat(self):
        series = Series([0, 1, 2, NaT], dtype="M8[us]")

        filled = series.fillna(method="pad")
        filled2 = series.fillna(value=series[2])

        expected = series.copy()
        expected[3] = expected[2]

        assert_series_equal(filled, expected)
        assert_series_equal(filled2, expected)

        df = DataFrame({"A": series})
        filled = df.fillna(method="pad")
        filled2 = df.fillna(value=series[2])
        expected = DataFrame({"A": expected})
        assert_frame_equal(filled, expected)
        assert_frame_equal(filled2, expected)
 def calculate_ground_truth(self):
     gt = DataFrame(copy.deepcopy(self.events_consuming))
     gt = gt.fillna(-1)
     for x in gt.columns:
         l = len(gt[x])
         for y in range(1,l):
             if (gt[x][y] == -1):
                 gt[x][y] = gt[x][y-1]
     self.appliances_status = gt
Beispiel #22
0
    def fillna_dict(cls, prop):
        """
        Use trade history then fill empty with value row above
        """
        df = DataFrame(prop)
        df = df.replace(['', 'DEBIT', 'CREDIT'], numpy.nan)
        df = df.fillna(method='ffill')

        return [r.to_dict() for k, r in df.iterrows()]
Beispiel #23
0
    def test_fillna_nat(self):
        series = Series([0, 1, 2, NaT], dtype='M8[ns]')

        filled = series.fillna(method='pad')
        filled2 = series.fillna(value=series.values[2])

        expected = series.copy()
        expected.values[3] = expected.values[2]

        assert_series_equal(filled, expected)
        assert_series_equal(filled2, expected)

        df = DataFrame({'A': series})
        filled = df.fillna(method='pad')
        filled2 = df.fillna(value=series.values[2])
        expected = DataFrame({'A': expected})
        assert_frame_equal(filled, expected)
        assert_frame_equal(filled2, expected)
Beispiel #24
0
def seriesData_fitTimeFrame(dfList, colName, ts):
	dataList = []
	#ts = TimeSeries(pd.DateRange(datetime(1998,1,1), datetime(2013, 3, 20)))
	for i in range(len(dfList)):
		tf = DataFrame(index = ts)
		df = dfList[i]
		tf[colName] = df[colName]
		if colName == 'TradeID' or colName == 'Symbol' or colName == 'Year' or colName == 'Buy' or colName == 'Sell' \
			or colName == 'Strat' or colName == 'StartTrading' or colName == 'EndTrading':	
			tf = tf.fillna(method = 'bfill')
			tf = tf.fillna(-99)
		elif colName == 'Close' or colName == 'IsPeriod' or colName == 'PosSize' or colName == 'PosDir' \
			or colName == 'DynamicPNL' or colName == 'DynamicDollarPNL':
			tf = tf.fillna(method = 'ffill')
			tf = tf.fillna(0)
		elif colName == 'RealizedPNL' or colName == 'RealizedDollarPNL' or colName == 'DailyPNLChange':
			tf = tf.fillna(0)
		dataList.append(np.asarray(tf[colName]))
	return dataList	
Beispiel #25
0
    def test_replace_datetimetz(self):

        # GH 11326
        # behaving poorly when presented with a datetime64[ns, tz]
        df = DataFrame({'A': date_range('20130101', periods=3,
                                        tz='US/Eastern'),
                        'B': [0, np.nan, 2]})
        result = df.replace(np.nan, 1)
        expected = DataFrame({'A': date_range('20130101', periods=3,
                                              tz='US/Eastern'),
                              'B': Series([0, 1, 2], dtype='float64')})
        assert_frame_equal(result, expected)

        result = df.fillna(1)
        assert_frame_equal(result, expected)

        result = df.replace(0, np.nan)
        expected = DataFrame({'A': date_range('20130101', periods=3,
                                              tz='US/Eastern'),
                              'B': [np.nan, np.nan, 2]})
        assert_frame_equal(result, expected)

        result = df.replace(Timestamp('20130102', tz='US/Eastern'),
                            Timestamp('20130104', tz='US/Eastern'))
        expected = DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'),
                                    Timestamp('20130104', tz='US/Eastern'),
                                    Timestamp('20130103', tz='US/Eastern')],
                              'B': [0, np.nan, 2]})
        assert_frame_equal(result, expected)

        result = df.copy()
        result.iloc[1, 0] = np.nan
        result = result.replace(
            {'A': pd.NaT}, Timestamp('20130104', tz='US/Eastern'))
        assert_frame_equal(result, expected)

        # coerce to object
        result = df.copy()
        result.iloc[1, 0] = np.nan
        result = result.replace(
            {'A': pd.NaT}, Timestamp('20130104', tz='US/Pacific'))
        expected = DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'),
                                    Timestamp('20130104', tz='US/Pacific'),
                                    Timestamp('20130103', tz='US/Eastern')],
                              'B': [0, np.nan, 2]})
        assert_frame_equal(result, expected)

        result = df.copy()
        result.iloc[1, 0] = np.nan
        result = result.replace({'A': np.nan}, Timestamp('20130104'))
        expected = DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'),
                                    Timestamp('20130104'),
                                    Timestamp('20130103', tz='US/Eastern')],
                              'B': [0, np.nan, 2]})
        assert_frame_equal(result, expected)
Beispiel #26
0
    def test_na_actions_categorical(self):

        cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
        vals = ["a", "b", np.nan, "d"]
        df = DataFrame({"cats": cat, "vals": vals})
        cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3])
        vals2 = ["a", "b", "b", "d"]
        df_exp_fill = DataFrame({"cats": cat2, "vals": vals2})
        cat3 = Categorical([1, 2, 3], categories=[1, 2, 3])
        vals3 = ["a", "b", np.nan]
        df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3})
        cat4 = Categorical([1, 2], categories=[1, 2, 3])
        vals4 = ["a", "b"]
        df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4})

        # fillna
        res = df.fillna(value={"cats": 3, "vals": "b"})
        tm.assert_frame_equal(res, df_exp_fill)

        with pytest.raises(ValueError, match=("fill value must "
                                              "be in categories")):
            df.fillna(value={"cats": 4, "vals": "c"})

        res = df.fillna(method='pad')
        tm.assert_frame_equal(res, df_exp_fill)

        # dropna
        res = df.dropna(subset=["cats"])
        tm.assert_frame_equal(res, df_exp_drop_cats)

        res = df.dropna()
        tm.assert_frame_equal(res, df_exp_drop_all)

        # make sure that fillna takes missing values into account
        c = Categorical([np.nan, "b", np.nan], categories=["a", "b"])
        df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]})

        cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"])
        df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]})

        res = df.fillna("a")
        tm.assert_frame_equal(res, df_exp)
Beispiel #27
0
 def _create(self, data):
     dct = self.get_prep_data(data)
     data = get_single_column(data)
     docs = [dct.doc2bow(d) for d in data]
     ids = dct.keys()
     df = DataFrame([dict(row) for row in docs], columns=ids, index=data.index)
     df.columns = ["%s_%s" % (dct[i], data.name) for i in ids]
     df = df.fillna(0)
     if self.bool_:
         df = df.astype(bool).astype(int)
     return df
Beispiel #28
0
Datei: text.py Projekt: Afey/ramp
 def _apply(self, data, fitted_feature):
     docs = list(data)
     dct = self.dictionary.get_dict(self.context, docs)
     tfidf = self.dictionary.get_tfidf(self.context, docs)
     docs = [dct.doc2bow(d) for d in docs]
     vecs = tfidf[docs]
     df = DataFrame([dict(row) for row in vecs], index=data.index)
     df.columns = ['%s_%s' % (dct[i], data.name) for i in df.columns]
     df = df.fillna(0)
     logging.debug(df)
     return df
Beispiel #29
0
 def _create(self, data):
     docs = list(data)
     dct = self.dictionary.get_dict(self.context, docs)
     tfidf = self.dictionary.get_tfidf(self.context, docs)
     docs = [dct.doc2bow(d) for d in docs]
     vecs = tfidf[docs]
     df = DataFrame([dict(row) for row in vecs], index=data.index)
     df.columns = ['%s_%s' % (dct[i], data.name) for i in df.columns]
     df = df.fillna(0)
     print df
     return df
Beispiel #30
0
def merge_dataset(files):
    test = DataFrame(columns = ['file'])
    for csvfile in files:
        if csvfile != '.DS_Store':
            dataframe = pd.read_csv(textdir + csvfile)
            test = pd.concat([test, dataframe], ignore_index = True)
            test = test.fillna(np.float(re.match(r'(.+)\.csv', csvfile).group(1)))
    test = test[
        ['file', 'position', 'number_of_stopwords', 'length', 'highlight_marker', 'sum_of_word_weigth', 'scoreline_s1',
         'scoreline_s2', 'scoreline_s3', 'specific_timestamp', 'similarity1', 'similarity2', 'f_score']]
    return test
Beispiel #31
0
obj.notnull()
pd.notnull(obj)

obj[obj.notnull()]
obj.dropna()

df = DataFrame([[1,2,3,],[1,NA,NA],[NA,NA,NA],[NA,2,3]])

df.dropna()  #Nan하나라도 있으며 ㄴ제외
df.dropna(how = 'all') #rowdp모든값이 Nan이면 제외

df[4] = NA

df.dropna(how = 'all',axis = 1) #컬럼에 모든 값이 Nan으로 되어있는 컬럼만 제외

df.fillna(0)
df[0].fillna(0)

df.fillna({0:0,1:1,2:2,4:4})
df.fillna(0,inplace = True)  #바로 수정한다.
df

df = DataFrame([[1,2,5],[NA,NA,4],[3,2,NA],[2,NA,3]])
df.fillna(method='ffill')
df.fillna(method='bfill')



[문제 139]커미션이 null 인 사원들의 이름과 커미션을 출력하세요 
emp = pd.read_csv("c:/r/emp.csv",names = ["empid","name","job","mgr","hire_date","sal","comm","deptno"])
    def test_replace_datetimetz(self):

        # GH 11326
        # behaving poorly when presented with a datetime64[ns, tz]
        df = DataFrame({
            'A': date_range('20130101', periods=3, tz='US/Eastern'),
            'B': [0, np.nan, 2]
        })
        result = df.replace(np.nan, 1)
        expected = DataFrame({
            'A':
            date_range('20130101', periods=3, tz='US/Eastern'),
            'B':
            Series([0, 1, 2], dtype='float64')
        })
        assert_frame_equal(result, expected)

        result = df.fillna(1)
        assert_frame_equal(result, expected)

        result = df.replace(0, np.nan)
        expected = DataFrame({
            'A':
            date_range('20130101', periods=3, tz='US/Eastern'),
            'B': [np.nan, np.nan, 2]
        })
        assert_frame_equal(result, expected)

        result = df.replace(Timestamp('20130102', tz='US/Eastern'),
                            Timestamp('20130104', tz='US/Eastern'))
        expected = DataFrame({
            'A': [
                Timestamp('20130101', tz='US/Eastern'),
                Timestamp('20130104', tz='US/Eastern'),
                Timestamp('20130103', tz='US/Eastern')
            ],
            'B': [0, np.nan, 2]
        })
        assert_frame_equal(result, expected)

        result = df.copy()
        result.iloc[1, 0] = np.nan
        result = result.replace({'A': pd.NaT},
                                Timestamp('20130104', tz='US/Eastern'))
        assert_frame_equal(result, expected)

        # coerce to object
        result = df.copy()
        result.iloc[1, 0] = np.nan
        result = result.replace({'A': pd.NaT},
                                Timestamp('20130104', tz='US/Pacific'))
        expected = DataFrame({
            'A': [
                Timestamp('20130101', tz='US/Eastern'),
                Timestamp('20130104', tz='US/Pacific'),
                Timestamp('20130103', tz='US/Eastern')
            ],
            'B': [0, np.nan, 2]
        })
        assert_frame_equal(result, expected)

        result = df.copy()
        result.iloc[1, 0] = np.nan
        result = result.replace({'A': np.nan}, Timestamp('20130104'))
        expected = DataFrame({
            'A': [
                Timestamp('20130101', tz='US/Eastern'),
                Timestamp('20130104'),
                Timestamp('20130103', tz='US/Eastern')
            ],
            'B': [0, np.nan, 2]
        })
        assert_frame_equal(result, expected)
def timeseries_to_supervised(data, column_name):
    df = DataFrame(data)
    df['output'] = df[column_name]
    df['output'] = df['output'].shift(1)
    df.fillna(0, inplace=True)
    return df
Beispiel #34
0
    def test_replace_datetimetz(self):

        # GH 11326
        # behaving poorly when presented with a datetime64[ns, tz]
        df = DataFrame(
            {
                "A": date_range("20130101", periods=3, tz="US/Eastern"),
                "B": [0, np.nan, 2],
            }
        )
        result = df.replace(np.nan, 1)
        expected = DataFrame(
            {
                "A": date_range("20130101", periods=3, tz="US/Eastern"),
                "B": Series([0, 1, 2], dtype="float64"),
            }
        )
        tm.assert_frame_equal(result, expected)

        result = df.fillna(1)
        tm.assert_frame_equal(result, expected)

        result = df.replace(0, np.nan)
        expected = DataFrame(
            {
                "A": date_range("20130101", periods=3, tz="US/Eastern"),
                "B": [np.nan, np.nan, 2],
            }
        )
        tm.assert_frame_equal(result, expected)

        result = df.replace(
            Timestamp("20130102", tz="US/Eastern"),
            Timestamp("20130104", tz="US/Eastern"),
        )
        expected = DataFrame(
            {
                "A": [
                    Timestamp("20130101", tz="US/Eastern"),
                    Timestamp("20130104", tz="US/Eastern"),
                    Timestamp("20130103", tz="US/Eastern"),
                ],
                "B": [0, np.nan, 2],
            }
        )
        tm.assert_frame_equal(result, expected)

        result = df.copy()
        result.iloc[1, 0] = np.nan
        result = result.replace({"A": pd.NaT}, Timestamp("20130104", tz="US/Eastern"))
        tm.assert_frame_equal(result, expected)

        # coerce to object
        result = df.copy()
        result.iloc[1, 0] = np.nan
        result = result.replace({"A": pd.NaT}, Timestamp("20130104", tz="US/Pacific"))
        expected = DataFrame(
            {
                "A": [
                    Timestamp("20130101", tz="US/Eastern"),
                    Timestamp("20130104", tz="US/Pacific"),
                    Timestamp("20130103", tz="US/Eastern"),
                ],
                "B": [0, np.nan, 2],
            }
        )
        tm.assert_frame_equal(result, expected)

        result = df.copy()
        result.iloc[1, 0] = np.nan
        result = result.replace({"A": np.nan}, Timestamp("20130104"))
        expected = DataFrame(
            {
                "A": [
                    Timestamp("20130101", tz="US/Eastern"),
                    Timestamp("20130104"),
                    Timestamp("20130103", tz="US/Eastern"),
                ],
                "B": [0, np.nan, 2],
            }
        )
        tm.assert_frame_equal(result, expected)
Beispiel #35
0
    def get_qq_nums(self, user, password, qq_group):
        try:
            #            cf = ConfigParser.ConfigParser()
            #            cf.read('conf.ini')
            # chromedriver = cf.get('main', 'path')
            #            chromedriver = "/Users/Homosum/Downloads/chromedriver"
            #             driver = webdriver.Chrome(self.chromedirverPath)

            driver = webdriver.Firefox()
            driver.get("http://qun.qq.com/member.html#gid={}".format(qq_group))
            IframeElement = driver.find_element_by_name("login_frame")
            driver.switch_to_frame(IframeElement)

            driver.find_element_by_xpath(
                "//*[@id='bottom_qlogin']/a[1]").click()  # 登录界面
            driver.find_element_by_xpath("//*[@id='u']").send_keys(user)
            driver.find_element_by_xpath("//*[@id='p']").send_keys(password)

            driver.find_element_by_xpath(
                "//*[@id='login_button']").click()  # 点击登录
            time.sleep(1.5)

            driver.switch_to_default_content(
            )  # 防止出现TypeError: can't access dead object 错误特别重要
            time.sleep(1.5)
            web_data = driver.page_source
            selector = etree.HTML(web_data)
            try:
                #print('999')
                people_num = selector.xpath(
                    "//*[@id='groupMemberNum']/text()")  # 获取群组人数量
                print('people_num:%s' % people_num)
                if len(people_num) == 0:
                    people_nums = 0
                else:
                    people_nums = int(people_num[0])
#print('777')
            except Exception as e:
                logger.warning("网络问题")
                driver.close()

            count = 1

            logger.info('QQ群人数%d' % (people_nums))

            for _ in range(int(people_nums / 20)):
                js = "var q=document.documentElement.scrollTop=500000"
                #            js = "var q=document.body.scrollTop=500000"
                driver.execute_script(js)
                time.sleep(2)
                count += 1
#print('666')
            web_data = driver.page_source  # 重新获取网页源代码
            selector = etree.HTML(web_data)
            people_nicks = selector.xpath(
                "//tbody[@class='list']/tr/td[3]/span/text()")  # 获取昵称
            people_nicks = get_freshList(people_nicks)

            people_names = selector.xpath(
                "//tbody[@class='list']/tr/td[4]/span/text()"
            )  # 获取群名片                                  #获取群名片
            people_names = get_freshList(people_names)

            people_QQs = selector.xpath(
                "//tbody[@class='list']/tr/td[5]/text()")  # 获取qq号
            people_QQs = get_freshList(people_QQs)

            people_sexs = selector.xpath(
                "//tbody[@class='list']/tr/td[6]/text()")  # 获取性别
            people_sexs = get_freshList(people_sexs)

            people_ages = selector.xpath(
                "//tbody[@class='list']/tr/td[7]/text()")  # 获取Q龄
            people_ages = get_freshList(people_ages)

            people_grades = selector.xpath(
                "//tbody[@class='list']/tr/td[9]/text()")  # 获取活跃度
            people_grades = get_freshList(people_grades)
            #print('555')
            result_array = []
            countS = 0
            #name_ = driver.find_element_by_xpath("//*[@id='groupTit']").text
            name_A = selector.xpath("//*[@id='groupTit']/text()")
            print('name_%s' % name_A)
            name_ = name_A[0]
            logger.info('用户:%s,群号:%s,爬取人数:%d' % (user, name_, len(people_QQs)))
            #print('用户:%s,群号:%s,爬取人数:%d' % (user,name_,len(people_QQs)))
            #print('444')
            for countS in range(len(people_QQs)):
                member = QQ_Member()
                member.name = people_nicks[countS]
                member.sex = people_sexs[countS]
                member.qq_age = people_ages[countS]
                member.num = people_QQs[countS]
                member.source = name_
                dic = classToDict(member)
                result_array.append(dic)

#print('333')
            frame = DataFrame(result_array)
            frame.fillna('NA')
            filePath = my_web.qqSavePath
            path = ('%s/%s.csv' % (filePath, qq_group))
            frame.to_csv(path, encoding='utf-8')
            #print('222')
            print('已保存%s' % path)
            logger.info('已保存%s' % path)
            driver.close()
            #print('111')
            pass
        except Exception as e:
            driver.close()
Beispiel #36
0
# print(s1.isnull())  # 判断是否为nan
# print(s1.notnull())  # 判断是否为非nan
# print(s1.dropna())  # 去掉nan值

df1 = DataFrame([[1, 2, 3], [np.nan, 5, 6], [7, 8, np.nan],
                 [np.nan, np.nan, np.nan]])  # DataFrame中nan的使用
print(df1)
# print(df1.isnull())
# print(df1.notnull())
df2 = df1.dropna(axis=0)  # 去掉有nan的行
# print(df2)
df3 = df1.dropna(axis=1)  # 去掉有nan的列
# print(df3)
df4 = df1.dropna(axis=0, how='any')  # 参数how,any:如果存在nan则去掉,
# print(df4)
df5 = df1.dropna(axis=0, how='all')  # all:如果全为nan则去掉
# print(df5)

dframe = DataFrame([[1, 2, 3], [np.nan, 5, 6], [7, np.nan, np.nan],
                    [np.nan, np.nan, np.nan]])
# print(dframe)
df6 = dframe.dropna(axis=0, thresh=2)  # 参数thresh指定去掉多少个nan,大于等于
# print(df6)
# print(dframe.fillna(value=1))  # nan填充
print(dframe.fillna(value={
    0: 0,
    1: 1,
    2: 2,
    3: 3
}))  # 指定列进行填充,如0列填充0,1列填充1,自己指定
Beispiel #37
0
def create_plots(df: pd.DataFrame, output_folder: PathLike, max_combsize: int = 1, xpaxis: str = "budget") -> None:
    """Saves all representing plots to the provided folder

    Parameters
    ----------
    df: pd.DataFrame
        the experiment data
    output_folder: PathLike
        path of the folder where the plots should be saved
    max_combsize: int
        maximum number of parameters to fix (combinations) when creating experiment plots
    xpaxis: str
        x-axis for xp plots (either budget or pseudotime)
    """
    assert xpaxis in ["budget", "pseudotime"]
    df = remove_errors(df)
    df.loc[:, "loss"] = pd.to_numeric(df.loc[:, "loss"])
    df = tools.Selector(df.fillna("N-A"))  # remove NaN in non score values
    assert not any("Unnamed: " in x for x in df.columns), f"Remove the unnamed index column:  {df.columns}"
    assert "error " not in df.columns, f"Remove error rows before plotting"
    required = {"optimizer_name", "budget", "loss", "elapsed_time", "elapsed_budget"}
    missing = required - set(df.columns)
    assert not missing, f"Missing fields: {missing}"
    output_folder = Path(output_folder)
    os.makedirs(output_folder, exist_ok=True)
    # check which descriptors do vary
    descriptors = sorted(set(df.columns) - (required | {"seed", "pseudotime"}))  # all other columns are descriptors
    to_drop = [x for x in descriptors if len(df.unique(x)) == 1]
    df = tools.Selector(df.loc[:, [x for x in df.columns if x not in to_drop]])
    descriptors = sorted(set(df.columns) - (required | {"seed", "pseudotime"}))  # now those should be actual interesting descriptors
    print(f"Descriptors: {descriptors}")
    print("# Fight plots")
    #
    # fight plot
    # choice of the combination variables to fix
    fight_descriptors = descriptors + ["budget"]  # budget can be used as a descriptor for fight plots
    combinable = [x for x in fight_descriptors if len(df.unique(x)) > 1]  # should be all now
    num_rows = 6
    for fixed in list(itertools.chain.from_iterable(itertools.combinations(combinable, order) for order in range(max_combsize + 1))):
        # choice of the cases with values for the fixed variables
        for case in df.unique(fixed):
            print("\n# new case #", fixed, case)
            casedf = df.select(**dict(zip(fixed, case)))
            data_df = FightPlotter.winrates_from_selection(casedf, fight_descriptors, num_rows=num_rows)
            fplotter = FightPlotter(data_df)
            # save
            name = "fight_" + ",".join("{}{}".format(x, y) for x, y in zip(fixed, case)) + ".png"
            name = "fight_all.png" if name == "fight_.png" else name
            fplotter.save(str(output_folder / name), dpi=_DPI)
    plt.close("all")
    #
    # xp plots
    # plot mean loss / budget for each optimizer for 1 context
    print("# Xp plots")
    name_style = NameStyle()  # keep the same style for each algorithm
    cases = df.unique(descriptors)
    for case in cases:
        subdf = df.select_and_drop(**dict(zip(descriptors, case)))
        description = ",".join("{}:{}".format(x, y) for x, y in zip(descriptors, case))
        out_filepath = output_folder / "xpresults{}{}.png".format("_" if description else "", description.replace(":", ""))
        data = XpPlotter.make_data(subdf)
        xpplotter = XpPlotter(data, title=description, name_style=name_style, xaxis=xpaxis)
        xpplotter.save(out_filepath)
    plt.close("all")
    temp = df['Close'].astype('float64').pct_change().fillna(0.)
    return temp


secIDs = [
    '000300.ZICN', '000905.ZICN', '399006.ZICN', 'SPX.ZIUS', '000012.ZICN',
    '000013.ZICN'
]
rtn_table = DataFrame()

for secID in secIDs:
    cp = get_return(secID)
    cp.name = secID
    rtn_table = pd.concat([rtn_table, cp], axis=1)

rtn_table.fillna(0, inplace=True)

#rtn_table.head(5)

#rtn_table.mean()*250

#rtn_table.corr()
print(rtn_table.mean() * 250)
print(rtn_table.corr())

print("*************************************************")

from cvxopt import matrix, solvers

portfolio1 = [0, 1, 2, 4, 5]
portfolio2 = range(6)
Beispiel #39
0
def create_head_traces(dict):
    heads = DataFrame(dict, columns=get_names())
    heads = heads.fillna(0)
    return heads
Beispiel #40
0
def create_plots(
    df: pd.DataFrame,
    output_folder: tp.PathLike,
    max_combsize: int = 1,
    xpaxis: str = "budget",
    competencemaps: bool = False,
) -> None:
    """Saves all representing plots to the provided folder

    Parameters
    ----------
    df: pd.DataFrame
        the experiment data
    output_folder: PathLike
        path of the folder where the plots should be saved
    max_combsize: int
        maximum number of parameters to fix (combinations) when creating experiment plots
    xpaxis: str
        x-axis for xp plots (either budget or pseudotime)
    """
    assert xpaxis in ["budget", "pseudotime"]
    df = remove_errors(df)
    df.loc[:, "loss"] = pd.to_numeric(df.loc[:, "loss"])
    # If we have a descriptor "instrum_str",
    # we assume that it describes the instrumentation as a string,
    # that we should include the various instrumentations as distinct curves in the same plot.
    # So we concat it at the end of the optimizer name, and we remove "parametrization"
    # from the descriptor.
    if "instrum_str" in set(df.columns):
        df.loc[:,
               "optimizer_name"] = df.loc[:,
                                          "optimizer_name"] + df.loc[:,
                                                                     "instrum_str"]
        df = df.drop(columns="instrum_str")
        df = df.drop(columns="dimension")
        if "parametrization" in set(df.columns):
            df = df.drop(columns="parametrization")
    df = utils.Selector(df.fillna("N-A"))  # remove NaN in non score values
    assert not any(
        "Unnamed: " in x
        for x in df.columns), f"Remove the unnamed index column:  {df.columns}"
    assert "error " not in df.columns, f"Remove error rows before plotting"
    required = {
        "optimizer_name", "budget", "loss", "elapsed_time", "elapsed_budget"
    }
    missing = required - set(df.columns)
    assert not missing, f"Missing fields: {missing}"
    output_folder = Path(output_folder)
    os.makedirs(output_folder, exist_ok=True)
    # check which descriptors do vary
    descriptors = sorted(
        set(df.columns) - (required | {"instrum_str", "seed", "pseudotime"
                                       }))  # all other columns are descriptors
    to_drop = [x for x in descriptors if len(df.unique(x)) == 1]
    df = utils.Selector(df.loc[:, [x for x in df.columns if x not in to_drop]])
    # now those should be actual interesting descriptors
    all_descriptors = sorted(
        set(df.columns) - (required | {"instrum_str", "seed", "pseudotime"}))
    print(f"Descriptors: {all_descriptors}")
    print("# Fight plots")
    #
    # fight plot
    # choice of the combination variables to fix
    fight_descriptors = all_descriptors + [
        "budget"
    ]  # budget can be used as a descriptor for fight plots
    combinable = [x for x in fight_descriptors
                  if len(df.unique(x)) > 1]  # should be all now
    # We remove descriptors which have only one value for each budget.
    descriptors = []
    for d in all_descriptors:
        acceptable = False
        for b in df.budget.unique():
            if len(df.loc[df["budget"] == b][d].unique()) > 1:
                acceptable = True
                break
        if acceptable:
            descriptors += [d]
    num_rows = 6

    # For the competence map case we must consider pairs of attributes, hence maxcomb_size >= 2.
    # A competence map shows for each value of each of two attributes which algorithm was best.
    if competencemaps:
        max_combsize = max(max_combsize, 2)
    for fixed in list(
            itertools.chain.from_iterable(
                itertools.combinations(combinable, order)
                for order in range(max_combsize + 1))):
        orders = [len(c) for c in df.unique(fixed)]
        if orders:
            assert min(orders) == max(orders)
            order = min(orders)
        else:
            order = 0
        best_algo: tp.List[tp.List[str]] = []
        if competencemaps and order == 2:  # With order 2 we can create a competence map.
            print("\n#trying to competence-map")
            if all([len(c) > 1 for c in df.unique(fixed)
                    ]):  # Let us try if data are adapted to competence maps.
                # This is not always the case, as some attribute1/value1 + attribute2/value2 might be empty
                # (typically when attribute1 and attribute2 are correlated).
                try:
                    xindices = sorted(set(c[0] for c in df.unique(fixed)))
                except TypeError:
                    xindices = list(set(c[0] for c in df.unique(fixed)))
                try:
                    yindices = sorted(set(c[1] for c in df.unique(fixed)))
                except TypeError:
                    yindices = list(set(c[1] for c in df.unique(fixed)))
                for _ in range(len(xindices)):
                    best_algo += [[]]
                for i in range(len(xindices)):
                    for _ in range(len(yindices)):
                        best_algo[i] += ["none"]

        # Let us loop over all combinations of variables.
        for case in df.unique(fixed) if fixed else [()]:
            print("\n# new case #", fixed, case)
            casedf = df.select(**dict(zip(fixed, case)))
            data_df = FightPlotter.winrates_from_selection(casedf,
                                                           fight_descriptors,
                                                           num_rows=num_rows)
            fplotter = FightPlotter(data_df)
            # Competence maps: we find out the best algorithm for each attribute1=valuei/attribute2=valuej.
            if order == 2 and competencemaps and best_algo:
                print("\n#storing data for competence-map")
                best_algo[xindices.index(case[0])][yindices.index(
                    case[1])] = fplotter.winrates.index[0]
            # save
            name = "fight_" + ",".join("{}{}".format(x, y)
                                       for x, y in zip(fixed, case)) + ".png"
            name = "fight_all.png" if name == "fight_.png" else name

            if name == "fight_all.png":
                with open(str(output_folder / name) + ".cp.txt", "w") as f:
                    f.write("ranking:\n")
                    for i, algo in enumerate(data_df.columns[:8]):
                        f.write(f"  algo {i}: {algo}\n")
            if len(name) > 240:
                hashcode = hashlib.md5(bytes(name, "utf8")).hexdigest()
                name = re.sub(r"\([^()]*\)", "", name)
                mid = 120
                name = name[:mid] + hashcode + name[-mid:]
            fplotter.save(str(output_folder / name), dpi=_DPI)

            if order == 2 and competencemaps and best_algo:  # With order 2 we can create a competence map.
                print("\n# Competence map")
                name = "competencemap_" + ",".join("{}".format(x)
                                                   for x in fixed) + ".tex"
                export_table(str(output_folder / name), xindices, yindices,
                             best_algo)
                print("Competence map data:", fixed, case, best_algo)

    plt.close("all")
    # xp plots: for each experimental setup, we plot curves with budget in x-axis.
    # plot mean loss / budget for each optimizer for 1 context
    print("# Xp plots")
    name_style = NameStyle()  # keep the same style for each algorithm
    cases = df.unique(descriptors)
    if not cases:
        cases = [()]
    # Average normalized plot with everything.
    out_filepath = output_folder / "xpresults_all.png"
    data = XpPlotter.make_data(df, normalized_loss=True)
    xpplotter = XpPlotter(data,
                          title=os.path.basename(output_folder),
                          name_style=name_style,
                          xaxis=xpaxis)
    xpplotter.save(out_filepath)
    # Now one xp plot per case.
    for case in cases:
        subdf = df.select_and_drop(**dict(zip(descriptors, case)))
        description = ",".join("{}:{}".format(x, y)
                               for x, y in zip(descriptors, case))
        if len(description) > 280:
            hash_ = hashlib.md5(bytes(description, "utf8")).hexdigest()
            description = description[:140] + hash_ + description[-140:]
        out_filepath = output_folder / "xpresults{}{}.png".format(
            "_" if description else "", description.replace(":", ""))
        data = XpPlotter.make_data(subdf)
        try:
            xpplotter = XpPlotter(data,
                                  title=description,
                                  name_style=name_style,
                                  xaxis=xpaxis)
        except Exception as e:  # pylint: disable=broad-except
            warnings.warn(f"Bypassing error in xpplotter:\n{e}",
                          RuntimeWarning)
        else:
            xpplotter.save(out_filepath)
    plt.close("all")
Beispiel #41
0
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
from numpy.random import randn
df1 = pd.read_clipboard()  #đọc bảng từ clipboard
df1.sum()  #tính tổng các cột
df1.sum(axis=1)  #tính tổng các hàng
df1.max()  # trả về giá trị lớn nhất của mỗi cột
df1.idxmax()  # trả về idx của giá trị max ở mỗi cột
df1.cumsum()  # tính tổng dần từ trên xuống:
# hàng 2=h1=h2;h3=h1+h2+h3
df1.describe()  # mô tả bảng

nd = np.nan  # hàm nào đó
A = [1, 2, 3]
B = [4, nd, 6]
C = [nd, 8, nd]
D = [nd, nd, nd]
df2 = DataFrame([A, B, C, D])
df2.dropna()  # drop hàng nào có ô giá trị NaN
df2.dropna(how='all')  # drop hàng tất cả các ô đều NaN
df2.dropna(thresh=1)  # drop một dòng từ dưới lên
df2.fillna(100)  # thay ô có giá trị NaN thành 100
df2.fillna(24, inplace=True)  # thay ô có giá trị NaN thành 24
df2
Beispiel #42
0
    df1.ix[row, column_name] = 1
    #添加手机号码前3位
    column_name = 'head' + '_' + head
    df1.ix[row, column_name] = 1
    #添加手机号码最后一位
    column_name = 'tail' + '_' + tail
    df1.ix[row, column_name] = 1

    if subroots != "\N":
        keyword_list = subroots.split(unicode(';', 'utf-8'))
        for element in keyword_list:
            column_name = element
            df2.ix[row, column_name] = 1
fin.close()

df1.fillna(0, inplace=True)  #默认不为NAN,而是为0
df2.fillna(0, inplace=True)  #默认不为NAN,而是为0

columns_path = os.path.join(
    os.path.split(os.path.realpath(__file__))[0], "column_names1.txt")
columns_fout = open(columns_path, 'w')
for column_name in df1.columns:  #将列名写到文件
    columns_fout.write(column_name + '\n')
columns_fout.close()
A1_path = os.path.join(os.path.split(os.path.realpath(__file__))[0], "A1.txt")
df1.to_csv(A1_path, index=False)  #将表格写到文件
# print df1.columns
print df1.shape  #输出表格的行列数

columns_path = os.path.join(
    os.path.split(os.path.realpath(__file__))[0], "column_names2.txt")
Beispiel #43
0
df = DataFrame([[1,2,3,],[1,NA,NA],[NA,NA,NA],[NA,2,3]])
df

df.isnull()
pd.isnull(df)

df.dropna()  # NaN 하나라도 있으면 row단위 제외
df.dropna(how = 'all')  # NaN만 전부인 row 제외

df[4] = NA  # 새로운 열 추가하면서 NaN 가득히
df

df.dropna(how = 'all', axis = 1)  # NaN만 전부인 열 제거


df.fillna(0)  # NaN -> 0 치환
df[0].fillna(0) # 0 col만 

df.fillna({0:0,1:1,2:2,4:4})  # col:input, NaN 열별 적용값 다르게 주기

df.fillna(0, inplace = True)  # inplace = True 바로 적용
df


df = DataFrame([[1,2,5],[NA,NA,4],[3,2,NA],[2,NA,3]])
df

df.fillna(method = "ffill")  # NaN 앞의 값으로 채원
df.ffill()

df.fillna(method = "bfill")  # NaN 뒤의 값으로 채원
Beispiel #44
0
def ReportGenerator(df: pd.DataFrame,
                    ClusteringVariables: np.array,
                    FillMissingReport=None) -> pd.DataFrame:
    """
    Function generates easy-erading clustering report. It takes 2 arguments as an input:
        DataFrame - dataframe with predicted cluester column;
        FillMissingReport - dictionary of rules how we are going to fill missing
        values of for final report generate (not included in modeling);
    in order to run the function following libraries must be imported:
        import pandas as pd
        import numpy as np
    >>> data = pd.DataFrame()
    >>> data['numbers'] = [1, 2, 3]
    >>> data['col1'] = [0.5, 2.5, 4.5]
    >>> data['col2'] = [100, 200, 300]
    >>> data['col3'] = [10, 20, 30]
    >>> data['Cluster'] = [1, 1, 2]
    >>> ReportGenerator(data, ['col1', 'col2'], 0)
               Features               Type   Mark           1           2
    0    # of Customers        ClusterSize  False    2.000000    1.000000
    1    % of Customers  ClusterProportion  False    0.666667    0.333333
    2              col1    mean_with_zeros   True    1.500000    4.500000
    3              col2    mean_with_zeros   True  150.000000  300.000000
    4           numbers    mean_with_zeros  False    1.500000    3.000000
    ..              ...                ...    ...         ...         ...
    99            dummy                 5%  False    1.000000    1.000000
    100           dummy                95%  False    1.000000    1.000000
    101           dummy              stdev  False    0.000000         NaN
    102           dummy               mode  False    1.000000    1.000000
    103           dummy             median  False    1.000000    1.000000
    <BLANKLINE>
    [104 rows x 5 columns]
    """
    # Fill missing values with given rules
    if FillMissingReport:
        df.fillna(value=FillMissingReport, inplace=True)
    df["dummy"] = 1
    numeric_cols = df.select_dtypes(np.number).columns
    report = (
        df.groupby(["Cluster"])[  # construct report dataframe
            numeric_cols]  # group by cluster number
        .agg([
            ("sum", np.sum),
            ("mean_with_zeros", lambda x: np.mean(np.nan_to_num(x))),
            ("mean_without_zeros", lambda x: x.replace(0, np.NaN).mean()),
            (
                "mean_25-75",
                lambda x: np.mean(
                    np.nan_to_num(
                        sorted(x)[round(
                            (len(x) * 25 / 100)):round(len(x) * 75 / 100)])),
            ),
            ("mean_with_na", np.mean),
            ("min", lambda x: x.min()),
            ("5%", lambda x: x.quantile(0.05)),
            ("25%", lambda x: x.quantile(0.25)),
            ("50%", lambda x: x.quantile(0.50)),
            ("75%", lambda x: x.quantile(0.75)),
            ("95%", lambda x: x.quantile(0.95)),
            ("max", lambda x: x.max()),
            ("count", lambda x: x.count()),
            ("stdev", lambda x: x.std()),
            ("mode", lambda x: x.mode()[0]),
            ("median", lambda x: x.median()),
            ("# > 0", lambda x: (x > 0).sum()),
        ]).T.reset_index().rename(index=str,
                                  columns={
                                      "level_0": "Features",
                                      "level_1": "Type"
                                  }))  # rename columns
    # calculate the size of cluster(count of clientID's)
    clustersize = report[(report["Features"] == "dummy") & (
        report["Type"] == "count")].copy()  # avoid SettingWithCopyWarning
    clustersize.Type = (
        "ClusterSize"  # rename created cluster df to match report column names
    )
    clustersize.Features = "# of Customers"
    clusterproportion = pd.DataFrame(clustersize.iloc[:, 2:].values /
                                     clustersize.iloc[:, 2:].values.sum(
                                     )  # calculating the proportion of cluster
                                     )
    clusterproportion[
        "Type"] = "% of Customers"  # rename created cluster df to match report column names
    clusterproportion["Features"] = "ClusterProportion"
    cols = clusterproportion.columns.tolist()
    cols = cols[-2:] + cols[:-2]
    clusterproportion = clusterproportion[
        cols]  # rearrange columns to match report
    clusterproportion.columns = report.columns
    a = pd.DataFrame(
        abs(report[report["Type"] == "count"].iloc[:, 2:].values -
            clustersize.iloc[:, 2:].values)
    )  # generating df with count of nan values
    a["Features"] = 0
    a["Type"] = "# of nan"
    a.Features = report[report["Type"] == "count"].Features.tolist(
    )  # filling values in order to match report
    cols = a.columns.tolist()
    cols = cols[-2:] + cols[:-2]
    a = a[cols]  # rearrange columns to match report
    a.columns = report.columns  # rename columns to match report
    report = report.drop(report[
        report.Type == "count"].index)  # drop count values except cluster size
    report = pd.concat(
        [report, a, clustersize, clusterproportion],
        axis=0)  # concat report with clustert size and nan values
    report["Mark"] = report["Features"].isin(ClusteringVariables)
    cols = report.columns.tolist()
    cols = cols[0:2] + cols[-1:] + cols[2:-1]
    report = report[cols]
    sorter1 = {
        "ClusterSize": 9,
        "ClusterProportion": 8,
        "mean_with_zeros": 7,
        "mean_with_na": 6,
        "max": 5,
        "50%": 4,
        "min": 3,
        "25%": 2,
        "75%": 1,
        "# of nan": 0,
        "# > 0": -1,
        "sum_with_na": -2,
    }
    report = (report.assign(
        Sorter1=lambda x: x.Type.map(sorter1),
        Sorter2=lambda x: list(reversed(range(len(x)))),
    ).sort_values(["Sorter1", "Mark", "Sorter2"],
                  ascending=False).drop(["Sorter1", "Sorter2"], axis=1))
    report.columns.name = ""
    report = report.reset_index()
    report.drop(columns=["index"], inplace=True)
    return report
    def test_fillna_integer_limit(self, type):
        df = DataFrame(np.random.randn(10, 4)).astype(type)

        msg = "Limit must be an integer"
        with pytest.raises(ValueError, match=msg):
            df.fillna(0, limit=0.5)
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np


#reading data
kingcnt = pd.read_csv("data/King_County_House_prices_dataset.csv")

df_kingcnt = DataFrame(kingcnt,columns=['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15']) 


#dealing with missing values
df_kingcnt.fillna({'waterfront':0, 'view':0}, inplace=True)


#splitting data
print("-----  Splitting the data in train and test ----")
train, test = train_test_split(df_kingcnt, test_size=0.33, random_state=42)


#cleaning training data
train = train[train.bedrooms != 33]
train = train[train.sqft_living < 12000]
train = train[train.sqft_lot < 1100000]
train = train[train.sqft_above < 9000]
train = train[train.sqft_lot15 < 500000]

Beispiel #47
0
print(df.isnull())
print(df.notnull())

# ②处理缺失数据
# 处理方式:数据补齐、删除对应行、不处理
# 1.删除对应行:dropna
newDf = df.dropna()  # 删除包含NaN的行
print(newDf)
print(len(newDf))  # 返回行数
print(newDf.columns)  # 含列名的Index
newDf = df.dropna(how='all')  # 只有当所有列全为空时,该行才删除
print(newDf)
print(df.dropna(axis=1))  # 按列丢弃
print(df.dropna(how='all', axis=1))  # 按列丢弃
# 2.数据补齐:fillna
print(df.fillna('?'))
df.at[0, '数分'] = None
print(df.fillna(method='pad'))  # 使用该列的前一个值填充,若该行没有前一行,则仍然为NaN
print(df.fillna(method='bfill'))  # 使用该列的后一个值填充,若该行没有后一行,则仍然为NaN
# 使用平均值或其他统计量代替NaN
print(df.fillna(df.mean()))  # 使用该列的平均数替代
print(df.fillna(df.mean()['高代':'解几']))  # 用其他列('解几')均值替代指定列('高代')的NaN
# 不同列填充不同值
print(df.fillna({'数分': 100, '高代': 0}))  # 没有列出的列不变
# strip()、lstrip()、rstrip():清除字符型数据首尾指定的字符(默认空白符)
df2 = DataFrame({
    'age':
    Series([26, 34, 27, 34, 88, 21, 27]),
    'name':
    Series([' Tom', 'Lee ', ' Jon', ' Lee', 'James ', 'Curry ', ' Curryy'])
})
Beispiel #48
0
def transform(df: pd.DataFrame,
              year: int,
              fillgps: bool = False,
              naninvalid: bool = False,
              dropnan: bool = False,
              masknan: float = None,
              fillnan: float = None,
              aqsnumerical: bool = False,
              sites=[]) -> pd.DataFrame:

    if len(sites) > 0:
        df.drop(df[~df['AQS_Code'].isin(list(sites.keys()))].index,
                inplace=True)

    # This is probobly not needed anymore after changes Data_structure_3 (level3_data)
    if naninvalid:
        if year < 2014:
            val = 'VAL'
        if year >= 2014:
            val = 'K'

        df[df['nox_flag'] != val]['nox_flag'] = np.nan
        df[df['no_flag'] != val]['no_flag'] = np.nan
        df[df['no2_flag'] != val]['no2_flag'] = np.nan
        df[df['o3_flag'] != val]['o3_flag'] = np.nan

    # This is probobly not needed anymore after changes Data_structure_3 (level3_data)
    if fillgps:
        unique = df['AQS_Code'].unique()
        for site in HOUSTON:
            if site in unique:
                df[df['AQS_Code'] ==
                   site]['Longitude'] = HOUSTON[site]['Longitude']
                df[df['AQS_Code'] ==
                   site]['Latitude'] = HOUSTON[site]['Latitude']

    if dropnan:
        if year < 2014:
            val = 'VAL'
        if year >= 2014:
            val = 'K'

        df.dropna(inplace=True)

    if aqsnumerical:
        df['AQS_Code'].str.replace('_', '')
        df['AQS_Code'] = df['AQS_Code'].astype(int)

    df['wind_x_dir'] = df['windspd'] * np.cos(df['winddir'] * (np.pi / 180))
    df['wind_y_dir'] = df['windspd'] * np.sin(df['winddir'] * (np.pi / 180))
    df['hour'] = pd.to_datetime(df['epoch'], unit='s').dt.hour
    df['day_of_year'] = pd.Series(pd.to_datetime(df['epoch'], unit='s'))
    df['day_of_year'] = df['day_of_year'].dt.dayofyear

    if masknan is not None:
        s = df['AQS_Code']
        df[df.isnull().any(axis=1)] = 1000
        df['AQS_Code'] = s
    elif fillnan is not None:
        df.fillna(fillnan, inplace=True)

    return df
    def test_fillna(self):
        tf = self.tsframe
        tf.loc[tf.index[:5], 'A'] = nan
        tf.loc[tf.index[-5:], 'A'] = nan

        zero_filled = self.tsframe.fillna(0)
        assert (zero_filled.loc[zero_filled.index[:5], 'A'] == 0).all()

        padded = self.tsframe.fillna(method='pad')
        assert np.isnan(padded.loc[padded.index[:5], 'A']).all()
        assert (padded.loc[padded.index[-5:], 'A'] ==
                padded.loc[padded.index[-5], 'A']).all()

        # mixed type
        mf = self.mixed_frame
        mf.loc[mf.index[5:20], 'foo'] = nan
        mf.loc[mf.index[-10:], 'A'] = nan
        result = self.mixed_frame.fillna(value=0)
        result = self.mixed_frame.fillna(method='pad')

        pytest.raises(ValueError, self.tsframe.fillna)
        pytest.raises(ValueError, self.tsframe.fillna, 5, method='ffill')

        # mixed numeric (but no float16)
        mf = self.mixed_float.reindex(columns=['A', 'B', 'D'])
        mf.loc[mf.index[-10:], 'A'] = nan
        result = mf.fillna(value=0)
        _check_mixed_float(result, dtype=dict(C=None))

        result = mf.fillna(method='pad')
        _check_mixed_float(result, dtype=dict(C=None))

        # empty frame (GH #2778)
        df = DataFrame(columns=['x'])
        for m in ['pad', 'backfill']:
            df.x.fillna(method=m, inplace=True)
            df.x.fillna(method=m)

        # with different dtype (GH3386)
        df = DataFrame([['a', 'a', np.nan, 'a'], [
            'b', 'b', np.nan, 'b'], ['c', 'c', np.nan, 'c']])

        result = df.fillna({2: 'foo'})
        expected = DataFrame([['a', 'a', 'foo', 'a'],
                              ['b', 'b', 'foo', 'b'],
                              ['c', 'c', 'foo', 'c']])
        assert_frame_equal(result, expected)

        df.fillna({2: 'foo'}, inplace=True)
        assert_frame_equal(df, expected)

        # limit and value
        df = DataFrame(np.random.randn(10, 3))
        df.iloc[2:7, 0] = np.nan
        df.iloc[3:5, 2] = np.nan

        expected = df.copy()
        expected.iloc[2, 0] = 999
        expected.iloc[3, 2] = 999
        result = df.fillna(999, limit=1)
        assert_frame_equal(result, expected)

        # with datelike
        # GH 6344
        df = DataFrame({
            'Date': [pd.NaT, Timestamp("2014-1-1")],
            'Date2': [Timestamp("2013-1-1"), pd.NaT]
        })

        expected = df.copy()
        expected['Date'] = expected['Date'].fillna(
            df.loc[df.index[0], 'Date2'])
        result = df.fillna(value={'Date': df['Date2']})
        assert_frame_equal(result, expected)

        # with timezone
        # GH 15855
        df = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'),
                                 pd.NaT]})
        exp = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'),
                                  pd.Timestamp('2012-11-11 00:00:00+01:00')]})
        assert_frame_equal(df.fillna(method='pad'), exp)

        df = pd.DataFrame({'A': [pd.NaT,
                                 pd.Timestamp('2012-11-11 00:00:00+01:00')]})
        exp = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'),
                                  pd.Timestamp('2012-11-11 00:00:00+01:00')]})
        assert_frame_equal(df.fillna(method='bfill'), exp)
Beispiel #50
0
 def test_fillna_downcast_dict(self):
     # GH#40809
     df = DataFrame({"col1": [1, np.nan]})
     result = df.fillna({"col1": 2}, downcast={"col1": "int64"})
     expected = DataFrame({"col1": [1, 2]})
     tm.assert_frame_equal(result, expected)
    def test_fillna_positive_limit(self, type):
        df = DataFrame(np.random.randn(10, 4)).astype(type)

        msg = "Limit must be greater than 0"
        with pytest.raises(ValueError, match=msg):
            df.fillna(0, limit=-5)
Beispiel #52
0
 def test_fillna_col_reordering(self):
     cols = ["COL." + str(i) for i in range(5, 0, -1)]
     data = np.random.rand(20, 5)
     df = DataFrame(index=lrange(20), columns=cols, data=data)
     filled = df.fillna(method='ffill')
     assert df.columns.tolist() == filled.columns.tolist()
Beispiel #53
0
    tmp_df = daily_df.filter(['movieNm', 'audiCnt'])

    daily_rank_df = tmp_df.rename(index=tmp_df['movieNm'],
                                  columns={'audiCnt': yesterday_str})

    daily_rank_df.drop('movieNm', axis=1, inplace=True)

    daily_rank_df[yesterday_str] = daily_rank_df[yesterday_str].astype(int)

    df = pd.merge(df,
                  daily_rank_df,
                  left_index=True,
                  right_index=True,
                  how='outer')

final_df = df.fillna(0)  # 결측치

yesterday = dt.datetime.now() + dt.timedelta(days=-1)
yesterday_str = yesterday.strftime('%Y%m%d')

final_df = final_df.sort_values(yesterday_str, ascending=False).head(5)

pyplot.rcParams['font.family'] = 'Malgun Gothic'
pyplot.rcParams['font.size'] = 17
pyplot.rcParams['figure.figsize'] = (20, 10)

final_df.plot.bar(rot=45)
pyplot.title('날짜별 관람객 빈도')
pyplot.show()

final_df.T.plot(rot=45)
Beispiel #54
0
def _extract_weather_power_plants(_dataset):

    _dataset_list = []
    df_average = DataFrame()
    df_average['Global_tavg'] = ""
    df_average['Global_tmin'] = ""
    df_average['Global_tmax'] = ""
    df_average['Global_prcp'] = ""
    df_average['Global_snow'] = ""
    df_average['Global_wdir'] = ""
    df_average['Global_wspd'] = ""
    df_average['Global_wpgt'] = ""
    df_average['Global_pres'] = ""
    df_average['Global_tsun'] = ""

    counter = 0

    for i in range(len(_dataset)):
        print(i)
        latitude = _dataset.at[i, 'latitude']
        longitude = _dataset.at[i, 'longitude']
        name = _dataset.at[i, 'country']

        weather_data = DataFrame(_weather_data_extraction(
            latitude, longitude)).add_prefix(name + "_" + str(i) + "_")
        weather_data.columns = [
            'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt',
            'pres', 'tsun'
        ]

        weather_data = weather_data.fillna(0.0)

        set_option('display.max_columns', 50)

        if i == 0:
            df_average['Global_tmin'] = weather_data['tmin']
            df_average['Global_tmax'] = weather_data['tmax']
            df_average['Global_prcp'] = weather_data['prcp']
            df_average['Global_snow'] = weather_data['snow']
            df_average['Global_wdir'] = weather_data['wdir']
            df_average['Global_wspd'] = weather_data['wspd']
            df_average['Global_wpgt'] = weather_data['wpgt']
            df_average['Global_pres'] = weather_data['pres']
            df_average['Global_tsun'] = weather_data['tsun']
        else:
            df_average['Global_tmin'] += weather_data['tmin']
            df_average['Global_tmax'] += weather_data['tmax']
            df_average['Global_prcp'] += weather_data['prcp']
            df_average['Global_snow'] += weather_data['snow']
            df_average['Global_wdir'] += weather_data['wdir']
            df_average['Global_wspd'] += weather_data['wspd']
            df_average['Global_wpgt'] += weather_data['wpgt']
            df_average['Global_pres'] += weather_data['pres']
            df_average['Global_tsun'] += weather_data['tsun']

        # print(weather_data['tavg'])
        # print("avg")
        # print(df_average)
        # print("---")

        counter += 1

        # print(weather_data)
        # _dataset_list.append(weather_data)

    df_average['Global_tavg'] = df_average['Global_tavg'] / counter
    df_average['Global_tmin'] = df_average['Global_tmin'] / counter
    df_average['Global_tmax'] = df_average['Global_tmax'] / counter
    df_average['Global_prcp'] = df_average['Global_prcp'] / counter
    df_average['Global_snow'] = df_average['Global_snow'] / counter
    df_average['Global_wdir'] = df_average['Global_wdir'] / counter
    df_average['Global_wspd'] = df_average['Global_wspd'] / counter
    df_average['Global_wpgt'] = df_average['Global_wpgt'] / counter
    df_average['Global_pres'] = df_average['Global_pres'] / counter
    df_average['Global_tsun'] = df_average['Global_tsun'] / counter

    #return concat(_dataset_list, join='inner', axis=1)
    return df_average
Beispiel #55
0
# Compare
salary.dropna().mean()
# with
salary.fillna(0).mean()

# When you have a DataFrame things get a bit messier - do you want to drop all the rows with NAs or just some of them?
salary = DataFrame({'salary':[53215, 112454, 22365, np.nan, 30493, None],
    'grade':[5, 7, None, np.nan, 2, 9]},index=['Margaret', 'Stephen', 'Joanne', 'Joe', 'Matthew', 'Nelson'])

# Drop NA drops all rows with NAs - this permits complete case analysis
salary.dropna()

#3 - Wrangling ******************************************************************

#Impute with column mean
salary.fillna(salary.mean())

# Create a design matrix X with 5 columns, the first of which is all 1 the subsequent ones are x_1, x_2, etc
X = np.array(([1]*n,x_1,x_2,x_3,x_4)).T
              
# The below code separates out the data set into two parts. The first part, a DataFrame called X, should contain 
# just the covariate information (i.e. the first 191 columns) standardised to have 
# mean 0 and standard deviation 1. The second part, called y, should contain just
# the RockOrNot variable.
X = (music.drop('RockOrNot',axis=1) - music.drop('RockOrNot',axis=1).mean())/music.drop('RockOrNot',axis=1).std()
y = music.RockOrNot
# Use corrwith and np.where to find which variable has the biggest correlation with y
cors = X.corrwith(y)
np.where(cors==max(cors))

 # So we now want each row to contain their rating and all their details
Beispiel #56
0
def prepare_data(test, traces, options):

    std_out('Preparing data for plot')

    # Dataframe to return
    df = DataFrame()

    # Check if there are different subplots
    n_subplots = 1

    for trace in traces:
        if 'subplot' in traces[trace].keys():
            n_subplots = max(n_subplots, traces[trace]['subplot'])
        else:
            std_out(f'Trace {trace} not assigned to subplot. Skipping',
                    'WARNING')

    std_out(f'Making {n_subplots} subplots')

    # Generate list of subplots
    subplots = [[] for x in range(n_subplots)]

    # Put data in the df
    for trace in traces.keys():

        if 'subplot' not in traces[trace].keys():
            std_out(
                f'The trace {traces[trace]} was not placed in any subplot. Assuming subplot #1',
                'WARNING')
            traces[trace]['subplot'] = 1

        ndevs = traces[trace]['devices']
        nchans = traces[trace]['channel']

        # Make them lists always
        if ndevs == 'all': devices = list(test.devices.keys())
        elif type(ndevs) == str or type(ndevs) == int: devices = [ndevs]
        else: devices = ndevs

        for device in devices:

            ndev = str(device)

            # Make them lists always
            if nchans == 'all':
                channels = list(test.devices[ndev].readings.columns)
            elif type(nchans) == str:
                channels = [nchans]
            else:
                channels = nchans

            for channel in channels:
                # Check if device is in columns
                if channel not in test.devices[ndev].readings.columns:
                    std_out(
                        f'The device {ndev} does not contain {channel}. Ignoring',
                        'WARNING')
                    continue

                # Put channel in subplots
                subplots[traces[trace]['subplot'] - 1].append(channel + '_' +
                                                              ndev)

                column_orig = [channel]
                columns_add = [channel + '_' + ndev]

                # Add filtering name to dfdev
                if 'filter' in traces[trace]:
                    col_name = traces[trace]['filter']['col']

                    if col_name not in test.devices[ndev].readings.columns:
                        std_out(
                            f'Column {col_name} not in dataframe. Ignoring filtering',
                            'WARNING')
                    else:
                        column_orig.append(col_name)
                        columns_add.append(col_name)

                # Device dataframe
                dfdev = DataFrame(
                    test.devices[ndev].readings[column_orig].values,
                    columns=columns_add,
                    index=test.devices[ndev].readings.index)

                # Add filtering function
                if 'filter' in traces[trace]:
                    value = traces[trace]['filter']['value']
                    relationship = traces[trace]['filter']['relationship']

                    if col_name in dfdev.columns:
                        if relationship == '==':
                            dfdev.loc[dfdev[col_name] == value]
                        elif relationship == '<=':
                            dfdev.loc[dfdev[col_name] <= value]
                        elif relationship == '>=':
                            dfdev.loc[dfdev[col_name] >= value]
                        elif relationship == '<':
                            dfdev.loc[dfdev[col_name] < value]
                        elif relationship == '>':
                            dfdev.loc[dfdev[col_name] > value]
                        else:
                            std_out(
                                f"Not valid relationship. Valid options: '==', '<=', '>=', '<', '>'",
                                'ERROR')
                            continue
                        # Remove column for filtering from dfdev
                        dfdev.drop(columns=[col_name], inplace=True)

                # Combine it in the df
                df = df.combine_first(dfdev)

        # Add average or other extras
        # TODO Check this to simplify
        # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.core.resample.Resampler.aggregate.html
        if 'extras' in traces[trace]:
            for extra in traces[trace]['extras']:

                extra_name = channel + f'-{extra.upper()}'
                sbl = subplots[traces[trace]['subplot'] - 1]

                if extra == 'max':
                    df[extra_name] = df.loc[:, sbl].max(axis=1)

                if extra == 'mean':
                    df[extra_name] = df.loc[:, sbl].mean(axis=1)

                if extra == 'min':
                    df[extra_name] = df.loc[:, sbl].min(axis=1)

                subplots[traces[trace]['subplot'] - 1].append(extra_name)

    # Trim data
    if options['min_date'] is not None: df = df[df.index > options['min_date']]
    if options['max_date'] is not None: df = df[df.index < options['max_date']]

    # Make sure everything is numeric before resampling
    # https://stackoverflow.com/questions/34257069/resampling-pandas-dataframe-is-deleting-column#34270422
    # df = df.apply(to_numeric, errors='coerce')
    df = df.astype(float, errors='ignore')

    # Resample it
    if options['frequency'] is not None:
        std_out(f"Resampling at {options['frequency']}", "INFO")

        if 'resample' in options:

            if options['resample'] == 'max':
                df = df.resample(options['frequency']).max()
            if options['resample'] == 'min':
                df = df.resample(options['frequency']).min()
            if options['resample'] == 'mean':
                df = df.resample(options['frequency']).mean()

        else:
            df = df.resample(options['frequency']).mean()

    # Clean na
    if options['clean_na'] is not None:
        if options['clean_na'] == 'fill':
            df = df.fillna(method='ffill')
        if options['clean_na'] == 'drop':
            df.dropna(axis=0, how='any')

    if df.empty: std_out('Dataframe for selected options is empty', 'WARNING')

    return df, subplots
Beispiel #57
0
xLastTimeFeature = xTotalGroup['time'].min().unstack('behavior_type')
xLastTimeFeature.fillna(value = 10, inplace = True)
xLastTimeFeature.sort_index(axis = 1, inplace = True)
xLastTimeFeature = xLastTimeFeature.add_prefix('x_last_time_')
xLastTimeFeature13 = xLastTimeFeature.ix[ : , ['x_last_time_1', 'x_last_time_3']]
#time relative
xTimeGroup = u.groupby(['user_id', 'item_id', 'time', 'behavior_type'])
xTimeFeature = xTimeGroup['item_category'].count()
xTimeFeature = xTimeFeature.unstack(['time', 'behavior_type'])
xTimeFeature.sort_index(axis = 1, inplace = True)
xTimeFeature.fillna(value = 0, inplace = True)
# 3 not 4
x_last1_3not4 = DataFrame(index = xTimeFeature[0].index)
x_last1_3not4_index = np.logical_and(xTimeFeature[0][4] == 0, xTimeFeature[0][3] != 0)
x_last1_3not4.ix[x_last1_3not4_index, 'x_last1_3not4'] = 1
x_last1_3not4.fillna(0, inplace = True)
# x_last2_3not4 = DataFrame(index = xTimeFeature[0].index)
# x_last2_3not4_index = np.logical_and(xTimeFeature[1][4] == 0, xTimeFeature[1][3] != 0)
# x_last2_3not4.ix[x_last2_3not4_index, 'x_last2_3not4'] = 1
# x_last2_3not4.fillna(0, inplace = True)

#every
xevery2= [(xTimeFeature[i] + xTimeFeature[i + 1]).add_prefix(str(i) + '_') 
            for i in range(0, 10, 2)]
xevery2 = pd.concat(xevery2, axis = 1).add_prefix('x_every2_')
#last
xlast1 = (xTimeFeature[0]).add_prefix('x_last_1')
xlast3 = (xTimeFeature[0] + xTimeFeature[1] + xTimeFeature[2]).add_prefix('x_last_3')
xlast5 = (xTimeFeature[0] + xTimeFeature[1] + xTimeFeature[2] 
            + xTimeFeature[3] + xTimeFeature[4]).add_prefix('x_last_5')
xT = pd.concat([xlast1, xlast3, xlast5], axis = 1)
Beispiel #58
0
    def test_fillna_other(self):
        # empty frame (GH #2778)
        df = DataFrame(columns=['x'])
        for m in ['pad', 'backfill']:
            df.x.fillna(method=m, inplace=True)
            df.x.fillna(method=m)

        # with different dtype (GH3386)
        df = DataFrame([['a', 'a', np.nan, 'a'], ['b', 'b', np.nan, 'b'],
                        ['c', 'c', np.nan, 'c']])

        result = df.fillna({2: 'foo'})
        expected = DataFrame([['a', 'a', 'foo', 'a'], ['b', 'b', 'foo', 'b'],
                              ['c', 'c', 'foo', 'c']])
        assert_frame_equal(result, expected)

        df.fillna({2: 'foo'}, inplace=True)
        assert_frame_equal(df, expected)

        # limit and value
        df = DataFrame(np.random.randn(10, 3))
        df.iloc[2:7, 0] = np.nan
        df.iloc[3:5, 2] = np.nan

        expected = df.copy()
        expected.iloc[2, 0] = 999
        expected.iloc[3, 2] = 999
        result = df.fillna(999, limit=1)
        assert_frame_equal(result, expected)

        # with datelike
        # GH 6344
        df = DataFrame({
            'Date': [pd.NaT, Timestamp("2014-1-1")],
            'Date2': [Timestamp("2013-1-1"), pd.NaT]
        })

        expected = df.copy()
        expected['Date'] = expected['Date'].fillna(df.loc[df.index[0],
                                                          'Date2'])
        result = df.fillna(value={'Date': df['Date2']})
        assert_frame_equal(result, expected)

        # with timezone
        # GH 15855
        df = pd.DataFrame(
            {'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'), pd.NaT]})
        exp = pd.DataFrame({
            'A': [
                pd.Timestamp('2012-11-11 00:00:00+01:00'),
                pd.Timestamp('2012-11-11 00:00:00+01:00')
            ]
        })
        assert_frame_equal(df.fillna(method='pad'), exp)

        df = pd.DataFrame(
            {'A': [pd.NaT, pd.Timestamp('2012-11-11 00:00:00+01:00')]})
        exp = pd.DataFrame({
            'A': [
                pd.Timestamp('2012-11-11 00:00:00+01:00'),
                pd.Timestamp('2012-11-11 00:00:00+01:00')
            ]
        })
        assert_frame_equal(df.fillna(method='bfill'), exp)

        # with timezone in another column
        # GH 15522
        df = pd.DataFrame({
            'A':
            pd.date_range('20130101', periods=4, tz='US/Eastern'),
            'B': [1, 2, np.nan, np.nan]
        })
        result = df.fillna(method='pad')
        expected = pd.DataFrame({
            'A':
            pd.date_range('20130101', periods=4, tz='US/Eastern'),
            'B': [1., 2., 2., 2.]
        })
        assert_frame_equal(result, expected)
Beispiel #59
0
print(df1.dropna(how='all'))

#Column wise dropping null values
print(df1.dropna(
    axis=1))  #The dropna() works similar in row and column NaN value drops
print()

#Threshold property of dropna() [thresh]
df2 = DataFrame([[1, 2, 3, np.nan], [4, 5, 6, 7], [8, 9, np.nan, np.nan],
                 [12, np.nan, np.nan, np.nan]])
print(
    df2.dropna(thresh=3)
)  #This drops all rows where number of data values (not equal to NaN) is lesser than 3

#3. Filling NaN values with chosen numerical values - fillna() function
print(df2.fillna({0: 0, 1: 50, 2: 100, 3: 200}))
print()

#----------- NEXT LECTURE -------------#

#Selecting and modifying data in pandas
series1 = Series([100, 200, 300], index=['a', 'b', 'c'])
#The Series always have in-built indices of 0,1,2.... even if we explicitly mention other indices. Hence we can access
#the series elements by the in-built indicies as well

#Conditional Indexing
print(series1[series1 > 150])
print(series1[series1 == 300])

df1 = DataFrame(np.arange(9).reshape(3, 3),
                index=['car', 'bike', 'cycle'],
Beispiel #60
0
 def test_fillna_dtype_conversion_equiv_replace(self, val):
     df = DataFrame({"A": [1, np.nan], "B": [1.0, 2.0]})
     expected = df.replace(np.nan, val)
     result = df.fillna(val)
     tm.assert_frame_equal(result, expected)