def slide_7():
    a = Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
               index=['f', 'e', 'd', 'c', 'b', 'a'])
    b = Series(np.arange(len(a), dtype=np.float64),
               index=['f', 'e', 'd', 'c', 'b', 'a'])
    print '***a***'
    print a
    print '***b***'
    print b
    b[-1] = np.nan
    print '***a***'
    print a
    print '***b***'
    print b
    print np.where(pd.isnull(a), b, a)

    print '#####combine_first#####'
    print '***b[:-2]***'
    print b[:-2]
    print '***a[2:]***'
    print a[2:]
    print 'b[:-2].combine_first(a[2:])'
    print b[:-2].combine_first(a[2:])

    df1 = DataFrame({'a': [1., np.nan, 5., np.nan],
                     'b': [np.nan, 2., np.nan, 6.],
                     'c': range(2, 18, 4)})
    df2 = DataFrame({'a': [5., 4., np.nan, 3., 7.],
                     'b': [np.nan, 3., 4., 6., 8.]})
    print '***df1***'
    print df1
    print '***df2***'
    print df2
    print df1.combine_first(df2)
    def test_combine_first(self):
        # disjoint
        head, tail = self.frame[:5], self.frame[5:]

        combined = head.combine_first(tail)
        reordered_frame = self.frame.reindex(combined.index)
        assert_frame_equal(combined, reordered_frame)
        assert tm.equalContents(combined.columns, self.frame.columns)
        assert_series_equal(combined['A'], reordered_frame['A'])

        # same index
        fcopy = self.frame.copy()
        fcopy['A'] = 1
        del fcopy['C']

        fcopy2 = self.frame.copy()
        fcopy2['B'] = 0
        del fcopy2['D']

        combined = fcopy.combine_first(fcopy2)

        assert (combined['A'] == 1).all()
        assert_series_equal(combined['B'], fcopy['B'])
        assert_series_equal(combined['C'], fcopy2['C'])
        assert_series_equal(combined['D'], fcopy['D'])

        # overlap
        head, tail = reordered_frame[:10].copy(), reordered_frame
        head['A'] = 1

        combined = head.combine_first(tail)
        assert (combined['A'][:10] == 1).all()

        # reverse overlap
        tail['A'][:10] = 0
        combined = tail.combine_first(head)
        assert (combined['A'][:10] == 0).all()

        # no overlap
        f = self.frame[:10]
        g = self.frame[10:]
        combined = f.combine_first(g)
        assert_series_equal(combined['A'].reindex(f.index), f['A'])
        assert_series_equal(combined['A'].reindex(g.index), g['A'])

        # corner cases
        comb = self.frame.combine_first(self.empty)
        assert_frame_equal(comb, self.frame)

        comb = self.empty.combine_first(self.frame)
        assert_frame_equal(comb, self.frame)

        comb = self.frame.combine_first(DataFrame(index=["faz", "boo"]))
        assert "faz" in comb.index

        # #2525
        df = DataFrame({'a': [1]}, index=[datetime(2012, 1, 1)])
        df2 = DataFrame({}, columns=['b'])
        result = df.combine_first(df2)
        assert 'b' in result
Example #3
0
    def update(self, df_in: pd.DataFrame, symbol: str=None, datatype: str=None,
               barsize: str=None, tz: str=None, standardize_index=True):
        """
        Input data is combined with self.df. Overlapped data will be
        overwritten by non-null values of input data. Indexes and Columns
        will be unioned.
        """
        # Check input data type
        if not (isinstance(df_in, pd.DataFrame)):
            raise TypeError('Input data must be a pandas.DataFrame.')

        # Check empty data
        if df_in.empty:
            return self

        # Standardize index
        if standardize_index:
            df_in = self._standardize_index(
                df_in.copy(), symbol=symbol, datatype=datatype,
                barsize=barsize, tz=tz)

        # Combine input DataFrame with internal self.df
        if self.df.empty:  # Initialize self.df
            self.df = df_in.sort_index()
        else:
            df_in = df_in.tz_convert(self.tzinfo, level=self.__class__.dtlevel)
            self.df = df_in.combine_first(self.df).sort_index()

        # Post-combination processing
        # Fill NaN, and enforce barcount and volume columns dtype to int64
        self.df.fillna(-1, inplace=True)
        for col in self.df.columns:
            if col.lower() in ('barcount', 'volume'):
                self.df[col] = self.df[col].astype(np.int64)
Example #4
0
def table_OD(list_coordsO, list_idsO, list_coordsD, list_idsD,
             OSRM_max_table=100, host='http://localhost:5000'):
    """
    Function wrapping OSRM 'table' function in order to get a matrix of
    time distance between different origins and destinations (N:M)
    Params :
        list_coordsO: list
            A list of coord as [x, y] for the origins, like :
                 list_coords = [[21.3224, 45.2358],
                                [21.3856, 42.0094],
                                [20.9574, 41.5286]] (coords have to be float)
        list_idsO: list
            A list of the corresponding unique id for the origins, like :
                     list_ids = ['name1',
                                 'name2',
                                 'name3'] (id can be str, int or float)
        list_coordsD: list
            A list of coord as [x, y] for the destinations (same kind as the
            origins)
        list_idsD: list
            A list of the corresponding unique id for the destinations (same
            kind as the origins)
        OSRM_max_table: int, default=100
            The --max-table-size defined when lauching osrm-routed (default is
            100). It will be used to clip the request in many 'table' requests
            and reconstruct the matrix.
        host: str, default 'http://localhost:5000'
            Url and port of the OSRM instance (no final bakslash)

    Output:
        A labeled DataFrame containing the time matrix in minutes
            (or NaN when OSRM encounter an error to compute a route)

        -1 or an empty DataFrame is return in case of any other error
            (wrong list of coords/ids, unknow host,
            wrong response from the host, etc.)
    """
    if list_coordsO == list_coordsD and list_idsO == list_idsD:
        list_coords, list_ids = list_coordsO, list_idsO
    else:
        list_coords = list_coordsO + list_coordsD
        list_ids = list_idsO + list_idsD

    if len(list_coords) > OSRM_max_table:
        gpd_coords = list(chunk(list_coords, OSRM_max_table//2))
        gpd_ids = list(chunk(list_ids, OSRM_max_table//2))
        df = DataFrame(index=list_ids, columns=list_ids, dtype=float)
        for lcoord, lid in zip(mat_range2d(gpd_coords), mat_range2d(gpd_ids)):
            df = df.combine_first(table(list(lcoord), list(lid), host=host))
    else:
        df = table(list_coords, list_ids, host=host)

    try:
        return df[list_idsO].filter(list_idsD, axis=0)
    except Exception as err:
        print(err)
        return -1
    def test_combine_first_mixed(self):
        a = Series(['a', 'b'], index=lrange(2))
        b = Series(lrange(2), index=lrange(2))
        f = DataFrame({'A': a, 'B': b})

        a = Series(['a', 'b'], index=lrange(5, 7))
        b = Series(lrange(2), index=lrange(5, 7))
        g = DataFrame({'A': a, 'B': b})

        # TODO(wesm): no verification?
        combined = f.combine_first(g)  # noqa
    def test_combine_first_mixed(self):
        a = Series(['a', 'b'], index=lrange(2))
        b = Series(lrange(2), index=lrange(2))
        f = DataFrame({'A': a, 'B': b})

        a = Series(['a', 'b'], index=lrange(5, 7))
        b = Series(lrange(2), index=lrange(5, 7))
        g = DataFrame({'A': a, 'B': b})

        exp = pd.DataFrame({'A': list('abab'), 'B': [0., 1., 0., 1.]},
                           index=[0, 1, 5, 6])
        combined = f.combine_first(g)
        tm.assert_frame_equal(combined, exp)
Example #7
0
def cal_idxrel_sym(f, s, cal_self=False, self_val=0, *args, **kargs):
    slen = len(s)
    df = {}
    for i in range(slen):
        if np.isscalar(s[i]) and np.isnan(s[i]):
            df[s.index[i]] = Series(np.nan, index=s.index)
            continue
        res = []
        if cal_self:
            res.append(f(s[i], s[i]))
        else:
            res.append(self_val)
        for j in range(i+1, slen):
            if np.isscalar(s[j]) and np.isnan(s[j]):
                res.append(np.nan)
            else:
                res.append(f(s[i], s[j], *args, **kargs))
        df[s.index[i]] = Series(res, index=s.index[i:])
    df = DataFrame(df)
    df = df.combine_first(df.T)
    return df
    def interpolate(self, X, force_interpolation=True, **kwargs):
        # force_interpolation: if false, if ALL interpolant variables already
        # present in X, then do not actually create new interpolation
        do_interpolation = force_interpolation
        for key in self.y_keys:
            # if a y_key is not present, force the interpolation
            if key not in X:
                do_interpolation = True

        if do_interpolation:
            interpolated = {}
            # for key in X.keys():#self.x_keys:
            #     interpolated[key] = X[key]
            for key in self.y_keys:
                interpolated[key] = self.knn[key].predict(X[self.x_keys])
            interpolated = DataFrame(interpolated, index=X.index)
            # want to overwrite any preexisting x
            X_return = interpolated.combine_first(X)
        else:
            # do nothing
            X_return = X
        return X_return
    def process(self, start_time:datetime, end_time:datetime, input:DataFrame):
        if (self._args is not None and len(self._args) > 2) or \
           (len(self._args) != 0 and not isinstance(self._args[0], QueryFunction)):
            raise ValueError('Invalid argument to absolute value function')

        # get the data
        data = input if len(self._args) == 0 else self._args[0].process(start_time, end_time, input)

        ret = None

        # go through each column, get the average, and apply it to the rows
        for col in data.columns:
            abs = data[col].abs()  # get the absolute value for each value in the column
            abs.name = 'abs ' + col  # update the name

            if ret is None:
                ret = DataFrame(abs)
            else:
                ret = ret.combine_first(DataFrame(abs))  # add it to our return value

        print(ret.head())

        return ret
Example #10
0
    def process(self, start_time: datetime, end_time: datetime, input:DataFrame):
        if str(self.name) not in '+-*/':
            raise ValueError("Unknown math function: " + str(self.name))

        ret = DataFrame()

        # two args means we're doing A + B
        if len(self._args) == 2:
            left = self._args[0].process(start_time, end_time, input) if isinstance(self._args[0], QueryFunction) else self._args[0]
            right = self._args[1].process(start_time, end_time, input) if isinstance(self._args[1], QueryFunction) else self._args[1]

            for l_col in left.columns:
                for r_col in right.columns:
                    if self.name == '+':
                        t = left[l_col] + right[r_col]
                    elif self.name == '-':
                        t = left[l_col] - right[r_col]
                    elif self.name == '*':
                        t = left[l_col] * right[r_col]
                    elif self.name == '/':
                        t = left[l_col] / right[r_col]
                    else:
                        raise ValueError("Unknown operator: " + str(self.name))

                    t = DataFrame(t)
                    t.columns = [l_col + self.name + r_col]

                    print(left.head())
                    print(right.head())
                    print(t.head())
                    ret = ret.combine_first(t)

        else:  # everything is in the input DataFrame
            ret = DataFrame(input.sum(axis=0))
            ret.columns = [' + '.join(input.columns)]

        return ret
    def test_combine_first_mixed_bug(self):
        idx = Index(['a', 'b', 'c', 'e'])
        ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx)
        ser2 = Series(['a', 'b', 'c', 'e'], index=idx)
        ser3 = Series([12, 4, 5, 97], index=idx)

        frame1 = DataFrame({"col0": ser1,
                            "col2": ser2,
                            "col3": ser3})

        idx = Index(['a', 'b', 'c', 'f'])
        ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx)
        ser2 = Series(['a', 'b', 'c', 'f'], index=idx)
        ser3 = Series([12, 4, 5, 97], index=idx)

        frame2 = DataFrame({"col1": ser1,
                            "col2": ser2,
                            "col5": ser3})

        combined = frame1.combine_first(frame2)
        assert len(combined.columns) == 5

        # gh 3016 (same as in update)
        df = DataFrame([[1., 2., False, True], [4., 5., True, False]],
                       columns=['A', 'B', 'bool1', 'bool2'])

        other = DataFrame([[45, 45]], index=[0], columns=['A', 'B'])
        result = df.combine_first(other)
        assert_frame_equal(result, df)

        df.loc[0, 'A'] = np.nan
        result = df.combine_first(other)
        df.loc[0, 'A'] = 45
        assert_frame_equal(result, df)

        # doc example
        df1 = DataFrame({'A': [1., np.nan, 3., 5., np.nan],
                         'B': [np.nan, 2., 3., np.nan, 6.]})

        df2 = DataFrame({'A': [5., 2., 4., np.nan, 3., 7.],
                         'B': [np.nan, np.nan, 3., 4., 6., 8.]})

        result = df1.combine_first(df2)
        expected = DataFrame(
            {'A': [1, 2, 3, 5, 3, 7.], 'B': [np.nan, 2, 3, 4, 6, 8]})
        assert_frame_equal(result, expected)

        # GH3552, return object dtype with bools
        df1 = DataFrame(
            [[np.nan, 3., True], [-4.6, np.nan, True], [np.nan, 7., False]])
        df2 = DataFrame(
            [[-42.6, np.nan, True], [-5., 1.6, False]], index=[1, 2])

        result = df1.combine_first(df2)[2]
        expected = Series([True, True, False], name=2)
        assert_series_equal(result, expected)

        # GH 3593, converting datetime64[ns] incorrecly
        df0 = DataFrame({"a": [datetime(2000, 1, 1),
                               datetime(2000, 1, 2),
                               datetime(2000, 1, 3)]})
        df1 = DataFrame({"a": [None, None, None]})
        df2 = df1.combine_first(df0)
        assert_frame_equal(df2, df0)

        df2 = df0.combine_first(df1)
        assert_frame_equal(df2, df0)

        df0 = DataFrame({"a": [datetime(2000, 1, 1),
                               datetime(2000, 1, 2),
                               datetime(2000, 1, 3)]})
        df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]})
        df2 = df1.combine_first(df0)
        result = df0.copy()
        result.iloc[0, :] = df1.iloc[0, :]
        assert_frame_equal(df2, result)

        df2 = df0.combine_first(df1)
        assert_frame_equal(df2, df0)
Example #12
0
a = Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
	index=['f','e','d','c','b','a'])
b = Series(np.arange(len(a), dtype=np.float64),
	index=['f','e','d','c','b','a'])
b[-1] = np.nan
# np中实现ifelse语句,a中空值位置用b替代
np.where(pd.isnull(a),b,a)
# pd中类似函数,b中控制用a替代
b[:-2].combine_first(a[2:])
# DataFrame中使用
df1 = DataFrame({'a':[1.,np.nan,5.,np.nan],
	'b':[np.nan,2.,np.nan, 6.],
	'c':range(2,18,4)})
df2 = DataFrame({'a':[5.,4.,np.nan,3.,7.],
	'b':[np.nan,3.,4,6.,8.]})
df1.combine_first(df2)
## 移除重复数据
data = DataFrame({'k1':['one']*3+['two']*4,
	'k2':[1,1,2,3,3,4,4]})
data.duplicated()
# 去除重复值,默认留第一个
data.drop_duplicates()
# 根据某一列去除重复值
data['v1'] = range(7)
data.drop_duplicates(['k1'])
# 保留最后一个
data.drop_duplicates(['k1','k2'], take_last=True)

## 利用函数或映射进行数据转换
data = DataFrame({'food':['bacon','pulled pork','bacon','Pastrami',
	'corned beef','Bacon','pastrami','honey ham','nova lox'],
Example #13
0
File: io.py Project: greole/owls
def import_foam_folder(
        path,
        search,
        files,
        skiplines=1,
        maxlines=0,
        skiptimes=slice(0,None),
        exclude=None,
        times_slice=None,
        ):
    """ returns a Dataframe for every file in fileList """
    #import StringIO
    fileList = find_datafiles(
        path, search=search, files=files,
        exclude=exclude, times_slice=times_slice
    )
    if not fileList:
        print("no files found")
        return None, DataFrame()
    p_bar = ProgressBar(n_tot=sum([len(l) for l in fileList.values()]))
    df = DataFrame()
    #df.index = MultiIndex.from_tuples(zip([],[]),names=['Loc',0])
    from collections import defaultdict
    origins = Origins()
    els = list(fileList.items())[skiptimes]
    for fullpath, files in els:
        time = strip_time(fullpath, path)
        df_tmp = DataFrame()

        # for fn in files:
        #     #ret = read_table(StringIO.StringIO(foam_to_csv(fn)))
        #     ret = read_data_file(fn, skiplines, maxlines)
        #     p_bar.next()

        args = [(fn, skiplines, maxlines, p_bar) for fn in files]
        if MULTIPROCESS:
            with multiprocessing.Pool(processes=MULTIPROCESS) as pool:
                rets = pool.map(read_data_file_args, args)
        else:
            rets = map(read_data_file_args, args)


        for fn, ret in zip(files, rets):
            if not ret or ret[1].empty:
                continue
            field_names, x, hashes = ret
            loc = x.index.values[-1][0]
            if df_tmp.empty:
                df_tmp = x
            else:
                try:
                    df_tmp = df_tmp.combine_first(x)
                except Exception as e:
                    print("failed to concat: ",
                            df_tmp, "and", x, "new_loc ",
                            x.index.levels[0][0], " existing_locs ",
                            df_tmp.index.levels[0] )
                    print(e)
            field_names = ([field_names] if not type(field_names) == list else field_names)
            for field in field_names:
                if field == "Pos":
                    continue
                origins.insert(time, loc, field, fn, hashes[field])
        df_tmp['Time'] = time
        if df.empty:
            df = df_tmp
        else:
            df = df.append(df_tmp)
    df.set_index('Time', append=True, inplace=True)
    if not "Loc" in  df.index.names:
        print(df)
        # df = df.reorder_levels(['Time', ])
    else:
        df = df.reorder_levels(['Time', 'Loc', 'Pos'])
    p_bar.done()
    return origins, df
Example #14
0
# encoding=utf-8

import pandas as pd
import numpy as np
from pandas import Series, DataFrame

# 合并重叠数据

a = Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
           index=['f', 'e', 'd', 'c', 'b', 'a'])
b = Series(np.arange(len(a), dtype=np.float64),
           index=['f', 'e', 'd', 'c', 'b', 'a'])
b[-1] = np.nan
print a
print b
print pd.isnull(a)
# 在True的地方保留b的元素,在False替换为a中对应的元素
print np.where(pd.isnull(a), b, a)

# Series中有对应的combine_first
print b[:-2].combine_first(a[2:])

df1 = DataFrame({'a': [1., np.nan, 5., np.nan],
                 'b': [np.nan, 2, np.nan, 6.],
                 'c': range(2, 18, 4)})
df2 = DataFrame({'a': [5., 4. ,np.nan, 3., 7.],
                 'b': [np.nan, 3., 4., 6., 8.]})
print df1
print df2
print df1.combine_first(df2)
Example #15
0
df1 = DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])
df2 = DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])
print(df1)
print(df2)
print(pd.concat([df1, df2]))
print(pd.concat([df1, df2], ignore_index=True))

a = Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
           index=['f', 'e', 'd', 'c', 'b', 'a'])
b = Series(np.arange(len(a)),
           dtype=np.float64,
           index=['f', 'e', 'd', 'c', 'b', 'a'])
b[-1] = np.nan

print(a)
print(b)
print(np.where(pd.isnull(a), b, a))
print(b[:-2].combine_first(a[2:]))

df1 = DataFrame({'a': [1, np.nan, 5, np.nan],
                 'b': [np.nan, 2, np.nan, 6],
                 'c': range(2, 18, 4)
                 })

df2 = DataFrame({'a': [5, 4, np.nan, 3, 7],
                 'b': [np.nan, 3, 4, 6, 8]
 })

print(df1.combine_first(df2))
Example #16
0
File: io.py Project: ALGe9/owls
def import_foam_folder(
        path,
        search,
        files,
        skiplines=1,
        maxlines=0,
        skiptimes=1,
        exclude=None
        ):
    """ returns a Dataframe for every file in fileList """
    #import StringIO
    from pandas import concat
    fileList = find_datafiles(
        path, search=search, files=files, exclude=exclude)
    if not fileList:
        print("no files found")
        return
    p_bar = ProgressBar(n_tot=sum([len(l) for l in fileList.values()]))
    df = DataFrame()
    #df.index = MultiIndex.from_tuples(zip([],[]),names=['Loc',0])
    from collections import defaultdict
    origins = Origins()
    els = list(fileList.items())[::skiptimes]
    for fullpath, files in els:
        time = strip_time(fullpath, path)
        df_tmp = DataFrame()
        for fn in files:
            #ret = read_table(StringIO.StringIO(foam_to_csv(fn)))
            ret = read_data_file(fn, skiplines, maxlines)
            p_bar.next()
            if not ret:
                continue
            field_names, x, hashes = ret
            loc = x.index.values[-1][0]
            if df_tmp.empty:
                df_tmp = x
            else:
                try:
                    # use combine first for all df at existing Loc or
                    # if not Loc is specified (Eul or Lag fields)
                    if x.index.levels[0][0] in df_tmp.index.levels[0]:
                        df_tmp = df_tmp.combine_first(x)
                        #df_tmp = concat([df_tmp, x], axis=1)
                        pass
                    else:
                        df_tmp = concat([df_tmp, x])
                except Exception as e:
                    print(x)
                    print(e)
            field_names = ([field_names] if not type(field_names) == list else field_names)
            for field in field_names:
                origins.insert(time, loc, field, fn, hashes[field])
        df_tmp['Time'] = time
        if df.empty:
            df = df_tmp
        else:
            df = df.append(df_tmp)
    df.set_index('Time', append=True, inplace=True)
    df = df.reorder_levels(['Time','Loc','Id'])
    p_bar.done()
    return origins, df