コード例 #1
1
    def test_column_dups2(self):

        # drop buggy GH 6240
        df = DataFrame({'A': np.random.randn(5),
                        'B': np.random.randn(5),
                        'C': np.random.randn(5),
                        'D': ['a', 'b', 'c', 'd', 'e']})

        expected = df.take([0, 1, 1], axis=1)
        df2 = df.take([2, 0, 1, 2, 1], axis=1)
        result = df2.drop('C', axis=1)
        assert_frame_equal(result, expected)

        # dropna
        df = DataFrame({'A': np.random.randn(5),
                        'B': np.random.randn(5),
                        'C': np.random.randn(5),
                        'D': ['a', 'b', 'c', 'd', 'e']})
        df.iloc[2, [0, 1, 2]] = np.nan
        df.iloc[0, 0] = np.nan
        df.iloc[1, 1] = np.nan
        df.iloc[:, 3] = np.nan
        expected = df.dropna(subset=['A', 'B', 'C'], how='all')
        expected.columns = ['A', 'A', 'B', 'C']

        df.columns = ['A', 'A', 'B', 'C']

        result = df.dropna(subset=['A', 'C'], how='all')
        assert_frame_equal(result, expected)
コード例 #2
0
    def test_unstack_fill_frame(self):

        # From a dataframe
        rows = [[1, 2], [3, 4], [5, 6], [7, 8]]
        df = DataFrame(rows, columns=list("AB"), dtype=np.int32)
        df.index = MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")])

        result = df.unstack(fill_value=-1)

        rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]]
        expected = DataFrame(rows, index=list("xyz"), dtype=np.int32)
        expected.columns = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")])
        assert_frame_equal(result, expected)

        # From a mixed type dataframe
        df["A"] = df["A"].astype(np.int16)
        df["B"] = df["B"].astype(np.float64)

        result = df.unstack(fill_value=-1)
        expected["A"] = expected["A"].astype(np.int16)
        expected["B"] = expected["B"].astype(np.float64)
        assert_frame_equal(result, expected)

        # From a dataframe with incorrect data type for fill_value
        result = df.unstack(fill_value=0.5)

        rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]]
        expected = DataFrame(rows, index=list("xyz"), dtype=np.float)
        expected.columns = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")])
        assert_frame_equal(result, expected)
コード例 #3
0
    def test_pivot_index_none(self):
        # gh-3962
        data = {
            "index": ["A", "B", "C", "C", "B", "A"],
            "columns": ["One", "One", "One", "Two", "Two", "Two"],
            "values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0],
        }

        frame = DataFrame(data).set_index("index")
        result = frame.pivot(columns="columns", values="values")
        expected = DataFrame({"One": {"A": 1.0, "B": 2.0, "C": 3.0}, "Two": {"A": 1.0, "B": 2.0, "C": 3.0}})

        expected.index.name, expected.columns.name = "index", "columns"
        assert_frame_equal(result, expected)

        # omit values
        result = frame.pivot(columns="columns")

        expected.columns = pd.MultiIndex.from_tuples([("values", "One"), ("values", "Two")], names=[None, "columns"])
        expected.index.name = "index"
        assert_frame_equal(result, expected, check_names=False)
        self.assertEqual(result.index.name, "index")
        self.assertEqual(result.columns.names, (None, "columns"))
        expected.columns = expected.columns.droplevel(0)

        data = {
            "index": range(7),
            "columns": ["One", "One", "One", "Two", "Two", "Two"],
            "values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0],
        }

        result = frame.pivot(columns="columns", values="values")

        expected.columns.name = "columns"
        assert_frame_equal(result, expected)
def datatype_records_to_subset_and_migrate(likechars):
    stmt_for_pkeys = conn_popler_2.execute(
        select(
            from_obj=Maintable,
            columns=[
                column('lter_proj_site'),
                column('samplingprotocol')
            ]).
        where(
            column('samplingprotocol').like(
                '%{}%'.format(likechars))
        )
    )
    data = DataFrame(stmt_for_pkeys.fetchall())
    data.columns = stmt_for_pkeys.keys()

    records_to_get = data['lter_proj_site'].values.tolist()

    stmt_for_records = conn_popler_2.execute(
        select(
            from_table=Rawtable,
        ).
        where(column('lter_proj_site').in_(records_to_get)).
        order_by('sampleid')
    )
    data2 = DataFrame(stmt_for_records.fetchall())
    data2.columns = stmt_for_records.keys()
    data2.drop('individ', axis=1, inplace=True)
コード例 #5
0
ファイル: clustering.py プロジェクト: takeru-nitta/auction
    def clustering(self, X, NUM_CLUSTERS, MINIBATCH):
        '''
        k平均法によってクラス分け
        '''
        
        if MINIBATCH:
            km = MiniBatchKMeans(n_clusters = NUM_CLUSTERS,
                                 init='k-means++', batch_size=1000,
                                 n_init=10, max_no_improvement=10)
        else:
            km = KMeans(n_clusters=NUM_CLUSTERS, init='k-means++', n_init=1)
        
        km.fit(X)
        transformed = km.transform(X) #商品の各クラスの中心への距離
        labels = km.labels_
        
        dists = []
        for i in range(len(labels)):
            dists.append(transformed[i, labels[i]]) #商品の属するクラスの中心への距離

        labels = DataFrame(labels)
        dists = DataFrame(dists)
        labels.columns = ['label']
        dists.columns = ['dists']
        self.data = pd.concat([labels, dists, self.data], axis=1) #元のデータにラベルを加える
        
        return km
コード例 #6
0
ファイル: test_reshape.py プロジェクト: dmjvictory/pandas
    def test_unstack_fill_frame(self):

        # From a dataframe
        rows = [[1, 2], [3, 4], [5, 6], [7, 8]]
        df = DataFrame(rows, columns=list('AB'), dtype=np.int32)
        df.index = MultiIndex.from_tuples(
            [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])

        result = df.unstack(fill_value=-1)

        rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]]
        expected = DataFrame(rows, index=list('xyz'), dtype=np.int32)
        expected.columns = MultiIndex.from_tuples(
            [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
        assert_frame_equal(result, expected)

        # From a mixed type dataframe
        df['A'] = df['A'].astype(np.int16)
        df['B'] = df['B'].astype(np.float64)

        result = df.unstack(fill_value=-1)
        expected['A'] = expected['A'].astype(np.int16)
        expected['B'] = expected['B'].astype(np.float64)
        assert_frame_equal(result, expected)

        # From a dataframe with incorrect data type for fill_value
        result = df.unstack(fill_value=0.5)

        rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]]
        expected = DataFrame(rows, index=list('xyz'), dtype=np.float)
        expected.columns = MultiIndex.from_tuples(
            [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
        assert_frame_equal(result, expected)
コード例 #7
0
ファイル: test_reshape.py プロジェクト: dmjvictory/pandas
    def test_pivot_index_none(self):
        # gh-3962
        data = {
            'index': ['A', 'B', 'C', 'C', 'B', 'A'],
            'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'],
            'values': [1., 2., 3., 3., 2., 1.]
        }

        frame = DataFrame(data).set_index('index')
        result = frame.pivot(columns='columns', values='values')
        expected = DataFrame({
            'One': {'A': 1., 'B': 2., 'C': 3.},
            'Two': {'A': 1., 'B': 2., 'C': 3.}
        })

        expected.index.name, expected.columns.name = 'index', 'columns'
        assert_frame_equal(result, expected)

        # omit values
        result = frame.pivot(columns='columns')

        expected.columns = pd.MultiIndex.from_tuples([('values', 'One'),
                                                      ('values', 'Two')],
                                                     names=[None, 'columns'])
        expected.index.name = 'index'
        tm.assert_frame_equal(result, expected, check_names=False)
        assert result.index.name == 'index'
        assert result.columns.names == (None, 'columns')
        expected.columns = expected.columns.droplevel(0)
        result = frame.pivot(columns='columns', values='values')

        expected.columns.name = 'columns'
        tm.assert_frame_equal(result, expected)
コード例 #8
0
def retrieve_from_db_usa():
    """imports model, pulls mwh production data from db, and places into pandas df.
    Also pulls state for each plant_name, and places into dict."""

    # add parent directory to the path, so can import model.py
    #  need model in order to update the database when this task is activated by cron
    import os
    parentdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    os.sys.path.insert(0,parentdir)

    import model
    s = model.connect()

    # retrive DECEMBER production data, for all turbines at all power plants in California
    USA_gen_dec13_obj = s.execute('SELECT plant_name, state, fuel_type, dec_mwh_gen FROM "ProdGensDec2013" ')
    USA_gen_dec13_data = USA_gen_dec13_obj.fetchall()
    df_dec2013 = DataFrame(USA_gen_dec13_data)
    df_dec2013.columns = ['plant_name', 'state', 'fuel_type', 'dec_mwh_gen']

    # retrive JAN-NOV 2014 production data, for all turbines at all power plants in USA
    USA_gen_2014_obj = s.execute('SELECT plant_name, state, fuel_type, jan_mwh_gen, feb_mwh_gen, mar_mwh_gen, apr_mwh_gen, may_mwh_gen, jun_mwh_gen, jul_mwh_gen, aug_mwh_gen, sep_mwh_gen, oct_mwh_gen, nov_mwh_gen FROM "ProdGens" ')
    USA_gen_2014_data = USA_gen_2014_obj.fetchall()
    df_2014 = DataFrame(USA_gen_2014_data)
    df_2014.columns = ['plant_name', 'state', 'fuel_type', 'jan_mwh_gen', 'feb_mwh_gen', 'mar_mwh_gen', 'apr_mwh_gen', 'may_mwh_gen', 'jun_mwh_gen', 'jul_mwh_gen', 'aug_mwh_gen', 'sep_mwh_gen', 'oct_mwh_gen', 'nov_mwh_gen']

    return df_dec2013, df_2014
コード例 #9
0
ファイル: ground_truth.py プロジェクト: t7reyeslua/NILM-Loc
 def save_to_file(self, fn):
     gg = DataFrame(self.power_series_apps_table)
     try:
         del gg['diff1']
         del gg['diff2']
     except Exception:
         print('')
         
     gg['Loc Events'] = self.loc.events_apps_1min['Apps']
     apps = self.loc.metadata.get_channels()
     sd = {}
     #Initialize series with 0s
     for app in apps:
         sd[app] = Series(0, index=gg.index)
         
     #Count location events for each appliance
     for index, row in gg.iterrows():
         try:
             if len(row['Loc Events']) > 0:
                 for app in apps:
                     n = row['Loc Events'].count(app)
                     sd[app][index] = n
         except Exception:
             continue
     
     if self.loc.name == 'REDD':
         sd[(3,4)] = sd[3]
         sd[(10,20)] = sd[10]
         del sd[3]
         del sd[4]
         del sd[10]
         del sd[20]
       
     #Change column names and append them to gral table
     locevents = DataFrame(sd)
     locevents.columns = [(str(col) + ' locEv') for col in locevents]        
     for locEv in locevents:
         gg[locEv] = locevents[locEv]
         
     
     #Get power values of each appliance and resample for 1min
     act = DataFrame(self.loc.appliances_consuming_times)
     act = act.resample('1Min')
            
     if self.loc.name == 'REDD':
         del act[3]
         del act[10]
         act.columns = [(3,4), 5,6,7,8,9,11,12,13,14,15,16,17,18,19,(10,20)]
     act.columns = [(str(col) + ' conEv') for col in act]
     
     for app in act:
         gg[app] = act[app]        
     gg.columns = [str(col) for col in gg]
     gg = gg[sorted(gg.columns)]
     gg.to_csv(fn)   
     return
コード例 #10
0
ファイル: tools.py プロジェクト: dengemann/statsmodels
def _series_add_constant(data, prepend):
    const = np.ones_like(data)
    const.name = 'const'
    if not prepend:
        results = DataFrame([data, const]).T
        results.columns = [data.name, 'const']
    else:
        results = DataFrame([const, data]).T
        results.columns = ['const', data.name]
    return results
コード例 #11
0
ファイル: test_conversions.py プロジェクト: ardunn/MatMiner
    def test_conversion_multiindex(self):
        d = {'comp_str': ["Fe2", "MnO2"]}

        df_1lvl = DataFrame(data=d)

        df_1lvl = StrToComposition().featurize_dataframe(
            df_1lvl, 'comp_str', multiindex=True)
        self.assertEqual(df_1lvl[("StrToComposition", "composition")].tolist(),
                         [Composition("Fe2"), Composition("MnO2")])

        df_2lvl = DataFrame(data=d)
        df_2lvl.columns = MultiIndex.from_product((["custom"],
                                                   df_2lvl.columns.values))

        df_2lvl = StrToComposition().featurize_dataframe(
            df_2lvl, ("custom", "comp_str"), multiindex=True)
        self.assertEqual(df_2lvl[("StrToComposition", "composition")].tolist(),
                         [Composition("Fe2"), Composition("MnO2")])

        df_2lvl = DataFrame(data=d)
        df_2lvl.columns = MultiIndex.from_product((["custom"],
                                                   df_2lvl.columns.values))

        sto = StrToComposition(target_col_id='test')
        df_2lvl = sto.featurize_dataframe(
            df_2lvl, ("custom", "comp_str"), multiindex=True)
        self.assertEqual(df_2lvl[("StrToComposition", "test")].tolist(),
                         [Composition("Fe2"), Composition("MnO2")])

        # if two level multiindex provided as target, it should be written there
        # here we test converting multiindex in place
        df_2lvl = DataFrame(data=d)
        df_2lvl.columns = MultiIndex.from_product((["custom"],
                                                   df_2lvl.columns.values))

        sto = StrToComposition(target_col_id=None, overwrite_data=True)

        df_2lvl = sto.featurize_dataframe(
            df_2lvl, ("custom", "comp_str"), multiindex=True, inplace=False)
        self.assertEqual(df_2lvl[("custom", "comp_str")].tolist(),
                         [Composition("Fe2"), Composition("MnO2")])

        # Try inplace multiindex conversion with return errors
        df_2lvl = DataFrame(data=d)
        df_2lvl.columns = MultiIndex.from_product((["custom"],
                                                   df_2lvl.columns.values))

        sto = StrToComposition(target_col_id=None, overwrite_data=True)
        df_2lvl = sto.featurize_dataframe(
            df_2lvl, ("custom", "comp_str"), multiindex=True,
            return_errors=True, ignore_errors=True)

        self.assertTrue(
            all(df_2lvl[("custom", "StrToComposition Exceptions")].isnull()))
コード例 #12
0
ファイル: test_to_csv.py プロジェクト: RahulHP/pandas
    def test_to_csv_dups_cols(self):

        df = DataFrame(np.random.randn(1000, 30), columns=lrange(
            15) + lrange(15), dtype='float64')

        with ensure_clean() as filename:
            df.to_csv(filename)  # single dtype, fine
            result = read_csv(filename, index_col=0)
            result.columns = df.columns
            assert_frame_equal(result, df)

        df_float = DataFrame(np.random.randn(1000, 3), dtype='float64')
        df_int = DataFrame(np.random.randn(1000, 3), dtype='int64')
        df_bool = DataFrame(True, index=df_float.index, columns=lrange(3))
        df_object = DataFrame('foo', index=df_float.index, columns=lrange(3))
        df_dt = DataFrame(Timestamp('20010101'),
                          index=df_float.index, columns=lrange(3))
        df = pd.concat([df_float, df_int, df_bool, df_object,
                        df_dt], axis=1, ignore_index=True)

        cols = []
        for i in range(5):
            cols.extend([0, 1, 2])
        df.columns = cols

        from pandas import to_datetime
        with ensure_clean() as filename:
            df.to_csv(filename)
            result = read_csv(filename, index_col=0)

            # date cols
            for i in ['0.4', '1.4', '2.4']:
                result[i] = to_datetime(result[i])

            result.columns = df.columns
            assert_frame_equal(result, df)

        # GH3457
        from pandas.util.testing import makeCustomDataframe as mkdf

        N = 10
        df = mkdf(N, 3)
        df.columns = ['a', 'a', 'b']

        with ensure_clean() as filename:
            df.to_csv(filename)

            # read_csv will rename the dups columns
            result = read_csv(filename, index_col=0)
            result = result.rename(columns={'a.1': 'a'})
            assert_frame_equal(result, df)
コード例 #13
0
ファイル: pandas.py プロジェクト: chdoig/blaze
def compute_one(t, df, **kwargs):
    if t.grouper.iscolumn:
        grouper = compute(t.grouper, {t.child: df}) # a Series
    elif isinstance(t.grouper, Projection) and t.grouper.child is t.child:
        grouper = t.grouper.columns  # list of column names

    if isinstance(t.apply, Summary):
        names = t.apply.names
        preapply = DataFrame(dict(zip(
            names,
            [compute(v.child, {t.child: df}) for v in t.apply.values])))

        df2 = concat_nodup(df, preapply)

        groups = df2.groupby(grouper)

        d = defaultdict(list)
        for name, v in zip(names, t.apply.values):
            d[name].append(getattr(Series, v.symbol))

        result = groups.agg(dict(d))

        # Rearrange columns to match names order
        result = result[sorted(list(result.columns),
                               key=lambda t: names.index(t[0]))]
        result.columns = t.apply.names  # flatten down multiindex

    if isinstance(t.apply, Reduction):
        names = t.apply.dshape[0].names
        preapply = compute(t.apply.child, {t.child: df})
        # Pandas and Blaze column naming schemes differ
        # Coerce DataFrame column names to match Blaze's names
        preapply = preapply.copy()
        if isinstance(preapply, Series):
            preapply.name = names[0]
        else:
            preapply.columns = names

        df2 = concat_nodup(df, preapply)

        if t.apply.child.iscolumn:
            groups = df2.groupby(grouper)[names[0]]
        else:
            groups = df2.groupby(grouper)[names]

        result = compute_one(t.apply, groups) # do reduction

    result = DataFrame(result).reset_index()
    result.columns = t.columns
    return result
コード例 #14
0
ファイル: numpy_records.py プロジェクト: manahl/arctic
    def deserialize(self, item, force_bytes_to_unicode=False):
        index = self._index_from_records(item)
        column_fields = [x for x in item.dtype.names if x not in item.dtype.metadata['index']]
        multi_column = item.dtype.metadata.get('multi_column')
        if len(item) == 0:
            rdata = item[column_fields] if len(column_fields) > 0 else None
            if multi_column is not None:
                columns = MultiIndex.from_arrays(multi_column["values"], names=multi_column["names"])
                return DataFrame(rdata, index=index, columns=columns)
            else:
                return DataFrame(rdata, index=index)

        columns = item.dtype.metadata['columns']
        df = DataFrame(data=item[column_fields], index=index, columns=columns)

        if multi_column is not None:
            df.columns = MultiIndex.from_arrays(multi_column["values"], names=multi_column["names"])

        if force_bytes_to_unicode:
            # This is needed due to 'str' type in py2 when read back in py3 is 'bytes' which breaks the workflow
            # of people migrating to py3. # https://github.com/manahl/arctic/issues/598
            # This should not be used for a normal flow, and you should instead of writing unicode strings
            # if you want to work with str in py3.,

            for c in df.select_dtypes(object):
                # The conversion is not using astype similar to the index as pandas has a bug where it tries to convert
                # the data columns to a unicode string, and the object in this case would be bytes, eg. b'abc'
                # which is converted to u"b'abc'" i.e it includes the b character as well! This generally happens
                # when there is a str conversion without specifying the encoding. eg. str(b'abc') -> "b'abc'" and the
                # fix for this is to tell it the encoding to use: i.e str(b'abc', 'utf-8') -> "abc"
                if type(df[c].iloc[0]) == bytes:
                    df[c] = df[c].str.decode('utf-8')

            if isinstance(df.index, MultiIndex):
                unicode_indexes = []
                # MultiIndex requires a conversion at each level.
                for level in range(len(df.index.levels)):
                    _index = df.index.get_level_values(level)
                    if isinstance(_index[0], bytes):
                        _index = _index.astype('unicode')
                    unicode_indexes.append(_index)
                df.index = unicode_indexes
            else:
                if type(df.index[0]) == bytes:
                    df.index = df.index.astype('unicode')

            if type(df.columns[0]) == bytes:
                df.columns = df.index.astype('unicode')

        return df
コード例 #15
0
    def test_set_value_by_index(self):
        # See gh-12344
        df = DataFrame(np.arange(9).reshape(3, 3).T)
        df.columns = list('AAA')
        expected = df.iloc[:, 2]

        df.iloc[:, 0] = 3
        assert_series_equal(df.iloc[:, 2], expected)

        df = DataFrame(np.arange(9).reshape(3, 3).T)
        df.columns = [2, float(2), str(2)]
        expected = df.iloc[:, 1]

        df.iloc[:, 0] = 3
        assert_series_equal(df.iloc[:, 1], expected)
コード例 #16
0
ファイル: tdi.py プロジェクト: AbnerZheng/pandas
    def components(self):
        """
        Return a dataframe of the components (days, hours, minutes,
        seconds, milliseconds, microseconds, nanoseconds) of the Timedeltas.

        Returns
        -------
        a DataFrame
        """
        from pandas import DataFrame

        columns = ['days', 'hours', 'minutes', 'seconds',
                   'milliseconds', 'microseconds', 'nanoseconds']
        hasnans = self.hasnans
        if hasnans:
            def f(x):
                if isnull(x):
                    return [np.nan] * len(columns)
                return x.components
        else:
            def f(x):
                return x.components

        result = DataFrame([f(x) for x in self])
        result.columns = columns
        if not hasnans:
            result = result.astype('int64')
        return result
コード例 #17
0
ファイル: test_reshape.py プロジェクト: Zando2011/pandas
    def test_include_na(self):
        s = ['a', 'b', np.nan]
        res = get_dummies(s, sparse=self.sparse)
        exp = DataFrame({'a': {0: 1.0,
                               1: 0.0,
                               2: 0.0},
                         'b': {0: 0.0,
                               1: 1.0,
                               2: 0.0}})
        assert_frame_equal(res, exp)

        # Sparse dataframes do not allow nan labelled columns, see #GH8822
        res_na = get_dummies(s, dummy_na=True, sparse=self.sparse)
        exp_na = DataFrame({nan: {0: 0.0,
                                  1: 0.0,
                                  2: 1.0},
                            'a': {0: 1.0,
                                  1: 0.0,
                                  2: 0.0},
                            'b': {0: 0.0,
                                  1: 1.0,
                                  2: 0.0}}).reindex_axis(
                                      ['a', 'b', nan], 1)
        # hack (NaN handling in assert_index_equal)
        exp_na.columns = res_na.columns
        assert_frame_equal(res_na, exp_na)

        res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse)
        exp_just_na = DataFrame(Series(1.0, index=[0]), columns=[nan])
        tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
コード例 #18
0
ファイル: test_apply.py プロジェクト: MattRijk/pandas
    def test_applymap(self):
        applied = self.frame.applymap(lambda x: x * 2)
        assert_frame_equal(applied, self.frame * 2)
        result = self.frame.applymap(type)

        # GH #465, function returning tuples
        result = self.frame.applymap(lambda x: (x, x))
        tm.assertIsInstance(result['A'][0], tuple)

        # GH 2909, object conversion to float in constructor?
        df = DataFrame(data=[1, 'a'])
        result = df.applymap(lambda x: x)
        self.assertEqual(result.dtypes[0], object)

        df = DataFrame(data=[1., 'a'])
        result = df.applymap(lambda x: x)
        self.assertEqual(result.dtypes[0], object)

        # GH2786
        df = DataFrame(np.random.random((3, 4)))
        df2 = df.copy()
        cols = ['a', 'a', 'a', 'a']
        df.columns = cols

        expected = df2.applymap(str)
        expected.columns = cols
        result = df.applymap(str)
        assert_frame_equal(result, expected)

        # datetime/timedelta
        df['datetime'] = Timestamp('20130101')
        df['timedelta'] = pd.Timedelta('1 min')
        result = df.applymap(str)
        for f in ['datetime', 'timedelta']:
            self.assertEqual(result.loc[0, f], str(df.loc[0, f]))
コード例 #19
0
ファイル: test_iloc.py プロジェクト: ankostis/pandas
    def test_iloc_setitem_dups(self):

        # GH 6766
        # iloc with a mask aligning from another iloc
        df1 = DataFrame([{'A': None, 'B': 1}, {'A': 2, 'B': 2}])
        df2 = DataFrame([{'A': 3, 'B': 3}, {'A': 4, 'B': 4}])
        df = concat([df1, df2], axis=1)

        expected = df.fillna(3)
        expected['A'] = expected['A'].astype('float64')
        inds = np.isnan(df.iloc[:, 0])
        mask = inds[inds].index
        df.iloc[mask, 0] = df.iloc[mask, 2]
        tm.assert_frame_equal(df, expected)

        # del a dup column across blocks
        expected = DataFrame({0: [1, 2], 1: [3, 4]})
        expected.columns = ['B', 'B']
        del df['A']
        tm.assert_frame_equal(df, expected)

        # assign back to self
        df.iloc[[0, 1], [0, 1]] = df.iloc[[0, 1], [0, 1]]
        tm.assert_frame_equal(df, expected)

        # reversed x 2
        df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index(
            drop=True)
        df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index(
            drop=True)
        tm.assert_frame_equal(df, expected)
コード例 #20
0
    def time_regex(data, col, form, nulls):
        '''
        Method to format the date columns in the raw data
        based on user input. Returns 3 formatted columns
        i.e. (year, month, day) including nulls
        '''

        fields = ['month', 'day', 'year']
        if any(isinstance(i, list) for i in col):
            col = list(chain.from_iterable(col))
        else:
            pass
        print(type(col))
        print(col)

        if len(nulls) > 0:
            nulldf = hlp.produce_null_df(
                len(nulls), nulls, len(data), 'NaN')


        else:
            nulldf = DataFrame()            
        try:
            if col[0] is not None:
                time_list_re = hlp.strip_time(data, col)
            else:
                time_list_re = []

        except Exception as e:
            print(str(e))
            raise AttributeError('Could not strip time format')            
        notnull = [x for x in fields if x not in nulls]

        for i,item in enumerate(form):
            try:
                time_form_list = []
                for j in time_list_re:
                    time_form_list.append(
                        [
                            to_datetime(
                                x, format=form[i]) for x in
                            j
                        ]
                    )
                if len(time_form_list) > 1:
                    timedf = DataFrame(
                        [list(x) for x in zip(
                            *time_form_list)])

                else:
                    timedf = DataFrame(time_form_list[0])                    
                    if len(notnull) == 1:
                        timedf.columns = notnull
                    else:
                        pass
                final = {'formatted': timedf, 'null': nulldf}
                return final
            except Exception as e:
                print(str(e))
                print('Trying different format')
コード例 #21
0
ファイル: parking.py プロジェクト: lrei/carpark_prediction
def create_df(db='parking.min.db', save_as='parking.df.pickle'):
    conn = sqlite3.connect(db)
    rows = conn.execute('''select updated, park_id, free_places
                        from parking_min''').fetchall()
    ids = list(set([t[1] for t in rows]))
    data = {}
    for x in ids:
        dates = [np.datetime64(r[0], 's')
                 for r in rows if r[1] == x]   # updated
        y = [r[2] for r in rows if r[1] == x]  # free_places (target)
        data[x] = Series(y, index=dates)

    # convert data to DataFrame
    df = DataFrame(data)
    # get the names
    nr = conn.execute('''SELECT DISTINCT name
                      FROM parking ORDER BY park_id''').fetchall()
    # replace non ascii chars
    names = [unicodedata.normalize('NFKD', x[0]).encode('ascii', 'ignore')
             for x in nr]
    # remove dots
    names = [x.replace(u'.', '') for x in names]
    # assign to columns
    df.columns = names

    # destroy where there all are NaNs
    df = df[pd.notnull(df).any(axis=1)]

    # save
    if save_as is not None:
        df.to_pickle(save_as)

    return df
コード例 #22
0
ファイル: test_pandas.py プロジェクト: FedericoCeratto/pandas
    def test_blocks_compat_GH9037(self):
        index = pd.date_range('20000101', periods=10, freq='H')
        df_mixed = DataFrame(OrderedDict(
            float_1=[-0.92077639, 0.77434435, 1.25234727, 0.61485564,
                     -0.60316077, 0.24653374, 0.28668979, -2.51969012,
                     0.95748401, -1.02970536],
            int_1=[19680418, 75337055, 99973684, 65103179, 79373900,
                   40314334, 21290235,  4991321, 41903419, 16008365],
            str_1=['78c608f1', '64a99743', '13d2ff52', 'ca7f4af2', '97236474',
                   'bde7e214', '1a6bde47', 'b1190be5', '7a669144', '8d64d068'],
            float_2=[-0.0428278, -1.80872357,  3.36042349, -0.7573685,
                     -0.48217572, 0.86229683, 1.08935819, 0.93898739,
                     -0.03030452, 1.43366348],
            str_2=['14f04af9', 'd085da90', '4bcfac83', '81504caf', '2ffef4a9',
                   '08e2f5c4', '07e1af03', 'addbd4a7', '1f6a09ba', '4bfc4d87'],
            int_2=[86967717, 98098830, 51927505, 20372254, 12601730, 20884027,
                   34193846, 10561746, 24867120, 76131025]
        ), index=index)

        # JSON deserialisation always creates unicode strings
        df_mixed.columns = df_mixed.columns.astype('unicode')

        df_roundtrip = pd.read_json(df_mixed.to_json(orient='split'),
                                    orient='split')
        assert_frame_equal(df_mixed, df_roundtrip,
                           check_index_type=True,
                           check_column_type=True,
                           check_frame_type=True,
                           by_blocks=True,
                           check_exact=True)
コード例 #23
0
ファイル: test_reshape.py プロジェクト: TomAugspurger/pandas
    def test_include_na(self, sparse, dtype):
        if sparse:
            pytest.xfail(reason='nan in index is problematic (GH 16894)')

        s = ['a', 'b', np.nan]
        res = get_dummies(s, sparse=sparse, dtype=dtype)
        exp = DataFrame({'a': [1, 0, 0],
                         'b': [0, 1, 0]},
                        dtype=self.effective_dtype(dtype))
        assert_frame_equal(res, exp)

        # Sparse dataframes do not allow nan labelled columns, see #GH8822
        res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype)
        exp_na = DataFrame({nan: [0, 0, 1],
                            'a': [1, 0, 0],
                            'b': [0, 1, 0]},
                           dtype=self.effective_dtype(dtype))
        exp_na = exp_na.reindex(['a', 'b', nan], axis=1)
        # hack (NaN handling in assert_index_equal)
        exp_na.columns = res_na.columns
        assert_frame_equal(res_na, exp_na)

        res_just_na = get_dummies([nan], dummy_na=True,
                                  sparse=sparse, dtype=dtype)
        exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan],
                                dtype=self.effective_dtype(dtype))
        tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
コード例 #24
0
ファイル: GraphtecAnalysis.py プロジェクト: moomns/python
def prepare_dataset(filename):
    """
    csvファイルを読み込んで、扱える形のデータフレームに整形する関数
    秒以上の列とms以下の列が分離しているために、前処理が必要

    Argument
    filename->str:          ファイル名のパス

    return
    Data->pandas.DataFrame: 日時と温度を保持するデータフレーム
    """
    #load dataset
    data = pd.read_csv(filename, skiprows=17, encoding="shift-jis")
    data.columns=(["No", "Time", "ms", "Temp", "1", "A12345678", "A1234", "A1"])

    #reshape dataset
    index = np.arange(0,len(data))
    #data["Time"](yy/MM/DD hh:mm:ss)->hh:mm:ss
    #hh:mm:ss + ms->datetime
    date = [str(data["Time"][i]).split(" ") for i in index]
    date = [date[i][1] + str(":") + str(data["ms"][i]*10**3) for i in index]
    date = [date[i].split(":") for i in index]
    date = [datetime.time(int(date[i][0]), int(date[i][1]), int(date[i][2]), int(date[i][3])) for i in index]

    #make dataset
    Data =DataFrame(np.c_[date, data["Temp"]])
    Data.columns=(["date", "temperature"])
    return Data
コード例 #25
0
ファイル: test_scalar.py プロジェクト: changhiskhan/pandas
    def test_at_to_fail(self):
        # at should not fallback
        # GH 7814
        s = Series([1, 2, 3], index=list('abc'))
        result = s.at['a']
        assert result == 1
        pytest.raises(ValueError, lambda: s.at[0])

        df = DataFrame({'A': [1, 2, 3]}, index=list('abc'))
        result = df.at['a', 'A']
        assert result == 1
        pytest.raises(ValueError, lambda: df.at['a', 0])

        s = Series([1, 2, 3], index=[3, 2, 1])
        result = s.at[1]
        assert result == 3
        pytest.raises(ValueError, lambda: s.at['a'])

        df = DataFrame({0: [1, 2, 3]}, index=[3, 2, 1])
        result = df.at[1, 0]
        assert result == 3
        pytest.raises(ValueError, lambda: df.at['a', 0])

        # GH 13822, incorrect error string with non-unique columns when missing
        # column is accessed
        df = DataFrame({'x': [1.], 'y': [2.], 'z': [3.]})
        df.columns = ['x', 'x', 'z']

        # Check that we get the correct value in the KeyError
        with pytest.raises(KeyError, match=r"\['y'\] not in index"):
            df[['x', 'y', 'z']]
コード例 #26
0
def splitMNIST(data,random_state):

    print("\n####################")
    print("splitMNIST():\n")

    nrow = data["data"].shape[0]
    ncol = data["data"].shape[1]

    label_features = np.hstack(
        tup = (
            np.arange(nrow).reshape((nrow,1)),
            data['target'].reshape((nrow,1)),
            data['data']
            )
        )

    label_features = DataFrame(data=label_features)
    label_features.columns = ['index','label'] + getColnames(ncolSquared = ncol)

    simpleTrainSet, simpleTestSet = train_test_split(
        label_features,
        test_size    = 1/7,
        random_state = random_state
        )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    del label_features
    print("\nexiting: splitMNIST()")
    print("####################")
    return( simpleTrainSet, simpleTestSet )
コード例 #27
0
def query_CAISODemand_hrly_Series():
    """specifically gets demand data"""

    import os
    parentdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    os.sys.path.insert(0,parentdir)

    import model
    s = model.connect()

    demand_obj = s.execute('SELECT time_start, mw_demand FROM "HistoricCAISODemands" WHERE caiso_tac=\'CA ISO-TAC\' and time_start between \'2014-01-01 07:00:00.000000\' and \'2015-01-01 00:00:00.000000\' ')
    demand_entry = demand_obj.fetchall()
    demand_df = DataFrame(demand_entry)
    demand_df.columns = ['time_start','mw_demand']

    dict_with_datetime_keys = { }

    for idx,row in enumerate(demand_df.values):
        time_start = row[0]

        # check date, since logs show we're missing a few
        if check_if_bad_date(time_start)!=True:

            # turn dict into a series.  will auto-index on dict keys
            mw_demand = row[1]
            dict_with_datetime_keys[time_start] = mw_demand

    return Series(dict_with_datetime_keys)
コード例 #28
0
def query_CAISONetImports_hrly_Series():
    """specifically gets import data"""

    import os
    parentdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    os.sys.path.insert(0,parentdir)

    import model
    s = model.connect()

    imports_obj = s.execute('SELECT time_start, sum(mw_imports) FROM "HistoricCAISONetImports" where time_start between \'2014-01-01 07:00:00.000000\' and \'2015-01-01 00:00:00.000000\' GROUP BY time_start ')
    imports_entry = imports_obj.fetchall()
    imports_df = DataFrame(imports_entry)
    imports_df.columns = ['time_start','mw_demand']

    dict_with_datetime_keys = { }

    for idx,row in enumerate(imports_df.values):
        time_start = row[0]

        # check date, since logs show we're missing a few
        if check_if_bad_date(time_start)!=True:

            # turn dict into a series.  will auto-index on dict keys
            mw_imports = row[1]
            dict_with_datetime_keys[time_start] = mw_imports

    return Series(dict_with_datetime_keys)
コード例 #29
0
ファイル: webmodels.py プロジェクト: gitter-badger/cameo
def index_models_minho(host="http://darwin.di.uminho.pt/models"):
    """
    Retrieves a summary of all models in the database.

    Parameters
    ----------
    host: the service host (optional, default: http://darwin.di.uminho.pt/models)

    Returns
    -------
    pandas.DataFrame
        summary of the models in the database
    """
    uri = host + "/models.json"
    try:
        response = requests.get(uri)
    except requests.ConnectionError as e:
        logger.error("Cannot reach %s. Are you sure that you are connected to the internet?" % host)
        raise e
    if response.ok:
        try:
            json = response.json()
        except Exception as e:
            logger.error('No json could be decoded from server response coming from {}.'.format(host))
            raise e
        else:
            index = DataFrame(json, columns=["id", "name", "doi", "author",
                                             "year", "formats", "organism",
                                             "taxonomy", "optflux_validated"])
            index.columns = ["id", "name", "doi", "author", "year", "formats", "organism", "taxonomy", "validated"]
            return index
    else:
        raise Exception("Could not index available models. %s returned status code %d" % (host, response.status_code))
コード例 #30
0
ファイル: nba.py プロジェクト: iswdp/nba-forest
def arrange_aggregates(cumsums, symbols, aggs):
    for i in symbols:
        cumsums[i] = cumsums[i].ix[:,0:5]

    cols = cumsums['ATL'].columns.tolist()
    cols2 = aggs['ATL'].columns.tolist()
    cols3 = (aggs['ATL'].columns + '1').tolist()
    cols.extend(cols2)
    cols.extend(cols3)
    ATL = DataFrame(columns = cols)

    for team in symbols:
        for Date in cumsums[team]['Date']:
            Opponent = cumsums[team].ix[cumsums[team]['Date'] == Date, 'Opponent'].all()
            cumsums_temp = cumsums[team].ix[cumsums[team]['Date'] == Date]
            cumsums_temp = cumsums_temp.reset_index()
            team_temp = aggs[team]
            oppenent_temp = DataFrame(aggs[Opponent])
            oppenent_temp.columns = cols3
            atl = pd.concat([cumsums_temp, team_temp, oppenent_temp], axis = 1)
            atl = atl.drop('index', axis=1)
            atl.columns = cols
            ATL = pd.concat([ATL, atl], axis = 0)

        print team

    ATL.to_csv('final.csv', sep=',', index=False)
コード例 #31
0
ファイル: test_readers.py プロジェクト: CSCD01/pandas-team24
    def test_read_excel_multiindex(self, read_ext):
        # see gh-4679
        if pd.read_excel.keywords["engine"] == "pyxlsb":
            pytest.xfail("Sheets containing datetimes not supported by pyxlsb")

        mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]])
        mi_file = "testmultiindex" + read_ext

        # "mi_column" sheet
        expected = DataFrame(
            [
                [1, 2.5, pd.Timestamp("2015-01-01"), True],
                [2, 3.5, pd.Timestamp("2015-01-02"), False],
                [3, 4.5, pd.Timestamp("2015-01-03"), False],
                [4, 5.5, pd.Timestamp("2015-01-04"), True],
            ],
            columns=mi,
        )

        actual = pd.read_excel(mi_file,
                               "mi_column",
                               header=[0, 1],
                               index_col=0)
        tm.assert_frame_equal(actual, expected)

        # "mi_index" sheet
        expected.index = mi
        expected.columns = ["a", "b", "c", "d"]

        actual = pd.read_excel(mi_file, "mi_index", index_col=[0, 1])
        tm.assert_frame_equal(actual, expected, check_names=False)

        # "both" sheet
        expected.columns = mi

        actual = pd.read_excel(mi_file,
                               "both",
                               index_col=[0, 1],
                               header=[0, 1])
        tm.assert_frame_equal(actual, expected, check_names=False)

        # "mi_index_name" sheet
        expected.columns = ["a", "b", "c", "d"]
        expected.index = mi.set_names(["ilvl1", "ilvl2"])

        actual = pd.read_excel(mi_file, "mi_index_name", index_col=[0, 1])
        tm.assert_frame_equal(actual, expected)

        # "mi_column_name" sheet
        expected.index = list(range(4))
        expected.columns = mi.set_names(["c1", "c2"])
        actual = pd.read_excel(mi_file,
                               "mi_column_name",
                               header=[0, 1],
                               index_col=0)
        tm.assert_frame_equal(actual, expected)

        # see gh-11317
        # "name_with_int" sheet
        expected.columns = mi.set_levels([1, 2],
                                         level=1).set_names(["c1", "c2"])

        actual = pd.read_excel(mi_file,
                               "name_with_int",
                               index_col=0,
                               header=[0, 1])
        tm.assert_frame_equal(actual, expected)

        # "both_name" sheet
        expected.columns = mi.set_names(["c1", "c2"])
        expected.index = mi.set_names(["ilvl1", "ilvl2"])

        actual = pd.read_excel(mi_file,
                               "both_name",
                               index_col=[0, 1],
                               header=[0, 1])
        tm.assert_frame_equal(actual, expected)

        # "both_skiprows" sheet
        actual = pd.read_excel(mi_file,
                               "both_name_skiprows",
                               index_col=[0, 1],
                               header=[0, 1],
                               skiprows=2)
        tm.assert_frame_equal(actual, expected)
コード例 #32
0
def _set(df: pd.DataFrame,
         area: Optional[str] = None,
         currency: Optional[str] = None,
         inf_adj: Optional[str] = None,
         unit: Optional[str] = None,
         seas_adj: Optional[str] = None,
         ts_type: Optional[str] = None,
         cumperiods: Optional[int] = None):
    """Add a multiindex to a dataframe's columns.

    Characterize a dataframe by adding metadata to its column names by
    use of multiindexes.

    Parameters
    ----------
    df : Pandas dataframe
    area : str or None (default is None)
        Topic to which the data relates to.
    currency : str or None (default is None)
        Currency denomination.
    inf_adj : str or None (default is None)
        Whether the data is in constant prices.
    unit : str or None (default is None)
        Units in which data is defined.
    seas_adj : str or None (default is None)
        Whether the data is seasonally adjusted.
    ts_type : str or None (default is None)
        Time series type, generally 'Stock' or 'Flujo'.
    cumperiods : int or None (default is None)
        Number of periods accumulated per period.

    Returns
    -------
    None

    See also
    --------
    Modifies the dataframe's column names in place.

    """
    colnames = df.columns
    try:
        inferred_freq = pd.infer_freq(df.index)
    except ValueError:
        warnings.warn(
            "ValueError: Need at least 3 dates to infer frequency. "
            "Setting to '-'.", UserWarning)
        inferred_freq = "-"
    if inferred_freq is None:
        warnings.warn(
            "Metadata: frequency could not be inferred "
            "from the index. Setting to '-'.", UserWarning)
        inferred_freq = "-"
    names = [
        "Indicador", "Área", "Frecuencia", "Moneda", "Inf. adj.", "Unidad",
        "Seas. Adj.", "Tipo", "Acum. períodos"
    ]
    if not isinstance(df.columns, pd.MultiIndex):
        df.columns = pd.MultiIndex.from_product([
            colnames, [area], [inferred_freq], [currency], [inf_adj], [unit],
            [seas_adj], [ts_type], [cumperiods]
        ],
                                                names=names)
    else:
        arrays = []
        for level in range(0, 9):
            arrays.append(list(df.columns.get_level_values(level)))

        arrays[2] = [inferred_freq] * len(df.columns)
        if area is not None:
            arrays[1] = [area] * len(df.columns)
        if currency is not None:
            arrays[3] = [currency] * len(df.columns)
        if inf_adj is not None:
            arrays[4] = [inf_adj] * len(df.columns)
        if unit is not None:
            arrays[5] = [unit] * len(df.columns)
        if seas_adj is not None:
            arrays[6] = [seas_adj] * len(df.columns)
        if ts_type is not None:
            arrays[7] = [ts_type] * len(df.columns)
        if cumperiods is not None:
            arrays[8] = [cumperiods] * len(df.columns)

        try:
            arrays[8] = list(map(int, arrays[8]))
        except ValueError:
            pass

        tuples = list(zip(*arrays))
        df.columns = pd.MultiIndex.from_tuples(tuples, names=names)
        return
コード例 #33
0
    print(cities)
    print(group)

n = 2
data = [[] for x in range(n)]
z1 = cities.get_group(1)
count = 0
for i in range(0, 20):
    count = count + 1
    z3 = z1[z1['Zones_id'] == count]
    a = len(z3.index)
    data[0].append(count)
    data[1].append(a)

df = DataFrame(data).transpose()
df.columns = ['Zone_id', 'Average_Calorie']
df.to_csv('zonesorders1.csv', index=False)

#city2
n = 2
data = [[] for x in range(n)]
z1 = cities.get_group(2)
count = 0
for i in range(0, 20):
    count = count + 1
    z3 = z1[z1['Zones_id'] == count]
    a = len(z3.index)
    data[0].append(count)
    data[1].append(a)

df = DataFrame(data).transpose()
コード例 #34
0
def get_variance_accumulated(path_dataset, range_=(1, 11)):
    """Variance computed in parallel.
    
    Calculation of accumulated variance in channels and files.
    Parameter receives the path where the folder with files is located.
    Calculates the variance only in the first ten people.

    On the tested computer it took about 10 minutes going
    through all the files and accumulating the variance.
    We filter warnings.
    """
    fold_variance = Path(path_dataset) / "variance_accumulated"

    if not check_exist(path_dataset, "variance_accumulated"):

        print("Loading the files to calculate variance.")

        filterwarnings("ignore")

        accumulate_count = 0
        accumulate_avg = 0
        accumulate_var = 0

        selected_channels = [
            "time",
            "FP1-F7",
            "F7-T7",
            "T7-P7",
            "P7-O1",
            "FP1-F3",
            "F3-C3",
            "C3-P3",
            "P3-O1",
            "FP2-F4",
            "F4-C4",
            "C4-P4",
            "P4-O2",
            "FP2-F8",
            "F8-T8",
            "T8-P8-0",
            "P8-O2",
            "FZ-CZ",
            "CZ-PZ",
            "P7-T7",
            "T7-FT9",
            "FT9-FT10",
            "FT10-T8",
            "T8-P8-1",
        ]

        for id_patient in tqdm_notebook(range(range_[0], range_[1]),
                                        desc="Patient"):

            path_files = join(path_dataset,
                              "chb{0:0=2d}/*.edf".format(id_patient))

            files_in_folder = glob(path_files)
            for enum, file in enumerate(
                    tqdm_notebook(files_in_folder, desc="Files", leave=False)):

                variance_file = read_raw_edf(input_fname=file,
                                             verbose=0).to_data_frame(
                                                 picks=["eeg"],
                                                 time_format="ms")

                # Removing channels that are not present in all files.
                variance_file = variance_file[
                    variance_file.columns.intersection(selected_channels)]
                # Sorting the channels
                variance_file.sort_index(axis=1, inplace=True)

                if (enum == 0) & (id_patient == 0):
                    accumulate_count = len(variance_file)
                    accumulate_avg = variance_file.mean()
                    accumulate_var = variance_file.var()

                else:
                    (
                        accumulate_count,
                        accumulate_avg,
                        accumulate_var,
                    ) = parallel_variance(
                        accumulate_count,
                        accumulate_avg,
                        accumulate_var,
                        len(variance_file),
                        variance_file.mean(),
                        variance_file.var(),
                    )

        accumulate_var = DataFrame(accumulate_var)
        accumulate_var.columns = accumulate_var.columns.astype(str)

        accumulate_var.to_parquet(fold_variance /
                                  "variance_accumulated.parquet",
                                  engine="pyarrow")
        return accumulate_var
    else:
        print("Reading the variance already calculated.")
        variance = read_parquet(fold_variance / "variance_accumulated.parquet",
                                engine="pyarrow")
        return variance
コード例 #35
0
ファイル: test_operators.py プロジェクト: zheewang/pandas
    def test_boolean_comparison(self):

        # GH 4576
        # boolean comparisons with a tuple/list give unexpected results
        df = DataFrame(np.arange(6).reshape((3, 2)))
        b = np.array([2, 2])
        b_r = np.atleast_2d([2, 2])
        b_c = b_r.T
        l = (2, 2, 2)
        tup = tuple(l)

        # gt
        expected = DataFrame([[False, False], [False, True], [True, True]])
        result = df > b
        assert_frame_equal(result, expected)

        result = df.values > b
        assert_numpy_array_equal(result, expected.values)

        result = df > l
        assert_frame_equal(result, expected)

        result = df > tup
        assert_frame_equal(result, expected)

        result = df > b_r
        assert_frame_equal(result, expected)

        result = df.values > b_r
        assert_numpy_array_equal(result, expected.values)

        pytest.raises(ValueError, df.__gt__, b_c)
        pytest.raises(ValueError, df.values.__gt__, b_c)

        # ==
        expected = DataFrame([[False, False], [True, False], [False, False]])
        result = df == b
        assert_frame_equal(result, expected)

        result = df == l
        assert_frame_equal(result, expected)

        result = df == tup
        assert_frame_equal(result, expected)

        result = df == b_r
        assert_frame_equal(result, expected)

        result = df.values == b_r
        assert_numpy_array_equal(result, expected.values)

        pytest.raises(ValueError, lambda: df == b_c)
        assert df.values.shape != b_c.shape

        # with alignment
        df = DataFrame(np.arange(6).reshape((3, 2)),
                       columns=list('AB'),
                       index=list('abc'))
        expected.index = df.index
        expected.columns = df.columns

        result = df == l
        assert_frame_equal(result, expected)

        result = df == tup
        assert_frame_equal(result, expected)
コード例 #36
0
    pickle.dump(content_WS_list,fp)
fp.close()

with open('reference_WS_list.pkl', 'wb') as fp:
    pickle.dump(reference_WS_list,fp)
fp.close()
all_list = []
all_list.append(chi_paper_name_WS_list)
all_list.append(chi_keyword_WS_list)
all_list.append(abstract_WS_list)
all_list.append(content_WS_list)
all_list.append(reference_WS_list)
print(len(all_list))

df = DataFrame(all_list).transpose()
df.columns =["chi_paper_name_WS","chi_keyword_WS","abstract_WS","content_WS","reference_WS"]
print(df.info())
print(df.head(5))
df.to_csv("all_data_WS.csv")
with open('All_WS_list.pkl', 'wb') as fp:
    pickle.dump(all_list,fp)
fp.close()

'''
def add_field_to_mongodb(collection_name, chi_paper_name_WS_list, chi_keyword_WS_list, abstract_WS_list, content_WS_list,reference_WS_list):
    client = MongoClient('localhost', 27017)
    db = client['110_conference']
    collection = db[collection_name]

    AllFields_array = np.array(All_list)
    # print(AllFields_list[0][2], AllFields_list[1][2])
コード例 #37
0
# returns one or otherwise None as a tuple
print(c.fetchone())

# returns one or otherwise None as a tuple
print(c.fetchmany(2))

# returns a list of tuples
print(c.fetchall())

# Since now the cursor has read all the rows and we are at End
# So again fetching the records from the database
c.execute("SELECT * FROM employees")

# STEP 5
df = DataFrame(c.fetchall())  # putting the result into Dataframe
df.columns = ["id", "first", "last", "pay"]

# STEP 6
# commits the current transaction
conn.commit()

# STEP 7
# closing the connection
conn.close()
"""
Database handling using MySQL on Local Machine
"""
#use below command in anaconda prompt
# pip install mysql-connector-python

from pandas import DataFrame
コード例 #38
0
import requests
import csv
from bs4 import BeautifulSoup
import pandas as pd
from pandas import Series, DataFrame

df = pd.read_csv('gymnasium-names2.csv')

#pd.concat([Series(row['gymnasium'], row['fach'].split(','))
#            for _, row in df.iterrows()]).reset_index()

b = DataFrame(df.fach.str.split(', ').tolist(), index=df.gymnasium).stack()
b = b.reset_index()[[0, 'gymnasium']]  # var1 variable is currently labeled 0
b.columns = ['fach', 'gymnasium']  # renaming var1

b.to_csv('dim_fach.csv', index=False)

b = DataFrame(df.sprachen.str.split(', ').tolist(), index=df.gymnasium).stack()
b = b.reset_index()[[0, 'gymnasium']]  # var1 variable is currently labeled 0
b.columns = ['sprachen', 'gymnasium']  # renaming var1

b.to_csv('dim_sprachen.csv', index=False)
コード例 #39
0
from flask_sqlalchemy import SQLAlchemy
from datetime import datetime
import time
from urllib.parse import unquote

app = Flask(__name__)
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///mydb.db'
db = SQLAlchemy(app)
db.create_all()

# Load the data from the db
# books df
books_db = db.session.execute('select * from Books LIMIT 1000')
books = DataFrame(books_db.fetchall())
books.columns = books_db.keys()
#print(books.head())
#print(books.shape)

# ratings df
ratings_db = db.session.execute('select * from ratings LIMIT 1000')
ratings = DataFrame(ratings_db.fetchall())
ratings.columns = ratings_db.keys()
#print(ratings.head())
#print(ratings.shape)

# book_tags df
book_tags_db = db.session.execute('select * from book_tags LIMIT 1000')
book_tags = DataFrame(book_tags_db.fetchall())
book_tags.columns = book_tags_db.keys()
#print(book_tags.head())
コード例 #40
0
def test_to_frame():
    tuples = [(1, "one"), (1, "two"), (2, "one"), (2, "two")]

    index = MultiIndex.from_tuples(tuples)
    result = index.to_frame(index=False)
    expected = DataFrame(tuples)
    tm.assert_frame_equal(result, expected)

    result = index.to_frame()
    expected.index = index
    tm.assert_frame_equal(result, expected)

    tuples = [(1, "one"), (1, "two"), (2, "one"), (2, "two")]
    index = MultiIndex.from_tuples(tuples, names=["first", "second"])
    result = index.to_frame(index=False)
    expected = DataFrame(tuples)
    expected.columns = ["first", "second"]
    tm.assert_frame_equal(result, expected)

    result = index.to_frame()
    expected.index = index
    tm.assert_frame_equal(result, expected)

    # See GH-22580
    index = MultiIndex.from_tuples(tuples)
    result = index.to_frame(index=False, name=["first", "second"])
    expected = DataFrame(tuples)
    expected.columns = ["first", "second"]
    tm.assert_frame_equal(result, expected)

    result = index.to_frame(name=["first", "second"])
    expected.index = index
    expected.columns = ["first", "second"]
    tm.assert_frame_equal(result, expected)

    msg = "'name' must be a list / sequence of column names."
    with pytest.raises(TypeError, match=msg):
        index.to_frame(name="first")

    msg = "'name' should have same length as number of levels on index."
    with pytest.raises(ValueError, match=msg):
        index.to_frame(name=["first"])

    # Tests for datetime index
    index = MultiIndex.from_product(
        [range(5), pd.date_range("20130101", periods=3)])
    result = index.to_frame(index=False)
    expected = DataFrame({
        0: np.repeat(np.arange(5, dtype="int64"), 3),
        1: np.tile(pd.date_range("20130101", periods=3), 5),
    })
    tm.assert_frame_equal(result, expected)

    result = index.to_frame()
    expected.index = index
    tm.assert_frame_equal(result, expected)

    # See GH-22580
    result = index.to_frame(index=False, name=["first", "second"])
    expected = DataFrame({
        "first":
        np.repeat(np.arange(5, dtype="int64"), 3),
        "second":
        np.tile(pd.date_range("20130101", periods=3), 5),
    })
    tm.assert_frame_equal(result, expected)

    result = index.to_frame(name=["first", "second"])
    expected.index = index
    tm.assert_frame_equal(result, expected)
コード例 #41
0
def create_data():
    """create the pickle data"""
    data = {
        "A": [0.0, 1.0, 2.0, 3.0, np.nan],
        "B": [0, 1, 0, 1, 0],
        "C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
        "D": date_range("1/1/2009", periods=5),
        "E": [0.0, 1, Timestamp("20100101"), "foo", 2.0],
    }

    scalars = {
        "timestamp": Timestamp("20130101"),
        "period": Period("2012", "M")
    }

    index = {
        "int": Index(np.arange(10)),
        "date": date_range("20130101", periods=10),
        "period": period_range("2013-01-01", freq="M", periods=10),
        "float": Index(np.arange(10, dtype=np.float64)),
        "uint": Index(np.arange(10, dtype=np.uint64)),
        "timedelta": timedelta_range("00:00:00", freq="30T", periods=10),
    }

    index["range"] = RangeIndex(10)

    index["interval"] = interval_range(0, periods=10)

    mi = {
        "reg2":
        MultiIndex.from_tuples(
            tuple(
                zip(*[
                    ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
                    ["one", "two", "one", "two", "one", "two", "one", "two"],
                ])),
            names=["first", "second"],
        )
    }

    series = {
        "float":
        Series(data["A"]),
        "int":
        Series(data["B"]),
        "mixed":
        Series(data["E"]),
        "ts":
        Series(np.arange(10).astype(np.int64),
               index=date_range("20130101", periods=10)),
        "mi":
        Series(
            np.arange(5).astype(np.float64),
            index=MultiIndex.from_tuples(tuple(
                zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])),
                                         names=["one", "two"]),
        ),
        "dup":
        Series(np.arange(5).astype(np.float64),
               index=["A", "B", "C", "D", "A"]),
        "cat":
        Series(Categorical(["foo", "bar", "baz"])),
        "dt":
        Series(date_range("20130101", periods=5)),
        "dt_tz":
        Series(date_range("20130101", periods=5, tz="US/Eastern")),
        "period":
        Series([Period("2000Q1")] * 5),
    }

    mixed_dup_df = DataFrame(data)
    mixed_dup_df.columns = list("ABCDA")
    frame = {
        "float":
        DataFrame({
            "A": series["float"],
            "B": series["float"] + 1
        }),
        "int":
        DataFrame({
            "A": series["int"],
            "B": series["int"] + 1
        }),
        "mixed":
        DataFrame({k: data[k]
                   for k in ["A", "B", "C", "D"]}),
        "mi":
        DataFrame(
            {
                "A": np.arange(5).astype(np.float64),
                "B": np.arange(5).astype(np.int64)
            },
            index=MultiIndex.from_tuples(
                tuple(
                    zip(*[
                        ["bar", "bar", "baz", "baz", "baz"],
                        ["one", "two", "one", "two", "three"],
                    ])),
                names=["first", "second"],
            ),
        ),
        "dup":
        DataFrame(np.arange(15).reshape(5, 3).astype(np.float64),
                  columns=["A", "B", "A"]),
        "cat_onecol":
        DataFrame({"A": Categorical(["foo", "bar"])}),
        "cat_and_float":
        DataFrame({
            "A": Categorical(["foo", "bar", "baz"]),
            "B": np.arange(3).astype(np.int64),
        }),
        "mixed_dup":
        mixed_dup_df,
        "dt_mixed_tzs":
        DataFrame(
            {
                "A": Timestamp("20130102", tz="US/Eastern"),
                "B": Timestamp("20130603", tz="CET"),
            },
            index=range(5),
        ),
        "dt_mixed2_tzs":
        DataFrame(
            {
                "A": Timestamp("20130102", tz="US/Eastern"),
                "B": Timestamp("20130603", tz="CET"),
                "C": Timestamp("20130603", tz="UTC"),
            },
            index=range(5),
        ),
    }

    cat = {
        "int8": Categorical(list("abcdefg")),
        "int16": Categorical(np.arange(1000)),
        "int32": Categorical(np.arange(10000)),
    }

    timestamp = {
        "normal": Timestamp("2011-01-01"),
        "nat": NaT,
        "tz": Timestamp("2011-01-01", tz="US/Eastern"),
    }

    timestamp["freq"] = Timestamp("2011-01-01", freq="D")
    timestamp["both"] = Timestamp("2011-01-01", tz="Asia/Tokyo", freq="M")

    off = {
        "DateOffset": DateOffset(years=1),
        "DateOffset_h_ns": DateOffset(hour=6, nanoseconds=5824),
        "BusinessDay": BusinessDay(offset=timedelta(seconds=9)),
        "BusinessHour": BusinessHour(normalize=True, n=6, end="15:14"),
        "CustomBusinessDay": CustomBusinessDay(weekmask="Mon Fri"),
        "SemiMonthBegin": SemiMonthBegin(day_of_month=9),
        "SemiMonthEnd": SemiMonthEnd(day_of_month=24),
        "MonthBegin": MonthBegin(1),
        "MonthEnd": MonthEnd(1),
        "QuarterBegin": QuarterBegin(1),
        "QuarterEnd": QuarterEnd(1),
        "Day": Day(1),
        "YearBegin": YearBegin(1),
        "YearEnd": YearEnd(1),
        "Week": Week(1),
        "Week_Tues": Week(2, normalize=False, weekday=1),
        "WeekOfMonth": WeekOfMonth(week=3, weekday=4),
        "LastWeekOfMonth": LastWeekOfMonth(n=1, weekday=3),
        "FY5253": FY5253(n=2, weekday=6, startingMonth=7, variation="last"),
        "Easter": Easter(),
        "Hour": Hour(1),
        "Minute": Minute(1),
    }

    return {
        "series": series,
        "frame": frame,
        "index": index,
        "scalars": scalars,
        "mi": mi,
        "sp_series": {
            "float": _create_sp_series(),
            "ts": _create_sp_tsseries()
        },
        "sp_frame": {
            "float": _create_sp_frame()
        },
        "cat": cat,
        "timestamp": timestamp,
        "offsets": off,
    }
コード例 #42
0
import pandas as pd
from pandas import DataFrame
csvfile = pd.read_csv('./cloth_shop.csv', header=None)

df = DataFrame(csvfile)

df.columns = [
    'Name', 'Age', 'Weight', 'm0006', 'm0612', 'm1218', 'f0006', 'f0612',
    'f1218'
]

##删除全空的行
df.dropna(inplace=True, how='all')

##最高频值填充
age_maxf = df['Age'].value_counts().index[0]
df['Age'].fillna(age_maxf, inplace=True)

#拆分name
df[['First_Name', 'Last_Name']] = df['Name'].str.split(expand=True)
#df.insert(0,['First_Name','Last_Name'],df['Name'].str.split(expand=True))
df.drop('Name', axis=1, inplace=True)
# 删除非 ASCII 字符
df[['First_Name', 'Last_Name']].replace({r'[^\x00-\x7F]+': ''},
                                        regex=True,
                                        inplace=True)

df.drop_duplicates(['First_Name', 'Last_Name'], inplace=True)

# 获取 weight 数据列中单位为 lbs 的数据
rows_with_lbs = df['Weight'].str.contains('lbs').fillna(False)
コード例 #43
0
# -*- coding: utf-8 -*-
"""
Created on Thu May 16 16:03:57 2019

@author: abhin
"""

import sqlite3
from pandas import DataFrame

coonect1 = sqlite3.connect("univercity.db")
curser1 = coonect1.cursor()
curser1.execute("""CREATE TABLE univercity(
          Student_Name TEXT,
          Student_Age INTEGER, 
          Student_Roll_no INTEGER,
          Student_Branch TEXT
          )""")
curser1.execute("INSERT INTO univercity VALUES ('abhi',20,1,'cse')")
curser1.execute("INSERT INTO univercity VALUES ('abhi2',21,2,'cse')")
curser1.execute("INSERT INTO univercity VALUES ('abhi3',23,3,'cse')")
curser1.execute("INSERT INTO univercity VALUES ('abhi4',25,1,'it')")
curser1.execute("SELECT * FROM univercity")
df = DataFrame(curser1.fetchall())
df.columns = [
    "Student_Name", "Student_Age", "Student_Roll_no", "Student_Branch"
]
coonect1.commit()
coonect1.close()
コード例 #44
0
ファイル: test_repr_info.py プロジェクト: thekensta/pandas
    def test_info_memory_usage(self):
        # Ensure memory usage is displayed, when asserted, on the last line
        dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]',
                  'complex128', 'object', 'bool']
        data = {}
        n = 10
        for i, dtype in enumerate(dtypes):
            data[i] = np.random.randint(2, size=n).astype(dtype)
        df = DataFrame(data)
        buf = StringIO()
        # display memory usage case
        df.info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()
        self.assertTrue("memory usage: " in res[-1])
        # do not display memory usage cas
        df.info(buf=buf, memory_usage=False)
        res = buf.getvalue().splitlines()
        self.assertTrue("memory usage: " not in res[-1])

        df.info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()
        # memory usage is a lower bound, so print it as XYZ+ MB
        self.assertTrue(re.match(r"memory usage: [^+]+\+", res[-1]))

        df.iloc[:, :5].info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()
        # excluded column with object dtype, so estimate is accurate
        self.assertFalse(re.match(r"memory usage: [^+]+\+", res[-1]))

        df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
        df_with_object_index.info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()
        self.assertTrue(re.match(r"memory usage: [^+]+\+", res[-1]))

        df_with_object_index.info(buf=buf, memory_usage='deep')
        res = buf.getvalue().splitlines()
        self.assertTrue(re.match(r"memory usage: [^+]+$", res[-1]))

        self.assertTrue(df_with_object_index.memory_usage(index=True,
                                                          deep=True).sum()
                        > df_with_object_index.memory_usage(index=True).sum())

        df_object = pd.DataFrame({'a': ['a']})
        self.assertTrue(df_object.memory_usage(deep=True).sum()
                        > df_object.memory_usage().sum())

        # Test a DataFrame with duplicate columns
        dtypes = ['int64', 'int64', 'int64', 'float64']
        data = {}
        n = 100
        for i, dtype in enumerate(dtypes):
            data[i] = np.random.randint(2, size=n).astype(dtype)
        df = DataFrame(data)
        df.columns = dtypes
        # Ensure df size is as expected
        df_size = df.memory_usage().sum()
        exp_size = (len(dtypes) + 1) * n * 8  # (cols + index) * rows * bytes
        self.assertEqual(df_size, exp_size)
        # Ensure number of cols in memory_usage is the same as df
        size_df = np.size(df.columns.values) + 1  # index=True; default
        self.assertEqual(size_df, np.size(df.memory_usage()))

        # assert deep works only on object
        self.assertEqual(df.memory_usage().sum(),
                         df.memory_usage(deep=True).sum())

        # test for validity
        DataFrame(1, index=['a'], columns=['A']
                  ).memory_usage(index=True)
        DataFrame(1, index=['a'], columns=['A']
                  ).index.nbytes
        df = DataFrame(
            data=1,
            index=pd.MultiIndex.from_product(
                [['a'], range(1000)]),
            columns=['A']
        )
        df.index.nbytes
        df.memory_usage(index=True)
        df.index.values.nbytes

        # sys.getsizeof will call the .memory_usage with
        # deep=True, and add on some GC overhead
        diff = df.memory_usage(deep=True).sum() - sys.getsizeof(df)
        self.assertTrue(abs(diff) < 100)
コード例 #45
0
def create_data():
    """ create the pickle data """
    data = {
        "A": [0.0, 1.0, 2.0, 3.0, np.nan],
        "B": [0, 1, 0, 1, 0],
        "C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
        "D": date_range("1/1/2009", periods=5),
        "E": [0.0, 1, Timestamp("20100101"), "foo", 2.0],
    }

    scalars = dict(timestamp=Timestamp("20130101"), period=Period("2012", "M"))

    index = dict(
        int=Index(np.arange(10)),
        date=date_range("20130101", periods=10),
        period=period_range("2013-01-01", freq="M", periods=10),
        float=Index(np.arange(10, dtype=np.float64)),
        uint=Index(np.arange(10, dtype=np.uint64)),
        timedelta=timedelta_range("00:00:00", freq="30T", periods=10),
    )

    index["range"] = RangeIndex(10)

    if _loose_version >= LooseVersion("0.21"):
        from pandas import interval_range

        index["interval"] = interval_range(0, periods=10)

    mi = dict(
        reg2=MultiIndex.from_tuples(
            tuple(
                zip(
                    *[
                        ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
                        ["one", "two", "one", "two", "one", "two", "one", "two"],
                    ]
                )
            ),
            names=["first", "second"],
        )
    )

    series = dict(
        float=Series(data["A"]),
        int=Series(data["B"]),
        mixed=Series(data["E"]),
        ts=Series(
            np.arange(10).astype(np.int64), index=date_range("20130101", periods=10)
        ),
        mi=Series(
            np.arange(5).astype(np.float64),
            index=MultiIndex.from_tuples(
                tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=["one", "two"]
            ),
        ),
        dup=Series(np.arange(5).astype(np.float64), index=["A", "B", "C", "D", "A"]),
        cat=Series(Categorical(["foo", "bar", "baz"])),
        dt=Series(date_range("20130101", periods=5)),
        dt_tz=Series(date_range("20130101", periods=5, tz="US/Eastern")),
        period=Series([Period("2000Q1")] * 5),
    )

    mixed_dup_df = DataFrame(data)
    mixed_dup_df.columns = list("ABCDA")
    frame = dict(
        float=DataFrame({"A": series["float"], "B": series["float"] + 1}),
        int=DataFrame({"A": series["int"], "B": series["int"] + 1}),
        mixed=DataFrame({k: data[k] for k in ["A", "B", "C", "D"]}),
        mi=DataFrame(
            {"A": np.arange(5).astype(np.float64), "B": np.arange(5).astype(np.int64)},
            index=MultiIndex.from_tuples(
                tuple(
                    zip(
                        *[
                            ["bar", "bar", "baz", "baz", "baz"],
                            ["one", "two", "one", "two", "three"],
                        ]
                    )
                ),
                names=["first", "second"],
            ),
        ),
        dup=DataFrame(
            np.arange(15).reshape(5, 3).astype(np.float64), columns=["A", "B", "A"]
        ),
        cat_onecol=DataFrame({"A": Categorical(["foo", "bar"])}),
        cat_and_float=DataFrame(
            {
                "A": Categorical(["foo", "bar", "baz"]),
                "B": np.arange(3).astype(np.int64),
            }
        ),
        mixed_dup=mixed_dup_df,
        dt_mixed_tzs=DataFrame(
            {
                "A": Timestamp("20130102", tz="US/Eastern"),
                "B": Timestamp("20130603", tz="CET"),
            },
            index=range(5),
        ),
        dt_mixed2_tzs=DataFrame(
            {
                "A": Timestamp("20130102", tz="US/Eastern"),
                "B": Timestamp("20130603", tz="CET"),
                "C": Timestamp("20130603", tz="UTC"),
            },
            index=range(5),
        ),
    )

    cat = dict(
        int8=Categorical(list("abcdefg")),
        int16=Categorical(np.arange(1000)),
        int32=Categorical(np.arange(10000)),
    )

    timestamp = dict(
        normal=Timestamp("2011-01-01"),
        nat=NaT,
        tz=Timestamp("2011-01-01", tz="US/Eastern"),
    )

    timestamp["freq"] = Timestamp("2011-01-01", freq="D")
    timestamp["both"] = Timestamp("2011-01-01", tz="Asia/Tokyo", freq="M")

    off = {
        "DateOffset": DateOffset(years=1),
        "DateOffset_h_ns": DateOffset(hour=6, nanoseconds=5824),
        "BusinessDay": BusinessDay(offset=timedelta(seconds=9)),
        "BusinessHour": BusinessHour(normalize=True, n=6, end="15:14"),
        "CustomBusinessDay": CustomBusinessDay(weekmask="Mon Fri"),
        "SemiMonthBegin": SemiMonthBegin(day_of_month=9),
        "SemiMonthEnd": SemiMonthEnd(day_of_month=24),
        "MonthBegin": MonthBegin(1),
        "MonthEnd": MonthEnd(1),
        "QuarterBegin": QuarterBegin(1),
        "QuarterEnd": QuarterEnd(1),
        "Day": Day(1),
        "YearBegin": YearBegin(1),
        "YearEnd": YearEnd(1),
        "Week": Week(1),
        "Week_Tues": Week(2, normalize=False, weekday=1),
        "WeekOfMonth": WeekOfMonth(week=3, weekday=4),
        "LastWeekOfMonth": LastWeekOfMonth(n=1, weekday=3),
        "FY5253": FY5253(n=2, weekday=6, startingMonth=7, variation="last"),
        "Easter": Easter(),
        "Hour": Hour(1),
        "Minute": Minute(1),
    }

    return dict(
        series=series,
        frame=frame,
        index=index,
        scalars=scalars,
        mi=mi,
        sp_series=dict(float=_create_sp_series(), ts=_create_sp_tsseries()),
        sp_frame=dict(float=_create_sp_frame()),
        cat=cat,
        timestamp=timestamp,
        offsets=off,
    )
コード例 #46
0
ファイル: daily.py プロジェクト: peijiche/pandas-datareader
    def _read_one_data(self, url, params):
        """ read one data from specified symbol """

        symbol = params['symbol']
        del params['symbol']
        url = url.format(symbol)

        resp = self._get_response(url, params=params)
        ptrn = r'root\.App\.main = (.*?);\n}\(this\)\);'
        try:
            j = json.loads(re.search(ptrn, resp.text, re.DOTALL).group(1))
            data = j['context']['dispatcher']['stores']['HistoricalPriceStore']
        except KeyError:
            msg = 'No data fetched for symbol {} using {}'
            raise RemoteDataError(msg.format(symbol, self.__class__.__name__))

        # price data
        prices = DataFrame(data['prices'])
        prices.columns = [col.capitalize() for col in prices.columns]
        prices['Date'] = to_datetime(
            to_datetime(prices['Date'], unit='s').dt.date)

        if 'Data' in prices.columns:
            prices = prices[prices['Data'].isnull()]
        prices = prices[[
            'Date', 'High', 'Low', 'Open', 'Close', 'Volume', 'Adjclose'
        ]]
        prices = prices.rename(columns={'Adjclose': 'Adj Close'})

        prices = prices.set_index('Date')
        prices = prices.sort_index().dropna(how='all')

        if self.ret_index:
            prices['Ret_Index'] = \
                _calc_return_index(prices['Adj Close'])
        if self.adjust_price:
            prices = _adjust_prices(prices)

        # dividends & splits data
        if self.get_actions and data['eventsData']:

            actions = DataFrame(data['eventsData'])
            actions.columns = [col.capitalize() for col in actions.columns]
            actions['Date'] = to_datetime(
                to_datetime(actions['Date'], unit='s').dt.date)

            types = actions['Type'].unique()
            if 'DIVIDEND' in types:
                divs = actions[actions.Type == 'DIVIDEND'].copy()
                divs = divs[['Date', 'Amount']].reset_index(drop=True)
                divs = divs.set_index('Date')
                divs = divs.rename(columns={'Amount': 'Dividends'})
                prices = prices.join(divs, how='outer')

            if 'SPLIT' in types:
                splits = actions[actions.Type == 'SPLIT'].copy()
                splits['SplitRatio'] = splits.apply(
                    lambda row: eval(row['Splitratio'])
                    if float(row['Numerator']) > 0 else 1,
                    axis=1)
                splits = splits.reset_index(drop=True)
                splits = splits.set_index('Date')
                splits['Splits'] = splits['SplitRatio']
                prices = prices.join(splits['Splits'], how='outer')

                if 'DIVIDEND' in types and self.adjust_dividends:
                    # Adjust dividends to deal with splits
                    adj = prices['Splits'].sort_index(
                        ascending=False).fillna(1).cumprod()
                    prices['Dividends'] = prices['Dividends'] * adj

        return prices
コード例 #47
0
ファイル: test_repr_info.py プロジェクト: ywy0318/pandas
    def test_info_memory_usage(self):
        # Ensure memory usage is displayed, when asserted, on the last line
        dtypes = [
            'int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]',
            'complex128', 'object', 'bool'
        ]
        data = {}
        n = 10
        for i, dtype in enumerate(dtypes):
            data[i] = np.random.randint(2, size=n).astype(dtype)
        df = DataFrame(data)
        buf = StringIO()

        # display memory usage case
        df.info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()
        assert "memory usage: " in res[-1]

        # do not display memory usage case
        df.info(buf=buf, memory_usage=False)
        res = buf.getvalue().splitlines()
        assert "memory usage: " not in res[-1]

        df.info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()

        # memory usage is a lower bound, so print it as XYZ+ MB
        assert re.match(r"memory usage: [^+]+\+", res[-1])

        df.iloc[:, :5].info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()

        # excluded column with object dtype, so estimate is accurate
        assert not re.match(r"memory usage: [^+]+\+", res[-1])

        # Test a DataFrame with duplicate columns
        dtypes = ['int64', 'int64', 'int64', 'float64']
        data = {}
        n = 100
        for i, dtype in enumerate(dtypes):
            data[i] = np.random.randint(2, size=n).astype(dtype)
        df = DataFrame(data)
        df.columns = dtypes

        df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
        df_with_object_index.info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()
        assert re.match(r"memory usage: [^+]+\+", res[-1])

        df_with_object_index.info(buf=buf, memory_usage='deep')
        res = buf.getvalue().splitlines()
        assert re.match(r"memory usage: [^+]+$", res[-1])

        # Ensure df size is as expected
        # (cols * rows * bytes) + index size
        df_size = df.memory_usage().sum()
        exp_size = len(dtypes) * n * 8 + df.index.nbytes
        assert df_size == exp_size

        # Ensure number of cols in memory_usage is the same as df
        size_df = np.size(df.columns.values) + 1  # index=True; default
        assert size_df == np.size(df.memory_usage())

        # assert deep works only on object
        assert df.memory_usage().sum() == df.memory_usage(deep=True).sum()

        # test for validity
        DataFrame(1, index=['a'], columns=['A']).memory_usage(index=True)
        DataFrame(1, index=['a'], columns=['A']).index.nbytes
        df = DataFrame(data=1,
                       index=pd.MultiIndex.from_product([['a'],
                                                         range(1000)]),
                       columns=['A'])
        df.index.nbytes
        df.memory_usage(index=True)
        df.index.values.nbytes

        mem = df.memory_usage(deep=True).sum()
        assert mem > 0
コード例 #48
0
def train_old_net():
    learning_rate = 0.001
    L1_reg = 0.00
    L2_reg = 0.0001
    n_epochs = 100
    batch_size = 20
    n_hidden = 500
    datasets = load_data(path)
    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x = datasets[2]

    #compute number of minibatches
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    print '...building the model'
    index = T.lscalar()
    x = T.matrix('x')
    y = T.ivector('y')
    rng = np.random.RandomState(1234567890)
    #construct the MLP class
    #Attention!!!
    #this line to set p_drop_perceptron and p_drop_logistic
    #if set no dropout then decrease the early stop threshold
    #improvement_threshold on line 292
    classifier = MLP(rng=rng,
                     input=x,
                     n_in=28 * 28,
                     n_hidden=n_hidden,
                     n_out=10,
                     p_drop_perceptron=0,
                     p_drop_logistic=0)

    # the cost we minimize during training is the negative log likelihood of
    # the model plus the regularization terms (L1 and L2); cost is expressed
    # here symbolically
    cost = classifier.negative_log_likelihood(
        y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr

    #compiling a theano function that computes the mistake rate that
    #made by the validate_set on minibatch
    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    #symbolicly compute the gradient of cost respect to params
    #the resulting gradient will be stored in list gparams
    gparams = []
    for param in classifier.params:
        gparam = T.grad(cost, param)
        gparams.append(gparam)

    #using RMSprop(scaling the gradient based on running average)
    #to update the parameters of the model as a list of (variable,update expression) pairs
    def RMSprop(gparams, params, learning_rate, rho=0.9, epsilon=1e-6):
        """
        param:rho,the fraction we keep the previous gradient contribution
        """
        updates = []
        for p, g in zip(params, gparams):
            acc = theano.shared(p.get_value() * 0.)
            acc_new = rho * acc + (1 - rho) * g**2
            gradient_scaling = T.sqrt(acc_new + epsilon)
            g = g / gradient_scaling
            updates.append((acc, acc_new))
            updates.append((p, p - learning_rate * g))
        return updates

    #compiling a Theano function 'train_model' that returns the cost
    #but in the same time updates the parameter of the model based on
    #the rules defined in 'updates'
    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=RMSprop(gparams, classifier.params, learning_rate=0.001),
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    ##############
    #Train Model##
    ##############
    print '...training'

    #early-stopping parameters
    patience = 10000  #look as this many examples regardless
    patience_increase = 2  #wait the iter number longer when a new best is found
    #improvement_threshold=0.995 # a relative improvement of this much on validation set
    # considered as not overfitting
    # if have added drop-out noise,we can increase the value
    improvement_threshold = 0.995
    validation_frequency = min(n_train_batches, patience / 2)
    # every this much interval check on the validation set
    # to see if the net is overfitting.
    # patience/2 because we want to at least check twice before getting the patience
    # include n_train_batches to ensure we at least check on every epoch
    best_validation_error_rate = np.inf
    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):
            minibatch_avg_cost = train_model(minibatch_index)
            #iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index
            if (iter + 1) % validation_frequency == 0:
                #validation
                validation_error_rate = [
                    validate_model(i) for i in xrange(n_valid_batches)
                ]
                this_validation_error_rate = np.mean(validation_error_rate)
                print('epoch %i,validation error %f %%' %
                      (epoch, this_validation_error_rate * 100.))

                #if we got the best validation score until now
                if this_validation_error_rate < best_validation_error_rate:
                    #improve the patience if error rate is good enough
                    if this_validation_error_rate < best_validation_error_rate * improvement_threshold:
                        patience = max(patience, iter * patience_increase)
                    best_validation_error_rate = this_validation_error_rate
            if patience <= iter:
                done_looping = True
                break

    ###########################################
    # Predict with trained parameters(nonoise)#
    ###########################################
    classifier.p_drop_perceptron = 0
    classifier.p_drop_logistic = 0
    y_x = classifier.predict()
    model_predict = theano.function(
        inputs=[index],
        outputs=y_x,
        givens={x: test_set_x[index * batch_size:(index + 1) * batch_size]})
    digit_preds = Series(
        np.concatenate([model_predict(i) for i in xrange(n_test_batches)]))
    image_ids = Series(np.arange(1, len(digit_preds) + 1))
    submission = DataFrame([image_ids, digit_preds]).T
    submission.columns = ['ImageId', 'Label']
    submission.to_csv(path + 'submission_sample.csv', index=False)
コード例 #49
0
ファイル: window.py プロジェクト: wjt/pandas
 def dataframe_from_int_dict(data, frame_template):
     result = DataFrame(data, index=frame_template.index)
     if len(result.columns) > 0:
         result.columns = frame_template.columns[result.columns]
     return result
コード例 #50
0
def get_variance_by_person(path_dataset, range_=(1, 11)):
    """Calculate the variance by person."""
    fold_variance = Path(path_dataset) / "variance_person"

    if not check_exist(path_dataset, "variance_person"):
        print("Loading the files to calculate variance.")

        filterwarnings("ignore")

        var_pearson = []

        selected_channels = [
            "time",
            "FP1-F7",
            "F7-T7",
            "T7-P7",
            "P7-O1",
            "FP1-F3",
            "F3-C3",
            "C3-P3",
            "P3-O1",
            "FP2-F4",
            "F4-C4",
            "C4-P4",
            "P4-O2",
            "FP2-F8",
            "F8-T8",
            "T8-P8-0",
            "P8-O2",
            "FZ-CZ",
            "CZ-PZ",
            "P7-T7",
            "T7-FT9",
            "FT9-FT10",
            "FT10-T8",
            "T8-P8-1",
        ]

        for id_patient in tqdm_notebook(range(range_[0], range_[1]),
                                        desc="Patient"):

            accumulate_count = 0
            accumulate_avg = 0
            accumulate_var = 0

            path_files = join(path_dataset,
                              "chb{0:0=2d}/*.edf".format(id_patient))

            files_in_folder = glob(path_files)
            for enum, file in enumerate(
                    tqdm_notebook(files_in_folder, desc="Files", leave=False)):

                variance_file = read_raw_edf(input_fname=file,
                                             verbose=0).to_data_frame(
                                                 picks=["eeg"],
                                                 time_format="ms")

                # Removing channels that are not present in all files.
                variance_file = variance_file[
                    variance_file.columns.intersection(selected_channels)]
                # Sorting the channels
                variance_file.sort_index(axis=1, inplace=True)

                if enum == 0:
                    accumulate_count = len(variance_file)
                    accumulate_avg = variance_file.mean()
                    accumulate_var = variance_file.var()

                else:
                    (
                        accumulate_count,
                        accumulate_avg,
                        accumulate_var,
                    ) = parallel_variance(
                        accumulate_count,
                        accumulate_avg,
                        accumulate_var,
                        len(variance_file),
                        variance_file.mean(),
                        variance_file.var(),
                    )

            var_pearson.append(accumulate_var)

        variance_df = DataFrame(var_pearson).drop("time", 1)

        variance = DataFrame([
            file.sort_index().sort_values().index[-1]
            for ind, file in variance_df.iterrows()
        ])

        variance.columns = variance.columns.astype(str)

        variance.to_parquet(fold_variance / "variance_person.parquet",
                            engine="pyarrow")

        return variance
    else:
        print("Reading the variance already calculated.")
        variance = read_parquet(fold_variance / "variance_person.parquet",
                                engine="pyarrow")
        return variance
コード例 #51
0
logreg.fit(X_train, Y_train)

Y_pred = logreg.predict(X_test)

logreg.score(X_train, Y_train)


# In[ ]:


random_forest = RandomForestClassifier(n_estimators=100)

random_forest.fit(X_train, Y_train)

Y_pred = random_forest.predict(X_test)

random_forest.score(X_train, Y_train)



# In[ ]:


coeff_df = DataFrame(titanic_df.columns.delete(0))
coeff_df.columns = ['Features']
coeff_df["Coefficient Estimate"] = pd.Series(logreg.coef_[0])

# preview
coeff_df

コード例 #52
0
ファイル: glofasdata.py プロジェクト: lvanbrussel/IBF-system
    def extract(self):
        print('\nExtracting Glofas Data\n')

        files = [
            f for f in listdir(self.inputPath)
            if isfile(join(self.inputPath, f)) and f.endswith('.nc')
        ]

        df_thresholds = DataFrame(self.GLOFAS_STATIONS)
        df_thresholds.columns = self.glofas_cols
        df_thresholds = df_thresholds.set_index("station_code", drop=False)
        df_district_mapping = DataFrame(self.DISTRICT_MAPPING)
        df_district_mapping.columns = self.district_cols
        df_district_mapping = df_district_mapping.set_index(
            "station_code_7day", drop=False)

        stations = []
        trigger_per_day = {
            1: 0,
            2: 0,
            3: 0,
            4: 0,
            5: 0,
            6: 0,
            7: 0,
        }
        for i in range(0, len(files)):
            logging.info("Extracting glofas data from %s", i)
            Filename = os.path.join(self.inputPath, files[i])
            station = {}
            station['code'] = files[i].split('_')[2]

            data = xr.open_dataset(Filename)

            # Get threshold for this specific station
            if station['code'] in df_thresholds['station_code'] and station[
                    'code'] in df_district_mapping['station_code_7day']:
                print(Filename)
                threshold = df_thresholds[df_thresholds.station_code ==
                                          station['code']]['trigger_level'][0]

                # Set dimension-values
                time = 0

                for step in range(1, 8):

                    # Loop through 51 ensembles, get forecast (for 3 or 7 day) and compare to threshold
                    ensemble_options = 51
                    count = 0
                    dis_sum = 0
                    for ensemble in range(0, ensemble_options):

                        discharge = data['dis'].sel(ensemble=ensemble,
                                                    step=step).values[time][0]

                        # DUMMY OVERWRITE DEPENDING ON COUNTRY SETTING
                        if SETTINGS[
                                self.country_code]['dummy_trigger'] == True:
                            if step < 5:
                                discharge = 0
                            elif station[
                                    'code'] == 'G1361':  # ZMB dummy flood station 1
                                discharge = 8000
                            elif station[
                                    'code'] == 'G1328':  # ZMB dummy flood station 2
                                discharge = 9000
                            elif station[
                                    'code'] == 'G5200':  # UGA dummy flood station
                                discharge = 700
                            elif station[
                                    'code'] == 'G1067':  # ETH dummy flood station
                                discharge = 1000
                            elif station[
                                    'code'] == 'G1904':  # ETH dummy flood station
                                discharge = 2000
                            elif station[
                                    'code'] == 'G5194':  # KEN dummy flood station
                                discharge = 2000
                            else:
                                discharge = 0

                        if discharge >= threshold:
                            count = count + 1
                        dis_sum = dis_sum + discharge

                    prob = count / ensemble_options
                    dis_avg = dis_sum / ensemble_options
                    station['fc_' + self.fcStep] = dis_avg
                    station['fc_' + self.fcStep + '_prob'] = prob
                    station['fc_' + self.fcStep +
                            '_trigger'] = 1 if prob > TRIGGER_LEVELS[
                                'minimum'] else 0

                    if station['fc_' + self.fcStep + '_trigger'] == 1:
                        trigger_per_day[step] = 1

                    if step == self.days:
                        stations.append(station)
                    station = {}
                    station['code'] = files[i].split('_')[2]

            data.close()

        # Add 'no_station' and all currently unavailable glofas-stations manually for now
        for station_code in [
                'no_station'
        ]:  #,'F0043','F0044','F0045','F0046','F0047','F0048','F0049','F0050','F0051','F0052','F0053','F0054','F0055','F0056','G5696']:
            station = {}
            station['code'] = station_code
            station['fc_' + self.fcStep] = 0
            station['fc_' + self.fcStep + '_prob'] = 0
            station['fc_' + self.fcStep + '_trigger'] = 0
            stations.append(station)

        with open(self.extractedGlofasPath, 'w') as fp:
            json.dump(stations, fp)
            print('Extracted Glofas data - File saved')

        with open(self.triggerPerDay, 'w') as fp:
            json.dump([trigger_per_day], fp)
            print('Extracted Glofas data - Trigger per day File saved')
コード例 #53
0
    def test_dups_fancy_indexing(self):

        # GH 3455
        from pandas.util.testing import makeCustomDataframe as mkdf
        df = mkdf(10, 3)
        df.columns = ['a', 'a', 'b']
        result = df[['b', 'a']].columns
        expected = Index(['b', 'a', 'a'])
        tm.assert_index_equal(result, expected)

        # across dtypes
        df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']],
                       columns=list('aaaaaaa'))
        df.head()
        str(df)
        result = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']])
        result.columns = list('aaaaaaa')

        # TODO(wesm): unused?
        df_v = df.iloc[:, 4]  # noqa
        res_v = result.iloc[:, 4]  # noqa

        tm.assert_frame_equal(df, result)

        # GH 3561, dups not in selected order
        df = DataFrame(
            {
                'test': [5, 7, 9, 11],
                'test1': [4., 5, 6, 7],
                'other': list('abcd')
            },
            index=['A', 'A', 'B', 'C'])
        rows = ['C', 'B']
        expected = DataFrame(
            {
                'test': [11, 9],
                'test1': [7., 6],
                'other': ['d', 'c']
            },
            index=rows)
        result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        result = df.loc[Index(rows)]
        tm.assert_frame_equal(result, expected)

        rows = ['C', 'B', 'E']
        expected = DataFrame(
            {
                'test': [11, 9, np.nan],
                'test1': [7., 6, np.nan],
                'other': ['d', 'c', np.nan]
            },
            index=rows)

        result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # see GH5553, make sure we use the right indexer
        rows = ['F', 'G', 'H', 'C', 'B', 'E']
        expected = DataFrame(
            {
                'test': [np.nan, np.nan, np.nan, 11, 9, np.nan],
                'test1': [np.nan, np.nan, np.nan, 7., 6, np.nan],
                'other': [np.nan, np.nan, np.nan, 'd', 'c', np.nan]
            },
            index=rows)
        result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # inconsistent returns for unique/duplicate indices when values are
        # missing
        df = DataFrame(np.random.randn(4, 3), index=list('ABCD'))
        expected = df.reindex(['E'])

        dfnu = DataFrame(np.random.randn(5, 3), index=list('AABCD'))
        with catch_warnings(record=True):
            result = dfnu.ix[['E']]
        tm.assert_frame_equal(result, expected)

        # ToDo: check_index_type can be True after GH 11497

        # GH 4619; duplicate indexer with missing label
        df = DataFrame({"A": [0, 1, 2]})
        result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        df = DataFrame({"A": list('abc')})
        result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        # non unique with non unique selector
        df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C'])
        expected = DataFrame({'test': [5, 7, 5, 7, np.nan]},
                             index=['A', 'A', 'A', 'A', 'E'])
        result = df.loc[['A', 'A', 'E']]
        tm.assert_frame_equal(result, expected)

        # GH 5835
        # dups on index and missing values
        df = DataFrame(np.random.randn(5, 5),
                       columns=['A', 'B', 'B', 'B', 'A'])

        expected = pd.concat([
            df.loc[:, ['A', 'B']],
            DataFrame(np.nan, columns=['C'], index=df.index)
        ],
                             axis=1)
        result = df.loc[:, ['A', 'B', 'C']]
        tm.assert_frame_equal(result, expected)

        # GH 6504, multi-axis indexing
        df = DataFrame(np.random.randn(9, 2),
                       index=[1, 1, 1, 2, 2, 2, 3, 3, 3],
                       columns=['a', 'b'])

        expected = df.iloc[0:6]
        result = df.loc[[1, 2]]
        tm.assert_frame_equal(result, expected)

        expected = df
        result = df.loc[:, ['a', 'b']]
        tm.assert_frame_equal(result, expected)

        expected = df.iloc[0:6, :]
        result = df.loc[[1, 2], ['a', 'b']]
        tm.assert_frame_equal(result, expected)
コード例 #54
0
    def test_dups_fancy_indexing(self):

        # GH 3455
        from pandas.util.testing import makeCustomDataframe as mkdf
        df = mkdf(10, 3)
        df.columns = ['a', 'a', 'b']
        result = df[['b', 'a']].columns
        expected = Index(['b', 'a', 'a'])
        tm.assert_index_equal(result, expected)

        # across dtypes
        df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']],
                       columns=list('aaaaaaa'))
        df.head()
        str(df)
        result = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']])
        result.columns = list('aaaaaaa')

        # TODO(wesm): unused?
        df_v = df.iloc[:, 4]  # noqa
        res_v = result.iloc[:, 4]  # noqa

        tm.assert_frame_equal(df, result)

        # GH 3561, dups not in selected order
        df = DataFrame(
            {'test': [5, 7, 9, 11],
             'test1': [4., 5, 6, 7],
             'other': list('abcd')}, index=['A', 'A', 'B', 'C'])
        rows = ['C', 'B']
        expected = DataFrame(
            {'test': [11, 9],
             'test1': [7., 6],
             'other': ['d', 'c']}, index=rows)
        result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        result = df.loc[Index(rows)]
        tm.assert_frame_equal(result, expected)

        rows = ['C', 'B', 'E']
        expected = DataFrame(
            {'test': [11, 9, np.nan],
             'test1': [7., 6, np.nan],
             'other': ['d', 'c', np.nan]}, index=rows)

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # see GH5553, make sure we use the right indexer
        rows = ['F', 'G', 'H', 'C', 'B', 'E']
        expected = DataFrame({'test': [np.nan, np.nan, np.nan, 11, 9, np.nan],
                              'test1': [np.nan, np.nan, np.nan, 7., 6, np.nan],
                              'other': [np.nan, np.nan, np.nan,
                                        'd', 'c', np.nan]},
                             index=rows)
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # List containing only missing label
        dfnu = DataFrame(np.random.randn(5, 3), index=list('AABCD'))
        with pytest.raises(KeyError):
            dfnu.loc[['E']]

        # ToDo: check_index_type can be True after GH 11497

        # GH 4619; duplicate indexer with missing label
        df = DataFrame({"A": [0, 1, 2]})
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        df = DataFrame({"A": list('abc')})
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        # non unique with non unique selector
        df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C'])
        expected = DataFrame(
            {'test': [5, 7, 5, 7, np.nan]}, index=['A', 'A', 'A', 'A', 'E'])
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[['A', 'A', 'E']]
        tm.assert_frame_equal(result, expected)
コード例 #55
0
                                        position1_weight=1,position2_weight=0.9,
                                        position3_weight=0.8,
                                        context_word_count_weight=1,topic_weight=1)
    for topic_index in range(0,total_topics_num):#Save the topic terms and their context keywords of this cluster in the dataframe
        topic_keywords_num=len(topics_keywords_context_weights[topic_index])
        #If the number of topic terms in one theme is less than the displaying number of topic terms which is setted before, then make it up with "None".
        if topic_keywords_num<topic_feature_display_num:
            for add_index in range(0,topic_feature_display_num-topic_keywords_num):
                topics_keywords_context_weights[topic_index].append('None')
                nmf_topic_detail_list[topic_index].append(('None',0))
        cluster_topic_df[str(cluster_index+1)+str(topic_index)]=\
            [word for (word,weight) in nmf_topic_detail_list[topic_index]]
        cluster_topic_with_context[str(cluster_index+1)+str(topic_index)]=\
            topics_keywords_context_weights[topic_index]

#Generate the two layer column names of dataframe
clusters_name_columns=[]
topics_name_columns=[]
for cluster_index in range(1,clusters_num+1):
    for topic_index in range(1,total_topics_num+1):
        clusters_name_columns.append(str(cluster_index))
        topics_name_columns.append(chr(64+topic_index))
cluster_topic_with_context.columns=[clusters_name_columns,topics_name_columns]
cluster_topic_with_context.columns.names=['clusters','topics']
cluster_topic_df.columns=[clusters_name_columns,topics_name_columns]
cluster_topic_df.columns.names=['clusters','topics']

cluster_topic_with_context.to_csv('topics_context_'+file_name+'_'+
                                  topic_model_name+'.csv')
cluster_topic_df.to_csv(file_name+'_'+topic_model_name+'.csv')
コード例 #56
0
ファイル: untitled1.py プロジェクト: bajaj-ankita/forskdata
import sqlalchemy
from pandas import DataFrame

engine = sqlalchemy.create_engine("mysql://root:@localhost/data science")
query = "select * from job_satisfaction"  #query Database
resoverall = engine.execute(query)  #execute Query
df = DataFrame(resoverall.fetchall())  #putting the result into Dataframe
df.columns = resoverall.keys(
)  #Setting the Column names as it was in database.

from pandas import DataFrame
import mysql.connector

# connect to  MySQL server along with Database name
conn = mysql.connector.connect(user='******',
                               password='',
                               host='localhost',
                               database='job_satisfaction')

# Creating cursor Object from connection object
cursor = conn.cursor()

query = ("SELECT * FROM job_satisfaction;")  # query Database
cursor.execute(query)  # execute Query
df = DataFrame(cursor.fetchall())  # putting the result into Dataframe
df.columns = cursor.column_names  # Setting the Column names as it was in database.
コード例 #57
0
ファイル: classifier_dump.py プロジェクト: lucashm/tcc-1
import arff
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from pandas import DataFrame
import pickle

data = arff.load(open('./OffComBR3.arff'))
df = DataFrame(data['data'])
df.columns = ['hate', 'sentence']
df['hate'] = df['hate'].apply(lambda x: 1 if x == 'yes' else 0)

X = df['sentence'].tolist()
y = df['hate'].tolist()

cl = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1, 4))),
               ('clf',
                RandomForestClassifier(n_estimators=100,
                                       max_depth=None,
                                       min_samples_leaf=1,
                                       min_samples_split=2,
                                       min_weight_fraction_leaf=0))])

cl.fit(X, y)

cl_filename = 'randomforest.sav'
df_filename = 'data.sav'

f = open(cl_filename, 'wb')
pickle.dump(cl, f)
f.close()
コード例 #58
0
ファイル: _export.py プロジェクト: mariiabilous/besca
def pseudobulk(adata,
             outpath = None,
             column = 'celltype0',
             label  = 'celltype0',
             split_condition  = 'donor',
             todrop =['CELL','input.path','percent_mito','n_counts','n_genes','leiden','celltype0','celltype1','celltype2','celltype3','dblabel'], main_condition='CONDITION'):
    """export pseudobulk profiles of cells to .gct files

    This is a function with which any type of labeling (i.e. celltype annotation, louvain 
    clustering, etc.) can be written out to several .gct files as well as a single metadata file. 

    To ensure FAIR compatbility label, and file name should not be changed.

    parameters
    ----------
    adata: `AnnData`
        the AnnData object containing the labeling 
    outpath `str` | default = current working directory
        filepath to the directory in which the results should be outputed, if no directory is 
        specified it outputs the results to the current working directory.
    column: `str` | default = 'celltype0'
        Name of the column in adata.obs that is to be mapped to cell barcodes and written out to file.
    label: `str` | default = 'celltype0'
        label above the column when it is written out to several files
    split_condition: `str` | default = 'experiment'
        the experimental unit, e.g. sample ID
    todrop: `list` 
        Several column headers to be excluded from metadata
    main_condition: `str` | default = 'CONDITION'
        main condition to be outputed in the metadata file
    returns
    -------
    dfmerge: `pd.DataFrame`
        merged dataframe

    """
    if outpath is None:
        outpath = os.getcwd()
    
    data = adata.obs.get(column)
    if data is None:
        sys.exit('please specify a column name that is present in adata.obs')
    
    data = adata.obs.get(column).to_frame(name=label)
        
    data = adata.obs.get(main_condition)
    if data is None:
        sys.exit('please specify a condition name that is present in adata.obs')
    
    ### check if the outdir exists if not create
    if not os.path.exists(outpath):
        os.makedirs(outpath)
    
    ### create adata subsets for each column value
    adata.obs[split_condition]=adata.obs[split_condition].astype('str')
    adata.obs[split_condition]=adata.obs[split_condition].astype('category')
    adata.obs[column]=adata.obs[column].astype('category')
    
    bulks={}
    myset=list(set(adata.obs[column]))
    for i in myset:
        ii=i.replace(" ", "_") ## to avoid spaces in cell names
        bulks[ii]=adata[adata.obs[column].isin([i])].copy()
    bulks['all']=adata.copy()
    
    ### go through each adata subset and export pseudobulk
    dfbulks={}
    for x in bulks.keys():   
        # sum expression
        auxdata=bulks[x].copy()
        myexp=list(auxdata.obs[split_condition].cat.categories) ### these are all different levels for experiments
        mysums=zeros((len(auxdata.raw.var.index),len(myexp)))
        for i in range(len(myexp)):
            mysums[:,i]=expm1(auxdata[auxdata.obs[split_condition]==myexp[i]].raw.X).sum(axis=0)
        mysums=DataFrame(mysums)
        mysums.index=adata.raw.var.index
        mysums.columns=[x+'.'+y for y in myexp]
        dfbulks[x]=mysums
    
        mydat = auxdata.raw.var.loc[:,['SYMBOL', 'ENSEMBL']]
        mydat.rename(columns={'SYMBOL':'Description'}, inplace=True)
        gct = mydat.merge(dfbulks[x], how='right', left_index=True, right_index=True)
        gct.set_index('ENSEMBL', inplace=True)
        gct.index.names = ['NAME']
        gct.columns=['Description']+myexp
    
        #write out average expression
        gctFile_pseudo = outpath+ 'Pseudobulk-'+label+'-'+x+'.gct'
        with open (gctFile_pseudo,"w") as fp:
            fp.write("#1.2"+"\n")
            fp.write(str(gct.shape[0])+'\t'+str(gct.shape[1] - 1)+'\n') # "description" already merged in as a column
        fp.close()
        #...and then the matrix
        gct.to_csv(gctFile_pseudo, sep = '\t', index=True, index_label='NAME', header=True, mode = 'a',  float_format='%.3f')
        print('Pseudobulk-'+label+'-'+x+'.gct exported successfully to file')

    #### Output into single .tsv file
    dfmerge=concat(dfbulks,axis=1)
    dfmerge.columns = dfmerge.columns.droplevel()
    dfmerge.to_csv(outpath+ 'Pseudobulk-'+label+'.tsv',sep='\t',index_label=False)

    ### Export one metadata file
    myexp=list(adata.obs[split_condition].cat.categories) 
    colindex=range(0,len(adata.obs.columns)) ### replace if only a subset of metadata should be used
    mysums=[]
    for i in range(len(myexp)):
        mysums.append(list(adata[adata.obs[split_condition]==myexp[i]].obs.iloc[:,colindex].iloc[0,:]))
    mysums=DataFrame(mysums).transpose()
    mysums.index=adata[adata.obs[split_condition]==myexp[i]].obs.iloc[:,colindex].columns
    mysums.columns=myexp
    mysums=mysums.transpose().drop(labels=todrop,axis=1,errors='ignore')
    mysums['ID']=list(mysums.index)
    colorder = ['ID',main_condition] + (mysums.columns.drop(['ID',main_condition]).tolist())
    mysums.loc[:,colorder].to_csv(outpath+ 'Pseudobulk.meta',sep='\t',index=False)

    return(dfmerge)
    sys.exit(0)
コード例 #59
0
ファイル: multi_reg.py プロジェクト: hiroshijsc/Udemy2
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import sklearn
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_boston

boston = load_boston()
boston_df = DataFrame(boston.data)
boston_df.columns = boston.feature_names
boston_df['Price'] = boston.target

X_multi = boston_df.drop('Price', axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(X_multi, boston_df.Price)
lreg = LinearRegression()
pred_train = lreg.predict(X_train)
pred_test = lreg.predict(X_test)

train = plt.scatter(pred_train, (pred_train - Y_train), c='b', alpha=0.5)
test = plt.scatter(pred_test, (pred_test - Y_test), c='r', alpha=0.5)
plt.hlines(y=0, xmin=1.0, xmax=50)
コード例 #60
0
ファイル: flatten.py プロジェクト: dodopizza/superset
def flatten(
    df: pd.DataFrame,
    reset_index: bool = True,
    drop_levels: Union[Sequence[int], Sequence[str]] = (),
) -> pd.DataFrame:
    """
    Convert N-dimensional DataFrame to a flat DataFrame

    :param df: N-dimensional DataFrame.
    :param reset_index: Convert index to column when df.index isn't RangeIndex
    :param drop_levels: index of level or names of level might be dropped
                        if df is N-dimensional
    :return: a flat DataFrame

    Examples
    -----------

    Convert DatetimeIndex into columns.

    >>> index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03",])
    >>> index.name = "__timestamp"
    >>> df = pd.DataFrame(index=index, data={"metric": [1, 2, 3]})
    >>> df
                 metric
    __timestamp
    2021-01-01        1
    2021-01-02        2
    2021-01-03        3
    >>> df = flatten(df)
    >>> df
      __timestamp  metric
    0  2021-01-01       1
    1  2021-01-02       2
    2  2021-01-03       3

    Convert DatetimeIndex and MultipleIndex into columns

    >>> iterables = [["foo", "bar"], ["one", "two"]]
    >>> columns = pd.MultiIndex.from_product(iterables, names=["level1", "level2"])
    >>> df = pd.DataFrame(index=index, columns=columns, data=1)
    >>> df
    level1      foo     bar
    level2      one two one two
    __timestamp
    2021-01-01    1   1   1   1
    2021-01-02    1   1   1   1
    2021-01-03    1   1   1   1
    >>> flatten(df)
      __timestamp foo, one foo, two bar, one bar, two
    0  2021-01-01        1        1        1        1
    1  2021-01-02        1        1        1        1
    2  2021-01-03        1        1        1        1
    """
    if _is_multi_index_on_columns(df):
        df.columns = df.columns.droplevel(drop_levels)
        _columns = []
        for series in df.columns.to_flat_index():
            _cells = []
            for cell in series if is_sequence(series) else [series]:
                if pd.notnull(cell):
                    # every cell should be converted to string
                    _cells.append(str(cell))
            _columns.append(FLAT_COLUMN_SEPARATOR.join(_cells))

        df.columns = _columns

    if reset_index and not isinstance(df.index, pd.RangeIndex):
        df = df.reset_index(level=0)
    return df