Example #1
0
    def test_pivot_index_none(self):
        # gh-3962
        data = {
            'index': ['A', 'B', 'C', 'C', 'B', 'A'],
            'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'],
            'values': [1., 2., 3., 3., 2., 1.]
        }

        frame = DataFrame(data).set_index('index')
        result = frame.pivot(columns='columns', values='values')
        expected = DataFrame({
            'One': {'A': 1., 'B': 2., 'C': 3.},
            'Two': {'A': 1., 'B': 2., 'C': 3.}
        })

        expected.index.name, expected.columns.name = 'index', 'columns'
        assert_frame_equal(result, expected)

        # omit values
        result = frame.pivot(columns='columns')

        expected.columns = pd.MultiIndex.from_tuples([('values', 'One'),
                                                      ('values', 'Two')],
                                                     names=[None, 'columns'])
        expected.index.name = 'index'
        tm.assert_frame_equal(result, expected, check_names=False)
        assert result.index.name == 'index'
        assert result.columns.names == (None, 'columns')
        expected.columns = expected.columns.droplevel(0)
        result = frame.pivot(columns='columns', values='values')

        expected.columns.name = 'columns'
        tm.assert_frame_equal(result, expected)
def datatype_records_to_subset_and_migrate(likechars):
    stmt_for_pkeys = conn_popler_2.execute(
        select(
            from_obj=Maintable,
            columns=[
                column('lter_proj_site'),
                column('samplingprotocol')
            ]).
        where(
            column('samplingprotocol').like(
                '%{}%'.format(likechars))
        )
    )
    data = DataFrame(stmt_for_pkeys.fetchall())
    data.columns = stmt_for_pkeys.keys()

    records_to_get = data['lter_proj_site'].values.tolist()

    stmt_for_records = conn_popler_2.execute(
        select(
            from_table=Rawtable,
        ).
        where(column('lter_proj_site').in_(records_to_get)).
        order_by('sampleid')
    )
    data2 = DataFrame(stmt_for_records.fetchall())
    data2.columns = stmt_for_records.keys()
    data2.drop('individ', axis=1, inplace=True)
Example #3
0
def retrieve_from_db_usa():
    """imports model, pulls mwh production data from db, and places into pandas df.
    Also pulls state for each plant_name, and places into dict."""

    # add parent directory to the path, so can import model.py
    #  need model in order to update the database when this task is activated by cron
    import os
    parentdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    os.sys.path.insert(0,parentdir)

    import model
    s = model.connect()

    # retrive DECEMBER production data, for all turbines at all power plants in California
    USA_gen_dec13_obj = s.execute('SELECT plant_name, state, fuel_type, dec_mwh_gen FROM "ProdGensDec2013" ')
    USA_gen_dec13_data = USA_gen_dec13_obj.fetchall()
    df_dec2013 = DataFrame(USA_gen_dec13_data)
    df_dec2013.columns = ['plant_name', 'state', 'fuel_type', 'dec_mwh_gen']

    # retrive JAN-NOV 2014 production data, for all turbines at all power plants in USA
    USA_gen_2014_obj = s.execute('SELECT plant_name, state, fuel_type, jan_mwh_gen, feb_mwh_gen, mar_mwh_gen, apr_mwh_gen, may_mwh_gen, jun_mwh_gen, jul_mwh_gen, aug_mwh_gen, sep_mwh_gen, oct_mwh_gen, nov_mwh_gen FROM "ProdGens" ')
    USA_gen_2014_data = USA_gen_2014_obj.fetchall()
    df_2014 = DataFrame(USA_gen_2014_data)
    df_2014.columns = ['plant_name', 'state', 'fuel_type', 'jan_mwh_gen', 'feb_mwh_gen', 'mar_mwh_gen', 'apr_mwh_gen', 'may_mwh_gen', 'jun_mwh_gen', 'jul_mwh_gen', 'aug_mwh_gen', 'sep_mwh_gen', 'oct_mwh_gen', 'nov_mwh_gen']

    return df_dec2013, df_2014
Example #4
0
    def clustering(self, X, NUM_CLUSTERS, MINIBATCH):
        '''
        k平均法によってクラス分け
        '''
        
        if MINIBATCH:
            km = MiniBatchKMeans(n_clusters = NUM_CLUSTERS,
                                 init='k-means++', batch_size=1000,
                                 n_init=10, max_no_improvement=10)
        else:
            km = KMeans(n_clusters=NUM_CLUSTERS, init='k-means++', n_init=1)
        
        km.fit(X)
        transformed = km.transform(X) #商品の各クラスの中心への距離
        labels = km.labels_
        
        dists = []
        for i in range(len(labels)):
            dists.append(transformed[i, labels[i]]) #商品の属するクラスの中心への距離

        labels = DataFrame(labels)
        dists = DataFrame(dists)
        labels.columns = ['label']
        dists.columns = ['dists']
        self.data = pd.concat([labels, dists, self.data], axis=1) #元のデータにラベルを加える
        
        return km
Example #5
0
    def test_pivot_index_none(self):
        # gh-3962
        data = {
            "index": ["A", "B", "C", "C", "B", "A"],
            "columns": ["One", "One", "One", "Two", "Two", "Two"],
            "values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0],
        }

        frame = DataFrame(data).set_index("index")
        result = frame.pivot(columns="columns", values="values")
        expected = DataFrame({"One": {"A": 1.0, "B": 2.0, "C": 3.0}, "Two": {"A": 1.0, "B": 2.0, "C": 3.0}})

        expected.index.name, expected.columns.name = "index", "columns"
        assert_frame_equal(result, expected)

        # omit values
        result = frame.pivot(columns="columns")

        expected.columns = pd.MultiIndex.from_tuples([("values", "One"), ("values", "Two")], names=[None, "columns"])
        expected.index.name = "index"
        assert_frame_equal(result, expected, check_names=False)
        self.assertEqual(result.index.name, "index")
        self.assertEqual(result.columns.names, (None, "columns"))
        expected.columns = expected.columns.droplevel(0)

        data = {
            "index": range(7),
            "columns": ["One", "One", "One", "Two", "Two", "Two"],
            "values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0],
        }

        result = frame.pivot(columns="columns", values="values")

        expected.columns.name = "columns"
        assert_frame_equal(result, expected)
Example #6
0
    def test_unstack_fill_frame(self):

        # From a dataframe
        rows = [[1, 2], [3, 4], [5, 6], [7, 8]]
        df = DataFrame(rows, columns=list("AB"), dtype=np.int32)
        df.index = MultiIndex.from_tuples([("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")])

        result = df.unstack(fill_value=-1)

        rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]]
        expected = DataFrame(rows, index=list("xyz"), dtype=np.int32)
        expected.columns = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")])
        assert_frame_equal(result, expected)

        # From a mixed type dataframe
        df["A"] = df["A"].astype(np.int16)
        df["B"] = df["B"].astype(np.float64)

        result = df.unstack(fill_value=-1)
        expected["A"] = expected["A"].astype(np.int16)
        expected["B"] = expected["B"].astype(np.float64)
        assert_frame_equal(result, expected)

        # From a dataframe with incorrect data type for fill_value
        result = df.unstack(fill_value=0.5)

        rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]]
        expected = DataFrame(rows, index=list("xyz"), dtype=np.float)
        expected.columns = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")])
        assert_frame_equal(result, expected)
Example #7
0
    def test_unstack_fill_frame(self):

        # From a dataframe
        rows = [[1, 2], [3, 4], [5, 6], [7, 8]]
        df = DataFrame(rows, columns=list('AB'), dtype=np.int32)
        df.index = MultiIndex.from_tuples(
            [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])

        result = df.unstack(fill_value=-1)

        rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]]
        expected = DataFrame(rows, index=list('xyz'), dtype=np.int32)
        expected.columns = MultiIndex.from_tuples(
            [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
        assert_frame_equal(result, expected)

        # From a mixed type dataframe
        df['A'] = df['A'].astype(np.int16)
        df['B'] = df['B'].astype(np.float64)

        result = df.unstack(fill_value=-1)
        expected['A'] = expected['A'].astype(np.int16)
        expected['B'] = expected['B'].astype(np.float64)
        assert_frame_equal(result, expected)

        # From a dataframe with incorrect data type for fill_value
        result = df.unstack(fill_value=0.5)

        rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]]
        expected = DataFrame(rows, index=list('xyz'), dtype=np.float)
        expected.columns = MultiIndex.from_tuples(
            [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
        assert_frame_equal(result, expected)
Example #8
0
 def save_to_file(self, fn):
     gg = DataFrame(self.power_series_apps_table)
     try:
         del gg['diff1']
         del gg['diff2']
     except Exception:
         print('')
         
     gg['Loc Events'] = self.loc.events_apps_1min['Apps']
     apps = self.loc.metadata.get_channels()
     sd = {}
     #Initialize series with 0s
     for app in apps:
         sd[app] = Series(0, index=gg.index)
         
     #Count location events for each appliance
     for index, row in gg.iterrows():
         try:
             if len(row['Loc Events']) > 0:
                 for app in apps:
                     n = row['Loc Events'].count(app)
                     sd[app][index] = n
         except Exception:
             continue
     
     if self.loc.name == 'REDD':
         sd[(3,4)] = sd[3]
         sd[(10,20)] = sd[10]
         del sd[3]
         del sd[4]
         del sd[10]
         del sd[20]
       
     #Change column names and append them to gral table
     locevents = DataFrame(sd)
     locevents.columns = [(str(col) + ' locEv') for col in locevents]        
     for locEv in locevents:
         gg[locEv] = locevents[locEv]
         
     
     #Get power values of each appliance and resample for 1min
     act = DataFrame(self.loc.appliances_consuming_times)
     act = act.resample('1Min')
            
     if self.loc.name == 'REDD':
         del act[3]
         del act[10]
         act.columns = [(3,4), 5,6,7,8,9,11,12,13,14,15,16,17,18,19,(10,20)]
     act.columns = [(str(col) + ' conEv') for col in act]
     
     for app in act:
         gg[app] = act[app]        
     gg.columns = [str(col) for col in gg]
     gg = gg[sorted(gg.columns)]
     gg.to_csv(fn)   
     return
Example #9
0
def _series_add_constant(data, prepend):
    const = np.ones_like(data)
    const.name = 'const'
    if not prepend:
        results = DataFrame([data, const]).T
        results.columns = [data.name, 'const']
    else:
        results = DataFrame([const, data]).T
        results.columns = ['const', data.name]
    return results
Example #10
0
    def test_conversion_multiindex(self):
        d = {'comp_str': ["Fe2", "MnO2"]}

        df_1lvl = DataFrame(data=d)

        df_1lvl = StrToComposition().featurize_dataframe(
            df_1lvl, 'comp_str', multiindex=True)
        self.assertEqual(df_1lvl[("StrToComposition", "composition")].tolist(),
                         [Composition("Fe2"), Composition("MnO2")])

        df_2lvl = DataFrame(data=d)
        df_2lvl.columns = MultiIndex.from_product((["custom"],
                                                   df_2lvl.columns.values))

        df_2lvl = StrToComposition().featurize_dataframe(
            df_2lvl, ("custom", "comp_str"), multiindex=True)
        self.assertEqual(df_2lvl[("StrToComposition", "composition")].tolist(),
                         [Composition("Fe2"), Composition("MnO2")])

        df_2lvl = DataFrame(data=d)
        df_2lvl.columns = MultiIndex.from_product((["custom"],
                                                   df_2lvl.columns.values))

        sto = StrToComposition(target_col_id='test')
        df_2lvl = sto.featurize_dataframe(
            df_2lvl, ("custom", "comp_str"), multiindex=True)
        self.assertEqual(df_2lvl[("StrToComposition", "test")].tolist(),
                         [Composition("Fe2"), Composition("MnO2")])

        # if two level multiindex provided as target, it should be written there
        # here we test converting multiindex in place
        df_2lvl = DataFrame(data=d)
        df_2lvl.columns = MultiIndex.from_product((["custom"],
                                                   df_2lvl.columns.values))

        sto = StrToComposition(target_col_id=None, overwrite_data=True)

        df_2lvl = sto.featurize_dataframe(
            df_2lvl, ("custom", "comp_str"), multiindex=True, inplace=False)
        self.assertEqual(df_2lvl[("custom", "comp_str")].tolist(),
                         [Composition("Fe2"), Composition("MnO2")])

        # Try inplace multiindex conversion with return errors
        df_2lvl = DataFrame(data=d)
        df_2lvl.columns = MultiIndex.from_product((["custom"],
                                                   df_2lvl.columns.values))

        sto = StrToComposition(target_col_id=None, overwrite_data=True)
        df_2lvl = sto.featurize_dataframe(
            df_2lvl, ("custom", "comp_str"), multiindex=True,
            return_errors=True, ignore_errors=True)

        self.assertTrue(
            all(df_2lvl[("custom", "StrToComposition Exceptions")].isnull()))
Example #11
0
    def test_to_csv_dups_cols(self):

        df = DataFrame(np.random.randn(1000, 30), columns=lrange(
            15) + lrange(15), dtype='float64')

        with ensure_clean() as filename:
            df.to_csv(filename)  # single dtype, fine
            result = read_csv(filename, index_col=0)
            result.columns = df.columns
            assert_frame_equal(result, df)

        df_float = DataFrame(np.random.randn(1000, 3), dtype='float64')
        df_int = DataFrame(np.random.randn(1000, 3), dtype='int64')
        df_bool = DataFrame(True, index=df_float.index, columns=lrange(3))
        df_object = DataFrame('foo', index=df_float.index, columns=lrange(3))
        df_dt = DataFrame(Timestamp('20010101'),
                          index=df_float.index, columns=lrange(3))
        df = pd.concat([df_float, df_int, df_bool, df_object,
                        df_dt], axis=1, ignore_index=True)

        cols = []
        for i in range(5):
            cols.extend([0, 1, 2])
        df.columns = cols

        from pandas import to_datetime
        with ensure_clean() as filename:
            df.to_csv(filename)
            result = read_csv(filename, index_col=0)

            # date cols
            for i in ['0.4', '1.4', '2.4']:
                result[i] = to_datetime(result[i])

            result.columns = df.columns
            assert_frame_equal(result, df)

        # GH3457
        from pandas.util.testing import makeCustomDataframe as mkdf

        N = 10
        df = mkdf(N, 3)
        df.columns = ['a', 'a', 'b']

        with ensure_clean() as filename:
            df.to_csv(filename)

            # read_csv will rename the dups columns
            result = read_csv(filename, index_col=0)
            result = result.rename(columns={'a.1': 'a'})
            assert_frame_equal(result, df)
Example #12
0
def compute_one(t, df, **kwargs):
    if t.grouper.iscolumn:
        grouper = compute(t.grouper, {t.child: df}) # a Series
    elif isinstance(t.grouper, Projection) and t.grouper.child is t.child:
        grouper = t.grouper.columns  # list of column names

    if isinstance(t.apply, Summary):
        names = t.apply.names
        preapply = DataFrame(dict(zip(
            names,
            [compute(v.child, {t.child: df}) for v in t.apply.values])))

        df2 = concat_nodup(df, preapply)

        groups = df2.groupby(grouper)

        d = defaultdict(list)
        for name, v in zip(names, t.apply.values):
            d[name].append(getattr(Series, v.symbol))

        result = groups.agg(dict(d))

        # Rearrange columns to match names order
        result = result[sorted(list(result.columns),
                               key=lambda t: names.index(t[0]))]
        result.columns = t.apply.names  # flatten down multiindex

    if isinstance(t.apply, Reduction):
        names = t.apply.dshape[0].names
        preapply = compute(t.apply.child, {t.child: df})
        # Pandas and Blaze column naming schemes differ
        # Coerce DataFrame column names to match Blaze's names
        preapply = preapply.copy()
        if isinstance(preapply, Series):
            preapply.name = names[0]
        else:
            preapply.columns = names

        df2 = concat_nodup(df, preapply)

        if t.apply.child.iscolumn:
            groups = df2.groupby(grouper)[names[0]]
        else:
            groups = df2.groupby(grouper)[names]

        result = compute_one(t.apply, groups) # do reduction

    result = DataFrame(result).reset_index()
    result.columns = t.columns
    return result
Example #13
0
    def deserialize(self, item, force_bytes_to_unicode=False):
        index = self._index_from_records(item)
        column_fields = [x for x in item.dtype.names if x not in item.dtype.metadata['index']]
        multi_column = item.dtype.metadata.get('multi_column')
        if len(item) == 0:
            rdata = item[column_fields] if len(column_fields) > 0 else None
            if multi_column is not None:
                columns = MultiIndex.from_arrays(multi_column["values"], names=multi_column["names"])
                return DataFrame(rdata, index=index, columns=columns)
            else:
                return DataFrame(rdata, index=index)

        columns = item.dtype.metadata['columns']
        df = DataFrame(data=item[column_fields], index=index, columns=columns)

        if multi_column is not None:
            df.columns = MultiIndex.from_arrays(multi_column["values"], names=multi_column["names"])

        if force_bytes_to_unicode:
            # This is needed due to 'str' type in py2 when read back in py3 is 'bytes' which breaks the workflow
            # of people migrating to py3. # https://github.com/manahl/arctic/issues/598
            # This should not be used for a normal flow, and you should instead of writing unicode strings
            # if you want to work with str in py3.,

            for c in df.select_dtypes(object):
                # The conversion is not using astype similar to the index as pandas has a bug where it tries to convert
                # the data columns to a unicode string, and the object in this case would be bytes, eg. b'abc'
                # which is converted to u"b'abc'" i.e it includes the b character as well! This generally happens
                # when there is a str conversion without specifying the encoding. eg. str(b'abc') -> "b'abc'" and the
                # fix for this is to tell it the encoding to use: i.e str(b'abc', 'utf-8') -> "abc"
                if type(df[c].iloc[0]) == bytes:
                    df[c] = df[c].str.decode('utf-8')

            if isinstance(df.index, MultiIndex):
                unicode_indexes = []
                # MultiIndex requires a conversion at each level.
                for level in range(len(df.index.levels)):
                    _index = df.index.get_level_values(level)
                    if isinstance(_index[0], bytes):
                        _index = _index.astype('unicode')
                    unicode_indexes.append(_index)
                df.index = unicode_indexes
            else:
                if type(df.index[0]) == bytes:
                    df.index = df.index.astype('unicode')

            if type(df.columns[0]) == bytes:
                df.columns = df.index.astype('unicode')

        return df
Example #14
0
    def test_set_value_by_index(self):
        # See gh-12344
        df = DataFrame(np.arange(9).reshape(3, 3).T)
        df.columns = list('AAA')
        expected = df.iloc[:, 2]

        df.iloc[:, 0] = 3
        assert_series_equal(df.iloc[:, 2], expected)

        df = DataFrame(np.arange(9).reshape(3, 3).T)
        df.columns = [2, float(2), str(2)]
        expected = df.iloc[:, 1]

        df.iloc[:, 0] = 3
        assert_series_equal(df.iloc[:, 1], expected)
Example #15
0
    def test_at_to_fail(self):
        # at should not fallback
        # GH 7814
        s = Series([1, 2, 3], index=list('abc'))
        result = s.at['a']
        assert result == 1
        pytest.raises(ValueError, lambda: s.at[0])

        df = DataFrame({'A': [1, 2, 3]}, index=list('abc'))
        result = df.at['a', 'A']
        assert result == 1
        pytest.raises(ValueError, lambda: df.at['a', 0])

        s = Series([1, 2, 3], index=[3, 2, 1])
        result = s.at[1]
        assert result == 3
        pytest.raises(ValueError, lambda: s.at['a'])

        df = DataFrame({0: [1, 2, 3]}, index=[3, 2, 1])
        result = df.at[1, 0]
        assert result == 3
        pytest.raises(ValueError, lambda: df.at['a', 0])

        # GH 13822, incorrect error string with non-unique columns when missing
        # column is accessed
        df = DataFrame({'x': [1.], 'y': [2.], 'z': [3.]})
        df.columns = ['x', 'x', 'z']

        # Check that we get the correct value in the KeyError
        with pytest.raises(KeyError, match=r"\['y'\] not in index"):
            df[['x', 'y', 'z']]
Example #16
0
def prepare_dataset(filename):
    """
    csvファイルを読み込んで、扱える形のデータフレームに整形する関数
    秒以上の列とms以下の列が分離しているために、前処理が必要

    Argument
    filename->str:          ファイル名のパス

    return
    Data->pandas.DataFrame: 日時と温度を保持するデータフレーム
    """
    #load dataset
    data = pd.read_csv(filename, skiprows=17, encoding="shift-jis")
    data.columns=(["No", "Time", "ms", "Temp", "1", "A12345678", "A1234", "A1"])

    #reshape dataset
    index = np.arange(0,len(data))
    #data["Time"](yy/MM/DD hh:mm:ss)->hh:mm:ss
    #hh:mm:ss + ms->datetime
    date = [str(data["Time"][i]).split(" ") for i in index]
    date = [date[i][1] + str(":") + str(data["ms"][i]*10**3) for i in index]
    date = [date[i].split(":") for i in index]
    date = [datetime.time(int(date[i][0]), int(date[i][1]), int(date[i][2]), int(date[i][3])) for i in index]

    #make dataset
    Data =DataFrame(np.c_[date, data["Temp"]])
    Data.columns=(["date", "temperature"])
    return Data
Example #17
0
def create_df(db='parking.min.db', save_as='parking.df.pickle'):
    conn = sqlite3.connect(db)
    rows = conn.execute('''select updated, park_id, free_places
                        from parking_min''').fetchall()
    ids = list(set([t[1] for t in rows]))
    data = {}
    for x in ids:
        dates = [np.datetime64(r[0], 's')
                 for r in rows if r[1] == x]   # updated
        y = [r[2] for r in rows if r[1] == x]  # free_places (target)
        data[x] = Series(y, index=dates)

    # convert data to DataFrame
    df = DataFrame(data)
    # get the names
    nr = conn.execute('''SELECT DISTINCT name
                      FROM parking ORDER BY park_id''').fetchall()
    # replace non ascii chars
    names = [unicodedata.normalize('NFKD', x[0]).encode('ascii', 'ignore')
             for x in nr]
    # remove dots
    names = [x.replace(u'.', '') for x in names]
    # assign to columns
    df.columns = names

    # destroy where there all are NaNs
    df = df[pd.notnull(df).any(axis=1)]

    # save
    if save_as is not None:
        df.to_pickle(save_as)

    return df
def query_CAISODemand_hrly_Series():
    """specifically gets demand data"""

    import os
    parentdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    os.sys.path.insert(0,parentdir)

    import model
    s = model.connect()

    demand_obj = s.execute('SELECT time_start, mw_demand FROM "HistoricCAISODemands" WHERE caiso_tac=\'CA ISO-TAC\' and time_start between \'2014-01-01 07:00:00.000000\' and \'2015-01-01 00:00:00.000000\' ')
    demand_entry = demand_obj.fetchall()
    demand_df = DataFrame(demand_entry)
    demand_df.columns = ['time_start','mw_demand']

    dict_with_datetime_keys = { }

    for idx,row in enumerate(demand_df.values):
        time_start = row[0]

        # check date, since logs show we're missing a few
        if check_if_bad_date(time_start)!=True:

            # turn dict into a series.  will auto-index on dict keys
            mw_demand = row[1]
            dict_with_datetime_keys[time_start] = mw_demand

    return Series(dict_with_datetime_keys)
Example #19
0
    def test_column_dups2(self):

        # drop buggy GH 6240
        df = DataFrame({'A': np.random.randn(5),
                        'B': np.random.randn(5),
                        'C': np.random.randn(5),
                        'D': ['a', 'b', 'c', 'd', 'e']})

        expected = df.take([0, 1, 1], axis=1)
        df2 = df.take([2, 0, 1, 2, 1], axis=1)
        result = df2.drop('C', axis=1)
        assert_frame_equal(result, expected)

        # dropna
        df = DataFrame({'A': np.random.randn(5),
                        'B': np.random.randn(5),
                        'C': np.random.randn(5),
                        'D': ['a', 'b', 'c', 'd', 'e']})
        df.iloc[2, [0, 1, 2]] = np.nan
        df.iloc[0, 0] = np.nan
        df.iloc[1, 1] = np.nan
        df.iloc[:, 3] = np.nan
        expected = df.dropna(subset=['A', 'B', 'C'], how='all')
        expected.columns = ['A', 'A', 'B', 'C']

        df.columns = ['A', 'A', 'B', 'C']

        result = df.dropna(subset=['A', 'C'], how='all')
        assert_frame_equal(result, expected)
def query_CAISONetImports_hrly_Series():
    """specifically gets import data"""

    import os
    parentdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    os.sys.path.insert(0,parentdir)

    import model
    s = model.connect()

    imports_obj = s.execute('SELECT time_start, sum(mw_imports) FROM "HistoricCAISONetImports" where time_start between \'2014-01-01 07:00:00.000000\' and \'2015-01-01 00:00:00.000000\' GROUP BY time_start ')
    imports_entry = imports_obj.fetchall()
    imports_df = DataFrame(imports_entry)
    imports_df.columns = ['time_start','mw_demand']

    dict_with_datetime_keys = { }

    for idx,row in enumerate(imports_df.values):
        time_start = row[0]

        # check date, since logs show we're missing a few
        if check_if_bad_date(time_start)!=True:

            # turn dict into a series.  will auto-index on dict keys
            mw_imports = row[1]
            dict_with_datetime_keys[time_start] = mw_imports

    return Series(dict_with_datetime_keys)
Example #21
0
    def test_blocks_compat_GH9037(self):
        index = pd.date_range('20000101', periods=10, freq='H')
        df_mixed = DataFrame(OrderedDict(
            float_1=[-0.92077639, 0.77434435, 1.25234727, 0.61485564,
                     -0.60316077, 0.24653374, 0.28668979, -2.51969012,
                     0.95748401, -1.02970536],
            int_1=[19680418, 75337055, 99973684, 65103179, 79373900,
                   40314334, 21290235,  4991321, 41903419, 16008365],
            str_1=['78c608f1', '64a99743', '13d2ff52', 'ca7f4af2', '97236474',
                   'bde7e214', '1a6bde47', 'b1190be5', '7a669144', '8d64d068'],
            float_2=[-0.0428278, -1.80872357,  3.36042349, -0.7573685,
                     -0.48217572, 0.86229683, 1.08935819, 0.93898739,
                     -0.03030452, 1.43366348],
            str_2=['14f04af9', 'd085da90', '4bcfac83', '81504caf', '2ffef4a9',
                   '08e2f5c4', '07e1af03', 'addbd4a7', '1f6a09ba', '4bfc4d87'],
            int_2=[86967717, 98098830, 51927505, 20372254, 12601730, 20884027,
                   34193846, 10561746, 24867120, 76131025]
        ), index=index)

        # JSON deserialisation always creates unicode strings
        df_mixed.columns = df_mixed.columns.astype('unicode')

        df_roundtrip = pd.read_json(df_mixed.to_json(orient='split'),
                                    orient='split')
        assert_frame_equal(df_mixed, df_roundtrip,
                           check_index_type=True,
                           check_column_type=True,
                           check_frame_type=True,
                           by_blocks=True,
                           check_exact=True)
Example #22
0
    def test_iloc_setitem_dups(self):

        # GH 6766
        # iloc with a mask aligning from another iloc
        df1 = DataFrame([{'A': None, 'B': 1}, {'A': 2, 'B': 2}])
        df2 = DataFrame([{'A': 3, 'B': 3}, {'A': 4, 'B': 4}])
        df = concat([df1, df2], axis=1)

        expected = df.fillna(3)
        expected['A'] = expected['A'].astype('float64')
        inds = np.isnan(df.iloc[:, 0])
        mask = inds[inds].index
        df.iloc[mask, 0] = df.iloc[mask, 2]
        tm.assert_frame_equal(df, expected)

        # del a dup column across blocks
        expected = DataFrame({0: [1, 2], 1: [3, 4]})
        expected.columns = ['B', 'B']
        del df['A']
        tm.assert_frame_equal(df, expected)

        # assign back to self
        df.iloc[[0, 1], [0, 1]] = df.iloc[[0, 1], [0, 1]]
        tm.assert_frame_equal(df, expected)

        # reversed x 2
        df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index(
            drop=True)
        df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index(
            drop=True)
        tm.assert_frame_equal(df, expected)
    def time_regex(data, col, form, nulls):
        '''
        Method to format the date columns in the raw data
        based on user input. Returns 3 formatted columns
        i.e. (year, month, day) including nulls
        '''

        fields = ['month', 'day', 'year']
        if any(isinstance(i, list) for i in col):
            col = list(chain.from_iterable(col))
        else:
            pass
        print(type(col))
        print(col)

        if len(nulls) > 0:
            nulldf = hlp.produce_null_df(
                len(nulls), nulls, len(data), 'NaN')


        else:
            nulldf = DataFrame()            
        try:
            if col[0] is not None:
                time_list_re = hlp.strip_time(data, col)
            else:
                time_list_re = []

        except Exception as e:
            print(str(e))
            raise AttributeError('Could not strip time format')            
        notnull = [x for x in fields if x not in nulls]

        for i,item in enumerate(form):
            try:
                time_form_list = []
                for j in time_list_re:
                    time_form_list.append(
                        [
                            to_datetime(
                                x, format=form[i]) for x in
                            j
                        ]
                    )
                if len(time_form_list) > 1:
                    timedf = DataFrame(
                        [list(x) for x in zip(
                            *time_form_list)])

                else:
                    timedf = DataFrame(time_form_list[0])                    
                    if len(notnull) == 1:
                        timedf.columns = notnull
                    else:
                        pass
                final = {'formatted': timedf, 'null': nulldf}
                return final
            except Exception as e:
                print(str(e))
                print('Trying different format')
Example #24
0
def index_models_minho(host="http://darwin.di.uminho.pt/models"):
    """
    Retrieves a summary of all models in the database.

    Parameters
    ----------
    host: the service host (optional, default: http://darwin.di.uminho.pt/models)

    Returns
    -------
    pandas.DataFrame
        summary of the models in the database
    """
    uri = host + "/models.json"
    try:
        response = requests.get(uri)
    except requests.ConnectionError as e:
        logger.error("Cannot reach %s. Are you sure that you are connected to the internet?" % host)
        raise e
    if response.ok:
        try:
            json = response.json()
        except Exception as e:
            logger.error('No json could be decoded from server response coming from {}.'.format(host))
            raise e
        else:
            index = DataFrame(json, columns=["id", "name", "doi", "author",
                                             "year", "formats", "organism",
                                             "taxonomy", "optflux_validated"])
            index.columns = ["id", "name", "doi", "author", "year", "formats", "organism", "taxonomy", "validated"]
            return index
    else:
        raise Exception("Could not index available models. %s returned status code %d" % (host, response.status_code))
Example #25
0
    def test_include_na(self):
        s = ['a', 'b', np.nan]
        res = get_dummies(s, sparse=self.sparse)
        exp = DataFrame({'a': {0: 1.0,
                               1: 0.0,
                               2: 0.0},
                         'b': {0: 0.0,
                               1: 1.0,
                               2: 0.0}})
        assert_frame_equal(res, exp)

        # Sparse dataframes do not allow nan labelled columns, see #GH8822
        res_na = get_dummies(s, dummy_na=True, sparse=self.sparse)
        exp_na = DataFrame({nan: {0: 0.0,
                                  1: 0.0,
                                  2: 1.0},
                            'a': {0: 1.0,
                                  1: 0.0,
                                  2: 0.0},
                            'b': {0: 0.0,
                                  1: 1.0,
                                  2: 0.0}}).reindex_axis(
                                      ['a', 'b', nan], 1)
        # hack (NaN handling in assert_index_equal)
        exp_na.columns = res_na.columns
        assert_frame_equal(res_na, exp_na)

        res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse)
        exp_just_na = DataFrame(Series(1.0, index=[0]), columns=[nan])
        tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
Example #26
0
def splitMNIST(data,random_state):

    print("\n####################")
    print("splitMNIST():\n")

    nrow = data["data"].shape[0]
    ncol = data["data"].shape[1]

    label_features = np.hstack(
        tup = (
            np.arange(nrow).reshape((nrow,1)),
            data['target'].reshape((nrow,1)),
            data['data']
            )
        )

    label_features = DataFrame(data=label_features)
    label_features.columns = ['index','label'] + getColnames(ncolSquared = ncol)

    simpleTrainSet, simpleTestSet = train_test_split(
        label_features,
        test_size    = 1/7,
        random_state = random_state
        )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    del label_features
    print("\nexiting: splitMNIST()")
    print("####################")
    return( simpleTrainSet, simpleTestSet )
Example #27
0
    def test_applymap(self):
        applied = self.frame.applymap(lambda x: x * 2)
        assert_frame_equal(applied, self.frame * 2)
        result = self.frame.applymap(type)

        # GH #465, function returning tuples
        result = self.frame.applymap(lambda x: (x, x))
        tm.assertIsInstance(result['A'][0], tuple)

        # GH 2909, object conversion to float in constructor?
        df = DataFrame(data=[1, 'a'])
        result = df.applymap(lambda x: x)
        self.assertEqual(result.dtypes[0], object)

        df = DataFrame(data=[1., 'a'])
        result = df.applymap(lambda x: x)
        self.assertEqual(result.dtypes[0], object)

        # GH2786
        df = DataFrame(np.random.random((3, 4)))
        df2 = df.copy()
        cols = ['a', 'a', 'a', 'a']
        df.columns = cols

        expected = df2.applymap(str)
        expected.columns = cols
        result = df.applymap(str)
        assert_frame_equal(result, expected)

        # datetime/timedelta
        df['datetime'] = Timestamp('20130101')
        df['timedelta'] = pd.Timedelta('1 min')
        result = df.applymap(str)
        for f in ['datetime', 'timedelta']:
            self.assertEqual(result.loc[0, f], str(df.loc[0, f]))
Example #28
0
def arrange_aggregates(cumsums, symbols, aggs):
    for i in symbols:
        cumsums[i] = cumsums[i].ix[:,0:5]

    cols = cumsums['ATL'].columns.tolist()
    cols2 = aggs['ATL'].columns.tolist()
    cols3 = (aggs['ATL'].columns + '1').tolist()
    cols.extend(cols2)
    cols.extend(cols3)
    ATL = DataFrame(columns = cols)

    for team in symbols:
        for Date in cumsums[team]['Date']:
            Opponent = cumsums[team].ix[cumsums[team]['Date'] == Date, 'Opponent'].all()
            cumsums_temp = cumsums[team].ix[cumsums[team]['Date'] == Date]
            cumsums_temp = cumsums_temp.reset_index()
            team_temp = aggs[team]
            oppenent_temp = DataFrame(aggs[Opponent])
            oppenent_temp.columns = cols3
            atl = pd.concat([cumsums_temp, team_temp, oppenent_temp], axis = 1)
            atl = atl.drop('index', axis=1)
            atl.columns = cols
            ATL = pd.concat([ATL, atl], axis = 0)

        print team

    ATL.to_csv('final.csv', sep=',', index=False)
Example #29
0
    def components(self):
        """
        Return a dataframe of the components (days, hours, minutes,
        seconds, milliseconds, microseconds, nanoseconds) of the Timedeltas.

        Returns
        -------
        a DataFrame
        """
        from pandas import DataFrame

        columns = ['days', 'hours', 'minutes', 'seconds',
                   'milliseconds', 'microseconds', 'nanoseconds']
        hasnans = self.hasnans
        if hasnans:
            def f(x):
                if isnull(x):
                    return [np.nan] * len(columns)
                return x.components
        else:
            def f(x):
                return x.components

        result = DataFrame([f(x) for x in self])
        result.columns = columns
        if not hasnans:
            result = result.astype('int64')
        return result
Example #30
0
    def test_include_na(self, sparse, dtype):
        if sparse:
            pytest.xfail(reason='nan in index is problematic (GH 16894)')

        s = ['a', 'b', np.nan]
        res = get_dummies(s, sparse=sparse, dtype=dtype)
        exp = DataFrame({'a': [1, 0, 0],
                         'b': [0, 1, 0]},
                        dtype=self.effective_dtype(dtype))
        assert_frame_equal(res, exp)

        # Sparse dataframes do not allow nan labelled columns, see #GH8822
        res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype)
        exp_na = DataFrame({nan: [0, 0, 1],
                            'a': [1, 0, 0],
                            'b': [0, 1, 0]},
                           dtype=self.effective_dtype(dtype))
        exp_na = exp_na.reindex(['a', 'b', nan], axis=1)
        # hack (NaN handling in assert_index_equal)
        exp_na.columns = res_na.columns
        assert_frame_equal(res_na, exp_na)

        res_just_na = get_dummies([nan], dummy_na=True,
                                  sparse=sparse, dtype=dtype)
        exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan],
                                dtype=self.effective_dtype(dtype))
        tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)