Example #1
0
    def test_consistent_coerce_for_shapes(self):
        # we want column names to NOT be propagated
        # just because the shape matches the input shape
        df = DataFrame(np.random.randn(4, 3), columns=['A', 'B', 'C'])

        result = df.apply(lambda x: [1, 2, 3], axis=1)
        expected = Series([[1, 2, 3] for t in df.itertuples()])
        assert_series_equal(result, expected)

        result = df.apply(lambda x: [1, 2], axis=1)
        expected = Series([[1, 2] for t in df.itertuples()])
        assert_series_equal(result, expected)
Example #2
0
class Iteration(object):

    def setup(self):
        N = 1000
        self.df = DataFrame(np.random.randn(N * 10, N))
        self.df2 = DataFrame(np.random.randn(N * 50, 10))
        self.df3 = DataFrame(np.random.randn(N, 5 * N),
                             columns=['C' + str(c) for c in range(N * 5)])

    def time_iteritems(self):
        # (monitor no-copying behaviour)
        if hasattr(self.df, '_item_cache'):
            self.df._item_cache.clear()
        for name, col in self.df.iteritems():
            pass

    def time_iteritems_cached(self):
        for name, col in self.df.iteritems():
            pass

    def time_iteritems_indexing(self):
        for col in self.df3:
            self.df3[col]

    def time_itertuples(self):
        for row in self.df2.itertuples():
            pass

    def time_iterrows(self):
        for row in self.df.iterrows():
            pass
Example #3
0
    def test_with_dictlike_columns(self):
        # GH 17602
        df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
        result = df.apply(lambda x: {'s': x['a'] + x['b']},
                          axis=1)
        expected = Series([{'s': 3} for t in df.itertuples()])
        assert_series_equal(result, expected)

        df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'),
                    pd.Timestamp('2017-05-02 00:00:00')]
        result = df.apply(lambda x: {'s': x['a'] + x['b']},
                          axis=1)
        assert_series_equal(result, expected)

        # compose a series
        result = (df['a'] + df['b']).apply(lambda x: {'s': x})
        expected = Series([{'s': 3}, {'s': 3}])
        assert_series_equal(result, expected)

        # GH 18775
        df = DataFrame()
        df["author"] = ["X", "Y", "Z"]
        df["publisher"] = ["BBC", "NBC", "N24"]
        df["date"] = pd.to_datetime(['17-10-2010 07:15:30',
                                     '13-05-2011 08:20:35',
                                     '15-01-2013 09:09:09'])
        result = df.apply(lambda x: {}, axis=1)
        expected = Series([{}, {}, {}])
        assert_series_equal(result, expected)
Example #4
0
    def test_infer_output_shape_columns(self):
        # GH 18573

        df = DataFrame({'number': [1., 2.],
                        'string': ['foo', 'bar'],
                        'datetime': [pd.Timestamp('2017-11-29 03:30:00'),
                                     pd.Timestamp('2017-11-29 03:45:00')]})
        result = df.apply(lambda row: (row.number, row.string), axis=1)
        expected = Series([(t.number, t.string) for t in df.itertuples()])
        assert_series_equal(result, expected)
Example #5
0
    def test_itertuples(self):
        for i, tup in enumerate(self.frame.itertuples()):
            s = Series(tup[1:])
            s.name = tup[0]
            expected = self.frame.iloc[i, :].reset_index(drop=True)
            assert_series_equal(s, expected)

        df = DataFrame({'floats': np.random.randn(5),
                        'ints': lrange(5)}, columns=['floats', 'ints'])

        for tup in df.itertuples(index=False):
            assert isinstance(tup[1], np.integer)

        df = DataFrame(data={"a": [1, 2, 3], "b": [4, 5, 6]})
        dfaa = df[['a', 'a']]
        self.assertEqual(list(dfaa.itertuples()), [
                         (0, 1, 1), (1, 2, 2), (2, 3, 3)])

        self.assertEqual(repr(list(df.itertuples(name=None))),
                         '[(0, 1, 4), (1, 2, 5), (2, 3, 6)]')

        tup = next(df.itertuples(name='TestName'))

        # no support for field renaming in Python 2.6, regular tuples are
        # returned
        if sys.version >= LooseVersion('2.7'):
            self.assertEqual(tup._fields, ('Index', 'a', 'b'))
            self.assertEqual((tup.Index, tup.a, tup.b), tup)
            self.assertEqual(type(tup).__name__, 'TestName')

        df.columns = ['def', 'return']
        tup2 = next(df.itertuples(name='TestName'))
        self.assertEqual(tup2, (0, 1, 4))

        if sys.version >= LooseVersion('2.7'):
            self.assertEqual(tup2._fields, ('Index', '_1', '_2'))

        df3 = DataFrame(dict(('f' + str(i), [i]) for i in range(1024)))
        # will raise SyntaxError if trying to create namedtuple
        tup3 = next(df3.itertuples())
        self.assertFalse(hasattr(tup3, '_fields'))
        assert isinstance(tup3, tuple)
Example #6
0
    def test_infer_output_shape_listlike_columns(self):
        # GH 16353

        df = DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C'])

        result = df.apply(lambda x: [1, 2, 3], axis=1)
        expected = Series([[1, 2, 3] for t in df.itertuples()])
        assert_series_equal(result, expected)

        result = df.apply(lambda x: [1, 2], axis=1)
        expected = Series([[1, 2] for t in df.itertuples()])
        assert_series_equal(result, expected)

        # GH 17970
        df = DataFrame({"a": [1, 2, 3]}, index=list('abc'))

        result = df.apply(lambda row: np.ones(1), axis=1)
        expected = Series([np.ones(1) for t in df.itertuples()],
                          index=df.index)
        assert_series_equal(result, expected)

        result = df.apply(lambda row: np.ones(2), axis=1)
        expected = Series([np.ones(2) for t in df.itertuples()],
                          index=df.index)
        assert_series_equal(result, expected)

        # GH 17892
        df = pd.DataFrame({'a': [pd.Timestamp('2010-02-01'),
                                 pd.Timestamp('2010-02-04'),
                                 pd.Timestamp('2010-02-05'),
                                 pd.Timestamp('2010-02-06')],
                           'b': [9, 5, 4, 3],
                           'c': [5, 3, 4, 2],
                           'd': [1, 2, 3, 4]})

        def fun(x):
            return (1, 2)

        result = df.apply(fun, axis=1)
        expected = Series([(1, 2) for t in df.itertuples()])
        assert_series_equal(result, expected)
Example #7
0
    def test_consistency_for_boxed(self, box):
        # passing an array or list should not affect the output shape
        df = DataFrame(
            np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1,
            columns=['A', 'B', 'C'])

        result = df.apply(lambda x: box([1, 2]), axis=1)
        expected = Series([box([1, 2]) for t in df.itertuples()])
        assert_series_equal(result, expected)

        result = df.apply(lambda x: box([1, 2]), axis=1, result_type='expand')
        expected = DataFrame(
            np.tile(np.arange(2, dtype='int64'), 6).reshape(6, -1) + 1)
        assert_frame_equal(result, expected)
Example #8
0
    def job_status(self, df: pd.DataFrame, job_opts: JobOpts, progressbar=True):
        """Read the status and results of each submitted job.

        Notes:
            - Multithrading does not make it faster :(.
        """
        # Refresh NFS:
        os.listdir(job_opts.working_dir.joinpath(job_opts.job_id))  # type: ignore
        results = [
            self._read_results(row, job_opts)
            for row in tqdm(df.itertuples(), total=len(df), ncols=100, disable=not progressbar)
        ]
        if not results:
            return pd.DataFrame(columns=['status', 'Index'])
        else:
            return pd.DataFrame(results).set_index('Index')
Example #9
0
    def test_itertuples(self):
        for i, tup in enumerate(self.frame.itertuples()):
            s = self.klass._constructor_sliced(tup[1:])
            s.name = tup[0]
            expected = self.frame.iloc[i, :].reset_index(drop=True)
            self._assert_series_equal(s, expected)

        df = self.klass({'floats': np.random.randn(5),
                         'ints': lrange(5)}, columns=['floats', 'ints'])

        for tup in df.itertuples(index=False):
            assert isinstance(tup[1], (int, long))

        df = self.klass(data={"a": [1, 2, 3], "b": [4, 5, 6]})
        dfaa = df[['a', 'a']]

        assert (list(dfaa.itertuples()) ==
                [(0, 1, 1), (1, 2, 2), (2, 3, 3)])

        # repr with be int/long on 32-bit/windows
        if not (compat.is_platform_windows() or compat.is_platform_32bit()):
            assert (repr(list(df.itertuples(name=None))) ==
                    '[(0, 1, 4), (1, 2, 5), (2, 3, 6)]')

        tup = next(df.itertuples(name='TestName'))

        if sys.version >= LooseVersion('2.7'):
            assert tup._fields == ('Index', 'a', 'b')
            assert (tup.Index, tup.a, tup.b) == tup
            assert type(tup).__name__ == 'TestName'

        df.columns = ['def', 'return']
        tup2 = next(df.itertuples(name='TestName'))
        assert tup2 == (0, 1, 4)

        if sys.version >= LooseVersion('2.7'):
            assert tup2._fields == ('Index', '_1', '_2')

        df3 = DataFrame({'f' + str(i): [i] for i in range(1024)})
        # will raise SyntaxError if trying to create namedtuple
        tup3 = next(df3.itertuples())
        assert not hasattr(tup3, '_fields')
        assert isinstance(tup3, tuple)
Example #10
0
    def test_sequence_like_with_categorical(self):

        # GH 7839
        # make sure can iterate
        df = DataFrame({"id": [1, 2, 3, 4, 5, 6],
                        "raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']})
        df['grade'] = Categorical(df['raw_grade'])

        # basic sequencing testing
        result = list(df.grade.values)
        expected = np.array(df.grade.values).tolist()
        tm.assert_almost_equal(result, expected)

        # iteration
        for t in df.itertuples(index=False):
            str(t)

        for row, s in df.iterrows():
            str(s)

        for c, col in df.iteritems():
            str(s)
Example #11
0
class Iteration:
    # mem_itertuples_* benchmarks are slow
    timeout = 120

    def setup(self):
        N = 1000
        self.df = DataFrame(np.random.randn(N * 10, N))
        self.df2 = DataFrame(np.random.randn(N * 50, 10))
        self.df3 = DataFrame(np.random.randn(N, 5 * N),
                             columns=['C' + str(c) for c in range(N * 5)])
        self.df4 = DataFrame(np.random.randn(N * 1000, 10))

    def time_iteritems(self):
        # (monitor no-copying behaviour)
        if hasattr(self.df, '_item_cache'):
            self.df._item_cache.clear()
        for name, col in self.df.iteritems():
            pass

    def time_iteritems_cached(self):
        for name, col in self.df.iteritems():
            pass

    def time_iteritems_indexing(self):
        for col in self.df3:
            self.df3[col]

    def time_itertuples_start(self):
        self.df4.itertuples()

    def time_itertuples_read_first(self):
        next(self.df4.itertuples())

    def time_itertuples(self):
        for row in self.df4.itertuples():
            pass

    def time_itertuples_to_list(self):
        list(self.df4.itertuples())

    def mem_itertuples_start(self):
        return self.df4.itertuples()

    def peakmem_itertuples_start(self):
        self.df4.itertuples()

    def mem_itertuples_read_first(self):
        return next(self.df4.itertuples())

    def peakmem_itertuples(self):
        for row in self.df4.itertuples():
            pass

    def mem_itertuples_to_list(self):
        return list(self.df4.itertuples())

    def peakmem_itertuples_to_list(self):
        list(self.df4.itertuples())

    def time_itertuples_raw_start(self):
        self.df4.itertuples(index=False, name=None)

    def time_itertuples_raw_read_first(self):
        next(self.df4.itertuples(index=False, name=None))

    def time_itertuples_raw_tuples(self):
        for row in self.df4.itertuples(index=False, name=None):
            pass

    def time_itertuples_raw_tuples_to_list(self):
        list(self.df4.itertuples(index=False, name=None))

    def mem_itertuples_raw_start(self):
        return self.df4.itertuples(index=False, name=None)

    def peakmem_itertuples_raw_start(self):
        self.df4.itertuples(index=False, name=None)

    def peakmem_itertuples_raw_read_first(self):
        next(self.df4.itertuples(index=False, name=None))

    def peakmem_itertuples_raw(self):
        for row in self.df4.itertuples(index=False, name=None):
            pass

    def mem_itertuples_raw_to_list(self):
        return list(self.df4.itertuples(index=False, name=None))

    def peakmem_itertuples_raw_to_list(self):
        list(self.df4.itertuples(index=False, name=None))

    def time_iterrows(self):
        for row in self.df.iterrows():
            pass
 def load_years(years):
     for year in years:
         print 'loading %s' %  year
         pop = DataFrame(index=['state','county'])
         column = 'popestimate%s' % year
         
         #create a DataFrame for each series of population estimates:
         #total, male, and female
         query = ("PopulationEst%sRaw.objects.values('state')" 
             ".filter(gender='0',ethnic_origin='0')"  
             ".annotate(population=Sum(column))" % args[0])
         total_pop = eval(query) 
         total_pop = DataFrame.from_records(
             total_pop,
             index=['state'])
         total_pop.columns = ['total']
         if np.isnan(total_pop.sum()):
             #No data yet for the current year, which means no data yet
             #for future years in the decade, so stop right here
             print 'No data for year %s. Stopping load.' % year
             return 0
         query = ("PopulationEst%sRaw.objects.values('state')"
             ".filter(gender='1',ethnic_origin='0')"
             ".annotate(population=Sum(column))" % args[0])
         male_pop = eval(query)
         male_pop = DataFrame.from_records(
             male_pop,
             index=['state'])
         male_pop.columns = ['male']
         query = ("PopulationEst%sRaw.objects.values('state')"
             ".filter(gender='2',ethnic_origin='0')"
             ".annotate(population=Sum(column))" % args[0])
         female_pop = eval(query)
         female_pop = DataFrame.from_records(
             female_pop,
             index=['state'])
         female_pop.columns = ['female']
         
         #merge the total, male, and female DataFrames into final, master df
         pop = pd.merge(pop,
             total_pop,
             how='right',
             left_index=True,
             right_index=True)
         pop = pd.merge(pop,
             male_pop,
             how='right',
             left_index=True,
             right_index=True)
         pop = pd.merge(pop,
             female_pop,
             how='right',
             left_index=True,
             right_index=True)
         
         #calculate male and female percentages and merge those in, too
         male_percent = DataFrame(pop.apply(
             lambda row: row['male']*1.0/row['total']*100,axis=1),
             columns=['male_percent'])
         pop = pd.merge(pop,
             male_percent,
             left_index=True,
             right_index=True)
         female_percent = DataFrame(pop.apply(
             lambda row: row['female']*1.0/row['total']*100,axis=1),
             columns=['female_percent'])
         pop = pd.merge(pop,
             female_percent,
             left_index=True,
             right_index=True)
         
         #add DataFrame contents to database
         #DataFrame is indexed by state code
         #i.e., p[0] = state code
         for p in pop.itertuples():
             state_id = states['id'][p[0]]
             try:
                 record = PopulationGenderState.objects.get(
                     state = state_id,
                     year = year)
             except:
                 record = PopulationGenderState()
                 record.state_id = state_id
                 record.year = year
             record.total = p[1]
             record.male = p[2]
             record.female = p[3]
             record.male_percent = str(p[4])
             record.female_percent = str(p[5])
             record.save()
             db.reset_queries()
Example #13
0
    def load_guess_score_map(guess_df: pd.DataFrame) -> defaultdict:
        guess_score_map = defaultdict(dict)
        for row in guess_df.itertuples():
            guess_score_map[row.guesser][(row.qnum, row.sentence, row.token, row.guess)] = row.score

        return guess_score_map