Beispiel #1
0
def subset(data):
    subset = data['table'].where(
        lambda r: r['origin'] in SELECTED_COUNTRIES and r['year'] >= 1980)
    groups = subset.group_by(lambda r: '/'.join([str(r['year']), r['origin']]),
                             key_name='year_and_origin')

    refugees = groups.aggregate([
        ('refugees', agate.Sum('refugees')),
        ('asylum_seekers', agate.Sum('asylum_seekers')),
        ('returned_refugees', agate.Sum('returned_refugees')),
        ('idps', agate.Sum('idps')),
        ('returned_idps', agate.Sum('returned_idps')),
        ('stateless_persons', agate.Sum('stateless_persons')),
        ('others', agate.Sum('others')), ('total', agate.Sum('total'))
    ]).order_by('year_and_origin', reverse=True)

    refugees = refugees.compute([
        ('year',
         agate.Formula(agate.Text(),
                       lambda r: r['year_and_origin'].split('/')[0])),
        ('origin',
         agate.Formula(agate.Text(),
                       lambda r: r['year_and_origin'].split('/')[1]))
    ])

    refugees = refugees.select([
        'origin', 'year', 'refugees', 'asylum_seekers', 'idps',
        'returned_idps', 'stateless_persons', 'others', 'total'
    ])

    refugees.to_csv('subset.csv')
    refugees.pivot(
        'year', 'origin',
        agate.Sum('total')).order_by('year').to_csv('subset_pivot.csv')
    def test_load(self):
        tester = agate.TypeTester(
            force={
                'last_name': agate.Text(),
                'first_name': agate.Text(),
                'age': agate.Number()
            })

        exonerations = agate.Table.from_csv(
            '../../../data/exonerations-20150828.csv', column_types=tester)
        print(exonerations)  # 表的描述
Beispiel #3
0
def get_types(example_row):
    types = []
    for v in example_row:
        value_type = xlrd.sheet.ctype_text[v.ctype]
        if value_type == 'text':
            types.append(agate.Text())
        elif value_type == 'number':
            types.append(agate.Number())
        elif value_type == 'xldate':
            types.append(agate.Date())
        else:
            types.append(agate.Text())
    return types
Beispiel #4
0
    def test_distinct_values(self):
        column_names: List = [
            'id',
            'name',
            'dob',
            'last seen',
            'size',
            'active',
        ]
        column_types: List = [
            agate.Number(),
            agate.Text(),
            agate.Date(),
            agate.DateTime(),
            agate.Text(),
            agate.Boolean(),
        ]

        rows = [(1, 'Alvin Cotton', '03-01-1980', '06-30-2019 12:12:00', 'L',
                 True),
                (2, 'Usmaan Rojas', '01-12-1978', '06-30-2019 12:12:00', 'S',
                 False),
                (3, 'Kingston Odling', '04-09-1990', '06-30-2019 12:12:00',
                 'M', True),
                (3, 'Pooja Gillespie', '10-07-1985', '06-30-2019 12:12:00',
                 'S', True),
                (4, 'Hal Blake', '08-17-1989', '06-30-2019 12:12:00', 'L',
                 True),
                (5, 'Shannen Blevins', '06-10-1981', '06-30-2019 12:12:00',
                 'M', False),
                (5, 'Courteney Weston', '04-23-1992', '06-30-2019 12:12:00',
                 'M', False),
                (6, 'Conner Calhoun', '05-16-1977', '06-30-2019 12:12:00',
                 'XL', True),
                (7, 'Susie Rasmussen', '02-08-1987', '06-30-2019 12:12:00',
                 'L', False),
                (8, 'Cassie Beltran', '12-15-1982', '06-30-2019 12:12:00', 'M',
                 True)]

        model = csvhound.core.BaseHound()
        table = model.get_table_from_file('sample-data/test-distinct.csv')
        distinct = model.distinct_values('size')
        agate_table = agate.Table(rows, column_names, column_types)
        distinct_agate = agate_table.select('size').distinct('size')

        # now do the testing
        self.assertColumnNames(distinct, ('size', ))
        self.assertColumnTypes(distinct,
                               [type(c) for c in distinct.column_types])
        self.assertRows(distinct, distinct_agate)
Beispiel #5
0
    def test_create_if_not_exists(self):
        column_names = ['id', 'name']
        column_types = [agate.Number(), agate.Text()]
        rows1 = (
            (1, 'Jake'),
            (2, 'Howard'),
        )
        rows2 = (
            (3, 'Liz'),
            (4, 'Tim'),
        )

        table1 = agate.Table(rows1, column_names, column_types)
        table2 = agate.Table(rows2, column_names, column_types)

        engine = create_engine(self.connection_string)
        connection = engine.connect()

        # Write two agate tables into the same SQL table
        table1.to_sql(connection,
                      'create_if_not_exists_test',
                      create=True,
                      create_if_not_exists=True,
                      insert=True)
        table2.to_sql(connection,
                      'create_if_not_exists_test',
                      create=True,
                      create_if_not_exists=True,
                      insert=True)
Beispiel #6
0
    def setUp(self):
        self.rows = (
            (1.123, 'a', True, '11/4/2015', '11/4/2015 12:22 PM'),
            (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM'),
            (2, u'c', False, '11/5/2015', '11/4/2015 12:45 PM'),
            (None, 'b', None, None, None),
        )

        self.column_names = [
            'number',
            'text',
            'boolean',
            'date',
            'datetime',
        ]

        self.column_types = [
            agate.Number(),
            agate.Text(),
            agate.Boolean(),
            agate.Date(),
            agate.DateTime(),
        ]

        self.table = agate.Table(self.rows, self.column_names,
                                 self.column_types)
        self.connection_string = 'sqlite:///:memory:'
Beispiel #7
0
def load_data(data):
    text_type = agate.Text()
    number_type = agate.Number()
    boolean_type = agate.Boolean()

    columns = (
        ('last_name', text_type),
        ('first_name', text_type),
        ('age', number_type),
        ('race', text_type),
        ('state', text_type),
        ('tags', text_type),
        ('crime', text_type),
        ('sentence', text_type),
        ('convicted', number_type),
        ('exonerated', number_type),
        ('dna', boolean_type),
        ('dna_essential', text_type),
        ('mistaken_witness', boolean_type),
        ('false_confession', boolean_type),
        ('perjury', boolean_type),
        ('false_evidence', boolean_type),
        ('official_misconduct', boolean_type),
        ('inadequate_defense', boolean_type),
    )

    with open('examples/realdata/exonerations-20150828.csv') as f:
        # Create a csv reader
        reader = csv.reader(f)

        # Skip header
        next(f)

        # Create the table
        data['exonerations'] = agate.Table(reader, columns)
Beispiel #8
0
def load_year_killed_data(year):
    specified_types = {
        'killed': agate.Number(),
        'injured': agate.Number(),
        'date_hour': agate.Text()
    }
    return agate.Table.from_url('https://s3.amazonaws.com/traffic-sd/accidents_killed_{}.csv'.format(year), column_types=specified_types)
Beispiel #9
0
    def setUp(self):
        text_type = agate.Text()
        number_type = agate.Number()

        columns = (
            ('gender', text_type),
            ('month', number_type),
            ('median', number_type),
            ('stdev', number_type),
            ('1st', number_type),
            ('3rd', number_type),
            ('5th', number_type),
            ('15th', number_type),
            ('25th', number_type),
            ('50th', number_type),
            ('75th', number_type),
            ('85th', number_type),
            ('95th', number_type),
            ('97th', number_type),
            ('99th', number_type)
        )

        with open('examples/heights.csv') as f:
            # Create a csv reader
            reader = csv.reader(f)

            # Skip header
            next(f)

            # Create the table
            self.table = agate.Table(reader, columns)

        if os.path.exists(TEST_FILENAME):
            os.remove(TEST_FILENAME)
def add_full_hour_date(data):
    data['table'] = data['table'].compute([
        ('date_hour',
         agate.Formula(
             agate.Text(),
             lambda r: r['date_time'].strftime("%Y-%m-%d %H:00:00"))),
    ])
Beispiel #11
0
def load_data(data):
    """
    Load the dataset.
    """
    text_type = agate.Text()
    number_type = agate.Number()

    columns = OrderedDict([
        ('year', number_type),
        ('residence', text_type),
        ('origin', text_type),
        ('refugees', number_type),
        ('asylum_seekers', number_type),
        ('returned_refugees', number_type),
        ('idps', number_type),
        ('returned_idps', number_type),
        ('stateless_persons', number_type),
        ('others', number_type),
        ('total', number_type),
    ])

    # Load the data
    with open('unhcr_popstats_export_persons_of_concern_2016_01_12_192533.csv'
              ) as f:
        reader = csvkit.reader(f)
        next(reader)

        rows = []

        for row in reader:
            rows.append([None if d == '*' else d for d in row])

        data['table'] = agate.Table(rows, columns.keys(), columns.values())
Beispiel #12
0
    def test_create_if_not_exists(self):
        column_names = ['id', 'name']
        column_types = [agate.Number(), agate.Text()]
        rows1 = (
            (1, 'Jake'),
            (2, 'Howard'),
        )
        rows2 = (
            (3, 'Liz'),
            (4, 'Tim'),
        )

        table1 = agate.Table(rows1, column_names, column_types)
        table2 = agate.Table(rows2, column_names, column_types)

        engine = create_engine(self.connection_string)
        connection = engine.connect()

        # Write two agate tables into the same SQL table
        table1.to_sql(connection, 'create_if_not_exists_test', create=True, create_if_not_exists=True, insert=True)
        table2.to_sql(connection, 'create_if_not_exists_test', create=True, create_if_not_exists=True, insert=True)

        table = agate.Table.from_sql(connection, 'create_if_not_exists_test')
        self.assertSequenceEqual(table.column_names, column_names)
        self.assertIsInstance(table.column_types[0], agate.Number)
        self.assertIsInstance(table.column_types[1], agate.Text)
        self.assertEqual(len(table.rows), len(table1.rows) + len(table1.rows))
        self.assertSequenceEqual(table.rows[0], table1.rows[0])
    def setUp(self):
        self.rows = (
            (1, 'a', True, '11/4/2015', '11/4/2015 12:22 PM'),
            (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM'),
            (None, 'b', None, None, None),
        )

        self.column_names = [
            'number',
            'text',
            'boolean',
            'date',
            'datetime',
        ]

        self.user_provided_column_names = [
            'number',
            'text',
            'boolean',
            'date',
            'datetime',
        ]

        self.column_types = [
            agate.Number(),
            agate.Text(),
            agate.Boolean(),
            agate.Date(),
            agate.DateTime(),
        ]

        self.table = agate.Table(self.rows, self.column_names,
                                 self.column_types)
Beispiel #14
0
    def get_column_types(self):
        if getattr(self.args, 'blanks', None):
            type_kwargs = {'null_values': ()}
        else:
            type_kwargs = {}

        text_type = agate.Text(**type_kwargs)

        if self.args.no_inference:
            types = [text_type]
        else:
            number_type = agate.Number(locale=self.args.locale, **type_kwargs)

            # See the order in the `agate.TypeTester` class.
            types = [
                agate.Boolean(**type_kwargs),
                agate.TimeDelta(**type_kwargs),
                agate.Date(date_format=self.args.date_format, **type_kwargs),
                agate.DateTime(datetime_format=self.args.datetime_format,
                               **type_kwargs),
                text_type,
            ]

            # In order to parse dates like "20010101".
            if self.args.date_format or self.args.datetime_format:
                types.insert(-1, number_type)
            else:
                types.insert(1, number_type)

        return agate.TypeTester(types=types)
Beispiel #15
0
def from_sql(cls, connection_or_string, table_name):
    """
    Create a new :class:`agate.Table` from a given SQL table. Types will be
    inferred from the database schema.

    Monkey patched as class method :meth:`Table.from_sql`.

    :param connection_or_string:
        An existing sqlalchemy connection or connection string.
    :param table_name:
        The name of a table in the referenced database.
    """
    engine, connection = get_engine_and_connection(connection_or_string)

    metadata = MetaData(connection)
    sql_table = Table(table_name,
                      metadata,
                      autoload=True,
                      autoload_with=connection)

    column_names = []
    column_types = []

    for sql_column in sql_table.columns:
        column_names.append(sql_column.name)

        if type(sql_column.type) in INTERVAL_MAP.values():
            py_type = datetime.timedelta
        else:
            py_type = sql_column.type.python_type

        if py_type in [int, float, decimal.Decimal]:
            if py_type is float:
                sql_column.type.asdecimal = True
            column_types.append(agate.Number())
        elif py_type is bool:
            column_types.append(agate.Boolean())
        elif issubclass(py_type, six.string_types):
            column_types.append(agate.Text())
        elif py_type is datetime.date:
            column_types.append(agate.Date())
        elif py_type is datetime.datetime:
            column_types.append(agate.DateTime())
        elif py_type is datetime.timedelta:
            column_types.append(agate.TimeDelta())
        else:
            raise ValueError('Unsupported sqlalchemy column type: %s' %
                             type(sql_column.type))

    s = select([sql_table])

    rows = connection.execute(s)

    try:
        return agate.Table(rows, column_names, column_types)
    finally:
        if engine is not None:
            connection.close()
            engine.dispose()
Beispiel #16
0
    def get_column_types(self):
        if getattr(self.args, 'blanks', None):
            text_type = agate.Text(cast_nulls=False)
        else:
            text_type = agate.Text()

        if self.args.no_inference:
            return agate.TypeTester(types=[text_type])
        else:
            return agate.TypeTester(types=[
                agate.Boolean(),
                agate.Number(locale=self.args.locale),
                agate.TimeDelta(),
                agate.Date(date_format=self.args.date_format),
                agate.DateTime(datetime_format=self.args.datetime_format),
                text_type
            ])
Beispiel #17
0
    def test_lookup_multiple_keys(self):
        rows = (('AZ', '1985'), ('WY', '2014'), ('SC', '1994'))

        column_names = ['usps', 'year']
        column_types = [agate.Text(), agate.Text()]

        table = agate.Table(rows, column_names, column_types)

        result = table.lookup(['usps', 'year'],
                              'population',
                              source=self._source)

        self.assertColumnNames(result, ['usps', 'year', 'population'])
        self.assertColumnTypes(result, [agate.Text, agate.Text, agate.Number])

        self.assertSequenceEqual(result.rows[1].values(),
                                 ['WY', '2014', 584153])
Beispiel #18
0
    def _analyze_date(self, event):
        table = self.table.where(
            lambda row: row["fields/date"] is not None).compute([
                (
                    "reduce_to_date",
                    agate.Formula(
                        agate.Text(),
                        lambda row: helpers.reduce_to_date(row["fields/date"]),
                    ),
                ),
                (
                    "reduce_to_year",
                    agate.Formula(
                        agate.Number(),
                        lambda row: helpers.reduce_to_year(row["fields/date"]),
                    ),
                ),
                (
                    "reduce_to_time",
                    agate.Formula(
                        agate.Number(),
                        lambda row: helpers.reduce_to_time(row["fields/date"]),
                    ),
                ),
            ])

        years = table.distinct(
            "reduce_to_year").columns["reduce_to_year"].values()

        _data = {}

        for year in years:
            _data[year] = (table.where(lambda row: row[
                "reduce_to_year"] == year).select("reduce_to_date").pivot(
                    "reduce_to_date").order_by("reduce_to_date"))

        event.set()

        print(f"\n\n{helpers.h1_icn} Date\n")

        for year in years:
            data_keys = list(_data[year].columns["reduce_to_date"].values())
            _counts = list(
                map(int, list(_data[year].columns["Count"].values())))
            _sum = sum(_counts)
            data_count = [[i] for i in _counts]

            args = {
                "color": False,
                "custom_tick": False,
                "start_dt": f"{year}-01-01"
            }

            print(f"\n{helpers.h2_icn} Year {year} ({_sum:,} emails)\n")
            calendar_heatmap(data=data_count, args=args, labels=data_keys)
Beispiel #19
0
    def test_make_sql_table_min_col_len(self):
        rows = ((1, 'x' * 10), (2, ''))
        column_names = ['id', 'name']
        column_types = [agate.Number(), agate.Text()]
        table = agate.Table(rows, column_names, column_types)

        sql_table = agatesql.table.make_sql_table(table, 'test_table', dialect='mysql', db_schema='test_schema', 
                                                  constraints=True, min_col_len=20)


        self.assertEquals(sql_table.columns.get('name').type.length, 20)
Beispiel #20
0
    def test_to_sql_create_statement_wide_width(self):
        rows = ((1, 'x' * 21845), (2, ''))
        column_names = ['id', 'name']
        column_types = [agate.Number(), agate.Text()]
        table = agate.Table(rows, column_names, column_types)

        statement = table.to_sql_create_statement('test_table', db_schema='test_schema', dialect='mysql')

        self.assertEqual(statement.replace('\t', '  '), '''CREATE TABLE test_schema.test_table (
  id DECIMAL(38, 0) NOT NULL, 
  name TEXT
);''')  # noqa
Beispiel #21
0
    def test_lookup_require_match(self):
        rows = (('WA', ), ('VA', ), ('FA', ))

        column_names = ['usps']
        column_types = [agate.Text()]

        table = agate.Table(rows, column_names, column_types)

        with self.assertRaises(ValueError):
            result = table.lookup('usps',
                                  'state',
                                  require_match=True,
                                  source=self._source)
Beispiel #22
0
    def test_lookup_no_match(self):
        rows = (('WA', ), ('VA', ), ('FA', ))

        column_names = ['usps']
        column_types = [agate.Text()]

        table = agate.Table(rows, column_names, column_types)

        result = table.lookup('usps', 'state', source=self._source)

        self.assertColumnNames(result, ['usps', 'state'])
        self.assertColumnTypes(result, [agate.Text, agate.Text])

        self.assertSequenceEqual(result.rows[2].values(), ['FA', None])
def main():
    df = pd.read_csv(SRC_PATH, dtype=str)
    counts = df['date'].value_counts().sort_index()
    # just get first 10 rows and last 10 rows
    counts = pd.concat([
        counts.head(10),
        pd.Series({'...': None}, name='date'),
        counts.tail(10)
    ])
    vals = [
        [k, v] for k, v in counts.to_dict().items()
    ]  # is there really no way to convert a Pandas series to list-of-lists?
    table = agate.Table(vals, ['date', 'count'],
                        [agate.Text(), agate.Number()])
    table.print_bars('date', 'count')
 def test_grouping(self):
     exonerations = agate.Table.from_csv(
         '../../../data/exonerations-20150828.csv')
     clean_state_data = exonerations.compute(
         [('federal',
           agate.Formula(agate.Boolean(),
                         lambda row: row['state'].startswith('F-'))),
          ('state',
           agate.Formula(
               agate.Text(), lambda row: row['state'][2:]
               if row['state'].startswith('F-') else row['state']))],
         replace=True)
     by_state = clean_state_data.group_by('state')
     state_totals = by_state.aggregate([('count', agate.Count())])
     sorted_totals = state_totals.order_by('count', reverse=True)
     sorted_totals.print_table(max_rows=10)
Beispiel #25
0
    def test_lookup_key(self):
        rows = (('WA', ), ('VA', ), ('TX', ))

        column_names = ['postal']
        column_types = [agate.Text()]

        table = agate.Table(rows, column_names, column_types)

        result = table.lookup('postal',
                              'state',
                              lookup_key='usps',
                              source=self._source)

        self.assertColumnNames(result, ['postal', 'state'])
        self.assertColumnTypes(result, [agate.Text, agate.Text])

        self.assertSequenceEqual(result.rows[1].values(), ['VA', 'Virginia'])
Beispiel #26
0
    def test_to_sql_create_statement_zero_width(self):
        rows = (
            (1, ''),
            (2, ''),
        )
        column_names = ['id', 'name']
        column_types = [agate.Number(), agate.Text()]

        table = agate.Table(rows, column_names, column_types)

        statement = table.to_sql_create_statement('test_table',
                                                  db_schema='test_schema',
                                                  dialect='mysql')

        self.assertIn('CREATE TABLE test_schema.test_table', statement)
        self.assertIn('id DECIMAL(38, 0) NOT NULL,', statement)
        self.assertIn('name VARCHAR(1)', statement)
Beispiel #27
0
    def get_column_types(self):
        if getattr(self.args, 'blanks', None):
            type_kwargs = {'null_values': ()}
        else:
            type_kwargs = {}

        types = [agate.Text(**type_kwargs)]

        if not self.args.no_inference:
            types = [
                agate.Boolean(**type_kwargs),
                agate.Number(locale=self.args.locale, **type_kwargs),
                agate.TimeDelta(**type_kwargs),
                agate.Date(date_format=self.args.date_format, **type_kwargs),
                agate.DateTime(datetime_format=self.args.datetime_format, **type_kwargs),
            ] + types

        return agate.TypeTester(types=types)
Beispiel #28
0
    def test_lookup_version(self):
        rows = (('1111', ), ('313320', ), ('522310', ))

        column_names = ['naics']
        column_types = [agate.Text()]

        table = agate.Table(rows, column_names, column_types)

        result = table.lookup('naics',
                              'description',
                              version='2012',
                              source=self._source)

        self.assertColumnNames(result, ['naics', 'description'])
        self.assertColumnTypes(result, [agate.Text, agate.Text])

        self.assertSequenceEqual(result.rows[1].values(),
                                 ['313320', 'Fabric Coating Mills'])
Beispiel #29
0
    def get_column_types(self):
        if getattr(self.args, 'blanks', None):
            type_kwargs = {'null_values': ()}
        else:
            type_kwargs = {}

        types = [agate.Text(**type_kwargs)]

        if not self.args.no_inference:
            types = [
                agate.Boolean(**type_kwargs),
                agate.TimeDelta(**type_kwargs),
                agate.Date(date_format=self.args.date_format, **type_kwargs),
                agate.DateTime(datetime_format=self.args.datetime_format,
                               **type_kwargs),
                # This is a different order than agate's default, in order to parse dates like "20010101".
                agate.Number(locale=self.args.locale, **type_kwargs),
            ] + types

        return agate.TypeTester(types=types)
Beispiel #30
0
    def setUp(self):
        self.rows = ((1, 'a', True, '11/4/2015', '11/4/2015 12:22 PM', '4:15'),
                     (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM',
                      '6:18'), (None, 'b', None, None, None, None))

        self.column_names = [
            'number', 'text', 'boolean', 'date', 'datetime', 'timedelta'
        ]

        self.column_types = [
            agate.Number(),
            agate.Text(),
            agate.Boolean(),
            agate.Date(),
            agate.DateTime(),
            agate.TimeDelta()
        ]

        self.table = agate.Table(self.rows, self.column_names,
                                 self.column_types)