Exemple #1
0
    def get_column_types(self):
        if getattr(self.args, 'blanks', None):
            type_kwargs = {'null_values': ()}
        else:
            type_kwargs = {}

        text_type = agate.Text(**type_kwargs)

        if self.args.no_inference:
            types = [text_type]
        else:
            number_type = agate.Number(locale=self.args.locale, **type_kwargs)

            # See the order in the `agate.TypeTester` class.
            types = [
                agate.Boolean(**type_kwargs),
                agate.TimeDelta(**type_kwargs),
                agate.Date(date_format=self.args.date_format, **type_kwargs),
                agate.DateTime(datetime_format=self.args.datetime_format,
                               **type_kwargs),
                text_type,
            ]

            # In order to parse dates like "20010101".
            if self.args.date_format or self.args.datetime_format:
                types.insert(-1, number_type)
            else:
                types.insert(1, number_type)

        return agate.TypeTester(types=types)
Exemple #2
0
    def get_column_types(self):
        if getattr(self.args, 'blanks', None):
            text_type = agate.Text(cast_nulls=False)
        else:
            text_type = agate.Text()

        if self.args.no_inference:
            return agate.TypeTester(types=[text_type])
        else:
            return agate.TypeTester(types=[
                agate.Boolean(),
                agate.Number(locale=self.args.locale),
                agate.TimeDelta(),
                agate.Date(date_format=self.args.date_format),
                agate.DateTime(datetime_format=self.args.datetime_format),
                text_type
            ])
Exemple #3
0
def load_data(data):
    tester = agate.TypeTester(types=[
        boolean,
        number,
        text
    ])

    data['dpi'] = agate.Table.from_csv('DPI2015_basefile.v5.csv', column_types=tester)
    def test_load(self):
        tester = agate.TypeTester(
            force={
                'last_name': agate.Text(),
                'first_name': agate.Text(),
                'age': agate.Number()
            })

        exonerations = agate.Table.from_csv(
            '../../../data/exonerations-20150828.csv', column_types=tester)
        print(exonerations)  # 表的描述
Exemple #5
0
def make_type_tester(meta):
    """
    Uses parsed lookup table metadata to create a :class:`.agate.TypeTester`
    that will always use correct types for the table columns. (And avoid
    the overhead of type inference.)
    """
    force = {}

    for k, v in meta['columns'].items():
        force[k] = getattr(agate, v['type'])()

    return agate.TypeTester(force=force)
Exemple #6
0
    def get_column_types(self):
        if getattr(self.args, 'blanks', None):
            type_kwargs = {'null_values': ()}
        else:
            type_kwargs = {}

        types = [agate.Text(**type_kwargs)]

        if not self.args.no_inference:
            types = [
                agate.Boolean(**type_kwargs),
                agate.Number(locale=self.args.locale, **type_kwargs),
                agate.TimeDelta(**type_kwargs),
                agate.Date(date_format=self.args.date_format, **type_kwargs),
                agate.DateTime(datetime_format=self.args.datetime_format, **type_kwargs),
            ] + types

        return agate.TypeTester(types=types)
Exemple #7
0
def build_type_tester(text_columns: Iterable[str]) -> agate.TypeTester:
    types = [
        agate.data_types.Number(null_values=('null', '')),
        agate.data_types.Date(null_values=('null', ''),
                              date_format='%Y-%m-%d'),
        agate.data_types.DateTime(null_values=('null', ''),
                                  datetime_format='%Y-%m-%d %H:%M:%S'),
        ISODateTime(null_values=('null', '')),
        agate.data_types.Boolean(true_values=('true',),
                                 false_values=('false',),
                                 null_values=('null', '')),
        agate.data_types.Text(null_values=('null', ''))
    ]
    force = {
        k: agate.data_types.Text(null_values=('null', ''))
        for k in text_columns
    }
    return agate.TypeTester(force=force, types=types)
Exemple #8
0
    def get_column_types(self):
        if getattr(self.args, 'blanks', None):
            type_kwargs = {'null_values': ()}
        else:
            type_kwargs = {}

        types = [agate.Text(**type_kwargs)]

        if not self.args.no_inference:
            types = [
                agate.Boolean(**type_kwargs),
                agate.TimeDelta(**type_kwargs),
                agate.Date(date_format=self.args.date_format, **type_kwargs),
                agate.DateTime(datetime_format=self.args.datetime_format,
                               **type_kwargs),
                # This is a different order than agate's default, in order to parse dates like "20010101".
                agate.Number(locale=self.args.locale, **type_kwargs),
            ] + types

        return agate.TypeTester(types=types)
Exemple #9
0
def extract_xlsx_to_sql(filename, sqlurl):
    workbook = openpyxl.load_workbook(filename, read_only=True, data_only=True)
    sheetnames = workbook.sheetnames

    # Agate makes an educated guess at data types, but sometimes it
    # guesses wrong.  This is especially possible when dealing with
    # multiple data files like we are in this project.  We're able
    # to override the type inference system here
    #
    # Many columns are sparsely populated: mostly blank, with maybe
    # a 1 or a 0 once in a while.  Agate will assume those are boolean
    # columns, but once in a while there'll be a number.
    #
    # Since the data is spread across so many input files, we are ignoring
    # the type testing capabilities all together and forcing everything
    # to Text()

    type_tester = agate.TypeTester(types=[agate.Text()])
    for sheetname in sheetnames:
        print("%s :: %s" % (filename, sheetname))
        start = datetime.now()
        t = agate.Table.from_xlsx(filename,
                                  sheet=sheetname,
                                  column_types=type_tester)
        # we create a duplicate table with lowercase column names because
        # uppercase names require quoting in resulting SQL statements
        # and we lowercase the SQL table name for the same reason in the following
        # line
        t = t.rename(column_names=[name.lower() for name in t.column_names])
        t.to_sql(
            sqlurl,
            sheetname.lower(),
            constraints=False,
            create=True,
            create_if_not_exists=True,
            # chunk_size=1,
        )
        delta = datetime.now() - start
        print("%s :: %s :: %d elapsed" % (filename, sheetname, delta.seconds))
Exemple #10
0
 def get_column_types(self):
     if self.args.no_inference:
         return agate.TypeTester(limit=0)
     else:
         return None
Exemple #11
0
import agate

DEFAULT_TYPE_TESTER = agate.TypeTester(types=[
    agate.data_types.Number(null_values=('null', '')),
    agate.data_types.TimeDelta(null_values=('null', '')),
    agate.data_types.Date(null_values=('null', '')),
    agate.data_types.DateTime(null_values=('null', '')),
    agate.data_types.Boolean(true_values=('true', ),
                             false_values=('false', ),
                             null_values=('null', '')),
    agate.data_types.Text(null_values=('null', ''))
])


def table_from_data(data, column_names):
    "Convert list of dictionaries into an Agate table"

    # The agate table is generated from a list of dicts, so the column order
    # from `data` is not preserved. We can use `select` to reorder the columns
    #
    # If there is no data, create an empty table with the specified columns

    if len(data) == 0:
        return agate.Table([], column_names=column_names)
    else:
        table = agate.Table.from_object(data, column_types=DEFAULT_TYPE_TESTER)
        return table.select(column_names)


def empty_table():
    "Returns an empty Agate table. To be used in place of None"
Exemple #12
0
#!/usr/bin/env python

import agate

tester = agate.TypeTester(force={'fips': agate.Text()})

table = agate.Table.from_csv('examples/realdata/ks_1033_data.csv',
                             column_types=tester)

# Question 1: What was the total cost to Kansas City area counties?

# Filter to counties containing Kansas City
kansas_city = table.where(lambda r: r['county'] in
                          ('JACKSON', 'CLAY', 'CASS', 'PLATTE'))

# Sum total_cost of four counties
print('Total for Kansas City area: %i' %
      kansas_city.columns['total_cost'].aggregate(agate.Sum()))

# Question 2: Which counties spent the most?

# Group by counties
counties = table.group_by('county')

# Aggregate totals for all counties
totals = counties.aggregate([('total_cost', agate.Sum(), 'total_cost_sum')])

totals = totals.order_by('total_cost_sum', reverse=True)
totals.limit(20).print_bars('county', 'total_cost_sum', width=80)

print('Five most spendy counties:')
Exemple #13
0
import agate
import agatecharts

agatecharts.patch()

OUTPUT_DIR = 'docs/samples'

if not os.path.exists(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)

for filename in os.listdir(OUTPUT_DIR):
    os.remove(os.path.join(OUTPUT_DIR, filename))

tester = agate.TypeTester(force={
    ' Date': agate.Date('%Y-%m-%d')
})

emissions = agate.Table.from_csv('examples/epa-emissions-20150910.csv', tester)

emissions = emissions.compute([
    (agate.Formula(agate.Number(), lambda r: r[' Date'].day), 'day'),
    (agate.Formula(agate.Number(), lambda r: r[' SO2 (tons)'] or 0), 'so2'),
    (agate.Formula(agate.Number(), lambda r: r[' NOx (tons)'] or 0), 'noX'),
    (agate.Formula(agate.Number(), lambda r: r[' CO2 (short tons)'] or 0), 'co2')
])

states = emissions.group_by('State')
state_totals = states.aggregate([
    ('so2', agate.Sum(), 'so2'),
    ('co2', agate.Sum(), 'co2'),
Exemple #14
0
import agate, os, itertools, time, datetime, glob, csv
from datetime import date

text_type = agate.Text()
datetime_type = agate.DateTime()

tester = agate.TypeTester(force={'contb_receipt_dt': agate.Text()})

today = date.today()
datestamp = str(today.year) + str(today.month) + str(today.day)

ky_candidates_file = str(
    glob.glob('data/csv/process/*ky-candidate-contributions.csv')[0])

ky_candidate_contributions = agate.Table.from_csv(ky_candidates_file,
                                                  column_types=tester)

current_candidate_cmte_ids = ['C00580100', 'C00575795']

#Trump, Donald J. = C00580100
#Sanders, Bernard = C00577130
#Kasich, John R. = C00581876
#Clinton, Hillary Rodham = C00575795
#Cruz, Rafael Edward 'Ted' = C00574624


def candidate_brackets(contributions):
    #brackets
    #bracket1 = 200 and under
    #bracket2 = 200.01 - 499.99
    #bracket3 = 500 - 999.99
Exemple #15
0
import agate

DEFAULT_TYPE_TESTER = agate.TypeTester(types=[
    agate.data_types.Number(),
    agate.data_types.Date(),
    agate.data_types.DateTime(),
    agate.data_types.Boolean(),
    agate.data_types.Text()
])


def table_from_data(data):
    "Convert list of dictionaries into an Agate table"

    return agate.Table.from_object(data, column_types=DEFAULT_TYPE_TESTER)


def empty_table():
    "Returns an empty Agate table. To be used in place of None"

    return agate.Table(rows=[])


def as_matrix(table):
    "Return an agate table as a matrix of data sans columns"

    return [r.values() for r in table.rows.values()]


def from_csv(abspath):
Exemple #16
0
    def main(self):
        # Determine the file type.
        if self.args.filetype:
            filetype = self.args.filetype
            if filetype not in SUPPORTED_FORMATS:
                self.argparser.error('"%s" is not a supported format' %
                                     self.args.filetype)
        elif self.args.schema:
            filetype = 'fixed'
        elif self.args.key:
            filetype = 'json'
        else:
            if not self.args.input_path or self.args.input_path == '-':
                self.argparser.error(
                    'You must specify a format when providing data via STDIN (pipe).'
                )
            filetype = convert.guess_format(self.args.input_path)
            if not filetype:
                self.argparser.error(
                    'Unable to automatically determine the format of the input file. Try specifying a format with --format.'
                )

        self.buffers_input = filetype == 'csv' or not self.args.no_inference

        # Set the input file.
        if filetype in ('xls', 'xlsx'):
            self.input_file = open(self.args.input_path, 'rb')
        else:
            self.input_file = self._open_input_file(self.args.input_path)

        # Set the reader's arguments.
        kwargs = {}

        if self.args.schema:
            schema = self._open_input_file(self.args.schema)
        elif filetype == 'fixed':
            raise ValueError('schema must not be null when format is "fixed"')

        if self.args.sheet:
            kwargs['sheet'] = self.args.sheet

        if filetype == 'csv':
            kwargs.update(self.reader_kwargs)
            # Streaming CSV musn't set sniff_limit, but non-streaming should.
            if not self.args.no_inference:
                kwargs['sniff_limit'] = self.args.sniff_limit
            if self.args.no_header_row:
                kwargs['header'] = False
        elif self.args.no_inference:
            # Streaming CSV musn't set column_types, but other formats should.
            kwargs['column_types'] = agate.TypeTester(limit=0)

        # Convert the file.
        if filetype == 'csv' and self.args.no_inference:
            reader = agate.csv.reader(self.input_file, **self.reader_kwargs)
            writer = agate.csv.writer(self.output_file, **self.writer_kwargs)
            writer.writerows(reader)
        elif filetype == 'fixed':
            self.output_file.write(
                fixed2csv(self.input_file,
                          schema,
                          output=self.output_file,
                          **kwargs))
        elif filetype == 'geojson':
            self.output_file.write(geojson2csv(self.input_file, **kwargs))
        elif filetype in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'):
            if filetype == 'csv':
                table = agate.Table.from_csv(self.input_file, **kwargs)
            elif filetype == 'json':
                table = agate.Table.from_json(self.input_file,
                                              key=self.args.key,
                                              **kwargs)
            elif filetype == 'ndjson':
                table = agate.Table.from_json(self.input_file,
                                              key=self.args.key,
                                              newline=True,
                                              **kwargs)
            elif filetype == 'xls':
                table = agate.Table.from_xls(self.input_file,
                                             sheet=kwargs.get('sheet'))
            elif filetype == 'xlsx':
                table = agate.Table.from_xlsx(self.input_file,
                                              sheet=kwargs.get('sheet'))
            elif filetype == 'dbf':
                if not hasattr(self.input_file, 'name'):
                    raise ValueError(
                        'DBF files can not be converted from stdin. You must pass a filename.'
                    )
                table = agate.Table.from_dbf(self.input_file.name, **kwargs)
            table.to_csv(self.output_file)

        self.input_file.close()

        if self.args.schema:
            schema.close()
Exemple #17
0
 def DumbTypeTester():
     # We get agate's normal possible types
     types = agate.TypeTester()._possible_types
     for i, t in enumerate(types):
         types[i] = t.__class__(null_values=('',))
     return agate.TypeTester(types=types)
Exemple #18
0
import agate

tester = agate.TypeTester(
    force={
        'Physician_First_Name': agate.Text(),
        'Physician_Last_Name': agate.Text(),
        'Recipient_Primary_Business_Street_Address_Line1': agate.Text(),
        'Recipient_City': agate.Text(),
        'Recipient_Zip_Code': agate.Text(),
        'Physician_Specialty': agate.Text(),
        'Physician_Profile_ID': agate.Number(),
        'Total_Amount_of_Payment_USDollars': agate.Number(),
        'General': agate.Number(),
        'Research': agate.Number()
    })

column_renames = {
    'Physician_First_Name': 'fn',
    'Physician_Last_Name': 'ln',
    'Recipient_Primary_Business_Street_Address_Line1': 'add',
    'Recipient_City': 'city',
    'Recipient_Zip_Code': 'zip',
    'Physician_Specialty': 'spec',
    'Physician_Profile_ID': 'id',
    'Total_Amount_of_Payment_USDollars': 'd',
    'General': 'g',
    'Research': 'r'
}

table = agate.Table.from_csv(
    'edits/payments/most-paid-02-trim.csv',