Beispiel #1
0
    def get_column_types(self):
        if getattr(self.args, 'blanks', None):
            type_kwargs = {'null_values': ()}
        else:
            type_kwargs = {}

        text_type = agate.Text(**type_kwargs)

        if self.args.no_inference:
            types = [text_type]
        else:
            number_type = agate.Number(locale=self.args.locale, **type_kwargs)

            # See the order in the `agate.TypeTester` class.
            types = [
                agate.Boolean(**type_kwargs),
                agate.TimeDelta(**type_kwargs),
                agate.Date(date_format=self.args.date_format, **type_kwargs),
                agate.DateTime(datetime_format=self.args.datetime_format,
                               **type_kwargs),
                text_type,
            ]

            # In order to parse dates like "20010101".
            if self.args.date_format or self.args.datetime_format:
                types.insert(-1, number_type)
            else:
                types.insert(1, number_type)

        return agate.TypeTester(types=types)
    def setUp(self):
        self.rows = (
            (1, 'a', True, '11/4/2015', '11/4/2015 12:22 PM'),
            (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM'),
            (None, 'b', None, None, None),
        )

        self.column_names = [
            'number',
            'text',
            'boolean',
            'date',
            'datetime',
        ]

        self.user_provided_column_names = [
            'number',
            'text',
            'boolean',
            'date',
            'datetime',
        ]

        self.column_types = [
            agate.Number(),
            agate.Text(),
            agate.Boolean(),
            agate.Date(),
            agate.DateTime(),
        ]

        self.table = agate.Table(self.rows, self.column_names,
                                 self.column_types)
Beispiel #3
0
    def setUp(self):
        self.rows = (
            (1.123, 'a', True, '11/4/2015', '11/4/2015 12:22 PM'),
            (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM'),
            (2, u'c', False, '11/5/2015', '11/4/2015 12:45 PM'),
            (None, 'b', None, None, None),
        )

        self.column_names = [
            'number',
            'text',
            'boolean',
            'date',
            'datetime',
        ]

        self.column_types = [
            agate.Number(),
            agate.Text(),
            agate.Boolean(),
            agate.Date(),
            agate.DateTime(),
        ]

        self.table = agate.Table(self.rows, self.column_names,
                                 self.column_types)
        self.connection_string = 'sqlite:///:memory:'
Beispiel #4
0
def load_data(data):
    text_type = agate.Text()
    number_type = agate.Number()
    boolean_type = agate.Boolean()

    columns = (
        ('last_name', text_type),
        ('first_name', text_type),
        ('age', number_type),
        ('race', text_type),
        ('state', text_type),
        ('tags', text_type),
        ('crime', text_type),
        ('sentence', text_type),
        ('convicted', number_type),
        ('exonerated', number_type),
        ('dna', boolean_type),
        ('dna_essential', text_type),
        ('mistaken_witness', boolean_type),
        ('false_confession', boolean_type),
        ('perjury', boolean_type),
        ('false_evidence', boolean_type),
        ('official_misconduct', boolean_type),
        ('inadequate_defense', boolean_type),
    )

    with open('examples/realdata/exonerations-20150828.csv') as f:
        # Create a csv reader
        reader = csv.reader(f)

        # Skip header
        next(f)

        # Create the table
        data['exonerations'] = agate.Table(reader, columns)
    def _overall_stats(self):
        count_open_licenses = agate.Summary(
            'license_id', agate.Number(),
            lambda r: sum(license_id in utils.OPEN_LICENSES
                          for license_id in r.values()))

        self.overall_package_stats = self._package_table().aggregate([
            ('open_data_count', count_open_licenses),
        ])
        self.resource_stats = self._package_resource_table().compute([
            ('open_format', agate.Formula(agate.Boolean(),
                                          open_formats_count)),
        ])
        if len(self._package_resource_table()) > 0:
            self.resource_stats = self.resource_stats.aggregate([
                ('open_format_count', agate.Count('open_format', True)),
                ('min_date', agate.Min('created')),
                ('max_date', agate.Max('created'))
            ])
            format_table = self._package_resource_table().group_by(
                "format").aggregate([
                    ('count', agate.Count()),
                ])
            count = format_table.aggregate([
                ('different_formats', agate.Count()),
            ])
            self.open_datasets = self.overall_package_stats.get(
                "open_data_count", 0)
            self.open_format_count = self.resource_stats.get(
                "open_format_count", 0)
            self.format_count = count.get("different_formats", 0)
            self.compute_dates()
Beispiel #6
0
def from_sql(cls, connection_or_string, table_name):
    """
    Create a new :class:`agate.Table` from a given SQL table. Types will be
    inferred from the database schema.

    Monkey patched as class method :meth:`Table.from_sql`.

    :param connection_or_string:
        An existing sqlalchemy connection or connection string.
    :param table_name:
        The name of a table in the referenced database.
    """
    engine, connection = get_engine_and_connection(connection_or_string)

    metadata = MetaData(connection)
    sql_table = Table(table_name,
                      metadata,
                      autoload=True,
                      autoload_with=connection)

    column_names = []
    column_types = []

    for sql_column in sql_table.columns:
        column_names.append(sql_column.name)

        if type(sql_column.type) in INTERVAL_MAP.values():
            py_type = datetime.timedelta
        else:
            py_type = sql_column.type.python_type

        if py_type in [int, float, decimal.Decimal]:
            if py_type is float:
                sql_column.type.asdecimal = True
            column_types.append(agate.Number())
        elif py_type is bool:
            column_types.append(agate.Boolean())
        elif issubclass(py_type, six.string_types):
            column_types.append(agate.Text())
        elif py_type is datetime.date:
            column_types.append(agate.Date())
        elif py_type is datetime.datetime:
            column_types.append(agate.DateTime())
        elif py_type is datetime.timedelta:
            column_types.append(agate.TimeDelta())
        else:
            raise ValueError('Unsupported sqlalchemy column type: %s' %
                             type(sql_column.type))

    s = select([sql_table])

    rows = connection.execute(s)

    try:
        return agate.Table(rows, column_names, column_types)
    finally:
        if engine is not None:
            connection.close()
            engine.dispose()
 def get_org_format_aggregates(self, package_table):
     # format can not exist!?!
     format_table = package_table.group_by("format").aggregate([
         ('count', agate.Count()),
     ])
     open_format_table = package_table.compute([
         ('open_format', agate.Formula(agate.Boolean(), open_formats_count))
     ])
     open_format_table_aggregates = open_format_table.aggregate([
         ('open_formats', agate.Count('open_format', True)),
     ])
     new_table = format_table.compute([
         ('open_format', agate.Formula(agate.Boolean(), open_formats_count))
     ])
     count = new_table.aggregate([
         ('different_formats', agate.Count()),
         ('open_formats', agate.Count('open_format', True)),
     ])
     count["open_formats_datasets"] = open_format_table_aggregates[
         "open_formats"]
     return count
Beispiel #8
0
    def test_distinct_values(self):
        column_names: List = [
            'id',
            'name',
            'dob',
            'last seen',
            'size',
            'active',
        ]
        column_types: List = [
            agate.Number(),
            agate.Text(),
            agate.Date(),
            agate.DateTime(),
            agate.Text(),
            agate.Boolean(),
        ]

        rows = [(1, 'Alvin Cotton', '03-01-1980', '06-30-2019 12:12:00', 'L',
                 True),
                (2, 'Usmaan Rojas', '01-12-1978', '06-30-2019 12:12:00', 'S',
                 False),
                (3, 'Kingston Odling', '04-09-1990', '06-30-2019 12:12:00',
                 'M', True),
                (3, 'Pooja Gillespie', '10-07-1985', '06-30-2019 12:12:00',
                 'S', True),
                (4, 'Hal Blake', '08-17-1989', '06-30-2019 12:12:00', 'L',
                 True),
                (5, 'Shannen Blevins', '06-10-1981', '06-30-2019 12:12:00',
                 'M', False),
                (5, 'Courteney Weston', '04-23-1992', '06-30-2019 12:12:00',
                 'M', False),
                (6, 'Conner Calhoun', '05-16-1977', '06-30-2019 12:12:00',
                 'XL', True),
                (7, 'Susie Rasmussen', '02-08-1987', '06-30-2019 12:12:00',
                 'L', False),
                (8, 'Cassie Beltran', '12-15-1982', '06-30-2019 12:12:00', 'M',
                 True)]

        model = csvhound.core.BaseHound()
        table = model.get_table_from_file('sample-data/test-distinct.csv')
        distinct = model.distinct_values('size')
        agate_table = agate.Table(rows, column_names, column_types)
        distinct_agate = agate_table.select('size').distinct('size')

        # now do the testing
        self.assertColumnNames(distinct, ('size', ))
        self.assertColumnTypes(distinct,
                               [type(c) for c in distinct.column_types])
        self.assertRows(distinct, distinct_agate)
 def test_grouping(self):
     exonerations = agate.Table.from_csv(
         '../../../data/exonerations-20150828.csv')
     clean_state_data = exonerations.compute(
         [('federal',
           agate.Formula(agate.Boolean(),
                         lambda row: row['state'].startswith('F-'))),
          ('state',
           agate.Formula(
               agate.Text(), lambda row: row['state'][2:]
               if row['state'].startswith('F-') else row['state']))],
         replace=True)
     by_state = clean_state_data.group_by('state')
     state_totals = by_state.aggregate([('count', agate.Count())])
     sorted_totals = state_totals.order_by('count', reverse=True)
     sorted_totals.print_table(max_rows=10)
Beispiel #10
0
    def get_column_types(self):
        if getattr(self.args, 'blanks', None):
            text_type = agate.Text(cast_nulls=False)
        else:
            text_type = agate.Text()

        if self.args.no_inference:
            return agate.TypeTester(types=[text_type])
        else:
            return agate.TypeTester(types=[
                agate.Boolean(),
                agate.Number(locale=self.args.locale),
                agate.TimeDelta(),
                agate.Date(date_format=self.args.date_format),
                agate.DateTime(datetime_format=self.args.datetime_format),
                text_type
            ])
Beispiel #11
0
    def get_column_types(self):
        if getattr(self.args, 'blanks', None):
            type_kwargs = {'null_values': ()}
        else:
            type_kwargs = {}

        types = [agate.Text(**type_kwargs)]

        if not self.args.no_inference:
            types = [
                agate.Boolean(**type_kwargs),
                agate.Number(locale=self.args.locale, **type_kwargs),
                agate.TimeDelta(**type_kwargs),
                agate.Date(date_format=self.args.date_format, **type_kwargs),
                agate.DateTime(datetime_format=self.args.datetime_format, **type_kwargs),
            ] + types

        return agate.TypeTester(types=types)
Beispiel #12
0
    def get_column_types(self):
        if getattr(self.args, 'blanks', None):
            type_kwargs = {'null_values': ()}
        else:
            type_kwargs = {}

        types = [agate.Text(**type_kwargs)]

        if not self.args.no_inference:
            types = [
                agate.Boolean(**type_kwargs),
                agate.TimeDelta(**type_kwargs),
                agate.Date(date_format=self.args.date_format, **type_kwargs),
                agate.DateTime(datetime_format=self.args.datetime_format,
                               **type_kwargs),
                # This is a different order than agate's default, in order to parse dates like "20010101".
                agate.Number(locale=self.args.locale, **type_kwargs),
            ] + types

        return agate.TypeTester(types=types)
Beispiel #13
0
    def setUp(self):
        self.rows = ((1, 'a', True, '11/4/2015', '11/4/2015 12:22 PM', '4:15'),
                     (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM',
                      '6:18'), (None, 'b', None, None, None, None))

        self.column_names = [
            'number', 'text', 'boolean', 'date', 'datetime', 'timedelta'
        ]

        self.column_types = [
            agate.Number(),
            agate.Text(),
            agate.Boolean(),
            agate.Date(),
            agate.DateTime(),
            agate.TimeDelta()
        ]

        self.table = agate.Table(self.rows, self.column_names,
                                 self.column_types)
Beispiel #14
0
    def setUp(self):
        self.rows = (('1', 'a', 'True', '10/01/2015', '10/01/2015 12:30 PM',
                      '4h45m'), ('2', 'b', 'False', '11/01/2015',
                                 '11/01/2015 12:45 PM', '3h25m'), ('', '', '',
                                                                   '', '', ''))

        self.number_type = agate.Number()
        self.text_type = agate.Text()
        self.boolean_type = agate.Boolean()
        self.date_type = agate.Date()
        self.datetime_type = agate.DateTime()
        self.timedelta_type = agate.TimeDelta()

        self.column_names = ('number', 'text', 'boolean', 'date', 'datetime',
                             'timedelta')
        self.column_types = (self.number_type, self.text_type,
                             self.boolean_type, self.date_type,
                             self.datetime_type, self.timedelta_type)

        self.table = agate.Table(self.rows,
                                 zip(self.column_names, self.column_types))
Beispiel #15
0
title_rows = zip(sheet.row_values(4), sheet.row_values(5))
print(title_rows)

titles = [t[0] + ' ' + t[1] for t in title_rows]
titles = [t.strip() for t in titles]
titles

country_rows = [sheet.row_values(r) for r in range(6, 114)]
country_rows

from xlrd.sheet import ctype_text
import agate

text_type = agate.Text()
number_type = agate.Number()
boolean_type = agate.Boolean()
date_type = agate.Date()

example_row = sheet.row(6)
print example_row
print example_row[0].ctype
print example_row[0].value
print ctype_text

types = []

for v in example_row:
    value_type = ctype_text[v.ctype]
    if value_type == 'text':
        types.append(text_type)
    elif value_type == 'number':
Beispiel #16
0
        print(impressions)
        return None, None


INSERT_QUERY = "INSERT INTO creative_stats ({}) VALUES ({}) ON CONFLICT (ad_id) DO UPDATE SET {}".format(', '.join([k for k in KEYS]), ', '.join([":" + k for k in KEYS]), ', '.join([f"{k} = :{k}" for k in KEYS]))

def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]


# specifying column types saves 50% of time in loading the CSV! (30min w/o, 15min w/)
CREATIVE_STATS_COLUMN_TYPES = {'Ad_ID': agate.Text(), 'Ad_URL': agate.Text(), 'Ad_Type': agate.Text(), 
                    'Regions': agate.Text(), 'Advertiser_ID': agate.Text(), 'Advertiser_Name': agate.Text(), 
                    'Ad_Campaigns_List': agate.Boolean(), 'Date_Range_Start': agate.Date(), 'Date_Range_End': agate.Date(), 
                    'Num_of_Days': agate.Number(), 'Impressions': agate.Text(), 'Spend_USD': agate.Text(), 
                    'First_Served_Timestamp': agate.DateTime(), 'Last_Served_Timestamp': agate.DateTime(), 
                    'Age_Targeting': agate.Text(), 'Gender_Targeting': agate.Text(), 'Geo_Targeting_Included': agate.Text(), 'Geo_Targeting_Excluded': agate.Text(), 
                    'Spend_Range_Min_USD': agate.Number(), 'Spend_Range_Max_USD': agate.Number(), 'Spend_Range_Min_EUR': agate.Number(), 'Spend_Range_Max_EUR': agate.Number(), 'Spend_Range_Min_INR': agate.Number(), 'Spend_Range_Max_INR': agate.Number(), 'Spend_Range_Min_BGN': agate.Number(), 'Spend_Range_Max_BGN': agate.Number(), 'Spend_Range_Min_HRK': agate.Number(), 'Spend_Range_Max_HRK': agate.Number(), 'Spend_Range_Min_CZK': agate.Number(), 'Spend_Range_Max_CZK': agate.Number(), 'Spend_Range_Min_DKK': agate.Number(), 'Spend_Range_Max_DKK': agate.Number(), 'Spend_Range_Min_HUF': agate.Number(), 'Spend_Range_Max_HUF': agate.Number(), 'Spend_Range_Min_PLN': agate.Number(), 'Spend_Range_Max_PLN': agate.Number(), 'Spend_Range_Min_RON': agate.Number(), 'Spend_Range_Max_RON': agate.Number(), 'Spend_Range_Min_SEK': agate.Number(), 'Spend_Range_Max_SEK': agate.Number(), 'Spend_Range_Min_GBP': agate.Number(), 'Spend_Range_Max_GBP': agate.Number(), 'Spend_Range_Min_NZD': agate.Number(), 'Spend_Range_Max_NZD': agate.Number()}


OLD_CREATIVE_STATS_COLUMN_TYPES = {'Ad_ID': agate.Text(), 'Ad_URL': agate.Text(), 'Ad_Type': agate.Text(), 
                    'Regions': agate.Text(), 'Advertiser_ID': agate.Text(), 'Advertiser_Name': agate.Text(), 
                    'Ad_Campaigns_List': agate.Text(), 'Date_Range_Start': agate.Date(), 'Date_Range_End': agate.Date(), 
                    'Num_of_Days': agate.Number(), 'Impressions': agate.Text(), 'Spend_USD': agate.Text(), 
                    }


CREATIVE_STATS_SCHEMA_CHANGE_DATE = date(2020, 7, 1) # it's sometime around here, I don't know for sure, that the schema changes
# Name:   Shani Kumar
# Date:   02/23/2020
# Course: DSC-540 - Data Preparation
# Desc:   Practice joining numerous datasets – an activity you will likely run into frequently. Following the example
#         in your text that starts on page 229 – 233 of Data Wrangling with Python, work through the example to bring
#         two datasets together.
# Usage:  This program is to complete assignment 11.2 requirements
#
# Import required packages
import xlrd
import agate
from xlrd.sheet import ctype_text

text_type = agate.Text()  # define text type
number_type = agate.Number()  # define number type
boolean_type = agate.Boolean()  # define boolean type
date_type = agate.Date()  # define date type


def remove_bad_chars(val):
    """ This method remove bad character from data. If it is '-' it returns none
    :param val: input string data
    :return: input string or none
    """
    if val == '-':
        return None
    return val


def get_types(example_row):
    """
Beispiel #18
0
#!/usr/bin/env python
"""
This module contains the XLS extension to :class:`Table <agate.table.Table>`.
"""

import datetime
from collections import OrderedDict

import agate
import six
import xlrd

EXCEL_TO_AGATE_TYPE = {
    xlrd.biffh.XL_CELL_EMPTY: agate.Boolean(),
    xlrd.biffh.XL_CELL_TEXT: agate.Text(),
    xlrd.biffh.XL_CELL_NUMBER: agate.Number(),
    xlrd.biffh.XL_CELL_DATE: agate.DateTime(),
    xlrd.biffh.XL_CELL_BOOLEAN: agate.Boolean(),
    xlrd.biffh.XL_CELL_ERROR: agate.Text(),
    xlrd.biffh.XL_CELL_BLANK: agate.Boolean(),
}


def from_xls(cls,
             path,
             sheet=None,
             skip_lines=0,
             header=True,
             encoding_override=None,
             **kwargs):
    """
Beispiel #19
0
    'New Zealand',
    'Norway',
    'Poland',
    'Portugal',
    'Slovakia',
    'Slovenia',
    'Spain',
    'Sweden',
    'Switzerland',
    'Turkey',
    'UK',
    'USA'
]
NULL_VALUES = ['', '-999']

boolean = agate.Boolean(null_values=NULL_VALUES)
number = agate.Number(null_values=NULL_VALUES)
text = agate.Text(null_values=NULL_VALUES)


def load_data(data):
    tester = agate.TypeTester(types=[
        boolean,
        number,
        text
    ])

    data['dpi'] = agate.Table.from_csv('DPI2015_basefile.v5.csv', column_types=tester)


def add_value(data):
 def get_computed_data_type(self, table):
     return agate.Boolean()