コード例 #1
0
ファイル: cli.py プロジェクト: vishalbelsare/csvkit
    def get_column_types(self):
        if getattr(self.args, 'blanks', None):
            type_kwargs = {'null_values': ()}
        else:
            type_kwargs = {}

        text_type = agate.Text(**type_kwargs)

        if self.args.no_inference:
            types = [text_type]
        else:
            number_type = agate.Number(locale=self.args.locale, **type_kwargs)

            # See the order in the `agate.TypeTester` class.
            types = [
                agate.Boolean(**type_kwargs),
                agate.TimeDelta(**type_kwargs),
                agate.Date(date_format=self.args.date_format, **type_kwargs),
                agate.DateTime(datetime_format=self.args.datetime_format,
                               **type_kwargs),
                text_type,
            ]

            # In order to parse dates like "20010101".
            if self.args.date_format or self.args.datetime_format:
                types.insert(-1, number_type)
            else:
                types.insert(1, number_type)

        return agate.TypeTester(types=types)
コード例 #2
0
    def setUp(self):
        self.rows = (
            (1.123, 'a', True, '11/4/2015', '11/4/2015 12:22 PM'),
            (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM'),
            (2, u'c', False, '11/5/2015', '11/4/2015 12:45 PM'),
            (None, 'b', None, None, None),
        )

        self.column_names = [
            'number',
            'text',
            'boolean',
            'date',
            'datetime',
        ]

        self.column_types = [
            agate.Number(),
            agate.Text(),
            agate.Boolean(),
            agate.Date(),
            agate.DateTime(),
        ]

        self.table = agate.Table(self.rows, self.column_names,
                                 self.column_types)
        self.connection_string = 'sqlite:///:memory:'
コード例 #3
0
    def setUp(self):
        self.rows = (
            (1, 'a', True, '11/4/2015', '11/4/2015 12:22 PM'),
            (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM'),
            (None, 'b', None, None, None),
        )

        self.column_names = [
            'number',
            'text',
            'boolean',
            'date',
            'datetime',
        ]

        self.user_provided_column_names = [
            'number',
            'text',
            'boolean',
            'date',
            'datetime',
        ]

        self.column_types = [
            agate.Number(),
            agate.Text(),
            agate.Boolean(),
            agate.Date(),
            agate.DateTime(),
        ]

        self.table = agate.Table(self.rows, self.column_names,
                                 self.column_types)
コード例 #4
0
def from_sql(cls, connection_or_string, table_name):
    """
    Create a new :class:`agate.Table` from a given SQL table. Types will be
    inferred from the database schema.

    Monkey patched as class method :meth:`Table.from_sql`.

    :param connection_or_string:
        An existing sqlalchemy connection or connection string.
    :param table_name:
        The name of a table in the referenced database.
    """
    engine, connection = get_engine_and_connection(connection_or_string)

    metadata = MetaData(connection)
    sql_table = Table(table_name,
                      metadata,
                      autoload=True,
                      autoload_with=connection)

    column_names = []
    column_types = []

    for sql_column in sql_table.columns:
        column_names.append(sql_column.name)

        if type(sql_column.type) in INTERVAL_MAP.values():
            py_type = datetime.timedelta
        else:
            py_type = sql_column.type.python_type

        if py_type in [int, float, decimal.Decimal]:
            if py_type is float:
                sql_column.type.asdecimal = True
            column_types.append(agate.Number())
        elif py_type is bool:
            column_types.append(agate.Boolean())
        elif issubclass(py_type, six.string_types):
            column_types.append(agate.Text())
        elif py_type is datetime.date:
            column_types.append(agate.Date())
        elif py_type is datetime.datetime:
            column_types.append(agate.DateTime())
        elif py_type is datetime.timedelta:
            column_types.append(agate.TimeDelta())
        else:
            raise ValueError('Unsupported sqlalchemy column type: %s' %
                             type(sql_column.type))

    s = select([sql_table])

    rows = connection.execute(s)

    try:
        return agate.Table(rows, column_names, column_types)
    finally:
        if engine is not None:
            connection.close()
            engine.dispose()
コード例 #5
0
ファイル: test_clihound.py プロジェクト: christi3k/csv-hound
    def test_distinct_values(self):
        column_names: List = [
            'id',
            'name',
            'dob',
            'last seen',
            'size',
            'active',
        ]
        column_types: List = [
            agate.Number(),
            agate.Text(),
            agate.Date(),
            agate.DateTime(),
            agate.Text(),
            agate.Boolean(),
        ]

        rows = [(1, 'Alvin Cotton', '03-01-1980', '06-30-2019 12:12:00', 'L',
                 True),
                (2, 'Usmaan Rojas', '01-12-1978', '06-30-2019 12:12:00', 'S',
                 False),
                (3, 'Kingston Odling', '04-09-1990', '06-30-2019 12:12:00',
                 'M', True),
                (3, 'Pooja Gillespie', '10-07-1985', '06-30-2019 12:12:00',
                 'S', True),
                (4, 'Hal Blake', '08-17-1989', '06-30-2019 12:12:00', 'L',
                 True),
                (5, 'Shannen Blevins', '06-10-1981', '06-30-2019 12:12:00',
                 'M', False),
                (5, 'Courteney Weston', '04-23-1992', '06-30-2019 12:12:00',
                 'M', False),
                (6, 'Conner Calhoun', '05-16-1977', '06-30-2019 12:12:00',
                 'XL', True),
                (7, 'Susie Rasmussen', '02-08-1987', '06-30-2019 12:12:00',
                 'L', False),
                (8, 'Cassie Beltran', '12-15-1982', '06-30-2019 12:12:00', 'M',
                 True)]

        model = csvhound.core.BaseHound()
        table = model.get_table_from_file('sample-data/test-distinct.csv')
        distinct = model.distinct_values('size')
        agate_table = agate.Table(rows, column_names, column_types)
        distinct_agate = agate_table.select('size').distinct('size')

        # now do the testing
        self.assertColumnNames(distinct, ('size', ))
        self.assertColumnTypes(distinct,
                               [type(c) for c in distinct.column_types])
        self.assertRows(distinct, distinct_agate)
コード例 #6
0
    def get_column_types(self):
        if getattr(self.args, 'blanks', None):
            text_type = agate.Text(cast_nulls=False)
        else:
            text_type = agate.Text()

        if self.args.no_inference:
            return agate.TypeTester(types=[text_type])
        else:
            return agate.TypeTester(types=[
                agate.Boolean(),
                agate.Number(locale=self.args.locale),
                agate.TimeDelta(),
                agate.Date(date_format=self.args.date_format),
                agate.DateTime(datetime_format=self.args.datetime_format),
                text_type
            ])
コード例 #7
0
ファイル: cli.py プロジェクト: leonqli/csvkit
    def get_column_types(self):
        if getattr(self.args, 'blanks', None):
            type_kwargs = {'null_values': ()}
        else:
            type_kwargs = {}

        types = [agate.Text(**type_kwargs)]

        if not self.args.no_inference:
            types = [
                agate.Boolean(**type_kwargs),
                agate.Number(locale=self.args.locale, **type_kwargs),
                agate.TimeDelta(**type_kwargs),
                agate.Date(date_format=self.args.date_format, **type_kwargs),
                agate.DateTime(datetime_format=self.args.datetime_format, **type_kwargs),
            ] + types

        return agate.TypeTester(types=types)
コード例 #8
0
ファイル: cli.py プロジェクト: v838/csvkit
    def get_column_types(self):
        if getattr(self.args, 'blanks', None):
            type_kwargs = {'null_values': ()}
        else:
            type_kwargs = {}

        types = [agate.Text(**type_kwargs)]

        if not self.args.no_inference:
            types = [
                agate.Boolean(**type_kwargs),
                agate.TimeDelta(**type_kwargs),
                agate.Date(date_format=self.args.date_format, **type_kwargs),
                agate.DateTime(datetime_format=self.args.datetime_format,
                               **type_kwargs),
                # This is a different order than agate's default, in order to parse dates like "20010101".
                agate.Number(locale=self.args.locale, **type_kwargs),
            ] + types

        return agate.TypeTester(types=types)
コード例 #9
0
    def setUp(self):
        self.rows = ((1, 'a', True, '11/4/2015', '11/4/2015 12:22 PM', '4:15'),
                     (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM',
                      '6:18'), (None, 'b', None, None, None, None))

        self.column_names = [
            'number', 'text', 'boolean', 'date', 'datetime', 'timedelta'
        ]

        self.column_types = [
            agate.Number(),
            agate.Text(),
            agate.Boolean(),
            agate.Date(),
            agate.DateTime(),
            agate.TimeDelta()
        ]

        self.table = agate.Table(self.rows, self.column_names,
                                 self.column_types)
コード例 #10
0
ファイル: metrics.py プロジェクト: icqw1983/gmail_analyzer
    def _analyze_count(self, event):
        # Average emails per day
        total = self.table.aggregate([("total", agate.Count())])["total"]
        total_senders = (
            self.table.distinct("fields/from").select("fields/from").aggregate(
                [("total", agate.Count())])["total"])

        if total == 0:
            first_email_date = ""
            last_email_date = None
        else:
            date_data = self.table.where(
                lambda row: row["fields/date"] is not None).compute([(
                    "reduce_to_datetime",
                    agate.Formula(
                        agate.DateTime(datetime_format="%Y-%m-%d %H:%M:%S"),
                        lambda row: helpers.reduce_to_datetime(row[
                            "fields/date"]),
                    ),
                )])
            first_email_date = (date_data.order_by("reduce_to_datetime").limit(
                1).columns["fields/date"].values()[0])
            last_email_date = (date_data.order_by(
                "reduce_to_datetime",
                reverse=True).limit(1).columns["fields/date"].values()[0])
        event.set()

        metrics = [
            ["Total emails", total],
            ["Senders", total_senders],
            ["First Email Date", first_email_date],
        ]

        if last_email_date:
            date_delta = helpers.convert_date(
                last_email_date) - helpers.convert_date(first_email_date)
            avg_email_per_day = total / date_delta.days
            metrics.append(["Avg. Emails/Day", f"{avg_email_per_day:.2f}"])

        print(f"\n\n{helpers.h1_icn} Stats\n")
        print(termtables.to_string(metrics))
コード例 #11
0
    def setUp(self):
        self.rows = (('1', 'a', 'True', '10/01/2015', '10/01/2015 12:30 PM',
                      '4h45m'), ('2', 'b', 'False', '11/01/2015',
                                 '11/01/2015 12:45 PM', '3h25m'), ('', '', '',
                                                                   '', '', ''))

        self.number_type = agate.Number()
        self.text_type = agate.Text()
        self.boolean_type = agate.Boolean()
        self.date_type = agate.Date()
        self.datetime_type = agate.DateTime()
        self.timedelta_type = agate.TimeDelta()

        self.column_names = ('number', 'text', 'boolean', 'date', 'datetime',
                             'timedelta')
        self.column_types = (self.number_type, self.text_type,
                             self.boolean_type, self.date_type,
                             self.datetime_type, self.timedelta_type)

        self.table = agate.Table(self.rows,
                                 zip(self.column_names, self.column_types))
コード例 #12
0

INSERT_QUERY = "INSERT INTO creative_stats ({}) VALUES ({}) ON CONFLICT (ad_id) DO UPDATE SET {}".format(', '.join([k for k in KEYS]), ', '.join([":" + k for k in KEYS]), ', '.join([f"{k} = :{k}" for k in KEYS]))

def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]


# specifying column types saves 50% of time in loading the CSV! (30min w/o, 15min w/)
CREATIVE_STATS_COLUMN_TYPES = {'Ad_ID': agate.Text(), 'Ad_URL': agate.Text(), 'Ad_Type': agate.Text(), 
                    'Regions': agate.Text(), 'Advertiser_ID': agate.Text(), 'Advertiser_Name': agate.Text(), 
                    'Ad_Campaigns_List': agate.Boolean(), 'Date_Range_Start': agate.Date(), 'Date_Range_End': agate.Date(), 
                    'Num_of_Days': agate.Number(), 'Impressions': agate.Text(), 'Spend_USD': agate.Text(), 
                    'First_Served_Timestamp': agate.DateTime(), 'Last_Served_Timestamp': agate.DateTime(), 
                    'Age_Targeting': agate.Text(), 'Gender_Targeting': agate.Text(), 'Geo_Targeting_Included': agate.Text(), 'Geo_Targeting_Excluded': agate.Text(), 
                    'Spend_Range_Min_USD': agate.Number(), 'Spend_Range_Max_USD': agate.Number(), 'Spend_Range_Min_EUR': agate.Number(), 'Spend_Range_Max_EUR': agate.Number(), 'Spend_Range_Min_INR': agate.Number(), 'Spend_Range_Max_INR': agate.Number(), 'Spend_Range_Min_BGN': agate.Number(), 'Spend_Range_Max_BGN': agate.Number(), 'Spend_Range_Min_HRK': agate.Number(), 'Spend_Range_Max_HRK': agate.Number(), 'Spend_Range_Min_CZK': agate.Number(), 'Spend_Range_Max_CZK': agate.Number(), 'Spend_Range_Min_DKK': agate.Number(), 'Spend_Range_Max_DKK': agate.Number(), 'Spend_Range_Min_HUF': agate.Number(), 'Spend_Range_Max_HUF': agate.Number(), 'Spend_Range_Min_PLN': agate.Number(), 'Spend_Range_Max_PLN': agate.Number(), 'Spend_Range_Min_RON': agate.Number(), 'Spend_Range_Max_RON': agate.Number(), 'Spend_Range_Min_SEK': agate.Number(), 'Spend_Range_Max_SEK': agate.Number(), 'Spend_Range_Min_GBP': agate.Number(), 'Spend_Range_Max_GBP': agate.Number(), 'Spend_Range_Min_NZD': agate.Number(), 'Spend_Range_Max_NZD': agate.Number()}


OLD_CREATIVE_STATS_COLUMN_TYPES = {'Ad_ID': agate.Text(), 'Ad_URL': agate.Text(), 'Ad_Type': agate.Text(), 
                    'Regions': agate.Text(), 'Advertiser_ID': agate.Text(), 'Advertiser_Name': agate.Text(), 
                    'Ad_Campaigns_List': agate.Text(), 'Date_Range_Start': agate.Date(), 'Date_Range_End': agate.Date(), 
                    'Num_of_Days': agate.Number(), 'Impressions': agate.Text(), 'Spend_USD': agate.Text(), 
                    }


CREATIVE_STATS_SCHEMA_CHANGE_DATE = date(2020, 7, 1) # it's sometime around here, I don't know for sure, that the schema changes

def load_creative_stats_to_db(csvfn, report_date):
    # should log: duration, total rows; New today; here yesterday, gone today.
コード例 #13
0
MSO_NUMBER_FORMAT_TO_AGATE_TYPE = {
    r'0': agate.Number(),
    r'0\.0': agate.Number(),
    r'0\.00': agate.Number(),
    r'0\.000': agate.Number(),
    r'0\.0000': agate.Number(),
    r'0\.E+00': agate.Number(),
    r'0%': agate.Number(),
    r'Percent': agate.Number(),
    r'\#\ ?\/?': agate.Number(),
    r'\#\ ??\/??': agate.Number(),
    r'\#\ ???\/???': agate.Number(),
    r'Short Date': agate.Date(date_format='%d/%m/%Y'),
    r'Medium Date': agate.Date(date_format='%d-%b-%y'),
    r'Long Date': agate.Date(date_format=''),
    r'Short Time': agate.DateTime(datetime_format='%H:%M'),
    r'Medium Time': agate.DateTime(datetime_format='%I:%M %p'),
    r'Long Time': agate.DateTime(datetime_format='%H:%M:%S:%f'),
    r'\@': agate.Text(),
    # TODO add mm\/dd\/yy and so on...
}


def from_html(cls,
              path,
              table_identifier=0,
              header=True,
              encoding='utf-8',
              mso_number_formats_override=None,
              row_limit=None,
              **kwargs):
コード例 #14
0
"""
This module contains the XLS extension to :class:`Table <agate.table.Table>`.
"""

import datetime
from collections import OrderedDict

import agate
import six
import xlrd

EXCEL_TO_AGATE_TYPE = {
    xlrd.biffh.XL_CELL_EMPTY: agate.Boolean(),
    xlrd.biffh.XL_CELL_TEXT: agate.Text(),
    xlrd.biffh.XL_CELL_NUMBER: agate.Number(),
    xlrd.biffh.XL_CELL_DATE: agate.DateTime(),
    xlrd.biffh.XL_CELL_BOOLEAN: agate.Boolean(),
    xlrd.biffh.XL_CELL_ERROR: agate.Text(),
    xlrd.biffh.XL_CELL_BLANK: agate.Boolean(),
}


def from_xls(cls,
             path,
             sheet=None,
             skip_lines=0,
             header=True,
             encoding_override=None,
             **kwargs):
    """
    Parse an XLS file.
コード例 #15
0
import agate, os, itertools, time, datetime, glob, csv
from datetime import date

text_type = agate.Text()
datetime_type = agate.DateTime()

tester = agate.TypeTester(force={'contb_receipt_dt': agate.Text()})

today = date.today()
datestamp = str(today.year) + str(today.month) + str(today.day)

ky_candidates_file = str(
    glob.glob('data/csv/process/*ky-candidate-contributions.csv')[0])

ky_candidate_contributions = agate.Table.from_csv(ky_candidates_file,
                                                  column_types=tester)

current_candidate_cmte_ids = ['C00580100', 'C00575795']

#Trump, Donald J. = C00580100
#Sanders, Bernard = C00577130
#Kasich, John R. = C00581876
#Clinton, Hillary Rodham = C00575795
#Cruz, Rafael Edward 'Ted' = C00574624


def candidate_brackets(contributions):
    #brackets
    #bracket1 = 200 and under
    #bracket2 = 200.01 - 499.99
    #bracket3 = 500 - 999.99
コード例 #16
0
def candidate_time_charts():
    os.remove('app/data/candidate_charts.js')
    text_type = agate.Text()
    datetime_type = agate.DateTime()
    chart_js = open('app/data/candidate_charts.js', 'a')

    candidate_contribs_with_monthyear = ky_candidate_contributions.compute([
        ('month_year',
         agate.Formula(text_type, lambda r: r['contb_receipt_dt'][-6:])),
        ('date',
         agate.Formula(
             text_type, lambda r: datetime.datetime.strptime(
                 r['contb_receipt_dt'], '%d-%b-%y')))
    ])

    date_sorted_candidat_contribs = candidate_contribs_with_monthyear.order_by(
        'date')
    restricted_date_candidate_contribs = date_sorted_candidat_contribs.where(
        lambda r: r['date'] > '2015-02-28 00:00:00')

    by_candidate_contribs = candidate_contribs_with_monthyear.group_by(
        'cand_nm')

    # We need a list of unique candidates and a list of unique month_years
    # Then we need to say, for each month_year and each candidate, how many contributions
    # happened.
    # We only need to write one label variable for all candidates:
    # labels = ['FEB-15', 'MAR-15', etc...]
    # For each candidate, we need:
    # candidateName_series = [200, 34, 885, 123, etc...]

    # Get unique list of month_years.
    # These are our labels.
    # We'll have to figure out how to sort these
    month_years = []
    for row in restricted_date_candidate_contribs.rows:
        month_year = row['month_year']
        if month_year in month_years:
            pass
        else:
            month_years.append(str(month_year))

    # Get unique list of candidates
    candidates = []
    for row in candidate_contribs_with_monthyear.rows:
        candidate = row['cand_nm']
        if candidate in candidates:
            pass
        else:
            candidates.append(candidate)

    candidate_month_year_groups = by_candidate_contribs.group_by(
        lambda r: r['month_year'], key_name='month_year_group')

    month_year_counts = candidate_month_year_groups.aggregate([
        ('contribution_count', agate.Count()),
        ('contribution_sum', agate.Sum('contb_receipt_amt'))
    ])

    #month_year_counts.print_table(max_rows=200)

    chart_js.write('count_labels = ' + str(month_years) + '\n')

    # For each candidate, each month, we want one value for count and one value for sum
    # If these values cannot be found in the month_year_counts table, then we should record a 0
    for candidate in candidates:
        count_value_list = []
        sum_value_list = []

        for month in month_years:
            contrib_count = 0
            contrib_sum = 0
            for row in month_year_counts.rows:
                if row['cand_nm'] == candidate:

                    series_label = candidate.split(',')[0].lower()
                    if month == row['month_year_group']:
                        contrib_count = str(row['contribution_count'])
                        #contrib_count = '{:,f}'.format(row['contribution_count'])
                        contrib_count_dict = {}
                        contrib_count_dict['meta'] = str('Contributions to ' +
                                                         candidate + ' for ' +
                                                         month)
                        contrib_count_dict['value'] = contrib_count
                        count_value_list.append(dict(contrib_count_dict))

                        contrib_sum = str(row['contribution_sum'])
                        #contrib_sum = '${:,.2f}'.format(row['contribution_sum'])
                        contrib_sum_dict = {}
                        contrib_sum_dict['meta'] = str('Amt. contributed to ' +
                                                       candidate + ' for ' +
                                                       month)
                        contrib_sum_dict['value'] = contrib_sum
                        sum_value_list.append(dict(contrib_sum_dict))
                    else:
                        pass
            if contrib_count == 0:
                contrib_count_dict = {}
                contrib_count_dict['meta'] = str('Contributions to ' +
                                                 candidate + ' for ' + month)
                contrib_count_dict['value'] = '0'
                count_value_list.append(dict(contrib_count_dict))
            if contrib_sum == 0:
                contrib_sum_dict = {}
                contrib_sum_dict['meta'] = str('Amount contributed to ' +
                                               candidate + ' for ' + month)
                contrib_sum_dict['value'] = '0'
                sum_value_list.append(dict(contrib_sum_dict))

        chart_js.write(series_label + '_count_series = ' +
                       str(count_value_list) + '\n')
        chart_js.write(series_label + '_sum_series = ' + str(sum_value_list) +
                       '\n')

    chart_js.close()