Example #1
0
def _test_dbo(write_dbo, read_dbo=None):
    if read_dbo is None:
        read_dbo = write_dbo

    expect_empty = (('foo', 'bar'),)
    expect = (('foo', 'bar'),
              ('a', 1),
              ('b', 2))
    expect_appended = (('foo', 'bar'),
                       ('a', 1),
                       ('b', 2),
                       ('a', 1),
                       ('b', 2))
    actual = etl.fromdb(read_dbo, 'SELECT * FROM test')

    debug('verify empty to start with...')
    debug(etl.look(actual))
    ieq(expect_empty, actual)

    debug('write some data and verify...')
    etl.todb(expect, write_dbo, 'test')
    debug(etl.look(actual))
    ieq(expect, actual)

    debug('append some data and verify...')
    etl.appenddb(expect, write_dbo, 'test')
    debug(etl.look(actual))
    ieq(expect_appended, actual)

    debug('overwrite and verify...')
    etl.todb(expect, write_dbo, 'test')
    debug(etl.look(actual))
    ieq(expect, actual)

    debug('cut, overwrite and verify')
    etl.todb(etl.cut(expect, 'bar', 'foo'), write_dbo, 'test')
    debug(etl.look(actual))
    ieq(expect, actual)

    debug('cut, append and verify')
    etl.appenddb(etl.cut(expect, 'bar', 'foo'), write_dbo, 'test')
    debug(etl.look(actual))
    ieq(expect_appended, actual)

    debug('try a single row')
    etl.todb(etl.head(expect, 1), write_dbo, 'test')
    debug(etl.look(actual))
    ieq(etl.head(expect, 1), actual)
Example #2
0
def transform(data,data_set):
	data = data['observations']
	data = etl.fromdicts(data, header=['value','realtime_start','realtime_end','date'])
	data = etl.cut(data,'date','value')	
	data = etl.rename(data,{'date':'date','value': data_set.lower()})
	data = etl.convert(data,data_set.lower(),lambda val: 0 if val == '.' else val)
	return data
Example #3
0
def test_dictlookupone():

    t1 = (('foo', 'bar'), ('a', 1), ('b', 2), ('b', 3))

    try:
        dictlookupone(t1, 'foo', strict=True)
    except DuplicateKeyError:
        pass  # expected
    else:
        assert False, 'expected error'

    # relax
    actual = dictlookupone(t1, 'foo', strict=False)
    # first wins
    expect = {'a': {'foo': 'a', 'bar': 1}, 'b': {'foo': 'b', 'bar': 2}}
    eq_(expect, actual)
    # key only
    actual = dictlookupone(cut(t1, 'foo'), 'foo')
    expect = {'a': {'foo': 'a'},
              'b': {'foo': 'b'}}
    eq_(expect, actual)

    t2 = (('foo', 'bar', 'baz'),
          ('a', 1, True),
          ('b', 2, False),
          ('b', 3, True),
          ('b', 3, False))

    # test compound key
    actual = dictlookupone(t2, ('foo', 'bar'), strict=False)
    expect = {('a', 1): {'foo': 'a', 'bar': 1, 'baz': True},
              ('b', 2): {'foo': 'b', 'bar': 2, 'baz': False},
              ('b', 3): {'foo': 'b', 'bar': 3, 'baz': True}}  # first wins
    eq_(expect, actual)
Example #4
0
def test_dictlookup():

    t1 = (('foo', 'bar'), ('a', 1), ('b', 2), ('b', 3))

    actual = dictlookup(t1, 'foo')
    expect = {'a': [{'foo': 'a', 'bar': 1}],
              'b': [{'foo': 'b', 'bar': 2}, {'foo': 'b', 'bar': 3}]}
    eq_(expect, actual)
    # key only
    actual = dictlookup(cut(t1, 'foo'), 'foo')
    expect = {'a': [{'foo': 'a'}],
              'b': [{'foo': 'b'}, {'foo': 'b'}]}
    eq_(expect, actual)

    t2 = (('foo', 'bar', 'baz'),
          ('a', 1, True),
          ('b', 2, False),
          ('b', 3, True),
          ('b', 3, False))

    # test compound key
    actual = dictlookup(t2, ('foo', 'bar'))
    expect = {('a', 1): [{'foo': 'a', 'bar': 1, 'baz': True}],
              ('b', 2): [{'foo': 'b', 'bar': 2, 'baz': False}],
              ('b', 3): [{'foo': 'b', 'bar': 3, 'baz': True},
                         {'foo': 'b', 'bar': 3, 'baz': False}]}
    eq_(expect, actual)
Example #5
0
def test_lookup():

    t1 = (('foo', 'bar'), ('a', 1), ('b', 2), ('b', 3))

    # lookup one column on another
    actual = lookup(t1, 'foo', 'bar')
    expect = {'a': [1], 'b': [2, 3]}
    eq_(expect, actual)

    # test default value - tuple of whole row
    actual = lookup(t1, 'foo')  # no value selector
    expect = {'a': [('a', 1)], 'b': [('b', 2), ('b', 3)]}
    eq_(expect, actual)
    # test default value - key only
    actual = lookup(cut(t1, 'foo'), 'foo')
    expect = {'a': [('a',)], 'b': [('b',), ('b',)]}
    eq_(expect, actual)

    t2 = (('foo', 'bar', 'baz'),
          ('a', 1, True),
          ('b', 2, False),
          ('b', 3, True),
          ('b', 3, False))

    # test value selection
    actual = lookup(t2, 'foo', ('bar', 'baz'))
    expect = {'a': [(1, True)], 'b': [(2, False), (3, True), (3, False)]}
    eq_(expect, actual)

    # test compound key
    actual = lookup(t2, ('foo', 'bar'), 'baz')
    expect = {('a', 1): [True], ('b', 2): [False], ('b', 3): [True, False]}
    eq_(expect, actual)
Example #6
0
def extract_column(table, column):
    '''
    Return a list of all values minus the header row for a given petl table
    :param table: A  petl data table
    :param column: The name of the column to extract
    '''
    a = petl.cut(table, column)
    return [i[0] for i in a][1:]
Example #7
0
def test_outerjoin_novaluefield():
    table1 = (('id', 'colour'),
              (0, 'black'),
              (1, 'blue'),
              (2, 'red'),
              (3, 'purple'),
              (5, 'yellow'),
              (7, 'white'))
    table2 = (('id', 'shape'),
              (1, 'circle'),
              (3, 'square'),
              (4, 'ellipse'))
    expect = (('id', 'colour', 'shape'),
              (0, 'black', None),
              (1, 'blue', 'circle'),
              (2, 'red', None),
              (3, 'purple', 'square'),
              (4, None, 'ellipse'),
              (5, 'yellow', None),
              (7, 'white', None))
    actual = outerjoin(table1, table2, key='id')
    ieq(expect, actual)
    actual = outerjoin(cut(table1, 'id'), table2, key='id')
    ieq(cut(expect, 'id', 'shape'), actual)
    actual = outerjoin(table1, cut(table2, 'id'), key='id')
    ieq(cut(expect, 'id', 'colour'), actual)
    actual = outerjoin(cut(table1, 'id'), cut(table2, 'id'), key='id')
    ieq(cut(expect, 'id'), actual)
Example #8
0
def _test_rightjoin_novaluefield(rightjoin_impl):
    table1 = (('id', 'colour'),
              (1, 'blue'),
              (2, 'red'),
              (3, 'purple'))
    table2 = (('id', 'shape'),
              (0, 'triangle'),
              (1, 'circle'),
              (3, 'square'),
              (4, 'ellipse'),
              (5, 'pentagon'))
    expect = (('id', 'colour', 'shape'),
              (0, None, 'triangle'),
              (1, 'blue', 'circle'),
              (3, 'purple', 'square'),
              (4, None, 'ellipse'),
              (5, None, 'pentagon'))

    actual = rightjoin_impl(table1, table2, key='id')
    ieq(expect, actual)
    actual = rightjoin_impl(cut(table1, 'id'), table2, key='id')
    ieq(cut(expect, 'id', 'shape'), actual)
    actual = rightjoin_impl(table1, cut(table2, 'id'), key='id')
    ieq(cut(expect, 'id', 'colour'), actual)
    actual = rightjoin_impl(cut(table1, 'id'), cut(table2, 'id'), key='id')
    ieq(cut(expect, 'id'), actual)
Example #9
0
def _test_leftjoin_novaluefield(leftjoin_impl):

    table1 = (('id', 'colour'),
              (1, 'blue'),
              (2, 'red'),
              (3, 'purple'),
              (5, 'yellow'),
              (7, 'orange'))
    table2 = (('id', 'shape'),
              (1, 'circle'),
              (3, 'square'),
              (4, 'ellipse'))
    expect = (('id', 'colour', 'shape'),
              (1, 'blue', 'circle'),
              (2, 'red', None),
              (3, 'purple', 'square'),
              (5, 'yellow', None,),
              (7, 'orange', None))
    
    actual = leftjoin_impl(table1, table2, key='id')
    ieq(expect, actual)
    actual = leftjoin_impl(cut(table1, 'id'), table2, key='id')
    ieq(cut(expect, 'id', 'shape'), actual)
    actual = leftjoin_impl(table1, cut(table2, 'id'), key='id')
    ieq(cut(expect, 'id', 'colour'), actual)
    actual = leftjoin_impl(cut(table1, 'id'), cut(table2, 'id'), key='id')
    ieq(cut(expect, 'id'), actual)
def createDimMedium(events):
    try:
        dim_medium_cut = etl.cut(events, 'utm_medium')
        dim_medium_rename = etl.rename(dim_medium_cut, {'utm_medium': 'medium'})
        dim_medium = etl.distinct(dim_medium_rename)
        # Export as csv to load folder
        etl.tocsv(dim_medium, 'load/dim_medium.csv')
    except Exception as e:
        print("Something went wrong. Error {0}".format(e))
def createDimSubscriptions(events):
    try:
        dim_subscriptions_cut = etl.cut(events, 'type')
        dim_subscriptions_rename = etl.rename(dim_subscriptions_cut, {'type': 'subscription_name'})
        dim_subscriptions = etl.distinct(dim_subscriptions_rename)
        # Export as csv to load folder
        etl.tocsv(dim_subscriptions, 'load/dim_subscriptions.csv')
    except Exception as e:
        print("Something went wrong. Error {0}".format(e))
def createDimCampaignType(events):
    try:
        dim_campaigntype_cut = etl.cut(events, 'utm_campaign')
        dim_campaigntype_rename = etl.rename(dim_campaigntype_cut, {'utm_campaign': 'campaign_type'})
        dim_campaigntype = etl.distinct(dim_campaigntype_rename)
        # export as csv to load folder
        etl.tocsv(dim_campaigntype, 'load/dim_campaigntype.csv')
    except Exception as e:
        print("Something went wrong. Error {0}".format(e))
Example #13
0
def _test_dbo(write_dbo, read_dbo=None):
    if read_dbo is None:
        read_dbo = write_dbo

    expect_empty = (('foo', 'bar'), )
    expect = (('foo', 'bar'), ('a', 1), ('b', 2))
    expect_appended = (('foo', 'bar'), ('a', 1), ('b', 2), ('a', 1), ('b', 2))
    actual = etl.fromdb(read_dbo, 'SELECT * FROM test')

    debug('verify empty to start with...')
    debug(etl.look(actual))
    ieq(expect_empty, actual)

    debug('write some data and verify...')
    etl.todb(expect, write_dbo, 'test')
    debug(etl.look(actual))
    ieq(expect, actual)

    debug('append some data and verify...')
    etl.appenddb(expect, write_dbo, 'test')
    debug(etl.look(actual))
    ieq(expect_appended, actual)

    debug('overwrite and verify...')
    etl.todb(expect, write_dbo, 'test')
    debug(etl.look(actual))
    ieq(expect, actual)

    debug('cut, overwrite and verify')
    etl.todb(etl.cut(expect, 'bar', 'foo'), write_dbo, 'test')
    debug(etl.look(actual))
    ieq(expect, actual)

    debug('cut, append and verify')
    etl.appenddb(etl.cut(expect, 'bar', 'foo'), write_dbo, 'test')
    debug(etl.look(actual))
    ieq(expect_appended, actual)

    debug('try a single row')
    etl.todb(etl.head(expect, 1), write_dbo, 'test')
    debug(etl.look(actual))
    ieq(etl.head(expect, 1), actual)
Example #14
0
def transform_data(data):
    tbl_data = petl.fromdicts(data)
    tbl_data = petl.convert(
        tbl_data, {k: v['value']
                   for k, v in fields_to_transform.items()})
    tbl_data = petl.rename(
        tbl_data, {k: v['key']
                   for k, v in fields_to_transform.items()})
    tbl_data_allowed = petl.cut(tbl_data, *allowed_fields)

    return tbl_data_allowed
def anyServices():
    # reading the csv file
    csv = pt.fromcsv('services.csv')
    # json content type declaration
    response.headers['Content-type'] = 'application/json'
    response.headers['Access-Control-Allow-Origin'] = '*'
    # cutting out the required column names
    jsonData = pt.cut(csv, 'ServiceID', 'Service')
    # convert the dictionary data into json data
    jsonData = json.JSONEncoder().encode(list(pt.dicts(jsonData)))
    # returning the json data
    return jsonData
Example #16
0
 def get_relationships(self):
     "Parses a list of `Relationship` objects."
     core_file = _find_loinc_table_core_file(self.uri.path)
     core = etl.fromcsv(core_file, delimiter=',')
     core = etl.cut(core, ['LOINC_NUM', 'LONG_COMMON_NAME'])
     hierarchy_file = _find_multi_axial_hierarchy_file(self.uri.path)
     hierarchy = etl.fromcsv(hierarchy_file, delimiter=',')
     hierarchy = etl.leftjoin(hierarchy, core, lkey='CODE', rkey='LOINC_NUM')
     hierarchy = etl.cut(hierarchy, ['IMMEDIATE_PARENT', 'CODE', 'CODE_TEXT', 'LONG_COMMON_NAME'])
     hierarchy = etl.fillright(hierarchy)
     hierarchy = etl.cut(hierarchy, ['IMMEDIATE_PARENT', 'CODE', 'LONG_COMMON_NAME'])
     hierarchy = etl.rename(hierarchy, 'LONG_COMMON_NAME', 'CODE_TEXT')
     parents = etl.cut(hierarchy, ['CODE', 'CODE_TEXT'])
     hierarchy = etl.selectne(hierarchy, 'IMMEDIATE_PARENT', '')
     hierarchy = etl.leftjoin(hierarchy, parents, lkey='IMMEDIATE_PARENT', rkey='CODE', lprefix='source.', rprefix='target.')
     hierarchy = etl.distinct(hierarchy)
     if self.versioned:
         version = _parse_version(hierarchy_file)
         hierarchy = etl.addfield(hierarchy, 'version', version)
     hierarchy = etl.rowmapmany(hierarchy, _to_json, ['relationship'])
     return hierarchy
Example #17
0
def exercise_ss_cursor(setup_dbo, ss_dbo):
    print '=' * len(repr(ss_dbo))
    print 'EXERCISE WITH SERVER-SIDE CURSOR'
    print repr(ss_dbo)
    print '=' * len(repr(ss_dbo))
    print

    expect_empty = (('foo', 'bar'),)
    expect = (('foo', 'bar'), ('a', 1), ('b', 1))
    expect_appended = (('foo', 'bar'), ('a', 1), ('b', 1), ('a', 1), ('b', 1))
    actual = fromdb(ss_dbo, 'SELECT * FROM test')

    print 'verify empty to start with...'
    ieq(expect_empty, actual)
    print look(actual)

    print 'write some data and verify...'
    todb(expect, setup_dbo, 'test')
    ieq(expect, actual)
    print look(actual)

    print 'append some data and verify...'
    appenddb(expect, setup_dbo, 'test')
    ieq(expect_appended, actual)
    print look(actual)

    print 'overwrite and verify...'
    todb(expect, setup_dbo, 'test')
    ieq(expect, actual)
    print look(actual)

    print 'cut, overwrite and verify'
    todb(cut(expect, 'bar', 'foo'), setup_dbo, 'test')
    ieq(expect, actual)
    print look(actual)

    print 'cut, append and verify'
    appenddb(cut(expect, 'bar', 'foo'), setup_dbo, 'test')
    ieq(expect_appended, actual)
    print look(actual)
Example #18
0
def test_issue_231():

    table = [['foo', 'bar'], ['a', '1'], ['b', '2']]
    t = cut(table, 'foo')
    totsv(t, 'tmp/issue_231.tsv')
    u = fromtsv('tmp/issue_231.tsv')
    ieq(t, u)
    tocsv(t, 'tmp/issue_231.csv')
    u = fromcsv('tmp/issue_231.csv')
    ieq(t, u)
    topickle(t, 'tmp/issue_231.pickle')
    u = frompickle('tmp/issue_231.pickle')
    ieq(t, u)
Example #19
0
def exercise(dbo):
    print '=' * len(repr(dbo))
    print repr(dbo)
    print '=' * len(repr(dbo))
    print
    
    expect_empty = (('foo', 'bar'),)
    expect = (('foo', 'bar'), ('a', 1), ('b', 1))
    expect_appended = (('foo', 'bar'), ('a', 1), ('b', 1), ('a', 1), ('b', 1))
    actual = fromdb(dbo, 'SELECT * FROM test')

    print 'verify empty to start with...'
    ieq(expect_empty, actual)
    print look(actual)
    
    print 'write some data and verify...'
    todb(expect, dbo, 'test')
    ieq(expect, actual)
    print look(actual)
    
    print 'append some data and verify...'
    appenddb(expect, dbo, 'test')
    ieq(expect_appended, actual)
    print look(actual)
    
    print 'overwrite and verify...'
    todb(expect, dbo, 'test')
    ieq(expect, actual)
    print look(actual)
    
    print 'cut, overwrite and verify'
    todb(cut(expect, 'bar', 'foo'), dbo, 'test')
    ieq(expect, actual)
    print look(actual)

    print 'cut, append and verify'
    appenddb(cut(expect, 'bar', 'foo'), dbo, 'test')
    ieq(expect_appended, actual)
    print look(actual)
def print_table(ctx):
    """Output a list of pipelines as table."""

    rows = [dict(source.state) for source in ctx.obj['sources']]
    message = '\nNumber of pipelines = {}\n'
    secho(message.format(len(rows)), **SUCCESS)

    subset = [
        'id', 'pipeline_status', 'validation_status', 'nb_validation_errors',
        'scraper_required', 'resource_type', 'extension'
    ]
    sorted_rows = sort(cut(fromdicts(rows), *subset), key='id')
    echo(look(sorted_rows, limit=None))
Example #21
0
def exercise(dbo):
    print '=' * len(repr(dbo))
    print repr(dbo)
    print '=' * len(repr(dbo))
    print
    
    expect_empty = (('foo', 'bar'),)
    expect = (('foo', 'bar'), ('a', 1), ('b', 1))
    expect_appended = (('foo', 'bar'), ('a', 1), ('b', 1), ('a', 1), ('b', 1))
    actual = fromdb(dbo, 'SELECT * FROM test')

    print 'verify empty to start with...'
    ieq(expect_empty, actual)
    print look(actual)
    
    print 'write some data and verify...'
    todb(expect, dbo, 'test')
    ieq(expect, actual)
    print look(actual)
    
    print 'append some data and verify...'
    appenddb(expect, dbo, 'test')
    ieq(expect_appended, actual)
    print look(actual)
    
    print 'overwrite and verify...'
    todb(expect, dbo, 'test')
    ieq(expect, actual)
    print look(actual)
    
    print 'cut, overwrite and verify'
    todb(cut(expect, 'bar', 'foo'), dbo, 'test')
    ieq(expect, actual)
    print look(actual)

    print 'cut, append and verify'
    appenddb(cut(expect, 'bar', 'foo'), dbo, 'test')
    ieq(expect_appended, actual)
    print look(actual)
Example #22
0
def test_issue_231():

    table = [['foo', 'bar'], ['a', '1'], ['b', '2']]
    t = cut(table, 'foo')
    totsv(t, 'tmp/issue_231.tsv')
    u = fromtsv('tmp/issue_231.tsv')
    ieq(t, u)
    tocsv(t, 'tmp/issue_231.csv')
    u = fromcsv('tmp/issue_231.csv')
    ieq(t, u)
    topickle(t, 'tmp/issue_231.pickle')
    u = frompickle('tmp/issue_231.pickle')
    ieq(t, u)
Example #23
0
def test_recordlookup():

    t1 = (('foo', 'bar'), ('a', 1), ('b', 2), ('b', 3))

    lkp = recordlookup(t1, 'foo')
    eq_(['a'], [r.foo for r in lkp['a']])
    eq_(['b', 'b'], [r.foo for r in lkp['b']])
    eq_([1], [r.bar for r in lkp['a']])
    eq_([2, 3], [r.bar for r in lkp['b']])

    # key only
    lkp = recordlookup(cut(t1, 'foo'), 'foo')
    eq_(['a'], [r.foo for r in lkp['a']])
    eq_(['b', 'b'], [r.foo for r in lkp['b']])
Example #24
0
    def cut(self, *columns):
        """
        Return a table of selection of columns

        `Args:`
            \*columns: str
                Columns in the parsons table
        `Returns:`
            A new parsons table containing the selected columnns
        """  # noqa: W605

        from parsons.etl.table import Table

        return Table(petl.cut(self.table, *columns))
Example #25
0
def test_crossjoin_novaluefield():
    table1 = (('id', 'colour'), (1, 'blue'), (2, 'red'))
    table2 = (('id', 'shape'), (1, 'circle'), (3, 'square'))
    expect = (('id', 'colour', 'id', 'shape'), (1, 'blue', 1, 'circle'),
              (1, 'blue', 3, 'square'), (2, 'red', 1, 'circle'), (2, 'red', 3,
                                                                  'square'))
    actual = crossjoin(table1, table2, key='id')
    ieq(expect, actual)
    actual = crossjoin(cut(table1, 'id'), table2, key='id')
    ieq(cut(expect, 0, 2, 'shape'), actual)
    actual = crossjoin(table1, cut(table2, 'id'), key='id')
    ieq(cut(expect, 0, 'colour', 2), actual)
    actual = crossjoin(cut(table1, 'id'), cut(table2, 'id'), key='id')
    ieq(cut(expect, 0, 2), actual)
Example #26
0
def test_outerjoin_novaluefield():
    table1 = (('id', 'colour'), (0, 'black'), (1, 'blue'), (2, 'red'),
              (3, 'purple'), (5, 'yellow'), (7, 'white'))
    table2 = (('id', 'shape'), (1, 'circle'), (3, 'square'), (4, 'ellipse'))
    expect = (('id', 'colour', 'shape'), (0, 'black', None),
              (1, 'blue', 'circle'), (2, 'red', None), (3, 'purple', 'square'),
              (4, None, 'ellipse'), (5, 'yellow', None), (7, 'white', None))
    actual = outerjoin(table1, table2, key='id')
    ieq(expect, actual)
    actual = outerjoin(cut(table1, 'id'), table2, key='id')
    ieq(cut(expect, 'id', 'shape'), actual)
    actual = outerjoin(table1, cut(table2, 'id'), key='id')
    ieq(cut(expect, 'id', 'colour'), actual)
    actual = outerjoin(cut(table1, 'id'), cut(table2, 'id'), key='id')
    ieq(cut(expect, 'id'), actual)
Example #27
0
def test_basics():

    t1 = (('foo', 'bar'), ('A', 1), ('B', 2))
    w1 = FluentWrapper(t1)

    eq_(('foo', 'bar'), w1.header())
    eq_(petl.header(w1), w1.header())
    iassertequal((('A', 1), ('B', 2)), w1.data())
    iassertequal(petl.data(w1), w1.data())

    w2 = w1.cut('bar', 'foo')
    expect2 = (('bar', 'foo'), (1, 'A'), (2, 'B'))
    iassertequal(expect2, w2)
    iassertequal(petl.cut(w1, 'bar', 'foo'), w2)

    w3 = w1.cut('bar', 'foo').cut('foo', 'bar')
    iassertequal(t1, w3)
Example #28
0
def test_basics():

    t1 = (('foo', 'bar'), ('A', 1), ('B', 2))
    w1 = etl.wrap(t1)

    eq_(('foo', 'bar'), w1.header())
    eq_(etl.header(w1), w1.header())
    ieq((('A', 1), ('B', 2)), w1.data())
    ieq(etl.data(w1), w1.data())

    w2 = w1.cut('bar', 'foo')
    expect2 = (('bar', 'foo'), (1, 'A'), (2, 'B'))
    ieq(expect2, w2)
    ieq(etl.cut(w1, 'bar', 'foo'), w2)

    w3 = w1.cut('bar', 'foo').cut('foo', 'bar')
    ieq(t1, w3)
Example #29
0
def _test_rightjoin_novaluefield(rightjoin_impl):
    table1 = (('id', 'colour'), (1, 'blue'), (2, 'red'), (3, 'purple'))
    table2 = (('id', 'shape'), (0, 'triangle'), (1, 'circle'), (3, 'square'),
              (4, 'ellipse'), (5, 'pentagon'))
    expect = (('id', 'colour', 'shape'), (0, None, 'triangle'),
              (1, 'blue', 'circle'), (3, 'purple', 'square'),
              (4, None, 'ellipse'), (5, None, 'pentagon'))

    actual = rightjoin_impl(table1, table2, key='id')
    ieq(expect, actual)
    actual = rightjoin_impl(cut(table1, 'id'), table2, key='id')
    ieq(cut(expect, 'id', 'shape'), actual)
    actual = rightjoin_impl(table1, cut(table2, 'id'), key='id')
    ieq(cut(expect, 'id', 'colour'), actual)
    actual = rightjoin_impl(cut(table1, 'id'), cut(table2, 'id'), key='id')
    ieq(cut(expect, 'id'), actual)
Example #30
0
def _test_join_novaluefield(join_impl):

    table1 = (('id', 'colour'), (1, 'blue'), (2, 'red'), (3, 'purple'))
    table2 = (('id', 'shape'), (1, 'circle'), (3, 'square'), (4, 'ellipse'))

    expect = (('id', 'colour', 'shape'), (1, 'blue', 'circle'), (3, 'purple',
                                                                 'square'))

    actual = join_impl(table1, table2, key='id')
    ieq(expect, actual)
    actual = join_impl(cut(table1, 'id'), table2, key='id')
    ieq(cut(expect, 'id', 'shape'), actual)
    actual = join_impl(table1, cut(table2, 'id'), key='id')
    ieq(cut(expect, 'id', 'colour'), actual)
    actual = join_impl(cut(table1, 'id'), cut(table2, 'id'), key='id')
    ieq(cut(expect, 'id'), actual)
Example #31
0
def print_table(ctx):
    """Output a list of pipelines as table."""

    rows = [dict(source.state) for source in ctx.obj['sources']]
    message = '\nNumber of pipelines = {}\n'
    secho(message.format(len(rows)), **SUCCESS)

    subset = [
        'id',
        'pipeline_status',
        'validation_status',
        'nb_validation_errors',
        'scraper_required',
        'resource_type',
        'extension'
    ]
    sorted_rows = sort(cut(fromdicts(rows), *subset), key='id')
    echo(look(sorted_rows, limit=None))
Example #32
0
def _test_lookupjoin_novaluefield(lookupjoin_impl):
    table1 = (('id', 'color', 'cost'), (1, 'blue', 12), (2, 'red', 8),
              (3, 'purple', 4))
    table2 = (('id', 'shape', 'size'), (1, 'circle', 'big'),
              (2, 'square', 'tiny'), (3, 'ellipse', 'small'))
    expect = (('id', 'color', 'cost', 'shape', 'size'), (1, 'blue', 12,
                                                         'circle', 'big'),
              (2, 'red', 8, 'square', 'tiny'), (3, 'purple', 4, 'ellipse',
                                                'small'))
    actual = lookupjoin_impl(table1, table2, key='id')
    ieq(expect, actual)
    actual = lookupjoin_impl(cut(table1, 'id'), table2, key='id')
    ieq(cut(expect, 'id', 'shape', 'size'), actual)
    actual = lookupjoin_impl(table1, cut(table2, 'id'), key='id')
    ieq(cut(expect, 'id', 'color', 'cost'), actual)
    actual = lookupjoin_impl(cut(table1, 'id'), cut(table2, 'id'), key='id')
    ieq(cut(expect, 'id'), actual)
Example #33
0
def _test_antijoin_novaluefield(antijoin_impl):
    table1 = (('id', 'colour'), (0, 'black'), (1, 'blue'), (2, 'red'),
              (4, 'yellow'), (5, 'white'))
    table2 = (('id', 'shape'), (1, 'circle'), (3, 'square'))
    expect = (('id', 'colour'), (0, 'black'), (2, 'red'), (4, 'yellow'),
              (5, 'white'))
    actual = antijoin_impl(table1, table2, key='id')
    ieq(expect, actual)
    actual = antijoin_impl(cut(table1, 'id'), table2, key='id')
    ieq(cut(expect, 'id'), actual)
    actual = antijoin_impl(table1, cut(table2, 'id'), key='id')
    ieq(expect, actual)
    actual = antijoin_impl(cut(table1, 'id'), cut(table2, 'id'), key='id')
    ieq(cut(expect, 'id'), actual)
Example #34
0
def xref_symbol_reports():
    symbol_reports = [
        f for f in os.listdir()
        if re.match('OCLC Datasync Unresolved.*\.csv', f)
    ]

    today = str(date.today())

    for report in symbol_reports:

        symbol_split = re.split('^.*processing.(M[A-Z]{2}).*$', report)
        symbol = symbol_split[1]
        xlsx_outfile = symbol + '_datasync_unresolved_' + today + '.xlsx'
        xls_outfile = symbol + '_datasync_unresolved_' + today + '.xls'
        txt_outfile = symbol + '_staging_OCNs_' + today + '.txt'

        symbol_table_raw = etl.fromcsv(report, encoding='utf-8')
        symbol_table = etl.rename(symbol_table_raw, '\ufeffMMS Id', 'MMS ID')
        symbol_table2 = etl.select(symbol_table, "{MMS ID} is not None")
        symbol_table_sorted = etl.sort(symbol_table2, 'MMS ID')

        xref_table = etl.fromcsv('unresxref.csv')
        xref_table2 = etl.select(xref_table, "{MMS ID} is not None")
        xref_table_sorted = etl.sort(xref_table2, 'MMS ID')

        symbol_xref_table = etl.join(symbol_table_sorted,
                                     xref_table_sorted,
                                     presorted=True,
                                     lkey="MMS ID",
                                     rkey="MMS ID")

        try:
            etl.toxlsx(symbol_xref_table, xlsx_outfile, encoding='utf-8')
        except TypeError:
            etl.toxls(symbol_xref_table,
                      xls_outfile,
                      'Sheet1',
                      encoding='utf-8')

        staging_ocns_table = etl.cut(symbol_xref_table, 'Staging OCN')
        template = '{Staging OCN}\n'
        etl.totext(staging_ocns_table, txt_outfile, template=template)
Example #35
0
def test_lookupone():

    t1 = (('foo', 'bar'), ('a', 1), ('b', 2), ('b', 3))

    # lookup one column on another under strict mode
    try:
        lookupone(t1, 'foo', 'bar', strict=True)
    except DuplicateKeyError:
        pass  # expected
    else:
        assert False, 'expected error'

    # lookup one column on another under, not strict
    actual = lookupone(t1, 'foo', 'bar', strict=False)
    expect = {'a': 1, 'b': 2}  # first value wins
    eq_(expect, actual)

    # test default value - tuple of whole row
    actual = lookupone(t1, 'foo', strict=False)  # no value selector
    expect = {'a': ('a', 1), 'b': ('b', 2)}  # first wins
    eq_(expect, actual)
    # test default value - key only
    actual = lookupone(cut(t1, 'foo'), 'foo')
    expect = {'a': ('a',), 'b': ('b',)}
    eq_(expect, actual)

    t2 = (('foo', 'bar', 'baz'),
          ('a', 1, True),
          ('b', 2, False),
          ('b', 3, True),
          ('b', 3, False))

    # test value selection
    actual = lookupone(t2, 'foo', ('bar', 'baz'), strict=False)
    expect = {'a': (1, True), 'b': (2, False)}
    eq_(expect, actual)

    # test compound key
    actual = lookupone(t2, ('foo', 'bar'), 'baz', strict=False)
    expect = {('a', 1): True, ('b', 2): False, ('b', 3): True}  # first wins
    eq_(expect, actual)
Example #36
0
def collection_count(request, pk):

    collection = Collection.objects.get(pk=pk)
    table = collection.data

    if request.method == "GET":
        fields = []
        button_states = {}
        header, table = table[0], table[1:]
    else:
        fields = [f for f in FIELDS if f in request.POST]

        if fields:
            table = petl.cut(table, *fields)
            header, counts = list(table[0]) + ["Count"], petl.valuecounter(
                table, *fields)
            table = []
            for k, v in counts.items():
                if isinstance(k, str):
                    table.append([k] + [v])
                else:
                    table.append(list(k) + [v])
        else:
            header, table = table[0], table[1:]

        button_states = {
            f"{f}_checked": True
            for f in FIELDS if f in request.POST
        }

    return render(
        request,
        "api/collection_detail.html",
        {
            "header": header,
            "table": table,
            "collection_id": collection.id,
            "show_load_more": False,
            **button_states,
        },
    )
Example #37
0
def test_basics():
    
    t1 = (('foo', 'bar'),
         ('A', 1),
         ('B', 2))
    w1 = FluentWrapper(t1)
    
    eq_(('foo', 'bar'), w1.header())
    eq_(petl.header(w1), w1.header())
    iassertequal((('A', 1), ('B', 2)), w1.data())
    iassertequal(petl.data(w1), w1.data())
    
    w2 = w1.cut('bar', 'foo')
    expect2 = (('bar', 'foo'),
               (1, 'A'),
               (2, 'B'))
    iassertequal(expect2, w2)
    iassertequal(petl.cut(w1, 'bar', 'foo'), w2)
    
    w3 = w1.cut('bar', 'foo').cut('foo', 'bar')
    iassertequal(t1, w3)
Example #38
0
def test_basics():
    
    t1 = (('foo', 'bar'),
         ('A', 1),
         ('B', 2))
    w1 = etl(t1)
    
    eq_(('foo', 'bar'), w1.header())
    eq_(petl.header(w1), w1.header())
    ieq((('A', 1), ('B', 2)), w1.data())
    ieq(petl.data(w1), w1.data())
    
    w2 = w1.cut('bar', 'foo')
    expect2 = (('bar', 'foo'),
               (1, 'A'),
               (2, 'B'))
    ieq(expect2, w2)
    ieq(petl.cut(w1, 'bar', 'foo'), w2)
    
    w3 = w1.cut('bar', 'foo').cut('foo', 'bar')
    ieq(t1, w3)
Example #39
0
def kcmo_convert(filepath, xtrapath):
    """
    Takes the file path to a csv in the format used by Kansas City proper
    converts to universal format 
    outputs csv.
    """
    kcmo = etl.fromcsv(filepath)
    kcx = etl.fromxlsx(xtrapath)
    kcjoin = etl.join(kcmo, kcx, lkey='POLEID', rkey='IDNumber')
    del kcmo
    del kcx

    kcjoin = etl.addfield(kcjoin, 'PoleID', lambda x: x['POLEID'])
    kcjoin = etl.addfield(kcjoin, 'Longitude',
                          lambda x: geom_to_tuple(x['the_geom'])[0])
    kcjoin = etl.addfield(kcjoin, 'Latitude',
                          lambda x: geom_to_tuple(x['the_geom'])[1])
    kcjoin = etl.addfield(kcjoin, 'LightbulbType',
                          lambda x: x['LUMINAIRE TYPE'])
    kcjoin = etl.addfield(kcjoin, 'Wattage', lambda x: x['WATTS'])
    kcjoin = etl.addfield(kcjoin, 'Lumens', None)
    kcjoin = etl.addfield(
        kcjoin, 'LightAttributes', lambda x: make_a_list(
            x['ATTACHMENT 10'], x['ATTACHMENT 9'], x['ATTACHMENT 8'], x[
                'ATTACHMENT 7'], x['ATTACHMENT 6'], x['ATTACHMENT 5'], x[
                    'ATTACHMENT 4'], x['ATTACHMENT 3'], x['ATTACHMENT 2'], x[
                        'ATTACHMENT 1'], x['SPECIAL_N2'], x['SPECIAL_NO']))
    kcjoin = etl.addfield(kcjoin, 'AttachedTech',
                          lambda x: bool(x['LightAttributes']))
    kcjoin = etl.addfield(
        kcjoin, 'FiberWiFiEnable', lambda x: find_wifi(*x[
            'LightAttributes'], x['SPECIAL_N2'], x['SPECIAL_NO']))
    kcjoin = etl.addfield(kcjoin, 'PoleType', lambda x: x['POLE TYPE'])
    kcjoin = etl.addfield(kcjoin, 'PoleOwner', lambda x: x['POLE OWNER'])
    kcjoin = etl.addfield(kcjoin, 'DataSource', 'Kansas City')
    kcjoin = etl.cut(kcjoin, 'PoleID', 'Longitude', 'Latitude',
                     'LightbulbType', 'Wattage', 'Lumens', 'AttachedTech',
                     'LightAttributes', 'FiberWiFiEnable', 'PoleType',
                     'PoleOwner', 'DataSource')
    etl.tocsv(kcjoin, 'data/kcmo_clean.csv')
Example #40
0
def test_crossjoin_novaluefield():
    table1 = (('id', 'colour'),
              (1, 'blue'),
              (2, 'red'))
    table2 = (('id', 'shape'),
              (1, 'circle'),
              (3, 'square'))
    expect = (('id', 'colour', 'id', 'shape'),
              (1, 'blue', 1, 'circle'),
              (1, 'blue', 3, 'square'),
              (2, 'red', 1, 'circle'),
              (2, 'red', 3, 'square'))
    actual = crossjoin(table1, table2, key='id')
    ieq(expect, actual)
    actual = crossjoin(cut(table1, 'id'), table2, key='id')
    ieq(cut(expect, 0, 2, 'shape'), actual)
    actual = crossjoin(table1, cut(table2, 'id'), key='id')
    ieq(cut(expect, 0, 'colour', 2), actual)
    actual = crossjoin(cut(table1, 'id'), cut(table2, 'id'), key='id')
    ieq(cut(expect, 0, 2), actual)
Example #41
0
def _test_leftjoin_novaluefield(leftjoin_impl):

    table1 = (('id', 'colour'), (1, 'blue'), (2, 'red'), (3, 'purple'),
              (5, 'yellow'), (7, 'orange'))
    table2 = (('id', 'shape'), (1, 'circle'), (3, 'square'), (4, 'ellipse'))
    expect = (('id', 'colour', 'shape'), (1, 'blue', 'circle'),
              (2, 'red', None), (3, 'purple', 'square'), (
                  5,
                  'yellow',
                  None,
              ), (7, 'orange', None))

    actual = leftjoin_impl(table1, table2, key='id')
    ieq(expect, actual)
    actual = leftjoin_impl(cut(table1, 'id'), table2, key='id')
    ieq(cut(expect, 'id', 'shape'), actual)
    actual = leftjoin_impl(table1, cut(table2, 'id'), key='id')
    ieq(cut(expect, 'id', 'colour'), actual)
    actual = leftjoin_impl(cut(table1, 'id'), cut(table2, 'id'), key='id')
    ieq(cut(expect, 'id'), actual)
Example #42
0
def xls_tidy(xls,qvalue):
    d=etl.fromtsv(xls)
    sd=etl.select(d,lambda x: float(x.PepQValue) <=float(qvalue))
    psmsummary=sd

    ssd=etl.cut(sd, 'Peptide', 'Protein', 'PepQValue')
    #remove the mod info in peptide.
    ssd=etl.transform.regex.sub(ssd,'Peptide', r'^[\w-]\.(.+)\.[\w-]$', r'\1')
    ssd=etl.transform.regex.sub(ssd,'Peptide', r'[\d\.\+]+', r'')

    aggregation = OrderedDict()
    aggregation['SpecCount'] = len
    cssd=etl.aggregate(ssd, 'Peptide', aggregation)

    fssd=etl.groupselectfirst(ssd, key=('Peptide','Protein',"PepQValue"))
    aggregation = OrderedDict()
    aggregation['Protein'] = 'Protein', etl.strjoin(';')
    aggregation['PepQValue'] = 'PepQValue', etl.strjoin(';')
    assd=etl.aggregate(fssd, 'Peptide', aggregation)
    pepsummary=etl.join(assd, cssd, key='Peptide')

    return (psmsummary, pepsummary)
Example #43
0
def test_recordlookupone():

    t1 = (('foo', 'bar'), ('a', 1), ('b', 2), ('b', 3))

    try:
        recordlookupone(t1, 'foo', strict=True)
    except DuplicateKeyError:
        pass  # expected
    else:
        assert False, 'expected error'

    # relax
    lkp = recordlookupone(t1, 'foo', strict=False)
    eq_('a', lkp['a'].foo)
    eq_('b', lkp['b'].foo)
    eq_(1, lkp['a'].bar)
    eq_(2, lkp['b'].bar)  # first wins

    # key only
    lkp = recordlookupone(cut(t1, 'foo'), 'foo', strict=False)
    eq_('a', lkp['a'].foo)
    eq_('b', lkp['b'].foo)
def createFacts(events, users):
    try:
        events_uid = etl.cutout(events, 'tracking_id', 'utm_medium', 'utm_campaign')
        events_tui = etl.cutout(events, 'user_id')

        stage_uid = etl.join(users, events_uid, key='user_id')
        stage_tui = etl.join(users, events_tui, key='tracking_id')

        stage_utm = etl.cut(stage_tui, 'user_id', 'utm_medium', 'utm_campaign')
        stage_uid_utm = etl.join(stage_uid, stage_utm, key='user_id')
        stage_m_s = etl.mergesort(stage_uid_utm, stage_tui, key=['created_at', 'email'])

        mappings = OrderedDict()
        mappings['tid'] = 'tracking_id'
        mappings['uid'] = 'user_id'
        mappings['utm_medium'] = 'utm_medium'
        mappings['utm_campaign'] = 'utm_campaign', {'audio': 'none', 'social': 'none'}
        mappings['utm_campaigntype'] = 'utm_campaign'
        mappings['email'] = 'email'
        mappings['subscription'] = 'type'
        mappings['sub_order'] = 'type', {'Signup Completed': '1', 'Trial Started': '2', 'Subscription Started': '3', 'Subscription Ended': '4'}
        mappings['created_at'] = 'created_at'

        # Mapping
        stage_mapping = etl.fieldmap(stage_m_s, mappings)

        # Sort
        stage_mapping_ordered = etl.sort(stage_mapping, key=['created_at', 'email', 'sub_order'])

        # Datetime split
        t1 = etl.split(stage_mapping_ordered, 'created_at', 'T', ['date', 'time'], include_original=True)
        t2 = etl.split(t1, 'date', '-', ['year', 'month', 'day'])
        stage_ready = etl.split(t2, 'time', ':', ['hour', 'minute', 'second'])

        # Export as csv to load folder
        etl.tocsv(stage_ready, 'load/facts.csv')

    except Exception as e:
        print("Something went wrong. Error {0}".format(e))
Example #45
0
def _test_lookupjoin_novaluefield(lookupjoin_impl):
    table1 = (('id', 'color', 'cost'),
              (1, 'blue', 12),
              (2, 'red', 8),
              (3, 'purple', 4))
    table2 = (('id', 'shape', 'size'),
              (1, 'circle', 'big'),
              (2, 'square', 'tiny'),
              (3, 'ellipse', 'small'))
    expect = (('id', 'color', 'cost', 'shape', 'size'),
              (1, 'blue', 12, 'circle', 'big'),
              (2, 'red', 8, 'square', 'tiny'),
              (3, 'purple', 4, 'ellipse', 'small'))
    actual = lookupjoin_impl(table1, table2, key='id')
    ieq(expect, actual)
    actual = lookupjoin_impl(cut(table1, 'id'), table2, key='id')
    ieq(cut(expect, 'id', 'shape', 'size'), actual)
    actual = lookupjoin_impl(table1, cut(table2, 'id'), key='id')
    ieq(cut(expect, 'id', 'color', 'cost'), actual)
    actual = lookupjoin_impl(cut(table1, 'id'), cut(table2, 'id'), key='id')
    ieq(cut(expect, 'id'), actual)
Example #46
0
    def import_process(self):
        self.data = self.hourly_data
        [
            list(
                map(
                    lambda value: value.update({
                        'location':
                        key.PositionName,
                        'position':
                        (key.Latitude, key.Longitude, key.Elevation)
                    }), values)) for key, values in self.data.items()
        ]

        weather_data = [
            value for key, values in self.data.items() for value in values
        ]

        self.data = petl.fromdicts(weather_data)

        self.data = petl.cut(self.data, 'location', 'position', 'time',
                             'summary', 'temperature', 'pressure', 'humidity')

        return True
Example #47
0
def _test_join_novaluefield(join_impl):

    table1 = (('id', 'colour'),
              (1, 'blue'),
              (2, 'red'),
              (3, 'purple'))
    table2 = (('id', 'shape'),
              (1, 'circle'),
              (3, 'square'),
              (4, 'ellipse'))

    expect = (('id', 'colour', 'shape'),
              (1, 'blue', 'circle'),
              (3, 'purple', 'square'))

    actual = join_impl(table1, table2, key='id')
    ieq(expect, actual)
    actual = join_impl(cut(table1, 'id'), table2, key='id')
    ieq(cut(expect, 'id', 'shape'), actual)
    actual = join_impl(table1, cut(table2, 'id'), key='id')
    ieq(cut(expect, 'id', 'colour'), actual)
    actual = join_impl(cut(table1, 'id'), cut(table2, 'id'), key='id')
    ieq(cut(expect, 'id'), actual)
Example #48
0
def _test_antijoin_novaluefield(antijoin_impl):
    table1 = (('id', 'colour'),
              (0, 'black'),
              (1, 'blue'),
              (2, 'red'),
              (4, 'yellow'),
              (5, 'white'))
    table2 = (('id', 'shape'),
              (1, 'circle'),
              (3, 'square'))
    expect = (('id', 'colour'),
              (0, 'black'),
              (2, 'red'),
              (4, 'yellow'),
              (5, 'white'))
    actual = antijoin_impl(table1, table2, key='id')
    ieq(expect, actual)
    actual = antijoin_impl(cut(table1, 'id'), table2, key='id')
    ieq(cut(expect, 'id'), actual)
    actual = antijoin_impl(table1, cut(table2, 'id'), key='id')
    ieq(expect, actual)
    actual = antijoin_impl(cut(table1, 'id'), cut(table2, 'id'), key='id')
    ieq(cut(expect, 'id'), actual)
def add_bbreflink(rec):
    bid = rec['bbrefID']
    initial = bid[0]
    return "http://www.baseball-reference.com/players/" + initial + "/" + bid + ".shtml"



# Load Master.csv from the Lahman database.
table = etl.fromcsv(sys.argv[1])

# Use US births only
table2 = etl.select(table, lambda rec: rec.birthCountry == 'USA')

# Only use these fields
table3 = etl.cut(table2, 'nameFirst', 'nameLast', 'debut', 'bbrefID', 'weight', 'height', 'finalGame', 'birthCity', 'birthState', 'birthYear')

# Remove null birth city and birth year
table4 = etl.select(table3, lambda rec: rec.birthCity != "" and rec.birthYear != "")

# Add Baseball Reference URL
table5 = etl.addfield(table4, 'baseball_ref_url', add_bbreflink)
# Remove unnecessary bbrefid
table6 = etl.cutout(table5, "bbrefID")

# Load city,state lat long table.
city = etl.fromcsv(sys.argv[2])
# Only use these fields
city2 = etl.cut(city, "city", "state", "lat", "long")

# Join tables by two keys
Example #50
0
table4['age'] = 'age_years'
look(table4)


# cut

table1 = [['foo', 'bar', 'baz'],
          ['A', 1, 2.7],
          ['B', 2, 3.4],
          ['B', 3, 7.8],
          ['D', 42, 9.0],
          ['E', 12]]

from petl import look, cut    
look(table1)
table2 = cut(table1, 'foo', 'baz')
look(table2)
# fields can also be specified by index, starting from zero
table3 = cut(table1, 0, 2)
look(table3)
# field names and indices can be mixed
table4 = cut(table1, 'bar', 0)
look(table4)
# select a range of fields
table5 = cut(table1, *range(0, 2))
look(table5)    


# cutout

table1 = [['foo', 'bar', 'baz'],
Example #51
0
def attendance_file2table(filename, output_csv_filebase, add_extra_fields):
    global full_name2sk_indiv_id

    print "*** Parsing file: " + filename
    print

    attendance_dicts = []

    # CCB's Worship Service event IDs...
    event_ids = {}
    event_ids["8"] = 6
    event_ids["9"] = 7
    event_ids["10"] = 8
    event_ids["11:15"] = 9
    event_ids["Christmas"] = 13

    # The following are used to create CSV output filenames and to emit human-readable event name if add_extra_fields
    # flag is on
    event_names = {}
    event_names[6] = "08am"
    event_names[7] = "09am"
    event_names[8] = "10am"
    event_names[9] = "11_15am"
    event_names[13] = "Christmas Eve"

    # Time of event in Excel-parseable format
    event_times = {}
    event_times[6] = "08:00 AM"
    event_times[7] = "09:00 AM"
    event_times[8] = "10:00 AM"
    event_times[9] = "11:15 AM"
    event_times[13] = "04:00 PM"

    # Starting state...
    prior_line = None
    matched_month_year = None
    matched_service_time = None
    month = None
    year = None
    service_time = None
    line_number = 1
    total_row_dict = None
    event_id = None
    accumulated_row_totals_dict = {"week1": 0, "week2": 0, "week3": 0, "week4": 0, "week5": 0, "week6": 0, "total": 0}
    full_name = None
    phone = None
    num_processed_lines = 0

    for line in open(filename):

        # First pick off line at front of file indicating month and year that this attendance file is for...
        if not matched_month_year:
            matched_month_year = re.search("For the month of ([A-Z][a-z]+), ([0-9]{4})", line)
            if matched_month_year:
                month = string2monthnum(matched_month_year.group(1))
                year = string2yearnum(matched_month_year.group(2))
                if not (month and year):
                    print >> sys.stderr, "*** Filename: " + filename + ", line number: " + str(line_number)
                    print >> sys.stderr, "*** ERROR! Invalid month or year found"
                    print >> sys.stderr, line
                    print >> sys.stderr
                    sys.exit(1)
                first_day_in_month, num_days_in_month = calendar.monthrange(year, month)

                # Create list of 6 date objects, month_sundays, representing week1, week2, ... week6 Sunday dates
                # If a week has no Sunday, it is None
                day_countup = 1
                day_countup += 6 - first_day_in_month
                month_sundays = []
                if first_day_in_month != 6:
                    month_sundays.append(None)
                while day_countup <= num_days_in_month:
                    month_sundays.append(datetime.date(year, month, day_countup))
                    day_countup += 7
                while len(month_sundays) < 6:
                    month_sundays.append(None)
                christmas_eve_date = datetime.date(year, month, 24)

        # Second pick off line at front of file indicating worship service time that this attendance file is for...
        elif not matched_service_time:
            matched_service_time = re.search("Worship Service - (Sunday |Summer )?([^ ]*)", line)
            if matched_service_time:
                service_time = matched_service_time.group(2)
                if service_time in event_ids:
                    event_id = event_ids[service_time]
                    event_name = event_names[event_id]
                else:
                    print >> sys.stderr, "*** Filename: " + filename + ", line number: " + str(line_number)
                    print >> sys.stderr, '*** ERROR! Unrecognized service_time: "' + service_time + '"'
                    print >> sys.stderr
                    sys.exit(1)

        # ...then match attendance (row per person with weeks they attended) and total (summary at bottom) rows
        else:

            # Once we found row with totals...we're done, that's last line in attendance file we need to parse
            matched_total_line = re.search("^ {18}Total: {13}(?P<attendance>( +[0-9]+)+)\r?$", line)
            if matched_total_line:
                totals_attendance_dict = attendance_str2dict(
                    matched_total_line.group("attendance"), [-3, -9, -15, -20, -24, -29, -35], 3
                )
                break

            matched_attendance_line = re.search(
                "^ {6}"
                + "(?P<full_name>(?P<last_name>[A-Za-z]+([ \-'][A-Za-z]+)*), "
                + "(?P<first_name>([A-Za-z]+\.?)+([\-' ][A-Za-z]+)*)( \((?P<nick_name>[A-Za-z]+)\))?\.?)?\r?"
                + "(?P<phone>( +)?([0-9]{3}-[0-9]{3}-[0-9]{4}|Unlisted))?"
                + "(?P<attendance> +(1 +)+[1-6])?\r?$",
                line,
            )
            if matched_attendance_line:
                if matched_attendance_line.group("full_name"):
                    full_name = matched_attendance_line.group("full_name").strip()
                if matched_attendance_line.group("phone"):
                    phone = matched_attendance_line.group("phone").strip()
                if matched_attendance_line.group("attendance"):
                    if full_name:
                        attendance = matched_attendance_line.group("attendance").strip()
                        row_dict = attendance_str2dict(attendance, [-1, -7, -13, -18, -22, -27, -33], 1)
                        row_dict["full_name"] = full_name
                        if phone:
                            row_dict["phone"] = phone
                        else:
                            row_dict["phone"] = ""
                        num_processed_lines += 1
                        full_name = None
                        phone = None
                        if row_dict["total"] != (
                            row_dict["week1"]
                            + row_dict["week2"]
                            + row_dict["week3"]
                            + row_dict["week4"]
                            + row_dict["week5"]
                            + row_dict["week6"]
                        ):
                            print >> sys.stderr, "*** Filename: " + filename + ", line number: " + str(line_number)
                            print >> sys.stderr, "*** ERROR! Bad row total, doesn't match sum of weeks 1-6"
                            print >> sys.stderr, row_dict
                            print >> sys.stderr
                            break

                        for key in accumulated_row_totals_dict:
                            accumulated_row_totals_dict[key] += row_dict[key]
                        attendance_dicts.append(row_dict)

            # Buffer the current line for line folding if needed (see 'line folding' above)
            prior_line = line
            line_number += 1

    print "*** Number of attendance lines processed: " + str(num_processed_lines)
    print "*** Number of attendees: " + str(accumulated_row_totals_dict["total"])
    print

    if output_csv_filebase and event_id:
        output_csv_filename = (
            output_csv_filebase + "/" + str(year) + format(month, "02d") + "_" + str(event_names[event_id]) + ".csv"
        )
        all_columns_table = petl.fromdicts(attendance_dicts)
        petl.tocsv(all_columns_table, output_csv_filename)

    # Build 2nd list of dicts, where each list item is dict of individual date/event attendance.  I.e. a row per
    # worship service date vs original attendance dicts format of a row per attendee across all weeks in month.
    # This is the actual one returned and eventually emitted into output file
    attendance_dicts2 = []
    for attendance_dict in attendance_dicts:
        for key in attendance_dict:
            if key[:4] == "week" and attendance_dict[key] != 0:
                week_index = int(key[4:5]) - 1
                if month_sundays[week_index] is not None:
                    attendance_dict2 = {}
                    full_name = attendance_dict["full_name"]
                    if full_name in full_name2sk_indiv_id:
                        attendance_dict2["Individual ID"] = full_name2sk_indiv_id[full_name]
                        if event_name == "Christmas Eve":
                            attendance_dict2["Date"] = christmas_eve_date
                        else:
                            attendance_dict2["Date"] = month_sundays[week_index]
                        attendance_dict2["Event ID"] = event_id
                        if add_extra_fields:
                            attendance_dict2["Time"] = event_times[event_id]
                            attendance_dict2["Full Name"] = full_name
                            attendance_dict2["Event Name"] = event_name
                            attendance_dict2["Week Num"] = week_index + 1
                        attendance_dicts2.append(attendance_dict2)
                    else:
                        print >> sys.stderr, '*** WARNING! Cannot find "' + full_name + '" in map'
                        print >> sys.stderr
                else:
                    print >> sys.stderr, '*** WARNING! Cannot find Sunday date for week index "' + str(week_index) + '"'
                    print >> sys.stderr

    # Check if numbers on Servant Keeper's reported Total: line match the totals we've been accumulating
    # per attendance row entry.  If they don't match, show WARNING (not ERROR, since via manual checks, it appears
    # that Servant Keeper totals are buggy)
    if totals_attendance_dict:
        for key in accumulated_row_totals_dict:
            if accumulated_row_totals_dict[key] != totals_attendance_dict[key]:
                pp = pprint.PrettyPrinter(stream=sys.stderr)
                print >> sys.stderr, "*** WARNING! Servant Keeper reported totals do not match data totals"
                print >> sys.stderr, "Servant Keeper Totals:"
                pp.pprint(totals_attendance_dict)
                print >> sys.stderr, "Data Totals:"
                pp.pprint(accumulated_row_totals_dict)
                print >> sys.stderr
                break

    return_table = petl.fromdicts(attendance_dicts2)
    header = petl.header(return_table)
    if "Event Name" in header:
        return_table = petl.cut(
            return_table, "Full Name", "Event Name", "Time", "Week Num", "Date", "Event ID", "Individual ID"
        )
    else:
        return_table = petl.cut(return_table, "Date", "Event ID", "Individual ID")

    return return_table
Example #52
0
def typeInference(table):
	for h in etl.header(table):
		col =  etl.cut(table, h)
		print etl.nrows(col)
Example #53
0
from __future__ import division, print_function, absolute_import


# cut()
#######

import petl as etl
table1 = [['foo', 'bar', 'baz'],
          ['A', 1, 2.7],
          ['B', 2, 3.4],
          ['B', 3, 7.8],
          ['D', 42, 9.0],
          ['E', 12]]
table2 = etl.cut(table1, 'foo', 'baz')
table2
# fields can also be specified by index, starting from zero
table3 = etl.cut(table1, 0, 2)
table3
# field names and indices can be mixed
table4 = etl.cut(table1, 'bar', 0)
table4
# select a range of fields
table5 = etl.cut(table1, *range(0, 2))
table5


# cutout()
##########

import petl as etl
table1 = [['foo', 'bar', 'baz'],