def _process_sheet(self, tabular_data): import db repo = db.Repository('sqlite:///%s' % dbpath) years = range(2002, 2011) td = tabular_data cells = td.data title = cells[0][0] table = db.PesaTable(title=title) footnotes = [] for lastrow in reversed(cells): if len(lastrow) > 2: # into the data break foot = lastrow[0].strip() if foot: footnotes.append(foot) table.footnotes = simplejson.dumps(footnotes) entries = {} for row in cells[6:]: if row[1]: # not a subheading or footnote series_name = row[0] for (year, cell) in zip(years, row[1:10]): db.Expenditure( title=series_name, date=unicode(year), amount=swiss.floatify(cell), pesatable=table, ) db.Session.flush()
def load(self): ''' Looks like LA is very limited and is always associated with a given "department" -- so this is really a classifier for the account Simplest normalization: * years * dept FK Expenditure * subfunc * year * caporcur * region: usuals ones ... (ID or Non-ID not needed ...) * programme FK # does the same programme ever occur within two differnet departments? Programme * department Department? What questions do i want to ask: * Basically we want to browse in by facets * Region, func, subfunc, ... ''' import db fp = cache.retrieve(url) reader = csv.reader(open(fp)) # theoretically we'd have distributions to dept from CG as well ... # acc = 'CG' acc = 'LA' # dept -> account # Tag accounts: # subfunc # Tags relate to other tags ... repo = db.Repository(dburi) # skip headings reader.next() _clean = lambda _str: unicode(_str.strip()) for count, row in enumerate(reader): deptcode = _clean(row[0]) dept = _clean(row[1]) # have some blank rows at end if not dept: continue function = _clean(row[2]) subfunction = _clean(row[3]) pog = _clean(row[4]) poga = _clean(row[5]) # take verbose one # pog = row['Programme Object Group'] caporcur = _clean(row[7]) region = _clean(row[9]) exps = row[10:] area = db.Area(title=poga, deptcode=deptcode, department=dept, function=function, subfunction=subfunction, pog=pog, cap_or_cur=caporcur, region=region) for ii, exp in enumerate(exps): amount = swiss.floatify(exp) if amount: # do not bother with null or zero amounts area.expenditures.append( db.Expenditure(amount=amount, year=2003 + ii)) if count % 5000 == 0: print 'Completed: %s' % count db.Session.commit() db.Session.remove() db.Session.commit()
def load(self): """ Looks like LA is very limited and is always associated with a given "department" -- so this is really a classifier for the account Simplest normalization: * years * dept FK Expenditure * subfunc * year * caporcur * region: usuals ones ... (ID or Non-ID not needed ...) * programme FK # does the same programme ever occur within two differnet departments? Programme * department Department? What questions do i want to ask: * Basically we want to browse in by facets * Region, func, subfunc, ... """ import db fp = cache.retrieve(url) reader = csv.reader(open(fp)) # theoretically we'd have distributions to dept from CG as well ... # acc = 'CG' acc = 'LA' # dept -> account # Tag accounts: # subfunc # Tags relate to other tags ... repo = db.Repository(dburi) # skip headings reader.next() _clean = lambda _str: unicode(_str.strip()) for count, row in enumerate(reader): dept = _clean(row[1]) # have some blank rows at end if not dept: continue subfunction = _clean(row[3]) pog = _clean(row[5]) # take verbose one # pog = row['Programme Object Group'] caporcur = _clean(row[7]) region = _clean(row[9]) exps = row[10:] area = db.Area(title=pog, department=dept, cap_or_cur=caporcur, region=region) for ii, exp in enumerate(exps): amount = swiss.floatify(exp) if amount: # do not bother with null or zero amounts area.expenditures.append(db.Expenditure(amount=amount, year=2003 + ii)) if count % 5000 == 0: print "Completed: %s" % count db.Session.commit() db.Session.remove() db.Session.commit()