def pesa_chapter_1(self): fp = retriever.filepath('pesa0809chapter1.xls') r = T.XlsReader() for sheet_index in range(1, 16): logger.info('Processing sheet: %s' % sheet_index) td = r.read(open(fp), sheet_index) self._process_sheet(td)
def extract(self): logger.info('Extracting data from Xls') self.download() fp = self.localfps[0] reader = T.XlsReader(fp) # notes are on 2nd page but ignore for time being tdata = reader.read() data = tdata.data out = T.TabularData() out.header = [ 'Date', 'Price (All)', 'Price (New)', 'Price (Modern)', 'Price (Older)' ] data = data[6:] data = zip(*data) def fix_date(indate): # e.g Q1 1952 q, year = indate.split() return int(year) + int(q[1]) / 4.0 data = [[fix_date(x) for x in data[0]], data[2], data[5], data[8], data[11]] out.data = list(zip(*data)) # outfp = 'data.js' # writer = T.JsonWriter() outfp = 'data.csv' writer = T.CsvWriter() writer.write(out, open(outfp, 'w')) logger.info('Data successfully extracted to: %s' % outfp)
def load_all(self): for ii in range(1, 5): fp = retriever.filepath('pesa0809chapter%s.xls' % ii) logger.info('Processing file: %s' % fp) r = T.XlsReader(open(fp)) r.read(open(fp), 0) num_sheets = r.book.nsheets for sheet_index in range(1, num_sheets): logger.info('Processing sheet: %s' % sheet_index) td = r.read(open(fp), sheet_index) self._process_sheet(td)
def extract_dept_spend(self): fp = retriever.filepath('pesa_2008_chapter5_tables.xls') print fp r = T.XlsReader() sheet_index = 1 td = r.read(open(fp), sheet_index) cells = td.data title = cells[0][0] # delete last row and column as totals headings = cells[3][1:-1] data = {} for row in cells[4:-1]: data[row[0]] = row[-1] return data
def extract_simple(self): # fp = retriever.filepath(self.xls_urls[2]) fp = retriever.filepath('pesa0809chapter1.xls') print fp r = T.XlsReader() sheet_index = 2 td = r.read(open(fp), sheet_index) cells = td.data title = cells[0][0] entries = {} for row in cells[6:]: if row[1]: # not a subheading entries[row[0]] = row[1:10] years = range(2002, 2011) return entries, years
def department_and_function(self, order='department', levels=1): ''' @param order: value is 'department' or 'function' (determines ordering in tree). @param levels: no of levels to show (1 or 2) ''' fp = retriever.filepath('pesa_2008_chapter5_tables.xls') r = T.XlsReader(open(fp)) td = r.read(sheet_index=1) cells = td.data title = cells[0][0] functions = [x.strip() for x in cells[3][1:-1]] depts = [row[0].strip() for row in cells[4:-1]] ourdata = [row[1:-1] for row in cells[4:-1]] children = [] def nodesum(nodes): values = map(lambda x: x['data']['$area'], nodes) return sum(values) rootchildren = [] if order == 'department': labels1 = depts labels2 = functions else: labels1 = functions labels2 = depts ourdata = list(zip(*ourdata)) for label1, row in zip(labels1, ourdata): if label1.startswith('of which'): # skip subfunctions continue children = [] for cell, label2 in zip(row, labels2): if label2.startswith('of which'): # skip subfunctions continue # some have -ve numbers which mess stuff up ... val = max(0, int(cell)) nn = self.makenode(label2, val) children.append(nn) deptnode = self.makenode(label1, nodesum(children)) if levels >= 2: deptnode['children'] = children rootchildren.append(deptnode) root = self.makenode('Total', nodesum(rootchildren)) root['children'] = rootchildren import simplejson return simplejson.dumps(root, indent=2)
def _extract_summary(self): results = {} for url in self.xls_urls: logger.info(url) fp = retriever.filepath(url) r = T.XlsReader(open(fp)) td = r.read() # 4th line has title set_title = td.data[3][0] # Great! sometimes in row 1 sometimes in row 2 def gettitle(row): if len(row) >= 2 and row[1]: return row[1] else: return row[0] table_titles = [gettitle(row) for row in td.data[4:] if row[0]] results[url] = {'title': set_title, 'tables': table_titles} self.info['summary'] = results simplejson.dump(self.info, open(infopath, 'w'), indent=4) return results