Beispiel #1
0
 def pesa_chapter_1(self):
     fp = retriever.filepath('pesa0809chapter1.xls')
     r = T.XlsReader()
     for sheet_index in range(1, 16):
         logger.info('Processing sheet: %s' % sheet_index)
         td = r.read(open(fp), sheet_index)
         self._process_sheet(td)
Beispiel #2
0
    def extract(self):
        logger.info('Extracting data from Xls')
        self.download()
        fp = self.localfps[0]
        reader = T.XlsReader(fp)
        # notes are on 2nd page but ignore for time being
        tdata = reader.read()
        data = tdata.data
        out = T.TabularData()
        out.header = [
            'Date', 'Price (All)', 'Price (New)', 'Price (Modern)',
            'Price (Older)'
        ]
        data = data[6:]
        data = zip(*data)

        def fix_date(indate):  # e.g Q1 1952
            q, year = indate.split()
            return int(year) + int(q[1]) / 4.0

        data = [[fix_date(x) for x in data[0]], data[2], data[5], data[8],
                data[11]]
        out.data = list(zip(*data))
        # outfp = 'data.js'
        # writer = T.JsonWriter()
        outfp = 'data.csv'
        writer = T.CsvWriter()
        writer.write(out, open(outfp, 'w'))
        logger.info('Data successfully extracted to: %s' % outfp)
Beispiel #3
0
 def load_all(self):
     for ii in range(1, 5):
         fp = retriever.filepath('pesa0809chapter%s.xls' % ii)
         logger.info('Processing file: %s' % fp)
         r = T.XlsReader(open(fp))
         r.read(open(fp), 0)
         num_sheets = r.book.nsheets
         for sheet_index in range(1, num_sheets):
             logger.info('Processing sheet: %s' % sheet_index)
             td = r.read(open(fp), sheet_index)
             self._process_sheet(td)
Beispiel #4
0
 def extract_dept_spend(self):
     fp = retriever.filepath('pesa_2008_chapter5_tables.xls')
     print fp
     r = T.XlsReader()
     sheet_index = 1
     td = r.read(open(fp), sheet_index)
     cells = td.data
     title = cells[0][0]
     # delete last row and column as totals
     headings = cells[3][1:-1]
     data = {}
     for row in cells[4:-1]:
         data[row[0]] = row[-1]
     return data
Beispiel #5
0
 def extract_simple(self):
     # fp = retriever.filepath(self.xls_urls[2])
     fp = retriever.filepath('pesa0809chapter1.xls')
     print fp
     r = T.XlsReader()
     sheet_index = 2
     td = r.read(open(fp), sheet_index)
     cells = td.data
     title = cells[0][0]
     entries = {}
     for row in cells[6:]:
         if row[1]:  # not a subheading
             entries[row[0]] = row[1:10]
     years = range(2002, 2011)
     return entries, years
Beispiel #6
0
    def department_and_function(self, order='department', levels=1):
        '''
        @param order: value is 'department' or 'function' (determines ordering in
        tree).
        @param levels: no of levels to show (1 or 2)
        '''
        fp = retriever.filepath('pesa_2008_chapter5_tables.xls')
        r = T.XlsReader(open(fp))
        td = r.read(sheet_index=1)
        cells = td.data
        title = cells[0][0]
        functions = [x.strip() for x in cells[3][1:-1]]
        depts = [row[0].strip() for row in cells[4:-1]]
        ourdata = [row[1:-1] for row in cells[4:-1]]
        children = []

        def nodesum(nodes):
            values = map(lambda x: x['data']['$area'], nodes)
            return sum(values)

        rootchildren = []

        if order == 'department':
            labels1 = depts
            labels2 = functions
        else:
            labels1 = functions
            labels2 = depts
            ourdata = list(zip(*ourdata))
        for label1, row in zip(labels1, ourdata):
            if label1.startswith('of which'):  # skip subfunctions
                continue
            children = []
            for cell, label2 in zip(row, labels2):
                if label2.startswith('of which'):  # skip subfunctions
                    continue
                # some have -ve numbers which mess stuff up ...
                val = max(0, int(cell))
                nn = self.makenode(label2, val)
                children.append(nn)
            deptnode = self.makenode(label1, nodesum(children))
            if levels >= 2:
                deptnode['children'] = children
            rootchildren.append(deptnode)
        root = self.makenode('Total', nodesum(rootchildren))
        root['children'] = rootchildren
        import simplejson
        return simplejson.dumps(root, indent=2)
Beispiel #7
0
    def _extract_summary(self):
        results = {}
        for url in self.xls_urls:
            logger.info(url)
            fp = retriever.filepath(url)
            r = T.XlsReader(open(fp))
            td = r.read()
            # 4th line has title
            set_title = td.data[3][0]

            # Great! sometimes in row 1 sometimes in row 2
            def gettitle(row):
                if len(row) >= 2 and row[1]: return row[1]
                else: return row[0]

            table_titles = [gettitle(row) for row in td.data[4:] if row[0]]
            results[url] = {'title': set_title, 'tables': table_titles}
        self.info['summary'] = results
        simplejson.dump(self.info, open(infopath, 'w'), indent=4)
        return results