#!/usr/bin/env python # data.py # David Jones, [email protected], 2010-05-25 """ Scrapes the output of the OCR'd documents received from HM Treasury, and outputs machine readable POG data. """ # http://docs.python.org/release/2.5.4/lib/module-csv.html import csv # http://docs.python.org/release/2.5.4/lib/module-re.html import re # okfn import swiss cache = swiss.Cache('cache') url = "http://www.archive.org/download/PogAndPo_138/POG-to-PO.zip" # Various regular expressions used by the parser. # RE for removing OCR dirt from the beginnings or ends of lines: # Extremely pragmatic. dirt = '(\w?[^\w]+|\w|to)' dirtre = '(^' + dirt + '($| +))|(\s+' + dirt + '$)' # Regular expression for 10 character code. Example: # "P01 S100101" # (These are POG codes, but this program doesn't really care). code10re = r'P\w\w ?[S5]\w{6}' # RE for 8 character code. Example: # "P0110001"
def command(self): self.parse_config(self.options.config_file) self.cache = swiss.Cache(self.config.get("cache_dir", "data")) self.log = logging.getLogger("fetch") for arg in self.args: self.fetch(arg)
import os import shutil import swiss storage_path = '/tmp/pdfator-test/storage' storage = swiss.Cache(storage_path) from collections import deque class Queue(deque): pass queue = Queue() def get_text(name): # id = map_name_to_id(name) id = name stream = storage.stream(id) if stream is None: queue.append(name) return 'No PDF text yet, added to queue, please check back in 10m' else: return stream class TestItAll: name = '1609'
url = 'http://www.econ.yale.edu/~shiller/data/ie_data.xls' cache = 'cache' import swiss import swiss.tabular cache = swiss.Cache(cache) class Extractor(object): def execute(self): fp = cache.retrieve(url) reader = swiss.tabular.XlsReader(fp) # print reader.info() tabdata = reader.read() # clean up data data = tabdata.data # headings spread across rows 5-8 headings = zip(*data[4:8]) tabdata.header = [' '.join(cols).strip() for cols in headings] data = tabdata.data[8:-1] transposed = zip(*data) # get rid of odd date e.g. 1871.01 and replace with date fraction fraction = transposed[5] del transposed[5] del transposed[-1] transposed[0] = fraction tabdata.data = zip(*transposed) del tabdata.header[5] del tabdata.header[-1] writer = swiss.tabular.CsvWriter() writer.write(tabdata, open('data.csv', 'w'))