Ejemplo n.º 1
0
#!/usr/bin/env python
# data.py
# David Jones, [email protected], 2010-05-25
"""
Scrapes the output of the OCR'd documents received from HM Treasury, and
outputs machine readable POG data.
"""

# http://docs.python.org/release/2.5.4/lib/module-csv.html
import csv
# http://docs.python.org/release/2.5.4/lib/module-re.html
import re
# okfn
import swiss
cache = swiss.Cache('cache')

url = "http://www.archive.org/download/PogAndPo_138/POG-to-PO.zip"

# Various regular expressions used by the parser.

# RE for removing OCR dirt from the beginnings or ends of lines:
# Extremely pragmatic.
dirt = '(\w?[^\w]+|\w|to)'
dirtre = '(^' + dirt + '($| +))|(\s+' + dirt + '$)'

# Regular expression for 10 character code.  Example:
# "P01 S100101"
# (These are POG codes, but this program doesn't really care).
code10re = r'P\w\w ?[S5]\w{6}'
# RE for 8 character code.  Example:
# "P0110001"
Ejemplo n.º 2
0
	def command(self):
		self.parse_config(self.options.config_file)
		self.cache = swiss.Cache(self.config.get("cache_dir", "data"))
		self.log = logging.getLogger("fetch")
		for arg in self.args:
			self.fetch(arg)
Ejemplo n.º 3
0
import os
import shutil

import swiss

storage_path = '/tmp/pdfator-test/storage'
storage = swiss.Cache(storage_path)
from collections import deque


class Queue(deque):
    pass


queue = Queue()


def get_text(name):
    # id = map_name_to_id(name)
    id = name
    stream = storage.stream(id)
    if stream is None:
        queue.append(name)
        return 'No PDF text yet, added to queue, please check back in 10m'
    else:
        return stream


class TestItAll:
    name = '1609'
Ejemplo n.º 4
0
url = 'http://www.econ.yale.edu/~shiller/data/ie_data.xls'
cache = 'cache'

import swiss
import swiss.tabular
cache = swiss.Cache(cache)


class Extractor(object):
    def execute(self):
        fp = cache.retrieve(url)
        reader = swiss.tabular.XlsReader(fp)
        # print reader.info()
        tabdata = reader.read()
        # clean up data
        data = tabdata.data
        # headings spread across rows 5-8
        headings = zip(*data[4:8])
        tabdata.header = [' '.join(cols).strip() for cols in headings]
        data = tabdata.data[8:-1]
        transposed = zip(*data)
        # get rid of odd date e.g. 1871.01 and replace with date fraction
        fraction = transposed[5]
        del transposed[5]
        del transposed[-1]
        transposed[0] = fraction
        tabdata.data = zip(*transposed)
        del tabdata.header[5]
        del tabdata.header[-1]
        writer = swiss.tabular.CsvWriter()
        writer.write(tabdata, open('data.csv', 'w'))