Example #1
0
    trs = html.xpath(
        '//table[@style="border-collapse: collapse; width: 100%;"]/descendant::tr'
    )

    def do_row(tr):
        try:
            return l.parse_row(tr)
        except:
            print tostring(tr)
            raise

    return map(do_row, trs[2:])


# Schema
dt = DumpTruck(dbname='/tmp/finalip.db')
dt.create_table({u'DA Number': u'NAE-2009-01067'},
                'finalip',
                if_not_exists=True)
dt.create_index(['Da Number'], 'finalip', unique=True, if_not_exists=True)

# Skip finished stuff
pages = set([(row['Year'], row['Month'], row['Page'])
             for row in dt.execute('SELECT Year, Month, Page FROM finalip')])

# Populate
for dirname, subdirnames, filenames in os.walk(
        os.path.join(os.environ['READER_ROOT'], '..', 'finalips')):
    if subdirnames != []:
        continue
    for filename in filenames:
Example #2
0
def _connect(dbname='scraperwiki.sqlite'):
    'Initialize the database (again). This is mainly for testing'
    global dt
    dt = DumpTruck(dbname=dbname, adapt_and_convert=False)
Example #3
0
# coding: utf-8
from dumptruck import DumpTruck
import csv, os, glob

_here = os.path.split(__file__)[0]

store = DumpTruck(dbname="db/documents.db")

already = dict([(a[26:36], a) for a in [
    os.path.split(a)[1]
    for a in glob.glob("/home/martin/Dropbox/blackrock-scraper/data/*")
]])

already_downloaded = dict([(a[26:36], a) for a in [
    os.path.split(a)[1] for a in glob.glob(os.path.join(_here, "data/html/*"))
]])

print len(already_downloaded.items()), len(already.items())
#s
#
# Liste aller Gesellschaften mit aktuellstem Berichtsdatum und Anzahl der Berichte
dt = csv.DictWriter(open(os.path.join(_here, "data/tables", "ciks.csv"), "w"),
                    [
                        'act', 'num', 'cik', 'name', 'filename', 'already',
                        'already_downloaded', 'exists', 'link'
                    ],
                    delimiter=";")
dt.writerow(
    dict(act="Aktuellster Bericht",
         num="Anzahl der Berichte",
         cik="Central Index Key",
Example #4
0
import logging, sys, os
from lxml import etree
import requests
from dumptruck import DumpTruck, Pickle

_here = os.path.split(__file__)[0]

store = DumpTruck(dbname=os.path.join(_here, "db/documents.db"))

parser = etree.HTMLParser()


def getTree(url):
    return etree.parse(url, etree.HTMLParser())


logger = logging.getLogger(os.path.split(__file__)[1])
logging.basicConfig(level=logging.DEBUG, file=sys.stderr)


def get_nq_for_cik(cik):
    try:
        tree = getTree(
            "http://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=%s&type=N-Q%%25&dateb=&owner=include&start=0&count=40&output=atom"
            % cik)
    except Exception, e:
        logger.error("Error searching for CIK %s:%s" % (cik, e))
        pass
    for entry in tree.xpath("//entry"):
        link = entry.xpath("link/@href")[0]
        date = entry.xpath("updated/text()")[0]
Example #5
0
def main():
    dt = DumpTruck(dbname='metrics.db')
    dt.create_table({'portal': 'abc', 'date': datetime.date.today()}, 'series')
    dt.create_index(['portal', 'date'], 'series')
    dt.upsert(list(table()), 'series')
Example #6
0
    def open_spider(self, spider):
        self.dt = DumpTruck(dbname=settings['DB_PATH'], auto_commit=True)

        id_data = self.dt.execute('SELECT id FROM auctions')
        self.ids = [x['id'] for x in id_data]
Example #7
0
#!/usr/bin/env python2
import os, json
from dumptruck import DumpTruck

dt = DumpTruck(dbname='/tmp/catalog.db')

# Create a unique index on `identifier`.
dt.execute('''
CREATE TABLE IF NOT EXISTS "catalog" (
  "portal" TEXT NOT NULL,
  "identifier" TEXT NOT NULL,
  PRIMARY KEY ("portal", "identifier")
);''')

for data_json in os.listdir('catalogs'):
    # Load into memory.
    data = json.load(open(os.path.join('catalogs', data_json)))[1:]

    # Add the portal.
    portal = data_json.replace('.json', '')
    for row in data:
        row['portal'] = portal

    # Put in the database.
    dt.insert(data, 'catalog')
Example #8
0
def _connect(dbname=DATABASE_NAME, timeout=DATABASE_TIMEOUT):
    'Initialize the database (again). This is mainly for testing'
    global dt
    dt = DumpTruck(dbname=dbname, adapt_and_convert=False, timeout=timeout)
Example #9
0
#!/usr/bin/env python
import json
from dumptruck import DumpTruck
dt = DumpTruck(dbname='applications.db')


def scott_data():
    sql = '''
    SELECT "parish", sum("acreage") AS 'acreage'
    FROM application
    WHERE "type" = 'impact' AND "parish" != ''
    GROUP BY "parish";
    '''

    return {
        row['parish'].upper().replace('SAINT', 'ST'):
        (row['parish'], row['acreage'])
        for row in dt.execute(sql)
    }


scott = scott_data()
parishes = json.load(open('parishes.json'))

max_impacted_acres = max([v[1] for v in scott.values()])
for feature in parishes['features']:
    feature['properties']['impacted_acres'] = scott.get(
        feature['properties']['COUNTY'], (None, 0))[1]
    feature['properties']['impacted_acres_prop_max'] = scott.get(
        feature['properties']['COUNTY'], (None, 0))[1] / max_impacted_acres