trs = html.xpath( '//table[@style="border-collapse: collapse; width: 100%;"]/descendant::tr' ) def do_row(tr): try: return l.parse_row(tr) except: print tostring(tr) raise return map(do_row, trs[2:]) # Schema dt = DumpTruck(dbname='/tmp/finalip.db') dt.create_table({u'DA Number': u'NAE-2009-01067'}, 'finalip', if_not_exists=True) dt.create_index(['Da Number'], 'finalip', unique=True, if_not_exists=True) # Skip finished stuff pages = set([(row['Year'], row['Month'], row['Page']) for row in dt.execute('SELECT Year, Month, Page FROM finalip')]) # Populate for dirname, subdirnames, filenames in os.walk( os.path.join(os.environ['READER_ROOT'], '..', 'finalips')): if subdirnames != []: continue for filename in filenames:
def _connect(dbname='scraperwiki.sqlite'): 'Initialize the database (again). This is mainly for testing' global dt dt = DumpTruck(dbname=dbname, adapt_and_convert=False)
# coding: utf-8 from dumptruck import DumpTruck import csv, os, glob _here = os.path.split(__file__)[0] store = DumpTruck(dbname="db/documents.db") already = dict([(a[26:36], a) for a in [ os.path.split(a)[1] for a in glob.glob("/home/martin/Dropbox/blackrock-scraper/data/*") ]]) already_downloaded = dict([(a[26:36], a) for a in [ os.path.split(a)[1] for a in glob.glob(os.path.join(_here, "data/html/*")) ]]) print len(already_downloaded.items()), len(already.items()) #s # # Liste aller Gesellschaften mit aktuellstem Berichtsdatum und Anzahl der Berichte dt = csv.DictWriter(open(os.path.join(_here, "data/tables", "ciks.csv"), "w"), [ 'act', 'num', 'cik', 'name', 'filename', 'already', 'already_downloaded', 'exists', 'link' ], delimiter=";") dt.writerow( dict(act="Aktuellster Bericht", num="Anzahl der Berichte", cik="Central Index Key",
import logging, sys, os from lxml import etree import requests from dumptruck import DumpTruck, Pickle _here = os.path.split(__file__)[0] store = DumpTruck(dbname=os.path.join(_here, "db/documents.db")) parser = etree.HTMLParser() def getTree(url): return etree.parse(url, etree.HTMLParser()) logger = logging.getLogger(os.path.split(__file__)[1]) logging.basicConfig(level=logging.DEBUG, file=sys.stderr) def get_nq_for_cik(cik): try: tree = getTree( "http://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=%s&type=N-Q%%25&dateb=&owner=include&start=0&count=40&output=atom" % cik) except Exception, e: logger.error("Error searching for CIK %s:%s" % (cik, e)) pass for entry in tree.xpath("//entry"): link = entry.xpath("link/@href")[0] date = entry.xpath("updated/text()")[0]
def main(): dt = DumpTruck(dbname='metrics.db') dt.create_table({'portal': 'abc', 'date': datetime.date.today()}, 'series') dt.create_index(['portal', 'date'], 'series') dt.upsert(list(table()), 'series')
def open_spider(self, spider): self.dt = DumpTruck(dbname=settings['DB_PATH'], auto_commit=True) id_data = self.dt.execute('SELECT id FROM auctions') self.ids = [x['id'] for x in id_data]
#!/usr/bin/env python2 import os, json from dumptruck import DumpTruck dt = DumpTruck(dbname='/tmp/catalog.db') # Create a unique index on `identifier`. dt.execute(''' CREATE TABLE IF NOT EXISTS "catalog" ( "portal" TEXT NOT NULL, "identifier" TEXT NOT NULL, PRIMARY KEY ("portal", "identifier") );''') for data_json in os.listdir('catalogs'): # Load into memory. data = json.load(open(os.path.join('catalogs', data_json)))[1:] # Add the portal. portal = data_json.replace('.json', '') for row in data: row['portal'] = portal # Put in the database. dt.insert(data, 'catalog')
def _connect(dbname=DATABASE_NAME, timeout=DATABASE_TIMEOUT): 'Initialize the database (again). This is mainly for testing' global dt dt = DumpTruck(dbname=dbname, adapt_and_convert=False, timeout=timeout)
#!/usr/bin/env python import json from dumptruck import DumpTruck dt = DumpTruck(dbname='applications.db') def scott_data(): sql = ''' SELECT "parish", sum("acreage") AS 'acreage' FROM application WHERE "type" = 'impact' AND "parish" != '' GROUP BY "parish"; ''' return { row['parish'].upper().replace('SAINT', 'ST'): (row['parish'], row['acreage']) for row in dt.execute(sql) } scott = scott_data() parishes = json.load(open('parishes.json')) max_impacted_acres = max([v[1] for v in scott.values()]) for feature in parishes['features']: feature['properties']['impacted_acres'] = scott.get( feature['properties']['COUNTY'], (None, 0))[1] feature['properties']['impacted_acres_prop_max'] = scott.get( feature['properties']['COUNTY'], (None, 0))[1] / max_impacted_acres