Example #1
0
  def test_create_if_exists(self):
    dt = DumpTruck(dbname = '/tmp/test.db')
    dt.execute('create table pineapple (bar integer, baz integer);')
    dt.create_index(['bar', 'baz'], 'pineapple')

    with self.assertRaises(sqlite3.OperationalError):
      dt.create_index(['bar', 'baz'], 'pineapple', if_not_exists = False)
Example #2
0
  def test_non_unique(self):
    dt = DumpTruck(dbname = '/tmp/test.db')
    dt.execute('create table tomato (bar integer, baz integer);')
    dt.create_index(['bar', 'baz'], 'tomato')
    observed = dt.execute('PRAGMA index_info(tomato_bar_baz)')

    # Indexness
    self.assertIsNotNone(observed)

    # Indexed columns
    expected = [
      {u'seqno': 0, u'cid': 0, u'name': u'bar'},
      {u'seqno': 1, u'cid': 1, u'name': u'baz'},
    ]
    self.assertListEqual(observed, expected)

    # Uniqueness
    indices = dt.execute('PRAGMA index_list(tomato)')
    for index in indices:
      if index[u'name'] == u'tomato_bar_baz':
        break
    else:
      index = {}

    self.assertEqual(index[u'unique'], 0)
Example #3
0
    def test_create_if_exists(self):
        dt = DumpTruck(dbname="/tmp/test.db")
        dt.execute("create table pineapple (bar integer, baz integer);")
        dt.create_index(["bar", "baz"], "pineapple")

        with self.assertRaises(sqlite3.OperationalError):
            dt.create_index(["bar", "baz"], "pineapple", if_not_exists=False)
Example #4
0
    def test_create_if_not_exists(self):
        dt = DumpTruck(dbname="/tmp/test.db")
        dt.execute("create table mango (bar integer, baz integer);")
        dt.create_index(["bar", "baz"], "mango")

        # This should not raise an error.
        dt.create_index(["bar", "baz"], "mango", if_not_exists=True)
Example #5
0
  def test_create_if_not_exists(self):
    dt = DumpTruck(dbname = '/tmp/test.db')
    dt.execute('create table mango (bar integer, baz integer);')
    dt.create_index(['bar', 'baz'], 'mango')

    # This should not raise an error.
    dt.create_index(['bar', 'baz'], 'mango', if_not_exists = True)
Example #6
0
def apis():
    dt = DumpTruck('/tmp/open-data.sqlite', auto_commit = False)
    dt.create_table({'catalog':'abc.def'}, 'socrata_apis')
    dt.create_index(['catalog'], 'socrata_apis', unique = True, if_not_exists = True)

    socrata_catalogs = filter(lambda x: x[0] == 'socrata', catalogs())
    for _, catalog in socrata_catalogs:
        dt.upsert({
            'catalog': catalog.split('://')[-1],
            'apis': count_apis(catalog),
        }, 'socrata_apis')
def main():
    edges = build_network()['edges']

    dt = DumpTruck(dbname = '/tmp/open-data.sqlite', adapt_and_convert = True)
    datasets_in = dt.execute('SELECT * FROM socrata')

    dt.create_table({'id': 'blah-blah'}, 'socrata_deduplicated')
    dt.create_index(['id'], 'socrata_deduplicated', if_not_exists = True, unique = True)

    for dataset in dedupe(datasets_in, edges):
        dt.upsert(dataset, 'socrata_deduplicated')
Example #8
0
def to_sqlite3():
    dt = DumpTruck('/tmp/open-data.sqlite', auto_commit = False)

    dummyrow = dict(zip(['software','catalog','identifier'], ['blah']*3))
    dt.create_table(dummyrow, 'datasets', if_not_exists = True)
    dt.create_index(['software','catalog','identifier'], 'datasets', if_not_exists = True, unique = True)

    for table in ['ckan','socrata']:
        dt.create_table({'catalog':'blah','identifier':'blah'}, table, if_not_exists = True)
        dt.create_index(['catalog','identifier'], table, if_not_exists = True, unique = True)

    dt.create_table({'view_id':'abc','table_id':123}, 'socrata_tables')
    dt.create_index(['view_id'], 'socrata_tables', if_not_exists = True, unique = True)
    dt.create_index(['table_id'], 'socrata_tables', if_not_exists = True)

    for dataset in datasets():
        row = {
            'software': dataset['software'],
            'catalog': dataset['catalog'],
            'identifier': dataset[SOFTWARE_MAP['identifier'][dataset['software']]],
        }
        sql = 'SELECT * FROM datasets WHERE software = ? AND catalog = ? AND identifier = ?'
        if dt.execute(sql, [row['software'],row['catalog'],row['identifier']]) != []:
            continue
        dt.upsert(row, 'datasets')
        if dataset['software'] == 'socrata':
            socrata_table = {
                'view_id': row['identifier'],
                'table_id': dataset['tableId'],
            }
            dt.upsert(socrata_table, 'socrata_tables')
        dt.upsert(dataset,dataset['software'])
        dt.commit()
Example #9
0
def extract_dataset_table_info():
    dt = DumpTruck(dbname = '/tmp/table_info.db')
    dt.create_table({'portal': 'abc', 'id': 'abcd-efgh'}, 'table_info')
    dt.create_index(['portal', 'id'], 'table_info', unique = True)
    dt.create_index(['tableId'], 'table_info', unique = False)
    done = set([tuple(row.keys()) for row in dt.execute('SELECT portal, id FROM table_info')])
    for portal in os.listdir('data'):
        for viewid in os.listdir(os.path.join('data', portal, 'views')):
            if (portal, viewid) in done:
                continue
            d = _dataset_table_info(portal, viewid)
            if d == None:
                continue
            dt.upsert(d, 'table_info')
Example #10
0
def users():
    dt = DumpTruck(dbname = '/tmp/socrata.db')
    dt.create_table({'id': 'abcd-efgh'}, 'user')
    dt.create_index(['id'], 'user', unique = True)

    _users = {}
    for portal in os.listdir('data'):
        for viewid in os.listdir(os.path.join('data', portal, 'views')):
            handle = open(os.path.join('data', portal, 'views', viewid), 'r')
            try:
                view = json.load(handle)
            except:
                # *cringe*
                continue
            handle.close()

            if view['owner']['id'] in _users:
                _users[view['owner']['id']]['views'].add(view['id'])
                try:
                    _users[view['owner']['id']]['publicationDates'].add((view['id'], view['publicationDate']))
                except:
                    return view
            else:
                _users[view['owner']['id']] = view['owner']
                _users[view['owner']['id']]['views'] = {view['id']}
                _users[view['owner']['id']]['tables'] = set()
                _users[view['owner']['id']]['publicationDates'] = set()

            if view['tableAuthor']['id'] in _users:
                _users[view['tableAuthor']['id']]['tables'].add(view['tableId'])
            else:
                _users[view['tableAuthor']['id']] = view['tableAuthor']
                _users[view['tableAuthor']['id']]['views'] = set()
                _users[view['tableAuthor']['id']]['tables'] = {view['tableId']}
                _users[view['tableAuthor']['id']]['publicationDates'] = set()

    for uid in _users.keys():
        for key in ['views', 'rights', 'tables']:
            if key in _users[uid]:
                _users[uid]['n_' + key] = len(_users[uid][key])
                del _users[uid][key]

    dt.insert(_users.values(), 'user')
    for uid, user in _users.items():
        for viewid, publicationDate in user['publicationDates']:
            dt.insert({'userid': user['id'], 'viewid': viewid, 'publicationDate': publicationDate}, 'publications', commit = False)
    dt.commit()
    return _users
Example #11
0
def get_links(softwares = ['ckan','socrata']):
    dt = DumpTruck('/tmp/open-data.sqlite')

    dummyrow = dict(zip(['software','catalog','identifier', 'status_code', 'headers', 'error'], (['blah'] * 3) + ([234] * 1) + ([{'a':'b'}] * 2)))
    dt.create_table(dummyrow, 'links', if_not_exists = True)
    dt.create_index(['software','catalog','identifier'], 'links', if_not_exists = True, unique = True)

    for software in softwares:
        for catalog in read.catalogs(software):
            if SOCRATA_FIX.get(catalog, 'this is a string, not None') == None:
                continue
            try:
                for row in _check_catalog(software, catalog):
                    dt.upsert(row, 'links')
            except:
                print(os.path.join('downloads',software,catalog))
                raise
Example #12
0
def check_links():
    dt = DumpTruck('/tmp/open-data.sqlite', auto_commit = False)
    dt.create_index(['url'], 'links', if_not_exists = True, unique = False)
    dt.create_index(['status_code'], 'links', if_not_exists = True, unique = False)

    # Source
    urls = Queue()
    sql = '''
SELECT DISTINCT url
FROM links
WHERE (status_code = -42 OR status_code IS NULL) AND is_link AND url NOT NULL
ORDER BY status_code, substr(30, 100);
'''
    # Order by the substring so that we randomly bounce around catalogs

    url_list = [row['url'] for row in dt.execute(sql)]
    for url in url_list:
        urls.put(url)

    # Sink to the database
    def _db(queue):
        dt = DumpTruck('/tmp/open-data.sqlite')
        while True:
            dt.execute(*queue.get())
    db_updates = Queue()
    db_thread = Thread(None, target = _db, args = (db_updates,))
    db_thread.start()

    # Check links
    def _check_link(url_queue):
        while not urls.empty():
            url = url_queue.get()
            if url == None:
                raise ValueError('url is None')
            status_code, headers, error = links.is_alive(url)
            sql = 'UPDATE links SET status_code = ?, headers = ?, error = ? WHERE is_link = 1 AND url = ?'
            db_updates.put((sql, (status_code, headers, error, url)))
            print(url)

    threads = {}
    for i in range(100):
        threads[i] = Thread(None, target = _check_link, args = (urls,))

    for thread in threads.values():
        thread.start()
Example #13
0
    def test_non_unique(self):
        dt = DumpTruck(dbname="/tmp/test.db")
        dt.execute("create table tomato (bar integer, baz integer);")
        dt.create_index(["bar", "baz"], "tomato")
        observed = dt.execute("PRAGMA index_info(tomato_bar_baz)")

        # Indexness
        self.assertIsNotNone(observed)

        # Indexed columns
        expected = [{u"seqno": 0, u"cid": 0, u"name": u"bar"}, {u"seqno": 1, u"cid": 1, u"name": u"baz"}]
        self.assertListEqual(observed, expected)

        # Uniqueness
        indices = dt.execute("PRAGMA index_list(tomato)")
        for index in indices:
            if index[u"name"] == u"tomato_bar_baz":
                break
        else:
            index = {}

        self.assertEqual(index[u"unique"], 0)
Example #14
0
    def do_row(tr):
        try:
            return l.parse_row(tr)
        except:
            print tostring(tr)
            raise

    return map(do_row, trs[2:])


# Schema
dt = DumpTruck(dbname='/tmp/finalip.db')
dt.create_table({u'DA Number': u'NAE-2009-01067'},
                'finalip',
                if_not_exists=True)
dt.create_index(['Da Number'], 'finalip', unique=True, if_not_exists=True)

# Skip finished stuff
pages = set([(row['Year'], row['Month'], row['Page'])
             for row in dt.execute('SELECT Year, Month, Page FROM finalip')])

# Populate
for dirname, subdirnames, filenames in os.walk(
        os.path.join(os.environ['READER_ROOT'], '..', 'finalips')):
    if subdirnames != []:
        continue
    for filename in filenames:
        year, month = map(int, dirname.split('/')[-2:])
        page = (year, month, filename)
        if page in pages:
            continue
Example #15
0
def read_finalip(path):
    html = parse(path)
    trs = html.xpath('//table[@style="border-collapse: collapse; width: 100%;"]/descendant::tr')
    def do_row(tr):
        try:
            return l.parse_row(tr)
        except:
            print tostring(tr)
            raise
    return map(do_row, trs[2:])

# Schema
dt = DumpTruck(dbname = '/tmp/finalip.db')
dt.create_table({u'DA Number': u'NAE-2009-01067'}, 'finalip', if_not_exists = True)
dt.create_index(['Da Number'], 'finalip', unique = True, if_not_exists = True)

# Skip finished stuff
pages = set([(row['Year'], row['Month'], row['Page']) for row in dt.execute('SELECT Year, Month, Page FROM finalip')])

# Populate
for dirname, subdirnames, filenames in os.walk(os.path.join(os.environ['READER_ROOT'], '..', 'finalips')):
    if subdirnames != []:
        continue
    for filename in filenames:
        year, month = map(int, dirname.split('/')[-2:])
        page = (year, month, filename)
        if page in pages:
            continue

        path = os.path.join(dirname, filename)
Example #16
0
    # We initialize DumpTruck, with dummy data that won't be inserted.
    dump_truck.create_table(
        {
            "stored_filename": "filename.20130227205616.tgz",
            "size": 1,
            "metadata": {"is_enc": False},
            "backup_date": 1361994976,
            "filename": "filename",
            "backend": "s3",
            "is_deleted": False,
            "last_updated": 1361994976,
            "tags": [],
            "backend_hash": "backendhash",
        },
        "backups",
    )
    dump_truck.create_index(["stored_filename"], "backups", unique=True)

if not "inventory" in dump_truck.tables():
    dump_truck.create_table({"filename": "filename", "archive_id": "glacier-archive-id"}, "inventory")
    dump_truck.create_index(["filename"], "inventory", unique=True)

if not "jobs" in dump_truck.tables():
    dump_truck.create_table({"filename": "filename", "job_id": "job_id"}, "jobs")
    dump_truck.create_index(["filename"], "jobs", unique=True)

if not "config" in dump_truck.tables():
    dump_truck.save_var("client_id", "")
    dump_truck.save_var("sync_ts", 0)
    dump_truck.save_var("tags", set())
Example #17
0
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16677&mon=jul", "Latest releases"],
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16676&mon=aug", "Latest releases"],
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16678&mon=sep", "Latest releases"],
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16679&mon=oct", "Latest releases"],
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16680&mon=nov", "Latest releases"],
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/10804.146.html", "Archive releases"], # 2005
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/10805.145.html", "Archive releases"], # 2006
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/10806.144.html", "Archive releases"], # 2007
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/10807.143.html", "Archive releases"], # 2008
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/13342.html", "Archive releases"], # 2009
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/13661.html", "Archive releases"], # 2010
    ["http://www.scotlandoffice.gov.uk/scotlandoffice/15263.html", "Archive releases"], # 2011
]

dt = DumpTruck(dbname="scotland.db")
dt.create_table({"Title": "",
                 "Publication date": "",
                 "Old URL": "",
                 "Summary": "",
                 "Attachments": "",
                 "Type": "",
                 "Associated organisations": ""}, "publications")
dt.create_index(["Title", "Old URL"], "publications", unique=True)

for url, page_type in URLS:
    for publication in scrape_list_page(url):
        publication['Type'] = page_type
        dt.upsert(publication, "publications")

dumptruck_to_csv(dt, "publications", "/home/http/scotland/publications.csv")
Example #18
0
def main():
    dt = DumpTruck(dbname='metrics.db')
    dt.create_table({'portal': 'abc', 'date': datetime.date.today()}, 'series')
    dt.create_index(['portal', 'date'], 'series')
    dt.upsert(list(table()), 'series')
Example #19
0
from common import dumptruck_to_csv

URL = "http://www.scotlandoffice.gov.uk/scotlandoffice/14463.363.html"

req = requests.get(URL)
doc = lxml.html.fromstring(req.text)

dt = DumpTruck(dbname="scotland.db")
dt.create_table({"Title": "",
                 "Publication date": "",
                 "Old URL": "",
                 "Body": "",
                 "Attachments": "",
                 "Associated organisations": "",
                 "Associated Document Series": ""}, "statistics")
dt.create_index(["Title", "Old URL"], "statistics", unique=True)

for link in doc.xpath("//div[@class='wrapper']/ul/li/a"):
    series_title, series_url = link.text, urlparse.urljoin(URL, link.attrib["href"])
    print series_title

    series_req = requests.get(series_url)
    series_doc = lxml.html.fromstring(series_req.text)

    for table_line in series_doc.xpath("//tr[not(@bgcolor) or @bgcolor!='#004093']"):
        file_pub_date = table_line.xpath("./td[3]")[0].text

        for file_node in table_line.xpath("./td[2]//a"):
            file_title = etree.tostring(file_node, method="text", encoding="utf8")
            file_link = file_node.attrib["href"]
            if not file_link.startswith("http"):
Example #20
0
def scrape_main_article(url):
    req = requests.get(url)
    doc = lxml.html.fromstring(req.text)

    div = doc.xpath("//*[@class='wrapper']")[0]
    div.remove(div.find("h1"))
    for para in div.findall("p"):
        if para.find("strong") is not None:
            div.remove(para)
    return htmlize(etree.tostring(div))

dt = DumpTruck(dbname="scotland.db")
dt.create_table({"Title": "",
                 "Publication date": "",
                 "Old URL": "",
                 "Summary": "",
                 "Body": "",
                 "Associated organisations": ""}, "news")
dt.create_index(["Title", "Old URL"], "news", unique=True)

for url in URLS:
    for news_item in scrape_list_page(url):
        attachments = json.loads(news_item.pop("Attachments"))
        link = attachments[0]["link"]
        news_item["Old URL"] = link
        news_item["Body"] = scrape_main_article(link)
        dt.upsert(news_item, "news")

dumptruck_to_csv(dt, "news", "/home/http/scotland/news.csv")
Example #21
0
def main():
    dt = DumpTruck(dbname = 'metrics.db')
    dt.create_table({'portal': 'abc', 'date': datetime.date.today()}, 'series')
    dt.create_index(['portal', 'date'], 'series')
    dt.upsert(list(table()), 'series')
Example #22
0
from dumptruck import DumpTruck
import random   
import os,sys
import logging
from sqlite3 import OperationalError
import datetime
import copy,types,time

logging.basicConfig(level=logging.DEBUG,file=sys.stdout,format='%(levelname)s %(asctime)-15s %(filename)s %(lineno)s %(message)s')
logger=logging.getLogger(os.path.split(__file__)[1])

_here=os.path.split(__file__)[0]

dt=DumpTruck(dbname=os.path.join(_here,"data/events.sqlite"))
try :
	dt.create_index(["id"],"events",unique=True)
	dt.create_index(["url"],"events",unique=True)
except OperationalError :
	pass

def scrape_now() :
	nn=0
	for l in listevents() :
		try :
			d=dt.execute("select count(*) as c from events where url='%(url)s'" % l)
		except OperationalError :
			d=[{ "c" : 0 }]
			pass
		if d[0]["c"]==0 :
			l.update(eventdata(l["url"]))
			logger.info("found new %s %s" % (l.get("id",l.get("threadid","")),l.get("title","")))
Example #23
0
    'http://www.nwo.usace.army.mil',
    'http://www.nws.usace.army.mil',
    'http://www.nww.usace.army.mil',
    'http://www.pof.usace.army.mil',
    'http://www.poj.usace.army.mil',
    'http://www.saw.usace.army.mil',
    'http://www.spa.usace.army.mil',
    'http://www.spk.usace.army.mil',
    'http://www.spl.usace.army.mil',
    'http://www.swf.usace.army.mil',
    'http://www.swg.usace.army.mil',
    'http://www.tam.usace.army.mil',
}

if __name__ == '__main__':
    dt = DumpTruck(dbname = 'usace.db')
    dt.create_table({'permit_application_number': 'abcd'}, 'notice')
    dt.create_index(['permit_application_number'], 'notice')
    for division in parse.locations(get('http://www.usace.army.mil/Locations.aspx')):
        for district in division['districts']:
            domain = re.sub(r'.usace.army.mil.*$', '.usace.army.mil', district['href'])
            path = '/Missions/Regulatory/PublicNotices.aspx'
            if domain in SKIPPED_DISTRICTS:
                continue

            pn_list = None
            while pn_list == None or pn_list['last_page'] > pn_list['current_page']:
                pn_list =  parse.public_notice_list(get(domain + path))
                dt.upsert(list(pn_list['notices']), 'notice')

Example #24
0
#!/usr/bin/env python

from dumptruck import DumpTruck

dt = DumpTruck(dbname='scotland.db')
dt.create_table({'title': '', 'date': ''}, 'publications')
dt.create_index(['title'], 'publications', unique=True)


dt.upsert({'title': 'one', 'date': 'today'}, 'publications')
dt.upsert({'title': 'one', 'date': 'yesterday'}, 'publications')

data = dt.execute('SELECT * FROM `publications`')
print data