Example #1
0
 def setUp(self):
     self.cleanUp()
     h = DumpTruck(dbname=u"/tmp/test.db")
     h.save_var(u"birthday", u"November 30, 1888")
     h.close()
     connection = sqlite3.connect(u"/tmp/test.db")
     self.cursor = connection.cursor()
Example #2
0
 def setUp(self):
   self.cleanUp()
   h = DumpTruck(dbname = u'/tmp/test.db')
   h.save_var(u'birthday', u'November 30, 1888')
   h.close()
   connection=sqlite3.connect(u'/tmp/test.db')
   self.cursor=connection.cursor()
Example #3
0
def build_table_from_db(dbname = '/tmp/table_info.db'):
    dt = DumpTruck(dbname = dbname)
    tableIds = [row['tableId'] for row in dt.execute('''
SELECT tableId, count(*)
FROM table_info
GROUP BY tableId
ORDER BY count(*) DESC
limit 10;
''')]

    try:
        os.mkdir('geneology')
    except OSError:
        pass
    json.dump(tableIds, open(os.path.join('geneology', 'index.json'), 'w'))

    for tableId in tableIds:
        result = {
            'source': dt.execute('SELECT * FROM table_info WHERE tableId = ? ORDER BY createdAt ASC LIMIT 1', [tableId])[0],
            'datasets': {},
        }
        for dataset in dt.execute('SELECT * FROM table_info WHERE tableId = ?', [tableId]):
            if dataset['id'] not in result['datasets']:
                result['datasets'][dataset['id']] = dataset
                result['datasets'][dataset['id']]['portals'] = []

            result['datasets'][dataset['id']]['portals'].append(dataset['portal'])

        result['datasets'] = result['datasets'].values()
        for dataset in result['datasets']:
            del dataset['portal']
        json.dump(result, open(os.path.join('geneology', '%d.json' % tableId), 'w'))
Example #4
0
 def test_select(self):
   shutil.copy(u'fixtures/landbank_branches.sqlite', u'.')
   h = DumpTruck(dbname = u'landbank_branches.sqlite')
   data_observed = h.execute(u'SELECT * FROM `branches` WHERE Fax is not null ORDER BY Fax LIMIT 3;')
   data_expected = [{'town': u'\r\nCenturion', 'date_scraped': 1327791915.618461, 'Fax': u' (012) 312 3647', 'Tel': u' (012) 686 0500', 'address_raw': u'\r\n420 Witch Hazel Ave\n\r\nEcopark\n\r\nCenturion\n\r\n0001\n (012) 686 0500\n (012) 312 3647', 'blockId': 14, 'street-address': None, 'postcode': u'\r\n0001', 'address': u'\r\n420 Witch Hazel Ave\n\r\nEcopark\n\r\nCenturion\n\r\n0001', 'branchName': u'Head Office'}, {'town': u'\r\nCenturion', 'date_scraped': 1327792245.787187, 'Fax': u' (012) 312 3647', 'Tel': u' (012) 686 0500', 'address_raw': u'\r\n420 Witch Hazel Ave\n\r\nEcopark\n\r\nCenturion\n\r\n0001\n (012) 686 0500\n (012) 312 3647', 'blockId': 14, 'street-address': u'\r\n420 Witch Hazel Ave\n\r\nEcopark', 'postcode': u'\r\n0001', 'address': u'\r\n420 Witch Hazel Ave\n\r\nEcopark\n\r\nCenturion\n\r\n0001', 'branchName': u'Head Office'}, {'town': u'\r\nMiddelburg', 'date_scraped': 1327791915.618461, 'Fax': u' (013) 282 6558', 'Tel': u' (013) 283 3500', 'address_raw': u'\r\n184 Jan van Riebeeck Street\n\r\nMiddelburg\n\r\n1050\n (013) 283 3500\n (013) 282 6558', 'blockId': 17, 'street-address': None, 'postcode': u'\r\n1050', 'address': u'\r\n184 Jan van Riebeeck Street\n\r\nMiddelburg\n\r\n1050', 'branchName': u'Middelburg'}]
   self.assertListEqual(data_observed, data_expected)
   os.remove('landbank_branches.sqlite')
Example #5
0
def check_timeouts():
    import requests
    from unidecode import unidecode

    dt = DumpTruck('/tmp/open-data.sqlite', auto_commit = False)
    dt.execute('''
CREATE TABLE IF NOT EXISTS link_speeds (
  url TEXT NOT NULL,
  elapsed FLOAT,
  error_type TEXT NOT NULL,
  error TEXT NOT NULL,
  UNIQUE(url)
);''')
    urls = Queue()
    url_list = [row['url'] for row in dt.execute('SELECT DISTINCT url FROM links WHERE status_code = -42 and URL NOT IN (SELECT url from link_speeds)')]
    for url in url_list:
        urls.put(url)

    # Sink to the database
    def _db(queue):
        dt = DumpTruck('/tmp/open-data.sqlite')
        while True:
            dt.execute(*queue.get())
    db_updates = Queue()
    db_thread = Thread(None, target = _db, args = (db_updates,))
    db_thread.start()

#   def signal_handler(signal, frame):
#       db_thread.terminate()
#       sys.exit(0)
#   signal.signal(signal.SIGINT, signal_handler)

    # Check links
    def _check_link(url_queue):
        while not urls.empty():
            url = url_queue.get()
            if url == None:
                raise ValueError('url is None')
            try:
                r = requests.head(url, allow_redirects=True, timeout = 30)
            except Exception as e:
                try:
                    msg = unicode(e)
                except:
                    msg = ''
                sql = 'INSERT INTO link_speeds (url, error_type, error) VALUES (?,?,?)'
                db_updates.put((sql, (url, unicode(type(e)), msg))) # ew python 2
            else:
                sql = 'INSERT INTO link_speeds (url, elapsed, error_type, error) VALUES (?,?,\'\',\'\')'
                db_updates.put((sql, (url, r.elapsed.total_seconds())))

    threads = {}
    for i in range(500):
        threads[i] = Thread(None, target = _check_link, args = (urls,))

    for thread in threads.values():
        thread.start()
Example #6
0
  def save_and_check(self, dataIn, tableIn, dataOut, tableOut = None, twice = True):
    if tableOut == None:
      tableOut = quote(tableIn)

    # Insert
    h = DumpTruck(dbname = '/tmp/test.db')
    h.insert(dataIn, tableIn)
    h.close()

    # Observe with pysqlite
    connection=sqlite3.connect('/tmp/test.db')
    cursor=connection.cursor()
    cursor.execute(u'SELECT * FROM %s' % tableOut)
    observed1 = cursor.fetchall()
    connection.close()

    if twice:
      # Observe with DumpTruck
      h = DumpTruck(dbname = '/tmp/test.db')
      observed2 = h.execute(u'SELECT * FROM %s' % tableOut)
      h.close()
 
      #Check
      expected1 = dataOut
      expected2 = [dataIn] if type(dataIn) in (dict, OrderedDict) else dataIn
 
      self.assertListEqual(observed1, expected1)
      self.assertListEqual(observed2, expected2)
Example #7
0
 def savegetvar(self, var):
     h = DumpTruck(dbname="/tmp/test.db")
     h.save_var(u"weird", var)
     h.close()
     h = DumpTruck(dbname="/tmp/test.db")
     t = os.stat("/tmp/test.db").st_mtime
     self.assertEqual(h.get_var(u"weird"), var)
     h.close()
     assert os.stat("/tmp/test.db").st_mtime == t
Example #8
0
 def savegetvar(self, var):
   h = DumpTruck(dbname = '/tmp/test.db')
   h.save_var(u'weird', var)
   h.close()
   h = DumpTruck(dbname = '/tmp/test.db')
   t=os.stat('/tmp/test.db').st_mtime
   self.assertEqual(h.get_var(u'weird'), var)
   h.close()
   assert os.stat('/tmp/test.db').st_mtime==t
Example #9
0
  def test_create_table(self):
    h = DumpTruck(dbname = '/tmp/test.db')
    h.create_table({'foo': 0, 'bar': 1, 'baz': 2}, 'zombies')
    h.close()

    connection=sqlite3.connect('/tmp/test.db')
    cursor=connection.cursor()
    cursor.execute('SELECT foo, bar, baz FROM zombies')
    observed = cursor.fetchall()
    connection.close()

    expected = []
    self.assertListEqual(observed, expected)
Example #10
0
    def test_create_table(self):
        h = DumpTruck(dbname="/tmp/test.db")
        h.create_table({"foo": 0, "bar": 1, "baz": 2}, "zombies")
        h.close()

        connection = sqlite3.connect("/tmp/test.db")
        cursor = connection.cursor()
        cursor.execute("SELECT foo, bar, baz FROM zombies")
        observed = cursor.fetchall()
        connection.close()

        expected = []
        self.assertListEqual(observed, expected)
Example #11
0
def extract_dataset_table_info():
    dt = DumpTruck(dbname = '/tmp/table_info.db')
    dt.create_table({'portal': 'abc', 'id': 'abcd-efgh'}, 'table_info')
    dt.create_index(['portal', 'id'], 'table_info', unique = True)
    dt.create_index(['tableId'], 'table_info', unique = False)
    done = set([tuple(row.keys()) for row in dt.execute('SELECT portal, id FROM table_info')])
    for portal in os.listdir('data'):
        for viewid in os.listdir(os.path.join('data', portal, 'views')):
            if (portal, viewid) in done:
                continue
            d = _dataset_table_info(portal, viewid)
            if d == None:
                continue
            dt.upsert(d, 'table_info')
Example #12
0
 def test_no_rows_second_insert(self):
     "Nothing happens if no rows are inserted to a table that is there."
     dt = DumpTruck(dbname="/tmp/test.db")
     dt.create_table({"foo": "uhtnh", "bar": "aoue"}, "ninety")
     dt.insert([], "ninety")
     c = dt.execute("select count(*) as c from ninety")[0]["c"]
     dt.close()
     self.assertEqual(c, 0)
Example #13
0
 def test_empty_row_second_insert(self):
     "An empty row acts like any other row."
     dt = DumpTruck(dbname="/tmp/test.db")
     dt.create_table({"foo": "uhtnh", "bar": "aoue"}, "nine")
     dt.insert({}, "nine")
     c = dt.execute("select count(*) as c from nine")[0]["c"]
     dt.close()
     self.assertEqual(c, 1)
Example #14
0
 def test_second_insert(self):
   "Inserting a second row that is all null adds an empty row."
   dt = DumpTruck(dbname = '/tmp/test.db')
   dt.create_table({'foo': 'uhtnh', 'bar': 'aoue'}, 'three')
   dt.insert({'foo': None, 'bar': None}, 'three')
   c = dt.execute('select count(*) as c from three')[0]['c']
   dt.close()
   self.assertEqual(c, 1)
Example #15
0
 def test_empty_row_second_insert(self):
   "An empty row acts like any other row."
   dt = DumpTruck(dbname = '/tmp/test.db')
   dt.create_table({'foo': 'uhtnh', 'bar': 'aoue'}, 'nine')
   dt.insert({}, 'nine')
   c = dt.execute('select count(*) as c from nine')[0]['c']
   dt.close()
   self.assertEqual(c, 1)
Example #16
0
 def test_second_insert(self):
     "Inserting a second row that is all null adds an empty row."
     dt = DumpTruck(dbname="/tmp/test.db")
     dt.create_table({"foo": "uhtnh", "bar": "aoue"}, "three")
     dt.insert({"foo": None, "bar": None}, "three")
     c = dt.execute("select count(*) as c from three")[0]["c"]
     dt.close()
     self.assertEqual(c, 1)
Example #17
0
 def test_no_rows_second_insert(self):
   "Nothing happens if no rows are inserted to a table that is there."
   dt = DumpTruck(dbname = '/tmp/test.db')
   dt.create_table({'foo': 'uhtnh', 'bar': 'aoue'}, 'ninety')
   dt.insert([], 'ninety')
   c = dt.execute('select count(*) as c from ninety')[0]['c']
   dt.close()
   self.assertEqual(c, 0)
Example #18
0
class AuctionsPipeline(object):
    def open_spider(self, spider):
        self.dt = DumpTruck(dbname=settings['DB_PATH'], auto_commit=True)

        id_data = self.dt.execute('SELECT id FROM auctions')
        self.ids = [x['id'] for x in id_data]

    def process_item(self, item, spider):
        if 'auctions' not in getattr(spider, 'pipelines', []):
            return item

        item['id'] = int(item['id'][0])
        item['auctioneer'] = ' '.join(item['auctioneer'])
        item['contact_number'] = ' '.join(item['contact_number'])
        item['date'] = '%s %s' % (' '.join(item['date']), ' '.join(
            item['time']))
        item['location'] = ' '.join(item['location'])
        item['link'] = ' '.join(item['link'])
        item['listing'] = ' '.join(item['listing'])

        #format phonenumber
        parsed_number = phonenumbers.parse(item['contact_number'], 'US')
        item['contact_number'] = phonenumbers.format_number(
            parsed_number, phonenumbers.PhoneNumber())

        # format listing / remove any html cludge
        soup_listing = BeautifulSoup(item['listing'])
        item['listing'] = soup_listing.get_text()

        # format date and time to standard format
        dt = parse(item['date'])
        item['date'] = dt.datetime.strftime('%Y-%m-%d %H:%M:%S')

        if item['id'] in self.ids:
            raise DropItem('Dupe auction stored, ignoring listing: %s' % item)
        else:
            self.dt.insert(
                {
                    'id': item['id'],
                    'auctioneer': item['auctioneer'],
                    'contact_number': item['contact_number'],
                    'date': item['date'],
                    'location': item['location'],
                    'link': item['link'],
                    'listing': item['listing'],
                }, 'auctions')

            return item
def main():
    dt = DumpTruck(dbname='/tmp/yoga.db')
    dt.execute('''
CREATE TABLE IF NOT EXISTS page_source (
  page_number INTEGER NOT NULL,
  page_source TEXT NOT NULL,
  UNIQUE(page_number)
)''')

    print('Running the search')
    # Get the search page
    driver = _driver_setup()
    driver.get(url)

    # 100 per page
    option = driver.find_elements_by_xpath('id("ctl00_TemplateBody_ucTeacherDirectory_ddhowmany")/option[@value="100"]')[0]
    option.click()

    # Search
    button = driver.find_elements_by_id('ctl00_TemplateBody_ucTeacherDirectory_imgSearch')[0]
    button.click()

    while True:
        print('Pausing for a few seconds to let the page load and be polite')
        sleep(8)
        randomsleep()

        # Current page number
        page_numbers = driver.find_elements_by_css_selector('#ctl00_TemplateBody_ucTeacherDirectory_gvTeacherDirectory tr td tr span')

        # Fast forward to new pages: Ellipsis
        ellipses = driver.find_elements_by_xpath('//a[text()="..."]')

        # Fast forward to new pages: Maximum page
        max_page_numbers = driver.find_elements_by_xpath('//td[a[text()="..."]]/preceding-sibling::td[position()=1]')
        if max_page_numbers == []:
            max_page_numbers = [page_number.find_elements_by_xpath('../td[position()=last()]') for page_number in page_numbers]

        for nodes in [page_numbers, ellipses, max_page_numbers]:
            print(nodes)
            print [n.text for n in nodes]
            if len(nodes) == 1:
                import pdb; pdb.set_trace()
                raise ValueError('Only one navigation row')
            elif nodes[0].text != nodes[1].text:
                import pdb; pdb.set_trace()
                raise ValueError('Page navigation rows don\'t match.')
Example #20
0
    def test_create_if_not_exists(self):
        dt = DumpTruck(dbname="/tmp/test.db")
        dt.execute("create table mango (bar integer, baz integer);")
        dt.create_index(["bar", "baz"], "mango")

        # This should not raise an error.
        dt.create_index(["bar", "baz"], "mango", if_not_exists=True)
Example #21
0
  def test_create_if_exists(self):
    dt = DumpTruck(dbname = '/tmp/test.db')
    dt.execute('create table pineapple (bar integer, baz integer);')
    dt.create_index(['bar', 'baz'], 'pineapple')

    with self.assertRaises(sqlite3.OperationalError):
      dt.create_index(['bar', 'baz'], 'pineapple', if_not_exists = False)
Example #22
0
  def test_create_if_not_exists(self):
    dt = DumpTruck(dbname = '/tmp/test.db')
    dt.execute('create table mango (bar integer, baz integer);')
    dt.create_index(['bar', 'baz'], 'mango')

    # This should not raise an error.
    dt.create_index(['bar', 'baz'], 'mango', if_not_exists = True)
Example #23
0
  def test_non_unique(self):
    dt = DumpTruck(dbname = '/tmp/test.db')
    dt.execute('create table tomato (bar integer, baz integer);')
    dt.create_index(['bar', 'baz'], 'tomato')
    observed = dt.execute('PRAGMA index_info(tomato_bar_baz)')

    # Indexness
    self.assertIsNotNone(observed)

    # Indexed columns
    expected = [
      {u'seqno': 0, u'cid': 0, u'name': u'bar'},
      {u'seqno': 1, u'cid': 1, u'name': u'baz'},
    ]
    self.assertListEqual(observed, expected)

    # Uniqueness
    indices = dt.execute('PRAGMA index_list(tomato)')
    for index in indices:
      if index[u'name'] == u'tomato_bar_baz':
        break
    else:
      index = {}

    self.assertEqual(index[u'unique'], 0)
Example #24
0
class AuctionsPipeline(object):
    def open_spider(self, spider):
        self.dt = DumpTruck(dbname=settings['DB_PATH'],auto_commit=True)

        id_data = self.dt.execute('SELECT id FROM auctions')
        self.ids = [x['id'] for x in id_data]

    def process_item(self, item, spider):
        if 'auctions' not in getattr(spider,'pipelines',[]):
            return item

        item['id'] = int(item['id'][0])
        item['auctioneer'] = ' '.join(item['auctioneer'])
        item['contact_number'] = ' '.join(item['contact_number'])
        item['date'] = '%s %s' % (' '.join(item['date']), ' '.join(item['time']))
        item['location'] = ' '.join(item['location'])
        item['link'] = ' '.join(item['link'])
        item['listing'] = ' '.join(item['listing'])

        #format phonenumber
        parsed_number = phonenumbers.parse(item['contact_number'],'US')
        item['contact_number'] = phonenumbers.format_number(parsed_number, phonenumbers.PhoneNumber())

        # format listing / remove any html cludge
        soup_listing = BeautifulSoup(item['listing'])
        item['listing'] = soup_listing.get_text()

        # format date and time to standard format
        dt = parse(item['date'])
        item['date'] = dt.datetime.strftime('%Y-%m-%d %H:%M:%S')

        if item['id'] in self.ids:
            raise DropItem('Dupe auction stored, ignoring listing: %s' % item)
        else:
            self.dt.insert({
                'id': item['id'],
                'auctioneer': item['auctioneer'],
                'contact_number': item['contact_number'],
                'date': item['date'],
                'location': item['location'],
                'link': item['link'],
                'listing': item['listing'],
            }, 'auctions')

            return item
Example #25
0
    def test_create_if_exists(self):
        dt = DumpTruck(dbname="/tmp/test.db")
        dt.execute("create table pineapple (bar integer, baz integer);")
        dt.create_index(["bar", "baz"], "pineapple")

        with self.assertRaises(sqlite3.OperationalError):
            dt.create_index(["bar", "baz"], "pineapple", if_not_exists=False)
def main():
    edges = build_network()['edges']

    dt = DumpTruck(dbname = '/tmp/open-data.sqlite', adapt_and_convert = True)
    datasets_in = dt.execute('SELECT * FROM socrata')

    dt.create_table({'id': 'blah-blah'}, 'socrata_deduplicated')
    dt.create_index(['id'], 'socrata_deduplicated', if_not_exists = True, unique = True)

    for dataset in dedupe(datasets_in, edges):
        dt.upsert(dataset, 'socrata_deduplicated')
Example #27
0
 def test_select(self):
     shutil.copy(u"fixtures/landbank_branches.sqlite", u".")
     h = DumpTruck(dbname=u"landbank_branches.sqlite")
     data_observed = h.execute(u"SELECT * FROM `branches` WHERE Fax is not null ORDER BY Fax LIMIT 3;")
     data_expected = [
         {
             "town": u"\r\nCenturion",
             "date_scraped": 1327791915.618461,
             "Fax": u" (012) 312 3647",
             "Tel": u" (012) 686 0500",
             "address_raw": u"\r\n420 Witch Hazel Ave\n\r\nEcopark\n\r\nCenturion\n\r\n0001\n (012) 686 0500\n (012) 312 3647",
             "blockId": 14,
             "street-address": None,
             "postcode": u"\r\n0001",
             "address": u"\r\n420 Witch Hazel Ave\n\r\nEcopark\n\r\nCenturion\n\r\n0001",
             "branchName": u"Head Office",
         },
         {
             "town": u"\r\nCenturion",
             "date_scraped": 1327792245.787187,
             "Fax": u" (012) 312 3647",
             "Tel": u" (012) 686 0500",
             "address_raw": u"\r\n420 Witch Hazel Ave\n\r\nEcopark\n\r\nCenturion\n\r\n0001\n (012) 686 0500\n (012) 312 3647",
             "blockId": 14,
             "street-address": u"\r\n420 Witch Hazel Ave\n\r\nEcopark",
             "postcode": u"\r\n0001",
             "address": u"\r\n420 Witch Hazel Ave\n\r\nEcopark\n\r\nCenturion\n\r\n0001",
             "branchName": u"Head Office",
         },
         {
             "town": u"\r\nMiddelburg",
             "date_scraped": 1327791915.618461,
             "Fax": u" (013) 282 6558",
             "Tel": u" (013) 283 3500",
             "address_raw": u"\r\n184 Jan van Riebeeck Street\n\r\nMiddelburg\n\r\n1050\n (013) 283 3500\n (013) 282 6558",
             "blockId": 17,
             "street-address": None,
             "postcode": u"\r\n1050",
             "address": u"\r\n184 Jan van Riebeeck Street\n\r\nMiddelburg\n\r\n1050",
             "branchName": u"Middelburg",
         },
     ]
     self.assertListEqual(data_observed, data_expected)
     os.remove("landbank_branches.sqlite")
Example #28
0
def avail_within(db, start=0, end=2**32):
    '''
    Given a database filename, uid and a time range (POSIX times),
    return how long everyone was online, in seconds.
    '''

    # Database connection
    dt = DumpTruck(db)
    updates = dt.execute('''
        SELECT uid, ts, status
        FROM log_status
        WHERE ts > ? AND ts < ?
        ORDER BY uid, ts
        ;''', [start, end])

    # This loop depends on the order by clause above
    total_time = {}
    for u in updates:

        if u['status'] == 'notavail' and (u['uid'] not in total_time):
            # First time we see person, but it's 'notavail', so skip it.
            continue

        elif u['status'] == 'avail' and (u['uid'] not in total_time):
            # First time we see person becoming available
            total_time[u['uid']] = 0

        elif u['status'] == 'avail':
            # The person went avail. We don't need to do anything;
            # prev_time is recorded below the if statements
            continue

        elif u['status'] == 'notavail':
            # The person went notavail, so record the time when the user was available.
            total_time[u['uid']] += u['ts'] - prev_time

        else:
            raise ValueError('The update\'s status "%s" is neither "avail" nor "notavail."' % u['status'])

        # Record the current timestamp as prev_time
        prev_time = u['ts']

    return total_time
Example #29
0
def apis():
    dt = DumpTruck('/tmp/open-data.sqlite', auto_commit = False)
    dt.create_table({'catalog':'abc.def'}, 'socrata_apis')
    dt.create_index(['catalog'], 'socrata_apis', unique = True, if_not_exists = True)

    socrata_catalogs = filter(lambda x: x[0] == 'socrata', catalogs())
    for _, catalog in socrata_catalogs:
        dt.upsert({
            'catalog': catalog.split('://')[-1],
            'apis': count_apis(catalog),
        }, 'socrata_apis')
Example #30
0
    def test_special_type_date(self):
        """Adapters and converters should not be enabled."""
        dt = DumpTruck(dbname="/tmp/test.db", adapt_and_convert=False)
        dt.execute("CREATE TABLE pork_sales (week date);")
        dt.execute("INSERT INTO pork_sales VALUES ('2012-10-08')")

        observedData = dt.execute("SELECT week FROM pork_sales")
        self.assertListEqual(observedData, [{u"week": u"2012-10-08"}])
Example #31
0
  def test_special_type_list(self):
    """Adapters and converters should not be enabled."""
    dt = DumpTruck(dbname = '/tmp/test.db', adapt_and_convert = False)
    dt.execute('CREATE TABLE pork_sales (week json);')
    dt.execute("INSERT INTO pork_sales VALUES ('[12,3,4]')")

    observedData = dt.execute('SELECT week FROM pork_sales')
    self.assertListEqual(observedData, [{u"week": u"[12,3,4]"}])
Example #32
0
#!/usr/bin/env python
import json
from dumptruck import DumpTruck
dt = DumpTruck(dbname='applications.db')


def scott_data():
    sql = '''
    SELECT "parish", sum("acreage") AS 'acreage'
    FROM application
    WHERE "type" = 'impact' AND "parish" != ''
    GROUP BY "parish";
    '''

    return {
        row['parish'].upper().replace('SAINT', 'ST'):
        (row['parish'], row['acreage'])
        for row in dt.execute(sql)
    }


scott = scott_data()
parishes = json.load(open('parishes.json'))

max_impacted_acres = max([v[1] for v in scott.values()])
for feature in parishes['features']:
    feature['properties']['impacted_acres'] = scott.get(
        feature['properties']['COUNTY'], (None, 0))[1]
    feature['properties']['impacted_acres_prop_max'] = scott.get(
        feature['properties']['COUNTY'], (None, 0))[1] / max_impacted_acres
Example #33
0
def _connect(dbname=DATABASE_NAME, timeout=DATABASE_TIMEOUT):
    'Initialize the database (again). This is mainly for testing'
    global dt
    dt = DumpTruck(dbname=dbname, adapt_and_convert=False, timeout=timeout)
Example #34
0
def main():
    dt = DumpTruck(dbname='metrics.db')
    dt.create_table({'portal': 'abc', 'date': datetime.date.today()}, 'series')
    dt.create_index(['portal', 'date'], 'series')
    dt.upsert(list(table()), 'series')
Example #35
0
    def open_spider(self, spider):
        self.dt = DumpTruck(dbname=settings['DB_PATH'], auto_commit=True)

        id_data = self.dt.execute('SELECT id FROM auctions')
        self.ids = [x['id'] for x in id_data]
Example #36
0
# coding: utf-8
from dumptruck import DumpTruck
import csv, os, glob

_here = os.path.split(__file__)[0]

store = DumpTruck(dbname="db/documents.db")

already = dict([(a[26:36], a) for a in [
    os.path.split(a)[1]
    for a in glob.glob("/home/martin/Dropbox/blackrock-scraper/data/*")
]])

already_downloaded = dict([(a[26:36], a) for a in [
    os.path.split(a)[1] for a in glob.glob(os.path.join(_here, "data/html/*"))
]])

print len(already_downloaded.items()), len(already.items())
#s
#
# Liste aller Gesellschaften mit aktuellstem Berichtsdatum und Anzahl der Berichte
dt = csv.DictWriter(open(os.path.join(_here, "data/tables", "ciks.csv"), "w"),
                    [
                        'act', 'num', 'cik', 'name', 'filename', 'already',
                        'already_downloaded', 'exists', 'link'
                    ],
                    delimiter=";")
dt.writerow(
    dict(act="Aktuellster Bericht",
         num="Anzahl der Berichte",
         cik="Central Index Key",
Example #37
0
#!/usr/bin/env python2
import os, json
from dumptruck import DumpTruck

dt = DumpTruck(dbname='/tmp/catalog.db')

# Create a unique index on `identifier`.
dt.execute('''
CREATE TABLE IF NOT EXISTS "catalog" (
  "portal" TEXT NOT NULL,
  "identifier" TEXT NOT NULL,
  PRIMARY KEY ("portal", "identifier")
);''')

for data_json in os.listdir('catalogs'):
    # Load into memory.
    data = json.load(open(os.path.join('catalogs', data_json)))[1:]

    # Add the portal.
    portal = data_json.replace('.json', '')
    for row in data:
        row['portal'] = portal

    # Put in the database.
    dt.insert(data, 'catalog')
Example #38
0
import logging, sys, os
from lxml import etree
import requests
from dumptruck import DumpTruck, Pickle

_here = os.path.split(__file__)[0]

store = DumpTruck(dbname=os.path.join(_here, "db/documents.db"))

parser = etree.HTMLParser()


def getTree(url):
    return etree.parse(url, etree.HTMLParser())


logger = logging.getLogger(os.path.split(__file__)[1])
logging.basicConfig(level=logging.DEBUG, file=sys.stderr)


def get_nq_for_cik(cik):
    try:
        tree = getTree(
            "http://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=%s&type=N-Q%%25&dateb=&owner=include&start=0&count=40&output=atom"
            % cik)
    except Exception, e:
        logger.error("Error searching for CIK %s:%s" % (cik, e))
        pass
    for entry in tree.xpath("//entry"):
        link = entry.xpath("link/@href")[0]
        date = entry.xpath("updated/text()")[0]
Example #39
0
def _connect(dbname='scraperwiki.sqlite'):
    'Initialize the database (again). This is mainly for testing'
    global dt
    dt = DumpTruck(dbname=dbname, adapt_and_convert=False)
Example #40
0
    trs = html.xpath(
        '//table[@style="border-collapse: collapse; width: 100%;"]/descendant::tr'
    )

    def do_row(tr):
        try:
            return l.parse_row(tr)
        except:
            print tostring(tr)
            raise

    return map(do_row, trs[2:])


# Schema
dt = DumpTruck(dbname='/tmp/finalip.db')
dt.create_table({u'DA Number': u'NAE-2009-01067'},
                'finalip',
                if_not_exists=True)
dt.create_index(['Da Number'], 'finalip', unique=True, if_not_exists=True)

# Skip finished stuff
pages = set([(row['Year'], row['Month'], row['Page'])
             for row in dt.execute('SELECT Year, Month, Page FROM finalip')])

# Populate
for dirname, subdirnames, filenames in os.walk(
        os.path.join(os.environ['READER_ROOT'], '..', 'finalips')):
    if subdirnames != []:
        continue
    for filename in filenames: