Ejemplo n.º 1
0
def build_table_from_db(dbname = '/tmp/table_info.db'):
    dt = DumpTruck(dbname = dbname)
    tableIds = [row['tableId'] for row in dt.execute('''
SELECT tableId, count(*)
FROM table_info
GROUP BY tableId
ORDER BY count(*) DESC
limit 10;
''')]

    try:
        os.mkdir('geneology')
    except OSError:
        pass
    json.dump(tableIds, open(os.path.join('geneology', 'index.json'), 'w'))

    for tableId in tableIds:
        result = {
            'source': dt.execute('SELECT * FROM table_info WHERE tableId = ? ORDER BY createdAt ASC LIMIT 1', [tableId])[0],
            'datasets': {},
        }
        for dataset in dt.execute('SELECT * FROM table_info WHERE tableId = ?', [tableId]):
            if dataset['id'] not in result['datasets']:
                result['datasets'][dataset['id']] = dataset
                result['datasets'][dataset['id']]['portals'] = []

            result['datasets'][dataset['id']]['portals'].append(dataset['portal'])

        result['datasets'] = result['datasets'].values()
        for dataset in result['datasets']:
            del dataset['portal']
        json.dump(result, open(os.path.join('geneology', '%d.json' % tableId), 'w'))
Ejemplo n.º 2
0
    def test_create_if_not_exists(self):
        dt = DumpTruck(dbname="/tmp/test.db")
        dt.execute("create table mango (bar integer, baz integer);")
        dt.create_index(["bar", "baz"], "mango")

        # This should not raise an error.
        dt.create_index(["bar", "baz"], "mango", if_not_exists=True)
Ejemplo n.º 3
0
  def test_create_if_exists(self):
    dt = DumpTruck(dbname = '/tmp/test.db')
    dt.execute('create table pineapple (bar integer, baz integer);')
    dt.create_index(['bar', 'baz'], 'pineapple')

    with self.assertRaises(sqlite3.OperationalError):
      dt.create_index(['bar', 'baz'], 'pineapple', if_not_exists = False)
Ejemplo n.º 4
0
  def test_non_unique(self):
    dt = DumpTruck(dbname = '/tmp/test.db')
    dt.execute('create table tomato (bar integer, baz integer);')
    dt.create_index(['bar', 'baz'], 'tomato')
    observed = dt.execute('PRAGMA index_info(tomato_bar_baz)')

    # Indexness
    self.assertIsNotNone(observed)

    # Indexed columns
    expected = [
      {u'seqno': 0, u'cid': 0, u'name': u'bar'},
      {u'seqno': 1, u'cid': 1, u'name': u'baz'},
    ]
    self.assertListEqual(observed, expected)

    # Uniqueness
    indices = dt.execute('PRAGMA index_list(tomato)')
    for index in indices:
      if index[u'name'] == u'tomato_bar_baz':
        break
    else:
      index = {}

    self.assertEqual(index[u'unique'], 0)
Ejemplo n.º 5
0
    def test_create_if_exists(self):
        dt = DumpTruck(dbname="/tmp/test.db")
        dt.execute("create table pineapple (bar integer, baz integer);")
        dt.create_index(["bar", "baz"], "pineapple")

        with self.assertRaises(sqlite3.OperationalError):
            dt.create_index(["bar", "baz"], "pineapple", if_not_exists=False)
Ejemplo n.º 6
0
  def test_create_if_not_exists(self):
    dt = DumpTruck(dbname = '/tmp/test.db')
    dt.execute('create table mango (bar integer, baz integer);')
    dt.create_index(['bar', 'baz'], 'mango')

    # This should not raise an error.
    dt.create_index(['bar', 'baz'], 'mango', if_not_exists = True)
Ejemplo n.º 7
0
    def test_special_type_date(self):
        """Adapters and converters should not be enabled."""
        dt = DumpTruck(dbname="/tmp/test.db", adapt_and_convert=False)
        dt.execute("CREATE TABLE pork_sales (week date);")
        dt.execute("INSERT INTO pork_sales VALUES ('2012-10-08')")

        observedData = dt.execute("SELECT week FROM pork_sales")
        self.assertListEqual(observedData, [{u"week": u"2012-10-08"}])
Ejemplo n.º 8
0
  def test_special_type_list(self):
    """Adapters and converters should not be enabled."""
    dt = DumpTruck(dbname = '/tmp/test.db', adapt_and_convert = False)
    dt.execute('CREATE TABLE pork_sales (week json);')
    dt.execute("INSERT INTO pork_sales VALUES ('[12,3,4]')")

    observedData = dt.execute('SELECT week FROM pork_sales')
    self.assertListEqual(observedData, [{u"week": u"[12,3,4]"}])
Ejemplo n.º 9
0
def check_timeouts():
    import requests
    from unidecode import unidecode

    dt = DumpTruck('/tmp/open-data.sqlite', auto_commit = False)
    dt.execute('''
CREATE TABLE IF NOT EXISTS link_speeds (
  url TEXT NOT NULL,
  elapsed FLOAT,
  error_type TEXT NOT NULL,
  error TEXT NOT NULL,
  UNIQUE(url)
);''')
    urls = Queue()
    url_list = [row['url'] for row in dt.execute('SELECT DISTINCT url FROM links WHERE status_code = -42 and URL NOT IN (SELECT url from link_speeds)')]
    for url in url_list:
        urls.put(url)

    # Sink to the database
    def _db(queue):
        dt = DumpTruck('/tmp/open-data.sqlite')
        while True:
            dt.execute(*queue.get())
    db_updates = Queue()
    db_thread = Thread(None, target = _db, args = (db_updates,))
    db_thread.start()

#   def signal_handler(signal, frame):
#       db_thread.terminate()
#       sys.exit(0)
#   signal.signal(signal.SIGINT, signal_handler)

    # Check links
    def _check_link(url_queue):
        while not urls.empty():
            url = url_queue.get()
            if url == None:
                raise ValueError('url is None')
            try:
                r = requests.head(url, allow_redirects=True, timeout = 30)
            except Exception as e:
                try:
                    msg = unicode(e)
                except:
                    msg = ''
                sql = 'INSERT INTO link_speeds (url, error_type, error) VALUES (?,?,?)'
                db_updates.put((sql, (url, unicode(type(e)), msg))) # ew python 2
            else:
                sql = 'INSERT INTO link_speeds (url, elapsed, error_type, error) VALUES (?,?,\'\',\'\')'
                db_updates.put((sql, (url, r.elapsed.total_seconds())))

    threads = {}
    for i in range(500):
        threads[i] = Thread(None, target = _check_link, args = (urls,))

    for thread in threads.values():
        thread.start()
Ejemplo n.º 10
0
def to_sqlite3():
    dt = DumpTruck('/tmp/open-data.sqlite', auto_commit = False)

    dummyrow = dict(zip(['software','catalog','identifier'], ['blah']*3))
    dt.create_table(dummyrow, 'datasets', if_not_exists = True)
    dt.create_index(['software','catalog','identifier'], 'datasets', if_not_exists = True, unique = True)

    for table in ['ckan','socrata']:
        dt.create_table({'catalog':'blah','identifier':'blah'}, table, if_not_exists = True)
        dt.create_index(['catalog','identifier'], table, if_not_exists = True, unique = True)

    dt.create_table({'view_id':'abc','table_id':123}, 'socrata_tables')
    dt.create_index(['view_id'], 'socrata_tables', if_not_exists = True, unique = True)
    dt.create_index(['table_id'], 'socrata_tables', if_not_exists = True)

    for dataset in datasets():
        row = {
            'software': dataset['software'],
            'catalog': dataset['catalog'],
            'identifier': dataset[SOFTWARE_MAP['identifier'][dataset['software']]],
        }
        sql = 'SELECT * FROM datasets WHERE software = ? AND catalog = ? AND identifier = ?'
        if dt.execute(sql, [row['software'],row['catalog'],row['identifier']]) != []:
            continue
        dt.upsert(row, 'datasets')
        if dataset['software'] == 'socrata':
            socrata_table = {
                'view_id': row['identifier'],
                'table_id': dataset['tableId'],
            }
            dt.upsert(socrata_table, 'socrata_tables')
        dt.upsert(dataset,dataset['software'])
        dt.commit()
Ejemplo n.º 11
0
 def test_select(self):
   shutil.copy(u'fixtures/landbank_branches.sqlite', u'.')
   h = DumpTruck(dbname = u'landbank_branches.sqlite')
   data_observed = h.execute(u'SELECT * FROM `branches` WHERE Fax is not null ORDER BY Fax LIMIT 3;')
   data_expected = [{'town': u'\r\nCenturion', 'date_scraped': 1327791915.618461, 'Fax': u' (012) 312 3647', 'Tel': u' (012) 686 0500', 'address_raw': u'\r\n420 Witch Hazel Ave\n\r\nEcopark\n\r\nCenturion\n\r\n0001\n (012) 686 0500\n (012) 312 3647', 'blockId': 14, 'street-address': None, 'postcode': u'\r\n0001', 'address': u'\r\n420 Witch Hazel Ave\n\r\nEcopark\n\r\nCenturion\n\r\n0001', 'branchName': u'Head Office'}, {'town': u'\r\nCenturion', 'date_scraped': 1327792245.787187, 'Fax': u' (012) 312 3647', 'Tel': u' (012) 686 0500', 'address_raw': u'\r\n420 Witch Hazel Ave\n\r\nEcopark\n\r\nCenturion\n\r\n0001\n (012) 686 0500\n (012) 312 3647', 'blockId': 14, 'street-address': u'\r\n420 Witch Hazel Ave\n\r\nEcopark', 'postcode': u'\r\n0001', 'address': u'\r\n420 Witch Hazel Ave\n\r\nEcopark\n\r\nCenturion\n\r\n0001', 'branchName': u'Head Office'}, {'town': u'\r\nMiddelburg', 'date_scraped': 1327791915.618461, 'Fax': u' (013) 282 6558', 'Tel': u' (013) 283 3500', 'address_raw': u'\r\n184 Jan van Riebeeck Street\n\r\nMiddelburg\n\r\n1050\n (013) 283 3500\n (013) 282 6558', 'blockId': 17, 'street-address': None, 'postcode': u'\r\n1050', 'address': u'\r\n184 Jan van Riebeeck Street\n\r\nMiddelburg\n\r\n1050', 'branchName': u'Middelburg'}]
   self.assertListEqual(data_observed, data_expected)
   os.remove('landbank_branches.sqlite')
Ejemplo n.º 12
0
  def save_and_check(self, dataIn, tableIn, dataOut, tableOut = None, twice = True):
    if tableOut == None:
      tableOut = quote(tableIn)

    # Insert
    h = DumpTruck(dbname = '/tmp/test.db')
    h.insert(dataIn, tableIn)
    h.close()

    # Observe with pysqlite
    connection=sqlite3.connect('/tmp/test.db')
    cursor=connection.cursor()
    cursor.execute(u'SELECT * FROM %s' % tableOut)
    observed1 = cursor.fetchall()
    connection.close()

    if twice:
      # Observe with DumpTruck
      h = DumpTruck(dbname = '/tmp/test.db')
      observed2 = h.execute(u'SELECT * FROM %s' % tableOut)
      h.close()
 
      #Check
      expected1 = dataOut
      expected2 = [dataIn] if type(dataIn) in (dict, OrderedDict) else dataIn
 
      self.assertListEqual(observed1, expected1)
      self.assertListEqual(observed2, expected2)
Ejemplo n.º 13
0
def main():
    dt = DumpTruck(dbname='/tmp/yoga.db')
    dt.execute('''
CREATE TABLE IF NOT EXISTS page_source (
  page_number INTEGER NOT NULL,
  page_source TEXT NOT NULL,
  UNIQUE(page_number)
)''')

    print('Running the search')
    # Get the search page
    driver = _driver_setup()
    driver.get(url)

    # 100 per page
    option = driver.find_elements_by_xpath('id("ctl00_TemplateBody_ucTeacherDirectory_ddhowmany")/option[@value="100"]')[0]
    option.click()

    # Search
    button = driver.find_elements_by_id('ctl00_TemplateBody_ucTeacherDirectory_imgSearch')[0]
    button.click()

    while True:
        print('Pausing for a few seconds to let the page load and be polite')
        sleep(8)
        randomsleep()

        # Current page number
        page_numbers = driver.find_elements_by_css_selector('#ctl00_TemplateBody_ucTeacherDirectory_gvTeacherDirectory tr td tr span')

        # Fast forward to new pages: Ellipsis
        ellipses = driver.find_elements_by_xpath('//a[text()="..."]')

        # Fast forward to new pages: Maximum page
        max_page_numbers = driver.find_elements_by_xpath('//td[a[text()="..."]]/preceding-sibling::td[position()=1]')
        if max_page_numbers == []:
            max_page_numbers = [page_number.find_elements_by_xpath('../td[position()=last()]') for page_number in page_numbers]

        for nodes in [page_numbers, ellipses, max_page_numbers]:
            print(nodes)
            print [n.text for n in nodes]
            if len(nodes) == 1:
                import pdb; pdb.set_trace()
                raise ValueError('Only one navigation row')
            elif nodes[0].text != nodes[1].text:
                import pdb; pdb.set_trace()
                raise ValueError('Page navigation rows don\'t match.')
Ejemplo n.º 14
0
 def test_empty_row_second_insert(self):
     "An empty row acts like any other row."
     dt = DumpTruck(dbname="/tmp/test.db")
     dt.create_table({"foo": "uhtnh", "bar": "aoue"}, "nine")
     dt.insert({}, "nine")
     c = dt.execute("select count(*) as c from nine")[0]["c"]
     dt.close()
     self.assertEqual(c, 1)
Ejemplo n.º 15
0
 def test_second_insert(self):
     "Inserting a second row that is all null adds an empty row."
     dt = DumpTruck(dbname="/tmp/test.db")
     dt.create_table({"foo": "uhtnh", "bar": "aoue"}, "three")
     dt.insert({"foo": None, "bar": None}, "three")
     c = dt.execute("select count(*) as c from three")[0]["c"]
     dt.close()
     self.assertEqual(c, 1)
Ejemplo n.º 16
0
 def test_no_rows_second_insert(self):
     "Nothing happens if no rows are inserted to a table that is there."
     dt = DumpTruck(dbname="/tmp/test.db")
     dt.create_table({"foo": "uhtnh", "bar": "aoue"}, "ninety")
     dt.insert([], "ninety")
     c = dt.execute("select count(*) as c from ninety")[0]["c"]
     dt.close()
     self.assertEqual(c, 0)
Ejemplo n.º 17
0
 def test_no_rows_second_insert(self):
   "Nothing happens if no rows are inserted to a table that is there."
   dt = DumpTruck(dbname = '/tmp/test.db')
   dt.create_table({'foo': 'uhtnh', 'bar': 'aoue'}, 'ninety')
   dt.insert([], 'ninety')
   c = dt.execute('select count(*) as c from ninety')[0]['c']
   dt.close()
   self.assertEqual(c, 0)
Ejemplo n.º 18
0
 def test_second_insert(self):
   "Inserting a second row that is all null adds an empty row."
   dt = DumpTruck(dbname = '/tmp/test.db')
   dt.create_table({'foo': 'uhtnh', 'bar': 'aoue'}, 'three')
   dt.insert({'foo': None, 'bar': None}, 'three')
   c = dt.execute('select count(*) as c from three')[0]['c']
   dt.close()
   self.assertEqual(c, 1)
Ejemplo n.º 19
0
 def test_empty_row_second_insert(self):
   "An empty row acts like any other row."
   dt = DumpTruck(dbname = '/tmp/test.db')
   dt.create_table({'foo': 'uhtnh', 'bar': 'aoue'}, 'nine')
   dt.insert({}, 'nine')
   c = dt.execute('select count(*) as c from nine')[0]['c']
   dt.close()
   self.assertEqual(c, 1)
def main():
    edges = build_network()['edges']

    dt = DumpTruck(dbname = '/tmp/open-data.sqlite', adapt_and_convert = True)
    datasets_in = dt.execute('SELECT * FROM socrata')

    dt.create_table({'id': 'blah-blah'}, 'socrata_deduplicated')
    dt.create_index(['id'], 'socrata_deduplicated', if_not_exists = True, unique = True)

    for dataset in dedupe(datasets_in, edges):
        dt.upsert(dataset, 'socrata_deduplicated')
Ejemplo n.º 21
0
    def test_non_unique(self):
        dt = DumpTruck(dbname="/tmp/test.db")
        dt.execute("create table tomato (bar integer, baz integer);")
        dt.create_index(["bar", "baz"], "tomato")
        observed = dt.execute("PRAGMA index_info(tomato_bar_baz)")

        # Indexness
        self.assertIsNotNone(observed)

        # Indexed columns
        expected = [{u"seqno": 0, u"cid": 0, u"name": u"bar"}, {u"seqno": 1, u"cid": 1, u"name": u"baz"}]
        self.assertListEqual(observed, expected)

        # Uniqueness
        indices = dt.execute("PRAGMA index_list(tomato)")
        for index in indices:
            if index[u"name"] == u"tomato_bar_baz":
                break
        else:
            index = {}

        self.assertEqual(index[u"unique"], 0)
Ejemplo n.º 22
0
def extract_dataset_table_info():
    dt = DumpTruck(dbname = '/tmp/table_info.db')
    dt.create_table({'portal': 'abc', 'id': 'abcd-efgh'}, 'table_info')
    dt.create_index(['portal', 'id'], 'table_info', unique = True)
    dt.create_index(['tableId'], 'table_info', unique = False)
    done = set([tuple(row.keys()) for row in dt.execute('SELECT portal, id FROM table_info')])
    for portal in os.listdir('data'):
        for viewid in os.listdir(os.path.join('data', portal, 'views')):
            if (portal, viewid) in done:
                continue
            d = _dataset_table_info(portal, viewid)
            if d == None:
                continue
            dt.upsert(d, 'table_info')
Ejemplo n.º 23
0
def get_uid_from_uid_or_nick(db, uid_or_nick):
    'Given a uid or a nick, return a uid or an error.'
    dt = DumpTruck(db)

    # Is it a uid?
    sql = 'SELECT count(*) FROM log_status WHERE uid = ?'
    count = dt.execute(sql, [uid_or_nick])[0]['count(*)']
    if count > 0:
        # It is a uid
        uid = uid_or_nick
    else:
        # It's a nick. Make sure it refers to a unique uid.
        sql = 'SELECT count(DISTINCT uid) FROM log_status WHERE nick = ?'
        count = dt.execute(sql, [uid_or_nick])[0]['count(DISTINCT uid)']

        if count == 1:
            # It is distinct
            sql = 'SELECT uid FROM log_status WHERE nick = ? LIMIT 1'
            uid = dt.execute(sql, [uid_or_nick])[0]['uid']

        else:
            # Multiple people have this name.
            raise ValueError('The name you specified does not refer uniquely to one user.')
    return uid
Ejemplo n.º 24
0
class AuctionsPipeline(object):
    def open_spider(self, spider):
        self.dt = DumpTruck(dbname=settings['DB_PATH'], auto_commit=True)

        id_data = self.dt.execute('SELECT id FROM auctions')
        self.ids = [x['id'] for x in id_data]

    def process_item(self, item, spider):
        if 'auctions' not in getattr(spider, 'pipelines', []):
            return item

        item['id'] = int(item['id'][0])
        item['auctioneer'] = ' '.join(item['auctioneer'])
        item['contact_number'] = ' '.join(item['contact_number'])
        item['date'] = '%s %s' % (' '.join(item['date']), ' '.join(
            item['time']))
        item['location'] = ' '.join(item['location'])
        item['link'] = ' '.join(item['link'])
        item['listing'] = ' '.join(item['listing'])

        #format phonenumber
        parsed_number = phonenumbers.parse(item['contact_number'], 'US')
        item['contact_number'] = phonenumbers.format_number(
            parsed_number, phonenumbers.PhoneNumber())

        # format listing / remove any html cludge
        soup_listing = BeautifulSoup(item['listing'])
        item['listing'] = soup_listing.get_text()

        # format date and time to standard format
        dt = parse(item['date'])
        item['date'] = dt.datetime.strftime('%Y-%m-%d %H:%M:%S')

        if item['id'] in self.ids:
            raise DropItem('Dupe auction stored, ignoring listing: %s' % item)
        else:
            self.dt.insert(
                {
                    'id': item['id'],
                    'auctioneer': item['auctioneer'],
                    'contact_number': item['contact_number'],
                    'date': item['date'],
                    'location': item['location'],
                    'link': item['link'],
                    'listing': item['listing'],
                }, 'auctions')

            return item
Ejemplo n.º 25
0
def check_links():
    dt = DumpTruck('/tmp/open-data.sqlite', auto_commit = False)
    dt.create_index(['url'], 'links', if_not_exists = True, unique = False)
    dt.create_index(['status_code'], 'links', if_not_exists = True, unique = False)

    # Source
    urls = Queue()
    sql = '''
SELECT DISTINCT url
FROM links
WHERE (status_code = -42 OR status_code IS NULL) AND is_link AND url NOT NULL
ORDER BY status_code, substr(30, 100);
'''
    # Order by the substring so that we randomly bounce around catalogs

    url_list = [row['url'] for row in dt.execute(sql)]
    for url in url_list:
        urls.put(url)

    # Sink to the database
    def _db(queue):
        dt = DumpTruck('/tmp/open-data.sqlite')
        while True:
            dt.execute(*queue.get())
    db_updates = Queue()
    db_thread = Thread(None, target = _db, args = (db_updates,))
    db_thread.start()

    # Check links
    def _check_link(url_queue):
        while not urls.empty():
            url = url_queue.get()
            if url == None:
                raise ValueError('url is None')
            status_code, headers, error = links.is_alive(url)
            sql = 'UPDATE links SET status_code = ?, headers = ?, error = ? WHERE is_link = 1 AND url = ?'
            db_updates.put((sql, (status_code, headers, error, url)))
            print(url)

    threads = {}
    for i in range(100):
        threads[i] = Thread(None, target = _check_link, args = (urls,))

    for thread in threads.values():
        thread.start()
Ejemplo n.º 26
0
class AuctionsPipeline(object):
    def open_spider(self, spider):
        self.dt = DumpTruck(dbname=settings['DB_PATH'],auto_commit=True)

        id_data = self.dt.execute('SELECT id FROM auctions')
        self.ids = [x['id'] for x in id_data]

    def process_item(self, item, spider):
        if 'auctions' not in getattr(spider,'pipelines',[]):
            return item

        item['id'] = int(item['id'][0])
        item['auctioneer'] = ' '.join(item['auctioneer'])
        item['contact_number'] = ' '.join(item['contact_number'])
        item['date'] = '%s %s' % (' '.join(item['date']), ' '.join(item['time']))
        item['location'] = ' '.join(item['location'])
        item['link'] = ' '.join(item['link'])
        item['listing'] = ' '.join(item['listing'])

        #format phonenumber
        parsed_number = phonenumbers.parse(item['contact_number'],'US')
        item['contact_number'] = phonenumbers.format_number(parsed_number, phonenumbers.PhoneNumber())

        # format listing / remove any html cludge
        soup_listing = BeautifulSoup(item['listing'])
        item['listing'] = soup_listing.get_text()

        # format date and time to standard format
        dt = parse(item['date'])
        item['date'] = dt.datetime.strftime('%Y-%m-%d %H:%M:%S')

        if item['id'] in self.ids:
            raise DropItem('Dupe auction stored, ignoring listing: %s' % item)
        else:
            self.dt.insert({
                'id': item['id'],
                'auctioneer': item['auctioneer'],
                'contact_number': item['contact_number'],
                'date': item['date'],
                'location': item['location'],
                'link': item['link'],
                'listing': item['listing'],
            }, 'auctions')

            return item
Ejemplo n.º 27
0
 def test_select(self):
     shutil.copy(u"fixtures/landbank_branches.sqlite", u".")
     h = DumpTruck(dbname=u"landbank_branches.sqlite")
     data_observed = h.execute(u"SELECT * FROM `branches` WHERE Fax is not null ORDER BY Fax LIMIT 3;")
     data_expected = [
         {
             "town": u"\r\nCenturion",
             "date_scraped": 1327791915.618461,
             "Fax": u" (012) 312 3647",
             "Tel": u" (012) 686 0500",
             "address_raw": u"\r\n420 Witch Hazel Ave\n\r\nEcopark\n\r\nCenturion\n\r\n0001\n (012) 686 0500\n (012) 312 3647",
             "blockId": 14,
             "street-address": None,
             "postcode": u"\r\n0001",
             "address": u"\r\n420 Witch Hazel Ave\n\r\nEcopark\n\r\nCenturion\n\r\n0001",
             "branchName": u"Head Office",
         },
         {
             "town": u"\r\nCenturion",
             "date_scraped": 1327792245.787187,
             "Fax": u" (012) 312 3647",
             "Tel": u" (012) 686 0500",
             "address_raw": u"\r\n420 Witch Hazel Ave\n\r\nEcopark\n\r\nCenturion\n\r\n0001\n (012) 686 0500\n (012) 312 3647",
             "blockId": 14,
             "street-address": u"\r\n420 Witch Hazel Ave\n\r\nEcopark",
             "postcode": u"\r\n0001",
             "address": u"\r\n420 Witch Hazel Ave\n\r\nEcopark\n\r\nCenturion\n\r\n0001",
             "branchName": u"Head Office",
         },
         {
             "town": u"\r\nMiddelburg",
             "date_scraped": 1327791915.618461,
             "Fax": u" (013) 282 6558",
             "Tel": u" (013) 283 3500",
             "address_raw": u"\r\n184 Jan van Riebeeck Street\n\r\nMiddelburg\n\r\n1050\n (013) 283 3500\n (013) 282 6558",
             "blockId": 17,
             "street-address": None,
             "postcode": u"\r\n1050",
             "address": u"\r\n184 Jan van Riebeeck Street\n\r\nMiddelburg\n\r\n1050",
             "branchName": u"Middelburg",
         },
     ]
     self.assertListEqual(data_observed, data_expected)
     os.remove("landbank_branches.sqlite")
Ejemplo n.º 28
0
def avail_within(db, start=0, end=2**32):
    '''
    Given a database filename, uid and a time range (POSIX times),
    return how long everyone was online, in seconds.
    '''

    # Database connection
    dt = DumpTruck(db)
    updates = dt.execute('''
        SELECT uid, ts, status
        FROM log_status
        WHERE ts > ? AND ts < ?
        ORDER BY uid, ts
        ;''', [start, end])

    # This loop depends on the order by clause above
    total_time = {}
    for u in updates:

        if u['status'] == 'notavail' and (u['uid'] not in total_time):
            # First time we see person, but it's 'notavail', so skip it.
            continue

        elif u['status'] == 'avail' and (u['uid'] not in total_time):
            # First time we see person becoming available
            total_time[u['uid']] = 0

        elif u['status'] == 'avail':
            # The person went avail. We don't need to do anything;
            # prev_time is recorded below the if statements
            continue

        elif u['status'] == 'notavail':
            # The person went notavail, so record the time when the user was available.
            total_time[u['uid']] += u['ts'] - prev_time

        else:
            raise ValueError('The update\'s status "%s" is neither "avail" nor "notavail."' % u['status'])

        # Record the current timestamp as prev_time
        prev_time = u['ts']

    return total_time
Ejemplo n.º 29
0
        (row['parish'], row['acreage'])
        for row in dt.execute(sql)
    }


scott = scott_data()
parishes = json.load(open('parishes.json'))

max_impacted_acres = max([v[1] for v in scott.values()])
for feature in parishes['features']:
    feature['properties']['impacted_acres'] = scott.get(
        feature['properties']['COUNTY'], (None, 0))[1]
    feature['properties']['impacted_acres_prop_max'] = scott.get(
        feature['properties']['COUNTY'], (None, 0))[1] / max_impacted_acres

    sql = '''
SELECT "permitApplicationNumber", "projectDescription", "acreage", "parish"
FROM application
WHERE "parish" = ? AND "type" = 'impact'
'''
    if feature['properties']['COUNTY'] in scott:
        applications = dt.execute(sql,
                                  [scott[feature['properties']['COUNTY']][0]])
        for a in applications:
            a['parish'] = a['parish'].upper().replace('SAINT', 'ST')
        feature['properties']['applications'] = applications
    else:
        feature['properties']['applications'] = []

json.dump(parishes, open('impacts.json', 'w'))
Ejemplo n.º 30
0
            print tostring(tr)
            raise

    return map(do_row, trs[2:])


# Schema
dt = DumpTruck(dbname='/tmp/finalip.db')
dt.create_table({u'DA Number': u'NAE-2009-01067'},
                'finalip',
                if_not_exists=True)
dt.create_index(['Da Number'], 'finalip', unique=True, if_not_exists=True)

# Skip finished stuff
pages = set([(row['Year'], row['Month'], row['Page'])
             for row in dt.execute('SELECT Year, Month, Page FROM finalip')])

# Populate
for dirname, subdirnames, filenames in os.walk(
        os.path.join(os.environ['READER_ROOT'], '..', 'finalips')):
    if subdirnames != []:
        continue
    for filename in filenames:
        year, month = map(int, dirname.split('/')[-2:])
        page = (year, month, filename)
        if page in pages:
            continue

        path = os.path.join(dirname, filename)
        try:
            data = read_finalip(path)
Ejemplo n.º 31
0
    trs = html.xpath('//table[@style="border-collapse: collapse; width: 100%;"]/descendant::tr')
    def do_row(tr):
        try:
            return l.parse_row(tr)
        except:
            print tostring(tr)
            raise
    return map(do_row, trs[2:])

# Schema
dt = DumpTruck(dbname = '/tmp/finalip.db')
dt.create_table({u'DA Number': u'NAE-2009-01067'}, 'finalip', if_not_exists = True)
dt.create_index(['Da Number'], 'finalip', unique = True, if_not_exists = True)

# Skip finished stuff
pages = set([(row['Year'], row['Month'], row['Page']) for row in dt.execute('SELECT Year, Month, Page FROM finalip')])

# Populate
for dirname, subdirnames, filenames in os.walk(os.path.join(os.environ['READER_ROOT'], '..', 'finalips')):
    if subdirnames != []:
        continue
    for filename in filenames:
        year, month = map(int, dirname.split('/')[-2:])
        page = (year, month, filename)
        if page in pages:
            continue

        path = os.path.join(dirname, filename)
        try:
            data = read_finalip(path)
        except:
Ejemplo n.º 32
0
 def test_question_mark(self):
   dt = DumpTruck(dbname = '/tmp/test.db')
   dt.execute('CREATE TABLE foo (bar TEXT)')
   dt.execute('INSERT INTO foo(bar) VALUES (?)', ['baz'])
   observed = [row['bar'] for row in dt.execute('SELECT bar from foo')]
   self.assertListEqual(observed, ['baz'])
Ejemplo n.º 33
0
                        'act', 'num', 'cik', 'name', 'filename', 'already',
                        'already_downloaded', 'exists', 'link'
                    ],
                    delimiter=";")
dt.writerow(
    dict(act="Aktuellster Bericht",
         num="Anzahl der Berichte",
         cik="Central Index Key",
         name="Name",
         already='frühere analysiert?',
         already_downloaded='frühere heruntergeladen?',
         exists='vorhanden?',
         filename='Aktuellster Bericht (lokal)',
         link="Aktuellster Bericht (Internet)"))
for o in store.execute(
        "select cik,name,count(date) as num,max(date) as act from docs group by cik order by act desc"
):
    details = store.execute(
        "select date,cik,doc from docs where cik='%s' order by date desc limit 1"
        % o["cik"])[0]
    o["link"] = details["doc"]
    details["fn"] = os.path.split(details["doc"])[1]
    o["filename"] = "%(date)s-%(cik)s-%(fn)s" % details
    o["exists"] = os.path.exists(
        os.path.join("/home/martin/Dropbox/blackrock-scraper/data",
                     o["filename"]))
    if not o["exists"]:
        o["already"] = "ea:%s" % already.get(o["cik"], 'none analyzed')
        o["already_downloaded"] = "ed:%s" % already.get(
            o["cik"], 'none downloaded')
    else:
# <codecell>
import os

_here=os.path.split("__file__")[0]

from dumptruck import DumpTruck
dt=DumpTruck(dbname=os.path.join(_here,"data/events.sqlite"))
from collections import defaultdict
from slscraper import eventdata 

d=True
c=1


while d: 
	r=dt.execute("select id from events where error like '%JSON%' order by random() limit 1")
	if len(r)>0 :
		rr=r[0]["id"]
		try :	
			dd=eventdata("%s" % rr)
			if 'error' not in dd :
				dd["error"]="--"
			dt.upsert(dd,"events")
			print "%(id)s %(title)s: %(error)s" % defaultdict(lambda : "-", dd)
			c+=1
			if (c % 10)==0 :
				r=dt.execute("select count(*) as c from events where error like '%JSON%'")
				if len(r)>0 :
					print "still %s to go" % r[0]["c"]
		except Exception, e:
			print "ERROR %s %s" % (rr,e)
Ejemplo n.º 35
0
#!/usr/bin/env python2
import os, json
from dumptruck import DumpTruck

dt = DumpTruck(dbname='/tmp/catalog.db')

# Create a unique index on `identifier`.
dt.execute('''
CREATE TABLE IF NOT EXISTS "catalog" (
  "portal" TEXT NOT NULL,
  "identifier" TEXT NOT NULL,
  PRIMARY KEY ("portal", "identifier")
);''')

for data_json in os.listdir('catalogs'):
    # Load into memory.
    data = json.load(open(os.path.join('catalogs', data_json)))[1:]

    # Add the portal.
    portal = data_json.replace('.json', '')
    for row in data:
        row['portal'] = portal

    # Put in the database.
    dt.insert(data, 'catalog')