forked from netguy204/scan_server
/
dbm_to_mongo.py
56 lines (42 loc) · 1.34 KB
/
dbm_to_mongo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/bin/env python
# Copyright 2010, Brian Taylor
# Distributed under the GNU General Public License
# utility for migrating data from a dbm based store to a mongodb based store
import os
import sys
import dbm
import scan_data
import pymongo
import json
def main(args):
db = dbm.open("scan_data", "r")
print "loaded database"
mdb = pymongo.Connection()
print "connected to mongo"
docs = scan_data.get_documents(db)
print "loaded %d documents" % len(docs)
mdb_docs = mdb.scanserver.documents
# go through each document and build a set of pagekeys
pagekeys = set()
for doc in docs:
docstr = scan_data.doc2json(doc)
pagekeys.update( [ page.key() for page in doc.pages() ] )
mdb_docs.insert(docstr)
print "found %d pages" % len(pagekeys)
mdb_pages = mdb.scanserver.pages
for pk in pagekeys:
page = scan_data.read_page(pk, db)
pagestr = scan_data.page2json(page)
mdb_pages.insert(pagestr)
dirpks = set()
for (base, dirs, files) in os.walk("static", topdown=True):
# don't recurse any further
del dirs[0:-1]
for fname in files:
pk = os.path.splitext(fname)[0]
if pk.startswith("page-"):
dirpks.add(pk)
notindb = dirpks - pagekeys
print "found %d page keys on disk that aren't in the database" % len(notindb)
if __name__ == "__main__":
sys.exit( main(sys.argv[1:]) )