/
Source.py
168 lines (137 loc) · 4.62 KB
/
Source.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#!/usr/bin/env python
# -*- coding: iso-8859-1 -*-
"""Base class for Parser (and unimplemented Downloader)"""
#Libs
import os
import sys
import re
import config
import time
from tempfile import mktemp
#3rd party libs
from rdflib import RDFS
from rdflib.Graph import Graph
from genshi.template import TemplateLoader
#Own libs
import Util
__scriptDir__ = os.getcwd()
class ParseError(Exception):
pass
class Downloader(object):
pass
class Parser(object):
"""Abstract base class for a document"""
reNormalizedSpace = re.compile(r'\s+',).sub
def __init__(self):
self.authRec = self.loadAuthRec(__scriptDir__ + "/etc/authrec.n3")
def Parse(self):
raise NotImplementedError
def generateXhtml(self, meta, body, registry, module, globals):
"""Create a XTHML representation of the document"""
loader = TemplateLoader(['.', os.path.dirname(__file__)],
variable_lookup='lenient')
t = loader.load('etc/%s.template.xht2'%module)
stream = t.generate(meta=meta, body=body, registry=registry, **globals)
try:
res = stream.render()
except Exception, e:
raise
if 'class="warning"' in res:
start = res.index('class="warning">')
end = res.index('</',start+16)
msg = Util.normalizedSpace(res[start+16:end].decode('utf-8'))
return res
def loadAuthRec(self, n3File):
"""Load a RDF graph with authority posts in n3-format"""
g = Graph()
n3File = Util.relpath(n3File)
g.load(n3File, format='n3')
d = {}
for uri, label in g.subject_objects(RDFS.label):
d[unicode(label)] = unicode(uri)
return d
def findAuthRec(self, label):
"""Given a string that refers to some type of organisation, person etc
return a URI for that"""
keys = []
for (key, value) in self.authRec.items():
if label.lower().startswith(key.lower()):
return self.storageUri(value)
else:
keys.append(key)
#TODO: Add 'fuzz' to find close matches.
#fuzz = difflib.get_close_matches(label, keys, 1, 0.8) ...
def storageUri(self, value):
return value.replace(" ", '_')
class Controller(object):
def __init__(self):
self.moduleDir = self._get_module_dir()
self.baseDir = os.path.dirname(__file__)+os.path.sep+config.datadir
## Controller Interface def. Subclasses must implement these ##
def Parse(self, f):
raise NotImplementedError
def ParseAll(self):
"""Parse all the legal documents that we have downloaded"""
#TODO: Fixme
dlDir = os.path.sep.join([self.baseDir, self.moduleDir, u'dl'])
self._runMethod(dlDir, 'html', self.Parse)
def Generate(self, f):
"""Generate HTML from the parsed files"""
raise NotImplementedError
def GenerateAll(self):
parsed = os.path.sep.join([self.baseDir, self.moduleDir, u'parsed'])
self._runMethod(parsed, '.xht2', self.Generate)
## Useable functions for subclasses, can be overriden ##
def _trimFileName(self, files):
"""Transforms a filename to a id, foo/bar/sfs/01.txt becomes sfs/01"""
for f in files:
fileName = "/".join(os.path.split(os.path.splitext(
os.sep.join(os.path.normpath(Util.relpath(f)).split(os.sep)[-2:]))[0]))
if not fileName:
continue
else:
yield fileName
def _runMethod(self, dir, suffix, method):
files = self._trimFileName(Util.listDirs(dir, suffix, reverse=True))
nrOfFiles = 0
currentTime = time.time()
for f in files:
if config.debug:
print "Running file: ", f, " with method: ", method
if config.benchmark:
nrOfFiles += 1
print nrOfFiles,",",f,
try:
method(f)
if config.benchmark:
print ",",time.time()-currentTime
except KeyboardInterrupt:
raise
def _fileUpToDate(self, infiles, outfile):
"""Check if the outfile is up-to-date, then there's no need to regenerate."""
if not os.path.exists(outfile):
return False
for i in infiles:
#TODO: Add lib for timeing!
if os.path.exists(i) and os.stat(i).st_mtime > os.stat(outfile).st_mtime:
return False
return True
def _htmlName(self, f):
"""Return a XHTML file name for the given file"""
if not isinstance(f, unicode):
raise Exception("WARNING: _htmlName called with non unicode name")
return u'%s/%s/generated/%s.html' % (self.baseDir, self.moduleDir, f)
def _xmlName(self, f):
"""Returns a XML file name for the given file"""
if not isinstance(f, unicode):
raise Exception("WARNING: _xmlName called with non unicode name")
return u'%s/%s/parsed/%s.xht2' % (self.baseDir, self.moduleDir, f)
def _dependName(self, f):
return u'%s/%s/intermediate/%s.deps' % (self.baseDir, self.moduleDir, f)
def _loadDepends(self, f):
dependFile = self._dependName(f)
depends = []
if os.path.exists(dependFile):
for dep in codecs.open(dependFile, encoding='utf-8'):
depends.append(dep.strip())
return depends