forked from mit-dig/air-reasoner
/
cwm_string.py
executable file
·592 lines (459 loc) · 18.9 KB
/
cwm_string.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
#! /usr/bin/python
"""
$Id: cwm_string.py,v 1.36 2007/06/26 02:36:15 syosi Exp $
String built-ins for cwm
This started as http://www.w3.org/2000/10/swap/string.py
See cwm.py
"""
import string
import re
from diag import verbosity, progress
import urllib # for hasContent
from term import LightBuiltIn, ReverseFunction, Function
from local_decimal import Decimal
LITERAL_URI_prefix = "data:text/rdf+n3;"
STRING_NS_URI = "http://www.w3.org/2000/10/swap/string#"
###############################################################################################
#
# S T R I N G B U I L T - I N s
#
# This should be in a separate module, imported and called once by the user
# to register the code with the store
#
# Light Built-in classes
class BI_GreaterThan(LightBuiltIn):
def eval(self, subj, obj, queue, bindings, proof, query):
return (subj.string > obj.string)
class BI_NotGreaterThan(LightBuiltIn):
def eval(self, subj, obj, queue, bindings, proof, query):
return (subj.string <= obj.string)
class BI_LessThan(LightBuiltIn):
def eval(self, subj, obj, queue, bindings, proof, query):
return (subj.string < obj.string)
class BI_NotLessThan(LightBuiltIn):
def eval(self, subj, obj, queue, bindings, proof, query):
return (subj.string >= obj.string)
class BI_StartsWith(LightBuiltIn):
def eval(self, subj, obj, queue, bindings, proof, query):
return subj.string.startswith(obj.string)
class BI_EndsWith(LightBuiltIn):
def eval(self, subj, obj, queue, bindings, proof, query):
return subj.string.endswith(obj.string)
# Added, SBP 2001-11:-
class BI_Contains(LightBuiltIn):
def eval(self, subj, obj, queue, bindings, proof, query):
return subj.string.find(obj.string) >= 0
class BI_ContainsIgnoringCase(LightBuiltIn):
def eval(self, subj, obj, queue, bindings, proof, query):
return subj.string.lower().find(obj.string.lower()) >= 0
class BI_ContainsRoughly(LightBuiltIn):
def eval(self, subj, obj, queue, bindings, proof, query):
return normalizeWhitespace(subj.string.lower()).find(normalizeWhitespace(obj.string.lower())) >= 0
class BI_DoesNotContain(LightBuiltIn): # Converse of the above
def eval(self, subj, obj, queue, bindings, proof, query):
return subj.string.find(obj.string) < 0
class BI_equalIgnoringCase(LightBuiltIn):
def eval(self, subj, obj, queue, bindings, proof, query):
return (subj.string.lower() == obj.string.lower())
class BI_notEqualIgnoringCase(LightBuiltIn):
def eval(self, subj, obj, queue, bindings, proof, query):
return (string.lower(subj.string) != string.lower(obj.string))
def normalizeWhitespace(s):
"Normalize whitespace sequences in a string to single spaces"
res = ""
for ch in s:
if ch in " \t\r\n":
if res[-1:]!=" ": res = res + " "
else:
res = res + ch
return res
# String Constructors - more light built-ins
make_string = unicode
class BI_concat(LightBuiltIn, ReverseFunction):
def evaluateSubject(self, obj_py):
if verbosity() > 80: progress("Concat input:"+`obj_py`)
str = ""
for x in obj_py:
if not isString(x): return None # Can't
str = str + x
return str
class BI_concatenation(LightBuiltIn, Function):
def evaluateObject(self, subj_py):
if verbosity() > 80: progress("Concatenation input:"+`subj_py`)
str = ""
for x in subj_py:
if not isString(x):
if type(x) == type(long()) or isinstance(x, Decimal):
x = make_string(x)
else:
x = `x`
if verbosity() > 34: progress("Warning: Coercing to string for concat:"+`x`)
# return None # Can't
str = str + x
return str
class BI_scrape(LightBuiltIn, Function):
"""a built-in for scraping using regexps.
takes a list of 2 strings; the first is the
input data, and the second is a regex with one () group.
Returns the data matched by the () group.
see also: test/includes/scrape1.n3
Hmm... negative tests don't seem to work.
"""
def evaluateObject(self, subj_py):
# raise Error
store = self.store
if verbosity() > 80: progress("scrape input:"+`subj_py`)
str, pat = subj_py
patc = re.compile(pat)
m = patc.search(str)
if m:
if verbosity() > 80: progress("scrape matched:"+m.group(1))
return m.group(1)
if verbosity() > 80: progress("scrape didn't match")
class BI_search(LightBuiltIn, Function):
"""a more powerful built-in for scraping using regexps.
takes a list of 2 strings; the first is the
input data, and the second is a regex with one or more () group.
Returns the list of data matched by the () groups.
see also: test/includes/search.n3
"""
def evaluateObject(self, subj_py):
# raise Error
store = self.store
if verbosity() > 80: progress("search input:"+`subj_py`)
str, pat = subj_py
patc = re.compile(pat)
m = patc.search(str)
if m:
if verbosity() > 80: progress("search matched:"+m.group(1))
return m.groups()
if verbosity() > 80: progress("search didn't match")
class BI_split(LightBuiltIn, Function):
"""split a string into a list of strings
takes a list of 2 strings and an integer; the first is the
input data, and the second is a regex
see re.split in http://docs.python.org/lib/node46.html
"""
def evaluateObject(self, subj_py):
store = self.store
str, pat, q = subj_py
patc = re.compile(pat)
return patc.split(str, q)
class BI_tokenize(LightBuiltIn, Function):
"""like split without the max arg
"""
def evaluateObject(self, subj_py):
store = self.store
str, pat = subj_py
patc = re.compile(pat)
return patc.split(str)
class BI_normalize_space(LightBuiltIn, Function):
"""Returns the value of $arg with whitespace normalized by
stripping leading and trailing whitespace and replacing sequences
of one or more than one whitespace character with a single space,
#x20 -- http://www.w3.org/2006/xpath-functions#normalize-space
"""
def evaluateObject(self, subj_py):
store = self.store
return ' '.join(subj_py.split())
class BI_stringToList(LightBuiltIn, Function, ReverseFunction):
"""You need nothing else. Makes a string a list of characters, and visa versa.
"""
def evaluateObject(self, subj_py):
print "hello, I'm at it"
try:
return [a for a in subj_py]
except TypeError:
return None
def evaluateSubject(self, obj_py):
try:
return "".join(obj_py)
except TypeError:
return None
class BI_format(LightBuiltIn, Function):
"""a built-in for string formatting,
ala python % or C's sprintf or common-lisp's format
takes a list; the first item is the format string, and the rest are args.
see also: test/@@
"""
def evaluateObject(self, subj_py):
return subj_py[0] % tuple(subj_py[1:])
class BI_matches(LightBuiltIn):
def eval(self, subj, obj, queue, bindings, proof, query):
return (re.compile(obj.string).search(subj.string))
class BI_notMatches(LightBuiltIn):
def eval(self, subj, obj, queue, bindings, proof, query):
return (not re.compile(obj.string).search(subj.string))
dataEsc = re.compile(r"[\r<>&]") # timbl removed \n as can be in data
attrEsc = re.compile(r"[\r<>&'\"\n]")
class BI_xmlEscapeData(LightBuiltIn, Function):
"""Take a unicode string and return it encoded so as to pass in an XML data
You will need the BI_xmlEscapeAttribute on for attributes, escaping quotes."""
def evaluateObject(self, subj_py):
return xmlEscape(subj_py, dataEsc)
class BI_xmlEscapeAttribute(LightBuiltIn, Function):
"""Take a unicode string and return it encoded so as to pass in an XML data
You may need stg different for attributes, escaping quotes."""
def evaluateObject(self, subj_py):
return xmlEscape(subj_py, attrEsc)
def xmlEscape(subj_py, markupChars):
"""Escape a string given a regex of the markup chars to be escaped
from toXML.py """
i = 0
result = ""
while i < len(subj_py):
m = markupChars.search(subj_py, i)
if not m:
result = result + subj_py[i:]
break
j = m.start()
result = result + subj_py[i:j]
result = result + ("&#%d;" % (ord(subj_py[j]),))
i = j + 1
return result
class BI_encodeForURI(LightBuiltIn, Function):
"""Take a unicode string and return it encoded so as to pass in an
URI path segment. See
http://www.w3.org/TR/2005/CR-xpath-functions-20051103/#func-encode-for-uri"""
def evaluateObject(self, subj_py):
return urllib.quote(subj_py, "#!~*'()")
class BI_encodeForFragID(LightBuiltIn, Function):
"""Take a unicode string and return it encoded so as to pass in
a URI grament identifier."""
def evaluateObject(self, subj_py):
return urllib.quote(subj_py)
class BI_resolve_uri(LightBuiltIn, Function):
"""see http://www.w3.org/2006/xpath-functions#resolve-uri"""
def evaluateObject(self, subj_py):
import uripath
there, base = subj_py
return uripath.join(base, there)
class BI_codepoints_to_string(LightBuiltIn, Function, ReverseFunction):
"""see http://www.w3.org/2006/xpath-functions#codepoints-to-string"""
def evaluateSubject(self, subj_py):
try:
# What about unicode?
return [ord(a) for a in subj_py]
except TypeError:
return None
def evaluateObject(self, obj_py):
try:
return u"".join([unichr(a) for a in obj_py])
except TypeError:
return None
class BI_string_to_codepoints(LightBuiltIn, Function, ReverseFunction):
"""see http://www.w3.org/2006/xpath-functions#string-to-codepoints"""
def evaluateObject(self, subj_py):
try:
# What about unicode?
return [ord(a) for a in subj_py]
except TypeError:
return None
def evaluateSubject(self, obj_py):
try:
return u"".join([unichr(a) for a in obj_py])
except TypeError:
return None
class BI_compare(LightBuiltIn, Function):
"""see http://www.w3.org/2006/xpath-functions#compare"""
def evaluateObject(self, subj_py):
try:
return [ord(a) for a in subj_py]
except TypeError:
return None
class BI_codepoint_equal(LightBuiltIn):
"""see http://www.w3.org/2006/xpath-functions#codepoint-equal"""
def evaluateObject(self, subj_py):
str = None
for x in subj_py:
if not isString(x):
if type(x) == type(long()) or isinstance(x, Decimal):
x = make_string(x)
else:
x = `x`
if verbosity() > 34: progress("Warning: Coercing to string for codepoint-equal:"+`x`)
# return None # Can't
if str == None:
str = x
elif str != x:
return False
return True
class BI_string_join(LightBuiltIn, Function):
"""see http://www.w3.org/2006/xpath-functions#string-join"""
def evaluateObject(self, subj_py):
if len(subj_py) != 2:
raise Error
strs = []
for x in subj_py[0]:
if not isString(x):
if type(x) == type(long()) or isinstance(x, Decimal):
strs.append(make_string(x))
else:
strs.append(`x`)
return strs.join(subj_py[1])
class BI_substring(LightBuiltIn, Function):
"""see http://www.w3.org/2006/xpath-functions#substring"""
def evaluateObject(self, subj_py):
if len(subj_py) < 2 or len(subj_py) > 3:
raise Error
sourceString = subj_py[0]
if not isString(sourceString):
if type(sourceString) == type(long()) or isinstance(sourceString, Decimal):
strs.append(make_string(sourceString))
else:
strs.append(`sourceString`)
startingLoc = round(subj_py[1])
if len(subj_py) == 3:
length = round(subj_py[2])
else:
length = len(sourceString) - round(startingLoc)
return sourceString[startingLoc - 1:len(sourceString) - length + startingLoc]
class BI_string_length(LightBuiltIn, Function):
"""see http://www.w3.org/2006/xpath-functions#string-length"""
def evaluateObject(self, subj_py):
str = subj_py[0]
if not isString(str):
if type(str) == type(long()) or isinstance(str, Decimal):
strs.append(make_string(str))
else:
strs.append(`str`)
return len(str)
class BI_normalize_unicode(LightBuiltIn, Function):
"""see http://www.w3.org/2006/xpath-functions#normalize-unicode"""
def evaluateObject(self, subj_py):
if len(subj_py) > 2:
raise Error
arg = subj_py[0]
if len(subj_py) == 2:
normalizationForm = subj_py[1]
else:
normalizationForm = "NFC"
if normalizationForm == "":
return arg
return unicodedata.normalize(arg, normalizationForm)
class BI_upper_case(LightBuiltIn, Function):
"""see http://www.w3.org/2006/xpath-functions#upper-case"""
def evaluateObject(self, subj_py):
return subj_py.upper()
class BI_lower_case(LightBuiltIn, Function):
"""see http://www.w3.org/2006/xpath-functions#lower-case"""
def evaluateObject(self, subj_py):
return subj_py.lower()
class BI_translate(LightBuiltIn, Function):
"""see http://www.w3.org/2006/xpath-functions#lower-case"""
def evaluateObject(self, subj_py):
if len(subj_py) != 3:
raise Error
arg = subj_py[0]
mapString = subj_py[1]
transString = subj_py[2]
if len(transString) < len(mapString):
maxlen = len(transString)
else:
maxlen = len(mapString)
table = string.maketrans(mapString[:maxlen], transString[:maxlen])
return arg.translate(table, mapString[maxlen:])
class BI_encode_for_uri(LightBuiltIn, Function, ReverseFunction):
"""see http://www.w3.org/2006/xpath-functions#encode-for-uri"""
def evaluateObject(self, subj_py):
return uripath.canonical(subj_py)
def evaluateSubject(self, obj_py):
return urllib.unquote(obj_py)
class BI_iri_to_uri(LightBuiltIn, Function):
"""see http://www.w3.org/2006/xpath-functions#iri-to-uri"""
def evaluateObject(self, subj_py):
return uripath.canonical(subj_py)
class BI_escape_html_uri(LightBuiltIn, Function):
"""see http://www.w3.org/2006/xpath-functions#escape-html-uri"""
# TODO: Fix me
unescape_re = re.compile('%20')
def evaluateObject(self, subj_py):
return unescape_re.sub(' ', uripath.canonical(subj_py))
class BI_substring_before(LightBuiltIn, Function):
"""see http://www.w3.org/2006/xpath-functions#substring-before"""
def evaluateObject(self, subj_py):
if len(subj_py) != 2:
raise Error
arg1 = subj_py[0]
arg2 = subj_py[1]
if arg1.find(arg2) >= 0:
return arg1[:arg1.find(arg2)]
else:
return ""
class BI_substring_after(LightBuiltIn, Function):
"""see http://www.w3.org/2006/xpath-functions#substring-after"""
def evaluateObject(self, subj_py):
if len(subj_py) != 2:
raise Error
arg1 = subj_py[0]
arg2 = subj_py[1]
if arg1.find(arg2) >= 0:
return arg1[arg1.find(arg2) + len(arg2):]
else:
return ""
class BI_replace(LightBuiltIn, Function):
"""see http://www.w3.org/2006/xpath-functions#replace"""
def evaluateObject(self, subj_py):
if len(subj_py) != 3:
raise Error
input = subj_py[0]
pattern = subj_py[1]
replacement = subj_py[2]
return (re.compile(pattern).replace(input, replacement))
# Register the string built-ins with the store
def isString(x):
# in 2.2, evidently we can test for isinstance(types.StringTypes)
return type(x) is type('') or type(x) is type(u'')
def register(store):
str = store.symbol(STRING_NS_URI[:-1])
str.internFrag("greaterThan", BI_GreaterThan)
str.internFrag("notGreaterThan", BI_NotGreaterThan)
str.internFrag("lessThan", BI_LessThan)
str.internFrag("notLessThan", BI_NotLessThan)
str.internFrag("startsWith", BI_StartsWith)
str.internFrag("endsWith", BI_EndsWith)
str.internFrag("concat", BI_concat)
str.internFrag("concatenation", BI_concatenation)
str.internFrag("scrape", BI_scrape)
str.internFrag("search", BI_search)
str.internFrag("split", BI_split)
str.internFrag("stringToList", BI_stringToList)
str.internFrag("format", BI_format)
str.internFrag("matches", BI_matches)
str.internFrag("notMatches", BI_notMatches)
str.internFrag("contains", BI_Contains)
str.internFrag("containsIgnoringCase", BI_ContainsIgnoringCase)
str.internFrag("containsRoughly", BI_ContainsRoughly)
str.internFrag("doesNotContain", BI_DoesNotContain)
str.internFrag("equalIgnoringCase", BI_equalIgnoringCase)
str.internFrag("notEqualIgnoringCase", BI_notEqualIgnoringCase)
str.internFrag("xmlEscapeAttribute", BI_xmlEscapeAttribute)
str.internFrag("xmlEscapeData", BI_xmlEscapeData)
str.internFrag("encodeForURI", BI_encodeForURI)
str.internFrag("encodeForFragID", BI_encodeForFragID)
fn = store.symbol("http://www.w3.org/2006/xpath-functions")
fn.internFrag("resolve-uri", BI_resolve_uri)
fn.internFrag("tokenize", BI_tokenize)
fn.internFrag("normalize-space", BI_normalize_space)
fn.internFrag("codepoints-to-string", BI_codepoints_to_string)
fn.internFrag("string-to-codepoints", BI_string_to_codepoints)
fn.internFrag("compare", BI_compare)
fn.internFrag("codepoint-equal", BI_codepoint_equal)
fn.internFrag("concat", BI_concatenation)
fn.internFrag("string-join", BI_string_join)
fn.internFrag("substring", BI_substring)
fn.internFrag("string-length", BI_string_length)
fn.internFrag("normalize-unicode", BI_normalize_unicode)
fn.internFrag("upper-case", BI_upper_case)
fn.internFrag("lower-case", BI_lower_case)
fn.internFrag("translate", BI_translate)
fn.internFrag("encode-for-uri", BI_encodeForURI)
fn.internFrag("iri-to-uri", BI_iri_to_uri)
fn.internFrag("escape-html-uri", BI_escape_html_uri)
fn.internFrag("contains", BI_Contains)
fn.internFrag("starts-with", BI_StartsWith)
fn.internFrag("ends-with", BI_EndsWith)
fn.internFrag("substring-before", BI_substring_before)
fn.internFrag("substring-after", BI_substring_after)
fn.internFrag("matches", BI_matches)
fn.internFrag("replace", BI_replace)