/
derived.py
674 lines (535 loc) · 21.6 KB
/
derived.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
# The contents of this file are subject to the Mozilla Public License
# (MPL) Version 1.1 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License
# at http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS IS"
# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
# the License for the specific language governing rights and
# limitations under the License.
#
# The Original Code is LEPL (http://www.acooke.org/lepl)
# The Initial Developer of the Original Code is Andrew Cooke.
# Portions created by the Initial Developer are Copyright (C) 2009-2010
# Andrew Cooke (andrew@acooke.org). All Rights Reserved.
#
# Alternatively, the contents of this file may be used under the terms
# of the LGPL license (the GNU Lesser General Public License,
# http://www.gnu.org/licenses/lgpl.html), in which case the provisions
# of the LGPL License are applicable instead of those above.
#
# If you wish to allow use of your version of this file only under the
# terms of the LGPL License and not to allow others to use your version
# of this file under the MPL, indicate your decision by deleting the
# provisions above and replace them with the notice and other provisions
# required by the LGPL License. If you do not delete the provisions
# above, a recipient may use your version of this file under either the
# MPL or the LGPL License.
'''
Matchers that are defined in terms of other matchers (ie the majority).
'''
from string import whitespace, digits, ascii_letters, \
ascii_uppercase, ascii_lowercase, printable, punctuation
from operator import __add__
from lepl.matchers.combine import And, DepthFirst, BreadthFirst, \
OrderByResultCount, Or, Limit
from lepl.matchers.core import Lookahead, Any, Eof, Literal, Regexp
from lepl.matchers.operators import BREADTH_FIRST, DEPTH_FIRST, GREEDY, \
NON_GREEDY
from lepl.matchers.support import coerce_
from lepl.matchers.transform import TransformationWrapper, Transform, \
ApplyArgs, ApplyRaw
from lepl.regexp.matchers import NfaRegexp, DfaRegexp
from lepl.support.lib import assert_type, lmap, fmt, basestring, reduce
from lepl.support.warn import warn_on_use
# pylint: disable-msg=C0103
# (consistent interface)
# pylint: disable-msg=W0142
# (*args)
# pylint: disable-msg=W0105
# (string comments)
# pylint: disable-msg=W0141
# (map)
def Repeat(matcher, start=0, stop=None, limit=None, algorithm=DEPTH_FIRST,
separator=None, add_=False, reduce=None):
'''
This is called by the [] operator. It repeats the given matcher between
`start` and `stop` number of times (inclusive).
If `limit` is given it is an upper limit on the number of different
results returned on backtracking.
`algorithm` selects the repeat algorithm to use.
If `separator` is given then each repetition is separated by that matcher.
If `add_` is true then the results are joined with `Add` (once all
results are obtained).
If `reduce` is given it should be a pair (zero, join) where
`join(results, next)` is used to accumulate results and `zero` is the
initial value of `results`. This is implemented via `Reduce`.
`reduce` and `add_` cannot be given together.
'''
first = coerce_(matcher)
if separator is None:
rest = first
else:
rest = And(coerce_(separator, Regexp), first)
if start is None:
start = 0
# allow duck typing (mutable values - IntVar etc)
# assert_type('The start index for Repeat or [...]', start, int)
# assert_type('The stop index for Repeat or [...]', stop, int, none_ok=True)
# assert_type('The limit value (step index) for Repeat or [...]', limit, int, none_ok=True)
# assert_type('The algorithm (step index) for Repeat or [...]', algorithm, str)
# if start < 0:
# raise ValueError('Repeat or [...] cannot have a negative start.')
# if stop is not None and stop < start:
# raise ValueError('Repeat or [...] must have a stop '
# 'value greater than or equal to the start.')
# if 'dbgn'.find(algorithm) == -1:
# raise ValueError('Repeat or [...] must have a step (algorithm) '
# 'of d, b, g or n.')
if add_ and reduce:
raise ValueError('Repeat cannot apply both add_ and reduce')
elif add_:
process = Add
elif reduce:
process = lambda r: Reduce(r, reduce[0], reduce[1])
else:
process = Identity
matcher = {DEPTH_FIRST:
process(DepthFirst(first=first, start=start, stop=stop, rest=rest)),
BREADTH_FIRST:
process(BreadthFirst(first=first, start=start, stop=stop, rest=rest)),
GREEDY:
process(OrderByResultCount(
BreadthFirst(first=first, start=start, stop=stop, rest=rest))),
NON_GREEDY:
process(OrderByResultCount(
BreadthFirst(first=first, start=start, stop=stop, rest=rest),
False))
}[algorithm]
if limit is not None:
matcher = Limit(matcher, count=limit)
return matcher
def Apply(matcher, function, raw=False, args=False):
'''
Apply an arbitrary function to the results of the matcher
(**>**, **>=**).
Apply can be used via the standard operators by placing ``>``
(or ``>=`` to set ``raw=True``, or ``*`` to set ``args=True``)
to the right of a matcher.
If the function is a `TransformationWrapper` it is used directly.
Otherwise a `TransformationWrapper` is constructed via the `raw` and
`args` parameters, as described below.
**Note:** The creation of named pairs (when a string argument is
used) behaves more like a mapping than a single function invocation.
If several values are present, several pairs will be created.
**Note:** There is an asymmetry in the default values of *raw*
and *args*. If the identity function is used with the default settings
then a list of results is passed as a single argument (``args=False``).
That is then returned (by the identity) as a list, which is wrapped
in an additional list (``raw=False``), giving an extra level of
grouping. This is necessary because Python's ``list()`` is an
identity for lists, but we want it to add an extra level of grouping
so that nested S-expressions are easy to generate.
See also `Map`.
:Parameters:
matcher
The matcher whose results will be modified.
function
The modification to apply.
If a `Transformation`, this is used directly.
If a string is given, named pairs will be created (and raw and args
ignored).
Otherwise the function should expect a list of results (unless
``args=True`` in which case the list is supplied as ``*args``)
and can return any value (unless ``raw=True`` in which case it should
return a list).
raw
If True the results are used directly. Otherwise they are wrapped in
a list. The default is False --- a list is added. This is set to
true if the target function is an `ApplyRaw` instance.
args
If True, the results are passed to the function as separate
arguments (Python's '*args' behaviour). The default is False ---
the results are passed inside a list). This is set to true if the
target function is an `ApplyArgs` instance.
'''
raw = raw or (type(function) is type and issubclass(function, ApplyRaw))
args = args or (type(function) is type
and issubclass(function, ApplyArgs))
if isinstance(function, TransformationWrapper):
apply = function
else:
if isinstance(function, basestring):
function = lambda results, f=function: \
lmap(lambda x: (f, x), results)
raw = True
args = False
if args:
if raw:
function = lambda results, f=function: f(*results)
else:
function = lambda results, f=function: [f(*results)]
else:
if not raw:
function = lambda results, f=function: [f(results)]
def apply(stream_in, matcher):
(results, stream_out) = matcher()
return (function(results), stream_out)
return Transform(matcher, apply)
def args(function):
'''
A decorator that has the same effect as ApplyArgs for functions/methods.
'''
def wrapper(args_):
'''
Apply args as *args.
'''
return function(*args_)
return wrapper
def KApply(matcher, function, raw=False):
'''
Apply an arbitrary function to named arguments (**\****).
The function should typically expect and return a list.
It can be used indirectly by placing ``**=`` to the right of the matcher.
The function will be applied with the following keyword arguments:
stream_out
The stream returned from the matcher.
results
A list of the results returned.
:Parameters:
matcher
The matcher whose results will be modified.
function
The modification to apply.
raw
If false (the default), the final return value from the function
will be placed in a list and returned in a pair together with the
new stream returned from the matcher (ie the function returns a
single new result).
If true, the final return value from the function is used directly
and so should match the ``([results], stream)`` type expected by
other matchers.
'''
def fun(stream_in, matcher):
'''
Apply args as **kargs.
'''
(results, stream_out) = matcher()
kargs = {'results': results,
'stream_in': stream_in,
'stream_out': stream_out}
if raw:
return function(**kargs)
else:
return ([function(**kargs)], stream_out)
return Transform(matcher, fun)
def AnyBut(exclude=None):
'''
Match any character except those specified (or, if a matcher is used as
the exclude, if the matcher fails).
The argument should be a list of tokens (or a string of suitable
characters) to exclude, or a matcher. If omitted all tokens are accepted.
'''
return And(~Lookahead(coerce_(exclude, Any)), Any())
def Optional(matcher):
'''
Match zero or one instances of a matcher (**[0:1]**).
'''
return Repeat(coerce_(matcher), stop=1)
#return Or(coerce_(matcher), Empty())
def Star(matcher):
'''
Match zero or more instances of a matcher (**[0:]**)
'''
return Repeat(coerce_(matcher))
ZeroOrMore = Star
'''
Match zero or more instances of a matcher (**[0:]**)
'''
def Plus(matcher):
'''
Match one or more instances of a matcher (**[1:]**)
'''
return Repeat(coerce_(matcher), start=1)
OneOrMore = Plus
'''
Match one or more instances of a matcher (**[1:]**)
'''
def Map(matcher, function):
'''
Apply an arbitrary function to each of the tokens in the result of the
matcher (**>>**). If the function is a name, named pairs are created
instead. It can be used indirectly by placing ``>>`` to the right of the
matcher.
See also `Apply`.
'''
# list() necessary so we can use '+' on result
if isinstance(function, basestring):
return Apply(matcher, lambda l: list(map(lambda x: (function, x), l)),
raw=True)
else:
return Apply(matcher, lambda l: list(map(function, l)), raw=True)
def Reduce(matcher, zero, join=__add__):
'''
Combine the results from the matcher using `reduce(join, results, zero)`.
Unlike `Add` this will return a value (`zero`) when there are no matches.
'''
def reduce_(_stream, matcher):
(results, stream_out) = matcher()
return ([reduce(join, results, zero)], stream_out)
return Apply(matcher, TransformationWrapper(reduce_))
def add(_stream, matcher):
'''
The transformation used in `Add` - we carefully use "+" in as generic
a manner as possible.
'''
(results, stream_out) = matcher()
if results:
result = results[0]
for extra in results[1:]:
try:
result = result + extra
except TypeError:
raise TypeError(
fmt('An attempt was made to add two results '
'that do not have consistent types: {0!r} + {1!r}',
result, extra))
result = [result]
else:
result = []
return (result, stream_out)
def Add(matcher):
'''
Join tokens in the result using the "+" operator (**+**).
This joins strings and merges lists.
Unlike `Reduce` this will have no effect of there are no matches.
'''
return Apply(matcher, TransformationWrapper(add))
def Join(*matchers):
'''
Combine many matchers together with Add(And(...)).
It can be used indirectly by placing ``+`` between matchers.
'''
return Add(And(*matchers))
def Drop(matcher):
'''
Do the match, but return nothing (**~**). The ~ prefix is equivalent.
'''
return Apply(matcher, lambda l: [], raw=True)
def Substitute(matcher, value):
'''
Replace each return value with that given.
'''
return Map(matcher, lambda x: value)
def Name(matcher, name):
'''
Name the result of matching (**> name**)
This replaces each value in the match with a tuple whose first value is
the given name and whose second value is the matched token. The Node
class recognises this form and associates such values with named attributes.
'''
return Map(matcher, name)
Eos = Eof
'''Match the end of a stream. Returns nothing.'''
def Identity(matcher):
'''Functions identically to the matcher given as an argument.'''
return coerce_(matcher)
def Newline():
'''Match newline (Unix) or carriage return newline (Windows)'''
return Or(Literal('\n'), Literal('\r\n'))
def Space(space=' \t'):
'''Match a single space (by default space or tab).'''
return Any(space)
def Whitespace(space=whitespace):
'''
Match a single space (by default from string.whitespace,
which includes newlines).
'''
return Any(space)
def Digit():
'''Match any single digit.'''
return Any(digits)
def Letter():
'''Match any ASCII letter (A-Z, a-z).'''
return Any(ascii_letters)
def Upper():
'''Match any ASCII uppercase letter (A-Z).'''
return Any(ascii_uppercase)
def Lower():
'''Match any ASCII lowercase letter (A-Z).'''
return Any(ascii_lowercase)
def Printable():
'''Match any printable character (string.printable).'''
return Any(printable)
def Punctuation():
'''Match any punctuation character (string.punctuation).'''
return Any(punctuation)
def UnsignedInteger():
'''Match a simple sequence of digits.'''
return Repeat(Digit(), start=1, add_=True)
def SignedInteger():
'''Match a sequence of digits with an optional initial sign.'''
return Add(And(Optional(Any('+-')), UnsignedInteger()))
Integer = SignedInteger
'''
The default integer is a signed one.
'''
def UnsignedReal(decimal='.'):
'''
Match a sequence of digits that may include a decimal point. This
will match both integer and float values.
'''
return Or(Join(Optional(UnsignedInteger()),
Any(decimal), UnsignedInteger()),
Join(UnsignedInteger(), Optional(Any(decimal))))
def SignedReal(decimal='.'):
'''
Match a signed sequence of digits that may include a decimal point.
This will match both integer and float values.
'''
return Join(Optional(Any('+-')), UnsignedReal(decimal))
def UnsignedEReal(decimal='.', exponent='eE'):
'''
Match an `UnsignedReal` followed by an optional exponent
(e+02 etc). This will match both integer and float values.
'''
return Join(UnsignedReal(decimal),
Optional(And(Any(exponent), SignedInteger())))
def SignedEReal(decimal='.', exponent='eE'):
'''
Match a `SignedReal` followed by an optional exponent
(e+02 etc). This will match both integer and float values.
'''
if decimal == '.' and exponent == 'eE':
# hack to faster direct implementation for now
return NfaRegexp(r'[\+\-]?(?:[0-9]*\.[0-9]+|[0-9]+\.|[0-9]+)(?:[eE][\+\-]?[0-9]+)?')
else:
return Join(SignedReal(decimal),
Optional(Join(Any(exponent), SignedInteger())))
Real = SignedEReal
'''
The default float is signed with exponents.
'''
_FLOAT_WARN = '''WARNING: The definition of the Float matchers changed in Lepl 4.4.0
(you may want to use Real instead).'''
@warn_on_use(_FLOAT_WARN)
def UnsignedFloat(decimal='.'):
'''
Match a sequence of digits that must include a decimal point. This
will match real values that are not integers.
'''
return Or(Join(Optional(UnsignedInteger()),
Any(decimal), UnsignedInteger()),
Join(UnsignedInteger(), Any(decimal)))
@warn_on_use(_FLOAT_WARN)
def SignedFloat(decimal='.'):
'''
Match a signed sequence of digits that must include a decimal point. This
will match real values that are not integers.
'''
return Join(Optional(Any('+-')), UnsignedFloat(decimal))
@warn_on_use(_FLOAT_WARN)
def UnsignedEFloat(decimal='.', exponent='eE'):
'''
As `UnsignedEReal`, but must contain a decimal or exponent. This
will match real values that are not integers.
'''
return Or(Join(UnsignedReal(decimal), Any(exponent), SignedInteger()),
UnsignedFloat(decimal))
@warn_on_use(_FLOAT_WARN)
def SignedEFloat(decimal='.', exponent='eE'):
'''
As `SignedEReal`, but must contain a decimal or exponent. This
will match real values that are not integers.
'''
if decimal == '.' and exponent == 'eE':
# hack to faster direct implementation for now
return NfaRegexp(r'[\+\-]?(?:[0-9]*\.[0-9]+(?:[eE][\+\-]?[0-9]+)?|[0-9]+\.(?:[eE][\+\-]?[0-9]+)?|[0-9]+[eE][\+\-]?[0-9]+)')
else:
return Or(Join(SignedReal(decimal), Any(exponent), SignedInteger()),
SignedFloat(decimal))
Float = SignedEFloat
'''
The default float matcher accepts signed values with optional exponents.
This will not match integers - see `Real` for that.
'''
def Word(chars=NfaRegexp('[^%s]' % whitespace), body=None):
'''
Match a sequence of non-space characters, joining them together.
chars and body, if given as strings, define possible characters to use
for the first and rest of the characters in the word, respectively.
If body is not given, then chars is used for the entire word.
They can also specify matchers, which typically should match only a
single character.
So ``Word(Upper(), Lower())`` would match names that being with an upper
case letter, for example, while ``Word(AnyBut(Space()))`` (the default)
matches any sequence of non-space characters.
'''
chars = coerce_(chars, Any)
body = chars if body is None else coerce_(body, Any)
return Add(And(chars, Star(body)))
def DropEmpty(matcher):
'''
Drop results if they are empty (ie if they are ``False`` in Python).
This will drop empty strings and lists. It will also drop
`Node` instances if they are empty (since the length is then
zero).
'''
def drop(results):
'''
Only drop "False" values.
'''
return [result for result in results if result]
return Apply(matcher, drop, raw=True)
def Literals(*matchers):
'''
A series of literals, joined with `Or`.
'''
# I considered implementing this by extending Literal() itself, but
# that would have meant putting "Or-like" functionality in Literal,
# and I felt it better to keep the base matchers reasonably orthogonal.
return Or(*lmap(Literal, matchers))
def String(quote='"', escape='\\', empty='', join=__add__):
'''
Match a string with quotes that can be escaped. This will match across
newlines (see `SingleLineString` for an alternative).
More generally, a string is a grouping of results. Setting `empty` and
`join` correctly will allow this matcher to work with a variety of types.
'''
q = Literal(quote)
content = AnyBut(q)
if escape:
content = Or(And(Drop(escape), q), content)
content = Repeat(content, reduce=(empty, join))
return And(Drop(q), content, Drop(q))
def SingleLineString(quote='"', escape='\\', exclude='\n', empty='', join=__add__):
'''
Like `String`, but will not match across multiple lines.
'''
q = Literal(quote)
content = AnyBut(Or(q, Any(exclude)))
if escape:
content = Or(content, And(Drop(escape), q))
content = Repeat(content, reduce=(empty, join))
return And(Drop(q), content, Drop(q))
def SkipString(quote='"', escape='\\', ignore='\n', empty='', join=__add__):
'''
Like `String`, matching across multiple lines, but will silently
drop newlines.
'''
q = Literal(quote)
content = AnyBut(Or(q, Any(ignore)))
if escape:
content = Or(content, And(Drop(escape), q))
content = Or(content, Drop(Any(ignore)))
content = Repeat(content, reduce=(empty, join))
return And(Drop(q), content, Drop(q))
def SkipTo(matcher, include=True):
'''
Consume everything up to (and including, if include is True, as it is by
default) the matcher. Returns all the skipped data, joined.
'''
if include:
return Add(And(Star(AnyBut(matcher)), matcher))
else:
return And(Add(Star(AnyBut(matcher))), Lookahead(matcher))