def __init__(self, input, decoders = None): """'decoders': dict of type decoders, {typename: decoder}, to override default Decoder.decoders. Decoder can be a function that's fed with all arguments read from the file: decoder(*args, **kwargs). OR, decoder can be a class that's instantiated with __new__(*args) - only unnamed arguments passed! - and then the object's __dict__ is updated with kwargs. """ self.decoders = decs = Decoder.decoders.copy() # the dict of decoders may get modified during operation, thus shallow-copying if decoders: decs.update(decoders) # replace names of classes/functions, present as values in 'decoders', with actual class/func objects; # rewrite values in 'decoders' to include info whether it's a class or just a function for name, dec in decs.iteritems(): if isstring(dec): decs[name] = dec = _import(dec) isclass = isinstance(dec, type) decs[name] = (dec, isclass) # now every value in 'decoders' is a pair: (decoder, isclass) # make an iterator from 'input' if isstring(input): self.input = iter(input.splitlines(True)) # must keep newline characters, thus 'True' elif isinstance(input, Iterator): self.input = input else: self.input = iter(input) self.parser = Analyzer(self.decodeType) self.line = None # the next line to be decoded, in a parsed form; client can read it directly for a preview of the next line; must explicitly call move() afterwards self.linenum = 0 # no. of the current line ('line'), counting from 1 self.move()
def stopwords(words, *stopLists, **params): """From a given list of words or a string, filter out: stopwords, numbers (if 'numbers'=True, default; both integers and floats), single letters and characters (if 'singles'=True, default); using all the *stopLists lists/sets combined, or STOP if no list provided. Comparison is done in lowercase, but original case is left in the result. >>> stopwords("This is an example string.") 'example string' >>> stopwords(u"Echte of neppatiënten") # unicode characters recognized correctly inside words u'Echte neppati\\xc3 nten' """ numbers = params.get('numbers', True) singles = params.get('singles', True) asstring = params.get('asstring', None) if not stopLists: stop = STOP else: stop = set() for s in stopLists: stop |= set(s.split()) if isstring(s) else set(s) if islist(s) else s if isstring(words): if asstring is None: asstring = True words = re.split(r'\W+', words, flags = re.UNICODE) res = [] for w in words: if singles and len(w) < 2: continue if numbers and w.isdigit(): continue # todo: replace isdigit() with a regex that handles all floats, too if w.lower() in stop: continue res.append(w) if asstring: return ' '.join(res) # get back to a concatenated text return res
def __init__(self, input, decoders=None): """'decoders': dict of type decoders, {typename: decoder}, to override default Decoder.decoders. Decoder can be a function that's fed with all arguments read from the file: decoder(*args, **kwargs). OR, decoder can be a class that's instantiated with __new__(*args) - only unnamed arguments passed! - and then the object's __dict__ is updated with kwargs. """ self.decoders = decs = Decoder.decoders.copy( ) # the dict of decoders may get modified during operation, thus shallow-copying if decoders: decs.update(decoders) # replace names of classes/functions, present as values in 'decoders', with actual class/func objects; # rewrite values in 'decoders' to include info whether it's a class or just a function for name, dec in decs.iteritems(): if isstring(dec): decs[name] = dec = _import(dec) isclass = isinstance(dec, type) decs[name] = ( dec, isclass ) # now every value in 'decoders' is a pair: (decoder, isclass) # make an iterator from 'input' if isstring(input): self.input = iter(input.splitlines( True)) # must keep newline characters, thus 'True' elif isinstance(input, Iterator): self.input = input else: self.input = iter(input) self.parser = Analyzer(self.decodeType) self.line = None # the next line to be decoded, in a parsed form; client can read it directly for a preview of the next line; must explicitly call move() afterwards self.linenum = 0 # no. of the current line ('line'), counting from 1 self.move()
def __init__(self, text = None, ast = None, upto = None): """Build Tree, either from input 'text' (will be parsed to raw AST and then rewritten to Tree), or from raw AST 'ast' (only rewriting will be done). """ if isstring(self._ignore_): self._ignore_ = self._ignore_.split() if isstring(self._reduce_): self._reduce_ = self._reduce_.split() self.text = text self.ast = self.parse(text) if text else ast # parse input text to raw AST; keep AST for reference by the client if upto == "parsing": return self.root = self.rewrite(self.ast) # rewrite AST to a tree of Tree.node's
def rewrite(self, waxnode): if isstring(waxnode): return self.static(self, None, waxnode) if not isinstance(waxnode, waxeye.AST): return self.empty(self, None) # node is a regular non-terminal; find corresponding inner class of the tree and instantiate (this will recursively rewrite nodes down the tree) nodeclass = getattr(self, 'x' + waxnode.type) return nodeclass(self, waxnode)
def resnext_unit(y, bottleneck, channels_out, paths, strides = 1, dilation_rate = 1, activation = 'relu'): """ResNeXt residual unit, optionally extended with spatial (vertical+horizontal) paths. If `paths` is a triple (A,B,C), A is the no. of 1x9 (horizontal) paths, B: 9x1 (vertical), C: 3x3. """ if isstring(activation): activation = Activation(activation) depth_in = y.shape[-1] shortcut = y # the residual block is reshaped as a bottleneck + grouped-convolution + rev-bottleneck, which is equivalent # to the original formulation as a collection of paths, but makes the network more economical y = Conv2D(bottleneck, (1, 1), padding = 'same')(y) # (1) bottleneck y = activation(BatchNormalization()(y)) # create ResNeXT grouped convolutions (the middle element of paths) y = grouped_convolution(y, bottleneck, paths, strides = strides, dilation_rate = dilation_rate) # (2) grouped convolution y = activation(BatchNormalization()(y)) y = Conv2D(channels_out, (1, 1), padding = 'same')(y) # (3) rev-bottleneck # batch normalization is employed after aggregating the transformations and before adding to the shortcut y = BatchNormalization()(y) # if input/output have different dimensions: because of a stride (spatial dimension), or because of a different depth, # an extra 1x1 convolution is added on the shortcut connection to perform the adjustment if strides not in [None, 1, (1, 1)] or depth_in != channels_out: shortcut = Conv2D(channels_out, (1, 1), strides = strides, padding = 'same')(shortcut) shortcut = BatchNormalization()(shortcut) y = layers_add([shortcut, y]) y = activation(y) return y
def alternative(strings, escape = True, compile = False, flags = 0): #@ReservedAssignment "Regex (not compiled by default) that matches the logical alternative of given strings. Strings are escaped by default." if isstring(strings): strings = strings.split() if escape: strings = filter(re.escape, strings) pat = "|".join(strings) if not compile: return pat return re.compile(pat, flags = flags)
def tags(names): """Returns a regex pattern matching only the tags with given names, both opening and closing ones. The matched tag name is available in 1st (opening) or 4th (closing) group. """ pat = r"""<(?:(%s)(\s(?:\s*[a-zA-Z][\w:\-]*(?:\s*=(?:\s*"(?:\\"|[^"])*"|\s*'(?:\\'|[^'])*'|[^\s>]+))?)*)?(\s*[\/\?]?)|\/(%s)\s*)>""" if isstring(names) and ' ' in names: names = names.split() if islist(names): names = "|".join(names) return pat % (names, names)
def tags_except(names, special = True): "Returns a regex pattern matching all tags _except_ the given names. If special=True (default), special tags are included: <!-- --> <? ?> <![CDATA" pat = r"""<(?:(?!%s)([a-zA-Z\?][\w:\-]*)(\s(?:\s*[a-zA-Z][\w:\-]*(?:\s*=(?:\s*"(?:\\"|[^"])*"|\s*'(?:\\'|[^'])*'|[^\s>]+))?)*)?(\s*[\/\?]?)|\/(?!%s)([a-zA-Z][\w:\-]*)\s*""" if special: pat += r"|!--((?:[^\-]|-(?!->))*)--|!\[CDATA\[((?:[^\]]|\](?!\]>))*)\]\]" pat += r")>" if isstring(names) and ' ' in names: names = names.split() if islist(names): names = "|".join(names) names = r"(?:%s)\b" % names # must check for word boundary (\b) at the end of a tag name, to avoid prefix matching of other tags return pat % (names, names)
def get_random(cls, randomness="__random__", rand=None, **params): """ Returns a single item from the probability distribution represented by self. In subclasses, always overide _get_random_instance() instead of this method. """ if isstring(randomness): randomness = getattr(cls, randomness) if isinstance(randomness, RandomInstance): return randomness.get_random(rand) attributes = cls._get_attributes(randomness, params) return cls._get_random_instance(attributes, rand)
def tags_pair(names = None): """Returns a regex pattern matching: (1) an opening tag with a name from 'names', or any name if 'names' is empty/None; followed by (2) any number of characters (the "body"), matched lazy (as few characters as possible); followed by (3) a closing tag with the same name as the opening tag. The matched tag name is available in the 1st group. Self-closing tags <.../> are NOT matched. """ opening = r"""<(%s)(\s(?:\s*[a-zA-Z][\w:\-]*(?:\s*=(?:\s*"(?:\\"|[^"])*"|\s*'(?:\\'|[^'])*'|[^\s>]+))?)*)?\s*>""" closing = r"<\/(\1)\s*>" body = r".*?" # lazy "match all" pat = opening + body + closing if not names: names = r"[a-zA-Z\?][\w:\-]*" # "any tag name"; for XML matching this regex is too strict, as XML allows for other names, too else: if isstring(names): names = names.split() if islist(names): names = "|".join(names) return pat % names
def generate_random(cls, randomness="__random__", rand=None): "Generate an infinite stream of random items from the distribution represented by self." if isstring(randomness): randomness = getattr(cls, randomness) if isinstance(randomness, RandomInstance): def get_next(): return randomness.get_random(rand) else: attributes = cls._get_attributes(randomness) def get_next(): return cls._get_random_instance(attributes, rand) while True: yield get_next()
def conv2D_BN(y, *args, **kwargs): """Extra arguments: - add: (optional) tensor or a list of tensors (typically a shortcut connection) to be added to the output right after BatchNormalization, but before activation """ activation = kwargs.pop('activation', None) if isstring(activation): activation = Activation(activation) add = kwargs.pop('add', None) if add and not islist(add): add = [add] y = Conv2D(*args, **kwargs)(y) y = BatchNormalization()(y) if add: y = layers_add([y] + add) if activation: y = activation(y) return y
def resnext_unit(y, bottleneck, channels_out, paths, strides=1, dilation_rate=1, activation='relu'): """ResNeXt residual unit, optionally extended with spatial (vertical+horizontal) paths. If `paths` is a triple (A,B,C), A is the no. of 1x9 (horizontal) paths, B: 9x1 (vertical), C: 3x3. """ if isstring(activation): activation = Activation(activation) depth_in = y.shape[-1] shortcut = y # the residual block is reshaped as a bottleneck + grouped-convolution + rev-bottleneck, which is equivalent # to the original formulation as a collection of paths, but makes the network more economical y = Conv2D(bottleneck, (1, 1), padding='same')(y) # (1) bottleneck y = activation(BatchNormalization()(y)) # create ResNeXT grouped convolutions (the middle element of paths) y = grouped_convolution( y, bottleneck, paths, strides=strides, dilation_rate=dilation_rate) # (2) grouped convolution y = activation(BatchNormalization()(y)) y = Conv2D(channels_out, (1, 1), padding='same')(y) # (3) rev-bottleneck # batch normalization is employed after aggregating the transformations and before adding to the shortcut y = BatchNormalization()(y) # if input/output have different dimensions: because of a stride (spatial dimension), or because of a different depth, # an extra 1x1 convolution is added on the shortcut connection to perform the adjustment if strides not in [None, 1, (1, 1)] or depth_in != channels_out: shortcut = Conv2D(channels_out, (1, 1), strides=strides, padding='same')(shortcut) shortcut = BatchNormalization()(shortcut) y = layers_add([shortcut, y]) y = activation(y) return y
def load(self, input): """Generator. Yields consecutive objects decoded from 'input'. 'input' is either a file object, or a name of file to be opened. If you have a string with encoded data, not a file, use decode() instead.""" if isstring(input): input = open(input, 'rt') return self.decode(input)
def info(node, depth = 0): prefix = ' ' * depth if isstring(node): return prefix + node return prefix + node.info(), [info(n, depth+1) for n in node.children]