Ejemplo n.º 1
0
def stopwords(words, *stopLists, **params):
    """From a given list of words or a string, filter out: stopwords, numbers (if 'numbers'=True, default; both integers and floats),
    single letters and characters (if 'singles'=True, default); using all the *stopLists lists/sets combined, or STOP if no list provided.
    Comparison is done in lowercase, but original case is left in the result.
    >>> stopwords("This is an example string.")
    'example string'
    >>> stopwords(u"Echte of neppatiënten")         # unicode characters recognized correctly inside words
    u'Echte neppati\\xc3 nten'
    """
    numbers = params.get('numbers', True)
    singles = params.get('singles', True)
    asstring = params.get('asstring', None)

    if not stopLists:
        stop = STOP
    else:
        stop = set()
        for s in stopLists:
            stop |= set(s.split()) if isstring(s) else set(s) if islist(s) else s

    if isstring(words):
        if asstring is None: asstring = True
        words = re.split(r'\W+', words, flags = re.UNICODE)

    res = []
    for w in words:
        if singles and len(w) < 2: continue
        if numbers and w.isdigit(): continue                    # todo: replace isdigit() with a regex that handles all floats, too
        if w.lower() in stop: continue
        res.append(w)

    if asstring: return ' '.join(res)                           # get back to a concatenated text
    return res
Ejemplo n.º 2
0
    def join(self, iterable):
        if islist(iterable): items = iterable
        else: items = list(iterable)            # we have to materialize the iterable to check language of each item

        # check that all strings to be joined have compatible languages; calculate the resulting language;
        # the initial self.language can be None - this allows the items set the language
        language = reduce(Text.combine, items, self.language)
        return Text(unicode.join(self, items), language)
Ejemplo n.º 3
0
 def tags(names):
     """Returns a regex pattern matching only the tags with given names, both opening and closing ones.
     The matched tag name is available in 1st (opening) or 4th (closing) group.
     """
     pat = r"""<(?:(%s)(\s(?:\s*[a-zA-Z][\w:\-]*(?:\s*=(?:\s*"(?:\\"|[^"])*"|\s*'(?:\\'|[^'])*'|[^\s>]+))?)*)?(\s*[\/\?]?)|\/(%s)\s*)>"""
     if isstring(names) and ' ' in names: names = names.split()
     if islist(names): names = "|".join(names)
     return pat % (names, names)
Ejemplo n.º 4
0
 def tags_except(names, special = True):
     "Returns a regex pattern matching all tags _except_ the given names. If special=True (default), special tags are included: <!-- --> <? ?> <![CDATA"
     pat = r"""<(?:(?!%s)([a-zA-Z\?][\w:\-]*)(\s(?:\s*[a-zA-Z][\w:\-]*(?:\s*=(?:\s*"(?:\\"|[^"])*"|\s*'(?:\\'|[^'])*'|[^\s>]+))?)*)?(\s*[\/\?]?)|\/(?!%s)([a-zA-Z][\w:\-]*)\s*"""
     if special: pat += r"|!--((?:[^\-]|-(?!->))*)--|!\[CDATA\[((?:[^\]]|\](?!\]>))*)\]\]"
     pat += r")>"
     if isstring(names) and ' ' in names: names = names.split()
     if islist(names): names = "|".join(names)
     names = r"(?:%s)\b" % names              # must check for word boundary (\b) at the end of a tag name, to avoid prefix matching of other tags
     return pat % (names, names)
Ejemplo n.º 5
0
    def tags_pair(names = None):
        """Returns a regex pattern matching: (1) an opening tag with a name from 'names', or any name if 'names' is empty/None;
        followed by (2) any number of characters (the "body"), matched lazy (as few characters as possible);
        followed by (3) a closing tag with the same name as the opening tag.
        The matched tag name is available in the 1st group. Self-closing tags <.../> are NOT matched.
        """
        opening = r"""<(%s)(\s(?:\s*[a-zA-Z][\w:\-]*(?:\s*=(?:\s*"(?:\\"|[^"])*"|\s*'(?:\\'|[^'])*'|[^\s>]+))?)*)?\s*>"""
        closing = r"<\/(\1)\s*>"
        body = r".*?"                                   # lazy "match all"
        pat = opening + body + closing

        if not names: names = r"[a-zA-Z\?][\w:\-]*"     # "any tag name"; for XML matching this regex is too strict, as XML allows for other names, too
        else:
            if isstring(names): names = names.split()
            if islist(names): names = "|".join(names)
        return pat % names
Ejemplo n.º 6
0
def conv2D_BN(y, *args, **kwargs):
    """Extra arguments:
    - add: (optional) tensor or a list of tensors (typically a shortcut connection) to be added to the output
           right after BatchNormalization, but before activation
    """
    activation = kwargs.pop('activation', None)
    if isstring(activation): activation = Activation(activation)

    add = kwargs.pop('add', None)
    if add and not islist(add): add = [add]

    y = Conv2D(*args, **kwargs)(y)
    y = BatchNormalization()(y)
    if add: y = layers_add([y] + add)
    if activation: y = activation(y)

    return y
Ejemplo n.º 7
0
def conv2D_BN(y, *args, **kwargs):
    """Extra arguments:
    - add: (optional) tensor or a list of tensors (typically a shortcut connection) to be added to the output
           right after BatchNormalization, but before activation
    """
    activation = kwargs.pop('activation', None)
    if isstring(activation): activation = Activation(activation)

    add = kwargs.pop('add', None)
    if add and not islist(add): add = [add]
    
    y = Conv2D(*args, **kwargs)(y)
    y = BatchNormalization()(y)
    if add: y = layers_add([y] + add)
    if activation: y = activation(y)
    
    return y