Example #1
0
 def output(self,delimiter = '\t', addnormalised=False):
     """Print a representation of the frequency list"""
     for type, count in self:
         if isinstance(type,tuple) or isinstance(type,list):
             if addnormalised:
                 yield " ".join((u(x) for x in type)) + delimiter + str(count) + delimiter + str(count/self.total)
             else:
                 yield " ".join((u(x) for x in type)) + delimiter + str(count)
         elif isstring(type):
             if addnormalised:
                 yield type + delimiter + str(count) + delimiter + str(count/self.total)
             else:
                 yield type + delimiter + str(count)
         else:
             if addnormalised:
                 yield str(type) + delimiter + str(count) + delimiter + str(count/self.total)
             else:
                 yield str(type) + delimiter + str(count)
Example #2
0
 def output(self, delimiter='\t', addnormalised=False):
     """Print a representation of the frequency list"""
     for type, count in self:
         if isinstance(type, tuple) or isinstance(type, list):
             if addnormalised:
                 yield " ".join((u(x) for x in type)) + delimiter + str(
                     count) + delimiter + str(count / self.total)
             else:
                 yield " ".join(
                     (u(x) for x in type)) + delimiter + str(count)
         elif isstring(type):
             if addnormalised:
                 yield type + delimiter + str(count) + delimiter + str(
                     count / self.total)
             else:
                 yield type + delimiter + str(count)
         else:
             if addnormalised:
                 yield str(type) + delimiter + str(count) + delimiter + str(
                     count / self.total)
             else:
                 yield str(type) + delimiter + str(count)
Example #3
0
def tokenize(text, regexps=TOKENIZERRULES):
    """Tokenizes a string and returns a list of tokens

    :param text: The text to tokenise
    :type text: string
    :param regexps: Regular expressions to use as tokeniser rules in tokenisation (default=_pynlpl.textprocessors.TOKENIZERRULES_)
    :type regexps:  Tuple/list of regular expressions to use in tokenisation
    :rtype: Returns a list of tokens

    Examples:

    >>> for token in tokenize("This is a test."):
    ...    print(token)
    This
    is
    a
    test
    .


    """

    for i,regexp in list(enumerate(regexps)):
        if isstring(regexp):
            regexps[i] = re.compile(regexp)

    tokens = []
    begin = 0
    for i, c in enumerate(text):
        if begin > i:
            continue
        elif i == begin:
            m = False
            for regexp in regexps:
                m = regexp.findall(text[i:i+300])
                if m:
                    tokens.append(m[0])
                    begin = i + len(m[0])
                    break
            if m: continue

        if c in string.punctuation or c in WHITESPACE:
            prev = text[i-1] if i > 0 else ""
            next = text[i+1] if i < len(text)-1 else ""

            if (c == '.' or c == ',') and prev.isdigit() and next.isdigit():
                #punctuation in between numbers, keep as one token
                pass
            elif (c == "'" or c == "`") and prev.isalpha() and next.isalpha():
                #quote in between chars, keep...
                pass
            elif c not in WHITESPACE and next == c: #group clusters of identical punctuation together
                continue
            elif c == '\r' and prev == '\n':
                #ignore
                begin = i+1
                continue
            else:
                token = text[begin:i]
                if token: tokens.append(token)

                if c not in WHITESPACE:
                    tokens.append(c) #anything but spaces and newlines (i.e. punctuation) counts as a token too
                begin = i + 1 #set the begin cursor

    if begin <= len(text) - 1:
        token = text[begin:]
        tokens.append(token)

    return tokens
Example #4
0
def tokenize(text, regexps=TOKENIZERRULES):
    """Tokenizes a string and returns a list of tokens

    :param text: The text to tokenise
    :type text: string
    :param regexps: Regular expressions to use as tokeniser rules in tokenisation (default=_pynlpl.textprocessors.TOKENIZERRULES_)
    :type regexps:  Tuple/list of regular expressions to use in tokenisation
    :rtype: Returns a list of tokens

    Examples:

    >>> for token in tokenize("This is a test."):
    ...    print(token)
    This
    is
    a
    test
    .


    """

    for i,regexp in list(enumerate(regexps)):
        if isstring(regexp):
            regexps[i] = re.compile(regexp)

    tokens = []
    begin = 0
    for i, c in enumerate(text):
        if begin > i:
            continue
        elif i == begin:
            m = False
            for regexp in regexps:
                m = regexp.findall(text[i:i+300])
                if m:
                    tokens.append(m[0])
                    begin = i + len(m[0])
                    break
            if m: continue

        if c in string.punctuation or c in WHITESPACE:
            prev = text[i-1] if i > 0 else ""
            next = text[i+1] if i < len(text)-1 else ""

            if (c == '.' or c == ',') and prev.isdigit() and next.isdigit():
                #punctuation in between numbers, keep as one token
                pass
            elif (c == "'" or c == "`") and prev.isalpha() and next.isalpha():
                #quote in between chars, keep...
                pass
            elif c not in WHITESPACE and next == c: #group clusters of identical punctuation together
                continue
            elif c == '\r' and prev == '\n':
                #ignore
                begin = i+1
                continue
            else:
                token = text[begin:i]
                if token: tokens.append(token)

                if c not in WHITESPACE:
                    tokens.append(c) #anything but spaces and newlines (i.e. punctuation) counts as a token too
                begin = i + 1 #set the begin cursor

    if begin <= len(text) - 1:
        token = text[begin:]
        tokens.append(token)

    return tokens
Example #5
0
def element2rst(element, retaintokenisation=False, _previousdelimiter=""):
    """Get the text associated with this element (of the specified class), will always be a unicode instance.
    If no text is directly associated with the element, it will be obtained from the children. If that doesn't result
    in any text either, a NoSuchText exception will be raised.

    If retaintokenisation is True, the space attribute on words will be ignored, otherwise it will be adhered to and text will be detokenised as much as possible.
    """


    prefix = suffix = ""
    indent = ""


    if element.TEXTCONTAINER:
        if isinstance(element, folia.TextMarkupStyle):
            #we guess how possible class names may be mapped to RST directly, set-agnostic
            if element.href:
                prefix = "`"
                suffix = " <" + element.href + ">`_"
            elif element.cls and (element.cls == 'strong' or element.cls[:4] == 'bold' or element.cls == 'b'):
                prefix = suffix ="**"
            elif element.cls and (element.cls[:2] == 'em' or element.cls[:6] == 'italic' or element.cls == 'i' or element.cls[:5] == 'slant'):
                prefix = suffix ="*"
            elif element.cls and (element.cls[:3] == 'lit' or element.cls[:4] == 'verb' or element.cls[:4] == 'code'):
                prefix = suffix ="``"
        s = prefix
        for e in element:
            if isstring(e):
                s += e
            else:
                if s: s += e.TEXTDELIMITER #for AbstractMarkup, will usually be ""
                s += element2rst(e)
        return s + suffix
    if not element.PRINTABLE: #only printable elements can hold text
        raise folia.NoSuchText


    if isinstance(element, folia.ListItem):
        if element.n:
            prefix = element.n + ") "
        else:
            prefix = "- "
    elif isinstance(element, folia.Head):
        level = 0
        for div in element.ancestors(folia.Division):
            if div.count(folia.Head,None,[folia.Division]):
                level += 1
        suffix = "\n" + ADORNMENT[level-1] * (len(element.text()) + 10) + "\n\n"
    elif isinstance(element, folia.Figure) and element.src:
        prefix = ".. figure::" + element.src + "\n\n"
    elif isinstance(element, folia.Note):
        #TODO
        pass
    elif isinstance(element, folia.Caption):
        indent =  "    "
    elif isinstance(element, folia.Quote) and not isinstance(element.parent, folia.Sentence) and not isinstance(element.parent, folia.Paragraph):
        indent = "    " #block quote
    elif isinstance(element, folia.Gap) and not isinstance(element.parent, folia.Sentence) and not isinstance(element.parent, folia.Paragraph):
        prefix = "\n\n::\n\n" + element.content() + "\n\n" #literal block
    elif isinstance(element, folia.List):
        suffix = "\n\n"



    if element.hastext():
        if indent:
            for i, ss in enumerate(element2rst(element.textcontent()).split("\n")):
                if i == 0:
                    s = indent + prefix + ss + "\n"
                else:
                    s = indent + ss + "\n"
        else:
            s = prefix + element2rst(element.textcontent())
    else:
        #Not found, descend into children
        delimiter = ""
        s = ""
        for e in element:
            if e.PRINTABLE and not isinstance(e, folia.TextContent):
                try:
                    if indent:
                        for ss in element2rst(e,retaintokenisation, delimiter).split("\n"):
                            if not s:
                                s += indent + prefix + ss + "\n"
                            else:
                                s += indent + ss + "\n"
                    else:
                        if not s: s += prefix
                        s += element2rst(e,retaintokenisation, delimiter)
                    delimiter = e.gettextdelimiter(retaintokenisation)
                    #delimiter will be buffered and only printed upon next iteration, this prevents the delimiter being output at the end of a sequence
                except folia.NoSuchText:
                    continue


    if s and _previousdelimiter:
        return _previousdelimiter + s + suffix
    elif s:
        return s + suffix
    else:
        #No text found at all :`(
        raise folia.NoSuchText
Example #6
0
def element2rst(element, retaintokenisation=False, _previousdelimiter=""):
    """Get the text associated with this element (of the specified class), will always be a unicode instance.
    If no text is directly associated with the element, it will be obtained from the children. If that doesn't result
    in any text either, a NoSuchText exception will be raised.

    If retaintokenisation is True, the space attribute on words will be ignored, otherwise it will be adhered to and text will be detokenised as much as possible.
    """

    prefix = suffix = ""
    indent = ""

    if element.TEXTCONTAINER:
        if isinstance(element, folia.TextMarkupStyle):
            #we guess how possible class names may be mapped to RST directly, set-agnostic
            if element.href:
                prefix = "`"
                suffix = " <" + element.href + ">`_"
            elif element.cls and (element.cls == 'strong' or element.cls[:4]
                                  == 'bold' or element.cls == 'b'):
                prefix = suffix = "**"
            elif element.cls and (element.cls[:2] == 'em' or element.cls[:6]
                                  == 'italic' or element.cls == 'i'
                                  or element.cls[:5] == 'slant'):
                prefix = suffix = "*"
            elif element.cls and (element.cls[:3] == 'lit' or element.cls[:4]
                                  == 'verb' or element.cls[:4] == 'code'):
                prefix = suffix = "``"
        s = prefix
        for e in element:
            if isstring(e):
                s += e
            else:
                if s:
                    s += e.TEXTDELIMITER  #for AbstractMarkup, will usually be ""
                s += element2rst(e)
        return s + suffix
    if not element.PRINTABLE:  #only printable elements can hold text
        raise folia.NoSuchText

    if isinstance(element, folia.ListItem):
        if element.n:
            prefix = element.n + ") "
        else:
            prefix = "- "
    elif isinstance(element, folia.Head):
        level = 0
        for div in element.ancestors(folia.Division):
            if div.count(folia.Head, None, [folia.Division]):
                level += 1
        suffix = "\n" + ADORNMENT[level - 1] * (len(element.text()) +
                                                10) + "\n\n"
    elif isinstance(element, folia.Figure) and element.src:
        prefix = ".. figure::" + element.src + "\n\n"
    elif isinstance(element, folia.Note):
        #TODO
        pass
    elif isinstance(element, folia.Caption):
        indent = "    "
    elif isinstance(element, folia.Quote) and not isinstance(
            element.parent, folia.Sentence) and not isinstance(
                element.parent, folia.Paragraph):
        indent = "    "  #block quote
    elif isinstance(element, folia.Gap) and not isinstance(
            element.parent, folia.Sentence) and not isinstance(
                element.parent, folia.Paragraph):
        prefix = "\n\n::\n\n" + element.content() + "\n\n"  #literal block
    elif isinstance(element, folia.List):
        suffix = "\n\n"

    if element.hastext():
        if indent:
            for i, ss in enumerate(
                    element2rst(element.textcontent()).split("\n")):
                if i == 0:
                    s = indent + prefix + ss + "\n"
                else:
                    s = indent + ss + "\n"
        else:
            s = prefix + element2rst(element.textcontent())
    else:
        #Not found, descend into children
        delimiter = ""
        s = ""
        for e in element:
            if e.PRINTABLE and not isinstance(e, folia.TextContent):
                try:
                    if indent:
                        for ss in element2rst(e, retaintokenisation,
                                              delimiter).split("\n"):
                            if not s:
                                s += indent + prefix + ss + "\n"
                            else:
                                s += indent + ss + "\n"
                    else:
                        if not s: s += prefix
                        s += element2rst(e, retaintokenisation, delimiter)
                    delimiter = e.gettextdelimiter(retaintokenisation)
                    #delimiter will be buffered and only printed upon next iteration, this prevents the delimiter being output at the end of a sequence
                except folia.NoSuchText:
                    continue

    if s and _previousdelimiter:
        return _previousdelimiter + s + suffix
    elif s:
        return s + suffix
    else:
        #No text found at all :`(
        raise folia.NoSuchText