def output(self,delimiter = '\t', addnormalised=False): """Print a representation of the frequency list""" for type, count in self: if isinstance(type,tuple) or isinstance(type,list): if addnormalised: yield " ".join((u(x) for x in type)) + delimiter + str(count) + delimiter + str(count/self.total) else: yield " ".join((u(x) for x in type)) + delimiter + str(count) elif isstring(type): if addnormalised: yield type + delimiter + str(count) + delimiter + str(count/self.total) else: yield type + delimiter + str(count) else: if addnormalised: yield str(type) + delimiter + str(count) + delimiter + str(count/self.total) else: yield str(type) + delimiter + str(count)
def output(self, delimiter='\t', addnormalised=False): """Print a representation of the frequency list""" for type, count in self: if isinstance(type, tuple) or isinstance(type, list): if addnormalised: yield " ".join((u(x) for x in type)) + delimiter + str( count) + delimiter + str(count / self.total) else: yield " ".join( (u(x) for x in type)) + delimiter + str(count) elif isstring(type): if addnormalised: yield type + delimiter + str(count) + delimiter + str( count / self.total) else: yield type + delimiter + str(count) else: if addnormalised: yield str(type) + delimiter + str(count) + delimiter + str( count / self.total) else: yield str(type) + delimiter + str(count)
def tokenize(text, regexps=TOKENIZERRULES): """Tokenizes a string and returns a list of tokens :param text: The text to tokenise :type text: string :param regexps: Regular expressions to use as tokeniser rules in tokenisation (default=_pynlpl.textprocessors.TOKENIZERRULES_) :type regexps: Tuple/list of regular expressions to use in tokenisation :rtype: Returns a list of tokens Examples: >>> for token in tokenize("This is a test."): ... print(token) This is a test . """ for i,regexp in list(enumerate(regexps)): if isstring(regexp): regexps[i] = re.compile(regexp) tokens = [] begin = 0 for i, c in enumerate(text): if begin > i: continue elif i == begin: m = False for regexp in regexps: m = regexp.findall(text[i:i+300]) if m: tokens.append(m[0]) begin = i + len(m[0]) break if m: continue if c in string.punctuation or c in WHITESPACE: prev = text[i-1] if i > 0 else "" next = text[i+1] if i < len(text)-1 else "" if (c == '.' or c == ',') and prev.isdigit() and next.isdigit(): #punctuation in between numbers, keep as one token pass elif (c == "'" or c == "`") and prev.isalpha() and next.isalpha(): #quote in between chars, keep... pass elif c not in WHITESPACE and next == c: #group clusters of identical punctuation together continue elif c == '\r' and prev == '\n': #ignore begin = i+1 continue else: token = text[begin:i] if token: tokens.append(token) if c not in WHITESPACE: tokens.append(c) #anything but spaces and newlines (i.e. punctuation) counts as a token too begin = i + 1 #set the begin cursor if begin <= len(text) - 1: token = text[begin:] tokens.append(token) return tokens
def element2rst(element, retaintokenisation=False, _previousdelimiter=""): """Get the text associated with this element (of the specified class), will always be a unicode instance. If no text is directly associated with the element, it will be obtained from the children. If that doesn't result in any text either, a NoSuchText exception will be raised. If retaintokenisation is True, the space attribute on words will be ignored, otherwise it will be adhered to and text will be detokenised as much as possible. """ prefix = suffix = "" indent = "" if element.TEXTCONTAINER: if isinstance(element, folia.TextMarkupStyle): #we guess how possible class names may be mapped to RST directly, set-agnostic if element.href: prefix = "`" suffix = " <" + element.href + ">`_" elif element.cls and (element.cls == 'strong' or element.cls[:4] == 'bold' or element.cls == 'b'): prefix = suffix ="**" elif element.cls and (element.cls[:2] == 'em' or element.cls[:6] == 'italic' or element.cls == 'i' or element.cls[:5] == 'slant'): prefix = suffix ="*" elif element.cls and (element.cls[:3] == 'lit' or element.cls[:4] == 'verb' or element.cls[:4] == 'code'): prefix = suffix ="``" s = prefix for e in element: if isstring(e): s += e else: if s: s += e.TEXTDELIMITER #for AbstractMarkup, will usually be "" s += element2rst(e) return s + suffix if not element.PRINTABLE: #only printable elements can hold text raise folia.NoSuchText if isinstance(element, folia.ListItem): if element.n: prefix = element.n + ") " else: prefix = "- " elif isinstance(element, folia.Head): level = 0 for div in element.ancestors(folia.Division): if div.count(folia.Head,None,[folia.Division]): level += 1 suffix = "\n" + ADORNMENT[level-1] * (len(element.text()) + 10) + "\n\n" elif isinstance(element, folia.Figure) and element.src: prefix = ".. figure::" + element.src + "\n\n" elif isinstance(element, folia.Note): #TODO pass elif isinstance(element, folia.Caption): indent = " " elif isinstance(element, folia.Quote) and not isinstance(element.parent, folia.Sentence) and not isinstance(element.parent, folia.Paragraph): indent = " " #block quote elif isinstance(element, folia.Gap) and not isinstance(element.parent, folia.Sentence) and not isinstance(element.parent, folia.Paragraph): prefix = "\n\n::\n\n" + element.content() + "\n\n" #literal block elif isinstance(element, folia.List): suffix = "\n\n" if element.hastext(): if indent: for i, ss in enumerate(element2rst(element.textcontent()).split("\n")): if i == 0: s = indent + prefix + ss + "\n" else: s = indent + ss + "\n" else: s = prefix + element2rst(element.textcontent()) else: #Not found, descend into children delimiter = "" s = "" for e in element: if e.PRINTABLE and not isinstance(e, folia.TextContent): try: if indent: for ss in element2rst(e,retaintokenisation, delimiter).split("\n"): if not s: s += indent + prefix + ss + "\n" else: s += indent + ss + "\n" else: if not s: s += prefix s += element2rst(e,retaintokenisation, delimiter) delimiter = e.gettextdelimiter(retaintokenisation) #delimiter will be buffered and only printed upon next iteration, this prevents the delimiter being output at the end of a sequence except folia.NoSuchText: continue if s and _previousdelimiter: return _previousdelimiter + s + suffix elif s: return s + suffix else: #No text found at all :`( raise folia.NoSuchText
def element2rst(element, retaintokenisation=False, _previousdelimiter=""): """Get the text associated with this element (of the specified class), will always be a unicode instance. If no text is directly associated with the element, it will be obtained from the children. If that doesn't result in any text either, a NoSuchText exception will be raised. If retaintokenisation is True, the space attribute on words will be ignored, otherwise it will be adhered to and text will be detokenised as much as possible. """ prefix = suffix = "" indent = "" if element.TEXTCONTAINER: if isinstance(element, folia.TextMarkupStyle): #we guess how possible class names may be mapped to RST directly, set-agnostic if element.href: prefix = "`" suffix = " <" + element.href + ">`_" elif element.cls and (element.cls == 'strong' or element.cls[:4] == 'bold' or element.cls == 'b'): prefix = suffix = "**" elif element.cls and (element.cls[:2] == 'em' or element.cls[:6] == 'italic' or element.cls == 'i' or element.cls[:5] == 'slant'): prefix = suffix = "*" elif element.cls and (element.cls[:3] == 'lit' or element.cls[:4] == 'verb' or element.cls[:4] == 'code'): prefix = suffix = "``" s = prefix for e in element: if isstring(e): s += e else: if s: s += e.TEXTDELIMITER #for AbstractMarkup, will usually be "" s += element2rst(e) return s + suffix if not element.PRINTABLE: #only printable elements can hold text raise folia.NoSuchText if isinstance(element, folia.ListItem): if element.n: prefix = element.n + ") " else: prefix = "- " elif isinstance(element, folia.Head): level = 0 for div in element.ancestors(folia.Division): if div.count(folia.Head, None, [folia.Division]): level += 1 suffix = "\n" + ADORNMENT[level - 1] * (len(element.text()) + 10) + "\n\n" elif isinstance(element, folia.Figure) and element.src: prefix = ".. figure::" + element.src + "\n\n" elif isinstance(element, folia.Note): #TODO pass elif isinstance(element, folia.Caption): indent = " " elif isinstance(element, folia.Quote) and not isinstance( element.parent, folia.Sentence) and not isinstance( element.parent, folia.Paragraph): indent = " " #block quote elif isinstance(element, folia.Gap) and not isinstance( element.parent, folia.Sentence) and not isinstance( element.parent, folia.Paragraph): prefix = "\n\n::\n\n" + element.content() + "\n\n" #literal block elif isinstance(element, folia.List): suffix = "\n\n" if element.hastext(): if indent: for i, ss in enumerate( element2rst(element.textcontent()).split("\n")): if i == 0: s = indent + prefix + ss + "\n" else: s = indent + ss + "\n" else: s = prefix + element2rst(element.textcontent()) else: #Not found, descend into children delimiter = "" s = "" for e in element: if e.PRINTABLE and not isinstance(e, folia.TextContent): try: if indent: for ss in element2rst(e, retaintokenisation, delimiter).split("\n"): if not s: s += indent + prefix + ss + "\n" else: s += indent + ss + "\n" else: if not s: s += prefix s += element2rst(e, retaintokenisation, delimiter) delimiter = e.gettextdelimiter(retaintokenisation) #delimiter will be buffered and only printed upon next iteration, this prevents the delimiter being output at the end of a sequence except folia.NoSuchText: continue if s and _previousdelimiter: return _previousdelimiter + s + suffix elif s: return s + suffix else: #No text found at all :`( raise folia.NoSuchText