def _clean(s): if s is None: return s = unicode(s) s = stripAccents(s) s = re.sub("[<>+*]", " ", s) s = re.sub("\s+", " ", s) return s.strip()
def parse_to_terms(s, simplify_terms=True, strip_accents=True): if strip_accents: s = stripAccents(s) try: terms = get_grammar().parseString(s, parseAll=True)[0] except Exception, e: raise QueryParseError("{e.__class__.__name__}: {e}".format(**locals()))
def _clean(s): if s is None: return s = unicode(s) s = stripAccents(s) s = re.sub("[<>+*]"," ", s) s = re.sub("\s+"," ", s) return s.strip()
def _sanitize(self, input): input = toolkit.stripAccents(input, latin1=True) input = input.replace("\n", " ") # alpino will stop parsing on line break input = input.replace( "|", "-") # | is field separator and we don't care anyway input = input.encode('latin-1', 'ignore').decode('latin-1') return input
def tokenizeRawText(self, text): """ Sentences are tokenized (and tagged) """ sent = stripAccents(text) if self.zeropunctuation == True: sent = clean(text,25) sent = self.tokenizer.tokenize(sent) if self.posfilter or (self.postagging == True): tokens = self.tagger.tag(sent) else: tokens = [(w, None) for w in sent] for word, pos in tokens: yield (word, pos)
def stripText(text, removeSpecial=False, stripAccents=True): if not text: return text for regExp, replacement in stripRegExpTuple: #print regExp text = regExp.sub(replacement, text) if removeSpecial: text = re.sub(ur'[^\w \-,\.\!\?\:/]+', '', text) text = toolkit.unescapeHtml(text) if stripAccents: text = toolkit.stripAccents(text) return text.strip()
def get_text(article): text = u"{article.headline}\n\n{article.text}".format(**locals()) text = text.replace("\r\n", "\n") text = text.replace("\r", "") text = stripAccents(text) #text = ". ALINEASCHEIDING. ".join(re.sub("\s+", " ", par) for par in re.split(r"\n\n+", text)) pars = re.split(r"\n\n+", text) for i, par in enumerate(pars): if par and par[-1] not in ".:?!": pars[i] = par + "." text = " ".join(pars) text = re.sub("\s+", " ", text) text = text.encode('ascii', 'ignore') if len(text) > 10000: text = text[:(text.find(".", 10000)+1)] return text
def __init__(self, query, label=None): self.query = stripAccents(query) self.declared_label = stripAccents(label) self.label = self.declared_label or self.query
def __init__(self, query, label=None): self.query = stripAccents(query) self.declared_label = _clean(label) self.label = self.declared_label or _clean(self.query)
def _sanitize(self, input): input = toolkit.stripAccents(input, latin1=True) input = input.replace("\n", " ")# alpino will stop parsing on line break input = input.replace("|", "-") # | is field separator and we don't care anyway input = input.encode('latin-1', 'ignore').decode('latin-1') return input
def _chunks_to_text(chunks): text = "\n".join(chunks) text = text.replace("\\n", "\n") text = decode_html_entities(text) text = toolkit.stripAccents(text) return text.strip()