Ejemplo n.º 1
0
def _clean(s):
    if s is None: return
    s = unicode(s)
    s = stripAccents(s)
    s = re.sub("[<>+*]", " ", s)
    s = re.sub("\s+", " ", s)
    return s.strip()
Ejemplo n.º 2
0
def parse_to_terms(s, simplify_terms=True, strip_accents=True):
    if strip_accents:
        s = stripAccents(s)
    try:
        terms = get_grammar().parseString(s, parseAll=True)[0]
    except Exception, e:
        raise QueryParseError("{e.__class__.__name__}: {e}".format(**locals()))
def _clean(s):
    if s is None: return
    s = unicode(s)
    s = stripAccents(s)
    s = re.sub("[<>+*]"," ", s)
    s = re.sub("\s+"," ", s)
    return s.strip()
Ejemplo n.º 4
0
def parse_to_terms(s, simplify_terms=True, strip_accents=True):
    if strip_accents:
        s = stripAccents(s)
    try:
        terms = get_grammar().parseString(s, parseAll=True)[0]
    except Exception, e:
        raise QueryParseError("{e.__class__.__name__}: {e}".format(**locals()))
Ejemplo n.º 5
0
 def _sanitize(self, input):
     input = toolkit.stripAccents(input, latin1=True)
     input = input.replace("\n",
                           " ")  # alpino will stop parsing on line break
     input = input.replace(
         "|", "-")  # | is field separator and we don't care anyway
     input = input.encode('latin-1', 'ignore').decode('latin-1')
     return input
Ejemplo n.º 6
0
 def tokenizeRawText(self, text):
     """
     Sentences are tokenized (and tagged)
     """
     sent = stripAccents(text)
     if self.zeropunctuation == True: sent = clean(text,25)
     sent = self.tokenizer.tokenize(sent)
     if self.posfilter or (self.postagging == True): tokens = self.tagger.tag(sent)
     else: tokens = [(w, None) for w in sent]
     for word, pos in tokens:
         yield (word, pos)
Ejemplo n.º 7
0
def stripText(text, removeSpecial=False, stripAccents=True):
    if not text: return text

    for regExp, replacement in stripRegExpTuple:
        #print regExp
        text = regExp.sub(replacement, text)

    if removeSpecial:
        text = re.sub(ur'[^\w \-,\.\!\?\:/]+', '', text)

    text = toolkit.unescapeHtml(text)
    if stripAccents:
        text = toolkit.stripAccents(text)

    return text.strip()
Ejemplo n.º 8
0
def get_text(article):
    text = u"{article.headline}\n\n{article.text}".format(**locals())
    text = text.replace("\r\n", "\n")
    text = text.replace("\r", "")
    text = stripAccents(text)

    #text = ". ALINEASCHEIDING. ".join(re.sub("\s+", " ", par) for par in re.split(r"\n\n+", text))

    pars = re.split(r"\n\n+", text)
    for i, par in enumerate(pars):
        if par and par[-1] not in ".:?!":
            pars[i] = par + "."
    text = " ".join(pars)
        
    
    text = re.sub("\s+", " ", text)
    text = text.encode('ascii', 'ignore')

    if len(text) > 10000:
	text = text[:(text.find(".", 10000)+1)]

    return text
Ejemplo n.º 9
0
 def __init__(self, query, label=None):
     self.query = stripAccents(query)
     self.declared_label = stripAccents(label)
     self.label = self.declared_label or self.query
Ejemplo n.º 10
0
 def __init__(self, query, label=None):
     self.query = stripAccents(query)
     self.declared_label = _clean(label)
     self.label = self.declared_label or _clean(self.query)
Ejemplo n.º 11
0
 def _sanitize(self, input):
     input = toolkit.stripAccents(input, latin1=True)
     input = input.replace("\n", " ")# alpino will stop parsing on line break
     input = input.replace("|", "-") # | is field separator and we don't care anyway
     input = input.encode('latin-1', 'ignore').decode('latin-1')
     return input
Ejemplo n.º 12
0
def _chunks_to_text(chunks):
    text = "\n".join(chunks)
    text = text.replace("\\n", "\n")
    text = decode_html_entities(text)
    text = toolkit.stripAccents(text)
    return text.strip()