Python stripAccents 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: amcat.tools.toolkit

메소드/함수: stripAccents

hotexamples.com에서의 예제들: 12

Python stripAccents - 12개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 amcat.tools.toolkit.stripAccents에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: keywordsearch.py 프로젝트: christianbaden/amcat

def _clean(s):
    if s is None: return
    s = unicode(s)
    s = stripAccents(s)
    s = re.sub("[<>+*]", " ", s)
    s = re.sub("\s+", " ", s)
    return s.strip()

예제 #2

파일 보기

파일: queryparser.py 프로젝트: CJStuart/amcat

def parse_to_terms(s, simplify_terms=True, strip_accents=True):
    if strip_accents:
        s = stripAccents(s)
    try:
        terms = get_grammar().parseString(s, parseAll=True)[0]
    except Exception, e:
        raise QueryParseError("{e.__class__.__name__}: {e}".format(**locals()))

예제 #3

파일 보기

파일: keywordsearch.py 프로젝트: Institute-Web-Science-and-Technologies/westcat

def _clean(s):
    if s is None: return
    s = unicode(s)
    s = stripAccents(s)
    s = re.sub("[<>+*]"," ", s)
    s = re.sub("\s+"," ", s)
    return s.strip()

예제 #4

파일 보기

def parse_to_terms(s, simplify_terms=True, strip_accents=True):
    if strip_accents:
        s = stripAccents(s)
    try:
        terms = get_grammar().parseString(s, parseAll=True)[0]
    except Exception, e:
        raise QueryParseError("{e.__class__.__name__}: {e}".format(**locals()))

예제 #5

파일 보기

 def _sanitize(self, input):
     input = toolkit.stripAccents(input, latin1=True)
     input = input.replace("\n",
                           " ")  # alpino will stop parsing on line break
     input = input.replace(
         "|", "-")  # | is field separator and we don't care anyway
     input = input.encode('latin-1', 'ignore').decode('latin-1')
     return input

예제 #6

파일 보기

파일: featurestream.py 프로젝트: kasperwelbers/amcat

 def tokenizeRawText(self, text):
     """
     Sentences are tokenized (and tagged)
     """
     sent = stripAccents(text)
     if self.zeropunctuation == True: sent = clean(text,25)
     sent = self.tokenizer.tokenize(sent)
     if self.posfilter or (self.postagging == True): tokens = self.tagger.tag(sent)
     else: tokens = [(w, None) for w in sent]
     for word, pos in tokens:
         yield (word, pos)

예제 #7

파일 보기

def stripText(text, removeSpecial=False, stripAccents=True):
    if not text: return text

    for regExp, replacement in stripRegExpTuple:
        #print regExp
        text = regExp.sub(replacement, text)

    if removeSpecial:
        text = re.sub(ur'[^\w \-,\.\!\?\:/]+', '', text)

    text = toolkit.unescapeHtml(text)
    if stripAccents:
        text = toolkit.stripAccents(text)

    return text.strip()

예제 #8

파일 보기

def get_text(article):
    text = u"{article.headline}\n\n{article.text}".format(**locals())
    text = text.replace("\r\n", "\n")
    text = text.replace("\r", "")
    text = stripAccents(text)

    #text = ". ALINEASCHEIDING. ".join(re.sub("\s+", " ", par) for par in re.split(r"\n\n+", text))

    pars = re.split(r"\n\n+", text)
    for i, par in enumerate(pars):
        if par and par[-1] not in ".:?!":
            pars[i] = par + "."
    text = " ".join(pars)
        
    
    text = re.sub("\s+", " ", text)
    text = text.encode('ascii', 'ignore')

    if len(text) > 10000:
	text = text[:(text.find(".", 10000)+1)]

    return text

예제 #9

파일 보기

파일: keywordsearch.py 프로젝트: larsmans/amcat

 def __init__(self, query, label=None):
     self.query = stripAccents(query)
     self.declared_label = stripAccents(label)
     self.label = self.declared_label or self.query

예제 #10

파일 보기

파일: keywordsearch.py 프로젝트: christianbaden/amcat

 def __init__(self, query, label=None):
     self.query = stripAccents(query)
     self.declared_label = _clean(label)
     self.label = self.declared_label or _clean(self.query)

예제 #11

파일 보기

파일: alpino.py 프로젝트: kasperwelbers/amcat

 def _sanitize(self, input):
     input = toolkit.stripAccents(input, latin1=True)
     input = input.replace("\n", " ")# alpino will stop parsing on line break
     input = input.replace("|", "-") # | is field separator and we don't care anyway
     input = input.encode('latin-1', 'ignore').decode('latin-1')
     return input

예제 #12

파일 보기

파일: wegenertools.py 프로젝트: edisona/amcat.scraping

def _chunks_to_text(chunks):
    text = "\n".join(chunks)
    text = text.replace("\\n", "\n")
    text = decode_html_entities(text)
    text = toolkit.stripAccents(text)
    return text.strip()