Beispiel #1
0
def clean_steam():

    documents = Document.objects.all()
    goal = 0
    current = 0
    leng = len(documents)
    for document in documents:
        goal, current = avance(current, leng, goal)
        if document.steamed_content:
            text_to_clean = document.steamed_content

            aux = unicode(text_to_clean)

            #Quito <>,[], <!-- --> y saltos de linea
            aux = strip_tags(aux)

            #quito espacios en bordes, llevo a lowercase y saco tildes
            aux = ' '+remove_non_unicode(aux.strip().lower())+' '

            #quito Numeros y Caracteres
            aux = remove_non_alphanumeric(aux)

            #quito espacios
            aux = remove_spaces(aux)

            document.steamed_content = aux
            document.save()
Beispiel #2
0
    def clean_content(self, stopwords=None, s_t=True):
        if not self.cleaned_content:
            aux = unicode(self.original_content)

            # Quito <>,[], <!-- --> y saltos de linea
            if s_t:
                aux = strip_tags(aux)

            # quito espacios en bordes, llevo a lowercase y saco tildes
            aux = " " + remove_non_unicode(aux.strip().lower()) + " "

            # quito Numeros y Caracteres
            aux = remove_non_alphanumeric(aux)

            # quito espacios
            aux = remove_spaces(aux)

            # quito Stop Words
            if stopwords is None:
                sw = Stopword.objects.all()
                stopwords = "|".join([" " + str(x) + " " for x in sw])

            if stopwords:
                aux = remove_words(aux, stopwords)
            else:
                print "Document %s: There aren't any stop words!" % self.id

            aux = aux.replace("  ", " ")

            self.cleaned_content = aux.strip()