Example #1
0
    def clean_content(self, stopwords=None, s_t=True):
        if not self.cleaned_content:
            aux = unicode(self.original_content)

            # Quito <>,[], <!-- --> y saltos de linea
            if s_t:
                aux = strip_tags(aux)

            # quito espacios en bordes, llevo a lowercase y saco tildes
            aux = " " + remove_non_unicode(aux.strip().lower()) + " "

            # quito Numeros y Caracteres
            aux = remove_non_alphanumeric(aux)

            # quito espacios
            aux = remove_spaces(aux)

            # quito Stop Words
            if stopwords is None:
                sw = Stopword.objects.all()
                stopwords = "|".join([" " + str(x) + " " for x in sw])

            if stopwords:
                aux = remove_words(aux, stopwords)
            else:
                print "Document %s: There aren't any stop words!" % self.id

            aux = aux.replace("  ", " ")

            self.cleaned_content = aux.strip()
Example #2
0
def get_article_content(htmlcode):

    regex = re.compile('<section class="bodytext">((\t|\n|.)*?)</section>')
    section = regex.search(htmlcode, re.IGNORECASE).group(1)
    article_div = BeautifulSoup(section,'lxml',from_encoding="utf-8")
    ps = []
    if article_div: ps = article_div.findAll("p")
    contents = []
    for p in ps:
        contents.append(str(strip_tags(str(p).replace('Link:','').replace('Links:',''))))
    content = " ".join(contents)
    content = re.sub("\n", "",content)
    content = content.strip()
    return content