def clean_content(self, stopwords=None, s_t=True): if not self.cleaned_content: aux = unicode(self.original_content) # Quito <>,[], <!-- --> y saltos de linea if s_t: aux = strip_tags(aux) # quito espacios en bordes, llevo a lowercase y saco tildes aux = " " + remove_non_unicode(aux.strip().lower()) + " " # quito Numeros y Caracteres aux = remove_non_alphanumeric(aux) # quito espacios aux = remove_spaces(aux) # quito Stop Words if stopwords is None: sw = Stopword.objects.all() stopwords = "|".join([" " + str(x) + " " for x in sw]) if stopwords: aux = remove_words(aux, stopwords) else: print "Document %s: There aren't any stop words!" % self.id aux = aux.replace(" ", " ") self.cleaned_content = aux.strip()
def get_article_content(htmlcode): regex = re.compile('<section class="bodytext">((\t|\n|.)*?)</section>') section = regex.search(htmlcode, re.IGNORECASE).group(1) article_div = BeautifulSoup(section,'lxml',from_encoding="utf-8") ps = [] if article_div: ps = article_div.findAll("p") contents = [] for p in ps: contents.append(str(strip_tags(str(p).replace('Link:','').replace('Links:','')))) content = " ".join(contents) content = re.sub("\n", "",content) content = content.strip() return content