def get_body(self, content): content = re.sub("<ref.*?</ref>", ' ', content) content = re.sub("i.e", '', content) content = re.sub("\.", ' ', content) content = re.sub('[^a-zA-Z0-9 ]', '', content) content = re.sub(' +', ' ', content) return remove_stop_words(stem_tokens(tokenize(content.lower())))
def extract_external_links(self, content): lines=content.split("\n") for i in xrange(len(lines)): if '* [' in lines[i] or '*[' in lines[i]: word = "" temp = lines[i].split(' ') word=[key for key in temp if 'http' not in temp] try: word=' '.join(word).encode('utf-8') self.article.token['external_links'].extend(remove_stop_words(stem_tokens(tokenize(word)))) except: pass