Ejemplo n.º 1
0
	def get_body(self, content):
		content = re.sub("<ref.*?</ref>", ' ', content)
		content = re.sub("i.e", '', content)
		content = re.sub("\.", ' ', content)
		content = re.sub('[^a-zA-Z0-9 ]', '', content)
		content = re.sub(' +', ' ', content)

		return remove_stop_words(stem_tokens(tokenize(content.lower())))    
Ejemplo n.º 2
0
	def extract_external_links(self, content):
		lines=content.split("\n")
		for i in xrange(len(lines)):
			if '* [' in lines[i] or '*[' in lines[i]:
				word = ""
				temp = lines[i].split(' ')
				word=[key for key in temp if 'http' not in temp]
				try:
					word=' '.join(word).encode('utf-8')
					self.article.token['external_links'].extend(remove_stop_words(stem_tokens(tokenize(word))))
				except:
					pass