Example #1
0
	def get_body(self, content):
		content = re.sub("<ref.*?</ref>", ' ', content)
		content = re.sub("i.e", '', content)
		content = re.sub("\.", ' ', content)
		content = re.sub('[^a-zA-Z0-9 ]', '', content)
		content = re.sub(' +', ' ', content)

		return remove_stop_words(stem_tokens(tokenize(content.lower())))    
Example #2
0
	def extract_external_links(self, content):
		lines=content.split("\n")
		for i in xrange(len(lines)):
			if '* [' in lines[i] or '*[' in lines[i]:
				word = ""
				temp = lines[i].split(' ')
				word=[key for key in temp if 'http' not in temp]
				try:
					word=' '.join(word).encode('utf-8')
					self.article.token['external_links'].extend(remove_stop_words(stem_tokens(tokenize(word))))
				except:
					pass
Example #3
0
	def get_tokens(self, content, title):
		self.article.token['title'] = tokenize(title)
		self.article.token['headings'] = tokenize(self.get_headings())
		self.article.token['References'] = tokenize(self.get_references(content))
		self.article.token['text'] = self.get_body(self.article.content)
Example #4
0
def processTitle(title):
	"""
	Title is converted to lower case, tokenized and stemmed
	"""
	return stem_tokens(tokenize(data.lower()), stemmer)