class BreakerUpWords(object):
	"""docstring for ProcessorEntities"""
	def __init__(self):
		super(BreakerUpWords, self).__init__()
		self.worker = FileWorker()
		self.load_words_codes()
		
	def load_words_codes(self):
		self.words_codes = {}
		self.words_codes["words"] = []
		self.words_codes["codes"] = []
		data = self.worker.read("/home/rodrigo/Twitter Analysis Library/lib/db/hashtagsDataBase.csv")
		for item in data:
			attr = item.split(',')
			word = attr[0]
			code = attr[1]
			self.words_codes["words"].append(word)
			self.words_codes["codes"].append(code)

	def save_words_codes(self):
		num_codes = len(self.words_codes["words"])
		lines = []
		for i in range(num_codes):
			word = self.words_codes["words"][i]
			code = self.words_codes["codes"][i]
			line = "%s,%s"%(word,code)
			lines.append(line)
		self.worker.write("hashtagsDataBase.csv",lines)

	def break_up_words(self,tokens):
		new_tokens = []
		for token in tokens:
			new_tokens = new_tokens + self.break_up(token)				
		return new_tokens

	def break_up(self,token):
		tokens = []
		if token in self.words_codes["words"]:#if toke is words_codes so we will break up token
			index = self.words_codes["words"].index(token)
			code = self.words_codes["codes"][index]
			token = self.segment(token,code)#break up the words with the specified code
			for newToken in token:
				tokens.append(newToken)
			return tokens
		else:
			return [token]

	def segment(self,text,segs):
		words = []
		last = 0
		for i in range(len(segs)):
			if segs[i] == '1':
				words.append(text[last:i+1])
				last = i+1
		words.append(text[last:])
		return words
	def load_stemming_words(self):
		#load a set of knowing words
		self.stemmingWords = {}
		self.stemmingWords["words"] = []
		self.stemmingWords["matches"] = []
		fileWorker = FileWorker()
		data = fileWorker.read("/home/rodrigo/Twitter Analysis Library/lib/db/stemmingDataBase.csv")
		for word in data:
			attr = word.split(',')
			word = attr[0]
			match = attr[1].rstrip()
			self.stemmingWords["words"].append(word)
			self.stemmingWords["matches"].append(match)
Ejemplo n.º 3
0
def read(name):
    print(FileWorker.read(name))
Ejemplo n.º 4
0
 def get_humans(self):
     return FileWorker.read(self.file)