class BreakerUpWords(object): """docstring for ProcessorEntities""" def __init__(self): super(BreakerUpWords, self).__init__() self.worker = FileWorker() self.load_words_codes() def load_words_codes(self): self.words_codes = {} self.words_codes["words"] = [] self.words_codes["codes"] = [] data = self.worker.read("/home/rodrigo/Twitter Analysis Library/lib/db/hashtagsDataBase.csv") for item in data: attr = item.split(',') word = attr[0] code = attr[1] self.words_codes["words"].append(word) self.words_codes["codes"].append(code) def save_words_codes(self): num_codes = len(self.words_codes["words"]) lines = [] for i in range(num_codes): word = self.words_codes["words"][i] code = self.words_codes["codes"][i] line = "%s,%s"%(word,code) lines.append(line) self.worker.write("hashtagsDataBase.csv",lines) def break_up_words(self,tokens): new_tokens = [] for token in tokens: new_tokens = new_tokens + self.break_up(token) return new_tokens def break_up(self,token): tokens = [] if token in self.words_codes["words"]:#if toke is words_codes so we will break up token index = self.words_codes["words"].index(token) code = self.words_codes["codes"][index] token = self.segment(token,code)#break up the words with the specified code for newToken in token: tokens.append(newToken) return tokens else: return [token] def segment(self,text,segs): words = [] last = 0 for i in range(len(segs)): if segs[i] == '1': words.append(text[last:i+1]) last = i+1 words.append(text[last:]) return words
def load_stemming_words(self): #load a set of knowing words self.stemmingWords = {} self.stemmingWords["words"] = [] self.stemmingWords["matches"] = [] fileWorker = FileWorker() data = fileWorker.read("/home/rodrigo/Twitter Analysis Library/lib/db/stemmingDataBase.csv") for word in data: attr = word.split(',') word = attr[0] match = attr[1].rstrip() self.stemmingWords["words"].append(word) self.stemmingWords["matches"].append(match)
def read(name): print(FileWorker.read(name))
def get_humans(self): return FileWorker.read(self.file)