def _preprocessCaption(self, cap): def removeAt(cap): # remove @eddie end_at = [' ', '\t', '#'] new_cap = '' pre_is_at = False for c in cap: if c =='@': pre_is_at = True continue if pre_is_at == True: if c in end_at: pre_is_at = False if pre_is_at == False: new_cap += c return new_cap cap = removeAt(cap) new_cap = '' pre_is_cap = False for c in cap: if c.isupper(): if not pre_is_cap: new_cap += ' ' new_cap += c.lower() pre_is_cap = True continue if c.islower(): new_cap += c else: new_cap += ' ' pre_is_cap = False words = new_cap.split() stopword_list = Stopwords.stopwords() tmp_dict = {} for word in words: word = word.strip() if self._stopword_removal and word in stopword_list: continue if len(word) < 3: continue if word in tmp_dict.keys(): tmp_dict[word] = tmp_dict[word] + 1 else: tmp_dict[word] = 1 return tmp_dict
def _preprocessCaption(self, cap): def removeAt(cap): # remove @eddie end_at = [' ', '\t', '#'] new_cap = '' pre_is_at = False for c in cap: if c == '@': pre_is_at = True continue if pre_is_at == True: if c in end_at: pre_is_at = False if pre_is_at == False: new_cap += c return new_cap cap = removeAt(cap) new_cap = '' pre_is_cap = False for c in cap: if c.isupper(): if not pre_is_cap: new_cap += ' ' new_cap += c.lower() pre_is_cap = True continue if c.islower(): new_cap += c else: new_cap += ' ' pre_is_cap = False words = new_cap.split() stopword_list = Stopwords.stopwords() tmp_dict = {} for word in words: word = word.strip() if self._stopword_removal and word in stopword_list: continue if len(word) < 3: continue if word in tmp_dict.keys(): tmp_dict[word] = tmp_dict[word] + 1 else: tmp_dict[word] = 1 return tmp_dict
def _preprocessText(self, cap): new_cap = tool.textPreprocessor(cap) words = new_cap.split() stopword_list = Stopwords.stopwords() tmp_dict = {} for word in words: word = word.strip() if self._stopword_removal and word in stopword_list: continue if len(word) < 3: continue if word in tmp_dict.keys(): tmp_dict[word] = tmp_dict[word] + 1 else: tmp_dict[word] = 1 return tmp_dict