Ejemplo n.º 1
0
 def main(self):
     """
     复杂应用才需要Override这部分
     :return:
     """
     self.get_contents()
     self.get_keywords()
     self.count_list = [0] * len(self.keywords_list)
     for index in range(len(self.raw_list)):
         string = self.raw_list[index]
         self.current_string = to_unicode(string[self.data_column_index])
         self.sources_finder()
         try:
             if self.current_result:
                 self.clean_index.append(index)
                 self.cleaned_list.append(string)
             else:
                 self.trash_index.append(index)
                 self.trash_list.append(string)
         except:
             pass
     # 在多数据文件或者多词库文件进行批量处理的时候需要对这些数据进行重置
     # self.raw_list = None
     # self.keywords_list = None
     return
Ejemplo n.º 2
0
 def main(self):
     """
     复杂应用才需要Override这部分
     :return:
     """
     self.get_contents()
     self.get_keywords()
     self.count_list = [0] * len(self.keywords_list)
     for index in range(len(self.raw_list)):
         string = self.raw_list[index]
         self.current_string = to_unicode(string[self.data_column_index])
         hit = False
         # for keywords in self.keywords_list:
         try:
             for keywords_index in range(0, len(self.keywords_list)):
                 keywords = self.keywords_list[keywords_index]
                 self.current_keywords = keywords
                 self.keywords_finder()
                 if self.current_result:
                     self.trash_list.append(string)
                     self.result_list.append(self.current_result)
                     hit = True
                     self.count_list[keywords_index] += 1
                     if self.one_hit_strategy:
                         break
                     else:
                         continue
                 else:
                     continue
             if hit:
                 self.trash_index.append(index)
                 continue
             else:
                 self.clean_index.append(index)
                 self.cleaned_list.append(string)
         except:
             pass
     if self.show_process:
         total_length = float(len(self.raw_list))
         keyword_count = 0
         for count in self.count_list:
             keyword_count += count
         print(u'關鍵詞標記微博數量為 ' + str(keyword_count) + u' 占' +
               str(keyword_count / total_length * 100) + '%')
         print(u"{0}以下是關鍵詞標記的水贴{0}".format(u"-" * 30))
         for count_index in range(len(self.count_list)):
             print(u'关键词 "' + self.keywords_list[count_index] + u'" 匹配的微博数量为 ' \
                   + str(self.count_list[count_index]) + u'  占' + \
                   str(self.count_list[count_index] / total_length * 100) + '%')
     # 在多数据文件或者多词库文件进行批量处理的时候需要对这些数据进行重置
     # self.raw_list = None
     # self.keywords_list = None
     return