Esempio n. 1
0
 def filter_into_temp(self):
     from bs4 import BeautifulSoup
     files = iu.list_children(self.orgn_file, full_path=True)
     array = list()
     for fidx, file in enumerate(files):
         print(fidx)
         tree = BeautifulSoup(''.join(iu.read_lines(file)), "html.parser")
         for article in tree.find_all("reuters"):
             topics = list(article.topics.children)
             if not len(topics) == 1:
                 continue
             topic = str(topics[0].text.encode('ascii', 'ignore'))
             text = article.find('text')
             if text is None or text.body is None:
                 continue
             title = str(
                 text.title.text.encode('utf-8', 'ignore')) if text.title is not None else ''
             title = ' '.join(pu.tokenize(title, pu.tokenize_pattern))
             body = str(text.body.text.encode('utf-8', 'ignore'))
             body = ' '.join(pu.tokenize(body, pu.tokenize_pattern))
             array.append((topic, '{}, {}'.format(title, body)))
     docarr = du.make_docarr([(idx, topic, body) for idx, (topic, body) in enumerate(array)])
     print(len(docarr))
     print(Counter([d.topic for d in docarr]))
     print(len(sorted(set([d.topic for d in docarr]))))
     du.dump_docarr(self.temp_file, docarr)
Esempio n. 2
0
 def filter_into_temp(self):
     json_list = iu.load_array(self.orgn_file)
     item_list = list()
     for i, o in enumerate(json_list):
         text = ' '.join(pu.tokenize(o['text'], pu.tokenize_pattern)[:1200])
         # text = ' '.join(pu.tokenize(o['text'], pu.tokenize_pattern)[:3000])
         # text = o['text']
         item_list.append((i, o['cluster'], text))
     docarr = du.make_docarr(item_list)
     du.dump_docarr(self.temp_file, docarr)
Esempio n. 3
0
 def filter_into_temp(self):
     twarr = iu.load_array(self.orgn_file)
     outrows = list()
     for idx, tw in enumerate(twarr):
         if tw['relevance'] > 1:
             continue
         docid, topic, text = tw['tweetId'], tw['clusterNo'], tw['text']
         if not 10 < len(' '.join(pu.tokenize(text, pu.tokenize_pattern))):
             continue
         outrows.append([docid, topic, text])
     topics = Counter([r[1] for r in outrows])
     print('get {} rows'.format(len(outrows)))
     print('{} topics, {}'.format(len(topics), topics))
     du.dump_docarr(self.temp_file, du.make_docarr(outrows))