def filter_into_temp(self): from bs4 import BeautifulSoup files = iu.list_children(self.orgn_file, full_path=True) array = list() for fidx, file in enumerate(files): print(fidx) tree = BeautifulSoup(''.join(iu.read_lines(file)), "html.parser") for article in tree.find_all("reuters"): topics = list(article.topics.children) if not len(topics) == 1: continue topic = str(topics[0].text.encode('ascii', 'ignore')) text = article.find('text') if text is None or text.body is None: continue title = str( text.title.text.encode('utf-8', 'ignore')) if text.title is not None else '' title = ' '.join(pu.tokenize(title, pu.tokenize_pattern)) body = str(text.body.text.encode('utf-8', 'ignore')) body = ' '.join(pu.tokenize(body, pu.tokenize_pattern)) array.append((topic, '{}, {}'.format(title, body))) docarr = du.make_docarr([(idx, topic, body) for idx, (topic, body) in enumerate(array)]) print(len(docarr)) print(Counter([d.topic for d in docarr])) print(len(sorted(set([d.topic for d in docarr])))) du.dump_docarr(self.temp_file, docarr)
def filter_into_temp(self): json_list = iu.load_array(self.orgn_file) item_list = list() for i, o in enumerate(json_list): text = ' '.join(pu.tokenize(o['text'], pu.tokenize_pattern)[:1200]) # text = ' '.join(pu.tokenize(o['text'], pu.tokenize_pattern)[:3000]) # text = o['text'] item_list.append((i, o['cluster'], text)) docarr = du.make_docarr(item_list) du.dump_docarr(self.temp_file, docarr)
def filter_into_temp(self): twarr = iu.load_array(self.orgn_file) outrows = list() for idx, tw in enumerate(twarr): if tw['relevance'] > 1: continue docid, topic, text = tw['tweetId'], tw['clusterNo'], tw['text'] if not 10 < len(' '.join(pu.tokenize(text, pu.tokenize_pattern))): continue outrows.append([docid, topic, text]) topics = Counter([r[1] for r in outrows]) print('get {} rows'.format(len(outrows))) print('{} topics, {}'.format(len(topics), topics)) du.dump_docarr(self.temp_file, du.make_docarr(outrows))