def filter_into_temp(self): twarr = iu.load_array(self.orgn_file) print(len(twarr), type(twarr[0])) docarr = du.make_docarr( [[tw[k] for k in ('tweetId', 'clusterNo', 'textCleaned')] for tw in twarr]) du.dump_docarr(self.temp_file, docarr)
def filter_tw_from_file(file): desire_tw_keys = [ 'created_at', 'id_str', 'retweet_count', 'text', ] desire_user_keys = [ 'followers_count', 'friends_count', 'statuses_count', 'time_zone', 'verified', 'id_str', 'description', 'name', ] desire_ent_keys = [ 'symbols', 'hashtags', ] twarr = iu.load_array(file) new_twarr = list() for tidx, tw in enumerate(twarr): new_tw = {k: tw[k] for k in desire_tw_keys} new_tw['entities'] = {k: tw['entities'][k] for k in desire_ent_keys} new_twarr.append(new_tw) profile = {k: twarr[-1]['user'][k] for k in desire_user_keys} return profile, new_twarr
def filter_into_temp(self): file_list = iu.list_children(self.orgn_file, full_path=True) twarr_list = [iu.load_array(file) for file in file_list] doclist = list() for topic_id, twarr in enumerate(twarr_list): for tw in twarr: doclist.append((str(tw['id']), topic_id, tw['text'].replace('#', ''))) docarr = du.make_docarr(doclist) du.dump_docarr(self.temp_file, docarr)
def filter_into_temp(self): json_list = iu.load_array(self.orgn_file) item_list = list() for i, o in enumerate(json_list): text = ' '.join(pu.tokenize(o['text'], pu.tokenize_pattern)[:1200]) # text = ' '.join(pu.tokenize(o['text'], pu.tokenize_pattern)[:3000]) # text = o['text'] item_list.append((i, o['cluster'], text)) docarr = du.make_docarr(item_list) du.dump_docarr(self.temp_file, docarr)
def filter_into_temp(self): twarr = iu.load_array(self.orgn_file) outrows = list() for idx, tw in enumerate(twarr): if tw['relevance'] > 1: continue docid, topic, text = tw['tweetId'], tw['clusterNo'], tw['text'] if not 10 < len(' '.join(pu.tokenize(text, pu.tokenize_pattern))): continue outrows.append([docid, topic, text]) topics = Counter([r[1] for r in outrows]) print('get {} rows'.format(len(outrows))) print('{} topics, {}'.format(len(topics), topics)) du.dump_docarr(self.temp_file, du.make_docarr(outrows))