Exemple #1
0
 def filter_into_temp(self):
     twarr = iu.load_array(self.orgn_file)
     print(len(twarr), type(twarr[0]))
     docarr = du.make_docarr(
         [[tw[k] for k in ('tweetId', 'clusterNo', 'textCleaned')]
          for tw in twarr])
     du.dump_docarr(self.temp_file, docarr)
Exemple #2
0
def filter_tw_from_file(file):
    desire_tw_keys = [
        'created_at',
        'id_str',
        'retweet_count',
        'text',
    ]
    desire_user_keys = [
        'followers_count',
        'friends_count',
        'statuses_count',
        'time_zone',
        'verified',
        'id_str',
        'description',
        'name',
    ]
    desire_ent_keys = [
        'symbols',
        'hashtags',
    ]
    twarr = iu.load_array(file)
    new_twarr = list()
    for tidx, tw in enumerate(twarr):
        new_tw = {k: tw[k] for k in desire_tw_keys}
        new_tw['entities'] = {k: tw['entities'][k] for k in desire_ent_keys}
        new_twarr.append(new_tw)
    profile = {k: twarr[-1]['user'][k] for k in desire_user_keys}
    return profile, new_twarr
Exemple #3
0
 def filter_into_temp(self):
     file_list = iu.list_children(self.orgn_file, full_path=True)
     twarr_list = [iu.load_array(file) for file in file_list]
     doclist = list()
     for topic_id, twarr in enumerate(twarr_list):
         for tw in twarr:
             doclist.append((str(tw['id']), topic_id, tw['text'].replace('#', '')))
     docarr = du.make_docarr(doclist)
     du.dump_docarr(self.temp_file, docarr)
Exemple #4
0
 def filter_into_temp(self):
     json_list = iu.load_array(self.orgn_file)
     item_list = list()
     for i, o in enumerate(json_list):
         text = ' '.join(pu.tokenize(o['text'], pu.tokenize_pattern)[:1200])
         # text = ' '.join(pu.tokenize(o['text'], pu.tokenize_pattern)[:3000])
         # text = o['text']
         item_list.append((i, o['cluster'], text))
     docarr = du.make_docarr(item_list)
     du.dump_docarr(self.temp_file, docarr)
Exemple #5
0
 def filter_into_temp(self):
     twarr = iu.load_array(self.orgn_file)
     outrows = list()
     for idx, tw in enumerate(twarr):
         if tw['relevance'] > 1:
             continue
         docid, topic, text = tw['tweetId'], tw['clusterNo'], tw['text']
         if not 10 < len(' '.join(pu.tokenize(text, pu.tokenize_pattern))):
             continue
         outrows.append([docid, topic, text])
     topics = Counter([r[1] for r in outrows])
     print('get {} rows'.format(len(outrows)))
     print('{} topics, {}'.format(len(topics), topics))
     du.dump_docarr(self.temp_file, du.make_docarr(outrows))