def exec_pre_test(test_data_path):
    subfiles = fi.listchildren(test_data_path, children_type='file')
    # file_list = fu.split_multi_format(
    #     [(test_data_path + file) for file in subfiles if file.endswith('.json')], process_num=6)
    # twarr_blocks = fu.multi_process(fi.summary_unzipped_tweets_multi,
    #                                 [(file_list_slice,) for file_list_slice in file_list])
    twarr_blocks = filter_twarr(
        [fu.load_array(file) for file in subfiles if file.endswith('.json')])
    twarr = au.merge_array(twarr_blocks)

    tu.start_ner_service(pool_size=16)
    tu.twarr_ner(twarr)
    tu.end_ner_service()

    all_ids = set(fu.load_array(test_data_path + 'test_ids_all.csv'))
    pos_ids = set(fu.load_array(test_data_path + 'test_ids_pos.csv'))
    non_pos_ids = all_ids.difference(pos_ids)

    pos_twarr = list()
    non_pos_twarr = list()
    for tw in twarr:
        twid = tw[tk.key_id]
        if twid in pos_ids:
            pos_twarr.append(tw)
        elif twid in non_pos_ids:
            non_pos_twarr.append(tw)

    fu.dump_array(getcfg().pos_data_file, pos_twarr)
    fu.dump_array(getcfg().non_pos_data_file, non_pos_twarr)
def parse_query_list(from_path, into_path, query_list, n_process):
    from_path = fi.add_sep_if_needed(from_path)
    into_path = fi.add_sep_if_needed(into_path)
    all_sub_files = [
        file for file in fi.listchildren(
            from_path, children_type=fi.TYPE_FILE, pattern='.sum$')
    ]
    tw_num_sum = 0
    for query in query_list:
        query = SeedQuery(*query)
        query_sub_files = [
            os.path.join(from_path, f) for f in all_sub_files
            if query.is_time_desired(
                tw_ymd=query.time_of_tweet(f, source='filename'))
        ]
        print('{} files from {} to {}'.format(
            len(query_sub_files),
            query_sub_files[0][query_sub_files[0].rfind('/') + 1:],
            query_sub_files[-1][query_sub_files[-1].rfind('/') + 1:],
        ))
        twarr = query_from_files_multi(query_sub_files, query, n_process)
        tw_num_sum += len(twarr)
        file_name = query.to_string() + '.json'
        if len(twarr) > 20:
            print('file {} written\n'.format(file_name))
            fu.dump_array(os.path.join(into_path, file_name), twarr)
        else:
            print('twarr not long enough')
        for tw in twarr:
            print(tw[tk.key_text], '\n')
    print('total tweet number: {}'.format(tw_num_sum))
Example #3
0
 def dump_dict(self, file_name):
     self.reset_id()
     for word in self.vocabulary():
         if type(word) is not str:
             self.drop_word(word)
     word_id_freq_arr = [(word.strip(), int(self.word2id(word)),
                          int(self.freq_of_word(word)))
                         for word in sorted(self.vocabulary())]
     fu.dump_array(file_name, word_id_freq_arr)
Example #4
0
 def test(self, test_file):
     textarr, labelarr = file2label_text_array(test_file)
     featurearr = self.textarr2featurearr(textarr)
     probarr = self.predict_proba(featurearr)
     au.precision_recall_threshold(
         labelarr,
         probarr,
         file="performance.csv",
         thres_range=[i / 100 for i in range(1, 10)] +
         [i / 20 for i in range(2, 20)])
     fu.dump_array("result.json", (labelarr, probarr))
 def dump_dict(self, file_name):
     self.reset_id()
     for word in self.vocabulary():
         if type(word) is not str:
             self.drop_word(word)
     fu.dump_array(file_name, [
         K_DELIMITER.join(['{}'] * 3).format(word.strip(),
                                             int(self.freq_of_word(word)),
                                             int(self.word2id(word)))
         for word in sorted(self.vocabulary()) if type(word) is str
     ])
Example #6
0
 def test(self, test_file):
     textarr, labelarr = file2label_text_array(test_file)
     """"""
     # docarr = su.textarr_nlp(textarr, self.get_nlp())
     # featurearr = self.textarr2featurearr(textarr, docarr)
     featurearr = self.textarr2featurearr_no_gpe(textarr)
     """"""
     probarr = self.predict_proba(featurearr)
     au.precision_recall_threshold(labelarr, probarr,
         thres_range=[i / 100 for i in range(1, 10)] + [i / 20 for i in range(2, 20)])
     fu.dump_array("result.json", (labelarr, probarr))
def merge_events_2016():
    base = '/home/nfs/cdong/tw/seeding/Terrorist/queried/event_corpus/'
    subs = fi.listchildren(base, fi.TYPE_FILE)
    twarr_list = []
    for sub in subs:
        twarr = fu.load_array(base + sub)
        # twarr = tu.twarr_ner(twarr)
        # twarr = ark.twarr_ark(twarr)
        twarr_list.append(twarr)
    fu.dump_array('/home/nfs/cdong/tw/seeding/Terrorist/queried/event2016.txt',
                  twarr_list)
def extract_bad_tweets_into(files, output_file):
    total_tw_num = 0
    neg_twarr = list()
    for file in files:
        twarr = fu.load_array(file)
        total_tw_num += len(twarr)
        for tw in twarr:
            text = tw[tk.key_text]
            if len(text) < 20 or not pu.has_enough_alpha(text, 0.6):
                neg_twarr.append(tw)
    fu.dump_array(output_file, neg_twarr)
    return len(neg_twarr), total_tw_num
Example #9
0
def summary_files_in_path_into_blocks(from_path, into_path, file_name):
    from_path = fi.add_sep_if_needed(from_path)
    sub_files = fi.listchildren(from_path, children_type=fi.TYPE_FILE, pattern='.json$')
    into_file = fi.add_sep_if_needed(into_path) + file_name
    twarr_block = list()
    for idx, file in enumerate(sub_files):
        from_file = from_path + file
        twarr = fu.load_array_catch(from_file)
        if len(twarr) <= 0:
            continue
        twarr = tflt.filter_twarr(twarr, tflt.FILTER_LEVEL_HIGH)
        twarr_block.append(twarr)
    print(sorted([('id'+str(idx), len(twarr)) for idx, twarr in enumerate(twarr_block)], key=lambda x: x[1]))
    print('event number in total: {}'.format(len(twarr_block)))
    fu.dump_array(into_file, twarr_block)
Example #10
0
def summary_files_in_path(from_path, into_path=None):
    """ Read all .json under file_path, extract tweets from them into a file under summary_path. """
    # [-13:]--hour [-13:-3]--day [-13:-5]--month,ymdh refers to the short of "year-month-date-hour"
    from_path = fi.add_sep_if_needed(from_path)
    file_ymdh_arr = pu.split_digit_arr(fi.get_parent_path(from_path)[-13:])
    if not is_target_ymdh(file_ymdh_arr):
        return
    
    into_file = '{}{}'.format(fi.add_sep_if_needed(into_path), '_'.join(file_ymdh_arr) + '.sum')
    fi.remove_file(into_file)
    subfiles = fi.listchildren(from_path, children_type=fi.TYPE_FILE)
    file_block = mu.split_multi_format([(from_path + subfile) for subfile in subfiles], process_num=20)
    twarr_blocks = mu.multi_process(sum_files, [(file_list, tflt.FILTER_LEVEL_LOW) for file_list in file_block])
    twarr = au.merge_array(twarr_blocks)
    if twarr:
        fu.dump_array(into_file, twarr, overwrite=True)
Example #11
0
 def test(self, test_file):
     """
     给定带标记和文本的文件,读取其中的文本-标记对,调用向量化接口以及分类器,评估分类器在测试集上的性能
     :param test_file: str,测试用文本文件的路径
     :return:
     """
     textarr, labelarr = file2label_text_array(test_file)
     """"""
     # docarr = su.textarr_nlp(textarr, self.get_nlp())
     # featurearr = self.textarr2featurearr(textarr, docarr)
     featurearr = self.textarr2featurearr_no_gpe(textarr)
     """"""
     probarr = self.predict_proba(featurearr)
     au.precision_recall_threshold(
         labelarr,
         probarr,
         thres_range=[i / 100 for i in range(1, 10)] +
         [i / 20 for i in range(2, 20)])
     fu.dump_array("result.json", (labelarr, probarr))
Example #12
0
def refilter_twarr(in_file, out_file):
    twarr = fu.load_array(in_file)[:200000]
    origin_len = len(twarr)
    print(origin_len)
    clf_filter = ClassifierTerror()

    # for idx in range(len(twarr) - 1, -1, -1):
    #     text = twarr[idx][tk.key_text]
    #     if not pu.has_enough_alpha(text, 0.6):
    #         print(text)
    #         twarr.pop(idx)
    # text_filter_len = len(twarr)
    # print("delta by text =", origin_len - text_filter_len)

    tmu.check_time("refilter_twarr")
    twarr = clf_filter.filter(twarr, 0.2)
    tmu.check_time("refilter_twarr")
    print(len(twarr))
    fu.dump_array(out_file, twarr[:100000])
Example #13
0
 def make_tw_batches(self, batch_size):
     ordered_twarr = self.order_twarr_through_time()
     tw_batches = split_array_into_batches(ordered_twarr, batch_size)
     self.twarr_info(au.merge_array(tw_batches))
     fu.dump_array(self.labelled_batch_file, tw_batches)
    exit()
    """ 文本数量小于30时关键词的质量已经相当低,应尽量使进入的文本数量大于一定阈值 """
    """ __main__里面的内容保持不变,是最终的接口形式 """
    _keyword_file = 'keyword_results.json'

    _file_name_keywords_list = fu.load_array(_keyword_file)

    # for filename, keyword in _file_name_keywords_list:
    #     print(filename)
    #     print(filter_keywords(keyword, 20), '\n')
    exit()

    _base = "/home/nfs/cdong/tw/seeding/Terrorist/queried/positive/"
    _files = fi.listchildren(_base, fi.TYPE_FILE, concat=True)
    _twarr_list = [fu.load_array(file) for file in _files]
    tmu.check_time()
    _file_name_list = [fi.get_name(file) for file in _files]
    _keywords_list = autophrase_multi(_twarr_list,
                                      process_num=8)  # 主要验证这个输出结果是否正确以及是否有加速
    tmu.check_time()
    _res = list(zip(_file_name_list, _keywords_list))
    fu.dump_array(_keyword_file, _res)
    # _keywords_list = fu.load_array(_keyword_file)
    # print(len(_keywords_list))
    # print([len(_keywords) for _keywords in _keywords_list])
    # assert len(_twarr_list) == len(_keywords_list)
    # for _idx, _keywords in enumerate(_keywords_list):
    #     # print('word num', len(_keywords), ', tw num', len(_twarr_list[idx]))
    #     if len(_keywords) < 20:
    #         print(len(_twarr_list[_idx]))
    def clusters_tfidf_similarity(self, file_name):
        import pandas as pd
        import utils.function_utils as fu
        # print('clusters_tfidf_similarity')
        # tmu.check_time()
        # """ construct tf vector for every cluster """
        # cluid_arr = sorted(self.cludict.keys())
        # valid_corpus_token_set = self.valid_corpus_token_set
        # cluid2vec = dict([(cluid, None) for cluid in cluid_arr])
        # for cluid, cluster in self.cludict.items():
        #     if cluster.twnum == 0:
        #         raise ValueError('cluster should have at least one document to make sense')
        #     clu_vec = np.array([])
        #     for k_type in TokenSet.KEY_LIST:
        #         valid_ifd = valid_corpus_token_set.get(k_type)
        #         vocab_size = valid_ifd.vocabulary_size()
        #         type_tf_vec = np.zeros([vocab_size])
        #         clu_ifd = cluster.token_set.get(k_type)
        #         for word, freq in clu_ifd.word_freq_enumerate():
        #             type_tf_vec[valid_ifd.word2id(word)] = freq
        #         clu_vec = np.concatenate([clu_vec, type_tf_vec])
        #     cluid2vec[cluid] = clu_vec
        # """ make idf """
        # vec_len = sum([valid_corpus_token_set.get(k_type).vocabulary_size() for k_type in TokenSet.KEY_LIST])
        # print('vector length sum({})={}'.format(
        #     [valid_corpus_token_set.get(k_type).vocabulary_size() for k_type in TokenSet.KEY_LIST], vec_len))
        # d = len(cluid2vec)
        # for i in range(vec_len):
        #     df = 1
        #     for clu_vec in cluid2vec.values():
        #         if clu_vec[i] > 0:
        #             df += 1
        #     idf = np.log(d / df)
        #     for clu_vec in cluid2vec.values():
        #         clu_vec[i] *= idf
        # # tmu.check_time(print_func=lambda dt: print('construct tf-idf vector dt={}'.format(dt)))
        #
        # """ cosine similarity matrix """
        # cosine_matrix = au.cosine_matrix_multi([cluid2vec[cluid].reshape([-1]) for cluid in cluid_arr], process_num=16)
        # sim_matrix = pd.DataFrame(index=cluid_arr, columns=cluid_arr, data=0.0, dtype=np.float32)
        # for i in range(len(cluid_arr)):
        #     cluidi = cluid_arr[i]
        #     for j in range(i + 1, len(cluid_arr)):
        #         cluidj = cluid_arr[j]
        #         cos_sim = au.cosine_similarity(cluid2vec[cluidi], cluid2vec[cluidj])
        #         sim_matrix.loc[cluidi, cluidj] = sim_matrix.loc[cluidj, cluidi] = cos_sim
        # tmu.check_time(print_func=lambda dt: print('cosine similarity single dt={}'.format(dt)))

        # TODO for each cluster, get the four similarities with other clusters,
        # use them as features to make classification whether two clusters are of same label
        """ one matrix per type """
        type2vecarr = dict([(k_type, None) for k_type in TokenSet.KEY_LIST])
        cluid_arr = sorted(self.cludict.keys())
        valid_corpus_token_set = self.valid_corpus_token_set
        for k_type in TokenSet.KEY_LIST:
            valid_ifd = valid_corpus_token_set.get(k_type)
            vec_len = valid_ifd.vocabulary_size()
            for cluid in cluid_arr:
                clu_vec = np.zeros([vec_len])
                clu_ifd = self.cludict[cluid].token_set.get(k_type)
                for word, freq in clu_ifd.word_freq_enumerate():
                    clu_vec[valid_ifd.word2id(word)] = freq
                type2vecarr[k_type] = np.concatenate([type2vecarr[k_type], clu_vec.reshape([1, -1])]) \
                    if type2vecarr[k_type] is not None else clu_vec.reshape([1, -1])
            print(k_type, type2vecarr[k_type].shape)
        """ a matrix per type """
        w_dict = {
            su.pos_prop: 0.4,
            su.pos_comm: 0.3,
            su.pos_verb: 0.2,
            su.pos_hstg: 0.1
        }
        cosine_matrix = np.zeros([len(cluid_arr), len(cluid_arr)])
        for k_type in TokenSet.KEY_LIST:
            if 0 in type2vecarr[k_type].shape:
                continue
            cosmtx = au.cosine_similarity(
                [vec.reshape([-1]) for vec in type2vecarr[k_type]],
                process_num=16)
            cosine_matrix += cosmtx * w_dict[k_type]
        """ ###  ### """
        """    ||    """
        """    __    """
        sim_matrix = pd.DataFrame(index=cluid_arr,
                                  columns=cluid_arr,
                                  data=cosine_matrix,
                                  dtype=np.float32)
        # tmu.check_time(print_func=lambda dt: print('cosine similarity multiple dt={}'.format(dt)))
        """ for each cluster, find top k similar clusters """
        top_k = 3
        cluid2topsim = dict()
        for cluid, row in sim_matrix.iterrows():
            top_sim_cluids = row.index[np.argsort(row.values)[::-1][:top_k]]
            cluid2topsim[cluid] = {
                'cluidarr': top_sim_cluids,
                'scorearr': row[top_sim_cluids].tolist()
            }
        # tmu.check_time(print_func=lambda dt: print('find top 5 similar dt={}'.format(dt)))
        """ find representative label for every cluster """
        cluid2label = dict()
        rep_score = 0.7
        df = cs.cluid_label_table([int(i) for i in self.label],
                                  [int(i) for i in self.z])
        for cluid, row in df.iterrows():
            clu_twnum = sum(row.values)
            assert clu_twnum == self.cludict[cluid].twnum
            rep_label = int(row.index[np.argmax(row.values)])
            rep_twnum = row[rep_label]
            if rep_twnum == 0 or rep_twnum < clu_twnum * rep_score:
                cluid2label[cluid] = -1
            else:
                cluid2label[cluid] = rep_label
        # tmu.check_time(print_func=lambda dt: print('find representative label dt={}'.format(dt)))
        """ verify top sim. and rep. label """
        assert len(set(cluid2topsim.keys()).difference(set(cluid_arr))) == 0
        assert len(set(cluid2label.keys()).difference(set(cluid_arr))) == 0
        sim_info = list()
        for cluid in cluid_arr:
            clu_replb = cluid2label[cluid]
            clu_twnum = self.cludict[cluid].twnum
            sim_cluid_arr = cluid2topsim[cluid]['cluidarr']
            sim_score_arr = cluid2topsim[cluid]['scorearr']
            top_sim_cluid = sim_cluid_arr[0]
            top_sim_replb = cluid2label[top_sim_cluid]
            top_sim_twnum = self.cludict[top_sim_cluid].twnum
            top_sim_score = sim_score_arr[0]
            if top_sim_score < 0.4 or cluid >= top_sim_cluid:
                continue
            # print('\ncid {}, lb [{}], twnum {}'.format(cluid, clu_replb, clu_twnum))
            info = 'lb {:3} tw {:3}  <->  lb {:3} tw {:3}, score {}'.format(
                clu_replb, clu_twnum, top_sim_replb, top_sim_twnum,
                round(top_sim_score, 2))
            sim_info.append(info)
            # for idx in range(top_k):
            #     sim_cluid = sim_cluid_arr[idx]
            #     if sim_cluid <= cluid:
            #         continue
            #     sim_clu_twnum = self.cludict[sim_cluid].twnum
            #     print('    cid {:4}, lb [{:3}], score {}, twnum {}'.format(
            #         sim_cluid, cluid2label[sim_cluid], round(sim_score_arr[idx], 2), sim_clu_twnum))
        fu.dump_array(file_name, sim_info, False)
     probarr = my_filter.predict_proba(fu.load_array(file))
     pos_probarr.extend(probarr)
     # post_twarr = list()
 
     # for idx in range(len(probarr)):
     #     if probarr[idx] >= 0.35:
     #         post_twarr.append(twarr[idx])
     #     else:
     #         print(twarr[idx][tk.key_text])
     # post_twarr = [tw for idx, tw in enumerate(twarr) if probarr[idx] >= 0.4]
     # post_total_len += len(post_twarr)
     # print(len(post_twarr) / len(twarr), '\n\n\n')
 tmu.check_time()
 lblarr = [1 for _ in range(len(pos_probarr))] + [0 for _ in range(len(neg_probarr))]
 prbarr = pos_probarr + neg_probarr
 fu.dump_array("prb_lbl_arr.txt", (lblarr, prbarr))
 lblarr, prbarr = fu.load_array("prb_lbl_arr.txt")
 au.precision_recall_threshold(lblarr, prbarr)
 # print('total portion = {} / {} = {}'.format(post_total_len, pre_total_len, post_total_len / pre_total_len))
 tmu.check_time()
 exit()
 
 sub_files = fi.listchildren('/home/nfs/cdong/tw/origin/', fi.TYPE_FILE, concat=True)[18:19]
 twarr = au.merge_array([fu.load_array(file) for file in sub_files])
 print(len(twarr))
 tmu.check_time(print_func=None)
 for idx, tw in enumerate(twarr[14000:15000]):
     if (idx + 1) % 1000 == 0:
         print(idx)
     try:
         my_filter.get_features(tw)
Example #17
0
 def dump_cluidarr(self, cluidarr):
     fu.dump_array(self.filtered_cluidarr_file, cluidarr)