コード例 #1
0
ファイル: main.py プロジェクト: locta66/TweetEventDetection
def main(args):
    input_base = getcfg().origin_path
    output_base = '/home/nfs/cdong/tw/seeding/Terrorist/queried/positive/'
    import utils.timer_utils as tmu
    tmu.check_time()
    parse_query_list(input_base, output_base, seed_queries, n_process=15)
    tmu.check_time()
    return
コード例 #2
0
ファイル: main.py プロジェクト: leeyanghaha/my_merge
def main():
    input_base = '/home/nfs/cdong/tw/origin/'
    # output_base = '/home/nfs/cdong/tw/seeding/Terrorist/queried/positive/'
    output_base = '/home/nfs/cdong/tw/seeding/NaturalDisaster/positive'
    import utils.timer_utils as tmu
    tmu.check_time()
    parse_query_list(input_base, output_base, seed_queries, process_num=20)
    tmu.check_time()
    return
コード例 #3
0
def refilter_twarr(in_file, out_file):
    twarr = fu.load_array(in_file)[:200000]
    origin_len = len(twarr)
    print(origin_len)
    clf_filter = ClassifierTerror()

    # for idx in range(len(twarr) - 1, -1, -1):
    #     text = twarr[idx][tk.key_text]
    #     if not pu.has_enough_alpha(text, 0.6):
    #         print(text)
    #         twarr.pop(idx)
    # text_filter_len = len(twarr)
    # print("delta by text =", origin_len - text_filter_len)

    tmu.check_time("refilter_twarr")
    twarr = clf_filter.filter(twarr, 0.2)
    tmu.check_time("refilter_twarr")
    print(len(twarr))
    fu.dump_array(out_file, twarr[:100000])
コード例 #4
0
def main():
    """
    启动各进程(池),遍历 _sub_files 中的文件名,逐个读取文件内容,
    每读取一个文件,输入过滤&聚类模块,并尝试从分类器读取返回结果后输入聚类模块;
    每过指定时间,向聚类模块发送聚类指令;
    随后尝试从聚类模块读取返回结果,并输入聚类信息提取模块
    :return:
    """
    tmu.check_time('qwertyui')
    tmu.check_time('main line 116', print_func=None)
    bflt.start_pool(flt_pool_size, ne_threshold, clf_threshold, event_type)
    bclu.start_pool(max_window_size, full_interval, alpha, beta)
    bext.start_pool(ext_pool_size, event_type)

    last_check_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print('sleeping.......')
    time.sleep(time_interval)
    _twarr, new_time = dbu.read_after_last_check(dbu.tweet_db, dbu.lxp_dataset,
                                                 last_check_time)
    _twarr = fu.change_from_lxp_format(_twarr)
    while (True):
        alarm = tmu.Alarm()
        print('main: {}th twarr to filter. '.format(len(_twarr)))

        # _sub_files = fi.listchildren("/home/nfs/cdong/tw/origin/", fi.TYPE_FILE, concat=True)[-4000:]
        # positive_twarr = fu.load_array('/home/nfs/yangl/dc/calling/filtered_twarr.json')[:5000]
        # _sub_files = fi.listchildren("/home/nfs/yangl/dc/input", fi.TYPE_FILE, concat=True)
        # if (_idx + 1) % 1000 == 0:
        #     dt = tmu.check_time('main line 116', print_func=None)
        #     emu.send_email('file {}/{}'.format(_idx + 1, len(_sub_files)), '{}s from last 1000 file'.format(dt))
        # if _idx > 0 and _idx % 10 == 0:
        #     print("main: {} th twarr to filter, len: {}".format(_idx, len(_twarr)))
        tmu.check_time('start running')
        twarr2filter(_twarr)
        filter2cluster()
        if alarm.is_time_elapse_bigger_than(check_every_sec):
            alarm.initialize_timestamp()
            filter2cluster(5)
            bclu.execute_cluster()
        cluster2extractor()
        end_it()
        time_left = time_interval - tmu.check_time(
            'start running',
            print_func=lambda dt: print('time epoch {}s'.format(dt)))
        print('sleeping.......')
        time.sleep(time_left if time_left > 0 else 0)
        _twarr, new_time = dbu.read_after_last_check(dbu.tweet_db,
                                                     dbu.lxp_dataset, new_time)
コード例 #5
0
ファイル: my_main.py プロジェクト: leeyanghaha/my_merge
def main():
    """
    启动各进程(池),遍历 _sub_files 中的文件名,逐个读取文件内容,
    每读取一个文件,输入过滤&聚类模块,并尝试从分类器读取返回结果后输入聚类模块;
    每过指定时间,向聚类模块发送聚类指令;
    随后尝试从聚类模块读取返回结果,并输入聚类信息提取模块
    :return:
    """
    tmu.check_time('qwertyui')
    tmu.check_time('main line 116', print_func=None)

    bflt.start_pool(flt_pool_size, ne_threshold, clf_threshold, event_type)
    bclu.start_pool(max_window_size, full_interval, alpha, beta)
    bext.start_pool(ext_pool_size, event_type)

    alarm = tmu.Alarm()
    # _sub_files = fi.listchildren("/home/nfs/cdong/tw/origin/", fi.TYPE_FILE, concat=True)[-4000:]
    # positive_twarr = fu.load_array('/home/nfs/yangl/dc/calling/filtered_twarr.json')[:5000]
    # _sub_files = fi.listchildren("/home/nfs/yangl/merge/lxp_data", fi.TYPE_FILE, concat=True)
    # _twarr = fu.load_array(_sub_files[0])
    # _twarr = fu.change_from_lxp_format(_twarr)
    last_check_time = datetime.datetime.now()
    count = 0
    while True:
        time.sleep(5 * 60)
        _twarr, new_time = dbu.read_after_last_check(dbu.nd_db, dbu.nd,
                                                     last_check_time)
        print('*************len(_twarr){}**********'.format(len(_twarr)))
        if len(_twarr) == 0:
            print('no new data arrive....')
            break
        if config.using_api_format == 'False':
            _twarr = fu.change_from_lxp_format(_twarr)
        # if (_idx + 1) % 1000 == 0:
        #     dt = tmu.check_time('main line 116', print_func=None)
        #     emu.send_email('file {}/{}'.format(_idx + 1, len(_sub_files)), '{}s from last 1000 file'.format(dt))
        # if _idx > 0 and _idx % 10 == 0:
        #     print("main: {} th twarr to filter, len: {}".format(_idx, len(_twarr)))
        print('main: {} tweets to filter'.format(len(_twarr)))
        count += len(_twarr)
        twarr2filter(_twarr)
        filter2cluster()
        if alarm.is_time_elapse_bigger_than(check_every_sec):
            alarm.initialize_timestamp()
            filter2cluster(5)
            bclu.execute_cluster()
            time.sleep(20)
        cluster2extractor()
        last_check_time = new_time
    print('waiting for main process ending......')
    time.sleep(600)
    end_it()
    tmu.check_time('qwertyui')
コード例 #6
0
def main():
    """
    启动各进程(池),遍历 _sub_files 中的文件名,逐个读取文件内容,
    每读取一个文件,输入过滤&聚类模块,并尝试从分类器读取返回结果后输入聚类模块;
    每过指定时间,向聚类模块发送聚类指令;
    随后尝试从聚类模块读取返回结果,并输入聚类信息提取模块
    :return:
    """
    tmu.check_time('qwertyui')
    tmu.check_time('main line 116', print_func=None)

    bflt.start_pool(flt_pool_size, ne_threshold, clf_threshold, event_type)
    bclu.start_pool(max_window_size, full_interval, alpha, beta)
    bext.start_pool(ext_pool_size, event_type)
    alarm = tmu.Alarm()
    # _sub_files = fi.listchildren("/home/nfs/cdong/tw/origin/", fi.TYPE_FILE, concat=True)[-4000:]
    _sub_files = fi.listchildren(
        "/home/nfs/cdong/tw/seeding/Terrorist/queried/positive",
        fi.TYPE_FILE,
        concat=True)
    # _twarr = fu.load_array(_sub_files[0])
    # _twarr = fu.change_from_lxp_format(_twarr)
    for _idx, _file in enumerate(_sub_files):
        _twarr = fu.load_array(_file)
        if config.using_api_format == 'False':
            _twarr = fu.change_from_lxp_format(_twarr)
        if (_idx + 1) % 1000 == 0:
            dt = tmu.check_time('main line 116', print_func=None)
            emu.send_email('file {}/{}'.format(_idx + 1, len(_sub_files)),
                           '{}s from last 1000 file'.format(dt))
        if _idx > 0 and _idx % 10 == 0:
            print("main: {} th twarr to filter, len: {}".format(
                _idx, len(_twarr)))
        print("{} th twarr to filter, len: {}".format(_idx, len(_twarr)))
        twarr2filter(_twarr)
        filter2cluster()
        if alarm.is_time_elapse_bigger_than(check_every_sec):
            alarm.initialize_timestamp()
            filter2cluster(5)
            bclu.execute_cluster()
            time.sleep(60)
        cluster2extractor()
    # time.sleep(300)
    end_it()
    tmu.check_time('qwertyui')
コード例 #7
0
    # p_textarr = fu.read_lines(pos_text_file)
    # tokens_list = [pu.tokenize('[\_\w\-]{2,}', text) for text in p_textarr]
    # cnt = Counter(au.merge_array(tokens_list))
    # print(cnt.most_common(100))
    # exit()

    word_list = [
        'attack', 'bomb', 'bombing', 'kill', 'killed', 'explode', 'explosion',
        'terrorist', 'suicide'
    ]

    _nlp = su.get_nlp()
    voacb = _nlp.vocab
    matrix_pre = np.array([voacb.get_vector(w) for w in word_list])
    np.save('matrix_pre', matrix_pre)

    tmu.check_time()
    _nlp = train_spacy_model(_nlp)
    tmu.check_time()
    # dump_nlp(trained_spacy_path, _nlp)
    # tmu.check_time()
    # test_nlp(trained_spacy_path)

    voacb = _nlp.vocab
    matrix_post = np.array([voacb.get_vector(w) for w in word_list])
    np.save('matrix_post', matrix_post)

    diff = matrix_pre - matrix_post
    diff = np.square(diff)
    print("diff:", np.sum(diff))
コード例 #8
0
    return featurearr, labelarr


def coef_of_lr_model(file):
    from sklearn.externals import joblib
    lr_model = joblib.load(file)
    print(len(lr_model.coef_[0]))
    print(lr_model.coef_)


if __name__ == "__main__":
    from classifying.terror.data_maker import fasttext_train, fasttext_test, ft_data_pattern
    ft_model = "/home/nfs/cdong/tw/src/models/classify/terror/ft_no_gpe_model"
    lr_model = "/home/nfs/cdong/tw/src/models/classify/terror/lr_no_gpe_model"
    clf_model = lr_model
    tmu.check_time('all')
    tmu.check_time()

    # coef_of_lr_model("/home/nfs/cdong/tw/src/models/classify/terror/lr_no_gpe_model")
    # clf_filter = ClassifierAddFeature(None, None)
    # for file in fi.listchildren("/home/nfs/cdong/tw/seeding/Terrorist/queried/positive", concat=True):
    #     twarr = fu.load_array(file)
    #     print(file, clf_filter.predict_mean_proba(twarr))
    # tmu.check_time()
    # exit()

    batch_num = 20
    fi.mkdir(ft_data_pattern.format('matrices_no_add'), remove_previous=True)
    train_mtx_ptn = ft_data_pattern.format(
        'matrices_no_add/train_feature_mtx_{}.npy')
    train_lbl_ptn = ft_data_pattern.format(
コード例 #9
0
 from calling.back_filter import filter_twarr_text
 # from classifying.terror.classifier_terror import file2label_text_array
 # textarr, labelarr = file2label_text_array("/home/nfs/cdong/tw/seeding/Terrorist/data/test")
 pos_base = "/home/nfs/cdong/tw/seeding/Terrorist/queried/positive"
 neg_base = "/home/nfs/cdong/tw/seeding/Terrorist/queried/negative"
 pos_files, neg_files = fi.listchildren(pos_base, concat=True), fi.listchildren(neg_base, concat=True, pattern='2012')
 
 base = "/home/nfs/yangl/event_detection/testdata/event2012/relevant"
 pos_files = fi.listchildren(base, concat=True)
 print(len(pos_files))
 print(sum([len(fu.read_lines(f)) for f in pos_files]))
 exit()
 
 my_filter = EffectCheck()
 pos_probarr, neg_probarr = list(), list()
 tmu.check_time()
 for file in neg_files:
     twarr = filter_twarr_text(fu.load_array(file))
     probarr = my_filter.predict_proba(twarr)
     neg_probarr.extend(probarr)
 tmu.check_time()
 for file in pos_files:
     probarr = my_filter.predict_proba(fu.load_array(file))
     pos_probarr.extend(probarr)
     # post_twarr = list()
 
     # for idx in range(len(probarr)):
     #     if probarr[idx] >= 0.35:
     #         post_twarr.append(twarr[idx])
     #     else:
     #         print(twarr[idx][tk.key_text])
コード例 #10
0
        # positive_twarr = fu.load_array('/home/nfs/yangl/dc/calling/filtered_twarr.json')[:5000]
        # _sub_files = fi.listchildren("/home/nfs/yangl/dc/input", fi.TYPE_FILE, concat=True)
        # if (_idx + 1) % 1000 == 0:
        #     dt = tmu.check_time('main line 116', print_func=None)
        #     emu.send_email('file {}/{}'.format(_idx + 1, len(_sub_files)), '{}s from last 1000 file'.format(dt))
        # if _idx > 0 and _idx % 10 == 0:
        #     print("main: {} th twarr to filter, len: {}".format(_idx, len(_twarr)))
        tmu.check_time('start running')
        twarr2filter(_twarr)
        filter2cluster()
        if alarm.is_time_elapse_bigger_than(check_every_sec):
            alarm.initialize_timestamp()
            filter2cluster(5)
            bclu.execute_cluster()
        cluster2extractor()
        end_it()
        time_left = time_interval - tmu.check_time(
            'start running',
            print_func=lambda dt: print('time epoch {}s'.format(dt)))
        print('sleeping.......')
        time.sleep(time_left if time_left > 0 else 0)
        _twarr, new_time = dbu.read_after_last_check(dbu.tweet_db,
                                                     dbu.lxp_dataset, new_time)
        # tmu.check_time('qwertyui')


if __name__ == '__main__':
    tmu.check_time()
    main()
    tmu.check_time(
        print_func=lambda dt: print("total time elapsed {}s".format(dt)))