コード例 #1
0
def make_train_test():
    p_file = ft_data_pattern.format("pos_2016.txt")
    n_bad_files = fi.listchildren(ft_data_pattern.format(''),
                                  fi.TYPE_FILE,
                                  concat=True,
                                  pattern='2016_bad')
    n_2017_files = fi.listchildren(ft_data_pattern.format(''),
                                   fi.TYPE_FILE,
                                   concat=True,
                                   pattern='2017')
    # n_2012_fulls = fi.listchildren(ft_data_pattern.format(''), fi.TYPE_FILE, concat=True, pattern='2012_full')[:12]
    n_2012_fulls = fi.listchildren(ft_data_pattern.format(''),
                                   fi.TYPE_FILE,
                                   concat=True,
                                   pattern='2012_full')
    n_2016_files = fi.listchildren(ft_data_pattern.format(''),
                                   fi.TYPE_FILE,
                                   concat=True,
                                   pattern='2016_queried')
    print(len(n_bad_files), len(n_2017_files), len(n_2012_fulls),
          len(n_2016_files))

    n_files = n_bad_files + n_2017_files + n_2012_fulls + n_2016_files

    p_txtarr = fu.read_lines(p_file)
    p_prefix_txtarr = prefix_textarr(label_t, p_txtarr)
    n_txtarr_blocks = [fu.read_lines(file) for file in n_files]
    n_prefix_txtarr_blocks = [
        prefix_textarr(label_f, txtarr) for txtarr in n_txtarr_blocks
    ]

    train_test = list()
    bad = len(n_bad_files)
    bad_blocks, n_blocks = n_prefix_txtarr_blocks[:
                                                  bad], n_prefix_txtarr_blocks[
                                                      bad:]
    train_test.append(split_train_test(p_prefix_txtarr))
    train_test.extend([split_train_test(block) for block in n_blocks])
    print("len(train_test)", len(train_test))
    train_list, test_list = zip(*train_test)
    train_list = list(train_list) + bad_blocks

    train_txtarr = au.merge_array(train_list)
    test_txtarr = au.merge_array(test_list)
    fu.write_lines(fasttext_train, train_txtarr)
    fu.write_lines(fasttext_test, test_txtarr)
    print("len(train_list)", len(train_list), "len(train_txtarr)",
          len(train_txtarr), "len(test_txtarr)", len(test_txtarr))
コード例 #2
0
ファイル: summarization.py プロジェクト: leeyanghaha/my_merge
def get_semantic_tokens_multi(file_path):
    pos_type_info = {
        ark.prop_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_prop_dict_file},
        ark.comm_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_comm_dict_file},
        ark.verb_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_verb_dict_file},
        ark.hstg_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_hstg_dict_file},
    }
    total_doc_num = 0
    file_path = fi.add_sep_if_needed(file_path)
    # subfiles = au.random_array_items(fi.listchildren(file_path, children_type=fi.TYPE_FILE), 40)
    subfiles = fi.listchildren(file_path, children_type=fi.TYPE_FILE)
    file_list_block = mu.split_multi_format([(file_path + subfile) for subfile in subfiles], process_num=20)
    res_list = mu.multi_process(get_semantic_tokens, [(file_list,) for file_list in file_list_block])
    for res_type_info, doc_num in res_list:
        total_doc_num += doc_num
        for label in res_type_info.keys():
            pos_type_info[label][K_IFD].merge_freq_from(res_type_info[label][K_IFD])
    print('total_doc_num', total_doc_num)
    for label in pos_type_info.keys():
        ifd, file_name = pos_type_info[label][K_IFD], pos_type_info[label][K_FILE]
        ifd.drop_words_by_condition(3)
        if label != ark.hstg_label:
            ifd.drop_words_by_condition(lambda word, _: word.startswith('#'))
        ifd.dump_dict(file_name)
        print('{}; vocab size:{}'.format(file_name, ifd.vocabulary_size()))
コード例 #3
0
def parse_query_list(from_path, into_path, query_list, n_process):
    from_path = fi.add_sep_if_needed(from_path)
    into_path = fi.add_sep_if_needed(into_path)
    all_sub_files = [
        file for file in fi.listchildren(
            from_path, children_type=fi.TYPE_FILE, pattern='.sum$')
    ]
    tw_num_sum = 0
    for query in query_list:
        query = SeedQuery(*query)
        query_sub_files = [
            os.path.join(from_path, f) for f in all_sub_files
            if query.is_time_desired(
                tw_ymd=query.time_of_tweet(f, source='filename'))
        ]
        print('{} files from {} to {}'.format(
            len(query_sub_files),
            query_sub_files[0][query_sub_files[0].rfind('/') + 1:],
            query_sub_files[-1][query_sub_files[-1].rfind('/') + 1:],
        ))
        twarr = query_from_files_multi(query_sub_files, query, n_process)
        tw_num_sum += len(twarr)
        file_name = query.to_string() + '.json'
        if len(twarr) > 20:
            print('file {} written\n'.format(file_name))
            fu.dump_array(os.path.join(into_path, file_name), twarr)
        else:
            print('twarr not long enough')
        for tw in twarr:
            print(tw[tk.key_text], '\n')
    print('total tweet number: {}'.format(tw_num_sum))
コード例 #4
0
def exec_pre_test(test_data_path):
    subfiles = fi.listchildren(test_data_path, children_type='file')
    # file_list = fu.split_multi_format(
    #     [(test_data_path + file) for file in subfiles if file.endswith('.json')], process_num=6)
    # twarr_blocks = fu.multi_process(fi.summary_unzipped_tweets_multi,
    #                                 [(file_list_slice,) for file_list_slice in file_list])
    twarr_blocks = filter_twarr(
        [fu.load_array(file) for file in subfiles if file.endswith('.json')])
    twarr = au.merge_array(twarr_blocks)

    tu.start_ner_service(pool_size=16)
    tu.twarr_ner(twarr)
    tu.end_ner_service()

    all_ids = set(fu.load_array(test_data_path + 'test_ids_all.csv'))
    pos_ids = set(fu.load_array(test_data_path + 'test_ids_pos.csv'))
    non_pos_ids = all_ids.difference(pos_ids)

    pos_twarr = list()
    non_pos_twarr = list()
    for tw in twarr:
        twid = tw[tk.key_id]
        if twid in pos_ids:
            pos_twarr.append(tw)
        elif twid in non_pos_ids:
            non_pos_twarr.append(tw)

    fu.dump_array(getcfg().pos_data_file, pos_twarr)
    fu.dump_array(getcfg().non_pos_data_file, non_pos_twarr)
コード例 #5
0
ファイル: main.py プロジェクト: locta66/TweetEventDetection
def main():
    bflt.start_pool(flt_pool_size, ne_threshold, clf_threshold)
    bclu.start_pool(hold_batch_num, batch_size, alpha, beta)
    # bext.start_pool(ext_pool_size)

    sub_files = fi.listchildren("/home/nfs/cdong/tw/origin/",
                                fi.TYPE_FILE,
                                concat=True)[-4000:]
    for _idx, _file in enumerate(sub_files):
        _twarr = fu.load_array(_file)
        print("1-- {} th twarr to filter, len: {}".format(_idx, len(_twarr)))
        twarr2filter(_twarr)
        # if _idx > 0 and (_idx + 1) % 1000 == 0:
        #     dt = tmu.check_time('if_idx>0and(_idx+1)%1000==0:', print_func=None)
        #     emu.send_email('notification', '{}/{} file, {}s from last 1000 file'.format(_idx+1, len(sub_files), dt))
        # if _idx % 50 == 0:
        #     tmu.check_time('_idx, _file', print_func=lambda dt: print("{} s from last 50".format(dt)))
        if _idx > 0 and _idx % 10 != 0:
            continue
        try_filter2cluster()

        # cluid_twarr_list = bclu.get_cluid_twarr_list()
        # print(len(cluid_twarr_list) if cluid_twarr_list else '--not ready')
        # if cluid_twarr_list:
        #     print(len(cluid_twarr_list))

    ensure_filter_workload()
コード例 #6
0
def merge_events_2016():
    base = '/home/nfs/cdong/tw/seeding/Terrorist/queried/event_corpus/'
    subs = fi.listchildren(base, fi.TYPE_FILE)
    twarr_list = []
    for sub in subs:
        twarr = fu.load_array(base + sub)
        # twarr = tu.twarr_ner(twarr)
        # twarr = ark.twarr_ark(twarr)
        twarr_list.append(twarr)
    fu.dump_array('/home/nfs/cdong/tw/seeding/Terrorist/queried/event2016.txt',
                  twarr_list)
コード例 #7
0
def make_neg_event_bad_text_2016():
    files = fi.listchildren("/home/nfs/cdong/tw/origin/",
                            fi.TYPE_FILE,
                            concat=True)
    files_blocks = mu.split_multi_format(files, 4)
    output_file = neg_event_pattern.format("neg_2016_bad_text_{}.json")
    args_list = [(block, output_file.format(idx))
                 for idx, block in enumerate(files_blocks)]
    res_list = mu.multi_process(extract_bad_tweets_into, args_list)
    n_num_list, tw_num_list = zip(*res_list)
    total_n, total_tw = sum(n_num_list), sum(tw_num_list)
    print(n_num_list, tw_num_list, total_n, total_tw,
          round(total_n / total_tw, 6))
コード例 #8
0
def rename_files_2016():
    sub_files = fi.listchildren(event_2016_pattern.format(''), fi.TYPE_FILE)
    for origin in sub_files:
        splits = origin.split('_')
        date = splits[0]
        date_splits = date.split('-')
        date_splits_ = ['{:0>2}'.format(s) for s in date_splits]
        date_ = '-'.join(date_splits_)
        splits[0] = date_
        name_ = '_'.join(splits)
        from_name = event_2016_pattern.format(origin)
        to_name = event_2016_pattern.format(name_)
        os.popen('mv {} {}'.format(from_name, to_name)).close()
コード例 #9
0
ファイル: summarization.py プロジェクト: leeyanghaha/my_merge
def get_tokens_multi(file_path):
    file_path = fi.add_sep_if_needed(file_path)
    # subfiles = au.random_array_items(fi.listchildren(file_path, children_type=fi.TYPE_FILE), 20)
    subfiles = fi.listchildren(file_path, children_type=fi.TYPE_FILE)
    file_list_block = mu.split_multi_format([(file_path + subfile) for subfile in subfiles], process_num=20)
    res_list = mu.multi_process(get_tokens, [(file_list,) for file_list in file_list_block])
    id_freq_dict, total_doc_num = IdFreqDict(), 0
    for ifd, doc_num in res_list:
        total_doc_num += doc_num
        id_freq_dict.merge_freq_from(ifd)
    print('total_doc_num', total_doc_num, 'total vocabulary_size', id_freq_dict.vocabulary_size())
    id_freq_dict.drop_words_by_condition(3)
    id_freq_dict.dump_dict(getcfg().post_dict_file)
コード例 #10
0
ファイル: summarization.py プロジェクト: leeyanghaha/my_merge
def summary_files_in_path_into_blocks(from_path, into_path, file_name):
    from_path = fi.add_sep_if_needed(from_path)
    sub_files = fi.listchildren(from_path, children_type=fi.TYPE_FILE, pattern='.json$')
    into_file = fi.add_sep_if_needed(into_path) + file_name
    twarr_block = list()
    for idx, file in enumerate(sub_files):
        from_file = from_path + file
        twarr = fu.load_array_catch(from_file)
        if len(twarr) <= 0:
            continue
        twarr = tflt.filter_twarr(twarr, tflt.FILTER_LEVEL_HIGH)
        twarr_block.append(twarr)
    print(sorted([('id'+str(idx), len(twarr)) for idx, twarr in enumerate(twarr_block)], key=lambda x: x[1]))
    print('event number in total: {}'.format(len(twarr_block)))
    fu.dump_array(into_file, twarr_block)
コード例 #11
0
ファイル: summarization.py プロジェクト: leeyanghaha/my_merge
def summary_files_in_path(from_path, into_path=None):
    """ Read all .json under file_path, extract tweets from them into a file under summary_path. """
    # [-13:]--hour [-13:-3]--day [-13:-5]--month,ymdh refers to the short of "year-month-date-hour"
    from_path = fi.add_sep_if_needed(from_path)
    file_ymdh_arr = pu.split_digit_arr(fi.get_parent_path(from_path)[-13:])
    if not is_target_ymdh(file_ymdh_arr):
        return
    
    into_file = '{}{}'.format(fi.add_sep_if_needed(into_path), '_'.join(file_ymdh_arr) + '.sum')
    fi.remove_file(into_file)
    subfiles = fi.listchildren(from_path, children_type=fi.TYPE_FILE)
    file_block = mu.split_multi_format([(from_path + subfile) for subfile in subfiles], process_num=20)
    twarr_blocks = mu.multi_process(sum_files, [(file_list, tflt.FILTER_LEVEL_LOW) for file_list in file_block])
    twarr = au.merge_array(twarr_blocks)
    if twarr:
        fu.dump_array(into_file, twarr, overwrite=True)
コード例 #12
0
def main():
    """
    启动各进程(池),遍历 _sub_files 中的文件名,逐个读取文件内容,
    每读取一个文件,输入过滤&聚类模块,并尝试从分类器读取返回结果后输入聚类模块;
    每过指定时间,向聚类模块发送聚类指令;
    随后尝试从聚类模块读取返回结果,并输入聚类信息提取模块
    :return:
    """
    tmu.check_time('qwertyui')
    tmu.check_time('main line 116', print_func=None)

    bflt.start_pool(flt_pool_size, ne_threshold, clf_threshold, event_type)
    bclu.start_pool(max_window_size, full_interval, alpha, beta)
    bext.start_pool(ext_pool_size, event_type)
    alarm = tmu.Alarm()
    # _sub_files = fi.listchildren("/home/nfs/cdong/tw/origin/", fi.TYPE_FILE, concat=True)[-4000:]
    _sub_files = fi.listchildren(
        "/home/nfs/cdong/tw/seeding/Terrorist/queried/positive",
        fi.TYPE_FILE,
        concat=True)
    # _twarr = fu.load_array(_sub_files[0])
    # _twarr = fu.change_from_lxp_format(_twarr)
    for _idx, _file in enumerate(_sub_files):
        _twarr = fu.load_array(_file)
        if config.using_api_format == 'False':
            _twarr = fu.change_from_lxp_format(_twarr)
        if (_idx + 1) % 1000 == 0:
            dt = tmu.check_time('main line 116', print_func=None)
            emu.send_email('file {}/{}'.format(_idx + 1, len(_sub_files)),
                           '{}s from last 1000 file'.format(dt))
        if _idx > 0 and _idx % 10 == 0:
            print("main: {} th twarr to filter, len: {}".format(
                _idx, len(_twarr)))
        print("{} th twarr to filter, len: {}".format(_idx, len(_twarr)))
        twarr2filter(_twarr)
        filter2cluster()
        if alarm.is_time_elapse_bigger_than(check_every_sec):
            alarm.initialize_timestamp()
            filter2cluster(5)
            bclu.execute_cluster()
            time.sleep(60)
        cluster2extractor()
    # time.sleep(300)
    end_it()
    tmu.check_time('qwertyui')
コード例 #13
0
        cluster_file = fi.join(path, '{}_cluid:{}.json'.format(idx, cluid))
        fu.write_lines(cluster_file, [json_str])

        # textarr_file = fi.join(path, '{}_text.json'.format(idx))
        # textarr = [tw[tk.key_text] for tw in cic.twarr]
        # fu.write_lines(textarr_file, textarr)
    print('    bext: output into files over')


def cic_format(cic_list):
    cic_list = sorted(cic_list, key=lambda item: len(item.twarr), reverse=True)
    res = []
    print('    bext: output cic list, len={}'.format(len(cic_list)))
    for idx, cic in enumerate(cic_list):
        cic.twarr = ClusterInfoGetter.group_similar_tweets(cic.twarr,
                                                           process_num=10)
        od = cic.construct_od()
        res.append(od)
    return res


if __name__ == '__main__':
    dir = "/home/nfs/yangl/merge/lxp_data"
    # dir2 = "/home/nfs/cdong/tw/seeding/Terrorist/queried/positive"
    files = fi.listchildren(dir, concat=True)[:2]
    getter = ClusterInfoGetter(cci.event_t)
    for idx, file in enumerate(files):
        twarr = fu.load_array(file)
        twarr = fu.change_from_lxp_format(twarr)
        getter.cluid_twarr2cic(idx, twarr, 1)
コード例 #14
0

def merge_events_2016():
    base = '/home/nfs/cdong/tw/seeding/Terrorist/queried/event_corpus/'
    subs = fi.listchildren(base, fi.TYPE_FILE)
    twarr_list = []
    for sub in subs:
        twarr = fu.load_array(base + sub)
        # twarr = tu.twarr_ner(twarr)
        # twarr = ark.twarr_ark(twarr)
        twarr_list.append(twarr)
    fu.dump_array('/home/nfs/cdong/tw/seeding/Terrorist/queried/event2016.txt',
                  twarr_list)


if __name__ == '__main__':
    # merge_events_2016()
    import utils.pattern_utils as pu
    base = "/home/nfs/cdong/tw/seeding/Terrorist/queried/event_corpus/"
    files = fi.listchildren(base, fi.TYPE_FILE, concat=True)
    for file in files:
        twarr = fu.load_array(file)
        len_pre = len(twarr)
        for idx in range(len(twarr) - 1, -1, -1):
            text = twarr[idx][tk.key_text]
            if not pu.has_enough_alpha(text, 0.6):
                print(text)
                twarr.pop(idx)
        print(len_pre, '->', len(twarr), '\n\n')
        # fu.dump_array(file, twarr)
コード例 #15
0
    # if earliest_time is None:
    #     earliest_time = now
    # if latest_time is None:
    #     latest_time = now
    return earliest_time, latest_time


# if __name__ == '__main__':
#     test_case = u'I need a desk for tomorrow from 2pm to 3pm'
#     print(json.dumps(sutime.parse(test_case), sort_keys=True, indent=4))
if __name__ == '__main__':
    import utils.function_utils as fu
    import utils.file_iterator as fi

    base = '/home/nfs/cdong/tw/seeding/Terrorist/queried/positive'
    pos_files = fi.listchildren(base, concat=True)
    for file in pos_files:
        twarr = fu.load_array(file)

        text_times, utc_time = get_text_time(twarr)
        earliest_time, latest_time = get_earlist_latest_post_time(twarr)

        print(earliest_time.isoformat())
        print(latest_time.isoformat())
        print(utc_time.isoformat())

        table = PrettyTable(["推测时间", "推文文本", "时间词", "推文创建时间", "utc_offset"])
        table.padding_width = 1
        for time in text_times:
            table.add_row(time)
        print(table)
コード例 #16
0
    extract_sub_process.set_input(END_PROCESS)
    extract_sub_process.get_output()


def input_cluid_twarr_list(cluid_twarr_list):
    """
    向子进程输入要求其处理的聚类列表,不等待返回结果
    :param cluid_twarr_list: list,每个元素为tuple
        见 clustering.gsdpmm.gsdpmm_stream_ifd_dynamic.GSDPMMStreamIFDDynamic#get_cluid_twarr_list
    :return:
    """
    if cluid_twarr_list:
        extract_sub_process.set_input(INPUT_LIST)
        extract_sub_process.set_input(cluid_twarr_list)


if __name__ == '__main__':
    import utils.timer_utils as tmu
    _base = "/home/nfs/cdong/tw/seeding/Terrorist/queried/positive"
    files = fi.listchildren(_base, concat=True)
    _cluid_twarr_list = [(idx, fu.load_array(file)[:1000])
                         for idx, file in enumerate(files)]

    start_pool(10, 'terrorist_attack')
    tmu.check_time()
    for i in range(2):
        input_cluid_twarr_list(_cluid_twarr_list)
    end_pool()
    tmu.check_time()
    exit()
コード例 #17
0
ファイル: matplot_utils.py プロジェクト: leeyanghaha/my_merge
    plt.plot(X, Y, color="blue", linewidth=1)
    plt.xlim([-0.03, 1.03])
    plt.ylim([-0.03, 1.03])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.title("roc curve")
    plt.legend(loc='lower right')
    plt.savefig(fig_name, format='png')


if __name__ == '__main__':
    import re
    import numpy as np
    from pathlib import Path
    import utils.file_iterator as fi
    files = fi.listchildren('/home/nfs/cdong/tw/testdata/output2', children_type=fi.TYPE_DIR, concat=True)
    ls = list()
    for f in files:
        p = Path(f)
        s = p.stat()
        digits = re.findall('\d+', p.name)
        cluid, clunum = list(map(int, digits))
        ls.append((cluid, clunum, s.st_mtime))
    
    ls = sorted(ls, key=lambda item: item[0])
    print(ls)
    dt = [(ls[0][0], ls[0][1], 0)] + [(ls[i][0], ls[i][1], int(ls[i][2] - ls[i - 1][2])) for i in range(1, len(ls))]
    print(dt)

    # x, y_cnum, y_dt = list(zip(*dt[:50]))
    x, y_cnum, y_dt = list(zip(*dt))
コード例 #18
0
        outq.put([idx, len(twarr)])


def read2(idx, file, nothing='p'):
    twarr = fu.load_array(file)
    return [idx, len(twarr)]


if __name__ == '__main__':
    # dp = CustomDaemonPool()
    # dp = ProxyDaemonPool()
    # dp.set_parameters(read2, 8)
    # base = '/home/nfs/cdong/tw/testdata/yying/2016_04/'
    # files = [base + sub for sub in subs][:40]
    base = '/home/nfs/cdong/tw/seeding/Terrorist/queried/event_corpus/'
    subs = fi.listchildren(base, children_type=fi.TYPE_FILE)
    files = [base + sub for sub in subs]

    tmu.check_time()
    res = multi_process_batch(read2,
                              args_list=[(idx, file)
                                         for idx, file in enumerate(files)])
    # dp.set_batch_input([(idx, file) for idx, file in enumerate(files)],
    #                    [{'nothing': str(idx)+file} for idx, file in enumerate(files)])
    # res = dp.get_batch_output()
    # print(sum(([length for idx, length in res])))
    # print(res)
    tmu.check_time()
    print([[idx, len(fu.load_array(file))] for idx, file in enumerate(files)])
    tmu.check_time()
コード例 #19
0
    #     print('总数:', len(predict), '过滤比例:', table.loc["data"]['被过滤'] / len(predict))


def perfomance_analysis():
    labal, proba = fu.load_array('label_proba')
    print(len(labal), len(proba))
    au.precision_recall_threshold(labal, proba)


if __name__ == '__main__':
    from calling.back_filter import filter_twarr_text
    # from classifying.terror.classifier_terror import file2label_text_array
    # textarr, labelarr = file2label_text_array("/home/nfs/cdong/tw/seeding/Terrorist/data/test")
    pos_base = "/home/nfs/cdong/tw/seeding/Terrorist/queried/positive"
    neg_base = "/home/nfs/cdong/tw/seeding/Terrorist/queried/negative"
    pos_files, neg_files = fi.listchildren(pos_base, concat=True), fi.listchildren(neg_base, concat=True, pattern='2012')
    
    base = "/home/nfs/yangl/event_detection/testdata/event2012/relevant"
    pos_files = fi.listchildren(base, concat=True)
    print(len(pos_files))
    print(sum([len(fu.read_lines(f)) for f in pos_files]))
    exit()
    
    my_filter = EffectCheck()
    pos_probarr, neg_probarr = list(), list()
    tmu.check_time()
    for file in neg_files:
        twarr = filter_twarr_text(fu.load_array(file))
        probarr = my_filter.predict_proba(twarr)
        neg_probarr.extend(probarr)
    tmu.check_time()
コード例 #20
0
            text = pu.text_normalization(text)
        if pu.is_empty_string(text):
            continue
        textarr.append(text)
    return textarr


def split_train_test(array):
    split = int(len(array) * 0.8)
    return array[:split], array[split:]


pos_event_pattern = '/home/nfs/cdong/tw/seeding/Terrorist/queried/positive/{}'
neg_event_pattern = '/home/nfs/cdong/tw/seeding/Terrorist/queried/negative/{}'
pos_files = fi.listchildren(pos_event_pattern.format(''),
                            fi.TYPE_FILE,
                            concat=True)
neg_files = fi.listchildren(neg_event_pattern.format(''),
                            fi.TYPE_FILE,
                            concat=True)
""" -------- for fasttext -------- """
label_t, label_f = ftu.label_t, ftu.label_f
ft_data_pattern = "/home/nfs/cdong/tw/seeding/Terrorist/data/fasttext/{}"
fasttext_train = ft_data_pattern.format("train")
fasttext_test = ft_data_pattern.format("test")

neg_2012_full_pattern = "/home/nfs/cdong/tw/seeding/Terrorist/queried/negative/neg_2012_full_1/{}"
neg_2012_full_files = fi.listchildren(neg_2012_full_pattern.format(''),
                                      concat=True)