Ejemplo n.º 1
0
def parse_query_list(from_path, into_path, query_list, process_num):
    all_sub_files = [
        f for f in fi.listchildren(
            from_path, children_type=fi.TYPE_FILE, pattern='.sum$')
    ]
    tw_num_sum = 0
    for query in query_list:
        query = SeedQuery(*query)
        query_sub_files = [
            fi.join(from_path, f) for f in all_sub_files
            if query.is_time_desired(
                tw_ymd=query.time_of_tweet(f, source='filename'))
        ]
        print('{} files from {} to {}'.format(
            len(query_sub_files),
            query_sub_files[0][query_sub_files[0].rfind('/') + 1:],
            query_sub_files[-1][query_sub_files[-1].rfind('/') + 1:],
        ))
        twarr = query_from_files_multi(query_sub_files, query, process_num)
        tw_num_sum += len(twarr)
        file_name = query.to_string() + '.json'
        if len(twarr) > 20:
            print('file {} written\n'.format(file_name))
            fu.dump_array(fi.join(into_path, file_name), twarr)
        else:
            print('twarr not long enough')
        for tw in twarr:
            print(tw[tk.key_text], '\n')
    print('total tweet number: {}'.format(tw_num_sum))
Ejemplo n.º 2
0
def copy_into_process_base(process_base):
    if not fi.exists(process_base):
        fi.mkdir(process_base)
        os.system("cp %s %s" % (command, process_base))
        os.system("cp %s %s" % (fi.join(autophrase_base, 'phrasal_segmentation.sh'), process_base))
        os.system("cp -r %s %s" % (fi.join(autophrase_base, 'bin'), process_base))
        os.system("cp -r %s %s" % (fi.join(autophrase_base, 'data'), process_base))
        os.system("cp -r %s %s" % (fi.join(autophrase_base, 'tools'), process_base))
def autophrase_wrapper(process_code, textarr):
    # process_code用于辨识进程所占用的路径,textarr是一个文本list
    process_base = fi.join(autophrase_output_base, str(process_code))
    copy_into_process_base(process_base)
    commander = fi.join(process_base, "auto_phrase.sh")
    input_text_file = fi.join(process_base, "raw_train.txt")
    output_keyword_file = fi.join(process_base, "AutoPhrase.txt")
    # 将文本列表写入文件, 执行autophrase
    fu.write_lines(input_text_file, textarr)
    min_sup = determine_min_sup(len(textarr))
    autophrase(input_text_file, process_base, commander, process_base, min_sup)
    # 读取autophrase结果
    lines = fu.read_lines(output_keyword_file)
    conf_word_list = list()
    for line in lines:
        conf, word = line.split(maxsplit=1)
        conf_word_list.append((float(conf), word))
    # fi.rmtree(os.path.join(process_base, 'tmp'))
    return conf_word_list
Ejemplo n.º 4
0
def extract_sub_main(inq, outq):
    """
    聚类信息提取模块子进程的主函数,该进程还有下属的若干子进程用于执行实际操作,
    负责读取主进程输入,调用子进程组进行一次地点提取,根据返回结果,对多个聚类依地点进行合并操作,
    并再次调用子进程组对合并结果进行完整处理,以该结果为最终结果输出到文件
    :param inq: mp.Queue,主进程向子进程的输入队列
    :param outq: mp.Queue,子进程向主进程的输出队列
    :return:
    """
    pool_size, event_type = inq.get()
    extract_pool = CustomDaemonPool()
    extract_pool.start(extract_pool_main, pool_size)
    extract_pool.set_batch_input([(pidx, event_type)
                                  for pidx in range(pool_size)])
    extract_pool.get_batch_output()

    counter = 0
    while True:
        command = inq.get()
        if command == INPUT_LIST:
            cluid_twarr_list = inq.get()
            print('    bext: remain task={}, len(cluid_twarr_list)={}'.format(
                inq.qsize(), len(cluid_twarr_list)))
            if inq.qsize() > 6:
                print('    bext: too many tasks undone, jumping')
                continue
            # tmu.check_time('extract_sub_main', print_func=lambda dt: print('lline 41', dt))
            # print("    bext: get new cluid_twarr_list, len {}".format(len(cluid_twarr_list)))
            extract_pool.set_batch_input(cluid_twarr_list)
            cic_list = extract_pool.get_batch_output()
            # tmu.check_time('extract_sub_main', print_func=lambda dt: print('lline 43', dt))
            new_cluid_twarr_list = merge_cic_list2cluid_twarr_list(cic_list)
            new_cluid_twarr_list = [(cluid, twarr, CIG.TAR_FULL)
                                    for cluid, twarr in new_cluid_twarr_list]
            extract_pool.set_batch_input(new_cluid_twarr_list)
            n_cic_list = extract_pool.get_batch_output()
            # format = cic_format(n_cic_list)
            # tmu.check_time('extract_sub_main', print_func=lambda dt: print('lline 49', dt))
            # print("    bext: get merged cic list, len {}".format(len(n_cic_list)))
            # collection = '{}_{}clusters'.format(counter,len(n_cic_list))
            # dbu.insert_many(dbu.clu_db,collection,format)
            write_cic_list(
                fi.join(OUT_BASE,
                        "{}_{}clusters/").format(counter, len(n_cic_list)),
                n_cic_list)
            counter += 1
        elif command == END_PROCESS:
            print('ending extract_sub_main')
            outq.put('ending extract_sub_main')
            return
Ejemplo n.º 5
0
def write_cic_list(path, cic_list):
    """
    调用 cic_list 中的各元素的 construct_od返回一个OrderedDIct,将每个OrderedDIct持久化到指定路径下的文件中
    :param path: str,输出路径
    :param cic_list: list,每个元素为 ClusterInfoCarrier
    :return:
    """
    fi.mkdir(path, remove_previous=True)
    cic_list = sorted(cic_list, key=lambda item: len(item.twarr), reverse=True)
    print('    bext: output cic list, len={}'.format(len(cic_list)))
    for idx, cic in enumerate(cic_list):
        cluid = cic.cluid
        cic.twarr = ClusterInfoGetter.group_similar_tweets(cic.twarr,
                                                           process_num=10)
        od = cic.construct_od()
        json_str = fu.dumps(od)
        cluster_file = fi.join(path, '{}_cluid:{}.json'.format(idx, cluid))
        fu.write_lines(cluster_file, [json_str])

        # textarr_file = fi.join(path, '{}_text.json'.format(idx))
        # textarr = [tw[tk.key_text] for tw in cic.twarr]
        # fu.write_lines(textarr_file, textarr)
    print('    bext: output into files over')
import os
import shutil
from subprocess import Popen, PIPE

from config.configure import getcfg
import utils.function_utils as fu
import utils.array_utils as au
import utils.file_iterator as fi
import utils.multiprocess_utils as mu
import utils.tweet_keys as tk
import utils.timer_utils as tmu

autophrase_base = getcfg().autophrase_path
autophrase_output_base = fi.join(
    autophrase_base, "OUTPUTS/")  # 保证任何autophrase的输出限制到output_base之内的某个角落
command = fi.join(autophrase_base, "auto_phrase.sh")
fi.mkdir(autophrase_output_base)


def autophrase(input_text_file, output_path, commander, process_base, min_sup):
    p = Popen(commander,
              shell=True,
              bufsize=1,
              stdin=PIPE,
              stdout=PIPE,
              stderr=PIPE,
              cwd=process_base)
    p.stdin.flush()
    p.stdin.write((input_text_file + '\n').encode("utf8"))
    p.stdin.write((output_path + '\n').encode("utf8"))
    p.stdin.write((min_sup + '\n').encode("utf8"))
Ejemplo n.º 7
0
 def get_param_path(self):
     return fi.join(self.get_theme_path(), 'params')
Ejemplo n.º 8
0
 def get_queried_path(self):
     return fi.join(self.get_theme_path(), 'queried')
Ejemplo n.º 9
0
 def get_theme_path(self):
     return fi.join(self.get_base_path(), self.theme)