Beispiel #1
0
def copy_into_process_base(process_base):
    if not fi.exists(process_base):
        fi.mkdir(process_base)
        os.system("cp %s %s" % (command, process_base))
        os.system("cp %s %s" % (fi.join(autophrase_base, 'phrasal_segmentation.sh'), process_base))
        os.system("cp -r %s %s" % (fi.join(autophrase_base, 'bin'), process_base))
        os.system("cp -r %s %s" % (fi.join(autophrase_base, 'data'), process_base))
        os.system("cp -r %s %s" % (fi.join(autophrase_base, 'tools'), process_base))
Beispiel #2
0
 def set_base_path(self, base_path):
     self.base_path = fi.add_sep_if_needed(base_path)
     for path in [
             self.get_base_path(),
             self.get_theme_path(),
             self.get_queried_path(),
             self.get_param_path(),
     ]:
         fi.mkdir(path)
Beispiel #3
0
def write_cic_list(path, cic_list):
    """
    调用 cic_list 中的各元素的 construct_od返回一个OrderedDIct,将每个OrderedDIct持久化到指定路径下的文件中
    :param path: str,输出路径
    :param cic_list: list,每个元素为 ClusterInfoCarrier
    :return:
    """
    fi.mkdir(path, remove_previous=True)
    cic_list = sorted(cic_list, key=lambda item: len(item.twarr), reverse=True)
    print('    bext: output cic list, len={}'.format(len(cic_list)))
    for idx, cic in enumerate(cic_list):
        cluid = cic.cluid
        cic.twarr = ClusterInfoGetter.group_similar_tweets(cic.twarr,
                                                           process_num=10)
        od = cic.construct_od()
        json_str = fu.dumps(od)
        cluster_file = fi.join(path, '{}_cluid:{}.json'.format(idx, cluid))
        fu.write_lines(cluster_file, [json_str])

        # textarr_file = fi.join(path, '{}_text.json'.format(idx))
        # textarr = [tw[tk.key_text] for tw in cic.twarr]
        # fu.write_lines(textarr_file, textarr)
    print('    bext: output into files over')
    ft_model = "/home/nfs/cdong/tw/src/models/classify/terror/ft_no_gpe_model"
    lr_model = "/home/nfs/cdong/tw/src/models/classify/terror/lr_no_gpe_model"
    clf_model = lr_model
    tmu.check_time('all')
    tmu.check_time()

    # coef_of_lr_model("/home/nfs/cdong/tw/src/models/classify/terror/lr_no_gpe_model")
    # clf_filter = ClassifierAddFeature(None, None)
    # for file in fi.listchildren("/home/nfs/cdong/tw/seeding/Terrorist/queried/positive", concat=True):
    #     twarr = fu.load_array(file)
    #     print(file, clf_filter.predict_mean_proba(twarr))
    # tmu.check_time()
    # exit()

    batch_num = 20
    fi.mkdir(ft_data_pattern.format('matrices_no_add'), remove_previous=True)
    train_mtx_ptn = ft_data_pattern.format(
        'matrices_no_add/train_feature_mtx_{}.npy')
    train_lbl_ptn = ft_data_pattern.format(
        'matrices_no_add/train_lblarr_mtx_{}.npy')
    train_file_list = [(train_mtx_ptn.format(idx), train_lbl_ptn.format(idx))
                       for idx in range(batch_num)]

    # _clf = ClassifierAddFeature(None, None)
    # _ft_args = dict(epoch=150, lr=1.5, wordNgrams=2, verbose=2, minCount=2, thread=20, dim=300)
    # tmu.check_time()

    # _clf.train_ft(fasttext_train, _ft_args, ft_model)
    # tmu.check_time(print_func=lambda dt: print('train ft time: {}s'.format(dt)))
    # generate_train_matrices(ft_model, fasttext_train, train_file_list)
    # tmu.check_time()
import shutil
from subprocess import Popen, PIPE

from config.configure import getcfg
import utils.function_utils as fu
import utils.array_utils as au
import utils.file_iterator as fi
import utils.multiprocess_utils as mu
import utils.tweet_keys as tk
import utils.timer_utils as tmu

autophrase_base = getcfg().autophrase_path
autophrase_output_base = fi.join(
    autophrase_base, "OUTPUTS/")  # 保证任何autophrase的输出限制到output_base之内的某个角落
command = fi.join(autophrase_base, "auto_phrase.sh")
fi.mkdir(autophrase_output_base)


def autophrase(input_text_file, output_path, commander, process_base, min_sup):
    p = Popen(commander,
              shell=True,
              bufsize=1,
              stdin=PIPE,
              stdout=PIPE,
              stderr=PIPE,
              cwd=process_base)
    p.stdin.flush()
    p.stdin.write((input_text_file + '\n').encode("utf8"))
    p.stdin.write((output_path + '\n').encode("utf8"))
    p.stdin.write((min_sup + '\n').encode("utf8"))
    p.stdin.flush()
            return
        print('new list generated, hbn={}, rbn={}'.format(hbn, rbn))
        cluid_twarr_list = self.gsdpmm.get_cluid_twarr_list(twnum_thres)
        if cluid_twarr_list:
            out_channel.put(cluid_twarr_list)


if __name__ == '__main__':
    import utils.tweet_keys as tk
    import utils.array_utils as au
    import utils.pattern_utils as pu
    import utils.timer_utils as tmu
    import calling.back_extractor as bext
    import utils.file_iterator as fi
    import utils.function_utils as fu
    fi.mkdir('/home/nfs/cdong/tw/src/calling/tmp', remove_previous=True)

    tmu.check_time()
    _hold_batch_num = 100
    _batch_size = 100
    _alpha, _beta = 30, 0.01
    # _alpha, _beta = 50, 0.005
    _file = "./filtered_twarr.json"
    _twarr = fu.load_array(_file)[:10200]
    start_pool(_hold_batch_num, _batch_size, _alpha, _beta)
    input_twarr_batch(_twarr)

    print('---> waiting for _cluid_cluster_list')
    while True:
        _cluid_cluster_list = cluster_daemon.outq2.get()
        print('     - some thing returned, type :{}'.format(
Beispiel #7
0
class ExtractSubProcess(DaemonProcess):
    def start(self, func):
        self.process = mp.Process(target=func, args=(self.inq, self.outq))
        self.process.daemon = False
        self.process.start()
        # print('ExtractSubProcess', self.process.pid)


extract_sub_process = ExtractSubProcess()

END_PROCESS = -1
SET_PARAMS = 0
INPUT_LIST = 1
OUTPUT_LIST = 3
OUT_BASE = getcfg().output_path
fi.mkdir(OUT_BASE, remove_previous=True)


def extract_sub_main(inq, outq):
    """
    聚类信息提取模块子进程的主函数,该进程还有下属的若干子进程用于执行实际操作,
    负责读取主进程输入,调用子进程组进行一次地点提取,根据返回结果,对多个聚类依地点进行合并操作,
    并再次调用子进程组对合并结果进行完整处理,以该结果为最终结果输出到文件
    :param inq: mp.Queue,主进程向子进程的输入队列
    :param outq: mp.Queue,子进程向主进程的输出队列
    :return:
    """
    pool_size, event_type = inq.get()
    extract_pool = CustomDaemonPool()
    extract_pool.start(extract_pool_main, pool_size)
    extract_pool.set_batch_input([(pidx, event_type)
Beispiel #8
0
        mtx_list.append(np.load(mtx_file))
        lbl_list.append(np.load(lbl_file))
    featurearr = np.concatenate(mtx_list, axis=0)
    labelarr = np.concatenate(lbl_list, axis=0)
    return featurearr, labelarr


if __name__ == "__main__":
    from classifying.natural_disaster.nd_data_maker import nd_train, nd_test, nd_data_pattern
    nd_ft_model = "/home/nfs/cdong/tw/src/models/classify/natural_disaster/ft_model"
    nd_lr_model = "/home/nfs/cdong/tw/src/models/classify/natural_disaster/lr_model"
    tmu.check_time('all')
    tmu.check_time()

    batch_num = 20
    fi.mkdir(nd_data_pattern.format('matrices'))
    nd_train_mtx = nd_data_pattern.format('matrices/train_feature_mtx_{}.npy')
    nd_train_lbl = nd_data_pattern.format('matrices/train_lblarr_mtx_{}.npy')
    nd_train_mtx_files = [(nd_train_mtx.format(idx), nd_train_lbl.format(idx))
                          for idx in range(batch_num)]

    _clf = ClassifierNaturalDisaster(None, None)
    # _ft_args = dict(epoch=150, lr=1.5, wordNgrams=2, verbose=2, minCount=2, thread=20, dim=250)
    # _clf.train_ft(nd_train, _ft_args, nd_ft_model)
    # tmu.check_time(print_func=lambda dt: print('train ft time: {}s'.format(dt)))
    generate_train_matrices(nd_ft_model, nd_train, nd_train_mtx_files)
    tmu.check_time()
    _featurearr, _labelarr = recover_train_matrix(nd_train_mtx_files)
    tmu.check_time()

    _lr_args = dict(n_jobs=20,