def generate_train_matrices(ft_model_file, lbl_txt_file, mtx_lbl_file_list): lbl_txt_arr = fu.read_lines(lbl_txt_file) lbl_txt_blocks = mu.split_multi_format(lbl_txt_arr, len(mtx_lbl_file_list)) args_list = [(ft_model_file, lbl_txt_blocks[idx], mtx_file, lbl_file) for idx, (mtx_file, lbl_file) in enumerate(mtx_lbl_file_list)] print([len(b) for b in lbl_txt_blocks]) mu.multi_process_batch(_generate_matrices, 10, args_list)
def run_glda_multi(): d_classes = (DataReuters(), Data20ng()) for d in d_classes: iu.mkdir(out_dir_.format(d.name), rm_prev=True) # file_name_list = [embed_file_, corpus_file_, out_dir_] # name_value_list = [ # ('D', [300]), # ('K', [250]), # ('K_0', [0.1, 0.01, 0.001]), # ('alpha', [0.0001, 0.004, 0.01]), # ('numIterations', [5]), # ('round_index', [666]), # ] name_value_list = [( ('f1', [d.glda_embed_file]), ('f2', [d.glda_corpus_file]), ('f3', [out_dir_.format(d.name)]), ('D', [dimension]), ('K', [d.topic_num]), ('numIterations', [num_iter]), ('round_index', [i for i in range(1)]), ('alpha', [0]), ('k_0', [1.0, 0.1, 0.01, 0.001]), ) for d in d_classes] # file_args_str = ('{} ' * len(file_name_list)).format(*file_name_list) # file_args_str = ' '.join(file_name_list) # value_args_str = '{} ' * len(name_value_list) # args_list = [(execute_ + file_args_str + value_args_str.format(*g.values()), glda_base_) for g in grid] grid = au.merge(au.grid_params(nv) for nv in name_value_list) args_list = [(execute_ + ' '.join(list(map(str, g.values()))), glda_base_) for g in grid] print(args_list[:4]) mu.multi_process_batch(run_glda, batch_size=Nodes.select(ncpu=32, ngpu=4), args_list=args_list)
def generate_train_matrices(ft_model_file, lbl_txt_file, mtx_lbl_file_list): """ 给出fasttext模型文件的路径,读取文本-标记文件,将文件内容分块传递给多个子进程, 各子进程将文本和标记分别转化为向量列表(即矩阵)输出到 mtx_lbl_file_list 中的每个文件中 文本量较大的情况下避免每次训练分类器都要重新生成文本对应的向量列表 :param ft_model_file: str,fasttext模型的文件路径 :param lbl_txt_file: str,文本-标记文件的路径 :param mtx_lbl_file_list: 每个元素为tuple,tuple的每个元素为str, 第一个str标志存储矩阵的文件,第二个str表示存储该矩阵对应的标记列表的文件 :return: """ lbl_txt_arr = fu.read_lines(lbl_txt_file) lbl_txt_blocks = mu.split_multi_format(lbl_txt_arr, len(mtx_lbl_file_list)) args_list = [(ft_model_file, lbl_txt_blocks[idx], mtx_file, lbl_file) for idx, (mtx_file, lbl_file) in enumerate(mtx_lbl_file_list)] print([len(b) for b in lbl_txt_blocks]) mu.multi_process_batch(_generate_matrices, 10, args_list)
def run(): cmd = 'CUDA_VISIBLE_DEVICES={} python3.6 short_cnn.py ' # name_value_list = ( # ('-do', [0.7]), # ('-ed', [256]), # ('-bs', [100]), # ('-ws', [5]), # ('-id', [i for i in range(10)]), # ) # for Kaggle # name_value_list = ( # ('-do', [0.7]), # ('-ed', [256]), # ('-bs', [100]), # ('-ws', [3]), # ('-id', [i for i in range(12)]), # ) # for Event # name_value_list = ( # ('-do', [0.5]), # ('-ed', [256]), # ('-bs', [100]), # ('-ws', [5]), # ('-id', [i for i in range(12)]), # ) # for Google nv_list = ( ('-do', [0.3, 0.5, 0.7]), ('-ed', [200, 300]), ('-bs', [50, 100]), ('-ws', [3, 5, 7]), ('-id', [i for i in range(2)]), ('-dn', [d.name for d in d_classes]), ('-gp', [gpu_frac]), ) # for DataSnippets grid = au.grid_params(nv_list) args_list = [[ cmd.format(gpus[idx % len(gpus)]) + entries2name(g, inter=' ', intra=' ', postfix='') ] for idx, g in enumerate(grid)][6:] print(args_list[:5]) mu.multi_process_batch(run_short_attention, batch_size=batch_size, args_list=args_list)
def run_multi_and_output(func, batch_size, args_list, result_file): res_list = mu.multi_process_batch(func, batch_size, args_list) fu.dump_array(result_file, res_list)