def read_all_reports(report_file, trace_folder, process_num): file_list = os.listdir(trace_folder) file_num = len(file_list) trace_collection = [] pool = Pool(process_num) for file_no in range(file_num): pool.apply_async( read_single_trace, args = (trace_folder, file_list[file_no], file_no), callback = trace_collection.append ) pool.close() pool.join() print('Finish reading all the traces') trace_dict = {} for item in trace_collection: trace_dict[item[0]] = item[1] # read reports reports = utils.read_pkl(report_file) # split reports report_dict = { 'm': [], 'b': [] } for item in reports: report_dict[item[1]].append(item[0]) print('Finish splitting the reports into two categories!') return trace_dict, report_dict
def read_single_trace(folder_path, file_name, file_no): if file_no % 100 == 0: print('Reading %d_th trace' % file_no) file_path = os.path.join(folder_path, file_name) if NPZTag: tmp = np.load(file_path) content = tmp['trace'] trace_hash = file_name.split('.')[0] else: content = utils.read_pkl(file_path) trace_hash = file_name unique_insns = np.unique(np.asarray(content)) temp = [trace_hash, unique_insns] return temp
def main(period, time_limit, max_vids, min_score, no_download): compiler = Compiler(priv_utils.get_yt(), priv_utils.get_reddit(), MAIN_DIR) # for debugging fetch = False write = True vid_pkl_path = MAIN_DIR + '/src/vid_info.pkl' # vid info will either be fetched or read from file if fetch: print("fetching") # TODO: remove vid_info, total_duration = compiler.fetch_vid_info( period=period, time_limit=time_limit, max_vids=max_vids, min_score=min_score) # only write if new values have been fetched if write: print("writing") # TODO: remove utils.write_pkl(vid_pkl_path, (vid_info, total_duration)) else: print("reading") # TODO: remove #vid_info, total_duration = utils.read_vid_info(vid_pkl_path) vid_info, total_duration = utils.read_pkl(vid_pkl_path) print("total duration: " + str(total_duration) + "\n") for i in range(len(vid_info)): vid_info[i].print_info() # generate name of compilation for file naming comp_name = compiler.comp_name_gen(period, max_vids) # create a directory for files related to compilation os.system("mkdir " + MAIN_DIR + "/final/" + comp_name) # shuffle vid order so that worst vids aren't always last random.shuffle(vid_info) description = compiler.gen_description(vid_info) compiler.write_description(comp_name, description) compiler.write_tags(comp_name) compiler.write_title(comp_name, period, max_vids) if not no_download: compiler.download_vids(vid_info) compiler.create_compilation(comp_name, vid_info)
def process_poc_trace(poc_trace_path, bin_path, target_src_str): if NPZTag: tmp = np.load(poc_trace_path) poc_trace = tmp['trace'] else: poc_trace = utils.read_pkl(poc_trace_path) poc_trace = np.asarray(poc_trace) if len(target_src_str) == 0: return poc_trace else: insn_list = parse_dwarf.get_bin_line(bin_path, target_src_str) insn_idx_list = [] for insn in insn_list: insn_idx_list += list(np.where(poc_trace == insn)[0]) if len(insn_idx_list) == 0: raise Exception("ERROR: Cannot find the instructions for source -> %s" % target_src_str) max_id = max(insn_idx_list) return poc_trace[:max_id+1]
parser.add_argument('--qid_list', required=False, default=None) args = parser.parse_args() os.makedirs(args.output_path, exist_ok=True) logging.basicConfig(format='%(levelname)s :: %(asctime)s - %(message)s', level=args.log_level, datefmt='%d/%m/%Y %I:%M:%S %p', filename=os.path.join(args.output_path, "GET_TURK_DATA_LOG"), filemode='w') data_root = os.path.join(args.data_repo_root, args.dataset) distmult_dump = utils.read_pkl(args.model_weights) logging.info("Read Model Dump") data = utils.read_data(args.test_file) qids = set(range(len(data))) if args.qid_list is not None: qids = set(np.loadtxt(args.qid_list, dtype=int).ravel().tolist()) assert max(qids) < len(data) # mapped_data = np.array( utils.map_data(data, distmult_dump['entity_to_id'], distmult_dump['relation_to_id'])).astype(np.int32) logging.info("Loaded test file from %s" % (args.test_file)) if (args.template_pred is None and args.rule_pred is None):
op.add_option('--nw', dest='nb_work', default=4, type='int', help='加载数据的线程数') op.add_option('--pr', '--path_result', dest='path_result', default='./result.txt', type='str', help='预测结果存放路径') argv = [] if is_interactive() else sys.argv[1:] (opts, args) = op.parse_args(argv) # 初始化数据参数 root_idx = opts.root_idx path_num = os.path.join(root_idx, 'instance.csv') max_len = opts.max_len root_voc = opts.root_voc word2id_dict = read_pkl(os.path.join(root_voc, 'word2id.pkl')) label2id_dict = read_pkl(os.path.join(root_voc, 'label2id.pkl')) has_label = False batch_size = opts.batch_size use_cuda = opts.cuda num_worker = opts.nb_work path_result = opts.path_result t0 = time() # 初始化数据 dataset = SentenceDataUtil(path_num, root_idx, max_len, word2id_dict, has_label,
action='store_true', default=False, help='是否使用GPU加速') op.add_option('--nw', dest='nb_work', default=4, type='int', help='加载数据的线程数') argv = [] if is_interactive() else sys.argv[1:] (opts, args) = op.parse_args(argv) if not opts.nb_class: op.print_help() exit() # 初始化数据参数 root_idx = opts.root_idx path_num = os.path.join(root_idx, 'instance.csv') max_len = opts.max_len root_voc = opts.root_voc word2id_dict = read_pkl(os.path.join(root_voc, 'word2id.pkl')) label2id_dict = read_pkl(os.path.join(root_voc, 'label2id.pkl')) has_label = True dev_size = opts.dev_size batch_size = opts.batch_size num_worker = opts.nb_work # 初始化数据 dataset = SentenceDataUtil(path_num, root_idx, max_len, word2id_dict, has_label, label2id_dict, shuffle=True) dataset_train, dataset_dev = dataset.split_train_and_dev(dev_size=dev_size)
import argparse import logging import os import pickle import string import time import numpy as np import pandas as pd data_repo_root = "../data/fb15k/" model_weights = "dumps/fb15k237_distmult_dump_norm.pkl" wiki_file = os.path.join(data_repo_root, "mid2wikipedia.tsv") orig_file = os.path.join(data_repo_root, "entity_mid_name_type_typeid.txt") intersection_file = os.path.join(data_repo_root, "mid2wikipedia_cleaned.tsv") distmult_dump = utils.read_pkl(model_weights) def read_data(path): mapping_name = {} mapping_url = {} with open(path, "r") as f: for line in f: line_arr = line.split("\t") mapping_name[line_arr[0]] = line_arr[1] mapping_url[line_arr[0]] = line_arr[2] return mapping_name, mapping_url mapping_name, mapping_url = read_data(wiki_file)
parser.add_argument('--qid_list', required=False, default=None) args = parser.parse_args() os.makedirs(args.output_path, exist_ok=True) logging.basicConfig(format='%(levelname)s :: %(asctime)s - %(message)s', level=args.log_level, datefmt='%d/%m/%Y %I:%M:%S %p', filename=os.path.join(args.output_path, "GET_TURK_DATA_LOG"), filemode='w') data_root = os.path.join(args.data_repo_root, args.dataset) distmult_dump = utils.read_pkl(args.model_weights) logging.info("Read Model Dump") data = utils.read_data(args.test_file) qids = set(range(len(data))) if args.qid_list is not None: qids = set(np.loadtxt(args.qid_list, dtype=int).ravel().tolist()) assert max(qids) < len(data) # mapped_data = np.array( utils.map_data(data, distmult_dump['entity_to_id'], distmult_dump['relation_to_id'])).astype(np.int32) logging.info("Loaded test file from %s" % (args.test_file)) # if(args.template_pred is None and args.rule_pred is None):
import numpy as np import cv2 from config import * make_dir(reconstruction_path) make_dir(generate_samples_path) make_dir(mean_var_path) def gen_data(mean, cov, num): mean = mean.cpu() data = np.random.multivariate_normal(mean, cov, num) return np.round(data, 4) map_dict = read_pkl() mean = map_dict[0].float() num = 40 net1 = torch.load( os.path.join(model_path, 'encoder_sigma_' + str(num) + '.pth')) net2 = torch.load( os.path.join(model_path, 'decoder_sigma_' + str(num) + '.pth')) for i in net1.parameters(): i.requires_grad = False for i in net2.parameters(): i.requires_grad = False net1 = net1.cuda()
def process_data(self, meta_file, feat_file, phn_file, wrd_file, slb_file): if self.num_all_utts == -1: meta_data = read_pkl(meta_file)[ 'prefix'] # {'prefix': [num_of_utts x drID_spkID_uttID]} feat = read_pkl( feat_file) # [num_of_utts x num_of_frames x feat_dim] phn_data = read_pkl( phn_file ) # [num_of_utts x num_of_phns x [phn, start, end]] ** include 'h#' ** wrd_data = read_pkl( wrd_file) # [num_of_utts x num_of_wrds x [wrd, start, end]] # slb_data = read_pkl(slb_file) # [num_of_utts x num_of_slbs x [slb, start, end]] else: meta_data = read_pkl( meta_file )['prefix'][:self. num_all_utts] # {'prefix': [num_of_utts x drID_spkID_uttID]} feat = read_pkl( feat_file )[:self.num_all_utts] # [num_of_utts x num_of_frames x feat_dim] phn_data = read_pkl( phn_file )[:self. num_all_utts] # [num_of_utts x num_of_phns x [phn, start, end]] ** include 'h#' ** wrd_data = read_pkl( wrd_file )[:self. num_all_utts] # [num_of_utts x num_of_wrds x [wrd, start, end]] # slb_data = read_pkl(slb_file)[:self.num_all_utts] # [num_of_utts x num_of_slbs x [slb, start, end]] phn_wrd_data = self.make_phn_wrd(phn_data, wrd_data) self.n_utts = len(feat) print("Read %s utterances" % self.n_utts) for i, phn_utt in enumerate(phn_data): # Process each phn in utt phn_meta_utt = [] for j, (phn, phn_start, phn_end) in enumerate(phn_utt): self.n_total_phns += 1 self.phn2cnt[phn] += 1 phn_meta_utt.append((phn, i, phn_start, phn_end)) self.phn_meta.append(phn_meta_utt) for i, (utt, feat_utt, wrd_utt, phn_wrd_utt) \ in enumerate(zip(meta_data, feat, wrd_data, phn_wrd_data)): spk = utt.split('_')[1] if not spk in self.spk2utt_idx: self.spks.append(spk) self.spk2utt_idx[spk] = [] self.spk2wrd_idx[spk] = [] self.spk2utt_idx[spk].append(i) self.utt_idx2spk[i] = spk # Process each wrd in utt wrd_meta_utt = [] phn_wrd_meta_utt = [] for j, ((wrd, wrd_start, wrd_end), phn_wrd) in enumerate(zip(wrd_utt, phn_wrd_utt)): wrd = wrd.lower() if j != 0 and wrd_start >= wrd_utt[ j - 1][1] and wrd_end <= wrd_utt[j - 1][2]: # print ('Words overlap:') # print (i, j-1, j, wrd_utt[j-1], wrd_utt[j]) continue if j != len(wrd_utt) - 1 and wrd_start >= wrd_utt[ j + 1][1] and wrd_end <= wrd_utt[j + 1][2]: # print ('Words overlap:') # print (i, j, j+1, wrd_utt[j], wrd_utt[j+1]) continue if wrd_end - wrd_start == 0: # print (i, wrd) continue if not len(phn_wrd): # print (i, j) continue self.n_total_wrds += 1 if not wrd in self.wrd2idx: self.wrd2idx[wrd] = self.n_wrds self.idx2wrd[self.n_wrds] = wrd self.n_wrds += 1 self.wrd2cnt[wrd] += 1 wrd_meta_utt.append((wrd, i, wrd_start, wrd_end)) if not (wrd, phn_wrd) in self.phn_wrd2idx: self.phn_wrd2idx[(wrd, phn_wrd)] = self.n_phn_wrds self.idx2phn_wrd[self.n_phn_wrds] = (wrd, phn_wrd) self.n_phn_wrds += 1 self.phn_wrd2cnt[(wrd, phn_wrd)] += 1 phn_wrd_meta_utt.append( ((wrd, phn_wrd), i, wrd_start, wrd_end)) self.spk2wrd_idx[spk].append(self.n_total_wrds - 1) self.wrd_idx2spk[self.n_total_wrds - 1] = spk self.feat.append(feat[i][wrd_start:wrd_end]) phn_idx_array = np.array( [self.phn2idx[phn] - 1 for phn in phn_wrd]) self.phn_idx_arrays.append(phn_idx_array) self.txt_feat.append( self.make_one_hot_feat(phn_idx_array, self.n_phns - 1)) char_idx_array = np.array( [self.char2idx[char] for char in wrd]) self.char_idx_arrays.append(char_idx_array) self.txt_feat_char.append( self.make_one_hot_feat(char_idx_array, self.n_chars)) self.wrd_meta.append(wrd_meta_utt) self.phn_wrd_meta.append(phn_wrd_meta_utt) # Process each slb in utt # slb_meta_utt = [] # for j, (slb, slb_start, slb_end) in enumerate(slb_utt): # if slb_start >= slb_utt[j-1][1] and slb_end <= slb_utt[j-1][2]: # # print ('Syllables overlap:') # # print (i, j-1, j, slb_utt[j-1], slb_utt[j]) # continue # self.n_total_slbs += 1 # if not slb in self.slb2idx: # self.slb2idx[slb] = self.n_slbs # self.idx2slb[self.n_slbs] = slb # self.n_slbs += 1 # self.slb2cnt[slb] += 1 # # slb_meta_utt update # slb_meta_utt.append((slb, i, slb_start, slb_end)) # self.slb_meta.append(slb_meta_utt) self.feat = np.array(self.feat) self.txt_feat = np.array(self.txt_feat) self.txt_feat_char = np.array(self.txt_feat_char) self.n_batches = len(self.feat) // self.batch_size if len(self.feat) % self.batch_size != 0: self.n_batches += 1 for wrd_meta_utt in self.wrd_meta: self.wrds.extend([w[0] for w in wrd_meta_utt]) for phn_wrd_meta_utt in self.phn_wrd_meta: self.phn_wrds.extend([w[0] for w in phn_wrd_meta_utt]) print('Num of total words: ', len(self.feat), len(self.txt_feat), len(self.txt_feat_char), self.n_total_wrds) print('Num of distinct words (with different phonemes): ', self.n_phn_wrds) print('Num of distinct words: ', self.n_wrds) print('Num of batches: ', self.n_batches) return