def _create_partial_indices(self): # for logging start_time = datetime.now().strftime('%H:%M:%S') # get the location of the documents and get all of the files inside file_names = get_file_names(self._index_config.get_input_dir()) for file in file_names: try: self._add_document_to_index(file) # offload the index if need be if self._time_to_offload(): # offload to the partial index self._offload_to_partial_index() # logging if self._doc_id % 100 == 0: print( f"Current memory size: {sys.getsizeof(self._inverted_index)}" ) print(f"Completed 100 files... current file: {file}") except JSONDecodeError: print(f"JSONDecodeError: Skipping file {file}") continue except UnicodeDecodeError: print(f"UnicodeDecodeError: Skipping file {file}") continue # offload the index self._offload_to_partial_index() # print the document id mapping dump_json_to_file(self._doc_id_map, self._index_config.get_doc_id_map_path()) # logging print(f"Started at: {start_time}") print(f"Partial indices!: {self._partial_index_file_names}") print(f"Completed!: {datetime.now().strftime('%H:%M:%S')}")
# attributes of the agent # setting the cmd mode or the visual mode if gui == False: sumoBinary = 'sumo.exe' else: sumoBinary = 'sumo-gui.exe' # initializations max_steps = 5400 # seconds = 1 h 30 min each episode total_episodes = 100 num_experiments = 1 learn = False traffic_gen = TrafficGenerator(max_steps) qmodel_filename, stats_filename = utils.get_file_names() init_experiment, init_epoch = utils.get_init_epoch(stats_filename, total_episodes) print('init_experiment={} init_epoch={}'.format(init_experiment, init_epoch)) stats = utils.get_stats(stats_filename, num_experiments, total_episodes) for experiment in range(init_experiment, num_experiments): env = SumoEnv(sumoBinary, max_steps) tl = TLAgent(env, traffic_gen, max_steps, num_experiments, total_episodes, qmodel_filename, stats, init_epoch, learn) init_epoch = 0 # reset init_epoch after first experiment if learn: tl.train(experiment) else: seeds = np.load('seed.npy')
def refine_raw_data(raw_dataset_dir, refined_dataset_dir): start_time = time.time() map_tag_info = {} # map_category_tag_post = {} raw_names = utils.get_file_names(raw_dataset_dir) for i, raw_name in enumerate(raw_names): map_pair_tag_occurrence = {} category = raw_name[:raw_name.find(".")] # map_tag_post = defaultdict(list) posts = utils.load_json(os.path.join(raw_dataset_dir, raw_name)) print("{}/{} Refine {} ({} posts) ...".format(i + 1, len(raw_names), category, len(posts))) # print(posts[0]) for post_id, post in enumerate(posts): tags = post.get("Tags", []) # Update tag info # for tag in tags: # map_tag_post[tag].append(post_id) # map_cat_ids = map_tag_info.get(tag) # if map_cat_ids is None: # map_cat_ids = {} # map_tag_info[tag] = map_cat_ids # ids = map_cat_ids.get(category) # if ids is None: # ids = [] # map_cat_ids[category] = ids # ids.append(post_id) # Update tag relationship for tag1, tag2 in get_all_pairs(tags): pred_tag, succ_tag = min(tag1, tag2), max(tag1, tag2) occurrence = map_pair_tag_occurrence.get((pred_tag, succ_tag), 0) map_pair_tag_occurrence.update({ (pred_tag, succ_tag): occurrence + 1 }) # map_category_tag_post[category] = dict(map_tag_post) save_path = os.path.join(refined_dataset_dir, "Pair_Tag", "{}.csv".format(category)) df = [] for (tag1, tag2), num_occ in map_pair_tag_occurrence.items(): df.append((tag1, tag2, num_occ)) df = pd.DataFrame(df, columns=["Tag1", "Tag2", "Num_Occurrence"]) utils.save_csv(df, save_path) print("Total pair_tags : ", len(map_pair_tag_occurrence)) # break # Save result # save_path = os.path.join(refined_dataset_dir, "category_tag_post.json") # utils.save_json(map_category_tag_post, save_path) # save_path = os.path.join(refined_dataset_dir, "tags_info.json") # utils.save_json(map_tag_info, save_path) # # save_path = os.path.join(refined_dataset_dir, "tags_relationship.csv") # df = [] # for (tag1, tag2), occ in map_pair_tag_occurrence.items(): # df.append((tag1, tag2, occ)) # df = pd.DataFrame(df, columns=["Tag1", "Tag2", "Occurrence"]) # utils.save_csv(df, save_path) print("Total tags all category : ", len(map_tag_info)) exec_time = time.time() - start_time print("Time : {:.2f} seconds".format(exec_time))
import utils as utils # Testing the get file by names and adding to output.txt utils.get_file_names("/home/jovyan/my_notebooks") # Testing the get all file names and adds to output2.txt utils.get_all_file_names("/home/jovyan/my_notebooks", "output2.txt") print("#### Testing the print first line of files ") # Testing the print first line of files utils.print_line_one("output.txt") print("####") print("#### Testing the print emails ") # Testing the print emails utils.print_emails("emails.txt") # Testing the write_headlines from MD files utils.write_headlines("md_test_file.md", "output3.txt")
def load_data(args, retriever, tokenizer, retriever_tokenizer): print("Loading data...") start_time = time.time() data_dir = args.data_dir + '_' + args.experiment_name if args.data_name == 'synthetic' else args.data_dir train_name, dev_name, test_name = utils.get_file_names(args.data_name) train_path = os.path.join(data_dir, train_name) dev_path = os.path.join(data_dir, dev_name) test_path = os.path.join(data_dir, test_name) make_data_function = get_make_data_function(args.data_name) train_dataset, train_info = make_data_function(args, retriever, tokenizer, retriever_tokenizer, file_path=train_path) dev_dataset, dev_info = make_data_function(args, None, tokenizer, retriever_tokenizer, file_path=dev_path) test_dataset, test_info = make_data_function(args, None, tokenizer, retriever_tokenizer, file_path=test_path) load_time = (time.time() - start_time) / 60 print(f"Loading data took {load_time:.2f} minutes") print("Data info:") for split_name, info in zip(['train', 'dev', 'test'], [train_info, dev_info, test_info]): n, n_classes, label_dist = info['n'], info['n_classes'], [ round(100 * x, 2) for x in info['label_dist'].values() ] print( f' {split_name}: {n} points | {n_classes} classes | label distribution : {label_dist}' ) train_dataloader = DataLoader(TensorDataset(*train_dataset), shuffle=True, batch_size=args.train_batch_size, num_workers=4, pin_memory=True) dev_dataloader = DataLoader(TensorDataset(*dev_dataset), shuffle=False, batch_size=args.test_batch_size, num_workers=4, pin_memory=True) test_dataloader = DataLoader(TensorDataset(*test_dataset), shuffle=False, batch_size=args.test_batch_size, num_workers=4, pin_memory=True) if args.eval_on_train: dev_dataloader, test_dataloader = train_dataloader, test_dataloader # load separate explanation data for RE into retriever if args.task_type == 'RE' and args.use_retrieval: exp_file_path = os.path.join( args.data_dir, 'semeval_exp.json' if args.data_name == 'semeval' else 'tacred_exp_orig.json') _, exp_info = utils.make_RE_data(args, retriever, tokenizer, retriever_tokenizer, exp_file_path, train_path) n, n_classes, label_dist = exp_info['n'], exp_info['n_classes'], [ round(100 * x, 2) for x in exp_info['label_dist'].values() ] print( f' Exp info: {n} points | {n_classes} classes | label distribution : {label_dist}' ) return train_dataloader, dev_dataloader, test_dataloader
def processConcat(self, dc, astr=''): setTitle = self.setTitle2 list1 = dc["list1"] list2 = dc["list2"] fast_mode_select = dc["fast_mode_select"] outputDir = dc["output_dir"] + os.sep tempDir = outputDir + 'tempDir' + os.sep # 保持长度一致 minLen = min(len(list1), len(list2)) list1 = list1[0:minLen] list2 = list2[0:minLen] finalMP4 = "" pStr = "" # set param=-c:v libx264 -s 1920x1080 -r 24 -b:v 6144k -b:a 128k -ar 44100 -ac 2 -preset slower -threads 8 FFStr = '''ffmpeg -y -i "{input}" -c:v libx264 -s {v_size} -crf 18 -r {fps} -b:a 128k -ar 44100 -ac 2 -threads 8 "{output}"''' FFConcat = '''ffmpeg -y -f concat -safe 0 -i "{0}" -c copy "{1}"''' seq = ('input', 'output', 'v_size', 'fps') utils.make_dir(tempDir) utils.hide_file(tempDir) total = len(list1) count = 0 msgStr = " ({0}/{1}) {2}" print(list1) status = [] for i in range(len(list1)): count = count + 1 status.append('') fileA = list1[i] fileB = list2[i] arr = utils.get_file_names(fileA) fnameA = arr[1] # ftypeA = arr[2] ftempA = tempDir + "-" + fnameA + ".mp4" arr = utils.get_file_names(fileB) fnameB = arr[1] # ftypeB = arr[2] ftempB = tempDir + "-" + fnameB + ".mp4" fullName = fnameA + '__' + fnameB finalMP4 = outputDir + fullName + ".mp4" subTxt = tempDir + "concat_" + fullName + ".txt" # 任务信息 mstr = msgStr.format(count, total, fullName) setTitle(mstr) # 读取第一个视频的 尺寸和帧频作为基准 # !!!所有的视频都会进行一次转码 dc = dict.fromkeys(seq, "") dcinfo = ff.get_video_info(fileA, False) dc['fps'] = dcinfo['fps'] if dcinfo['fps'] else '24' dc['v_size'] = dcinfo['v_size'] if dcinfo['v_size'] else '1920x1080' # 检查视频参数是否相同 isSame = False if fast_mode_select: isSame = ff.compare_video(fileA, fileB) # 生成concat.txt, 并转换片头/片尾 subs = [] sub = "file '{0}'\n" if not isSame: # 转第一个视频 mstr = msgStr.format(count, total, "转换 第一个视频……") setTitle(mstr) status[i] = '10%' self.updateCenter(status) dc['input'] = fileA dc['output'] = ftempA pStr = FFStr.format(**dc) ff.execute(pStr) # 转第二个视频 mstr = msgStr.format(count, total, "转换 第二个视频……") setTitle(mstr) status[i] = '50%' self.updateCenter(status) dc['input'] = fileB dc['output'] = ftempB pStr = FFStr.format(**dc) ff.execute(pStr) subs.append(sub.format(ftempA)) subs.append(sub.format(ftempB)) else: mstr = msgStr.format(count, total, "参数相同,跳过转换,直接拼接!") setTitle(mstr) subs.append(sub.format(fileA)) subs.append(sub.format(fileB)) # 写入concat文件 utils.write_txt(subTxt, subs) # 拼接视频 mstr = msgStr.format(count, total, "拼接中……") setTitle(mstr) status[i] = '90%' self.updateCenter(status) pStr = FFConcat.format(subTxt, finalMP4) ff.execute(pStr) # print(pStr) sstr = '成功' if os.path.exists(finalMP4) else '失败' status[i] = sstr self.updateCenter(status) # 移除 concat.txt 和 mp4 utils.remove_file(subTxt) utils.remove_file(ftempA) utils.remove_file(ftempB) setTitle("操作结束!") setTitle("") # 自动打开目录 if finalMP4: utils.open_dir(outputDir) self.t1 = "" self.lockBtn(False)
def once_complex(self, dc, one_dc): set_title = self.start_btn.update_query update_status = self.update_status need_number = one_dc['need_number'] num_file = one_dc["number_file"] num_size = one_dc['number_size'] num_join_str = one_dc['number_join_str'] num_join_short_str = one_dc['number_join_short_str'] if not num_join_short_str: num_join_short_str = '' else: num_join_short_str = " " + num_join_short_str num_second = 0 is_iqy = True if num_join_str == '爱奇艺备案号' else False raw_mp4 = one_dc['rawMP4'] i = one_dc['index'] number_second = int(dc["number_second"]) total = one_dc['total'] out_dir = one_dc['output_dir'] temp_dir = one_dc['temp_dir'] pt_second = one_dc['pt_second'] pw_second = one_dc['pw_second'] pt_out_file = one_dc['pt_out_file'] pw_out_file = one_dc['pw_out_file'] frame_size = one_dc['frame_size'] water_size = one_dc['water_size'] rad_var = dc['fps'] if rad_var == 2: fps = '24' elif rad_var == 3: fps = '25' elif rad_var == 4: fps = '30' else: fps = '0' target_fps = fps radio_select_var = dc["bit"] pt_file = dc["pt_file"] pw_file = dc["pw_file"] frame_file = dc["frame_file"] watermark_file = dc["watermark_file"] pt_select = dc['pt_select'] pw_select = dc['pw_select'] need_frame = dc["frame_select"] need_watermark = dc["watermark_select"] double_fix_select = utils.str_to_bool(dc["select_double_fix"]) select_30m = utils.str_to_bool(dc["select_30m"]) fast_mode_select = False # fast_mode_select = dc['fast_mode_select'] # skip_content_mp4 = False count = i + 1 set_title("") format_str = "(%d/%d)" % (count, total) + ' %s' arr = utils.get_file_names(raw_mp4) f_name = arr[1] f_type = arr[2] f_full_name = f_name + f_type out_file_type = ".mpg" if select_30m else ".mp4" temp_video = temp_dir + "-" + f_name + out_file_type final_video = out_dir + f_name + out_file_type if need_number and num_join_str: temp_path = Path(out_dir) / num_join_str temp_path = str(temp_path) + os.sep utils.make_dir(temp_path) final_video = temp_path + f_name + out_file_type vb_str = "" need_same_bit_rate = False # 1) 转正片视频 set_title(format_str % f_full_name) update_status(i, '10%' + num_join_short_str) # 匹配 尺寸和fps tdc = ff.get_video_info(raw_mp4, False) v_size = tdc["v_size"] if tdc["v_size"] else "1920x1080" tdc["v_size"] = v_size fps = tdc["fps"] if tdc["fps"] else "24" tdc["fps"] = fps if target_fps == '0' else target_fps duration = tdc['duration'] if tdc["duration"] else '0' duration = float(duration) if is_iqy: vb_str = "8M" else: # 码率 部分 if radio_select_var == 1: # 保持 need_same_bit_rate = True # tdc["crf"] = 1 vb_str = '' elif radio_select_var == 2: # 自动 tdc["crf"] = 18 vb_str = '' if radio_select_var == 3: vb_str = "4M" elif radio_select_var == 4: vb_str = "6M" elif radio_select_var == 5: vb_str = "8M" elif radio_select_var == 6: vb_str = "10M" elif radio_select_var == 7: vb_str = "30M" obj = ff.create_obj() obj.input_file = raw_mp4 obj.output_file = temp_video obj.need_same_bit_rate = need_same_bit_rate obj.need_30m = select_30m # obj.set_video_info(tdc) # obj.fps = fps # obj.size = v_size obj.set_video_info(tdc, vb_str) if need_number: if number_second == -1: num_second = duration + pt_second + pw_second else: num_second = number_second if double_fix_select and duration: obj.time_start = 0 obj.time_to = duration duration_string = ff.millisecond_to_str(int(duration * 1000)) set_title(format_str % ("*[双倍时长修正]该视频时长:" + duration_string)) png_list = [] msg_str = '正在转换 正片(' if need_frame: png_list.append(["加幕布", frame_file, frame_size, 0]) if need_watermark: png_list.append([" 加水印", watermark_file, water_size, 0]) if need_number: t = num_second - pt_second png_list.append([" 加备案号", num_file, num_size, t]) if len(png_list): sizes = [] times = [] npngs = [] for p in png_list: msg_str += p[0] npngs.append(p[1]) sizes.append(p[2]) times.append(p[3]) png_list = npngs obj.set_overlay(png_list, sizes, times) msg_str += ')……' msg_str = msg_str.replace('()', '') set_title(format_str % msg_str) # 可以不转换片头的情况 # 没有选择任何合成功能时,会对正片进行一次转码操作,后面会进行处理 if not need_frame and not need_watermark and not need_number and not double_fix_select: skip_content_mp4 = True else: skip_content_mp4 = False update_status(i, '20%' + num_join_short_str) obj.execute() # 2) 有片头或片尾需要合成 if pt_select or pw_select: # 生成concat.txt, 并转换片头/片尾 subs = [] # 1 if pt_select: nobj = ff.create_obj() nobj.input_file = pt_file nobj.output_file = pt_out_file nobj.need_30m = select_30m nobj.need_same_bit_rate = need_same_bit_rate # nobj.fps = fps # nobj.size = v_size nobj.set_video_info(tdc, vb_str) # 需要添加备案号 msg_str = "正在转换 片头" if need_number and num_second: msg_str += '(加备案号)' if pt_second < num_second: nobj.set_overlay([num_file], [num_size]) else: nobj.set_overlay([num_file], [num_size], [pt_second]) msg_str += '……' set_title(format_str % msg_str) update_status(i, '40%' + num_join_short_str) nobj.execute() subs.append(pt_out_file) # 2 if skip_content_mp4: if fast_mode_select and ff.compare_video(raw_mp4, pt_out_file): subs.append(raw_mp4) # 让正片参与最后的拼接,但不能删除正片 msg_str = "没有水印等,不转换正片,直接进行合并" set_title(format_str % msg_str) else: # 和片头的视频参数不一致,进行一次转码 obj.set_video_info(tdc, vb_str) # 此操作能恢复之前的大多数参数 msg_str = "正在转换 正片" msg_str += '……' set_title(format_str % msg_str) update_status(i, '50%' + num_join_short_str) obj.execute() subs.append(temp_video) else: subs.append(temp_video) # 3 if pw_select: nobj = ff.create_obj() nobj.input_file = pw_file nobj.output_file = pw_out_file nobj.need_same_bit_rate = need_same_bit_rate nobj.need_30m = select_30m # nobj.fps = fps # nobj.size = v_size nobj.set_video_info(tdc, vb_str) # 需要添加备案号 msg_str = "正在转换 片尾" t = pt_second + duration if need_number and t < num_second: msg_str += '(加备案号)' new_t = num_second - t nobj.set_overlay([num_file], [num_size], [new_t]) msg_str += "……" set_title(format_str % msg_str) update_status(i, '60%' + num_join_short_str) nobj.execute() subs.append(pw_out_file) # 拼接视频 set_title(format_str % "拼接中……") update_status(i, '90%' + num_join_short_str) sub_txt = temp_dir + "concat_" + f_name + ".txt" ff.concat(subs, final_video, sub_txt) # 移除 concat.txt 和 mp4 utils.remove_file(sub_txt) if not skip_content_mp4: utils.remove_file(temp_video) else: # 没有任何选项 仅对正片进行一次转码操作 if skip_content_mp4: obj.execute() utils.move_file(temp_video, final_video) else: utils.move_file(temp_video, final_video) self.final_video = final_video update_status(i, 'OK')
import utils as u u.get_file_names('./') u.get_all_file_names("./demo_dir") u.print_line_one(['./output.txt']) u.print_emails(['./demo_dir/demo1.txt', './demo_dir/demo2.txt']) u.write_headlines(['../README.md'], '../headlines.txt')
import utils as utils # Testing the get file by names and adding to output.txt utils.get_file_names("/") # Testing the get all file names and adds to output2.txt utils.get_all_file_names("/", "output2.txt") print("#### Testing the print first line of files ") # Testing the print first line of files utils.print_line_one("output.txt") print("#### Testing the print emails ") # Testing the print emails utils.print_emails("emails.txt") # Testing the write_headlines from MD files utils.write_headlines("mdfile.md", "output3.txt")
scale, clf, dec_thresh=0.99) cars, heatmap = car_tracker.update(heatmap, threshold=2.0) # for p1, p2 in itertools.chain(cars): # # Draw SVC boxes # cv2.rectangle(svc_img, p1, p2, (255, 255, 0), 3) svc_img = cv2.addWeighted(svc_img, 1.0, heatmap, 0.8, 0.0) return svc_img if __name__ == "__main__": # Import car and not car images cars = get_file_names('./data/vehicles', pattern='*.png') not_cars = get_file_names('./data/non-vehicles', pattern='*.png') # Calculate car features & not-car features car_features = get_feature(cars, workers=4) not_car_features = get_feature(not_cars, workers=4) # Create data set x = np.vstack((car_features, not_car_features)).astype(np.float64) y = np.concatenate( (np.ones(len(car_features)), np.zeros(len(not_car_features)))) # SVC classifier clf = SupportVectorMachineClassifier() clf.train(x, y)