Example #1
0
    def _create_partial_indices(self):
        # for logging
        start_time = datetime.now().strftime('%H:%M:%S')

        # get the location of the documents and get all of the files inside
        file_names = get_file_names(self._index_config.get_input_dir())
        for file in file_names:
            try:

                self._add_document_to_index(file)

                # offload the index if need be
                if self._time_to_offload():
                    # offload to the partial index
                    self._offload_to_partial_index()

                # logging
                if self._doc_id % 100 == 0:
                    print(
                        f"Current memory size: {sys.getsizeof(self._inverted_index)}"
                    )
                    print(f"Completed 100 files... current file: {file}")

            except JSONDecodeError:
                print(f"JSONDecodeError: Skipping file {file}")
                continue
            except UnicodeDecodeError:
                print(f"UnicodeDecodeError: Skipping file {file}")
                continue

        # offload the index
        self._offload_to_partial_index()

        # print the document id mapping
        dump_json_to_file(self._doc_id_map,
                          self._index_config.get_doc_id_map_path())

        # logging
        print(f"Started at: {start_time}")
        print(f"Partial indices!: {self._partial_index_file_names}")
        print(f"Completed!: {datetime.now().strftime('%H:%M:%S')}")
Example #2
0
    # attributes of the agent

    # setting the cmd mode or the visual mode
    if gui == False:
        sumoBinary = 'sumo.exe'
    else:
        sumoBinary = 'sumo-gui.exe'

    # initializations
    max_steps = 5400  # seconds = 1 h 30 min each episode
    total_episodes = 100
    num_experiments = 1
    learn = False
    traffic_gen = TrafficGenerator(max_steps)
    qmodel_filename, stats_filename = utils.get_file_names()
    init_experiment, init_epoch = utils.get_init_epoch(stats_filename,
                                                       total_episodes)
    print('init_experiment={} init_epoch={}'.format(init_experiment,
                                                    init_epoch))
    stats = utils.get_stats(stats_filename, num_experiments, total_episodes)

    for experiment in range(init_experiment, num_experiments):
        env = SumoEnv(sumoBinary, max_steps)
        tl = TLAgent(env, traffic_gen, max_steps, num_experiments,
                     total_episodes, qmodel_filename, stats, init_epoch, learn)
        init_epoch = 0  # reset init_epoch after first experiment
        if learn:
            tl.train(experiment)
        else:
            seeds = np.load('seed.npy')
Example #3
0
def refine_raw_data(raw_dataset_dir, refined_dataset_dir):
    start_time = time.time()

    map_tag_info = {}
    # map_category_tag_post = {}

    raw_names = utils.get_file_names(raw_dataset_dir)
    for i, raw_name in enumerate(raw_names):
        map_pair_tag_occurrence = {}
        category = raw_name[:raw_name.find(".")]
        # map_tag_post = defaultdict(list)
        posts = utils.load_json(os.path.join(raw_dataset_dir, raw_name))
        print("{}/{} Refine {} ({} posts) ...".format(i + 1, len(raw_names),
                                                      category, len(posts)))

        # print(posts[0])
        for post_id, post in enumerate(posts):
            tags = post.get("Tags", [])

            # Update tag info
            # for tag in tags:
            #     map_tag_post[tag].append(post_id)

            # map_cat_ids = map_tag_info.get(tag)
            # if map_cat_ids is None:
            #     map_cat_ids = {}
            #     map_tag_info[tag] = map_cat_ids
            # ids = map_cat_ids.get(category)
            # if ids is None:
            #     ids = []
            #     map_cat_ids[category] = ids
            # ids.append(post_id)

            # Update tag relationship
            for tag1, tag2 in get_all_pairs(tags):
                pred_tag, succ_tag = min(tag1, tag2), max(tag1, tag2)
                occurrence = map_pair_tag_occurrence.get((pred_tag, succ_tag),
                                                         0)
                map_pair_tag_occurrence.update({
                    (pred_tag, succ_tag):
                    occurrence + 1
                })

        # map_category_tag_post[category] = dict(map_tag_post)
        save_path = os.path.join(refined_dataset_dir, "Pair_Tag",
                                 "{}.csv".format(category))
        df = []
        for (tag1, tag2), num_occ in map_pair_tag_occurrence.items():
            df.append((tag1, tag2, num_occ))
        df = pd.DataFrame(df, columns=["Tag1", "Tag2", "Num_Occurrence"])
        utils.save_csv(df, save_path)
        print("Total pair_tags : ", len(map_pair_tag_occurrence))
        # break

    # Save result
    # save_path = os.path.join(refined_dataset_dir, "category_tag_post.json")
    # utils.save_json(map_category_tag_post, save_path)

    # save_path = os.path.join(refined_dataset_dir, "tags_info.json")
    # utils.save_json(map_tag_info, save_path)
    #
    # save_path = os.path.join(refined_dataset_dir, "tags_relationship.csv")
    # df = []
    # for (tag1, tag2), occ in map_pair_tag_occurrence.items():
    #     df.append((tag1, tag2, occ))
    # df = pd.DataFrame(df, columns=["Tag1", "Tag2", "Occurrence"])
    # utils.save_csv(df, save_path)

    print("Total tags  all category : ", len(map_tag_info))

    exec_time = time.time() - start_time
    print("Time : {:.2f} seconds".format(exec_time))
Example #4
0
import utils as utils

# Testing the get file by names and adding to output.txt
utils.get_file_names("/home/jovyan/my_notebooks")

# Testing the get all file names and adds to output2.txt
utils.get_all_file_names("/home/jovyan/my_notebooks", "output2.txt")

print("#### Testing the print first line of files ")
# Testing the print first line of files
utils.print_line_one("output.txt")

print("####")

print("#### Testing the print emails ")
# Testing the print emails
utils.print_emails("emails.txt")

# Testing the write_headlines from MD files
utils.write_headlines("md_test_file.md", "output3.txt")
Example #5
0
def load_data(args, retriever, tokenizer, retriever_tokenizer):
    print("Loading data...")
    start_time = time.time()
    data_dir = args.data_dir + '_' + args.experiment_name if args.data_name == 'synthetic' else args.data_dir
    train_name, dev_name, test_name = utils.get_file_names(args.data_name)
    train_path = os.path.join(data_dir, train_name)
    dev_path = os.path.join(data_dir, dev_name)
    test_path = os.path.join(data_dir, test_name)
    make_data_function = get_make_data_function(args.data_name)
    train_dataset, train_info = make_data_function(args,
                                                   retriever,
                                                   tokenizer,
                                                   retriever_tokenizer,
                                                   file_path=train_path)
    dev_dataset, dev_info = make_data_function(args,
                                               None,
                                               tokenizer,
                                               retriever_tokenizer,
                                               file_path=dev_path)
    test_dataset, test_info = make_data_function(args,
                                                 None,
                                                 tokenizer,
                                                 retriever_tokenizer,
                                                 file_path=test_path)
    load_time = (time.time() - start_time) / 60
    print(f"Loading data took {load_time:.2f} minutes")
    print("Data info:")
    for split_name, info in zip(['train', 'dev', 'test'],
                                [train_info, dev_info, test_info]):
        n, n_classes, label_dist = info['n'], info['n_classes'], [
            round(100 * x, 2) for x in info['label_dist'].values()
        ]
        print(
            f'  {split_name}: {n} points | {n_classes} classes | label distribution : {label_dist}'
        )
    train_dataloader = DataLoader(TensorDataset(*train_dataset),
                                  shuffle=True,
                                  batch_size=args.train_batch_size,
                                  num_workers=4,
                                  pin_memory=True)
    dev_dataloader = DataLoader(TensorDataset(*dev_dataset),
                                shuffle=False,
                                batch_size=args.test_batch_size,
                                num_workers=4,
                                pin_memory=True)
    test_dataloader = DataLoader(TensorDataset(*test_dataset),
                                 shuffle=False,
                                 batch_size=args.test_batch_size,
                                 num_workers=4,
                                 pin_memory=True)
    if args.eval_on_train:
        dev_dataloader, test_dataloader = train_dataloader, test_dataloader
    # load separate explanation data for RE into retriever
    if args.task_type == 'RE' and args.use_retrieval:
        exp_file_path = os.path.join(
            args.data_dir, 'semeval_exp.json'
            if args.data_name == 'semeval' else 'tacred_exp_orig.json')
        _, exp_info = utils.make_RE_data(args, retriever, tokenizer,
                                         retriever_tokenizer, exp_file_path,
                                         train_path)
        n, n_classes, label_dist = exp_info['n'], exp_info['n_classes'], [
            round(100 * x, 2) for x in exp_info['label_dist'].values()
        ]
        print(
            f'  Exp info: {n} points | {n_classes} classes | label distribution : {label_dist}'
        )
    return train_dataloader, dev_dataloader, test_dataloader
Example #6
0
    def processConcat(self, dc, astr=''):
        setTitle = self.setTitle2

        list1 = dc["list1"]
        list2 = dc["list2"]
        fast_mode_select = dc["fast_mode_select"]
        outputDir = dc["output_dir"] + os.sep

        tempDir = outputDir + 'tempDir' + os.sep

        # 保持长度一致
        minLen = min(len(list1), len(list2))
        list1 = list1[0:minLen]
        list2 = list2[0:minLen]

        finalMP4 = ""
        pStr = ""

        # set param=-c:v libx264 -s 1920x1080 -r 24 -b:v 6144k -b:a 128k -ar 44100 -ac 2 -preset slower -threads 8
        FFStr = '''ffmpeg -y -i "{input}" -c:v libx264 -s {v_size} -crf 18 -r {fps} -b:a 128k -ar 44100 -ac 2 -threads 8 "{output}"'''
        FFConcat = '''ffmpeg -y -f concat -safe 0 -i "{0}" -c copy "{1}"'''
        seq = ('input', 'output', 'v_size', 'fps')

        utils.make_dir(tempDir)
        utils.hide_file(tempDir)
        total = len(list1)
        count = 0
        msgStr = " ({0}/{1}) {2}"
        print(list1)
        status = []
        for i in range(len(list1)):
            count = count + 1
            status.append('')

            fileA = list1[i]
            fileB = list2[i]

            arr = utils.get_file_names(fileA)
            fnameA = arr[1]
            # ftypeA = arr[2]
            ftempA = tempDir + "-" + fnameA + ".mp4"

            arr = utils.get_file_names(fileB)
            fnameB = arr[1]
            # ftypeB = arr[2]
            ftempB = tempDir + "-" + fnameB + ".mp4"

            fullName = fnameA + '__' + fnameB
            finalMP4 = outputDir + fullName + ".mp4"
            subTxt = tempDir + "concat_" + fullName + ".txt"

            # 任务信息
            mstr = msgStr.format(count, total, fullName)
            setTitle(mstr)

            # 读取第一个视频的 尺寸和帧频作为基准
            # !!!所有的视频都会进行一次转码
            dc = dict.fromkeys(seq, "")
            dcinfo = ff.get_video_info(fileA, False)
            dc['fps'] = dcinfo['fps'] if dcinfo['fps'] else '24'
            dc['v_size'] = dcinfo['v_size'] if dcinfo['v_size'] else '1920x1080'

            # 检查视频参数是否相同
            isSame = False
            if fast_mode_select:
                isSame = ff.compare_video(fileA, fileB)
            # 生成concat.txt, 并转换片头/片尾
            subs = []
            sub = "file '{0}'\n"
            if not isSame:
                # 转第一个视频
                mstr = msgStr.format(count, total, "转换 第一个视频……")
                setTitle(mstr)
                status[i] = '10%'
                self.updateCenter(status)

                dc['input'] = fileA
                dc['output'] = ftempA
                pStr = FFStr.format(**dc)
                ff.execute(pStr)

                # 转第二个视频
                mstr = msgStr.format(count, total, "转换 第二个视频……")
                setTitle(mstr)
                status[i] = '50%'
                self.updateCenter(status)

                dc['input'] = fileB
                dc['output'] = ftempB
                pStr = FFStr.format(**dc)
                ff.execute(pStr)

                subs.append(sub.format(ftempA))
                subs.append(sub.format(ftempB))
            else:
                mstr = msgStr.format(count, total, "参数相同,跳过转换,直接拼接!")
                setTitle(mstr)

                subs.append(sub.format(fileA))
                subs.append(sub.format(fileB))

            # 写入concat文件
            utils.write_txt(subTxt, subs)

            # 拼接视频
            mstr = msgStr.format(count, total, "拼接中……")
            setTitle(mstr)
            status[i] = '90%'
            self.updateCenter(status)

            pStr = FFConcat.format(subTxt, finalMP4)
            ff.execute(pStr)
            # print(pStr)

            sstr = '成功' if os.path.exists(finalMP4) else '失败'
            status[i] = sstr
            self.updateCenter(status)

            # 移除 concat.txt 和 mp4
            utils.remove_file(subTxt)
            utils.remove_file(ftempA)
            utils.remove_file(ftempB)

        setTitle("操作结束!")
        setTitle("")

        # 自动打开目录
        if finalMP4:
            utils.open_dir(outputDir)

        self.t1 = ""
        self.lockBtn(False)
Example #7
0
    def once_complex(self, dc, one_dc):
        set_title = self.start_btn.update_query
        update_status = self.update_status

        need_number = one_dc['need_number']
        num_file = one_dc["number_file"]
        num_size = one_dc['number_size']
        num_join_str = one_dc['number_join_str']
        num_join_short_str = one_dc['number_join_short_str']
        if not num_join_short_str:
            num_join_short_str = ''
        else:
            num_join_short_str = " " + num_join_short_str

        num_second = 0
        is_iqy = True if num_join_str == '爱奇艺备案号' else False
        raw_mp4 = one_dc['rawMP4']
        i = one_dc['index']
        number_second = int(dc["number_second"])
        total = one_dc['total']

        out_dir = one_dc['output_dir']
        temp_dir = one_dc['temp_dir']
        pt_second = one_dc['pt_second']
        pw_second = one_dc['pw_second']
        pt_out_file = one_dc['pt_out_file']
        pw_out_file = one_dc['pw_out_file']
        frame_size = one_dc['frame_size']
        water_size = one_dc['water_size']

        rad_var = dc['fps']
        if rad_var == 2:
            fps = '24'
        elif rad_var == 3:
            fps = '25'
        elif rad_var == 4:
            fps = '30'
        else:
            fps = '0'
        target_fps = fps
        radio_select_var = dc["bit"]

        pt_file = dc["pt_file"]
        pw_file = dc["pw_file"]
        frame_file = dc["frame_file"]
        watermark_file = dc["watermark_file"]

        pt_select = dc['pt_select']
        pw_select = dc['pw_select']
        need_frame = dc["frame_select"]
        need_watermark = dc["watermark_select"]

        double_fix_select = utils.str_to_bool(dc["select_double_fix"])
        select_30m = utils.str_to_bool(dc["select_30m"])
        fast_mode_select = False
        # fast_mode_select = dc['fast_mode_select']

        # skip_content_mp4 = False
        count = i + 1

        set_title("")
        format_str = "(%d/%d)" % (count, total) + ' %s'

        arr = utils.get_file_names(raw_mp4)
        f_name = arr[1]
        f_type = arr[2]
        f_full_name = f_name + f_type

        out_file_type = ".mpg" if select_30m else ".mp4"
        temp_video = temp_dir + "-" + f_name + out_file_type
        final_video = out_dir + f_name + out_file_type

        if need_number and num_join_str:
            temp_path = Path(out_dir) / num_join_str
            temp_path = str(temp_path) + os.sep
            utils.make_dir(temp_path)
            final_video = temp_path + f_name + out_file_type

        vb_str = ""
        need_same_bit_rate = False
        # 1) 转正片视频
        set_title(format_str % f_full_name)
        update_status(i, '10%' + num_join_short_str)

        # 匹配 尺寸和fps
        tdc = ff.get_video_info(raw_mp4, False)
        v_size = tdc["v_size"] if tdc["v_size"] else "1920x1080"
        tdc["v_size"] = v_size
        fps = tdc["fps"] if tdc["fps"] else "24"
        tdc["fps"] = fps if target_fps == '0' else target_fps
        duration = tdc['duration'] if tdc["duration"] else '0'
        duration = float(duration)

        if is_iqy:
            vb_str = "8M"
        else:
            # 码率 部分
            if radio_select_var == 1:  # 保持
                need_same_bit_rate = True
                # tdc["crf"] = 1
                vb_str = ''

            elif radio_select_var == 2:  # 自动
                tdc["crf"] = 18
                vb_str = ''

            if radio_select_var == 3:
                vb_str = "4M"

            elif radio_select_var == 4:
                vb_str = "6M"

            elif radio_select_var == 5:
                vb_str = "8M"

            elif radio_select_var == 6:
                vb_str = "10M"

            elif radio_select_var == 7:
                vb_str = "30M"

        obj = ff.create_obj()
        obj.input_file = raw_mp4
        obj.output_file = temp_video
        obj.need_same_bit_rate = need_same_bit_rate
        obj.need_30m = select_30m
        # obj.set_video_info(tdc)
        # obj.fps = fps
        # obj.size = v_size
        obj.set_video_info(tdc, vb_str)

        if need_number:
            if number_second == -1:
                num_second = duration + pt_second + pw_second
            else:
                num_second = number_second

        if double_fix_select and duration:
            obj.time_start = 0
            obj.time_to = duration
            duration_string = ff.millisecond_to_str(int(duration * 1000))
            set_title(format_str % ("*[双倍时长修正]该视频时长:" + duration_string))

        png_list = []
        msg_str = '正在转换 正片('
        if need_frame:
            png_list.append(["加幕布", frame_file, frame_size, 0])
        if need_watermark:
            png_list.append([" 加水印", watermark_file, water_size, 0])
        if need_number:
            t = num_second - pt_second
            png_list.append([" 加备案号", num_file, num_size, t])
        if len(png_list):
            sizes = []
            times = []
            npngs = []
            for p in png_list:
                msg_str += p[0]
                npngs.append(p[1])
                sizes.append(p[2])
                times.append(p[3])
            png_list = npngs
            obj.set_overlay(png_list, sizes, times)

            msg_str += ')……'
            msg_str = msg_str.replace('()', '')
            set_title(format_str % msg_str)

        # 可以不转换片头的情况
        # 没有选择任何合成功能时,会对正片进行一次转码操作,后面会进行处理
        if not need_frame and not need_watermark and not need_number and not double_fix_select:
            skip_content_mp4 = True
        else:
            skip_content_mp4 = False
            update_status(i, '20%' + num_join_short_str)
            obj.execute()

        # 2) 有片头或片尾需要合成
        if pt_select or pw_select:
            # 生成concat.txt, 并转换片头/片尾
            subs = []
            # 1
            if pt_select:
                nobj = ff.create_obj()
                nobj.input_file = pt_file
                nobj.output_file = pt_out_file
                nobj.need_30m = select_30m
                nobj.need_same_bit_rate = need_same_bit_rate
                # nobj.fps = fps
                # nobj.size = v_size
                nobj.set_video_info(tdc, vb_str)
                # 需要添加备案号
                msg_str = "正在转换 片头"
                if need_number and num_second:
                    msg_str += '(加备案号)'
                    if pt_second < num_second:
                        nobj.set_overlay([num_file], [num_size])
                    else:
                        nobj.set_overlay([num_file], [num_size], [pt_second])

                msg_str += '……'
                set_title(format_str % msg_str)
                update_status(i, '40%' + num_join_short_str)
                nobj.execute()
                subs.append(pt_out_file)
            # 2
            if skip_content_mp4:
                if fast_mode_select and ff.compare_video(raw_mp4, pt_out_file):
                    subs.append(raw_mp4)  # 让正片参与最后的拼接,但不能删除正片
                    msg_str = "没有水印等,不转换正片,直接进行合并"
                    set_title(format_str % msg_str)
                else:
                    # 和片头的视频参数不一致,进行一次转码
                    obj.set_video_info(tdc, vb_str)  # 此操作能恢复之前的大多数参数
                    msg_str = "正在转换 正片"
                    msg_str += '……'
                    set_title(format_str % msg_str)
                    update_status(i, '50%' + num_join_short_str)
                    obj.execute()
                    subs.append(temp_video)
            else:
                subs.append(temp_video)

            # 3
            if pw_select:
                nobj = ff.create_obj()
                nobj.input_file = pw_file
                nobj.output_file = pw_out_file
                nobj.need_same_bit_rate = need_same_bit_rate
                nobj.need_30m = select_30m
                # nobj.fps = fps
                # nobj.size = v_size
                nobj.set_video_info(tdc, vb_str)

                # 需要添加备案号
                msg_str = "正在转换 片尾"
                t = pt_second + duration
                if need_number and t < num_second:
                    msg_str += '(加备案号)'
                    new_t = num_second - t
                    nobj.set_overlay([num_file], [num_size], [new_t])
                msg_str += "……"
                set_title(format_str % msg_str)
                update_status(i, '60%' + num_join_short_str)
                nobj.execute()
                subs.append(pw_out_file)

            # 拼接视频
            set_title(format_str % "拼接中……")
            update_status(i, '90%' + num_join_short_str)
            sub_txt = temp_dir + "concat_" + f_name + ".txt"
            ff.concat(subs, final_video, sub_txt)
            # 移除 concat.txt 和 mp4
            utils.remove_file(sub_txt)
            if not skip_content_mp4:
                utils.remove_file(temp_video)
        else:
            # 没有任何选项 仅对正片进行一次转码操作
            if skip_content_mp4:
                obj.execute()
                utils.move_file(temp_video, final_video)
            else:
                utils.move_file(temp_video, final_video)
        self.final_video = final_video
        update_status(i, 'OK')
Example #8
0
import utils as u

u.get_file_names('./')
u.get_all_file_names("./demo_dir")
u.print_line_one(['./output.txt'])
u.print_emails(['./demo_dir/demo1.txt', './demo_dir/demo2.txt'])
u.write_headlines(['../README.md'], '../headlines.txt')
Example #9
0
import utils as utils

# Testing the get file by names and adding to output.txt
utils.get_file_names("/")

# Testing the get all file names and adds to output2.txt
utils.get_all_file_names("/", "output2.txt")

print("#### Testing the print first line of files ")
# Testing the print first line of files
utils.print_line_one("output.txt")

print("#### Testing the print emails ")
# Testing the print emails
utils.print_emails("emails.txt")

# Testing the write_headlines from MD files
utils.write_headlines("mdfile.md", "output3.txt")
Example #10
0
                                 scale,
                                 clf,
                                 dec_thresh=0.99)
    cars, heatmap = car_tracker.update(heatmap, threshold=2.0)

    # for p1, p2 in itertools.chain(cars):
    #     # Draw SVC boxes
    #     cv2.rectangle(svc_img, p1, p2, (255, 255, 0), 3)

    svc_img = cv2.addWeighted(svc_img, 1.0, heatmap, 0.8, 0.0)
    return svc_img


if __name__ == "__main__":
    # Import car and not car images
    cars = get_file_names('./data/vehicles', pattern='*.png')
    not_cars = get_file_names('./data/non-vehicles', pattern='*.png')

    # Calculate car features & not-car features
    car_features = get_feature(cars, workers=4)
    not_car_features = get_feature(not_cars, workers=4)

    # Create data set
    x = np.vstack((car_features, not_car_features)).astype(np.float64)
    y = np.concatenate(
        (np.ones(len(car_features)), np.zeros(len(not_car_features))))

    # SVC classifier
    clf = SupportVectorMachineClassifier()
    clf.train(x, y)