def kfold_validation(self, k=10):

        available_ram = psutil.virtual_memory()[1]
        available_ram = int(int(available_ram) * .9 * 1e-9)

        if available_ram > 5:
            jvm.start(max_heap_size='5g')
        else:
            print(
                'Seem your machine has less than 5 GB amount of RAM available:\n'
            )
            print('cannot start jvm.')
            sys.exit()

        ###

        print('\nCaricando ' + self.input_file + ' con opts -f' +
              str(self.features_number) + ' -c' + self.classifier_name + '\n')
        # load .arff file
        dataset = arff.load(open(input_file, 'r'))
        data = np.array(dataset['data'])

        self.features_names = [x[0] for x in dataset['attributes']]

        self.attributes_number = data.shape[1]
        self.dataset_features_number = self.attributes_number - self.levels_number

        # Factorization of Nominal features_index
        encoder = CategoricalEncoder(encoding='ordinal')
        nominal_features_index = [
            i for i in range(len(dataset['attributes'][:-self.levels_number]))
            if dataset['attributes'][i][1] != u'NUMERIC'
        ]
        if len(nominal_features_index) > 0:
            data[:, nominal_features_index] = encoder.fit_transform(
                data[:, nominal_features_index])

        prediction = []
        probability = []
        oracle = []

        print('\n***\nStart testing with ' + str(k) +
              'Fold cross-validation -f' + str(self.features_number) + ' -c' +
              self.classifier_name + '\n***\n')

        bar = progressbar.ProgressBar(maxval=k,
                                      widgets=[
                                          progressbar.Bar('=', '[', ']'), ' ',
                                          progressbar.Percentage()
                                      ])
        bar.start()

        temp_metrics = []

        skf = StratifiedKFold(n_splits=k, shuffle=True)
        bar_cnt = 0
        for train_index, test_index in skf.split(
                data,
                data[:, self.dataset_features_number + self.tag_under_test]):

            self.training_set = data[
                train_index, :self.dataset_features_number]
            self.testing_set = data[test_index, :self.dataset_features_number]
            self.ground_through = data[train_index,
                                       self.dataset_features_number +
                                       self.tag_under_test]
            self.oracle = data[test_index, self.dataset_features_number +
                               self.tag_under_test]
            self.prediction = np.ndarray(shape=[len(test_index), 1],
                                         dtype='<U24')
            self.probability = np.ndarray(
                shape=[len(test_index),
                       len(set(self.ground_through))],
                dtype='<U24')

            classifier_to_call = getattr(
                self, supported_classifiers[self.classifier_name])
            classifier_to_call()

            prediction.append(self.prediction)
            probability.append(self.probability)
            oracle.append(self.oracle)

            bar_cnt += 1
            bar.update(bar_cnt)

        bar.finish()

        relations = []

        relations = []
        relations.append({  # Lv2:Lv1
            u'Tor': u'Tor',
            u'TorPT': u'Tor',
            u'TorApp': u'Tor',
            u'I2PApp80BW': u'I2P',
            u'I2PApp0BW': u'I2P',
            u'I2PApp': u'I2P',
            u'JonDonym': u'JonDonym'
        })

        relations.append({  # Lv3:Lv2
            u'JonDonym': u'JonDonym',
            u'I2PSNARK_App80BW': u'I2PApp80BW',
            u'IRC_App80BW': u'I2PApp80BW',
            u'Eepsites_App80BW': u'I2PApp80BW',
            u'I2PSNARK_App0BW': u'I2PApp0BW',
            u'IRC_App0BW': u'I2PApp0BW',
            u'Eepsites_App0BW': u'I2PApp0BW',
            u'I2PSNARK_App': u'I2PApp',
            u'IRC_App': u'I2PApp',
            u'Eepsites_App': u'I2PApp',
            u'ExploratoryTunnels_App': u'I2PApp',
            u'ParticipatingTunnels_App': u'I2PApp',
            u'Tor': u'Tor',
            u'Streaming': u'TorApp',
            u'Torrent': u'TorApp',
            u'Browsing': u'TorApp',
            u'Flashproxy': u'TorPT',
            u'FTE': u'TorPT',
            u'Meek': u'TorPT',
            u'Obfs3': u'TorPT',
            u'scramblesuit': u'TorPT'
        })

        oracle_inferred = []
        prediction_inferred = []

        for i in range(self.tag_under_test):
            oracle_inferred.append(list())
            prediction_inferred.append(list())

        # Infering superior levels
        for i in range(k):
            # Assign of prediction to a dummy to use this one in consecutive label swaps
            inferred_prediction = prediction[i].copy()
            inferred_oracle = oracle[i].copy()
            for j in reversed(range(self.tag_under_test)):
                inferred_oracle = np.vectorize(relations[j].get)(
                    list(inferred_oracle))
                inferred_prediction = np.vectorize(relations[j].get)(
                    list(inferred_prediction))
                oracle_inferred[j].append(inferred_oracle)
                prediction_inferred[j].append(inferred_prediction)
        print('\n***\nStart testing with incremental gamma threshold\n***\n')

        bar = progressbar.ProgressBar(maxval=9,
                                      widgets=[
                                          progressbar.Bar('=', '[', ']'), ' ',
                                          progressbar.Percentage()
                                      ])
        bar.start()

        oracle_gamma = []
        prediction_gamma = []
        classified_ratio = []

        for i in range(9):
            gamma = float(i + 1) / 10.0

            oracle_gamma.append(list())
            prediction_gamma.append(list())
            classified_ratio.append(list())

            for j in range(k):
                indexes = []
                p_cnt = 0
                for p in probability[j]:
                    if max(p) < gamma:
                        indexes.append(p_cnt)
                    p_cnt += 1
                gamma_oracle = np.delete(oracle[j], [indexes])
                gamma_prediction = np.delete(prediction[j], [indexes])
                oracle_gamma[i].append(gamma_oracle)
                prediction_gamma[i].append(gamma_prediction)
                classified_ratio[i].append(
                    float(len(gamma_prediction)) / float(len(prediction[j])))

            bar.update(i)

        bar.finish()

        data_folder = './data_' + self.classifier_name + '/material/'

        if not os.path.exists('./data_' + self.classifier_name):
            os.makedirs('./data_' + self.classifier_name)
            os.makedirs(data_folder)
        elif not os.path.exists(data_folder):
            os.makedirs(data_folder)

        if self.packets_number != 0:
            file = open(
                data_folder + 'flat_early_level_' + str(self.level_target) +
                '_p_' + str(self.packets_number) + '.dat', 'w+')
        else:
            file = open(
                data_folder + 'flat_flow_level_' + str(self.level_target) +
                '_f_' + str(self.features_number) + '.dat', 'w+')

        for i in range(k):
            file.write('@fold\n')
            for o, p in zip(oracle[i], prediction[i]):
                file.write(str(o) + ' ' + str(p) + '\n')

        file.close()

        for i in range(self.tag_under_test):

            if self.packets_number != 0:
                file = open(
                    data_folder + 'flat_early_level_' +
                    str(self.level_target) + '_p_' + str(self.packets_number) +
                    '_inferred_' + str(i + 1) + '.dat', 'w+')
            else:
                file = open(
                    data_folder + 'flat_flow_level_' + str(self.level_target) +
                    '_f_' + str(self.features_number) + '_inferred_' +
                    str(i + 1) + '.dat', 'w+')

            for j in range(k):
                file.write('@fold\n')
                for o, p in zip(oracle_inferred[i][j],
                                prediction_inferred[i][j]):
                    file.write(str(o) + ' ' + str(p) + '\n')

            file.close()

        for i in range(9):
            if self.packets_number != 0:
                file = open(
                    data_folder + 'flat_early_level_' +
                    str(self.level_target) + '_p_' + str(self.packets_number) +
                    '_gamma_' + str(float(i + 1) / 10.0) + '.dat', 'w+')
            else:
                file = open(
                    data_folder + 'flat_flow_level_' + str(self.level_target) +
                    '_f_' + str(self.features_number) + '_gamma_' +
                    str(float(i + 1) / 10.0) + '.dat', 'w+')

            for j in range(k):
                file.write('@fold_cr\n')
                file.write(str(classified_ratio[i][j]) + '\n')
                for o, p in zip(oracle_gamma[i][j], prediction_gamma[i][j]):
                    file.write(str(o) + ' ' + str(p) + '\n')

            file.close()

        ###

        jvm.stop()
 def draw_progress_bar(self, value):
     bar = progressbar.ProgressBar(maxval=value, widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
     bar.start()
     for i in xrange(value):
         bar.update(i+1)
         sleep(0.2)
     bar.finish()
Beispiel #3
0
def retrack_reuse_data_association(
    h5_filename=None,
    output_h5_filename=None,
    kalman_filename=None,
    start=None,
    stop=None,
    less_ram=False,
    show_progress=False,
    show_progress_json=False,
):
    if os.path.exists(output_h5_filename):
        raise RuntimeError("will not overwrite old file '%s'" %
                           output_h5_filename)

    ca = core_analysis.get_global_CachingAnalyzer()
    with ca.kalman_analysis_context(kalman_filename,
                                    data2d_fname=h5_filename) as h5_context:
        R = h5_context.get_reconstructor()
        if less_ram:
            ML_estimates_2d_idxs = h5_context.get_pytable_node(
                "ML_estimates_2d_idxs")
        else:
            ML_estimates_2d_idxs = h5_context.load_entire_table(
                "ML_estimates_2d_idxs")
        use_obj_ids = h5_context.get_unique_obj_ids()
        extra = h5_context.get_extra_info()
        dt = 1.0 / extra["frames_per_second"]
        dynamic_model_name = extra["dynamic_model_name"]
        kalman_model = dynamic_models.get_kalman_model(name=dynamic_model_name,
                                                       dt=dt)
        kalman_model["max_frames_skipped"] = 2**62  # close to max i64

        fps = extra["frames_per_second"]
        camn2cam_id, cam_id2camns = h5_context.get_caminfo_dicts()

        parsed = h5_context.read_textlog_header()
        if "trigger_CS3" not in parsed:
            parsed["trigger_CS3"] = "unknown"

        textlog_save_lines = [
            "retrack_reuse_data_association running at %s fps, (top %s, trigger_CS3 %s, flydra_version %s)"
            % (
                str(fps),
                str(parsed.get("top", "unknown")),
                str(parsed["trigger_CS3"]),
                flydra_analysis.version.__version__,
            ),
            "original file: %s" % (kalman_filename, ),
            "dynamic model: %s" % (dynamic_model_name, ),
            "reconstructor file: %s" % (kalman_filename, ),
        ]

        with open_file_safe(
                output_h5_filename,
                mode="w",
                title="tracked Flydra data file",
                delete_on_error=True,
        ) as output_h5:

            h5saver = KalmanSaver(
                output_h5,
                R,
                cam_id2camns=cam_id2camns,
                min_observations_to_save=0,
                textlog_save_lines=textlog_save_lines,
                dynamic_model_name=dynamic_model_name,
                dynamic_model=kalman_model,
            )

            # associate framenumbers with timestamps using 2d .h5 file
            if less_ram:
                data2d = h5_context.get_pytable_node("data2d_distorted",
                                                     from_2d_file=True)
                h5_framenumbers = data2d.cols.frame[:]
            else:
                data2d = h5_context.load_entire_table("data2d_distorted",
                                                      from_2d_file=True)
                h5_framenumbers = data2d["frame"]
            h5_frame_qfi = result_utils.QuickFrameIndexer(h5_framenumbers)

            if show_progress:
                string_widget = StringWidget()
                objs_per_sec_widget = progressbar.FileTransferSpeed(
                    unit="obj_ids ")
                widgets = [
                    string_widget,
                    objs_per_sec_widget,
                    progressbar.Percentage(),
                    progressbar.Bar(),
                    progressbar.ETA(),
                ]
                pbar = progressbar.ProgressBar(
                    widgets=widgets, maxval=len(use_obj_ids)).start()

            for obj_id_enum, obj_id in enumerate(use_obj_ids):
                if show_progress:
                    string_widget.set_string("[obj_id: % 5d]" % obj_id)
                    pbar.update(obj_id_enum)
                if show_progress_json and obj_id_enum % 100 == 0:
                    rough_percent_done = float(obj_id_enum) / len(
                        use_obj_ids) * 100.0
                    result_utils.do_json_progress(rough_percent_done)

                tro = None
                first_frame_per_obj = True
                obj_3d_rows = h5_context.load_dynamics_free_MLE_position(
                    obj_id)
                for this_3d_row in obj_3d_rows:
                    # iterate over each sample in the current camera
                    framenumber = this_3d_row["frame"]
                    if start is not None:
                        if not framenumber >= start:
                            continue
                    if stop is not None:
                        if not framenumber <= stop:
                            continue
                    h5_2d_row_idxs = h5_frame_qfi.get_frame_idxs(framenumber)
                    if len(h5_2d_row_idxs) == 0:
                        # At the start, there may be 3d data without 2d data.
                        continue

                    # If there was a 3D ML estimate, there must be 2D data.

                    frame2d = data2d[h5_2d_row_idxs]

                    obs_2d_idx = this_3d_row["obs_2d_idx"]
                    kobs_2d_data = ML_estimates_2d_idxs[int(obs_2d_idx)]

                    # Parse VLArray.
                    this_camns = kobs_2d_data[0::2]
                    this_camn_idxs = kobs_2d_data[1::2]

                    # Now, for each camera viewing this object at this
                    # frame, extract images.
                    observation_camns = []
                    observation_idxs = []
                    data_dict = {}
                    used_camns_and_idxs = []
                    cam_ids_and_points2d = []

                    for camn, frame_pt_idx in zip(this_camns, this_camn_idxs):
                        try:
                            cam_id = camn2cam_id[camn]
                        except KeyError:
                            warnings.warn("camn %d not found" % (camn, ))
                            continue

                        # find 2D point corresponding to object
                        cond = (frame2d["camn"] == camn) & (
                            frame2d["frame_pt_idx"] == frame_pt_idx)
                        idxs = np.nonzero(cond)[0]
                        if len(idxs) == 0:
                            # no frame for that camera (start or stop of file)
                            continue
                        elif len(idxs) > 1:
                            print(
                                "MEGA WARNING MULTIPLE 2D POINTS\n",
                                camn,
                                frame_pt_idx,
                                "\n\n",
                            )
                            continue

                        idx = idxs[0]

                        frame2d_row = frame2d[idx]
                        x2d_real = frame2d_row["x"], frame2d_row["y"]
                        pt_undistorted = R.undistort(cam_id, x2d_real)
                        x2d_area = frame2d_row["area"]

                        observation_camns.append(camn)
                        observation_idxs.append(idx)
                        candidate_point_list = []
                        data_dict[camn] = candidate_point_list
                        used_camns_and_idxs.append((camn, frame_pt_idx, None))

                        # with no orientation
                        observed_2d = (pt_undistorted[0], pt_undistorted[1],
                                       x2d_area)

                        cam_ids_and_points2d.append((cam_id, observed_2d))

                    if first_frame_per_obj:
                        if len(cam_ids_and_points2d) < 2:
                            warnings.warn(
                                "some 2D data seems to be missing, cannot completely reconstruct"
                            )
                        else:
                            X3d = R.find3d(
                                cam_ids_and_points2d,
                                return_line_coords=False,
                                simulate_via_tracking_dynamic_model=
                                kalman_model,
                            )

                            # first frame
                            tro = TrackedObject(
                                R,
                                obj_id,
                                framenumber,
                                X3d,  # obs0_position
                                None,  # obs0_Lcoords
                                observation_camns,  # first_observation_camns
                                observation_idxs,  # first_observation_idxs
                                kalman_model=kalman_model,
                            )
                            del X3d
                            first_frame_per_obj = False
                    else:
                        tro.calculate_a_posteriori_estimate(
                            framenumber,
                            data_dict,
                            camn2cam_id,
                            skip_data_association=True,
                            original_camns_and_idxs=used_camns_and_idxs,
                            original_cam_ids_and_points2d=cam_ids_and_points2d,
                        )

                # done with all data for this obj_id
                if tro is not None:
                    tro.kill()
                    h5saver.save_tro(tro, force_obj_id=obj_id)
    if show_progress_json:
        result_utils.do_json_progress(100)
Beispiel #4
0
def run(df: pd.DataFrame,
        expected_imbalance_window: int = 100,
        num_prev_bars: int = 100,
        expected_num_ticks: int = 100,
        expected_num_ticks_min_max: list = [80, 200],
        run_type: str = 'tick') -> pd.DataFrame:
    """런 바를 구한다
    df:                                   틱 데이터의 pandas.DataFrame 객체 입력
    expected_imbalance_window:            기대 불균형의 최대 윈도우 크기
    num_prev_bars:                        E[T]의 지수가중평균을 구할 때의 window 및 span 크기
    expected_num_ticks:
    expected_num_ticks_min_max:           제한을 두지 않을 경우는 [0, np.inf]로 설정
    run_type:                             tick->틱 불균형바, volume->거래량 불균형바, dollar->달러(원) 불균형바
    """

    print(f'(*) {run_type} run bar를 생성합니다.')

    # 바 추출 타입 체크
    # assert run_type in ('tick', 'volume', 'dollar'), 'wrong run_type'
    _run_type = ('tick', 'volume', 'dollar').index(run_type)

    # 바 추출 이후 초기화 되지 않을 변수들
    signs_sell = []
    signs_buy = []
    list_bars = []
    num_ticks_bar = []
    tick_num = 0
    prv_sign = 0
    expected_imbalance_buy = expected_imbalance_sell = None
    buy_ticks_proportion = []

    # 바 추출 이후 초기화될 변수들
    price_open = price_close = prv_price = None
    exp_buy_ticks_proportion = None
    exp_sell_ticks_proportion = None
    price_high, price_low = -np.inf, np.inf
    cum_theta = cum_tick = cum_dollar = cum_volume = cum_theta_buy = cum_theta_sell = buy_tick_num = 0

    sample_size = len(df)
    data = df.values
    data_len = len(data)

    # 진행률 체크용 progress 바 생성
    bar = progressbar.ProgressBar(maxval=data_len,
                                  widgets=[
                                      progressbar.Bar('=', '[', ']'), ' ',
                                      progressbar.Percentage()
                                  ])
    bar.start()

    same_time_idx = 0
    prv_date_time = None
    for d in data:
        tick_num += 1
        bar.update(tick_num)
        date_time = _date_time = d[0]
        # 해상도가 초단위이기 때문에 추후 일일 변동성 구할 때 중복된 값이 추출될 수 있음
        # 단위 시간 애 중복이 있는 경우 tick 순서대로 microsecond 단위로 유니크한 값을 기록하도록 함
        if prv_date_time == _date_time:
            same_time_idx += 1
            date_time += '.{:06d}'.format(same_time_idx)
        else:
            same_time_idx = 0
            date_time += '.{:06d}'.format(same_time_idx)
        prv_date_time = _date_time
        price = d[1]
        volume = d[2]
        dollar = price * volume
        ## make ohlc ##
        if price_open is None: price_open = price  # open
        if price > price_high: price_high = price  # high
        if price < price_low: price_low = price  # low
        price_close = price  # close

        ## 누적 tick / dollar / volume
        cum_tick += 1
        cum_dollar += dollar
        cum_volume += volume

        ############## tick_delta 초기값 설정 #################
        if prv_price is not None:
            tick_delta = price - prv_price  # 이전 값이 있는 경우 delta 구함
        else:
            tick_delta = 0  # None인 경우 0으로 세팅

        ############## imbalance 계산 ####################
        if tick_delta != 0:
            _sign = 0
            if tick_delta > 0: _sign = 1
            elif tick_delta < 0: _sign = -1
        else:
            _sign = prv_sign

        if _run_type == 0:  # tick type
            imbalance = _sign
        elif _run_type == 1:  # volume type
            imbalance = _sign * volume
        elif _run_type == 2:  # dollar type
            imbalance = _sign * dollar

        if imbalance > 0:
            cum_theta_buy += imbalance
            buy_tick_num += 1
            signs_buy.append(imbalance)  # 매수 불균형 바 list
        elif imbalance < 0:
            _imbalance = abs(imbalance)
            cum_theta_sell += _imbalance
            signs_sell.append(_imbalance)  # 매도 불균형 바 list

        prv_price = price
        prv_sign = _sign

        ############# 초기 기대 불균형 값 세팅 #################
        if expected_imbalance_buy is None or expected_imbalance_sell is None:
            expected_imbalance_buy = _get_expected_imbalance(
                signs_buy,
                expected_num_ticks,
                expected_imbalance_window,
                warm_up=True)
            expected_imbalance_sell = _get_expected_imbalance(
                signs_sell,
                expected_num_ticks,
                expected_imbalance_window,
                warm_up=True)

            if expected_imbalance_buy is not None and expected_imbalance_sell is not None:
                exp_buy_ticks_proportion = buy_tick_num / cum_tick
                exp_sell_ticks_proportion = (1 - exp_buy_ticks_proportion)

        if exp_buy_ticks_proportion is None: max_proportion = None
        else:
            max_proportion = max(
                expected_imbalance_buy * exp_buy_ticks_proportion,
                expected_imbalance_sell * exp_sell_ticks_proportion)

        max_theta = max(cum_theta_buy, cum_theta_sell)

        ############# bar 추출 #############
        if max_proportion is not None and max_theta > expected_num_ticks * max_proportion:
            #### bar 생성 ####
            bar_info = dict(date_time=date_time,
                            tick_num=tick_num,
                            open=price_open,
                            high=price_high,
                            low=price_low,
                            close=price_close,
                            cum_vol=cum_volume,
                            cum_dallar=cum_dollar)

            # # 관측 값이 과도하게 크거나 작을 경우 이상치로 판단하여 무시함
            # if _run_type:
            #     if imbalance > 0 and is_outlier(signs_buy[:-1], signs_buy[-1]):
            #         _ = signs_buy.pop()          # 이미 입력된 이상치 제거
            #         continue
            #     elif imbalance < 0 and is_outlier(signs_sell[:-1], signs_sell[-1]):
            #         _ = signs_sell.pop()         # 이미 입력된 이상치 제거
            #         continue

            list_bars.append(bar_info)
            num_ticks_bar.append(cum_tick)
            buy_ticks_proportion.append(buy_tick_num / cum_tick)

            # 기대  buy ticks proportion based on formed bars
            exp_buy_ticks_proportion = ewm_mean(
                buy_ticks_proportion[-num_prev_bars:], num_prev_bars)[-1]
            exp_sell_ticks_proportion = (1 - exp_buy_ticks_proportion)

            #### 기대값 계산 ####
            expected_num_ticks = _get_exp_num_ticks(
                num_ticks_bar, num_prev_bars,
                expected_num_ticks_min_max)  # E[T]의 기대 크기
            expected_imbalance_buy = _get_expected_imbalance(
                signs_buy, expected_num_ticks,
                expected_imbalance_window)  # 기대 불균형
            expected_imbalance_sell = _get_expected_imbalance(
                signs_sell, expected_num_ticks,
                expected_imbalance_window)  # 기대 불균형

            # 바 추출 이후 초기화될 변수들
            price_open = price_close = None
            price_high, price_low = -np.inf, np.inf
            cum_theta_buy = cum_theta_sell = cum_tick = cum_dollar = cum_volume = buy_tick_num = 0

    bar.finish()
    df = pd.DataFrame(list_bars)
    df['date_time'] = pd.to_datetime(df['date_time'])
    return df
Beispiel #5
0
    def train(self):
        with tf.Graph().as_default():

            logging.info("add model")

            var = self.add_model()

            saver = tf.train.Saver()

            # config = tf.ConfigProto(allow_soft_placement=True)
            # config.gpu_options.allow_growth = True
            # sess = tf.Session(config=config)

            sess = tf.Session()

            sess.run(tf.initialize_all_variables())

            total_batch = int(
                np.ceil(len(self.train_data) / float(self.args.batch)))

            for epoch in xrange(self.args.epochs):

                total_loss = 0.0
                total_acc_sum = 0.0
                total_count = 0
                pbar = pb.ProgressBar(widgets=[
                    "[TRAIN] ",
                    pb.DynamicMessage('loss'), " ",
                    pb.DynamicMessage('acc'), " ",
                    pb.FileTransferSpeed(unit="batchs"),
                    pb.Percentage(),
                    pb.Bar(),
                    pb.Timer(), " ",
                    pb.ETA()
                ],
                                      maxval=total_batch).start()

                for i in xrange(total_batch):
                    batchx, batchy = self.next_batch(self.args.batch)
                    _, loss, acc_sum = sess.run(
                        [var['opt'], var['cost'], var['acc_sum']],
                        feed_dict={
                            var['x']: batchx,
                            var['y']: batchy,
                            var['keep_prob']: 0.7
                        })
                    total_loss += loss
                    total_acc_sum += acc_sum
                    total_count += len(batchx)
                    pbar.update(i,
                                loss=total_loss / total_count,
                                acc=total_acc_sum / total_count)
                pbar.finish()

                v_loss, v_acc_sum = self.eval(sess, var)
                v_size = len(self.val_data)

                logging.info(
                    "Epoch {}: tr_loss: {}, tr_acc: {}\n{}v_loss: {}, v_acc: {}"
                    .format(epoch, total_loss / total_count,
                            total_acc_sum / total_count, "                   ",
                            v_loss / v_size, v_acc_sum / v_size))

            logging.info("save model")
            save_path = saver.save(sess, self.args.model)
            logging.info("save model in path: {}".format(save_path))
labels = [dict_data[i] for i in ids]

# encode the labels
le = LabelEncoder()
labels = le.fit_transform(labels)

# initialize the HDF5 dataset writer, then store the class label names in the dataset
dataset = HDF5DatasetWriter(
    (len(imagePaths), config.INPUT_SIZE, config.INPUT_SIZE, 3),
    config.TRAIN_HDF5)
dataset.storeClassLabels(le.classes_)

# initialize the progress bar
widgets = [
    "Saving Images: ",
    progressbar.Percentage(), " ",
    progressbar.Bar(), " ",
    progressbar.ETA()
]
pbar = progressbar.ProgressBar(maxval=len(imagePaths), widgets=widgets).start()

# loop over the images in batches
for i in np.arange(0, len(imagePaths)):
    # Grab values
    imagePath = imagePaths[i]
    label = labels[i]
    _id = ids[i]

    # load the input image using the Keras helper utility
    # while ensuring the image is resized
    image = load_img(imagePath,
def scan_regionset(regionset, options):
    """ This function scans all te region files in a regionset object
    and fills the ScannedRegionFile obj with the results
    """

    total_regions = len(regionset.regions)
    total_chunks = 0
    corrupted_total = 0
    wrong_total = 0
    entities_total = 0
    too_small_total = 0
    unreadable = 0

    # init progress bar
    if not options.verbose:
        pbar = progressbar.ProgressBar(widgets=[
            'Scanning: ',
            FractionWidget(), ' ',
            progressbar.Percentage(), ' ',
            progressbar.Bar(left='[', right=']'), ' ',
            progressbar.ETA()
        ],
                                       maxval=total_regions)

    # queue used by processes to pass finished stuff
    q = queues.SimpleQueue()
    pool = multiprocessing.Pool(processes=options.processes,
                                initializer=_mp_pool_init,
                                initargs=(regionset, options, q))

    if not options.verbose:
        pbar.start()

    # start the pool
    # Note to self: every child process has his own memory space,
    # that means every obj recived by them will be a copy of the
    # main obj
    result = pool.map_async(multithread_scan_regionfile,
                            regionset.list_regions(None),
                            max(1, total_regions // options.processes))

    # printing status
    region_counter = 0

    while not result.ready() or not q.empty():
        time.sleep(0.01)
        if not q.empty():
            r = q.get()
            if r == None:  # something went wrong scanning this region file
                # probably a bug... don't know if it's a good
                # idea to skip it
                continue
            if not isinstance(r, world.ScannedRegionFile):
                raise ChildProcessException(r)
            else:
                corrupted, wrong, entities_prob, shared_offset, num_chunks = r.get_counters(
                )
                filename = r.filename
                # the obj returned is a copy, overwrite it in regionset
                regionset[r.get_coords()] = r
                corrupted_total += corrupted
                wrong_total += wrong
                total_chunks += num_chunks
                entities_total += entities_prob
                if r.status == world.REGION_TOO_SMALL:
                    too_small_total += 1
                elif r.status == world.REGION_UNREADABLE:
                    unreadable += 1
                region_counter += 1
                if options.verbose:
                    if r.status == world.REGION_OK:
                        stats = "(c: {0}, w: {1}, tme: {2}, so: {3}, t: {4})".format(
                            corrupted, wrong, entities_prob, shared_offset,
                            num_chunks)
                    elif r.status == world.REGION_TOO_SMALL:
                        stats = "(Error: not a region file)"
                    elif r.status == world.REGION_UNREADABLE:
                        stats = "(Error: unreadable region file)"
                    print "Scanned {0: <12} {1:.<43} {2}/{3}".format(
                        filename, stats, region_counter, total_regions)
                else:
                    pbar.update(region_counter)

    if not options.verbose: pbar.finish()

    regionset.scanned = True
Beispiel #8
0
ap.add_argument("-d", "--dataset", required=True,
	help="path to input directory of images")
ap.add_argument("-o", "--output", required=True,
	help="path to output directory of rotated iamges")
args = vars(ap.parse_args())

# grab the paths to the input images (limiting ourselves to 10,000
# images) and shuffle them to make creating a training and testing
# split easier
imagePaths = list(paths.list_images(args["dataset"]))[:10000]
random.shuffle(imagePaths)

# initialize a dictionary to keep track of the number of each angle
# chosen so far, then initialize the progress bar
angles = {}
widgets = ["Building Dataset: ", progressbar.Percentage(), " ",
	progressbar.Bar(), " ", progressbar.ETA()]
pbar = progressbar.ProgressBar(maxval=len(imagePaths),
	widgets=widgets).start()

# loop over the image paths
for (i, imagePath) in enumerate(imagePaths):
	# determine the rotation angle, and load the image
	angle = np.random.choice([0, 90, 180, 270])
	image = cv2.imread(imagePath)

	# if the image is None (meaning there was an issue loading the
	# image from disk, simply skip it)
	if image is None:
		continue
Beispiel #9
0
def plot_raydensity(
    map_object,
    station_events: List[Tuple[dict, dict]],
    domain: object,
    projection: cp.crs.Projection,
):
    """
    Create a ray-density plot for all events and all stations.

    This function is potentially expensive and will use all CPUs available.
    Does require geographiclib to be installed.

    :param map_object: The cartopy domain plot object
    :type map_object: cp.mpl.geoaxes.GeoAxes
    :param station_events: A list of tuples with two dictionaries
    :type station_events: List[Tuple[dict, dict]]
    :param domain: An object with the domain plot
    :type domain: object
    :param projection: cartopy projection object
    :type projection: cp.crs.Projection
    """
    import ctypes as C
    from lasif.tools.great_circle_binner import GreatCircleBinner
    from lasif.utils import Point
    import multiprocessing
    import progressbar
    from scipy.stats import scoreatpercentile

    # Merge everything so that a list with coordinate pairs is created. This
    # list is then distributed among all processors.
    station_event_list = []
    for event, stations in station_events:

        e_point = Point(event["latitude"], event["longitude"])
        for station in stations.values():

            p = Point(station["latitude"], station["longitude"])
            station_event_list.append((e_point, p))

    circle_count = len(station_event_list)

    # The granularity of the latitude/longitude discretization for the
    # raypaths. Attempt to get a somewhat meaningful result in any case.
    if circle_count < 1000:
        lat_lng_count = 1000
    elif circle_count < 10000:
        lat_lng_count = 2000
    else:
        lat_lng_count = 3000

    cpu_count = multiprocessing.cpu_count()

    def to_numpy(raw_array, dtype, shape):
        data = np.frombuffer(raw_array.get_obj())
        data.dtype = dtype
        return data.reshape(shape)

    print("\nLaunching %i great circle calculations on %i CPUs..." %
          (circle_count, cpu_count))

    widgets = [
        "Progress: ",
        progressbar.Percentage(),
        progressbar.Bar(),
        "",
        progressbar.ETA(),
    ]
    pbar = progressbar.ProgressBar(widgets=widgets,
                                   maxval=circle_count).start()

    def great_circle_binning(sta_evs, bin_data_buffer, bin_data_shape, lock,
                             counter):
        new_bins = GreatCircleBinner(
            domain.min_lat,
            domain.max_lat,
            lat_lng_count,
            domain.min_lon,
            domain.max_lon,
            lat_lng_count,
        )
        for event, station in sta_evs:
            with lock:
                counter.value += 1
            if not counter.value % 25:
                pbar.update(counter.value)
            new_bins.add_greatcircle(event, station)

        bin_data = to_numpy(bin_data_buffer, np.uint32, bin_data_shape)
        with bin_data_buffer.get_lock():
            bin_data += new_bins.bins

    # Split the data in cpu_count parts.
    def chunk(seq, num):
        avg = len(seq) / float(num)
        out = []
        last = 0.0
        while last < len(seq):
            out.append(seq[int(last):int(last + avg)])
            last += avg
        return out

    chunks = chunk(station_event_list, cpu_count)

    # One instance that collects everything.
    collected_bins = GreatCircleBinner(
        domain.min_lat,
        domain.max_lat,
        lat_lng_count,
        domain.min_lon,
        domain.max_lon,
        lat_lng_count,
    )

    # Use a multiprocessing shared memory array and map it to a numpy view.
    collected_bins_data = multiprocessing.Array(C.c_uint32,
                                                collected_bins.bins.size)
    collected_bins.bins = to_numpy(collected_bins_data, np.uint32,
                                   collected_bins.bins.shape)

    # Create, launch and join one process per CPU. Use a shared value as a
    # counter and a lock to avoid race conditions.
    processes = []
    lock = multiprocessing.Lock()
    counter = multiprocessing.Value("i", 0)
    for _i in range(cpu_count):
        processes.append(
            multiprocessing.Process(
                target=great_circle_binning,
                args=(
                    chunks[_i],
                    collected_bins_data,
                    collected_bins.bins.shape,
                    lock,
                    counter,
                ),
            ))
    for process in processes:
        process.start()
    for process in processes:
        process.join()

    pbar.finish()

    stations = chain.from_iterable(
        (_i[1].values() for _i in station_events if _i[1]))
    # Remove duplicates
    stations = [(_i["latitude"], _i["longitude"]) for _i in stations]
    stations = set(stations)
    title = "%i Events, %i unique raypaths, " "%i unique stations" % (
        len(station_events),
        circle_count,
        len(stations),
    )
    plt.title(title, size="xx-large")

    data = collected_bins.bins.transpose()

    if data.max() >= 10:
        data = np.log10(np.clip(data, a_min=0.5, a_max=data.max()))
        data[data >= 0.0] += 0.1
        data[data < 0.0] = 0.0
        max_val = scoreatpercentile(data.ravel(), 99)
    else:
        max_val = data.max()

    cmap = cm.get_cmap("gist_heat")
    cmap._init()
    cmap._lut[:120, -1] = np.linspace(0, 1.0, 120)**2

    lngs, lats = collected_bins.coordinates
    ln, la = project_points(projection, lngs, lats)

    map_object.pcolormesh(ln,
                          la,
                          data,
                          cmap=cmap,
                          vmin=0,
                          vmax=max_val,
                          zorder=10)
    # Draw the coastlines so they appear over the rays. Otherwise things are
    # sometimes hard to see.
    map_object.add_feature(cp.feature.COASTLINE, zorder=13)
    map_object.add_feature(cp.feature.BORDERS, linestyle=":", zorder=13)
Beispiel #10
0
def plot_raydensity(map_object, station_events, domain):
    """
    Create a ray-density plot for all events and all stations.

    This function is potentially expensive and will use all CPUs available.
    Does require geographiclib to be installed.
    """
    import ctypes as C
    from lasif import rotations
    from lasif.domain import RectangularSphericalSection
    from lasif.tools.great_circle_binner import GreatCircleBinner
    from lasif.utils import Point
    import multiprocessing
    import progressbar
    from scipy.stats import scoreatpercentile

    if not isinstance(domain, RectangularSphericalSection):
        raise NotImplementedError(
            "Raydensity currently only implemented for rectangular domains. "
            "Should be easy to implement for other domains. Let me know.")

    # Merge everything so that a list with coordinate pairs is created. This
    # list is then distributed among all processors.
    station_event_list = []
    for event, stations in station_events:
        if domain.rotation_angle_in_degree:
            # Rotate point to the non-rotated domain.
            e_point = Point(*rotations.rotate_lat_lon(
                event["latitude"], event["longitude"], domain.rotation_axis,
                -1.0 * domain.rotation_angle_in_degree))
        else:
            e_point = Point(event["latitude"], event["longitude"])
        for station in stations.values():
            # Rotate point to the non-rotated domain if necessary.
            if domain.rotation_angle_in_degree:
                p = Point(*rotations.rotate_lat_lon(
                    station["latitude"], station["longitude"],
                    domain.rotation_axis, -1.0 *
                    domain.rotation_angle_in_degree))
            else:
                p = Point(station["latitude"], station["longitude"])
            station_event_list.append((e_point, p))

    circle_count = len(station_event_list)

    # The granularity of the latitude/longitude discretization for the
    # raypaths. Attempt to get a somewhat meaningful result in any case.
    lat_lng_count = 1000
    if circle_count < 1000:
        lat_lng_count = 1000
    if circle_count < 10000:
        lat_lng_count = 2000
    else:
        lat_lng_count = 3000

    cpu_count = multiprocessing.cpu_count()

    def to_numpy(raw_array, dtype, shape):
        data = np.frombuffer(raw_array.get_obj())
        data.dtype = dtype
        return data.reshape(shape)

    print("\nLaunching %i greatcircle calculations on %i CPUs..." %
          (circle_count, cpu_count))

    widgets = [
        "Progress: ",
        progressbar.Percentage(),
        progressbar.Bar(), "",
        progressbar.ETA()
    ]
    pbar = progressbar.ProgressBar(widgets=widgets,
                                   maxval=circle_count).start()

    def great_circle_binning(sta_evs, bin_data_buffer, bin_data_shape, lock,
                             counter):
        new_bins = GreatCircleBinner(domain.min_latitude, domain.max_latitude,
                                     lat_lng_count, domain.min_longitude,
                                     domain.max_longitude, lat_lng_count)
        for event, station in sta_evs:
            with lock:
                counter.value += 1
            if not counter.value % 25:
                pbar.update(counter.value)
            new_bins.add_greatcircle(event, station)

        bin_data = to_numpy(bin_data_buffer, np.uint32, bin_data_shape)
        with bin_data_buffer.get_lock():
            bin_data += new_bins.bins

    # Split the data in cpu_count parts.
    def chunk(seq, num):
        avg = len(seq) / float(num)
        out = []
        last = 0.0
        while last < len(seq):
            out.append(seq[int(last):int(last + avg)])
            last += avg
        return out

    chunks = chunk(station_event_list, cpu_count)

    # One instance that collects everything.
    collected_bins = GreatCircleBinner(domain.min_latitude,
                                       domain.max_latitude, lat_lng_count,
                                       domain.min_longitude,
                                       domain.max_longitude, lat_lng_count)

    # Use a multiprocessing shared memory array and map it to a numpy view.
    collected_bins_data = multiprocessing.Array(C.c_uint32,
                                                collected_bins.bins.size)
    collected_bins.bins = to_numpy(collected_bins_data, np.uint32,
                                   collected_bins.bins.shape)

    # Create, launch and join one process per CPU. Use a shared value as a
    # counter and a lock to avoid race conditions.
    processes = []
    lock = multiprocessing.Lock()
    counter = multiprocessing.Value("i", 0)
    for _i in range(cpu_count):
        processes.append(
            multiprocessing.Process(target=great_circle_binning,
                                    args=(chunks[_i], collected_bins_data,
                                          collected_bins.bins.shape, lock,
                                          counter)))
    for process in processes:
        process.start()
    for process in processes:
        process.join()

    pbar.finish()

    stations = chain.from_iterable(
        (list(_i[1].values()) for _i in station_events if _i[1]))
    # Remove duplicates
    stations = [(_i["latitude"], _i["longitude"]) for _i in stations]
    stations = set(stations)
    title = "%i Events, %i unique raypaths, "\
            "%i unique stations" % (len(station_events), circle_count,
                                    len(stations))
    plt.title(title, size="xx-large")

    data = collected_bins.bins.transpose()

    if data.max() >= 10:
        data = np.log10(np.clip(data, a_min=0.5, a_max=data.max()))
        data[data >= 0.0] += 0.1
        data[data < 0.0] = 0.0
        max_val = scoreatpercentile(data.ravel(), 99)
    else:
        max_val = data.max()

    cmap = cm.get_cmap("gist_heat")
    cmap._init()
    cmap._lut[:120, -1] = np.linspace(0, 1.0, 120)**2

    # Slightly change the appearance of the map so it suits the rays.
    map_object.fillcontinents(color='#dddddd', lake_color='#dddddd', zorder=2)

    lngs, lats = collected_bins.coordinates
    # Rotate back if necessary!
    if domain.rotation_angle_in_degree:
        for lat, lng in zip(lats, lngs):
            lat[:], lng[:] = rotations.rotate_lat_lon(
                lat, lng, domain.rotation_axis,
                domain.rotation_angle_in_degree)
    ln, la = map_object(lngs, lats)
    map_object.pcolormesh(ln,
                          la,
                          data,
                          cmap=cmap,
                          vmin=0,
                          vmax=max_val,
                          zorder=10)
    # Draw the coastlines so they appear over the rays. Otherwise things are
    # sometimes hard to see.
    map_object.drawcoastlines(zorder=3)
    map_object.drawcountries(linewidth=0.2, zorder=3)
    def handle(self, *args, **options):
        if not os.path.exists(DATA_DIR):
            self.logger.info('Creating %s' % DATA_DIR)
            os.mkdir(DATA_DIR)

        translation_hack_path = os.path.join(DATA_DIR, 'translation_hack')

        self.noinsert = options.get('noinsert', False)
        self.widgets = [
            'RAM used: ',
            MemoryUsageWidget(),
            ' ',
            progressbar.ETA(),
            ' Done: ',
            progressbar.Percentage(),
            progressbar.Bar(),
        ]

        for url in SOURCES:
            destination_file_name = url.split('/')[-1]

            force = options.get('force_all', False)
            if not force:
                for f in options['force']:
                    if f in destination_file_name or f in url:
                        force = True

            geonames = Geonames(url, force=force)
            downloaded = geonames.downloaded

            force_import = options.get('force_import_all', False)

            if not force_import:
                for f in options['force_import']:
                    if f in destination_file_name or f in url:
                        force_import = True

            if downloaded or force_import:
                self.logger.info('Importing %s' % destination_file_name)

                if url in TRANSLATION_SOURCES:
                    if options.get('hack_translations', False):
                        if os.path.exists(translation_hack_path):
                            self.logger.debug(
                                'Using translation parsed data: %s' %
                                translation_hack_path)
                            continue

                i = 0
                progress = progressbar.ProgressBar(maxval=geonames.num_lines(),
                                                   widgets=self.widgets)

                for items in geonames.parse():
                    if url in CITY_SOURCES:
                        self.city_import(items)
                    elif url in REGION_SOURCES:
                        self.region_import(items)
                    elif url in COUNTRY_SOURCES:
                        self.country_import(items)
                    elif url in TRANSLATION_SOURCES:
                        # free some memory
                        if getattr(self, '_country_codes', False):
                            del self._country_codes
                        if getattr(self, '_region_codes', False):
                            del self._region_codes
                        self.translation_parse(items)

                    reset_queries()

                    i += 1
                    progress.update(i)

                progress.finish()

                if url in TRANSLATION_SOURCES and options.get(
                        'hack_translations', False):
                    with open(translation_hack_path, 'w+') as f:
                        pickle.dump(self.translation_data, f)

        if options.get('hack_translations', False):
            with open(translation_hack_path, 'r') as f:
                self.translation_data = pickle.load(f)

        self.logger.info('Importing parsed translation in the database')
        self.translation_import()
def make_progress_bar(text=None):
    widgets = (['%s: ' % text] if text else []) + [progressbar.Percentage(), ' ', 
                                                   progressbar.Bar(), ' ', 
                                                   progressbar.ETA()]
    return progressbar.ProgressBar(widgets=widgets)
Beispiel #13
0
    def loadMSRA(self, seqName, mode='train', replace=False, tApp=False):
        '''seqName: P0 - P8
           mode: if train, only save the cropped image
           replace: replace the previous cache file if exists
           tApp: append to previous loaded file if True
        '''
        if not hasattr(self, 'frmList'):
            self.frmList = []
        if not tApp:
            self.frmList = []

        pickleCachePath = '{}/msra_{}.pkl'.format(self.cache_base_path,
                                                  seqName)
        if os.path.isfile(pickleCachePath) and not replace:
            print 'direct load from the cache'
            t1 = time.time()
            f = open(pickleCachePath, 'rb')
            (self.frmList) += cPickle.load(f)
            t1 = time.time() - t1
            print 'loaded with {}s'.format(t1)
            return self.frmList

        Camera.setCamera('INTEL')
        pbar = pb.ProgressBar(
            maxval=500 * len(self.msra_pose_list),
            widgets=['Loading MSRA | ',
                     pb.Percentage(),
                     pb.Bar()])
        pbar.start()
        pbIdx = 0

        seqPath = '/'.join([self.msra_base_path, seqName])
        for pose_name in self.msra_pose_list:
            curPath = '/'.join([seqPath, pose_name, 'joint.txt'])
            f = open(curPath, 'r')
            frmNum = int(f.readline()[:-1])
            for frmIdx in range(frmNum):
                frmPath = '/'.join(
                    [seqPath, pose_name,
                     '%06i_depth.bin' % (frmIdx)])
                dm = DepthMap('MSRA', frmPath)
                skel = f.readline().split()
                skel = np.asarray([float(pt) for pt in skel])

                def cvtMSRA_skel(init_skel):
                    skel = init_skel.copy()
                    for i in range(len(skel)):
                        if i % 3 == 2:
                            skel[i] *= -1.0
                    return skel

                skel = cvtMSRA_skel(skel)
                self.frmList.append(Frame(dm, skel))
                if mode is 'train':
                    self.frmList[-1].saveOnlyForTrain()
                pbar.update(pbIdx)
                pbIdx += 1
        pbar.finish()

        if not os.path.exists(self.cache_base_path):
            os.makedirs(self.cache_base_path)
        f = open(pickleCachePath, 'wb')
        cPickle.dump((self.frmList), f, protocol=cPickle.HIGHEST_PROTOCOL)
        f.close()
        print 'loaded with {} frames'.format(len(self.frmList))
Beispiel #14
0
    def loadNYU(self, frmStartNum, cameraIdx = 1, tFlag = 'train', tApp =\
                False, isReplace=False):
        '''frmStartNum: starting frame index
           cameraIdx: [1,3]
           tFlag: save only the cropped image if is 'train'
           tApp: append to the previously loaded file if True
        '''
        Camera.setCamera('KINECT')
        if cameraIdx not in [1]:
            raise ValueError(
                'invalid cameraIdx, current only support view from 1')

        if tFlag not in ['train', 'test']:
            raise ValueError('invalid tFlag, can be only train or test')

        # load the annotation file
        matPath = '{}/{}/joint_data.mat'.format(self.nyu_base_path, tFlag)
        joint = sio.loadmat(matPath)
        joint_xyz = joint['joint_xyz'][cameraIdx - 1]
        joint_uvd = joint['joint_uvd'][cameraIdx - 1]
        matPath = './data/center_uvd_{}.mat'.format(tFlag)
        center = sio.loadmat(matPath)
        center = center['center_uvd']

        # determine the start and end frame
        if frmStartNum >= len(joint_xyz):
            raise ValueError(
                'invalid start frame, shoud be lower than {}'.format(
                    len(joint_xyz)))

        fileIdx = int(frmStartNum / self.nyu_frm_perfile)
        frmStartNum = fileIdx * self.nyu_frm_perfile
        if tFlag == 'train':
            frmEndNum = min(frmStartNum + self.nyu_frm_perfile, len(joint_xyz))
        elif tFlag == 'test':
            frmEndNum = len(joint_xyz)
        print 'frmStartNum={}, frmEndNum={}, fileIdx={}'.format(
            frmStartNum, frmEndNum, fileIdx)

        pickleCachePath = '{}/nyu_{}_{}_{}.pkl'.format(self.cache_base_path,
                                                       tFlag, cameraIdx,
                                                       fileIdx)
        if not hasattr(self, 'frmList'):
            self.frmList = []
        if not tApp:
            self.frmList = []

        if os.path.isfile(pickleCachePath) and isReplace == False:
            print 'direct load from the cache'
            print 'cache dir ={}'.format(pickleCachePath)
            t1 = time.time()
            f = open(pickleCachePath, 'rb')
            self.frmList += cPickle.load(f)
            t1 = time.time() - t1
            print 'loaded with {}s'.format(t1)
            return

        pbar = pb.ProgressBar(
            maxval=frmEndNum - frmStartNum,
            widgets=['Loading NYU | ',
                     pb.Percentage(),
                     pb.Bar()])
        pbar.start()
        pbIdx = 0

        for frmIdx in range(frmStartNum, frmEndNum):
            frmPath = '{}/{}/depth_{}_{:07d}.png'.format(
                self.nyu_base_path, tFlag, cameraIdx, frmIdx + 1)
            dm = DepthMap('NYU', frmPath)
            skel = joint_xyz[frmIdx]
            skel = np.reshape(skel, (-1))
            com_uvd = center[frmIdx]
            self.frmList.append(Frame(dm, skel, com_uvd))
            self.frmList[-1].saveOnlyForTrain()
            pbar.update(pbIdx)
            pbIdx += 1
        pbar.finish()

        if not os.path.exists(self.cache_base_path):
            os.makedirs(self.cache_base_path)
        f = open(pickleCachePath, 'wb')
        cPickle.dump((self.frmList), f, protocol=cPickle.HIGHEST_PROTOCOL)
        f.close()
        print 'loaded with {} frames'.format(len(self.frmList))
Beispiel #15
0
    def computeByDay(self):
        self.loadOriginal()
        exp_res = pd.read_csv(
            "haikou-experiments/results/SIMULATION_RESULTS_ALL_DIDI_CHUXING_HAIKOU.csv"
        )
        exp_res["real_start_time"] = pd.to_datetime(exp_res["real_start_time"])
        self.all_ODs = {}
        bar = progressbar.ProgressBar(widgets=[
            'Days ',
            progressbar.Percentage(),
            ' (',
            progressbar.SimpleProgress(),
            ') ',
            ' (',
            progressbar.AbsoluteETA(),
            ') ',
        ])
        all_days_str = exp_res["real_start_date"].unique()
        all_days = []
        print("共计天数:", len(all_days_str))
        for cur_date in bar(all_days_str):
            if self.date_week[cur_date] >= 5: continue
            sta_res = self.computeOneDay(
                exp_res[exp_res["real_start_date"] == cur_date], cur_date)
            all_days.append(sta_res)

        for sta_day in all_days:
            for period_index in range(len(PERIODS_MINUTES)):
                for key in sta_day[period_index].keys():
                    if sta_day[period_index][key]["num"] == 0: continue
                    self.all_ODs[key][period_index]["num"].append(
                        sta_day[period_index][key]["num"])
                    self.all_ODs[key][period_index]["matching_num"].append(
                        sta_day[period_index][key]["matching_num"])
                    self.all_ODs[key][period_index][
                        "matching_probability"].append(
                            sta_day[period_index][key]["matching_probability"])
                    self.all_ODs[key][period_index][
                        "aver_shared_distance"].append(
                            sta_day[period_index][key]["aver_shared_distance"])
                    self.all_ODs[key][period_index][
                        "aver_final_distance"].append(
                            sta_day[period_index][key]["aver_final_distance"])

        with open("haikou-experiments/results/SIMULATION_STATISTIC.csv",
                  "w") as csvfile:
            writer = csv.writer(csvfile)
            row = ["start_ver", "end_ver", "original_num", "original_days"]
            for i in range(len(PERIODS_MINUTES)):
                row += [
                    "num%s" % i,
                    "matching_num%s" % i,
                    "days%s" % i,
                    "matching_probability%s" % i,
                    "aver_shared_distance%s" % i,
                    "aver_final_distance%s" % i
                ]
            writer.writerow(row)
            for i, key in enumerate(self.all_ODs.keys()):
                combined_id = getID(self.all_ODs[key][0]["start_ver"],
                                    self.all_ODs[key][0]["end_ver"])
                if combined_id not in self.origianl_days: continue
                detail = [
                    self.all_ODs[key][0]["start_ver"],
                    self.all_ODs[key][0]["end_ver"],
                    self.origianl_orders[combined_id],
                    self.origianl_days[combined_id]
                ]
                for j in range(len(PERIODS_MINUTES)):
                    detail += [sum(self.all_ODs[key][j]["num"]),sum(self.all_ODs[key][j]["matching_num"]),len(self.all_ODs[key][j]["num"]),\
                        np.mean(self.all_ODs[key][j]["matching_probability"]), np.mean(self.all_ODs[key][j]["aver_shared_distance"]),\
                            np.mean(self.all_ODs[key][j]["aver_final_distance"])]
                writer.writerow(detail)
Beispiel #16
0
def main():
    common = common_cli.GetCommonArguments()
    device = common_cli.GetDeviceArguments()
    device.add_argument(
        '--chunk_kb', type=int, default=1024, metavar='1024',
        help='Size of packets to write in Kb. For older devices, it may be '
             'required to use 4.')
    parents = [common, device]

    parser = argparse.ArgumentParser(
        description=sys.modules[__name__].__doc__, parents=[common])
    subparsers = parser.add_subparsers(title='Commands', dest='command_name')

    subparser = subparsers.add_parser(
        name='help', help='Prints the commands available')
    subparser = subparsers.add_parser(
        name='devices', help='Lists the available devices', parents=[common])
    common_cli.MakeSubparser(
        subparsers, parents, fastboot.FastbootCommands.Continue)

    common_cli.MakeSubparser(
        subparsers, parents, fastboot.FastbootCommands.Download,
        {'source_file': 'Filename on the host to push'})
    common_cli.MakeSubparser(
        subparsers, parents, fastboot.FastbootCommands.Erase)
    common_cli.MakeSubparser(
        subparsers, parents, fastboot.FastbootCommands.Flash)
    common_cli.MakeSubparser(
        subparsers, parents, fastboot.FastbootCommands.Getvar)
    common_cli.MakeSubparser(
        subparsers, parents, fastboot.FastbootCommands.Oem)
    common_cli.MakeSubparser(
        subparsers, parents, fastboot.FastbootCommands.Reboot)

    if len(sys.argv) == 1:
        parser.print_help()
        return 2

    args = parser.parse_args()
    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
    if args.command_name == 'devices':
        return Devices(args)
    if args.command_name == 'help':
        parser.print_help()
        return 0

    kwargs = {}
    argspec = inspect.getargspec(args.method)
    if 'info_cb' in argspec.args:
        kwargs['info_cb'] = _InfoCb
    if 'progress_callback' in argspec.args and progressbar:
        bar = progressbar.ProgessBar(
            widgets=[progressbar.Bar(), progressbar.Percentage()])
        bar.start()

        def SetProgress(current, total):
            bar.update(current / total * 100.0)
            if current == total:
                bar.finish()

        kwargs['progress_callback'] = SetProgress

    return common_cli.StartCli(
        args,
        fastboot.FastbootCommands,
        chunk_kb=args.chunk_kb,
        extra=kwargs)
Beispiel #17
0
def download_files(fileList, urlBase, outDir, verify=True):
    '''
    Download a list of files from a URL to a directory
    '''
    # Authors
    # -------
    # Milena Veneziani
    # Xylar Asay-Davis

    session = requests.Session()
    if not verify:
        session.verify = False

    for fileName in fileList:
        outFileName = '{}/{}'.format(outDir, fileName)
        # outFileName contains full path, so we need to make the relevant
        # subdirectories if they do not exist already
        directory = os.path.dirname(outFileName)
        try:
            os.makedirs(directory)
        except OSError:
            pass

        url = '{}/{}'.format(urlBase, fileName)
        try:
            response = session.get(url, stream=True)
            totalSize = response.headers.get('content-length')
        except requests.exceptions.RequestException:
            print('  {} could not be reached!'.format(url))
            continue

        try:
            response.raise_for_status()
        except requests.exceptions.HTTPError as e:
            print('ERROR while downloading {}:'.format(fileName))
            print(e)
            continue

        if totalSize is None:
            # no content length header
            if not os.path.exists(outFileName):
                with open(outFileName, 'wb') as f:
                    print('Downloading {}...'.format(fileName))
                    try:
                        f.write(response.content)
                    except requests.exceptions.RequestException:
                        print('  {} failed!'.format(fileName))
                    else:
                        print('  {} done.'.format(fileName))
        else:
            # we can do the download in chunks and use a progress bar, yay!

            totalSize = int(totalSize)
            if os.path.exists(outFileName) and \
                    totalSize == os.path.getsize(outFileName):
                # we already have the file, so just continue
                continue

            print('Downloading {} ({})...'.format(fileName,
                                                  sizeof_fmt(totalSize)))
            widgets = [
                progressbar.Percentage(), ' ',
                progressbar.Bar(), ' ',
                progressbar.ETA()
            ]
            bar = progressbar.ProgressBar(widgets=widgets,
                                          maxval=totalSize).start()
            size = 0
            with open(outFileName, 'wb') as f:
                try:
                    for data in response.iter_content(chunk_size=4096):
                        size += len(data)
                        f.write(data)
                        bar.update(size)
                    bar.finish()
                except requests.exceptions.RequestException:
                    print('  {} failed!'.format(fileName))
                else:
                    print('  {} done.'.format(fileName))
Beispiel #18
0
    def run_halted_queue(self, params, frame_chunks):
        """Runs a queue with params for each of the frame chunks. The program halts while
        awaiting the completion of tasks in the queue and shows a progress bar meanwhile. Any
        frame chunks that have been previously completed will be marked as complete unless
        running with force_recompute.

        Args:
            params (dict[str, _]): Message to be published to RabbitMQ.
            frame_chunks (list[dict[str, str]]): List of frame chunk with keys
                "first" and "last" corresponding to the appropriate frame names for the chunk.
        """
        connection = pika.BlockingConnection(
            pika.ConnectionParameters(self.master_ip, heartbeat=0))
        channel = connection.channel()
        channel.queue_declare(queue=config.QUEUE_NAME)
        channel.queue_declare(queue=config.RESPONSE_QUEUE_NAME)

        self.purge_queue(config.QUEUE_NAME)
        self.purge_queue(config.RESPONSE_QUEUE_NAME)

        # force_recompute can be specified over the entire pipeline or particular stages
        frame_chunks = self._get_missing_chunks(params, frame_chunks)
        if len(frame_chunks) == 0:
            return

        for frame_chunk in frame_chunks:
            params.update(frame_chunk)
            msg = json.dumps(params)
            channel.basic_publish(
                exchange="",
                routing_key=config.QUEUE_NAME,
                body=msg,
                properties=pika.BasicProperties(
                    delivery_mode=2),  # make message persistent
            )

        # Waits until the queue is empty before returning for next step
        queue_state = channel.queue_declare(config.RESPONSE_QUEUE_NAME)
        queue_size = queue_state.method.message_count

        progress = "█"
        widgets = [
            f"{progress} ",
            f"{params['app']}:",
            progressbar.Bar(progress, "|", "|"),
            progressbar.Percentage(),
            " (Workers: ",
            progressbar.FormatLabel("0"),
            ") (",
            progressbar.FormatLabel("%(elapsed)s"),
            ")",
        ]
        bar = progressbar.ProgressBar(maxval=len(frame_chunks),
                                      widgets=widgets)
        bar.start()
        no_worker_period = None
        while queue_size != len(frame_chunks):
            time.sleep(1.0)
            queue_size = channel.queue_declare(
                config.RESPONSE_QUEUE_NAME).method.message_count
            num_workers = channel.queue_declare(
                config.QUEUE_NAME).method.consumer_count
            widgets[5] = str(num_workers)

            if num_workers != 0:
                no_worker_period = None
            if num_workers == 0:
                if no_worker_period is None:
                    no_worker_period = time.time()
                if time.time() - no_worker_period > config.NO_WORKER_TIMEOUT:
                    raise Exception(
                        "No workers for extended time! Check worker logs for errors..."
                    )
            bar.update(queue_size)
        bar.finish()
Beispiel #19
0
    def download(self):
        """Download the specified file"""

        def total_seconds(td):
            # Keep backward compatibility with Python 2.6 which doesn't have
            # this method
            if hasattr(td, 'total_seconds'):
                return td.total_seconds()
            else:
                return (td.microseconds +
                        (td.seconds + td.days * 24 * 3600) * 10 ** 6) / 10 ** 6

        attempt = 0

        if not os.path.isdir(self.directory):
            os.makedirs(self.directory)

        # Don't re-download the file
        if os.path.isfile(os.path.abspath(self.target)):
            self.logger.info("File has already been downloaded: %s" %
                             (self.target))
            return

        self.logger.info('Downloading from: %s' %
                         (urllib.unquote(self.final_url)))
        self.logger.info('Saving as: %s' % self.target)

        tmp_file = self.target + ".part"

        while True:
            attempt += 1
            try:
                start_time = datetime.now()

                # Enable streaming mode so we can download content in chunks
                r = requests.get(self.final_url, stream=True,
                                 auth=self.authentication)
                r.raise_for_status()

                content_length = r.headers.get('Content-length')
                # ValueError: Value out of range if only total_size given
                if content_length:
                    total_size = int(content_length.strip())
                    max_value = ((total_size / CHUNK_SIZE) + 1) * CHUNK_SIZE

                bytes_downloaded = 0

                log_level = self.logger.getEffectiveLevel()
                if log_level <= mozlog.INFO and content_length:
                    widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA(),
                               ' ', pb.FileTransferSpeed()]
                    pbar = pb.ProgressBar(widgets=widgets,
                                          maxval=max_value).start()

                with open(tmp_file, 'wb') as f:
                    for chunk in iter(lambda: r.raw.read(CHUNK_SIZE), ''):
                        f.write(chunk)
                        bytes_downloaded += CHUNK_SIZE

                        if log_level <= mozlog.INFO and content_length:
                            pbar.update(bytes_downloaded)

                        t1 = total_seconds(datetime.now() - start_time)
                        if self.timeout_download and \
                                t1 >= self.timeout_download:
                            raise TimeoutError

                if log_level <= mozlog.INFO and content_length:
                    pbar.finish()
                break
            except (requests.exceptions.RequestException, TimeoutError), e:
                if tmp_file and os.path.isfile(tmp_file):
                    os.remove(tmp_file)
                if self.retry_attempts > 0:
                    # Log only if multiple attempts are requested
                    self.logger.warning('Download failed: "%s"' % str(e))
                    self.logger.info('Will retry in %s seconds...' %
                                     (self.retry_delay))
                    time.sleep(self.retry_delay)
                    self.logger.info("Retrying... (attempt %s)" % attempt)
                if attempt >= self.retry_attempts:
                    raise
                time.sleep(self.retry_delay)
Beispiel #20
0
def analyse_all_genomes(genomes, dbpath, tmp_path, nbn, soft, logger, quiet=False):
    """

    Parameters
    ----------
    genomes : dict
        {genome: spegenus.date}
    dbpath : str
        path to folder containing genomes
    tmp_path : str
        path to put out files
    nbn : int
        minimum number of 'N' required to cut into a new contig
    soft : str
        soft used (prokka, prodigal, or None if called by prepare module)
    logger : logging.Logger
        logger object to write log information. Because this function can be called from
        prepare module, where sub logger name is different
    quiet : bool
        True if nothing must be written to stdout/stderr, False otherwise

    Returns
    -------
    dict
        {genome: [spegenus.date, orig_name, path_to_seq_to_annotate, size, nbcont, l90]}

    """
    cut = nbn > 0
    pat = None  ## To put pattern with which sequence must be cut
    if cut:
        pat = 'N' * nbn + "+"
    nbgen = len(genomes)
    bar = None
    curnum = None
    if cut:
        logger.info(("Cutting genomes at each time there are at least {} 'N' in a row, "
                     "and then, calculating genome size, number of contigs and L90.").format(nbn))
    else:
        logger.info("Calculating genome size, number of contigs, L90")
    if not quiet:
        # Create progressbar
        widgets = ['Analysis: ', progressbar.Bar(marker='█', left='', right=''),
                   ' ', progressbar.Counter(), "/{}".format(nbgen), ' (',
                   progressbar.Percentage(), ') - ', progressbar.Timer(), ' - ',
                   progressbar.ETA()
                   ]
        bar = progressbar.ProgressBar(widgets=widgets, max_value=nbgen, term_width=79).start()
        curnum = 1
    toremove = []
    # Analyse genomes 1 by 1
    for genome, name in genomes.items():
        # If not quiet option, show progress bar
        if not quiet:
            bar.update(curnum)
            curnum += 1
        # analyse genome, and check everything went well.
        # exception if binary file
        try:
            res = analyse_genome(genome, dbpath, tmp_path, cut, pat, genomes, soft, logger=logger)
        except UnicodeDecodeError:
            logger.warning(f"'{genome}' does not seem to be a fasta file. It will be ignored.")
            res = False
        # Problem while analysing genome -> genome ignored
        if not res:
            toremove.append(genome)
    # If there are some genomes to remove (analysis failed), remove them from genomes dict.
    if toremove:
        for gen in toremove:
            del genomes[gen]
    if not genomes:
        logger.error(f"No genome was found in the database folder {dbpath}. See logfile "
                     "for more information.")
        sys.exit(1)
    if not quiet:
        bar.finish()
    return 0
Beispiel #21
0
def gitsearch():
    # This part contains the main code.

    path_place = '/home/shaaran/Downloads/Obama_out_-_President_Barack_Obama_s_hilarious_final_White_House_correspondents_dinner_speech-youtube-NxFkEj7KPC0-43-0-301.mp4'  #file destination
    video_capture = cv2.VideoCapture(
        path_place
    )  #starts the web cam if you attach it externally use 1 or 2 , use trail and error .For using the downloaded video replace with path_place
    detector = dlib.get_frontal_face_detector(
    )  #pretrained model for detecting frontal face
    predict_path = '/home/shaaran/PycharmProjects/shape_predictor_68_face_landmarks.dat'
    predictor = dlib.shape_predictor(predict_path)  # initialzing the predictor
    count = 0  # counter for loop
    tfms = tfms_from_model(
        resnet34, sz, aug_tfms=transforms_side_on, max_zoom=1.1
    )  #transforamtions for getting a large and varied dataset from small datset
    data = ImageClassifierData.from_paths(PATH,
                                          tfms=tfms)  #apply transforms to data
    print(data.classes)  #prints the available emotions

    learn = ConvLearner.pretrained(
        arch, data, precompute=True)  #Uses pretrianed in first case
    print('loading requirements......')
    print(
        'This has been made by shaaran alias devshaaran, if you are using this code anywhere for research or educational purposes, please give reference.ENJOY!'
    )
    learn.precompute = False  #precomputation is made false for deeper recognition
    #learn.fit(1e-1, 1)
    learn.fit(1e-1, 3, cycle_len=1)  #model is fit
    learn.load('224_all')
    print('loading done !')

    #progress bar for all emotions *Incomplete*
    bar_happy = progressbar.ProgressBar(maxval=1,
                                        widgets=[
                                            progressbar.Bar('=', '[', ']'),
                                            'happy',
                                            progressbar.Percentage()
                                        ])
    bar_neutral = progressbar.ProgressBar(maxval=1,
                                          widgets=[
                                              progressbar.Bar('=', '[', ']'),
                                              'neutral',
                                              progressbar.Percentage()
                                          ])
    bar_sad = progressbar.ProgressBar(maxval=1,
                                      widgets=[
                                          progressbar.Bar('=', '[', ']'),
                                          'sad',
                                          progressbar.Percentage()
                                      ])
    bar_surprise = progressbar.ProgressBar(maxval=1,
                                           widgets=[
                                               progressbar.Bar('=', '[', ']'),
                                               'surprise',
                                               progressbar.Percentage()
                                           ])
    bar_happy.start()
    bar_neutral.start()
    bar_sad.start()
    bar_surprise.start()

    # Initialize some variablesface_locations = []

    while True:
        # Grab a single frame of video
        ret, frame = video_capture.read()

        # Resize frame of video to 1/4 size for faster face detection processing
        small_frame = cv2.resize(frame, (0, 0), fx=0.50, fy=0.50)
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        # detect faces in the grayscale image
        rects = detector(gray, 1)

        for (i, rect) in enumerate(rects):
            # determine the facial landmarks for the face region, then
            # convert the facial landmark (x, y)-coordinates to a NumPy
            # array
            shape = predictor(gray, rect)
            shape = face_utils.shape_to_np(shape)
            for (x, y) in shape:
                cv2.circle(frame, (x, y), 1, (0, 0, 255), -1)

        # Find all the faces and face encodings in the current frame of video
        face_locations = face_recognition.face_locations(small_frame,
                                                         model="cnn")
        counts = 0
        counts += 1

        # Display the results
        for top, right, bottom, left in face_locations:
            # Scale back up face locanp.exp(preds[0][3])*100tions since the frame we detected in was scaled to 1/4 size
            top *= 2
            right *= 2
            bottom *= 2
            left *= 2

            lower_red = np.array([0, 0, 253])
            upper_red = np.array([0, 0, 255])

            # Extract the region of the image that contains the face
            face_image = frame[top:bottom, left:right]
            mask = cv2.inRange(face_image, lower_red, upper_red)
            res = cv2.bitwise_and(face_image, face_image, mask=mask)
            cv2.imshow('vid', face_image)
            cv2.imshow('res', res)
            count += 1
            cv2.imwrite('0.jpg', res)
            #cv2.imwrite((output_loc + '\\' + str(count)+ str(counts) + '.jpg'), res)

            try:

                # learn = ConvLearner.pretrained(arch, data, precompute=True)
                trn_tfms, val_tfms = tfms_from_model(arch, sz)
                im = val_tfms(open_image('0.jpg'))
                learn.precompute = False
                preds = learn.predict_array(im[None])

                #Use below only for debuggng !
                #print(preds)
                #print(np.exp(preds)[0][0])
                #qprint(data.classes[np.argmax(preds)])

                #updating the percentages

                bar_happy.update(np.exp(preds[0][0]))
                bar_sad.update(np.exp(preds[0][2]))
                bar_neutral.update(np.exp(preds[0][1]))
                bar_surprise.update(np.exp(preds[0][3]))

                #put text on video
                cv2.putText(
                    frame,
                    'happy : ' + str(int(np.exp(preds[0][0]) * 100)) + '%',
                    (top - 40, left - 30), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                    (0, 255, 0), 1)
                cv2.putText(
                    frame,
                    'neutral : ' + str(int(np.exp(preds[0][1]) * 100)) + '%',
                    (top - 40, left), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                    (0, 255, 0), 1)
                cv2.putText(
                    frame,
                    'sad : ' + str(int(np.exp(preds[0][2]) * 100)) + '%',
                    (top - 40, left + 30), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                    (0, 255, 0), 1)
                cv2.putText(
                    frame,
                    'surprise : ' + str(int(np.exp(preds[0][3]) * 100)) + '%',
                    (top - 40, left + 60), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                    (0, 255, 0), 1)

            except Exception as e:
                print(e)

        cv2.imshow('Video', frame)  #shows image
        if cv2.waitKey(1) & 0xFF == ord('q'):
            bar_surprise.finish()
            bar_neutral.finish()
            bar_sad.finish()
            bar_happy.finish()
            break
    video_capture.release()
    cv2.destroyAllWindows()
Beispiel #22
0
    def generate_fill(self, max_length=None, no_duration=False, verbose=1):
        """

        :param max_length:
        :param no_duration:
        :param verbose:
        :return:
        """
        # ----- Parameters -----
        max_length = 300 / self.step_length if max_length is None else max_length

        # ----- Variables -----
        if self.data_transformed_path is None and self.data_test_transformed_path is None:
            raise Exception(
                'Some data need to be loaded before comparing the generation')
        path = self.data_transformed_path if self.data_test_transformed_path is None else self.data_test_transformed_path
        sequence = Sequences.KerasSequence(
            path=path,
            nb_steps=self.nb_steps,
            batch_size=1,
            work_on=self.work_on
        )  # Return array instead of list (for instruments)
        max_length = int(min(max_length, len(sequence)))
        nb_instruments = sequence.nb_instruments
        # ----- Seeds -----
        truth = sequence[0][0]
        filled_list = [np.copy(truth) for inst in range(nb_instruments)]
        mask = np.ones((nb_instruments, nb_instruments, self.nb_steps))
        for inst in range(nb_instruments):
            filled_list[inst][inst] = 0
            mask[inst, inst] = 0

        # ----- Generation -----
        cprint('Start generating (fill) ...', 'blue')
        bar = progressbar.ProgressBar(maxval=max_length,
                                      widgets=[
                                          progressbar.Bar('=', '[', ']'), ' ',
                                          progressbar.Percentage(), ' ',
                                          progressbar.ETA()
                                      ])
        bar.start()  # To see it working
        for l in range(max_length):
            s_input, s_output = sequence[l]
            to_fill_list = [np.copy(s_input) for inst in range(nb_instruments)]
            for inst in range(nb_instruments):
                to_fill_list[inst][inst] = 0
            nn_input = np.concatenate(
                tuple(to_fill_list), axis=1
            )  # (nb_instruments, batch=nb_instruments, nb_steps, step_size, input_size, channels)
            preds = self.keras_nn.generate(input=list(nn_input) + [mask])

            preds = np.asarray(preds).astype(
                'float64'
            )  # (nb_instruments, bath=nb_instruments, nb_steps=1, step_size, input_size, channels)
            if len(preds.shape
                   ) == 5:  # Only one instrument : output of nn not a list
                preds = np.expand_dims(preds, axis=0)
            if len(s_output.shape
                   ) == 5:  # Only one instrument : output of nn not a list
                s_output = np.expand_dims(s_output)
            preds = midi.create.normalize_activation(
                preds, mono=self.mono, use_binary=self.use_binary)
            truth = np.concatenate((truth, s_output), axis=2)
            for inst in range(nb_instruments):
                p = np.copy(s_output)
                p[inst] = np.take(preds, axis=1, indices=[inst])[inst]
                filled_list[inst] = np.concatenate(
                    (filled_list[inst], p), axis=2
                )  # (nb_instruments, batch=1, nb_steps, step_size, input_size, channels)
            bar.update(l + 1)
        bar.finish()

        # -------------------- Compute notes list --------------------
        # ----- Reshape -----
        truth = self.reshape_generated_array(truth)
        for inst in range(nb_instruments):
            filled_list[inst] = self.reshape_generated_array(filled_list[inst])
        self.save_midis_path.mkdir(parents=True, exist_ok=True)
        accuracies, accuracies_inst = self.compute_generated_array(
            generated_array=truth,
            folder_path=self.save_midis_path,
            name='generated_fill_truth',
            no_duration=no_duration,
            verbose=verbose,
            save_images=True)
        accuracies, accuracies_inst = [accuracies], [accuracies_inst]
        for inst in range(nb_instruments):
            acc, acc_inst = self.compute_generated_array(
                generated_array=filled_list[inst],
                folder_path=self.save_midis_path,
                name=f'generated_fill_{inst}',
                no_duration=no_duration,
                array_truth=truth,
                verbose=verbose,
                save_truth=False,
                save_images=True)
            accuracies.append(acc)
            accuracies_inst.append(acc_inst)

        # Save the image of all in a subplot to allow easier comparaisons
        self.save_generated_arrays_cross_images(
            generated_arrays=[truth] + filled_list,
            folder_path=self.save_midis_path,
            name=f'generated_fill_all',
            replicate=False,
            titles=['Truth'] +
            [f'Fill Inst {i}' for i in range(nb_instruments)],
            subtitles=[
                f'Acc: {accuracies_inst[i][int(max(0, i - 1))]}'
                for i in range(nb_instruments + 1)
            ]  # Truth is in it
        )

        # Save the summary of the generation
        summary.summarize(
            # Function parameters
            path=self.save_midis_path,
            title=self.full_name,
            file_name='generate_fill_summary.txt',
            # Summary parameters
            length=max_length,
            no_duration=no_duration,
            # Generic Summary
            **self.summary_dict)

        cprint('Done generating (fill)', 'green')
Beispiel #23
0
    def runStrategy(self):

        if (debug):
            print 'Started computing trades'

        dataList = self.reader.dataList

        # Hardcoded due to the strategy used
        # Trading starts in January
        # startId = int(3 * 30 * jumpSize * 0.7)
        startId = 0
        endId = dataList['open'].shape[0]
        # endId   = startId + 50
        currPos = 0 * dataList['open'].iloc[0]
        self.currQty = 0.0 * dataList['open'].iloc[0]
        self.currQty.fillna(0, inplace=True)
        logList = []

        print dataList['open'].shape, startId, endId

        print 'Current trading progress:'
        bar = progressbar.ProgressBar(maxval = endId, \
                          widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
        bar.start()

        tradeId = startId

        while (tradeId < endId):

            # Since the first column (and index) is time
            currTime = dataList['open'].iloc[tradeId].name

            if (not (self.startTime <= currTime <= self.endTime)):
                tradeId += self.tradeFreq
                continue

            currTimeStamp = datetime.datetime.fromtimestamp(currTime)
            currDay = currTimeStamp.date()
            currHour = currTimeStamp.time().hour
            currMins = currTimeStamp.time().minute

            if (tradeId % 25000 == 0):
                print currTimeStamp
                # print 'Trade Cost:', self.tradeCost
                # print 'Current Qty:', np.array(self.currQty)
                print 'Current Budget:', self.currBudget
                print 'Current Val:', self.currVal
                print 'NumTrades:', self.cntTrades
                print 'Avg. Profit:', self.currVal, self.initBudget
                print 'Avg. Profit:', ((
                    (self.currVal - self.initBudget) / 1000.0) /
                                       (self.cntTrades + eps)) * 1e4

            if (self.dailyTrade):
                lowTime = deepcopy(currTimeStamp).replace(hour=9,
                                                          minute=(15 +
                                                                  self.window))
                highTime = deepcopy(lowTime) + datetime.timedelta(
                    minutes=(self.holdPeriod * self.tradeFreq))
                # print currTimeStamp, lowTime, highTime, (lowTime <= currTimeStamp <= highTime)
                if (not (lowTime <= currTimeStamp <= highTime)):
                    tradeId += 1
                    continue

            if (traceFlag):
                print 'generateSignal: START'
            currOrder = self.strategy.generateSignal(currTime)
            if (traceFlag):
                print 'generateSignal: END'

            # Saving the current day price
            currPrice = pd.DataFrame(0,
                                     index=dataList['open'].columns,
                                     columns=['open', 'close'])
            currPrice['open'] = dataList['open'].iloc[tradeId]
            currPrice['close'] = dataList['close'].iloc[tradeId]

            self.cntTrades += (np.sum(np.abs(currOrder['qty']) > eps))

            if (debug):
                print currOrder

            # If we want to manipulate positions and other quantities
            # currOrder = self.processOrders(currOrder, currPrice, currPos)
            currOrder = self.processOrdersOptimized(currOrder, currPrice,
                                                    currPos)
            currPos = currOrder['position']

            if (traceFlag):
                print 'loggingComputation: START'

            # Compute step statistics
            self.tradeCost = (currOrder['qty'] * currOrder['signal'] *
                              currPrice['open']).sum()
            self.currQty += currOrder['qty'] * currOrder['signal']
            self.currBudget -= self.tradeCost
            self.currVal = (self.currQty *
                            currPrice['close']).sum() + self.currBudget

            if (debug):
                # print currTime
                # print currPrice
                # print currOrder
                # print 'Trade Cost:', self.tradeCost
                # print 'Current Qty:', self.currQty
                print 'Current Budget:', self.currBudget
                print 'Current Val:', self.currVal
                # raw_input('WAIT')

            logList.append([currTime, currOrder, currPrice])

            bar.update(tradeId)

            tradeId += self.tradeFreq

            # Denotes whether we are currently holding any positions or not if daily trade
            # Helps in deciding how much to move at the end of the hold period
            if (self.dailyTrade):
                if (currTimeStamp == highTime):
                    tradeId -= self.tradeFreq
                    tradeId += 1

            if (traceFlag):
                print 'loggingComputation: END'

        # Flush i.e. empty your positions
        if (self.flush):
            currTime = dataList['open'].iloc[endId - 1].name

            currPrice = pd.DataFrame(0,
                                     index=dataList['open'].columns,
                                     columns=['open', 'close'])
            currPrice['open'] = dataList['open'].iloc[endId - 1]
            currPrice['close'] = dataList['close'].iloc[endId - 1]

            currOrder = pd.DataFrame(0,
                                     index=dataList['open'].columns,
                                     columns=['signal', 'qty'])
            currOrder['signal'] = np.sign(-self.currQty)
            currOrder['qty'] = np.abs(-self.currQty)

            logList.append([currTime, currOrder, currPrice])

        bar.finish()

        print 'Finished running the strategy on', len(logList), 'timestamps'

        self.logOrders(logList)

        print 'FINAL STATS:'
        # print 'Trade Cost:', self.tradeCost
        # print 'Current Qty:', np.array(self.currQty)
        print 'Current Budget:', self.currBudget
        print 'Current Val:', self.currVal
        print 'NumTrades:', self.cntTrades
        print 'Avg. Profit:', (((self.currVal - self.initBudget) / 100.0) /
                               (self.cntTrades + eps)) * 1e4

        if (debug):
            print 'Finished computing trades'
            raw_input('Finished Logging computed trades (Enter to continue):')
Beispiel #24
0
    def compare_generation(self,
                           max_length=None,
                           no_duration=False,
                           verbose=1):
        """

        :return:
        """
        # -------------------- Find informations --------------------
        if self.data_transformed_path is None and self.data_test_transformed_path is None:
            raise Exception(
                'Some data need to be loaded before comparing the generation')
        path = self.data_transformed_path if self.data_test_transformed_path is None else self.data_test_transformed_path
        self.sequence = Sequences.AllInstSequence(path=path,
                                                  nb_steps=self.nb_steps,
                                                  batch_size=1,
                                                  work_on=self.work_on,
                                                  noise=0)
        max_length = len(self.sequence) if max_length is None else min(
            max_length, len(self.sequence))
        max_length = min(max_length, 10)

        # -------------------- Construct seeds --------------------
        generated = np.array(
            self.sequence[0][0]
        )  # (nb_instrument, 1, nb_steps, step_size, input_size, 2) (1=batch)
        generated_helped = np.copy(
            generated)  # Each step will take the truth as an input
        generated_truth = np.copy(generated)  # The truth

        mask = self.get_mask(self.sequence.nb_instruments, batch_size=2)

        # -------------------- Generation --------------------
        cprint('Start comparing generation ...', 'blue')
        bar = progressbar.ProgressBar(maxval=max_length,
                                      widgets=[
                                          progressbar.Bar('=', '[', ']'), ' ',
                                          progressbar.Percentage(), ' ',
                                          progressbar.ETA()
                                      ])
        bar.start()  # To see it working
        for l in range(max_length):
            ms_input, ms_output = self.sequence[l]
            sample = np.concatenate(
                (generated[:, :, l:], np.array(ms_input)), axis=1
            )  # (nb_instruments, 2, nb_steps, step_size, input_size, 2)

            # Generation
            preds = self.keras_nn.generate(input=list(sample) + mask)

            # Reshape
            preds = np.asarray(preds).astype(
                'float64'
            )  # (nb_instruments, batch=2, nb_steps=1, length, 88, 2)
            preds_truth = np.array(
                ms_output)  # (nb_instruments, 1, 1, step_size, input_size, 2)
            # if only one instrument
            if len(preds.shape
                   ) == 5:  # Only one instrument : output of nn not a list
                preds = np.expand_dims(preds, axis=0)
            if len(preds_truth.shape
                   ) == 5:  # Only one instrument : output of nn not a list
                preds_truth = np.expand_dims(preds_truth)
            preds = midi.create.normalize_activation(
                preds, mono=self.mono,
                use_binary=self.use_binary)  # Normalize the activation part
            preds_helped = preds[:,
                                 [1]]  # (nb_instruments, 1, 1, length, 88, 2)
            preds = preds[:, [0]]

            # Concatenation
            generated = np.concatenate(
                (generated, preds),
                axis=2)  # (nb_instruments, 1, nb_steps, length, 88, 2)
            generated_helped = np.concatenate(
                (generated_helped, preds_helped),
                axis=2)  # (nb_instruments, 1, nb_steps, length, 88, 2)
            generated_truth = np.concatenate((generated_truth, preds_truth),
                                             axis=2)
            bar.update(l + 1)
        bar.finish()

        # -------------------- Compute notes list --------------------
        # Generated
        generated_midi_final = self.reshape_generated_array(generated)
        # Helped
        generated_midi_final_helped = self.reshape_generated_array(
            generated_helped)
        # Truth
        generated_midi_final_truth = self.reshape_generated_array(
            generated_truth)

        # ---------- find the name for the midi_file ----------
        self.save_midis_path.mkdir(parents=True, exist_ok=True)

        accuracies, accuracies_inst = [], []
        # Generated
        acc, acc_inst = self.compute_generated_array(
            generated_array=generated_midi_final,
            folder_path=self.save_midis_path,
            name='compare_generation_alone',
            no_duration=no_duration,
            array_truth=generated_midi_final_truth,
            verbose=verbose,
            save_truth=False,
            save_images=True)
        accuracies.append(acc)
        accuracies_inst.append(acc_inst)
        # Helped
        acc, acc_inst = self.compute_generated_array(
            generated_array=generated_midi_final_helped,
            folder_path=self.save_midis_path,
            name='compare_generation_helped',
            no_duration=no_duration,
            array_truth=generated_midi_final_truth,
            verbose=verbose,
            save_truth=False,
            save_images=True)
        accuracies.append(acc)
        accuracies_inst.append(acc_inst)
        # Truth
        self.compute_generated_array(
            generated_array=generated_midi_final_truth,
            folder_path=self.save_midis_path,
            name='compare_generation_truth',
            no_duration=no_duration,
            array_truth=None,
            verbose=verbose,
            save_truth=False,
            save_images=True)
        accuracies.append(acc)
        accuracies_inst.append(acc_inst)

        # Save the image of all in a subplot to allow easier comparaisons
        self.save_generated_arrays_cross_images(
            generated_arrays=[
                generated_midi_final_truth, generated_midi_final_helped,
                generated_midi_final
            ],
            folder_path=self.save_midis_path,
            name=f'compare_generation_all',
            replicate=False,
            titles=['Truth', 'Helped', 'Alone'],
            subtitles=[
                'Acc : 1',
                f'Acc: {accuracies[1]:.3}, Acc_inst: [{", ".join([f"{a:.3}" for a in accuracies_inst[1]])}]',
                f'Acc: {accuracies[0]:.3}, Acc_inst: [{", ".join([f"{a:.3}" for a in accuracies_inst[0]])}]'
            ]  # Truth is in it
        )

        # ----- Summarize the generation -----

        # Creation of the summary .txt file
        summary.summarize(
            # Function parameters
            path=self.save_midis_path,
            title=self.full_name,
            file_name='compare_generation_summary.txt',
            # Summary paramters,
            length=max_length,
            no_duration=no_duration,
            generated_accuracy=accuracies[0],
            generated_accuracies=accuracies_inst[0],
            helped_accuracy=accuracies[1],
            helped_accuracies=accuracies_inst[1],
            # Generic Summary
            **self.summary_dict)

        cprint('Done comparing generation', 'green')
Beispiel #25
0
args1 = parser.parse_args()
print(args1)

dt = [1, 3, 4, 6, 12, 24, 48]  #[24, 48] #  #discretization unit in hours

dw = [0, 1, 3, 5,
      7]  #length of the sliding window of previous features, in units of dt

create = False

all_combs = list(itertools.product(dt, dw))
pdb.set_trace()
bar = progressbar.ProgressBar(
    maxval=len(all_combs),
    widgets=[progressbar.Bar('=', '[', ']'), ' ',
             progressbar.Percentage()])
bar.start()
n_comb = 0
args = Args(0, 0, "", 0, "", False, "")

for comb in all_combs:
    dt = comb[0]
    dw = comb[1]
    bar.update(n_comb + 1)
    n_comb += 1

    if dt == 1 and dw == 0:
        continue
    #if dw==0: #and dt!=1:
    #    create=True
    #if dt!=1:
Beispiel #26
0
    def redo_song_generate(self,
                           song_number=None,
                           instrument_order=None,
                           no_duration=False,
                           save_images=True,
                           noise=0):
        """

        :param instrument_order: The order of the instruments to remplace
        :param song_number: The number of the song in the dataset
        :param no_duration:
        :param save_images:
        :param noise:
        :return:
        """
        path = self.data_transformed_path if self.data_test_transformed_path is None else self.data_test_transformed_path
        self.sequence = Sequences.KerasSequence(
            path=path,
            nb_steps=self.nb_steps,
            batch_size=1,
            work_on=self.work_on,
            noise=noise,
            replicate=False,
            predict_offset=self.predict_offset)
        song_number = np.random.randint(
            self.sequence.nb_songs) if song_number is None else song_number
        instrument_order = np.random.permutation(
            self.nb_instruments
        ) if instrument_order is None else instrument_order
        all_arrays = []
        # Construct the truth array
        x, y = self.sequence.get_all_song(song_number=song_number,
                                          in_batch_format=False)
        # x: (nb_instruments, batch=1, nb_steps, step_size, input_size, channels)]
        # y: (nb_instruments, batch=1, nb_steps, step_size, input_size, channels)]
        # x and y are the same except that in x, there is some noise
        length = self.sequence.get_song_len(song_number)
        if length == 0:
            # It means the len of the song is < nb_steps
            shape = (*(x.shape[:2]), self.nb_steps + 1, *(x.shape[3:]))
            zeros = np.zeros(
                shape
            )  # (nb_instruments, batch, nb_steps, step_size, input_size, channels)
            zeros[:, :, -x.shape[2]:] = x
            x = zeros
            length = 1
        truth = x
        # truth: (nb_instruments, batch=1, len_song, step_size, input_size, channels)

        all_arrays.append(truth)
        cprint('Start redoing song (generate) ...', 'blue')
        bar = progressbar.ProgressBar(maxval=length * self.nb_instruments,
                                      widgets=[
                                          progressbar.Bar('=', '[', ']'), ' ',
                                          progressbar.Percentage(), ' ',
                                          progressbar.ETA()
                                      ])
        bar.start()  # To see it working
        for instrument in range(len(instrument_order)):
            # We replace the instruments one by one
            instrument_to_remove = instrument_order[instrument]
            generated = np.copy(all_arrays[-1])
            # generated : (nb_instruments, batch=1, nb_steps, step_size, input_size, channels)
            generated[instrument_to_remove] = 0
            for step in range(length):
                inputs = np.take(generated,
                                 axis=2,
                                 indices=range(step, step + self.nb_steps))
                # inputs = (nb_instruments, batch=1, nb_steps, step_size, input_size, channels)]
                mask = self.get_mask()  # (batch=1, nb_instruments, nb_steps)
                # Remove the instrument from the input
                mask[0][:, instrument_to_remove] = 0
                inputs[instrument_to_remove] = 0
                preds = np.asarray(
                    self.keras_nn.generate(input=list(inputs) +
                                           mask)).astype('float64')
                # preds: (nb_instruments, batch=1, nb_steps=1, step_size, input_size, channels)
                preds = midi.create.normalize_activation(
                    preds, mono=self.mono, use_binary=self.use_binary)
                preds_index = step + self.nb_steps + (self.predict_offset - 1)
                generated[instrument_to_remove, :, preds_index:preds_index +
                          1] = preds[instrument_to_remove]

                bar.update(instrument * length + step)
            if self.mono:
                # If mono we say the first measures have no notes
                generated[instrument_to_remove, :, :self.nb_steps, :, -1] = 1
                # generated : (nb_instruments, batch=1, nb_steps, step_size, input_size, channels)

            all_arrays.append(generated)
            # all_arrays: List(nb_instruments + 1)[(nb_instruments, batch=1, nb_steps, step_size, input_size, channels)]
        bar.finish()

        self.save_midis_path.mkdir(exist_ok=True, parents=True)
        generated_midi = [
            self.reshape_generated_array(arr) for arr in all_arrays
        ]
        # Save the truth
        accuracies, accuracies_inst = self.compute_generated_array(
            generated_array=generated_midi[0],
            folder_path=self.save_midis_path,
            name='redo_song_generate_truth',
            no_duration=no_duration,
            save_images=save_images,
        )
        accuracies, accuracies_inst = [accuracies], [accuracies_inst]
        for inst in range(self.nb_instruments):
            acc, acc_inst = self.compute_generated_array(
                generated_array=generated_midi[inst + 1],
                folder_path=self.save_midis_path,
                name=
                f'redo_song_generate_{inst}_(inst_{instrument_order[inst]})',
                no_duration=no_duration,
                array_truth=generated_midi[0],
                save_images=save_images,
                save_truth=False,
            )
            accuracies.append(acc)
            accuracies_inst.append(acc_inst)

        if self.batch is not None:
            self.sequence.batch_size = self.batch

        self.save_generated_arrays_cross_images(
            generated_arrays=generated_midi,
            folder_path=self.save_midis_path,
            name='redo_song_all',
            titles=['Truth'] + [
                f'Iteration {i}: change inst {instrument_order[i]}'
                for i in range(self.nb_instruments)
            ],
            subtitles=[
                f'Acc: {accuracies[i]}, Acc inst: {accuracies_inst[i]}'
                for i in range(self.nb_instruments + 1)
            ])

        summary.summarize(
            # Function params
            path=self.save_midis_path,
            title=self.full_name,
            file_name='redo_song_replicate_summary.txt',
            # Summary params
            song_number=song_number,
            instrument_order=instrument_order,
            no_duration=no_duration,
            noise=noise,
            # Generic summary
            **self.summary_dict)

        cprint('Done redo song generate', 'green')
Beispiel #27
0
def main():
    browser = create_client()

    conn = sqlite3.connect('links.db')
    conn.row_factory = sqlite3.Row
    videos_info = conn.execute(
        f'select * from videos where downloaded = 0 and download_forbidden isnull'
    ).fetchall()
    widgets = [
        progressbar.Percentage(), ' ',
        progressbar.Counter(), ' ',
        progressbar.Bar(), ' ',
        progressbar.FileTransferSpeed()
    ]
    pbar = progressbar.ProgressBar(widgets=widgets,
                                   max_value=len(videos_info)).start()

    for i, video_info in enumerate(videos_info):
        pbar.update(i)
        video_info = dict(video_info)
        video_id = video_info['video_id']
        browser.visit(video_info['video_url'])

        while browser.is_element_present_by_css(
                '.recaptchaContent'):  # sometimes wild captcha appears
            print("CAPTCHA NEEDED")
            sleep(60)

        if browser.is_element_present_by_css('.removed'):
            # video has been removed
            print('video has been removed\n')
            with conn:
                conn.execute(
                    f'UPDATE videos SET download_forbidden = 1 where video_id = "{video_id}"'
                )
            continue

        if not browser.is_element_present_by_css(
                '.premiumIconTitleOnVideo:visible'
        ) and not browser.is_element_present_by_css('#videoTitle'):
            # video has been removed
            print('video is somehow broken and not premiuzm\n')
            with conn:
                conn.execute(
                    f'UPDATE videos SET download_forbidden = 1 where video_id = "{video_id}"'
                )
            continue

        video_title = browser.find_by_css('#videoTitle').text  # type: str
        # because of f*****g windows
        video_title = video_title.replace(':', '').replace('?', '').replace('*', '').replace('"', '').replace('/', '') \
            .replace('\\', '')
        browser.find_by_id('player').click()  # pausing video
        browser.find_by_tag('body')._element.send_keys('M')  # muting video

        file_name = f'videos/{video_id}-{video_title}.mp4'
        if osp.exists(file_name):
            with conn:
                conn.execute(
                    f'UPDATE videos SET downloaded = 1 where video_id = "{video_id}"'
                )
            continue

        if browser.is_element_present_by_css(
                '.tab-menu-item.js-paidDownload[data-tab="download-tab"]'):
            # video has been removed
            print('video download is paid\n')
            with conn:
                conn.execute(
                    f'UPDATE videos SET download_forbidden = 1 where video_id = "{video_id}"'
                )
            continue

        download_tab_button_sel = '.tab-menu-item[data-tab="download-tab"]'
        vr_tab_button_sel = '.tab-menu-item[data-tab="vr-tab"]'
        if not browser.is_element_present_by_css(download_tab_button_sel) \
                and browser.is_element_present_by_css(vr_tab_button_sel):
            # video has been removed
            print('video is vr, no download\n')
            with conn:
                conn.execute(
                    f'UPDATE videos SET download_forbidden = 1 where video_id = "{video_id}"'
                )
            continue

        click_download_tab(browser, download_tab_button_sel)

        if is_download_forbidden(browser, conn, video_id):
            continue

        download_link = get_download_link(browser)
        # must have here headers, otherwise it behaves as api and does not serve the video
        for _ in range(5):
            try:
                request.urlretrieve(download_link, file_name)
                break
            except URLError:
                print('connection failed, trying again\n')

        print(file_name, 'downloaded\n')
        with conn:
            conn.execute(
                f'UPDATE videos SET downloaded = 1 where video_id = "{video_id}"'
            )

    pbar.finish()
    print('done')
# Print input file in human-readable format
print("Input (.csv) file size: ", sizeof_fmt(statinfo_in.st_size))

# Determine number of lines in input file to be used for progress bar
fname = inFile
num_lines = 0
with open(fname, 'r') as f:
    for line in f:
        num_lines += 1
print("Number of lines in csv file:", num_lines)

# Set widgets for progress bar
widgets = [
    'Converting csv to json. Percentage completed:',
    pb.Percentage(), ' ',
    pb.Bar(marker='█'), ' ',
    pb.ETA()
]

# Create progress bar and initialize
bar = pb.ProgressBar(widgets=widgets, maxval=num_lines).start()


def csv_to_json(csv_path, json_path):
    # Open connection to csv file
    with open(csv_path, 'r') as csv_file:
        # Create reader object; a dictionary of the csv file
        reader = csv.DictReader(csv_file)
        # Open output json file
        with open(json_path, 'w') as json_file:
START_IDX = 0
VERBOSE = False
TEST = True

output = 'auto_spaced_review_' + str(START_IDX) + '-%s.json'

texts = texts[START_IDX:]

start_idx = 0
data = ""
next_text = ""

import progressbar
bar = progressbar.ProgressBar(maxval=len(texts), \
                widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])

is_saved = False
try:
    for idx, text in enumerate(texts):
        bar.update(idx + 1)

        data += text + '\r\n'
        if len(data) < 1000000:
            continue

        new_text = auto_spacing(data).split('<br>')
        if len(new_text) == 1:
            new_text = new_text[0].split('<br/>')

        for i, review in enumerate(reviews[start_idx:idx + 1]):
Beispiel #30
0
    def search_query(self):
        @retry(elasticsearch.exceptions.ConnectionError, tries=TIMES_TO_TRY)
        def next_scroll(scroll_id):
            return self.es_conn.scroll(scroll=self.scroll_time,
                                       scroll_id=scroll_id)

        search_args = dict(index=','.join(self.opts.index_prefixes),
                           scroll=self.scroll_time,
                           size=self.opts.scroll_size,
                           terminate_after=self.opts.max_results)

        if self.opts.doc_types:
            search_args['doc_type'] = self.opts.doc_types

        if self.opts.query.startswith('@'):
            query_file = self.opts.query[1:]
            if os.path.exists(query_file):
                with open(query_file, 'r') as f:
                    self.opts.query = f.read()
            else:
                print('No such file: %s' % query_file)
                exit(1)
        if self.opts.raw_query:
            try:
                query = json.loads(self.opts.query)
            except ValueError as e:
                print('Invalid JSON syntax in query. %s' % e)
                exit(1)
            search_args['body'] = query
        else:
            query = self.opts.query if not self.opts.tags else '%s AND tags:%s' % (
                self.opts.query, '(%s)' % ' AND '.join(self.opts.tags))
            search_args['q'] = query

        if '_all' not in self.opts.fields:
            search_args['_source_include'] = ','.join(self.opts.fields)
            self.csv_headers.extend(
                [field for field in self.opts.fields if '*' not in field])

        if self.opts.debug_mode:
            print('Using these indices: %s' %
                  ', '.join(self.opts.index_prefixes))
            print('Query[%s]: %s' %
                  (('Query DSL', json.dumps(query)) if self.opts.raw_query else
                   ('Lucene', query)))
            print('Output field(s): %s' % ', '.join(self.opts.fields))

        res = self.es_conn.search(**search_args)

        self.num_results = res['hits']['total']

        print('Found %s results' % self.num_results)
        if self.opts.debug_mode:
            print(json.dumps(res))

        if self.num_results > 0:
            open(self.opts.output_file, 'w').close()
            open(self.tmp_file, 'w').close()

            hit_list = []
            total_lines = 0

            widgets = [
                'Run query ',
                progressbar.Bar(left='[', marker='#', right=']'),
                progressbar.FormatLabel(' [%(value)i/%(max)i] ['),
                progressbar.Percentage(),
                progressbar.FormatLabel('] [%(elapsed)s] ['),
                progressbar.ETA(), '] [',
                progressbar.FileTransferSpeed(unit='docs'), ']'
            ]
            bar = progressbar.ProgressBar(widgets=widgets,
                                          maxval=self.num_results).start()

            while total_lines != self.num_results:
                if res['_scroll_id'] not in self.scroll_ids:
                    self.scroll_ids.append(res['_scroll_id'])

                if not res['hits']['hits']:
                    print(
                        'Scroll[%s] expired(multiple reads?). Saving loaded data.'
                        % res['_scroll_id'])
                    break
                for hit in res['hits']['hits']:
                    total_lines += 1
                    bar.update(total_lines)
                    hit_list.append(hit)
                    if len(hit_list) == FLUSH_BUFFER:
                        self.flush_to_file(hit_list)
                        hit_list = []
                    if self.opts.max_results:
                        if total_lines == self.opts.max_results:
                            self.flush_to_file(hit_list)
                            print('Hit max result limit: %s records' %
                                  self.opts.max_results)
                            return
                res = next_scroll(res['_scroll_id'])
            self.flush_to_file(hit_list)
            bar.finish()