def __load_dir(dir_path):
        """
        Load all the data in "dir_path", and complement the data in the dates that no transaction happened
        :return
            data: (list)
            e.g. [ # include transactions happen in many days
                ['bond_a', 'bond_b', ...], # represent transaction happen in one day
                ['bond_a', 'bond_b', ...],
                ...
            ]
        """
        data = []

        # load the date list
        date_list = os.listdir(dir_path)
        date_list.sort()

        # generate a date dict so that we can check whether there is transaction happens in that date
        date_dict = utils.list_2_dict(date_list)

        # find out the start and end date of all the transactions
        start_date = date_list[0][len('doc_'):-len('.json')]
        end_date = date_list[-1][len('doc_'):-len('.json')]

        # covert the date to timestamp
        cur_timestamp = utils.date_2_timestamp(start_date)
        end_timestamp = utils.date_2_timestamp(end_date) + 86000

        # traverse all the date between the start date and the end date, but skip the holidays
        while cur_timestamp < end_timestamp:
            _date = utils.timestamp_2_date(cur_timestamp)
            file_name = f'doc_{_date}.json'

            # check if there is any transaction
            if file_name in date_dict:
                file_path = os.path.join(dir_path, file_name)

                # remove nan in doc
                tmp_doc = list(
                    map(lambda x: x if isinstance(x, str) else '',
                        utils.load_json(file_path)))
                while '' in tmp_doc:
                    tmp_doc.remove('')

                data.append(tmp_doc)

            # if it is holidays, then skip it
            elif date.is_holiday(_date):
                pass

            # if no transaction happens in that date
            else:
                data.append([])

            # move to the next day
            cur_timestamp += 86400

        return data
def __generate_date_structure(len_bonds,
                              start_date='2015-01-02',
                              end_date='2015-12-31',
                              with_day_off=True,
                              buy_sell_plan=2):
    """
    return:
        date_matrix: np.array, shape: (date_num, len_bonds + 3) or (date_num, len_bonds * 2 + 3),
                    values are all ones. Therefore, a mask for the input is needed.
                    "+ 3" is because the last 3 tokens are for <start> <end> <day-off>
                    <pad> uses all zeros, thus, it does not need a place in vocabulary
        date_mask: np.array, shape: (date_num, len_bonds + 3) or (date_num, len_bonds * 2 + 3),
                    values are all zeros. The mask would be changed when traversing the transaction history.
        dict_date_2_input_m_index: dict, map the date to the index of date_matrix
    """
    dict_date_2_input_m_index = {}

    # convert timestamp
    start_timestamp = utils.date_2_timestamp(start_date)
    end_timestamp = utils.date_2_timestamp(end_date, True)

    # temporary variables
    l = []
    cur_timestamp = start_timestamp

    # generate the dict_date_2_input_m_index and calculate the date_num
    while cur_timestamp <= end_timestamp:
        _date = utils.timestamp_2_date(cur_timestamp)
        cur_timestamp += 86400

        if date.is_holiday(_date):
            if with_day_off:
                l.append(0)
            continue

        dict_date_2_input_m_index[_date] = len(l)
        l.append(1)

    # calculate the shape
    date_num = len(l)
    extra_token_num = 3 if with_day_off else 2
    len_bonds = len_bonds * 2 + extra_token_num if buy_sell_plan == 2 else len_bonds + extra_token_num

    # generate variables
    date_matrix = np.ones((date_num, 1)) * np.arange(len_bonds)
    date_mask = np.zeros((date_num, len_bonds)) * np.arange(len_bonds)
    # change the value in day off pos to be 1
    date_mask[np.where(np.array(l) == 0)[0], -1] = 1

    return date_matrix, date_mask, dict_date_2_input_m_index
Beispiel #3
0
    for i, v in enumerate(data):
        if i % 20 == 0:
            progress = float(i) / length * 100.
            print('\rprogress: %.2f%% ' % progress, end='')

        bond_id = v['bond_id']
        offering_date = v['offering_date']
        report_dealer_index = str(v['report_dealer_index'])
        contra_party_index = str(v['contra_party_index'])
        date = v['date']
        volume = v['volume']

        if str(offering_date)[0] != '2':
            continue

        offering_timestamp = utils.date_2_timestamp(str(offering_date).split(' ')[0])
        if offering_timestamp >= bound_timestamp:
            continue

        if report_dealer_index not in dict_first_num_dealers and contra_party_index not in dict_first_num_dealers:
            continue

        in_bond = False
        for _dealer in tmp_dealers:
            no_below_0_val = _dealer[3]
            no_below_dict_0 = no_below_0_val[-1]
            if no_below_dict_0.doc2idx([bond_id])[0] != -1:
                in_bond = True
                break

        if in_bond:
    tmp_data = utils.load_json(os.path.join(dir_path, file_name))

    d_bonds = {}
    for i, v in enumerate(tmp_data):
        bond_id = v[0]
        if bond_id not in d_bonds:
            d_bonds[bond_id] = []
        d_bonds[bond_id].append(v[2])

    l_bonds = []
    for bond_id, date_list in d_bonds.items():
        date_list.sort()
        first_date = date_list[0][:-9]
        last_date = date_list[-1][:-9]

        first_date_timestamp = utils.date_2_timestamp(first_date)
        last_date_timestamp = utils.date_2_timestamp(last_date)
        interval = int((last_date_timestamp - first_date_timestamp) / 86400)
        if interval > 0:
            interval += 1

        l_dates = list(map(lambda x: x[:-9], date_list))
        l_dates = list(set(l_dates))
        count = len(l_dates)
        if interval == 0:
            count = 0

        count_divide_interval = min(float(count) /
                                    interval, 1.0) if interval > 0 else 0

        l_bonds.append([bond_id, interval, count_divide_interval, count])
    def __init__(self, input_days=20, force_refresh=False):
        self.__input_days = input_days
        self.__output_days = input_days // 10
        self.__input_mode = 0

        # get the path of the cache data
        data_subset = load.DATA_SUBSET_FOR_TEMPORAL_INPUT
        subset = os.path.split(data_subset)[1]
        year = load.YEAR_FOR_TEMPORAL_INPUT
        volume_level = load.VOLUME_LEVEL_FOR_TEMPORAL_INPUT
        no_below = load.NO_BELOW_FOR_TEMPORAL_INPUT
        data_index = load.DATA_INDEX_FOR_TEMPORAL_INPUT

        data_all_root_dir = os.path.join(data_subset, year, volume_level)
        all_level = os.path.split(data_all_root_dir)[1]

        self.__data_pkl_path = os.path.join(
            path.PATH_TMP_DIR,
            f'temporal_input_interval_output_for_pretrain_same_volume_{all_level}_{subset}_{year}_{volume_level}_{data_index}_no_below_{no_below}_input_days_{input_days}.pkl'
        )

        if os.path.isfile(self.__data_pkl_path) and not force_refresh:
            self.__train_X, self.__train_y, self.__test_X, self.__test_y, self.dict, self.voc_size = \
                utils.load_pkl(self.__data_pkl_path)

        else:
            print(f'\nStart loading data from {data_all_root_dir} ...')

            train_start_timestamp = utils.date_2_timestamp('2015-01-02')
            train_end_timestamp = utils.date_2_timestamp('2015-10-14', True)

            test_start_timestamp = utils.date_2_timestamp('2015-10-14')
            test_end_timestamp = utils.date_2_timestamp('2015-12-31', True)

            data_all_pkl_path = os.path.join(
                path.PATH_TMP_DIR,
                f'all_doc_list_for_pretrain_{subset}_{year}_{all_level}.pkl')

            if os.path.isfile(data_all_pkl_path):
                train_all_doc_list, test_all_doc_list = utils.load_pkl(
                    data_all_pkl_path)

            else:
                train_all_doc_list = self.__load_dir_all(
                    data_all_root_dir, train_start_timestamp,
                    train_end_timestamp, 'train')
                test_all_doc_list = self.__load_dir_all(
                    data_all_root_dir, test_start_timestamp,
                    test_end_timestamp, 'test')

                # train_all_doc_list = []
                # test_all_doc_list = []
                # for _volume in os.listdir(data_all_root_dir):
                #     sub_all_root_dir = os.path.join(data_all_root_dir, _volume)
                #     sub_train_all_doc_list = self.__load_dir_all(sub_all_root_dir, train_start_timestamp,
                #                                                  train_end_timestamp,
                #                                                  'train')
                #     sub_test_all_doc_list = self.__load_dir_all(sub_all_root_dir, test_start_timestamp,
                #                                                 test_end_timestamp,
                #                                                 'test')
                #
                #     train_all_doc_list += sub_train_all_doc_list
                #     test_all_doc_list += sub_test_all_doc_list

                utils.write_pkl(data_all_pkl_path,
                                [train_all_doc_list, test_all_doc_list])

            print(f'Finish loading \n\nStart processing data ... ')

            train_all_docs = []
            for v in train_all_doc_list:
                train_all_docs += v
            test_all_docs = []
            for v in test_all_doc_list:
                test_all_docs += v
            del train_all_doc_list
            del test_all_doc_list

            self.dict, self.voc_size = self.__gen_dict(train_all_docs, 150)

            # # generate the dictionary which maps the bond_id to index
            # self.dict, self.voc_size = self.__gen_dict(train_all_doc_list, no_below)

            print(self.voc_size)
            # print(self.voc_size_all)
            print(
                'Finish generating dict\n\nStart converting input output ...')

            # convert doc list to trainable interval summed one-hot vector
            self.__train_X = self.__convert_input(train_all_docs, self.dict,
                                                  self.voc_size,
                                                  'allow_unknown')
            self.__train_y = self.__convert_output(train_all_docs)
            self.__test_X = self.__convert_input(test_all_docs, self.dict,
                                                 self.voc_size,
                                                 'allow_unknown')
            self.__test_y = self.__convert_output(test_all_docs)

            # self.__train_X = 0.
            # self.__test_X = 0.
            # self.__train_y = 0.
            # self.__test_y = 0.
            #
            # for doc_list in train_all_doc_list:
            #     self.__train_X += self.__convert_input(doc_list, self.dict, self.voc_size, 'allow_unknown')
            #     self.__train_y += self.__convert_output(doc_list)
            # self.__train_X /= len(train_all_doc_list)
            #
            # for doc_list in test_all_doc_list:
            #     self.__test_X += self.__convert_input(doc_list, self.dict, self.voc_size, 'allow_unknown')
            #     self.__test_y += self.__convert_output(doc_list)
            # self.__test_X /= len(test_all_doc_list)

            # self.__train_all_X = np.array(
            #     list(map(lambda x: self.__convert_input(x, self.dict_all, self.voc_size_all, 'allow_unknown'),
            #              train_all_doc_list)))
            # self.__train_all_y = np.array(list(map(self.__convert_output, train_all_doc_list)))
            # self.__test_all_X = np.array(
            #     list(map(lambda x: self.__convert_input(x, self.dict_all, self.voc_size_all, 'allow_unknown'),
            #              test_all_doc_list)))
            # self.__test_all_y = np.array(list(map(self.__convert_output, test_all_doc_list)))

            # self.__train_all_X = np.mean(self.__train_all_X, axis=0)
            # self.__train_all_y = np.mean(self.__train_all_y, axis=0)
            # self.__test_all_X = np.mean(self.__test_all_X, axis=0)
            # self.__test_all_y = np.mean(self.__test_all_y, axis=0)

            # # convert doc list to trainable interval summed one-hot vector
            # self.__train_X = self.__convert_input(train_doc_list, self.dict, self.voc_size, 'allow_unknown')
            # self.__train_y = self.__convert_output(train_doc_list)
            # self.__test_X = self.__convert_input(test_doc_list, self.dict, self.voc_size, 'allow_unknown')
            # self.__test_y = self.__convert_output(test_doc_list)

            # self.__train_X = np.vstack([self.__train_X.transpose([2, 0, 1]), self.__train_all_X.transpose([2, 0, 1])])
            # self.__test_X = np.vstack([self.__test_X.transpose([2, 0, 1]), self.__test_all_X.transpose([2, 0, 1])])
            # self.__train_X = self.__train_X.transpose([1, 2, 0])
            # self.__test_X = self.__test_X.transpose([1, 2, 0])

            print('Finish processing ')

            # cache data for faster processing next time
            utils.write_pkl(self.__data_pkl_path, [
                self.__train_X, self.__train_y, self.__test_X, self.__test_y,
                self.dict, self.voc_size
            ])

        self.__gen_topics_mask()

        self.__statistic()
Beispiel #6
0
force_no_direction = True

dir_path = os.path.join(path.PREDICTION_DATE_DIR, dir_name, '2015', f'dealer_{dealer_index}')
name = dir_name.replace('bonds_by_dealer_', '').replace('bonds_by_dealer', '')
name = name_map[name]

save_dir = os.path.join(path.ROOT_DIR, 'runtime', 'test')
if not os.path.isdir(save_dir):
    os.mkdir(save_dir)

path_dict_bond_id_offering_date = os.path.join(path.DATA_ROOT_DIR, 'dict_bond_id_offering_date.json')
dict_bond_id_offering_date = utils.load_json(path_dict_bond_id_offering_date)
remove_bond_list = []

bound_date = '2014-06-01'
bound_timestamp = utils.date_2_timestamp(bound_date)
for bond_id, offering_date in dict_bond_id_offering_date.items():
    if offering_date[:2] == '19':
        continue
    elif offering_date[:1] != '2':
        remove_bond_list.append(bond_id)
        continue

    offering_date = offering_date.replace('-00', '-01')

    offering_timestamp = utils.date_2_timestamp(offering_date)
    if offering_timestamp >= bound_timestamp:
        remove_bond_list.append(bond_id)

data = __load_dir(dir_path)
dates, voc_size, original_bond_size = __process(data, remove_bond_list, dir_name, no_below, force_no_direction)
            'offering_date': x[15],
            'report_dealer_index': int(x[10]),
            'contra_party_index': int(x[11]),
            'date': x[9],
            'volume': float(x[3]),
        }, data))

print('finish converting\n\nstart traversing data ...')

d_new_bonds = {}
d_dealers = {}

total_volume = 0.
total_transaction_count = len(data)

bound_timestamp = utils.date_2_timestamp('2014-06-01')

length = len(data)
for i, v in enumerate(data):
    if i % 20 == 0:
        progress = float(i) / length * 100.
        print('\rprogress: %.2f%% ' % progress, end='')

    bond_id = v['bond_id']
    offering_date = v['offering_date']
    report_dealer_index = str(v['report_dealer_index'])
    contra_party_index = str(v['contra_party_index'])
    date = v['date']
    volume = v['volume']

    if str(offering_date)[0] != '2':