Esempio n. 1
0
    def default_setting(self):
        """
        A default setting for data loading
        :return:
        """
        unknown_as_zero = False  # using original labels, e.g., w.r.t. semi-supervised dataset
        binary_rele = False  # using original labels
        train_presort, validation_presort, test_presort = True, True, True
        train_batch_size, validation_batch_size, test_batch_size = 1, 1, 1
        scale_data, scaler_id, scaler_level = get_default_scaler_setting(
            data_id=self.data_id)

        # more data settings that are rarely changed
        self.data_dict = dict(data_id=self.data_id,
                              dir_data=self.dir_data,
                              min_docs=10,
                              min_rele=1,
                              scale_data=scale_data,
                              scaler_id=scaler_id,
                              scaler_level=scaler_level,
                              train_presort=train_presort,
                              validation_presort=validation_presort,
                              test_presort=test_presort,
                              train_batch_size=train_batch_size,
                              validation_batch_size=validation_batch_size,
                              test_batch_size=test_batch_size,
                              unknown_as_zero=unknown_as_zero,
                              binary_rele=binary_rele)

        data_meta = get_data_meta(data_id=self.data_id)  # add meta-information
        self.data_dict.update(data_meta)

        return self.data_dict
Esempio n. 2
0
    def default_setting(self):
        """
		A default setting for data loading when performing adversarial ltr
		"""
        unknown_as_zero = False
        binary_rele = False  # using the original values
        train_presort, validation_presort, test_presort = True, True, True
        train_rough_batch_size, validation_rough_batch_size, test_rough_batch_size = 1, 100, 100
        scale_data, scaler_id, scaler_level = get_scaler_setting(
            data_id=self.data_id)

        # more data settings that are rarely changed
        self.data_dict = dict(
            data_id=self.data_id,
            dir_data=self.dir_data,
            min_docs=10,
            min_rele=1,
            unknown_as_zero=unknown_as_zero,
            binary_rele=binary_rele,
            train_presort=train_presort,
            validation_presort=validation_presort,
            test_presort=test_presort,
            train_rough_batch_size=train_rough_batch_size,
            validation_rough_batch_size=validation_rough_batch_size,
            test_rough_batch_size=test_rough_batch_size,
            scale_data=scale_data,
            scaler_id=scaler_id,
            scaler_level=scaler_level)

        data_meta = get_data_meta(data_id=self.data_id)  # add meta-information
        if self.debug: data_meta['fold_num'] = 2
        self.data_dict.update(data_meta)

        return self.data_dict
Esempio n. 3
0
    def default_setting(self):
        """
        A default setting for data loading when running lambdaMART
        """
        unknown_as_zero = True if self.data_id in MSLETOR_SEMI else False  # since lambdaMART is a supervised method
        binary_rele = False  # using the original values
        train_presort, validation_presort, test_presort = False, False, False
        train_batch_size, validation_batch_size, test_batch_size = 1, 1, 1

        scale_data, scaler_id, scaler_level = get_default_scaler_setting(
            data_id=self.data_id)

        # more data settings that are rarely changed
        self.data_dict = dict(data_id=self.data_id,
                              dir_data=self.dir_data,
                              min_docs=10,
                              min_rele=1,
                              unknown_as_zero=unknown_as_zero,
                              binary_rele=binary_rele,
                              train_presort=train_presort,
                              validation_presort=validation_presort,
                              test_presort=test_presort,
                              train_batch_size=train_batch_size,
                              validation_batch_size=validation_batch_size,
                              test_batch_size=test_batch_size,
                              scale_data=scale_data,
                              scaler_id=scaler_id,
                              scaler_level=scaler_level)

        data_meta = get_data_meta(data_id=self.data_id)  # add meta-information
        self.data_dict.update(data_meta)

        return self.data_dict
    def default_setting(self):
        """
        A default setting for data loading when running lambdaMART
        :return:
        """
        unknown_as_zero = True if self.data_id in MSLETOR_SEMI else False  # since lambdaMART is a supervised method
        binary_rele = False  # using the original values
        presort = True  # this setting leads to no difference for lambdaMART, but it can be altered to reused buffered data

        scale_data, scaler_id, scaler_level = get_default_scaler_setting(
            data_id=self.data_id)

        # more data settings that are rarely changed
        self.data_dict = dict(data_id=self.data_id,
                              dir_data=self.dir_data,
                              min_docs=10,
                              min_rele=1,
                              sample_rankings_per_q=1,
                              unknown_as_zero=unknown_as_zero,
                              binary_rele=binary_rele,
                              presort=presort,
                              scale_data=scale_data,
                              scaler_id=scaler_id,
                              scaler_level=scaler_level)

        data_meta = get_data_meta(data_id=self.data_id)  # add meta-information
        self.data_dict.update(data_meta)

        return self.data_dict
Esempio n. 5
0
    def default_setting(self):
        """
        A default setting for data loading
        :return:
        """
        unknown_as_zero = True if self.data_id in MSLETOR_SEMI else False
        binary_rele = False  # using the original values
        presort = True  # a default setting

        scale_data, scaler_id, scaler_level = get_default_scaler_setting(
            data_id=self.data_id)

        # more data settings that are rarely changed
        self.data_dict = dict(data_id=self.data_id,
                              dir_data=self.dir_data,
                              min_docs=10,
                              min_rele=1,
                              sample_rankings_per_q=1,
                              unknown_as_zero=unknown_as_zero,
                              binary_rele=binary_rele,
                              presort=presort,
                              scale_data=scale_data,
                              scaler_id=scaler_id,
                              scaler_level=scaler_level)

        data_meta = get_data_meta(data_id=self.data_id)  # add meta-information
        self.data_dict.update(data_meta)

        return self.data_dict
Esempio n. 6
0
	def grid_search(self):
		"""
		Iterator of settings for data loading when performing adversarial ltr
		:param debug:
		:param data_id:
		:param dir_data:
		:return:
		"""
		''' common settings without grid-search '''
		binary_rele, unknown_as_zero = False, False
		common_data_dict = dict(data_id=self.data_id, dir_data=self.dir_data, min_docs=10, min_rele=1,
								unknown_as_zero=unknown_as_zero, binary_rele=binary_rele)

		data_meta = get_data_meta(data_id=self.data_id)  # add meta-information
		common_data_dict.update(data_meta)

		''' some settings for grid-search '''
		choice_presort = [True] if self.debug else [True]
		choice_sample_rankings_per_q = [1] if self.debug else [1]  # number of sample rankings per query
		choice_scale_data, choice_scaler_id, choice_scaler_level = get_default_scaler_setting(data_id=self.data_id, grid_search=True)

		for scale_data, scaler_id, scaler_level, presort, sample_rankings_per_q in product(choice_scale_data,
																						   choice_scaler_id,
																						   choice_scaler_level,
																						   choice_presort,
																						   choice_sample_rankings_per_q):

			self.data_dict = dict(presort=presort, sample_rankings_per_q=sample_rankings_per_q,
								  scale_data=scale_data, scaler_id=scaler_id, scaler_level=scaler_level)
			self.data_dict.update(common_data_dict)
			yield self.data_dict
Esempio n. 7
0
    def grid_search(self):
        if self.use_json:
            scaler_id = self.json_dict['scaler_id']
            choice_min_docs = self.json_dict['min_docs']
            choice_min_rele = self.json_dict['min_rele']
            choice_binary_rele = self.json_dict['binary_rele']
            choice_unknown_as_zero = self.json_dict['unknown_as_zero']
            choice_tr_batch_size = self.json_dict[
                'tr_batch_size']  # train_rough_batch_size
            # hard-coding for rarely changed settings
            base_data_dict = dict(data_id=self.data_id,
                                  dir_data=self.json_dict["dir_data"],
                                  train_presort=True,
                                  test_presort=True,
                                  validation_presort=True,
                                  validation_rough_batch_size=100,
                                  test_rough_batch_size=100)
        else:
            scaler_id = None
            choice_min_docs = [10]
            choice_min_rele = [1]
            choice_binary_rele = [False]
            choice_unknown_as_zero = [False]
            choice_tr_batch_size = [100]
            base_data_dict = dict(data_id=self.data_id,
                                  dir_data=self.dir_data,
                                  train_presort=True,
                                  test_presort=True,
                                  validation_presort=True,
                                  validation_rough_batch_size=100,
                                  test_rough_batch_size=100)

        data_meta = get_data_meta(data_id=self.data_id)  # add meta-information
        if self.debug: data_meta['fold_num'] = 1
        base_data_dict.update(data_meta)

        scale_data, scaler_id, scaler_level = get_scaler_setting(
            data_id=self.data_id, scaler_id=scaler_id)

        for min_docs, min_rele, tr_batch_size in product(
                choice_min_docs, choice_min_rele, choice_tr_batch_size):
            threshold_dict = dict(min_docs=min_docs,
                                  min_rele=min_rele,
                                  train_rough_batch_size=tr_batch_size)

            for binary_rele, unknown_as_zero in product(
                    choice_binary_rele, choice_unknown_as_zero):
                custom_dict = dict(binary_rele=binary_rele,
                                   unknown_as_zero=unknown_as_zero)
                scale_dict = dict(scale_data=scale_data,
                                  scaler_id=scaler_id,
                                  scaler_level=scaler_level)

                self.data_dict = dict()
                self.data_dict.update(base_data_dict)
                self.data_dict.update(threshold_dict)
                self.data_dict.update(custom_dict)
                self.data_dict.update(scale_dict)
                yield self.data_dict
Esempio n. 8
0
    def grid_search(self):
        """
		Iterator of settings for data loading when performing adversarial ltr
		"""
        if self.use_json:
            scaler_id = self.json_dict['scaler_id']
            choice_min_docs = self.json_dict['min_docs']
            choice_min_rele = self.json_dict['min_rele']
            choice_binary_rele = self.json_dict['binary_rele']
            choice_unknown_as_zero = self.json_dict['unknown_as_zero']
            base_data_dict = dict(data_id=self.data_id,
                                  dir_data=self.json_dict["dir_data"],
                                  train_presort=True,
                                  test_presort=True,
                                  validation_presort=True,
                                  train_rough_batch_size=1,
                                  validation_rough_batch_size=100,
                                  test_rough_batch_size=100)
        else:
            scaler_id = None
            choice_min_docs = [10]
            choice_min_rele = [1]
            choice_binary_rele = [False]
            choice_unknown_as_zero = [False]
            base_data_dict = dict(data_id=self.data_id,
                                  dir_data=self.dir_data,
                                  train_presort=True,
                                  test_presort=True,
                                  validation_presort=True,
                                  train_rough_batch_size=1,
                                  validation_rough_batch_size=100,
                                  test_rough_batch_size=100)

        data_meta = get_data_meta(data_id=self.data_id)  # add meta-information
        base_data_dict.update(data_meta)

        scale_data, scaler_id, scaler_level = get_scaler_setting(
            data_id=self.data_id, scaler_id=scaler_id)

        for min_docs, min_rele in product(choice_min_docs, choice_min_rele):
            threshold_dict = dict(min_docs=min_docs, min_rele=min_rele)

            for binary_rele, unknown_as_zero in product(
                    choice_binary_rele, choice_unknown_as_zero):
                custom_dict = dict(binary_rele=binary_rele,
                                   unknown_as_zero=unknown_as_zero)
                scale_dict = dict(scale_data=scale_data,
                                  scaler_id=scaler_id,
                                  scaler_level=scaler_level)

                self.data_dict = dict()
                self.data_dict.update(base_data_dict)
                self.data_dict.update(threshold_dict)
                self.data_dict.update(custom_dict)
                self.data_dict.update(scale_dict)
                yield self.data_dict
Esempio n. 9
0
    def grid_search(self):
        if self.data_json is not None:  # using json file
            choice_presort = self.json_dict['presort']
            choice_min_docs = self.json_dict['min_docs']
            choice_min_rele = self.json_dict['min_rele']
            choice_binary_rele = self.json_dict['binary_rele']
            choice_unknown_as_zero = self.json_dict['unknown_as_zero']
            choice_sample_rankings_per_q = self.json_dict[
                'sample_rankings_per_q']
            base_data_dict = dict(data_id=self.data_id,
                                  dir_data=self.json_dict["dir_data"])
        else:
            choice_min_docs = [10]
            choice_min_rele = [1]
            choice_presort = [True]
            choice_binary_rele = [False]
            choice_unknown_as_zero = [True]
            choice_sample_rankings_per_q = [
                1
            ]  # number of sample rankings per query

            base_data_dict = dict(data_id=self.data_id, dir_data=self.dir_data)

        data_meta = get_data_meta(data_id=self.data_id)  # add meta-information
        base_data_dict.update(data_meta)

        choice_scale_data, choice_scaler_id, choice_scaler_level = get_default_scaler_setting(
            data_id=self.data_id, grid_search=True)

        for min_docs, min_rele, sample_rankings_per_q in product(
                choice_min_docs, choice_min_rele,
                choice_sample_rankings_per_q):
            threshold_dict = dict(min_docs=min_docs,
                                  min_rele=min_rele,
                                  sample_rankings_per_q=sample_rankings_per_q)

            for binary_rele, unknown_as_zero, presort in product(
                    choice_binary_rele, choice_unknown_as_zero,
                    choice_presort):
                custom_dict = dict(binary_rele=binary_rele,
                                   unknown_as_zero=unknown_as_zero,
                                   presort=presort)

                for scale_data, scaler_id, scaler_level in product(
                        choice_scale_data, choice_scaler_id,
                        choice_scaler_level):
                    scale_dict = dict(scale_data=scale_data,
                                      scaler_id=scaler_id,
                                      scaler_level=scaler_level)

                    self.data_dict = dict()
                    self.data_dict.update(base_data_dict)
                    self.data_dict.update(threshold_dict)
                    self.data_dict.update(custom_dict)
                    self.data_dict.update(scale_dict)
                    yield self.data_dict
Esempio n. 10
0
 def set_model_setting(self,
                       model_id=None,
                       data_id=None,
                       dir_json=None,
                       debug=False):
     """
     Initialize the parameter class for a specified model
     :param debug:
     :param model_id:
     :param data_id:
     :return:
     """
     if model_id in [
             'RankMSE', 'RankNet', 'ListNet', 'ListMLE', 'RankCosine'
     ]:
         # the 1st type with model_id, where ModelParameter is sufficient
         self.model_parameter = ModelParameter(model_id=model_id)
     elif model_id in [
             'LambdaRank', 'ApproxNDCG', 'DirectOpt', 'MarginLambdaLoss'
     ]:
         # the 2nd type, where the information of the type of relevance label is required.
         data_meta = get_data_meta(data_id=data_id)  # add meta-information
         if data_meta['multi_level_rele']:
             if dir_json is not None:
                 para_json = dir_json + model_id + "Parameter.json"
                 self.model_parameter = globals()[model_id + "Parameter"](
                     para_json=para_json, std_rele_is_permutation=False)
             else:
                 self.model_parameter = globals()[model_id + "Parameter"](
                     debug=debug, std_rele_is_permutation=False)
         else:  # the case like MSLETOR_LIST
             if dir_json is not None:
                 para_json = dir_json + model_id + "Parameter.json"
                 self.model_parameter = globals()[model_id + "Parameter"](
                     para_json=para_json, std_rele_is_permutation=True)
             else:
                 self.model_parameter = globals()[model_id + "Parameter"](
                     debug=debug, std_rele_is_permutation=True)
     else:
         # the 3rd type, where debug-mode enables quick test
         if dir_json is not None:
             para_json = dir_json + model_id + "Parameter.json"
             self.model_parameter = globals()[model_id + "Parameter"](
                 para_json=para_json)
         else:
             self.model_parameter = globals()[model_id +
                                              "Parameter"](debug=debug)
Esempio n. 11
0
	def load_setting(self):
		if self.use_json:
			choice_min_docs = self.json_dict['min_docs']
			choice_min_rele = self.json_dict['min_rele']
			choice_binary_rele = self.json_dict['binary_rele']
			choice_unknown_as_zero = self.json_dict['unknown_as_zero']
			choice_train_presort = self.json_dict['train_presort']
			choice_train_batch_size = self.json_dict['train_batch_size']
			# hard-coding for rarely changed settings
			base_data_dict = dict(data_id=self.data_id, dir_data=self.json_dict["dir_data"], test_presort=True,
								  validation_presort=True, validation_batch_size=1, test_batch_size=1)
		else:
			choice_min_docs = [10]
			choice_min_rele = [1]
			choice_binary_rele = [False]
			choice_unknown_as_zero = [False]
			choice_train_presort = [True]
			choice_train_batch_size = [1]  # number of sample rankings per query

			base_data_dict = dict(data_id=self.data_id, dir_data=self.dir_data, test_presort=True,
								  validation_presort=True, validation_batch_size=1, test_batch_size=1)

		data_meta = get_data_meta(data_id=self.data_id)  # add meta-information
		base_data_dict.update(data_meta)

		choice_scale_data, choice_scaler_id, choice_scaler_level = get_default_scaler_setting(data_id=self.data_id,
																							  grid_search=True)

		for min_docs, min_rele, train_batch_size in product(choice_min_docs, choice_min_rele, choice_train_batch_size):
			threshold_dict = dict(min_docs=min_docs, min_rele=min_rele, train_batch_size=train_batch_size)

			for binary_rele, unknown_as_zero, train_presort in product(choice_binary_rele, choice_unknown_as_zero,
																	   choice_train_presort):
				custom_dict = dict(binary_rele=binary_rele, unknown_as_zero=unknown_as_zero,
								   train_presort=train_presort)

				for scale_data, scaler_id, scaler_level in product(choice_scale_data, choice_scaler_id,
																   choice_scaler_level):
					scale_dict = dict(scale_data=scale_data, scaler_id=scaler_id, scaler_level=scaler_level)

					self.data_dict = dict()
					self.data_dict.update(base_data_dict)
					self.data_dict.update(threshold_dict)
					self.data_dict.update(custom_dict)
					self.data_dict.update(scale_dict)
					return self.data_dict
Esempio n. 12
0
def check_dataset_statistics(data_id, dir_data, buffer=False):
    '''
    Get the basic statistics on the specified dataset
    '''
    if data_id in YAHOO_LTR:
        data_prefix = dir_data + data_id.lower() + '.'
        file_train, file_vali, file_test = data_prefix + 'train.txt', data_prefix + 'valid.txt', data_prefix + 'test.txt'

    elif data_id in ISTELLA_LTR:
        data_prefix = dir_data + data_id + '/'
        if data_id == 'Istella_X' or data_id == 'Istella_S':
            file_train, file_vali, file_test = data_prefix + 'train.txt', data_prefix + 'vali.txt', data_prefix + 'test.txt'
        else:
            file_train, file_test = data_prefix + 'train.txt', data_prefix + 'test.txt'
    else:
        fold_k = 1
        fold_k_dir = dir_data + 'Fold' + str(fold_k) + '/'
        file_train, file_vali, file_test = fold_k_dir + 'train.txt', fold_k_dir + 'vali.txt', fold_k_dir + 'test.txt'

    # common
    if 'Istella' == data_id:
        train_dataset = LTRDataset(split_type=SPLIT_TYPE.Train,
                                   file=file_train,
                                   data_id=data_id,
                                   shuffle=False,
                                   buffer=buffer)
        test_dataset = LTRDataset(split_type=SPLIT_TYPE.Test,
                                  file=file_test,
                                  data_id=data_id,
                                  shuffle=False,
                                  buffer=buffer)

        num_queries = train_dataset.__len__() + test_dataset.__len__()
        print('Dataset:\t', data_id)
        print('Total queries:\t', num_queries)
        print('\tTrain:', train_dataset.__len__(), 'Test:',
              test_dataset.__len__())

        num_docs = get_doc_num(train_dataset) + get_doc_num(test_dataset)
        print('Total docs:\t', num_docs)

        min_doc, max_doc, sum_rele = get_min_max_docs(
            train_dataset=train_dataset,
            vali_dataset=None,
            test_dataset=test_dataset)
        data_meta = get_data_meta(data_id=data_id)
        max_rele_label = data_meta['max_rele_level']
        sum_bin_cnts = get_label_distribution(train_dataset=train_dataset,
                                              test_dataset=test_dataset,
                                              semi_supervised=False,
                                              max_lavel=max_rele_label)
    else:
        train_dataset = LTRDataset(split_type=SPLIT_TYPE.Train,
                                   file=file_train,
                                   data_id=data_id,
                                   shuffle=False,
                                   buffer=buffer)
        vali_dataset = LTRDataset(split_type=SPLIT_TYPE.Validation,
                                  file=file_vali,
                                  data_id=data_id,
                                  shuffle=False,
                                  buffer=buffer)
        test_dataset = LTRDataset(split_type=SPLIT_TYPE.Test,
                                  file=file_test,
                                  data_id=data_id,
                                  shuffle=False,
                                  buffer=buffer)

        num_queries = train_dataset.__len__() + vali_dataset.__len__(
        ) + test_dataset.__len__()
        print('Dataset:\t', data_id)
        print('Total queries:\t', num_queries)
        print('\tTrain:', train_dataset.__len__(), 'Vali:',
              vali_dataset.__len__(), 'Test:', test_dataset.__len__())

        num_docs = get_doc_num(train_dataset) + get_doc_num(
            vali_dataset) + get_doc_num(test_dataset)
        print('Total docs:\t', num_docs)

        if data_id in MSLETOR_SEMI:
            min_doc, max_doc, sum_rele, sum_unknown = \
                get_min_max_docs(train_dataset=train_dataset, vali_dataset=vali_dataset, test_dataset=test_dataset, semi_supervised=True)
        else:
            min_doc, max_doc, sum_rele = get_min_max_docs(
                train_dataset=train_dataset,
                vali_dataset=vali_dataset,
                test_dataset=test_dataset)
            data_meta = get_data_meta(data_id=data_id)
            max_rele_label = data_meta['max_rele_level']
            sum_bin_cnts = get_label_distribution(train_dataset=train_dataset,
                                                  vali_dataset=vali_dataset,
                                                  test_dataset=test_dataset,
                                                  semi_supervised=False,
                                                  max_lavel=max_rele_label)

    print('min, max documents per query', min_doc, max_doc)
    print('total relevant documents', sum_rele)
    print('avg rele documents per query', sum_rele * 1.0 / num_queries)
    print('avg documents per query', num_docs * 1.0 / num_queries)
    print('label distribution: ', sum_bin_cnts)
    if data_id in MSLETOR_SEMI:
        print('total unlabeled documents', sum_unknown)
Esempio n. 13
0
    def grid_search(self):
        """
		Iterator of settings for data loading when performing adversarial ltr
		"""
        if self.ad_data_json is not None:  # using json file
            choice_min_docs = self.json_dict['min_docs']
            choice_min_rele = self.json_dict['min_rele']
            choice_binary_rele = self.json_dict['binary_rele']
            choice_unknown_as_zero = self.json_dict['unknown_as_zero']
            choice_train_presort = self.json_dict['train_presort']
            choice_train_batch_size = self.json_dict['train_batch_size']
            base_data_dict = dict(data_id=self.data_id,
                                  dir_data=self.json_dict["dir_data"],
                                  test_presort=True,
                                  validation_presort=True,
                                  validation_batch_size=1,
                                  test_batch_size=1)
        else:
            choice_min_docs = [10]
            choice_min_rele = [1]
            choice_binary_rele = [False]
            choice_unknown_as_zero = [False]
            choice_train_presort = [True]
            choice_train_batch_size = [1
                                       ]  # number of sample rankings per query

            base_data_dict = dict(data_id=self.data_id,
                                  dir_data=self.dir_data,
                                  test_presort=True,
                                  validation_presort=True,
                                  validation_batch_size=1,
                                  test_batch_size=1)

        data_meta = get_data_meta(data_id=self.data_id)  # add meta-information
        base_data_dict.update(data_meta)

        choice_scale_data, choice_scaler_id, choice_scaler_level = get_default_scaler_setting(
            data_id=self.data_id, grid_search=True)

        for min_docs, min_rele, train_batch_size in product(
                choice_min_docs, choice_min_rele, choice_train_batch_size):
            threshold_dict = dict(min_docs=min_docs,
                                  min_rele=min_rele,
                                  train_batch_size=train_batch_size)

            for binary_rele, unknown_as_zero, train_presort in product(
                    choice_binary_rele, choice_unknown_as_zero,
                    choice_train_presort):
                custom_dict = dict(binary_rele=binary_rele,
                                   unknown_as_zero=unknown_as_zero,
                                   train_presort=train_presort)

                for scale_data, scaler_id, scaler_level in product(
                        choice_scale_data, choice_scaler_id,
                        choice_scaler_level):
                    scale_dict = dict(scale_data=scale_data,
                                      scaler_id=scaler_id,
                                      scaler_level=scaler_level)

                    self.data_dict = dict()
                    self.data_dict.update(base_data_dict)
                    self.data_dict.update(threshold_dict)
                    self.data_dict.update(custom_dict)
                    self.data_dict.update(scale_dict)
                    yield self.data_dict
Esempio n. 14
0
    def default_setting(self):
        """
        A default setting for data loading
        :return:
        """
        if self.use_json:
            scaler_id = self.json_dict['scaler_id']
            min_docs = self.json_dict['min_docs'][0]
            min_rele = self.json_dict['min_rele'][0]
            binary_rele = self.json_dict['binary_rele'][0]
            unknown_as_zero = self.json_dict['unknown_as_zero'][0]
            tr_batch_size = self.json_dict['tr_batch_size'][
                0]  # train_rough_batch_size

            scale_data, scaler_id, scaler_level = get_scaler_setting(
                data_id=self.data_id, scaler_id=scaler_id)

            # hard-coding for rarely changed settings
            self.data_dict = dict(data_id=self.data_id,
                                  dir_data=self.json_dict["dir_data"],
                                  train_presort=True,
                                  test_presort=True,
                                  validation_presort=True,
                                  validation_rough_batch_size=100,
                                  test_rough_batch_size=100,
                                  min_docs=min_docs,
                                  min_rele=min_rele,
                                  train_rough_batch_size=tr_batch_size,
                                  scale_data=scale_data,
                                  scaler_id=scaler_id,
                                  scaler_level=scaler_level,
                                  unknown_as_zero=unknown_as_zero,
                                  binary_rele=binary_rele)
        else:
            unknown_as_zero = False  # using original labels, e.g., w.r.t. semi-supervised dataset
            binary_rele = False  # using original labels
            train_presort, validation_presort, test_presort = True, True, True
            #train_rough_batch_size, validation_rough_batch_size, test_rough_batch_size = 1, 100, 100
            train_rough_batch_size, validation_rough_batch_size, test_rough_batch_size = 100, 100, 100
            scale_data, scaler_id, scaler_level = get_scaler_setting(
                data_id=self.data_id)

            # more data settings that are rarely changed
            self.data_dict = dict(
                data_id=self.data_id,
                dir_data=self.dir_data,
                min_docs=10,
                min_rele=1,
                scale_data=scale_data,
                scaler_id=scaler_id,
                scaler_level=scaler_level,
                train_presort=train_presort,
                validation_presort=validation_presort,
                test_presort=test_presort,
                train_rough_batch_size=train_rough_batch_size,
                validation_rough_batch_size=validation_rough_batch_size,
                test_rough_batch_size=test_rough_batch_size,
                unknown_as_zero=unknown_as_zero,
                binary_rele=binary_rele)

        data_meta = get_data_meta(data_id=self.data_id)  # add meta-information

        if self.debug: data_meta['fold_num'] = 2
        self.data_dict.update(data_meta)

        return self.data_dict