def __init__(self, inputs: datapack.DataPack, num_neg: int = 1, num_dup: int = 4, batch_size: int = 32, stage: str = 'train', shuffle: bool = True): """Construct the pair generator. :param inputs: the output generated by :class:`DataPack`. :param num_neg: the number of negative samples associated with each positive sample. :param num_dup: the number of duplicates for each positive sample. This variable is used to balance samples since there are always many more negative sample than positive sample, thus, we use num_dup to duplicate those positive samples. :param batch_size: number of instances in a batch. :param stage: the current phase, the value can be 'train' or 'test'. :param shuffle: whether to shuffle the instances while generating a batch. """ self._num_neg = num_neg self._num_dup = num_dup self._left = inputs.left self._right = inputs.right self._task = tasks.Ranking() self._relation = self.transform_relation(inputs.relation) num_pairs = len(self._relation) // (self._num_neg + 1) super().__init__(batch_size, num_pairs, stage, shuffle)
def guess_and_fill_missing_params(self, verbose=1): """ Guess and fill missing parameters in :attr:`params`. Use this method to automatically fill-in hyper parameters. This involves some guessing so the parameter it fills could be wrong. For example, the default task is `Ranking`, and if we do not set it to `Classification` manaully for data packs prepared for classification, then the shape of the model output and the data will mismatch. :param verbose: Verbosity. """ self._params.get('name').set_default(self.__class__.__name__, verbose) self._params.get('task').set_default(tasks.Ranking(), verbose) self._params.get('input_shapes').set_default([(30, ), (30, )], verbose) self._params.get('optimizer').set_default('adam', verbose) if 'with_embedding' in self._params: self._params.get('embedding_input_dim').set_default(300, verbose) self._params.get('embedding_output_dim').set_default(300, verbose) self._params.get('embedding_trainable').set_default(True, verbose) if 'with_multi_layer_perceptron' in self._params: self._params.get('mlp_num_layers').set_default(3, verbose) self._params.get('mlp_num_units').set_default(64, verbose) self._params.get('mlp_num_fan_out').set_default(32, verbose) self._params.get('mlp_activation_func').set_default( 'relu', verbose)
def guess_and_fill_missing_params(self, verbose=1): """ Guess and fill missing parameters in :attr:`params`. Use this method to automatically fill-in other hyper parameters. This involves some guessing so the parameter it fills could be wrong. For example, the default task is `Ranking`, and if we do not set it to `Classification` manaully for data packs prepared for classification, then the shape of the model output and the data will mismatch. :param verbose: Verbosity. """ self._params.get('task').set_default(tasks.Ranking(), verbose) if 'with_embedding' in self._params: self._params.get('embedding_input_dim').set_default(300, verbose) self._params.get('embedding_output_dim').set_default(300, verbose)
def predict(config: Config, model: engine.BaseModel, query: str, nlargest: int = 5) -> typing.List[typing.Tuple[str, float, str]]: logger.info('Running predictions...') net_name = config.net_name pp_dir = config.paths['preprocess_dir'] corpus_d_path = os.path.join(pp_dir, net_name + "_documents.dill") docs = dill.load(open(corpus_d_path, 'rb')) doc_lookup = list(docs.keys()) num_docs = len(doc_lookup) docs_df = pd.DataFrame.from_dict(docs, orient='index', columns=['Document']) docs_df['QID'] = 'Q' task = tasks.Ranking() pre = engine.load_preprocessor(dirpath=pp_dir, name=net_name) query_df = docs_df.copy() query_df['Question'] = query inputs = pre.transform(list(query_df.itertuples()), stage='predict') gen_predict = generators.PointGenerator(inputs, task, shuffle=False, stage='test') predictions = model._backend.predict_generator(gen_predict, verbose=1) idx = heapq.nlargest(nlargest, range(num_docs), predictions.ravel().take) results = [] for candidate in idx: did = doc_lookup[candidate] d = docs[did] score = predictions[candidate][0] results.append((did, score, d)) return results
model_setups = [ (models.NaiveModel, None), (models.DenseBaselineModel, None), (models.DSSMModel, None) ] @pytest.fixture(scope='module', params=[1, 32]) def num_samples(request): return request.param @pytest.fixture(scope='module', params=[ tasks.Classification(num_classes=2), tasks.Classification(num_classes=16), tasks.Ranking() ]) def task(request): return request.param @pytest.fixture(params=model_setups) def raw_model(request): model_class, custom_kwargs = request.param model = model_class() if custom_kwargs: for key, val in custom_kwargs.items(): model.params[key] = val return model
def _guess_task(train_pack): if np.issubdtype(train_pack.relation['label'].dtype, np.number): return tasks.Ranking() elif np.issubdtype(train_pack.relation['label'].dtype, list): num_classes = int(train_pack.relation['label'].apply(len).max()) return tasks.Classification(num_classes)