Ejemplo n.º 1
0
    def __init__(self,
                 inputs: datapack.DataPack,
                 num_neg: int = 1,
                 num_dup: int = 4,
                 batch_size: int = 32,
                 stage: str = 'train',
                 shuffle: bool = True):
        """Construct the pair generator.

        :param inputs: the output generated by :class:`DataPack`.
        :param num_neg: the number of negative samples associated with each
            positive sample.
        :param num_dup: the number of duplicates for each positive sample.
        This variable is used to balance samples since there are always many
        more negative sample than positive sample, thus, we use num_dup to
        duplicate those positive samples.
        :param batch_size: number of instances in a batch.
        :param stage: the current phase, the value can be 'train' or 'test'.
        :param shuffle: whether to shuffle the instances while generating a
            batch.
        """
        self._num_neg = num_neg
        self._num_dup = num_dup
        self._left = inputs.left
        self._right = inputs.right
        self._task = tasks.Ranking()
        self._relation = self.transform_relation(inputs.relation)
        num_pairs = len(self._relation) // (self._num_neg + 1)
        super().__init__(batch_size, num_pairs, stage, shuffle)
Ejemplo n.º 2
0
    def guess_and_fill_missing_params(self, verbose=1):
        """
        Guess and fill missing parameters in :attr:`params`.

        Use this method to automatically fill-in hyper parameters.
        This involves some guessing so the parameter it fills could be
        wrong. For example, the default task is `Ranking`, and if we do not
        set it to `Classification` manaully for data packs prepared for
        classification, then the shape of the model output and the data will
        mismatch.

        :param verbose: Verbosity.
        """
        self._params.get('name').set_default(self.__class__.__name__, verbose)
        self._params.get('task').set_default(tasks.Ranking(), verbose)
        self._params.get('input_shapes').set_default([(30, ), (30, )], verbose)
        self._params.get('optimizer').set_default('adam', verbose)
        if 'with_embedding' in self._params:
            self._params.get('embedding_input_dim').set_default(300, verbose)
            self._params.get('embedding_output_dim').set_default(300, verbose)
            self._params.get('embedding_trainable').set_default(True, verbose)
        if 'with_multi_layer_perceptron' in self._params:
            self._params.get('mlp_num_layers').set_default(3, verbose)
            self._params.get('mlp_num_units').set_default(64, verbose)
            self._params.get('mlp_num_fan_out').set_default(32, verbose)
            self._params.get('mlp_activation_func').set_default(
                'relu', verbose)
Ejemplo n.º 3
0
    def guess_and_fill_missing_params(self, verbose=1):
        """
        Guess and fill missing parameters in :attr:`params`.

        Use this method to automatically fill-in other hyper parameters.
        This involves some guessing so the parameter it fills could be
        wrong. For example, the default task is `Ranking`, and if we do not
        set it to `Classification` manaully for data packs prepared for
        classification, then the shape of the model output and the data will
        mismatch.

        :param verbose: Verbosity.
        """
        self._params.get('task').set_default(tasks.Ranking(), verbose)
        if 'with_embedding' in self._params:
            self._params.get('embedding_input_dim').set_default(300, verbose)
            self._params.get('embedding_output_dim').set_default(300, verbose)
Ejemplo n.º 4
0
def predict(config: Config,
            model: engine.BaseModel,
            query: str,
            nlargest: int = 5) -> typing.List[typing.Tuple[str, float, str]]:
    logger.info('Running predictions...')

    net_name = config.net_name
    pp_dir = config.paths['preprocess_dir']
    corpus_d_path = os.path.join(pp_dir, net_name + "_documents.dill")

    docs = dill.load(open(corpus_d_path, 'rb'))
    doc_lookup = list(docs.keys())
    num_docs = len(doc_lookup)
    docs_df = pd.DataFrame.from_dict(docs,
                                     orient='index',
                                     columns=['Document'])
    docs_df['QID'] = 'Q'
    task = tasks.Ranking()
    pre = engine.load_preprocessor(dirpath=pp_dir, name=net_name)

    query_df = docs_df.copy()
    query_df['Question'] = query
    inputs = pre.transform(list(query_df.itertuples()), stage='predict')
    gen_predict = generators.PointGenerator(inputs,
                                            task,
                                            shuffle=False,
                                            stage='test')
    predictions = model._backend.predict_generator(gen_predict, verbose=1)
    idx = heapq.nlargest(nlargest, range(num_docs), predictions.ravel().take)
    results = []
    for candidate in idx:
        did = doc_lookup[candidate]
        d = docs[did]
        score = predictions[candidate][0]
        results.append((did, score, d))

    return results
Ejemplo n.º 5
0
model_setups = [
    (models.NaiveModel, None),
    (models.DenseBaselineModel, None),
    (models.DSSMModel, None)
]


@pytest.fixture(scope='module', params=[1, 32])
def num_samples(request):
    return request.param


@pytest.fixture(scope='module', params=[
    tasks.Classification(num_classes=2),
    tasks.Classification(num_classes=16),
    tasks.Ranking()
])
def task(request):
    return request.param


@pytest.fixture(params=model_setups)
def raw_model(request):
    model_class, custom_kwargs = request.param
    model = model_class()
    if custom_kwargs:
        for key, val in custom_kwargs.items():
            model.params[key] = val
    return model

Ejemplo n.º 6
0
def _guess_task(train_pack):
    if np.issubdtype(train_pack.relation['label'].dtype, np.number):
        return tasks.Ranking()
    elif np.issubdtype(train_pack.relation['label'].dtype, list):
        num_classes = int(train_pack.relation['label'].apply(len).max())
        return tasks.Classification(num_classes)