Esempio n. 1
0
    def id2vec(self, qid, posid, negid=None, **kwargs):
        query = self.qid2toks[qid]

        # TODO find a way to calculate qlen/doclen stats earlier, so we can log them and check sanity of our values
        qlen, doclen = self.config["maxqlen"], self.config["maxdoclen"]
        posdoc = self.get_doc_tokens(posid)
        if not posdoc:
            raise MissingDocError(qid, posid)

        idfs = padlist(self._get_idf(query), qlen, 0)
        query = self._tok2vec(padlist(query, qlen, self.pad_tok))
        posdoc = self._tok2vec(padlist(posdoc, doclen, self.pad_tok))

        # TODO determine whether pin_memory is happening. may not be because we don't place the strings in a np or torch object
        data = {
            "qid": qid,
            "posdocid": posid,
            "idfs": np.array(idfs, dtype=np.float32),
            "query": np.array(query, dtype=np.long),
            "posdoc": np.array(posdoc, dtype=np.long),
            "query_idf": np.array(idfs, dtype=np.float32),
            "negdocid": "",
            "negdoc": np.zeros(self.config["maxdoclen"], dtype=np.long),
        }

        if negid:
            negdoc = self.get_doc_tokens(negid)
            if not negdoc:
                raise MissingDocError(qid, negid)

            negdoc = self._tok2vec(padlist(negdoc, doclen, self.pad_tok))
            data["negdocid"] = negid
            data["negdoc"] = np.array(negdoc, dtype=np.long)

        return data
Esempio n. 2
0
    def id2vec(self, qid, posid, negid=None, label=None):
        """
        See parent class for docstring
        """
        assert label is not None

        maxseqlen = self.config["maxseqlen"]
        numpassages = self.config["numpassages"]

        query_toks = self.qid2toks[qid]
        pos_bert_inputs = []
        pos_bert_masks = []
        pos_bert_segs = []

        # N.B: The passages in self.docid2passages are not bert tokenized
        pos_passages = self.docid2passages[posid]
        for tokenized_passage in pos_passages:
            inp, mask, seg = self._prepare_bert_input(query_toks,
                                                      tokenized_passage)
            pos_bert_inputs.append(inp)
            pos_bert_masks.append(mask)
            pos_bert_segs.append(seg)

        # TODO: Rename the posdoc key in the below dict to 'pos_bert_input'
        data = {
            "qid": qid,
            "posdocid": posid,
            "pos_bert_input": np.array(pos_bert_inputs, dtype=np.long),
            "pos_mask": np.array(pos_bert_masks, dtype=np.long),
            "pos_seg": np.array(pos_bert_segs, dtype=np.long),
            "negdocid": "",
            "neg_bert_input": np.zeros((numpassages, maxseqlen),
                                       dtype=np.long),
            "neg_mask": np.zeros((numpassages, maxseqlen), dtype=np.long),
            "neg_seg": np.zeros((numpassages, maxseqlen), dtype=np.long),
            "label": np.array(label, dtype=np.float32),
        }

        if not negid:
            return data

        neg_bert_inputs, neg_bert_masks, neg_bert_segs = [], [], []
        neg_passages = self.docid2passages[negid]
        for tokenized_passage in neg_passages:
            inp, mask, seg = self._prepare_bert_input(query_toks,
                                                      tokenized_passage)
            neg_bert_inputs.append(inp)
            neg_bert_masks.append(mask)
            neg_bert_segs.append(seg)

        if not neg_bert_inputs:
            raise MissingDocError(qid, negid)

        data["negdocid"] = negid
        data["neg_bert_input"] = np.array(neg_bert_inputs, dtype=np.long)
        data["neg_mask"] = np.array(neg_bert_masks, dtype=np.long)
        data["neg_seg"] = np.array(neg_bert_segs, dtype=np.long)

        return data
Esempio n. 3
0
    def id2vec(self, qid, posid, negid=None, query=None):
        if query is not None:
            if qid is None:
                query = self["tokenizer"].tokenize(query)
                pass
            else:
                raise RuntimeError("received both a qid and query, but only one can be passed")

        else:
            query = self.qid2toks[qid]

        # TODO find a way to calculate qlen/doclen stats earlier, so we can log them and check sanity of our values
        qlen, doclen = self.cfg["maxqlen"], self.cfg["maxdoclen"]
        posdoc = self.docid2toks.get(posid, None)
        if not posdoc:
            raise MissingDocError(qid, posid)

        idfs = padlist(self._get_idf(query), qlen, 0)
        query = self._tok2vec(padlist(query, qlen, self.pad_tok))
        posdoc = self._tok2vec(padlist(posdoc, doclen, self.pad_tok))

        # TODO determine whether pin_memory is happening. may not be because we don't place the strings in a np or torch object
        data = {
            "qid": qid,
            "posdocid": posid,
            "idfs": np.array(idfs, dtype=np.float32),
            "query": np.array(query, dtype=np.long),
            "posdoc": np.array(posdoc, dtype=np.long),
            "query_idf": np.array(idfs, dtype=np.float32),
        }

        if not negid:
            logger.debug(f"missing negtive doc id for qid {qid}")
            return data

        negdoc = self.docid2toks.get(negid, None)
        if not negdoc:
            raise MissingDocError(qid, negid)

        negdoc = self._tok2vec(padlist(negdoc, doclen, self.pad_tok))
        data["negdocid"] = negid
        data["negdoc"] = np.array(negdoc, dtype=np.long)

        return data
Esempio n. 4
0
    def id2vec(self, qid, posid, negid=None):
        tokenizer = self.tokenizer
        qlen, doclen = self.config["maxqlen"], self.config["maxdoclen"]

        query_toks = tokenizer.convert_tokens_to_ids(self.qid2toks[qid])
        query_mask = self.get_mask(query_toks, qlen)
        query = padlist(query_toks, qlen)

        posdoc_toks = tokenizer.convert_tokens_to_ids(self.docid2toks[posid])
        posdoc_mask = self.get_mask(posdoc_toks, doclen)
        posdoc = padlist(posdoc_toks, doclen)

        data = {
            "qid": qid,
            "posdocid": posid,
            "idfs": np.zeros(qlen, dtype=np.float32),
            "query": np.array(query, dtype=np.long),
            "query_mask": np.array(query_mask, dtype=np.long),
            "posdoc": np.array(posdoc, dtype=np.long),
            "posdoc_mask": np.array(posdoc_mask, dtype=np.long),
            "query_idf": np.array(query, dtype=np.float32),
            "negdocid": "",
            "negdoc": np.zeros(doclen, dtype=np.long),
            "negdoc_mask": np.zeros(doclen, dtype=np.long),
        }

        if negid:
            negdoc_toks = tokenizer.convert_tokens_to_ids(
                self.docid2toks.get(negid, None))
            negdoc_mask = self.get_mask(negdoc_toks, doclen)
            negdoc = padlist(negdoc_toks, doclen)

            if not negdoc:
                raise MissingDocError(qid, negid)

            data["negdocid"] = negid
            data["negdoc"] = np.array(negdoc, dtype=np.long)
            data["negdoc_mask"] = np.array(negdoc_mask, dtype=np.long)

        return data