Esempio n. 1
0
 def to_scored_spans(self, probs, score_mask):
     probs = probs.data.cpu()
     score_mask = score_mask.data.cpu()
     batch_size, num_spans = probs.size()
     spans = []
     for b in range(batch_size):
         batch_spans = []
         for start, end, i in self.start_end_range(num_spans):
             if score_mask[b, i] == 1 and probs[b, i] > 0:
                 batch_spans.append((Span(start, end), probs[b, i]))
         spans.append(batch_spans)
     return spans
    def _read(self, file_list: str):
        instances = []
        for file_path in file_list.split(","):
            if file_path.strip() == "":
                continue

            logger.info("Reading QASRL instances from dataset file at: %s",
                        file_path)
            data = []
            if file_path.endswith('.gz'):
                with gzip.open(cached_path(file_path), 'r') as f:
                    for line in f:
                        data.append(json.loads(line))
            elif file_path.endswith(".jsonl"):
                with codecs.open(cached_path(file_path), 'r',
                                 encoding='utf8') as f:
                    for line in f:
                        data.append(json.loads(line))

            for item in data:
                sent_id = item["sentenceId"]
                sentence_tokens = item["sentenceTokens"]

                annotations = []
                for _, verb_entry in item["verbEntries"].items():
                    verb_index = verb_entry["verbIndex"]

                    self._num_verbs += 1

                    annotations = []
                    for _, question_label in verb_entry[
                            "questionLabels"].items():
                        answers = len(question_label["answerJudgments"])
                        valid_answers = len([
                            ans for ans in question_label["answerJudgments"]
                            if ans["isValid"]
                        ])

                        if self._question_sources is not None:
                            if not any([
                                    source.startswith(prefix) for source in
                                    question_label["questionSources"]
                                    for prefix in self._question_sources
                            ]):
                                continue

                        if answers < self._min_answers:
                            self._not_enough_answers += 1
                            continue
                        if valid_answers < self._min_valid_answers:
                            self._not_enough_valid_answers += 1
                            continue

                        slots = []
                        for l in self._slot_labels:
                            slots.append(question_label["questionSlots"][l])

                        provinence = list(question_label["questionSources"])[0]

                        spans = []
                        for ans in question_label["answerJudgments"]:
                            if ans["isValid"]:
                                for s in ans["spans"]:
                                    spans.append(Span(s[0], s[1] - 1))

                        self._qa_pairs += 1
                        annotations.append(
                            AnnotatedSpan(slots=slots,
                                          all_spans=spans,
                                          provinence=provinence))

                    if annotations:
                        self._instances += 1
                        yield self._make_instance_from_text(
                            sentence_tokens,
                            verb_index,
                            annotations=annotations,
                            sent_id=sent_id)
                    else:
                        self._no_ann += 1

        logger.info("Produced %d instances" % self._instances)
        logger.info("\t%d Verbs" % self._num_verbs)
        logger.info("\t%d QA pairs" % self._qa_pairs)
        logger.info("\t%d no annotation" % self._no_ann)
        logger.info("\t%d not enough answers" % self._not_enough_answers)
        logger.info("\t%d not enough valid answers" %
                    self._not_enough_valid_answers)