def to_scored_spans(self, probs, score_mask): probs = probs.data.cpu() score_mask = score_mask.data.cpu() batch_size, num_spans = probs.size() spans = [] for b in range(batch_size): batch_spans = [] for start, end, i in self.start_end_range(num_spans): if score_mask[b, i] == 1 and probs[b, i] > 0: batch_spans.append((Span(start, end), probs[b, i])) spans.append(batch_spans) return spans
def _read(self, file_list: str): instances = [] for file_path in file_list.split(","): if file_path.strip() == "": continue logger.info("Reading QASRL instances from dataset file at: %s", file_path) data = [] if file_path.endswith('.gz'): with gzip.open(cached_path(file_path), 'r') as f: for line in f: data.append(json.loads(line)) elif file_path.endswith(".jsonl"): with codecs.open(cached_path(file_path), 'r', encoding='utf8') as f: for line in f: data.append(json.loads(line)) for item in data: sent_id = item["sentenceId"] sentence_tokens = item["sentenceTokens"] annotations = [] for _, verb_entry in item["verbEntries"].items(): verb_index = verb_entry["verbIndex"] self._num_verbs += 1 annotations = [] for _, question_label in verb_entry[ "questionLabels"].items(): answers = len(question_label["answerJudgments"]) valid_answers = len([ ans for ans in question_label["answerJudgments"] if ans["isValid"] ]) if self._question_sources is not None: if not any([ source.startswith(prefix) for source in question_label["questionSources"] for prefix in self._question_sources ]): continue if answers < self._min_answers: self._not_enough_answers += 1 continue if valid_answers < self._min_valid_answers: self._not_enough_valid_answers += 1 continue slots = [] for l in self._slot_labels: slots.append(question_label["questionSlots"][l]) provinence = list(question_label["questionSources"])[0] spans = [] for ans in question_label["answerJudgments"]: if ans["isValid"]: for s in ans["spans"]: spans.append(Span(s[0], s[1] - 1)) self._qa_pairs += 1 annotations.append( AnnotatedSpan(slots=slots, all_spans=spans, provinence=provinence)) if annotations: self._instances += 1 yield self._make_instance_from_text( sentence_tokens, verb_index, annotations=annotations, sent_id=sent_id) else: self._no_ann += 1 logger.info("Produced %d instances" % self._instances) logger.info("\t%d Verbs" % self._num_verbs) logger.info("\t%d QA pairs" % self._qa_pairs) logger.info("\t%d no annotation" % self._no_ann) logger.info("\t%d not enough answers" % self._not_enough_answers) logger.info("\t%d not enough valid answers" % self._not_enough_valid_answers)