def read_medquad_raw_dataset() -> List[Dict]: logging.basicConfig(level=logging.INFO) tokenizer = stanza.Pipeline(lang='en', processors='tokenize') ds = [] nb_generate_data = 0 for subset_dir in os.listdir(MEDQUAD_RAW_DIR): dirpath = f"{MEDQUAD_RAW_DIR}/{subset_dir}" if os.path.isdir(dirpath): for xml_file in os.listdir(dirpath): filepath = f"{dirpath}/{xml_file}" if os.path.isfile(filepath) and xml_file.endswith(".xml"): parsed = ET.parse(filepath) qa_pairs = parsed.getroot().find('QAPairs') pair_tag = "QAPair" q_tag = "Question" a_tag = "Answer" if qa_pairs is None: # Some documents have XML tags although having the same structure qa_pairs = parsed.getroot().find('qaPairs') pair_tag = "pair" q_tag = "question" a_tag = "answer" if qa_pairs is None: logging.warning(f"No QAPairs tag in {ET.tostring(parsed.getroot())}") continue for qa in qa_pairs.findall(pair_tag): question = qa.find(q_tag).text answer = qa.find(a_tag).text if not isinstance(question, str) or not isinstance(answer, str) or len(question) == 0 or \ len(answer) == 0: logging.warning(f"Issue with QA pair: \n'{question}' \n'{answer}") continue question_tokens = tokenizer.process(question).sentences[0].tokens paragraph = tokenizer.process(answer) for i in range(0, len(paragraph.sentences), 2): # Takes 2 sentences at a time if i + 1 < len(paragraph.sentences): tokens = paragraph.sentences[i].tokens + paragraph.sentences[i+1].tokens else: tokens = paragraph.sentences[i].tokens answer_content = array_to_string(list(tok.text for tok in tokens)) question_content = array_to_string(list(tok.text for tok in question_tokens)).lower() ds.append({ 'question': question_content, 'answer': answer_content, 'sub_dataset': subset_dir, 'filename': xml_file }) nb_generate_data += 1 if nb_generate_data % 10 == 0: logging.info(f"Processed {nb_generate_data}") random.shuffle(ds) return ds
def create_pos_sequences(self): pos_sequences = [] for passage in self.passages: # Creates the POS sequence pos_sequence = [] for word in passage: pos_sequence.append(word.xpos) pos_sequences.append(array_to_string(pos_sequence)) return np.array(pos_sequences)
def generate_medqa_handmade_dataset(ds_path): ds_raw = pd.read_csv(ds_path, sep='|') tokenizer = stanza.Pipeline(lang='en', processors='tokenize') ds = [] for question, answer in zip(ds_raw['question'], ds_raw['answer']): question_tokens = tokenizer.process(question).sentences[0].tokens paragraph = tokenizer.process(answer) for i in range(0, len(paragraph.sentences), 2): # Takes 2 sentences at a time if i + 1 < len(paragraph.sentences): tokens = paragraph.sentences[i].tokens + paragraph.sentences[i + 1].tokens else: tokens = paragraph.sentences[i].tokens answer_content = array_to_string(list(tok.text for tok in tokens)) question_content = array_to_string(list(tok.text for tok in question_tokens)).lower() ds.append({ 'question': question_content, 'answer': answer_content, }) pd.DataFrame(ds).to_csv(MEDQA_HANDMADE_FILEPATH, index=False, sep="|")
def create_case_sequences(self) -> np.ndarray: """ :return: The casing sequence for each passage ('UP' when the word's first letter is capitalized, 'LOW' otherwise). """ case_seqs = [] for passage in self.passages: case_seq = np.array(list("LOW" for _ in range(len(passage)))) case_indices = np.where( list(str.isupper(word.text[0]) for word in passage)) case_seq[case_indices] = "UP" case_seqs.append(array_to_string(case_seq)) return case_seqs
def create_ner_sequence(enhanced_ner, passage, ner_mapping=None): # Takes care of creating the NER sequence ner_sequence = np.full(shape=len(passage), fill_value='O', dtype=object) i = 0 for word in passage: token_ner = word.parent._ner if len( word.parent._ner) == 1 else word.parent._ner[2:] # Takes either the most recent NER tag or the ones used in the original NQG paper ner_sequence[i] = token_ner if enhanced_ner else ner_mapping( token_ner) i += 1 return array_to_string(ner_sequence)
def create_bio_sequences(self, answer_starts: np.ndarray, answer_lengths: np.ndarray) -> np.ndarray: """ :param answer_starts: Indices of where the answers start for each passage. :param answer_lengths: The lengths (number of words) of each answer. :return: The BIO sequence of each passage. """ bio_seqs = [] for passage, answer_start, answer_length in zip( self.passages, answer_starts, answer_lengths): bio = list('O' for _ in range(len(passage))) bio[answer_start] = 'B' if answer_length > 1: for i in range(answer_start + 1, answer_start + answer_length): bio[i] = 'I' bio_seqs.append(array_to_string(bio)) return bio_seqs
def create_ner_sequences(self, enhanced_ner): ner_sequences = [] for passage in self.passages: # Takes care of creating the NER sequence ner_sequence = np.full(shape=len(passage), fill_value='O', dtype=object) i = 0 for word in passage: token_ner = word.parent._ner if len( word.parent._ner) == 1 else word.parent._ner[2:] # Takes either the most recent NER tag or the ones used in the original NQG paper ner_sequence[ i] = token_ner if enhanced_ner else self._ner_mapping( token_ner) i += 1 ner_sequences.append(array_to_string(ner_sequence)) return np.array(ner_sequences)
def generate_bio_features(mode: str, ds_name: str, answer_mode: str): assert answer_mode in ("none", "guess") source_dir = f"{NQG_DATA_HOME}/{ds_name}/{mode}" target_dir = f"{NQG_DATA_HOME}/{ds_name}" if answer_mode == "none": target_dir += "_NA" else: target_dir += "_GA" assert os.path.exists(source_dir) and os.path.isdir(source_dir) if not os.path.exists(target_dir): os.mkdir(target_dir) if not os.path.exists(f"{target_dir}/{mode}"): os.mkdir(f"{target_dir}/{mode}") if answer_mode == "none": bios = [] source_passages = np.loadtxt(f"{source_dir}/data.txt.source.txt", dtype=str, delimiter='\n', comments=None) for passage in source_passages: bio = ["I" for _ in range(len(passage.split(" ")))] bio[0] = "B" bios.append(array_to_string(bio)) if answer_mode == "guess": corpus_named_entities = np.loadtxt(f"{source_dir}/data.txt.ner", dtype=str, delimiter='\n', comments=None) corpus_pos_tags = np.loadtxt(f"{source_dir}/data.txt.pos", dtype=str, delimiter='\n', comments=None) bios = [] for named_entities, pos_tags in zip(corpus_named_entities, corpus_pos_tags): named_entities = named_entities.split(' ') longest_ne_seq = [] current_seq_length = [] for i in range(len(named_entities)): ne = named_entities[i] if ne != 'O': current_seq_length.append(i) else: if len(current_seq_length) > len(longest_ne_seq): longest_ne_seq = current_seq_length current_seq_length = [] if len(longest_ne_seq) == 0: # No named entities in this passage so we take the first noun phrase pos_tags = pos_tags.split(' ') try: bio = ["O" for _ in range(len(pos_tags))] i = 0 while i < len(pos_tags): if pos_tags[i].startswith("NN"): bio[i] = "B" i += 1 break i += 1 while i < len(pos_tags) and pos_tags[i].startswith("NN"): bio[i] = "I" i += 1 except ValueError: # No noun either, we fallback on using the full passage as the answer bio = array_to_string(['B'] + ['I' for _ in range(len(named_entities) - 1)]) else: bio = ['O' for _ in range(len(named_entities))] bio[longest_ne_seq[0]] = "B" for i in longest_ne_seq[1:]: bio[i] = "I" bios.append(array_to_string(bio)) np.savetxt(f"{target_dir}/{mode}/data.txt.bio", bios, fmt="%s")
def uncased_sequences(self): return list( array_to_string(list(word.text.lower() for word in sequence)) for sequence in self.passages)