def test_extract_phrases_from_formatted_text(): extractor = PhraseExtractor(["two-token phrase"]) phrases = list( extractor.parse( "main.tex", r"In this \textbf{two-token phrase}, something happens.")) assert len(phrases) == 1
def test_extract_phrases_starting_with_symbol(): # This example is from arXiv paper 1811.11889. extractor = PhraseExtractor(["+D&M"]) phrases = list( extractor.parse("main.tex", r"This sentence contains +D\&M.")) assert len(phrases) == 1 assert phrases[0].text == "+D&M"
def test_extract_phrases_containing_ampersands(): # This example is from arXiv paper 1811.11889. extractor = PhraseExtractor(["D&M"]) phrases = list(extractor.parse("main.tex", r"This sentence contains D\&M.")) assert len(phrases) == 1 assert phrases[0].text == "D&M" assert phrases[0].tex == "D\&M"
def test_extract_phrases(): extractor = PhraseExtractor(["word", "two-token phrase"]) phrases = list( extractor.parse("main.tex", "This sentence contains word and a two-token phrase.")) phrase1 = phrases[0] assert phrase1.start == 23 assert phrase1.end == 27 assert phrase1.text == "word" phrase2 = phrases[1] assert phrase2.start == 34 assert phrase2.end == 50 assert phrase2.text == "two-token phrase"
def parse(self, tex_path: str, tex: str) -> Iterator[Term]: phrase_extractor = PhraseExtractor(list(self.glossary.keys())) for i, phrase in enumerate(phrase_extractor.parse(tex_path, tex)): entries = self.glossary[phrase.text] definitions = [e.definition for e in entries] sources = [e.source for e in entries] yield Term( id_=f"glossary-term-{i}", start=phrase.start, end=phrase.end, tex=phrase.tex, text=phrase.text, type_=None, tex_path=tex_path, context_tex=phrase.context_tex, definitions=definitions, sources=sources, sentence_id=None, )
def test_extract_phrase_containing_single_letter(): extractor = PhraseExtractor(["T"]) phrases = list( extractor.parse("main.tex", "This sentence contains the letter T.")) assert len(phrases) == 1 assert phrases[0].text == "T"
def process( self, item: DetectDefinitionsTask ) -> Iterator[Union[Definiendum, Definition, TermReference]]: sentences_ordered = sorted(item.sentences, key=lambda s: s.start) num_sentences = len(sentences_ordered) end_position_of_last_sentence = sentences_ordered[-1].end if len(item.sentences) == 0: logging.warning( # pylint: disable=logging-not-lazy "No sentences found for arXiv ID %s. Skipping detection of sentences " + "that contain entities.", item.arxiv_id, ) return # Load the pre-trained definition detection model. prediction_type = "DocDef2+AI2020+W00" model = DefinitionDetectionModel(prediction_type) definition_index = 0 features = [] sentences: List[EmbellishedSentence] = [] definiendums: Dict[TermName, List[Definiendum]] = defaultdict(list) term_phrases: List[str] = [] abbreviations: List[str] = [] symbol_nicks: List[str] = [] definitions: Dict[DefinitionId, Definition] = {} with tqdm(total=num_sentences, disable=(not self.args.show_progress)) as progress: for sentence_index, sentence in enumerate(sentences_ordered): progress.update(1) # Only attempt to process sentences that have been marked as likely to be proper # plaintext. Note that this means some sentences may be skipped that didn't pass # heuristics in the sentence extractor. if not sentence.validity_guess: continue # Extract features from raw text. featurized_text = model.featurize( sentence.legacy_definition_input) features.append(featurized_text) sentences.append(sentence) # Process sentences in batches. if (len(features) >= self.args.batch_size or sentence_index == num_sentences - 1): # Detect terms and definitions in each sentence with a pre-trained definition # extraction model, from the featurized text. (_, slots, slots_confidence) = model.predict_batch( cast(List[Dict[Any, Any]], features)) # Package extracted terms and definitions into a representation that's # easier to process. for ( s, sentence_features, termdef_sentence_slots, termdef_sentence_slots_confidence, abbrexp_sentence_slots, abbrexp_sentence_slots_confidence, symnick_sentence_slots, symnick_sentence_slots_confidence, ) in zip( sentences, features, slots["W00"], slots_confidence["W00"], slots["AI2020"], slots_confidence["AI2020"], slots["DocDef2"], slots_confidence["DocDef2"], ): # Extract TeX for each symbol from a parallel representation of the # sentence, so that the TeX for symbols can be saved. # Types of [term and definition] pairs. # [nickname and definition] for symbols. # [abbreviation and expansion] for abbreviations. # [term and definition] for other types. symbol_texs = get_symbol_texs( s.legacy_definition_input, s.with_formulas_marked) # Only process slots when they include both 'TERM' and 'DEFINITION'. if ("TERM" not in termdef_sentence_slots or "DEF" not in termdef_sentence_slots): term_definition_pairs = [] else: term_definition_pairs = consolidate_keyword_definitions( s.legacy_definition_input, sentence_features["tokens"], termdef_sentence_slots, termdef_sentence_slots_confidence, "W00", ) if ("TERM" not in abbrexp_sentence_slots or "DEF" not in abbrexp_sentence_slots): abbreviation_expansion_pairs = [] else: abbreviation_expansion_pairs = consolidate_keyword_definitions( s.legacy_definition_input, sentence_features["tokens"], abbrexp_sentence_slots, abbrexp_sentence_slots_confidence, "AI2020", ) if ("TERM" not in symnick_sentence_slots or "DEF" not in symnick_sentence_slots): symbol_nickname_pairs = [] else: symbol_nickname_pairs = consolidate_keyword_definitions( s.legacy_definition_input, sentence_features["tokens"], symnick_sentence_slots, symnick_sentence_slots_confidence, "DocDef2", ) pairs = (term_definition_pairs + symbol_nickname_pairs + abbreviation_expansion_pairs) for pair in pairs: tex_path = s.tex_path definiendum_id = ( f"definiendum-{tex_path}-{definition_index}") definition_id = f"definition-{tex_path}-{definition_index}" definiendum_text = pair.term_text definiendum_type = pair.term_type definition_type = pair.definition_type definiendum_confidence = pair.term_confidence definition_confidence = pair.definition_confidence # Map definiendum and definition start and end positions back to # their original positions in the TeX. offsets = s.legacy_definition_input_journal.initial_offsets( pair.term_start, pair.term_end) if offsets[0] is None or offsets[1] is None: logging.warning( # pylint: disable=logging-not-lazy "Could not find offsets of definiendum %s in original TeX " + "(from sentence %s, file %s, arXiv ID %s). Definiendum will not be saved.", pair.term_text, s.id_, s.tex_path, item.arxiv_id, ) continue definiendum_start = s.start + offsets[0] definiendum_end = s.start + offsets[1] offsets = s.legacy_definition_input_journal.initial_offsets( pair.definition_start, pair.definition_end) if offsets[0] is None or offsets[1] is None: logging.warning( # pylint: disable=logging-not-lazy "Could not find offsets of definition %s in original TeX " + "(from sentence %s, file %s, arXiv ID %s). Definiendum will not be saved.", pair.definition_text, s.id_, s.tex_path, item.arxiv_id, ) continue definition_start = s.start + offsets[0] definition_end = s.start + offsets[1] # Extract document-level features from sentence. position_ratio = (definiendum_start / end_position_of_last_sentence) section_name = s.section_name try: tex = item.tex_by_file[tex_path] except KeyError: logging.warning( # pylint: disable=logging-not-lazy "Could not find TeX for %s. TeX will not be included in " + "the output data for definition '%s' for term '%s'", tex_path, pair.definition_text, definiendum_text, ) definiendum_tex = "NOT AVAILABLE" definition_tex = "NOT AVAILABLE" else: if (definiendum_type == "symbol" and symbol_texs is not None and pair.term_start in symbol_texs): definiendum_tex = symbol_texs[ pair.term_start] definiendum_text = definiendum_tex else: definiendum_tex = tex.contents[ definiendum_start:definiendum_end] definition_tex = tex.contents[ definition_start:definition_end] # Save the definition to file. definition = Definition( id_=definition_id, start=definition_start, end=definition_end, definiendum=definiendum_text, type_=definition_type, tex_path=tex_path, tex=definition_tex, text=pair.definition_text, context_tex=sentence.context_tex, sentence_id=sentence.id_, intent=True, confidence=definition_confidence, ) definitions[definition_id] = definition yield definition # Don't save the definiendum to file yet. Save it in memory first, and then # save it to file once it's done being processed. It will need # to be associated with other definitions. Also, other references # to the term will be detected before this method is over. definiendum = Definiendum( id_=definiendum_id, text=definiendum_text, type_=definiendum_type, confidence=definiendum_confidence, # Link the definiendum to the text that defined it. definition_id=definition_id, # Because a term can be defined multiple places in the paper, these # three lists of definition data will be filled out once all of the # definitions have been found. definition_ids=[], definitions=[], definition_texs=[], sources=[], start=definiendum_start, end=definiendum_end, tex_path=tex_path, tex=definiendum_tex, context_tex=sentence.context_tex, sentence_id=sentence.id_, # Document-level features below. position_ratio=position_ratio, position_ratios=[], section_name=section_name, section_names=[], ) definiendums[definiendum_text].append(definiendum) if definiendum.type_ == "term": term_phrases.append(definiendum.text) if definiendum.type_ == "abbreviation": abbreviations.append(definiendum.text) if definiendum.type_ == "symbol": symbol_nicks.append(definiendum.text) definition_index += 1 features = [] sentences = [] logging.debug( # pylint: disable=logging-not-lazy "Finished detecting definitions for paper %s. Now finding references to defined terms.", item.arxiv_id, ) all_definiendums: List[Definiendum] = [] for _, definiendum_list in definiendums.items(): all_definiendums.extend(definiendum_list) definition_ids: Dict[TermName, List[DefinitionId]] = {} definition_texs: Dict[TermName, List[str]] = {} definition_texts: Dict[TermName, List[str]] = {} sources: Dict[TermName, List[str]] = {} position_ratios: Dict[TermName, List[float]] = {} section_names: Dict[TermName, List[str]] = {} # Associate terms with all definitions that apply to them. for term, definiendum_list in definiendums.items(): definition_ids[term] = [ definiendum.definition_id for definiendum in definiendum_list ] definition_texs[term] = [ definitions[definiendum.definition_id].tex for definiendum in definiendum_list ] definition_texts[term] = [ definitions[definiendum.definition_id].text for definiendum in definiendum_list ] sources[term] = ["model"] * len(definition_ids[term]) position_ratios[term] = [ definiendum.position_ratio for definiendum in definiendum_list ] section_names[term] = [ definiendum.section_name for definiendum in definiendum_list if definiendum.section_name is not None ] # Associate each definiendum with all applicable definitions, and save them to file. for _, definiendum_list in definiendums.items(): for definiendum in definiendum_list: definiendum.definition_ids.extend( definition_ids[definiendum.text]) definiendum.definition_texs.extend( definition_texs[definiendum.text]) definiendum.definitions.extend( definition_texts[definiendum.text]) definiendum.sources.extend(sources[definiendum.text]) definiendum.position_ratios.extend( position_ratios[definiendum.text]) definiendum.section_names.extend( section_names[definiendum.text]) yield definiendum # Detect all other references to the defined terms. Detect references to textual # terms and abbreviations. References to symbols need not be found here; they # will be detected automatically in the symbol extraction code. term_index = 0 for tex_path, file_contents in item.tex_by_file.items(): term_extractor = PhraseExtractor(term_phrases + abbreviations) for t in term_extractor.parse(tex_path, file_contents.contents): # Don't save term references if they are already in the definiendums. if any([ overlaps(definiendum, t) for definiendum in all_definiendums ]): continue logging.debug( "Found reference to term %s at (%d, %d) in %s for arXiv ID %s", t.text, t.start, t.end, t.tex_path, item.arxiv_id, ) type_ = ("abbreviation" if t.text in abbreviations else "term" if t.text in term_phrases else "symbol" if t.text in symbol_nicks else "unknown") yield TermReference( id_=f"term-{t.tex_path}-{term_index}", text=t.text, type_=type_, definition_ids=definition_ids[t.text], definitions=definition_texts[t.text], definition_texs=definition_texs[t.text], sources=sources[t.text], position_ratios=position_ratios[t.text], section_names=section_names[t.text], start=t.start, end=t.end, tex_path=t.tex_path, tex=t.tex, context_tex=t.context_tex, ) term_index += 1
def process( self, item: DetectDefinitionsTask ) -> Iterator[Union[Definiendum, Definition, TermReference]]: sentences_ordered = sorted(item.sentences, key=lambda s: s.start) num_sentences = len(sentences_ordered) if len(item.sentences) == 0: logging.warning( # pylint: disable=logging-not-lazy "No sentences found for arXiv ID %s. Skipping detection of sentences " + "that contain entities.", item.arxiv_id, ) return # Load the pre-trained definition detection model. model = DefinitionDetectionModel() definition_index = 0 features = [] sentences = [] definiendums: Dict[TermName, List[Definiendum]] = defaultdict(list) definitions: Dict[DefinitionId, Definition] = {} with tqdm( total=num_sentences, disable=(not self.args.show_progress) ) as progress: for si, sentence in enumerate(sentences_ordered): progress.update(1) # Only attempt to process sentences that have been marked as likely to be proper # plaintext. Note that this means some sentences may be skipped that didn't pass # heuristics in the sentence extractor. if not sentence.validity_guess: continue # Extract features from raw text. featurized_text = model.featurize(sentence.legacy_definition_input) features.append(featurized_text) sentences.append(sentence) # Process sentences in batches. if len(features) >= self.args.batch_size or si == num_sentences - 1: # Detect terms and definitions in each sentence with a pre-trained definition # extraction model, from the featurized text. intents, slots = model.predict_batch( cast(List[Dict[Any, Any]], features) ) for s, sentence_features, intent, sentence_slots in zip( sentences, features, intents, slots ): # Only process slots when they includ both 'TERM' and 'DEFINITION'. if "TERM" not in sentence_slots or "DEF" not in sentence_slots: continue # Package extracted terms and definitions into a representation that's # easier to process. pairs = get_term_definition_pairs( s.legacy_definition_input, sentence_features, sentence_slots, ) # Extract TeX for each symbol from a parallel representation of the # sentence, so that the TeX for symbols can be saved. symbol_texs = get_symbol_texs( s.legacy_definition_input, s.with_equation_tex ) for pair in pairs: tex_path = s.tex_path definiendum_id = ( f"definiendum-{tex_path}-{definition_index}" ) definition_id = f"definition-{tex_path}-{definition_index}" definiendum_text = pair.term_text definiendum_type = ( "symbol" if "SYMBOL" in definiendum_text else "term" ) # Map definiendum and definition start and end positions back to # their original positions in the TeX. offsets = s.legacy_definition_input_journal.initial_offsets( pair.term_start, pair.term_end ) if offsets[0] is None or offsets[1] is None: logging.warning( # pylint: disable=logging-not-lazy "Could not find offsets of definiendum %s in original TeX " + "(from sentence %s, file %s, arXiv ID %s). Definiendum will not be saved.", pair.term_text, s.id_, s.tex_path, item.arxiv_id, ) continue definiendum_start = s.start + offsets[0] definiendum_end = s.start + offsets[1] offsets = s.legacy_definition_input_journal.initial_offsets( pair.definition_start, pair.definition_end ) if offsets[0] is None or offsets[1] is None: logging.warning( # pylint: disable=logging-not-lazy "Could not find offsets of definition %s in original TeX " + "(from sentence %s, file %s, arXiv ID %s). Definiendum will not be saved.", pair.definition_text, s.id_, s.tex_path, item.arxiv_id, ) continue definition_start = s.start + offsets[0] definition_end = s.start + offsets[1] try: tex = item.tex_by_file[tex_path] except KeyError: logging.warning( # pylint: disable=logging-not-lazy "Could not find TeX for %s. TeX will not be included in " + "the output data for definition '%s' for term '%s'", tex_path, pair.definition_text, definiendum_text, ) definiendum_tex = "NOT AVAILABLE" definition_tex = "NOT AVAILABLE" else: if ( definiendum_type == "symbol" and symbol_texs is not None and pair.term_start in symbol_texs ): definiendum_tex = symbol_texs[pair.term_start] definiendum_text = definiendum_tex else: definiendum_tex = tex.contents[ definiendum_start:definiendum_end ] definition_tex = tex.contents[ definition_start:definition_end ] # Save the definition to file. definition = Definition( id_=definition_id, start=definition_start, end=definition_end, definiendum=definiendum_text, type_=None, tex_path=tex_path, tex=definition_tex, text=pair.definition_text, context_tex=s.context_tex, sentence_id=s.id_, intent=bool(intent), confidence=None, ) definitions[definition_id] = definition yield definition # Don't save the definiendum to file yet. Save it in memory first, and then # save it to file once it's done being processed. It will need # to be associated with other definitions. Also, other references # to the term will be detected before this method is over. definiendums[definiendum_text].append( Definiendum( id_=definiendum_id, text=definiendum_text, type_=definiendum_type, confidence=None, # Link the definiendum to the text that defined it. definition_id=definition_id, # Because a term can be defined multiple places in the paper, these # three lists of definition data will be filled out once all of the # definitions have been found. definition_ids=[], definitions=[], definition_texs=[], sources=[], start=definiendum_start, end=definiendum_end, tex_path=tex_path, tex=definiendum_tex, context_tex=s.context_tex, sentence_id=s.id_, ) ) definition_index += 1 features = [] sentences = [] logging.debug( "Finished detecting definitions for paper %s. Now finding references to defined terms.", item.arxiv_id, ) all_definiendums: List[Definiendum] = [] for _, definiendum_list in definiendums.items(): all_definiendums.extend(definiendum_list) term_phrases: List[TermName] = list(definiendums.keys()) definition_ids: Dict[TermName, List[DefinitionId]] = {} definition_texs: Dict[TermName, List[str]] = {} definition_texts: Dict[TermName, List[str]] = {} sources: Dict[TermName, List[str]] = {} # Associate terms with all definitions that apply to them. for term, definiendum_list in definiendums.items(): definition_ids[term] = [d.definition_id for d in definiendum_list] definition_texs[term] = [ definitions[d.definition_id].tex for d in definiendum_list ] definition_texts[term] = [ definitions[d.definition_id].text for d in definiendum_list ] sources[term] = ["model"] * len(definition_ids[term]) # Associate each definiendum with all applicable definitions, and save them to file. for _, definiendum_list in definiendums.items(): for d in definiendum_list: d.definition_ids.extend(definition_ids[d.text]) d.definition_texs.extend(definition_texs[d.text]) d.definitions.extend(definition_texts[d.text]) d.sources.extend(sources[d.text]) yield d # Detect all other references to the defined terms. term_index = 0 sentence_entities: List[SerializableEntity] = cast( List[SerializableEntity], item.sentences ) for tex_path, file_contents in item.tex_by_file.items(): term_extractor = PhraseExtractor(term_phrases) for t in term_extractor.parse(tex_path, file_contents.contents): t_sentence = get_containing_entity(t, sentence_entities) # Don't save term references if they are already in the definiendums if any([overlaps(d, t) for d in all_definiendums]): continue logging.debug( "Found reference to term %s at (%d, %d) in %s for arXiv ID %s", t.text, t.start, t.end, t.tex_path, item.arxiv_id, ) yield TermReference( id_=f"term-{t.tex_path}-{term_index}", text=t.text, type_=None, definition_ids=definition_ids[t.text], definitions=definition_texts[t.text], definition_texs=definition_texs[t.text], sources=sources[t.text], start=t.start, end=t.end, tex_path=t.tex_path, tex=t.tex, context_tex=t.context_tex, sentence_id=t_sentence.id_ if t_sentence is not None else None, ) term_index += 1