Ejemplo n.º 1
0
    def __init__(
        self,
        ner_model: str = None,
        rel_model: str = None,
        qg_model: str = None,
        qa_model: str = None,
        bert_score_model: str = None,
    ):
        """
        FactSumm object used to calculate Factual Consistency score of Abstractive Summarization model

        Args:
            ner_model (str, optional): NER model to be used (Flair or HuggingFace). Defaults to None.
            rel_model (str, optional): RE model to be used (HuggingFace). Defaults to None.
            qg_model (str, optional): QA model to be used (HuggingFace). Defaults to None.
            qa_model (str, optional): QG model to be used (HuggingFace). Defaults to None.
            bert_score_model (str, optional): BERTScore model to be used (HuggingFace). Defaults to None.

        """
        self.config = Config()
        self.segmenter = pysbd.Segmenter(language="en", clean=False)
        self.rouge = RougeCalculator(stopwords=True, lang="en")

        # NER, RE, QG, QA models supported by HuggingFace can be used (default can be found in `config.py`)
        self.ner = ner_model if ner_model is not None else self.config.NER_MODEL
        self.rel = rel_model if rel_model is not None else self.config.REL_MODEL
        self.qg = qg_model if qg_model is not None else self.config.QG_MODEL
        self.qa = qa_model if qa_model is not None else self.config.QA_MODEL
        self.bert_score = bert_score_model if bert_score_model is not None else self.config.BERT_SCORE_MODEL
        self.ie = None
def test_sentence_segmentation(examples):
    seg = pysbd.Segmenter(language="en", clean=False, char_span=True)
    for example in examples:
        # pySBD
        the_sentences = seg.segment(example)
        sentences = []
        for the_span in the_sentences:
            span = {}
            span["start"] = the_span.start
            span["end"] = the_span.end
            sentences.append(span)
        if validate_segmentation(sentences):
            print("\n --> segmentation is valid")
        else:
            print("\n --> segmentation is not valid")
        print("nb. sentences:", str(len(sentences)))
        for span in sentences:    
            print(span["start"], span["end"])
            #print(example[span["start"]:span["end"]])
        # NLTK for comparison         
        sentences = []
        for start, end in PunktSentenceTokenizer().span_tokenize(example):
            span = {}
            span["start"] = start
            span["end"] = end
            sentences.append(span)
        if validate_segmentation(sentences):
            print("\n --> segmentation is valid")
        else:
            print("\n --> segmentation is not valid")
        print("nb. sentences:", str(len(sentences)))
        for span in sentences:
            print(span["start"], span["end"])
Ejemplo n.º 3
0
def test_issue(issue_no, text, expected_sents):
    """pySBD issues tests from https://github.com/nipunsadvilkar/pySBD/issues/"""
    seg = pysbd.Segmenter(language="en", clean=False)
    segments = seg.segment(text)
    assert segments == expected_sents
    # clubbing sentences and matching with original text
    assert text == " ".join(segments)
Ejemplo n.º 4
0
def pysbd_sentencizer(doc: Doc) -> Doc:
    """Adds sentence boundaries to a Doc.
    Intended to be used as a pipe in a spaCy pipeline.
    Uses https://github.com/nipunsadvilkar/pySBD to get proper sentence and
    respective char_spans

    Handle special cases:
    New lines cannot be end of sentence tokens.
    New lines that separate sentences will be added to the
    beginning of the next sentence.

    @param doc: the spaCy document to be annotated with sentence boundaries
    """
    segmenter = pysbd.Segmenter(language="en", clean=False, char_span=True)
    sents_char_spans: List[TextSpan] = segmenter.segment(doc.text)

    char_spans = [
        doc.char_span(sent_span.start, sent_span.end) for sent_span in sents_char_spans
    ]
    start_token_char_offsets = [span[0].idx for span in char_spans if span is not None]
    for token in doc:
        prev_token = token.nbor(-1) if token.i != 0 else None
        if token.idx in start_token_char_offsets:
            if prev_token and prev_token.text in ABBREVIATIONS:
                token.is_sent_start = False
            else:
                token.is_sent_start = True
        # check if previous token contains more than 2 newline chars
        elif prev_token and prev_token.i != 0 and prev_token.text.count("\n") >= 2:
            token.is_sent_start = True
        else:
            token.is_sent_start = False
    return doc
Ejemplo n.º 5
0
def test_exception_with_both_clean_and_span_true():
    """Test to not allow clean=True and char_span=True
    """
    with pytest.raises(ValueError) as e:
        seg = pysbd.Segmenter(language="en", clean=True, char_span=True)
    assert str(e.value) == "char_span must be False if clean is True. "\
                            "Since `clean=True` will modify original text."
 def quebra_em_sentences(self):
     """
     Divide o conteudo limpo,
     em uma lista de sentenças lógicas.
     """
     seg = pysbd.Segmenter(language="en", clean=False)
     quebrado = seg.segment(self.conteudo_limpo)
     self.sentences = quebrado[:self.dados["input"]["tamanhoSlide"]]
Ejemplo n.º 7
0
def test_issues_with_char_spans(issue_no, text, expected_sents_w_spans):
    """pySBD issues tests from https://github.com/nipunsadvilkar/pySBD/issues/"""
    seg = pysbd.Segmenter(language="en", clean=False, char_span=True)
    segments = seg.segment(text)
    expected_text_spans = [TextSpan(sent_w_span[0], sent_w_span[1], sent_w_span[2])
                           for sent_w_span in expected_sents_w_spans]
    assert segments == expected_text_spans
    # clubbing sentences and matching with original text
    assert text == "".join([seg.sent for seg in segments])
Ejemplo n.º 8
0
def test_exception_with_both_clean_and_span_true():
    """Test to not allow clean=True and char_span=True
    """
    with pytest.raises(ValueError) as e:
        seg = pysbd.Segmenter(language="en", clean=True, char_span=True)
        text = "<h2 class=\"lined\">Hello</h2>\n<p>This is a test. Another test.</p>"
        seg.segment(text)
    assert str(e.value) == "char_span must be False if clean is True. "\
                            "Since `clean=True` will modify original text."
Ejemplo n.º 9
0
def test_exception_with_doc_type_pdf_and_clean_false():
    """
    Test to force clean=True when doc_type="pdf"
    """
    with pytest.raises(ValueError) as e:
        seg = pysbd.Segmenter(language="en", clean=False, doc_type='pdf')
    assert str(e.value) == ("`doc_type='pdf'` should have `clean=True` & "
                            "`char_span` should be False since original"
                            "text will be modified.")
Ejemplo n.º 10
0
    def _get_segmenter(lang: str):
        """get the sentence segmenter for the given language.

        Args:
            lang (str): target language code.

        Returns:
            [type]: [description]
        """
        return pysbd.Segmenter(language=lang, clean=True)
Ejemplo n.º 11
0
    def get_titles_paras(self, sentence_threshold=5):
        no_of_para = len(self.list_para)
        seg = pysbd.Segmenter(language="en", clean=True)
        sent = seg.segment(' '.join(
            self.list_para))  # List of sentences as string
        no_of_sent = len(sent)

        print("\nNumber of paragraphs : ", no_of_para)
        # print("Number of sentences : ",no_of_sent)

        i = 0
        in_para = ''
        title = []
        while (i < no_of_para):
            in_para = in_para + ' ' + self.list_para[i]
            seg = pysbd.Segmenter(language="en", clean=True)
            res = seg.segment(in_para)  # List of sentences as string

            if len(res) >= sentence_threshold:
                # print(len(res))
                # heading = self.generate_title(in_para,PASSES = training, NUM_HEADING = heading_threshold).strip().upper()
                heading = self.GetHeadings(in_para).strip().upper()
                if heading != '':
                    title.append((heading, in_para))
                in_para = ''

            i += 1

        if in_para != '':
            # print(len(res))
            # heading = self.generate_title(in_para,PASSES = training, NUM_HEADING = heading_threshold).strip().upper()
            heading = self.GetHeadings(in_para).strip().upper()
            title.append((heading, in_para))

        with open(os.path.join('res', "paragraph_headings.txt"),
                  "w",
                  encoding="utf-8") as f:
            num = 1
            for i, j in title:
                f.write(str(num) + ".) " + i + " $ " + j.strip() + "\n")
                num += 1

        return title
Ejemplo n.º 12
0
def pysbd_sentence_boundaries(doc):
    seg = pysbd.Segmenter(language="en", clean=False, char_span=True)
    sents_char_spans = seg.segment(doc.text)
    char_spans = [
        doc.char_span(sent_span.start, sent_span.end)
        for sent_span in sents_char_spans
    ]
    start_token_ids = [span[0].idx for span in char_spans if span is not None]
    for token in doc:
        token.is_sent_start = True if token.idx in start_token_ids else False
    return doc
Ejemplo n.º 13
0
def test_exception_with_doc_type_pdf_and_both_clean_char_span_true():
    """
    Test to raise ValueError exception when doc_type="pdf" and
    both clean=True and char_span=True
    """
    with pytest.raises(ValueError) as e:
        seg = pysbd.Segmenter(language="en",
                              clean=True,
                              doc_type='pdf',
                              char_span=True)
    assert str(e.value) == "char_span must be False if clean is True. "\
                            "Since `clean=True` will modify original text."
Ejemplo n.º 14
0
def breakContentIntoSentences(content):
    seg = pysbd.Segmenter(language="en", clean=False)
    sentences = seg.segment(content["sourceContentSanitized"])

    content["sentences"] = []

    for sentence in sentences:
        content["sentences"].append({
            "text": sentence,
            "keywords": [],
            "images": []
        })
Ejemplo n.º 15
0
    def break_into_sentences(self):
        text = self.sanitized_content
        seg = pysbd.Segmenter(language="en", clean=False)
        sentences = []
        for sentence in seg.segment(text)[0:self.limit_sentences]:
            content = {
                "text": sentence,
                "keywords": self.fetch_keywords_sentences(sentence),
                "images": []
            }
            sentences.append(content)

        return sentences
Ejemplo n.º 16
0
def get_titles_paras(list_para, sentence_threshold=5):
    no_of_para = len(list_para)
    seg = pysbd.Segmenter(language="en", clean=True)
    sent = seg.segment(' '.join(list_para))  # List of sentences as string
    no_of_sent = len(sent)

    # print("\nNumber of paragraphs : ",no_of_para)
    # print("Number of sentences : ",no_of_sent)

    i = 0
    in_para = ''
    title = []
    while (i < no_of_para):
        in_para = in_para + ' ' + list_para[i]
        seg = pysbd.Segmenter(language="en", clean=True)
        res = seg.segment(in_para)  # List of sentences as string

        if len(res) >= sentence_threshold:
            # print(len(res))
            heading = GenerateTitle(in_para).strip().upper()
            if heading != '':
                title.append((heading, in_para))
            in_para = ''

        i += 1

    if in_para != '':
        # print(len(res))
        heading = GenerateTitle(in_para).strip().upper()
        title.append((heading, in_para))

    with open("paragraph_headings.txt", "w", encoding="utf-8") as f:
        for i, j in title:
            f.write(i + " $ " + j.strip() + "\n")

    return title
Ejemplo n.º 17
0
    def paragraph(self,
                  similarity_threshold=0.35,
                  word_threshold=20,
                  percent_reduce=0.6):
        # Sentence Boundary Detection
        seg = pysbd.Segmenter(language="en", clean=True)
        sentences = seg.segment(self.text)  # List of sentences as string
        # print("Number of sentences : ",len(sentences))
        # Sentence Similarity
        res_similar = self.sentence_similarity(sentences,
                                               percent=percent_reduce)

        para = ''
        n = len(res_similar)
        second = sentences[0]
        for i in range(n - 1):
            first = sentences[i]
            second = sentences[i + 1]
            similar = res_similar[i]
            similar = round(similar, 2)
            # print("Sentence ",i,',',i+1," : ",similar)

            if similar < 0:
                continue

            if similar >= similarity_threshold:
                para += first.strip() + ' '

            else:
                para += first.strip() + '\n'

        para += second.strip()

        # Merge Small Sentences with the previous para
        p = para.split('\n')
        final = ''
        for i in range(1, len(p)):
            small = len(p[i - 1].split(' '))

            if small >= word_threshold:
                final += p[i - 1] + '\n'

            else:
                final += p[i - 1] + ' '

        final += p[len(p) - 1]
        return final.split('\n')  # List of paragraphs
Ejemplo n.º 18
0
def docx_parser(filepath: str, sliding_window: int =3, max_words: int=100):
    """
    parser a given docx
    Args:
        :param filepath: path to the docs for parsing
        :param sliding_window: maximum number of sentences in each segment
        :param max_words: maximum number of words in each sentences (split by ' ')

    Returns: a list of list of sentences [["sent1", "sent2", "sent3", ...], # Segment1
                                          ["sent1", "sent2", "sent3", ...], # Segment2
                                          ...]

    requirements (03/22):
        - Header as a single segment
        - Three sentences per segment
        - Split paragraphs
        - Maximum 100 words per segment (delete sentences if it is longer)

    """
    document = Document(filepath)
    segmenter = pysbd.Segmenter(language="en", clean=True)
    segs = []

    for paragraph in document.paragraphs:

        sentences = segmenter.segment(paragraph.text)
        sentences = sents_preprocessor(sentences)

        if sentences:
            # header & one sentence paragraph
            if len(sentences) <= sliding_window:
                segs.append(sentences)
            else:
                # long paragraph
                end_idx = len(sentences) - 1
                for i in range(len(sentences)):
                    if i + sliding_window <= end_idx:
                        seg = segment_clipper(sentences[i: i+sliding_window], max_words)
                        segs.append(seg)

                        if (i + sliding_window == end_idx) and (len(seg) == sliding_window):
                            break
                    else:
                        seg = segment_clipper(sentences[i:], max_words)
                        segs.append(seg)

    return segs
Ejemplo n.º 19
0
def convert_to_sentence_segments(json):
    new_json = OrderedDict()
    if json is None:
        print("WARNING: json is empty")
        return new_json
    # the abstract is empty for softcite corpus
    if "id" in json:
        new_json["id"] = json["id"]
    new_json["level"] = "sentence"
    new_json["abstract"] = []
    new_json["body_text"] = []
    seg = pysbd.Segmenter(language="en", clean=False, char_span=True)
    # ['My name is Jonas E. Smith.', 'Please turn to p. 55.']
    if "abstract" in json:
        process_text_list(seg, json["abstract"], new_json, "abstract")
    if "body_text" in json:
        process_text_list(seg, json["body_text"], new_json, "body_text")
    return new_json
Ejemplo n.º 20
0
def hl1(raw_text, ratio, highlighter = ['<b>','</b>']):
    #segment text to sentences
    seg = pysbd.Segmenter(language="fr", clean=True)
    sentences = seg.segment(raw_text)
    nb_sentences = len(sentences)
    ref_text = ''.join(sentences)
    mess = "Finding " + str(nb_sentences) + " sentences"
    st.code(mess)

    #compute sentence embedding
    embedder = SentenceTransformer('distiluse-base-multilingual-cased-v1', device="cpu")
    embeddings = embedder.encode(sentences, convert_to_tensor=True)

    #Compute the pair-wise cosine similarities
    cos_scores = util.pytorch_cos_sim(embeddings, embeddings).numpy()

    #Compute the centrality for each sentence
    centrality_scores = degree_centrality_scores(cos_scores, threshold=None)

    #We argsort so that the first element is the sentence with the highest score
    most_central_sentence_indices = np.argsort(-centrality_scores)

    lc=len(most_central_sentence_indices)
    rc = round(lc*ratio)
    hl = []
    mess = "indices:"
    for i in most_central_sentence_indices[:rc]:
        mess =  mess + str(most_central_sentence_indices[i]) + ","
        hl.append(most_central_sentence_indices[i])
    st.code(mess)

    highlighted = []
    for i in range(nb_sentences):
        if i in hl:
            #st.code(str(i))
            highlighted.append(highlighter[0]+sentences[i]+highlighter[1]+' ')
        else:
            highlighted.append(sentences[i]+' ')
    html = '\n'.join(highlighted)

    ret = {}    
    ret["raw_text"] = raw_text
    ret["highlighted_html"] = html
    return (ret)
Ejemplo n.º 21
0
def combined_rule_sentence_segmenter(doc: Doc) -> Doc:
    """Adds sentence boundaries to a Doc. Intended to be used as a pipe in a spaCy pipeline.
       New lines cannot be end of sentence tokens. New lines that separate sentences will be
       added to the beginning of the next sentence.

    @param doc: the spaCy document to be annotated with sentence boundaries
    """
    segmenter = pysbd.Segmenter(language='en', clean=False)
    segments = segmenter.segment(doc.text)
    segments = merge_segments(segments)

    # pysbd splits raw text into sentences, so we have to do our best to align those
    # segments with spacy tokens
    segment_index = 0
    current_segment = segments[segment_index]
    built_up_sentence = ""
    for i, token in enumerate(doc):
        if i == 0 and (token.is_space or token.text == '.'):
            token.is_sent_start = True
            continue
        if token.text.replace('\n', '').replace('\r', '') == '':
            token.is_sent_start = False
        elif len(built_up_sentence) >= len(current_segment):
            token.is_sent_start = True

            # handle the rare (impossible?) case where spacy tokenizes over a sentence boundary that
            # pysbd finds
            built_up_sentence = ' ' * int(
                len(built_up_sentence) - len(current_segment))
            built_up_sentence = token.text_with_ws
            segment_index += 1
            current_segment = segments[segment_index]
        else:
            built_up_sentence += token.text_with_ws
            token.is_sent_start = False

    return doc
Ejemplo n.º 22
0
    - only file with extension .software-mention.xml are considered

    A report is generated with total number of tokens, total number of annotated tokens,
    and total annotated tokens per annotation fields
'''

import os
import argparse
import ntpath
import xml
import regex as re
from xml.sax import make_parser, handler
from collections import OrderedDict
import pysbd

segmenter = pysbd.Segmenter(language='en')

# software recognizer analyzer regex
DELIMITERS = " \n\r\t(([ ^%‰°•*,:;?.!/))-–−‐«»„=≈<>+~\"“”‘’'`$®]*\u2666\u2665\u2663\u2660\u00A0"
reger_delimiters = '([' + '|'.join(map(re.escape, DELIMITERS)) + '])'
REGEX = "(?<=[a-zA-Z])(?=\d)|(?<=\d)(?=\D)"


class TEIContentHandler(xml.sax.ContentHandler):
    # working variables
    accumulated = ''
    current_annotation_type = None
    has_annotation = False
    open_paragraph = False
    paragraph_content = ''
Ejemplo n.º 23
0
def test_es_pdf_type(text, expected_sents):
    """Spanish SBD tests from Pragmatic Segmenter for doctype:pdf"""
    seg = pysbd.Segmenter(language="es", clean=True, doc_type='pdf')
    segments = seg.segment(text)
    assert segments == expected_sents
Ejemplo n.º 24
0
def test_da_pdf_type(text, expected_sents):
    """SBD tests from Pragmatic Segmenter for doctype:pdf"""
    seg = pysbd.Segmenter(language="da", clean=True, doc_type='pdf')
    segments = seg.segment(text)
    segments = [s.strip() for s in segments]
    assert segments == expected_sents
Ejemplo n.º 25
0
 def get_segmenter(lang):
     return pysbd.Segmenter(language=lang, clean=True)
def pysbd_sentencizer(sentence: str, language="en"):
    seg = pysbd.Segmenter(language="en", clean=False)
    return seg.segment(sentence)
Ejemplo n.º 27
0
text = "1 Introduction The publication rate in the medical and biomedical sciences is growing at an exponential rate (Bornmann and Mutz, 2014). The information overload problem is widespread across academia, but is particularly apparent in the biomedical sciences, where individual papers may contain specific discoveries relating to a dizzying variety of genes, drugs, and proteins. In order to cope with the sheer volume of new scientific knowledge, there have been many attempts to automate the process of extracting entities, relations, protein interactions and other structured knowledge from scientific papers (Wei et al., 2016; Ammar et al., 2018; Poon et al., 2014). Although there exists a wealth of tools for processing biomedical text, many focus primarily on entity linking, negation detection and abbreviation detection. MetaMap and MetaMapLite (Aronson, 2001; Demner-Fushman et al., 2017), the two most widely used and supported tools for biomedical text processing, consider additional features, such as negation detection and acronym resolution. However, tools which cover more classical natural language processing (NLP) tasks such as the GENIA tagger (Tsuruoka et al., 2005; Tsuruoka and Tsujii, 2005) and phrase structure parsers such as those presented in (McClosky and Charniak, 2008) typically do not make use of new research innovations such as word representations or neural networks. In this paper, we introduce scispaCy, a specialized NLP library for processing biomedical texts which builds on the robust spaCy library,1 and document its performance relative to state of the art models for part of speech (POS) tagging, dependency parsing, named entity recognition (NER) and sentence segmentation. Specifically, we: • Release a reformatted version of the GENIA 1.0 (Kim et al., 2003) corpus converted into Universal Dependencies v1.0 and aligned 1spacy.io ar X iv :1 90 2. 07 66 9v 2 [ cs .C L ] 2 1 Fe b 20 19 with the original text from the PubMed abstracts. • Benchmark 9 named entity recognition models for more specific entity extraction applications demonstrating competitive performance when compared to strong baselines. • Release and evaluate two fast and convenient pipelines for biomedical text, which include tokenization, part of speech tagging, dependency parsing and named entity recognition. 2 Overview of (sci)spaCy In this section, we briefly describe the models used in the spaCy library and describe how we build on them in scispaCy. spaCy. The spaCy library (Honnibal and Montani, 2017)2 provides a variety of practical tools for text processing in multiple languages. Their models have emerged as the defacto standard for practical NLP due to their speed, robustness and close to state of the art performance. As the spaCy models are popular and the spaCy API is widely known to many potential users, we choose to build upon the spaCy library for creating a biomedical text processing pipeline. scispaCy. Our goal is to develop scispaCy as a robust, efficient and performant NLP library to satisfy the primary text processing needs in the biomedical domain. In this release of scispaCy, we retrain spaCy3 models for POS tagging, dependency parsing, and NER using datasets relevant to biomedical text, and enhance the tokenization module with additional rules. scispaCy contains two core released packages: en core sci sm and en core sci md. Models in the en core sci md package have a larger vocabulary and include word vectors, while those in en core sci sm have a smaller vocabulary and do not include word vectors, as shown in Table 1. Processing Speed. To emphasize the efficiency and practical utility of the end-to-end pipeline provided by scispaCy packages, we perform a speed comparison with several other publicly available processing pipelines for biomedical text using 10k randomly selected PubMed abstracts. We report 2Source code at https://github.com/ explosion/spaCy 3scispaCy models are based on spaCy version 2.0.18 results with and without segmenting the abstracts into sentences since some of the libraries (e.g., GENIA tagger) are designed to operate on sentences. As shown in Table 2, both models released in scispaCy demonstrate competitive speed to pipelines written in C++ and Java, languages designed for production settings. Whilst scispaCy is not as fast as pipelines designed for purely production use-cases (e.g., NLP4J), it has the benefit of straightforward integration with the large ecosystem of Python libraries for machine learning and text processing. Although the comparison in Table 2 is not an apples to apples comparison with other frameworks (different tasks, implementation languages etc), it is useful to understand scispaCy’s runtime in the context of other pipeline components. Running scispaCy models in addition to standard Entity Linking software such as MetaMap would result in only a marginal increase in overall runtime. In the following section, we describe the POS taggers and dependency parsers in scispaCy. 3 POS Tagging and Dependency Parsing The joint POS tagging and dependency parsing model in spaCy is an arc-eager transition-based parser trained with a dynamic oracle, similar to (Goldberg and Nivre, 2012). Features are CNN representations of token features and shared across all pipeline models (Kiperwasser and Goldberg, 2016; Zhang and Weiss, 2016). Next, we describe the data we used to train it in scispaCy. 3.1 Datasets GENIA 1.0 Dependencies. To train the dependency parser and part of speech tagger in both released models, we convert the treebank of (McClosky and Charniak, 2008),4 which is based on the GENIA 1.0 corpus (Kim et al., 2003), to Universal Dependencies v1.0 using the Stanford Dependency Converter (Schuster and Manning, 2016). As this dataset has POS tags annotated, we use it to train the POS tagger jointly with the dependency parser in both released models. As we believe the Universal Dependencies converted from the original GENIA 1.0 corpus are generally useful, we have released them as a separate contribution of this paper.5 In this data release, we also align the converted dependency parses to their original text spans in the raw, untokenized abstracts from the original release,6 and include the PubMed metadata for the abstracts which was discarded in the GENIA corpus released by McClosky and Charniak (2008). We hope that this raw format can emerge as a resource for practical evaluation in the biomedical domain of core NLP tasks such as tokenization, sentence segmentation and joint models of syntax. Finally, we also retrieve from PubMed the original metadata associated with each abstract. This includes relevant named entities linked to their Medical Subject Headings (MeSH terms) as well as chemicals and drugs linked to a variety of ontologies, as well as author metadata, publication dates, citation statistics and journal metadata. We hope that the community can find interesting problems for which such natural supervision can be used. 4https://nlp.stanford.edu/˜mcclosky/ biomedical.html 5Available at https://github.com/allenai/ genia-dependency-trees 6Available at http://www.geniaproject.org/ OntoNotes 5.0. To increase the robustness of the dependency parser and POS tagger to generic text, we make use of the OntoNotes 5.0 corpus7 when training the dependency parser and part of speech tagger (Weischedel et al., 2011; Hovy et al., 2006). The OntoNotes corpus consists of multiple genres of text, annotated with syntactic and semantic information, but we only use POS and dependency parsing annotations in this work. 3.2 Experiments We compare our models to the recent survey study of dependency parsing and POS tagging for biomedical data (Nguyen and Verspoor, 2018) in Tables 3 and 4. POS tagging results show that both models released in scispaCy are competitive with state of the art systems, and can be considered of equivalent practical value. In the case of dependency parsing, we find that the Biaffine parser of (Dozat and Manning, 2016) outperforms the scispaCy models by a margin of 2-3%. However, as demonstrated in Table 2, the scispaCy models are 7Instructions for download at http://cemantix. org/data/ontonotes.html approximately 9x faster due to the speed optimizations in spaCy. Robustness to Web Data. A core principle of the scispaCy models is that they are useful on a wide variety of types of text with a biomedical focus, such as clinical notes, academic papers, clinical trials reports and medical records. In order to make our models robust across a wider range of domains more generally, we experiment with incorporating training data from the OntoNotes 5.0 corpus when training the dependency parser and POS tagger. Figure 2 demonstrates the effectiveness of adding increasing percentages of web data, showing substantially improved performance on OntoNotes, at no reduction in performance on biomedical text. Note that mixing in web text during training has been applied to previous systems - the GENIA Tagger (Tsuruoka et al., 2005) also employs this technique. 4 Named Entity Recognition The NER model in spaCy is a transition-based system based on the chunking model from (Lample et al., 2016). Tokens are represented as hashed, embedded representations of the prefix, suffix, shape and lemmatized features of individual words. Next, we describe the data we used to train NER models in scispaCy. 4.1 Datasets The main NER model in both released packages in scispaCy is trained on the mention spans in the MedMentions dataset (Murty et al., 2018). Since the MedMentions dataset was originally designed for entity linking, this model recognizes a wide variety of entity types, as well as non-standard syntactic phrases such as verbs and modifiers, but the model does not predict the entity type. In order to provide for users with more specific requirements around entity types, we release four additional packages en ner {bc5cdr|craft |jnlpba|bionlp13cg} md with finer-grained NER models trained on BC5CDR (for chemicals and diseases; Li et al., 2016), CRAFT (for cell types, chemicals, proteins, genes; Bada et al., 2011), JNLPBA (for cell lines, cell types, DNAs, RNAs, proteins; Collier and Kim, 2004) and BioNLP13CG (for cancer genetics; Pyysalo et al., 2015), respectively. 4.2 Experiments As NER is a key task for other biomedical text processing tasks, we conduct a through evaluation of the suitability of scispaCy to provide baseline performance across a wide variety of datasets. In particular, we retrain the spaCy NER model on each of the four datasets mentioned earlier (BC5CDR, CRAFT, JNLPBA, BioNLP13CG) as well as five more datasets in Crichton et al. (2017): AnatEM, BC2GM, BC4CHEMD, Linnaeus, NCBI-Disease. These datasets cover a wide variety of entity types required by different biomedical domains, including cancer genetics, disease-drug interactions, pathway analysis and trial population extraction. Additionally, they vary considerably in size and number of entities. For example, BC4CHEMD (Krallinger et al., 2015) has 84,310 annotations while Linnaeus (Gerner et al., 2009) only has 4,263. BioNLP13CG (Pyysalo et al., 2015) annotates 16 entity types while five of the datasets only annotate a single entity type.8 Table 5 provides a through comparison of the scispaCy NER models compared to a variety of models. In particular, we compare the models to strong baselines which do not consider the use of 1) multi-task learning across multiple datasets and 2) semi-supervised learning via large pretrained language models. Overall, we find that the scispaCy models are competitive baselines for 5 of the 9 datasets. Additionally, in Table 6 we evaluate the recall of the pipeline mention detector available in both 8For a detailed discussion of the datasets and their creation, we refer the reader to https://github.com/ cambridgeltl/MTL-Bioinformatics-2016/ blob/master/Additional%20file%201.pdf scispaCy models (trained on the MedMentions dataset) against all 9 specialised NER datasets. Overall, we observe a modest drop in average recall when compared directly to the MedMentions results in Table 7, but considering the diverse domains of the 9 specialised NER datasets, achieving this level of recall across datasets is already nontrivial. 5 Sentence Segmentation and Citation Handling Accurate sentence segmentation is required for many practical applications of natural language processing. Biomedical data presents many difficulties for standard sentence segmentation algorithms: abbreviated names and noun compounds containing punctuation are more common, whilst the wide range of citation styles can easily be misidentified as sentence boundaries. We evaluate sentence segmentation using both sentence and full-abstract accuracy when segmenting PubMed abstracts from the raw, untokenized GENIA development set (the Sent/Abstract columns in Table 8). Additionally, we examine the ability of the segmentation learned by our model to generalise to the body text of PubMed articles. Body text is typically more complex than abstract text, but in particular, it contains citations, which are considerably less frequent in abstract text. In order to examine the effectiveness of our models in this scenario, we design the following synthetic experiment. Given sentences from (Anonymous, 2019)9 which were originally designed for citation intent prediction, we run these sentences individually through our models. As we know that these sentences should be single sentences, we can simply count the frequency with which our models segment the individual sentences containing citations into multiple sentences (the Citation column in Table 8). As demonstrated by Table 8, training the dependency parser on in-domain data (both the scispaCy models) completely obviates the need for rule-based sentence segmentation. This is a positive result - rule based sentence segmentation is a brittle, time consuming process, which we have replaced with a domain specific version of an existing pipeline component. Both scispaCy models are released with the custom tokeniser, but without a custom sentence segmenter by default. 6 Related Work Apache cTakes (Savova et al., 2010) was designed specifically for clinical notes rather than the broader biomedical domain. MetaMap and MetaMapLite (Aronson, 2001; Demner-Fushman et al., 2017) from the National Library of 9Paper currently under review. Medicine focus specifically on entity linking using the Unified Medical Language System (UMLS) (Bodenreider, 2004) as a knowledge base. (Buyko et al.) adapt Apache OpenNLP using the GENIA corpus, but their system is not openly available and is less suitable for modern, Python-based workflows. The GENIA Tagger (Tsuruoka et al., 2005) provides the closest comparison to scispaCy due to it’s multi-stage pipeline, integrated research contributions and production quality runtime. We improve on the GENIA Tagger by adding a full dependency parser rather than just noun chunking, as well as improved results for NER without compromising significantly on speed. In more fundamental NLP research, the GENIA corpus (Kim et al., 2003) has been widely used to evaluate transfer learning and domain adaptation. (McClosky et al., 2006) demonstrate the effectiveness of self-training and parse re-ranking for domain adaptation. (Rimell and Clark, 2008) adapt a CCG parser using only POS and lexical categories, while (Joshi et al., 2018) extend a neural phrase structure parser trained on web text to the biomedical domain with a small number of partially annotated examples. These papers focus mainly of the problem of domain adaptation itself, rather than the objective of obtaining a robust, high-performance parser using existing resources. NLP techniques, and in particular, distant supervision have been employed to assist the curation of large, structured biomedical resources. (Poon et al., 2015) extract 1.5 million cancer path- way interactions from PubMed abstracts, leading to the development of Literome (Poon et al., 2014), a search engine for genic pathway interactions and genotype-phenotype interactions. A fundamental aspect of (Valenzuela-Escarcega et al., 2018; Poon et al., 2014) is the use of hand-written rules and triggers for events based on dependency tree paths; the connection to the application of scispaCy is quite apparent. 7 Conclusion In this paper we presented several robust model pipelines for a variety of natural language processing tasks focused on biomedical text. The scispaCy models are fast, easy to use, scalable, and achieve close to state of the art performance. We hope that the release of these models enables new applications in biomedical information extraction whilst making it easy to leverage high quality syntactic annotation for downstream tasks. Additionally, we released a reformatted GENIA 1.0 corpus augmented with automatically produced Universal Dependency annotations and recovered and aligned original abstract metadata. 1 Introduction The publication rate in the medical and biomedical sciences is growing at an exponential rate (Bornmann and Mutz, 2014). The information overload problem is widespread across academia, but is particularly apparent in the biomedical sciences, where individual papers may contain specific discoveries relating to a dizzying variety of genes, drugs, and proteins. In order to cope with the sheer volume of new scientific knowledge, there have been many attempts to automate the process of extracting entities, relations, protein interactions and other structured knowledge from scientific papers (Wei et al., 2016; Ammar et al., 2018; Poon et al., 2014). Although there exists a wealth of tools for processing biomedical text, many focus primarily on entity linking, negation detection and abbreviation detection. MetaMap and MetaMapLite (Aronson, 2001; Demner-Fushman et al., 2017), the two most widely used and supported tools for biomedical text processing, consider additional features, such as negation detection and acronym resolution. However, tools which cover more classical natural language processing (NLP) tasks such as the GENIA tagger (Tsuruoka et al., 2005; Tsuruoka and Tsujii, 2005) and phrase structure parsers such as those presented in (McClosky and Charniak, 2008) typically do not make use of new research innovations such as word representations or neural networks. In this paper, we introduce scispaCy, a specialized NLP library for processing biomedical texts which builds on the robust spaCy library,1 and document its performance relative to state of the art models for part of speech (POS) tagging, dependency parsing, named entity recognition (NER) and sentence segmentation. Specifically, we: • Release a reformatted version of the GENIA 1.0 (Kim et al., 2003) corpus converted into Universal Dependencies v1.0 and aligned 1spacy.io ar X iv :1 90 2. 07 66 9v 2 [ cs .C L ] 2 1 Fe b 20 19 with the original text from the PubMed abstracts. • Benchmark 9 named entity recognition models for more specific entity extraction applications demonstrating competitive performance when compared to strong baselines. • Release and evaluate two fast and convenient pipelines for biomedical text, which include tokenization, part of speech tagging, dependency parsing and named entity recognition. 2 Overview of (sci)spaCy In this section, we briefly describe the models used in the spaCy library and describe how we build on them in scispaCy. spaCy. The spaCy library (Honnibal and Montani, 2017)2 provides a variety of practical tools for text processing in multiple languages. Their models have emerged as the defacto standard for practical NLP due to their speed, robustness and close to state of the art performance. As the spaCy models are popular and the spaCy API is widely known to many potential users, we choose to build upon the spaCy library for creating a biomedical text processing pipeline. scispaCy. Our goal is to develop scispaCy as a robust, efficient and performant NLP library to satisfy the primary text processing needs in the biomedical domain. In this release of scispaCy, we retrain spaCy3 models for POS tagging, dependency parsing, and NER using datasets relevant to biomedical text, and enhance the tokenization module with additional rules. scispaCy contains two core released packages: en core sci sm and en core sci md. Models in the en core sci md package have a larger vocabulary and include word vectors, while those in en core sci sm have a smaller vocabulary and do not include word vectors, as shown in Table 1. Processing Speed. To emphasize the efficiency and practical utility of the end-to-end pipeline provided by scispaCy packages, we perform a speed comparison with several other publicly available processing pipelines for biomedical text using 10k randomly selected PubMed abstracts. We report 2Source code at https://github.com/ explosion/spaCy 3scispaCy models are based on spaCy version 2.0.18 results with and without segmenting the abstracts into sentences since some of the libraries (e.g., GENIA tagger) are designed to operate on sentences. As shown in Table 2, both models released in scispaCy demonstrate competitive speed to pipelines written in C++ and Java, languages designed for production settings. Whilst scispaCy is not as fast as pipelines designed for purely production use-cases (e.g., NLP4J), it has the benefit of straightforward integration with the large ecosystem of Python libraries for machine learning and text processing. Although the comparison in Table 2 is not an apples to apples comparison with other frameworks (different tasks, implementation languages etc), it is useful to understand scispaCy’s runtime in the context of other pipeline components. Running scispaCy models in addition to standard Entity Linking software such as MetaMap would result in only a marginal increase in overall runtime. In the following section, we describe the POS taggers and dependency parsers in scispaCy. 3 POS Tagging and Dependency Parsing The joint POS tagging and dependency parsing model in spaCy is an arc-eager transition-based parser trained with a dynamic oracle, similar to (Goldberg and Nivre, 2012). Features are CNN representations of token features and shared across all pipeline models (Kiperwasser and Goldberg, 2016; Zhang and Weiss, 2016). Next, we describe the data we used to train it in scispaCy. 3.1 Datasets GENIA 1.0 Dependencies. To train the dependency parser and part of speech tagger in both released models, we convert the treebank of (McClosky and Charniak, 2008),4 which is based on the GENIA 1.0 corpus (Kim et al., 2003), to Universal Dependencies v1.0 using the Stanford Dependency Converter (Schuster and Manning, 2016). As this dataset has POS tags annotated, we use it to train the POS tagger jointly with the dependency parser in both released models. As we believe the Universal Dependencies converted from the original GENIA 1.0 corpus are generally useful, we have released them as a separate contribution of this paper.5 In this data release, we also align the converted dependency parses to their original text spans in the raw, untokenized abstracts from the original release,6 and include the PubMed metadata for the abstracts which was discarded in the GENIA corpus released by McClosky and Charniak (2008). We hope that this raw format can emerge as a resource for practical evaluation in the biomedical domain of core NLP tasks such as tokenization, sentence segmentation and joint models of syntax. Finally, we also retrieve from PubMed the original metadata associated with each abstract. This includes relevant named entities linked to their Medical Subject Headings (MeSH terms) as well as chemicals and drugs linked to a variety of ontologies, as well as author metadata, publication dates, citation statistics and journal metadata. We hope that the community can find interesting problems for which such natural supervision can be used. 4https://nlp.stanford.edu/˜mcclosky/ biomedical.html 5Available at https://github.com/allenai/ genia-dependency-trees 6Available at http://www.geniaproject.org/ OntoNotes 5.0. To increase the robustness of the dependency parser and POS tagger to generic text, we make use of the OntoNotes 5.0 corpus7 when training the dependency parser and part of speech tagger (Weischedel et al., 2011; Hovy et al., 2006). The OntoNotes corpus consists of multiple genres of text, annotated with syntactic and semantic information, but we only use POS and dependency parsing annotations in this work. 3.2 Experiments We compare our models to the recent survey study of dependency parsing and POS tagging for biomedical data (Nguyen and Verspoor, 2018) in Tables 3 and 4. POS tagging results show that both models released in scispaCy are competitive with state of the art systems, and can be considered of equivalent practical value. In the case of dependency parsing, we find that the Biaffine parser of (Dozat and Manning, 2016) outperforms the scispaCy models by a margin of 2-3%. However, as demonstrated in Table 2, the scispaCy models are 7Instructions for download at http://cemantix. org/data/ontonotes.html approximately 9x faster due to the speed optimizations in spaCy. Robustness to Web Data. A core principle of the scispaCy models is that they are useful on a wide variety of types of text with a biomedical focus, such as clinical notes, academic papers, clinical trials reports and medical records. In order to make our models robust across a wider range of domains more generally, we experiment with incorporating training data from the OntoNotes 5.0 corpus when training the dependency parser and POS tagger. Figure 2 demonstrates the effectiveness of adding increasing percentages of web data, showing substantially improved performance on OntoNotes, at no reduction in performance on biomedical text. Note that mixing in web text during training has been applied to previous systems - the GENIA Tagger (Tsuruoka et al., 2005) also employs this technique. 4 Named Entity Recognition The NER model in spaCy is a transition-based system based on the chunking model from (Lample et al., 2016). Tokens are represented as hashed, embedded representations of the prefix, suffix, shape and lemmatized features of individual words. Next, we describe the data we used to train NER models in scispaCy. 4.1 Datasets The main NER model in both released packages in scispaCy is trained on the mention spans in the MedMentions dataset (Murty et al., 2018). Since the MedMentions dataset was originally designed for entity linking, this model recognizes a wide variety of entity types, as well as non-standard syntactic phrases such as verbs and modifiers, but the model does not predict the entity type. In order to provide for users with more specific requirements around entity types, we release four additional packages en ner {bc5cdr|craft |jnlpba|bionlp13cg} md with finer-grained NER models trained on BC5CDR (for chemicals and diseases; Li et al., 2016), CRAFT (for cell types, chemicals, proteins, genes; Bada et al., 2011), JNLPBA (for cell lines, cell types, DNAs, RNAs, proteins; Collier and Kim, 2004) and BioNLP13CG (for cancer genetics; Pyysalo et al., 2015), respectively. 4.2 Experiments As NER is a key task for other biomedical text processing tasks, we conduct a through evaluation of the suitability of scispaCy to provide baseline performance across a wide variety of datasets. In particular, we retrain the spaCy NER model on each of the four datasets mentioned earlier (BC5CDR, CRAFT, JNLPBA, BioNLP13CG) as well as five more datasets in Crichton et al. (2017): AnatEM, BC2GM, BC4CHEMD, Linnaeus, NCBI-Disease. These datasets cover a wide variety of entity types required by different biomedical domains, including cancer genetics, disease-drug interactions, pathway analysis and trial population extraction. Additionally, they vary considerably in size and number of entities. For example, BC4CHEMD (Krallinger et al., 2015) has 84,310 annotations while Linnaeus (Gerner et al., 2009) only has 4,263. BioNLP13CG (Pyysalo et al., 2015) annotates 16 entity types while five of the datasets only annotate a single entity type.8 Table 5 provides a through comparison of the scispaCy NER models compared to a variety of models. In particular, we compare the models to strong baselines which do not consider the use of 1) multi-task learning across multiple datasets and 2) semi-supervised learning via large pretrained language models. Overall, we find that the scispaCy models are competitive baselines for 5 of the 9 datasets. Additionally, in Table 6 we evaluate the recall of the pipeline mention detector available in both 8For a detailed discussion of the datasets and their creation, we refer the reader to https://github.com/ cambridgeltl/MTL-Bioinformatics-2016/ blob/master/Additional%20file%201.pdf scispaCy models (trained on the MedMentions dataset) against all 9 specialised NER datasets. Overall, we observe a modest drop in average recall when compared directly to the MedMentions results in Table 7, but considering the diverse domains of the 9 specialised NER datasets, achieving this level of recall across datasets is already nontrivial. 5 Sentence Segmentation and Citation Handling Accurate sentence segmentation is required for many practical applications of natural language processing. Biomedical data presents many difficulties for standard sentence segmentation algorithms: abbreviated names and noun compounds containing punctuation are more common, whilst the wide range of citation styles can easily be misidentified as sentence boundaries. We evaluate sentence segmentation using both sentence and full-abstract accuracy when segmenting PubMed abstracts from the raw, untokenized GENIA development set (the Sent/Abstract columns in Table 8). Additionally, we examine the ability of the segmentation learned by our model to generalise to the body text of PubMed articles. Body text is typically more complex than abstract text, but in particular, it contains citations, which are considerably less frequent in abstract text. In order to examine the effectiveness of our models in this scenario, we design the following synthetic experiment. Given sentences from (Anonymous, 2019)9 which were originally designed for citation intent prediction, we run these sentences individually through our models. As we know that these sentences should be single sentences, we can simply count the frequency with which our models segment the individual sentences containing citations into multiple sentences (the Citation column in Table 8). As demonstrated by Table 8, training the dependency parser on in-domain data (both the scispaCy models) completely obviates the need for rule-based sentence segmentation. This is a positive result - rule based sentence segmentation is a brittle, time consuming process, which we have replaced with a domain specific version of an existing pipeline component. Both scispaCy models are released with the custom tokeniser, but without a custom sentence segmenter by default. 6 Related Work Apache cTakes (Savova et al., 2010) was designed specifically for clinical notes rather than the broader biomedical domain. MetaMap and MetaMapLite (Aronson, 2001; Demner-Fushman et al., 2017) from the National Library of 9Paper currently under review. Medicine focus specifically on entity linking using the Unified Medical Language System (UMLS) (Bodenreider, 2004) as a knowledge base. (Buyko et al.) adapt Apache OpenNLP using the GENIA corpus, but their system is not openly available and is less suitable for modern, Python-based workflows. The GENIA Tagger (Tsuruoka et al., 2005) provides the closest comparison to scispaCy due to it’s multi-stage pipeline, integrated research contributions and production quality runtime. We improve on the GENIA Tagger by adding a full dependency parser rather than just noun chunking, as well as improved results for NER without compromising significantly on speed. In more fundamental NLP research, the GENIA corpus (Kim et al., 2003) has been widely used to evaluate transfer learning and domain adaptation. (McClosky et al., 2006) demonstrate the effectiveness of self-training and parse re-ranking for domain adaptation. (Rimell and Clark, 2008) adapt a CCG parser using only POS and lexical categories, while (Joshi et al., 2018) extend a neural phrase structure parser trained on web text to the biomedical domain with a small number of partially annotated examples. These papers focus mainly of the problem of domain adaptation itself, rather than the objective of obtaining a robust, high-performance parser using existing resources. NLP techniques, and in particular, distant supervision have been employed to assist the curation of large, structured biomedical resources. (Poon et al., 2015) extract 1.5 million cancer path- way interactions from PubMed abstracts, leading to the development of Literome (Poon et al., 2014), a search engine for genic pathway interactions and genotype-phenotype interactions. A fundamental aspect of (Valenzuela-Escarcega et al., 2018; Poon et al., 2014) is the use of hand-written rules and triggers for events based on dependency tree paths; the connection to the application of scispaCy is quite apparent. 7 Conclusion In this paper we presented several robust model pipelines for a variety of natural language processing tasks focused on biomedical text. The scispaCy models are fast, easy to use, scalable, and achieve close to state of the art performance. We hope that the release of these models enables new applications in biomedical information extraction whilst making it easy to leverage high quality syntactic annotation for downstream tasks. Additionally, we released a reformatted GENIA 1.0 corpus augmented with automatically produced Universal Dependency annotations and recovered and aligned original abstract metadata."

import pysbd
import time
import cProfile
from tqdm import tqdm

segmenter = pysbd.Segmenter(language='en', clean=False)

n_trials = 10
times = []
for i in tqdm(range(n_trials)):
    start = time.time()
    # segments = cProfile.run('segmenter.segment(text)')
    segments = segmenter.segment(text)
    end = time.time()
    times.append(end - start)

print("Total seconds {}".format(sum(times)))
print("Num trials {}".format(n_trials))
print("Average second {}".format(sum(times) / n_trials))
Ejemplo n.º 28
0
def test_sbd_clean_chart_span():
    """Test to not allow clean=True and char_span=True
    """
    seg = pysbd.Segmenter(language="en", clean=True, char_span=True)
    text = "<h2 class=\"lined\">Hello</h2>\n<p>This is a test. Another test.</p>"
    seg.segment(text)
Ejemplo n.º 29
0
    def parse(self, tex_path: str, tex: str) -> Iterator[Sentence]:
        check_for_pysbd_reserved_characters(tex)

        # Extract plaintext from TeX.
        plaintext = extract_plaintext(tex_path, tex)

        # Segment the plaintext. Return offsets for each setence relative to the TeX input
        segmenter = pysbd.Segmenter(language="en", clean=False, char_span=True)

        # As each sentence is scanned, keep track of what sections and environments the
        # sentence appears within.
        section_name = None
        in_figure = False
        in_table = False
        in_itemize = False

        # The pysbd module has several open bugs and issues which are addressed below.
        # As of 3/23/20 we know the module will fail in the following ways:
        # 1. pysbd will not break up the sentence when it starts with a punctuation mark or space.
        #    ex: ". hello. world. hi."
        #    sol: check for sentences being longer than 1000 characters. Also, see the
        #         plaintext extraction function, which attempts to clean up the text so that
        #         consecutive periods are removed before segmentation.
        # 2. pysbd uses reserved characters for splitting sentences
        #    ex: see PYSBD_RESERVED_CHARACTERS list.
        #    sol: throw a warning if the sentence contains any of these characters.
        sentence_ranges: List[CharacterRange] = []
        sentence_start: Optional[int] = None

        for span in segmenter.segment(str(plaintext)):
            if sentence_start is None:
                # Strip leading whitespace from sentence.
                sentence_start = span.start + regex.search(
                    r"^(\s*)", span.sent).end()

            # Don't detect a sentence boundary in the middle of a equation
            is_boundary_in_equation = regex.search(
                r"EQUATION_DEPTH_0_START(?!.*EQUATION_DEPTH_0_END)",
                str(plaintext[sentence_start:span.end]),
                flags=regex.DOTALL,
            )
            if not is_boundary_in_equation:
                # Strip trailing whitespace from sentence.
                end = span.start + regex.search(r"(\s*)$", span.sent).start()
                sentence_ranges.append(CharacterRange(sentence_start, end))
                sentence_start = None

        for i, sentence_range in enumerate(sentence_ranges):
            tex_start, tex_end = plaintext.initial_offsets(
                sentence_range.start, sentence_range.end)
            if tex_start is None or tex_end is None:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "The span bounds (%d, %d) from pysbd for a sentence could not be mapped "
                    +
                    "back to character offsets in the LaTeX for an unknown reason.",
                    sentence_range.start,
                    sentence_range.end,
                )
                continue

            sentence_tex = tex[tex_start:tex_end]

            # Save the sentence as a journaled string, which will allow the mapping of the cleaned
            # sentence text to the original TeX.
            sentence = plaintext.substring(
                sentence_range.start,
                sentence_range.end,
                # These truncation options are important for preserving the mapping from offsets in
                # the edited sentence to the initial offsets before the edits.
                include_truncated_left=False,
                include_truncated_right=False,
            )
            if len(sentence) > 1000:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "Exceptionally long sentence (length %d). This might indicate the sentence "
                    +
                    "extractor failed to properly split text into sentences.",
                    len(sentence),
                )

            # Extract TeX around sentence to understand the environment in which it appears
            context_tex = get_context(tex, tex_start, tex_end)

            # Detect features describing the context the sentence appears in (i.e., the section it's in,
            # or if it's in a figure, etc.) using regular expressions.
            section = regex.findall(
                r"\\(?:sub)*section[*]*\{[A-Za-z0-9 \{\}\\_.,:-]*\}",
                context_tex)
            abstract_begin = regex.findall(r"\\begin\{abstract\}", context_tex)
            abstract_end = regex.findall(r"\\end\{abstract\}", context_tex)
            table_begin = regex.findall(r"\\begin\{tabular\}", context_tex)
            table_end = regex.findall(r"\\end\{tabular\}", context_tex)
            figure_begin = regex.findall(r"\\begin\{figure[*]*\}", context_tex)
            figure_end = regex.findall(r"\\end\{figure[*]*\}", context_tex)
            itemize_begin = regex.findall(r"\\begin\{itemize[*]*\}",
                                          context_tex)
            itemize_end = regex.findall(r"\\end\{itemize[*]*\}", context_tex)
            cite = regex.findall(
                r"\\cite[A-Za-z0-9 \\_\[\].,:-]*\{[A-Za-z0-9 \\_.,:-]*\}",
                context_tex)
            url = regex.findall(r"\\url\{[A-Za-z0-9 \{\}/\\_.,:-]*\}",
                                context_tex,
                                overlapped=False)
            label = regex.findall(r"\\label\{[A-Za-z0-9 \\_.,:-]*\}",
                                  context_tex)
            ref = regex.findall(r"\\ref\{[A-Za-z0-9 \\_.,:-]*\}", context_tex)
            tex_macros = set(
                regex.findall(
                    r"\\[A-Za-z0-9\\\[\]_.,:-]*[\{[A-Za-z0-9 \\_.,:-]*\}]*",
                    context_tex))

            # Save a list of other TeX macros that aren't captured by any of the other
            # categories: { any } - { section, label, ... }.
            other_tex_macros: List[str] = []
            named_macros = {
                m
                for l in [
                    abstract_begin,
                    abstract_end,
                    table_begin,
                    table_end,
                    figure_begin,
                    figure_end,
                    itemize_begin,
                    itemize_end,
                    cite,
                ] for m in l
            }
            other_tex_macros = list(tex_macros - named_macros)

            # Save section name.
            if abstract_begin:
                section_name = "ABSTRACT"
            if abstract_end:
                section_name = None
            if section:
                section_name = extract_text_from_tex_group(section[0])

            # Save information about whether a sentence is in a figure, table, or other environment.
            # TODO(dykang): considering using \label{} in table/figure to improve matching.
            if figure_begin:
                in_figure = True
            if figure_end:
                in_figure = False
            if table_begin:
                in_table = True
            if table_end:
                in_table = False
            if itemize_begin:
                in_itemize = True
            if itemize_end:
                in_itemize = False

            # Use heuristics about the surrounding text to determine whether or not this
            # sentence is valid. These heuristics have a number of limitations, and should be
            # replaced with more mature rules for detecting whether the sentence is indeed in
            # names section, the abstract, a figure, a table, etc. See documentation of its
            # limitations here: https://github.com/allenai/scholar-reader/issues/138#issue-678432430
            validity_guess = all([
                # Sentence should appear in a named section.
                (not self.from_named_sections_only) or section_name,
                # Sentence should not appear in a figure or table.
                # TODO(dykang, andrewhead): eventually, this should be rewritten to permit the
                # extraction of sentences from captions.
                not in_figure,
                not in_table,
                # If the sentence contained regular expression patterns for the start or end of
                # an environment, it's probably not a sentence, bur rather just TeX macros.
                not abstract_begin,
                not abstract_end,
                not section,
                not table_end,
                not figure_end,
                not itemize_begin,
                not itemize_end,
            ])

            tokens = regex.split(r"[\s,;.!?()]+", str(sentence))
            contains_common_english_word = any([
                len(t) > 1 and t.lower() in self.english_words for t in tokens
            ])
            ends_with_stop = bool(regex.search(r"[,.:;!?]\s*$", str(sentence)))
            is_clean = contains_common_english_word and ends_with_stop

            # Sanitize the text, replacing macros and unwanted TeX with text that will be easier
            # for the text processing algorithms to process.
            sanitized = sentence
            replace_patterns: List[Tuple[str, str]] = []

            # Replace citations with "CITATION".
            for citation in cite:
                citation_text = extract_text_from_tex_group(citation)
                for key in citation_text.split(","):
                    replace_patterns.append((key, "CITATION"))

            # Replace URLs with "URL".
            for url_item in url:
                url_text = extract_text_from_tex_group(url_item)
                replace_patterns.append((url_text, "URL"))

            # Replace references to text elements like figures and tables with a single
            # known word for each type of element. Currently depends on idiomatic patterns
            # for naming elements, like \ref{{fig,tab,sec,eq}:XXX}, to distinguish between
            # element types. Also, the code keeps the token ahead of the reference (e.g.,
            # the word "Table" in "Table\ref{...}"), although it might duplicate the
            # information in the replaced label.
            for reference in ref:
                reference_text = extract_text_from_tex_group(reference)
                for r in reference_text.split(","):
                    if reference.lower().startswith("tab"):
                        replace_patterns.append((r, "TABLE"))
                    if reference.lower().startswith("fig"):
                        replace_patterns.append((r, "FIGURE"))
                    if reference.lower().startswith("sec"):
                        replace_patterns.append((r, "SECTION"))
                    if reference.lower().startswith("eq"):
                        replace_patterns.append((r, "EQUATION"))

            # Substitute patterns with replacements.
            for pattern, replacement in replace_patterns:
                if pattern == "":
                    continue
                match_start = 0
                while True:
                    match_offset = sanitized.find(pattern, match_start)
                    if match_offset == -1:
                        break
                    sanitized = sanitized.edit(match_offset,
                                               match_offset + len(pattern),
                                               replacement)
                    match_start = match_offset + len(pattern)

            yield Sentence(
                id_=str(i),
                tex_path=tex_path,
                start=tex_start,
                end=tex_end,
                text=str(sentence),
                text_journal=sentence,
                sanitized=str(sanitized),
                sanitized_journal=sanitized,
                tex=sentence_tex,
                context_tex=context_tex,
                validity_guess=validity_guess,
                is_clean=is_clean,
                section_name=section_name,
                in_figure=in_figure,
                in_table=in_table,
                in_itemize=in_itemize,
                label=label,
                ref=ref,
                cite=cite,
                url=url,
                others=other_tex_macros,
            )
Ejemplo n.º 30
0
def test_sbd_char_span(text, expected):
    """Test sentences with character offsets"""
    seg = pysbd.Segmenter(language="en", clean=False, char_span=True)
    segments = seg.segment(text)
    assert segments == expected