Esempio n. 1
0
def get_parser(disable: Set[str] = None,
               lang: str = 'en',
               merge_terms: Optional[Set] = None,
               max_sent_len: Optional[int] = None) -> Callable:
    """spaCy clinical text parser

    Parameters
    ----------
    disable
    lang
    merge_terms
    max_sent_len

    Returns
    -------

    """
    disable = {"ner", "parser", "tagger", "lemmatizer"} \
        if disable is None else disable

    merge_terms = {} if not merge_terms else merge_terms

    nlp = spacy.load(lang, disable=disable)
    nlp.tokenizer = ct_tokenizer(nlp)

    sbd_func = partial(ct_sbd_rules,
                       merge_terms=merge_terms,
                       max_sent_len=max_sent_len)

    sbd = SentenceSegmenter(nlp.vocab, strategy=sbd_func)
    nlp.add_pipe(sbd)
    return nlp
Esempio n. 2
0
def separate(nlp, s, stop_file=None):
    separate._log.debug("\nThe outcomes of separation '{}' are:".format(s))
    # 因为需要nlp.remove_pipe()和nlp.add_pipe(SS),所以必须重新import zh、不能从外部传参进来,否则会有ValueError。泥马这个坑爹bug花了哥一个上午才搞定!泥马坑爹!
    import zh_core_web_sm
    nlp = zh_core_web_sm.load()

    from spacy.pipeline import SentenceSegmenter

    def split_on_punctuation(doc):
        punctuation = ",;,;、和与"
        # punctuation = re.compile(r",.:;?!,。:;?!")
        start = 0
        whether_segmenter = False
        for word in doc:
            if whether_segmenter or word.is_space:  # and not word.is_space!
                yield doc[start:word.i]
                start = word.i
                whether_segmenter = False
            elif word.text in punctuation:
                whether_segmenter = True
        if start < len(doc):
            yield doc[start:len(doc)]

    SS = SentenceSegmenter(nlp.vocab, strategy=split_on_punctuation)
    nlp.add_pipe(SS)
    doc = nlp(s)
    for sent in doc.sents:
        separate._log.debug("\t{}".format(sent.text))
    return doc
Esempio n. 3
0
def Parsing():
    print("\nThe outcomes of Parsing are:")

    # 中文无法直接分句!
    paragraph = nlp(u"京东CEO刘强东, 在美国明尼苏达,涉嫌性侵女大学生. 奶茶妹妹。遇见VP就有90%的几率1位出道……")
    for sent in paragraph.sents:
        print("\t{}".format(sent.text))

    # 自定义中文分句。
    from spacy.pipeline import SentenceSegmenter

    def split_on_punctuation(doc):
        start = 0
        whether_segmenter = False
        for word in doc:
            if whether_segmenter and not word.is_space:
                yield doc[start:word.i]
                start = word.i
                whether_segmenter = False
            elif word.text in ",.:;?!,。:;?!":
                whether_segmenter = True
        if start < len(doc):
            yield doc[start:len(doc)]

    punctuation = re.compile(r",.:;?!,。:;?!")
    SS = SentenceSegmenter(nlp.vocab, strategy=split_on_punctuation)
    nlp.add_pipe(SS)
    paragraph = nlp(u"京东CEO刘强东, 在美国明尼苏达,涉嫌性侵女大学生. 奶茶妹妹……遇见VP就有90%的几率1位出道。")
    for sent in paragraph.sents:
        print("\t{}".format(sent.text))

    print("\n")
    sentence = nlp("京东CEO刘强东在美国明尼苏达涉嫌性侵女大学生。")
    for word in sentence:
        print("\t{}: {}".format(word, str(list(word.children))))
    def __init__(self, text=""):
        self.text = text
        self.output = {'tokens': [], 'types': [], 'lemma': [], 'pos': []}

        # Pipeline definieren
        self.nlp = spacy.load("en_core_web_sm")
        self.nlp.tokenizer = self.__custom_tokenizer
        seg = SentenceSegmenter(self.nlp.vocab,
                                strategy=self.__custom_segmenter)
        self.nlp.add_pipe(seg, first=True)
Esempio n. 5
0
 def __init__(self, language='en'):
     self.exclude = EXCLUDE
     self.language = language
     if language == 'fr':
         nlp = French()
     else:
         nlp = English()
     #nlp.add_pipe(nlp.create_pipe('sentencizer'))
     sbd = SentenceSegmenter(nlp.vocab, strategy=split_sents)
     nlp.add_pipe(sbd)
     self.nlp = nlp
Esempio n. 6
0
    u'Managment ist die Sachen richtig zu machen - Führung ist die richtigen Sachen zu machen. - Peter Drucker'
)
for sent in doc.sents:
    print(sent)


# splittet an \n
def split_on_new_lines(doc):
    start = 0
    seen_newline = False

    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text.startswith('\n'):
            seen_newline = True
    yield doc[start:]


sbd = SentenceSegmenter(nlp_de.vocab, strategy=split_on_new_lines)
nlp_de.add_pipe(sbd)
print(nlp_de.pipe_names)

doc = nlp_de(
    u'Dies ist ein Satz. Dies ist ein anderer.\n\nDies ist ein \ndritter Satz.'
)

for sent in doc.sents:
    print(sent)
Esempio n. 7
0
 def __init__(self, lang):
     self.nlp = spacy.load(lang, disable=['parser', 'tagger', 'ner'])
     sbd = SentenceSegmenter(self.nlp.vocab)
     self.nlp.add_pipe(sbd)
Esempio n. 8
0
def split_on_breaks(doc):
    start = 0
    seen_break = False
    for word in doc:
        if seen_break:
            yield doc[start:word.i - 1]
            start = word.i
            seen_break = False
        elif word.text == sentence_boundary:
            seen_break = True
    if start < len(doc):
        yield doc[start:len(doc)]


sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_breaks)
nlp.add_pipe(sbd, first=True)

clean_text = lambda sent: sent.encode("ascii", errors="ignore").decode(
).replace('\\', '')


def inject_sentence_boundaries(paragraph, boundary_type="multirc"):
    if boundary_type == "multirc":
        paragraph = re.sub(r'(<br>)?(<b>Sent \d+: *<\/b>)',
                           ' {} '.format(sentence_boundary), paragraph)
        paragraph = re.sub(r'<br> *$', '', paragraph)
        paragraph = re.sub(r'^ *{} *'.format(sentence_boundary), '', paragraph)
    elif boundary_type == "spacy":
        paragraph = " {} ".format(sentence_boundary).join(
            [str(sent).strip() for sent in raw_nlp(paragraph).sents])
def main():
    import nltk
    from nltk.stem.porter import PorterStemmer
    from nltk.stem.snowball import SnowballStemmer
    import spacy
    from spacy import displacy
    from spacy.matcher import Matcher
    from spacy.matcher import PhraseMatcher
    from spacy.tokens import Span
    from spacy.pipeline import SentenceSegmenter

    import config

    nlp = spacy.load('en_core_web_sm')

    #%%
    print(nlp.pipeline)
    print(nlp.pipe_names)

    #%%
    print("Data string examples \n")
    mystring = '"As of last quarter autonomous cars have shifted insurance liability toward manufacturers. ' \
               'There\'s a car factory in LA! About 5km away. ' \
               'Here is the Apple snail-mail: [email protected] or visit http://www.oursite.com."'

    mystring2 = 'I am a runner running in a race because I love to run since I ran today.'
    words = [
        'run', 'ran', 'runner', 'runs', 'fairly', 'fairness', 'generous',
        'generously', 'generate', 'generation'
    ]

    #%%
    print(
        "Print each word in the string with it's corresponding POS, dependency:"
    )
    doc1 = nlp(mystring)
    print("The vocab size for our small lang. lib. is: ", len(doc1.vocab))
    for token in doc1:
        print(token.text, token.pos, token.pos_, token.dep_)

    #%%

    print("Print the named entities:")
    for token in doc1.ents:
        print(
            f"{token.text} {10*'.'}\t {token.label_} {5*'.'}\t {spacy.explain(token.label_)}\n"
        )

    #%%
    print("A function to display basic entity info.")

    def show_ents(doc):
        if doc.ents:
            for ent in doc.ents:
                print(ent.text + ' - ' + ent.label_ + ' - ' +
                      str(spacy.explain(ent.label_)))
        else:
            print('\n No named entities found. \n')

    doc4 = nlp(
        u'May I go to Washington, DC next May to see the Washington Monument and buy Tesla stocks? The flight ticket is only 500 dollars.'
    )
    doc5 = nlp(u'Hi, Hope you are well.')
    show_ents(doc4)
    show_ents(doc5)

    #%%
    print("Adding a single term as an NER")
    from spacy.tokens import Span

    doc = nlp(u'Tesla to build a U.K. factory for $6 million')

    # Get the hash value of the ORG entity label
    ORG = doc.vocab.strings[u'ORG']
    print(ORG)
    # Create a Span for the new entity
    # doc: Name of document object
    # 0: start position of the span,
    # 1: stop position of the span (exclusive: not including 1)
    # Label: ORG is the label assigned to the entity
    new_ent = Span(doc, 0, 1, label=ORG)

    # Add the entity to the existing Doc object
    doc.ents = list(doc.ents) + [new_ent]

    show_ents(doc)

    #%%
    print("Adding multiple phrases as NERs")

    doc = nlp(u'Our company plans to introduce a new vacuum cleaner. '
              u'If successful, the vacuum cleaner will be our first product.')
    show_ents(doc)

    # Import PhraseMatcher and create a matcher object:
    from spacy.matcher import PhraseMatcher
    matcher = PhraseMatcher(nlp.vocab)

    # Create the desired phrase patterns:
    phrase_list = ['vacuum cleaner', 'vacuum-cleaner']
    phrase_patterns = [nlp(text) for text in phrase_list]

    # Apply the patterns to our matcher object:
    matcher.add('newproduct', None, *phrase_patterns)

    # Apply the matcher to our Doc object:
    found_matches = matcher(doc)

    # See what matches occur:
    print(found_matches)

    # Here we create Spans from each match, and create named entities from them:
    from spacy.tokens import Span

    PROD = doc.vocab.strings[u'PRODUCT']

    new_ents = [
        Span(doc, match[1], match[2], label=PROD) for match in found_matches
    ]

    doc.ents = list(doc.ents) + new_ents

    show_ents(doc)
    #%%
    print("Counting Named Entities occurrences")

    doc = nlp(
        u'Originally priced at $29.50, the sweater was marked down to five dollars.'
    )

    show_ents(doc)

    len([ent for ent in doc.ents if ent.label_ == 'MONEY'])

    # For more on **Named Entity Recognition** visit https://spacy.io/usage/linguistic-features#101

    #%%
    print("Visualizing NER")
    doc = nlp(
        u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million. '
        u'By contrast, Sony sold only 7 thousand Walkman music players.')

    displacy.render(doc, style='ent', jupyter=True)
    displacy.serve(doc1, style='ent')

    print('Viewing Sentences Line by Line')
    for sent in doc.sents:
        displacy.render(nlp(sent.text), style='ent', jupyter=True)

    print("Viewing Specific Entities, and customizing the visualization")
    options = {'ents': ['ORG', 'PRODUCT']}

    colors = {
        'ORG': 'linear-gradient(90deg, #aa9cfc, #fc9ce7)',
        'PRODUCT': 'radial-gradient(yellow, green)'
    }

    options = {'ents': ['ORG', 'PRODUCT'], 'colors': colors}

    print('display entities on jupiter notebook')
    displacy.render(doc, style='ent', jupyter=True, options=options)
    print('Display entities on browser: http://127.0.0.1:5000 ')
    displacy.serve(doc1, style='ent', options=options)
    # For more on applying CSS background colors and gradients, visit https://www.w3schools.com/css/css3_gradients.asp
    # https://spacy.io/usage/visualizers
    #%%
    print("Visualize entity recognizer with Spacy (line by line)")

    doc1 = nlp(mystring)
    spans = list(doc1.sents)

    # print('display entities on jupiter notebook')
    # displacy.render(spans,style='ent',jupyter=True,options={'distance':80})

    print('Display entities on browser: http://127.0.0.1:5000 ')
    displacy.serve(spans, style='ent', options=options)

    #%%
    print("Visualize entity recognizer with Spacy (whole paragraph)")

    # print('display entities on jupiter notebook')
    # displacy.render(doc1,style='ent',jupyter=True,options={'distance':80})

    print('Display entities on browser: http://127.0.0.1:5000 ')
    displacy.serve(doc1, style='ent', options=options)

    #%%

    print("List name Chunks:")
    for token in doc1.noun_chunks:
        print(token)
    # For more on **noun_chunks** visit https://spacy.io/usage/linguistic-features#noun-chunks
    #%%
    print("Dependency visualization with Spacy ")

    # style 'dep': shows pos tags and syntactic dependencies
    options = {
        'distance': 80,
        'compact': 'True',
        'color': 'yellow',
        'bg': '#09a3d5',
        'font': 'Times'
    }

    print('display dependencies on jupiter notebook')
    displacy.render(doc1, style='dep', jupyter=True, options=options)

    print('Display dependencies on browser: http://127.0.0.1:5000 ')
    displacy.serve(doc1, style='dep', options=options)

    #%%
    print(
        "Spacy doesn't include a Stemmer. Instead it relies on lemmatization entirely. \n"
        "We use NLTK Porter and Snowball Stemmers here.")

    p_stemmer = PorterStemmer()

    for word in words:
        print(f"{word}, {10*'.'}, {p_stemmer.stem(word)}")

    s_stemmer = SnowballStemmer(language='english')
    for word in words:
        print(f"{word}, {10*'.'}, {s_stemmer.stem(word)}")

    #%%
    print("Perform Lemmatization with Spacy")
    text = nlp(mystring2)

    def show_lemmas(text):
        for token in text:
            print(
                f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{12}} {token.lemma_:{6}} {token.tag_:{6}} {spacy.explain(token.tag_)}'
            )

    show_lemmas(text)

    #%%
    print("Remove/Add stopwords with Spacy")
    print(nlp.Defaults.stop_words)  # Print the List of Spacy stopwords
    len(nlp.Defaults.stop_words)  # Number of default stopwords in Spacy
    nlp.vocab[
        'is'].is_stop  # Tells if the vocab is among Spacy stopwords or not
    nlp.vocab['mystery'].is_stop
    nlp.Defaults.stop_words.add(
        'btw')  # Adding to the Spacy's list of stopwords
    nlp.vocab['btw'].is_stop = True  # set it to True
    nlp.Defaults.stop_words.remove(
        'six')  # Removing from the Spacy's list of stopwords
    nlp.vocab['six'].is_stop = False

    #%%
    print(
        "RuleBased Vocabulary Matching.\n More powerful version of the regular expressions"
    )

    # looking for 3 different forms of the same pattern here
    matcher = Matcher(nlp.vocab)
    # a single token whose lowercase text reads 'solarpower'
    pattern1 = [{'LOWER': 'solarpower'}]
    # two adjacent tokens that read 'solar' and 'power' in that order
    pattern2 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]
    # three adjacent tokens, with a middle token that can be any punctuation
    pattern3 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]
    # Option (OP) key '*' : allows pattern 0 or more times
    pattern4 = [{
        'LOWER': 'solar'
    }, {
        'IS_PUNCT': True,
        'OP': '*'
    }, {
        'LOWER': 'power'
    }]

    # add patterns to matcher labeled 'SolarPowerMatcherName'
    matcher.add('SolarPowerMatcherName', None, pattern1, pattern2, pattern3,
                pattern4)

    doc = nlp(u'The Solar Power industry continues to grow as demand \
    for solarpower increases. Solar-power cars are gaining popularity as solar--power shows more strength'
              )

    found_matches = matcher(doc)
    print(
        found_matches)  # gives you tuples with match_id, start, and end index

    for match_id, start, end in found_matches:  # grabs raw matched-vocab with match_id, start, and end index
        string_id = nlp.vocab.strings[match_id]  # get string representation
        span = doc[start:end]  # get the matched span
        print(match_id, string_id, start, end, span.text)

    # remove the patterns identified under 'SolarPowerMatcherName' label to avoid duplicates in next search
    matcher.remove('SolarPowerMatcherName')

    #%%
    print(
        "RuleBased Phrase Matching.\n More powerful version of the regular expressions"
    )
    matcher = PhraseMatcher(nlp.vocab)
    # if your file gave you utf8 file error run this on terminal:
    # iconv -f iso-8859-1 -t utf-8  original_file > new_file
    doc2_path = config.DATA_DIR + 'reaganomics.txt'
    with open(doc2_path) as f:
        doc2 = nlp(f.read())

    # First, create a list of match phrases:
    phrase_list = [
        'voodoo economics', 'supply-side economics', 'trickle-down economics',
        'free-market economics'
    ]

    # Next, convert each phrase to a Doc object:
    phrase_patterns = [nlp(text) for text in phrase_list]

    # Pass each Doc object into matcher (note the use of the asterisk!):
    matcher.add('VoodooEconomics', None, *phrase_patterns)

    # Build a list of matches:
    found_matches = matcher(doc2)

    for match_id, start, end in found_matches:  # grabs raw matched-vocab with match_id, start, and end index
        string_id = nlp.vocab.strings[match_id]  # get string representation
        span = doc2[start:end]  # get the matched span
        print(match_id, string_id, start, end, span.text)
    #%%
    print("going through doc sentences")

    with open(config.DATA_DIR + 'owlcreek.txt') as f:
        doc = nlp(f.read())

    sents = [sent for sent in doc.sents]
    len(sents)

    #%%
    print("sentense segmentation")
    '''
    It is important to note that `doc.sents`
    is a *generator*. That is, a Doc is not segmented 
    until `doc.sents` is called. This means that, 
    where you could print the second Doc token with
    `print(doc[1])`, you can't call the' 
     "second Doc sentence" with `print(doc.sents[1])`
     However, you *can* build a sentence collection by
      running `doc.sents` and saving the result to a list
    '''

    doc = nlp(
        u'This is the first sentence. This is another sentence. This is the last sentence.'
    )

    for sent in doc.sents:
        print(sent)

    print(doc[1])
    type(list(doc.sents)[0])  # it is a span type not string
    # print(doc.sents[1]) gives you error, you should use the following instead
    print(list(doc.sents)[0])

    doc_sents = [sent for sent in doc.sents]

    # Now you can access individual sentences
    print(doc_sents[1])

    # At first glance it looks like each `sent` contains text from the original Doc object. In fact they're just Spans
    # with start and end token pointers.

    type(doc_sents[1])
    print(doc_sents[1].start, doc_sents[1].end)
    #%%
    print("Spacy's built-in `sentencizer` for sentense segmentation")
    """
    spaCy's built-in `sentencizer` relies on the dependency
     parse and end-of-sentence punctuation to determine 
     segmentation rules. We can add rules of our own, 
     but they have to be added *before* the creation of 
     the Doc object, as that is where the parsing of segment 
     start tokens happens
    
    """

    # Parsing the segmentation start tokens happens during the nlp pipeline
    doc2 = nlp(u'This is a sentence; This is a sentence. This is a sentence.')

    for token in doc2:
        print(token.is_sent_start, ' ' + token.text)

    for sent in doc2.sents:
        print(sent)
    #%%
    print("ADD A NEW SEGMENTATION RULE TO THE PIPELINE-part2")

    def set_custom_boundaries(doc):
        for token in doc[:-1]:
            if token.text == ';':
                doc[token.i + 1].is_sent_start = True
        return doc

    nlp.add_pipe(set_custom_boundaries, before='parser')

    print(
        nlp.pipe_names)  # ['tagger', 'set_custom_boundaries', 'parser', 'ner']

    # Re-run the Doc object creation:
    doc4 = nlp(
        u'"Management is doing things right; leadership is doing the right things." -Peter Drucker'
    )

    for sent in doc4.sents:  # separates sentences on semicolon
        print(sent)

    # And yet the new rule doesn't apply to the older Doc object:
    for sent in doc2.sents:
        print(sent)
    #%%
    print("ADD CHANGE SEGMENTATION RULES TO THE PIPELINE-part2")
    """
    Why not simply set the `.is_sent_start` value to 
    True on existing tokens?
    
    In some cases we want to *replace* spaCy's default 
    sentencizer with our own set of rules. 
    In this section we'll see how the default 
    sentencizer breaks on periods. We'll then replace 
    this behavior with a sentencizer that breaks on linebreaks.
    
    """

    nlp = spacy.load('en_core_web_sm')  # reset to the original

    mystring = u"This is a sentence. This is another.\n\nThis is a \nthird sentence."

    # SPACY DEFAULT BEHAVIOR:
    doc = nlp(mystring)

    for sent in doc.sents:
        print([token.text for token in sent])

    def split_on_newlines(doc):  #split on newlines instead of `.`
        start = 0
        seen_newline = False
        for word in doc:
            if seen_newline:
                yield doc[start:
                          word.i]  #word.i --> current word index position
                start = word.i
                seen_newline = False
            elif word.text.startswith('\n'):  # handles multiple occurrences
                seen_newline = True
        yield doc[start:]  # handles the last group of tokens

    sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)
    nlp.add_pipe(sbd)

    doc = nlp(mystring)
    for sent in doc.sents:
        print([token.text for token in sent])

    #%%
    print("Perform POS with Spacy")
    text = nlp(u"I read books on NLP.")
    text2 = nlp(u"I read a book on NLP.")
    word = text[1]
    print(f'{word} : {type(word)}')
    print(f'{word.text} : {type(word.text)}')

    def show_pos(text):
        for token in text:
            print(
                f'{token.text:{12}} {token.pos_:{6}} {token.tag_:{6}} {spacy.explain(token.tag_)}'
            )

    # the pos shows 'read' is past/present tense
    print('\n read (present tense)\n')
    show_pos(text)
    print(f'\n read (past tense)\n')
    show_pos(text2)
    #%%
    print("Count different coarse-grained POS codes\n")
    doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")
    POS_counts = doc.count_by(spacy.attrs.POS)
    print('POS_counts:', POS_counts)
    print('Associated `item` for the POS `key #`: ', doc.vocab[83].text)

    print(
        'Creat frequency list of POS tags since `POS_counts` returns a dictionary  with `POS_counts.items()\n'
    )

    for k, v in sorted(POS_counts.items()):
        print(f'{k}. {doc.vocab[k].text:{5}}: {v}')
    #%%
    print("Count different coarse-grained Tag codes\n")

    TAG_counts = doc.count_by(spacy.attrs.TAG)

    for k, v in sorted(TAG_counts.items()):
        print(f'{k}. {doc.vocab[k].text:{4}}: {v}')
    #%%
    print('Count the different dependencies (DEP) codes\n')

    DEP_counts = doc.count_by(spacy.attrs.DEP)

    for k, v in sorted(DEP_counts.items()):
        print(f'{k}. {doc.vocab[k].text:{4}}: {v}')