def nlp_model(negation_language="en"): try: import en_core_sci_sm nlp = en_core_sci_sm.load(disable=["tagger", "parser", "lemmatizer"]) nlp.add_pipe(nlp.create_pipe('sentencizer')) negex = Negex(nlp, language=negation_language, chunk_prefix=["no"]) nlp.add_pipe(negex, last=True) Token.set_extension('negex', default=False, force=True) except ModuleNotFoundError: rl = download_model( "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_sm-0.2.4.tar.gz" ) if rl == 0: import en_core_sci_sm nlp = en_core_sci_sm.load(disable=["tagger", "parser"]) else: logger.info('Negation model could not be loaded\n') nlp = None if nlp: for not_a_stop in remove_from_stops.split(" "): nlp.vocab[not_a_stop].is_stop = False nlp.vocab[not_a_stop.capitalize()].is_stop = False return nlp
def search_canonicalNameFlesh(text=None): global keyprocessor global nlpbif resp = [] if keyprocessor is None or text is None: keyprocessor = load_gbif_categories() nlpbif = en_core_sci_sm.load() print("- Keyprocessor and nlp, loaded") if text is not None: doc = nlpbif(text) print("nb entities:", len(doc.ents)) data = [] for x in doc.ents: x = str(x).lower() tab = x.split(" ") for y in tab: if len(y) > 0: data.append(y) keywords_found = keyprocessor.extract_keywords(" ".join(data), span_info=True) if (len(keywords_found)) > 0: for k in keywords_found: # ('pterocladiophilaceae', 735, 755)] if k[0] not in resp: resp.append(k[0]) return resp
def search_canonicalName( text=None, gbif_extract_file="../data/gbif_extract_canonicalName_short.csv"): """ search on gbif canonical names """ global df_canonicalName, nlpbif if df_canonicalName is None or text is None: df_canonicalName = pd.read_csv(gbif_extract_file, sep=";") print( gbif_extract_file, ", loaded", df_canonicalName.shape, list(df_canonicalName.columns), ) print(df_canonicalName.shape) nlpbif = en_core_sci_sm.load() if text is not None and df_canonicalName is not None: resp = [] doc = nlpbif(text) print("nb entities:", len(doc.ents)) for x in doc.ents: # print(x) tabw = str(x).lower().split(" ") for w in tabw: print(x, "-", w) df = df_canonicalName[df_canonicalName["canonicalName_word"] == w] # print(df) if len(df) > 0: resp.append({"name": w, "key": df["tab_key"].values[0]}) return resp return
def __init__(self, add_id, add_section, all_lines): self.single_annotation = re.compile(r'(.*?)<<<<(.*?)>>>>##\[(.*?)\](.*?)') self.sequence_annotation_start = re.compile(r'(.*?)<<<<(.*)') self.sequence_annotation_end = re.compile(r'(.*?)>>>>##\[(.*?)\](.*?)') self.replace = re.compile(r'<<<<(.*?)>>>>##\[(.*?)\]') self.SUBSTITUTIONS = { u'ff': 'ff', u'fi': 'fi', u'fl': 'fl', u'“': "``", u'”': "''", u'⫺': "-", u'−' : "-", u"…" : "...", u"⫽" : "=", u'~' : "(", u"!" : ")", u'共' : "(", u'兲' : ')', u'' : '-', u'' : '', u'': " ", u'¼' : '=', u'1⁄4' : '=', u';' : ' ; ', u'.' : ' . ', u':' : ' : ', u',' : ' , ', u')' : ' ) ', u'(' : ' ( ', u'-' : ' - ', u'\\' : ' \\ ' } # self.braces = re.compile(r'~(.*?)!') # self.charsub_empty = re.compile(r"(['\\\-\(\);])") # self.charsub_space = re.compile(r"([\.:,])") # self.hyphen = re.compile(r"−") # self.equals = re.compile(r"¼") # self.equals2 = re.compile(r"1⁄4") self.mapper = {'': 'INVALID', 'result': 'RESULT', 'method':'METHOD', 'parameters':'PARAMETER', 'parameter':'PARAMETER','parameters l':'PARAMETER', 'material':'MATERIAL', 'meterial':'MATERIAL', 'materials':'MATERIAL', 'xc':'METHOD', 'code':'CODE', 'structure':'STRUCTURE'} self.add_id = add_id self.add_section = add_section self.all_lines = all_lines self.nlp = en_core_sci_sm.load()
from pprint import pprint import itertools import pytest import json import en_core_web_sm import en_core_sci_sm from spacy.tokens import Token from role_pattern_nlp import RolePatternBuilder, RolePatternSet from role_pattern_nlp.exceptions import FeaturesNotInFeatureDictError from role_pattern_nlp import util import visualise_spacy_tree idxs_to_tokens = util.idxs_to_tokens # nlp = en_core_web_sm.load() nlp = en_core_sci_sm.load() Token.set_extension('valence', default=False) Token.set_extension('has_valence', default=False) text1 = 'We introduce efficient methods for fitting Boolean models to molecular data, successfully demonstrating their application to synthetic time courses generated by a number of established clock models, as well as experimental expression levels measured using luciferase imaging.' text2 = 'The amyloid-beta oligomer hypothesis was introduced in 1998.' text3 = 'L-theanine alone improved self-reported relaxation, tension, and calmness starting at 200 mg.' text4 = 'These include maintaining a consistent bedtime routine, establishing healthy eating habits and exercise, avoiding caffeine and other substances that can exacerbate RLS, and stretching before bedtime.' text5 = 'Smoking and heavy alcohol consumption were associated with increased risks.' text6 = 'In both CC and AA adults, greater adherence to a Prudent dietary pattern was associated with better cognitive outcomes.' text7 = 'However, expectancy and the related psychological permutations that are associated with oral CAF ingestion are generally not considered in most experimental designs and these could be important in understanding if/how CAF elicits an ergogenic effect.'
base = max_sc**(1 / top_val) return base def log_top_scale(arr_per_topic, top_val=9): base = base2scale(arr_per_topic, top_val) return np.array([scale_score(score, base) for score in arr_per_topic]) ###################################################### ####################### CORPUS ####################### ###################################################### # # SciSpacy model to tokenize text print("-------- Loading scispacy en_core_sci_sm model --------") nlp = en_core_sci_sm.load(disable=['ner', 'tagger']) nlp.max_length = 2000000 # # Corpus print("-------- Building corpus --------") df_docs.title = df_docs.title.fillna("") df_docs.abstract = df_docs.abstract.fillna("") df_docs.fulltext = df_docs.fulltext.fillna("") corpus_list = [] name_corpus_list = [] if options.fulltext: fulltext_corpus = df_docs.fulltext.to_list() corpus_list.append(fulltext_corpus) name_corpus_list.append("fulltext") if options.abstract:
) # Collecting the common stop_words (in English language) imported from spacy.lang.en.stop_words # Extra stop_words which frequently appear in medical articles custom_stop_words = [ 'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure', 'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 'al.', 'Elsevier', 'PMC', 'CZI', 'www' ] # Appending the extra stop words to the eng language stop words for w in custom_stop_words: if w not in stopwords: stopwords.append(w) # Parser for parsing the text in the article parser = en_core_sci_sm.load( disable=["tagger", "ner"]) # Loading the parse from en_core_sci_sm package parser.max_length = 3000000 ## Helper function to tokenize the full text in an article def spacy_tokenizer(text): all_tokens = parser(text) ## Parse the article using parser defined above lem_tokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in all_tokens ] ##Lemmatization filtered_tokens = [ word for word in lem_tokens if word not in stopwords and word not in punctuations ] ## Filtering stop words and punctuations tokens = [token for token in filtered_tokens]
import gensim, spacy, warnings import gensim.corpora as corpora from gensim.utils import lemmatize, simple_preprocess from gensim.models import TfidfModel from gensim import corpora, models, similarities warnings.filterwarnings("ignore", category=DeprecationWarning) # Plotting tools import pyLDAvis import pyLDAvis.gensim #pip install scispacy # pip install <Model URL> : pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_md-0.2.5.tar.gz # For the parser, we will use en_core_sci_lg. This is a model for processing biomedical, scientific or clinical text. import en_core_sci_sm nlp = en_core_sci_sm.load(disable=["tagger", "ner"]) # Initialize the stopwords stop_words_ = stop_words.get_stop_words('fr') stop_words_nltk = stopwords.words('english') custom_stop_words = [] stopwords = list( set( list(stop_words_nltk) + stop_words_ + list(stop_words_spacy) + custom_stop_words)) # Initialize ponctuation punctuations = string.punctuation + "«" + "»" + "’" + '—' def find_AAV_terms(text):
def explore__text_byScispacy(text=None): global pp if text is None: text = "Spinal and bulbar muscular atrophy (SBMA) is an \ inherited motor neuron disease caused by the expansion \ of a polyglutamine tract within the androgen receptor (AR). \ SBMA can be caused by this easily." text = "Passive acoustic monitoring is an efficient way to provide insights \ on the ecology of large whales. This approach allows for long-term and \ species-specific monitoring over large areas. In this study, \ we examined six years (2010 to 2015) of continuous acoustic \ recordings at up to seven different locations in the Central and Southern Indian Basin to assess the peak periods of presence, seasonality and migration movements of Antarctic blue whales (Balaenoptera musculus intermedia). An automated method is used to detect the Antarctic blue whale stereotyped call, known as Z-call. Detection results are analyzed in terms of distribution, seasonal presence and diel pattern of emission at each site. Z-calls are detected year-round at each site, except for one located in the equatorial Indian Ocean, and display highly seasonal distribution. This seasonality is stable across years for every site, but varies between sites. Z-calls are mainly detected during autumn and spring at the subantarctic locations, suggesting that these sites are on the Antarctic blue whale migration routes, and mostly during winter at the subtropical sites. In addition to these seasonal trends, there is a significant diel pattern in Z-call emission, with more Z-calls in daytime than in nighttime. This diel pattern may be related to the blue whale feeding ecology." text = "Distribution and movement patterns of Antarctic blue whales Balaenoptera \ musculus intermedia at large temporal and spatial scales are still \ poorly understood. The objective of this study was \ to explore spatio-temporal distribution patterns of\ Antarctic blue whales in the Atlantic sector of the Southern Ocean,\ using passive acoustic monitoring data. Multi-year dat\ a were collected between 2008 and 2013 by\ 11 recorders deployed in the Weddell Sea and along \ the Greenwich meridian. \ Antarctic blue whale Z-calls were detected via spectrogram cross-correlation.\ A Blue Whale Index was developed to quantify the proportion of time \ during which acoustic energy from Antarctic blue whales dominated\ over background noise. Our results show that Antarctic \ blue whales were acoustically present year-round, \ with most call detections between January and April.\ During austral summer, the number of detected calls \ peaked synchronously throughout the study area in most\ years, and hence, no directed meridional movement \ pattern was detectable. During austral winter,\ vocalizations were recorded at latitudes \ as high as 69°S, with sea ice cover exceeding 90%,\ suggesting that some Antarctic blue whales overwinter\ in Antarctic waters. Polynyas likely \ serve as an important habitat for baleen whales during\ austral winter, providing food and reliable access \ to open water for breathing. Overall, our results \ support increasing evidence of a complex \ and non-obligatory migratory behavior of Antarctic blue whales,\ potentially involving temporally and spatially dynamic \ migration routes and destinations, as well as variable \ timing of migration to and from the feeding grounds." print("Input text:") pp.pprint(text) print("- load model: en_core_sci_sm") nlp = en_core_sci_sm.load() ts = time.time() doc = nlp(text) print(time.time() - ts) print("- returned object type: <class 'spacy.tokens.doc.Doc'>") print(type(doc)) # print(list(doc.sents)) print("nb entities:", len(doc.ents)) for x in doc.ents: print(str(x)) ent_bc = {} for x in doc.ents: ent_bc[x.text] = x.label_ pp.pprint(ent_bc) # trans_df = pd.DataFrame(table) # Add the abbreviation pipe to the spacy pipeline. abbreviation_pipe = AbbreviationDetector(nlp) nlp.add_pipe(abbreviation_pipe) print("Abbreviations FOUND", "\t", "Definition") ts = time.time() doc = nlp(text) print(time.time() - ts) for abrv in doc._.abbreviations: print(f"{abrv} \t ({abrv.start}, {abrv.end}) {abrv._.long_form}") displacy_image = displacy.render(doc, jupyter=True, style="ent") print(displacy_image)
def InitSciSpacy(): nlp = en_core_sci_sm.load() return nlp
nlp = en_core_sci_sm.load(disable=["tagger", "parser"]) else: logger.info('Negation model could not be loaded\n') nlp = None if nlp: for not_a_stop in remove_from_stops.split(" "): nlp.vocab[not_a_stop].is_stop = False nlp.vocab[not_a_stop.capitalize()].is_stop = False return nlp try: import en_core_sci_sm nlp_sans_ner = en_core_sci_sm.load( disable=["tagger", "parser", "ner", "lemmatizer"]) logger.info('Using scispaCy language model\n') except ModuleNotFoundError: rl = download_model( "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_sm-0.2.4.tar.gz" ) if rl == 0: import en_core_sci_sm nlp_sans_ner = en_core_sci_sm.load(disable=["tagger", "parser", "ner"]) logger.info('Using scispaCy language model\n') else: logger.info('scispaCy language model could not be loaded\n') logger.info( 'Performing a one-time download of an English language model\n') from spacy.cli import download
def get_label_list(input_summary, max_length=max_length): """ get list of labels based on summary string """ wv = get_words_from_text(input_summary, max_length) new_preds = model.predict(np.reshape(wv, list(wv.shape)), batch_size=1) predicted_labels = np.array([vec_label(i) for i in new_preds[0]]) return predicted_labels # nlp_reader is pickled bc it takes forever nlp_reader = en_core_sci_sm.load() cui_linker = UmlsEntityLinker(resolve_abbreviations=True) nlp_reader.add_pipe(cui_linker) def get_summary_scispacy(input_summary, *args): tagged_sent = nlp_reader(input_summary["original"]) # BAD_STRING = """<span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">ENTITY</span>\n</mark>\n""" #replace all BAD_Strings with a link # tagged_sent.replace(BAD_STRING, "") #find all mark tags, add class tooltipped, and add cui code # tooltipped" data-position="bottom" data-tooltip="I am a tooltip" #or we could deconstruct the output, reconstruct it. want to add tooltip showing cui