Beispiel #1
0
def nlp_model(negation_language="en"):
    try:
        import en_core_sci_sm
        nlp = en_core_sci_sm.load(disable=["tagger", "parser", "lemmatizer"])
        nlp.add_pipe(nlp.create_pipe('sentencizer'))
        negex = Negex(nlp, language=negation_language, chunk_prefix=["no"])
        nlp.add_pipe(negex, last=True)
        Token.set_extension('negex', default=False, force=True)

    except ModuleNotFoundError:
        rl = download_model(
            "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_sm-0.2.4.tar.gz"
        )
        if rl == 0:
            import en_core_sci_sm
            nlp = en_core_sci_sm.load(disable=["tagger", "parser"])
        else:
            logger.info('Negation model could not be loaded\n')
            nlp = None

    if nlp:
        for not_a_stop in remove_from_stops.split(" "):
            nlp.vocab[not_a_stop].is_stop = False
            nlp.vocab[not_a_stop.capitalize()].is_stop = False

    return nlp
Beispiel #2
0
def search_canonicalNameFlesh(text=None):
    global keyprocessor
    global nlpbif

    resp = []
    if keyprocessor is None or text is None:
        keyprocessor = load_gbif_categories()
        nlpbif = en_core_sci_sm.load()
        print("- Keyprocessor and nlp, loaded")

    if text is not None:
        doc = nlpbif(text)
        print("nb entities:", len(doc.ents))
        data = []
        for x in doc.ents:
            x = str(x).lower()
            tab = x.split(" ")
            for y in tab:
                if len(y) > 0:
                    data.append(y)
        keywords_found = keyprocessor.extract_keywords(" ".join(data),
                                                       span_info=True)
        if (len(keywords_found)) > 0:
            for k in keywords_found:  # ('pterocladiophilaceae', 735, 755)]
                if k[0] not in resp:
                    resp.append(k[0])
    return resp
Beispiel #3
0
def search_canonicalName(
        text=None,
        gbif_extract_file="../data/gbif_extract_canonicalName_short.csv"):
    """ search on gbif canonical names """
    global df_canonicalName, nlpbif
    if df_canonicalName is None or text is None:
        df_canonicalName = pd.read_csv(gbif_extract_file, sep=";")
        print(
            gbif_extract_file,
            ", loaded",
            df_canonicalName.shape,
            list(df_canonicalName.columns),
        )
        print(df_canonicalName.shape)
        nlpbif = en_core_sci_sm.load()

    if text is not None and df_canonicalName is not None:
        resp = []
        doc = nlpbif(text)
        print("nb entities:", len(doc.ents))
        for x in doc.ents:
            # print(x)
            tabw = str(x).lower().split(" ")
            for w in tabw:
                print(x, "-", w)
                df = df_canonicalName[df_canonicalName["canonicalName_word"] ==
                                      w]
                # print(df)
                if len(df) > 0:
                    resp.append({"name": w, "key": df["tab_key"].values[0]})
        return resp
    return
Beispiel #4
0
    def __init__(self, add_id, add_section, all_lines):

        self.single_annotation = re.compile(r'(.*?)<<<<(.*?)>>>>##\[(.*?)\](.*?)')
        self.sequence_annotation_start = re.compile(r'(.*?)<<<<(.*)')
        self.sequence_annotation_end = re.compile(r'(.*?)>>>>##\[(.*?)\](.*?)')
        
        self.replace = re.compile(r'<<<<(.*?)>>>>##\[(.*?)\]')

        self.SUBSTITUTIONS = {
            u'ff': 'ff',
            u'fi': 'fi',
            u'fl': 'fl',
            u'“': "``",
            u'”': "''",
            u'⫺': "-",
            u'−' : "-",
            u"…" : "...",
            u"⫽" : "=",
            u'~' : "(",
            u"!" : ")",
            u'共' : "(",
            u'兲' : ')',
            u'' : '-',
            u'' : '',
            u'': " ",
            u'¼' : '=',
            u'1⁄4' : '=',
            u';' : ' ; ',
            u'.' : ' . ',
            u':' : ' : ',
            u',' : ' , ',
            u')' : ' ) ',
            u'(' : ' ( ',
            u'-' : ' - ',
            u'\\' : ' \\ '
        }


        # self.braces = re.compile(r'~(.*?)!')
        # self.charsub_empty = re.compile(r"(['\\\-\(\);])")
        # self.charsub_space = re.compile(r"([\.:,])")
        # self.hyphen = re.compile(r"−")
        # self.equals = re.compile(r"¼")
        # self.equals2 = re.compile(r"1⁄4")
        
        self.mapper = {'': 'INVALID', 'result': 'RESULT', 'method':'METHOD',
'parameters':'PARAMETER', 'parameter':'PARAMETER','parameters l':'PARAMETER', 'material':'MATERIAL',
'meterial':'MATERIAL', 'materials':'MATERIAL', 'xc':'METHOD', 'code':'CODE', 'structure':'STRUCTURE'}

        self.add_id = add_id
        self.add_section = add_section
        self.all_lines = all_lines

        self.nlp = en_core_sci_sm.load()
Beispiel #5
0
from pprint import pprint
import itertools
import pytest
import json
import en_core_web_sm
import en_core_sci_sm
from spacy.tokens import Token
from role_pattern_nlp import RolePatternBuilder, RolePatternSet
from role_pattern_nlp.exceptions import FeaturesNotInFeatureDictError
from role_pattern_nlp import util
import visualise_spacy_tree

idxs_to_tokens = util.idxs_to_tokens
# nlp = en_core_web_sm.load()
nlp = en_core_sci_sm.load()
Token.set_extension('valence', default=False)
Token.set_extension('has_valence', default=False)

text1 = 'We introduce efficient methods for fitting Boolean models to molecular data, successfully demonstrating their application to synthetic time courses generated by a number of established clock models, as well as experimental expression levels measured using luciferase imaging.'

text2 = 'The amyloid-beta oligomer hypothesis was introduced in 1998.'

text3 = 'L-theanine alone improved self-reported relaxation, tension, and calmness starting at 200 mg.'

text4 = 'These include maintaining a consistent bedtime routine, establishing healthy eating habits and exercise, avoiding caffeine and other substances that can exacerbate RLS, and stretching before bedtime.'

text5 = 'Smoking and heavy alcohol consumption were associated with increased risks.'

text6 = 'In both CC and AA adults, greater adherence to a Prudent dietary pattern was associated with better cognitive outcomes.'

text7 = 'However, expectancy and the related psychological permutations that are associated with oral CAF ingestion are generally not considered in most experimental designs and these could be important in understanding if/how CAF elicits an ergogenic effect.'
    base = max_sc**(1 / top_val)
    return base


def log_top_scale(arr_per_topic, top_val=9):
    base = base2scale(arr_per_topic, top_val)
    return np.array([scale_score(score, base) for score in arr_per_topic])


######################################################
####################### CORPUS #######################
######################################################

# # SciSpacy model to tokenize text
print("-------- Loading scispacy en_core_sci_sm model --------")
nlp = en_core_sci_sm.load(disable=['ner', 'tagger'])
nlp.max_length = 2000000

# # Corpus
print("-------- Building corpus --------")
df_docs.title = df_docs.title.fillna("")
df_docs.abstract = df_docs.abstract.fillna("")
df_docs.fulltext = df_docs.fulltext.fillna("")

corpus_list = []
name_corpus_list = []
if options.fulltext:
    fulltext_corpus = df_docs.fulltext.to_list()
    corpus_list.append(fulltext_corpus)
    name_corpus_list.append("fulltext")
if options.abstract:
)  # Collecting the common stop_words (in English language) imported from spacy.lang.en.stop_words

# Extra stop_words which frequently appear in medical articles
custom_stop_words = [
    'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et',
    'al', 'author', 'figure', 'rights', 'reserved', 'permission', 'used',
    'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 'al.', 'Elsevier',
    'PMC', 'CZI', 'www'
]
# Appending the extra stop words to the eng language stop words
for w in custom_stop_words:
    if w not in stopwords:
        stopwords.append(w)

# Parser for parsing the text in the article
parser = en_core_sci_sm.load(
    disable=["tagger", "ner"])  # Loading the parse from en_core_sci_sm package
parser.max_length = 3000000


## Helper function to tokenize the full text in an article
def spacy_tokenizer(text):
    all_tokens = parser(text)  ## Parse the article using parser defined above
    lem_tokens = [
        word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_
        for word in all_tokens
    ]  ##Lemmatization
    filtered_tokens = [
        word for word in lem_tokens
        if word not in stopwords and word not in punctuations
    ]  ## Filtering stop words and punctuations
    tokens = [token for token in filtered_tokens]
import gensim, spacy, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import TfidfModel
from gensim import corpora, models, similarities
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim

#pip install scispacy
# pip install <Model URL> :  pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_md-0.2.5.tar.gz
# For the parser, we will use en_core_sci_lg. This is a model for processing biomedical, scientific or clinical text.
import en_core_sci_sm
nlp = en_core_sci_sm.load(disable=["tagger", "ner"])

# Initialize the stopwords
stop_words_ = stop_words.get_stop_words('fr')
stop_words_nltk = stopwords.words('english')
custom_stop_words = []
stopwords = list(
    set(
        list(stop_words_nltk) + stop_words_ + list(stop_words_spacy) +
        custom_stop_words))

# Initialize ponctuation
punctuations = string.punctuation + "«" + "»" + "’" + '—'


def find_AAV_terms(text):
Beispiel #9
0
def explore__text_byScispacy(text=None):
    global pp

    if text is None:
        text = "Spinal and bulbar muscular atrophy (SBMA) is an \
                   inherited motor neuron disease caused by the expansion \
                   of a polyglutamine tract within the androgen receptor (AR). \
                   SBMA can be caused by this easily."

    text = "Passive acoustic monitoring is an efficient way to provide insights \
            on the ecology of large whales. This approach allows for long-term and \
            species-specific monitoring over large areas. In this study, \
            we examined six years (2010 to 2015) of continuous acoustic \
            recordings at up to seven different locations in the Central and Southern Indian Basin to assess the peak periods of presence, seasonality and migration movements of Antarctic blue whales (Balaenoptera musculus intermedia). An automated method is used to detect the Antarctic blue whale stereotyped call, known as Z-call. Detection results are analyzed in terms of distribution, seasonal presence and diel pattern of emission at each site. Z-calls are detected year-round at each site, except for one located in the equatorial Indian Ocean, and display highly seasonal distribution. This seasonality is stable across years for every site, but varies between sites. Z-calls are mainly detected during autumn and spring at the subantarctic locations, suggesting that these sites are on the Antarctic blue whale migration routes, and mostly during winter at the subtropical sites. In addition to these seasonal trends, there is a significant diel pattern in Z-call emission, with more Z-calls in daytime than in nighttime. This diel pattern may be related to the blue whale feeding ecology."

    text = "Distribution and movement patterns of Antarctic blue whales Balaenoptera \
        musculus intermedia at large temporal and spatial scales are still \
        poorly understood. The objective of this study was \
        to explore spatio-temporal distribution patterns of\
        Antarctic blue whales in the Atlantic sector of the Southern Ocean,\
        using passive acoustic monitoring data. Multi-year dat\
        a were collected between 2008 and 2013 by\
        11 recorders deployed in the Weddell Sea and along \
        the Greenwich meridian. \
        Antarctic blue whale Z-calls were detected via spectrogram cross-correlation.\
        A Blue Whale Index was developed to quantify the proportion of time \
        during which acoustic energy from Antarctic blue whales dominated\
        over background noise. Our results show that Antarctic \
        blue whales were acoustically present year-round, \
        with most call detections between January and April.\
        During austral summer, the number of detected calls \
        peaked synchronously throughout the study area in most\
        years, and hence, no directed meridional movement \
        pattern was detectable. During austral winter,\
        vocalizations were recorded at latitudes \
        as high as 69°S, with sea ice cover exceeding 90%,\
        suggesting that some Antarctic blue whales overwinter\
        in Antarctic waters. Polynyas likely \
        serve as an important habitat for baleen whales during\
        austral winter, providing food and reliable access \
        to open water for breathing. Overall, our results \
        support increasing evidence of a complex \
        and non-obligatory migratory behavior of Antarctic blue whales,\
        potentially involving temporally and spatially dynamic \
        migration routes and destinations, as well as variable \
        timing of migration to and from the feeding grounds."

    print("Input text:")
    pp.pprint(text)
    print("- load model: en_core_sci_sm")
    nlp = en_core_sci_sm.load()
    ts = time.time()
    doc = nlp(text)
    print(time.time() - ts)
    print("- returned object type: <class 'spacy.tokens.doc.Doc'>")
    print(type(doc))
    # print(list(doc.sents))

    print("nb entities:", len(doc.ents))
    for x in doc.ents:
        print(str(x))
    ent_bc = {}
    for x in doc.ents:
        ent_bc[x.text] = x.label_
    pp.pprint(ent_bc)
    # trans_df = pd.DataFrame(table)

    # Add the abbreviation pipe to the spacy pipeline.
    abbreviation_pipe = AbbreviationDetector(nlp)
    nlp.add_pipe(abbreviation_pipe)
    print("Abbreviations FOUND", "\t", "Definition")
    ts = time.time()
    doc = nlp(text)
    print(time.time() - ts)
    for abrv in doc._.abbreviations:
        print(f"{abrv} \t ({abrv.start}, {abrv.end}) {abrv._.long_form}")

    displacy_image = displacy.render(doc, jupyter=True, style="ent")
    print(displacy_image)
Beispiel #10
0
def InitSciSpacy():
    nlp = en_core_sci_sm.load()
    return nlp
Beispiel #11
0
            nlp = en_core_sci_sm.load(disable=["tagger", "parser"])
        else:
            logger.info('Negation model could not be loaded\n')
            nlp = None

    if nlp:
        for not_a_stop in remove_from_stops.split(" "):
            nlp.vocab[not_a_stop].is_stop = False
            nlp.vocab[not_a_stop.capitalize()].is_stop = False

    return nlp


try:
    import en_core_sci_sm
    nlp_sans_ner = en_core_sci_sm.load(
        disable=["tagger", "parser", "ner", "lemmatizer"])
    logger.info('Using scispaCy language model\n')

except ModuleNotFoundError:
    rl = download_model(
        "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_sm-0.2.4.tar.gz"
    )
    if rl == 0:
        import en_core_sci_sm
        nlp_sans_ner = en_core_sci_sm.load(disable=["tagger", "parser", "ner"])
        logger.info('Using scispaCy language model\n')
    else:
        logger.info('scispaCy language model could not be loaded\n')
        logger.info(
            'Performing a one-time download of an English language model\n')
        from spacy.cli import download
def get_label_list(input_summary, max_length=max_length):
    """
	get list of labels based on summary string
	"""
    wv = get_words_from_text(input_summary, max_length)

    new_preds = model.predict(np.reshape(wv, list(wv.shape)), batch_size=1)

    predicted_labels = np.array([vec_label(i) for i in new_preds[0]])

    return predicted_labels


# nlp_reader is pickled bc it takes forever

nlp_reader = en_core_sci_sm.load()
cui_linker = UmlsEntityLinker(resolve_abbreviations=True)
nlp_reader.add_pipe(cui_linker)


def get_summary_scispacy(input_summary, *args):
    tagged_sent = nlp_reader(input_summary["original"])
    # BAD_STRING = """<span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">ENTITY</span>\n</mark>\n"""

    #replace all BAD_Strings with a link
    # tagged_sent.replace(BAD_STRING, "")

    #find all mark tags, add class tooltipped, and add cui code
    # tooltipped" data-position="bottom" data-tooltip="I am a tooltip"

    #or we could deconstruct the output, reconstruct it. want to add tooltip showing cui