def load_final_pipe():
    import string
    import en_core_sci_lg
    from collections import Counter
    from tqdm import tqdm

    en_core_sci_lg.load()
    return load_object(FINAL_PIPE_FILE_PATH)
Esempio n. 2
0
def dependency_parser_visualizer(text):

    nlp = en_core_sci_lg.load()
    doc = nlp(text)

    print(list(doc.sents))

    # Examine the entities extracted by the mention detector.
    print(doc.ents)

    from spacy import displacy
    displacy.render(next(doc.sents), style='dep', jupyter=True)
    def __init__(self, data_dir: str):
        '''Initializes a CORD-19 data preprocessing class
        
        Args:
            data_dir: Raw data directory
        '''
        self.data_dir = data_dir

        # Initialize NLP model
        self.nlp = en_core_sci_lg.load(disable=["tagger", "ner"])
        self.nlp.max_length = 2000000
        self.nlp.add_pipe(LanguageDetector(),
                          name='language_detector',
                          last=True)
        self.nlp_words_to_check = 100
Esempio n. 4
0
def similarize_tokens(words, df, feature):
    nlp = en_core_sci_lg.load()
    processed_text = nlp(df[feature])
    word_tokens = nlp(' '.join(words))
    similarities = []

    for token1 in word_tokens:
        for token2 in processed_text:
            if token1 != token2:
                similarity = token1.similarity(token2)

            if (similarity is not None and similarity > 0.5 and similarity < 1):
                similarities.append(
                    {'token1': token1.text, 'token2': token2.text, 'similarity': similarity})
    return similarities
Esempio n. 5
0
    def __init__(self):
        self.gbif_source_path = (
            "/Users/chloesekkat/Documents/batch8_ceebios/data/simplified_taxon_gbif.csv"
        )
        self.papers_data_dir = (
            "/Users/chloesekkat/Documents/batch8_ceebios/data_open_source"
        )

        self.to_keep = [
            "id",
            "title",
            "paperAbstract",
            "authors",
            "year",
            "fieldsOfStudy",
            "journalName",
            "doiUrl",
        ]
        self.keyword_processor = get_gbif_keyprocessor(self.gbif_source_path)
        self.nlp = en_core_sci_lg.load()
Esempio n. 6
0
import xx_sent_ud_sm
import en_core_sci_lg

nlp_uni = xx_sent_ud_sm.load()
nlp_sci = en_core_sci_lg.load()


# UNIVERSAL
def is_token_allowed_uni(token):
    '''
         Only allow valid tokens which are not stop words
         and punctuation symbols.
    '''
    if not token or not token.text.strip() or token.is_stop or token.is_punct:
        return False
    return True


def preprocesstoken_uni(token):
    # Reduce token to its lowercase lemma form
    return token.lemma_.strip().lower()


def tokenize_uni(x):
    try:
        return str([
            preprocesstoken_uni(token) for token in nlp_uni(x)
            if is_token_allowed_uni(token)
        ])
    except:
        return str([])
"""
This script converts the jsonlines data format to the csv format for annotation
usage: python convert_from_jsonl.py <inpath>
"""
import plac
import jsonlines
from pathlib import Path
import csv
import jsonlines
from helperutilz import *
from ekphrasis_preprocess import text_processor
import plac
import pandas as pd
import en_core_sci_lg

nlp = en_core_sci_lg.load()


# plac.annotations(inpath=("inpath for ", "positional", "i", Path),
#                  outpath=("outpath for jsonlines for prodigy", "positional", "o", Path),
#                  process=("boolean", "option", "p", bool),
#                  label=("string ", "option", "l", str),
#                  )


def convert(inpath, outpath, process=True, label='fullname'):
    print(f"reading in {inpath}")
    Path(outpath).parent.mkdir(parents=True, exist_ok=True)
    unique_set = set()
    cnt = 0
    kept = 0
import scispacy
import spacy
import en_core_sci_lg
from scipy.spatial.distance import cosine
import joblib
from IPython.display import HTML, display
from ipywidgets import interact, Layout, HBox, VBox, Box
import ipywidgets as widgets
from IPython.display import clear_output
from tqdm import tqdm
from os.path import isfile
import seaborn as sb
import matplotlib.pyplot as plt
from joblib import dump, load

nlp = en_core_sci_lg.load(disable=["tagger", "parser", "ner"])
nlp.max_length = 2000000


def spacy_tokenizer(sentence):
    return [
        word.lemma_ for word in nlp(sentence)
        if not (word.like_num or word.is_stop or word.is_punct or word.is_space
                or len(word) == 1)
    ]


def print_top_words(model, vectorizer, n_top_words):
    feature_names = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        message = "\nTopic #%d: " % topic_idx
Esempio n. 9
0
    def preprocess(self, raw_data = DEFAULT_ROOT_PATH, output_file = DEFAULT_OUTPUT_FILE):
        metadata_path = f'{raw_data}/metadata.csv'
        meta_df = pd.read_csv(metadata_path, dtype={
            'pubmed_id': str,
            'Microsoft Academic Paper ID': str, 
            'doi': str
        })
        #print(meta_df.head())

        all_json = glob.glob(f'{raw_data}/pdf_json/**/*.json', recursive=True)
        #print(len(all_json))

        class FileReader:
            def __init__(self, file_path):
                with open(file_path) as file:
                    content = json.load(file)
                    self.paper_id = content['paper_id']
                    self.abstract = []
                    self.body_text = []
                    # Abstract
                    for entry in content['abstract']:
                        self.abstract.append(entry['text'])
                    # Body text
                    for entry in content['body_text']:
                        self.body_text.append(entry['text'])
                    self.abstract = '\n'.join(self.abstract)
                    self.body_text = '\n'.join(self.body_text)
            def __repr__(self):
                return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'

        first_row = FileReader(all_json[0])

        def get_breaks(content, length):
            data = ""
            words = content.split(' ')
            total_chars = 0

            # add break every length characters
            for i in range(len(words)):
                total_chars += len(words[i])
                if total_chars > length:
                    data = data + "<br>" + words[i]
                    total_chars = 0
                else:
                    data = data + " " + words[i]
            return data


        dict_ = {'paper_id': [], 'doi':[], 'abstract': [], 'body_text': [], 'authors': [], 'title': [], 'journal': [], 'abstract_summary': []}
        for idx, entry in enumerate(all_json):
            if idx % (len(all_json) // 10) == 0:
                print(f'Processing index: {idx} of {len(all_json)}')

            try:
                content = FileReader(entry)
            except Exception as e:
                continue  # invalid paper format, skip

            # get metadata information
            meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
            # no metadata, skip this paper
            if len(meta_data) == 0:
                continue

            dict_['abstract'].append(content.abstract)
            dict_['paper_id'].append(content.paper_id)
            dict_['body_text'].append(content.body_text)

            # also create a column for the summary of abstract to be used in a plot
            if len(content.abstract) == 0:
                # no abstract provided
                dict_['abstract_summary'].append("Not provided.")
            elif len(content.abstract.split(' ')) > 100:
                # abstract provided is too long for plot, take first 100 words append with ...
                info = content.abstract.split(' ')[:100]
                summary = get_breaks(' '.join(info), 40)
                dict_['abstract_summary'].append(summary + "...")
            else:
                # abstract is short enough
                summary = get_breaks(content.abstract, 40)
                dict_['abstract_summary'].append(summary)

            # get metadata information
            meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]

            try:
                # if more than one author
                authors = meta_data['authors'].values[0].split(';')
                if len(authors) > 2:
                    # if more than 2 authors, take them all with html tag breaks in between
                    dict_['authors'].append(get_breaks('. '.join(authors), 40))
                else:
                    # authors will fit in plot
                    dict_['authors'].append(". ".join(authors))
            except Exception as e:
                # if only one author - or Null valie
                dict_['authors'].append(meta_data['authors'].values[0])

            # add the title information, add breaks when needed
            try:
                title = get_breaks(meta_data['title'].values[0], 40)
                dict_['title'].append(title)
            # if title was not provided
            except Exception as e:
                dict_['title'].append(meta_data['title'].values[0])

            # add the journal information
            dict_['journal'].append(meta_data['journal'].values[0])

            # add doi
            dict_['doi'].append(meta_data['doi'].values[0])

        df_covid = pd.DataFrame(dict_, columns=['paper_id', 'doi', 'abstract', 'body_text', 'authors', 'title', 'journal', 'abstract_summary'])
        df_covid.head()
        #df_covid.to_csv("/data/jilin/4225proj/df_covid.csv", index = False)


        df_covid['abstract_word_count'] = df_covid['abstract'].apply(lambda x: len(x.strip().split()))  # word count in abstract
        df_covid['body_word_count'] = df_covid['body_text'].apply(lambda x: len(x.strip().split()))  # word count in body
        df_covid['body_unique_words']=df_covid['body_text'].apply(lambda x:len(set(str(x).split())))  # number of unique words in body
        df_covid.head()

        #df_covid['abstract'].describe(include='all')

        df_covid.drop_duplicates(['abstract', 'body_text'], inplace=True)
        #df_covid['abstract'].describe(include='all')

        #df_covid['body_text'].describe(include='all')

        #print(df_covid.describe())

        df = df_covid
        df.dropna(inplace=True)

        from tqdm import tqdm
        from langdetect import detect
        from langdetect import DetectorFactory

        # set seed
        DetectorFactory.seed = 0

        # hold label - language
        languages = []

        # go through each text
        for ii in tqdm(range(0,len(df))):
            # split by space into list, take the first x intex, join with space
            text = df.iloc[ii]['body_text'].split(" ")

            lang = "en"
            try:
                if len(text) > 50:
                    lang = detect(" ".join(text[:50]))
                elif len(text) > 0:
                    lang = detect(" ".join(text[:len(text)]))
            # ught... beginning of the document was not in a good format
            except Exception as e:
                all_words = set(text)
                try:
                    lang = detect(" ".join(all_words))
                # what!! :( let's see if we can find any text in abstract...
                except Exception as e:

                    try:
                        # let's try to label it through the abstract then
                        lang = detect(df.iloc[ii]['abstract_summary'])
                    except Exception as e:
                        lang = "unknown"
                        pass

            # get the language
            languages.append(lang)

        from pprint import pprint

        languages_dict = {}
        for lang in set(languages):
            languages_dict[lang] = languages.count(lang)
            
        #print("Total: {}\n".format(len(languages)))
        pprint(languages_dict)

        df['language'] = languages
        df = df[df['language'] == 'en']
        #print(df.info())

        import string

        punctuations = string.punctuation
        stopwords = list(STOP_WORDS)
        stopwords[:10]

        custom_stop_words = [
            'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure',
            'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.',
            'al.', 'Elsevier', 'PMC', 'CZI', 'www'
        ]

        for w in custom_stop_words:
            if w not in stopwords:
                stopwords.append(w)

        # Parser
        parser = en_core_sci_lg.load(disable=["tagger", "ner"])
        parser.max_length = 7000000

        def spacy_tokenizer(sentence):
            mytokens = parser(sentence)
            mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
            mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
            mytokens = " ".join([i for i in mytokens])
            return mytokens

        tqdm.pandas()
        df["processed_text"] = df["body_text"].progress_apply(spacy_tokenizer)
        #df['body_word_count'].describe()
        #df['body_unique_words'].describe()

        from sklearn.feature_extraction.text import TfidfVectorizer
        def vectorize(text, maxx_features):

            vectorizer = TfidfVectorizer(max_features=maxx_features)
            X = vectorizer.fit_transform(text)
            return X

        text = df['processed_text'].values
        X = vectorize(text, 2 ** 12)
        X.shape

        from sklearn.decomposition import PCA

        pca = PCA(n_components=0.8, random_state=42)
        X_reduced = pca.fit_transform(X.toarray())
        #print(X_reduced.shape)

        np.savetxt(output_file, X_reduced, delimiter = ",")
Esempio n. 10
0
# import scispacy
# import spacy
import en_core_sci_lg
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nlp_engine = en_core_sci_lg.load()
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
Esempio n. 11
0
import en_core_sci_lg
import de_core_news_lg
import os
import datetime
from pandarallel import pandarallel
import warnings

warnings.filterwarnings('ignore')
pandarallel.initialize(use_memory_fs=False)

# client = Elasticsearch([{'host': 'localhost'}, {'port': 9200}])

nlp_german = de_core_news_lg.load(
    exclude=["parser", "ner", "tok2vec", "textcat"])

nlp_sci = en_core_sci_lg.load(exclude=["parser", "ner", "tok2vec", "textcat"])

# UNIVERSAL


def is_token_allowed_german(token):
    '''
         Only allow valid tokens which are not stop words
         and punctuation symbols.
    '''
    if not token or not token.text.strip() or token.is_stop or token.is_punct:
        return False
    return True


def preprocesstoken_german(token):
Esempio n. 12
0
#Part of the following code was obtained from here:
#https://github.com/allenai/scispacy
#Getting the annotations from each of the tweet's text

df_scispacy_annotations = pd.DataFrame(columns=[
    'Tweet_id', 'Text_section', 'Span_start', 'Span_end', 'Annotation_type',
    'Extras'
])
df_scispacy_tweets_tagged = pd.DataFrame(
    columns=['Tweet_id', 'Tweet_full_text'])

print("Configuring the Scispacy tagger. Please wait...")
nlp = {}
print("Configuring the UMLS linker. Please wait..")
#We setup the scispacy tagger using the UML linker first
nlp['umls'] = en_core_sci_lg.load()
linker = EntityLinker(resolve_abbreviations=True, name="umls")
nlp['umls'].add_pipe(linker)
linker_umls = nlp['umls'].get_pipe("EntityLinker")

print("Starting the tagging process. Please wait...")

for index, row in df_filtered.iterrows():

    annotation_umls = nlp['umls'](str(row['tweet_text']))

    #UMLS Linker
    count = 0
    if len(annotation_umls.ents) > 0:
        df_scispacy_tweets_tagged.loc[len(df_scispacy_tweets_tagged.index)] = [
            row['tweet_id'], row['tweet_text']