コード例 #1
0
 def __init__(self, txt_dir, annotation_dir=None, encoding="ISO-8859-1"):
     super(bratDataLoader, self).__init__(txt_dir)
     self.annotation_dir = annotation_dir
     self.detected_labels = set()
     self.spacy_model = en_core_sci_md.load()
     self.sent_dict = {}
     self.logger = logging.getLogger(__name__)
     self.encoding = encoding
コード例 #2
0
 def __init__(self):
     # spaCy NLP
     self.nlp = en_core_sci_md.load()
コード例 #3
0
corona_df.tail()

"""Extraímos o texto de uma das amostras (neste caso escolhi o primeiro artigo da amostra, mas poderia ser qualquer um)"""

sample_text = corona_df['text'][22895]
sample_text

"""# Função para Pré-processamento
Vamos criar uma função para pré-processar os dados
"""

# utilizaremos a classe padrão do Spacy, pois já é preparada para área médica
# disable para agilizar o processamento dos textos
#tagger indica o pipeline, parser vai fazer o parse de independencias e ner é o reconhecimento de entidades no texto, mas nao utilizaremos nenhuma das 3
nlp = en_core_sci_md.load(disable=['tagger', 'parser','ner'])
nlp.max_length = 2000000

# vamos visualizar as stop words: Stop Words são palavras que não usaremos, palavras mais do cotidiano (meu, seu, onde, então, etc)
print(spacy.lang.en.stop_words.STOP_WORDS)

# Ao criarmos a nuvem de palavras, identificamos algumas palavras que não são necessárias, como et e al, por exemplo
# por este motivo, vamos criar uma nova STOP WORDS, removendo estas palavras
new_stop_words = ['et','al','doi','copyright','http','https','fig','table','result','show']
for word in new_stop_words:
  nlp.vocab[word].is_stop=True

def spacy_tokenizer(sentence):
  sentence = sentence.lower()
  #extrair o lemma das palavras
  list = []
コード例 #4
0
ファイル: utils.py プロジェクト: collabovid/collabovid
import en_core_sci_md
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

nlp = en_core_sci_md.load(disable=["tagger", "parser", "ner"])
nlp.max_length = 2000000

customize_stop_words = [
    'doi', 'preprint', 'copyright', 'peer', 'review', 'reviewed', 'org',
    'https', 'et', 'al', 'author', 'figure', 'rights', 'reserved',
    'permission', 'used', 'using', 'biorxiv', 'fig', 'fig.', 'al.', 'di', 'la',
    'il', 'del', 'le', 'della', 'dei', 'delle', 'una', 'da', 'dell', 'non',
    'si', 'funding', 'covid-19', 'covid19', 'sars-cov-2', 'coronavirus',
    'method', 'study', 'infection', 'public', 'sars', 'datum', 'datum',
    'human', 'peer-reviewed', 'cc-by', 'the(which', 'cc-by-nc-nd', 'medrxiv',
    'wang', 'licenseauthor/funder', 'li', 'org/10', 'author/funder',
    'available', 'licenseit', 'sep2020', 'medrxiv', 'biorxiv', 'pp', 'paper',
    'research', 'license', '2019-ncov', 'i(t', 'grant', 'virus', 'health',
    'disease', 'infect', 'grant', 'show', 'yes', 'ratio', 'size', 'high',
    'low', 'large', '0(0', 'result', '\\\\r', 'investor', 'group', 'allow',
    'show', 'table', 'plot', 'betacov/zhejiang/wz-02/2020',
    'betacov/zhejiang/hangzhou-', 'david', 'betacov/shenzhen/szth-003/2020',
    'betacov/taiwan/2/2020', 'cov', 'include', 'use', 'licensewas',
    'whichthis', 'vf=0', 'set', 'patient', 'china', 'confirm', 'italy',
    'novel', 'need', 'pubmed', 'require', 'conclusion', 'average', 'december',
    'february', 'march', 'april', 'january', 'pandemic'
]

# Mark them as stop words
for w in customize_stop_words:
コード例 #5
0
ファイル: lda.py プロジェクト: srravula1/topic-modelling-1
 def __init__(self):
     self.nlp = en_core_sci_md.load(disable=['tagger', 'parser', 'ner'])
     self.nlp.max_length = 2000000
     self.expandStopWords()
コード例 #6
0
ファイル: emb_utils.py プロジェクト: unik00/GraphLSTM
import en_core_sci_md

nlp = en_core_sci_md.load()
print("Finished loading en_core_sci_md")


def get_all_dep():
    for label in nlp.get_pipe("parser").labels:
        print(label)
    return nlp.get_pipe("parser").labels


def gen_dependency_tree(sent):
    doc = nlp(sent)
    return doc


def get_universal_POS():
    universal_pos = [
        "ADJ", "ADP", "ADV", "AUX", "CONJ", "CCONJ", "DET", "INTJ", "NOUN",
        "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X",
        "SPACE"
    ]

    POS_map = dict()
    cnt = 0
    for s in universal_pos:
        POS_map[s] = cnt
        cnt += 1

    return POS_map
コード例 #7
0
ファイル: i2b2DataLoading.py プロジェクト: FredHutch/HutchNER
 def __init__(self, txt_dir, annotation_dir=None):
     super(i2b2DataLoader, self).__init__(txt_dir)
     self.annotation_dir = annotation_dir
     self.detected_labels = set()
     self.spacy_model = en_core_sci_md.load()
     self.logger = logging.getLogger(__name__)
コード例 #8
0
    def __init__(self,
                 raw_string,
                 split_expression=r'\W+',
                 bow=True,
                 mask_string=None):
        """Initializer.

        Args:
            raw_string: string with raw text in it
            split_expression: Regex string or callable. If regex string, will be used with re.split.
                If callable, the function should return a list of tokens.
            bow: if True, a word is the same everywhere in the text - i.e. we
                 will index multiple occurrences of the same word. If False,
                 order matters, so that the same word will have different ids
                 according to position.
            mask_string: If not None, replace words with this if bow=False
                if None, default value is UNKWORDZ
        """
        self.raw = raw_string
        self.mask_string = 'UNKWORDZ' if mask_string is None else mask_string
        # NLTK tokenizer
        # sentences_span = PunktSentenceTokenizer().span_tokenize(self.raw)
        # Scispacy tokenizer
        nlp_sentencizer = en_core_sci_md.load()
        nlp_tokens = nlp_sentencizer(self.raw)
        sentences_span = []
        for sent in nlp_tokens.sents:
            sentences_span.append((sent.start_char, sent.end_char))

        self.as_list = [self.raw[begin:end] for (begin, end) in sentences_span]
        self.as_np = np.array(self.as_list)
        non_word = re.compile(r'(%s)|$' % split_expression).match
        self.string_start = np.array(
            [begin for (begin, end) in sentences_span])
        '''
        if callable(split_expression):
            tokens = split_expression(self.raw)
            self.as_list = self._segment_with_tokens(self.raw, tokens)
            tokens = set(tokens)

            def non_word(string):
                return string not in tokens

        else:
            # with the split_expression as a non-capturing group (?:), we don't need to filter out
            # the separator character from the split results.
            splitter = re.compile(r'(%s)|$' % split_expression)
            self.as_list = [s for s in splitter.split(self.raw) if s]
            non_word = splitter.match
			
        '''

        # self.as_np = np.array(self.as_list)
        # self.string_start = np.hstack(
        # ([0], np.cumsum([len(x) for x in self.as_np[:-1]])))
        vocab = {}
        self.inverse_vocab = []
        self.positions = []
        self.bow = bow
        non_vocab = set()
        for i, word in enumerate(self.as_np):
            if word in non_vocab:
                continue
            if non_word(word):
                non_vocab.add(word)
                continue
            if bow:
                if word not in vocab:
                    vocab[word] = len(vocab)
                    self.inverse_vocab.append(word)
                    self.positions.append([])
                idx_word = vocab[word]
                self.positions[idx_word].append(i)
            else:
                self.inverse_vocab.append(word)
                self.positions.append(i)
        if not bow:
            self.positions = np.array(self.positions)