def __init__(self, txt_dir, annotation_dir=None, encoding="ISO-8859-1"): super(bratDataLoader, self).__init__(txt_dir) self.annotation_dir = annotation_dir self.detected_labels = set() self.spacy_model = en_core_sci_md.load() self.sent_dict = {} self.logger = logging.getLogger(__name__) self.encoding = encoding
def __init__(self): # spaCy NLP self.nlp = en_core_sci_md.load()
corona_df.tail() """Extraímos o texto de uma das amostras (neste caso escolhi o primeiro artigo da amostra, mas poderia ser qualquer um)""" sample_text = corona_df['text'][22895] sample_text """# Função para Pré-processamento Vamos criar uma função para pré-processar os dados """ # utilizaremos a classe padrão do Spacy, pois já é preparada para área médica # disable para agilizar o processamento dos textos #tagger indica o pipeline, parser vai fazer o parse de independencias e ner é o reconhecimento de entidades no texto, mas nao utilizaremos nenhuma das 3 nlp = en_core_sci_md.load(disable=['tagger', 'parser','ner']) nlp.max_length = 2000000 # vamos visualizar as stop words: Stop Words são palavras que não usaremos, palavras mais do cotidiano (meu, seu, onde, então, etc) print(spacy.lang.en.stop_words.STOP_WORDS) # Ao criarmos a nuvem de palavras, identificamos algumas palavras que não são necessárias, como et e al, por exemplo # por este motivo, vamos criar uma nova STOP WORDS, removendo estas palavras new_stop_words = ['et','al','doi','copyright','http','https','fig','table','result','show'] for word in new_stop_words: nlp.vocab[word].is_stop=True def spacy_tokenizer(sentence): sentence = sentence.lower() #extrair o lemma das palavras list = []
import en_core_sci_md from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import CountVectorizer import numpy as np nlp = en_core_sci_md.load(disable=["tagger", "parser", "ner"]) nlp.max_length = 2000000 customize_stop_words = [ 'doi', 'preprint', 'copyright', 'peer', 'review', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure', 'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'fig', 'fig.', 'al.', 'di', 'la', 'il', 'del', 'le', 'della', 'dei', 'delle', 'una', 'da', 'dell', 'non', 'si', 'funding', 'covid-19', 'covid19', 'sars-cov-2', 'coronavirus', 'method', 'study', 'infection', 'public', 'sars', 'datum', 'datum', 'human', 'peer-reviewed', 'cc-by', 'the(which', 'cc-by-nc-nd', 'medrxiv', 'wang', 'licenseauthor/funder', 'li', 'org/10', 'author/funder', 'available', 'licenseit', 'sep2020', 'medrxiv', 'biorxiv', 'pp', 'paper', 'research', 'license', '2019-ncov', 'i(t', 'grant', 'virus', 'health', 'disease', 'infect', 'grant', 'show', 'yes', 'ratio', 'size', 'high', 'low', 'large', '0(0', 'result', '\\\\r', 'investor', 'group', 'allow', 'show', 'table', 'plot', 'betacov/zhejiang/wz-02/2020', 'betacov/zhejiang/hangzhou-', 'david', 'betacov/shenzhen/szth-003/2020', 'betacov/taiwan/2/2020', 'cov', 'include', 'use', 'licensewas', 'whichthis', 'vf=0', 'set', 'patient', 'china', 'confirm', 'italy', 'novel', 'need', 'pubmed', 'require', 'conclusion', 'average', 'december', 'february', 'march', 'april', 'january', 'pandemic' ] # Mark them as stop words for w in customize_stop_words:
def __init__(self): self.nlp = en_core_sci_md.load(disable=['tagger', 'parser', 'ner']) self.nlp.max_length = 2000000 self.expandStopWords()
import en_core_sci_md nlp = en_core_sci_md.load() print("Finished loading en_core_sci_md") def get_all_dep(): for label in nlp.get_pipe("parser").labels: print(label) return nlp.get_pipe("parser").labels def gen_dependency_tree(sent): doc = nlp(sent) return doc def get_universal_POS(): universal_pos = [ "ADJ", "ADP", "ADV", "AUX", "CONJ", "CCONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X", "SPACE" ] POS_map = dict() cnt = 0 for s in universal_pos: POS_map[s] = cnt cnt += 1 return POS_map
def __init__(self, txt_dir, annotation_dir=None): super(i2b2DataLoader, self).__init__(txt_dir) self.annotation_dir = annotation_dir self.detected_labels = set() self.spacy_model = en_core_sci_md.load() self.logger = logging.getLogger(__name__)
def __init__(self, raw_string, split_expression=r'\W+', bow=True, mask_string=None): """Initializer. Args: raw_string: string with raw text in it split_expression: Regex string or callable. If regex string, will be used with re.split. If callable, the function should return a list of tokens. bow: if True, a word is the same everywhere in the text - i.e. we will index multiple occurrences of the same word. If False, order matters, so that the same word will have different ids according to position. mask_string: If not None, replace words with this if bow=False if None, default value is UNKWORDZ """ self.raw = raw_string self.mask_string = 'UNKWORDZ' if mask_string is None else mask_string # NLTK tokenizer # sentences_span = PunktSentenceTokenizer().span_tokenize(self.raw) # Scispacy tokenizer nlp_sentencizer = en_core_sci_md.load() nlp_tokens = nlp_sentencizer(self.raw) sentences_span = [] for sent in nlp_tokens.sents: sentences_span.append((sent.start_char, sent.end_char)) self.as_list = [self.raw[begin:end] for (begin, end) in sentences_span] self.as_np = np.array(self.as_list) non_word = re.compile(r'(%s)|$' % split_expression).match self.string_start = np.array( [begin for (begin, end) in sentences_span]) ''' if callable(split_expression): tokens = split_expression(self.raw) self.as_list = self._segment_with_tokens(self.raw, tokens) tokens = set(tokens) def non_word(string): return string not in tokens else: # with the split_expression as a non-capturing group (?:), we don't need to filter out # the separator character from the split results. splitter = re.compile(r'(%s)|$' % split_expression) self.as_list = [s for s in splitter.split(self.raw) if s] non_word = splitter.match ''' # self.as_np = np.array(self.as_list) # self.string_start = np.hstack( # ([0], np.cumsum([len(x) for x in self.as_np[:-1]]))) vocab = {} self.inverse_vocab = [] self.positions = [] self.bow = bow non_vocab = set() for i, word in enumerate(self.as_np): if word in non_vocab: continue if non_word(word): non_vocab.add(word) continue if bow: if word not in vocab: vocab[word] = len(vocab) self.inverse_vocab.append(word) self.positions.append([]) idx_word = vocab[word] self.positions[idx_word].append(i) else: self.inverse_vocab.append(word) self.positions.append(i) if not bow: self.positions = np.array(self.positions)