class PrepareCDB(object): """ Prepares CDB data in csv format for annotations, after everything is done the result is in the cdb field. """ SEPARATOR = "" NAME_SEPARATOR = "|" CONCEPT_LENGTH_LIMIT = 20 SKIP_STOPWORDS = False VERSIONS = ['CLEAN', 'RAW'] def __init__(self, vocab=None, pretrained_cdb=None, word_tokenizer=None): self.vocab = vocab if pretrained_cdb is None: self.cdb = CDB() else: self.cdb = pretrained_cdb # Build the required spacy pipeline self.nlp = SpacyPipe(spacy_split_all, disable=['ner', 'parser']) self.nlp.add_punct_tagger(tagger=partial( spacy_tag_punct, skip_stopwords=self.SKIP_STOPWORDS)) # Get the tokenizer if word_tokenizer is not None: self.tokenizer = word_tokenizer else: self.tokenizer = self._tok def _tok(self, text): return [text] def prepare_csvs(self, csv_paths, sep=',', encoding=None, escapechar=None, only_existing=False, add_cleaner=None, only_new=False): """ Compile one or multiple CSVs into an internal CDB class csv_paths: an array of paths to the csv files that should be processed sep: if necessarya a custom separator for the csv files return: Compiled CDB class """ _new_cuis = set() for csv_path in csv_paths: df = pandas.read_csv(csv_path, sep=sep, encoding=encoding, escapechar=escapechar) cols = list(df.columns) str_ind = cols.index('str') cui_ind = cols.index('cui') tui_ind = -1 if 'tui' in cols: tui_ind = cols.index('tui') tty_ind = -1 if 'tty' in cols: tty_ind = cols.index('tty') desc_ind = -1 if 'desc' in cols: desc_ind = cols.index('desc') onto_ind = -1 if 'onto' in cols: onto_ind = cols.index('onto') is_unique_ind = -1 if 'is_unique' in cols: is_unique_ind = cols.index('is_unique') examples_ind = -1 if 'examples' in cols: examples_ind = cols.index('examples') for ind in range(len(df)): names = str(df.iat[ind, str_ind]).split(self.NAME_SEPARATOR) if ind % 10000 == 0: print("Done: {}".format(ind)) for _name in names: skip_raw = False for version in self.VERSIONS: # Get the cui cui = str(df.iat[ind, cui_ind]) if only_new: # Add only new concepts, skip exisitng ones #_tmp_name = clean_name(_name).lower().replace(" ", "") if (cui in self.cdb.cui2names and cui not in _new_cuis ): #and _tmp_name in self.cdb.name2cui: continue else: if cui not in self.cdb.cui2names: _new_cuis.add(cui) if (version == "RAW" and skip_raw) or \ (only_existing and cui not in self.cdb.cui2names): continue # Save originals pretty_name = _name original_name = _name name = _name if version == "CLEAN" and add_cleaner is not None: name = add_cleaner(name) name = clean_name(name) # Clean and preprocess the name sc_name = self.nlp(name) if version == 'CLEAN': tokens = [ str(t.lemma_).lower() for t in sc_name if not t._.is_punct and not t._.to_skip ] elif version == 'RAW': tokens = [ str(t.lower_) for t in sc_name if not t._.is_punct and not t._.to_skip ] tokens_vocab = [ t.lower_ for t in sc_name if not t._.is_punct ] # Don't allow concept names to be above concept_length_limit if len(tokens) > self.CONCEPT_LENGTH_LIMIT: continue name = self.SEPARATOR.join(tokens) tmp_name = "".join(tokens) if add_cleaner is None and name == self.SEPARATOR.join( tokens_vocab): # Both names are the same, skip raw version skip_raw = True is_pref_name = False if 'tty' in df.columns: _tmp = str(df.iat[ind, tty_ind]) if _tmp.lower().strip() == 'pn': is_pref_name = True # Skip concepts are digits or each token is a single letter length_one = [ True if len(x) < 2 else False for x in tokens ] if tmp_name.isdigit() or all(length_one): continue # Create snames of the name snames = [] sname = "" for token in tokens: sname = sname + token + self.SEPARATOR snames.append(sname.strip()) # Check is unique is_unique = None if 'is_unique' in df.columns: _tmp = str(df.iat[ind, is_unique_ind]).strip() if _tmp.lower().strip() == '0': is_unique = False elif _tmp.lower().strip() == '1': is_unique = True # Get the ontology: 'sab' in umls onto = 'default' if 'onto' in df.columns: # Get the ontology onto = str(df.iat[ind, onto_ind]) # Get the tui tui = None if 'tui' in df.columns: _tui = str(df.iat[ind, tui_ind]).strip() if len(_tui) > 0 and _tui != "nan": tui = _tui #TODO: If there are multiple tuis just take the first one if len(tui.split(',')) > 1: tui = tui.split(',')[0] # Get the concept description desc = None if 'desc' in df.columns: _desc = str(df.iat[ind, desc_ind]).strip() if len(_desc) > 0: desc = _desc # Add the concept self.cdb.add_concept(cui, name, onto, tokens, snames, tui=tui, pretty_name=pretty_name, tokens_vocab=tokens_vocab, is_unique=is_unique, desc=desc, original_name=original_name, is_pref_name=is_pref_name) # Process examples if we have them examples = [] if 'examples' in df.columns: tmp = str(df.iat[ind, examples_ind]).strip().split( self.NAME_SEPARATOR) for example in tmp: example = example.strip() if len(example) > 0: examples.append(example) # If we have examples for example in examples: doc = self.nlp(example) cntx = [] for word in doc: if not word._.to_skip: for w in self.tokenizer(word.lower_): if w in self.vocab and self.vocab.vec( w) is not None: cntx.append(self.vocab.vec(w)) if len(cntx) > 1: cntx = np.average(cntx, axis=0) self.cdb.add_context_vec(cui, cntx, cntx_type='MED') return self.cdb
class PrepareCDB(object): """ Prepares CDB data in csv format for annotations, after everything is done the result is in the cdb field. """ SEPARATOR = "" NAME_SEPARATOR = "|" CONCEPT_LENGTH_LIMIT = 8 SKIP_STOPWORDS = True def __init__(self, vocab=None, pretrained_cdb=None, word_tokenizer=None): self.vocab = vocab if pretrained_cdb is None: self.cdb = CDB() else: self.cdb = pretrained_cdb # Build the required spacy pipeline self.nlp = SpacyPipe(spacy_split_all, disable=['ner', 'parser']) self.nlp.add_punct_tagger(tagger=partial( spacy_tag_punct, skip_stopwords=self.SKIP_STOPWORDS)) # Get the tokenizer if word_tokenizer is not None: self.tokenizer = word_tokenizer else: self.tokenizer = self._tok def _tok(self, text): return [text] def prepare_csvs(self, csv_paths, sep=','): """ Compile one or multiple CSVs into an internal CDB class csv_paths: an array of paths to the csv files that should be processed sep: if necessarya a custom separator for the csv files return: Compiled CDB class """ for csv_path in csv_paths: df = pandas.read_csv(csv_path, sep=sep) for ind in range(len(df)): names = str(df.iloc[ind]['str']).split(self.NAME_SEPARATOR) for _name in names: if ind % 10000 == 0: print("Done: {}".format(ind)) pretty_name = _name name = clean_name(_name) # Clean and preprocess the name sc_name = self.nlp(name) tokens = [ str(t.lemma_).lower() for t in sc_name if not t._.is_punct and not t._.to_skip ] tokens_vocab = [ t.lower_ for t in sc_name if not t._.is_punct ] # Don't allow concept names to be above concept_length_limit if len(tokens) > self.CONCEPT_LENGTH_LIMIT: continue name = self.SEPARATOR.join(tokens) _name = "".join(tokens) length_one = [ True if len(x) < 2 else False for x in tokens ] # Skip concepts are digits or each token is a single letter if _name.isdigit() or all(length_one): continue # Create snames of the name snames = [] sname = "" for token in tokens: sname = sname + token + self.SEPARATOR snames.append(sname.strip()) # Check is unique unique = True if 'unique' in df.columns: _tmp = str(df.iloc[ind]['unique']).strip() if _tmp.lower().strip() == '0': unique = False onto = 'default' if 'sab' in df.columns: # Get the ontology onto = df.iloc[ind]['sab'] # Get the cui cui = df.iloc[ind]['cui'] # Get the tui tui = None if 'tui' in df.columns: tui = str(df.iloc[ind]['tui']) #TODO: If there are multiple tuis just take the first one if len(tui.split(',')) > 1: tui = tui.split(',')[0] examples = [] if 'examples' in df.columns: tmp = str(df.iloc[ind]['examples']).strip().split( self.NAME_SEPARATOR) for example in tmp: example = example.strip() if len(example) > 0: examples.append(example) self.cdb.add_concept(cui, name, onto, tokens, snames, tui=tui, pretty_name=pretty_name, tokens_vocab=tokens_vocab, unique=unique) # If we have examples for example in examples: doc = self.nlp(example) cntx = [] for word in doc: if not word._.to_skip: for w in self.tokenizer(word._.norm): if w in self.vocab and self.vocab.vec( w) is not None: cntx.append(self.vocab.vec(w)) if len(cntx) > 1: cntx = np.average(cntx, axis=0) self.cdb.add_context_vec(cui, cntx, cntx_type='MED') return self.cdb
class CDBMaker(object): r''' Given a CSV as shown in https://github.com/CogStack/MedCAT/tree/master/examples/<example> it creates a CDB. Args: cdb (`medcat.cdb.CDB`, optional): If set the `CDBMaker` will updat the existing `CDB` with new concepts in the CSV. ''' def __init__(self, cdb=None): if cdb is None: self.cdb = CDB() else: self.cdb = cdb # Build the required spacy pipeline self.nlp = SpacyPipe(spacy_split_all) self.nlp.add_punct_tagger(tagger=partial(spacy_tag_punct, skip_stopwords=False)) def prepare_csvs(self, csv_paths, sep=','): r''' Compile one or multipe CSVs into a CDB. Args: csv_paths (`List[str]`): An array of paths to the csv files that should be processed sep (`str`, optional, defaults to `,`): If necessarya a custom separator for the csv files return: Compiled UMLS class ''' for csv_path in csv_paths: df = pandas.read_csv(csv_path, sep=sep) for ind in range(len(df)): names = str(df.iloc[ind]['str']).split("||") for _name in names: if ind % 10000 == 0: print("Done: {}".format(ind)) # Save originals pretty_name = _name original_name = _name name = clean_umls(_name) # Clean and preprocess the name doc = self.nlp(name) tokens = [str(t.lemma_).lower() for t in doc if not t._.is_punct and not t._.to_skip] # Don't allow concept names to be above concept_length_limit if len(tokens) > CONCEPT_LENGTH_LIMIT: continue isupper = False if len(doc) == 1: if doc[0].is_upper and len(doc[0]) > 1: isupper = True name = SEPARATOR.join(tokens) _name = "".join(tokens) length_one = [True if len(x) < 2 else False for x in tokens] # Skip concepts are digits or each token is a single letter if _name.isdigit() or all(length_one): continue # Create snames of the name snames = [] sname = "" for token in tokens: sname = sname + token + SEPARATOR snames.append(sname.strip()) # Check is prefered name, it is if the column "TTY" equals PN is_pref_name = False if 'tty' in df.columns: _tmp = str(df.iloc[ind]['tty']) if _tmp.lower().strip() == 'pn': is_pref_name = True onto = 'default' if 'sab' in df.columns: # Get the ontology onto = df.iloc[ind]['sab'] # Get the cui cui = df.iloc[ind]['cui'] # Get the tui tui = None if 'tui' in df.columns: tui = str(df.iloc[ind]['tui']) #TODO: If there are multiple tuis just take the first one if len(tui.split(',')) > 1: tui = tui.split(',')[0] desc = None if 'def' in df.columns: tmp = str(df.iloc[ind]['def']).strip() if len(tmp) > 0: desc = tmp self.cdb.add_concept(cui, name, onto, tokens, snames, isupper=isupper, is_pref_name=is_pref_name, tui=tui, pretty_name=pretty_name, desc=desc) # If we had desc we can also add vectors if desc is not None: doc = self.nlp(clean_def(desc)) cntx = [] for word in doc: if not word._.to_skip: for w in self.tokenizer(word.lower_): if w in self.vocab and self.vocab.vec(w) is not None: cntx.append(self.vocab.vec(w)) if len(cntx) > 1: cntx = np.average(cntx, axis=0) self.cdb.add_context_vec(cui, cntx, cntx_type='LONG') # Increase cui count because we added the context if cui in self.cdb.cui_count: self.cdb.cui_count[cui] += 1 else: self.cdb.cui_count[cui] = 1 return self.cdb
class PrepareUMLS(object): """ Prepares UMLS data in csv format for annotations, after everything is done the result is in the umls field. """ def __init__(self, vocab=None, pretrained_cdb=None, tokenizer=None): self.vocab = vocab if pretrained_cdb is None: self.cdb = CDB() else: self.cdb = pretrained_cdb # Build the required spacy pipeline self.nlp = SpacyPipe(spacy_split_all, disable=['ner', 'parser']) self.nlp.add_punct_tagger( tagger=partial(spacy_tag_punct, skip_stopwords=False)) # Get the tokenizer if tokenizer is not None: self.tokenizer = tokenizer else: self.tokenizer = self._tok #BertTokenizer.from_pretrained('bert-base-uncased') def _tok(self, text): return [text] def prepare_csvs(self, csv_paths, sep=','): """ Compile one or multiple CSVs into an internal UMLS class csv_paths: an array of paths to the csv files that should be processed sep: if necessarya a custom separator for the csv files return: Compiled UMLS class """ for csv_path in csv_paths: df = pandas.read_csv(csv_path, sep=sep) for ind in range(len(df)): names = str(df.iloc[ind]['str']).split("||") for _name in names: if ind % 10000 == 0: print("Done: {}".format(ind)) pretty_name = _name name = clean_umls(_name) # Clean and preprocess the name doc = self.nlp(name) tokens = [ str(t.lemma_).lower() for t in doc if not t._.is_punct and not t._.to_skip ] # Don't allow concept names to be above concept_length_limit if len(tokens) > CONCEPT_LENGTH_LIMIT: continue isupper = False if len(doc) == 1: if doc[0].is_upper and len(doc[0]) > 1: isupper = True name = SEPARATOR.join(tokens) _name = "".join(tokens) length_one = [ True if len(x) < 2 else False for x in tokens ] # Skip concepts are digits or each token is a single letter if _name.isdigit() or all(length_one): continue # Create snames of the name snames = [] sname = "" for token in tokens: sname = sname + token + SEPARATOR snames.append(sname.strip()) # Check is prefered name, it is if the column "TTY" equals PN is_pref_name = False if 'tty' in df.columns: _tmp = str(df.iloc[ind]['tty']) if _tmp.lower().strip() == 'pn': is_pref_name = True onto = 'default' if 'sab' in df.columns: # Get the ontology onto = df.iloc[ind]['sab'] # Get the cui cui = df.iloc[ind]['cui'] # Get the tui tui = None if 'tui' in df.columns: tui = str(df.iloc[ind]['tui']) #TODO: If there are multiple tuis just take the first one if len(tui.split(',')) > 1: tui = tui.split(',')[0] desc = None if 'def' in df.columns: tmp = str(df.iloc[ind]['def']).strip() if len(tmp) > 0: desc = tmp self.cdb.add_concept(cui, name, onto, tokens, snames, isupper=isupper, is_pref_name=is_pref_name, tui=tui, pretty_name=pretty_name, desc=desc) # If we had desc we can also add vectors if desc is not None: doc = self.nlp(clean_def(desc)) cntx = [] for word in doc: if not word._.to_skip: for w in self.tokenizer(word.lower_): if w in self.vocab and self.vocab.vec( w) is not None: cntx.append(self.vocab.vec(w)) if len(cntx) > 1: cntx = np.average(cntx, axis=0) self.cdb.add_context_vec(cui, cntx, cntx_type='LONG') # Increase cui count because we added the context if cui in self.cdb.cui_count: self.cdb.cui_count[cui] += 1 else: self.cdb.cui_count[cui] = 1 return self.cdb