def replace_proper_nouns_in_line(line, to_replace): toktok = ToktokTokenizer() tokenized = toktok.tokenize(line) tagged_sent = pos_tag(tokenized) for i in range(len(tokenized)): if tagged_sent[i][1] == "NNP": tokenized[i] = tokenized[i][:2] + to_replace return tokenized
def normalize_sentences(sentences: List[str]) -> List[str]: new_sentences = [] tokenizer = ToktokTokenizer() morph = pymorphy2.MorphAnalyzer() for line in sentences: line = line.lower() line = ''.join(c if c in RUSSIAN_ALPHABET else ' ' for c in line) line = ' '.join(morph.parse(word)[0].normal_form for word in tokenizer.tokenize(line)) new_sentences.append(line) return new_sentences
def __init__(self, synsets: List[List[str]], tokenizer=None): self.tokenizer = tokenizer or ToktokTokenizer() list_synsets = list() max_word_token_list_len = 0 for synset in synsets: tokenized_synset = [] for word in synset: tokenized_word = self.tokenizer.tokenize(word) max_word_token_list_len = max(max_word_token_list_len, len(tokenized_word)) tokenized_synset.append(tokenized_word) list_synsets.append(tokenized_synset) dict_synsets = dict() for synset in list_synsets: tag = synset[0] for word in synset[1:]: dict_synsets[tuple(word)] = tuple(tag) self.synsets = dict_synsets self.max_word_token_list_len = max_word_token_list_len
from nltk import ToktokTokenizer from readtxt import readtxt from pattern.nl import attributive, predicative, parse ConceptsFile = '/home/sander/Studie/Stage/fuzzy-octo-tribble/Entities/Concepts_UMLS.txt' tokenize = ToktokTokenizer().tokenize def IndexLastComma(Tokens): assert ',' in Tokens i = len(Tokens)-1 while Tokens[i] != ',': i-=1 return i def Variants(Concept): TC = tokenize(Concept) # Lowercase all tokens except for capitalized abbreviations TokenizedConcept = [] for Token in TC: if Token.upper() == Token: TokenizedConcept.append(Token) else: TokenizedConcept.append(Token.lower()) Variants = [TokenizedConcept] # First variant removes specification between brackets. # But is this always sound???
def __init__(self, tokenizer=None): self.tokenizer = tokenizer or ToktokTokenizer()
def spellfix(s: str, corpora: Corpora, fix_threshold: float) -> str: tokenizer = ToktokTokenizer() return ' '.join( spellfix_word(word, corpora, fix_threshold) for word in tokenizer.tokenize(s))
def __init__(self, tokenizer=None): tokenizer = tokenizer or ToktokTokenizer() super(CoreNLPChunkingExtractor, self).__init__(tokenizer) self.parser = CoreNLPParser()
def _tokenize_text(s): tokenizer = ToktokTokenizer() return tokenizer.tokenize(s)
# Define function for removing special characters def remove_special_characters(text, remove_digits=True): pattern = r'[^a-zA-z0-9\s]' text = re.sub(pattern, '', text) return text # Stemming the text def simple_stemmer(text): ps = nltk.porter.PorterStemmer() text = ' '.join([ps.stem(word) for word in text.split()]) return text # Tokenization of text tokenizer = ToktokTokenizer() # set stopwords to english stop_words = set(stopwords.words('english')) # removing the stopwords def remove_stopwords(text, is_lower_case=False): tokens = tokenizer.tokenize(text) tokens = [token.strip() for token in tokens] if is_lower_case: filtered_tokens = [ token for token in tokens if token not in stop_words ] else: filtered_tokens = [
def update_with_sentence(self, s: str): tokenizer = ToktokTokenizer() self.update_with_list(tokenizer.tokenize(s))