def lemmatize(data): output = [] lemmatizerEn = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) lemmatizerEs = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES, lookup=spacy.lang.es.LOOKUP) bar = ChargingBar('Lemmatizing\t\t\t\t', max=len(data)) for instance in data: new_tweet = {} new_tweet['tweetid'] = instance['tweetid'] new_tweet['tweet'] = instance['tweet'] new_tweet['tokens'] = [] new_tweet['langid'] = instance['langid'] new_tweet['sentiment'] = instance['sentiment'] for i, word in enumerate(instance['tokens']): if (instance['langid'][i] == 'lang1'): new_tweet['tokens'].append(lemmatizerEn.lookup(word)) elif (instance['langid'][i] == 'lang2'): new_tweet['tokens'].append(lemmatizerEs.lookup(word)) else: new_tweet['tokens'].append(word) # new_tweet['tokens'].append(lemmatizerEn.lookup(word)) output.append(new_tweet) new_tweet = {} new_tweet['tweetid'] = instance['tweetid'] new_tweet['tweet'] = instance['tweet'] new_tweet['tokens'] = [] new_tweet['langid'] = [] new_tweet['sentiment'] = instance['sentiment'] bar.next() bar.finish() return output
def solve(word_list): wnl = stem.WordNetLemmatizer() porter = stem.porter.PorterStemmer() a = [porter.stem(word) for word in word_list] b = [wnl.lemmatize(word) for word in word_list] lemmatizer = Lemmatizer() c = [lemmatizer.lookup(word) for word in word_list] res = {} res['a'] = a res['b'] = b res['c'] = c return res
def transform(self): df2 = self.dataframe.withColumn( "_2", regexp_replace(col("_2"), "[\"'./§$&+,:;=?@#–|'<>.^*()%!-]", "")) df = df2.withColumn("_2", regexp_replace(col("_2"), "\\s{2,}", "")) language_detect = udf(lambda x: detect(x), returnType=StringType()) df3 = df.withColumn("lang", language_detect('_2')) lemmatizer = Lemmatizer(lookup=delook) lemmatizer1 = Lemmatizer(lookup=enlook) tokenizer = Tokenizer(inputCol="_2", outputCol="words") tokenized = tokenizer.transform(df3) # print(tokenized) lemma = udf(lambda x, lang: True if lang == "de" " ".join([lemmatizer.lookup(i) for i in x]) else " ".join( [lemmatizer1.lookup(i) for i in x]), returnType=StringType()) lemmatized = tokenized.withColumn( "stemmed", lemma(col('words'), col('lang'))).drop('words').drop('_2') tokenizer = Tokenizer(inputCol="stemmed", outputCol="words") tokenized = tokenizer.transform(lemmatized) remover = StopWordsRemover(inputCol="words", outputCol="filtered") stopwords = remover.loadDefaultStopWords( "german") + remover.loadDefaultStopWords("english") remover = remover.setStopWords(stopwords) newDataSet = remover.transform(tokenized) test = newDataSet.withColumn("filtered", explode(col("filtered"))) \ .groupBy("_1", "filtered") \ .agg(func.count(func.lit(1)).alias("count")) \ .sort(col("count").desc()) return test
def get_cleaned_text(self): ''' This function clean the text ''' f = gzip.open(self.path, 'rb') self.text = f.read().decode('utf-8') #removing stop words and words with only one character nlp = spacy.load("en_core_web_sm") stop_words = set(nlp.Defaults.stop_words) self.text = self.text.lower().split(' ') self.text = [ word for word in self.text if word not in stop_words if len(word) > 1 ] #lemmatizing the words lemmatizer = Lemmatizer() self.text = [lemmatizer.lookup(word) for word in self.text] return self.text
docx = nlp(document1) #Otherwise, send the headingsearch result through nlp else: docx = nlp(returnedSearch) word_frequencies = {} # how many times each word occurs int the document words = [ ] # a list of every word in the document stores in the same index of the frequency array #spacy lemmatizer to get root words lookups = Lookups() lemmatizer = Lemmatizer(lookups) for word in docx: # go through every word in document if word.text not in stopwords: # as long as the word isnt a stop word if lemmatizer.lookup(word.text) not in word_frequencies.keys( ): # if we havent come across the word yet word_frequencies[lemmatizer.lookup( word.text)] = 1 # its frequency is one words.append(lemmatizer.lookup(word.text)) # add it to words else: word_frequencies[lemmatizer.lookup( word.text )] += 1 # otherwise it is already in the list, so increment it #Sort through the array by bubble sort def bubble_sort(arrNum, arrStr): def swapNum(i, j): arrNum[i], arrNum[j] = arrNum[j], arrNum[i]
lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) # Load pre-trained model tokenizer (vocabulary) tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') # Tokenize input file = open('./Books/text.txt', mode='r') tokenized_text = tokenizer.tokenize(file.read()) vocabulary = pandas.read_csv("./Lexique382/lexique3_words_90_percentile.tsv", sep='\t').lemme.to_list() unknownVocab = {} index = 0 for token in tokenized_text: lemma = lemmatizer.lookup(token) if lemma not in vocabulary and token.isalpha(): unknownVocab[token] = index tokenized_text[index] = '[MASK]' index += 1 # Convert token to vocabulary indices indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor([indexed_tokens]) # Load pre-trained model (weights) model = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased') model.eval()
# spacy -m download en_core_web_sm # ``` # %% import spacy # %% from spacy.lemmatizer import Lemmatizer from spacy.lookups import Lookups lookups = Lookups() lookups.add_table("lemma_rules", {"noun": [["s", ""]]}) lemmatizer = Lemmatizer(lookups) # %% [lemmatizer.lookup(word) for word in word_list] # %% [markdown] # Spacy doesn't offer a stemmer (since lemmatization is considered better-- this is an example of being opinionated!) # %% [markdown] # Stop words vary from library to library # %% nlp = spacy.load("en_core_web_sm") # %% sorted(list(nlp.Defaults.stop_words))[:20] # %% [markdown] # #### Exercise: What stop words appear in spacy but not in sklearn?
class SpacyTokenizer: """ Tokenize or tokenize a list of documents. Return list of tokens or lemmas, without sentencizing. Works only for English language. """ def __init__(self, disable: list = None, stopwords: list = None, batch_size: int = None, ngram_range: Tuple[int, int] = None, lemmas=False, lowercase: bool = None, alphas_only: bool = None): """ :param disable: pipeline processors to omit; if nothing should be disabled, pass an empty list :param stopwords: a set of words to skip :param batch_size: a batch size for internal spaCy multi-threading :param ngram_range: range for producing ngrams, ex. for unigrams + bigrams should be set to (1, 2), for bigrams only should be set to (2, 2) :param lemmas: weather to perform lemmatizing or not while tokenizing, currently works only for the English language :param n_threads: a number of threads for internal spaCy multi-threading """ if disable is None: disable = ['parser', 'ner'] self._stopwords = stopwords or [] self.model = spacy.load('en', disable=disable) self.tokenizer = Tokenizer(self.model.vocab) self.lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES, LOOKUP) self.batch_size = batch_size self.ngram_range = ngram_range self.lemmas = lemmas self.lowercase = lowercase self.alphas_only = alphas_only @property def stopwords(self): return self._stopwords @stopwords.setter def stopwords(self, stopwords: List[str]): self._stopwords = stopwords def tokenize(self, data: List[str], ngram_range=(1, 1), lowercase=True) -> Generator[List[str], Any, None]: """ Tokenize a list of documents. :param data: a list of documents to process :param ngram_range: range for producing ngrams, ex. for unigrams + bigrams should be set to (1, 2), for bigrams only should be set to (2, 2) :param lowercase: whether to perform lowercasing or not :return: a single processed doc generator """ size = len(data) _ngram_range = self.ngram_range or ngram_range if self.lowercase is None: _lowercase = lowercase else: _lowercase = self.lowercase for i, doc in enumerate(data): spacy_doc = self.model(doc) logger.debug("Tokenize doc {} from {}".format(i, size)) if _lowercase: tokens = [t.lower_ for t in spacy_doc] else: tokens = [t.text for t in spacy_doc] filtered = self._filter(tokens) processed_doc = ngramize(filtered, ngram_range=_ngram_range) yield from processed_doc def lemmatize(self, data: List[str], ngram_range=(1, 1)) -> \ Generator[List[str], Any, None]: """ Lemmatize a list of documents. :param data: a list of documents to process :param ngram_range: range for producing ngrams, ex. for unigrams + bigrams should be set to (1, 2), for bigrams only should be set to (2, 2) :return: a single processed doc generator """ size = len(data) _ngram_range = self.ngram_range or ngram_range for i, doc in enumerate(data): spacy_doc = self.model(doc) logger.debug("Lemmatize doc {} from {}".format(i, size)) tokens = [t.lower_ for t in spacy_doc] lemmas = [self.lemmatizer.lookup(word) for word in tokens] filtered = self._filter(lemmas) processed_doc = ngramize(filtered, ngram_range=_ngram_range) yield from processed_doc def _filter(self, items, alphas_only=True): """ Make ngrams from a list of tokens/lemmas :param items: list of tokens, lemmas or other strings to form ngrams :param alphas_only: should filter numeric and alpha-numeric types or not :return: filtered list of tokens/lemmas """ _alphas_only = self.alphas_only or alphas_only if _alphas_only: filter_fn = lambda x: x.isalpha() and x not in self._stopwords else: filter_fn = lambda x: x not in self._stopwords return list(filter(filter_fn, items))