def stem_words(words): stemmer = LancasterStemmer() stems = [] for word in words: stem = stemmer.stem(word) stems.append(stem) return stems
def __init__(self, stemmer_type='Porter'): self.stemmer_type = stemmer_type if self.stemmer_type == 'Porter': self.stemmer = PorterStemmer() elif self.stemmer_type == 'Lancaster': self.stemmer = LancasterStemmer() else: raise Exception('Invalid stemmer_type = {0}'.format(stemmer_type))
def porter_stemer(texto): text = [] porter = PorterStemmer() lancaster = LancasterStemmer() for words in texto: word_porter = porter.stem(words) text.append(word_porter) word_lancaster = lancaster.stem(words) text.append(word_lancaster) return text
def stemming(df): new_df = [] lancaster = LancasterStemmer() for i in range(0, len(df)): tokens = word_tokenize(df['Text'][i]) s = [lancaster.stem(word) for word in tokens] new_df.append([' '.join(s), df['Label'][i]]) new_df = pd.DataFrame(new_df) new_df.columns = ['Text', 'Label'] return pd.DataFrame(new_df)
def stem_words(words): """Stem words in list of tokenized words""" stemmer = LancasterStemmer() stemmer = SnowballStemmer('english') stems = [] for word in words: stem = stemmer.stem(word) stems.append(stem) return stems
def stemWords(self, words): """Stem words in list of tokenized words""" if stemmer == "lancaster": stemmer = LancasterStemmer() elif stemmer == "snowbal": stemmer = SnowballStemmer() elif stemmer == "porter": stemmer = PorterStemmer() stems = [stemmer.stem(word) for word in words] return stems
def WordStemming(sample): stem1 = LancasterStemmer() stem2 = PorterStemmer() tokenWords = word_tokenize(sample) #tokenWords stem_sentence = [] for word in tokenWords: stem_sentence.append(stem1.stem(word)) stem_sentence.append(" ") return "".join(stem_sentence), tokenWords
def stem_word_list(word_list): """Stem word_list in list of tokenized word_list Keyword arguments: word_list: list of words """ stemmer = LancasterStemmer() stems = [] for word in word_list: stem = stemmer.stem(word) stems.append(stem) return stems
def stemming(word): # Use stemmers for removing morphological affixes from words. Portst = PorterStemmer() Landst = LancasterStemmer() Regst = RegexpStemmer('ing|ed') new = Portst.stem(word) if new == word: new = Landst.stem(word) if new == word: new = Regst.stem(word) return new
def word_frequence(al, rank): lst = LancasterStemmer() left = [ lst.stem(word.lower()) for word in word_tokenize(al) if word.lower() not in stopwords.words('english') and len(word) > 2 ] final = FreqDist(left) sort = sorted(list(set(final.values()))) sort = [i for i in sort[::-1]] for i in sort[:rank]: #¦C¥X«e´X¦W print([v for v, k in final.items() if k == i], i)
def __stem_words(self, words): """Stem words in list of tokenized words""" if isinstance(words, str): words = words.split(' ') stemmer = LancasterStemmer() stems = [] for word in words: stem = stemmer.stem(word) stems.append(stem) return stems
def stemming(text): """does stemming""" words_list = nltk.word_tokenize(text) words_set = set() ps = LancasterStemmer() for word in words_list: words_set.add(ps.stem(word)) return words_set
def process_lancaster_stop(text): text = re.sub('[^A-Za-z0-9]+', ' ', text) tokens = word_tokenize(text) tokens = [w.lower() for w in tokens] stemmer = LancasterStemmer() tokens = [stemmer.stem(word) for word in tokens] table = str.maketrans('', '', string.punctuation) stripped = [w.translate(table) for w in tokens] words = [word for word in stripped if word.isalpha()] return words
def stem(word_list): ''' Returns the stemming for a lisf of word :param word_list: list of words :return: list of tuple (word, stemming) ''' result = [] lstemmer = LancasterStemmer() for word in word_list: w = lstemmer.stem(word) result.append(w) return result
def string_stemmer(text): """ :param text: words from email :return: string with stemmed words """ stemmer = LancasterStemmer() stemmed_list = [] for word in text.split(' '): stemmed_list.append(stemmer.stem(word)) return ' '.join(word for word in stemmed_list)
def stem_words(words): """ Stem words in list of tokenized words :param words: :return: """ stemmer = LancasterStemmer() stems = [] for word in words: stem = stemmer.stem(word) stems.append(stem) return stems
def __init__(self, algorithm="TFIDF"): self.algorithm = algorithm self.stopWords = stopwords.words('english') self.wsTok = WhitespaceTokenizer() self.stemmer = LancasterStemmer() self.countVect = CountVectorizer() self.tfidfVect = TfidfVectorizer() self.queryData = [] self.srcData = []
def list_stemmer(word_list): """ :param word_list: list of strings :return: list with stemmed words """ stemmer = LancasterStemmer() ret_list = [] for word in word_list: ret_list.append(stemmer.stem(word)) return ret_list
def create_stemming(wrd_tokens): print("\n==================== Stemming ====================") p_stemmer = PorterStemmer() l_stemmer = LancasterStemmer() s_stemmer = SnowballStemmer('english') wrd_token_count = 0 for wrd_token in wrd_tokens: wrd_token_count += 1 if wrd_token_count < 7: print(p_stemmer.stem(wrd_token), l_stemmer.stem(wrd_token), s_stemmer.stem(wrd_token))
def stem_words(text): """ combines the different forms of the verbs/adverbs/adjectives""" text = text.split() try: stemmer = LancasterStemmer() except LookupError: nltk.download('wordnet') stems = list() for word in text: stem = stemmer.stem(word) stems.append(stem) return ' '.join(stems)
def prepare_embedding_matrix(max_words, embedding_dim, word_index, embeddings_index, hparams, lower_only=False): print('Preparing embedding matrix...') np.random.seed(hparams['random_state']) porter = PorterStemmer() lancaster = LancasterStemmer() snowball = SnowballStemmer('english') lemmatizer = WordNetLemmatizer() count = 0 embedding_matrix = np.zeros((max_words, embedding_dim)) random_vector = np.random.random(embedding_dim) for word, i in word_index.items(): if i >= max_words: continue if word in embeddings_index and word.lower() not in embeddings_index: embeddings_index[word.lower()] = embeddings_index[word] embedding_vector = embeddings_index.get(word.lower()) if lower_only else embeddings_index.get(word) # https://www.kaggle.com/wowfattie/3rd-place if embedding_vector is None: embedding_vector = embeddings_index.get(word.lower()) if embedding_vector is None: embedding_vector = embeddings_index.get(word.upper()) if embedding_vector is None: embedding_vector = embeddings_index.get(word.capitalize()) if embedding_vector is None: embedding_vector = embeddings_index.get(porter.stem(word)) if embedding_vector is None: embedding_vector = embeddings_index.get(lancaster.stem(word)) if embedding_vector is None: embedding_vector = embeddings_index.get(snowball.stem(word)) if embedding_vector is None: embedding_vector = embeddings_index.get(lemmatizer.lemmatize(word)) if word == hparams['tokenizer_oov_token'] or embedding_vector is None: embedding_matrix[i] = random_vector else: embedding_matrix[i] = embedding_vector count += 1 print('Word vectors coverage:', count / max_words) print('Embedding matrix shape:', embedding_matrix.shape) return embedding_matrix
def preprocess_text(df): stemmer = LancasterStemmer() lemmatizer = WordNetLemmatizer() p = inflect.engine() import pdb pdb.set_trace() # remove special characters df['text'].apply(lambda x: re.sub("(\\W)+", " ", x)) # remove punctuation df['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x)) # tokenize df['text'].apply(lambda x: nltk.word_tokenize(x)) # to lower case df['text'].apply(lambda x: [word.lower() for word in x]) # filter special characters df['text'].apply(lambda x: [word.lower() for word in x]) # filter stopwords df['text'].apply( lambda x: [item for item in x if item not in stopwords.words('german')]) # remove punctuation df['text'].apply( lambda x: [re.sub(r'[^\w\s]', '', word) for word in x if word != '']) # Remove non-ASCII characters from list of tokenized words df['text'].apply(lambda x: [ unicodedata.normalize('NFKD', word).encode('ascii', 'ignore'). decode('utf-8', 'ignore') for word in x ]) # Replace all interger occurrences in list of tokenized words with textual representation df['text'].apply( lambda x: [p.number_to_words(word) for word in x if word.isdigit()]) # stemming df['text'].apply(lambda x: [stemmer.stem(word) for word in x]) # lemmatizing df['text'].apply( lambda x: [lemmatizer.lemmatize(word, pos='v') for word in x]) return df
def stemming(text, method='lancaster'): result = [] if method == 'lancaster': stemmer = LancasterStemmer() elif method == 'porter': stemmer = PorterStemmer() word_pattern = re.compile("(?:[a-zA-Z]+[-–’'`ʼ]?)*[a-zA-Z]+[’'`ʼ]?") words = word_pattern.findall(text) for word in words: word_stemmed = stemmer.stem(word) result.append(word_stemmed) return result
def set_content_based_on_intents(self) -> (list(), list()): all_words = [] self.steammer = LancasterStemmer() for intent in self.intents: for pattern in intent["patterns"]: words = nltk.word_tokenize(pattern) words = [self.steammer.stem(word.lower()) for word in words] all_words.extend(words) self.sentences.append(words) self.tags.append(intent["intent"]) words = [] self.words_bag = sorted(list(set(all_words))) self.intents_bag = sorted(list(set(self.tags)))
def __getStems(words): ######################################################################################### # This method returns stemmed words by applying nltk.LancasterStemmer. ######################################################################################### try: if words: stemmer = LancasterStemmer() return [stemmer.stem(word) for word in words if word != ""] return words # return words as is without any changes except: exc_type, exc_value, exc_traceback = sys.exc_info() err = "Error occurred while getting stems of the words '{0}'. Error is: {1}; {2}".format( " ".join(words), str(exc_type), str(exc_value)) raise Exception(err)
def stem(self, input_text): tokenizer = RegexpTokenizer("\s+", gaps=True) stemmed_text = [] lemmatizer = WordNetLemmatizer() stemmer = LancasterStemmer() text = tokenizer.tokenize(str(input_text)) filtered_text = self.stopword(text) for word in filtered_text: if word.isalpha(): stemmed_text.append(stemmer.stem(word).lower()) " ".join(stemmed_text) return stemmed_text
def main(): lancaster = LancasterStemmer() porter = PorterStemmer() snowball = SnowballStemmer("english") word_list = ["friend", "friendship", "friends", "friendships", "stabil", "destabilize", "misunderstanding", "universe", "universal", "university", "union", "railroad", "moonlight", "football", "going", "education"] print("{0:20}{1:20}{2:20}{3:20}".format("Word", "Porter Stemmer", "lancaster Stemmer", "Snowball Stemmer")) for word in word_list: print("{0:20}{1:20}{2:20}{3:20}".format(word, porter.stem(word), lancaster.stem(word), snowball.stem(word)))
async def process_text_nltk_handle(request): logger = logging.getLogger("nltk-request") logger.info("Process text [NLTK] request") request_text = await request.read() raw_data = request_text.decode('utf8') raw_data_modified = '' for raw_string in raw_data.split('.'): buff = raw_string.strip().replace(r"\n", "") if len(buff) > 1: raw_data_modified += buff + '.' a = set(stopwords.words('english')) tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(raw_data_modified.lower()) tokens = [x for x in tokens if x not in a] result = {"tokens": tokens} fdist = FreqDist(tokens) result["fdist"] = fdist fdist10 = fdist.most_common(10) result["fdist10"] = fdist10 porter_stemmer_result = [] lancaster_stemmer_result = [] pst = PorterStemmer() lst = LancasterStemmer() for token in tokens: porter_stemmer_result.append(pst.stem(token)) lancaster_stemmer_result.append(lst.stem(token)) result["porter_stemmer_result"] = porter_stemmer_result result["lancaster_stemmer_result"] = lancaster_stemmer_result lemmatizer = WordNetLemmatizer() lemmas = [] for token in tokens: lemmas.append(lemmatizer.lemmatize(token)) result["lemmas"] = lemmas result["pos_tags"] = nltk.pos_tag(tokens) result["ner"] = ne_chunk(result["pos_tags"]) return web.Response(text=json.dumps(result), headers={"Access-Control-Allow-Origin": "*"}, content_type="application/json")
def obtain_the_tags_frequency(self): b = TextBlob(self.objectiveoutstring) # b, pass frequecy, pass Lancaster stemming, pass 850, pass wordnet, pass nn/vb # save dictionary wordtags = b.correct().tags # The first frequency saving stemmer = LancasterStemmer() first_frequecy = {} for item in wordtags: temp = stemmer.stem(item[0]) if Word(temp).synsets and temp not in self.basicwords: if temp not in first_frequecy: first_frequecy[temp] = [] first_frequecy[temp].append(item[1]) else: try: first_frequecy[temp].append(item[1]) except: print(temp) print(item) elif temp in self.basicwords: self.basicoverlap += 1 else: self.weirdwords.append(item[0]) for key in first_frequecy.keys(): length = len(first_frequecy[key]) tags_length = len(set(first_frequecy[key])) t1 = 0 t2 = 0 t3 = 1 for it in set(first_frequecy[key]): if 'NN' in it: t1 = 1 t3 = 0 else: if 'VB' in it: t2 = 1 t3 = 0 self.second_frequency.append((key, length, tags_length, t1, t2, t3))
def stem_lem_words(word_list, engine="word_net"): """ :param word_list: a list of words :param engine: variable to choose between available stemmer :return: stemmed/lemmatized list of words """ if engine == "porter": porter = PorterStemmer() return [porter.stem(word) for word in word_list] elif engine == "lancaster": lancaster = LancasterStemmer() return [lancaster.stem(word) for word in word_list] elif engine == "word_net": lem = WordNetLemmatizer() return [lem.lemmatize(word) for word in word_list]
def run(self): super().run() # stem words in input file stemmer = LancasterStemmer() output = open(self.output, 'w+') with open(self.input, mode='r') as input: for line in input: for word in line.split(): output.write(stemmer.stem(word) + ' ') output.write('\n') output.close()
def stem_and_rem_stopwords(documents:list, additional_stopwords: list = []): """Returns a list of documents that have been stemmed and had stopwords removed. """ s_words = set(stopwords.words('english') + additional_stopwords) stemmer = LancasterStemmer() processed_documents =[] for document in documents: tokens = document.split() processed_documents.append( ' '.join([stemmer.stem(token) for token in tokens if token not in s_words]) ) return processed_documents
class Baseline(object): def __init__(self): self.stemmer = LancasterStemmer() self.stopwords = set([self.stemmer.stem(word) for word in stopwords]) def stem(self, doc): return [self.stemmer.stem(word) for word in doc] def doc_similarity(self, s1, s2, pairId=None): s1 = s1.lower().split() s2 = s2.lower().split() s1 = self.stem(s1) s2 = self.stem(s2) s1 = set(s1) - self.stopwords s2 = set(s2) - self.stopwords return float(len(s1.intersection(s2)))/((len(s1)+len(s2)))
class Tokenizer(): def __init__(self): self.stemmer = LancasterStemmer() def __call__(self, text): return [self.stemmer.stem(token) for token in word_tokenize(text)]
class Tokenizer(object): def __init__(self): self.tok = RegexpTokenizer(r'some_regular_expression') self.stemmer = LancasterStemmer() def __call__(self, doc): return [self.stemmer.stem(token) for token in self.tok.tokenize(doc)]
def __init__(self, analyze_attributes_from_schema, does_process_similar_words, precision): # Load medium or big corpus of english words. self.nlp_corpus = spacy.load("en_core_web_md") self.analyze_attributes_from_schema = analyze_attributes_from_schema self.precision = precision self.does_process_similar_words = does_process_similar_words # Aggresive stemming preferred. self.lancester = LancasterStemmer() # Use wordnet for lemmas. self.wordnet_lemmatizer = WordNetLemmatizer() # Managers used throughout the program. self.noise_manager = NoiseManager() self.trait_extractor = TraitExtractor() self.trait_analyzer = TraitAnalyzer() # Extract traits from CDM Schema documents folder. self.trait_list = self.trait_extractor.extract_traits( 'CDM.SchemaDocuments/', self.trait_files) # The list of stemmed trait features. self.stem_traits = self.trait_analyzer.stem_traits( self.trait_list, self.lancester, self.wordnet_lemmatizer, self.noise_manager)
class TFIDF: def __init__(self): self.pickle_docs = "tfidf_pickle_docs" self.pickle_corpus = "tfidf_pickle_corpus" self.lan = LancasterStemmer() self.construct() #print sorted(self.words.iteritems(), key = operator.itemgetter(1), reverse=True)[:20] def clean(self, word): '''cleans a word or returns None if it should not be considered''' word = word.strip(string.punctuation) word = self.lan.stem(word) return word def construct(self): corpus = {} # Check to see if we should simply load a pickle if os.path.isfile(self.pickle_docs): with open(self.pickle_docs) as docs_file: current_doclist = pickle.load(docs_file) if os.listdir('articles/') == current_doclist: # current article list is the same as pickled article list # so we want to just load the stored pickled corpus data with open(self.pickle_corpus) as corpus_file: self.words = pickle.load(corpus_file) self.n = len(current_doclist) return # If we don't load a pickle, build the corpus from articles/ dir num_docs = 0.0 for file_name in os.listdir('articles/'): num_docs += 1 doc = {} with open("articles/" + file_name) as article: for line in article: for word in tokenize(line, "word", return_spans=False): word = self.clean(word) doc[word] = 1 for key in doc.keys(): corpus[key] = corpus.get(key, 0) + 1 self.words = corpus self.n = num_docs print "Pickling a new TFIDF corpus" # pickle corpus and document list with open(self.pickle_docs, "w") as docs_file: pickle.dump(os.listdir('articles/'), docs_file) with open(self.pickle_corpus, "w") as corpus_file: pickle.dump(self.words, corpus_file) def weight(self, word, count, debug=False): if debug: return (word, count, self.words.get(word, 1)) return count * math.log(self.n / self.words.get(word, 1))
class StemTokenizer(object): def __init__(self, stemmer_type='Porter'): self.stemmer_type = stemmer_type if self.stemmer_type == 'Porter': self.stemmer = PorterStemmer() elif self.stemmer_type == 'Lancaster': self.stemmer = LancasterStemmer() else: raise Exception('Invalid stemmer_type = {0}'.format(stemmer_type)) def __call__(self, doc): return [self.stemmer.stem(t) for t in word_tokenize(doc)]
def word_refiner(*args): Portst = PorterStemmer() Landst = LancasterStemmer() Regst = RegexpStemmer('ing|ed|ly|lly') args = [i for i in args if isinstance(i, unicode)] for w in map(str, args): if w in dic1: yield w else: st1 = Portst.stem(w) if st1 in dic1: yield st1 else: st2 = Landst.stem(w) if st2 in dic1: yield st2 else: st3 = Regst.stem(w) if st3 in dic1: yield st3 else: yield w
class LancasterTokenizer(object): def __init__(self): self.ls = LancasterStemmer() self.rx = RegexpTokenizer(r"(?u)\b\w\w+\b") def isNumber(self, s): try: float(s) return True except ValueError: return False def __call__(self, doc): return [self.ls.stem(t) for t in self.rx.tokenize(doc) if not self.isNumber(t)]
def preprocess(sentence): output_list = [] #CASE FOLDING [NOT COMPLETE] sentence = sentence.lower() #DATA CLEANING sentence = sentence.replace('[https://]?[t.co/]?','') sentence = sentence.replace('@','') sentence = sentence.replace('[#]?','') sentence = sentence.replace('[RT]?','') sentence = sentence.replace(',','') sentence = sentence.replace('!','') sentence = sentence.replace('?','') sentence = sentence.replace('.','') sentence = sentence.replace('\'','') sentence = sentence.replace('\"','') sentence = sentence.replace(':','') #REMOVE REPEATED CHARS #sentence = re.sub(r'(\w)\1+', r'\1', sentence) #TOKENIZE tt = TweetTokenizer() temp = tt.tokenize(sentence) #REMOVE STOP WORDS stop = stopwords.words('english') #STEMMING ls = LancasterStemmer() newtemp = [eachword for eachword in temp if eachword not in stop] for eachword in newtemp: output_list.append(ls.stem(eachword)) return output_list
class Tokenizer(): """ Tokenizes and stems text using NLTK libraries """ def __init__(self): """ Constructs a tokenizer object """ self.stemmer = LancasterStemmer() def __call__(self, text): """ Tokenizes text :param text: the text to tokenize :type text: str or unicode :return: a list of tokens :rtype: list of (str or unicode) """ return [self.stemmer.stem(token) for token in word_tokenize(text)]
def __init__(self): self.pickle_docs = "tfidf_pickle_docs" self.pickle_corpus = "tfidf_pickle_corpus" self.lan = LancasterStemmer() self.construct()
def stem_document(document): from nltk.stem import LancasterStemmer stemmer = LancasterStemmer() return stemmer.stem(document)
class TFIDF(object): def __init__(self, tfidf_file, id2wordFile=None): self.model = models.TfidfModel.load(tfidf_file) self.stemmer = LancasterStemmer() self.stopwords = set([self._preprocess_word(word) for word in stopwords]) #self.stem_model() print "done" def _preprocess_word(self, word): return self.stemmer.stem(word.lower()) #return word.lower() def stem(self, doc): return [self.stemmer.stem(word) for word in doc] def stem_model(self): print "stemming" new_id2word = corpora.Dictionary() # Create a new dicitonary with the stemmed terms and summed document frequencies for termid, freq in self.model.dfs.iteritems(): stemmed_word = self.stemmer.stem(self.model.id2word[termid]) stemmed_id = None if stemmed_word in new_id2word.token2id: stemmed_id = new_id2word.token2id[stemmed_word] else: stemmed_id = len(new_id2word.token2id) new_id2word.token2id[stemmed_word] = stemmed_id new_id2word.dfs[stemmed_id] = 0 new_id2word.dfs[stemmed_id] += freq # add df from old dicionary new_id2word.num_docs = self.model.id2word.num_docs new_id2word.num_nnz = self.model.id2word.num_nnz new_id2word.num_pos = self.model.id2word.num_pos self.model.id2word = new_id2word self.model.dfs = self.model.id2word.dfs self.model.idfs = precompute_idfs(self.model.wglobal, self.model.dfs, self.model.num_docs) self.model.save('models/all_lancaster.tfidfmodel') print len(new_id2word) print "done stemming" def restrict_vocab(self, corpus): vocab = set() for doc in corpus: for idx, freq in doc: vocab.add(idx) for idx in vocab: dfs[idx] = self.model.dfs[idx] idfs[idx] = self.model.idfs[idx] self.model.dfs = dfs self.model.idfs = idfs def to_bow(self, doc): doc = [self._preprocess_word(word) for word in doc.lower().split() if word not in self.stopwords] return self.model.id2word.doc2bow(doc) def doc_similarity(self, s1, s2, pairId=None): # tfidf1 = self.model[self.to_bow(s1)] # tfidf2 = self.model[self.to_bow(s2)] # index = similarities.MatrixSimilarity([tfidf1],num_features=len(self.model.id2word)) # return math.sqrt(index[tfidf2][0])*4. + 1 tfidf1 = self.model[self.to_bow(s1)] tfidf2 = self.model[self.to_bow(s2)] common_terms = set(zip(*tfidf1)[0])&set(zip(*tfidf2)[0]) similarity = 0. tfidf_total = 0. for term, freq in tfidf1 + tfidf2: if term in common_terms: similarity += freq tfidf_total += freq val = math.sqrt(similarity/tfidf_total)*5. if val < 1.: val +=1. return val
def __init__(self): self.wnl = LancasterStemmer()
def LancasterTokenizer(s): from nltk import word_tokenize from nltk.stem import LancasterStemmer stemmer = LancasterStemmer() return [stemmer.stem(t) for t in word_tokenize(s)]
class LancasterTokenizer(object): def __init__(self): self.wnl = LancasterStemmer() def __call__(self, doc): return [self.wnl.stem(t) for t in wordpunct_tokenize(doc)]
def __init__(self): self.ls = LancasterStemmer() self.rx = RegexpTokenizer(r"(?u)\b\w\w+\b")
__author__ = "pratap" # Porter Stemmer from nltk.stem import PorterStemmer stemmer = PorterStemmer() print stemmer.stem("cooking") print stemmer.stem("cookery") # Lancaster Stermmer from nltk.stem import LancasterStemmer lanc_stemmer = LancasterStemmer() print lanc_stemmer.stem("cooking") print lanc_stemmer.stem("cookery")
def __init__(self): """ Constructs a tokenizer object """ self.stemmer = LancasterStemmer()
import nltk from nltk.stem import LancasterStemmer stemmerlan=LancasterStemmer() print(stemmerlan.stem('working')) print(stemmerlan.stem('happiness'))
Tokens2.append(w) #fix ascii again, don't know whats happening here! for w in Tokens2: for char in w: if ord(char) > 128: w. #stemming Tokens3 = [] for w in Tokens2: Tokens3.append(lanStem.stem(w)) return Tokens3 test = clean(tesSum1) #read files tesSum1 = teslaSummary1.read() tesSum5 = teslaSummary5.read() #stpWrds = set(stopwords.words("english")) #print stpWrds sum1SentTok = sentTok(tesSum1) sum2SentTok = sentTok(tesSum2) sum1WordTok = wordTok(tesSum1) sum2WordTok = wordTok(tesSum2) for w in sum1WordTok: tryAgain + LancasterStemmer.stem('stemming')
print remove_repeated_characters(sample_sentence_tokens) # porter stemmer from nltk.stem import PorterStemmer ps = PorterStemmer() print ps.stem('jumping'), ps.stem('jumps'), ps.stem('jumped') print ps.stem('lying') print ps.stem('strange') # lancaster stemmer from nltk.stem import LancasterStemmer ls = LancasterStemmer() print ls.stem('jumping'), ls.stem('jumps'), ls.stem('jumped') print ls.stem('lying') print ls.stem('strange') # regex stemmer from nltk.stem import RegexpStemmer rs = RegexpStemmer('ing$|s$|ed$', min=4) print rs.stem('jumping'), rs.stem('jumps'), rs.stem('jumped') print rs.stem('lying')
def stemmed(self,word): stemmer = LancasterStemmer() return stemmer.stem(word)
def __init__(self, tfidf_file, id2wordFile=None): self.model = models.TfidfModel.load(tfidf_file) self.stemmer = LancasterStemmer() self.stopwords = set([self._preprocess_word(word) for word in stopwords]) #self.stem_model() print "done"
# -*- coding: utf-8 -*- # <nbformat>2</nbformat> # <markdowncell> # <h2>Stemming Words</h2> # <p>Stemming is the process of removing <em>affixes</em> from a word to obtain it's root, or <em>stem</em>. For example, the stem of <strong> # growing</strong> is <strong>grow</strong>. </p> # <p>Python includes 4 stemming algorithms, 3 of which are demonstrated below. The fourth, <em>Snowball</em> is for non-English languages # and is not covered here but is in the text </p> # <codecell> from nltk.stem import PorterStemmer, LancasterStemmer, RegexpStemmer porter = PorterStemmer() lancaster = LancasterStemmer() reg = RegexpStemmer('ing') g = 'growing' print 'Porter yields: ',porter.stem(g) print 'lancaster yields: ', lancaster.stem(g) print 'Regexp yields: ', reg.stem(g) # <markdowncell> # <p>The output of various words can be different between stemmers:</p> # <codecell> g = 'cookery' print 'Porter yields: ',porter.stem(g) print 'lancaster yields: ', lancaster.stem(g)
def __init__(self): self.stemmer = LancasterStemmer() self.stopwords = set([self.stemmer.stem(word) for word in stopwords])
return False def isMedium(url): print(url) try: a = re.search(ur'^http://thenextweb.+?',url) except UnicodeEncodeError: print('encode error') else: print('encode success') if a: return True else: return False stem = LancasterStemmer() wnl = WordNetLemmatizer() wordStemed = '' wordLemmatized = '' urlBegin = 'http://thenextweb.com/section/tech/' urlUnused.add(urlBegin) urlUnused.add('http://thenextweb.com/apple/2016/10/13/apple-in-talks-with-australian-company-to-bring-dynamic-keyboard-tech-to-macbooks/') i=0 r=800 #load url while r<1200: tempArticle = '' url = urlUnused.pop() # print url if url not in urlUsed: headers = {'User-Agent' : user_agent}