def stemming(words_l, type="PorterStemmer", lang="english", encoding="utf8"): supported_stemmers = [ "PorterStemmer", "SnowballStemmer", "LancasterStemmer", "WordNetLemmatizer"] if type is False or type not in supported_stemmers: return words_l else: l = [] if type == "PorterStemmer": stemmer = PorterStemmer() for word in words_l: l.append(stemmer.stem(word).encode(encoding)) if type == "SnowballStemmer": stemmer = SnowballStemmer(lang) for word in words_l: l.append(stemmer.stem(word).encode(encoding)) if type == "LancasterStemmer": stemmer = LancasterStemmer() for word in words_l: l.append(stemmer.stem(word).encode(encoding)) if type == "WordNetLemmatizer": # TODO: context wnl = WordNetLemmatizer() for word in words_l: l.append(wnl.lemmatize(word).encode(encoding)) return l
def _log_likelihood(answer_text, stemmed_vocabulary, distrib_matrix): LL = 0 if answer_text is not '': tokens = word_tokenize(str(answer_text), language='english') porter_stemmer = PorterStemmer() unique_wordcount = len(stemmed_vocabulary) """ per ogni w unica print_function words Cw = conta w in answer_text PwM = self.distrib_matrix[stemmer(w)] unique_wordcount = len(tokenize(answer_text) """ for w in tokens: _w = w.strip().lower() Cw = 0 for _ in answer_text.split(): if _w == _.strip().lower(): Cw += 1 try: w_stem = porter_stemmer.stem(_w.decode('utf-8', 'replace').encode('ascii', 'replace')) except AttributeError: w_stem = porter_stemmer.stem(_w) try: PwM = distrib_matrix[w_stem] except KeyError: # key error means frequency is equal to cutoff point 1 PwM = 1 LL += (Cw * log(float(PwM))) try: LL = "{0:.2f}".format(LL / float(unique_wordcount)) except ZeroDivisionError: LL = 0 return LL
def get_ngram_features(self): stemmer = PorterStemmer() top_features = [(stemmer.stem(token) + "__TOP__", True) for token in self.top_text] bottom_features = [(stemmer.stem(token) + "__BOTTOM__", True) for token in self.bottom_text] all_features = [(stemmer.stem(token) + "__ALL__", True) for token in self.all_text] self.ngram_features = dict(top_features + bottom_features + all_features)
def stem(input): from nltk import PorterStemmer stemmer = PorterStemmer(); stemmed_training_input = []; stemmed_testing_input = []; for training_example in input['training']: word_list = training_example.split(); stemmed_training_input.append(' '.join([stemmer.stem(word) for word in word_list])) for testing_example in input['testing']: word_list = testing_example.split(); stemmed_testing_input.append(' '.join([stemmer.stem(word) for word in word_list])) result = {'training':stemmed_training_input, 'training_labels':input['training_labels'], 'testing':stemmed_testing_input, 'testing_labels':input['testing_labels']} return result
def openAndProcessingFiles(path,resultDict): # Main Function for filename in os.listdir(os.getcwd()+path): thisFile = open(os.getcwd()+path+'/'+filename,'r') #open the file and process each file currentTextString = " ".join(thisFile.read().split())#store the file as a string for removing HTML tags textAfterHtmlRemovingString = re.sub('<[^>]*>', '', currentTextString) # remove HTML tags (String) textAfterHtmlRemovingList = textAfterHtmlRemovingString.split() # convert String to List for the text contains only characters textRemoveingUnnecessaryCharactersList = [removeUnnecessaryCharacters(word) for word in textAfterHtmlRemovingList ] textRemoveingUnnecessaryCharactersList = [word for word in textRemoveingUnnecessaryCharactersList if word is not None] stop_words = set(stopwords.words('english')) stop_words.update(['texthtml', 'html', 'server', "email", 'date', 'gmt', 'www']) # By analying the previous result set, continully adding new stopwords textAfterStopwordsRemovingList = [word for word in textRemoveingUnnecessaryCharactersList if word not in stop_words] #remove stopwords stemmer = PorterStemmer() #stemming for eachWord in textAfterStopwordsRemovingList: eachWord = stemmer.stem(eachWord) storeToResultDict(eachWord,resultDict) thisFile.close()
def stemm(cls, tokens): stemmer = PorterStemmer() for i, t in enumerate(tokens): tokens[i] = stemmer.stem(t) return tokens
def process_email(filename): f = open(filename, 'r') text = f.read() f.close() text = text.lower() #replaces html tags by space text = re.sub(r'<[^<>]+>', ' ', text) #replaces numbers by word number text = re.sub(r'[0-9]+', 'number', text) #replaces URLs by word httpaddr text = re.sub(r'(http|https)://[^\s]*', 'httpaddr', text) #replaces email addresses by word emailaddr text = re.sub(r'[^\s]+@[^\s]+', 'emailaddr', text) #replaces dollar signs with word dollar text = re.sub(r'[$]+', 'dollar', text) #removes punctuation and non-words and separates words words = re.split('[^a-z0-9]| ', text) #removes nans words = filter(lambda x: x!='', words) #reduces words to their stems stemmer = PorterStemmer() words = [stemmer.stem(word) for word in words] return words
class Model(FileIO): def __init__(self, *args, **kwargs): FileIO.__init__(self, *args, **kwargs) self.data_list = [] self.stemmer = PorterStemmer() # correct syntax? self.score_map = self.ranges = def isInt(self, val): try: val = int(val) return True except ValueError: return False def cleanString(self, word): if (word not in stopwords) and (word is not " ") and (self.isInt(word) is False): word = word.lower() return self.stemmer.stem(word) else: return None def makeScoreList(self): '''Initialize a new array of 0s for each range''' s_list = [0] * len(self.ranges))
def main(): with open("sentiment.txt", 'r') as _file: stemmer = PorterStemmer() features = [] for words in _file: feature = [] is_sentence = True # 極性ラベルを除外 for word in words.split()[1:]: try: word = word.decode("utf-8") if word not in [".", ",", ":", "?", "!"] \ and not has_stop_list(word): feature.append(stemmer.stem(word)) except UnicodeDecodeError: # 文字化けは無視する is_sentence = False break if is_sentence: features.append(feature) return features
def review_to_words(raw_review, remove_stopwords = False): # BeautifulSoup pulls data out of html file # here it removes html tags and markups text = BeautifulSoup(raw_review).get_text() # replace numbers by word number text=re.sub(r'[0-9]+','number',text) # remove punctuations (they can be analyzed for better results) text = re.sub(r'[^a-zA-Z]', ' ', text) text = text.lower() #make a list of words words_list = text.split() #download nltk text data sets, including stop words #nltk.download() if remove_stopwords: # get stopwords, searching a set is faster than searching a list stops = set(stopwords.words('english')) # remove stopwords words_list = [word for word in words_list if not word in stops] # reduce words to their stems stemmer=PorterStemmer() words_list=[stemmer.stem(word) for word in words_list] # return the list of words return words_list
def normalize(word): ''' normalize the the word for query or indexing :param word: unicode string :return: unicode string of the normalized ter ''' porter = PorterStemmer() return porter.stem(word) if word[0].isalpha() else ''
def processContent(self, content): stemmer = PorterStemmer() tokens = word_tokenize(content) tokens = filter(lambda x: len(x) < 20 and x.isalnum(), tokens) tokens = [stemmer.stem(token.lower()) for token in tokens] tokens = filter(lambda x: x not in stopwords.words('english'), tokens) tokens = [str(token) for token in tokens] bow = FreqDist(tokens) return(bow)
def main(): # Use file defined by BIOC_IN as default if no other provided bioc_in = BIOC_IN if len(sys.argv) >= 2: bioc_in = sys.argv[1] # A BioCReader object is put in place to hold the example BioC XML # document bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE) # A BioCWRiter object is prepared to write out the annotated data bioc_writer = BioCWriter(BIOC_OUT) # The NLTK porter stemmer is used for stemming stemmer = PorterStemmer() # The example input file given above (by BIOC_IN) is fed into # a BioCReader object; validation is done by the BioC DTD bioc_reader.read() # Pass over basic data bioc_writer.collection = bioc_reader.collection # Get documents to manipulate documents = bioc_writer.collection.documents # Go through each document annotation_id = 0 for document in documents: # Go through each passage of the document for passage in document: # Stem all the tokens found stems = [stemmer.stem(token) for token in wordpunct_tokenize(passage.text)] # Add an anotation showing the stemmed version, in the # given order for stem in stems: annotation_id += 1 # For each token an annotation is created, providing # the surface form of a 'stemmed token'. # (The annotations are collectively added following # a document passage with a <text> tag.) bioc_annotation = BioCAnnotation() bioc_annotation.text = stem bioc_annotation.id = str(annotation_id) bioc_annotation.put_infon('surface form', 'stemmed token') passage.add_annotation(bioc_annotation) # Print file to screen w/o trailing newline # (Can be redirected into a file, e. g output_bioc.xml) sys.stdout.write(str(bioc_writer)) # Write to disk bioc_writer.write()
def stemmingword(word_list, stemtype='porter'): if stemtype == 'porter': stemengine = PorterStemmer() else: stemengine = LancasterStemmer() try: filtered_words = [stemengine.stem(token).encode('latin-1', errors='ignore') for token in word_list] except UnicodeDecodeError, e: print 'Error en el tipo de caracteres descartando texto "{}"'.format(' '.join(word_list))
class PorterStemmerTokenizer(object): """A tokenizer that also stems tokens using a porter stemmer""" def __init__(self): self.non_alphanum_regex = re.compile('[^ 0-9a-zA-Z]') self.porter = PorterStemmer() def __call__(self, doc): doc = self.non_alphanum_regex.sub(' ', doc) tokens_alpha = word_tokenize(doc.lower()) return [self.porter.stem(t) for t in tokens_alpha]
class Tokenizer(object): def __init__(self): self.stem = PorterStemmer() self.punct = set(string.punctuation) | set(['·™','..','...','....','.....','......']) self.punct = self.punct | set(["``", "·", "–", "--", "”","—","•","—"]) def __call__(self, doc): return [t.lower() for t in word_tokenize(doc) if t not in self.punct] def stem_toke(self, doc): return [self.stem.stem(t.lower()) for t in word_tokenize(doc) if t not in self.punct]
def getPosWords(): stemmer = PorterStemmer() stemmedPosTokens = [] pos = open(r'pos.txt').read() pos = re.sub("\d", "", pos) posWords = nltk.word_tokenize(pos) for posWord in posWords: stemmedPosWord = stemmer.stem(posWord) stemmedPosTokens.append(stemmedPosWord.lower()) return stemmedPosTokens
def stemmer(self, raw): """ Use porter stemmer from nltk library to stem tokens in raw text. """ tokens = word_tokenize(raw) porter = PorterStemmer() # lancaster = LancasterStemmer() # stem_lancaster = [lancaster.stem(t) for t in tokens] stem_porter = [porter.stem(t) for t in tokens] return stem_porter
def getUncertainWords(): stemmer = PorterStemmer() stemmedUnTokens = [] un = open(r'uncertain.txt').read() un = re.sub("\d", "", un) unWords = nltk.word_tokenize(un) for unWord in unWords: stemmedUnWord = stemmer.stem(unWord) stemmedUnTokens.append(stemmedUnWord.lower()) return stemmedUnTokens
def update_Porter_stemming(): #We use stems occasionally. "Updating stems from Porter algorithm..." from nltk import PorterStemmer stemmer = PorterStemmer() cursor.execute("""SELECT word FROM words WHERE wordid <= 750000 and stem is null;""") words = cursor.fetchall() for local in words: word = ''.join(local) if re.match("^[A-Za-z]+$",word): query = """UPDATE words SET stem='""" + stemmer.stem(''.join(local)) + """' WHERE word='""" + ''.join(local) + """';""" z = cursor.execute(query)
def tokenize(self, sentence, do_stopwords, do_stemming,use_bigrams): words = word_tokenize(sentence) words = [w.lower() for w in words if len(w) > 2] if do_stopwords: words = [w for w in words if w not in stop_set] if do_stemming: stemmer = PorterStemmer() words = [stemmer.stem(w) for w in words] if use_bigrams: words = bigrams(words) return words
def getNegWords(): stemmer = PorterStemmer() stemmedNegTokens = [] neg = open(r'neg.txt').read() neg = re.sub("\d", "", neg) negWords = nltk.word_tokenize(neg) for negWord in negWords: stemmedNegWord = stemmer.stem(negWord) stemmedNegTokens.append(stemmedNegWord.lower()) return stemmedNegTokens
def normalize_data(lines): norm_words = [] punctuation = ['!', '.', ';', ':', '\'', '"','`','?'] exceptions = ['\n', '\'s', '\'t', " "] stemmer = PorterStemmer() stop = stopwords.words('english') mega_stop_list = list(itertools.chain(punctuation, exceptions)) print " Now Normalizing......." for sentence in lines: words = [stemmer.stem(word.lower()) for word in word_tokenize(sentence.rstrip("\\n")) if word not in [stop, "not"]] norm_words.extend([word for word in negate_Ngram(words) if not re.match("[0-9]+", word) if word.lower() not in mega_stop_list]) return norm_words
def buildTrainTokensBigram(self): self.trainTokens = [] with open(self.trainingData, 'r') as reviews: for review in reviews: data = json.loads(review) words = word_tokenize(data['text']) words = [norm(word) for word in words if norm(word)] words = [word for word in words if word not in stwords] stemmer = PorterStemmer() words = [stemmer.stem(word) for word in words] featureSet = self.buildWordFeatureSetBigram(words) self.trainTokens.append((featureSet, data['stars']))
def clean_data_to_feed_classifier(tweests): st = PorterStemmer() stop = stopwords.words('english') parsed_tweests = [] for x in tweests: y=x[0] y = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",y).split()) y = ' '.join(re.sub(r'(.)\1+', r'\1\1', i.lower()) for i in y.split() if i not in stop) y = ' '.join(st.stem(i) for i in y.split() if len(i) > 3 and i.isalpha() and wordnet.synsets(i)) # y = punctuations_repl(y) parsed_tweests.append(y) return parsed_tweests
class EnglishStemmer: """ Stemmer wrapper on Sumy's Stemmer for compatibility reasons with summarizer, but uses nltk's porter stemmer to do the actual stemming. """ def __init__(self): self.__stemmer=PorterStemmer() def __call__(self, word): return self.__stemmer.stem(word)
def tokenize_normalize(raw): ''' tokenize raw texts :param raw: unicode string :return: list[unicode]: a list of tokenized unicode Example: words = tokenize_normalize(line) ''' tokens = [t for t in word_tokenize(raw) if len(t) < 20] # don't use any token too long (like genetic sequence) porter = PorterStemmer() tokens_n = [porter.stem(t) for t in tokens if t[0].isalpha()] # only interested in word tokens_n = ['NUMBER' if all(a.isdigit() for a in t) else t for t in tokens_n] # combine all numbers to one return tokens_n
def update_Porter_stemming(self): #We use stems occasionally. print "Updating stems from Porter algorithm..." from nltk import PorterStemmer stemmer = PorterStemmer() cursor = db.query("""SELECT word FROM words""") words = cursor.fetchall() for local in words: word = ''.join(local) #Could probably take the first element of the tuple as well? #Apostrophes have the save stem as the word, if they're included word = word.replace("'s","") if re.match("^[A-Za-z]+$",word): query = """UPDATE words SET stem='""" + stemmer.stem(''.join(local)) + """' WHERE word='""" + ''.join(local) + """';""" z = cursor.execute(query)
def review_mapper(self, _, data): review = data['text'] rating = data['stars'] business_id = data['business_id'] category = data['category'] words = word_tokenize(review) words = [norm(word) for word in words if norm(word)] words = [word for word in words if word not in stwords] tagged_words = tagger.tag(words) stemmer = PorterStemmer() tagged_words = [(stemmer.stem(tagged_word[0]), tagged_word[1]) for tagged_word in tagged_words] for tagged_word in tagged_words: yield (category, tagged_word), (business_id, rating, 1)
def text_preprocessing(text): #lowercase everything text = text.lower() #remove punctuation regex = re.compile('[%s]' % re.escape(string.punctuation)) text = regex.sub(" ", text) #remove stopwords no_stopwords = [word for word in text.split() if word.lower() not in ext_stopwords] text = " ".join(no_stopwords) #stem the words stemmer = PorterStemmer() text = " ".join([stemmer.stem(w) for w in text.split()]) return text
def get_non_zero_count(list_of_wiki_files): non_zero_count = 0 porter_stemmer = PorterStemmer() for fn in list_of_wiki_files: print(fn, non_zero_count) sys.stdout.flush() start_time = time.time() with open(wiki_files_path + fn, 'r') as openfile: for line in file_reader_generator(openfile): json_dict = json.loads(line) file_key = json_dict['id'] if file_key: text_data = json_dict['text'] # Tokenise tokens = word_tokenize(text_data) if tokens: # Lowercase tokens = list(map(lambda x: x.lower(), tokens)) # Remove stopwords tokens = list( filter(lambda l_ph: l_ph not in stop_words, tokens)) # Remove punctuation and stem tokens = [ porter_stemmer.stem((val.translate(translator))) for val in tokens ] # tokens = list(map(lambda val: PorterStemmer().stem(val), tokens)) tokens = set(tokens) if '' in tokens: tokens.remove('') non_zero_count += len(tokens) end_time = time.time() print('time taken', end_time - start_time) sys.stdout.flush() return non_zero_count
def tokenize_and_normalize(file_name): """ this function takes in a path to a song, reads the song file, tokenizes it into words, then stems and lowercases these words. INPUT: file_name - a path to a file as a string OUTPUT: normalized_song - a song represented as a list of stems. """ ps = PorterStemmer() # YOUR CODE HERE song = open(file_name, 'r').read() song = word_tokenize(song.lower()) normalized_song = [None] * len(song) for i, word in enumerate(song): normalized_song[i] = ps.stem(word) return normalized_song
def frequencyMatrix(sentences): frequency_matrix = {} stop_words = stopwords.words('english') ps = PorterStemmer() for sentence in sentences: frequency_table = {} tokenized_words = word_tokenize(sentence) for word in tokenized_words: word = word.lower() word = ps.stem(word) if word in stop_words: continue if word in frequency_table: frequency_table[word] += 1 else: frequency_table[word] = 1 frequency_matrix[sentence[:15]] = frequency_table return frequency_matrix
def twitter_setiment_analyze(test): """ This function main goal is to pre-process the data from the input dataset going through tokenization, stemming and others processes """ global bow warnings.filterwarnings("ignore", category=DeprecationWarning) def remove_pattern(text,pattern): # re.findall() finds the pattern i.e @user and puts it in a list for further task r = re.findall(pattern,text) # re.sub() removes @user from the sentences in the dataset for i in r: text = re.sub(i,"",text) return text test['Tidy_Tweets'] = np.vectorize(remove_pattern)(test['tweet'], "@[\w]*") test['Tidy_Tweets'] = test['Tidy_Tweets'].str.replace("[^a-zA-Z#]", " ") test['Tidy_Tweets'] = test['Tidy_Tweets'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3])) tokenized_tweet = test['Tidy_Tweets'].apply(lambda x: x.split()) from nltk import PorterStemmer ps = PorterStemmer() tokenized_tweet = tokenized_tweet.apply(lambda x: [ps.stem(i) for i in x]) for i in range(len(tokenized_tweet)): tokenized_tweet[i] = ' '.join(tokenized_tweet[i]) test['Tidy_Tweets'] = tokenized_tweet return test['Tidy_Tweets']
class Stemmer: def __init__(self): self.ps = PorterStemmer() def stem_term(self, token): """ This function stem a token :param token: string of a token :return: stemmed token """ return self.ps.stem(token[0]) def porter_stemmer(self, terms_list): index = 0 for w in terms_list: new_stem = self.stem_term(w) if new_stem != w[0]: terms_list[index] = (new_stem, terms_list[index][1]) index += 1 return terms_list
def _create_frequency_matrix(sentences): frequency_matrix = {} stopWords = set(stopwords.words("english")) ps = PorterStemmer() # here i denotes the index of sentence for i, sent in list(enumerate(sentences)): freq_table = {} words = word_tokenize(sent) for word in words: word = word.lower() word = ps.stem(word) if word in stopWords: continue if word in freq_table: freq_table[word] += 1 else: freq_table[word] = 1 frequency_matrix[sent[:15] + str(i)] = freq_table return frequency_matrix
def process_sentence(tokens, preprocessing_params): if preprocessing_params[1]: stopwordlist = set(stopwords.words("english")) else: stopwordlist = [] # Create Lemmatizer and Stemmer. lemmatizer = WordNetLemmatizer() stemmer = PorterStemmer() processed_sentence = [] partofspeech = [] for word, tag in pos_tag(tokens): if len(word) > 1: if word not in stopwordlist: if tag.startswith('NN'): pos = 'n' # noun elif tag.startswith('VB'): pos = 'v' # verb elif tag.startswith('JJ'): pos = 'a' # adjective elif tag.startswith('RB'): pos = 'r' # adverb else: pos = 'o' # other if pos in ['n', 'v', 'a', 'r']: word = lemmatizer.lemmatize(word, pos) else: word = lemmatizer.lemmatize(word) # now stem if preprocessing_params[0]: word = stemmer.stem(word) processed_sentence.append(word) partofspeech.append(pos) final_text = ' '.join(processed_sentence) final_pos = ' '.join(partofspeech) return final_text, final_pos