def stem_words(self, words: List[str]) -> List[str]: """Stem list of words with PyStemmer.""" language_code = self.language_code() words = decode_object_from_bytes_if_needed(words) # Normalize apostrophe so that "it’s" and "it's" get treated identically (it's being done in # _tokenize_with_spaces() too but let's not assume that all tokens that are to be stemmed go through sentence # tokenization first) words = [word.replace("’", "'") for word in words] if language_code is None: raise McLanguageException("Language code is None.") if words is None: raise McLanguageException("Words to stem is None.") # (Re-)initialize stemmer if needed if self.__pystemmer is None: try: self.__pystemmer = PyStemmer(language_code) except Exception as ex: raise McLanguageException( "Unable to initialize PyStemmer for language '%s': %s" % ( language_code, str(ex), )) stems = self.__pystemmer.stemWords(words) if len(words) != len(stems): log.warning( "Stem count is not the same as word count; words: %s; stems: %s" % ( str(words), str(stems), )) # Perl's Snowball implementation used to return lowercase stems stems = [stem.lower() for stem in stems] return stems
def __init__(self): self.stemmer = PyStemmer('porter')
from __future__ import print_function import xml.etree.ElementTree as etree import re, os, heapq, math, operator, string, time, sys from collections import * from Stemmer import Stemmer as PyStemmer import glob reload(sys) sys.setdefaultencoding('utf-8') ps = PyStemmer('porter') if (len(sys.argv[1:]) < 1): print("Needs 1 argument, the index directory") sys.exit() indexDirPth = sys.argv[1] # qryTxtFlPth = sys.argv[2] # outTxtFlPth = sys.argv[3] # if not os.path.exists(outTxtFlPth): # with open(outTxtFlPth, 'w+'): pass # else: # open(outTxtFlPth, 'w').close() absltPthCurrPrgrm = os.path.abspath(os.path.dirname(sys.argv[0])) ########################################################################### stopwords = dict() inverted_index_file, mapping, doc_offset = list(), list(), list() inverted_index_file.append( open(os.path.join(indexDirPth, 'title/final.txt'), 'r'))
def __init__(self): # type: () -> None self.stemmer = PyStemmer('porter')