def token(X, words_only=False, word_normalize=True, emoji_normalize=True, remove_digits=True, lower_case=True, stop_words=None): ''' requires Stemming if word_normalize = True use pip[env] install stemming ''' # eyes [nose] mouth | mouth [nose] eyes pattern emoticons = r"(?:[<>]?[:;=8][\-o\*\']?[\)\]\(\[dDpP/\:\}\{@\|\\]|[\)\]\(\[dDpP/\:\}\{@\|\\][\-o\*\']?[:;=8][<>]?)" emoticon_re = re.compile(emoticons, re.VERBOSE | re.I | re.UNICODE) # Keep word only. Digit are consider true Emojis false if words_only: clean_text = re.sub('[\W]+', ' ', X) else: clean_text = '{}{}'.format(re.sub('[\W]+', ' ', X), ''.join(re.findall(emoticon_re, X))) # normalize emoji? if emoji_normalize: clean_text = (re.sub('[\W]+', ' ', X) + ' '.join( re.findall(emoticon_re, X)).replace(';', ':').replace('-', '')) if remove_digits: clean_text = clean_text.translate(str.maketrans('', '', '0123456789')) if lower_case: clean_text = clean_text.lower() if word_normalize: try: import Stemmer stemmer = Stemmer.Stemmer('danish') clean_text = ' '.join(stemmer.stemWords(clean_text.split())) except ModuleNotFoundError: print('Stemmer is not found. Try "pip install pystemmer"') print('Words not normalize') pass #Continue with issue if stop_words: return [word for word in clean_text.split() if word not in stop_words] else: return clean_text.split()
def filter_stemmer(words = set([]), lang="pt"): ''' Invoca a biblioteca que retira de uma palavra todas a partes desnecessárias. @param words: Um set com as palavras para aplicar o stemmer. @return: List com as palavras após o stemmer. ''' stemmer = Stemmer.Stemmer(lang) #@UndefinedVariable text = [] for word in words: stm = stemmer.stemWord(word) if len(stm) > 0: text.append(stm.lower()) return text
def __init__(self, index_path=""): self.path = index_path self.doccount = 0 self.token_count = 0 self.init_counts() self.ranker = bm25.BM25(index_path=self.path, doccount=self.doccount, tokcount=self.token_count) self.stopwords = set() self.init_stopwords() self.stemmer = Stemmer.Stemmer('english')
def __init__(self, index_path): super().__init__() self.index_path = index_path self.categories = sorted( ["references", "body", "infobox", "title", "category", "links"]) self.query_categories = { "c:": "category", "b:": "body", "t:": "title", "i:": "infobox", "r:": "references", "e:": "links" } self.tokens_dict = self.get_tokens() self.stemmer = Stemmer.Stemmer("english")
def splitstringStemKazStop(str, stoplist): words = [] str = str.lower() str = str.replace("ё", "е") stemmer = Stemmer.Stemmer('russian') # for i in re.split('[;,.,\n,\s,:,-,+,(,),=,/,«,»,\d,!,?,"]',str): # re.split("(?:(?:[^а-яА-Я]+')|(?:'[^a-zA-Z]+))|(?:[^a-zA-Z']+)" for i in re.split("(?:[^а-я0-9әғқңөұүһі]+)", str): if len(i) > 1 and len(i) <= 17: if i not in stoplist: stemmed = stemmer.stemWord(i) if len(stemmed) > 1: words.append(stemmed) # words.append(i) # without stamming return words
def __init__(self, xml_file_path, index_directory, stop_words_file): self.parser = etree.iterparse(xml_file_path, events=("start", "end")) # Reading stop words list with open(stop_words_file, "r") as fp: self.stop_words = fp.readlines() # print(self.stop_words) self.stop_words_dict = {} # self.stop_words = [word.strip("'") for word in self.stop_words] for word in self.stop_words: self.stop_words_dict[word.split("\n")[0]] = 1 self.stemmer = Stemmer.Stemmer('english') self.postings_dictionary = dict() self.index_directory = index_directory
def __init__(self): self.seek_list = None self.comment_file = None self.index_file = None self.symbol_to_encoding_dict = None self.cids = None self.comment_offsets_cid = None self.comment_offsets = None self.comment_term_counts = None self.comment_csv_reader = None self.authors_list = None self.articles_list = None self.reply_to_index = None self.collection_term_count = 0 self.stemmer = Stemmer.Stemmer('english') self.tokenizer = nltk.tokenize.ToktokTokenizer() self.report = Report()
def clean_portuguese_text(text): text = clean_text(text) stop_words = get_stop_words('pt') stop_words = get_stop_words('portuguese') stop_words.append("rua") stop_words.append("estrada") stop_words.append("citada") stop_words.append("citado") stop_words.append("endereço") stop_words.append("endereco") stop_words.append("caminho") stop_words.append("período") stop_words.append("periodo") stop_words.append("próximo") stop_words.append("proximo") stop_words.append("próxima") stop_words.append("proxima") stop_words.append("mencionado") stop_words.append("mencionada") stop_words.append("altura") stop_words.append("complementa") stop_words.append("denuncia") stop_words.append("denúncia") stop_words.append("diariamente") stop_words.append("avenida") stop_words.append("município") stop_words.append("municipio") words = text.split(' ') content = "" stemmer = Stemmer.Stemmer('portuguese') for word in words: word = word.lower() if word not in stop_words and word.strip(): content = content + stemmer.stemWord(word.lower()).upper() + " " # Removing last space content.rstrip() content.lstrip() return content
def __init__(self, language): """Initialize a StemNormalizer. Args: language: a PyStemmer language. These can be seen by listing Stemmer.algorithms(), but current options are: danish, dutch, english, finnish, french, german, hungarian, italian, norwegian, portuguese, romanian, russian, spanish, swedish, turkish. You can also specify "porter" to get the classic Porter stemmer for English. """ super(StemNormalizer, self).__init__() self.stemmer = Stemmer.Stemmer(language.lower())
def __init__(self, enable_case_folding=True, enable_remove_stop_words=True, enable_stemmer=False, enable_lemmatizer=True, min_length=2): self.steps = [] self.SPLIT_WORDS_PATTERN = re.compile( r'\s|\/|\\|\.|\:|\?|\(|\)|\[|\]|\{|\}|\<|\>|\'|\!|\"|\-|,|;|\$|\*|\%|#' ) self.steps.append(self.__split_words) if enable_case_folding: self.steps.append(self.__case_folding) if enable_remove_stop_words: self.steps.append(self.__remove_stop_words) self.stop_words = { 'a', 'able', 'about', 'across', 'after', 'all', 'almost', 'also', 'am', 'among', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 'because', 'been', 'but', 'by', 'can', 'cannot', 'could', 'dear', 'did', 'do', 'does', 'either', 'else', 'ever', 'every', 'for', 'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her', 'hers', 'him', 'his', 'how', 'however', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like', 'likely', 'may', 'me', 'might', 'most', 'must', 'my', 'neither', 'no', 'nor', 'not', 'of', 'off', 'often', 'on', 'only', 'or', 'other', 'our', 'own', 'rather', 'said', 'say', 'says', 'she', 'should', 'since', 'so', 'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there', 'these', 'they', 'this', 'tis', 'to', 'too', 'twas', 'us', 'wants', 'was', 'we', 'were', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'would', 'yet', 'you', 'your' } if enable_stemmer: self.steps.append(self.__stem) self.stemmer = Stemmer.Stemmer('english') if enable_lemmatizer: self.steps.append(self.__lemmatiza) self.lemmatizer = WordNetLemmatizer() if min_length: self.steps.append( lambda words: self.__remove_short_words(words, min_length))
def __init__( self, stem_threshold=STEM_THRESHOLD, max_token_length=MAX_TOKEN_LENGTH, min_split_length=MIN_SPLIT_LENGTH, single_shot=False, save_token_style=False, attach_upper=True, use_nn=False, nn_model=None, ): """ Initialize a new TokenSplitter. :param stem_threshold: We do not stem split parts shorter than or equal to this size. :param max_token_length: We cut identifiers longer than this value. :param min_split_length: We do not split source code identifiers shorter than this value. \ If you do not want to filter small tokens set min_split_length=1. :param single_shot: True if we do not want to join small identifiers to next one. \ Example: 'sourced.ml.algorithms' → ["sourc", "sourcedml", "algorithm", "mlalgorithm"].\ If True we have only ["sourc", "algorithm"]. \ :param save_token_style: value indicating whether yield metadata that can be used to \ reconstruct the initial identifier. :param attach_upper: True to attach the last of several uppercase letters in a row to \ the next token. Example: 'HTMLResponse' -> ["html", "response"] if True, \ 'HTMLResponse' -> ["htmlr", "esponse"] if False. :param use_nn: value indicating whether to use the Neural Network-based splitter instead \ of the heuristics. :param nn_model: IdentifierSplitterBiLSTM model UUID to load. None means the most recent. """ self._stemmer = Stemmer.Stemmer("english") self._stemmer.maxCacheSize = 0 self._stem_threshold = stem_threshold self._max_token_length = max_token_length self._min_split_length = min_split_length self._single_shot = single_shot self._save_token_style = save_token_style self._attach_upper = attach_upper self._id_splitter_nn = None if use_nn: self._init_nn(nn_model) if self._save_token_style and not self._single_shot: raise ValueError( "Only one of `single_shot`/`save_token_style` should be True" )
def __init__(self, stem_threshold=STEM_THRESHOLD, max_token_length=MAX_TOKEN_LENGTH, min_split_length=MIN_SPLIT_LENGTH, single_shot=DEFAULT_SINGLE_SHOT, save_token_style=SAVE_TOKEN_STYLE, attach_upper=ATTACH_UPPER): self._stemmer = Stemmer.Stemmer("english") self._stemmer.maxCacheSize = 0 self._stem_threshold = stem_threshold self._max_token_length = max_token_length self._min_split_length = min_split_length self._single_shot = single_shot self._save_token_style = save_token_style self._attach_upper = attach_upper if self._save_token_style and not self._single_shot: raise ValueError( "Only one of `single_shot`/`save_token_style` should be True")
def transform(self, X): self.X = X # eyes [nose] mouth | mouth [nose] eyes pattern emoticons = r"(?:[<>]?[:;=8][\-o\*\']?[\)\]\(\[dDpP/\:\}\{@\|\\]|[\)\]\(\[dDpP/\:\}\{@\|\\][\-o\*\']?[:;=8][<>]?)" emoticon_re = re.compile(emoticons, re.VERBOSE | re.I | re.UNICODE) # Keep word only. Digit are consider true Emojis false if self.words_only: clean_text = self.X.apply(lambda x: (re.sub('[\W]+', ' ', x))) else: clean_text = self.X.apply(lambda x: ('{}{}'.format( re.sub('[\W]+', ' ', x), ''.join(re.findall(emoticon_re, x))))) # normalize emoji? if self.emoji_normalize: clean_text = self.X.apply(lambda x: (re.sub( '[\W]+', ' ', x) + ' '.join(re.findall( emoticon_re, x)).replace(';', ':').replace('-', ''))) if self.remove_digits: clean_text = clean_text.apply( lambda x: x.translate(str.maketrans('', '', '0123456789'))) if self.lower_case: clean_text = clean_text.str.lower() if self.word_normalize: try: import Stemmer stemmer = Stemmer.Stemmer('danish') clean_text = clean_text.apply( lambda x: ' '.join(stemmer.stemWords(x.split()))) except ModuleNotFoundError: print('Stemmer is not found. Try "pip install pystemmer"') print('Words not normalize') pass #Continue with issue if self.token: return clean_text.str.split() else: return clean_text
class Cleaner(object): REJECT_BOTH = {'.', '(', ')', '!', '?', ',', 'num', 'и', 'в', 'с', 'о', 'об', 'от', 'я', 'по', 'на', 'ты', 'он' } REJECT_POST = { 'не' } stemmer = Stemmer.Stemmer('russian') def __init__(self, stemming=True, locale='ru', ngrams_size = 1): assert locale == 'ru' self._stemming = stemming self._ngrams_size = ngrams_size def words(self, text): cleaned_words = self._clean(text).split(' ') if cleaned_words[0] == '': cleaned_words = cleaned_words[1:] if len(cleaned_words) > 0 and cleaned_words[-1] == '': cleaned_words = cleaned_words[:-1] words = self._stem(cleaned_words) if self._stemming is True else cleaned_words words += self._ngrams(words) return words def _clean(self, string): string = re.sub(r"[ёЁ]", "е", string) string = re.sub(r"[^А-Яа-я0-9(),!?.]", " ", string) string = re.sub(r"\d+(\.|,)?\d*", " num ", string) string = re.sub(r"\.", " . ", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " ( ", string) string = re.sub(r"\)", " ) ", string) string = re.sub(r"\?", " ? ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip().lower() def _stem(self, words): return self.stemmer.stemWords(words) def _ngrams(self, words): if self._ngrams_size == 1: return [] ngrams = [' '.join(words[i:i + self._ngrams_size]) for i in range(len(words) - self._ngrams_size + 1) if words[i] not in self.REJECT_BOTH and words[i + 1] not in self.REJECT_BOTH and words[i + 1] not in self.REJECT_POST] return ngrams
def tokenize(self, text, lang, rem_sw, let_stemming): sents_text, sents_offset, sents_start_end, sent_based_voc= [],[],[],{} text = text.replace(chr(0), ' ') text = text.replace('*', ' ') text = text.replace('(', ' ') text = text.replace(')', ' ') text = text.replace('|', ' ') text = text.replace('\ufeff', ' ') sent_detector = nltk.data.load('tokenizers/punkt/' + lang + '.pickle') stemmer = Stemmer.Stemmer(lang) word_detector = nltk.TreebankWordTokenizer() sent_spans = sent_detector.span_tokenize(text) if rem_sw == 0: stopwords = [] elif rem_sw == 1: stopwords = copy.deepcopy(self.langstopwords[lang]) sents_vect = [] for span in sent_spans: # For each sentence sent_dic = {} sents_text.append(text[span[0]:span[1]].lower()) for word in word_detector.tokenize( sents_text[-1]): # for each word in the sentence if len(word) > 2 and word not in stopwords: if let_stemming == 1: word_pp = stemmer.stemWord(word) else: word_pp = word else: continue if word_pp in sent_dic: sent_dic[word_pp] += 1 else: sent_dic[word_pp] = 1 if word_pp in sent_based_voc: sent_based_voc[word_pp] += 1 else: sent_based_voc[word_pp] = 1 sents_vect.append(sent_dic) sents_offset.append([span[0], span[1] - span[0]]) sents_start_end.append([span[0], span[1]]) return sents_text, sents_vect, sents_offset, sents_start_end, sent_based_voc
def tokenize(text, Type): global STOP_WORDS global current_token tokens = re.split(r'[^A-Za-z0-9]+', text) length = len(tokens) current_token += length stemmer = Stemmer.Stemmer('english') if Type == 1: if length > 0 and tokens[0] == 'redirect': return if length > 1 and tokens[1] == 'redirect': return for token in tokens: cur_token = stemmer.stemWord(token.lower().casefold()) if cur_token != "" and cur_token not in STOP_WORDS and token.lower( ) not in STOP_WORDS: if cur_token not in index: index[cur_token] = [0, 0, 0, 0, 0, 0] index[cur_token][Type] += 1
def test_inaugural(self): # preparing data usprez = shorttext.data.inaugural() docids = sorted(usprez.keys()) usprez = [' '.join(usprez[docid]) for docid in docids] usprezdf = pd.DataFrame({'yrprez': docids, 'speech': usprez}) usprezdf = usprezdf[['yrprez', 'speech']] stemmer = Stemmer.Stemmer('english') # preprocesser defined pipeline = [ lambda s: re.sub('[^\w\s]', '', s), lambda s: re.sub('[\d]', '', s), lambda s: s.lower(), lambda s: ' '.join([ stemmer.stemWord(token) for token in shorttext.utils.tokenize(s) ]) ] txtpreprocessor = shorttext.utils.text_preprocessor(pipeline) # corpus making docids = list(usprezdf['yrprez']) corpus = [ txtpreprocessor(speech).split(' ') for speech in usprezdf['speech'] ] # making DTM dtm = shorttext.utils.DocumentTermMatrix(corpus, docids=docids, tfidf=True) # check results self.assertEqual(len(dtm.dictionary), 5252) self.assertAlmostEqual( dtm.get_token_occurences(stemmer.stemWord('change'))['2009-Obama'], 0.013937471327928361) numdocs, numtokens = dtm.dtm.shape self.assertEqual(numdocs, 56) self.assertEqual(numtokens, 5252) self.assertAlmostEqual(dtm.get_total_termfreq('government'), 0.27875478870737563)
def __init__(self, outputdir, statfile, iter): self.outputdir = outputdir self.statfile = statfile self.iter = iter self.createOutputDir() # porter = PorterStemmer() # lancaster=LancasterStemmer() self.stemer = Stemmer.Stemmer('english') # self.englishStemmer=SnowballStemmer("english") self.wCount = {} self.storeStem = {} self.max = int(int(resource.getrlimit(resource.RLIMIT_NOFILE)[0]) / 2) if self.max < 1: self.max = 200 self.stop_words = set(stopwords.words('english')) self.wStr = {} self.curDocID = 0 self.docID = 0 self.count = 0
def processQuery(query): #lowerCase query=query.lower() #remove space unnecessary characters re6 = re.compile(r'[\_]', re.DOTALL) query = re6.sub(' ', query) query = re.findall("\d+|[\w]+", str(query)) #remove stopwords stop_word = set(stopwords.words('english')) query = [w for w in query if w not in stop_word] #stemmize stemmer=Stemmer.Stemmer('english') stemmed_data=[] for words in query: stemmed_data.append(stemmer.stemWord(words)) query=stemmed_data return query
def improvedTokenizer(docdict): stopwords = [] #ps = PorterStemmer() s = Stemmer.Stemmer('porter') #read stopword_file fs = open("snowball_stopwords_EN.txt", 'r') for line in fs: stopwords.append(line.strip()) #split each dictionary entry (of key TI) by word text = docdict["TI"] #creates list of words not in stopwords text = list(filter(lambda word: word not in stopwords, text)) docdict["TI"] = s.stemWords(text) return docdict
def stem_data(self): """Stems the data, using Porters algorithm""" stemmer = Stemmer.Stemmer('english') # The stemming object def stem_string(string): """Input a string, returns a string with the words replaced by their stemmed equivalents""" stemmed_list = [] for word in string.split(): stemmed_word = stemmer.stemWord(word) stemmed_list.append(stemmed_word) stemmed_string = " ".join(stemmed_list) return stemmed_string self.body = stem_string(self.body) self.subject = stem_string(self.subject)
def getQueries(): global userQueries, stemQueries doc = etree.parse('topics.xml', parser=etree.XMLParser()) root = doc.getroot() for child in root.iter('topic'): query = child.find('query').text.strip() query = re.split(r'\W+(\.?\W+)*', query, flags=re.IGNORECASE) # same Regex from tokenizing userQueries[int(child.attrib['number'])] = query stopWords = getStopWords() stemmer = Stemmer.Stemmer('english') for id in sorted(userQueries): stemQueries[id] = [] # stemming using query id for query in userQueries[id]: if query and query is not None and query not in stopWords: token = query.lower() stem = stemmer.stemWord(token) stemQueries[id].append(stem)
def __init__(self): self.stemmer = Stemmer.Stemmer('english') # -------------------------------- Read and store irregular nouns self.irregularNounsPlural = [] self.irregularNouns = [] irregularNounsPluralFile = open("WordsList/irregularNounsPlural.txt", "r") irregularNounsFile = open("WordsList/irregularNouns.txt", "r") for pluralNoun in irregularNounsPluralFile: self.irregularNounsPlural.append(pluralNoun.replace('\n', '')) for noun in irregularNounsFile: self.irregularNouns.append(noun.replace('\n', '')) # ------------------------- Read and store irregular verb endings self.irregularEndings = [] irregularEndingsFile = open("WordsList/irregularEndings.txt", "r") for suffix in irregularEndingsFile: self.irregularEndings.append(suffix.replace('\n', '')) # --------------------------------------------- Remove stop words self.stopList = [] stopListFile = open("WordsList/stop.txt", "r") for stopWord in stopListFile.readlines(): stop_word = stopWord.strip() self.stopList.append(stop_word)
def compare_stemmers(algorithm, words): """ Make sure pystemmer and purestemmer return the same stems. ``algorithm`` is the name of the algorithm to be tested and ``words`` is a list of input words. """ py = Stemmer.Stemmer(algorithm) pure = purestemmer.Stemmer(algorithm) for word in words: variants = _get_variants(word) for variant in variants: py_stem = py.stemWord(variant) pure_stem = pure.stemWord(variant) assert py_stem == pure_stem, ( 'Different output for %r: pystemmer returned %r, ' + 'purestemmer returned %r.' % (variant, py_stem, pure_stem)) assert type(py_stem) == type(pure_stem), ( 'Different output types for %r: pystemmer returned %s, ' + 'purestemmer returned %s.' % (variant, type(py_stem), type(pure_stem)))
def get_data_from_file(self): stemmer = Stemmer.Stemmer('english') infile = open(self.filename, "r") self.data = json.load(infile) infile.close() for ids in self.data: tweet = self.data[ids][u'tags'] tweet_text = [words[0] for words in tweet] # print tweet_text clean_tweet = ' '.join(tweet_text) regex_form = '^rt\s+|@\w+:*|https?://[\w\.\/]*' clean_tweet = re.sub(regex_form, '', clean_tweet) clean_tweet = [stemmer.stemWord(x) for x in clean_tweet.split()] for item in clean_tweet: self.unigram_vocab[item] += 1 for item in list(nltk.bigrams(clean_tweet)): self.bigram_vocab[item] += 1 for item in list(nltk.trigrams(clean_tweet)): self.trigram_vocab[item] += 1 # break temp = [k for k, v in self.unigram_vocab.iteritems() if v >= 5] self.unigram_vocab = temp temp = [k for k, v in self.bigram_vocab.iteritems() if v >= 5] self.bigram_vocab = temp temp = [k for k, v in self.trigram_vocab.iteritems() if v >= 3] self.trigram_vocab = temp self.features = self.unigram_vocab + self.bigram_vocab + self.trigram_vocab for index, item in enumerate(self.features): self.featureIndex[item] = index # infile.close() print "Finished composing the Lexical Features"
def tf_vector(tweet): """ Transform a string into a Term-Frecuency dictionary :param tweet: Text to process :param stopwords: A list of string of stopwords :param emoticons: A list of string of emoticons (see __main__ in this script) :param emojis: A list of string of emoticons (see __main__ in this script) :return: a dict object in the form {term=count}, with all terms preprocessed """ path = os.path.dirname(os.path.abspath(__file__)) stopwords = open(os.path.join(path,DATAPATH,STOPWORDS)).read().splitlines() emoticons = open(os.path.join(path,DATAPATH,EMOTICONS)).read().splitlines() emoj = pd.read_csv(os.path.join(path,DATAPATH,EMMOJIS)) emojis = list(emoj['emoji']) token_list = ['URL', 'EMAIL', 'MENTION', 'HASHTAG', 'NUMBER', 'EMOTICON', 'EMOJI'] x = tweet x = re.sub("(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]", " URL ", x) x = re.sub("^[_A-Za-z0-9-\\\\+]+(\\\\.[_A-Za-z0-9-]+)*@[A-Za-z0-9-]+(\\\\.[A-Za-z0-9]+)*(\\\\.[A-Za-z]{2,})$", " EMAIL ", x) x = re.sub("@[A-Za-z0-9]+", " MENTION ", x) x = re.sub("#[A-Za-z0-9]+", " HASHTAG ", x) x = re.sub("\\d+(\\.\\d*)?°?", " NUMBER ", x) for em in emoticons: x = x.replace(em, ' EMOTICON ') for ej in emojis: x = x.replace(ej, ' EMOJI ') x = re.sub(u'['u'\U0001F300-\U0001F64F'u'\U0001F680-\U0001F6FF'u'\u2600-\u26FF\u2700-\u27BF]+', ' EMOJI ', x) x = re.sub("[\\\"\\$%&@\\.,:;\\(\\)¿\\?`+\\-_\\*=!¡\\\\/#{}\\[\\]]", " ", x) x = re.sub("\\s+", " ", x) x = x.strip() words = x.split(' ') words_nonstop = [w for w in words if not w in stopwords] words_nonstop_lower = [w.lower() if not w in token_list else w for w in words_nonstop] stemmer = Stemmer.Stemmer('spanish') words_nonstop_lower_stemmed = stemmer.stemWords(words_nonstop_lower) return dict(Counter(words_nonstop_lower_stemmed))
def wildcard_search(self, word): if word.find("*") == -1: # No wildcard try: stemmer = Stemmer.Stemmer('spanish') stemmed_word = stemmer.stemWord(word) return list([self._btree[stemmed_word]]) except KeyError: return None elif word[-1] == "*": return list( self._btree.values(min=word[:-1] + self._alphabet[0], max=word[:-1] + self._alphabet[26])) elif word[0] == "*": print("Desde %s hasta %s" % (word[::-1][:-1] + self._alphabet[0], word[::-1][:-1] + self._alphabet[26])) return list( self._reverse_btree.values( min=word[::-1][:-1] + self._alphabet[0], max=word[::-1][:-1] + self._alphabet[26])) else: return None
def get_translation_and_statistics_parse_stemming_ext_snowball( self, snowball_language_shortenning, min_occurences=0): lifo_stack = [] stemmers = dict() for language_shortening in snowball_language_shortenning: stemmers[language_shortening] = Stemmer.Stemmer( language_shortening) self.load_stop_words(snowball_language_shortenning) self.prepare_statistic_structures(snowball_language_shortenning) if min_occurences < 2: self.parse_stemmer_ext_snowball(lifo_stack, self.root_node, snowball_language_shortenning, stemmers) else: print("SUCCESS") self.parse_stemmer_ext_min_snowball(lifo_stack, self.root_node, snowball_language_shortenning, min_occurences, stemmers) self.print_statistics(snowball_language_shortenning)
def convert_src_vector(self, src_vect, lang, rem_sw, let_stemming, fs_knword): res_vect, res_voc = [], {} if rem_sw == 0: stopwords = [] elif rem_sw == 1: stopwords = copy.deepcopy(self.langstopwords['english']) stemmer = Stemmer.Stemmer('english') for seg_vect in src_vect: temp_res_vect = {} for word, freq in seg_vect.items(): # trans_w_lst= self.blc.get_nearest_token(word, lang, fs_knword) # if trans_w_lst: # for trans_w,score in trans_w_lst.items(): # if trans_w not in stopwords: # if let_stemming==0: # temp_res_vect[trans_w]=freq # else: # trans_w=stemmer.stemWord(trans_w) # temp_res_vect[trans_w] = freq trans_w = self.blc.get_nearest_token(word, lang, fs_knword) if trans_w: if trans_w not in stopwords: if let_stemming == 0: temp_res_vect[trans_w] = freq else: trans_w = stemmer.stemWord(trans_w) temp_res_vect[trans_w] = freq else: trans_w = word temp_res_vect[trans_w] = freq res_vect.append(temp_res_vect) for tw in temp_res_vect.keys(): if tw in res_voc: res_voc[tw] += 1 else: res_voc[tw] = 1 return res_vect, res_voc
def words_filter1(sourceword): """根据WordNet过滤, 这个是没有用停用词过滤的,而是对Wordnet词典进行查找,找到就加入,没有找到的话先还原下,然后再查,仍然没有则丢弃 """ stemmer = Stemmer.Stemmer('english') #print stemmer.stemWords(sourceword) 直接用stemmer,有些词不对,,,, #return words_filter(stemmer.stemWords(sourceword), StopWordList().words) stopwordlist = StopWordList().words purewords = [] for oneword in sourceword: for d in Dictionaries: part = d.pos try: getWord(oneword, part) purewords.append(oneword) # 只要查到一种就退出 break except: pass else: # 这里再次用stemmer试探还原词干,,,比如说查找dogs oneword = stemmer.stemWord(oneword) for d in Dictionaries: part = d.pos try: getWord(oneword, part) purewords.append(oneword) break except: pass else: # 还原后还是找不到,那就没办法了...如果这个词不在停用词表中的话,那还是保存起来 pass #if oneword not in stopwordlist: # purewords.append(oneword) return purewords return words_filter(purewords, StopWordList().words)