def stem_words(iterable, language='english'): """Stem every word in iterable. Uses PyStemmer which is based on the Porter Stemming algorithms - an algorithm for suffix stripping. https://tartarus.org/martin/PorterStemmer/def.txt :rtype: list. """ try: stemmer = Stemmer(language) except KeyError: stemmer = Stemmer('english') return stemmer.stemWords(iterable)
def stemmer(tokens): # ps = PorterStemmer() # tokens = [ps.stem(w) for w in tokens] ps = Stemmer('porter') tokens = [ps.stemWord(w) for w in tokens] return tokens
def textHandler(text): #print(text) #stop_word = {} #tokenizing text = text.encode('ascii', errors='ignore').decode() text = re.sub(r'[^A-Za-z0-9]+', r' ', text) #tokens = nltk.word_tokenize(text)#tokenizing #stop word removal #uwords = [word for word in tokens if word not in stop_word.keys()]#stop word removal #print('remove',uwords) stop_words = set(stopwords.words('english')) word_tokens = word_tokenize(text) filter_sentence = [w for w in word_tokens if not w in stop_words] # filter_sentence = [] # for w in word_tokens: # if w not in stop_words: # filter_sentence.append(w) stemmer = Stemmer('porter') stem_text = [] for word in filter_sentence: stem_text.append(stemmer.stemWord(word)) #print(filter_sentence) # print('before',len(filter_sentence)) # print('after',len(stemming(stem_text))) return stem_text
def __init__(self): self.lexicon = {} #lexicon for assisting in search self.titles = {} #document titles self.stop_words = {} self.stemmer = Stemmer("english") # for stemming of words self.totalDocs = 127467 # total counts of all pages found in our document ( please update this count according to your dataset) self.load()
def stem(datalist): #Stemming stemmer = Stemmer("english") tmp = [] for x in datalist: y = stemmer.stemWord(x) tmp.append(y) return tmp
def text_cleaner(text): text = text.lower() # приведение в lowercase, text = re.sub(r'https?://[\S]+', ' url ', text) # замена интернет ссылок text = re.sub(r'[\w\./]+\.[a-z]+', ' url ', text) text = re.sub(r'\d+[-/\.]\d+[-/\.]\d+', ' date ', text) # замена даты и времени text = re.sub(r'\d+ ?гг?', ' date ', text) text = re.sub(r'\d+:\d+(:\d+)?', ' time ', text) # text = re.sub( r'@\w+', ' tname ', text ) # замена имён twiter # text = re.sub( r'#\w+', ' htag ', text ) # замена хештегов text = re.sub(r'<[^>]*>', ' ', text) # удаление html тагов text = re.sub(r'[\W]+', ' ', text) # удаление лишних символов stemmer = Stemmer('russian') text = ' '.join(stemmer.stemWords(text.split())) stw = [ 'в', 'по', 'на', 'из', 'и', 'или', 'не', 'но', 'за', 'над', 'под', 'то', 'a', 'at', 'on', 'of', 'and', 'or', 'in', 'for', 'at' ] remove = r'\b(' + '|'.join(stw) + ')\b' text = re.sub(remove, ' ', text) text = re.sub(r'\b\w\b', ' ', text) # удаление отдельно стоящих букв text = re.sub(r'\b\d+\b', ' digit ', text) # замена цифр return text
def text_cleaner(text: str): text = text.lower() stemmer = Stemmer("russian") # Выбор языка на котором будут входные данные text = " ".join(stemmer.stemWords(text.split())) text = re.sub(r"\b\d+\b", "digit", text) # По идее заменяет цифры ! (на что пока не понял) return text
def tokenise(value, identifier, category, content_stop): token_list = [] final_list = [] value = re.sub(exclude1, " ", value) value = re.sub(exclude2, " ", value) value = re.sub(r'[^a-zA-Z]', " ", value) value = value.lower() if category == 'e': value = re.sub(r'(http|www|com)', " ", value) if category == 'c': value = re.sub(r'category', " ", value) token_list = value.split() for w in token_list: if w not in content_stop.keys(): final_list.append(w) # stemmer = PorterStemmer() stemmer = Stemmer("english") final_list = [stemmer.stemWord(key) for key in final_list] # final_list = [stemmer.stem(plural,0, len(plural)-1) for plural in final_list] if final_list: #call next function here. return (final_list) ####after work of token_list is done#### token_list = [] final_list = []
def _prepare_text(self, text): """Extracts and stems the words from some given text. """ words = re.findall('[a-z0-9\']+', text.lower()) words = [word for word in words if word not in STOP_WORDS] stemmer = Stemmer('english') stemmed_words = stemmer.stemWords(words) return stemmed_words
def get_stemmer(language=None): """ Return stemmer for given language. The default language is english. """ return Stemmer(language or 'english')
def main(inputFile): print("Start making emoticon map") map_emoticon = generateEmotMap('emoticon_id.txt') print("Finished") print("Start making senti map") map_senti = generateSentiMap(['boosterwords_id.txt', 'idioms_id.txt', 'negatingword.txt', 'sentiwords_id.txt']) print("Finished") print("Start making abbreviation dictionary for bahasa") corrector = Corrector('singkatan.dic') print("Finished") print("Start making stopword dictionary for bahasa") cutter = Cutter('stopword.txt') print("Finished") print("Start making stemmer for bahasa") stemmer = Stemmer() print("Finished") output_file = sys.argv[2] + '.txt' file_read = open(str(inputFile), "r", encoding='utf-8') file_write = open(output_file, "w", encoding='utf-8') start = timeit.default_timer() review_number = 0; for line in file_read.readlines(): review_number += 1 user_rating = line.split('<>')[0] sentence_number = 0 header_string = 'REVIEW-' + str(review_number) + ' ' + str(user_rating) body_string = '' # file_write.write('REVIEW-' + str(review_number) + ' [rating] ' + str(user_rating) + '\n') review = line.split('<>')[1] review = erase_question_sentence(review) # Erase question sentence for i, sentence in enumerate(re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', review)): print("Processing sentence " + str(i+1) + ' from review ' + str(review_number)) sentence = sentence.lower() sentence = corrector.correct(sentence, map_emoticon, map_senti).strip() sentence = cutter.cut(sentence, map_emoticon, map_senti).strip() sentence = stemmer.stem(sentence, map_emoticon, map_senti).strip() if (sentence != ''): # file_write.write(sentence + "\n") body_string = body_string + sentence + '\n' sentence_number += 1 header_string = header_string + ' ' + str(sentence_number) + '\n' output_string = header_string + body_string file_write.write(output_string) stop = timeit.default_timer() print("Running time: " + str(stop - start)) print("Finished.\nOutput file: " + output_file) file_read.close() file_write.close() duration = 1000 # millisecond freq = 440 # Hz winsound.Beep(freq, duration)
def train(self): ''' Trains a bow vectorizer ''' processors = dict() data = get_speech_text(folder=self.folder) text = chain.from_iterable(data.values()) # stemming if 'stemming' in self.steps: print "Stemming" processors['stemmer'] = Stemmer('german') text = map( lambda y: ' '.join(processors['stemmer'].stemWords(y.split(' ') )), text) # the count vectorizer of scikit learn if 'hashing' in self.steps: print 'Hashing Bag of words vectorizer' count_vect = HashingVectorizer(ngram_range=(1, 5), decode_error='ignore').fit(text) elif 'trigrams' in self.steps: print "Trigram Bag-of-Words" count_vect = CountVectorizer(ngram_range=(1, 3), min_df=2).fit(text) elif 'bigrams' in self.steps: print "Bigram Bag-of-Words" count_vect = CountVectorizer(ngram_range=(1, 2), min_df=2).fit(text) else: print "Unigram Bag-of-Words" count_vect = CountVectorizer(min_df=2).fit(text) processors['count_vectorizer'] = count_vect text = count_vect.transform(chain.from_iterable(data.values())) if 'tfidf' in self.steps: print "Tf-idf normalization" processors['tf_transformer'] = TfidfTransformer( use_idf=True).fit(text) text = processors['tf_transformer'].transform(text) # dumping this vectorizer to pickle fn = self.folder + '/vectorizer_%s.pickle' % '_'.join( sorted(self.steps)) cPickle.dump(processors, open(fn, 'wb'), -1) self.processors = processors fn = self.folder + '/bag_of_words_%s.pickle' % '_'.join( sorted(self.steps)) for party in data.keys(): data[party] = self.transform(data[party]) cPickle.dump(data, open(fn, 'wb'), -1)
def processQueries(queries): queryList = [] for query in queries: filteredQuery = tokenize.filterToken(query, tokenize.getStopWords()) if filteredQuery and filteredQuery is not None: stemmer = Stemmer('english') queryStem = stemmer.stemWord(filteredQuery.lower()) queryList.append(queryStem) return queryList
def process_text(s): s = re.sub('<[^>]+>', '', s) s = re.sub('&.*?;', '', s) words = simple_preprocess(s, deacc=True, max_len=99) words = [word for word in words if word not in stoplist] stemmer = Stemmer('english') words = stemmer.stemWords(words) #print words #print stoplist #raw_input() return words
def get_stemmer(lang, allow_dummy=True): global _stemmers if lang: lang = locale_to_lang(lang) if lang not in known_languages: if not allow_dummy: return None _stemmers[lang] = DummyStemmer() elif lang not in _stemmers: _stemmers[lang] = Stemmer(lang) return _stemmers[lang]
def stem_snowball(doc, language): """Stem words in doc using the Snowball stemmer. Set the parameter ``lang`` to a language code such as "de", "en", "nl", or the special string "porter" to get Porter's classic stemming algorithm for English. See also -------- morphy: smarter approach to stemming (lemmatization), but only for English. """ from Stemmer import Stemmer return Stemmer(language).stemWords(_tokenize_if_needed(doc))
def parse_html(html): words = dehtml(html) s = Stemmer("danish") result = [] for w in words.split(): word = w.lower() if word in stop_words or len(word) < 2 or word.count('\\'): continue result.append(s.stemWord(word)) return result
def __init__(self, language, min_sent_len): if language not in AVAILABLE_LANGUAGES: err = (f"Language '{language}' is not available, " f"choose from [{', '.join(AVAILABLE_LANGUAGES)}]") raise ValueError(err) self.min_sent_len = min_sent_len self.stem = Stemmer(language).stemWord stopwords = importlib.import_module(f'tldry.stopwords.{language}') self.stopwords = frozenset(self.stem(w) for w in stopwords.stopwords) self.raw_sentences = []
def getTerm(term): term_ids = {} term_ids_file = open(TERMIDSFILE, 'rU') for line in term_ids_file.readlines(): pieces = line.strip().split('\t') stemmer = Stemmer('english') #stemmer.maxCacheSize = 1 termStem = stemmer.stemWord(term.lower()) if termStem == pieces[1]: term_ids[pieces[1]] = int(pieces[0]) return term_ids term_ids_file.close() return term_ids
def stem_snowball(doc, language): """Stem words in doc using the Snowball stemmer. Set the parameter ``lang`` to a language code such as "de", "en", "nl", or the special string "porter" to get Porter's classic stemming algorithm for English. See also -------- morphy: smarter approach to stemming (lemmatization), but only for English. """ from Stemmer import Stemmer # Build the Stemmer before fetching to force an exception for invalid # languages. stem = Stemmer(language).stemWords return pipe(doc, fetch, _tokenize_if_needed, stem)
def simple_tokenizer(text: str, stemmer: Stemmer = None) -> List[List[Tuple]]: if stemmer is None: stemmer = Stemmer('english') text = unicodedata.normalize(UNICODE_NORMALIZATION, text) text = normalize_quotation_marks(text) paragraphs = split_into_paragraphs(text) stopwords = get_stopwords() common_abbr = get_common_abbr() sentences = split_into_sentences(paragraphs, common_abbr) tagged_sentences = tag_sentences(sentences, stopwords, common_abbr) tagged_and_stemmed = apply_snowball_stemmer(tagged_sentences, stemmer) return tagged_and_stemmed
def __init__(self, rules: List[ExtendedRule], tokenizer_fn=tokenize): self.rules = rules self.intents = [] self.resolvers = [ StemmerResolver(Stemmer('russian')) ] self.j2_env = NativeEnvironment() self.tokenizer_fn = tokenizer_fn for r in rules: if r.production.startswith('intent'): self.intents.append(r.production) self.intents = tuple(set(self.intents)) self._parsing_tables = {}
def __init__(self, inputPath, outputPath): self.inputPath = inputPath self.outputPath = outputPath logging.setLoggerClass(ColoredLogger) self.logger = logging.getLogger('ArnetMinerDataImporter') # Get the stop words set & stemmer for text analysis self.stopWords = None with open( os.path.join(os.getcwd(), 'src', 'importer', 'stopWords.json')) as stopWordsFile: self.stopWords = set(json.load(stopWordsFile)) self.stemmer = Stemmer('english') super(ArnetMinerDataImporter, self).__init__()
def summarize(text_file: str) -> Dict: en_stemmer = Stemmer('english') with open(text_file, 'r', encoding='utf-8') as file: sentences = simple_tokenizer(file.read(), en_stemmer) scored_terms = score_terms(sentences) scored_sentences = score_sentences(scored_terms, sentences) top_keywords = get_top_keywords( scored_terms) # TODO those are underlying reprs! reduced_text = reduce_sentences(scored_sentences) return { 'top keywords': top_keywords, 'original text': sentences, 'reduced text': reduced_text, 'reduced by': (1 - (len(reduced_text) / float(len(sentences)))) }
def __init__(self, inputFolderPath, outputPath): self.inputFolderPath = inputFolderPath self.outputPath = outputPath logging.setLoggerClass(ColoredLogger) self.logger = logging.getLogger('SerializedDBLPDataImporter') # Get the stop words set & stemmer for text analysis self.stopWords = None with open(os.path.join(os.getcwd(), 'src', 'importer', 'stopWords.json')) as stopWordsFile: self.stopWords = set(json.load(stopWordsFile)) self.stemmer = Stemmer('english') # Regex for stripping non-visible characters controlChars = ''.join(map(unichr, range(0,32) + range(127,160))) self.controlCharactersRegex = re.compile('[%s]' % re.escape(controlChars)) super(SerializedDBLPDataImporter, self).__init__()
def stemming(tokens): """ Input = Tokens after tokenisation and removing stop words Function = Use stemmer to identify the root word """ newlist = [] for s in tokens: if s in stemmed_dict: str = stemmed_dict[s] else: str = Stemmer('english').stemWord(s) stemmed_dict[s] = str #if str not in newlist: newlist.append(stemmed_dict[s]) return newlist
def __init__(self, words_file=default_words, verbs_file=default_verbs, joined_verb_parts=True): self.verbs = {} self.stemmer = Stemmer() tokenizer = WordTokenizer(words_file=default_words, verbs_file=verbs_file) self.words = tokenizer.words if verbs_file: self.verbs['است'] = '#است' for verb in tokenizer.verbs: for tense in self.conjugations(verb): self.verbs[tense] = verb if joined_verb_parts: for verb in tokenizer.verbs: bon = verb.split('#')[0] for after_verb in tokenizer.after_verbs: self.verbs[bon + 'ه_' + after_verb] = verb self.verbs['ن' + bon + 'ه_' + after_verb] = verb for before_verb in tokenizer.before_verbs: self.verbs[before_verb + '_' + bon] = verb
def __init__(self): self.NaiveBayesClassifier = NaiveBayesClassifier() # Sentence Splitters self.RuleBasedSentenceSplitter = RuleBasedSentenceSplitter() self.MLBasedSentenceSplitter = MLBasedSentenceSplitter() # Tokenizers self.RuleBasedTokenizer = RuleBasedTokenizer() self.MLBasedTokenizer = MLBasedTokenizer() # Normalizer self.Normalizer = Normalizer() # Stemmer self.Stemmer = Stemmer() # Stopword Eliminators self.StaticStopWordEliminator = StaticStopwordRemover() self.DynamicStopWordEliminator = DynamicStopWordEliminator()
def index(text, accepted_languages=None, langs=None): registry = get_current_registry() if accepted_languages == None: accepted_languages = [ x.strip() for x in registry.settings["accepted_languages"].split(",") ] if langs == None: lang = guessLanguage(text) if lang not in accepted_languages: langs = accepted_languages else: langs = [lang] langs = list(set(langs).intersection(set(accepted_languages))) if not langs: langs = accepted_languages indexed_words = set() for lang in langs: stemmer = Stemmer(lang) indexed_words.update( [stemmer.stemWord(x.value) for x in tokenize(text)]) return indexed_words
def nonField_query(path, text, secondary_index_list): #print(1) text = text.lower() text = text.encode('ascii', errors='ignore').decode() text = re.sub(r'[^A-Za-z0-9]+', r' ', text) stop_words = set(stopwords.words('english')) word_tokens = word_tokenize(text) filter_sentence = [w for w in word_tokens if not w in stop_words] # filter_sentence = [] # for w in word_tokens: # if w not in stop_words: # filter_sentence.append(w) stemmer = Stemmer('porter') stem_text = [] for word in filter_sentence: stem_text.append(stemmer.stemWord(word)) #print(word) result_list = [] #print(stem_text) for word in stem_text: result_list.append(Posting(secondary_index_list, word, path)) return result_list