def stemming_and_stopwords(text): stemmer = SnowballStemmer("english") stop = stopwords.words("english") text = text.apply(lambda x: x.split()) # text = text.apply(lambda word_list: [w for w in word_list if w not in stop]) return text.apply(lambda word_list: " ".join( [stemmer.stem(w) for w in word_list if w not in stop]))
class SnowballStemmerNormalizer(Normalizer): """ A Normalizer that uses the NLTK SnowballStemmer to normalize tokens. """ def __init__(self, language='english'): self.language = language self.stemmer = SnowballStemmer(self.language) def normalize(self, token): """ Apply normalization techniques over the token to simplify its structure. :param token: :return: A stem of the token, or the token, if a stem could not be produced. """ return self.stemmer.stem(token) def normalize_list(self, tokens): """ Normalize a entire list of tokens. :param tokens: A list of tokens to be normalized :return: Yields a normalized token """ for token in tokens: yield self.stemmer.stem(token)
def stemming_text(self, words): stemmer = SnowballStemmer("spanish") final_text = [] for word in words: final_text.append(stemmer.stem(word)) return final_text
def tokenize(text, stemming=True, stoplist=[], remove_digits=False, lang='en'): translator = str.maketrans( string.punctuation, ' ' * len(string.punctuation)) # map punctuation to space text = text.translate(translator) text = text.lower() text = text.strip() table = str.maketrans({key: None for key in string.punctuation}) text = text.translate(table) if stemming: if lang == 'en': stemmer = Stemmer() elif lang == 'it': stemmer = SnowballStemmer('italian') elif lang == 'de': stemmer = SnowballStemmer('german') elif lang == 'fa': stemmer = paStemmer() analyzer = StemmingAnalyzer(stoplist=stoplist, minsize=1, stemfn=stemmer.stem) else: analyzer = StandardAnalyzer(stoplist=stoplist, minsize=1) tokens = [token.text for token in analyzer(text)] if remove_digits: tokens = [ word for word in tokens if not contains_digits(word) and 2 <= len(word) ] return tokens
def target_stemming_spanish(self, words): result = "" wordset = words.split(" ") stemmer = SnowballStemmer('spanish') for word in wordset: result += stemmer.stem(word) + "_" return result
def stem_text(text: str, lang_code: str) -> [str]: if lang_code in languages.languages.keys(): tokens = word_tokenize(text) stemmer = SnowballStemmer(languages.languages[lang_code]) stems = [stemmer.stem(token) for token in tokens] return stems return []
def text_process(self, text): # Remove punctutation no_punc = [ char.lower() for char in text if char not in string.punctuation ] # Join the characters again to form the string. no_punc = ''.join(no_punc) # Remove any stopwords try: no_stopwords = [ word for word in no_punc.split() if word.lower() not in stopwords.words(self.language) ] except LookupError: nltk.download('stopwords') no_stopwords = [ word for word in no_punc.split() if word.lower() not in stopwords.words(self.language) ] result = no_stopwords if self.tagging: # Tag each word tagged_words = self._tag_text(result) # Remove unwanted tags extracted_tags = self._extract_tags(tagged_words) result = extracted_tags if self.stemming: # Stem it stemmer = SnowballStemmer(self.language) result = [stemmer.stem(word) for word in result] return result
def sentence_stemming(sentence): options = { "ar": "arabic", "da": "danish", "nl": "dutch", "en": "english", "fi": "finnish", "fr": "french", "de": "german", "hu": "hungarian", "it": "italian", "no": "norwegian", "pt": "portuguese", "ro": "romanian", "ru": "russian", "es": "spanish", "sw": "swedish" } c = detect(sentence) try: stemmer = SnowballStemmer(options[c]) except KeyError: print("Language not supported") sys.exit() s = "".join(stemmer.stem(i) + " " for i in sentence.split()) return "".join(s + " " for s in word_tokenize(s) if s not in set(stopwords.words(options[c])))
def bag_of_words_spacy2(dataset): import spacy nlp = spacy.load('es_core_news_md') spanishstemmer = SnowballStemmer("spanish") all_stopwords = stopwords.words('spanish') all_stopwords.extend( ("saludo", "dia", "noche", "noches", "tardes", "buenos", "buenas", "atentamente", "dias", "estimado", "estimados", "estimada", "atte", "hola", "gracia", "caja", "respuesta", "adjunto", "mucha", "me", "cordoba", "buen", "ud")) removeList = ["no", "nunca"] all_stopwords = [e for e in all_stopwords if e not in removeList] corpus = [] for i, value in dataset.items(): review = str(html.unescape(dataset[i])) review = cleanhtml(review) review = re.sub( r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1", normalize("NFD", review), 0, re.I) review = normalize('NFC', review) review = re.sub('[^a-zA-Zá-ú0-9]', ' ', review) review = review.lower() doc = nlp(review) stems = [ spanishstemmer.stem(token) for token in doc if not token in set(all_stopwords) ] review = ' '.join(stems) corpus.append(review) return corpus
def __init__(self, lang): lang_ipa = {'es': 'spa-Latn', 'en': 'eng-Latn'} lang_stemm = {'es': 'spanish', 'en': 'english'} self.lang = lang self.stemmer = SnowballStemmer(language=lang_stemm[lang]) self.epi = epitran.Epitran(lang_ipa[lang]) self.nlp = self.load_sapcy(lang)
def quadratic(cls, language: 'model.Language'): """ This estimator computes the ratio of new words for a given user and language :param language: language of the text that needs to be estimated :param user: the user for which the difficulty estimation needs to be done :rtype: WordHistoryDifficultyEstimator :return: WordHistoryDifficultyEstimator with initialized user, language and word => score map which can be used for determining scores for multiple articles for the same user and language """ estimator = cls(language) freq_list = load_language_from_hermit(language.code) word_dict = dict() for k, v in freq_list.word_info_dict.items(): word_dict[k] = v.frequency stemmer = SnowballStemmer(language.name.lower()) score_map = defaultdict(int) for k, v in word_dict.items(): score_map[stemmer.stem(k.lower())] += v max_freq = max(score_map.values()) for k in score_map.keys(): score_map[k] = (1 - score_map[k] / max_freq)**0.5 estimator.score_map = score_map return estimator
def stemLine (text): snow = SnowballStemmer('english') text = [snow.stem(t) for t in text.split()] return (' ').join(text)
def bag_of_words_spacy(dataset): import spacy spanishstemmer = SnowballStemmer("spanish") nlp = spacy.load('es_core_news_md') nlp.Defaults.stop_words |= { "saludo", "dia", "noche", "noches", "tardes", "buenos", "buenas", "atentamente", "dias", "hola", "estimado", "estimados", "estimada", "atte" } nlp.Defaults.stop_words -= {"no", "nunca"} corpus = [] for i, value in dataset.items(): review = str(html.unescape(dataset[i])) review = cleanhtml(review) doc = nlp(review) words = [t.orth_.lower() for t in doc if not t.is_punct | t.is_stop ] #elimina signos de puntuacion y stopwords #lexical_tokens = [t.lower() for t in words if len(t) > 2 and t.isalpha()] # pasa a minuscula, elimina pal de 2letras y num review = ' '.join(words) doc = nlp(review) lemmas = [tok.lemma_.lower() for tok in doc] stems = [spanishstemmer.stem(token) for token in lemmas] review = ' '.join(stems) corpus.append(review) return corpus
def __init__(self, min_occurrence=10, window=15, from_corpus=False): self.min_occurrence = min_occurrence self.window = window # map words to integers (more memory efficient and faster) self.word2int_count = count() self.word2int = defaultdict(self.word2int_count.__next__) # map city names also to ints self.city2int_count = count() self.city2int = defaultdict(self.city2int_count.__next__) self.stemmer = SnowballStemmer('german') self.stopwords = set(stopwords.words('german')).union(STOP_CITIES) self.stems = defaultdict(lambda: defaultdict(int)) self.cores = multiprocessing.cpu_count() if from_corpus: print("loading spacy", file=sys.stderr, flush=True) self.nlp = spacy.load('de', parser=False, tagger=True, entity=False) print("done...", file=sys.stderr, flush=True)
def normalizeWords(text): ''' Text preprocessing ''' stemmer = SnowballStemmer(language='english') test = re.compile(r'\W+', re.UNICODE).split(text[0].lower()) stop_words = [ 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'html' ] test = [ stemmer.stem(word) for word in test if not word in stop_words and word.isalpha() and len(word) > 2 ] return (test, text[1], len(test))
def embers_stem(x): """ DESCRIPTION It will do stemming for words in x considering english, spanish and portuguese INPUT x: a tweet text, or other sentense or paragraph OUTPUT the tweet text after stemming. """ x = x.lower() if isinstance(x, unicode) == False: x = x.decode('utf-8', 'ignore') try: stemmer = SnowballStemmer('spanish') x1 = FeatureCountVectorizer.preprocess_unicode_text(x, stemmer.stem) if (x1 == ''): x1 = x # print x1 stemmer = SnowballStemmer('english') x2 = FeatureCountVectorizer.preprocess_unicode_text(x, stemmer.stem) if (x2 == ''): x2 = x # print x2 stemmer = SnowballStemmer('portuguese') x3 = FeatureCountVectorizer.preprocess_unicode_text(x, stemmer.stem) if (x3 == ''): x3 = x # print x3 # print 'success' return min(x1, x2, x3, key=lambda x: len(x)) except: return x
def delete_mark(request): args = {'status': 0} if request.POST: from_entity = request.POST.get('current_entity_id', '') to_entity = request.POST.get('to_entity', '') project_id = request.POST.get('project_id', '') stemmer = SnowballStemmer('russian') stem_entity = stemmer.stem(to_entity) entities = Entity.objects.filter(project_id=project_id) ent = LinkingEntities(entities, None) stem_entities = ent.get_stemmed_names_of_entity() to_id = 0 for ent in stem_entities: if stem_entity == ent.get("stemmed_name"): to_id = ent.get("id") unmark_entity = LinksBetweenEntities.objects.filter(from_entity_id=from_entity, to_entity_id=to_id, is_unmarked=False) if unmark_entity: data = {'from_entity_id': from_entity, 'to_entity_id': to_id, 'is_unmarked': True} unmark_entity.update(**data) args['status'] = 1 messages.add_message(request, messages.SUCCESS, "Связь успешно удалена") return JsonResponse(args)
def __init__(self, max_edit_distance_dictionary: int = 5, prefix_length: int = 10, count_threshold: int = 1, compact_level: int = 5): super().__init__(max_edit_distance_dictionary, prefix_length, count_threshold, compact_level) self.stemmer = SnowballStemmer('german')
def get_stem(lang, sentence): stemmer = SnowballStemmer(lang) stemmed = '' for word in casual_tokenize(sentence): word = stemmer.stem(word) stemmed = stemmed + word + ' ' return stemmed
def main(): #get_synsets( parrot_primary) stemmer = SnowballStemmer('english') print stemmer.stem( "affectionate" ) print is_emotion( "affection" ) print is_emotion( "Affection" ) print is_emotion( "haha" )
def __init__(self): # self._no_punct_pattern = re.compile('[a-zA-Z0-9- ]') self._tok = MosesTokenizer(lang='en') self._stemmer = SnowballStemmer('english') self._lemmatizer = TreeTagger(language='english') self._stopwords = set(open(STOPWORDS).read().splitlines()) # istopwords.words('french') # self._porter_stemmer = nltk.stem.porter.PorterStemmer()
def stemLine(title, abstract): snow = SnowballStemmer('english') title = [snow.stem(t) for t in title.split()] abstract = [snow.stem(a) for a in abstract.split()] return (' ').join(title) + '\t' + (' ').join(abstract)
def stem(word): '''danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese", 'romanian', 'russian', 'spanish', 'swedish')''' stemmer = SnowballStemmer("english") try: word = stemmer.stem(word).encode('utf-8') except Exception, e: word = word
def preprocess(question, spelling_correction, stemming, featurenames, numbers_feature): """ preprocess the questions question - string containing the text of the question spelling_correction - Boolean parameter which decides whether to use spelling correction or not, by default False stemming - Boolean parameter whether to do the stemming or not featurenames - set of feature names numbers_feature - Boolean parameter whether to include an indication about a number in the datapoint return: valid_questions - list of non-empty questions featurenames - list of feature names """ # make a list with german stop-words stop_words = stopwords.words('german') stop_words = [i.decode('utf-8') for i in stop_words] # create the stemmer stemmer = SnowballStemmer("german") # create a dictionary of german words for spelling correction if spelling_correction: german_dict = enchant.Dict("de_DE") valid_questions = [] for i in question: # check if the question has a category if i[3] == 'N': continue contains_num = 0 category_text = i[4] if re.search('\d+', category_text): contains_num = 1 # remove the punctuation category_text = re.sub(r'[^a-zA-Z ]',' ', category_text) # remove the stop words and split questions into words category_text = category_text.split() category_text = [w for w in category_text if w not in stop_words] for k in range(len(category_text)): # do the spelling correction, if specified if spelling_correction: if not german_dict.check(category_text[k]): try: category_text[k] = german_dict.suggest(category_text[k])[0] except: pass # convert words to lowercase category_text[k] = category_text[k].lower() # stem the words if stemming: category_text[k] = stemmer.stem(category_text[k]) #save the words as features if category_text[k]: featurenames.add(category_text[k]) # if the text of the question is not empty, append the question to the list of valid questions if category_text: i[4] = category_text if numbers_feature: i[4] += [contains_num] valid_questions.append(i) return valid_questions, list(featurenames)
def _remove_pattern_2(input_text_list): stoplist = read_stopwords() cleaned_text_list = [] for text in input_text_list: text = text.translate(string.punctuation) # Remove puncuation 去除标点 text = text.lower() # Convert words to lower case and split them # text = " ".join(text) # Clean the text text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) # 除A-Za-z0-9(),!?'`外的字符,去除 text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) text = re.sub(r"https://t.co/[A-Za-z]{10}", " ", text) text = text.split() text = [word for word in text if word not in stoplist] ## 在提取词根前清除一次停用词 stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] cleanwordlist = [ word for word in stemmed_words if word not in stoplist ] ## 提取词根后,再清除 text = " ".join(cleanwordlist) cleaned_text_list.append(text) return cleaned_text_list
def clean_text(text): import nltk nltk.download('stopwords') translate_table = dict((ord(char), None) for char in string.punctuation) text = text.translate(translate_table) re_url = re.compile( r"((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\ .([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*", re.MULTILINE | re.UNICODE) re_ip = re.compile("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}") text = re_url.sub("URL", text) text = re_ip.sub("IPADDRESS", text) text = text.lower().split() stops = set(stopwords.words("english")) text = [w for w in text if not w in stops and len(w) >= 3] text = " ".join(text) text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) text = text.split() stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] text = " ".join(stemmed_words) return text
def preprocess(tweet): stop_words = stopwords.words("english") stemmer = SnowballStemmer("english") tweet = re.sub(TEXT_CLEANING_RE, ' ', str(tweet).lower()).strip() tokens = [] for token in tweet.split(): if token not in stop_words: tokens.append(stemmer.stem(token)) return " ".join(tokens)
def stem_tweets(sentence): stemmer = SnowballStemmer("english") stem_sentence = "" for word in sentence.split(): stem = stemmer.stem(word) stem_sentence += stem stem_sentence += " " stem_sentence = stem_sentence.strip() return stem_sentence
def stemm(self, document): ''' document stemming ''' if len(self.tokens) == 0: #empty list self.tokenize(document) else: spanishstemmer = SnowballStemmer('spanish') self.stems = [spanishstemmer.stem(token) for token in self.tokens]
def findOrigin(self): stemmer = SnowballStemmer("english") tokenList = word_tokenize(self.text) for token in tokenList: # Get the current token and stem it curStem = stemmer.stem(token) # if current token's stemmed version can be found in stemlist if curStem in self.stemList and token not in self.stemDict[curStem]: self.stemDict[curStem].append(token)
def learn(corpus, epsilon=None): # set default smooth parameter if(epsilon == None): epsilon = 0.1 prior_sense_dist = {} total_num_sense = 0 word_sense_dist = {} sense_sum = {} snstmr = SnowballStemmer("english") #vocabulary = [] senses = [] #count all of the sensense for instance in corpus: sense = instance.get_sense() if sense not in senses: senses.append(sense) for instance in corpus: context = map(lambda x: snstmr.stem(x), instance.get_context().lower().strip().split()) # count all senses to estimate the distribution of senses sense = instance.get_sense() total_num_sense += 1 try: prior_sense_dist[sense] += 1 except KeyError: prior_sense_dist[sense] = 1 #head = instance.get_head_word() # count all (word, sense) cooccurece to estimatate the conditional distribution, namely P(word|sense) for word in context: try: word_sense_dist[(word, sense)] += 1 #find a new word except KeyError: # put it in the matrix word_sense_dist[(word,sense)] = 1 + epsilon # also put it for all other possible senses for s in senses: if s != sense: word_sense_dist[(word, s)] = epsilon # sum over senses, will be used to calculate the probability for k in word_sense_dist.keys(): try: sense_sum[k[1]] += word_sense_dist[k] except KeyError: sense_sum[k[1]] = 0 for k in word_sense_dist.keys(): word_sense_dist[k] = math.log(float(word_sense_dist[k]) / float(sense_sum[k[1]])) #print k, word_sense_dist[k] #print "=========" for k in prior_sense_dist.keys(): prior_sense_dist[k] = math.log(float(prior_sense_dist[k]) / total_num_sense) #print prior_sense_dist[k] return (prior_sense_dist,word_sense_dist)
def softClean(text, rmPunc=False, sentTok=False, rmNumber=False, stop_words=False, stem=False, lower_case=False, rm_char=False): if lower_case: text = text.lower() cleanText = re.sub(RVM_REPEATED_PUNC, '\\1', text) # Remove time cleanText = re.sub(r'[0-9]{1,2}:[0-9]{2}:[0-9]{2}', '', cleanText) # Remove date cleanText = re.sub(r'[0-9]{1,4}([/-])[0-9]{1,2}\1[0-9]{2,4}', '', cleanText) if sentTok: cleanText = '\n'.join(sent_tokenize(text)) tokens = word_tokenize(cleanText) if stop_words or stem else None if stem: stop_word_set = set(stopwords.words('english') + ["n't"]) if stop_words else set() stemmer = SnowballStemmer('english', ignore_stopwords=True) old_tokens = tokens tokens = [] for token in old_tokens: nt = stemmer.stem(token) if len(nt) > 0 and nt not in stop_word_set: tokens.append(nt) elif stop_words: stop_word_set = set(stopwords.words('english') + ["n't"]) tokens = list(filter(lambda token: token not in stop_word_set, tokens)) if tokens is not None: cleanText = ' '.join(tokens) if rmPunc: cleanText = re.sub(SPLIT_PUNCT_REGEX, ' ', cleanText) else: cleanText = re.sub(SPLIT_PUNCT_REGEX, ' \\1 ', cleanText) if rmNumber: cleanText = re.sub(r'\b[0-9]+\b', '', cleanText) if rm_char: cleanText = re.sub(r'\b\S\b', '', cleanText) return cleanText
def __init__(self, dictionary_loader: DictionaryLoader, stop_words_file: str, termination_terms_file: str, stemmer: StemmerI = None): super().__init__(stop_words_file, termination_terms_file, dictionary_loader) self.stemmer = stemmer self.unigram_stem_index = dict() # record the stem and give an Id self.concept_length_index = dict() # record the stem and give the length of the expression if not stemmer: self.stemmer = SnowballStemmer("english") self.punctuation_remove = regex.compile('\p{C}', regex.UNICODE)
def __init__(self): self.stemmer = SnowballStemmer("english") self.stop_list = set(stopwords.words("english")) self.dv = DictVectorizer() self.vocab_index = {} self.index_counter = 0 self.doc_index = {} self.doc_vectors = {} self.vocab_counter = Counter() self.loaded_vocab = False
def rss_news(rss, word): stemmer = SnowballStemmer("russian") w = stemmer.stem(word) news = [] feed = feedparser.parse(rss) for post in feed.entries: lst = list(map(stemmer.stem, (post.title + post.description).split())) if w in lst: news.append(f"{post.title} \n{post.description}") return news
def numero_palabras_positivas(tweet): res = 0 twt = tweet.split() try: stemmer = SnowballStemmer("spanish") for w in twt: if stemmer.stem(unicode(w.strip(),'UTF-8')).encode("UTF-8") in positivas_list: res += 1 except: pass return res
def query(): stemmer = SnowballStemmer('english') raw_tokens = [w for w in re.split('\W+', request.args['q']) if w.lower() not in set(stopwords.words('english'))] tokens = [stemmer.stem(w) for w in raw_tokens] token_s2r = dict(zip(tokens, raw_tokens)) g = Graph() g.parse('../PopHR-ToyOnto2.rdf') results = [] matched_tokens = [] query_type = None for n in [3, 2]: # important: begin by most specific query if n > len(tokens): continue for p in product(['lower', 'higher'], permutations(tokens, n)): p = flatten(p) p3 = p[3] if n == 3 else None # relation results = findLowHighDiseaseRelationships(g, p[0], p[1], p[2], p3) if results: query_type = 'disease_relationship' matched_tokens = [token_s2r[w] for w in p[1:]] # first is lower/higher break if results: break if not results: for w in tokens: result = findTopAncestorConcept(g, w) if result: query_type = 'single_concept' results.append(result) matched_tokens.append(token_s2r[w]) # dont break, there might be another single concepts! html_results = [] if query_type == 'disease_relationship': for r in results: s = '<p>> %s is a %s in relation %s to %s</p>' % tuple([concept(r[k]) for k in ['lower', 'higher', 'relation', 'disease']]) html_results.append(s) elif query_type == 'single_concept': #s = '<p>> %s</p>' % ' and '.join(['%s is a %s' % (concept(r['lower']), concept(r['highest'])) for r in results]) for r in results: s = '<p>> %s is a %s</p>' % tuple([concept(r[k]) for k in ['lower', 'highest']]) html_results.append(s) else: html_results.append('<p>> unable to process query</p>') html_query = request.args['q'] for w in matched_tokens: html_query = re.sub(w, '<span class="highlighted">%s</span>' % w, html_query) return json.dumps({'html_results': html_results, 'html_query': html_query})
def get_stemmed_names_of_entity(self): # get stemmed names of entities stemmer = SnowballStemmer('russian') stemmed_names_in = self.get_names_of_entity() stemmed_names_out = [] for item in stemmed_names_in: stemmed = " ".join([stemmer.stem(word.lower()) for word in item.get('name').split(" ")]) # for чтобы # отстеммить name_of_entity, состоящий из нескольких слов if stemmed.endswith('ок'): stemmed = stemmed[:-2] + 'к' stemmed_names_out.append({'stemmed_name': str(stemmed).lower(), 'id': item.get('id')}) return stemmed_names_out
class MFSTagger(object): ''' most frequent sense Tagger, will be used as baseline system ''' def __init__(self, model): ''' Constructor ''' self.prior_sense_dist = model self.snstmr = SnowballStemmer("english") def tag(self, sentence): context = map(lambda x: self.snstmr.stem(x), sentence.lower().strip().split()) predict_sense = "init" prob = float('-inf') for candi_sense in self.prior_sense_dist.keys(): #print candi_sense current_prob = self.prior_sense_dist[candi_sense] if current_prob > prob: predict_sense = candi_sense #update the probability prob = current_prob return predict_sense
class SenseTagger(object): prior_sense_dist = {} word_sense_dist = {} def __init__(self, model): self.prior_sense_dist = model[0] self.word_sense_dist = model[1] self.snstmr = SnowballStemmer("english") def tag(self, sentence): context = map(lambda x: self.snstmr.stem(x), sentence.lower().strip().split()) predict_sense = "init" prob = float('-inf') for candi_sense in self.prior_sense_dist.keys(): #print candi_sense current_prob = self.prior_sense_dist[candi_sense] #print current_prob #print '--------' for word in context: try: current_prob += self.word_sense_dist[(word, candi_sense)] #print current_prob except KeyError: pass #print current_prob, prob if current_prob > prob: predict_sense = candi_sense #update the probability prob = current_prob return predict_sense
class Tokenizer(object): """ For a given language it - splits the text into tokens - applies snowball stemmer Main method = @get_tokens """ def __init__(self, language='russian'): self.stopwords = set(stopwords.words(language)).union('. , ? ! ( )'.split()) self.stemmer = SnowballStemmer('russian') def get_tokens(self, s): """ :param s :type str :return list of str list of stemmed tokens """ return map(self._process_token, self._str2tokens(s)) def _str2tokens(self, s): return list(set(word_tokenize(s.lower())).difference(self.stopwords)) def _process_token(self, token): return self.stemmer.stem(token).encode('utf-8')
class nltk_tools: def __init__(self): self.stemmer = SnowballStemmer("english") def nltk_word_tokenize(self, text): return nltk.word_tokenize(text) def nltk_text(self, tokens): return nltk.Text(tokens) def nltk_text_collection(self, documents): return nltk.TextCollection(documents) def nltk_stopwords(self): return stopwords.words('english') def nltk_cosine_distance(self, u, v): # Closer to 1 the better return nltk.cluster.util.cosine_distance(u,v) def nltk_stemmer(self, input_string): return self.stemmer.stem(input_string) def nltk_pos(self, text): return nltk.pos_tag(text) def nltk_bigrams(self, text): return nltk.bigrams(text)
def wordlist(body, remove_stopwords=False, stem=False): """ convert a document to a sequence of words, optionally removing stop words. Returns a list of words.""" # Remove non-letters text = re.sub("[^a-zA-Z]", " ", body) # convert words to lower case and split them words = text.lower().split() if remove_stopwords: stops = set(stopwords.words("english")) words = [w for w in words if not w in stops] if stem: stemmer = SnowballStemmer("english") words = [stemmer.stem(w) for w in words] return words
def mark_as_entity(request): args = {'status': 0} if request.POST: post_project_id = request.POST.get('project_id', '') post_entity_name = request.POST.get('entity_name', '') # to_hovered_word # unmarked_entities from_entity = request.POST.get('current_entity_id', '') stemmer = SnowballStemmer('russian') stem_entity = stemmer.stem(post_entity_name) entities = Entity.objects.filter(project_id=post_project_id) ent = LinkingEntities(entities, None) stem_entities = ent.get_stemmed_names_of_entity() to_entity = 0 for ent in stem_entities: if stem_entity == ent.get("stemmed_name"): to_entity = ent.get("id") if from_entity == to_entity: pass else: unmark_entity = LinksBetweenEntities.objects.filter(from_entity_id=from_entity, to_entity_id=to_entity) if unmark_entity: data = {'from_entity_id': from_entity, 'to_entity_id': to_entity, 'is_unmarked': False} unmark_entity.update(**data) args['status'] = 1 messages.add_message(request, messages.SUCCESS, "Слово отмечено как сущность") # else: # add new entity morph = pymorphy2.MorphAnalyzer() new_entity = morph.parse(post_entity_name)[0].normal_form.title() args['new_entity'] = new_entity args['status'] = 2 # # new_entity = Entity(item_name=post_entity_name, description='', project_id=post_project_id) # new_entity.save() # args['status'] = 1 return JsonResponse(args)
def preprocess_data(data): """ :param data: data to be pre-processed :return: pre-processed data """ stemmer2 = SnowballStemmer("english") # for removing stem words stop_words = text.ENGLISH_STOP_WORDS # omit stop words temp = data temp = re.sub("[,.-:/()?{}*$#&]"," ",temp) # remove all symbols temp = "".join([ch for ch in temp if ch not in string.punctuation]) # remove all punctuation temp = "".join(ch for ch in temp if ord(ch) < 128) # remove all non-ascii characters temp = temp.lower() # convert to lowercase words = temp.split() no_stop_words = [w for w in words if not w in stop_words] # stemming of words stemmed_data = [stemmer2.stem(plural) for plural in no_stop_words] return stemmed_data
def CreateTextVector(): fileName='ocr_result _1.csv' wordDict = {}; csvfile = file(fileName, 'rb') reader = csv.reader(csvfile) regEx = re.compile('\\W*') regNumber = re.compile('^\\d+$') regStartNumber = re.compile('^\\d+') regEndNumber = re.compile('\\d+$') stemmer = SnowballStemmer("english") for line in reader: if len(line) > 0: currentFileName = line[0] content = line[1] rawWordList = regEx.split(content) wordList =[tok.lower() for tok in rawWordList if len(tok) > 2 and len(tok) < 20 and regNumber.match(tok)==None] #print wordList #print len(wordList) for currentWord in wordList: wordStem = currentWord #Remove numbers begein and end of the wordDict startNumberMatch = regStartNumber.match(wordStem) if startNumberMatch != None: wordStem = wordStem[startNumberMatch.span()[1]:] endNumberMatch = re.search('\\d+$',wordStem) if endNumberMatch != None: wordStem = wordStem[:endNumberMatch.span()[0]] wordStem = stemmer.stem(wordStem) if len(wordStem) >2 : if wordStem in wordDict: wordDict[wordStem] = wordDict[wordStem] + 1 else: wordDict[wordStem] = 1 #break; csvfile.close() sortedList = sorted(wordDict.iteritems(), key=lambda d:d[1], reverse = False) keys = [[item[0]] for item in sortedList if item[1]>2] print len(keys) #print keys # Write the keys to csv file CSVUtil.WriteCSV('../../data/all_tokens.csv',keys)
def tokenize_and_stem(txt, stem=True, remove_html=True, join=False, remove_stopwords=True): ''' Remove html and stopwords, tokenize and stem. ''' lang = 'english' if remove_html: txt = clean_html(txt) words = tokenize(txt) if remove_stopwords: stop_words = stopwords.words(lang) words = [w for w in words if w.lower() not in stop_words] if stem: stemmer = SnowballStemmer(lang) words = [stemmer.stem(word).encode(encoding="utf8") for word in words] if join: words = " ".join(words) return words
def main(): parser = optparse.OptionParser() parser.add_option('-f', help='archivo de la lista', type='string', dest='in_file') parser.add_option('-o', help='archivo de salida', type='string', dest='out_file') (opts, args) = parser.parse_args() mandatories = ['in_file', 'out_file'] for m in mandatories: if not opts.__dict__[m]: print "Falta argumento obligatorio" parser.print_help() exit(-1) lista = open(opts.in_file, 'r') output = open(opts.out_file, 'w') stemmer = SnowballStemmer("spanish") for word in lista: output.write(stemmer.stem(unicode(word.strip(),'UTF-8')).encode("UTF-8")+"\n") lista.close() output.close()
class StemTransform( metl.transform.base.Transform ): init = ['language'] # void def __init__( self, language, *args, **kwargs ): self.language = language.lower() self.stemmer = SnowballStemmer( self.language ) super( StemTransform, self ).__init__( *args, **kwargs ) # Field def transform( self, field ): if field.getValue() is None: return field field.setValue( u' '.join( [ self.stemmer.stem( word ) for word in field.getValue().split() if word != u'' ] ) ) return field
class TweetNormalizer(object): def __init__(self, language): self.language = language self.stemmer = SnowballStemmer(language, ignore_stopwords=True) def clean_stopwords(self, text): # Cleaning portuguese stopwords splitted = [i for i in text.split() if i not in stopwords.words(self.language)] cleaned_splitted = [] # Cleaning twitter stopwords for word in splitted: cleaned_splitted.append(word) for twitter_stopword in TWITTER_STOPWORDS: if word.startswith(twitter_stopword): cleaned_splitted.remove(word) return ' '.join(cleaned_splitted) def stem(self, text): splitted = text.split() for i, word in enumerate(splitted): # import ipdb; ipdb.set_trace() # unicode_word = word.encode('utf-8') stem_word = self.stemmer.stem(unidecode(word)) splitted[i] = stem_word return ' '.join(splitted) def normalize(self, text): text = self.clean_stopwords(text) text = self.stem(text) return text
def __init__(self, model): self.prior_sense_dist = model[0] self.word_sense_dist = model[1] self.snstmr = SnowballStemmer("english")
except KeyError: # put it in the matrix word_sense_dist[(word,sense)] = 1 + epsilon # also put it for all other possible senses for s in senses: if s != sense: word_sense_dist[(word, s)] = epsilon # sum over senses, will be used to calculate the probability for k in word_sense_dist.keys(): try: sense_sum[k[1]] += word_sense_dist[k] except KeyError: sense_sum[k[1]] = 0 for k in word_sense_dist.keys(): word_sense_dist[k] = math.log(float(word_sense_dist[k]) / float(sense_sum[k[1]])) #print k, word_sense_dist[k] #print "=========" for k in prior_sense_dist.keys(): prior_sense_dist[k] = math.log(float(prior_sense_dist[k]) / total_num_sense) #print prior_sense_dist[k] return (prior_sense_dist,word_sense_dist) if __name__ == '__main__': snstmr = SnowballStemmer("english") print snstmr.stem("stemming")
import sys import json from nltk import SnowballStemmer if __name__=='__main__': for line in sys.stdin: try: tweet=json.loads(line,strict=False) except: continue lang=tweet['lang_det'] tweet['stemmed']=[] options={'de': "german", 'en': "english", 'ro': "romanian", "da": "danish", "nl": "dutch", "fi": "finnish", "fr": "french", "hu": "hungarian", "it": "italian", "no": "norwegian", "pt": "portuguese", "ru": "russian", "es": "spanish", "sv": "swedish"} try: stemmer=SnowballStemmer(options[lang]) except: print json.dumps(tweet) continue tokens=tweet['tokens'] for token in tokens: tweet['stemmed'].append(stemmer.stem(token)) print json.dumps(tweet)