class Index: """ Inverted index datastructure """ def __init__(self, tokenizer=None, stemmer=None, stopwords=None): """ Parameters ---------- tokenizer : None NLTK compatible tokenizer function stemmer : None NLTK compatible stemmer stopwords : list list of ignored words """ self.tokenizer = nltk.word_tokenize self.stemmer = EnglishStemmer() self.index = defaultdict(list) self.documents = {} self.__unique_id = 0 if not stopwords: self.stopwords = set() else: self.stopwords = set(nltk.corpus.stopwords.words("english")) def lookup(self, word: str): """ Lookup a word in the index Parameters ---------- word : str words """ word = word.lower() if self.stemmer: word = self.stemmer.stem(word) return [self.documents.get(id, None) for id in self.index.get(word)] def add(self, document): """ Add a document string to the index Parameters ---------- document : str document """ for token in [t.lower() for t in nltk.word_tokenize(document)]: if token in self.stopwords: continue if self.stemmer: token = self.stemmer.stem(token) if self.__unique_id not in self.index[token]: self.index[token].append(self.__unique_id) self.documents[self.__unique_id] = document self.__unique_id += 1
def stem_wrapper(_input=None): stemmer = EnglishStemmer() result = func(_input) if _input else func() if isinstance(result, list): if isinstance(result[0], tuple): for i in range(len(result)): result[i] = list(result[i]) result[i][0] = stemmer.stem(result[i][0]) result[i] = tuple(result[i]) return result return [stemmer.stem(word) for word in result]
def fix_lemma_problem(pred_scores, targets, space): from nltk.stem.snowball import EnglishStemmer es = EnglishStemmer() r = pred_scores.copy() lemmas = np.array([es.stem(v) for v in space.vocab]) for i, t in enumerate(targets): g = es.stem(space.vocab[t]) mask = (lemmas == g) #print space.vocab[t], np.sum(mask) r[i][mask] = -1e9 #print r[i][mask] return r
class Stemmer_Transformer(BaseEstimator, TransformerMixin): def __init__(self): self.stemmer = EnglishStemmer() return def fit(self, mots, y=None): return self def transform(self, mots, y=None): for i in range(len(mots)): for mot in mots[i]: self.stemmer.stem(mot) return mots
def similarity_score(word1, word2): """ see sections 2.3 and 2.4 of http://dx.doi.org.ezp-prod1.hul.harvard.edu/10.1109/TKDE.2003.1209005 :type word1: string :type word2: string :return: float: between 0 and 1; similarity between two given words """ stemmer = EnglishStemmer() if stemmer.stem(word1) == stemmer.stem(word2): return 1 alpha = 0.2 beta = 0.6 l, h = get_path_length_and_subsumer_height(word1, word2) return exp((-1)*alpha*l)*((exp(beta*h)-exp((-1)*beta*h))/(exp(beta*h)+exp((-1)*beta*h)))
def similarity_score(word1, word2): """ see sections 2.3 and 2.4 of http://dx.doi.org.ezp-prod1.hul.harvard.edu/10.1109/TKDE.2003.1209005 :type word1: string :type word2: string :return: float: between 0 and 1; similarity between two given words """ stemmer = EnglishStemmer() if stemmer.stem(word1) == stemmer.stem(word2): return 1 alpha = 0.2 beta = 0.6 l, h = get_path_length_and_subsumer_height(word1, word2) return exp((-1) * alpha * l) * ((exp(beta * h) - exp( (-1) * beta * h)) / (exp(beta * h) + exp((-1) * beta * h)))
def stem_terms_list(terms_list: list) -> list: """stems the list of terms(terms) using snowball stemmer Update the terms_list and also returns same updated list""" stemmer = EnglishStemmer() for i in range(terms_list.__len__()): terms_list[i] = stemmer.stem(terms_list[i]) return terms_list
def fit(self, X, D, seq_length=None, wgt=1, wgt_inverse=0): max_seq_length = 0 word_counts = defaultdict(lambda: 0) doc_counts = defaultdict(lambda: 1) for doc in X: sl = 0 tokens = self.tokenize(doc) for w in set(tokens): doc_counts[w] += 1 for w in tokens: sl += 1 word_counts[w] += 1 max_seq_length = sl if sl > max_seq_length else max_seq_length self.max_seq_length = max_seq_length word_counts = dict(word_counts) word_counts = sorted(word_counts.items(), reverse=True, key=operator.itemgetter(1)) doc_words = doc_counts.keys() doc_ids = np.array(list(doc_counts.values())) / float(X.shape[0]) self.doc_counts = dict(zip(doc_words, doc_ids)) self.doc_counts["__PADDING__"] = 0 self.doc_counts["__OOV_WORD__"] = 0 self.word_index = dict() self.word_index["__PADDING__"] = self.pad_id self.word_index["__OOV_WORD__"] = self.oov_id self.word_counts = dict() for i, (w, c) in enumerate(word_counts): self.word_index[w] = i + 1 + self.base_word_id self.word_counts[i + 1 + self.base_word_id] = c self.inverse_word_index = {v: k for k, v in self.word_index.items()} from nltk.stem.snowball import EnglishStemmer stemmer = EnglishStemmer() V = [] for i, x in enumerate(X): deplist = df["dependency_path"].values[i] tokens = word_tokenize(x) if seq_length is not None: v = np.ones(seq_length) * wgt_inverse else: v = np.ones(self.max_seq_length) * wgt_inverse for j, w in enumerate(tokens): if stemmer.stem(w.lower()) in deplist or w.lower() in deplist: v[j] = wgt V.append(v) self.weights = np.array(V) return V, self.weights
class Tokenizer(object): def __init__(self): self.cache = {} self.r_stemmer = RussianStemmer() self.e_stemmer = EnglishStemmer() def process_word(self, w): if w in self.cache: return self.cache[w] else: struct = check_structure(w) if struct == 'TRASH': w_proc = '' elif struct == 'WORD': if is_ascii(w): w_proc = self.e_stemmer.stem(w) else: w_proc = self.r_stemmer.stem(w) elif struct == 'NUMBER': w_proc = '' elif struct == 'COMPLEX': w_proc = w self.cache[w] = w_proc return w_proc def tokenize(self, text): text = preprosess_text(text) words = text.split(' ') tokens = [] for w in words: tokens.append(self.process_word(w)) tokens = [t for t in tokens if len(t)] return tokens
def Granularity(sentenceArray): for sentence in sentenceArray: # print(sentence) try: stemmer = EnglishStemmer() sentence = re.sub(r'\#.*?$', '', sentence) sentence = re.sub(r'\#.*? ', '', sentence) sentence = re.sub(r'\@.*?$', '', sentence) sentence = re.sub(r'\@.*? ', '', sentence) sentence = re.sub(r'pic.twitter.*?$', '', sentence) sentence = re.sub(r'pic.twitter.*? ', '', sentence) sentence = re.sub(r'\'m', ' am', sentence) sentence = re.sub(r'\'d', ' would', sentence) sentence = re.sub(r'\'ll', ' will', sentence) sentence = re.sub(r'\&', 'and', sentence) sentence = re.sub(r'don\'t', 'do not', sentence) data = stemmer.stem(sentence) print(data) from nltk.corpus import stopwords sentence = str(data) stop = stopwords.words('english') final = [i for i in sentence.split() if i not in stop] finalstring = ' '.join(final) os.system("printf \"" + str(finalstring) + "\n\">> stemstop/" + word) except Exception as e: print(e)
def query(word): db = MySQLdb.connect("127.0.0.1","dizing","ynr3","dizing" ) cursor=db.cursor() snowball_stemmer = EnglishStemmer() stem2 = snowball_stemmer.stem(word) cursor.execute("SELECT * FROM words WHERE original=%s OR stem1=%s OR stem2=%s", (word,word,stem2)) rows = cursor.fetchall() words1 = dict() words2 = dict() for row in rows: if row[1] == word or row[3]==word: words1[word] = row[0] else: words2[word] = row[0] scenes1 = [] scenes2 = [] for (i,words_dict) in [(1,words1), (2,words2)]: wids = words_dict.values() for wid in wids: sql = "SELECT s.sentence, s.start, s.stop, s.ready, m.title FROM scene AS s, words_scenes AS ws, movie as m " + \ "WHERE ws.wid=%d AND ws.sid=s.sid AND s.mid = m.mid" % int(wid) # print sql cursor.execute(sql) rows = cursor.fetchall() if (i==1): scenes1 += rows else: scenes2 += rows print scenes1 print scenes2 return scenes1 + scenes2 db.close()
def etape3_bis_get_counterWord2_with_sw_and_stemmer(data, stopword): '''Fonction qui fait un comptage par post avec prise en compte des stopwords /!\ On ajoute le stemmer INPUT: ------ - data OUTPUT: ------- - data + colonne de counter (solution choisie) OU - liste des counter (en stand by : a voir pb de perfs) APPEL FONCTION: --------------- - etape3(df2, sw) ''' from nltk.stem.snowball import EnglishStemmer stemmer = EnglishStemmer() data_text = data['Body_clean'] # On initialise la nouvelle feature data['Counter_WORD2'] = None for m_id, m_text in data_text.iteritems(): # On liste les tokens tokens = tokenizer.tokenize(m_text.lower()) # On sélectionne les token qui ne sont pas des stopWords # et on met à la jour la nouvelle colonne # data.at[m_id, 'Counter_WORD2'] = # [w for w in tokens if not w in list(stopword)] data.at[m_id, 'Counter_WORD2'] = [stemmer.stem(w) for w in tokens if w not in list(stopword)]
def tokenize(self, file): stemmer = EnglishStemmer() # start_time = time.time() url = open(file, encoding='utf-8') html = url.read() soup = BeautifulSoup(html, 'lxml') for script in soup(['script', 'style']): script.extract() word_weights = defaultdict(int) # total_words = len(soup.get_text()) # Index position: index in tokens, creation of auxiliary for i in soup.find_all( ['title', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'b', 'strong', 'p']): tokens = set() for t in re.findall(r'\w+', i.text): if len(t) >= 2: tokens.add(t.lower()) weight = [1, 4, 10] for token in tokens: token = stemmer.stem(token) if i.name == 'title': word_weights[token] += weight[2] elif i.name == 'strong' or i.name == 'b' or i.name[0] == 'h': word_weights[token] += weight[1] else: word_weights[token] += weight[0] url.close() # print(time.time() - start_time, "seconds") return word_weights
def generate_tokens(document_text): """Process text so it can be added to the index. The NLTK module provides a tokenizer, word stemmer, and a list of stopwords.""" stemmer = EnglishStemmer() # from NLTK module try: tokens = [additional_pruning(remove_hyphens(stemmer.stem(token.lower()))) for token in nltk.word_tokenize(document_text) if len(token) > 2 and token] except Exception as e: print('error: {}\ndocument_text: {}'.format(e, document_text)) tokens = [] good_tokens = [] for token in tokens: good_tokens.extend([unidecode(item) for item in process_url_prefixes(token)]) good_tokens.extend([custom_youtube_stemmer(item) for item in process_url_prefixes(token) if custom_youtube_stemmer(item)]) better_tokens = [] for token in good_tokens: if has_alpha_char(token): better_tokens.extend(process_url_ends(token)) # can you use list comprehension with extend? return better_tokens
def getStem(*words): '''Return a list of the stems corresponding to the words provided as arguments''' stemmer = EnglishStemmer() out = list() for word in words: out.append(stemmer.stem(word)) return out
class StemmedCountVectorizer(CountVectorizer): def __init__(self, lang, strip_accents=None, ngram_range=(1, 1), max_df=1.0, min_df=1, stop_words=None): if lang == 'de': self.stemmer = GermanStemmer() else: self.stemmer = EnglishStemmer() super(self.__class__, self).__init__(stop_words=stop_words, strip_accents=strip_accents, ngram_range=ngram_range, max_df=max_df, min_df=min_df) def _stem_tokens(self, words): return [self.stemmer.stem(w) for w in words] def build_analyzer(self): preprocess = self.build_preprocessor() stop_words = self.get_stop_words() tokenize = self.build_tokenizer() return lambda doc: self._word_ngrams( self._stem_tokens(tokenize(preprocess(self.decode(doc)))), stop_words)
def getAllStemEntities(entities): st = EnglishStemmer() q = [",", ".", "!", "?", ":", ";"] tmp = [] sourceEntities = [x for x in entities if len(x) > 0] np.random.shuffle(entities) for i in xrange(len(entities)): if len(entities[i]) == 0: continue if i % 1000 == 0: print i entities[i] = entities[i].lower() entities[i] = entities[i].replace(" - ", " \u2013 ", entities[i].count(" - ")) entities[i] = entities[i].replace(" -", " \u2013", entities[i].count(" -")) entities[i] = entities[i].replace("- ", "\u2013 ", entities[i].count("- ")) entities[i] = entities[i].replace("-", " - ", entities[i].count("-")) entities[i] = entities[i].replace(")", " )", entities[i].count(")")) entities[i] = entities[i].replace("(", "( ", entities[i].count("(")) entities[i] = entities[i].replace("\u0027", " \u0027", entities.count("\u0027")) for w in q: entities[i] = entities[i].replace(w, " " + w, entities[i].count(w)) word = entities[i].split(" ") s = "" for w in word: s += st.stem(unicode(w)) + " " tmp.append(s[:-1]) if len(tmp) > 50: break return tmp, entities[: len(tmp)]
def get_unique_words_by_stemming(self): """Compare a starting word and a group of user submitted words. Using stemming to determing similarity. Return a list unique words, with matching user. """ stemmer = EnglishStemmer() #Make starting list list_of_words = list(self.user_submissions.values()) #Lower case all words for comparison list_of_words = [word.lower() for word in list_of_words] #Add starting word to list. list_of_words.append(self.prompt) #Remove identical unique_words. counter_of_words = Counter(list_of_words) list_of_words = [] for word in counter_of_words: if counter_of_words[word] == 1: list_of_words.append(word) stemmed_dict = dict() for word in list_of_words: stemmed_dict[word] = stemmer.stem(word) unique_words = [key for key, value in stemmed_dict.items() if list(stemmed_dict.values()).count(value) == 1] if self.prompt in unique_words: unique_words.remove(self.prompt) return unique_words
def getAllStemEntities(entities): st = EnglishStemmer() q = [',', '.', '!', '?', ':', ';'] tmp = [] sourceEntities = [x for x in entities if len(x)>0] np.random.shuffle(entities) for i in xrange(len(entities)): if len(entities[i]) == 0: continue if i % 1000 == 0: print i entities[i] = entities[i].lower() entities[i] = entities[i].replace(' - ', ' \u2013 ', entities[i].count(' - ')) entities[i] = entities[i].replace(' -', ' \u2013', entities[i].count(' -')) entities[i] = entities[i].replace('- ', '\u2013 ', entities[i].count('- ')) entities[i] = entities[i].replace('-', ' - ', entities[i].count('-')) entities[i] = entities[i].replace(')', ' )', entities[i].count(')')) entities[i] = entities[i].replace('(', '( ', entities[i].count('(')) entities[i] = entities[i].replace('\u0027', ' \u0027', entities.count('\u0027')) for w in q: entities[i]=entities[i].replace(w, ' '+w, entities[i].count(w)) word = entities[i].split(' ') s = '' for w in word: s += st.stem(unicode(w)) + ' ' tmp.append(s[:-1]) if len(tmp) > 50: break return tmp, entities[:len(tmp)]
def stem_tokenizer(text): stemmer = EnglishStemmer(ignore_stopwords=True) # Removing the URL links words = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text) words = re.sub(r"[^A-Za-z0-9\-]", " ", words).lower().split() words = [stemmer.stem(word) for word in words] return words
def stem_docs(docs): """Return stemmed documents""" stemmer = EnglishStemmer() return [ " ".join([stemmer.stem(word) for word in sentence.split(" ")]) for sentence in docs ]
def _tokenize_sentences(text): ''' Tokenize sentences by performing the following: - convert to uniform case (lower) - numeric removal - punctuation removal - word stemming - stop word removal Token lists are converted to token strings for hashability ''' original_sentences = sent_tokenize(text) stops = set(stopwords.words('english')) # Sentences to lower case tokenized_sentences = list(map(lambda s: s.lower(), original_sentences)) # Remove numbers regex = re.compile(r"[0-9]+") tokenized_sentences = [regex.sub("", sentence) for sentence in tokenized_sentences] ## Strip all punctuation regex = re.compile(str.format('([{0}])+', re.escape(punctuation))) tokenized_sentences = [regex.sub(" ", sentence) for sentence in tokenized_sentences] # Strip stop words tokenized_sentences = list(map(lambda s: filter(lambda w: w not in stops, s.split()), tokenized_sentences)) ## Stem the sentences stemmer = EnglishStemmer() tokenized_sentences = [ [stemmer.stem(word) for word in sentence] for sentence in tokenized_sentences] # Join the sentences back into strings... tokenized_sentences = [' '.join(lst) for lst in tokenized_sentences] return _merge_sentences(original_sentences, tokenized_sentences)
def unsupervised_predict(post, lda_tags_df_scaled, dictionary, lda): w_tokenizer = nltk.tokenize.WhitespaceTokenizer() stemmer = EnglishStemmer() stemmed = ' '.join(stemmer.stem(WordNetLemmatizer().lemmatize(w, pos='v')) for w in w_tokenizer.tokenize(post)) pattern = re.compile('[^A-Za-z +]') normalized = re.sub(pattern, ' ', stemmed) result = [] for token in gensim.utils.simple_preprocess(normalized): if token not in gensim.parsing.preprocessing.STOPWORDS: result.append(token) this_dictionary = [] this_dictionary.append(result) other_corpus = [dictionary.doc2bow(text) for text in this_dictionary] unseen_doc = other_corpus[0] vector = lda[unseen_doc] topic = vector[0][0][0] perc = vector[0][0][1] tags = lda_tags_df_scaled[int(topic)] tags_output = tags.sort_values(ascending=False).head(5) return tags_output
def clean(toClean): """ This function cleans the text before adding it to the JSON. It operates by using regex to remove non-alphabetical charecters and extra spcaces, then uses NLTK to remove stop words Arguments: toClean {str} -- A string of text to be cleaned Returns: {str} -- A cleaned string of text """ stemmer = EnglishStemmer() stopWords = set(stopwords.words('english')) clean = "" # Force lowercase toClean = toClean.lower() # remove non alphabetical chars and remove extra spaces toClean = re.sub('[^a-z,.!?]', ' ', toClean) # use [^a-zA-Z] if not forced lowercase # Replaces sequential spaces with single space toClean = re.sub(' +', ' ', toClean) # tokenize words for stemming and stop word removal tokens = word_tokenize(toClean) # for each token, stem token and then check if it is stop word. # adds token to the clean text if it is not a stop word. for token in tokens: token = stemmer.stem(token) if token not in stopWords: clean = clean + (str(token) + " ") return clean
def filter_misc(series, pos=None, stem=False): new_series = pd.Series(index=series.index) if stem == True: # does not stem stopwords stemmer = EnglishStemmer(ignore_stopwords=True) if pos is not None: for index, text in enumerate(series): new_series.iloc[index] = ' '.join([ y for y, tag in nltk.pos_tag(nltk.word_tokenize(text)) if tag in pos ]) pos_flag = True else: pos_flag = False if stem is True: # if both pos and stem if pos_flag == True: use_series = new_series # just stem else: use_series = series for index, text in enumerate(use_series): text_list = text.split() stemmed_text = [] for word in text_list: stemmed_text += [stemmer.stem(word)] new_series.iloc[index] = ' '.join(stemmed_text) return new_series
def process_text(self, text, k): es = EnglishStemmer() doc = "".join([ es.stem(w.lower()) for w in text.split() if w.lower() not in (stopwords.words('english')) ]) return [doc[index:index + k] for index in range(len(doc) - k + 1)]
def pre_proc(in_str, removestop=True, alwayskeep=False, word_punc=False, unquote=False): # remove accents, wordify punctuation in_str = strip_accents(in_str, wordify=word_punc, unquote=unquote) en_stem = EnglishStemmer() # tokenize string if removestop: # remove stop words tok_list = filter(lambda x: x not in stopwords.words('english'), wordpunct_tokenize(in_str)) else: tok_list = wordpunct_tokenize(in_str) new_tok_list = [] for tok in tok_list: if tok not in WORD_PUNC_LIST: correct_spell = HOBJ.spell(tok) if not correct_spell: suggestions = [strip_accents(tmp_sug).lower() for tmp_sug in HOBJ.suggest(tok)] else: suggestions = [] if correct_spell or (tok.lower() in suggestions): new_tok_list.append(tok) tok_stem = en_stem.stem(tok) if tok_stem != tok: new_tok_list.append(tok_stem) elif len(tok) >= 3: tok_sug = None lev_perc = .34 for sug in suggestions: if not tok_sug and tok == sug[1:]: tok_sug = sug if not tok_sug: for sug in suggestions: tmp_lev_perc = float(lev_dist(tok, sug)) / float(max(len(tok),len(sug))) if not tok_sug and tmp_lev_perc < lev_perc: tok_sug = sug lev_perc = tmp_lev_perc if tok_sug: new_tok_list.append(tok_sug) tok_stem = en_stem.stem(tok_sug) if tok_stem != tok_sug: new_tok_list.append(tok_stem) elif alwayskeep: new_tok_list.append(tok) elif alwayskeep: new_tok_list.append(tok) else: new_tok_list.append(tok) out_str = string.join(new_tok_list, ' ') return out_str.lower()
def _execute(self): corpus = mongoExtractText(self.name) stemmer = EnglishStemmer() for item in corpus: line = item.replace(',', ' ') stemmed_line = stemmer.stem(line) self.sentiment.append((sentiment.sentiment(stemmed_line), stemmed_line))
def run(self, data): english = EnglishStemmer() for corpus in data: corpus.tokenized_contents = [ english.stem(word) for word in corpus.tokenized_contents ] corpus.contents = ''.join(corpus.tokenized_contents) return data
def tokenize_en(text): def is_ok(item): return True if item.lower() == item and all(elem.isalpha() and elem in string.ascii_letters for elem in item) else False from nltk.stem.snowball import EnglishStemmer stemmer = EnglishStemmer(ignore_stopwords=True) tokens = word_tokenize(text) result = [stemmer.stem(item) for item in tokens if is_ok(item)] return result
def as_eng_postagged_doc(doc): '''Uses nltk default tagger.''' tags = [t for _, t in nltk.pos_tag(list(doc.word))] stemmer = EnglishStemmer() lemmata = [stemmer.stem(w) for w in list(doc.word)] doc['pos'] = Series(tags) doc['lemma'] = Series(lemmata) return doc
def stem_word(word): """ Stem words :param word: (str) text word :returns: stemmed word """ stemmer = EnglishStemmer() return stemmer.stem(word)
def parse(self, text): stemmer = EnglishStemmer() text = text.decode('utf-8') tokens = filter( lambda token: not self.stop_set.__contains__(token), map(lambda token: stemmer.stem(token.lower()), wordpunct_tokenize(text))) return tokens
def stemArticle(self, doc): stemmer_fr = FrenchStemmer() stemmer_en = EnglishStemmer() stemmedArticle = [str(stemmer_fr.stem(w)) for w in doc] stemmedArticle = [str(stemmer_en.stem(w)) for w in stemmedArticle] return stemmedArticle
def tokenize(text, stem=False): tokens = [word for word in word_tokenize(text) if word.isalpha()] if stem: stemmer = EnglishStemmer() tokens = [stemmer.stem(word) for word in tokens] return ' '.join(tokens)
def word_count(document): words = get_words(document["content"]) stemmer = EnglishStemmer() words = [stemmer.stem(word) for word in words] fdist = FreqDist(words) for word, frequency in fdist.most_common(50): print(u'{};{}'.format(word, frequency)) fdist.plot(30, cumulative=False)
class TextProcessor: def __init__(self): self._stemmer = EnglishStemmer() def process(self, text: str) -> [str]: words = filter( lambda word: word not in stopwords.words("english") and word. isalpha(), word_tokenize(text)) return [self._stemmer.stem(word) for word in words]
def text_processing(text, min_size=4, sep_char=' '): from nltk.stem.snowball import EnglishStemmer from nltk.corpus import stopwords as stwds stemmer = EnglishStemmer() stopwords = set(stwds.words('english') + contractions_without_punc) text = [stemmer.stem(w) for w in text.split(sep_char) if not w in stopwords and len(w) >= min_size] return text words = list() for word in text: words.append(stemmer.stem(word)) return words
def computeSentiment(tweet_text): pos_count = 0 neg_count = 0 pos_terms = [] neg_terms = [] st = EnglishStemmer() tokenized_tweet = tokenize(tweet_text) for t in tokenized_tweet: #print st.stem(t.lower()) if st.stem(t.lower()) in negative_terms: neg_terms.append(t.lower()) neg_count += 1 elif st.stem(t.lower()) in positive_terms: pos_terms.append(t.lower()) pos_count += 1 return pos_count, neg_count, set(pos_terms), set(neg_terms)
def text_processing(text, min_size=4, sep_char=' '): from nltk.stem.snowball import EnglishStemmer from nltk.corpus import stopwords as stwds stemmer = EnglishStemmer() stopwords = set(stwds.words('english') + contractions_without_punc) text = [ stemmer.stem(w) for w in text.split(sep_char) if not w in stopwords and len(w) >= min_size ] return text words = list() for word in text: words.append(stemmer.stem(word)) return words
def use_snowball_stemmer(self,word): """ return stemmed words used snowball algorithm :param word: :return: """ englishStemmer=EnglishStemmer() stemmed_word= englishStemmer.stem(word) return stemmed_word
class StemTokenizer(object): """ Transform each word to its stemmed version e.g. studies --> studi """ def __init__(self): self.st = EnglishStemmer() def __call__(self, doc): return [self.st.stem(t) for t in word_tokenize(doc)]
def getLemmatizerInfo(pathArticle): data = open(pathArticle, "r") text1 = data.read().decode('utf-8') sourceText = text1 links1 = [] l = 0 for q in text1.split(): if q == '\ufeff': continue links1.append([text1.find(q,l), q]) l = len(q) + 1 + text1.find(q,l) text1 = text1.replace(' - ', ' \u2013 ', text1.count(' - ')) text1 = text1.replace(' -', ' \u2013', text1.count(' -')) text1 = text1.replace('- ', '\u2013 ', text1.count('- ')) text1 = text1.replace('-', ' - ', text1.count('-')) text1 = text1.replace('(', '( ', text1.count('(')) text1 = text1.replace(')', ' )', text1.count(')')) text1 = text1.replace(' \u0027', ' \u301E', text1.count(' \u0027')) text1 = text1.replace('\u0027', ' \u0027', text1.count('\u0027')) text1 = text1.split() if text1[0] == u'\ufeff': text1=text1[1:] text = [] for word in text1: text2 = [] if len(word) == 0: continue while word[len(word)-1] in [',','.','!','?',':',';']: text2.append(word[len(word)-1]) word = word[:-1] if len(word) == 0: break text.append(word) for i in range(len(text2)-1, -1,-1): text.append(text2[i]) out = '' st = EnglishStemmer() l = 0 links = [] for word in text: if isOk(word): q = st.stem(word) + ' ' else: q = word + ' ' out += q.lower() links.append([l, q]) l += len(q) return out, links, links1, sourceText
def getLemmatizerInfo(pathArticle): data = open(pathArticle, "r") text1 = data.read().decode("utf-8") sourceText = text1 links1 = [] l = 0 for q in text1.split(): if q == "\ufeff": continue links1.append([text1.find(q, l), q]) l = len(q) + 1 + text1.find(q, l) text1 = text1.replace(" - ", " \u2013 ", text1.count(" - ")) text1 = text1.replace(" -", " \u2013", text1.count(" -")) text1 = text1.replace("- ", "\u2013 ", text1.count("- ")) text1 = text1.replace("-", " - ", text1.count("-")) text1 = text1.replace("(", "( ", text1.count("(")) text1 = text1.replace(")", " )", text1.count(")")) text1 = text1.replace(" \u0027", " \u301E", text1.count(" \u0027")) text1 = text1.replace("\u0027", " \u0027", text1.count("\u0027")) text1 = text1.split() if text1[0] == u"\ufeff": text1 = text1[1:] text = [] for word in text1: text2 = [] if len(word) == 0: continue while word[len(word) - 1] in [",", ".", "!", "?", ":", ";"]: text2.append(word[len(word) - 1]) word = word[:-1] if len(word) == 0: break text.append(word) for i in range(len(text2) - 1, -1, -1): text.append(text2[i]) out = "" st = EnglishStemmer() l = 0 links = [] for word in text: if isOk(word): q = st.stem(word) + " " else: q = word + " " out += q.lower() links.append([l, q]) l += len(q) return out, links, links1, sourceText
def get_query(vec_dict): dim = 300 # Dimension of the GloVe vectors chosen # initialize stemmer for search in GLoVe vector space st = EnglishStemmer() query = raw_input("Please enter search query:") query_vector = np.zeros(dim) numWords = 0 for word in query.split(): if st.stem(word) in vec_dict: query_vector += vec_dict[st.stem(word)].astype(np.float) numWords += 1 elif st.stem(word) + "e" in vec_dict: query_vector += vec_dict[st.stem(word) + "e"].astype(np.float) numWords += 1 query_vector /= numWords return query, query_vector
def stemming(tweet): tweets = tweet.split() wrdStemmer = EnglishStemmer() stemTweet =[] try: for tweet in tweets: tweet = wrdStemmer.stem(tweet) stemTweet.append(tweet) except: print("Error: Stemming") return " ".join(stemTweet)
class EnglishStemmer(PreProcessor): def __init__(self): self.stemmer = SnowballEnglishStemmer() def process_sentence(self, sentence): stemmed_sentence = [] for token in wordpunct_tokenize(sentence): if len(token) > 1: stemmed_sentence.append(self.stemmer.stem(token)) return ' '.join(stemmed_sentence)
def main(fname): e = EnglishStemmer() n, a = 0, 0 for line in open(sys.argv[1]): title, body, tags, creationdate, acceptedanswerid, score, viewcount = eval(line) # Process text into tokens html_tags = RX_OPEN_TAGS.findall(body) body = RX_TAGS.sub("",body) print " ".join(e.stem(s) for s in RX_NONWORD.split(body)) M = bayes.NaiveLearner(adjust_threshold=True, name="Adjusted Naive Bayes")
def get_stemmed_keywords(keywords): stemmer = EnglishStemmer() stemmed_keywords = list(keywords) # split into list of list stemmed_keywords = [keyword.split() for keyword in stemmed_keywords] # stem individual words stemmed_keywords = [list(stemmer.stem(word) for word in keyword) for keyword in stemmed_keywords] # list of words to string stemmed_keywords = [' '.join(keyword).encode('ascii') for keyword in stemmed_keywords] return stemmed_keywords
def stemmed(text, snowball=False): """Returns stemmed text """ if snowball: st = EnglishStemmer() else: st = PorterStemmer() words = wordpunct_tokenize(text) words = [st.stem(w) for w in words] text = ' '.join(words) return text
def nltk_tokenizer(text, min_size=4, *args, **kwargs): from nltk.stem.snowball import EnglishStemmer from nltk.corpus import stopwords as stwds from nltk.tokenize import TreebankWordTokenizer stemmer = EnglishStemmer() stopwords = set(stwds.words('english')) text = [stemmer.stem(w) for w in TreebankWordTokenizer(). tokenize(text) if not w in stopwords and len(w) >= min_size] return text
def normalize_tags(): cursor.execute('SELECT app_id, tag, times FROM tag_app_rel;') all_tag_data = defaultdict(dict) for r in cursor: all_tag_data[r[0]][r[1]] = r[2] from nltk.stem.snowball import EnglishStemmer stemmer = EnglishStemmer() for app_id, tag_to_times in all_tag_data.iteritems(): normalized_app_tag_dict = defaultdict(int) for tag, times in tag_to_times.iteritems(): normalized_app_tag_dict[stemmer.stem(tag)] += times for tag, times in normalized_app_tag_dict.iteritems(): cursor.execute('INSERT INTO tag_app_relation (app_id, tag, times) VALUES (%s, %s, %s)', (app_id, tag, times))
def tokenize_documents(documents): stop_words = stopwords.words('english') + stopwords.words('spanish') #common words to be filtered english = EnglishStemmer() arabic = ISRIStemmer() punctuation = { ord(char): None for char in string.punctuation} def valid_word(token, filtered=stop_words): # Returns false for common words, links, and strange patterns if (token in filtered) or (token[0:4] == u'http') or\ (token in string.punctuation): return False else: return True for doc in documents: row = doc[0] doc = doc[1] if doc is not None: # remove trailing whitespace doc = doc.strip() # remove twitter handles (words in doc starting with @) doc = re.sub(r"@\w+|\b@\w+", "", doc) # lowercase letters doc = doc.lower() # remove punctuation doc = doc.translate(punctuation) # tokenization: handles documents with arabic or foreign characters tokens = nltk.tokenize.wordpunct_tokenize(doc) cleaned_tokens = [] for token in tokens: # for valid words, correct spellings of gaddafi and stem words if valid_word(token): if token in [u'gadhafi', u'gadafi', u'ghadhafi', u'kadhafi', u'khadafi', u'kaddafi']: token = u'gaddafi' else: token = arabic.stem(english.stem(token)) cleaned_tokens.append(token) yield row yield cleaned_tokens
def stemVector(vector,method="lemmatize"): output=[] if method=='lemmatize': wnl = WordNetLemmatizer() for i in vector: i=wnl.lemmatize(i) output.append(i) if method=='snowball': st=EnglishStemmer() for i in vector: i=st.stem(i) output.append(i) if method=='porter': st=PorterStemmer() for i in vector: i=st.stem(i) output.append(i) if method=='lancaster': st=LancasterStemmer() for i in vector: i=st.stem(i) output.append(i) return output
def stem_sen(list_sentences): stemmer = EnglishStemmer() # map back should be a dict with words, # each word map to 3 version: noun, adj, verb, # and each version is a list of pair lem = WordNetLemmatizer() mapping_back = {} res_list = [] res_sen = [] stemmer = EnglishStemmer() # of course we want to return a list of sentences back as well for sent in list_sentences: tmp_list = [] tok_list = word_tokenize(sent) tok_pos = nltk.pos_tag(tok_list) for tok,pos in tok_pos: if (tok.lower() in stopwords.words('english')): continue if len(tok) == 1: continue tok = lem.lemmatize(tok) pos = pos[:2] if ('NN' not in pos) and ('JJ' not in pos) and ('VB' not in pos): continue stem_tok = stemmer.stem(tok) if (stem_tok not in mapping_back): mapping_back[stem_tok] = {} if pos not in mapping_back[stem_tok]: mapping_back[stem_tok][pos] = {} # increase count if tok not in mapping_back[stem_tok][pos]: mapping_back[stem_tok][pos][tok] = 1 else: mapping_back[stem_tok][pos][tok] += 1 tmp_list.append(stem_tok + '-' + pos) res_sen.append(tmp_list) res_map = {} # do the second run through to find the most frequent - mapping for tok in mapping_back: for pos in mapping_back[tok]: tmp_tok = tok + '-' + pos # find the most frequently, unstemmed word correspond to the stemmer + tagged most_freq = max(mapping_back[tok][pos], key = mapping_back[tok][pos].get) res_map[tmp_tok] = most_freq.encode('ascii') res_list.append(tmp_tok) return res_sen, res_list, res_map
def tokenize(self): terms = word_tokenize(self.text); self.tokens = []; self.lemmas = [] stemmer = EnglishStemmer(); lemmatizer = WordNetLemmatizer() for term in terms: try: self.tokens.append(stemmer.stem(term).lower()) self.lemmas.append(lemmatizer.lemmatize(term.lower())) except Exception, e: print 'current text:', self.text; print 'current term:', term; print str(e); sys.exit(-1);
def computeSentiment(tweet_text): annotated = '' positive = 0 negative = 0 st = EnglishStemmer() tokenized_tweet = tokenize(tweet_text) for t in tokenized_tweet: #print st.stem(t.lower()) wsp = ' ' if len(annotated) == 0 or annotated[-1] in '@#': wsp = '' if st.stem(t.lower()) in negative_terms: annotated += wsp+'<span class="negative">'+t+'</span>' negative += 1 elif st.stem(t.lower()) in positive_terms: annotated += wsp+'<span class="positive">'+t+'</span>' positive += 1 else: if len(t) == 1 and t not in '@#': annotated += t else: annotated += wsp + t return annotated, positive, negative
def exe_compress_word(argv): word_stat_path, comp_word_stat_path = argv; stemmer = EnglishStemmer(); word_stat = load_word_stat(word_stat_path); compress_word_stat = {}; for word, count in word_stat.items(): if count <= 0: continue; word = stemmer.stem(word.lower().decode('utf8')); compress_word_stat.__setitem__(word, max(word_stat.get(word,0), count)); words = compress_word_stat.keys(); words.sort(); f = open(comp_word_stat_path, 'w'); for word in words: f.write('%s %d\n' % (word.encode('utf8'), compress_word_stat[word])); f.close();
def tokenizeTweet(tweet,unique = True): allWords = [word.lower() for word in word_tokenize(tweet)] # deletes @users, RT and URLs and saves #hashtags nWords, i = len(allWords), 0 hashtags = [] while i < nWords: if allWords[i] == '@': # @users allWords[i:i + 2] = [] nWords -= 2 elif allWords[i] == 'rt': # delete RT allWords[i:i + 1] = [] nWords -= 1 elif allWords[i] == '#': # save the hashtag try: hashtags.append(allWords[i + 1]) allWords[i:i + 2] = [] nWords -= 2 except: allWords[i:i + 1] = [] nWords -= 1 elif allWords[i] == "http": # delete url starting with http: allWords[i:i + 3] = [] nWords -= 3 elif allWords[i][0:3] == 'www': # delete urls starting with www. allWords[i:i + 1] = [] nWords -= 1 else: i += 1 possibleWords = filter(lambda x: x not in ourStopWords and x.isdigit() == False, allWords) stemmer = EnglishStemmer() tokens = [] for word in possibleWords: aux = str(stemmer.stem(word)) if unique: if(aux not in tokens): # this makes each token appears only once tokens.append(aux) else: tokens.append(aux) for tag in hashtags: # this makes each token appears only once if unique: if '#' + tag not in tokens: tokens.append('#' + tag) else: tokens.append('#'+tag) return tokens