class ModelBuilder(): def __init__(self): self.model = {} self.stemmer = SnowballStemmer('english') def build(self): with open('data/candidate_synonyms.txt') as f: all_words = f.read().split('\n') for words in all_words: if words: word, similar = words.split(',') word, similar = self.stemmer.stem(word), self.stemmer.stem(similar) if word not in self.model: self.model[word] = {} self.model[word][similar] = 1 return self def condense(self): condensed_model = {} for word, similars in self.model.items(): for similar in similars: if self.model.get(similar, {}).has_key(word): if condensed_model.has_key(word): condensed_model[word].append(similar) else: condensed_model[word] = [similar] self.model = condensed_model return self
def text_token_data_generator(): global id_text_index_map translation_table = string.maketrans( string.punctuation + string.uppercase, " " * len(string.punctuation) + string.lowercase ) snowball_stemmer = SnowballStemmer("english") for f in glob.glob("json/text/*.json"): for line in open(f).readlines(): extract_row = json.loads(line) id_text_index_map[extract_row["file_id"]] = len(id_text_index_map) visible_text = extract_row["visible_text"].encode("ascii", "ignore") visible_text = visible_text.translate(translation_table) visible_text = [ snowball_stemmer.stem(word) for word in visible_text.split() if word not in ENGLISH_STOP_WORDS and len(word) > 1 ] title = extract_row["title"].encode("ascii", "ignore") title = title.translate(translation_table) title = [ "t^{}".format(snowball_stemmer.stem(word)) for word in title.split() if word not in ENGLISH_STOP_WORDS and len(word) > 1 ] visible_text.extend(title) yield " ".join(visible_text)
def stemWordMatch2(question,sentence): question_tokens = set(nltk.word_tokenize(question)) sentence_tokens=set(nltk.word_tokenize(sentence)) # Finding the match between two words from the same root using Lancaster Stemmizer '''stemmer=LancasterStemmer() for i in sentence_tokens: stem_words_list.append(stemmer.stem(i)) for i in question_tokens: question_words_list.append(stemmer.stem(i)) #print 'Stem word list',stem_words_list #print 'Question word list', question_words_list stem_count=0 for i in stem_words_list: #Finding the exact word match if i.lower() in [x.lower() for x in question_words_list]: #print 'Question word is',x #print 'Sentence word stem is :',i #print 'Match' stem_count=stem_count+6 stem_word_match_counter.append(count)''' stem_word_match_counter=[] stem_words_list=[] question_words_list=[] # Finding the match between two words from the same root using Snowball Stemmizer snowball_stemmer = SnowballStemmer('english') for i in sentence_tokens: stem_words_list.append(snowball_stemmer.stem(i)) for i in question_tokens: question_words_list.append(snowball_stemmer.stem(i)) #print 'Stem word list',stem_words_list #print 'Question word list', question_words_list stem_count=0 for i in stem_words_list: #Finding the exact word match if i.lower() in [x.lower() for x in question_words_list]: #print 'Question word is',x #print 'Sentence word stem is :',i #print 'Match' stem_count=stem_count+6 #print 'Stem word count match score is :', stem_count return stem_count
def wordnet_sim(query, db): """ This function imlements simple wordnet definition lookup and compares it with a different block of text. For every word match between the definition token and text token doc receives +1. INPUT: query -- string that represents user query expanded with word net defs db -- dict representation of database xml file OUTPUT: maxdoc -- the document with the highest score """ # print('QUERY:', query) # initializing SnowballStemmer from nltk sst = SnowballStemmer("english") # taking stopwords from nltk stop = stopwords.words("english") # creating translation table to remove punctuation transnone = {ord(c): None for c in string.punctuation} # first we remove any punctuation and concatenate specific nodes into one query_nopunct = query.lower().translate(transnone) query_stems = [sst.stem(token) for token in query_nopunct.split() if token not in stop] doc_scores = defaultdict(float) for doc in db: for block, text in db[doc].items(): # normalize block text if not text: continue text_nopunct = text.lower().translate(transnone) text = [sst.stem(t) for t in text_nopunct.split() if t not in stop] if len(text) == 0: text += " " # here we can finetune the block score multiplicators # some blocks are more important than the others if block == "description": for s in query_stems: doc_scores[doc] += text.count(s) / len(text) * 2 elif block == "trivia": for s in query_stems: doc_scores[doc] += text.count(s) / len(text) * 0.5 elif block == "history": for s in query_stems: doc_scores[doc] += text.count(s) / len(text) * 0.5 elif block == "comments": for s in query_stems: doc_scores[doc] += text.count(s) / len(text) maxdoc = max(doc_scores, key=lambda x: doc_scores[x]) debug = sorted([(k, v) for k, v in doc_scores.items()], key=lambda x: x[1]) return (debug, maxdoc)
def text_to_wordlist(text, remove_stopwords=False, stem_words=False): # Clean the text, with the option to remove stopwords and to stem words. # Convert words to lower case and split them text = text.lower().split() # Optionally, remove stop words if remove_stopwords: stops = set(stopwords.words("english")) text = [w for w in text if not w in stops] text = " ".join(text) #Remove Special Characters text=special_character_removal.sub('',text) #Replace Numbers text=replace_numbers.sub('n',text) # Optionally, shorten words to their stems if stem_words: text = text.split() stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] text = " ".join(stemmed_words) # Return a list of words return(text)
def des_extrect(): filename_list = [] file_stopwords = file('stopwords.txt', "r") stopwords = [line.strip() for line in file_stopwords.readlines()] for file_name in os.listdir(DESCRIPTION_DIR): filename_list.append(file_name) for filename in filename_list: path = os.path.join(DESCRIPTION_DIR, filename) fr = file(path, 'r') fw = file(filename+'.des', 'w') soup = BeautifulSoup(fr.read()) docs = soup.findAll('doc') for doc in docs: content = str(doc['title'] + doc.snippet.text) content = re.sub("[\.\@\,\:\;\!\?\(\)]".decode("utf8"), "".decode("utf8"),content) stemmer = SnowballStemmer('english') content = content.split() pro_content = '' for w in content: w = stemmer.stem(w) #去停用词 if w not in stopwords: pro_content += w + ' ' fw.write(doc['rank'] + ' ' +pro_content+'\n') fw.close() fr.close()
class StemmedCorpus(DocumentCorpus): def __init__(self, documents=None, language="german"): DocumentCorpus.__init__(self, documents) with codecs.open("stopwords/" + language, "r", encoding=my_encoding) as f: self._stopwords = [sw.strip() for sw in f.readlines()] self._stemmer = SnowballStemmer(language) self._lemmatizer = WordNetLemmatizer() self._stemmed_documents = [] def preprocess_documents(self, lemmatize=False, remove_stopwords = True): _highest_func = self._lemmatize_tokens if lemmatize else self._stemm_tokens _second_highest_func = self._remove_stopword if remove_stopwords else lambda x: x self._stemmed_documents = [ (_highest_func(_second_highest_func(self._tokenize_document(doc[0].lower()))), doc[1] ) for doc in self._documents] def _tokenize_document(self, document): return regexp_tokenize(document, pattern_words) def _remove_stopword(self, tokens): return [token for token in tokens if token not in self._stopwords] def _stemm_tokens(self, tokens): return [self._stemmer.stem(token) for token in tokens] def _lemmatize_tokens(self, tokens): return [self._lemmatizer.lemmatize(token, trans_tag(tag)) for token, tag in pos_tag(tokens)]
def stemmed(text,language): stemmer= SnowballStemmer(language) tas=text.split() text="" for word in tas: text=" ".join((text,stemmer.stem(word))) return text.lstrip()
def frequency_analysis(input_path, output_path, stopwords=None, n_most_common=50): recipes = [] with open(input_path, 'r') as f: for i, line in enumerate(f): if line == '\n': break if i == 0: continue # skip header fields = line.split('\t') recipes.append(fields[1].replace("\n", "")) recipe_text = re.sub("[^a-z ]", "", ' '.join(recipes)) recipe_words = re.split("\s+", recipe_text) stemmer = SnowballStemmer("english") recipe_stems = [stemmer.stem(w) for w in recipe_words] if stopwords is not None: recipe_stems = filter(None, [s for s in recipe_stems if s not in stopwords]) top_words = Counter(recipe_stems).most_common(n_most_common) # write to a file # do a second pass of the recipe to determine how many of the documents the term is in freq_table = open(output_path, 'wb') for elt in top_words: doc_freq = sum([elt[0] in recipe for recipe in recipes]) freq_table.write(','.join([str(e) for e in elt]) +','+ str(doc_freq) + '\n') freq_table.close()
class Cleaner(object): """ the sql query in get_reviews needs to be customized """ def __init__(self): self.sbstem = SnowballStemmer("english") replace = string.punctuation + string.digits self.replace_punctuation = string.maketrans(replace, ' '*len(replace)) self.locations = [] self.cached_stopwords = stopwords.words("english") def clean(self, txt): #removes stopwords, punctuation txt = txt.encode('ascii','ignore') nopunct = txt.translate(self.replace_punctuation) no_locs = [x for x in nopunct.split() if x.lower() not in self.cached_stopwords] stemmed = [self.sbstem.stem(x) for x in no_locs] return " ".join(stemmed) def make_loclist(self, locations): locations = list(locations) removelist = ['Ho Chi Minh City', 'Phu Quoc Island', 'Halong Bay'] locations = [x.lower() for x in locations if x not in removelist] locations.extend(['ho chi minh','hoan','kiem','phu quoc', 'halong', 'vietnam', 'dong','vnd','vdn']) locations.extend(['vietnames', 'nhatrang','saigon','america','maryland','york']) loc_wordlist = [f.split() for f in locations] loc_wordlist = list(itertools.chain(*loc_wordlist)) self.cached_stopwords.extend(loc_wordlist) return loc_wordlist
def norm_corpus(document_list): norm_doc_list = [] # lowercase document_list = [word.lower() for word in document_list] # remove symbols in text symbols = ",.?!" for sym in symbols: document_list = [word.replace(sym,'') for word in document_list] # loop through each string i.e. review in the column for doc in document_list: doc = nltk.word_tokenize(doc) # remove stopwords doc = [word for word in doc if word not in stopwords.words('english')] # stem words stemmer = SnowballStemmer("english") doc = [stemmer.stem(word) for word in doc] # make tokenised text one string norm_doc = " ".join(doc) norm_doc_list.append(norm_doc) return norm_doc_list
def preprocessing(doc): #stop word as optional x = re.sub("[^a-zA-Z]", " ", doc) #only words x = x.lower().split() stemmer = SnowballStemmer("english") # use snowball stops = set(stopwords.words("english")) # set is faster than list x = [stemmer.stem(word) for word in x if word not in stops] return(x)
class VocKeyworder(BaseKeyworder): def __init__(self): super(VocKeyworder, self).__init__() self._vocs = engvoc.voc2000 self._lemmatizer = WordNetLemmatizer() self._stemmer1 = LancasterStemmer() self._stemmer2 = SnowballStemmer('english') def add_keyword(self, gag_id, title): tokens = re.split(' |\.|,|;|=', title) for token in tokens: token = re.sub(r"\W+$", '', token) token = re.sub(r"^\W+", '', token) vocs = [] try: token = token.encode('utf8') vocs.append(re.sub(r"'\w+", '', token).lower()) vocs.append(self._lemmatizer.lemmatize(vocs[0])) vocs.append(self._stemmer1.stem(vocs[0])) vocs.append(self._stemmer2.stem(vocs[0])) except UnicodeDecodeError: continue if vocs[0] == '': continue try: float(vocs[0]) continue except ValueError: pass if not any([voc in self._vocs for voc in vocs]): print 'voc', vocs, token self._add_keyword(gag_id, token)
def procesar(request, identificador): lmtzr = WordNetLemmatizer() d = Documento.objects.get(id=identificador) #nltk.corpus.cess_esp.words() tokens = nltk.word_tokenize(d.contenido.replace('.', ' . ')) #print tokens #scentence = d.contenido #scentence = scentence.lower() words = tokens spanish_stemmer = SnowballStemmer('spanish') #This is the simple way to remove stop words important_words=[] for word in words: if word not in stopwords.words('spanish'): important_words.append([word, lmtzr.lemmatize(word), spanish_stemmer.stem(word)]) return render_to_response('templates/documentoProcesado.html', { 'original': d.contenido, 'tokens': tokens, 'important_words' : important_words, #'pos_tags': pos_tags, #'ne_chunks': ne_chunks.subtrees(), })
def normalized_token(token): """ Use stemmer to normalize the token. 建图时调用该函数,而不是在file_text改变词形的存储 """ stemmer = SnowballStemmer("english") return stemmer.stem(token.lower())
class Model: def __init__(self): self.model = ModelBuilder().build().condense().model self.stemmer = SnowballStemmer("english") def simset(self, word): stemmed_word = self.stemmer.stem(word) return self.model.get(stemmed_word, [])
def stemLem(w): lemmatizer = WordNetLemmatizer() stemmer = SnowballStemmer("english") #stemmer = PorterStemmer() lem = lemmatizer.lemmatize(w) if len(w) > len(lem): return lem return stemmer.stem(w)
def stemmed_top_user_words(usertxt, num=10): wl_usertxt = word_tokenize(usertxt.lower()) num = min(num, len(wl_usertxt)) snowball_stemmer = SnowballStemmer("english") stemmed_fl_usertxt = [snowball_stemmer.stem(w) for w in wl_usertxt if (len(w)>4 and w not in ewl)] fd_user_ls = [w[0] for w in FreqDist(Text(stemmed_fl_usertxt)).most_common(num)] return fd_user_ls
def __call__(self, doc ): snowball_stemmer = SnowballStemmer('english') #tokenizer = RegexpTokenizer(r'\w+') #words=[self.wnl.lemmatize(t) for t in word_tokenize(doc)] words=[snowball_stemmer.stem(t) for t in word_tokenize(doc)] stop_words=set(stopwords.words('english')) stop_words.update(self.mystops) stop_words=list(stop_words) return [i.lower() for i in words if i not in stop_words]
def main(input_file, dbname): """ Main function. Connects to a database and reads a\ CSV with the arousal and valence. Uses the sentiment \ library to compute the sentiment of a new. :param input_file: the ANEW file :param dbname: the name of the database """ # read ANEW file if not os.path.exists(input_file): logging.error('File %s does not exist', input_file) sys.exit(1) else: csvfile = open(input_file, 'r') reader = csv.reader(csvfile, delimiter=',') reader.next() # skip headers stemmer = SnowballStemmer('spanish') anew = dict([(stemmer.stem(unicode(row[2], 'utf-8')), {'valence': float(row[3]), 'arousal': float(row[5])}) for row in reader]) couch = couchdb.Server() database = couch[dbname] logging.info('Established connection with the db %s', dbname) for element in database: doc = database.get(element) comments = " ".join([comment['cleaned_summary'] for comment in doc['comments']]) description = " ".join([database.get(element)['title'], doc['description']]) sentiment_comments = get_sentiment(anew, comments) sentiment_description = get_sentiment(anew, description) if sentiment_comments is not None and sentiment_description is not None: logging.info('%s val: %.2f - %.2f aro: %.2f - %.2f : %s', doc.id, sentiment_comments[0], sentiment_description[0], sentiment_comments[1], sentiment_description[1], doc['title']) doc['sentiments'] = {'comments': {'valence': sentiment_comments[0], 'arousal': sentiment_comments[1]}, 'description': {'valence': sentiment_description[0], 'arousal': sentiment_description[1]}} database.save(doc) else: logging.warn('%s could not be analyzed. skiping ...', database.get(element)['title'])
def stemWordMatch(question,sentence): snowball_stemmer = SnowballStemmer('english') question_tokens = set(nltk.word_tokenize(question)) sentence_tokens=set(nltk.word_tokenize(sentence)) #print 'Question is :',question_tokens #print 'Sentence is :',sentence_tokens count=0 for i in sentence_tokens: #Finding the exact word match if snowball_stemmer.stem(i).lower() in [snowball_stemmer.stem(x).lower() for x in question_tokens]: count=count+6 elif i.lower() in [x.lower() for x in question_tokens]: count=count+3 #print 'Exact word match count is :',count return count
def preprocess_tweets(tweets): stemmer = SnowballStemmer("english") stop = set(stopwords.words("english")) tweet_texts = [ " ".join(stemmer.stem(i) if len(i) > 1 else i for i in ("".join(c for c in word if c not in string.punctuation) for word in tweet["text"].lower().split()) if i and i not in stop) for tweet in tweets ] return list(set(tweet_texts))
def stem(self, content): import re original_string = content new_content = re.sub('[^a-zA-Z0-9\n\.]', ' ', original_string) words = new_content.split() stemmer = SnowballStemmer('english') singles = [stemmer.stem(wordsa) for wordsa in words] return (' '.join(singles))
def __init__(self, data="en600.468/aligner/data/hansards", num_sents=sys.maxint): f_data = "%s.%s" % (data, "f") e_data = "%s.%s" % (data, "e") bitext = [[sentence.strip().split() for sentence in pair] for pair in zip(open(f_data), open(e_data))[:num_sents]] # Stem words before model training french_stemmer = SnowballStemmer("french") english_stemmer = SnowballStemmer("english") bitext_stemmed = [] for (n, (f, e)) in enumerate(bitext): f_stemmed = [french_stemmer.stem(word.decode("utf-8")) for word in f] e_stemmed = [english_stemmer.stem(word) for word in e] bitext_stemmed.append([f_stemmed, e_stemmed]) bitext = bitext_stemmed self._train(bitext) self._align(bitext)
def stem_text(self): ''' Perform stemming ''' stemmer = SnowballStemmer("english") stemmed_sents = [] for sent in self.tok_text: stemmed_sents.append([stemmer.stem(tok) for tok in sent]) self.stem_text = stemmed_sents
def filterFile(filein,fileout): #Turn to lowercase, remove Spaces, replace Or Remove Special Symbols #and stem words fin = open(filein,"r") fout = open(fileout,"w") snowball = SnowballStemmer('english') end = "End of the Project Gutenberg" #remove the header lines = fin.readline() while re.match(r'^Title',lines)==None: lines = fin.readline() lines = re.sub(r'[:,]+',"",lines) words = lines.split() for word in words: fout.write(snowball.stem(word.lower())+" ") #Filter and stem for line in fin: if end in line: break elif re.search('\S',line): line = line.lower() line = re.sub(r'\s+'," ",line) line = re.sub(r'&',"and",line) line = re.sub(r'[\[\]\'\"()@#$%^&*?\|!.,:;]+|(--)+','',line) line = re.sub(r'[_-]+'," ",line) words = line.split() for word in words: try: fout.write(snowball.stem(word)+" ") except ValueError: #REMOVES ALL ACCENTS word = unicode(word,"utf-8") word = unidecode(word) fout.write(snowball.stem(word)+" ") fin.close() fout.close()
def stemming(self, words): ''' Make stem for each word in array @return array of stemming words ''' russian_stemmer = SnowballStemmer('russian') stemming = list() for w in words: try: stemming.append(russian_stemmer.stem(w)) except Exception, e: pass
def process_spanish_owned(): from inflector import Inflector, Spanish inflector = Inflector(Spanish) from nltk.stem import SnowballStemmer stemmer = SnowballStemmer("spanish") file_valid = open('valid_words.txt', "r") lines = file_valid.readlines() valid_words = lines[0].split(' ') print len(valid_words) file_valid.close() #valid_words = set(valid_words) owned_words = ['cúster', 'custer', 'cústers', 'custers', 'combi', 'combis', 'susana', 'villaran', 'villarán', 'castañeda'] file = open("raw_words.txt", 'r') fileout = open("spanish_words_owned.txt", 'w') fout_sing = open("spanish_words_sing.txt", 'w') fout_stem = open("spanish_words_stem.txt", 'w') nline = 0 for line in file: nline += 1 words = line.split(' ') processed = [] ini_line = True for word in words: if (word != '') & (word != '\n') & (word != 'servicio') & (word != 'servicio\n'): word = word.replace('\n', '') if (word in valid_words) | (word in owned_words): processed.append(word) if word != 'bus': word_singular = inflector.singularize(word) #word_singular = word_singular.replace(u'\xF3'.encode('utf-8'), 'o') else: word_singular = word word_stemmed = stemmer.stem(word.decode('utf-8')).encode('utf-8') if ini_line: fileout.write(word) fout_sing.write(word_singular) fout_stem.write(word_stemmed) ini_line = False else: fileout.write(' ' + word) fout_sing.write(' ' + word_singular) fout_stem.write(' ' + word_stemmed) print nline, word, word_singular, word_stemmed fileout.write('\n') fout_sing.write('\n') fout_stem.write('\n') file.close() fileout.close() fout_sing.close() fout_stem.close()
def tokenize(resultList1): entrada=[] for i in range(0,len(resultList1)): sentence=resultList1[i] tokens = word_tokenize(sentence) filtered_words = [w for w in tokens if not w in stopwords.words('spanish')] stemmer = SnowballStemmer('spanish') for i in filtered_words: entrada.append( stemmer.stem(i)) return entrada
def tokenize(resultList1): entrada=[] tokens = word_tokenize(resultList1) filtered_words = [w for w in tokens if not w in stopwords.words('spanish')] stemmer = SnowballStemmer('spanish') for i in filtered_words: stri = unicode(i,errors='replace') entrada.append(stemmer.stem(stri)) return entrada
def stem_txt(text): ss = SnowballStemmer('english') return " ".join([ss.stem(w) for w in text])
# Stopwords ##### from nltk.corpus import stopwords stoplist = stopwords.words('english') # keep if not a stopword nostop = [t for t in norm_numbers if t not in stoplist] print(nostop) ##### # Stemming ##### from nltk.stem import SnowballStemmer stemmer = SnowballStemmer('german') # snowball stemmer, german print(stemmer.stem("Autobahnen")) stemmer = SnowballStemmer('english') # snowball stemmer, english # remake list of tokens, replace with stemmed versions tokens_stemmed = [stemmer.stem(t) for t in tokens] print(tokens_stemmed) # other options: # from nltk.stem import PorterStemmer # from nltk.stem import WordNetLemmatizer ##### # Corpus statistics ##### docs = df1['snippet'] print(len(sentences), 'sentences in corpus.')
def stemmatize(text): sb = SnowballStemmer("english") return sb.stem(text)
class SnowballTokenizer(object): def __init__(self): self.sbs = SnowballStemmer('english') def __call__(self, doc): return [self.sbs.stem(t) for t in word_tokenize(doc)]
def lemmatize_stemming(text): stemmer = SnowballStemmer("english", ignore_stopwords=True) return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
print rs.stem('jumping'), rs.stem('jumps'), rs.stem('jumped') print rs.stem('lying') print rs.stem('strange') # snowball stemmer from nltk.stem import SnowballStemmer ss = SnowballStemmer("german") print 'Supported Languages:', SnowballStemmer.languages # autobahnen -> cars # autobahn -> car ss.stem('autobahnen') # springen -> jumping # spring -> jump ss.stem('springen') # lemmatization from nltk.stem import WordNetLemmatizer wnl = WordNetLemmatizer() # lemmatize nouns print wnl.lemmatize('cars', 'n') print wnl.lemmatize('men', 'n') # lemmatize verbs
class _AutoTag(object): """ A class to auto tag posts, using tf-idf. """ WORDS = '([A-Za-z]+[A-Za-z-]*[A-Za-z]+|[A-Za-z]+)' def tag(self, post, count=5): """ Return a list of top tags, given a post. post: can either be a post object or the source path count: the number of tags to return """ if isinstance(post, (bytes_str, unicode_str)): source_path = post post = self._get_post_from_source_path(source_path) if post is None: LOGGER.error('No post found for path: %s' % source_path) return return self._find_top_scoring_tags(post, count) # ### 'object' interface ################################################### def __init__(self, site, use_nltk=True): """ Set up a dictionary of documents. Each post is mapped to a list of words it contains. """ self._site = site self._documents = {} self._stem_cache = {} self._use_nltk = use_nltk and self._nltk_available() self._tag_set = set([]) if self._use_nltk: from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem import SnowballStemmer self._tag_pattern = re.compile(self.WORDS + '$') self._tokenize = word_tokenize self._stem_word_mapping = defaultdict(set) self._stemmer = SnowballStemmer('porter') self._stopwords = set(stopwords.words()) else: self._tag_pattern = re.compile(self.WORDS) self._process_tags() self._process_posts() self._document_count = len(self._documents) # ### 'Private' interface ################################################## def _find_stems_for_words_in_documents(self, text): """ Process text to get list of stems. """ words = [] for word in self._tokenize(text): if self._tag_pattern.match(word) is not None: if word not in self._stopwords: words.append(self._get_stem_from_cache(word)) return words def _find_top_scoring_tags(self, post, count): """ Return the tags with the top tf-idf score. """ tf_idf_table = {} for word in self._documents[post.source_path]: tf_idf_table[word] = self._tf_idf(word, post) tags = sorted( tf_idf_table, key=lambda x: tf_idf_table[x], reverse=True ) if self._use_nltk: tags = [ sorted(self._stem_word_mapping[tag], key=len)[0] for tag in tags[:count] ] else: tags = tags[:count] return tags def _get_post_from_source_path(self, source): """ Return a post given the source path. """ posts = [ post for post in self._site.timeline if post.source_path == source ] post = posts[0] if len(posts) == 1 else None return post def _get_post_text(self, post): """ Return the text of a given post. """ with codecs.open(post.source_path, 'r', 'utf-8') as post_file: post_text = post_file.read().lower() if not post.is_two_file: post_text = post_text.split('\n\n', 1)[-1] return post_text def _get_word_count(self, post): """ Get the count of all words in a given post. """ word_counts = defaultdict(lambda: 0) for word in self._documents[post.source_path]: word_counts[word] += 1 return word_counts def _get_stem_from_cache(self, word): """ Return the stem for a word, and cache it, if required. """ if word not in self._stem_cache: stem = self._stemmer.stem(word) self._stem_cache[word] = stem self._stem_word_mapping[stem].add(word) else: stem = self._stem_cache[word] return stem def _modified_inverse_document_frequency(self, word): """ Gets the inverse document frequency of a word. This departs from the normal inverse document frequency calculation, to give a higher score for words that are already being used as tags in other posts. """ if word not in self._tag_set: count = sum( 1 for doc in self._documents.values() if word.lower() in doc ) else: count = 0.25 return math.log(self._document_count / float(count)) @staticmethod def _nltk_available(): """ Return True if we can import nltk. """ try: import nltk except ImportError: nltk = None return nltk is not None def _process_posts(self): """ Tokenize the posts (and stem the words, if use_nltk). """ for post in self._site.timeline: text = self._get_post_text(post) if not self._use_nltk: words = self._tag_pattern.findall(text) else: words = self._find_stems_for_words_in_documents(text) self._documents[post.source_path] = words def _process_tags(self): """ Create a tag set, to be used during tf-idf calculation. """ tags = self._site.posts_per_tag.keys() if not self._use_nltk: self._tag_set = set(tags) else: self._tag_set = set(self._get_stem_from_cache(tag) for tag in tags) def _term_frequncy(self, word, post): """ Returns the frequency of a word, given a post. """ word_counts = self._get_word_count(post) # A mix of augmented, logarithmic frequency. We divide with # the max frequency to prevent a bias towards longer document. tf = math.log( 1 + float(word_counts[word]) / max(word_counts.values()) ) return tf def _tf_idf(self, word, post): """ Return tf-idf value of a word, in a specified post. """ tf = self._term_frequncy(word, post) idf = self._modified_inverse_document_frequency(word) return tf * idf
def clean(text, remove_stopwords=False, stem_words=False): text = text.lower().split() if remove_stopwords: stops = set(stopwords.words("english")) text = [w for w in text if not w in stops] text = " ".join(text) # Clean the text #print(text) text = re.sub(r"b'rt", '', text) text = re.sub(r"http.+?\s", '', text) text = re.sub(r"@.+?\s", '', text) text = re.sub(r"[0-9]+", '', text) text = re.sub(r"x[a-z][0-9]", '', text) text = re.sub(r"x[a-z]", '', text) text = re.sub(r"x", '', text) text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text) text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"can't", "cannot ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r",", " ", text) text = re.sub(r"\.", " ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) # Optionally, shorten words to their stems if stem_words: text = text.split() stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] text = " ".join(stemmed_words) ''' tokens = word_tokenize(text) pos_text = pos_tag(tokens) new_text = ' '.join([i[0]+'-'+i[1] for i in pos_text]) ''' # Return a list of words return (text)
class process_list: def __init__(self, debug=False): # LIST person_type self._polarity = [] self._other = [] # StopWord self._stopwords = nltk.corpus.stopwords.words('spanish') self._stopwords_no_accents = [] for w in self._stopwords: self._stopwords_no_accents.append( self.delete_special_characters(self.delete_accents(w))) # Stemmer self._spanis_stemmer = SnowballStemmer('spanish') #Debug Print Message self.debug = debug def load_list(self, type_file_parm, _file_type="lexicons/politico.csv", separator="\t"): if (type_file_parm is type_file_enum.polarity): with open(_file_type, newline='') as csvFileBow: reader = csv.reader(csvFileBow, delimiter=separator) for row in reader: new_row = [ row[0], self.delete_accents(row[0]), self.delete_special_characters( self.delete_accents(row[0])), self._spanis_stemmer.stem( self.delete_accents(self.delete_accents(row[0]))), row[1] ] self._polarity.append(new_row) #print(new_row) print("Summary Lexicon: ", _file_type, " #Words: ", len(self._polarity)) if (type_file_parm is type_file_enum.other): with open(_file_type, newline='') as csvFileBow: reader = csv.reader(csvFileBow, delimiter=';') for row in reader: new_row = [ row[0], self.delete_accents(row[0]), self.delete_special_characters( self.delete_accents(row[0])), self._spanis_stemmer.stem( self.delete_accents(self.delete_accents(row[0]))), row[1] + "-" + row[2] + "-" + row[3] ] self._other.append(new_row) #print(new_row) def filter_word(self, word, type_file_parm): debug = True if (type_file_parm is type_file_enum.polarity): original = word matching = [[original, s[0]] for s in self._polarity if word == s[0]] if debug: print("1.Characters ", original, "-", word, " - ", matching) if (len(matching) == 0): word = self.delete_accents(word) matching = [[original, s[1]] for s in self._polarity if word == s[1]] if debug: print("2.Characters ", original, "-", word, " - ", matching) if (len(matching) == 0): word = self.delete_special_characters( self.delete_accents(word)) matching = [[original, s[2]] for s in self._polarity if word == s[2]] if debug: print("3.Characters ", original, "-", word, " - ", matching) if (len(matching) == 0): word = self._spanis_stemmer.stem( self.delete_special_characters( self.delete_accents(word))) matching = [[original, s[3]] for s in self._polarity if word == s[3]] if debug: print("4.Characters ", original, "-", word, " - ", matching) if (len(matching) == 0): return "No identificado", 0 else: return "No personal", matching else: return "No personal", matching else: return "No personal", matching else: return "No personal", matching return "No identificado", 0 def process_text(self, text): result_words = [] result_polarity = None polarity_word = None polarity_value = 0 polarity_average = 0 polarity_label = "" counter = 0 neg = 0 pos = 0 value = 0 try: words_text = nltk.word_tokenize(text) words_text = self.delete_stopword(words_text) for word in words_text: polarity_word = self.filter_word_generic( word, type_file_enum.polarity) #print("Polarity word =", polarity_word) #print(polarity_word) if polarity_word[1] != 0: counter = counter + 1 if int(polarity_word[1][0][2]) < 0: #counter = counter + 1 neg = neg - 1 elif int(polarity_word[1][0][2]) > 0: #counter = counter + 1 pos = pos + 1 result_words.append(polarity_word) if self.debug: print(polarity_word) if abs(neg) > pos: value = neg #value = pos - abs(neg) else: value = pos #value = abs(neg) - pos if counter != 0: polarity_average = value / counter # polarity_average = value else: polarity_average = 0 polarity_value = round(polarity_average) #polarity_value = 0 #if polarity_average >= 2: #polarity_label = "Positivo" #polarity_value = 1 #elif polarity_average <= -2: #polarity_label = "Negativo" #polarity_value = -1 #else: #polarity_label = "Neutro" if polarity_value == 1: polarity_label = "Positivo" elif polarity_value == -1: polarity_label = "Negativo" else: polarity_label = "Neutro" except: print("Error - polarity : ", polarity_word) result_polarity = { 'Polarity': polarity_value, 'Average': polarity_average, 'Label': polarity_label, 'Words': result_words } return result_polarity def filter_word_generic(self, word, type_file_parm): if (type_file_parm is type_file_enum.polarity): return self.internal_count_list(word, self._polarity, { "error": "No identificado", "successful": "polarity" }) if (type_file_parm is type_file_enum.occupation): return self.internal_count_list(word, self._other, { "error": "No identificado", "successful": "other" }) return "ERROR 0001 - LIST - NOT FOUND", 0 def internal_count_list(self, word, list_porcess, response_text): original = word matching = [[original, s[0], s[4]] for s in list_porcess if word == s[0]] if self.debug: print("-----------------------------------") if self.debug: print("1.Characters ", original, "-", word, " - ", matching) if (len(matching) == 0): word = self.delete_accents(word) matching = [[original, s[1], s[4]] for s in list_porcess if word == s[1]] if self.debug: print("2.Characters ", original, "-", word, " - ", matching) if (len(matching) == 0): word = self.delete_special_characters( self.delete_accents(word)) matching = [[original, s[2], s[4]] for s in list_porcess if word == s[2]] if self.debug: print("3.Characters ", original, "-", word, " - ", matching) if (len(matching) == 0): word = self._spanis_stemmer.stem( self.delete_special_characters( self.delete_accents(word))) matching = [[original, s[3], s[4]] for s in list_porcess if word == s[3]] if self.debug: print("4.Characters ", original, "-", word, " - ", matching) if (len(matching) == 0): return response_text["error"], 0 else: return response_text["successful"], [matching[0]] else: return response_text["successful"], [matching[0]] else: return response_text["successful"], [matching[0]] else: return response_text["successful"], [matching[0]] #Eliminar las Tildes def delete_accents(self, _word): return ''.join((c for c in unicodedata.normalize('NFD', _word) if unicodedata.category(c) != 'Mn')) #Elimina Caracteres Especiales def delete_special_characters(self, lin): lin = re.sub('\/|\\|\\.|\,|\;|\:|\n|\?|\'|\t', ' ', lin) # quita los puntos lin = re.sub("\s+\w\s+", " ", lin) # quita los caractores solos lin = re.sub("\.", "", lin) lin = re.sub(" ", "", lin) return lin.lower() #Elimina los StopWords def delete_stopword(self, text): return_data = [] for word in text: if (word.lower() not in self._stopwords_no_accents) and ( word != "") and (len(word) > 2): return_data.append(word.lower()) return return_data
def preprocess_text(text, tokenize=False, ner=False, stem=False, stopw=False, all_lower=False, strip_punct=True): """ Preprocesses and cleans text :param ner: Do Named Entity Recognition and join into one word :param stem: Stem text :param stopw: Remove stopwords :param all_lower: lowercase text :param strip_punct: strips punctuation :return: preprocessed text """ # Clean the text text = re.sub(r"what's", "what is ", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"can't", "cannot ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"i'm", "i am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r"i\.e\.", "", text) text = re.sub(r"\.", " . ", text) text = re.sub(r"!", " ! ", text) text = re.sub(r"\/", " ", text) text = re.sub(r"\^", " ^ ", text) text = re.sub(r"\+", " + ", text) text = re.sub(r"\-", " - ", text) text = re.sub(r"\=", " = ", text) text = re.sub(r"'", " ", text) text = re.sub(r'"', " ", text) text = re.sub(r"(\d+)(k)", r"\g<1>000", text) text = re.sub(r":", " : ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r"^e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r"^b g ", " bg ", text) text = re.sub(r" u s ", " american ", text) text = re.sub(r"^u s ", " american ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e - mail", "email", text) text = re.sub(r"j k", "jk", text) text = re.sub(r"\s{2,}", " ", text) text = re.sub(r"\b[a-zA-Z]\b", "", text) if ner: tokenized_text = word_tokenize(text) tagged_text = pos_tag(tokenized_text) chunked_text = ne_chunk(tagged_text, binary=True) named_entities = extract_entity_names(chunked_text) for named_entity in named_entities: entity = named_entity.replace(".", "") entity = re.sub(r'\s+', "_", entity) text = text.replace(named_entity, entity) if all_lower: text = text.lower() if stopw: global stops if stops is None: try: stops = set(stopwords.words("english")) except Exception as e: print("%s - Please download english stopwords from NLTK" % e) exit() text = [word.strip() for word in text.split() if word not in stops] text = " ".join(text) if tokenize: text = word_tokenize(text) text = " ".join(text) # shorten words to their stems if stem: text = text.split() stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] text = " ".join(stemmed_words) if strip_punct: text = text.translate(str.maketrans('', '', string.punctuation)) text = text.strip() # Empty string if text == '': return EMPTY_TOKEN return text
class Preprocess(): def __init__(self, rootDir, word_dict, inv_words): self.rootDir_ = rootDir self.class_words_dict_ = word_dict self.inv_words_dict_ = inv_words self.imputer_ = KNN(k=1) self.enc_ = OrdinalEncoder() self.spanish_stemmer_ = SnowballStemmer('spanish') self.special_words_ = ['piez'] self.stopwords_spanish_ = stopwords.words('spanish') self.df_ = pd.DataFrame(columns=[ 'Tipo', 'Tipo_2', 'Tipo_3', 'Tipo_4', 'Marca', 'Submarca', 'Empaque', 'Contenido', 'UnidadMedida', 'LocalidadGeografica', 'Fuente', 'precio', 'fecha' ]) self.data_ = self.import_data() self.add_stop_words() self.preprocess('descripcion') self.categorize() self.append_df() self.join_marca_submarca_drop_null() self.imputation() self.inv_words_funct() self.drop_unused_columns() def import_data(self): ''' Import all files in a library without subfolders ''' data = {} path = self.rootDir_ + '*.csv' for fname in glob.glob(path): data[fname.split('\\')[1].split('.csv')[0]] = pd.read_csv( fname, index_col=0) try: data.get(fname.split('\\')[1].split('.csv') [0])['fecha'] = pd.to_datetime(data.get( fname.split('\\')[1].split('.csv')[0])['fecha'], format='%d-%m-%Y') except KeyError: print('Check datetime values, as I didnt find them.') return data def add_stop_words(self): new_stop_words = ['s'] self.stopwords_spanish_.extend(new_stop_words) return self def tokenize(self, data): ''' Input: the complete strins Output: the tokenize string in a list of strings ''' return word_tokenize(data) def remove_stopwords_punctuation(self, data): clean_description = [] for word in data: if (word not in self.stopwords_spanish_ and word not in string.punctuation): clean_description.append(word) return clean_description def remove_accents(self, data): return [unidecode.unidecode(word) for word in data] def lowercasing(self, data): return [word.lower() for word in data] def stemming(self, data): return [self.spanish_stemmer_.stem(word) for word in data] def remove_duplicates(self, data): seen = set() result = [] for item in data: if item not in seen: seen.add(item) result.append(item) return result def split_number_letter(self, data): result = [] for word in data: match = re.match(r'([0-9]+)([a-z]+)', word, re.I) if match: for element in match.groups(): result.append(element) else: result.append(word) return result def remove_special_char(self, data): result = [] for word in data: if (word not in self.special_words_): result.append(word) return result def preprocess(self, column_name): for values in self.data_.values(): values[column_name] = values.apply( lambda row: self.tokenize(row[column_name]), axis=1) values[column_name] = values.apply( lambda row: self.remove_accents(row[column_name]), axis=1) values[column_name] = values.apply( lambda row: self.lowercasing(row[column_name]), axis=1) values[column_name] = values.apply( lambda row: self.split_number_letter(row[column_name]), axis=1) values[column_name] = values.apply( lambda row: self.remove_stopwords_punctuation(row[column_name] ), axis=1) values[column_name] = values.apply( lambda row: self.stemming(row[column_name]), axis=1) values[column_name] = values.apply( lambda row: self.remove_special_char(row[column_name]), axis=1) values[column_name] = values.apply( lambda row: self.remove_duplicates(row[column_name]), axis=1) return self def append_df(self): for element in self.data_.keys(): self.df_ = self.df_.append(self.data_.get(element), ignore_index=True) return self def categorize(self): for base_key in self.data_.keys(): self.data_.get(base_key).reset_index(drop=True, inplace=True) columns_to_add = [ 'Tipo', 'Tipo_2', 'Tipo_3', 'Tipo_4', 'Marca', 'Submarca', 'Empaque', 'Contenido', 'UnidadMedida' ] for i in columns_to_add: self.data_.get(base_key)[i] = np.nan self.data_.get(base_key)['Fuente'] = base_key for row in range(len(self.data_.get(base_key))): for element in self.data_.get(base_key)['descripcion'][row]: if element in self.class_words_dict_.get('Tipo'): self.data_.get(base_key)['Tipo'].loc[row] = element if element in self.class_words_dict_.get('Tipo_2'): self.data_.get(base_key)['Tipo_2'].loc[row] = element if element in self.class_words_dict_.get('Tipo_3'): self.data_.get(base_key)['Tipo_3'].loc[row] = element if element in self.class_words_dict_.get('Tipo_4'): self.data_.get(base_key)['Tipo_4'].loc[row] = element if element in self.class_words_dict_.get('Marca'): self.data_.get(base_key)['Marca'].loc[row] = element if element in self.class_words_dict_.get('Submarca'): self.data_.get(base_key)['Submarca'].loc[row] = element if element in self.class_words_dict_.get('Empaque'): self.data_.get(base_key)['Empaque'].loc[row] = element if element in self.class_words_dict_.get('Contenido'): self.data_.get( base_key)['Contenido'].loc[row] = element if element in self.class_words_dict_.get('UnidadMedida'): self.data_.get( base_key)['UnidadMedida'].loc[row] = element return self def join_marca_submarca_drop_null(self): self.df_['Submarca'].fillna('', inplace=True) self.df_['Marca'] = self.df_['Marca'] + self.df_['Submarca'] self.df_.drop(['Submarca'], axis=1, inplace=True) self.df_.dropna(subset=['Tipo'], inplace=True) return self def imputation(self): self.df_.fillna('', inplace=True) self.df_.reset_index(drop=True, inplace=True) for row in range(len(self.df_)): if self.df_.Tipo.loc[row] == 'huev' and self.df_.UnidadMedida.loc[ row] == '': self.df_['UnidadMedida'].loc[row] = 'pz' if self.df_.Tipo.loc[ row] == 'tortill' and self.df_.UnidadMedida.loc[row] == '': self.df_['UnidadMedida'].loc[row] = 'pz' if self.df_.Tipo.loc[row] == 'papel' and self.df_.UnidadMedida.loc[ row] == '': self.df_['UnidadMedida'].loc[row] = 'roll' if self.df_.Tipo.loc[row] == 'lech' and self.df_.UnidadMedida.loc[ row] == '': self.df_['UnidadMedida'].loc[row] = 'l' if self.df_.Contenido.loc[row] == '': self.df_['Contenido'].loc[row] = '1' if self.df_.Marca.loc[row] == '': self.df_['Marca'].loc[row] = 'no_especificado' if self.df_['Tipo_2'].loc[row] == '': if self.df_['Tipo_4'].loc[row] == '' and self.df_[ 'Tipo_3'].loc[row] == '': self.df_['Tipo_2'].loc[row] = 'no_especificado' else: if self.df_['Tipo_4'].loc[row] == '': self.df_['Tipo_2'].loc[row] = self.df_['Tipo_3'].loc[ row] else: if self.df_['Tipo_3'].loc[row] == '': self.df_['Tipo_2'].loc[row] = self.df_[ 'Tipo_4'].loc[row] else: self.df_['Tipo_2'].loc[ row] = self.df_['Tipo_3'].loc[ row] + '_' + self.df_['Tipo_4'].loc[row] else: if self.df_['Tipo_4'].loc[row] == '' and self.df_[ 'Tipo_3'].loc[row] == '': self.df_['Tipo_2'].loc[row] = self.df_['Tipo_2'].loc[row] else: if self.df_['Tipo_4'].loc[row] == '': self.df_['Tipo_2'].loc[row] = self.df_['Tipo_2'].loc[ row] + '_' + self.df_['Tipo_3'].loc[row] else: if self.df_['Tipo_3'].loc[row] == '': self.df_['Tipo_2'].loc[ row] = self.df_['Tipo_2'].loc[ row] + '_' + self.df_['Tipo_4'].loc[row] else: self.df_['Tipo_2'].loc[row] = self.df_[ 'Tipo_2'].loc[row] + '_' + self.df_[ 'Tipo_3'].loc[row] + '_' + self.df_[ 'Tipo_4'].loc[row] self.knn_imputer_for_empaque() return self def knn_imputer_for_empaque(self): data = self.df_.copy(deep=True) data['Empaque'][(data['Empaque'] == '')] = np.nan # initialize variables ordinal_enc_dict = {} columns_to_encode = ['Tipo', 'Tipo_2', 'Empaque'] # loop over columns to encode for col_name in data[columns_to_encode]: # create ordinal encoder for the column ordinal_enc_dict[col_name] = OrdinalEncoder() # select the non-null values in the column col = data[col_name] col_not_null = col[col.notnull()] reshaped_vals = col_not_null.values.reshape(-1, 1) # encode the non-null values of the column encoded_vals = ordinal_enc_dict[col_name].fit_transform( reshaped_vals) # store the values to non-null values of the column in data data.loc[col.notnull(), col_name] = np.squeeze(encoded_vals) # imputing with KNN data.iloc[:, [data.columns.get_loc(col_) for col_ in columns_to_encode]] = np.round( self.imputer_.fit_transform(data[columns_to_encode])) for col_name in data[columns_to_encode]: # reshape the data reshaped = data[col_name].values.reshape(-1, 1) # perform inverse transformation of the ordinally encoded columns data[col_name] = ordinal_enc_dict[col_name].inverse_transform( reshaped) self.df_ = data.copy(deep=True) return self def search_in_dict(self, data): for key, value in self.inv_words_dict_.items(): for i in value: if i == data: return key else: pass return data def inv_words_funct(self): column_name = [ 'Tipo', 'Tipo_2', 'Marca', 'Empaque', 'UnidadMedida', 'Contenido' ] for element in column_name: self.df_[element] = self.df_.apply( lambda row: self.search_in_dict(row[element]), axis=1) return self def drop_unused_columns(self): columns_to_drop = [ 'descripcion', 'producto', 'LocalidadGeografica', 'Tipo_3', 'Tipo_4' ] self.df_.drop(columns_to_drop, axis=1, inplace=True) return self
dir_name = './res_reddit/' news_dir = '' news2 = pd.read_csv(news_dir + 'cooc21weighted.csv', index_col=0) news1 = pd.read_csv(news_dir + 'cooc11weighted.csv', index_col=0) st = SnowballStemmer('english') for filename in os.listdir(dir_name): if 'count_term_500' in filename: print(filename) else: continue term = filename.split('_child_')[1][:-4] term_clean = ' '.join(term.split('_')) term_stemmed = ' '.join([st.stem(x) for x in term.split('_')]) red = pd.read_csv(dir_name + 'count_term_500_child_' + term + '.csv', index_col=0) if len(term.split('_')) == 2: news_words = news2 else: news_words = news1 if term_stemmed + '_count_freq_weighted' not in news_words.columns: continue news_words = news_words[[term_stemmed + '_count_freq_weighted']] news_words.columns = ['news'] news_words['news_rank'] = news_words['news'].rank(method='min', ascending=False)
import nltk from nltk.stem import RegexpStemmer stemmerregexp = RegexpStemmer('ing') stemmerregexp.stem('running') # ### II. SNOWBALL STEMMER # In[7]: import nltk from nltk.stem import SnowballStemmer SnowballStemmer.languages frstemmer = SnowballStemmer('french') frstemmer.stem('manges') # ### III. LANCASTER STEMMER # In[8]: import nltk from nltk.stem import LancasterStemmer lancaster = LancasterStemmer() lancaster.stem('running') # ### IV. PORTEWR STEMMER # In[10]:
print(list(newsgroups_train.target_names)) # Lets look at some sample news k = newsgroups_train.data[:2] print(newsgroups_train.filenames.shape, newsgroups_train.target.shape) print(WordNetLemmatizer().lemmatize('went', pos='v')) # past tense to present tense stemmer = SnowballStemmer("english") original_words = [ 'caresses', 'flies', 'dies', 'mules', 'denied', 'died', 'agreed', 'owned', 'humbled', 'sized', 'meeting', 'stating', 'siezing', 'itemization', 'sensational', 'traditional', 'reference', 'colonizer', 'plotted' ] singles = [stemmer.stem(plural) for plural in original_words] pd.DataFrame(data={'original word': original_words, 'stemmed': singles}) def lemmatize_stemming(text): return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v')) # Tokenize and lemmatize def preprocess(text): result = [] for token in gensim.utils.simple_preprocess(text): if token not in gensim.parsing.preprocessing.STOPWORDS and len( token) > 3: result.append(lemmatize_stemming(token))
tokenized_sentences = nltk.sent_tokenize(text) print(tokenized_sentences) # Tokenizar palabras tokenized_words = [ nltk.word_tokenize(sentence) for sentence in tokenized_sentences ] print(tokenized_words) # POS Tagger tokens_pos = [pos_tagger.tag(word) for word in tokenized_words] print(tokens_pos) # Stemmas stemmas = [stemmer.stem(word) for word in tokenized_words[0]] print(stemmas) # NER Tagger entities = [ner_tagger.tag(word) for word in tokenized_words] print(entities) # Stop words print(stop_words) # Removing using stop words # for word in tokenized_words: # if word not in tokenized_words: # print(word)
def stemning(words): stemmer = SnowballStemmer('danish') return [stemmer.stem(word) for word in words]
def stemming_tweets(tweet): stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in tweet] return stemmed_words
for word in words: if dicts.has_key(word.encode('utf8')): tag = sorted(dicts[word.encode('utf8')].items(), key=lambda x: x[1], reverse=True)[0][0] tagged_text.append(word + ' ' + tag) else: tagged_text.append(word + ' ' + 'NN') # Create frequence list freq_list = dict() for word in tagged_text: tokens = nltk.word_tokenize(word) if tokens[0] not in stopwords.words( 'swedish') and tokens[1] != 'MAD' and tokens[1] != 'MID': if freq_list.has_key(stemmer.stem(tokens[0])): freq_list[stemmer.stem(tokens[0])] += 1 else: freq_list[stemmer.stem(tokens[0])] = 1 # Dela alla varden i frekvenslistan med hogsta antalet frekvens for att fa varde mellan 0-1 highest_value = sorted(freq_list.items(), key=lambda x: x[1], reverse=True)[0][1] for word in freq_list: freq_list[word] = freq_list[word] / highest_value # Kor alla meningar mot frekvenslistan (orden stemmade) och tilldela meningarna en score ordered = [] for sentence in sentences: score = 0 tokens = nltk.word_tokenize(sentence)
final_table["all_reviews_string"] = '' snowball_stemmer = SnowballStemmer("english") wordnet_lemmatizer = WordNetLemmatizer() for i in range(len(final_table)): review = final_table["all_reviews_text"][i] #print review if review.__str__() != 'nan': str1 = ''.join(review) #lemmetize it #wordnet_lemmatizer.lemmatize documents = " ".join( [wordnet_lemmatizer.lemmatize(word) for word in str1.split(" ")]) #stem it documents = " ".join( [snowball_stemmer.stem(word) for word in documents.split(" ")]) print documents final_table["all_reviews_string"][i] = documents stop = stopwords.words('english') stop.remove(u'no') stop.remove(u'nor') stop.remove(u'not') stop.append(u'let') stop.append(u'anyway') stop.append(u'else') stop.append(u'maybe') stop.append(u'however') stop.append(u'00') stop.append(u'10') stop.append(u'11')
for file in files: file_path = count_to_file[file] file_id = file_to_count[file_path] lines = "" f = open(file_path, 'r') local_freq_dic = {} #store only the freq of words appearing in one file lines = f.read() words = tokenizer.tokenize(lines) #tokenize words = [word.strip('_ ').lower() for word in words if word not in STOPWORDS] #lowercase words = [stemmer.stem(word) for word in words] #stemmer words = [word for word in words if word not in STOPWORDS and len(word) > 0] #stopword removal for word in words: all_words[cls].add(word) for word in words: if word not in local_freq_dic: local_freq_dic[word] = 0 #initialize local_freq_dic[word] += 1 for word in local_freq_dic: if word not in train_tfidf_dic: train_tfidf_dic[word] = {} #initialize train_tfidf_dic[word][file_id] = local_freq_dic[word]
json_data = None with open( 'C:/Users/ABHISEK/Downloads/yelp_academic_dataset_review.json/yelp_academic_dataset_review.json' ) as data_file: lines = data_file.readlines() joined_lines = "[" + ",".join(lines) + "]" json_data = j.loads(joined_lines) data = pd.DataFrame(json_data) stemmer = SnowballStemmer('english') words = stopwords.words("english") data['cleaned'] = data['text'].apply(lambda x: " ".join([ stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words ]).lower()) X_train, X_test, y_train, y_test = train_test_split(data['cleaned'], data.stars, test_size=0.2) pipeline = Pipeline([('vect', TfidfVectorizer(ngram_range=(1, 2), stop_words="english", sublinear_tf=True)), ('chi', SelectKBest(chi2, k=10000)), ('clf', LinearSVC(C=1.0, penalty='l1', max_iter=3000, dual=False))])
def jamesLemmatize(tokens, doStemDic): ''' This method is used to lemmatize and stem text for topic modeling It is used by preProcess and preProcessSentence above Parameters ---------- tokens: list or str the input to be lemmatized and stemmed either a string that has not yet been tokenized, or a list representing an already tokenized string doStemDic: bool a setting for whether or not a word stem to word dictionary should be constructed during lemmatization and returned with the results Output ------ dict a dictionary containing the results of lemmatization if doStemDic is true, the dictionary will have two keys: "lemmatized" which has a list representing the lemmatized input as a value, and "stemDic" which has the word stem to word dictionary as a value otherwise, the dictionary will have only have the lemmatized key and value ''' # Initialize the objects to be returned, if needed lemmatized = [] if doStemDic: stemDic = {} # Initialize a WordNetLemmatizer, imported from nltk.stem lemmatizer = WordNetLemmatizer() # Initialize a SnowballStemmer in english, imported from nltk.stem stemmer = SnowballStemmer('english') # Remove apostrophes and text following before lemmatizing for token in tokens: token = re.sub("\'[a-zA-Z]*",'',token) # Tokenize the text using simple_preprocess, imported from gensim.utils, if needed if type(tokens) == str: tokens = simple_preprocess(tokens) # Tag each word using pos_tag, imported from nltk.tag, and iterate through each token and tag for token, tag in pos_tag(tokens): # Format it to lowercase token = token.lower() # Check whether the token is tagged as a noun, a verb, or other, and set pos appropriately if tag.startswith('NN'): pos = 'n' elif tag.startswith('VB'): pos = 'v' else: pos = 'a' # Filter out each token that is punctuation, in STOPWORDS (imported from # gensim.parsing.preprocessing), or is shorter than the minimum acceptable token length if token not in STOPWORDS and token not in string.punctuation and len(token) >= cfg['mintokenlen']: # Lemmatize the token using WordNetLemmatizer lemma = lemmatizer.lemmatize(token, pos) # Stem the token using the SnowballStemmer lemma = stemmer.stem(lemma) # Add the the stem to the stem dictionary as a key with a value of the lemma which produced # the stem if the stem is not already in the stem dictionary, and if a stem dictionary # is needed if doStemDic: if lemma not in stemDic: stemDic[lemma] = token # Add the lemma to the lemmatized list lemmatized.append(lemma) # If a stem dictionary is required, return a dictionary containing the stems list and stem dictionary if doStemDic: return {"lemmatized": lemmatized, "stemDic": stemDic} # Otherwise, return a dictionary with only the lemmatized list return {"lemmatized": lemmatized}
def text_to_wordlist(text, remove_stop_words=True, stem_words=False): # Clean the text, with the option to remove stop_words and to stem words. # Clean the text text = re.sub(r"[^A-Za-z0-9]", " ", text) text = re.sub(r"what's", "", text) text = re.sub(r"What's", "", text) text = re.sub(r"\'s", " ", text) text = re.sub(r"\'ve", " have ", text) text = re.sub(r"can't", "cannot ", text) text = re.sub(r"n't", " not ", text) text = re.sub(r"I'm", "I am", text) text = re.sub(r" m ", " am ", text) text = re.sub(r"\'re", " are ", text) text = re.sub(r"\'d", " would ", text) text = re.sub(r"\'ll", " will ", text) text = re.sub(r"60k", " 60000 ", text) text = re.sub(r" e g ", " eg ", text) text = re.sub(r" b g ", " bg ", text) text = re.sub(r"\0s", "0", text) text = re.sub(r" 9 11 ", "911", text) text = re.sub(r"e-mail", "email", text) text = re.sub(r"\s{2,}", " ", text) text = re.sub(r"quikly", "quickly", text) text = re.sub(r" usa ", " America ", text) text = re.sub(r" USA ", " America ", text) text = re.sub(r" u s ", " America ", text) text = re.sub(r" uk ", " England ", text) text = re.sub(r" UK ", " England ", text) text = re.sub(r"india", "India", text) text = re.sub(r"switzerland", "Switzerland", text) text = re.sub(r"china", "China", text) text = re.sub(r"chinese", "Chinese", text) text = re.sub(r"imrovement", "improvement", text) text = re.sub(r"intially", "initially", text) text = re.sub(r"quora", "Quora", text) text = re.sub(r" dms ", "direct messages ", text) text = re.sub(r"demonitization", "demonetization", text) text = re.sub(r"actived", "active", text) text = re.sub(r"kms", " kilometers ", text) text = re.sub(r"KMs", " kilometers ", text) text = re.sub(r" cs ", " computer science ", text) text = re.sub(r" upvotes ", " up votes ", text) text = re.sub(r" iPhone ", " phone ", text) text = re.sub(r"\0rs ", " rs ", text) text = re.sub(r"calender", "calendar", text) text = re.sub(r"ios", "operating system", text) text = re.sub(r"gps", "GPS", text) text = re.sub(r"gst", "GST", text) text = re.sub(r"programing", "programming", text) text = re.sub(r"bestfriend", "best friend", text) text = re.sub(r"dna", "DNA", text) text = re.sub(r"III", "3", text) text = re.sub(r"the US", "America", text) text = re.sub(r"Astrology", "astrology", text) text = re.sub(r"Method", "method", text) text = re.sub(r"Find", "find", text) text = re.sub(r"banglore", "Banglore", text) text = re.sub(r" J K ", " JK ", text) # Remove punctuation from text text = ''.join([c for c in text if c not in punctuation]) # Optionally, remove stop words if remove_stop_words: text = text.split() text = [w for w in text if not w in stop_words] text = " ".join(text) # Optionally, shorten words to their stems if stem_words: text = text.split() stemmer = SnowballStemmer('english') stemmed_words = [stemmer.stem(word) for word in text] text = " ".join(stemmed_words) # Return a list of words return (text)
class Processing: def __init__(self, stopwords_path='data/', tokenizer_path='models/', max_len=80): # It needs a stopwords file to init stop_words = pd.read_csv(stopwords_path + 'stopwords-es.txt', header=None) stop_words = stop_words[0].tolist() + ['secuela'] self.stop_words = stop_words self.n_words = 8000 self.max_len = max_len # self.aug = naf.Sequential([ # naw.ContextualWordEmbsAug(model_path='bert-base-multilingual-cased', action="insert", aug_p=0.1), # naw.ContextualWordEmbsAug(model_path='bert-base-multilingual-cased', action="substitute", aug_p=0.9), # naw.RandomWordAug(action="delete", aug_p=0.1) # ]) try: self.stemmer = SnowballStemmer("spanish", ignore_stopwords=True) except: nltk.download("popular") self.stemmer = SnowballStemmer("spanish", ignore_stopwords=True) # loading with open(tokenizer_path + 'tokenizer.pickle', 'rb') as handle: self.tokenizer = pickle.load(handle) self.__vocab_size = len(self.tokenizer.word_index) + 1 @property def vocab_size(self): return self.__vocab_size def normalize(self, s): s = s.lower() replacements = (("á", "a"), ("é", "e"), ("í", "i"), ("ó", "o"), ("ú", "u"), ("ñ", "n")) for a, b in replacements: s = s.replace(a, b).replace(a.upper(), b.upper()) return s def split_punt(self, x): words = WordPunctTokenizer().tokenize(x) x = str(' '.join(words)) x = re.sub(' +', ' ', x) return x def delete_stop_words(self, x): x = x.translate(str.maketrans('', '', string.punctuation)) x = x.translate(str.maketrans('', '', '1234567890ªº¡¿')) words = x.split(' ') words = [word for word in words if word not in self.stop_words] x = str(' '.join(words)) return x def stem_sentence(self, sentence): # Stem the sentence stemmed_text = [ self.stemmer.stem(word) for word in word_tokenize(sentence) ] return " ".join(stemmed_text) def augment(self, x): try: return self.aug.augment(x) except: return None def clean_overview(self, df): # Execute the full cleaning process into every overview df['overview'] = df['overview'].apply(lambda x: self.normalize(x)) df['overview'] = df['overview'].apply( lambda x: self.delete_stop_words(x)) df['overview'] = df['overview'].apply(lambda x: self.stem_sentence(x)) df['overview'] = df.apply( lambda x: self.get_actors(x['cast']) + ' ' + x['overview'], axis=1) df['overview'] = df.apply( lambda x: self.get_director(x['crew']) + x['overview'], axis=1) df['overview'] = df['overview'].apply(lambda x: self.normalize(x)) df['overview'] = df['overview'].apply( lambda x: self.delete_stop_words(x)) return df # Get staff and paste to overview @staticmethod def eval_cell(cell): try: cell_array = eval(cell) except: cell_array = [] return cell_array def get_actors(self, cast): eval_cast = self.eval_cell(cast) if len(eval_cast) > 2: up = 3 else: up = len(eval_cast) actors = '' for i in range(0, up): actor = eval_cast[i]['name'] actor = self.normalize(actor.replace(' ', '_').lower()) actors = actors + ' ' + actor return actors def get_director(self, crew): eval_crew = self.eval_cell(crew) directors = [ member['name'] for member in eval_crew if member['job'] == 'Director' ] directors = [ self.normalize(director.replace(' ', '_').lower()) for director in directors ] directors = str(' '.join(directors)) return directors def paste_cast(self, data): data['overview'] = data.apply( lambda x: self.get_actors(x['cast']) + ' ' + x['overview'], axis=1) data['overview'] = data.apply( lambda x: self.get_director(x['crew']) + x['overview'], axis=1) return data # Split train_test def split_data(self, data): overviews = data['overview'].values y = data['like'].values overviews_train, overviews_test, y_train, y_test = train_test_split( overviews, y, test_size=0.15, stratify=y, random_state=9) return overviews_train, overviews_test, y_train, y_test def fit_tokenizer(self, overviews_train, num_words): self.tokenizer = Tokenizer(num_words) self.tokenizer.fit_on_texts(overviews_train) # Adding 1 because of reserved 0 index self.vocab_size = len(self.tokenizer.word_index) + 1 def tokenize_overview(self, overviews, max_len): X = self.tokenizer.texts_to_sequences(overviews) # print(len(max(X, key=len))) from keras.preprocessing.sequence import pad_sequences # We pad the sentence for the left to fit with max_len X = pad_sequences(X, padding='pre', maxlen=max_len) # print(X[1]) return X def process(self, data, train_dev): df = self.clean_overview(data) df = self.paste_cast(df) if train_dev: X_train, X_test, y_train, y_test = self.split_data(df) self.fit_tokenizer(X_train, self.n_words) X_train = self.tokenize_overview(X_train, self.max_len) X_test = self.tokenize_overview(X_test, self.max_len) return X_train, X_test else: X = df['overview'].values X = self.tokenize_overview(X, self.max_len) return X
def lemmatize_stemming(text): stemmer = SnowballStemmer("english") return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
# Importing the dataset col_names = ['Rating', 'Review'] dataset = pd.read_csv('data/train.csv', names=col_names, header=None) del col_names #====================================================================================================== # Cleaning the texts corpus = [] for i in range(len(dataset)): review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) review = review.lower() review = review.split() ps = SnowballStemmer('english') review = [ ps.stem(word) for word in review if not word in set(stopwords.words('english')) ] review = ' '.join(review) corpus.append(review) del review, i #====================================================================================================== X = corpus y = dataset.iloc[0:len(corpus), 0].values #X_train, X_val, y_train, y_val = train_test_split(corpus, y, test_size = 0.10, random_state = 0) def decrement(list): return [x - 1 for x in list]
def stemTokenize_1(text): stemmer=SnowballStemmer('danish') return[stemmer.stem(w)for w in word_tokenize(text)]
def stem_tokens(t_tokens, lang): stemmer = SnowballStemmer(language=lang) t_tokens = [stemmer.stem(item) for item in t_tokens] return t_tokens
from nltk.stem import SnowballStemmer from nltk.stem.snowball import EnglishStemmer stemmer = SnowballStemmer("english") stemmer = EnglishStemmer() TEXT = "Once upon a time there lived in a certain village a little country girl, " + \ "the prettiest creature who was ever seen. Her mother was excessively fond of her; " + \ "and her grandmother doted on her still more. This good woman had a little red riding hood made for her. " + \ "It suited the girl so extremely well that everybody called her Little Red Riding Hood." stemmed_text = [stemmer.stem(word) for word in TEXT.split()] print(stemmed_text)
if subject not in documents: documents[subject] = 1 else: documents[subject] += 1 if subject not in totals: totals[subject] = 0 if subject not in texts: texts[subject] = {} for word in words[1:]: # Only include words greater than 2 letters, after being stemmed if len(word) > 2: word = stemmer.stem(word) totals[subject] += 1 vocabulary.add(word) if word not in texts[subject]: texts[subject][word] = 1 else: texts[subject][word] += 1 # Also used in Bayes equation vocabularySize = len(vocabulary) print('%i distinct words found in the training data' % vocabularySize) print('Opening test data file') with open(sys.argv[2]) as f: print('Reading test data file') testlines = f.readlines()