def AddTopicUnigram(self, feaName,comName, data = None): #need mapping first if data is None: data =self._data for i in range(len(data)): t_bigram = self.getEssayCollocation(data, i) t_uni = list() for (a, b) in t_bigram: t_uni.append(a) t_uni.append(b) t_uni = set(t_uni) comment = data[i][comName] tokens = nltk.wordpunct_tokenize(comment) tokens = [word.lower() for word in tokens] #stemming if self._stemoption ==True: st = PorterStemmer() tokens = [st.stem(t) for t in tokens] t_uni = set([st.stem(t) for t in list(t_uni)]) shared = [w for w in tokens if w in t_uni] #normalized data[i][feaName] = float(len(shared))/(len(tokens)+0.00001)
def getDomainUnigram(self, directory = None): collocations = set() #collocation items ewordlists = list() #list of lists of words #extract words from essays if directory is not None: doclist = os.listdir(directory) for essay in doclist: dir_essay = directory+'/'+essay etext = open(dir_essay,'r').read() tokens = nltk.wordpunct_tokenize(etext) tokens = [word.lower() for word in tokens] #stemming if self._stemoption ==True: st = PorterStemmer() tokens = [st.stem(t) for t in tokens] #extract the collocation for the given essay e_bigram = set(Mytext(tokens).collocations()) collocations = collocations | e_bigram ewordlists.append(tokens) else: # using the mapped essay to calcuate the candidate bigrams #need to call mapessay fuction first for ins in self._data: if ins['essay'] is not None: etext = open(ins['essay'],'r').read() tokens = nltk.wordpunct_tokenize(etext) tokens = [word.lower() for word in tokens] #stemming if self._stemoption ==True: st = PorterStemmer() tokens = [st.stem(t) for t in tokens] #extract the collocation for the given essay e_bigram = set(Mytext(tokens).collocations()) collocations = collocations | e_bigram ewordlists.append(tokens) #get collection of all essays under the specified directory / associated essays collection_text = TextCollection(ewordlists) itemlist = list() for (a, b) in collocations: itemlist.append(a) itemlist.append(b) itemlist = list(set(itemlist)) word_idf = [] for i in range(len(itemlist)): word_idf.append((collection_text.idf(itemlist[i]), itemlist[i])) word_idf = sorted(word_idf, key = operator.itemgetter(0)) ave = 0 if len(word_idf)!=0: ave = sum(map(operator.itemgetter(0), word_idf)) / len(word_idf) wlist = [j for (i, j) in word_idf if i<ave] return wlist
def extract_entities(doc): print 'extracting entities from %s...' % doc.getFilename() nps = list(set([re.sub(' \.', '', re.sub(' -[A-Z]{3}-', '', np).lower()) for np in doc.getAllNodesOfType('NP')])) p = PorterStemmer() entities = [] for np in nps: try: response = json.loads(requests.get(host+'select', params={'q': 'wam:[50 TO 100] AND iscontent:true AND lang:en AND (title_en:"%s" OR redirect_titles_mv_en:"%s")' % (np, np), 'fl': 'title_en,redirect_titles_mv_en', 'wt': 'json'}).content) except requests.exceptions.ConnectionError: while True: time.sleep(15) print 'retrying connection...' try: response = json.loads(requests.get(host+'select', params={'q': 'wam:[50 TO 100] AND iscontent:true AND lang:en AND (title_en:"%s" OR redirect_titles_mv_en:"%s")' % (np, np), 'fl': 'title_en,redirect_titles_mv_en', 'wt': 'json'}).content) break except requests.exceptions.ConnectionError: continue docs = response[u'response'][u'docs'] if len(docs) > 0: titles = [docs[0][u'title_en']] + docs[0].get(u'redirect_titles_mv_en', []) else: titles = [] if len(titles) > 0: titles = [' '.join([p.stem(w.lower()) for w in t.split(' ')]) for t in titles] stem_np = ' '.join([p.stem(w) for w in np.split(' ')]) for title in titles: if stem_np == title: entities.append(np) print np break #print doc.getFilename(), entities return (doc.getFilename(), entities)
def compare_english_simple(article_title): """Given a title of an article, returns the number of tokens, types, and stems in both the English version and the simple English version.""" english = extract_wikipedia_page(article_title, "en") simple = extract_wikipedia_page(article_title, "simple") num_tokens_english = len(english) num_tokens_simple = len(simple) types_english = count_words(get_words(english)) types_simple = count_words(get_words(simple)) porter_stemmer = PorterStemmer() stem_english = defaultdict(int) stem_simple = defaultdict(int) for key in types_english.keys(): stem_english[porter_stemmer.stem(key)] += 1 for key in types_simple.keys(): stem_simple[porter_stemmer.stem(key)] += 1 print ("Number of Tokens in English " + article_title + ": %d" % num_tokens_english) print ("Number of Tokens in Simple English " + article_title + ": %d" % num_tokens_simple) print ("Number of Types in English " + article_title + ": %d" % len(types_english)) print ("Number of Types in Simple English " + article_title + ": %d" % len(types_simple)) print ("Number of Stems in English " + article_title + ": %d" % len(stem_english)) print ("Number of Stems in Simple English " + article_title + ": %d" % len(stem_simple))
def parse_questions(self): stemmer = PorterStemmer() tokenizer = RegexpTokenizer(r'\w+') for questions_key in self.rawSamples: # Stem the Question Text question_text = self.rawSamples[questions_key][0] words_array = tokenizer.tokenize(question_text) question_text = "" for word in words_array: if word.isnumeric(): continue if word not in text.ENGLISH_STOP_WORDS: word = stemmer.stem(word) word = stemmer.stem(word) question_text += (word + " ") self.rawSamples[questions_key][0] = question_text # Stem the topic names topics_text = self.rawSamples[questions_key][2] words_array = tokenizer.tokenize(topics_text) topics_text = "" for word in words_array: if word.isnumeric(): continue if word not in text.ENGLISH_STOP_WORDS: word = stemmer.stem(word) word = stemmer.stem(word) topics_text += (word + " ") self.rawSamples[questions_key][2] = topics_text
def extractFeatures(dataSet): vector1, vector2 = list(), list() stemmer = PorterStemmer() # Produces list of all unique word stems in the titles in the dataset wordBag = list({stemmer.stem(word) for entry in dataSet for word in entry[2].strip().split(" ") if not word in stopwords.words('english')}) for entry in dataSet: genre, isbn, title, authors = entry[0], entry[1].strip(), entry[2].strip(), entry[3].strip() wordList, authorList = [word for word in title.split(" ")], [author.strip() for author in authors.split(";")] sortedWords = sorted(wordList, key = lambda x: -1*len(x)) nonStopWords = [word for word in sortedWords if not word in stopwords.words('english')] stemmedWords = [stemmer.stem(word) for word in nonStopWords] # Quantitative data about the title shortestWord = len(nonStopWords[-1]) longestWord = len(nonStopWords[0]) meanWord = sum([len(word) for word in nonStopWords])/len(nonStopWords) wordSD = (sum([(len(word)-meanWord)**2 for word in nonStopWords])/len(nonStopWords))**.5 vector1.append([(len(authorList), len(wordList), longestWord, shortestWord, meanWord, wordSD), genre]) # Creates a vector storing whether a word in a dataset occurred in the title occurrences = tuple(1 if word in stemmedWords else 0 for word in wordBag) vector2.append([occurrences, genre]) return (vector1,vector2)
class PostProcessor: def __init__(self): """Loads in Ed and Olivier's domainRules.json file, now converted to a big (7k+ entry) dict object""" #import domainRules.json from domain_rules import domain_rules from tldextract.tldextract import extract self.extract = extract from nltk.stem.porter import PorterStemmer as PorterStemmer self.domain_rules = domain_rules #create stemmer self.Stemmer = PorterStemmer() def rerank(self, url, text, results): """Processes classified results""" #check if the domain exists in domainrules domain = self.extract(url) domain = domain.domain + "." + domain.suffix print "Extracted domain: {0}".format(domain) if domain in self.domain_rules: print "found domain" if "__ANY" in self.domain_rules[domain]: categories = self.domain_rules[domain]['__ANY'] for cat in categories: #stem it matchers = [self.Stemmer.stem(cat)] if "-" in matchers[0]: matchers.append(matchers[0].replace("-", "_")) for matcher in matchers: for x in range(len(results)): print "comparing {0} to {1}".format(matcher, results[x][0]) if matcher.lower() in results[x][0].lower(): print "{0} with score {1} contains {2}".format(results[x][0], results[x][1], matcher) results[x][1] = results[x][1] + 1 print "score is now {0}".format(results[x][1]) else: print "augmenting common words" #check for common words words = defaultdict(int) for result in results: tokens = re.findall("[a-z]+", result[0].lower()) for token in tokens: words[token] += 1 #remove single entries for k,v in words.iteritems(): if v > 1: for x in range(len(results)): matchers = [self.Stemmer.stem(k)] if "-" in matchers[0]: matchers.append(matchers[0].replace("-", "_")) for matcher in matchers: if matcher.lower() in results[x][0].lower(): print "{0} with score {1} contains {2} which has score {3}".format(results[x][0], results[x][1], matcher, v) results[x][1] = results[x][1] + v print "score is now {0}".format(results[x][1]) return sorted(results, key=lambda x:x[1], reverse=True)
def search(ngrams, index, path, counts, id): print 'Searching {}'.format(path.split('/')[-1]) # If 'Graph!' button was hit with nothing in box if ngrams == '': return None if len(ngrams) > 1: ngrams = ngrams.replace(', ', ',').encode('utf-8').lower().split(',') else: ngrams = ngrams.encode('utf-8').lower() ngram_count = {ngram: defaultdict(int) for ngram in ngrams} stemmer = PorterStemmer() for ngram in ngrams: transcripts = list() for word in ngram.split(): # Get stem of word word = stemmer.stem(word) try: # Get set of books the word appears in transcripts.append(set([posting[0] for posting in index[word]])) except: # If the word is not in the index pass # Get the set of transcripts in which all words in the ngram appear transcripts = set.intersection(*transcripts) if len(transcripts) > 0 else set() for transcript in transcripts: year = int(transcript.split('-')[1]) month = int(transcript.split('-')[2]) day = int(transcript.split('-')[3]) date = datetime(year, month, day) locs = [] # For each transcript, get all of the locations of where the words in the ngram appear for word in ngram.split(): word = stemmer.stem(word) locs.extend([posting[1] for posting in index[word] if posting[0] == transcript]) # Check if the words are next to each other # e.g. ngram = 'very high profit margin' and the positions of the words are [[2,10] [3], [4,8,12,29], [5]] # This line of code will shift the position of each word over by its distance from the # beginning of the ngram to produce new positions [[2,10], [2], [2,6,10,29], [2]] # Then I take the intersection of these positions -- if it's not empty, # then the ngram appears in the transcript locs = [set([int(pos) - i for pos in loc]) for i, loc in enumerate(locs)] ngram_count[ngram][date] += len(set.intersection(*locs)) counts[id] = ngram_count print 'Finished searching {}'.format(path.split('/')[-1])
def get_bleu_similarity(reference_answers, student_answer): porter_stemmer = PorterStemmer() reference_answers_tokens = [] for answer in reference_answers: reference_answers_tokens.append(map(lambda x: str(porter_stemmer.stem(x)), answer.split())) student_answer = map(lambda x: str(porter_stemmer.stem(x)), student_answer.split()) weights = [0.25, 0.25] return bleu(student_answer,reference_answers_tokens, weights)
def stem(ts): global stemmer if stemmer is None: stemmer = PorterStemmer() if type(ts) is list: return [stemmer.stem(x) for x in ts] else: return stemmer.stem(ts)
class PropertyFinder(object): def __init__(self): self._stemmer = PorterStemmer() def __get_property_string_forms(self, property_subtree): words = stopwords.words('english') property_string_forms = set() property_string_forms.add((' '.join(property_subtree.leaves())).lower()) property_string_forms.add((' '.join([self._stemmer.stem(word) for word in property_subtree.leaves()])).lower()) property_string_forms.add((' '.join([word for word in property_subtree.leaves() if word not in words])).lower()) property_string_forms.add((' '.join([self._stemmer.stem(word) for word in property_subtree.leaves() if word not in words])).lower()) return property_string_forms def __fetch_from_wikibase(self, property_string): labels = DataBase().search_properties_name(property_string) if labels is None: return [] return [label.lower() for label in labels] def __fetch_synonyms_and_hypernyms(self, property_string): words = set() synsets = wordnet.synsets(property_string) for synset in synsets: words.update([lemma.replace('_', ' ').lower() for lemma in synset.lemma_names()]) for hypernym in synset.hypernyms(): words.update([lemma.replace('_', ' ').lower() for lemma in hypernym.lemma_names()]) return words def find_candidates(self, property_subtree): if not isinstance(property_subtree, ParentedTree): raise AttributeError candidates = set(self.__get_property_string_forms(property_subtree)) new_candidates = set() for candidate in candidates: for label in self.__fetch_from_wikibase(candidate): new_candidates.add(label) candidates.update(new_candidates) new_candidates = set() for candidate in candidates: new_candidates.update(self.__fetch_synonyms_and_hypernyms(candidate)) candidates.update(new_candidates) new_candidates = set() for candidate in candidates: for POS in [wordnet.ADJ, wordnet.ADV, wordnet.NOUN, wordnet.VERB]: morphy = wordnet.morphy(candidate, POS) if morphy is not None: new_candidates.add(morphy) candidates.update(new_candidates) return candidates
def __weight_tokens(self, mid, nps, sentences, sent_id): st = PorterStemmer() sent_target = sentences[sent_id] token_id = [idx for idx, token in enumerate(sent_target.strip().split(" ")) if mid in token][0] sent_lengths= [len(s.split(" ")) for s in sentences] nps_base = {np:" ".join(st.stem(token) for token in np.split(" ")) for np in nps} nps_proc = {} for sent_idx, sent in enumerate(sentences): sent_stem = " ".join(st.stem(token) for token in sent.split(" ")) for np_ori, np in nps_base.iteritems(): if np_ori not in nps_proc: nps_proc[np_ori] = {} if "dist_sent" not in nps_proc[np_ori] or abs(sent_idx - sent_id) < nps_proc[np_ori]["dist_sent"]: #always update the info if np not in sent_stem: continue np_idx = sent_stem.rindex(np) np_token_idx= len(sent_target[:np_idx].strip().split(" ")) dist_start = len(sent_stem[:np_idx].strip().split(" ")) dist_end = len(sent_stem[np_idx+len(np):].strip().split(" ")) dist_sent = abs(sent_idx - sent_id) dist_token = -1 if dist_sent == 0: if mid in np_ori: dist_token = 0 elif np_token_idx < token_id: dist_token = token_id - np_token_idx - (len(np.split(" ")) - 1) - 1 elif np_token_idx > token_id: dist_token = np_token_idx - token_id - 1 elif sent_idx < sent_id: dist_token = dist_end + sum(sent_lengths[sent_idx+1:sent_id]) + token_id elif sent_idx > sent_id: dist_token = (len(sent_target.strip().split(" "))-1-token_id) + sum(sent_lengths[sent_id+1:sent_idx]) + dist_start nps_proc[np_ori]["dist_sent"] = dist_sent nps_proc[np_ori]["dist_token"] = dist_token np_count = sent_stem.count(np) nps_proc[np_ori]["tf"] = (nps_proc[np_ori].get("tf") or 0) + np_count nps_weight = {} for np, vals in nps_proc.iteritems(): term1 = self.__alpha * self.__gaussian_weight(vals["dist_token"], self.__var_d) term2 = self.__beta * self.__gaussian_weight(vals["dist_sent"], self.__var_s) term3 = self.__gamma * vals["tf"] nps_weight[np] = (term1 + term2 + term3) / (self.__alpha + self.__beta + self.__gamma) return nps_weight
def preProcessing(self,raw,fileName): cachedStopWords = stopwords.words("english") stemmer = PorterStemmer() text = ' '.join([word for word in raw.split() if word not in cachedStopWords]) tokens = nltk.word_tokenize(text.lower()) stemmed = [] directory = os.getcwd()+"/pre-process/" if not os.path.exists(directory): os.makedirs(directory) test = open(directory+re.sub('\.htm$', '', fileName)+".txt","w") for item in tokens: stemmed.append(stemmer.stem(item)) test.write(stemmer.stem(item)+' ') test.close() return stemmed
def search(dictionary_file, postings_file, query_file, output_file): """ Entry point to the program """ stemmer = PorterStemmer() with open(dictionary_file, "rb") as dfile: dictionary = pickle.loads(dfile.read()) with open(query_file, "rb") as qfile: with open(postings_file, "rb") as pfile: for query in qfile: print "query: ", query prefix = parser.to_polish_notation(query) print "prefix: ", prefix processed = [] for token in prefix: if parser.is_operand(token): token = stemmer.stem(token).lower() processed.append(token) print "processed: ", processed query = parser.process_query(processed) print "query: ", query result = execute_query(query, dictionary, pfile) print result
def createLDAModel(texts, n_topics, n_passes): """Generates a LDA model from an array of texts """ tokenizer = RegexpTokenizer(r'\w+') #Create EN stop words list en_stop = get_stop_words('en') #Create p_stemmer of class PorterStemmer p_stemmer = PorterStemmer() texts_ = [] # loop through document list for i in texts: # clean and tokenize document string raw = i.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in en_stop] # stem tokens stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] # add tokens to list texts_.append(stemmed_tokens) # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts_) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts_] # generate LDA model ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=n_topics, id2word = dictionary, passes=n_passes) return(ldamodel)
def tokenStem(words): words = words.strip('[').strip(']').lower() #remove brackets and lowercase words = re.sub('[(){}<>:,.!?\'"]', '', words) stemmer = PorterStemmer() stops = stopwords.words('english') output = [stemmer.stem(token) for token in wordpunct_tokenize(words) if token not in stops ] #stem words return " ".join(output) #merge into strings
def main(): rake=RAKE.Rake('SmartStoplist.txt') fp=open(input_file,'r') text=fp.read() text=text_clean(text) """wnl=WordNetLemmatizer() text=' '.join([wnl.lemmatize(i.strip()) for i in nltk.word_tokenize(text)])""" porter_stemmer=PorterStemmer() text=' '.join([porter_stemmer.stem(i.strip()) for i in nltk.word_tokenize(text)]) keywords=rake.run(text) # print keywords with open(key_score_file,'wb') as out: csv_out=csv.writer(out) csv_out.writerow(['KEYWORD','SCORE']) for row in keywords: if row[1]>0: csv_out.writerow(row) unibitrigram_list=[] unibitrigram_list=generate_unibitrigrams(key_score_file) #print unibitrigram_list #ngram_freq=[] ngram_freq=Counter(unibitrigram_list) sorted_ngram_freq=sorted(ngram_freq.items(),key=lambda x:x[1],reverse=True ) print ngram_freq with open('bcom_ngramfr_stem.csv','wb') as nf_csv: csv_wr=csv.writer(nf_csv) for item in sorted_ngram_freq: if ((item[0]!='')): csv_wr.writerow(item)
def Tokenize(TextData): tokenizer = RegexpTokenizer(r'\w+') tokens = list() # create English stop words list en_stop = get_stop_words('en') # Create p_stemmer of class PorterStemmer p_stemmer = PorterStemmer() # clean and tokenize document string raw = TextData.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in en_stop] # stem tokens stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] tokens = stemmed_tokens TOKENIZEDTEXT_FILE = path.join(os.pardir, "Resources/TokenizedTextFiles/Personal-Narration/Unbroken - Motivational Video.txt") fp = open(TOKENIZEDTEXT_FILE, "w") print(TOKENIZEDTEXT_FILE) # pickle.dump(tokens, fp) fp.write(str(tokens)) fp.close()
def get_stemmed_separate(indeed_reviews_db, glassdoor_reviews_db): separate = get_separate_reviews(indeed_reviews_db, glassdoor_reviews_db) stemmer = PorterStemmer() stemmed_reviews = [] for review in separate: stemmed_reviews.append(' '.join([stemmer.stem(word) for sent in sent_tokenize(review) for word in word_tokenize(sent.lower())])) return stemmed_reviews
def create_bag_of_words(self): """Create a BagOfWords for the document. Performs named entity recognition, stemming and stopword removal. """ stemmer = PorterStemmer() nes = [] tagged_text = self.ner_tagger.get_entities(self.content.encode('utf-8')) for key in tagged_text.keys(): if key != 'O': nes += tagged_text[key] for n in nes: self.bag_of_words.add_stem_word(n, n) Document.vocabulary.add_stem_word(n, n) wo_named = re.sub('|'.join(nes), '', self.content) words = re.findall(r'\w+', wo_named,flags = re.UNICODE | re.LOCALE) for wordo in words: word = wordo.rstrip(r'\n') if word.lower() not in stopwords: w = stemmer.stem(word.lower()) self.bag_of_words.add_stem_word(w, word) Document.vocabulary.add_stem_word(w, word) for word in self.bag_of_words.get_all_words(): if word in Document.document_word_frequency: Document.document_word_frequency[word] += 1 else: Document.document_word_frequency[word] = 1
def evaluate(query): global DICTIONARY word_score = {} seek_pos = open(postings_file, 'r') seek_pos.seek(0,0) words = query.split() stemmer = PorterStemmer() words = [element.lower() for element in words] for item in words: word = stemmer.stem(item) if word not in word_score: if word in DICTIONARY: seek_pointer = DICTIONARY[word] seek_pos.seek(int(seek_pointer)) line = seek_pos.readline() seek_pos.seek(0,0) post_list = line.split() score = score_documents(post_list) word_score[word] = score else: #not encountered, score of 0 word_score[word] = [] #else duplicate, skip word result = score_query(word_score) return result
def stemText(s): ps = PorterStemmer() stemmedText = [] for word in s: stemmedText.append(ps.stem(word)) return stemmedText
def clean_split_stem(rawstring): stop = stopwords.words('english') out_str = rawstring.split() porter = PorterStemmer() out_str = [porter.stem(word) for word in out_str] out_str = [word for word in out_str if word not in stop] return out_str
def lda(data): data = get_only_text(data) only_tweet = data length = len(only_tweet) length = min(20,length) for i in xrange(0,length): print i print only_tweet[i] return tokenizer = RegexpTokenizer(r'\w+') en_stop = get_stop_words('en') p_stemmer = PorterStemmer() length = len(only_tweet) length = min(20,length) total_texts = [] for i in xrange(0,length): print only_tweet[i] print to_lower = only_tweet[i].lower() tokens = tokenizer.tokenize(to_lower) stopped_tokens = [k for k in tokens if not k in en_stop] texts = [p_stemmer.stem(k) for k in stopped_tokens] total_texts.append(texts) dictionary = corpora.Dictionary(total_texts) corpus = [dictionary.doc2bow(text) for text in total_texts] ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20) result = ldamodel.print_topics(num_topics=2, num_words=1) for i in result: print i
def PreProcessing(line): unigrams = line.split() word_list = [word.lower() for word in unigrams if word.lower() not in stopwords] st = PorterStemmer() word_list = [st.stem(word) for word in word_list if word] vocab = [word for word in word_list if word not in stopwords] return vocab
def preprocess_text(raw): lower_raw = raw.lower() tokens = nltk.word_tokenize(lower_raw) filtered_tokens = [word for word in tokens if word not in stopwords.words('english')] port = PorterStemmer() #This extracts the important root of a word. eg. parsing -> pars stemmed = [port.stem(item) for item in tokens] return stemmed
def cleanData(doc_list): # tokenize tokens = [] for doc in doc_list: text_l = [] ws_split = re.split(split_on, doc) for w in ws_split: # remove URLs and empty strings if not (url_pat.match(w) or w == u''): text_l.append(w) # rejoin text and 'properly' tokenize text = " ".join(text_l) text_l = nltk.word_tokenize(text) # stop words text_l = [ w.lower() for w in text_l if w.lower() not in stops] # stemming p_stemmer = PorterStemmer() text_l = [p_stemmer.stem(t) for t in text_l] ## append cleaned text to list tokens.append(text_l) return tokens
def tokenize(docs, norm, stop, ne, central_per=None, central_loc=None, central_org=None): if stop: with open("stopwords.txt", "r") as f: sw = set([word.strip().decode("utf-8").lower() for word in f]) if norm == "stem": from nltk.stem.porter import PorterStemmer stemmer = PorterStemmer() all_toks = [] for doc in docs: toks = [] for sent in doc: if norm == "lemma": stoks = [unicode(tok.lem).lower() for tok in sent] elif norm == "stem": stoks = [stemmer.stem(unicode(tok).lower()) for tok in sent] else: stoks = [unicode(tok).lower() for tok in sent] if stop: toks.extend([tok for tok in stoks if tok not in sw]) else: toks.extend(stoks) toks = [tok for tok in toks if len(tok) < 50] #if len(toks) == 0: continue string = u" ".join(toks).encode("utf-8") #print string all_toks.append(string) return all_toks
def text_process(text): ''' Takes in a string of text, then performs the following 1. Tokenizes and removes punctuation 2. Removes stopwords 3. Stems 4. Returns a list of the cleaned text ''' if(pd.isnull(text)): return [] # Tokenize tokenizer = RegexpTokenizer(r'\w+') text_processed = tokenizer.tokenize(text) # Removing any stopwords text_processed = [word.lower() for word in text_processed if word.lower() not in stopwords.words('english')] # Stemming porterStemmer = PorterStemmer() text_processed = [porterStemmer.stem(word) for word in text_processed] try: text_processed.remove('b') except: pass return " ".join(text_processed)
class StemmerTokenizer(object): def __init__(self): self.stemmer = PorterStemmer() def __call__(self, doc): return [self.stemmer.stem(t) for t in word_tokenize(doc)]
def get_data(): from collections import defaultdict as dd from nltk import word_tokenize from nltk.stem.porter import PorterStemmer import pickle stemmer = PorterStemmer() data = pickle.load(open("../data/person_pub_data.pkl", "rb")) title_df = dd(int) venue_df = dd(int) aff_df = dd(int) for d in data: for item in d["pubs"]: for w in word_tokenize(item["title"].lower()): title_df[stemmer.stem(w)] += 1 if item["venue"]: for w in word_tokenize(item["venue"].lower()): venue_df[stemmer.stem(w)] += 1 for a in item["authors"].values(): if a["aff"]: for w in word_tokenize(a["aff"].lower()): aff_df[stemmer.stem(w)] += 1 with open("aff_vocab.pkl", "wb") as f_out: pickle.dump(list(aff_df.items()), f_out) with open("venue_vocab.pkl", "wb") as f_out: pickle.dump(list(venue_df.items()), f_out) with open("title_vocab.pkl", "wb") as f_out: pickle.dump(list(title_df.items()), f_out) pub_author_map = [] authors_map = [] for d in data: labels = dd(list) for item in d["pubs"]: labels[item["label"]].append(item["authors"][item["offset"]]["idx"]) for a in item["authors"].values(): pub_author_map.append((item["idx"], a["idx"])) for l in labels: for i in range(len(labels[l])): for j in range(i+1, len(labels[l])): authors_map.append((labels[l][i], labels[l][j])) with open("pub_author_map.pkl", "wb") as f_out: pickle.dump(pub_author_map, f_out) with open("authors_map.pkl", "wb") as f_out: pickle.dump(authors_map, f_out) attr = [None for i in range(25102)] for d in data: for pub in d["pubs"]: title, venue = pub["title"], pub["venue"] if title: title = [stemmer.stem(w) for w in word_tokenize(pub["title"].lower())] if venue: venue = [stemmer.stem(w) for w in word_tokenize(pub["venue"].lower())] attr[pub["idx"]] = ("pub", title, venue) for a in pub["authors"].values(): name, aff = a["name"], a["aff"] if name: name = [stemmer.stem(w) for w in word_tokenize(a["name"].lower())] if aff: aff = [stemmer.stem(w) for w in word_tokenize(a["aff"].lower())] attr[a["idx"]] = ("author", name, aff) with open("attr.pkl", "wb") as f_out: pickle.dump(attr, f_out)
def LDA_Topic_Clustering(corp, reading_weight, new_model, class_num, LDA_passes, x, y): # ------------------- 1 Stop words---------------------- #raw = re.sub("\d+","",raw) #raw = raw.replace("’","'") English_stop_words = get_stop_words('en') My_list = [ ".'", ".']", "]']", "\'\'", 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', '://', 'http', 'www', 'com', 'don', 'pre', 'paid', 'must', 'tcan', 'twhen', 'twhat', 'via', 'are', 'will', 'said', 'can', 'near', 'and', 'the', 'i', 'a', 'to', 'it', 'was', 'he', 'of', 'in', 'you', 'that', 'but', 'so', 'on', 'up', 'we', 'all', 'for', 'out', 'me', 'him', 'they', 'says', 'got', 'then', 'there', 'no', 'his', 'as', 'with', 'them', 'she', 'said', 'down', 'see', 'had', 'when', 'about', 'what', 'my', 'well', 'if', 'at', 'come', 'would', 'by', 'one', 'do', 'be', 'her', "didn't", 'jim', 'get', "don't", 'time', 'or', 'right', 'could', 'is', 'went', "warn't", "ain't", 'good', 'off', 'over', 'go', 'just', 'way', 'like', 'old', 'around', 'know', 'de', 'now', 'this', 'along', 'en', 'done', 'because', 'back', "it's", 'tom', "couldn't", 'ever', 'why', 'going', 'little', 'some', 'your', 'man', 'never', 'too', 'more', 'say', 'says', 'again', 'how', 'here', 'tell', 'message', 'posted', 'need', 'needs', 'someone', 'government', 'intelligence', 'report' ] stoplist_1 = set( 'a b c d e f g h i j k l m n o p q r s t u v w x y z 1 2 3 4 5 6 7 8 9 0' .split(' ')) # Create a set of enlighs alphabets stoplist_2 = set(English_stop_words) stoplist_3 = set( 'es la . , . <br> <br><br> br > : >< < .< { } [ ] ( ) ,\'\' ." ` " ? ! - \u201d< \u201d .\u201d \u201d u201d \u2019 \xe9 !< >!' .split(' ')) # Create a set #stoplist_33 = set(' .' .'] '.split(' ')) # Create a set stoplist_4 = set(My_list) stoplist = stoplist_1 | stoplist_2 | stoplist_3 | stoplist_4 # ------------------- 2 tokenizer ---------------------- stopped_tokens = [ [ word for word in WordPunctTokenizer().tokenize(str(document).lower()) if ((word not in stoplist) & (word != u'.\u201d<') & (word != u'.\u201d') & (word != u'\u201c') & (len(word) > 2) & (is_int(word) == False)) ] # & (is_int(word) == False) & (len(word) > 3) & (len(word) == len(word.strip({0,1,2,3,4,5,6,7,8,9}))) )] #(re.search('\d+', word) == False) ) ] for document in corp ] # ------------------- 3 Stemming and Count word frequencies ------------------- p_stemmer = PorterStemmer() stemmer = {} texts = [] texts_set = [] #set() de_stemmer = {} for stopped_token in stopped_tokens: stemmed_texts = [p_stemmer.stem(i) for i in stopped_token] texts_set += [stemmed_texts] #texts_set = stopped_tokens # Without stemmer for j in range(0, len(texts_set)): for i in range(0, len(texts_set[j])): if not texts_set[j][i] in de_stemmer: de_stemmer[texts_set[j][i]] = stopped_tokens[j][ i] # Save it later for de_stemmer! frequency = defaultdict(int) for text in texts_set: for token in text: frequency[token] += 1 # Only keep words that appear more than once processed_corpus = [[token for token in text if frequency[token] > 0] for text in texts_set] #print processed_corpus #return 0 # ------------------- 4 Dictionary and TF-IDF Vectors ------------------- my_dictionary = corpora.Dictionary(processed_corpus) ids2words = my_dictionary.token2id bow_corpus = [my_dictionary.doc2bow(text) for text in processed_corpus] # ------------------- Add user interactions weights ---------------------- i = 0 new_corp = [] for each_doc in bow_corpus: new = [] for each_word in each_doc: new.append( (each_word[0], each_word[1] * (1 + reading_weight[i])) ) # new.append(each_word[0], float(each_word[1]) * (1+reading_weight[i])) j += 1 new_corp.append(new) i += 1 # train the model Text_tfidf = models.TfidfModel(new_corp) # all_vectors = Text_tfidf[new_corp] #bow_corpus] # Gives representative vectors # print "\n bow_corpus: ", bow_corpus # print "\n new weighted Docs: ", new_corp # print "\n TF-IDF vectors: ", all_vectors # print "\n TF-IDF size: ", len(all_vectors) # print "\n TF-IDF zero: ", all_vectors[0] # for doc in all_vectors: # print "each: ", doc # print "\n The End " # ------------------- 5 LDA Model and ------------------- if os.path.isfile("./LDAmodels/LDAmodel_dataset" + str(x) + "_P" + str(y) + "_class" + str(class_num) + ".lda") == 0 or ( new_model == 1): # Do you want to train the model? print "\n LDA Model Training..." Text_lda = models.LdaModel(new_corp, id2word=my_dictionary, num_topics=class_num, passes=LDA_passes) # with out TF-IDF model Text_lda.save("./LDAmodels/LDAmodel_dataset" + str(x) + "_P" + str(y) + "_class" + str(class_num) + ".lda") # same for tfidf, lsa, ... else: print "\n LDA Model Loading..." Text_lda = models.LdaModel.load("./LDAmodels/LDAmodel_dataset" + str(x) + "_P" + str(y) + "_class" + str(class_num) + ".lda") # ------------------- 6 Document Vectors and Classification ------------------- counter = [] doc_topics = [] for each in range(0, class_num): counter.append(0) for index, document in enumerate( all_vectors): # Each documents probability to calss # infer topic distribution for each document doc_topics.append(Text_lda.get_document_topics( document)) # , minimum_probability=0.19) # No_Topic = 1 new_list = [] for each_topic in doc_topics[-1]: new_list.append(each_topic[1]) t_index, value = max(enumerate(new_list), key=operator.itemgetter(1)) # print "\n index: ", t_index # print "\n doc_topics: ", doc_topics counter[t_index] += 1 # ------------------- 7 Create a bag for topic keywords ------------------- topicWordTags = [] topicWordTags2 = [] topicWordTags3 = [] finalBag = [] for each in range(0, class_num + 1): topicWordTags.append(set()) topicWordTags2.append([]) topicWordTags3.append([]) #set()) finalBag.append('') # print "\n Topic word empty: ", topicWordTags # ------------------- 8 Topic summary output ------------------- output_topics = Text_lda.show_topics( num_topics=class_num, num_words=15, formatted=False) # To review topics and terms individually return finalBag, topicWordTags, topicWordTags2, topicWordTags3, de_stemmer, ids2words, all_vectors, Text_lda, my_dictionary, Text_tfidf, output_topics, de_stemmer, doc_topics
import nltk nltk.download('stopwords') from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer corpus = [] for i in range(0, dataset.shape[0]): review = re.sub('[^a-zA-z]', ' ', dataset['Review'][i]) review = review.lower() review = review.split() ps = PorterStemmer() all_stopwords = stopwords.words('english') all_stopwords.remove('not') review = [ ps.stem(word) for word in review if not word in set(all_stopwords) ] review = ' '.join(review) corpus.append(review) #Creating the Bag of Words model from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(max_features=1500) x = cv.fit_transform(corpus).toarray() y = dataset.iloc[:, -1].values #Splitting the dataset into training and test sets from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x,
def Stem_words(self,words): stemmer = PorterStemmer() Stemmed_words = [stemmer.stem(w) for w in words] return " ".join(Stemmed_words)
sentences= sent_tokenize(data) from nltk import word_tokenize token=word_tokenize(data) words=[word for word in token if word.isalpha()] from nltk.corpus import stopwords stop_words = stopwords.words('english') print(stop_words) from nltk.stem.porter import PorterStemmer porter = PorterStemmer() words=[word for word in words if not word in stop_words] stemmed = [porter.stem(word) for word in words] from wordcloud import WordCloud import matplotlib.pyplot as plt wordcloud = WordCloud(width = 1000, height = 500).generate(" ".join(stemmed)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.show() str1 = ''.join(stemmed) type(str1) #bigram and trigram from nltk.corpus import webtext from nltk.collocations import BigramCollocationFinder
line__ = [data_.get(h) for h in header] data.append(line__) df = pd.DataFrame(data, columns=header) return df df = open_file('Cell_Phones_and_Accessories_5.json') text_list = df['reviewText'].values.tolist()Pandas library ttagged_list = [] ptagged_list = [] ltagged_list = [] stagged_list = [] for text in text_list: tokens = nltk.word_tokenize(text) portstemmed = [stemmer_porter.stem(token) for token in tokens] lancasterstemmed = [stemmer_lancaster.stem(token) for token in tokens] SBstemmed = [stemmer_snowball.stem(token) for token in tokens] ptagged = nltk.pos_tag(portstemmed) ltagged = nltk.pos_tag(lancasterstemmed) stagged = nltk.pos_tag(SBstemmed) ptagged_list.append(ptagged) ltagged_list.append(ltagged) stagged_list.append(stagged) ptagged_list = np.expand_dims(np.asarray(ptagged_list),0) ltagged_list = np.expand_dims(np.asarray(ltagged_list),0) stagged_list = np.expand_dims(np.asarray(stagged_list),0) print (ptagged_list.shape) print (ltagged_list.shape) print (stagged_list.shape)
def lda(user_last_read_article): #word_tokenizing global sent_to_words def sent_to_words(sentences): for sentence in sentences: yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) data_words = list(sent_to_words(data)) p_stemmer = PorterStemmer() en_stop = get_stop_words('en') data_lemmatized = [] for i in data_words: tokens = i stopped_tokens = [i for i in tokens if not i in en_stop] stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] data_lemmatized.append(' '.join(stemmed_tokens)) global vectorizer,data_vectorized,lda_model,lda_output,best_lda_model if training == 1: vectorizer = CountVectorizer(analyzer='word', #min_df=10, # minimum reqd occurences of a word stop_words='english', # remove stop words lowercase=True, # convert all words to lowercase token_pattern='[a-zA-Z0-9]{3,}', # num chars > 3 # max_features=30000, # max number of uniq words ) data_vectorized = vectorizer.fit_transform(data_lemmatized) #Building LDA model lda_model = LatentDirichletAllocation(n_components=8, # Number of topics max_iter=20, # Max learning iterations learning_method='online', random_state=100, # Random state batch_size=2, # n docs in each learning iter evaluate_every = -1, # compute perplexity every n iters, default: Don't n_jobs = -1, # Use all available CPUs ) lda_output = lda_model.fit_transform(data_vectorized) search_params = {'n_components': [3,5,7,9], 'learning_decay': [.5, .7, .9]} # Init the Model lda = LatentDirichletAllocation() # Init Grid Search Class model = GridSearchCV(lda, param_grid=search_params) # Do the Grid Search model.fit(data_vectorized) # Printing params for best model among all the generated ones # Best Model best_lda_model = model.best_estimator_ outfile = open('vectorizer.pickled','wb') pickle.dump(vectorizer,outfile) outfile.close() outfile = open('data_vectorized.pickled','wb') pickle.dump(data_vectorized,outfile) outfile.close() outfile = open('lda_output.pickled','wb') pickle.dump(lda_output,outfile) outfile.close() outfile = open('lda_model.pickled','wb') pickle.dump(lda_model,outfile) outfile.close() outfile = open('best_lda_model.pickled','wb') pickle.dump(best_lda_model,outfile) outfile.close() else : infile = open('vectorizer.pickled','rb') vectorizer = pickle.load(infile) infile.close() infile = open('data_vectorized.pickled','rb') data_vectorized = pickle.load(infile) infile.close() infile = open('lda_output.pickled','rb') lda_output = pickle.load(infile) infile.close() infile = open('lda_model.pickled','rb') lda_model = pickle.load(infile) infile.close() infile = open('best_lda_model.pickled','rb') best_lda_model = pickle.load(infile) infile.close() #dominant topic in each doc # Create Document - Topic Matrix lda_output = best_lda_model.transform(data_vectorized) # column names topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)] # index names docnames = ["Doc" + str(i) for i in range(len(data))] # Make the pandas dataframe df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames) # Get dominant topic for each document dominant_topic = np.argmax(df_document_topic.values, axis=1) df_document_topic['dominant_topic'] = dominant_topic df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents") # defining topic keywords # Topic-Keyword Matrix df_topic_keywords = pd.DataFrame(best_lda_model.components_) # Assign Column and Index df_topic_keywords.columns = vectorizer.get_feature_names() df_topic_keywords.index = topicnames # View df_topic_keywords.head() #get top 15 keywords for each doc # Show top n keywords for each topic def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20): keywords = np.array(vectorizer.get_feature_names()) topic_keywords = [] for topic_weights in lda_model.components_: top_keyword_locs = (-topic_weights).argsort()[:n_words] topic_keywords.append(keywords.take(top_keyword_locs)) return topic_keywords topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=15) #Given a piece of text, predicting the topic in document def predict_topic(text): global sent_to_words mytext_2 = list(sent_to_words(text)) #print(mytext_2) mytext_3 =[] for i in mytext_2 : tokens=i stopped_tokens = [i for i in tokens if not i in en_stop] #print(stopped_tokens) stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] #print(stemmed_tokens) mytext_3.append(' '.join(stemmed_tokens)) #print(mytext_3) mytext_4 = vectorizer.transform(mytext_3) topic_probability_scores = best_lda_model.transform(mytext_4) topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), :].values.tolist() return topic, topic_probability_scores #Given a piece of Text, predicting the documents that are related to it most closely from sklearn.metrics.pairwise import euclidean_distances def similar_documents(text, doc_topic_probs, documents = data, top_n=2, verbose=False): topic, x = predict_topic(text) dists = euclidean_distances(x.reshape(1, -1), doc_topic_probs)[0] doc_ids = np.argsort(dists)[:top_n] return doc_ids, np.take(documents, doc_ids) arr=[] arr.append(user_last_read_article) doc_ids, docs = similar_documents(text=arr, doc_topic_probs=lda_output, documents = data, top_n=2, verbose=True) result_api.append(doc_ids[0]) result_api.append(doc_ids[1]) print(result_api)
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3) # Cleaning the texts import re import nltk nltk.download('stopwords') from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer corpus = [] for i in range(0, 1000): review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) review = review.lower() review = review.split() ps = PorterStemmer() review = [ ps.stem(word) for word in review if not word in set(stopwords.words('english')) ] review = ' '.join(review) corpus.append(review) # Creating a Bag of Words model from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(max_features=1500) X = cv.fit_transform(corpus).toarray() y = dataset.iloc[:, 1].values # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y,
class BidafQaPredictor(Predictor): """ Converts the QA JSON into an instance that is expected by BiDAF model. """ def __init__(self, model: Model, dataset_reader: DatasetReader) -> None: super().__init__(model, dataset_reader) self._stemmer = PorterStemmer() self._stop_words = set(stopwords.words('english')) @overrides def _json_to_instance( self, # type: ignore json_dict: JsonDict) -> Instance: # pylint: disable=arguments-differ """ Expects JSON that looks like ``{"question": { "stem": "..."}, "para": "..."}``. """ question_text = json_dict["question"]["stem"] passage_text = json_dict["para"] return self._dataset_reader.text_to_instance(question_text, passage_text) @overrides # def predict_json(self, inputs: JsonDict, cuda_device: int = -1): def predict_json(self, inputs: JsonDict): instance = self._json_to_instance(inputs) # outputs = self._model.forward_on_instance(instance, cuda_device) outputs = self._model.forward_on_instance(instance) json_output = inputs span_str = outputs["best_span_str"] # If the file has an answer key, calculate the score if "answerKey" in json_output: answer_choices = json_output["question"]["choices"] # Score each answer choice based on its overlap with the predicted span. for choice in answer_choices: choice_text = choice["text"] choice_score = self._overlap_score(choice_text, span_str) choice["score"] = choice_score # Get the maximum answer choice score max_choice_score = max(answer_choices, key=itemgetter("score"))["score"] # Collect all answer choices with the same score selected_answers = [ choice["label"] for choice in answer_choices if choice["score"] == max_choice_score ] answer_key = json_output["answerKey"] if answer_key in selected_answers: question_score = 1 / len(selected_answers) else: question_score = 0 json_output["selected_answers"] = ",".join(selected_answers) json_output["question_score"] = question_score json_output["best_span_str"] = span_str return sanitize(json_output) def _overlap_score(self, answer: str, predicted_span: str) -> float: """ Scores the predicted span against the correct answer by calculating the proportion of the stopword-filtered stemmed words in the correct answer covered by the predicted span :param answer: correct answer :param predicted_span: predicted span :return: """ answer_tokens = self._get_tokens(answer) # degenerate case: if the answer only has stopwords, we can not score it. if not len(answer_tokens): return 0.0 span_tokens = self._get_tokens(predicted_span) overlap = [tok for tok in answer_tokens if tok in span_tokens] score = len(overlap) / len(answer_tokens) return score def _get_tokens(self, phrase: str) -> List[str]: # Get the stopword-filtered lowercase stemmed tokens from input phrase return [ self._stemmer.stem(word) for word in word_tokenize(phrase) if word.lower() not in self._stop_words ]
dataset = pd.read_csv('spam.csv',delimiter = ',',encoding = "ISO-8859-1",engine='python') dataset = dataset.drop(dataset.columns[[2, 3,4]], axis=1) dataset['v1'] = dataset['v1'].map({'ham': 0, 'spam': 1}).astype(int) # Cleaning the texts import re import nltk #nltk.download('stopwords') from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer corpus = [] for i in range(0, 5572): review = re.sub('[^a-zA-Z]', ' ', dataset['v2'][i]) review = review.lower() review = review.split() ps = PorterStemmer() review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] review = ' '.join(review) corpus.append(review) # Creating the Bag of Words model from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer() X = cv.fit_transform(corpus).toarray() y = dataset.iloc[:, 0].values # Splitting the dataset into the Training set and Test set from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20) # Fitting Naive Bayes to the Training set from sklearn.naive_bayes import GaussianNB
# 词干提取与词形还原之前先进行分词 tokens = nltk.word_tokenize(text) # stemming-提取词干 # 导入stem.porter和Lancaster工具包 from nltk.stem.porter import PorterStemmer from nltk.stem.lancaster import LancasterStemmer # 实例化PosterStemmer对象 porter_stemmer = PorterStemmer() # 实例化LancasterStemmer对象 lancaster_stemmer = LancasterStemmer() # 新建stemmed_list和lancaster_list数组,用于分别存放PorterStemmer和LancasterStemmer的结果 stemmed_list = [] lancaster_list = [] for token in tokens: stemmed_list.append(porter_stemmer.stem(token)) lancaster_list.append(lancaster_stemmer.stem(token)) print("提取词干结果:") print("1.PorterStemmer:", stemmed_list) print("2.LancasterStemmer:", lancaster_list) # Lemmatization-词形还原 # nltk的Lemmatization是基于WordNet实现的,导入WordNetLemmatizer。 from nltk.stem import WordNetLemmatizer wordnet_lemmatizer = WordNetLemmatizer() # 新建lem_list数组,用于存放词形还原 lem_list = [] for token in tokens: lem_list.append(wordnet_lemmatizer.lemmatize(token)) print("词形还原结果:")
try: temp = "" for i in doc_set: # clean and tokenize document string if data[channels][i]['title'] != "No Title": raw = data[channels][i]['title'].lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [data[channels][i]['title'] for data[channels][i]['title'] in tokens if not data[channels][i]['title'] in en_stop] # stem tokens stemmed_tokens = [p_stemmer.stem(data[channels][i]['title']) for data[channels][i]['title'] in stopped_tokens] # add tokens to list texts.append(stemmed_tokens) # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts] # generate LDA model ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=3, id2word = dictionary, passes=20) ldaAns = ldamodel.print_topics(num_topics=3, num_words=3)
appendFile.write("\n") appendFile.close() no_punctuation = [] for word in no_stopwords: if word.isalpha(): no_punctuation.append(word) appendFile = open('/pfs/out/no_punctuation.txt', 'a', encoding='utf-8') appendFile.write(word) appendFile.write("\n") appendFile.close() port_stem = PorterStemmer() stemmed = [] for word in no_punctuation: stemmed_word = port_stem.stem(word) stemmed.append(stemmed_word) appendFile = open('/pfs/out/stemmed.txt', 'a', encoding='utf-8') appendFile.write(stemmed_word) appendFile.write("\n") appendFile.close() lemmatizer = WordNetLemmatizer() lemmatized = [] for word in no_punctuation: l_text = lemmatizer.lemmatize(word) lemmatized.append(l_text) appendFile = open('/pfs/out/lemmatized.txt', 'a', encoding='utf-8') appendFile.write(l_text) appendFile.write("\n") appendFile.close()
# Converting the entire review into lower case review = review.lower() # Tokenizing the review by words review_words = review.split() # Removing the stop words review_words = [ word for word in review_words if not word in set(stopwords.words('english')) ] # Stemming the words ps = PorterStemmer() review = [ps.stem(word) for word in review_words] # Joining the stemmed words review = ' '.join(review) # Creating a corpus corpus.append(review) # Creating the Bag of Words model from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(max_features=1500) X = cv.fit_transform(corpus).toarray() y = df.iloc[:, 1].values # Creating a pickle file for the CountVectorizer pickle.dump(cv, open('cv-transform.pkl', 'wb'))
dataset = pd.read_csv("Restaurant_Reviews.tsv", delimiter="\t") from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer ps = PorterStemmer() dataset['Review'][0] clean_review = [] for i in range(1000): Review = dataset['Review'][i] Review = re.sub('[^a-zA-Z]', ' ', Review) Review = Review.lower() Review = Review.split() Review = [ ps.stem(token) for token in Review if not token in stopwords.words('english') ] Review = ' '.join(Review) clean_review.append(Review) from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(max_features=4000) X = cv.fit_transform(clean_review) X = X.toarray() y = dataset['Liked'].values from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) print(cv.get_feature_names())
class TextConfidenceGenerator(object): def __init__(self, input_file_path, output_file_path): self.input_file_path = input_file_path self.output_file_path = output_file_path self.l2_1_list = [] self.l4_1_list = [] self.l5_1_list = [] self.l6_1_list = [] self.l2_2_list = [] self.l4_2_list = [] self.l5_2_list = [] self.l6_2_list = [] self.l7_list = [] self.title_match_list = [] # added, checking s_itemname in r_title # Added For UI self.mpn_check_list = [] self.upc_check_list = [] self.asin_check_list = [] self.gtin_check_list = [] self.text_confidence_score = [] self.stemmer = PorterStemmer() def check_field_contains(self, superset, subset): try: if type(superset) == float or type(subset) == float: return 0 elif superset == '' or subset == '': return 0 elif (str(superset).lower().find(str(subset).lower()) == -1) & ( str(subset).lower().find(str(superset).lower()) == -1): return 0 else: return 1 except: return 0 def check_field_variants(self, superset, variant_info): try: if (type(variant_info) == float) or variant_info == '': return 0 subsetlist = re.split(r"\s*\^\s*", variant_info) agg_score = 0 for x in subsetlist: agg_score = agg_score + self.check_field_contains( superset, str(x)) return agg_score except: return 0 def check_category_in_field(self, superset, category): if (type(superset) == float or superset == '') or (type(category) == float or category == ''): return 0 categories = category.split('>') superset = superset.lower() if (type(superset) == float) or (type(categories) == float): return 0 conf = 0 for cat in categories: stemmed_cat = self.stemmer.stem(cat) if (stemmed_cat in superset) or (superset in cat.lower()): conf = 1 return conf def check_field_in_another_field(self, superset, field): if (type(superset) == float or superset == '') or (type(field) == float or field == ''): return 0 field = field.lower() stemmed_field = self.stemmer.stem(field) superset = superset.lower() if stemmed_field in superset: return 1 else: field_subset = field.split() field_subset_match_count = 0 for part in field_subset: if (part in superset) or (self.stemmer.stem(part) in superset): field_subset_match_count += 1 field_in_another_field_weight = field_subset_match_count / len( field_subset) field_in_another_field_weight = "%.4f" % field_in_another_field_weight return (float(field_in_another_field_weight)) def check_unique_identifier_match(self, search_id, result_id): search_id = str(search_id) result_id = str(result_id) if (type(search_id) == float or search_id == '') or (type(result_id) == float or result_id == ''): return 0 elif search_id in result_id or result_id in search_id: # handles cases of multiple asin/upc/mpn/gtin return 1 else: return 0 def process(self, row): product_sku = row['s_sku'] l2_1 = self.check_category_in_field(row['r_item_name'], row['s_category']) self.l2_1_list.append(l2_1) l4_1 = self.check_field_variants(row['r_item_name'], row['s_variant_info']) self.l4_1_list.append(l4_1) l5_1 = self.check_field_in_another_field(row['r_item_name'], row['s_manufacturer']) self.l5_1_list.append(l5_1) l6_1 = self.check_field_contains(row['r_item_name'], row['s_mpn']) self.l6_1_list.append(l6_1) l2_2 = self.check_category_in_field(row['r_description'], row['s_category']) self.l2_2_list.append(l2_2) l4_2 = self.check_field_variants(row['r_description'], row['s_variant_info']) self.l4_2_list.append(l4_2) l5_2 = self.check_field_in_another_field(row['r_description'], row['s_manufacturer']) self.l5_2_list.append(l5_2) l6_2 = self.check_field_contains(row['r_description'], row['s_mpn']) self.l6_2_list.append(l6_2) l7 = self.check_field_contains(row['r_description'], row['s_item_name']) self.l7_list.append(l7) title_match = self.check_field_in_another_field( row['r_item_name'], row['s_item_name']) self.title_match_list.append(title_match) # additional checks mpn_check = self.check_unique_identifier_match(row['s_mpn'], row['r_mpn']) upc_check = self.check_unique_identifier_match(row['s_upc'], row['r_upc']) asin_check = self.check_unique_identifier_match( row['s_asin'], row['r_asin']) gtin_check = self.check_unique_identifier_match( row['s_gtin'], row['r_gtin']) self.mpn_check_list.append(mpn_check) self.upc_check_list.append(upc_check) self.asin_check_list.append(asin_check) self.gtin_check_list.append(gtin_check) total_row_confidence = l2_1 + l4_1 + l5_1 + l6_1 + l2_2 + l4_2 + l5_2 + l6_2 + l7\ + title_match + mpn_check + upc_check + gtin_check + asin_check # total_row_confidence = l2_1 + l4_1 + l5_1 + l6_1 + title_match self.text_confidence_score.append(total_row_confidence) def main(self): self.cpi_conf_df = pd.read_csv(self.input_file_path, sep='\t', encoding='ISO-8859-1', dtype=object) # self.cpi_conf_df.rename(columns={'s_image': 's_image_url', 's_link': 's_product_url','URL':'search_url', 's_title':'s_item_name'}, inplace=True) self.input_file_column_list = self.cpi_conf_df.columns.tolist() self.cpi_conf_df = self.cpi_conf_df.fillna(value='') search_columns = ['s_mpn','s_upc','s_asin','s_gtin','s_variant_info','s_manufacturer',\ 's_category','s_description'] result_columns = ['r_mpn','r_upc','r_asin','r_gtin','r_variant_info','r_manufacturer',\ 'r_category','r_description'] for col in search_columns + result_columns: # adding missing columns if col not in self.input_file_column_list: self.cpi_conf_df[col] = '' self.cpi_conf_df.apply(self.process, axis=1) column_list = ['sys_index','s_sku','s_product_url','s_item_name','s_category','s_description',\ 's_variant_info','s_manufacturer','s_mpn','s_upc','s_asin','s_gtin','SERP_URL','SERP_KEY',\ 'r_product_url','r_item_name','r_description','r_mpn','r_upc','r_asin','r_gtin',\ 'r_variant_info','r_manufacturer','r_category'] self.cpi_conf_df = self.cpi_conf_df[column_list] self.cpi_conf_df[ 's_vs_r_text_confidence_matrix'] = self.text_confidence_score # self.cpi_conf_df['s_category_in_r_title'] = self.l2_1_list # self.cpi_conf_df['s_variant_info_in_r_title'] = self.l4_1_list # self.cpi_conf_df['s_manufacturer_in_r_title'] = self.l5_1_list # self.cpi_conf_df['s_mpn_in_r_title'] = self.l6_1_list self.cpi_conf_df['mpn_match'] = self.mpn_check_list self.cpi_conf_df['upc_match'] = self.upc_check_list self.cpi_conf_df['asin_match'] = self.asin_check_list self.cpi_conf_df['gtin_match'] = self.gtin_check_list self.cpi_conf_df['title_match'] = self.title_match_list self.cpi_conf_df.to_csv(self.output_file_path, index=False, sep='\t', encoding='iso-8859-1')
def word_steam(tokens): porter = PorterStemmer() return [porter.stem(word) for word in tokens]
from sklearn.metrics import confusion_matrix # Importing the dataset data_set = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3) # Cleaning the data corpus = [] data_set_review = data_set['Review'] for index in range(len(data_set_review)): review = data_set_review[index] review = re.sub('[^a-zA-Z]', ' ', review) review = review.lower() review = review.split() porter_stemmer = PorterStemmer() review = [ porter_stemmer.stem(word) for word in review if not word in set(stopwords.words('english')) ] review = ' '.join(review) corpus.append(review) # Creating the bag of words model count_vectorizer = CountVectorizer() X = count_vectorizer.fit_transform(corpus).toarray() y = data_set.iloc[:, -1].values # Splitting the dataset into the Training set and Test set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
# -*- coding: utf-8 -*- from nltk.stem.porter import PorterStemmer from nltk.stem.lancaster import LancasterStemmer from nltk.stem.snowball import SnowballStemmer input_words = [ 'writing', 'calves', 'be', 'branded', 'horse', 'randomize', 'possibly', 'provision', 'hospital', 'kept', 'scratchy', 'code' ] porter = PorterStemmer() lancaster = LancasterStemmer() snowball = SnowballStemmer('english') stemmer_names = ['INPUT WORD', 'PORTER', 'LANCASTER', 'SNOWBALL'] fmt = '{:>16}' * len(stemmer_names) print(fmt.format(*stemmer_names)) print('=' * 68) for word in input_words: output = [ word, porter.stem(word), lancaster.stem(word), snowball.stem(word) ] print(fmt.format(*output))
from nltk.stem.porter import PorterStemmer import pandas as pd import numpy as np ################################ abuseIndex = 0.00 tokensInput = input("Enter your phrase : ") tokens = word_tokenize(tokensInput) tokens = [w for w in tokens if not w in stop_words] porter = PorterStemmer() stems = [] for t in tokens: stems.append(porter.stem(t)) ########## df = pd.read_csv('experiment.csv') df = df.drop_duplicates(subset='word') print(df.head(15)) ########## print(tokens) print(stems) ########### for part in tokens: df_a = df[df["word"] == part] df_aIndex = df_a.index
def LDA_Topic(Int_type, de_stemmer, corp, Text_lda1, my_dictionary, Text_tfidf): # Defines LDA topic number for search terms/notes/highlights/etc/etc. # ------------------- 1 Stop words---------------------- # <span class="highlight-pink">Cato</span>'] # print "Input: ", corp #raw = re.sub("\d+","",raw) #raw = raw.replace("’","'") English_stop_words = get_stop_words('en') My_list = [ 'span', 'highlight', 'pink', 'class', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', '://', 'http', 'www', 'com', 'don', 'pre', 'paid', 'must', 'tcan', 'twhen', 'twhat', 'via', 'are', 'will', 'said', 'can', 'near', 'and', 'the', 'i', 'a', 'to', 'it', 'was', 'he', 'of', 'in', 'you', 'that', 'but', 'so', 'on', 'up', 'we', 'all', 'for', 'out', 'me', 'him', 'they', 'says', 'got', 'then', 'there', 'no', 'his', 'as', 'with', 'them', 'she', 'said', 'down', 'see', 'had', 'when', 'about', 'what', 'my', 'well', 'if', 'at', 'come', 'would', 'by', 'one', 'do', 'be', 'her', "didn't", 'jim', 'get', "don't", 'time', 'or', 'right', 'could', 'is', 'went', "warn't", "ain't", 'good', 'off', 'over', 'go', 'just', 'way', 'like', 'old', 'around', 'know', 'de', 'now', 'this', 'along', 'en', 'done', 'because', 'back', "it's", 'tom', "couldn't", 'ever', 'why', 'going', 'little', 'some', 'your', 'man', 'never', 'too', 'more', 'say', 'says', 'again', 'how', 'here', 'tell', 'posted', 'need', 'needs', 'someone', 'government', 'intelligence', 'report' ] stoplist_1 = set( 'a b c d e f g h i j k l m n o p q r s t u v w x y z 1 2 3 4 5 6 7 8 9 0' .split(' ')) # Create a set of enlighs alphabets stoplist_2 = set() #English_stop_words) stoplist_3 = set( 'es la . , . <br> <br><br> br > : >< < .< { } [ ] ( ) .' '\' ` " “ ” ? ! - \u201d< \u201d .\u201d \u201d u201d \u2019 \xe9 !< >!' .split(' ')) # Create a set stoplist_4 = set(My_list) stoplist = stoplist_1 | stoplist_2 | stoplist_3 | stoplist_4 # ------------------- 2 tokenizer ---------------------- stopped_tokens = [ [ word for word in WordPunctTokenizer().tokenize(str(document).lower()) if ((word not in stoplist) & (word != u'.\u201d<') & (word != u'.\u201d') & (len(word) > 2) & (is_int(word) == False)) ] # & (is_int(word) == False) & (len(word) > 3) & (len(word) == len(word.strip({0,1,2,3,4,5,6,7,8,9}))) )] #(re.search('\d+', word) == False) ) ] for document in corp ] # stopped_tokens = [[word for word in WordPunctTokenizer().tokenize(str(document).lower()) if ((word not in stoplist) & (word != u'.\u201d<') &(word != u'.\u201d') & (len(word) > 2) & (is_int(word) == False) )]# & (is_int(word) == False) & (len(word) > 3) & (len(word) == len(word.strip({0,1,2,3,4,5,6,7,8,9}))) )] #(re.search('\d+', word) == False) ) ] # for document in corp] # ------------------- 3 Stemming and Count word frequencies ------------------- p_stemmer = PorterStemmer() stemmer = {} texts = [] texts_set = [] for stopped_token in stopped_tokens: stemmed_texts = [p_stemmer.stem(i) for i in stopped_token] texts_set += [stemmed_texts] frequency = defaultdict(int) for text in texts_set: for token in text: frequency[token] += 1 # Only keep words that appear more than once processed_corpus = [[token for token in text if frequency[token] > 0] for text in texts_set] # ------------------- 4 Dictionary and TF-IDF Vectors ------------------- ids2words = my_dictionary.token2id bow_corpus = [my_dictionary.doc2bow(text) for text in processed_corpus] # print "------------------>>>>: ", corp all_vectors = Text_tfidf[ bow_corpus] #bow_corpus] # Gives representative vectors # ------------------- 5 Document Vectors and Classification ------------------- counter = [] doc_topics = [] for each in range(0, class_num): counter.append(0) for index, document in enumerate( all_vectors): # Each documents probability to calss doc_topics.append(Text_lda1.get_document_topics( document)) # , minimum_probability=0.19) new_list = [] for each_topic in doc_topics[-1]: new_list.append(each_topic[1]) t_index, value = max(enumerate(new_list), key=operator.itemgetter(1)) # ------------------- 6 Word tags ------------------- new_list = [] key_words = [] i = 0 # Words from doc TF-IDF Vector # Sort word bag of each document if len(all_vectors[0]) > 3: new_list = sorted(all_vectors[0], key=lambda prob: prob[1], reverse=True) else: new_list = all_vectors[0] # print "Topic: ", t_index + 1 for i in range(0, len(new_list)): # Pick the firts 5 keywords in sorted list for key in ids2words: if ids2words[key] == new_list[i][0]: # bow_corpus[1][2][0]: if (i < 3): # first 3 keywords, no more term = de_stemmer[key] key_words.append(str(term)) topicWordTags[t_index + 1].add( str(term)) # Add this to the bag of words # print "summary: ", key_words # ------------------- 6 Final Word tags and sorting ------------------- # temp = [""] temp = corp[0] if Int_type == "Search": # finalBag[t_index + 1] = finalBag[ t_index + 1] + ' ' + temp[0] + ' ' + temp[0] + ' ' + temp[0] # print "Search: ", temp #finalBag[t_index + 1] elif Int_type == "Add note": finalBag[t_index + 1] = finalBag[t_index + 1] + ' ' + temp[0] + ' ' + temp[0] else: finalBag[t_index + 1] = finalBag[t_index + 1] + ' ' + temp[0] # finalBag[t_index + 1] = finalBag[t_index + 1] + ' ' + temp[0] #bow_corpus # Keep adding user interactions string entities...! # print "Bag: ", finalBag return t_index + 1
spec_chars = ["!",'"',"#","%","&","'","(",")", "*","+",",","-",".","/",":",";","<", "=",">","?","@","[","\\","]","^","_", "`","{","|","}","~","–"] #text preprocess stop = stopwords.words('english') df_new['Answer_processed'] = df_new['Answer'].apply(lambda j: ' '.join([item for item in j.split() if item not in stop])) for char in spec_chars: df_new['Answer_processed'] = df_new['Answer_processed'].str.replace(char, '').str.lower() stemmer = PorterStemmer() df_new['Answer_processed'] = df_new['Answer_processed'].apply(lambda j: ' '.join([stemmer.stem(item) for item in j.split()])) length=df_new[df_new['Answer_processed'].map(len)<2].index df_new.drop(length,inplace=True) df_new=df_new.reset_index(drop=True) #vader FinalResults_Vader = pd.DataFrame() analyzer = SentimentIntensityAnalyzer() df_new['scores'] = df_new['Answer'].apply(lambda ans: analyzer.polarity_scores(ans)) df_new['compound'] = df_new['scores'].apply(lambda score_dict: score_dict['compound']) df_new['positive'] = df_new['scores'].apply(lambda score_dict: score_dict['pos']) df_new['negative'] = df_new['scores'].apply(lambda score_dict: score_dict['neg'])
#text preprocessing from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer nltk.download('stopwords') import re from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer stemmer = PorterStemmer() corpus = [] for i in range(len(msg)): review = re.sub('[^a-zA-Z]', ' ', msg['text'][i]) review = review.lower() review = review.split() review = [ stemmer.stem(word) for word in review if not word in stopwords.words('english') ] review = ' '.join(review) corpus.append(review) corpus #creating bag of words model from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(max_features=5000, ngram_range=(1, 3)) X = cv.fit_transform(corpus).toarray() X X.shape
def entity_summary(my_dictionary1, docs_number, ids2words, doc_vectors, output_topics, doc_topics, de_stemmer, class_num, keyword_num, filename, filename2, filename3): English_stop_words = get_stop_words('en') My_list = [ "u'\u201c'", 'span', 'highlight', 'pink', 'class', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', '://', 'http', 'www', 'com', 'don', 'pre', 'paid', 'must', 'tcan', 'twhen', 'twhat', 'via', 'are', 'will', 'said', 'can', 'near', 'and', 'the', 'i', 'a', 'to', 'it', 'was', 'he', 'of', 'in', 'you', 'that', 'but', 'so', 'on', 'up', 'we', 'all', 'for', 'out', 'me', 'him', 'they', 'says', 'got', 'then', 'there', 'no', 'his', 'as', 'with', 'them', 'she', 'said', 'down', 'see', 'had', 'when', 'about', 'what', 'my', 'well', 'if', 'at', 'come', 'would', 'by', 'one', 'do', 'be', 'her', "didn't", 'jim', 'get', "don't", 'time', 'or', 'right', 'could', 'is', 'went', "warn't", "ain't", 'good', 'off', 'over', 'go', 'just', 'way', 'like', 'old', 'around', 'know', 'de', 'now', 'this', 'along', 'en', 'done', 'because', 'back', "it's", 'tom', "couldn't", 'ever', 'why', 'going', 'little', 'some', 'your', 'man', 'never', 'too', 'more', 'say', 'says', 'again', 'how', 'here', 'tell', 'posted', 'need', 'needs', 'someone', 'government', 'intelligence', 'report' ] stoplist_1 = set( 'a b c d e guy f size styled g h also number details since due countries using selling sent given earlier completely owed full player numerous thus recovered number i j k unknown move l m n o p q r else s t u v w x y z first becomes able actually absolutely necessary officialise entire stage issued' .split(' ')) # Create a set of enlighs alphabets stoplist_2 = set(English_stop_words) stoplist_3 = set( 'es la . , . taken <br> however require ratio note illumination homeland give order possibly think questions event hour case occurred yet confirmed destination million want update arrived removed responsibility known claiming icon role display none stating closed work apply research provided additional closed caused showed month succeeded knowledge stop coroner style index enclosed sudden seeks wait last soon centers outside believed feet happened begins colors hour people airing large claims area getting blkd highly whose young information made year ptf create make public date text tried space found name run ome ngoki agree everyone caller identification <br><br> br > : >< < .< { } [ ] ( ) .' '\' ` " “ ” ? ! - \u2018 \xe9 \u201c \u201d< \u201d .\u201d \u201d u201d \u201c looking .\u201d< \u2019 worth realized facilitated \xe9 keeping !< >! ago note sending' .split(' ')) # Create a set stoplist_4 = set(My_list) stoplist = stoplist_1 | stoplist_2 | stoplist_3 | stoplist_4 p_stemmer = PorterStemmer() # ----------------- Process wordtags from user interactions ----------------------------------- timeList = [ 'date', 'jan', 'january', 'feb', 'february', 'march', 'april', 'may', 'present', 'jun', 'july', 'august', 'september', 'october', 'november', 'december', '1998' ] placeList = [ 'engstrom', 'gastech', 'abila', 'kronos', 'petra', 'jet', 'limousine', 'tethan', '', '', '', 'headquarters', 'tethys', 'elodis', 'airport', 'airports', 'vastopolis', 'terrorist', 'brotherhood', 'antarctica', 'washington', 'dhs', 'valujet', 'laboratory', 'dharan', 'bahrain', 'qatar', 'kuwait', 'airlines', 'vastpress', 'ibm', 'suburbia', 'bruno', 'lab', 'antarctica', 'nigeria', 'dubai', 'burj', 'syria', 'gaza', 'sanaa', 'ebilaead', 'tabriz', 'venezuela', 'pakistan', 'countries', 'saudi', 'arabia', 'kenya', 'iran', 'lebanon', 'russia', 'yemen', 'turkey', 'arkadi', 'barcelona', 'paris', 'cafe', 'mosque', 'exhibition', 'valley', 'moscow', 'downtown', 'mombasa', 'bangkok', 'sudan', 'usa', 'washington', 'milan', 'italy', 'hospital', 'british', 'soviet', 'antalya', 'malaysia', 'somalia', 'sana', 'lagos', 'pyongyang', 'uae', 'kiev', 'hotel' ] peopleList = [ 'edvard', 'employee', 'ipo', 'president', 'firemen', 'apa', 'silvia', 'protectors', '', 'wgo', 'torsten', 'juliana', 'dread', 'networks', 'sanjorge', 'vann', 'employees', 'pok', 'sten', 'cato', 'ceo', 'rebecca', 'karel', 'wfa', 'elian', 'carman', 'kapelou', 'nespola', 'torsten', 'trucco', 'douglas', 'eggleston', 'lark', 'mayor', 'afghan', 'philippines', 'paramurderers', 'bruno', 'psychobrotherhood', 'pakistani', 'hasidic', 'brothers', 'hate', 'george', 'dombrovski', 'columbia', 'mikhail', 'Kapolalum ', 'funsho', 'bukhari', 'ahmed', 'basra', 'khouri', 'kasem', 'leonid', 'nahid', 'otieno', 'owiti', 'leonid', 'baltasar', 'hombre', 'jhon', 'professor', 'saleh', 'tanya', 'mohammed', 'borodinski', 'kashfi', 'khemkhaeng', 'boonmee', 'ukrainian', 'german', 'italian', 'dutch', 'french', 'kapolalum', 'funsho', 'mai', 'korongi', 'lashkar', 'hosain', 'haq', 'maulana', 'bukhari', 'arab', 'ali', 'balochi', 'nicolai', 'aden', 'akram', 'shamsheer', 'jeddah', 'kiev', 'abdullah', 'carabobo', 'bolivar', 'bhutani', 'jumeirah', 'michieka', 'borodinski', 'otieno', 'wanjohi', 'onyango', 'kenyan', 'nairobi', 'jtomski', 'hakan', 'vwhombre', 'jorge', 'soltan', 'anka', 'green', 'joetomsk', 'igor', 'middleman' ] for j in range(1, len(finalBag)): # Each topic # print "\n \n \n " , finalBag[j] # print "topic#: ", j # ------------------- 2 tokenizer ---------------------- # print finalBag[j] # ------------------- 2 tokenizer ---------------------- stopped_tokens = [ [ word for word in WordPunctTokenizer().tokenize( str(document).lower()) if ((word not in stoplist) & (word != u'.\u201d<') & (word != u'\xe9') & (word != u'\u2018') & (word != u'.\u201d') & (word != u'\u201c') & (word != '\u201c') & (len(word) > 2) & (is_int(word) == False)) ] # & (is_int(word) == False) & (len(word) > 3) & (len(word) == len(word.strip({0,1,2,3,4,5,6,7,8,9}))) )] #(re.search('\d+', word) == False) ) ] for document in [finalBag[j]] ] # stopped_tokens = [[word for word in WordPunctTokenizer().tokenize(str(document).lower()) if ((word not in stoplist) & (word != u'.\u201d<') &(word != u'.\u201d') & (len(word) > 2) & (is_int(word) == False) )]# & (is_int(word) == False) & (len(word) > 3) & (len(word) == len(word.strip({0,1,2,3,4,5,6,7,8,9}))) )] #(re.search('\d+', word) == False) ) ] # for document in [finalBag[j]]] # print "Final stopped", stopped_tokens # ------------------- 3 Stemming and Count word frequencies ------------------- # p_stemmer = PorterStemmer() stemmer = {} texts = [] texts_set = [] #set() for stopped_token in stopped_tokens: stemmed_texts = [p_stemmer.stem(i) for i in stopped_token] texts_set += [stemmed_texts] frequency = defaultdict(int) for text in texts_set: for token in text: frequency[token] += 1 # Only keep words that appear more than once processed_corpus = [[token for token in text if frequency[token] > 0] for text in texts_set] # print "\n Final proceesd", processed_corpus # ------------------- 4 Dictionary and TF-IDF Vectors ------------------- ids2words = my_dictionary.token2id bow_corpus = [my_dictionary.doc2bow(text) for text in processed_corpus] # final_vectors = Text_tfidf[bow_corpus] # With TF-IDF final_vectors = bow_corpus # No TF-IDF # print "Input: ", bow_corpus # print "Input: ", final_vectors[0] new_list = [] key_words = [] # Words from doc TF-IDF Vector # Sort word bag of each document if len(final_vectors[0]) > 2: new_list = sorted(final_vectors[0], key=lambda prob: prob[1], reverse=True) else: new_list = final_vectors[0] # print "\n Bag of sorted: ", new_list # k = 0; # for i in range(0,len(new_list)): # Pick the firts 10 keywords in sorted list # for key in ids2words: # if ids2words[key] == new_list[i][0]: # bow_corpus[1][2][0]: # if (k<10): # first 3 keywords, no more # term = de_stemmer[key] # topicWordTags2[j].add(str(term)) # Add this to the bag of words # k = k+1 accu = 0 for each in new_list: accu += each[1] # print each, each[0], each[1] for i in range( 0, len(new_list)): # Pick the firts 10 keywords in sorted list for key in ids2words: if ids2words[key] == new_list[i][0]: # bow_corpus[1][2][0]: term = de_stemmer[key] if (term in timeList): group = 0 elif (term in placeList): group = 1 elif (term in peopleList): group = 2 else: group = 3 score = float(new_list[i][1]) / accu # print "score", float(new_list[i][1]), accu, score # if score > 0.01: topicWordTags2[j].append( [str(term), group, score]) # Add this to the bag of words print "\n Final Entites: ", len(new_list), topicWordTags2[j] # ----------------------- Process word tags from document vectors (to complete 10 minimum tags for each topic) for j in range(1, len(doc_topic_keywords)): # Each topic # ------------------- 2 tokenizer ---------------------- stopped_tokens = [ [ word for word in WordPunctTokenizer().tokenize( str(document).lower()) if ((word not in stoplist) & (word != u'.\u201d<') & (word != u'\xe9') & (word != u'\u2018') & (word != u'.\u201d') & (word != u'\u201c') & (word != '\u201c') & (len(word) > 2) & (is_int(word) == False)) ] # & (is_int(word) == False) & (len(word) > 3) & (len(word) == len(word.strip({0,1,2,3,4,5,6,7,8,9}))) )] #(re.search('\d+', word) == False) ) ] for document in [doc_topic_keywords[j]] ] # ------------------- 3 Stemming and Count word frequencies ------------------- stemmer = {} texts = [] texts_set = [] #set() for stopped_token in stopped_tokens: stemmed_texts = [p_stemmer.stem(i) for i in stopped_token] texts_set += [stemmed_texts] frequency = defaultdict(int) for text in texts_set: for token in text: frequency[token] += 1 # Only keep words that appear more than once processed_corpus = [[token for token in text if frequency[token] > 0] for text in texts_set] # print "\n Final proceesd", processed_corpus # ------------------- 4 Dictionary and TF-IDF Vectors ------------------- ids2words = my_dictionary.token2id bow_corpus = [my_dictionary.doc2bow(text) for text in processed_corpus] # final_vectors = Text_tfidf[bow_corpus] # With TF-IDF final_vectors = bow_corpus # No TF-IDF # print "Input: ", bow_corpus # print "Input: ", final_vectors[0] new_list = [] key_words = [] # Words from doc TF-IDF Vector # Sort word bag of each document if len(final_vectors[0]) > 2: new_list = sorted(final_vectors[0], key=lambda prob: prob[1], reverse=True) else: new_list = final_vectors[0] # print "\n Bag of sorted: ", new_list k = 0 for i in range( 0, len(new_list)): # Pick the firts 10 keywords in sorted list for key in ids2words: if ids2words[key] == new_list[i][0]: # bow_corpus[1][2][0]: if (k < 20): # first 3 keywords, no more term = de_stemmer[key] # topicWordTags3[j].add(str(term)) # Add this to the bag of words topicWordTags3[j].append( [str(term), 3, 0.1]) # Add this to the bag of words k = k + 1 # print "\n Final Entites: ", topicWordTags2[j] # ------------------------------ Add entities from user interactions ---------------------- topic_hash = [] for i in range(1, class_num + 1): # topicWordTags[0] is always empty, tagWords = [] temp_set = set() kk = 0 for eachTag in topicWordTags2[i]: if kk < 20: if not (eachTag[0] in temp_set): temp_set.add(eachTag[0]) tagWords.append(eachTag) kk = kk + 1 # ------------------------------ Add more entities from documents ----------------------- #print topicWordTags3[i] if kk < 20: for eachTag in topicWordTags3[i]: if kk < 20: if not (eachTag[0] in temp_set): temp_set.add(eachTag[0]) #print "set > ", temp_set tagWords.append(eachTag) #print "List > ", tagWords kk = kk + 1 tagWords = sorted(tagWords, key=lambda k: k[2], reverse=True) temp = {"TopicNum: ": i - 1, "keywords": tagWords} topic_hash.append(temp) fout = open(filename3, "w") fout.write(json.dumps(topic_hash, indent=1)) fout.close() # print "\n doc_topic_array: ", doc_topic_array # print "\n doc_topic_array: ", doc_key_word return
review1 # # Stemming: # - Convert word to its root word # # Eg: # loved ----> love # In[18]: # Use Stemming to take word it to its Root form from nltk.stem.porter import PorterStemmer ps = PorterStemmer() review1 = [ps.stem(word) for word in review1] review1 # In[19]: # Convert list to string review2 = ' '.join(review1) review2 # ### Count-Vectorizer( ) # - This will construct the vocabulary of the bag-of-words model and transform the sentences into sparse feature vectors # In[20]: corpus1 = []
for channels in data: # list for tokenized documents in loop texts = [] doc_set = data[channels] # loop through document list for i in doc_set: # clean and tokenize document string raw = i[0].lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i[0] for i[0] in tokens if not i[0] in en_stop] # stem tokens stemmed_tokens = [p_stemmer.stem(i[0]) for i[0] in stopped_tokens] # add tokens to list texts.append(stemmed_tokens) # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts] # generate LDA model ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=3, id2word = dictionary, passes=20) ldaAns = ldamodel.print_topics(num_topics=3, num_words=3)
def token2features(sent, i, add_neighs=True): """Compute the features of a token. All the features are boolean, i.e. they appear or they do not. For the token, you have to return a set of strings that represent the features that *fire* for the token. See the code below. The token is at position i, and the rest of the sentence is provided as well. Try to make this efficient, since it is called on every token. One thing to note is that it is only called once per token, i.e. we do not call this function in the inner loops of training. So if your training is slow, it's not because of how long it's taking to run this code. That said, if your number of features is quite large, that will cause slowdowns for sure. add_neighs is a parameter that allows us to use this function itself in order to recursively add the same features, as computed for the neighbors. Of course, we do not want to recurse on the neighbors again, and then it is set to False (see code). """ porter = PorterStemmer() ftrs = [] # bias ftrs.append("BIAS") # position features if i == 0: ftrs.append("SENT_BEGIN") if i == len(sent) - 1: ftrs.append("SENT_END") # the word itself word = unicode(sent[i]) ftrs.append("WORD=" + word) ftrs.append("LCASE=" + word.lower()) # Adding stemmed version of word. ftrs.append("STEMMED=" + porter.stem(word)) # some features of the word if word.isalnum(): ftrs.append("IS_ALNUM") if word.isnumeric(): ftrs.append("IS_NUMERIC") if word.isdigit(): ftrs.append("IS_DIGIT") if word.isupper(): ftrs.append("IS_UPPER") if word.islower(): ftrs.append("IS_LOWER") # Additional features if word.startswith("http") or word.endswith(".com"): ftrs.append("IS_URL") if word in abbreviations: ftrs.append("IS_ABRV") if word.startswith("#"): ftrs.append("IS_HASHTAG") if word.startswith("@"): ftrs.append("IS_MENTION") # previous/next word feats if add_neighs: if i > 0: for pf in token2features(sent, i - 1, add_neighs=False): ftrs.append("PREV_" + pf) if i < len(sent) - 1: for pf in token2features(sent, i + 1, add_neighs=False): ftrs.append("NEXT_" + pf) # return it! return ftrs
# print(dataset[90:100]) # Cleaning text import re import nltk # nltk.download('stopwords') from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer corpus = [] for i in range(0, 1000): review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) review = review.lower() review = review.split() ps = PorterStemmer() review = [ ps.stem(x) for x in review if not x in set(stopwords.words('english')) ] review = ' '.join(review) corpus.append(review) # print(corpus) # Creating the Bag of Words Model from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(max_features=1500) X = cv.fit_transform(corpus).toarray() y = dataset.iloc[:, 1].values # Using NaiveBayes on dependent and independent variables from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y,
class Preprocess: def __init__(self, text): self.text = text self.STOPWORDS = set(stopwords.words('english')) self.spell = SpellChecker() self.p = inflect.engine() self.nlp = en_core_web_sm.load() #self.nlp = spacy.load('en_core_web_md') self.model = api.load("glove-twitter-25") self.lemmatizer = WordNetLemmatizer() self.stemmer = PorterStemmer() def strip_html_tags(self): """remove html tags from text""" soup = BeautifulSoup(self.text, "html.parser") stripped_text = soup.get_text(separator=" ") return stripped_text def remove_accented_chars(self): """remove accented characters from text, e.g. café""" text = unidecode.unidecode(self.text) return text '''def expand_contractions(self, text): """expand shortened words, e.g. don't to do not""" text = list(cont.expand_texts([text], precise=True))[0] return text''' def pos_tagging(self): word_tokens = word_tokenize(self.text) return pos_tag(word_tokens) def text_lowercase(self): return self.text.lower() def text_uppercase(self): return self.text.upper() def remove_numbers(self): result = re.sub(r'\d+', '', self.text) return result def convert_number(self): # split string into list of words temp_str = self.text.split() # initialise empty list new_string = [] for word in temp_str: # if word is a digit, convert the digit # to numbers and append into the new_string list if word.isdigit(): temp = p.number_to_words(word) new_string.append(temp) # append the word as it is else: new_string.append(word) # join the words of new_string to form a string temp_str = ' '.join(new_string) return temp_str def remove_punctuation(self): translator = str.maketrans('', '', string.punctuation) return self.text.translate(translator) def remove_whitespace(self): return " ".join(self.text.split()) def remove_stopwords(self): """custom function to remove the stopwords""" return " ".join([word for word in str(self.text).split() if word not in self.STOPWORDS]) def stem_words(self): return " ".join([self.stemmer.stem(word) for word in self.text.split()]) def lemmatize_words(self): return " ".join([self.lemmatizer.lemmatize(word) for word in self.text.split()]) def remove_freqwords(self, df, column_name): """custom function to remove the frequent words""" cnt = Counter() for self.text in df["text_wo_stop"].values: for word in self.text.split(): cnt[word] += 1 FREQWORDS = set([w for (w, wc) in cnt.most_common(10)]) return " ".join([word for word in str(self.text).split() if word not in FREQWORDS]) def remove_emoji(self): emoji_pattern = re.compile("[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE) return emoji_pattern.sub(r'', self.text) def remove_emoticons(self): emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')') return emoticon_pattern.sub(r'', self.text) def convert_emoticons(self): for emot in EMOTICONS: text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), self.text) return text def remove_urls(self): url_pattern = re.compile(r'https?://\S+|www\.\S+') return url_pattern.sub(r'', self.text) def remove_html(self): html_pattern = re.compile('<.*?>') return html_pattern.sub(r'', self.text) def correct_spellings(self): corrected_text = [] misspelled_words = self.spell.unknown(self.text.split()) for word in self.text.split(): if word in misspelled_words: corrected_text.append(self.spell.correction(word)) else: corrected_text.append(word) return " ".join(corrected_text) def NER(self): doc = self.nlp(self.text) entity_label_map = dict() for entity in doc.ents: entity_label_map[entity.self.text] = entity.label_ return entity_label_map
# list for tokenized documents in loop texts = [] # loop through document list for i in doc_set: # clean and tokenize document string raw = i.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in en_stop] # stem tokens stemmed_tokens = [stemmer.stem(i) for i in stopped_tokens] # add tokens to list texts.append(stemmed_tokens) # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts] # generate LDA model ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word=dictionary, passes=20)