def simplify_old(s): res = '' st = LancasterStemmer() text = nltk.word_tokenize(s) tags = nltk.pos_tag(text) for tag in tags: word = tag[0] if f.checkPos(tag[1]): if word in model: word_stem = st.stem(word) top_words = model.most_similar(positive=[word], topn = 20) candidate_list = [w[0] for w in top_words] freq_list = [fdist[w] for w in candidate_list] c_f_list = zip(candidate_list, freq_list) ordered_list = sorted(c_f_list, key=lambda c_f_list:c_f_list[1], reverse=True) word_freq = fdist[word] # synonmys = f.getSynonmys(word) ## get synonmys from wordnet # print synonmys for w in ordered_list: if not f.freq_diff(word_freq, w[1]): ## break for loop if candidate word frequency does not exceed the word frequency by a threshold break if st.stem(w[0]) != word_stem and f.samePos(word, w[0]): ##exclude morphological derivations and same pos word = w[0] ### do not use wordnet # if w[0] in synonmys: # word = w[0] # else: # for syn in synonmys: # if st.stem(w[0]) == st.stem(syn): # word = w[0] res = res + word + ' ' return res
def getstems(dict): l = LancasterStemmer() stems = {} for word in dict: if word in dicts.irregforms: stems[word] = l.stem(dicts.irregforms[word]) else: stems[word] = l.stem(word) return stems
def filter_pos(text): st = LancasterStemmer() tokens = nltk.word_tokenize(text) tagged = nltk.pos_tag(tokens) nouns = list() verbs = list() for (word, tag) in tagged: if tag.startswith('N'): nouns.append(st.stem(word)) elif tag.startswith('V'): verbs.append(st.stem(word)) return nouns,verbs
def mapper(shard, doc_counter): st = LancasterStemmer() with open(shard, "r") as f: ohsu = json.JSONDecoder().decode(f.read()) output_values = [] doc_counter.add(len(ohsu)) for article in ohsu: output_values += [(w, (article[".I"], 'a')) for w in article[".A"]] output_values += [(st.stem(w), (article[".I"], 't')) for w in alphabet.findall(article[".T"].lower())] if article.get('.W') is not None: body_words = (w for w in alphabet.findall(article[".W"].lower())) output_values += [(st.stem(w), (article[".I"], 'w')) for w in body_words] return output_values
def poss_train(train_file,train_write,sw_file): """ Arguments: - `train_file`: """ a = 0 f = open(train_file) reader = csv.reader(f) t = open(train_write,"w") sw = open(sw_file) sw = sw.readlines() sw = [word.strip() for word in sw] #stopwords = sw # use nltk stopwords stopwords = nltk.corpus.stopwords.words('english') print "停顿词表长度",len(stopwords) stopwords = set(stopwords) g = lambda x : x not in stopwords for row in reader: if a%10000 == 0: print a a += 1 title = row[1].lower() #clean html body = nltk.clean_html(row[2].lower()) #word tokenize pattern = r"([a-z])\w+" body = nltk.regexp_tokenize(body, pattern) title = nltk.regexp_tokenize(title, pattern) #remove stopwords body = filter(g,body) title = filter(g,title) #light stem st = LancasterStemmer() title = set([st.stem(word) for word in title]) body = set(body) body = set([st.stem(word) for word in body]) # list to string body = ' '.join(body) title = ' '.join(title) t.write('"%s","%s","%s","%s"\n'%(row[0], title,body,row[3]))
def remove_stems(file): new_file = [] punctuation = re.compile(r'[.,"?!:;]') lemmatizer = WordNetLemmatizer() stemmer = LancasterStemmer() for raw_post in file: post = raw_post[1] token = nltk.word_tokenize(post) token_tags = nltk.pos_tag(token) new_token = [] for word in token_tags: # Removes punctuations and change it to lower case original_word = punctuation.sub("", word[0].lower()) # Stems each word to their roots, but using lemmatizer then Lancaster stemmed_word = lemmatizer.lemmatize(original_word) if original_word == stemmed_word: stemmed_word = stemmer.stem(stemmed_word) # Removes stopwords that are defined in the nltk library if stemmed_word not in nltk.corpus.stopwords.words('english') and stemmed_word != '': new_token.append((stemmed_word, word[1])) new_file.append((raw_post[0], new_token)) return new_file
def train_lsi_model(self, texts, num_of_toptics=10): texts_tokenized = [[word.lower() for word in word_tokenize(text)] for text in texts] # remove the stop words and punctuations english_stop_words = stopwords.words('english') english_punctuations = [',', '.', ':', '?', '(', ')', '[', ']', '@', '&', '!', '*', '#', '$', '%'] texts_filtered = [[word for word in text_tokenized if (not word in english_punctuations) and (not word in english_stop_words)] for text_tokenized in texts_tokenized] # stem the word st = LancasterStemmer() texts_stemed = [[st.stem(word) for word in text_filtered] for text_filtered in texts_filtered] all_stems = sum(texts_stemed, []) stem_once = set(stem for stem in set(all_stems) if all_stems.count(stem) == 1) cleaned_texts = [[stem for stem in text if stem not in stem_once] for text in texts_stemed] dictionary = corpora.Dictionary(cleaned_texts) corpus = [dictionary.doc2bow(text) for text in cleaned_texts] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_of_toptics) result = lsi[corpus] return result
def prepare_corpus(raw_documents): # remove punctuation print "Removing Punctuation" import string exclude = set(string.punctuation) raw_documents = [''.join(ch for ch in s if ch not in exclude) for s in raw_documents] # remove common words print "Calculating Stoplist" stoplist = set([x.rstrip() for x in codecs.open("stop_list.txt", encoding='utf-8') if not x.startswith("#")]) stoplist = stoplist.union(set(nltk.corpus.stopwords.words("english"))) # print stoplist print "Removing Stoplist and Stemming" from nltk.stem.lancaster import LancasterStemmer st = LancasterStemmer() texts = [[st.stem(word) for word in document.lower().split() if word not in stoplist] for document in raw_documents] # remove words that appear only once print "Removing Single Variables" all_tokens = sum(texts, []) tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) texts = [[word for word in text if word not in tokens_once] for text in texts] return texts
def lemmstem(sentences): ''' This function is responsible for perfoming the lemmarization and stemming of the words Input: A list of trees containing the sentences. All words are classificated by their NE type Output: Lemmatized/Stemmized sentences ''' lmtzr = WordNetLemmatizer() st = LancasterStemmer() dic = {'VB' :wordnet.VERB, 'NN': wordnet.NOUN, 'JJ':wordnet.ADJ, 'RB':wordnet.ADV } for sent in sentences: lvsidx=sent.treepositions('leaves') for pos in lvsidx: word=sent[pos][0] tag = sent[pos][1] rtag = tag[0:2] if rtag in dic: lemm=lmtzr.lemmatize( word, dic[rtag] ) stem=st.stem(lemm) #print word, lemm, stem #Linia maldita sent[pos]=(word, tag, stem) else: sent[pos]=(word, tag, word) return sentences
def word_stem_example(word="Amevive"): """ [EN]Read: http://www.nltk.org/book/ch03.html, 3.6 Normalizing Text [CN]根据NLTK in python书中的推荐Porter算法较为鲁棒, 推荐使用 """ stemmer = LancasterStemmer() print("Lancaster [%s => %s]" % (word, stemmer.stem(word))) stemmer = PorterStemmer() # <=== recommended algorithm print("Porter [%s => %s]" % (word, stemmer.stem(word))) stemmer = RegexpStemmer('ing$|s$|e$', min=4) print("Regexp [%s => %s]" % (word, stemmer.stem(word))) stemmer = SnowballStemmer('english') # Choose a language print("Snowball [%s => %s]" % (word, stemmer.stem(word)))
def process(reviews): #separate splitor from nltk.tokenize import word_tokenize review_tokenized = [[word.lower() for word in word_tokenize(review.decode('utf-8'))] for review in reviews] #remove stop words from nltk.corpus import stopwords english_stopwords = stopwords.words('english') review_filterd_stopwords = [[word for word in review if not word in english_stopwords] for review in review_tokenized] #remove punctuations english_punctuations = [',','.','...', ':',';','?','(',')','&','!','@','#','$','%'] review_filtered = [[word for word in review if not word in english_punctuations] for review in review_filterd_stopwords] #stemming from nltk.stem.lancaster import LancasterStemmer st = LancasterStemmer() review_stemmed = [[st.stem(word) for word in review] for review in review_filtered] #remove word whose frequency is less than 5 all_stems = sum(review_stemmed, []) stems_lt_three = set(stem for stem in set(all_stems) if all_stems.count(stem) == 1) final_review = [[stem for stem in text if stem not in stems_lt_three] for text in review_stemmed] return final_review
class Stemmer(): def __init__(self): self.stemmer = LancasterStemmer() def stem(self, word_to_be_stemmed): return self.stemmer.stem(word_to_be_stemmed)
def predict_category_subcategory(book_name): data_set1 = pandas.Series(book_name.encode('ascii')) #Data Preprocessing data_set1 = data_set1.dropna(axis=0,how='any') data_set1 = data_set1.str.lower() #Manual removal List remove_list = ['edition','ed','edn', 'vol' , 'vol.' , '-' ,'i'] data_set1[0] =' '.join([i for i in data_set1[0].split() if i not in remove_list]) data_set1 = data_set1.apply(lambda x :re.sub(r'\w*\d\w*', '', x).strip()) data_set1 = data_set1.apply(lambda x :re.sub(r'\([^)]*\)', ' ', x)) data_set1 = data_set1.apply(lambda x :re.sub('[^A-Za-z0-9]+', ' ', x)) #data_set['Category ID'] = data_set['Category ID']+"|"+data_set['Subcategory ID'] #Stemming the book titles stemmer = LancasterStemmer() data_set1[0]=" ".join([stemmer.stem(i) for i in data_set1[0].split()]) clf = joblib.load(os.path.join(BASE_DIR+"/learners/",'category_predict.pkl')) ans = clf.predict(data_set1) sub_clf = joblib.load(os.path.join(BASE_DIR+"/learners/",'subcategory_predict.pkl')) sub_ans = sub_clf.predict(data_set1) return [ans[0],sub_ans[0]]
def tweetTokenizer(tweet_text): st = LancasterStemmer() twitterWords = tweet_text.split() #remove stop words using NLTK corpus twitterWords = [word.lower() for word in twitterWords] twitterWords = [w for w in twitterWords if not w in stopwords.words('english')] #remove custom list of stop words using experimentation noiseWords = ["i'm", "like", "get", "don't", "it's", "go", "lol", "got", "one", "know", "@", "good", "want", "can't", "need", "see", "people", "going", "back", "really", "u", "think", "right", "never", "day", "time", "never", "that's", "even", ",", "." "make", "wanna", "you're", "come", "-", "still", "much", "someone", "today", "gonna", "new", "would", "take", "always", "im", "i'll", "best", "'", "feel", "getting", "say", "tonight", "last", "ever", "better", "i've", "look", "f*****g", "way", "could", "!", "oh" "tomorrow", "night", "first", "miss", "ain't", "thank", "2", "bad" "little", "thanks", "something", "wait", "&", "`", "oh", "make", "bad", "let","stop", "well", "tell"] twitterWords = [w for w in twitterWords if not w in noiseWords] twitterWords = [st.stem(w) for w in twitterWords] return twitterWords
def word_standardize(sentences): tokens = [] sentences_st = [] for sent in sentences: tokens.extend(word_tokenize(sent)) sentences_st.append(word_tokenize(sent)) words = tokens st = LancasterStemmer() words = [w.lower() for w in words] words = [w for w in words if not w in stopwords.words('english')] words = [w for w in words if not w in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'] st_words = [st.stem(w) for w in words] sent_result = [] for sent in sentences_st: sent = [w.lower() for w in sent] sent = [w for w in sent if not w in stopwords.words('english')] sent = [w for w in sent if not w in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'] sent_result.append(sent) return st_words, sent_result
def score_sentence(sentence, weights, stop_words): """ Parameters weights: Counter, sentence: string #I NEED SKIES DOCUMENTATION """ lemmatizer = WordNetLemmatizer() stemmer = LancasterStemmer() sentence = strip_punc(sentence) tokens = word_tokenize(sentence) score = 0 for token in tokens: root = stemmer.stem(lemmatizer.lemmatize(token)) if token not in stop_words and root not in stop_words: score += weights[root] score = sum([weights[stemmer.stem(lemmatizer.lemmatize(token))] for token in tokens if token not in stop_words and stemmer.stem(lemmatizer.lemmatize(token)) not in stop_words]) return score
def parse_raw_data(self, new_art): self.startClass=default_timer() tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(new_art.body) stemmer = LancasterStemmer() article_dic = new_art.words global_dic = self.raw_dictionary for word in tokens: word = word.lower() if(False == self.is_stop_word(word) and word.isnumeric()==False): s_word = stemmer.stem(word) # s_word = word ## it is not a stop word, check if the word ## is already part of the article dictionary. ## if yes, increment the count else add it. ## If you are adding check if it is part of ## the big corpus, if yes increment the count ## of number of articles with that word. self.globalWordCount+=1 new_art.doc_len = new_art.doc_len + 1 if(s_word in article_dic): article_dic[s_word].wrd_count+=1 global_dic[s_word].wrd_count+=1 else: article_dic[s_word] = local_word_attributes(1) if (s_word in global_dic): global_dic[s_word].art_count+=1 global_dic[s_word].wrd_count+=1 else: global_dic[s_word] = global_word_attributes(1,1, 1, 0)
def preprocess(reviews): import nltk from nltk.tokenize import word_tokenize review_tokenized = [[word.lower() for word in word_tokenize(review.decode('utf-8'))] for review in reviews] #print "review tokenize done" #remove stop words from nltk.corpus import stopwords english_stopwords = stopwords.words('english') review_filterd_stopwords = [[word for word in review if not word in english_stopwords] for review in review_tokenized] #print 'remove stop words done' #remove punctuations english_punctuations = [',','.',':',';','?','(',')','&','!','@','#','$','%'] review_filtered = [[word for word in review if not word in english_punctuations] for review in review_filterd_stopwords] #print 'remove punctuations done' #stemming from nltk.stem.lancaster import LancasterStemmer st = LancasterStemmer() review_stemmed = [[st.stem(word) for word in review] for review in review_filtered] #print 'stemming done' return review_stemmed
def stemming(words): wordsAfterStemming=[] st=LancasterStemmer() for x in words: y=st.stem(x) wordsAfterStemming.append(y) return wordsAfterStemming
def lemmatizer_newsheadlines() : lancaster_stemmer = LancasterStemmer() frl=open("C:/Users/rajas/Downloads/csv_files-2014-12-10/csv files/lemma1.csv","rU") fr=open("C:/Users/rajas/Downloads/csv_files-2014-12-10/csv files/sample.csv","rU") fw=open("C:/Users/rajas/Downloads/csv_files-2014-12-10/csv files/lemmaheadlines.csv","w") for headline in fr: if len(headline)>0: headlinelist=headline.split(",") if len(headlinelist)==3: headlinewords=headlinelist[1].split(" ") print(headlinewords) for word in headlinewords: wordcor=(((word.replace("?","")).replace(":","")).replace("\"","")) headlineword=(lancaster_stemmer.stem(wordcor)).lower() print(headlineword) # for line in frl: # crimelist=line.split(",") # crimeword=((crimelist[1].replace("\"","")).strip()).lower() # print(crimeword+str(i)) # i+=1 dictcrime=lemmadict() if headlineword in dictcrime: print(headlineword+"yipee") fw.write(headlineword+","+headlinelist[0]+","+headlinelist[1]+"\n") break; frl.close() fw.close() fr.close()
def stem_text(text): stm = LancasterStemmer() tokens = text.split() words = [stm.stem(w) for w in tokens] snt = " ".join(words) return snt
def readText(textFile): examples = [] count = 0 lexicon_en = {} lexicon_ge = {} stem_en = LancasterStemmer() stem_ge = nltk.stem.snowball.GermanStemmer() for line in open(textFile): count+=1 if count % 1000 == 0: print count lans = line.lower().strip().split("|||") #german = [stem_ge.stem(x.decode('utf-8')) for x in lans[0].strip().split(" ")] german = lans[0].strip().split(" ") german = process(german) for wordx in german: for word in wordx: if word not in lexicon_ge: lexicon_ge[word]=1 else: lexicon_ge[word]+=1 eng = [stem_en.stem(x.decode('utf-8')) for x in lans[1].strip().split(" ")] #parse_en = pattern.en.parse(" ".join(eng)) eng = lans[1].strip().split(" ") for word in eng: if word not in lexicon_en: lexicon_en[word]=1 else: lexicon_en[word]+=1 examples.append(Example(german,eng)) return examples, lexicon_en, lexicon_ge
def filt(string): ret = string # Filter all punctuation from string for p in punctuation: ret = ret.replace(p, '') # Replace hyphens with spaces ret = ret.replace('-', ' ') oldret = ret ret = "" # Filter all stop words from string for word in oldret.split(): if (word in allStopWords) or len (word) <= 1: pass else: ret += word.lower() + " " st = LancasterStemmer() steamed = "" for word in ret.split(): try: steamed += str(st.stem(word)) + " " except UnicodeDecodeError: pass return steamed
class VocKeyworder(BaseKeyworder): def __init__(self): super(VocKeyworder, self).__init__() self._vocs = engvoc.voc2000 self._lemmatizer = WordNetLemmatizer() self._stemmer1 = LancasterStemmer() self._stemmer2 = SnowballStemmer('english') def add_keyword(self, gag_id, title): tokens = re.split(' |\.|,|;|=', title) for token in tokens: token = re.sub(r"\W+$", '', token) token = re.sub(r"^\W+", '', token) vocs = [] try: token = token.encode('utf8') vocs.append(re.sub(r"'\w+", '', token).lower()) vocs.append(self._lemmatizer.lemmatize(vocs[0])) vocs.append(self._stemmer1.stem(vocs[0])) vocs.append(self._stemmer2.stem(vocs[0])) except UnicodeDecodeError: continue if vocs[0] == '': continue try: float(vocs[0]) continue except ValueError: pass if not any([voc in self._vocs for voc in vocs]): print 'voc', vocs, token self._add_keyword(gag_id, token)
def mapper(): #list of fields in positional order expected in inbound #forum node data. fieldnames = ['id', 'title', 'tag_names', 'author_id', 'body', 'node_type', 'parent_id', 'abs_parent_id', 'added_at', 'score', 'state_string', 'last_edited_id', 'last_activity_by_id', 'last_activity_at', 'active_revision_id', 'extra', 'extra_ref_id', 'extra_count', 'marked'] reader = csv.DictReader(sys.stdin, delimiter='\t', fieldnames=fieldnames) stemmer = LancasterStemmer() stopw = stopwords.words('english') split_pattern = re.compile('[\W.!?:;"()<>[\]#$=\-/]') for line in reader: pid = line['id'] body = line['body'] # split body into words words = split_pattern.split(body) # map the stemmer function across all the words. # and use the Counter to create a dict # of counted stems. Remove english stopwords. stem_counts = Counter((stemmer.stem(x) for x in words if x not in stopw)) # emit the stem, count and node id # for reduction into the reverse index for stem, count in stem_counts.items(): print "{stem}\t{node_id}\t{count}".format(stem=stem, node_id=pid, count=count)
class Stemmer: def __init__(self): self.st = LancasterStemmer() self.stop = stopwords.words('english') #Provides list of stem words given a line def getStemmedCorpus(self,line): stemWords = list() data = line.strip().split(',') if len(data) < 2: return None stri = "" stri = ' '.join(e for e in data[1].split(" ") if e.isalnum()) for i in stri.split(" "): if self.st.stem(i) not in self.stop: stemWords.append(self.st.stem(i)) return stemWords
def overlapping_text(text_1, text_2): st = LancasterStemmer() cachedStopWords = get_stopwords() text_1_list = ([st.stem(word) for word in text_1.split() if word not in cachedStopWords]) text_2_list = ([st.stem(word) for word in text_2.split() if word not in cachedStopWords]) return jaccard_dist(text_1_list, text_2_list) '''
class LemmaTokenizer(object): def __init__(self): #self.wnl = WordNetLemmatizer() self.stemmer = LancasterStemmer() def __call__(self, doc): #return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if re.match(r'[a-z]+', t, re.M|re.I)] return [self.stemmer.stem(t) for t in word_tokenize(doc) if re.match(r'[a-z]+', t, re.M|re.I)]
def preprocess(content): stopset = set(stopwords.words('english')) #replace punctuation and tag with space tokens = word_tokenize(re.sub(r'<p>|</p>|[^A-Za-z ]', ' ', content.lower())) pos_list = pos_tag(tokens) s_tokens = list() #noun and verb only for pos in pos_list: #print pos[1] #if pos[1] in ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']: if pos[1] in ['NN', 'NNS']: s_tokens.append(pos[0]) wordfreq = FreqDist(s_tokens) stemfreq = dict() st = LancasterStemmer() for word, freq in wordfreq.items(): #stopwords if word in stopset: del wordfreq[word] continue #tiny words if len(word) <= 2: del wordfreq[word] continue #stemmer stem = st.stem(word) try: stemfreq[stem]+=freq except: stemfreq[stem]=freq return stemfreq
def get_pretrained_vector(session, word2vec_model, vocab_path, vocab_size, vectors): print(vectors) with gfile.GFile(vocab_path, mode="r") as vocab_file: st = LancasterStemmer() counter = 0 counter_w2v = 0.0 while counter < vocab_size: vocab_w = vocab_file.readline().replace("\n", "") # vocab_w = st.stem(vocab_w) # for each word in vocabulary check if w2v vector exist and inject. # otherwise dont change value initialise randomly. if word2vec_model and vocab_w and word2vec_model.__contains__(vocab_w) and counter > 3: w2w_word_vector = word2vec_model.get_vector(vocab_w) print("word:%s c:%i w2v size %i" % (vocab_w, counter, w2w_word_vector.size)) vectors[counter] = w2w_word_vector counter_w2v += 1 else: vocab_w_st = st.stem(vocab_w) if word2vec_model and vocab_w_st and word2vec_model.__contains__(vocab_w_st): w2w_word_vector = word2vec_model.get_vector(vocab_w_st) print("st_word:%s c:%i w2v size %i" % (vocab_w_st, counter, w2w_word_vector.size)) vectors[counter] = w2w_word_vector counter_w2v += 1 else: if not vocab_w: print("no more words.") break counter += 1 print("injected %f per cent" % (100 * counter_w2v / counter)) print(vectors) return vectors
def get_model(): ''' Function to train and load the model if the file is modified else just load the model and return it. INPUT: 1. NONE OUTPUT: 1. model(tflearn-model): DNN model ''' #call the method to check if the file was modified mod_val = Model.__check_file() if mod_val == "modified": #initialize stemmer stemmer = LancasterStemmer() #load json file with open("data/intents.json") as file: data = json.load(file) #lists to store values words = [] labels = [] docs_x = [] docs_y = [] #stemming for intent in data['intents']: for pattern in intent['patterns']: #tokenize wrds = nltk.word_tokenize(pattern) words.extend(wrds) docs_x.append(wrds) docs_y.append(intent['tag']) if intent['tag'] not in labels: labels.append(intent['tag']) #save the stemmed words and associated labels words = [stemmer.stem(w.lower()) for w in words if w != "?"] words = sorted(list(set(words))) labels = sorted(labels) #create bag of words training = [] output = [] #list with initial 0s out_empty = [0 for _ in range(len(labels))] #loop through doc_X for x, doc in enumerate(docs_x): bag = [] wrds = [stemmer.stem(w) for w in doc] for w in words: if w in wrds: bag.append(1) else: bag.append(0) out_row = out_empty[:] out_row[labels.index(docs_y[x])] = 1 training.append(bag) output.append(out_row) training = np.array(training) output = np.array(output) #save the files with open("data/model_data.pkl", "wb") as f: pickle.dump((words, labels, training, output), f) #train the model #get the dnn files words, labels, training, output = Model.__dnn_files() #clear the default graph stack and resets the global default graph. tf.compat.v1.reset_default_graph() #layers net = tflearn.input_data(shape = [None, len(training[0])]) net = tflearn.fully_connected(net, 8) net = tflearn.fully_connected(net, 8) net = tflearn.fully_connected(net, len(output[0]), activation = "softmax") net = tflearn.regression(net) #DNN Model model = tflearn.models.dnn.DNN(net) #train and save the model if the file is modified if mod_val == "modified": model.fit(training, output, batch_size = 8, n_epoch = 1000, show_metric = True) #save the model model.save("model/model.tflearn") #load the model model.load("model/model.tflearn") return model, words, labels
def test_view(db, view, options): #Getting the clusters data view_id = options.viewID query_id = options.queryID bu_id = options.bu collection = db[settings.get('Potential_CFD', 'proj_cluster')] cursor = collection.find({}) clusters = pd.DataFrame(list(cursor)) project_clusters = [] groups = clusters.groupby('Cluster') for name, group in groups: project_clusters.append(list(group['Project'])) print(project_clusters) #Fetch the data from the respective collection if (view): vi_col_name = settings.get( 'Potential_CFD', 'viewPrefix') + str(view_id) + '_' + str(query_id) tr_col_name = settings.get('Potential_CFD', 'trainPrefix') else: vi_col_name = settings.get( 'Potential_CFD', 'viewPrefix') + str(bu_id) + '_' + str(query_id) tr_col_name = settings.get('Potential_CFD', 'trainPrefix') collection = db[vi_col_name] print(vi_col_name) cursor = collection.find({}) test_df = pd.DataFrame(list(cursor)) if (test_df.shape[0] == 0): return if (options.cfd == "N"): test_df = test_df[test_df['CFD_INDIC'] == 0] if (test_df.shape[0] == 0): return req_cluster = list(test_df['PROJECT'].unique()) print(req_cluster) if (len(req_cluster) > 2): req_cluster = test_df['PROJECT'].value_counts().nlargest( 1).index.tolist() print(req_cluster) print(test_df.shape[0]) #Get the cluster number if it exists, else create new cluster status = False for a in [ 'CSC.sys-doc', 'CSC.autons', 'CSC.asics', 'CSC.hw', 'CSC.general', 'CSC.voice' ]: if a in req_cluster: req_cluster.remove(a) if req_cluster in project_clusters: status = True p = 0 cluster_id = 0 f_c = [] for cluster in project_clusters: p = p + 1 if set(req_cluster).issubset(cluster): cluster_id = p f_c = cluster status = True te_col_name = settings.get('Potential_CFD', 'testPrefix') + str(cluster_id) #status = True #cluster_id = 3 print(cluster_id) print(status) if (status == True): #Fetching the cut_off print("In test_view printing cutoff" + str(options.cutoff)) if (options.cutoff): cut_off = float(options.cutoff) else: collection = db[settings.get('Potential_CFD', 'testPrefix') + str(cluster_id)] cursor = collection.find({}) df = pd.DataFrame(list(cursor)) fpr, tpr, thresholds = roc_curve(df['IFD_CFD_INDIC'], df['Final_prediction']) roc_auc = auc(fpr, tpr) i = np.arange(len(tpr)) roc = pd.DataFrame({ 'fpr': pd.Series(fpr, index=i), 'tpr': pd.Series(tpr, index=i), '1-fpr': pd.Series(1 - fpr, index=i), 'tf': pd.Series(tpr - (1 - fpr), index=i), 'thresholds': pd.Series(thresholds, index=i) }) r = roc.ix[(roc.tf - 0).abs().argsort()[:1]] cut_off = list(r['thresholds'])[0] / 100 print(cut_off) # if(options.cutOff != ""): # cut_off = int(options.cutOff)/100 # #cut_off = 0.5 #print(cut_off) #del[df] #Get all the saved model paths model1 = str(settings.get("Potential_CFD", "temp_path_mod_potCFD") ) + '/cluster' + str(cluster_id) + '_' + str( settings.get("Potential_CFD", "xgboost_model")) model2 = str(settings.get("Potential_CFD", "temp_path_mod_potCFD") ) + '/cluster' + str(cluster_id) + '_' + str( settings.get("Potential_CFD", "cnn_lstm_model")) model3 = str(settings.get( "Potential_CFD", "temp_path_mod_potCFD")) + '/' + str( settings.get( "Potential_CFD", "dnn_model")) + '_cluster' + str(cluster_id) + '_ticketCNT' model4 = str(settings.get( "Potential_CFD", "temp_path_mod_potCFD")) + '/' + str( settings.get( "Potential_CFD", "dnn_model")) + '_cluster' + str(cluster_id) + '_days' feature_columns_to_use = [ 'DE_MANAGER_USERID', 'SEVERITY_CODE', 'LIFECYCLE_STATE_CODE', 'PROJECT', 'PRODUCT', 'COMPONENT', 'ENGINEER', 'SUBMITTER_ID', 'AGE', 'FEATURE', 'RELEASE_NOTE', 'SA_ATTACHMENT_INDIC', 'CR_ATTACHMENT_INDIC', 'UT_ATTACHMENT_INDIC', 'IMPACT', 'ORIGIN', 'IS_CUSTOMER_VISIBLE', 'INCOMING_INDIC', 'BACKLOG_INDIC', 'DISPOSED_INDIC', 'TS_INDIC', 'SS_INDIC', 'OIB_INDIC', 'STATE_ASSIGN_INDIC', 'STATE_CLOSE_INDIC', 'STATE_DUPLICATE_INDIC', 'STATE_FORWARD_INDIC', 'STATE_HELD_INDIC', 'STATE_INFO_INDIC', 'STATE_JUNK_INDIC', 'STATE_MORE_INDIC', 'STATE_NEW_INDIC', 'STATE_OPEN_INDIC', 'STATE_POSTPONE_INDIC', 'STATE_RESOLVE_INDIC', 'STATE_SUBMIT_INDIC', 'STATE_UNREP_INDIC', 'STATE_VERIFY_INDIC', 'STATE_WAIT_INDIC', 'CFR_INDIC', 'S12RD_INDIC', 'S123RD_INDIC', 'MISSING_SS_EVAL_INDIC', 'S123_INDIC', 'S12_INDIC', 'RNE_INDIC', 'UPDATED_BY', 'DEV_ESCAPE_ACTIVITY', 'RELEASED_CODE', 'TEST_EDP_ACTIVITY', 'TEST_EDP_PHASE', 'RESOLVER_ANALYSIS_INDIC', 'SUBMITTER_ANALYSIS_INDIC', 'EDP_ANALYSIS_INDIC', 'RETI_ANALYSIS_INDIC', 'DESIGN_REVIEW_ESCAPE_INDIC', 'STATIC_ANALYSIS_ESCAPE_INDIC', 'FUNC_TEST_ESCAPE_INDIC', 'SELECT_REG_ESCAPE_INDIC', 'CODE_REVIEW_ESCAPE_INDIC', 'UNIT_TEST_ESCAPE_INDIC', 'DEV_ESCAPE_INDIC', 'FEATURE_TEST_ESCAPE_INDIC', 'REG_TEST_ESCAPE_INDIC', 'SYSTEM_TEST_ESCAPE_INDIC', 'SOLUTION_TEST_ESCAPE_INDIC', 'INT_TEST_ESCAPE_INDIC', 'GO_TEST_ESCAPE_INDIC', 'COMPLETE_ESCAPE_INDIC', 'SR_CNT', 'PSIRT_INDIC', 'BADCODEFLAG', 'RISK_OWNER', 'SIR', 'PSIRT_FLAG', 'URC_DISPOSED_INDIC', 'CLOSED_DISPOSED_INDIC', 'REGRESSION_BUG_FLAG' ] nonnumeric_columns = [ 'DE_MANAGER_USERID', 'LIFECYCLE_STATE_CODE', 'PROJECT', 'PRODUCT', 'COMPONENT', 'ENGINEER', 'SUBMITTER_ID', 'FEATURE', 'RELEASE_NOTE', 'IMPACT', 'ORIGIN', 'IS_CUSTOMER_VISIBLE', 'INCOMING_INDIC', 'BACKLOG_INDIC', 'DISPOSED_INDIC', 'UPDATED_BY', 'DEV_ESCAPE_ACTIVITY', 'RELEASED_CODE', 'TEST_EDP_ACTIVITY', 'TEST_EDP_PHASE', 'BADCODEFLAG', 'RISK_OWNER', 'SIR', 'PSIRT_FLAG', 'REGRESSION_BUG_FLAG' ] big_X = test_df[feature_columns_to_use] big_X = big_X.replace(np.nan, '', regex=True) big_X_imputed = DataFrameImputer().fit_transform(big_X) le = LabelEncoder() big_X_imputed["COMPONENT"] = big_X_imputed["COMPONENT"].astype(str) big_X_imputed["PRODUCT"] = big_X_imputed["PRODUCT"].astype(str) big_X_imputed["SUBMITTER_ID"] = big_X_imputed["SUBMITTER_ID"].astype( str) for feature in nonnumeric_columns: big_X_imputed[feature] = big_X_imputed[feature].astype(str) big_X_imputed[feature] = le.fit_transform(big_X_imputed[feature]) thefile = str(settings.get( "Potential_CFD", "temp_path_mod_potCFD")) + '/' + settings.get( 'Potential_CFD', 'potCFD_features') + str(cluster_id) + '.txt' with open(thefile, 'rb') as fp: feature_indices = pickle.load(fp) big_X_imputed = big_X_imputed.iloc[:, feature_indices] test_X = big_X_imputed.as_matrix() with open(model1, 'rb') as f: clf = pickle.load(f) test_X[test_X == ''] = 0 #print(test_X[3090:3100]) test_probs = clf.predict_proba(test_X)[:, 1] print("Model 1 ran") test_df["Prediction"] = test_probs ##################################SECOND MODEL################################ top_words = 10000 test_data = test_df[["ENCL-Description", "Headline", "ATTRIBUTE"]] stemmer = LancasterStemmer() i = 0 test_data['ATTRIBUTE'] = test_data["ATTRIBUTE"].replace(np.nan, ' ') test_data['Headline'] = test_data["Headline"].replace(np.nan, ' ') test_data["complete"] = test_data["ENCL-Description"].astype( str) + test_data["Headline"].astype( str) + " " + test_data["ATTRIBUTE"].astype(str) thefile = str(settings.get("Potential_CFD", "temp_path_mod_potCFD") ) + '/top_words_cluster_' + str(cluster_id) + '.txt' with open(thefile, 'rb') as fp: top_words = pickle.load(fp) f = str(settings.get("Potential_CFD", "temp_path_mod_potCFD") ) + '/indexes_cluster_' + str(cluster_id) + '.json' indexes = json.load(open(f, 'r')) testing_data = [] i = 0 for text in test_data["complete"]: #print(i) i = i + 1 text_list = [] if (not (pd.isnull(text))): for word in nltk.word_tokenize(text): if word.lower() not in [ "?", "'s", ">", "<", ",", ":", "'", "''", "--", "`", "``", "...", "", "!", "#", '"', '$', '%', '&', '(', ')', '*', '+', '-', '.', '/', ';', '=', '@', '[', '\\', ']', '^', '_', '{', '}', '|', '~', '\t', '\n', '' ] and '*' not in word.lower() and '=' not in word.lower( ) and '++' not in word.lower() and '___' not in word.lower( ) and (not word.isdigit()) and word.lower( ) not in stop_words and (len(word) > 1): stemmed_word = stemmer.stem(word.lower()) if stemmed_word not in top_words: text_list.append(0) else: text_list.append(indexes[stemmed_word]) testing_data.append(text_list) max_text_length = 150 X_test = sequence.pad_sequences(testing_data, maxlen=max_text_length) model = load_model(model2) prediction = model.predict(X_test) print("Model 2 ran") test_df["test_pred"] = prediction test_df["Final_prediction"] = stacking_test(test_df, cluster_id) ##############################Model3############################## print("Starting model 3") print(test_df[['Final_prediction', 'test_pred']]) print(cut_off) test_df1 = test_df[test_df['test_pred'] >= float(cut_off)] #CHnage it back to Final_prediction #print(test_df1) if (test_df1.shape[0] > 0): #test_df1['month_created'] = pd.to_datetime(test_df1['SUBMITTED_DATE']).dt.month #test_df1['year_created'] = pd.to_datetime(test_df1['SUBMITTED_DATE']).dt.year test_df1['COMPONENT'] = test_df1['COMPONENT'].astype(str) test_df1['PRODUCT'] = test_df1['PRODUCT'].astype(str) test_df1['SEVERITY_CODE'] = test_df1['SEVERITY_CODE'].astype(str) test_df1['SS_INDIC'] = test_df1['SS_INDIC'].astype(str) test_df1['TS_INDIC'] = test_df1['TS_INDIC'].astype(str) thefile = str(settings.get( "Potential_CFD", "temp_path_mod_potCFD")) + '/' + settings.get( 'Potential_CFD', 'potCFD_features') + 'dnn_' + str(cluster_id) + '.txt' with open(thefile, 'rb') as fp: new_feature_columns_to_use = pickle.load(fp) feature_columns_to_use = new_feature_columns_to_use #+ ['month_created', 'year_created'] categorical_features = new_feature_columns_to_use continuous_features = [] #['month_created', 'year_created'] for feature in categorical_features: test_df1[feature] = test_df1[feature].astype(str) new_test_df = test_df1[feature_columns_to_use] engineered_features = [] for continuous_feature in continuous_features: engineered_features.append( tf.contrib.layers.real_valued_column(continuous_feature)) for categorical_feature in categorical_features: sparse_column = tf.contrib.layers.sparse_column_with_hash_bucket( categorical_feature, hash_bucket_size=1000) engineered_features.append( tf.contrib.layers.embedding_column( sparse_id_column=sparse_column, dimension=16, combiner="sum")) regressor2 = tf.contrib.learn.DNNRegressor( feature_columns=engineered_features, hidden_units=[64, 32, 10], model_dir=model3) #TensorFlow input functions for Text Analysis def input_fn(df, training=True): continuous_cols = { k: tf.constant(df[k].values) for k in continuous_features } categorical_cols = { k: tf.SparseTensor(indices=[[i, 0] for i in range(df[k].size)], values=df[k].values, dense_shape=[df[k].size, 1]) for k in categorical_features } feature_cols = dict( list(continuous_cols.items()) + list(categorical_cols.items())) if training: label = tf.constant(df[LABEL_COLUMN].values) return feature_cols, label return feature_cols def train_input_fn(): return input_fn(train_df1) def eval_input_fn(): return input_fn(evaluate_df) def test_input_fn(): return input_fn(new_test_df, False) #Predicting SR tickets predicted_output = regressor2.predict( input_fn=test_input_fn) #input_fn(new_test_df, False)) test_df1['Ticket_Predictions'] = list(predicted_output) #Predicting days ahead regressor2 = tf.contrib.learn.DNNRegressor( feature_columns=engineered_features, hidden_units=[64, 32, 10], model_dir=model4) predicted_output = regressor2.predict( input_fn=test_input_fn) #input_fn(new_test_df, False)) test_df1['Days_Predictions'] = list(predicted_output) now = datetime.datetime.now() test_df1[test_df1['Ticket_Predictions'] == 0]['Ticket_Predictions'] = 1 test_df1[ test_df1['Ticket_Predictions'] < 0]['Ticket_Predictions'] = 0 test_df1[test_df1['Days_Predictions'] < 0]['Days_Predictions'] = 0 #test_df1['days_ahead'] = (pd.to_datetime(test_df1['SUBMITTED_DATE']) - now)/np.timedelta64(1, 'D') + test_df1['Days_Predictions'] test_df2 = test_df[[ 'IDENTIFIER', 'LIFECYCLE_STATE_CODE', 'DISPOSED_INDIC', 'CFD_INDIC', 'AGE', 'ATTRIBUTE', 'COMPONENT', 'DE_MANAGER_USERID', 'ENCL-Description', 'ENGINEER', 'Headline', 'IMPACT', 'PRIORITY_CODE', 'PRODUCT', 'PROJECT', 'SS_INDIC', 'TS_INDIC', 'SEVERITY_CODE', 'SUBMITTED_DATE', 'SUBMITTER_ID', 'TICKETS_COUNT', 'VERSION_TEXT', 'IFD_CFD_INDIC', 'Prediction', 'test_pred', 'Final_prediction' ]] #test_df2 = test_df[['IDENTIFIER', 'LIFECYCLE_STATE_CODE', 'DISPOSED_INDIC', 'CFD_INDIC', 'Prediction', 'test_pred', 'Final_prediction']] test_df3 = test_df1[[ 'IDENTIFIER', 'Ticket_Predictions', 'Days_Predictions' ]] final_test_df = pd.DataFrame() final_test_df = test_df2.join(test_df3.set_index('IDENTIFIER'), on='IDENTIFIER') final_test_df = final_test_df.drop_duplicates('IDENTIFIER') final_test_df['Prediction'] = final_test_df['Prediction'] * 100 final_test_df['Final_prediction'] = final_test_df[ 'test_pred'] * 100 #Change it back to Final_prediction final_test_df['test_pred'] = final_test_df['test_pred'] * 100 final_test_df['days_ahead'] = (pd.to_datetime( final_test_df['SUBMITTED_DATE']) - now) / np.timedelta64( 1, 'D') + final_test_df['Days_Predictions'] final_test_df['Cluster'] = cluster_id final_test_df['last_run_date'] = now.strftime("%Y-%m-%d") final_test_df = final_test_df[ final_test_df['test_pred'] >= cut_off * 100] #CHnage it back to Final_prediction print(final_test_df.shape) #print(test_df1.shape) #Inserting data to view results collection if (view): vi_col_name_results = settings.get( 'Potential_CFD', 'viewPrefix') + str(view_id) + '_' + str( query_id) + '_results' collection = db[vi_col_name_results] else: vi_col_name_results = settings.get( 'Potential_CFD', 'viewPrefix') + str(bu_id) + '_' + str( query_id) + '_results' collection = db[vi_col_name_results] records = json2.loads( final_test_df.T.to_json(date_format='iso')).values() collection.create_index([("IDENTIFIER", pymongo.ASCENDING), ("last_run_date", pymongo.ASCENDING)], unique=True) print(collection.index_information()) try: collection.insert(records) print("Inserted data to results collection") except pymongo.errors.DuplicateKeyError: print("Duplicates records in collection, so not inserting...") #Inserting data to View Mapper collection collection = db[settings.get('Potential_CFD', 'Pot_cfd_viewCluster')] df = pd.DataFrame(columns=[ 'viewSetCollectionName', 'trainedOnCollectionName', 'testCollectionName', 'clusterId', 'viewId', 'queryId', 'BU', 'projectList', 'csap_last_run_date', 'cutoff' ]) proj_list = ",".join(f_c) dat = now.strftime("%Y-%m-%d") #print(dat) if (view): df.loc[0] = [ vi_col_name_results, tr_col_name, te_col_name, int(cluster_id), int(view_id), int(query_id), bu_id, proj_list, dat, float(cut_off * 100) ] else: print("here") df.loc[0] = [ vi_col_name_results, tr_col_name, te_col_name, int(cluster_id), view_id, int(query_id), str(bu_id), proj_list, dat, float(cut_off * 100) ] records = json2.loads(df.T.to_json(date_format='iso')).values() collection.insert(records) print("Inserted data to View mapper collection") else: print("No predicted CFDs in this ViewSet")
words = [] labels = [] docs_x = [] docs_y = [] for intent in data["intents"]: for pattern in intent["patterns"]: wrds = nltk.word_tokenize(pattern) words.extend(wrds) docs_x.append(wrds) docs_y.append(intent["tag"]) if intent["tag"] not in labels: labels.append(intent["tag"]) words = [stemmer.stem(w.lower()) for w in words if w not in "?"] words = sorted(list(set(words))) labels = sorted(labels) training = [] output = [] out_empty = [0 for _ in range(len(labels))] for x, doc in enumerate(docs_x): bag = [] wrds = [stemmer.stem(w) for w in doc] for w in words:
docs_y.append(intent["tag"]) if intent["tag"] not in labels: labels.append(intent["tag"]) for intent2 in data["intents"]: for pattern2 in intent2["patterns"]: wrds2 = nltk.word_tokenize(pattern2) words2.extend(wrds2) docs_x2.append(wrds2) docs_y2.append(intent2["tag"]) if intent2["tag"] not in labels2: labels2.append(intent2["tag"]) words = [stemmer.stem(w.lower()) for w in words if w != "?"] words = sorted(list(set(words))) words2 = [stemmer.stem(w2.lower()) for w2 in words2 if w2 != "?"] words2 = sorted(list(set(words2))) labels = sorted(labels) labels2 = sorted(labels2) training = [] output = [] training2 = [] output2 = []
words = [] labels = [] docs_x = [] docs_y = [] for command in data["commands"]: for pattern in commands["patterns"]: words_list = nltk.word_tokenize(pattern) words.extend(words_list) docs_x.append(words_list) docs_y.append(intent["tag"]) if command["tag"] not in labels: labels.append(command["tag"]) words = [stemmer.stem(w.lower()) for w in words if w not in "?" or "!"] words = sorted(list(set(words))) labels = sorted(labels) training = [] output = [] out_empty = [0 for _ in range(len(labels))] for x, doc in enumerate(docs_x): bag = [] words_list = [stemmer.stem(w) for w in doc] for w in words:
# Create various stemmer objects porter = PorterStemmer() lancaster = LancasterStemmer() snowball = SnowballStemmer('english') # Create a list of stemmer names for display stemmer_names = ['PORTER', 'LANCASTER', 'SNOWBALL'] formatted_text = '{:>16}' * (len(stemmer_names) + 1) print('\n', formatted_text.format('INPUT WORD', *stemmer_names), '\n', '='*68) # Stem each word and display the output for word in input_words: output = [word, porter.stem(word), lancaster.stem(word), snowball.stem(word)] print(formatted_text.format(*output)) #Chunks: Diving input data into chunks. it is not the same as tokenization while chunks is need to be meaningful import nltk nltk.download('brown') import numpy as np from nltk.corpus import brown # Split the input text into chunks, where # each chunk contains N words def chunker(input_data, N): input_words = input_data.split(' ') output = [] cur_chunk = [] count = 0
def ProcessData(data, train=False): check_tokenizer() if (train == False): try: with open(pickle_path, "rb") as f: words, labels, training, output = pickle.load(f) print("Loaded stemmed data from pickle") except: print("Stemming data from the intents.json file") stemmer = LancasterStemmer() words = [] labels = [] docs_X = [] docs_y = [] with open(classes_path, "w") as f: f.write("") for intent in data["intents"]: for pattern in intent["patterns"]: wrds = nltk.word_tokenize(pattern) words.extend(wrds) docs_X.append(wrds) docs_y.append(intent["tag"]) if intent["tag"] not in labels: labels.append(intent["tag"]) with open(classes_path, "a") as f: f.write(intent["tag"] + "\n") #List of non-redundant words the model has seen words = [stemmer.stem(w.lower()) for w in words if w != "?"] words = np.array(words) words = sorted(np.unique(words)) #labels (sorted) labels = sorted(labels) training = [] output = [] out_empty = [0 for _ in range(len(labels))] for x, doc in enumerate(docs_X): bag = np.array([]) wrds = [stemmer.stem(w.lower()) for w in doc if w != "?"] for w in words: if w in wrds: bag = np.append(bag, np.array([1])) else: bag = np.append(bag, np.array([0])) output_row = out_empty[:] output_row[labels.index(docs_y[x])] = 1 training.append(bag) output.append(np.argmax(output_row)) #into np arrays training = np.asarray(training) output = np.asarray(output) with open(pickle_path, "wb") as f: print("Stemmed data saved in pickle file...") pickle.dump((words, labels, training, output), f) else: with open(classes_path, "w") as f: f.write("") print("Stemming data from the intents.json file") stemmer = LancasterStemmer() words = [] labels = [] docs_X = [] docs_y = [] for intent in data["intents"]: for pattern in intent["patterns"]: wrds = nltk.word_tokenize(pattern) words.extend(wrds) docs_X.append(wrds) docs_y.append(intent["tag"]) if intent["tag"] not in labels: labels.append(intent["tag"]) with open(classes_path, "a") as f: f.write(intent["tag"] + "\n") #List of non-redundant words the model has seen words = [stemmer.stem(w.lower()) for w in words if w != "?"] words = np.array(words) words = sorted(np.unique(words)) #labels (sorted) labels = sorted(labels) training = [] output = [] out_empty = [0 for _ in range(len(labels))] for x, doc in enumerate(docs_X): bag = np.array([]) wrds = [stemmer.stem(w.lower()) for w in doc if w != "?"] for w in words: if w in wrds: bag = np.append(bag, np.array([1])) else: bag = np.append(bag, np.array([0])) output_row = out_empty[:] output_row[labels.index(docs_y[x])] = 1 training.append(bag) output.append(np.argmax(output_row)) #into np arrays training = np.asarray(training) output = np.asarray(output) with open(pickle_path, "wb") as f: print("Stemmed data saved in pickle file...") pickle.dump((words, labels, training, output), f) return words, labels, training, output
text2.concordance("montrous") text2.similar("montrous") text2.common_contexts(['monstrous', 'very']) text4.dispersion_plot(['citizens', 'democracy', 'duties', 'America']) text1.dispersion_plot(['happy', 'sad']) text = "You have to ask yourself one question: Do I feel lucky? Well do ya, punk?" sents = sent_tokenize(text) print(sents) # In[ ]: words = [word_tokenize(sent) for sent in sents] customStopWords = set(stopwords.words('english') + list(punctuation)) wordsWOStopwords = [ word for word in word_tokenize(text) if word not in customStopWords ] print(wordsWOStopwords) text2 = "this is the end of the world as we know it, and I feel fine!!!." st = LancasterStemmer() stemmedWords = [st.stem(word) for word in word_tokenize(text2)] print(stemmedWords) nltk.pos_tag(word_tokenize(text2))
documents = [] ignore_words = ['?'] # loop through each sentence in our training data for pattern in training_data: # tokenize each word in the sentence w = nltk.word_tokenize(pattern['sentence']) # add to our words list words.extend(w) # add to documents in our corpus documents.append((w, pattern['class'])) # add to our classes list if pattern['class'] not in classes: classes.append(pattern['class']) # stem and lower each word and remove duplicates words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words] words = list(set(words)) # remove duplicates classes = list(set(classes)) print (len(documents), "documents") print (len(classes), "classes", classes) print (len(words), "unique stemmed words", words) # create our training data training = [] output = [] # create an empty array for our output output_empty = [0] * len(classes)
#Stemming #Using the porter algorithm from nltk.stem.porter import PorterStemmer porter_stemmer = PorterStemmer() porter_stemmer.stem('maximum') porter_stemmer.stem('presumably') porter_stemmer.stem('multiply') porter_stemmer.stem('provision') porter_stemmer.stem('owed') porter_stemmer.stem('ear') porter_stemmer.stem('saying') #Using the Lancaster algorithm from nltk.stem.lancaster import LancasterStemmer lancaster_stemmer = LancasterStemmer() lancaster_stemmer.stem('maximum') lancaster_stemmer.stem('presumably') lancaster_stemmer.stem('multiply') lancaster_stemmer.stem('provision') lancaster_stemmer.stem('owed') lancaster_stemmer.stem('ear') lancaster_stemmer.stem('saying') #Snowball stemmer from nltk.stem import SnowballStemmer snowball_stemmer = SnowballStemmer(“english”) snowball_stemmer.stem('maximum') snowball_stemmer.stem('presumably') snowball_stemmer.stem('multiply') snowball_stemmer.stem('provision') snowball_stemmer.stem('owed')
class Scrape: def __init__(self, file_name, tags, remove_hyphen=None, remove_apostrophe=None, remove_stop_words=None): self.tags = tags with open(file_name, 'r') as file_handle: self.html = ''.join(file_handle.readlines()) self.soup = BeautifulSoup(self.html, "html.parser") self.remove_hyphen = True if remove_hyphen is None else remove_hyphen self.remove_apostrophe = True if remove_apostrophe is None else remove_apostrophe self.remove_stop_words = True if remove_stop_words is None else remove_stop_words self.terms = [] self.token_counter = 1 self.stemmer = LancasterStemmer() self.doc_id = file_name.replace('WEBPAGES_RAW/', '') def parse_content(self, document): # Get a list of strings associated with each tag for x in self.soup.find_all(self.tags): tag = x.name if tag != 'p': tokens = self.tokenize(x.string) else: tokens = self.tokenize(x.text) if tokens is not None: for token in tokens: # self.terms.append(Term(doc=document, term=token, tag_type=tag, position=self.token_counter)) self.terms.append({ 'doc': self.doc_id, 'term': token, 'tag_type': tag, 'position': self.token_counter }) self.token_counter += 1 return self.terms def tokenize(self, text): # Checking if the text is a string value and is not None if text is None: return None # Convert to lowercase text = text.lower() # Removing hyphens and apostrophes if needed if self.remove_hyphen is True: text = text.replace('-', '') if self.remove_apostrophe is True: text = text.replace('\'', '') # Finding all the alphanumeric tokens and converting them to lowercase line_tokens = re.findall('\w+', text) # Uncomment the following line for stemming line_tokens = [self.stemmer.stem(token) for token in line_tokens] if self.remove_stop_words is True: line_tokens = filter( lambda token: token not in stopwords.words('english'), line_tokens) return line_tokens
def stem(word): """Return the stemmed word using a Lancaster Stemmer.""" st = LancasterStemmer() return st.stem(word)
def stem_wordify(text): sw = nltk.corpus.stopwords.words('english') st = LancasterStemmer() return map(lambda x: st.stem(x), filter(lambda y: not(y in sw), map(lambda z: st.stem(z), wordify(text))))
class PreProcessor: # nl_features : Sentences # text_labels : Python namespaces for the file to be mapped for respective sentence def __init__(self, nl_features, text_labels, stem_blacklist_words): self._nl_features = nl_features self._text_labels = text_labels self._stem_blacklist_words = stem_blacklist_words self._word_list = [] self._unique_labels_list = [] self._nl_feature_label_map = [] self._stemmer = LancasterStemmer() self._features = [] self._labels = [] def tokenize_and_stem(self): for i in range(0, len(self._nl_features)): nl_feature = self._nl_features[i] text_label = self._text_labels[i] tokenized_feature = nltk.word_tokenize(nl_feature) self._word_list.extend(tokenized_feature) self._nl_feature_label_map.append((tokenized_feature, text_label)) if text_label not in self._unique_labels_list: self._unique_labels_list.append(text_label) self._word_list = [self._stemmer.stem(word.lower()) for word in self._word_list if word not in self._stem_blacklist_words] self._word_list = list(set(self._word_list)) self._unique_labels_list = list(set(self._unique_labels_list)) def convert_to_patterns(self): encoded_label_template = [0] * len(self._unique_labels_list) for example in self._nl_feature_label_map: words_in_example = example[0] # Get the word list of the example label_of_example = example[1] # Get the label of the example words_in_example = [self._stemmer.stem(word_in_example.lower()) for word_in_example in words_in_example] # Stem each word pattern_for_words = [] for unique_word in self._word_list: if unique_word in words_in_example: pattern_for_words.append(1) else: pattern_for_words.append(0) encoded_label = list(encoded_label_template) encoded_label[self._unique_labels_list.index(label_of_example)] = 1 self._features.append(pattern_for_words) self._labels.append(encoded_label) def get_text_labels(self): return self._text_labels def get_unique_text_labels(self): return list(set(self._text_labels)) def get_processed_features_and_labels(self): return self._features, self._labels def get_processed_features(self): return self._features def get_processed_labels(self): return self._labels def get_feature_length(self): return len(self._word_list) def get_unique_label_count(self): return len(self._unique_labels_list) def get_unique_wordlist(self): return self._word_list def validate_tasks_directory(self, tasks_directory_path): if os.path.exists(tasks_directory_path): structure_definitions_filepath = tasks_directory_path + "/" + consts.TASKS_STRUCT_FILE_FILENAME if os.path.exists(structure_definitions_filepath): structure_definitions_file_data = open(structure_definitions_filepath).read() structure_definitions_file_data = json.loads(structure_definitions_file_data) if all(basic_prop in structure_definitions_file_data for basic_prop in consts.TASKS_STRUCT_FILE_BASIC_PROPERTY_KEYS): all_executors = structure_definitions_file_data[consts.TASKS_STRUCT_FILE_PROP_EXECUTORS] for executor in all_executors: executor_folder = executor[consts.TASKS_STRUCT_FILE_PROP_EXECUTORS_NAMESPACE] executor_folder_path = tasks_directory_path + "/" + executor_folder if os.path.exists(executor_folder_path): executor_name = executor[consts.TASKS_STRUCT_FILE_PROP_EXECUTORS_CLASS] executor_file_name = executor_name + ".py" executor_file_path = executor_folder_path + "/" + executor_file_name if not os.path.exists(executor_file_path): return False, "Class File: " + executor_name + " cannot be found inside the Namespace Folder: " + executor_folder + "." else: return False, "Namespace folder: " + executor_folder + " not found." return True, "Success" else: return False, "One or more of the following required entries are not found in the " + consts.TASKS_STRUCT_FILE_FILENAME + ".\n" + consts.TASKS_STRUCT_FILE_PROP_DEP_DIRS + ", " + consts.TASKS_STRUCT_FILE_PROP_EXECUTORS else: return False, consts.TASKS_STRUCT_FILE_FILENAME + " does not exist inside " + tasks_directory_path +".\nPlease make sure the tasks executors folder path you've given contains valid TaskExecutors." else: return False, tasks_directory_path + " is an invalid directory." @staticmethod def get_sentence_patterns(sentence, word_list): tokenized_words = nltk.word_tokenize(sentence) stemmer = LancasterStemmer() stemmed_words = [stemmer.stem(tokenized_word.lower()) for tokenized_word in tokenized_words] patterns = [0] * len(word_list) for stemmed_word in stemmed_words: for index, trainingSet_word in enumerate(word_list): if stemmed_word == trainingSet_word: patterns[index] = 1 return patterns
words, labels, training, output = pickle.load(f) except: words = [] labels = [] doc_x = [] doc_y = [] for value in data['intents']: for pattern in value['patterns']: wrds = nltk.word_tokenize(pattern) words.extend(wrds) doc_x.append(wrds) doc_y.append(value['tag']) if value['tag'] not in labels: labels.append(value['tag']) words = [stemmer.stem(x.lower()) for x in words if x not in '?'] words = sorted(list(set(words))) labels = sorted(labels) training = [] output = [] output_empty = [0 for _ in range(len(labels))] for x, y in enumerate(doc_x): bag = [] wrds = [stemmer.stem(w) for w in y] for w in words: if w in wrds: bag.append(1)
import nltk from nltk.stem.lancaster import LancasterStemmer stri = LancasterStemmer() print(stri.stem('achievement'))
class Naive_bayes(): def __init__(self): self.stemmer = LancasterStemmer() self.training_data = [] self.base_path = "../../../conversation/" # 파일 읽어들임 for filename in os.listdir(self.base_path): self.read_data(filename) self.corpus_words = {} self.class_words = {} self.classes = list(set([a['class'] for a in self.training_data])) for c in self.classes: self.class_words[c] = [] self.extract_data() def read_data(self, filename): with open(self.base_path + filename, encoding="utf-8") as f: while True: line = f.readline().strip() if not line: break self.training_data.append({ "class": filename, "sentence": line }) def extract_data(self): for data in self.training_data: # 각 문장을 단어를 토큰화 for word in nltk.word_tokenize(data['sentence']): if word not in ["?", "'s"]: # stem and lowercase each word stemmed_word = self.stemmer.stem(word.lower()) # 이미 나온 문자인지 확인 if stemmed_word not in self.corpus_words: self.corpus_words[stemmed_word] = 1 else: self.corpus_words[stemmed_word] += 1 # class list에 단어 추가 self.class_words[data['class']].extend([stemmed_word]) def calculate_class_score_commonality(self, sentence, class_name, show_details=True): score = 0 # 새로운 문장에서 단어를 각각 토큰화 for word in nltk.word_tokenize(sentence): # 단어의 stem이 클래스 중에 속하는 지 여부 if self.stemmer.stem(word.lower()) in self.class_words[class_name]: # 상대적인 가중치를 더함 score += (1 / self.corpus_words[self.stemmer.stem(word.lower())]) if show_details: print(" match: %s (%s)" % (self.stemmer.stem(word.lower()), 1 / self.corpus_words[self.stemmer.stem(word.lower())])) return score def classify(self, sentence): high_class = None high_score = 0 for c in self.class_words.keys(): score = self.calculate_class_score_commonality(sentence, c, show_details=False) if score > high_score: high_class = c high_score = score return high_class, high_score
class AICodemaster(Codemaster): def __init__(self, brown_ic=None, glove_vecs=None, word_vectors=None): super().__init__() self.brown_ic = brown_ic self.glove_vecs = glove_vecs self.word_vectors = word_vectors self.wordnet_lemmatizer = WordNetLemmatizer() self.lancaster_stemmer = LancasterStemmer() self.cm_wordlist = [] with open('players/cm_wordlist.txt') as infile: for line in infile: self.cm_wordlist.append(line.rstrip()) self.bad_word_dists = None self.red_word_dists = None def set_game_state(self, words, maps): self.words = words self.maps = maps def get_clue(self): cos_dist = scipy.spatial.distance.cosine red_words = [] bad_words = [] # Creates Red-Labeled Word arrays, and everything else arrays for i in range(25): if self.words[i][0] == '*': continue elif self.maps[i] == "Assassin" or self.maps[ i] == "Blue" or self.maps[i] == "Civilian": bad_words.append(self.words[i].lower()) else: red_words.append(self.words[i].lower()) print("RED:\t", red_words) all_vectors = (self.glove_vecs, ) bests = {} if not self.bad_word_dists: self.bad_word_dists = {} for word in bad_words: self.bad_word_dists[word] = {} for val in self.cm_wordlist: b_dist = cos_dist(self.concatenate(val, all_vectors), self.concatenate(word, all_vectors)) self.bad_word_dists[word][val] = b_dist self.red_word_dists = {} for word in red_words: self.red_word_dists[word] = {} for val in self.cm_wordlist: b_dist = cos_dist(self.concatenate(val, all_vectors), self.concatenate(word, all_vectors)) self.red_word_dists[word][val] = b_dist else: to_remove = set(self.bad_word_dists) - set(bad_words) for word in to_remove: del self.bad_word_dists[word] to_remove = set(self.red_word_dists) - set(red_words) for word in to_remove: del self.red_word_dists[word] for clue_num in range(1, 3 + 1): best_per_dist = np.inf best_per = '' best_red_word = '' for red_word in list(itertools.combinations(red_words, clue_num)): best_word = '' best_dist = np.inf for word in self.cm_wordlist: if not self.arr_not_in_word(word, red_words + bad_words): continue bad_dist = np.inf worst_bad = '' for bad_word in self.bad_word_dists: if self.bad_word_dists[bad_word][word] < bad_dist: bad_dist = self.bad_word_dists[bad_word][word] worst_bad = bad_word worst_red = 0 for red in red_word: dist = self.red_word_dists[red][word] if dist > worst_red: worst_red = dist if worst_red < best_dist and worst_red < bad_dist: best_dist = worst_red best_word = word # print(worst_red,red_word,word) if best_dist < best_per_dist: best_per_dist = best_dist best_per = best_word best_red_word = red_word bests[clue_num] = (best_red_word, best_per, best_per_dist) print("BESTS: ", bests) li = [] pi = [] chosen_clue = bests[1] chosen_num = 1 for clue_num, clue in bests.items(): best_red_word, combined_clue, combined_score = clue worst = -np.inf best = np.inf worst_word = '' for word in best_red_word: dist = cos_dist(self.concatenate(word, all_vectors), self.concatenate(combined_clue, all_vectors)) if dist > worst: worst_word = word worst = dist if dist < best: best = dist if worst < 0.3 and worst != -np.inf: print(worst, chosen_clue, chosen_num) chosen_clue = clue chosen_num = clue_num li.append((worst / best, best_red_word, worst_word, combined_clue, combined_score, combined_score**len(best_red_word))) if chosen_clue[2] == np.inf: chosen_clue = ('', li[0][3], 0) chosen_num = 1 # print("The clue is: ", li[0][3]) print('chosen_clue is:', chosen_clue) # return in array styled: ["clue", number] return chosen_clue[1], chosen_num # [li[0][3], 1] def arr_not_in_word(self, word, arr): if word in arr: return False lemm = self.wordnet_lemmatizer.lemmatize(word) lancas = self.lancaster_stemmer.stem(word) for i in arr: if i == lemm or i == lancas: return False if i.find(word) != -1: return False if word.find(i) != -1: return False return True def combine(self, words, wordvecs): factor = 1.0 / float(len(words)) new_word = self.concatenate(words[0], wordvecs) * factor for word in words[1:]: new_word += self.concatenate(word, wordvecs) * factor return new_word def concatenate(self, word, wordvecs): concatenated = wordvecs[0][word] for vec in wordvecs[1:]: concatenated = np.hstack((concatenated, vec[word])) return concatenated
print("Categorias:" + str(categories)) for each_category in data.keys(): for each_sentence in data[each_category]: # remove any punctuation from the sentence each_sentence = remove_punctuation(each_sentence) print(each_sentence) # extract words from each sentence and append to the word list w = nltk.word_tokenize(each_sentence) print("tokenized words: ", w) words.extend(w) docs.append((w, each_category)) # stem and lower each word and remove duplicates words = [stemmer.stem(w.lower()) for w in words] words = sorted(list(set(words))) print(words) print("\n") print(docs) # create our training data training = [] output = [] # create an empty array for our output output_empty = [0] * len(categories) for doc in docs: # initialize our bag of words(bow) for each document in the list bow = []
from nltk.stem.porter import PorterStemmer from nltk.stem.lancaster import LancasterStemmer from nltk.stem.snowball import SnowballStemmer input_words = [ 'writing', 'calves', 'be', 'branded', 'horse', 'randomize', 'possibly', 'provision', 'hospital', 'kept', 'scratchy', 'code' ] # Stemmer objects porter = PorterStemmer() lancaster = LancasterStemmer() snowball = SnowballStemmer('english') # Create a list of stemmer names for display stemmer_names = ['PORTER', 'LANCASTER', 'SNOWBALL'] formatted_text = '{:>16}' * (len(stemmer_names) + 1) print(formatted_text.format('INPUT WORD', *stemmer_names)) print('=' * 75) # Stem each word and display the output for word in input_words: output = [ word, porter.stem(word), lancaster.stem(word), snowball.stem(word) ] print(formatted_text.format(*output))
st = LancasterStemmer() from nltk.stem import PorterStemmer pt = PorterStemmer() from nltk.stem.snowball import EnglishStemmer sb = EnglishStemmer() from nltk.stem.wordnet import WordNetLemmatizer wn = WordNetLemmatizer() ##let's examine the word ``better" st.stem('better') pt.stem('better') sb.stem('better') wn.lemmatize('better', 'a') wn.lemmatize('families', 'n') ## ##applying the porter stemmer to the gettysburg address text_5 = map(pt.stem, text_4) ##now creating a dictionary that will count the occurrence of the words getty = {} used = []
def __init__(self, d): self.categories = None self.conversations = None for k, v in d.items(): setattr(self, k, v) data = [] for fn in listdir(PATH): with open(PATH + fn, 'r') as s: data.append(Corpus(yaml.safe_load(s))) stemmer = LancasterStemmer() clear_sentence = lambda sentence: ' '.join( [stemmer.stem(w) for w in nltk.word_tokenize(sentence)]) questions = [] classes = [] for item in data: cat = item.categories[0] for quest in item.conversations: for q in quest: questions.append(clear_sentence(q)) classes.append(cat) tfv = TfidfVectorizer(stop_words='english') le = LabelEncoder() X = tfv.fit_transform(questions)
infile.close() jsonList = json.loads(lines) for tweet in jsonList: wordlist = clean_tweet(tweet) #print type(tweet) for word in wordlist: if len(word) == 1 or word in stopwords: continue if any(s in word.lower() for s in specialList): continue wordcloudlist += ' {}'.format(ls.stem(word)) #print text2 #out = text2.translate(string.maketrans("",""), string.punctuation) #ls.stem(out) #wnl.lemmatize(out) #ss.stem(out) #print out # Generate a word cloud image #wordcloud = WordCloud().generate(text) # lower max_font_size wordcloud1 = WordCloud(max_font_size=40).generate(wordcloudlist) # Display the generated image:
docs_y = [] for intent in data["intents"]: #Loop on All question types for pattern in intent[ "patterns"]: #All patterns in a single question type,i.e; all the sentences wrds = nltk.word_tokenize(pattern) # divides based on space words.extend(wrds) docs_x.append( wrds ) #docs_x contains all sentences as a list of words,i.e;list of lists of words docs_y.append(intent["tag"]) if intent["tag"] not in labels: labels.append(intent["tag"]) words = [stemmer.stem(w.lower()) for w in words if w != "?"] #stemmer reduces similar words into a single word words = sorted(list(set(words))) #set makes sure it doesn't have duplicates labels = sorted(labels) training = [] output = [] out_empty = [0 for _ in range(len(labels))] for x, doc in enumerate(docs_x): bag = [] wrds = [stemmer.stem(w) for w in doc]
#sentiment analysis import nltk path = 'D:\projects\chat.txt' from nltk.stem.lancaster import LancasterStemmer stemmer = LancasterStemmer() from trainsenti import training_data corpus_words = {} class_words = {} classes = list(set([a['class'] for a in training_data])) for c in classes: class_words[c] = [] for data in training_data: for word in nltk.word_tokenize(data['sentence']): if word not in ["?", "'s"]: stemmed_word = stemmer.stem(word.lower()) if stemmed_word not in corpus_words: corpus_words[stemmed_word] = 1 else: corpus_words[stemmed_word] += 1 class_words[data['class']].extend([stemmed_word]) def calculate_class_score(sentence, class_name, show_details=True): score = 0 for word in nltk.word_tokenize(sentence): if stemmer.stem(word.lower()) in class_words[class_name]: score += (1 / corpus_words[stemmer.stem(word.lower())]) return score
from nltk.stem.porter import PorterStemmer from nltk.stem.lancaster import LancasterStemmer from nltk.stem.snowball import SnowballStemmer words = [ 'table', 'probably', 'wolves', 'playing', 'is', 'dog', 'the', 'beaches', 'grounded', 'dreamt', 'envision' ] # Compare different stemmers stemmers = ['PORTER', 'LANCASTER', 'SNOWBALL'] stemmer_porter = PorterStemmer() stemmer_lancaster = LancasterStemmer() stemmer_snowball = SnowballStemmer('english') formatted_row = '{:>16}' * (len(stemmers) + 1) print(formatted_row.format('WORD', *stemmers)) for word in words: stemmed_words = [ stemmer_porter.stem(word), stemmer_lancaster.stem(word), stemmer_snowball.stem(word) ] print(formatted_row.format(word, *stemmed_words))
wrds = nltk.word_tokenize(pattern) words.extend(wrds) docs_x.append(pattern) docs_y.append(intent['tag']) if intent['tag'] not in labels: labels.append(intent['tag']) words = [stemmer.setm(w.lower()) for w in words] words = sorted(list(set(words))) labels = sorted(labels) training = [] output = [] out_empty = [ 0 for _ in range(len(classes))] for x,doc in enumerate(docs_x): bag = [] wrds = [stemmer.stem(w) for w in doc] for w in words: if w in wrds: bag.append(1) else: bag.append(0) output_row = out_empty[:] output_row = [labels.index(docs_y[x])] = 1
def lancasterStem(features): lancasterStemmer = LancasterStemmer() return [lancasterStemmer.stem(feature) for feature in features]
words = [] labels = [] docs_x = [] docs_y = [] for intent in data["intents"]: for pattern in intent["patterns"]: wrds = nltk.word_tokenize(pattern) words.extend(wrds) docs_x.append(wrds) docs_y.append(intent["tag"]) if intent["tag"] not in labels: labels.append(intent["tag"]) words = [stemmer.stem(w.lower()) for w in words if w != "?"] words = sorted(list(set(words))) labels = sorted(labels) training = [] output = [] out_empty = [0 for _ in range(len(labels))] for x, doc in enumerate(docs_x): bag = [] wrds = [stemmer.stem(w.lower()) for w in doc] for w in words:
#scan through the document to take out patterns and tokenize them #add all the tokenized words in a single list #split the patterns and tags(data_x and data_y) #take out distinct tags (labels) #a lexicon is created(words) for intent in data["intents"]: for pattern in intent["patterns"]: wrds = nltk.word_tokenize(pattern) words.extend(wrds) data_x.append(wrds) data_y.append(intent["tag"]) if (intent["tag"] not in labels): labels.append(intent["tag"]) words = [stemmer.stem(w) for w in words if w != "?"] words = sorted(list(set(words))) labels = sorted(labels) training = [] output = [] out_empty = [0 for _ in range(len(labels))] for x, doc in enumerate(data_x): bag = [] wrds = [stemmer.stem(w) for w in doc] for w in words: if w in wrds: bag.append(1) else: bag.append(0)