Exemple #1
0
def simplify_old(s):
    res = ''
    st = LancasterStemmer()

    text = nltk.word_tokenize(s)
    tags = nltk.pos_tag(text)

    for tag in tags:
        word = tag[0]
        if f.checkPos(tag[1]):
            if word in model:
                word_stem = st.stem(word)
                top_words = model.most_similar(positive=[word], topn = 20)
                candidate_list = [w[0] for w in top_words]
                freq_list = [fdist[w] for w in candidate_list]
                c_f_list = zip(candidate_list, freq_list)
                ordered_list = sorted(c_f_list, key=lambda c_f_list:c_f_list[1], reverse=True)
                word_freq = fdist[word]
                #			synonmys = f.getSynonmys(word)  ## get synonmys from wordnet
                # print synonmys
                for w in ordered_list:
                    if not f.freq_diff(word_freq, w[1]):  ## break for loop if candidate word frequency does not exceed the word frequency by a threshold
                            break
                    if st.stem(w[0]) != word_stem and f.samePos(word, w[0]): ##exclude morphological derivations and same pos
                            word = w[0]  ### do not use wordnet
        # if w[0] in synonmys:
        # 	word = w[0]
        # else:
        # 	for syn in synonmys:
        # 		if st.stem(w[0]) == st.stem(syn):
        # 			word = w[0]

        res = res + word + ' '
    return res
def getstems(dict):
    l = LancasterStemmer()
    stems = {}
    for word in dict:
        if word in dicts.irregforms:
            stems[word] = l.stem(dicts.irregforms[word])
        else:
            stems[word] = l.stem(word)
    return stems                
Exemple #3
0
def filter_pos(text):
	st = LancasterStemmer()
	tokens = nltk.word_tokenize(text)
	tagged = nltk.pos_tag(tokens)
	nouns = list()
	verbs = list()
	for (word, tag) in tagged:
		if tag.startswith('N'):
            		nouns.append(st.stem(word))
		elif tag.startswith('V'):
			verbs.append(st.stem(word))
	return nouns,verbs
Exemple #4
0
def mapper(shard, doc_counter):
    st = LancasterStemmer()
    with open(shard, "r") as f:
        ohsu = json.JSONDecoder().decode(f.read())
        output_values = []
        doc_counter.add(len(ohsu))
        for article in ohsu:
            output_values += [(w, (article[".I"], 'a')) for w in article[".A"]]
            output_values += [(st.stem(w), (article[".I"], 't')) for w in alphabet.findall(article[".T"].lower())]
            if article.get('.W') is not None:
                body_words = (w for w in alphabet.findall(article[".W"].lower()))
                output_values += [(st.stem(w), (article[".I"], 'w')) for w in body_words]
    return output_values
Exemple #5
0
def poss_train(train_file,train_write,sw_file):
    """
    
    Arguments:
    - `train_file`:
    """
    a = 0
    f = open(train_file)
    reader = csv.reader(f)

    t = open(train_write,"w")

    sw = open(sw_file)
    sw = sw.readlines()
    sw = [word.strip() for word in sw]
    
    #stopwords = sw  # use nltk stopwords
    stopwords = nltk.corpus.stopwords.words('english')
    print "停顿词表长度",len(stopwords)
    stopwords = set(stopwords)

    g = lambda x : x not in stopwords
    
    for row in reader:
        if a%10000 == 0:
            print a    
        a += 1
        title = row[1].lower()
        #clean html
        body = nltk.clean_html(row[2].lower())
        
        #word tokenize
        pattern = r"([a-z])\w+"
        body = nltk.regexp_tokenize(body, pattern)
        title = nltk.regexp_tokenize(title, pattern)
        
        #remove stopwords
        body = filter(g,body)
        title = filter(g,title)

        #light stem
        st = LancasterStemmer()
        title = set([st.stem(word) for word in title])
        body = set(body)
        body = set([st.stem(word) for word in body])

        # list to string
        body = ' '.join(body)
        title = ' '.join(title)
        t.write('"%s","%s","%s","%s"\n'%(row[0], title,body,row[3]))
Exemple #6
0
def remove_stems(file):
    new_file = []
    punctuation = re.compile(r'[.,"?!:;]')
    lemmatizer = WordNetLemmatizer()
    stemmer = LancasterStemmer()

    for raw_post in file:
        post = raw_post[1]
        token = nltk.word_tokenize(post)
        token_tags = nltk.pos_tag(token)

        new_token = []
        for word in token_tags:
            # Removes punctuations and change it to lower case
            original_word = punctuation.sub("", word[0].lower())

            # Stems each word to their roots, but using lemmatizer then Lancaster
            stemmed_word = lemmatizer.lemmatize(original_word)
            if original_word == stemmed_word:
                stemmed_word = stemmer.stem(stemmed_word)

            # Removes stopwords that are defined in the nltk library
            if stemmed_word not in nltk.corpus.stopwords.words('english') and stemmed_word != '':
                new_token.append((stemmed_word, word[1]))

        new_file.append((raw_post[0], new_token))
    return new_file
    def train_lsi_model(self, texts, num_of_toptics=10):
        texts_tokenized = [[word.lower()
                          for word in word_tokenize(text)]
                          for text in texts]
        # remove the stop words and punctuations
        english_stop_words = stopwords.words('english')
        english_punctuations = [',', '.', ':', '?', '(', ')', '[',
                                ']', '@', '&', '!', '*', '#', '$', '%']
        texts_filtered = [[word for word in text_tokenized
                         if (not word in english_punctuations) and
                         (not word in english_stop_words)]
                         for text_tokenized in texts_tokenized]
        # stem the word
        st = LancasterStemmer()
        texts_stemed = [[st.stem(word) for word in text_filtered]
                       for text_filtered in texts_filtered]

        all_stems = sum(texts_stemed, [])
        stem_once = set(stem for stem in set(all_stems)
                        if all_stems.count(stem) == 1)
        cleaned_texts = [[stem for stem in text if stem not in stem_once]
                        for text in texts_stemed]

        dictionary = corpora.Dictionary(cleaned_texts)
        corpus = [dictionary.doc2bow(text) for text in cleaned_texts]
        tfidf = models.TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]
        lsi = models.LsiModel(corpus_tfidf, id2word=dictionary,
                              num_topics=num_of_toptics)
        result = lsi[corpus]
        return result
Exemple #8
0
def prepare_corpus(raw_documents):
    # remove punctuation
    print "Removing Punctuation"
    import string
    exclude = set(string.punctuation)
    raw_documents = [''.join(ch for ch in s if ch not in exclude) for s in raw_documents]

    # remove common words
    print "Calculating Stoplist"
    stoplist = set([x.rstrip() for x in codecs.open("stop_list.txt", encoding='utf-8') if not x.startswith("#")])
    stoplist = stoplist.union(set(nltk.corpus.stopwords.words("english")))
    # print stoplist

    print "Removing Stoplist and Stemming"

    from nltk.stem.lancaster import LancasterStemmer
    st = LancasterStemmer()

    texts = [[st.stem(word) for word in document.lower().split() if word not in stoplist]
             for document in raw_documents]

    # remove words that appear only once
    print "Removing Single Variables"
    all_tokens = sum(texts, [])
    tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
    texts = [[word for word in text if word not in tokens_once]
             for text in texts]

    return texts
def lemmstem(sentences):
    ''' This function is responsible for perfoming 
        the lemmarization and stemming of the words
        Input: A list of trees containing the sentences.
                All words are classificated by their NE type
        Output: Lemmatized/Stemmized sentences
    '''
    
    lmtzr = WordNetLemmatizer()
    st = LancasterStemmer()
    
    dic = {'VB' :wordnet.VERB,
            'NN': wordnet.NOUN,
            'JJ':wordnet.ADJ,
            'RB':wordnet.ADV }
    
    for sent in sentences:
      
        lvsidx=sent.treepositions('leaves') 
       
        for pos in lvsidx:
            word=sent[pos][0]
            tag = sent[pos][1]
            rtag = tag[0:2]
            if rtag in dic:
                lemm=lmtzr.lemmatize( word, dic[rtag] )
                stem=st.stem(lemm)
                #print word, lemm, stem #Linia maldita
                sent[pos]=(word, tag, stem)
            else:
                sent[pos]=(word, tag, word)
    
    return sentences
def word_stem_example(word="Amevive"):
    """
    [EN]Read: http://www.nltk.org/book/ch03.html, 3.6 Normalizing Text
    [CN]根据NLTK in python书中的推荐Porter算法较为鲁棒, 推荐使用
    """
    stemmer = LancasterStemmer()
    print("Lancaster [%s => %s]" % (word, stemmer.stem(word)))
    
    stemmer = PorterStemmer() # <=== recommended algorithm
    print("Porter [%s => %s]" % (word, stemmer.stem(word)))
    
    stemmer = RegexpStemmer('ing$|s$|e$', min=4)
    print("Regexp [%s => %s]" % (word, stemmer.stem(word)))
    
    stemmer = SnowballStemmer('english') # Choose a language
    print("Snowball [%s => %s]" % (word, stemmer.stem(word)))
def process(reviews):
	#separate splitor
	from nltk.tokenize import word_tokenize
	review_tokenized = [[word.lower() for word in word_tokenize(review.decode('utf-8'))] for review in reviews]

	#remove stop words
	from nltk.corpus import stopwords
	english_stopwords = stopwords.words('english')

	review_filterd_stopwords = [[word for word in review if not word in english_stopwords] for review in review_tokenized]

	#remove punctuations
	english_punctuations = [',','.','...', ':',';','?','(',')','&','!','@','#','$','%']
	review_filtered = [[word for word in review if not word in english_punctuations] for review in review_filterd_stopwords]

	#stemming
	from nltk.stem.lancaster import LancasterStemmer
	st = LancasterStemmer()
	review_stemmed = [[st.stem(word) for word in review] for review in review_filtered]

	#remove word whose frequency is less than 5
	all_stems = sum(review_stemmed, [])
	stems_lt_three = set(stem for stem in set(all_stems) if all_stems.count(stem) == 1)
	final_review = [[stem for stem in text if stem not in stems_lt_three] for text in review_stemmed]

	return final_review
class Stemmer():

    def __init__(self):
        self.stemmer = LancasterStemmer()

    def stem(self, word_to_be_stemmed):
        return self.stemmer.stem(word_to_be_stemmed)
Exemple #13
0
def predict_category_subcategory(book_name):
	data_set1 = pandas.Series(book_name.encode('ascii'))

    #Data Preprocessing
	data_set1 = data_set1.dropna(axis=0,how='any')
	data_set1 = data_set1.str.lower()

    #Manual removal List
	remove_list = ['edition','ed','edn', 'vol' , 'vol.' , '-' ,'i']


	data_set1[0] =' '.join([i for i in data_set1[0].split() if i not in remove_list])

	data_set1 = data_set1.apply(lambda x :re.sub(r'\w*\d\w*', '', x).strip())
	data_set1 = data_set1.apply(lambda x :re.sub(r'\([^)]*\)', ' ', x))
	data_set1 = data_set1.apply(lambda x :re.sub('[^A-Za-z0-9]+', ' ', x))
    #data_set['Category ID'] = data_set['Category ID']+"|"+data_set['Subcategory ID']


    #Stemming the book titles
	stemmer = LancasterStemmer()
	data_set1[0]=" ".join([stemmer.stem(i) for i in  data_set1[0].split()])

	clf = joblib.load(os.path.join(BASE_DIR+"/learners/",'category_predict.pkl'))
	ans = clf.predict(data_set1)
	sub_clf = joblib.load(os.path.join(BASE_DIR+"/learners/",'subcategory_predict.pkl'))
	sub_ans = sub_clf.predict(data_set1)
	return [ans[0],sub_ans[0]]
def tweetTokenizer(tweet_text):
	st = LancasterStemmer()	
	twitterWords = tweet_text.split()

        	#remove stop words using NLTK corpus
       	twitterWords = [word.lower() for word in twitterWords]
       	twitterWords = [w for w in twitterWords if not w in stopwords.words('english')]

        	#remove custom list of stop words using experimentation
       	noiseWords = ["i'm", "like", "get", "don't", "it's", "go", "lol", "got",
                      "one", "know", "@", "good", "want", "can't", "need", "see",
                      "people", "going", "back", "really", "u", "think", "right",
                      "never", "day", "time", "never", "that's", "even", ",", "."
                      "make", "wanna", "you're", "come", "-", "still", "much", "someone",
                      "today", "gonna", "new", "would", "take", "always", "im", "i'll",
                      "best", "'", "feel", "getting", "say", "tonight", "last", "ever",
                      "better", "i've", "look", "f*****g", "way", "could", "!", "oh"
                      "tomorrow", "night", "first", "miss", "ain't", "thank", "2", "bad"
                      "little", "thanks", "something", "wait", "&amp;", "`", "oh", "make",
                      "bad", "let","stop", "well", "tell"]

       	twitterWords = [w for w in twitterWords if not w in noiseWords]
       	twitterWords = [st.stem(w) for w in twitterWords]

	return twitterWords
Exemple #15
0
def word_standardize(sentences): 	
    tokens = []
    sentences_st = []

    for sent in sentences:
        tokens.extend(word_tokenize(sent))
        sentences_st.append(word_tokenize(sent))
	
    words = tokens
    
    st = LancasterStemmer()

    words = [w.lower() for w in words]
    words = [w for w in words if not w in stopwords.words('english')]
    words = [w for w in words if not w in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~']
    st_words = [st.stem(w) for w in words]

    sent_result = []
    for sent in sentences_st:
        sent = [w.lower() for w in sent]
        sent = [w for w in sent if not w in stopwords.words('english')]
        sent = [w for w in sent if not w in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~']
        sent_result.append(sent)

    return st_words, sent_result
Exemple #16
0
def score_sentence(sentence, weights, stop_words):
	"""
	Parameters weights: Counter, sentence: string
	#I NEED SKIES DOCUMENTATION
	"""
	lemmatizer = WordNetLemmatizer()
	stemmer = LancasterStemmer()
	sentence = strip_punc(sentence)
	tokens = word_tokenize(sentence)
	score = 0
	for token in tokens:
		root = stemmer.stem(lemmatizer.lemmatize(token))
		if token not in stop_words and root not in stop_words:
			score += weights[root] 
	score = sum([weights[stemmer.stem(lemmatizer.lemmatize(token))] for token in tokens if token not in stop_words and stemmer.stem(lemmatizer.lemmatize(token)) not in stop_words])
	return score
Exemple #17
0
	def parse_raw_data(self, new_art):
		self.startClass=default_timer()
		tokenizer = RegexpTokenizer(r'\w+')
		tokens = tokenizer.tokenize(new_art.body)
		stemmer = LancasterStemmer()
		article_dic = new_art.words
		global_dic = self.raw_dictionary

		for word in tokens:
			word = word.lower()
			if(False == self.is_stop_word(word) and word.isnumeric()==False):
				s_word = stemmer.stem(word)

			#	s_word = word
			## it is not a stop word, check if the word
			## is already part of the article dictionary.
			## if yes, increment the count else add it.
			## If you are adding check if it is part of
			## the big corpus, if yes increment the count
			## of number of articles with that word.
				self.globalWordCount+=1
				new_art.doc_len = new_art.doc_len + 1
				if(s_word in article_dic):
					article_dic[s_word].wrd_count+=1
					global_dic[s_word].wrd_count+=1
				else:
					article_dic[s_word] = local_word_attributes(1)

					if (s_word in global_dic):
						global_dic[s_word].art_count+=1
						global_dic[s_word].wrd_count+=1
					else:
						global_dic[s_word] = global_word_attributes(1,1, 1, 0)
def preprocess(reviews):
	import nltk
	from nltk.tokenize import word_tokenize

	review_tokenized = [[word.lower() for word in word_tokenize(review.decode('utf-8'))] for review in reviews] 
	#print "review tokenize done"

	#remove stop words
	from nltk.corpus import stopwords
	english_stopwords = stopwords.words('english')
	review_filterd_stopwords = [[word for word in review if not word in english_stopwords] for review in review_tokenized]
	#print 'remove stop words done'

	#remove punctuations
	english_punctuations = [',','.',':',';','?','(',')','&','!','@','#','$','%']
	review_filtered = [[word for word in review if not word in english_punctuations] for review in review_filterd_stopwords]
	#print 'remove punctuations done'

	#stemming
	from nltk.stem.lancaster import LancasterStemmer
	st = LancasterStemmer()
	review_stemmed = [[st.stem(word) for word in review] for review in review_filtered]
	#print 'stemming done'

	return review_stemmed
Exemple #19
0
def stemming(words):
    wordsAfterStemming=[]
    st=LancasterStemmer()
    for x in words:
        y=st.stem(x)
        wordsAfterStemming.append(y)
    return wordsAfterStemming
def lemmatizer_newsheadlines() :
    lancaster_stemmer = LancasterStemmer()
    frl=open("C:/Users/rajas/Downloads/csv_files-2014-12-10/csv files/lemma1.csv","rU")
    fr=open("C:/Users/rajas/Downloads/csv_files-2014-12-10/csv files/sample.csv","rU")
    fw=open("C:/Users/rajas/Downloads/csv_files-2014-12-10/csv files/lemmaheadlines.csv","w")
    for headline in fr:
        if len(headline)>0:
          headlinelist=headline.split(",")
        
          if len(headlinelist)==3:
            headlinewords=headlinelist[1].split(" ")
            print(headlinewords)
            for word in headlinewords:
              wordcor=(((word.replace("?","")).replace(":","")).replace("\"",""))    
               
              headlineword=(lancaster_stemmer.stem(wordcor)).lower()
              print(headlineword) 
     #         for line in frl:
      #          crimelist=line.split(",")
       #         crimeword=((crimelist[1].replace("\"","")).strip()).lower()
               
        #        print(crimeword+str(i))
         #       i+=1
              dictcrime=lemmadict()
              if headlineword in dictcrime:
                  print(headlineword+"yipee")
                  fw.write(headlineword+","+headlinelist[0]+","+headlinelist[1]+"\n")
                                    
                  break;
    frl.close()     
    fw.close()
    fr.close()
def stem_text(text):
    stm = LancasterStemmer()
    tokens = text.split()
    words = [stm.stem(w) for w in tokens]
    snt = " ".join(words)

    return snt
def readText(textFile):			
	examples = []
	count = 0
	lexicon_en = {}
	lexicon_ge = {}
	stem_en = LancasterStemmer()
	stem_ge = nltk.stem.snowball.GermanStemmer()
	for line in open(textFile):
		count+=1
		if count % 1000 == 0:
			print count
		lans = line.lower().strip().split("|||")
		#german = [stem_ge.stem(x.decode('utf-8')) for x in lans[0].strip().split(" ")]
		german = lans[0].strip().split(" ")
		german = process(german)
		for wordx in german:
			for word in wordx:
				if word not in lexicon_ge:
					lexicon_ge[word]=1
				else:
					lexicon_ge[word]+=1
		eng = [stem_en.stem(x.decode('utf-8')) for x in lans[1].strip().split(" ")]
		#parse_en = pattern.en.parse(" ".join(eng))
		eng = lans[1].strip().split(" ")
		for word in eng:
			if word not in lexicon_en:
				lexicon_en[word]=1
			else:
				lexicon_en[word]+=1
		examples.append(Example(german,eng))
	return examples, lexicon_en, lexicon_ge
	def filt(string):

		ret = string

		#	Filter all punctuation from string
		for p in punctuation:
			ret = ret.replace(p, '')

		#	Replace hyphens with spaces
		ret = ret.replace('-', ' ')
		oldret = ret
		ret = ""

		#	Filter all stop words from string
		for word in oldret.split():
			if (word in allStopWords) or len (word) <= 1:
				pass
			else:
				ret += word.lower() +  " "

		st = LancasterStemmer()
		steamed = ""

		for word in ret.split():
			try:
				steamed += str(st.stem(word)) + " "

			except UnicodeDecodeError:
				pass

		return steamed
Exemple #24
0
class VocKeyworder(BaseKeyworder):
    def __init__(self):
        super(VocKeyworder, self).__init__()
        self._vocs = engvoc.voc2000
        self._lemmatizer = WordNetLemmatizer()
        self._stemmer1 = LancasterStemmer()
        self._stemmer2 = SnowballStemmer('english')

    def add_keyword(self, gag_id, title):
        tokens = re.split(' |\.|,|;|=', title)
        for token in tokens:
            token = re.sub(r"\W+$", '', token)
            token = re.sub(r"^\W+", '', token)
            vocs = []
            try:
                token = token.encode('utf8')
                vocs.append(re.sub(r"'\w+", '', token).lower())
                vocs.append(self._lemmatizer.lemmatize(vocs[0]))
                vocs.append(self._stemmer1.stem(vocs[0]))
                vocs.append(self._stemmer2.stem(vocs[0]))
            except UnicodeDecodeError:
                continue
            if vocs[0] == '':
                continue
            try:
                float(vocs[0])
                continue
            except ValueError:
                pass
            if not any([voc in self._vocs for voc in vocs]):
                print 'voc', vocs, token
                self._add_keyword(gag_id, token)
Exemple #25
0
def mapper():

    #list of fields in positional order expected in inbound
    #forum node data.
    fieldnames = ['id', 'title', 'tag_names', 'author_id', 'body',
                    'node_type', 'parent_id', 'abs_parent_id', 
                    'added_at', 'score', 'state_string', 'last_edited_id',
                    'last_activity_by_id', 'last_activity_at', 
                    'active_revision_id', 'extra', 'extra_ref_id',
                    'extra_count', 'marked']

    reader = csv.DictReader(sys.stdin, delimiter='\t', fieldnames=fieldnames)
    stemmer = LancasterStemmer()
    stopw = stopwords.words('english')

    split_pattern = re.compile('[\W.!?:;"()<>[\]#$=\-/]')
    for line in reader:        
        
        pid = line['id']
        body = line['body']
        
        # split body into words
        words = split_pattern.split(body)
     
        # map the stemmer function across all the words.
        # and use the Counter to create a dict
        # of counted stems. Remove english stopwords.
        stem_counts = Counter((stemmer.stem(x) for x in words  if x not in stopw))        
        
        # emit the stem, count and node id
        # for reduction into the reverse index
        for stem, count in stem_counts.items():
        	print "{stem}\t{node_id}\t{count}".format(stem=stem, node_id=pid, count=count)
class Stemmer:
	def __init__(self):
		self.st = LancasterStemmer()
		self.stop = stopwords.words('english')
	#Provides list of stem words given a line
	def getStemmedCorpus(self,line):
    		stemWords = list()
    		data = line.strip().split(',')
	    	if len(data) < 2:
        		return None
    		stri = ""
    		stri = ' '.join(e for e in data[1].split(" ") if e.isalnum())
    		for i in stri.split(" "):
        		if self.st.stem(i) not in self.stop:
            			stemWords.append(self.st.stem(i))
		return stemWords
Exemple #27
0
def overlapping_text(text_1, text_2):
	st = LancasterStemmer()
	cachedStopWords = get_stopwords()
	text_1_list = ([st.stem(word) for word in text_1.split() if word not in cachedStopWords])
	text_2_list = ([st.stem(word) for word in text_2.split() if word not in cachedStopWords])
	return jaccard_dist(text_1_list, text_2_list)
	'''
class LemmaTokenizer(object):
	def __init__(self):
		#self.wnl = WordNetLemmatizer()
		self.stemmer = LancasterStemmer()
	def __call__(self, doc):
		#return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if re.match(r'[a-z]+', t, re.M|re.I)]
		return [self.stemmer.stem(t) for t in word_tokenize(doc) if re.match(r'[a-z]+', t, re.M|re.I)]
Exemple #29
0
def preprocess(content):
	stopset = set(stopwords.words('english'))
	#replace punctuation and tag with space
	tokens = word_tokenize(re.sub(r'<p>|</p>|[^A-Za-z ]', ' ', content.lower())) 
	pos_list = pos_tag(tokens)
	s_tokens = list()

	#noun and verb only
	for pos in pos_list:
		#print pos[1]
		#if pos[1] in ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
		if pos[1] in ['NN', 'NNS']:
			s_tokens.append(pos[0])

	wordfreq = FreqDist(s_tokens)
	stemfreq = dict()
	st = LancasterStemmer()
	for word, freq in wordfreq.items():
		#stopwords
		if word in stopset:
			del wordfreq[word]
			continue
		#tiny words
		if len(word) <= 2:
			del wordfreq[word]
			continue
		#stemmer
		stem = st.stem(word)
		try:
			stemfreq[stem]+=freq
		except:
			stemfreq[stem]=freq
	return stemfreq
def get_pretrained_vector(session, word2vec_model, vocab_path, vocab_size, vectors):
    print(vectors)
    with gfile.GFile(vocab_path, mode="r") as vocab_file:
        st = LancasterStemmer()
        counter = 0
        counter_w2v = 0.0
        while counter < vocab_size:
            vocab_w = vocab_file.readline().replace("\n", "")

            # vocab_w = st.stem(vocab_w)
            # for each word in vocabulary check if w2v vector exist and inject.
            # otherwise dont change value initialise randomly.
            if word2vec_model and vocab_w and word2vec_model.__contains__(vocab_w) and counter > 3:
                w2w_word_vector = word2vec_model.get_vector(vocab_w)
                print("word:%s c:%i w2v size %i" % (vocab_w, counter, w2w_word_vector.size))
                vectors[counter] = w2w_word_vector
                counter_w2v += 1
            else:
                vocab_w_st = st.stem(vocab_w)
                if word2vec_model and vocab_w_st and word2vec_model.__contains__(vocab_w_st):
                    w2w_word_vector = word2vec_model.get_vector(vocab_w_st)
                    print("st_word:%s c:%i w2v size %i" % (vocab_w_st, counter, w2w_word_vector.size))
                    vectors[counter] = w2w_word_vector
                    counter_w2v += 1
                else:
                    if not vocab_w:
                        print("no more words.")
                        break

            counter += 1
        print("injected %f per cent" % (100 * counter_w2v / counter))
        print(vectors)
    return vectors
Exemple #31
0
    def get_model():
        '''
        Function to train and load the model if the file is modified else 
        just load the model and return it.

        INPUT:
            1. NONE

        OUTPUT:
            1. model(tflearn-model): DNN model
        '''
        #call the method to check if the file was modified
        mod_val = Model.__check_file()

        if mod_val == "modified":
            #initialize stemmer
            stemmer = LancasterStemmer()
            #load json file
            with open("data/intents.json") as file:
                data = json.load(file)
            
            #lists to store values
            words = []
            labels = []
            docs_x = []
            docs_y = []

            #stemming
            for intent in data['intents']:
                for pattern in intent['patterns']:
                    #tokenize
                    wrds = nltk.word_tokenize(pattern)
                    words.extend(wrds)
                    docs_x.append(wrds)
                    docs_y.append(intent['tag'])

                if intent['tag'] not in labels:
                    labels.append(intent['tag'])
            
            #save the stemmed words and associated labels
            words = [stemmer.stem(w.lower()) for w in words if w != "?"]
            words = sorted(list(set(words)))
            labels = sorted(labels)

            #create bag of words
            training = []
            output = []
            #list with initial 0s
            out_empty = [0 for _ in range(len(labels))]
            #loop through doc_X
            for x, doc in enumerate(docs_x):
                bag = []
                
                wrds = [stemmer.stem(w) for w in doc]

                for w in words:
                    if w in wrds:
                        bag.append(1)
                    else:
                        bag.append(0)

                out_row = out_empty[:]
                out_row[labels.index(docs_y[x])] = 1

                training.append(bag)
                output.append(out_row)

            training = np.array(training)
            output = np.array(output)

            #save the files
            with open("data/model_data.pkl", "wb") as f:
                pickle.dump((words, labels, training, output), f)

        #train the model
        #get the dnn files
        words, labels, training, output = Model.__dnn_files()
        #clear the default graph stack and resets the global default graph.
        tf.compat.v1.reset_default_graph()
        #layers
        net = tflearn.input_data(shape = [None, len(training[0])])
        net = tflearn.fully_connected(net, 8)
        net = tflearn.fully_connected(net, 8)
        net = tflearn.fully_connected(net, len(output[0]), activation = "softmax")
        net = tflearn.regression(net)
        #DNN Model
        model = tflearn.models.dnn.DNN(net)

        #train and save the model if the file is modified
        if mod_val == "modified":
            model.fit(training, output, batch_size = 8, n_epoch = 1000, show_metric = True)
            #save the model
            model.save("model/model.tflearn")

        #load the model
        model.load("model/model.tflearn")

        return model, words, labels
def test_view(db, view, options):
    #Getting the clusters data
    view_id = options.viewID
    query_id = options.queryID
    bu_id = options.bu

    collection = db[settings.get('Potential_CFD', 'proj_cluster')]
    cursor = collection.find({})
    clusters = pd.DataFrame(list(cursor))
    project_clusters = []
    groups = clusters.groupby('Cluster')

    for name, group in groups:
        project_clusters.append(list(group['Project']))

    print(project_clusters)

    #Fetch the data from the respective collection
    if (view):
        vi_col_name = settings.get(
            'Potential_CFD', 'viewPrefix') + str(view_id) + '_' + str(query_id)
        tr_col_name = settings.get('Potential_CFD', 'trainPrefix')

    else:
        vi_col_name = settings.get(
            'Potential_CFD', 'viewPrefix') + str(bu_id) + '_' + str(query_id)
        tr_col_name = settings.get('Potential_CFD', 'trainPrefix')

    collection = db[vi_col_name]
    print(vi_col_name)
    cursor = collection.find({})
    test_df = pd.DataFrame(list(cursor))
    if (test_df.shape[0] == 0):
        return

    if (options.cfd == "N"):
        test_df = test_df[test_df['CFD_INDIC'] == 0]

    if (test_df.shape[0] == 0):
        return

    req_cluster = list(test_df['PROJECT'].unique())
    print(req_cluster)
    if (len(req_cluster) > 2):
        req_cluster = test_df['PROJECT'].value_counts().nlargest(
            1).index.tolist()
    print(req_cluster)
    print(test_df.shape[0])

    #Get the cluster number if it exists, else create new cluster
    status = False
    for a in [
            'CSC.sys-doc', 'CSC.autons', 'CSC.asics', 'CSC.hw', 'CSC.general',
            'CSC.voice'
    ]:
        if a in req_cluster:
            req_cluster.remove(a)

    if req_cluster in project_clusters:
        status = True

    p = 0
    cluster_id = 0
    f_c = []
    for cluster in project_clusters:
        p = p + 1
        if set(req_cluster).issubset(cluster):
            cluster_id = p
            f_c = cluster
            status = True

    te_col_name = settings.get('Potential_CFD', 'testPrefix') + str(cluster_id)
    #status = True
    #cluster_id = 3
    print(cluster_id)
    print(status)

    if (status == True):
        #Fetching the cut_off
        print("In test_view printing cutoff" + str(options.cutoff))
        if (options.cutoff):
            cut_off = float(options.cutoff)
        else:
            collection = db[settings.get('Potential_CFD', 'testPrefix') +
                            str(cluster_id)]
            cursor = collection.find({})
            df = pd.DataFrame(list(cursor))
            fpr, tpr, thresholds = roc_curve(df['IFD_CFD_INDIC'],
                                             df['Final_prediction'])
            roc_auc = auc(fpr, tpr)
            i = np.arange(len(tpr))
            roc = pd.DataFrame({
                'fpr': pd.Series(fpr, index=i),
                'tpr': pd.Series(tpr, index=i),
                '1-fpr': pd.Series(1 - fpr, index=i),
                'tf': pd.Series(tpr - (1 - fpr), index=i),
                'thresholds': pd.Series(thresholds, index=i)
            })
            r = roc.ix[(roc.tf - 0).abs().argsort()[:1]]
            cut_off = list(r['thresholds'])[0] / 100
        print(cut_off)

        # if(options.cutOff != ""):
        #     cut_off = int(options.cutOff)/100
        # #cut_off = 0.5
        #print(cut_off)
        #del[df]

        #Get all the saved model paths
        model1 = str(settings.get("Potential_CFD", "temp_path_mod_potCFD")
                     ) + '/cluster' + str(cluster_id) + '_' + str(
                         settings.get("Potential_CFD", "xgboost_model"))
        model2 = str(settings.get("Potential_CFD", "temp_path_mod_potCFD")
                     ) + '/cluster' + str(cluster_id) + '_' + str(
                         settings.get("Potential_CFD", "cnn_lstm_model"))
        model3 = str(settings.get(
            "Potential_CFD", "temp_path_mod_potCFD")) + '/' + str(
                settings.get(
                    "Potential_CFD",
                    "dnn_model")) + '_cluster' + str(cluster_id) + '_ticketCNT'
        model4 = str(settings.get(
            "Potential_CFD", "temp_path_mod_potCFD")) + '/' + str(
                settings.get(
                    "Potential_CFD",
                    "dnn_model")) + '_cluster' + str(cluster_id) + '_days'

        feature_columns_to_use = [
            'DE_MANAGER_USERID', 'SEVERITY_CODE', 'LIFECYCLE_STATE_CODE',
            'PROJECT', 'PRODUCT', 'COMPONENT', 'ENGINEER', 'SUBMITTER_ID',
            'AGE', 'FEATURE', 'RELEASE_NOTE', 'SA_ATTACHMENT_INDIC',
            'CR_ATTACHMENT_INDIC', 'UT_ATTACHMENT_INDIC', 'IMPACT', 'ORIGIN',
            'IS_CUSTOMER_VISIBLE', 'INCOMING_INDIC', 'BACKLOG_INDIC',
            'DISPOSED_INDIC', 'TS_INDIC', 'SS_INDIC', 'OIB_INDIC',
            'STATE_ASSIGN_INDIC', 'STATE_CLOSE_INDIC', 'STATE_DUPLICATE_INDIC',
            'STATE_FORWARD_INDIC', 'STATE_HELD_INDIC', 'STATE_INFO_INDIC',
            'STATE_JUNK_INDIC', 'STATE_MORE_INDIC', 'STATE_NEW_INDIC',
            'STATE_OPEN_INDIC', 'STATE_POSTPONE_INDIC', 'STATE_RESOLVE_INDIC',
            'STATE_SUBMIT_INDIC', 'STATE_UNREP_INDIC', 'STATE_VERIFY_INDIC',
            'STATE_WAIT_INDIC', 'CFR_INDIC', 'S12RD_INDIC', 'S123RD_INDIC',
            'MISSING_SS_EVAL_INDIC', 'S123_INDIC', 'S12_INDIC', 'RNE_INDIC',
            'UPDATED_BY', 'DEV_ESCAPE_ACTIVITY', 'RELEASED_CODE',
            'TEST_EDP_ACTIVITY', 'TEST_EDP_PHASE', 'RESOLVER_ANALYSIS_INDIC',
            'SUBMITTER_ANALYSIS_INDIC', 'EDP_ANALYSIS_INDIC',
            'RETI_ANALYSIS_INDIC', 'DESIGN_REVIEW_ESCAPE_INDIC',
            'STATIC_ANALYSIS_ESCAPE_INDIC', 'FUNC_TEST_ESCAPE_INDIC',
            'SELECT_REG_ESCAPE_INDIC', 'CODE_REVIEW_ESCAPE_INDIC',
            'UNIT_TEST_ESCAPE_INDIC', 'DEV_ESCAPE_INDIC',
            'FEATURE_TEST_ESCAPE_INDIC', 'REG_TEST_ESCAPE_INDIC',
            'SYSTEM_TEST_ESCAPE_INDIC', 'SOLUTION_TEST_ESCAPE_INDIC',
            'INT_TEST_ESCAPE_INDIC', 'GO_TEST_ESCAPE_INDIC',
            'COMPLETE_ESCAPE_INDIC', 'SR_CNT', 'PSIRT_INDIC', 'BADCODEFLAG',
            'RISK_OWNER', 'SIR', 'PSIRT_FLAG', 'URC_DISPOSED_INDIC',
            'CLOSED_DISPOSED_INDIC', 'REGRESSION_BUG_FLAG'
        ]
        nonnumeric_columns = [
            'DE_MANAGER_USERID', 'LIFECYCLE_STATE_CODE', 'PROJECT', 'PRODUCT',
            'COMPONENT', 'ENGINEER', 'SUBMITTER_ID', 'FEATURE', 'RELEASE_NOTE',
            'IMPACT', 'ORIGIN', 'IS_CUSTOMER_VISIBLE', 'INCOMING_INDIC',
            'BACKLOG_INDIC', 'DISPOSED_INDIC', 'UPDATED_BY',
            'DEV_ESCAPE_ACTIVITY', 'RELEASED_CODE', 'TEST_EDP_ACTIVITY',
            'TEST_EDP_PHASE', 'BADCODEFLAG', 'RISK_OWNER', 'SIR', 'PSIRT_FLAG',
            'REGRESSION_BUG_FLAG'
        ]

        big_X = test_df[feature_columns_to_use]
        big_X = big_X.replace(np.nan, '', regex=True)
        big_X_imputed = DataFrameImputer().fit_transform(big_X)

        le = LabelEncoder()
        big_X_imputed["COMPONENT"] = big_X_imputed["COMPONENT"].astype(str)
        big_X_imputed["PRODUCT"] = big_X_imputed["PRODUCT"].astype(str)
        big_X_imputed["SUBMITTER_ID"] = big_X_imputed["SUBMITTER_ID"].astype(
            str)
        for feature in nonnumeric_columns:
            big_X_imputed[feature] = big_X_imputed[feature].astype(str)
            big_X_imputed[feature] = le.fit_transform(big_X_imputed[feature])

        thefile = str(settings.get(
            "Potential_CFD", "temp_path_mod_potCFD")) + '/' + settings.get(
                'Potential_CFD', 'potCFD_features') + str(cluster_id) + '.txt'
        with open(thefile, 'rb') as fp:
            feature_indices = pickle.load(fp)

        big_X_imputed = big_X_imputed.iloc[:, feature_indices]
        test_X = big_X_imputed.as_matrix()
        with open(model1, 'rb') as f:
            clf = pickle.load(f)
        test_X[test_X == ''] = 0
        #print(test_X[3090:3100])
        test_probs = clf.predict_proba(test_X)[:, 1]
        print("Model 1 ran")
        test_df["Prediction"] = test_probs

        ##################################SECOND MODEL################################

        top_words = 10000
        test_data = test_df[["ENCL-Description", "Headline", "ATTRIBUTE"]]
        stemmer = LancasterStemmer()
        i = 0

        test_data['ATTRIBUTE'] = test_data["ATTRIBUTE"].replace(np.nan, ' ')
        test_data['Headline'] = test_data["Headline"].replace(np.nan, ' ')
        test_data["complete"] = test_data["ENCL-Description"].astype(
            str) + test_data["Headline"].astype(
                str) + " " + test_data["ATTRIBUTE"].astype(str)

        thefile = str(settings.get("Potential_CFD", "temp_path_mod_potCFD")
                      ) + '/top_words_cluster_' + str(cluster_id) + '.txt'
        with open(thefile, 'rb') as fp:
            top_words = pickle.load(fp)

        f = str(settings.get("Potential_CFD", "temp_path_mod_potCFD")
                ) + '/indexes_cluster_' + str(cluster_id) + '.json'
        indexes = json.load(open(f, 'r'))

        testing_data = []
        i = 0
        for text in test_data["complete"]:
            #print(i)
            i = i + 1
            text_list = []
            if (not (pd.isnull(text))):
                for word in nltk.word_tokenize(text):
                    if word.lower() not in [
                            "?", "'s", ">", "<", ",", ":", "'", "''", "--",
                            "`", "``", "...", "", "!", "#", '"', '$', '%', '&',
                            '(', ')', '*', '+', '-', '.', '/', ';', '=', '@',
                            '[', '\\', ']', '^', '_', '{', '}', '|', '~', '\t',
                            '\n', ''
                    ] and '*' not in word.lower() and '=' not in word.lower(
                    ) and '++' not in word.lower() and '___' not in word.lower(
                    ) and (not word.isdigit()) and word.lower(
                    ) not in stop_words and (len(word) > 1):
                        stemmed_word = stemmer.stem(word.lower())
                        if stemmed_word not in top_words:
                            text_list.append(0)
                        else:
                            text_list.append(indexes[stemmed_word])
                testing_data.append(text_list)

        max_text_length = 150
        X_test = sequence.pad_sequences(testing_data, maxlen=max_text_length)

        model = load_model(model2)
        prediction = model.predict(X_test)
        print("Model 2 ran")
        test_df["test_pred"] = prediction
        test_df["Final_prediction"] = stacking_test(test_df, cluster_id)

        ##############################Model3##############################
        print("Starting model 3")
        print(test_df[['Final_prediction', 'test_pred']])
        print(cut_off)
        test_df1 = test_df[test_df['test_pred'] >=
                           float(cut_off)]  #CHnage it back to Final_prediction
        #print(test_df1)
        if (test_df1.shape[0] > 0):
            #test_df1['month_created'] = pd.to_datetime(test_df1['SUBMITTED_DATE']).dt.month
            #test_df1['year_created'] = pd.to_datetime(test_df1['SUBMITTED_DATE']).dt.year

            test_df1['COMPONENT'] = test_df1['COMPONENT'].astype(str)
            test_df1['PRODUCT'] = test_df1['PRODUCT'].astype(str)
            test_df1['SEVERITY_CODE'] = test_df1['SEVERITY_CODE'].astype(str)
            test_df1['SS_INDIC'] = test_df1['SS_INDIC'].astype(str)
            test_df1['TS_INDIC'] = test_df1['TS_INDIC'].astype(str)

            thefile = str(settings.get(
                "Potential_CFD", "temp_path_mod_potCFD")) + '/' + settings.get(
                    'Potential_CFD',
                    'potCFD_features') + 'dnn_' + str(cluster_id) + '.txt'
            with open(thefile, 'rb') as fp:
                new_feature_columns_to_use = pickle.load(fp)

            feature_columns_to_use = new_feature_columns_to_use  #+ ['month_created', 'year_created']
            categorical_features = new_feature_columns_to_use
            continuous_features = []  #['month_created', 'year_created']

            for feature in categorical_features:
                test_df1[feature] = test_df1[feature].astype(str)

            new_test_df = test_df1[feature_columns_to_use]

            engineered_features = []
            for continuous_feature in continuous_features:
                engineered_features.append(
                    tf.contrib.layers.real_valued_column(continuous_feature))

            for categorical_feature in categorical_features:
                sparse_column = tf.contrib.layers.sparse_column_with_hash_bucket(
                    categorical_feature, hash_bucket_size=1000)
                engineered_features.append(
                    tf.contrib.layers.embedding_column(
                        sparse_id_column=sparse_column,
                        dimension=16,
                        combiner="sum"))

            regressor2 = tf.contrib.learn.DNNRegressor(
                feature_columns=engineered_features,
                hidden_units=[64, 32, 10],
                model_dir=model3)

            #TensorFlow input functions for Text Analysis
            def input_fn(df, training=True):
                continuous_cols = {
                    k: tf.constant(df[k].values)
                    for k in continuous_features
                }
                categorical_cols = {
                    k: tf.SparseTensor(indices=[[i, 0]
                                                for i in range(df[k].size)],
                                       values=df[k].values,
                                       dense_shape=[df[k].size, 1])
                    for k in categorical_features
                }
                feature_cols = dict(
                    list(continuous_cols.items()) +
                    list(categorical_cols.items()))
                if training:
                    label = tf.constant(df[LABEL_COLUMN].values)
                    return feature_cols, label

                return feature_cols

            def train_input_fn():
                return input_fn(train_df1)

            def eval_input_fn():
                return input_fn(evaluate_df)

            def test_input_fn():
                return input_fn(new_test_df, False)

            #Predicting SR tickets
            predicted_output = regressor2.predict(
                input_fn=test_input_fn)  #input_fn(new_test_df, False))
            test_df1['Ticket_Predictions'] = list(predicted_output)

            #Predicting days ahead
            regressor2 = tf.contrib.learn.DNNRegressor(
                feature_columns=engineered_features,
                hidden_units=[64, 32, 10],
                model_dir=model4)
            predicted_output = regressor2.predict(
                input_fn=test_input_fn)  #input_fn(new_test_df, False))
            test_df1['Days_Predictions'] = list(predicted_output)

            now = datetime.datetime.now()
            test_df1[test_df1['Ticket_Predictions'] ==
                     0]['Ticket_Predictions'] = 1
            test_df1[
                test_df1['Ticket_Predictions'] < 0]['Ticket_Predictions'] = 0
            test_df1[test_df1['Days_Predictions'] < 0]['Days_Predictions'] = 0
            #test_df1['days_ahead'] = (pd.to_datetime(test_df1['SUBMITTED_DATE']) - now)/np.timedelta64(1, 'D') + test_df1['Days_Predictions']

            test_df2 = test_df[[
                'IDENTIFIER', 'LIFECYCLE_STATE_CODE', 'DISPOSED_INDIC',
                'CFD_INDIC', 'AGE', 'ATTRIBUTE', 'COMPONENT',
                'DE_MANAGER_USERID', 'ENCL-Description', 'ENGINEER',
                'Headline', 'IMPACT', 'PRIORITY_CODE', 'PRODUCT', 'PROJECT',
                'SS_INDIC', 'TS_INDIC', 'SEVERITY_CODE', 'SUBMITTED_DATE',
                'SUBMITTER_ID', 'TICKETS_COUNT', 'VERSION_TEXT',
                'IFD_CFD_INDIC', 'Prediction', 'test_pred', 'Final_prediction'
            ]]
            #test_df2 = test_df[['IDENTIFIER', 'LIFECYCLE_STATE_CODE', 'DISPOSED_INDIC', 'CFD_INDIC', 'Prediction', 'test_pred', 'Final_prediction']]
            test_df3 = test_df1[[
                'IDENTIFIER', 'Ticket_Predictions', 'Days_Predictions'
            ]]

            final_test_df = pd.DataFrame()
            final_test_df = test_df2.join(test_df3.set_index('IDENTIFIER'),
                                          on='IDENTIFIER')
            final_test_df = final_test_df.drop_duplicates('IDENTIFIER')
            final_test_df['Prediction'] = final_test_df['Prediction'] * 100
            final_test_df['Final_prediction'] = final_test_df[
                'test_pred'] * 100  #Change it back to Final_prediction
            final_test_df['test_pred'] = final_test_df['test_pred'] * 100
            final_test_df['days_ahead'] = (pd.to_datetime(
                final_test_df['SUBMITTED_DATE']) - now) / np.timedelta64(
                    1, 'D') + final_test_df['Days_Predictions']
            final_test_df['Cluster'] = cluster_id
            final_test_df['last_run_date'] = now.strftime("%Y-%m-%d")

            final_test_df = final_test_df[
                final_test_df['test_pred'] >=
                cut_off * 100]  #CHnage it back to Final_prediction
            print(final_test_df.shape)
            #print(test_df1.shape)
            #Inserting data to view results collection
            if (view):
                vi_col_name_results = settings.get(
                    'Potential_CFD', 'viewPrefix') + str(view_id) + '_' + str(
                        query_id) + '_results'
                collection = db[vi_col_name_results]

            else:
                vi_col_name_results = settings.get(
                    'Potential_CFD', 'viewPrefix') + str(bu_id) + '_' + str(
                        query_id) + '_results'
                collection = db[vi_col_name_results]

            records = json2.loads(
                final_test_df.T.to_json(date_format='iso')).values()
            collection.create_index([("IDENTIFIER", pymongo.ASCENDING),
                                     ("last_run_date", pymongo.ASCENDING)],
                                    unique=True)
            print(collection.index_information())
            try:
                collection.insert(records)
                print("Inserted data to results collection")
            except pymongo.errors.DuplicateKeyError:
                print("Duplicates records in collection, so not inserting...")

            #Inserting data to View Mapper collection
            collection = db[settings.get('Potential_CFD',
                                         'Pot_cfd_viewCluster')]
            df = pd.DataFrame(columns=[
                'viewSetCollectionName', 'trainedOnCollectionName',
                'testCollectionName', 'clusterId', 'viewId', 'queryId', 'BU',
                'projectList', 'csap_last_run_date', 'cutoff'
            ])
            proj_list = ",".join(f_c)
            dat = now.strftime("%Y-%m-%d")
            #print(dat)
            if (view):
                df.loc[0] = [
                    vi_col_name_results, tr_col_name, te_col_name,
                    int(cluster_id),
                    int(view_id),
                    int(query_id), bu_id, proj_list, dat,
                    float(cut_off * 100)
                ]

            else:
                print("here")
                df.loc[0] = [
                    vi_col_name_results, tr_col_name, te_col_name,
                    int(cluster_id), view_id,
                    int(query_id),
                    str(bu_id), proj_list, dat,
                    float(cut_off * 100)
                ]

            records = json2.loads(df.T.to_json(date_format='iso')).values()
            collection.insert(records)
            print("Inserted data to View mapper collection")
        else:
            print("No predicted CFDs in this ViewSet")
Exemple #33
0
    words = []
    labels = []
    docs_x = []
    docs_y = []

    for intent in data["intents"]:
        for pattern in intent["patterns"]:
            wrds = nltk.word_tokenize(pattern)
            words.extend(wrds)
            docs_x.append(wrds)
            docs_y.append(intent["tag"])

            if intent["tag"] not in labels:
                labels.append(intent["tag"])

    words = [stemmer.stem(w.lower()) for w in words if w not in "?"]
    words = sorted(list(set(words)))

    labels = sorted(labels)

    training = []
    output = []

    out_empty = [0 for _ in range(len(labels))]

    for x, doc in enumerate(docs_x):
        bag = []

        wrds = [stemmer.stem(w) for w in doc]

        for w in words:
Exemple #34
0
        docs_y.append(intent["tag"])

    if intent["tag"] not in labels:
        labels.append(intent["tag"])

for intent2 in data["intents"]:
    for pattern2 in intent2["patterns"]:
        wrds2 = nltk.word_tokenize(pattern2)
        words2.extend(wrds2)
        docs_x2.append(wrds2)
        docs_y2.append(intent2["tag"])

    if intent2["tag"] not in labels2:
        labels2.append(intent2["tag"])

words = [stemmer.stem(w.lower()) for w in words if w != "?"]
words = sorted(list(set(words)))

words2 = [stemmer.stem(w2.lower()) for w2 in words2 if w2 != "?"]
words2 = sorted(list(set(words2)))

labels = sorted(labels)

labels2 = sorted(labels2)

training = []
output = []

training2 = []
output2 = []
Exemple #35
0
	words = []
	labels = []
	docs_x = []
	docs_y = []

	for command in data["commands"]:
		for pattern in commands["patterns"]:
			words_list = nltk.word_tokenize(pattern)
			words.extend(words_list)
			docs_x.append(words_list)
			docs_y.append(intent["tag"])

		if command["tag"] not in labels:
			labels.append(command["tag"])

	words = [stemmer.stem(w.lower()) for w in words if w not in "?" or "!"]
	words = sorted(list(set(words)))

	labels = sorted(labels)

	training = []
	output = []

	out_empty = [0 for _ in range(len(labels))]

	for x, doc in enumerate(docs_x):
		bag = []

		words_list = [stemmer.stem(w) for w in doc]

		for w in words:
Exemple #36
0
# Create various stemmer objects
porter = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer('english')

# Create a list of stemmer names for display
stemmer_names = ['PORTER', 'LANCASTER', 'SNOWBALL']
formatted_text = '{:>16}' * (len(stemmer_names) + 1)
print('\n', formatted_text.format('INPUT WORD', *stemmer_names), 
        '\n', '='*68)

# Stem each word and display the output
for word in input_words:
    output = [word, porter.stem(word), 
            lancaster.stem(word), snowball.stem(word)]
    print(formatted_text.format(*output))

#Chunks: Diving input data into chunks. it is not the same as tokenization while chunks is need to be meaningful
    import nltk
nltk.download('brown')
import numpy as np
from nltk.corpus import brown
# Split the input text into chunks, where
# each chunk contains N words
def chunker(input_data, N):
    input_words = input_data.split(' ')
    output = []

    cur_chunk = []
    count = 0
Exemple #37
0
def ProcessData(data, train=False):
    check_tokenizer()
    if (train == False):
        try:
            with open(pickle_path, "rb") as f:
                words, labels, training, output = pickle.load(f)
            print("Loaded stemmed data from pickle")
        except:
            print("Stemming data from the intents.json file")
            stemmer = LancasterStemmer()
            words = []
            labels = []
            docs_X = []
            docs_y = []

            with open(classes_path, "w") as f:
                f.write("")

            for intent in data["intents"]:
                for pattern in intent["patterns"]:
                    wrds = nltk.word_tokenize(pattern)
                    words.extend(wrds)
                    docs_X.append(wrds)
                    docs_y.append(intent["tag"])

                if intent["tag"] not in labels:
                    labels.append(intent["tag"])
                with open(classes_path, "a") as f:
                    f.write(intent["tag"] + "\n")

            #List of non-redundant words the model has seen
            words = [stemmer.stem(w.lower()) for w in words if w != "?"]
            words = np.array(words)
            words = sorted(np.unique(words))

            #labels (sorted)
            labels = sorted(labels)

            training = []
            output = []

            out_empty = [0 for _ in range(len(labels))]

            for x, doc in enumerate(docs_X):
                bag = np.array([])
                wrds = [stemmer.stem(w.lower()) for w in doc if w != "?"]

                for w in words:
                    if w in wrds:
                        bag = np.append(bag, np.array([1]))
                    else:
                        bag = np.append(bag, np.array([0]))

                output_row = out_empty[:]
                output_row[labels.index(docs_y[x])] = 1

                training.append(bag)
                output.append(np.argmax(output_row))

            #into np arrays
            training = np.asarray(training)
            output = np.asarray(output)

            with open(pickle_path, "wb") as f:
                print("Stemmed data saved in pickle file...")
                pickle.dump((words, labels, training, output), f)
    else:
        with open(classes_path, "w") as f:
            f.write("")
        print("Stemming data from the intents.json file")
        stemmer = LancasterStemmer()
        words = []
        labels = []
        docs_X = []
        docs_y = []

        for intent in data["intents"]:
            for pattern in intent["patterns"]:
                wrds = nltk.word_tokenize(pattern)
                words.extend(wrds)
                docs_X.append(wrds)
                docs_y.append(intent["tag"])

            if intent["tag"] not in labels:
                labels.append(intent["tag"])

            with open(classes_path, "a") as f:
                f.write(intent["tag"] + "\n")

        #List of non-redundant words the model has seen
        words = [stemmer.stem(w.lower()) for w in words if w != "?"]
        words = np.array(words)
        words = sorted(np.unique(words))

        #labels (sorted)
        labels = sorted(labels)

        training = []
        output = []

        out_empty = [0 for _ in range(len(labels))]

        for x, doc in enumerate(docs_X):
            bag = np.array([])
            wrds = [stemmer.stem(w.lower()) for w in doc if w != "?"]

            for w in words:
                if w in wrds:
                    bag = np.append(bag, np.array([1]))
                else:
                    bag = np.append(bag, np.array([0]))

            output_row = out_empty[:]
            output_row[labels.index(docs_y[x])] = 1

            training.append(bag)
            output.append(np.argmax(output_row))

        #into np arrays
        training = np.asarray(training)
        output = np.asarray(output)

        with open(pickle_path, "wb") as f:
            print("Stemmed data saved in pickle file...")
            pickle.dump((words, labels, training, output), f)
    return words, labels, training, output
Exemple #38
0
text2.concordance("montrous")

text2.similar("montrous")

text2.common_contexts(['monstrous', 'very'])
text4.dispersion_plot(['citizens', 'democracy', 'duties', 'America'])
text1.dispersion_plot(['happy', 'sad'])

text = "You have to ask yourself one question: Do I feel lucky? Well do ya, punk?"
sents = sent_tokenize(text)
print(sents)

# In[ ]:

words = [word_tokenize(sent) for sent in sents]

customStopWords = set(stopwords.words('english') + list(punctuation))

wordsWOStopwords = [
    word for word in word_tokenize(text) if word not in customStopWords
]
print(wordsWOStopwords)

text2 = "this is the end of the world as we know it, and I feel fine!!!."

st = LancasterStemmer()
stemmedWords = [st.stem(word) for word in word_tokenize(text2)]
print(stemmedWords)

nltk.pos_tag(word_tokenize(text2))
Exemple #39
0
documents = []
ignore_words = ['?']
# loop through each sentence in our training data
for pattern in training_data:
    # tokenize each word in the sentence
    w = nltk.word_tokenize(pattern['sentence'])
    # add to our words list
    words.extend(w)
    # add to documents in our corpus
    documents.append((w, pattern['class']))
    # add to our classes list
    if pattern['class'] not in classes:
        classes.append(pattern['class'])

# stem and lower each word and remove duplicates
words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]
words = list(set(words))

# remove duplicates
classes = list(set(classes))

print (len(documents), "documents")
print (len(classes), "classes", classes)
print (len(words), "unique stemmed words", words)


# create our training data
training = []
output = []
# create an empty array for our output
output_empty = [0] * len(classes)
Exemple #40
0
#Stemming
#Using the porter algorithm
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
porter_stemmer.stem('maximum')
porter_stemmer.stem('presumably')
porter_stemmer.stem('multiply')
porter_stemmer.stem('provision')
porter_stemmer.stem('owed')
porter_stemmer.stem('ear')
porter_stemmer.stem('saying')

#Using the Lancaster algorithm
from nltk.stem.lancaster import LancasterStemmer
lancaster_stemmer = LancasterStemmer()
lancaster_stemmer.stem('maximum')
lancaster_stemmer.stem('presumably')
lancaster_stemmer.stem('multiply')
lancaster_stemmer.stem('provision')
lancaster_stemmer.stem('owed')
lancaster_stemmer.stem('ear')
lancaster_stemmer.stem('saying')

#Snowball stemmer
from nltk.stem import SnowballStemmer
snowball_stemmer = SnowballStemmer(“english”)
snowball_stemmer.stem('maximum')
snowball_stemmer.stem('presumably')
snowball_stemmer.stem('multiply')
snowball_stemmer.stem('provision')
snowball_stemmer.stem('owed')
Exemple #41
0
class Scrape:
    def __init__(self,
                 file_name,
                 tags,
                 remove_hyphen=None,
                 remove_apostrophe=None,
                 remove_stop_words=None):
        self.tags = tags
        with open(file_name, 'r') as file_handle:
            self.html = ''.join(file_handle.readlines())
            self.soup = BeautifulSoup(self.html, "html.parser")

        self.remove_hyphen = True if remove_hyphen is None else remove_hyphen
        self.remove_apostrophe = True if remove_apostrophe is None else remove_apostrophe
        self.remove_stop_words = True if remove_stop_words is None else remove_stop_words

        self.terms = []
        self.token_counter = 1

        self.stemmer = LancasterStemmer()

        self.doc_id = file_name.replace('WEBPAGES_RAW/', '')

    def parse_content(self, document):

        # Get a list of strings associated with each tag
        for x in self.soup.find_all(self.tags):
            tag = x.name
            if tag != 'p':
                tokens = self.tokenize(x.string)
            else:
                tokens = self.tokenize(x.text)
            if tokens is not None:
                for token in tokens:
                    # self.terms.append(Term(doc=document, term=token, tag_type=tag, position=self.token_counter))
                    self.terms.append({
                        'doc': self.doc_id,
                        'term': token,
                        'tag_type': tag,
                        'position': self.token_counter
                    })
                    self.token_counter += 1

        return self.terms

    def tokenize(self, text):

        # Checking if the text is a string value and is not None
        if text is None:
            return None

        # Convert to lowercase
        text = text.lower()

        # Removing hyphens and apostrophes if needed
        if self.remove_hyphen is True:
            text = text.replace('-', '')
        if self.remove_apostrophe is True:
            text = text.replace('\'', '')

        # Finding all the alphanumeric tokens and converting them to lowercase
        line_tokens = re.findall('\w+', text)

        # Uncomment the following line for stemming
        line_tokens = [self.stemmer.stem(token) for token in line_tokens]

        if self.remove_stop_words is True:
            line_tokens = filter(
                lambda token: token not in stopwords.words('english'),
                line_tokens)

        return line_tokens
def stem(word):
    """Return the stemmed word using a Lancaster Stemmer."""
    st = LancasterStemmer()
    return st.stem(word)
Exemple #43
0
def stem_wordify(text):
	sw = nltk.corpus.stopwords.words('english') 
	st = LancasterStemmer()
	return map(lambda x: st.stem(x), 
				filter(lambda y: not(y in sw), map(lambda z: st.stem(z), wordify(text))))
class PreProcessor:

    # nl_features : Sentences
    # text_labels : Python namespaces for the file to be mapped for respective sentence
    def __init__(self, nl_features, text_labels, stem_blacklist_words):
        self._nl_features = nl_features
        self._text_labels = text_labels
        self._stem_blacklist_words = stem_blacklist_words

        self._word_list = []
        self._unique_labels_list = []
        self._nl_feature_label_map = []

        self._stemmer = LancasterStemmer()

        self._features = []
        self._labels = []

    def tokenize_and_stem(self):
        for i in range(0, len(self._nl_features)):
            nl_feature = self._nl_features[i]
            text_label = self._text_labels[i]

            tokenized_feature = nltk.word_tokenize(nl_feature)
            self._word_list.extend(tokenized_feature)
            self._nl_feature_label_map.append((tokenized_feature, text_label))

            if text_label not in self._unique_labels_list:
                self._unique_labels_list.append(text_label)

        self._word_list = [self._stemmer.stem(word.lower()) for word in self._word_list if word not in self._stem_blacklist_words]
        self._word_list = list(set(self._word_list))

        self._unique_labels_list = list(set(self._unique_labels_list))

    def convert_to_patterns(self):
        encoded_label_template = [0] * len(self._unique_labels_list)

        for example in self._nl_feature_label_map:
            words_in_example = example[0] # Get the word list of the example
            label_of_example = example[1] # Get the label of the example

            words_in_example = [self._stemmer.stem(word_in_example.lower()) for word_in_example in words_in_example] # Stem each word

            pattern_for_words = []
            for unique_word in self._word_list:
                if unique_word in words_in_example:
                    pattern_for_words.append(1)
                else:
                    pattern_for_words.append(0)

            encoded_label = list(encoded_label_template)
            encoded_label[self._unique_labels_list.index(label_of_example)] = 1

            self._features.append(pattern_for_words)
            self._labels.append(encoded_label)

    def get_text_labels(self):
        return self._text_labels

    def get_unique_text_labels(self):
        return list(set(self._text_labels))

    def get_processed_features_and_labels(self):
        return self._features, self._labels

    def get_processed_features(self):
        return self._features

    def get_processed_labels(self):
        return self._labels

    def get_feature_length(self):
        return len(self._word_list)

    def get_unique_label_count(self):
        return len(self._unique_labels_list)

    def get_unique_wordlist(self):
        return self._word_list

    def validate_tasks_directory(self, tasks_directory_path):
        if os.path.exists(tasks_directory_path):
            structure_definitions_filepath = tasks_directory_path + "/" + consts.TASKS_STRUCT_FILE_FILENAME

            if os.path.exists(structure_definitions_filepath):
                structure_definitions_file_data = open(structure_definitions_filepath).read()
                structure_definitions_file_data = json.loads(structure_definitions_file_data)

                if all(basic_prop in structure_definitions_file_data for basic_prop in consts.TASKS_STRUCT_FILE_BASIC_PROPERTY_KEYS):

                    all_executors = structure_definitions_file_data[consts.TASKS_STRUCT_FILE_PROP_EXECUTORS]

                    for executor in all_executors:
                        executor_folder = executor[consts.TASKS_STRUCT_FILE_PROP_EXECUTORS_NAMESPACE]
                        executor_folder_path = tasks_directory_path + "/" + executor_folder

                        if os.path.exists(executor_folder_path):
                            executor_name = executor[consts.TASKS_STRUCT_FILE_PROP_EXECUTORS_CLASS]
                            executor_file_name = executor_name + ".py"
                            executor_file_path = executor_folder_path + "/" + executor_file_name

                            if not os.path.exists(executor_file_path):
                                return False, "Class File: " + executor_name + " cannot be found inside the Namespace Folder: " + executor_folder + "."
                        else:
                            return False, "Namespace folder: " + executor_folder + " not found."

                    return True, "Success"
                else:
                    return False, "One or more of the following required entries are not found in the " + consts.TASKS_STRUCT_FILE_FILENAME + ".\n" + consts.TASKS_STRUCT_FILE_PROP_DEP_DIRS + ", " + consts.TASKS_STRUCT_FILE_PROP_EXECUTORS
            else:
                return False, consts.TASKS_STRUCT_FILE_FILENAME + " does not exist inside " + tasks_directory_path +".\nPlease make sure the tasks executors folder path you've given contains valid TaskExecutors."
        else:
            return False, tasks_directory_path + " is an invalid directory."

    @staticmethod
    def get_sentence_patterns(sentence, word_list):
        tokenized_words = nltk.word_tokenize(sentence)
        stemmer = LancasterStemmer()
        stemmed_words = [stemmer.stem(tokenized_word.lower()) for tokenized_word in tokenized_words]

        patterns = [0] * len(word_list)
        for stemmed_word in stemmed_words:
            for index, trainingSet_word in enumerate(word_list):
                if stemmed_word == trainingSet_word:
                    patterns[index] = 1

        return patterns
Exemple #45
0
        words, labels, training, output = pickle.load(f)
except:
    words = []
    labels = []
    doc_x = []
    doc_y = []
    for value in data['intents']:
        for pattern in value['patterns']:
            wrds = nltk.word_tokenize(pattern)
            words.extend(wrds)
            doc_x.append(wrds)
            doc_y.append(value['tag'])
        if value['tag'] not in labels:
            labels.append(value['tag'])
    words = [stemmer.stem(x.lower()) for x in words if x not in '?']
    words = sorted(list(set(words)))

    labels = sorted(labels)

    training = []
    output = []

    output_empty = [0 for _ in range(len(labels))]

    for x, y in enumerate(doc_x):
        bag = []
        wrds = [stemmer.stem(w) for w in y]
        for w in words:
            if w in wrds:
                bag.append(1)
import nltk
from nltk.stem.lancaster import LancasterStemmer
stri = LancasterStemmer()
print(stri.stem('achievement'))
Exemple #47
0
class Naive_bayes():
    def __init__(self):
        self.stemmer = LancasterStemmer()
        self.training_data = []
        self.base_path = "../../../conversation/"

        # 파일 읽어들임
        for filename in os.listdir(self.base_path):
            self.read_data(filename)

        self.corpus_words = {}
        self.class_words = {}
        self.classes = list(set([a['class'] for a in self.training_data]))
        for c in self.classes:
            self.class_words[c] = []
        self.extract_data()

    def read_data(self, filename):
        with open(self.base_path + filename, encoding="utf-8") as f:
            while True:
                line = f.readline().strip()
                if not line: break
                self.training_data.append({
                    "class": filename,
                    "sentence": line
                })

    def extract_data(self):
        for data in self.training_data:
            # 각 문장을 단어를 토큰화
            for word in nltk.word_tokenize(data['sentence']):
                if word not in ["?", "'s"]:
                    # stem and lowercase each word
                    stemmed_word = self.stemmer.stem(word.lower())

                    # 이미 나온 문자인지 확인
                    if stemmed_word not in self.corpus_words:
                        self.corpus_words[stemmed_word] = 1
                    else:
                        self.corpus_words[stemmed_word] += 1

                    # class list에 단어 추가
                    self.class_words[data['class']].extend([stemmed_word])

    def calculate_class_score_commonality(self,
                                          sentence,
                                          class_name,
                                          show_details=True):
        score = 0
        # 새로운 문장에서 단어를 각각 토큰화
        for word in nltk.word_tokenize(sentence):
            # 단어의 stem이 클래스 중에 속하는 지 여부
            if self.stemmer.stem(word.lower()) in self.class_words[class_name]:
                # 상대적인 가중치를 더함
                score += (1 /
                          self.corpus_words[self.stemmer.stem(word.lower())])

                if show_details:
                    print("   match: %s (%s)" %
                          (self.stemmer.stem(word.lower()), 1 /
                           self.corpus_words[self.stemmer.stem(word.lower())]))
        return score

    def classify(self, sentence):
        high_class = None
        high_score = 0

        for c in self.class_words.keys():
            score = self.calculate_class_score_commonality(sentence,
                                                           c,
                                                           show_details=False)

            if score > high_score:
                high_class = c
                high_score = score
        return high_class, high_score
class AICodemaster(Codemaster):
    def __init__(self, brown_ic=None, glove_vecs=None, word_vectors=None):
        super().__init__()
        self.brown_ic = brown_ic
        self.glove_vecs = glove_vecs
        self.word_vectors = word_vectors
        self.wordnet_lemmatizer = WordNetLemmatizer()
        self.lancaster_stemmer = LancasterStemmer()
        self.cm_wordlist = []
        with open('players/cm_wordlist.txt') as infile:
            for line in infile:
                self.cm_wordlist.append(line.rstrip())

        self.bad_word_dists = None
        self.red_word_dists = None

    def set_game_state(self, words, maps):
        self.words = words
        self.maps = maps

    def get_clue(self):
        cos_dist = scipy.spatial.distance.cosine
        red_words = []
        bad_words = []

        # Creates Red-Labeled Word arrays, and everything else arrays
        for i in range(25):
            if self.words[i][0] == '*':
                continue
            elif self.maps[i] == "Assassin" or self.maps[
                    i] == "Blue" or self.maps[i] == "Civilian":
                bad_words.append(self.words[i].lower())
            else:
                red_words.append(self.words[i].lower())
        print("RED:\t", red_words)

        all_vectors = (self.glove_vecs, )
        bests = {}

        if not self.bad_word_dists:
            self.bad_word_dists = {}
            for word in bad_words:
                self.bad_word_dists[word] = {}
                for val in self.cm_wordlist:
                    b_dist = cos_dist(self.concatenate(val, all_vectors),
                                      self.concatenate(word, all_vectors))
                    self.bad_word_dists[word][val] = b_dist

            self.red_word_dists = {}
            for word in red_words:
                self.red_word_dists[word] = {}
                for val in self.cm_wordlist:
                    b_dist = cos_dist(self.concatenate(val, all_vectors),
                                      self.concatenate(word, all_vectors))
                    self.red_word_dists[word][val] = b_dist

        else:
            to_remove = set(self.bad_word_dists) - set(bad_words)
            for word in to_remove:
                del self.bad_word_dists[word]
            to_remove = set(self.red_word_dists) - set(red_words)
            for word in to_remove:
                del self.red_word_dists[word]

        for clue_num in range(1, 3 + 1):
            best_per_dist = np.inf
            best_per = ''
            best_red_word = ''
            for red_word in list(itertools.combinations(red_words, clue_num)):
                best_word = ''
                best_dist = np.inf
                for word in self.cm_wordlist:
                    if not self.arr_not_in_word(word, red_words + bad_words):
                        continue

                    bad_dist = np.inf
                    worst_bad = ''
                    for bad_word in self.bad_word_dists:
                        if self.bad_word_dists[bad_word][word] < bad_dist:
                            bad_dist = self.bad_word_dists[bad_word][word]
                            worst_bad = bad_word
                    worst_red = 0
                    for red in red_word:
                        dist = self.red_word_dists[red][word]
                        if dist > worst_red:
                            worst_red = dist

                    if worst_red < best_dist and worst_red < bad_dist:
                        best_dist = worst_red
                        best_word = word
                        # print(worst_red,red_word,word)

                        if best_dist < best_per_dist:
                            best_per_dist = best_dist
                            best_per = best_word
                            best_red_word = red_word
            bests[clue_num] = (best_red_word, best_per, best_per_dist)

        print("BESTS: ", bests)
        li = []
        pi = []
        chosen_clue = bests[1]
        chosen_num = 1
        for clue_num, clue in bests.items():
            best_red_word, combined_clue, combined_score = clue
            worst = -np.inf
            best = np.inf
            worst_word = ''
            for word in best_red_word:
                dist = cos_dist(self.concatenate(word, all_vectors),
                                self.concatenate(combined_clue, all_vectors))
                if dist > worst:
                    worst_word = word
                    worst = dist
                if dist < best:
                    best = dist
            if worst < 0.3 and worst != -np.inf:
                print(worst, chosen_clue, chosen_num)
                chosen_clue = clue
                chosen_num = clue_num

            li.append((worst / best, best_red_word, worst_word, combined_clue,
                       combined_score, combined_score**len(best_red_word)))

        if chosen_clue[2] == np.inf:
            chosen_clue = ('', li[0][3], 0)
            chosen_num = 1

        # print("The clue is: ", li[0][3])
        print('chosen_clue is:', chosen_clue)
        # return in array styled: ["clue", number]
        return chosen_clue[1], chosen_num  # [li[0][3], 1]

    def arr_not_in_word(self, word, arr):
        if word in arr:
            return False
        lemm = self.wordnet_lemmatizer.lemmatize(word)
        lancas = self.lancaster_stemmer.stem(word)
        for i in arr:
            if i == lemm or i == lancas:
                return False
            if i.find(word) != -1:
                return False
            if word.find(i) != -1:
                return False
        return True

    def combine(self, words, wordvecs):
        factor = 1.0 / float(len(words))
        new_word = self.concatenate(words[0], wordvecs) * factor
        for word in words[1:]:
            new_word += self.concatenate(word, wordvecs) * factor
        return new_word

    def concatenate(self, word, wordvecs):
        concatenated = wordvecs[0][word]
        for vec in wordvecs[1:]:
            concatenated = np.hstack((concatenated, vec[word]))
        return concatenated
Exemple #49
0
print("Categorias:" + str(categories))

for each_category in data.keys():
    for each_sentence in data[each_category]:
        # remove any punctuation from the sentence
        each_sentence = remove_punctuation(each_sentence)
        print(each_sentence)
        # extract words from each sentence and append to the word list
        w = nltk.word_tokenize(each_sentence)
        print("tokenized words: ", w)
        words.extend(w)
        docs.append((w, each_category))

# stem and lower each word and remove duplicates
words = [stemmer.stem(w.lower()) for w in words]
words = sorted(list(set(words)))

print(words)
print("\n")
print(docs)

# create our training data
training = []
output = []
# create an empty array for our output
output_empty = [0] * len(categories)

for doc in docs:
    # initialize our bag of words(bow) for each document in the list
    bow = []
Exemple #50
0
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer

input_words = [
    'writing', 'calves', 'be', 'branded', 'horse', 'randomize', 'possibly',
    'provision', 'hospital', 'kept', 'scratchy', 'code'
]

# Stemmer objects
porter = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer('english')

# Create a list of stemmer names for display
stemmer_names = ['PORTER', 'LANCASTER', 'SNOWBALL']
formatted_text = '{:>16}' * (len(stemmer_names) + 1)
print(formatted_text.format('INPUT WORD', *stemmer_names))
print('=' * 75)

# Stem each word and display the output
for word in input_words:
    output = [
        word,
        porter.stem(word),
        lancaster.stem(word),
        snowball.stem(word)
    ]
    print(formatted_text.format(*output))
Exemple #51
0
st = LancasterStemmer()

from nltk.stem import PorterStemmer

pt = PorterStemmer()

from nltk.stem.snowball import EnglishStemmer

sb = EnglishStemmer()

from nltk.stem.wordnet import WordNetLemmatizer

wn = WordNetLemmatizer()

##let's examine the word ``better"
st.stem('better')
pt.stem('better')
sb.stem('better')
wn.lemmatize('better', 'a')

wn.lemmatize('families', 'n')

##
##applying the porter stemmer to the gettysburg address

text_5 = map(pt.stem, text_4)

##now creating a dictionary that will count the occurrence of the words

getty = {}
used = []
Exemple #52
0
    def __init__(self, d):
        self.categories = None
        self.conversations = None
        for k, v in d.items():
            setattr(self, k, v)


data = []
for fn in listdir(PATH):
    with open(PATH + fn, 'r') as s:
        data.append(Corpus(yaml.safe_load(s)))

stemmer = LancasterStemmer()

clear_sentence = lambda sentence: ' '.join(
    [stemmer.stem(w) for w in nltk.word_tokenize(sentence)])

questions = []
classes = []

for item in data:
    cat = item.categories[0]
    for quest in item.conversations:
        for q in quest:
            questions.append(clear_sentence(q))
            classes.append(cat)

tfv = TfidfVectorizer(stop_words='english')
le = LabelEncoder()

X = tfv.fit_transform(questions)
Exemple #53
0
    infile.close()

    jsonList = json.loads(lines)

for tweet in jsonList:
    wordlist = clean_tweet(tweet)
    #print type(tweet)

    for word in wordlist:
        if len(word) == 1 or word in stopwords:
            continue

        if any(s in word.lower() for s in specialList):
            continue

        wordcloudlist += ' {}'.format(ls.stem(word))
#print text2
#out = text2.translate(string.maketrans("",""), string.punctuation)

#ls.stem(out)
#wnl.lemmatize(out)
#ss.stem(out)
#print out

# Generate a word cloud image
#wordcloud = WordCloud().generate(text)

# lower max_font_size
wordcloud1 = WordCloud(max_font_size=40).generate(wordcloudlist)

# Display the generated image:
docs_y = []

for intent in data["intents"]:  #Loop on All question types
    for pattern in intent[
            "patterns"]:  #All patterns in a single question type,i.e; all the sentences
        wrds = nltk.word_tokenize(pattern)  # divides based on space
        words.extend(wrds)
        docs_x.append(
            wrds
        )  #docs_x contains all sentences as a list of words,i.e;list of lists of words
        docs_y.append(intent["tag"])

    if intent["tag"] not in labels:
        labels.append(intent["tag"])

words = [stemmer.stem(w.lower()) for w in words
         if w != "?"]  #stemmer reduces similar words into a single word
words = sorted(list(set(words)))  #set makes sure it doesn't have duplicates

labels = sorted(labels)

training = []
output = []

out_empty = [0 for _ in range(len(labels))]

for x, doc in enumerate(docs_x):
    bag = []

    wrds = [stemmer.stem(w) for w in doc]
#sentiment analysis

import nltk
path = 'D:\projects\chat.txt'
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()
from trainsenti import training_data
corpus_words = {}
class_words = {}
classes = list(set([a['class'] for a in training_data]))
for c in classes:
    class_words[c] = []
for data in training_data:
    for word in nltk.word_tokenize(data['sentence']):
        if word not in ["?", "'s"]:
            stemmed_word = stemmer.stem(word.lower())
            if stemmed_word not in corpus_words:
                corpus_words[stemmed_word] = 1
            else:
                corpus_words[stemmed_word] += 1
            class_words[data['class']].extend([stemmed_word])


def calculate_class_score(sentence, class_name, show_details=True):
    score = 0
    for word in nltk.word_tokenize(sentence):
        if stemmer.stem(word.lower()) in class_words[class_name]:
            score += (1 / corpus_words[stemmer.stem(word.lower())])
    return score

Exemple #56
0
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer

words = [
    'table', 'probably', 'wolves', 'playing', 'is', 'dog', 'the', 'beaches',
    'grounded', 'dreamt', 'envision'
]

# Compare different stemmers
stemmers = ['PORTER', 'LANCASTER', 'SNOWBALL']
stemmer_porter = PorterStemmer()
stemmer_lancaster = LancasterStemmer()
stemmer_snowball = SnowballStemmer('english')

formatted_row = '{:>16}' * (len(stemmers) + 1)
print(formatted_row.format('WORD', *stemmers))
for word in words:
    stemmed_words = [
        stemmer_porter.stem(word),
        stemmer_lancaster.stem(word),
        stemmer_snowball.stem(word)
    ]
    print(formatted_row.format(word, *stemmed_words))
Exemple #57
0
        wrds = nltk.word_tokenize(pattern)
        words.extend(wrds)
        docs_x.append(pattern)
        docs_y.append(intent['tag'])

        if intent['tag'] not in labels:
            labels.append(intent['tag'])


words = [stemmer.setm(w.lower()) for w in words]
words = sorted(list(set(words)))
labels = sorted(labels)

training = []
output = []

out_empty = [ 0 for _ in range(len(classes))]

for x,doc in enumerate(docs_x):
    bag = []

    wrds = [stemmer.stem(w) for w in doc]

    for w in words:
        if w in wrds:
            bag.append(1)
        else:
            bag.append(0)

    output_row = out_empty[:]
    output_row = [labels.index(docs_y[x])] = 1
Exemple #58
0
def lancasterStem(features):
    lancasterStemmer = LancasterStemmer()
    return [lancasterStemmer.stem(feature) for feature in features]
Exemple #59
0
    words = []
    labels = []
    docs_x = []
    docs_y = []

    for intent in data["intents"]:
        for pattern in intent["patterns"]:
            wrds = nltk.word_tokenize(pattern)
            words.extend(wrds)
            docs_x.append(wrds)
            docs_y.append(intent["tag"])

        if intent["tag"] not in labels:
            labels.append(intent["tag"])

    words = [stemmer.stem(w.lower()) for w in words if w != "?"]
    words = sorted(list(set(words)))

    labels = sorted(labels)

    training = []
    output = []

    out_empty = [0 for _ in range(len(labels))]

    for x, doc in enumerate(docs_x):
        bag = []

        wrds = [stemmer.stem(w.lower()) for w in doc]

        for w in words:
    #scan through the document to take out patterns and tokenize them
    #add all the tokenized words in a single list
    #split the patterns and tags(data_x and data_y)
    #take out distinct tags (labels)
    #a lexicon is created(words)
    for intent in data["intents"]:
        for pattern in intent["patterns"]:
            wrds = nltk.word_tokenize(pattern)
            words.extend(wrds)
            data_x.append(wrds)
            data_y.append(intent["tag"])

        if (intent["tag"] not in labels):
            labels.append(intent["tag"])
    words = [stemmer.stem(w) for w in words if w != "?"]
    words = sorted(list(set(words)))
    labels = sorted(labels)

    training = []
    output = []
    out_empty = [0 for _ in range(len(labels))]

    for x, doc in enumerate(data_x):
        bag = []
        wrds = [stemmer.stem(w) for w in doc]
        for w in words:
            if w in wrds:
                bag.append(1)
            else:
                bag.append(0)