def getMaybeWords(self, text_ls):
      ignoreWords = ["","have","her","there","the","be","to","of","and","a","in","that","it","for","on","with","as","at","this","but","his","by","from","they","or","an","will","would","so","even","is","be","am","are"];

      word_ls = []
      for text in text_ls:
         word_ls += wordpunct_tokenize(text)
         
      frequencies = {}
      st = LancasterStemmer()
      for word in word_ls:
         if not word[0].isalpha():
            continue
         if word in ignoreWords:
            continue
         word_stem = st.stem(word)
         if word_stem in frequencies:
            frequencies[word_stem] += 1
         else:
            frequencies[word_stem] = 1

      sorted_frequencies = sorted(frequencies.iteritems(), key = operator.itemgetter(1), reverse =  True)
      #print sorted_frequencies

      max_words = 30
      if len(sorted_frequencies) < max_words:
         max_words = len(sorted_frequencies)
      word_tuples = sorted_frequencies[0:max_words]
      words = [tuple[0] for tuple in word_tuples]
      print words
      return words
Beispiel #2
0
    def build_analyzer(self):
        """
        Return a callable that handles preprocessing and tokenization
        """
        preprocess = self.build_preprocessor()
        tokenize = self.build_tokenizer()
        stemmer = LancasterStemmer()

        filter_meta = lambda doc: ' '.join([w for w in doc.split() if not w.startswith('~')])
        parse_words = lambda doc: tokenize(preprocess(filter_meta(self.decode(doc))))
        stem_words = lambda doc: [stemmer.stem(t) for t in parse_words(doc)]
        meta_func = lambda prefix: lambda doc: (t for t in self.decode(doc).split() if t.startswith(prefix))

        feat_func_map = {
            'word': lambda doc: self._word_ngrams(parse_words(doc), self.get_stop_words()),
            'stem': lambda doc: self._word_ngrams(stem_words(doc), self.get_stop_words()),
            '1st': lambda doc: ('~T:1st' for i in parse_words(doc) if i in first_person_words),
            '3rd': lambda doc: ('~T:3rd' for i in parse_words(doc) if i in third_person_words),
            'tag': lambda doc: self._word_ngrams([t[1] for t in nltk.pos_tag(parse_words(doc))]),
            'length': lambda doc: ['~L:%d' % (len(parse_words(doc)) / 5)],
            'genre': meta_func('~G'),
            'rating': meta_func('~Ra'),
            'votes': meta_func('~V'),
            'lang': meta_func('~La'),
            'country': meta_func('~Co'),
            'year': meta_func('~Y'),
            'runtime': meta_func('~Rt'),
            'type': meta_func('~T')
        }
        func_list = [feat_func_map.get(flag.strip()) for flag in self.analyzer.split(':')] \
            if type(self.analyzer) is str else None
        if not func_list:
            raise ValueError('%s is not a valid tokenization scheme/analyzer' % self.analyzer)
        else:
            return lambda doc: itertools.chain.from_iterable(f(doc) for f in func_list if callable(f))
Beispiel #3
0
def prepare_corpus(raw_documents):
    # remove punctuation
    print "Removing Punctuation"
    import string
    exclude = set(string.punctuation)
    raw_documents = [''.join(ch for ch in s if ch not in exclude) for s in raw_documents]

    # remove common words
    print "Calculating Stoplist"
    stoplist = set([x.rstrip() for x in codecs.open("stop_list.txt", encoding='utf-8') if not x.startswith("#")])
    stoplist = stoplist.union(set(nltk.corpus.stopwords.words("english")))
    # print stoplist

    print "Removing Stoplist and Stemming"

    from nltk.stem.lancaster import LancasterStemmer
    st = LancasterStemmer()

    texts = [[st.stem(word) for word in document.lower().split() if word not in stoplist]
             for document in raw_documents]

    # remove words that appear only once
    print "Removing Single Variables"
    all_tokens = sum(texts, [])
    tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
    texts = [[word for word in text if word not in tokens_once]
             for text in texts]

    return texts
Beispiel #4
0
def tokenize_rest(text):
    wnl =  WordNetLemmatizer()
    st = LancasterStemmer()
    words = nltk.word_tokenize(text)
    postag = nltk.pos_tag(words)
    
    tokens = []
    whfound=False
    for word in words:
        if word[0:2].lower() == 'wh' and not whfound:
            tokens.append({word.lower():'wh'})
            whfound = True
            continue
        elem=wnl.lemmatize(word)
        stem = st.stem(elem)
        synd = wn.synsets(stem)
        if not synd:
            stem = stemmer(elem)
            synd = wn.synsets(stem)
        if not synd:
            stem = elem
            synd = wn.synsets(stem)
        dbelement=detect(stem)
        if dbelement:
            for every_elem in dbelement:
                tokens.append({word:every_elem})
    print "\n Rest of possible Tokens"
    print tokens
    return tokens
def stem_tweet(tweet, stemmer_type = "lancaster"):
    """
    :param tweet: string representing tweet
    :param stemmer_type: type of stemmer used (default value is lancaster)
    :return: stemmed tweet
    :type tweet: str
    :type stemmer_type: str
    """
    tokens = nltk.word_tokenize(tweet)
    stemmed_tokens = []
    if stemmer_type == "lancaster":
        stemmer = LancasterStemmer()
    elif stemmer_type == "snowball":
        stemmer = SnowballStemmer("english")
    elif stemmer_type == "porter":
        stemmer = PorterStemmer()
    elif stemmer_type == "regexp":
        stemmer = RegexpStemmer("english")
    else:
        return None

    for token in tokens:
        stemmed_tokens.append(stemmer.stem(token))

    ret_tw = "".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in stemmed_tokens]).strip()
    return ret_tw
    def train_lsi_model(self, texts, num_of_toptics=10):
        texts_tokenized = [[word.lower()
                          for word in word_tokenize(text)]
                          for text in texts]
        # remove the stop words and punctuations
        english_stop_words = stopwords.words('english')
        english_punctuations = [',', '.', ':', '?', '(', ')', '[',
                                ']', '@', '&', '!', '*', '#', '$', '%']
        texts_filtered = [[word for word in text_tokenized
                         if (not word in english_punctuations) and
                         (not word in english_stop_words)]
                         for text_tokenized in texts_tokenized]
        # stem the word
        st = LancasterStemmer()
        texts_stemed = [[st.stem(word) for word in text_filtered]
                       for text_filtered in texts_filtered]

        all_stems = sum(texts_stemed, [])
        stem_once = set(stem for stem in set(all_stems)
                        if all_stems.count(stem) == 1)
        cleaned_texts = [[stem for stem in text if stem not in stem_once]
                        for text in texts_stemed]

        dictionary = corpora.Dictionary(cleaned_texts)
        corpus = [dictionary.doc2bow(text) for text in cleaned_texts]
        tfidf = models.TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]
        lsi = models.LsiModel(corpus_tfidf, id2word=dictionary,
                              num_topics=num_of_toptics)
        result = lsi[corpus]
        return result
Beispiel #7
0
	def parse_raw_data(self, new_art):
		self.startClass=default_timer()
		tokenizer = RegexpTokenizer(r'\w+')
		tokens = tokenizer.tokenize(new_art.body)
		stemmer = LancasterStemmer()
		article_dic = new_art.words
		global_dic = self.raw_dictionary

		for word in tokens:
			word = word.lower()
			if(False == self.is_stop_word(word) and word.isnumeric()==False):
				s_word = stemmer.stem(word)

			#	s_word = word
			## it is not a stop word, check if the word
			## is already part of the article dictionary.
			## if yes, increment the count else add it.
			## If you are adding check if it is part of
			## the big corpus, if yes increment the count
			## of number of articles with that word.
				self.globalWordCount+=1
				new_art.doc_len = new_art.doc_len + 1
				if(s_word in article_dic):
					article_dic[s_word].wrd_count+=1
					global_dic[s_word].wrd_count+=1
				else:
					article_dic[s_word] = local_word_attributes(1)

					if (s_word in global_dic):
						global_dic[s_word].art_count+=1
						global_dic[s_word].wrd_count+=1
					else:
						global_dic[s_word] = global_word_attributes(1,1, 1, 0)
Beispiel #8
0
def word_standardize(sentences): 	
    tokens = []
    sentences_st = []

    for sent in sentences:
        tokens.extend(word_tokenize(sent))
        sentences_st.append(word_tokenize(sent))
	
    words = tokens
    
    st = LancasterStemmer()

    words = [w.lower() for w in words]
    words = [w for w in words if not w in stopwords.words('english')]
    words = [w for w in words if not w in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~']
    st_words = [st.stem(w) for w in words]

    sent_result = []
    for sent in sentences_st:
        sent = [w.lower() for w in sent]
        sent = [w for w in sent if not w in stopwords.words('english')]
        sent = [w for w in sent if not w in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~']
        sent_result.append(sent)

    return st_words, sent_result
def tweetTokenizer(tweet_text):
	st = LancasterStemmer()	
	twitterWords = tweet_text.split()

        	#remove stop words using NLTK corpus
       	twitterWords = [word.lower() for word in twitterWords]
       	twitterWords = [w for w in twitterWords if not w in stopwords.words('english')]

        	#remove custom list of stop words using experimentation
       	noiseWords = ["i'm", "like", "get", "don't", "it's", "go", "lol", "got",
                      "one", "know", "@", "good", "want", "can't", "need", "see",
                      "people", "going", "back", "really", "u", "think", "right",
                      "never", "day", "time", "never", "that's", "even", ",", "."
                      "make", "wanna", "you're", "come", "-", "still", "much", "someone",
                      "today", "gonna", "new", "would", "take", "always", "im", "i'll",
                      "best", "'", "feel", "getting", "say", "tonight", "last", "ever",
                      "better", "i've", "look", "f*****g", "way", "could", "!", "oh"
                      "tomorrow", "night", "first", "miss", "ain't", "thank", "2", "bad"
                      "little", "thanks", "something", "wait", "&amp;", "`", "oh", "make",
                      "bad", "let","stop", "well", "tell"]

       	twitterWords = [w for w in twitterWords if not w in noiseWords]
       	twitterWords = [st.stem(w) for w in twitterWords]

	return twitterWords
Beispiel #10
0
def predict_category_subcategory(book_name):
	data_set1 = pandas.Series(book_name.encode('ascii'))

    #Data Preprocessing
	data_set1 = data_set1.dropna(axis=0,how='any')
	data_set1 = data_set1.str.lower()

    #Manual removal List
	remove_list = ['edition','ed','edn', 'vol' , 'vol.' , '-' ,'i']


	data_set1[0] =' '.join([i for i in data_set1[0].split() if i not in remove_list])

	data_set1 = data_set1.apply(lambda x :re.sub(r'\w*\d\w*', '', x).strip())
	data_set1 = data_set1.apply(lambda x :re.sub(r'\([^)]*\)', ' ', x))
	data_set1 = data_set1.apply(lambda x :re.sub('[^A-Za-z0-9]+', ' ', x))
    #data_set['Category ID'] = data_set['Category ID']+"|"+data_set['Subcategory ID']


    #Stemming the book titles
	stemmer = LancasterStemmer()
	data_set1[0]=" ".join([stemmer.stem(i) for i in  data_set1[0].split()])

	clf = joblib.load(os.path.join(BASE_DIR+"/learners/",'category_predict.pkl'))
	ans = clf.predict(data_set1)
	sub_clf = joblib.load(os.path.join(BASE_DIR+"/learners/",'subcategory_predict.pkl'))
	sub_ans = sub_clf.predict(data_set1)
	return [ans[0],sub_ans[0]]
Beispiel #11
0
def parse_validation(validation_path):
    validation_list = []
    with open(validation_path) as f:
        for line in f:
            strs = line.split('|')
            word_dict = {}
            validation_list.append(word_dict)
            word_dict["word"] = strs[0].strip()
            word_dict["real_sense"] = int(strs[1])
            sentence_list = []
            word_dict["sentence"] = sentence_list

            lmtzr = WordNetLemmatizer()
            ls = LancasterStemmer()
            single_words = re.findall("(\w+|%%)",strs[2])
            double_mod_found = False
            word_count = 0
            for single_word in single_words:
                if single_word == "%%":
                    if not double_mod_found:
                        word_dict["target_word_idx"] = word_count+1
                        double_mod_found = True
                    continue
                lemmed = lmtzr.lemmatize(single_word)
                stemmed = ls.stem(lemmed)
                if not stemmed in glob_Lucene:
                    sentence_list.append(stemmed)
                    word_count += 1

    return validation_list
def lemmstem(sentences):
    ''' This function is responsible for perfoming 
        the lemmarization and stemming of the words
        Input: A list of trees containing the sentences.
                All words are classificated by their NE type
        Output: Lemmatized/Stemmized sentences
    '''
    
    lmtzr = WordNetLemmatizer()
    st = LancasterStemmer()
    
    dic = {'VB' :wordnet.VERB,
            'NN': wordnet.NOUN,
            'JJ':wordnet.ADJ,
            'RB':wordnet.ADV }
    
    for sent in sentences:
      
        lvsidx=sent.treepositions('leaves') 
       
        for pos in lvsidx:
            word=sent[pos][0]
            tag = sent[pos][1]
            rtag = tag[0:2]
            if rtag in dic:
                lemm=lmtzr.lemmatize( word, dic[rtag] )
                stem=st.stem(lemm)
                #print word, lemm, stem #Linia maldita
                sent[pos]=(word, tag, stem)
            else:
                sent[pos]=(word, tag, word)
    
    return sentences
Beispiel #13
0
def stemming(words):
    wordsAfterStemming=[]
    st=LancasterStemmer()
    for x in words:
        y=st.stem(x)
        wordsAfterStemming.append(y)
    return wordsAfterStemming
def readText(textFile):			
	examples = []
	count = 0
	lexicon_en = {}
	lexicon_ge = {}
	stem_en = LancasterStemmer()
	stem_ge = nltk.stem.snowball.GermanStemmer()
	for line in open(textFile):
		count+=1
		if count % 1000 == 0:
			print count
		lans = line.lower().strip().split("|||")
		#german = [stem_ge.stem(x.decode('utf-8')) for x in lans[0].strip().split(" ")]
		german = lans[0].strip().split(" ")
		german = process(german)
		for wordx in german:
			for word in wordx:
				if word not in lexicon_ge:
					lexicon_ge[word]=1
				else:
					lexicon_ge[word]+=1
		eng = [stem_en.stem(x.decode('utf-8')) for x in lans[1].strip().split(" ")]
		#parse_en = pattern.en.parse(" ".join(eng))
		eng = lans[1].strip().split(" ")
		for word in eng:
			if word not in lexicon_en:
				lexicon_en[word]=1
			else:
				lexicon_en[word]+=1
		examples.append(Example(german,eng))
	return examples, lexicon_en, lexicon_ge
Beispiel #15
0
def remove_stems(file):
    new_file = []
    punctuation = re.compile(r'[.,"?!:;]')
    lemmatizer = WordNetLemmatizer()
    stemmer = LancasterStemmer()

    for raw_post in file:
        post = raw_post[1]
        token = nltk.word_tokenize(post)
        token_tags = nltk.pos_tag(token)

        new_token = []
        for word in token_tags:
            # Removes punctuations and change it to lower case
            original_word = punctuation.sub("", word[0].lower())

            # Stems each word to their roots, but using lemmatizer then Lancaster
            stemmed_word = lemmatizer.lemmatize(original_word)
            if original_word == stemmed_word:
                stemmed_word = stemmer.stem(stemmed_word)

            # Removes stopwords that are defined in the nltk library
            if stemmed_word not in nltk.corpus.stopwords.words('english') and stemmed_word != '':
                new_token.append((stemmed_word, word[1]))

        new_file.append((raw_post[0], new_token))
    return new_file
def get_pretrained_vector(session, word2vec_model, vocab_path, vocab_size, vectors):
    print(vectors)
    with gfile.GFile(vocab_path, mode="r") as vocab_file:
        st = LancasterStemmer()
        counter = 0
        counter_w2v = 0.0
        while counter < vocab_size:
            vocab_w = vocab_file.readline().replace("\n", "")

            # vocab_w = st.stem(vocab_w)
            # for each word in vocabulary check if w2v vector exist and inject.
            # otherwise dont change value initialise randomly.
            if word2vec_model and vocab_w and word2vec_model.__contains__(vocab_w) and counter > 3:
                w2w_word_vector = word2vec_model.get_vector(vocab_w)
                print("word:%s c:%i w2v size %i" % (vocab_w, counter, w2w_word_vector.size))
                vectors[counter] = w2w_word_vector
                counter_w2v += 1
            else:
                vocab_w_st = st.stem(vocab_w)
                if word2vec_model and vocab_w_st and word2vec_model.__contains__(vocab_w_st):
                    w2w_word_vector = word2vec_model.get_vector(vocab_w_st)
                    print("st_word:%s c:%i w2v size %i" % (vocab_w_st, counter, w2w_word_vector.size))
                    vectors[counter] = w2w_word_vector
                    counter_w2v += 1
                else:
                    if not vocab_w:
                        print("no more words.")
                        break

            counter += 1
        print("injected %f per cent" % (100 * counter_w2v / counter))
        print(vectors)
    return vectors
Beispiel #17
0
 def processRawData(self, inputPath, outputPath):
   raw = pickle.load(open(inputPath, "r"))
   data = []
   genres = set([])
   count = 0
   st = LancasterStemmer()
   for key in raw.keys():
     movie = raw[key]
     # if no genre or synopsis data
     if 'genres' not in movie or 'synopsis' not in movie: continue
     if len(movie['genres'])==0 or movie['synopsis'] == '': continue
     temp = {}
     temp['genres'] = movie['genres']
     for g in temp['genres']:
       genres.add(g)
     # trim out the punctuation and transform to lowercase
     #replace_punctuation = string.maketrans(string.punctuation, ' '*len(string.punctuation))
     s = str(movie['synopsis'])
     s = s.translate(string.maketrans("",""), string.punctuation)
     s = re.sub(' +', ' ', s).strip()
     s = " ".join(st.stem(word) for word in s.split(" "))
     temp['synopsis'] = s.lower()
     data.append(temp)
     count += 1
   # output as a pickle file 
   file = open(outputPath, 'wb')
   pickle.dump(data, file)
   print 'processed ' + str(count) + ' movies'
   return genres
def preprocess(reviews):
	import nltk
	from nltk.tokenize import word_tokenize

	review_tokenized = [[word.lower() for word in word_tokenize(review.decode('utf-8'))] for review in reviews] 
	#print "review tokenize done"

	#remove stop words
	from nltk.corpus import stopwords
	english_stopwords = stopwords.words('english')
	review_filterd_stopwords = [[word for word in review if not word in english_stopwords] for review in review_tokenized]
	#print 'remove stop words done'

	#remove punctuations
	english_punctuations = [',','.',':',';','?','(',')','&','!','@','#','$','%']
	review_filtered = [[word for word in review if not word in english_punctuations] for review in review_filterd_stopwords]
	#print 'remove punctuations done'

	#stemming
	from nltk.stem.lancaster import LancasterStemmer
	st = LancasterStemmer()
	review_stemmed = [[st.stem(word) for word in review] for review in review_filtered]
	#print 'stemming done'

	return review_stemmed
Beispiel #19
0
def overlapping_text(text_1, text_2):
	st = LancasterStemmer()
	cachedStopWords = get_stopwords()
	text_1_list = ([st.stem(word) for word in text_1.split() if word not in cachedStopWords])
	text_2_list = ([st.stem(word) for word in text_2.split() if word not in cachedStopWords])
	return jaccard_dist(text_1_list, text_2_list)
	'''
Beispiel #20
0
def preprocess(content):
	stopset = set(stopwords.words('english'))
	#replace punctuation and tag with space
	tokens = word_tokenize(re.sub(r'<p>|</p>|[^A-Za-z ]', ' ', content.lower())) 
	pos_list = pos_tag(tokens)
	s_tokens = list()

	#noun and verb only
	for pos in pos_list:
		#print pos[1]
		#if pos[1] in ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
		if pos[1] in ['NN', 'NNS']:
			s_tokens.append(pos[0])

	wordfreq = FreqDist(s_tokens)
	stemfreq = dict()
	st = LancasterStemmer()
	for word, freq in wordfreq.items():
		#stopwords
		if word in stopset:
			del wordfreq[word]
			continue
		#tiny words
		if len(word) <= 2:
			del wordfreq[word]
			continue
		#stemmer
		stem = st.stem(word)
		try:
			stemfreq[stem]+=freq
		except:
			stemfreq[stem]=freq
	return stemfreq
Beispiel #21
0
def simplify_old(s):
    res = ''
    st = LancasterStemmer()

    text = nltk.word_tokenize(s)
    tags = nltk.pos_tag(text)

    for tag in tags:
        word = tag[0]
        if f.checkPos(tag[1]):
            if word in model:
                word_stem = st.stem(word)
                top_words = model.most_similar(positive=[word], topn = 20)
                candidate_list = [w[0] for w in top_words]
                freq_list = [fdist[w] for w in candidate_list]
                c_f_list = zip(candidate_list, freq_list)
                ordered_list = sorted(c_f_list, key=lambda c_f_list:c_f_list[1], reverse=True)
                word_freq = fdist[word]
                #			synonmys = f.getSynonmys(word)  ## get synonmys from wordnet
                # print synonmys
                for w in ordered_list:
                    if not f.freq_diff(word_freq, w[1]):  ## break for loop if candidate word frequency does not exceed the word frequency by a threshold
                            break
                    if st.stem(w[0]) != word_stem and f.samePos(word, w[0]): ##exclude morphological derivations and same pos
                            word = w[0]  ### do not use wordnet
        # if w[0] in synonmys:
        # 	word = w[0]
        # else:
        # 	for syn in synonmys:
        # 		if st.stem(w[0]) == st.stem(syn):
        # 			word = w[0]

        res = res + word + ' '
    return res
Beispiel #22
0
def mapper():

    #list of fields in positional order expected in inbound
    #forum node data.
    fieldnames = ['id', 'title', 'tag_names', 'author_id', 'body',
                    'node_type', 'parent_id', 'abs_parent_id', 
                    'added_at', 'score', 'state_string', 'last_edited_id',
                    'last_activity_by_id', 'last_activity_at', 
                    'active_revision_id', 'extra', 'extra_ref_id',
                    'extra_count', 'marked']

    reader = csv.DictReader(sys.stdin, delimiter='\t', fieldnames=fieldnames)
    stemmer = LancasterStemmer()
    stopw = stopwords.words('english')

    split_pattern = re.compile('[\W.!?:;"()<>[\]#$=\-/]')
    for line in reader:        
        
        pid = line['id']
        body = line['body']
        
        # split body into words
        words = split_pattern.split(body)
     
        # map the stemmer function across all the words.
        # and use the Counter to create a dict
        # of counted stems. Remove english stopwords.
        stem_counts = Counter((stemmer.stem(x) for x in words  if x not in stopw))        
        
        # emit the stem, count and node id
        # for reduction into the reverse index
        for stem, count in stem_counts.items():
        	print "{stem}\t{node_id}\t{count}".format(stem=stem, node_id=pid, count=count)
	def filt(string):

		ret = string

		#	Filter all punctuation from string
		for p in punctuation:
			ret = ret.replace(p, '')

		#	Replace hyphens with spaces
		ret = ret.replace('-', ' ')
		oldret = ret
		ret = ""

		#	Filter all stop words from string
		for word in oldret.split():
			if (word in allStopWords) or len (word) <= 1:
				pass
			else:
				ret += word.lower() +  " "

		st = LancasterStemmer()
		steamed = ""

		for word in ret.split():
			try:
				steamed += str(st.stem(word)) + " "

			except UnicodeDecodeError:
				pass

		return steamed
def stem_text(text):
    stm = LancasterStemmer()
    tokens = text.split()
    words = [stm.stem(w) for w in tokens]
    snt = " ".join(words)

    return snt
def lemmatizer_newsheadlines() :
    lancaster_stemmer = LancasterStemmer()
    frl=open("C:/Users/rajas/Downloads/csv_files-2014-12-10/csv files/lemma1.csv","rU")
    fr=open("C:/Users/rajas/Downloads/csv_files-2014-12-10/csv files/sample.csv","rU")
    fw=open("C:/Users/rajas/Downloads/csv_files-2014-12-10/csv files/lemmaheadlines.csv","w")
    for headline in fr:
        if len(headline)>0:
          headlinelist=headline.split(",")
        
          if len(headlinelist)==3:
            headlinewords=headlinelist[1].split(" ")
            print(headlinewords)
            for word in headlinewords:
              wordcor=(((word.replace("?","")).replace(":","")).replace("\"",""))    
               
              headlineword=(lancaster_stemmer.stem(wordcor)).lower()
              print(headlineword) 
     #         for line in frl:
      #          crimelist=line.split(",")
       #         crimeword=((crimelist[1].replace("\"","")).strip()).lower()
               
        #        print(crimeword+str(i))
         #       i+=1
              dictcrime=lemmadict()
              if headlineword in dictcrime:
                  print(headlineword+"yipee")
                  fw.write(headlineword+","+headlinelist[0]+","+headlinelist[1]+"\n")
                                    
                  break;
    frl.close()     
    fw.close()
    fr.close()
def process(reviews):
	#separate splitor
	from nltk.tokenize import word_tokenize
	review_tokenized = [[word.lower() for word in word_tokenize(review.decode('utf-8'))] for review in reviews]

	#remove stop words
	from nltk.corpus import stopwords
	english_stopwords = stopwords.words('english')

	review_filterd_stopwords = [[word for word in review if not word in english_stopwords] for review in review_tokenized]

	#remove punctuations
	english_punctuations = [',','.','...', ':',';','?','(',')','&','!','@','#','$','%']
	review_filtered = [[word for word in review if not word in english_punctuations] for review in review_filterd_stopwords]

	#stemming
	from nltk.stem.lancaster import LancasterStemmer
	st = LancasterStemmer()
	review_stemmed = [[st.stem(word) for word in review] for review in review_filtered]

	#remove word whose frequency is less than 5
	all_stems = sum(review_stemmed, [])
	stems_lt_three = set(stem for stem in set(all_stems) if all_stems.count(stem) == 1)
	final_review = [[stem for stem in text if stem not in stems_lt_three] for text in review_stemmed]

	return final_review
Beispiel #27
0
def extractRawTrainingData(text, stopWords, stemming = None):
    st = LancasterStemmer()
    rawData = []
    text.readline()
    print("********** Extract From Raw Training Data **********")

    if stopWords:
        sign = 'ON'
    else:
        sign = 'OFF'
    print("Stopwords:" + sign)

    if stemming:
        sign = 'ON'
    else:
        sign = 'OFF'
    print("Stemming:" + sign)

    prevId = 0
    print("Extracting...")
    for line in text:

        lineTokens = line.strip('\n').split('\t')
        sentenceId = int(lineTokens[1])
        '''
        if sentenceId > prevId:

            prevId = sentenceId
            sentenceStr = lineTokens[2]
            sentiment = int(lineTokens[3])

            sentenceTokens = re.sub("\s+", " ", sentenceStr).split(' ')

            if stemming:
                sentenceTokens = map(lambda x:unicode(st.stem(x).lower()),sentenceTokens)
            else:
                sentenceTokens = map(lambda x:unicode(x.lower()),sentenceTokens)

            sentenceTokens = stripWords(sentenceTokens,stopWords)
            entry = {"sentenceId":sentenceId, "sentence":sentenceTokens, "sentiment":sentiment}
            rawData.append(entry)
        '''
        sentenceStr = lineTokens[2]
        sentiment = int(lineTokens[3])

        sentenceTokens = re.sub("\s+", " ", sentenceStr).split(' ')

        if stemming:
            sentenceTokens = map(lambda x:unicode(st.stem(x).lower()),sentenceTokens)
        else:
            sentenceTokens = map(lambda x:unicode(x.lower()),sentenceTokens)

        sentenceTokens = stripWords(sentenceTokens,stopWords)
        entry = {"sentenceId":sentenceId, "sentence":sentenceTokens, "sentiment":sentiment}
        rawData.append(entry)

    print("Done")
    print(len(rawData))
    return rawData
Beispiel #28
0
def stem_funct(str):
    res = ''
    #Use NLTK's stemmer
    st = LancasterStemmer()
    #Stem each word and append the result in the string
    for word in str.split(' '):
        res += ' ' + st.stem(word)
    return res
def containKeywords(text, keywords):
    letters_only = re.sub("[^a-zA-Z0-9]", " ", text)
    lower_case = letters_only.lower()
    words = lower_case.split()
    words = [word for word in words if not word in stopwords.words("english")]
    st = LancasterStemmer()
    stemmed = [st.stem(word) for word in words]
    return (any(i in stemmed for i in keywords))
 def tokenizeRawText(self):
     for each in self.rawtext:
         st = LancasterStemmer()
         try:
             ev = st.stem(each.lower())
             self.rawTokens.append(ev)
         except UnicodeDecodeError as e:
             self.unicodeErrors = self.unicodeErrors + 1
# things we need in general
import sys
import pickle
import json
import ijson.backends.yajl2 as ijson
# things we need for NLP
import nltk
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()
# things we need for Tensorflow
import os
import numpy as np
import tflearn
import tensorflow as tf
import random

#get all the inputs
training_input    = sys.argv[1]
training_logs     = sys.argv[2]
model_output      = sys.argv[3]
training_data_file= sys.argv[4]
words_file        = str(training_data_file) + ".words"
classes_file      = str(training_data_file) + ".classes"
documents_file    = str(training_data_file) + ".documents"
training_text_file= str(training_data_file) + ".txt"

# helper methods
def load_json(filepath):
    return json.load(open(filepath, "r"))

def save_json(data, filepath):
Beispiel #32
0
 def __init__(self):
     self.stemmer = LancasterStemmer()
     with open('intents_data.json', 'r') as json_data:
         self.intents = json.load(json_data)
Beispiel #33
0
def text_to_wordlist(text, remove_stop_words=True, stem_words=False, lemma=True):
    # Clean the text, with the option to remove stop_words and to stem words.

    # Clean the text
    text = text.rstrip('?')
    text = text.rstrip(',')
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r"what's", "", text)
    text = re.sub(r"What's", "", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"I'm", "I am", text)
    text = re.sub(r" m ", " am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"60k", " 60000 ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e-mail", "email", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"quikly", "quickly", text)
    text = re.sub(r" usa ", " America ", text)
    text = re.sub(r" USA ", " America ", text)
    text = re.sub(r" u s ", " America ", text)
    text = re.sub(r" uk ", " England ", text)
    text = re.sub(r" UK ", " England ", text)
    text = re.sub(r"india", "India", text)
    text = re.sub(r"switzerland", "Switzerland", text)
    text = re.sub(r"china", "China", text)
    text = re.sub(r"chinese", "Chinese", text) 
    text = re.sub(r"imrovement", "improvement", text)
    text = re.sub(r"intially", "initially", text)
    text = re.sub(r"quora", "Quora", text)
    text = re.sub(r" dms ", "direct messages ", text)  
    text = re.sub(r"demonitization", "demonetization", text) 
    text = re.sub(r"actived", "active", text)
    text = re.sub(r"kms", " kilometers ", text)
    text = re.sub(r"KMs", " kilometers ", text)
    text = re.sub(r" cs ", " computer science ", text) 
    text = re.sub(r" upvotes ", " up votes ", text)
    text = re.sub(r" iPhone ", " phone ", text)
    text = re.sub(r"\0rs ", " rs ", text) 
    text = re.sub(r"calender", "calendar", text)
    text = re.sub(r"ios", "operating system", text)
    text = re.sub(r"gps", "GPS", text)
    text = re.sub(r"gst", "GST", text)
    text = re.sub(r"programing", "programming", text)
    text = re.sub(r"bestfriend", "best friend", text)
    text = re.sub(r"dna", "DNA", text)
    text = re.sub(r"III", "3", text) 
    text = re.sub(r"the US", "America", text)
    text = re.sub(r"Astrology", "astrology", text)
    text = re.sub(r"Method", "method", text)
    text = re.sub(r"Find", "find", text) 
    text = re.sub(r"banglore", "Banglore", text)
    text = re.sub(r" J K ", " JK ", text)
    
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])

    if remove_stop_words:
        text = text.split()
        text = [w for w in text if not w in stop_words]
        text = " ".join(text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        #stemmer = SnowballStemmer('english')
        #stemmed_words = [stemmer.stem(word) for word in text]
        stemmed_words = [nltk.PorterStemmer().stem_word(word.lower()) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words after lemma
    if lemma:
        text = text.split()
        lancaster_stemmer = LancasterStemmer()
        lemma_words = [lancaster_stemmer.stem(word.lower()) for word in text]
        text = " ".join(lemma_words)
    return(text)
Beispiel #34
0
import nltk
# nltk.download('punkt')
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()

import numpy
import tflearn
import tensorflow
import random
import json
import pickle

with open("intents.json") as file:
    data = json.load(file)

# avoid preprocessings if it has already done
try:
    with open("data.pickle", "rb") as f:
        words, labels, training, output = pickle.load(f)
except:
    words = [] # tokenized words in patterns of all tags
    labels = [] # distinct tags
    docs_x = [] # list of all words
    docs_y = [] # list of tags associated with the words in docs_x
    ignore_letters = [",", "!", ".", ":", "?"]

    for intent in data["intents"]:
        for pattern in intent["patterns"]:
            wrds = nltk.word_tokenize(pattern)
            words.extend(wrds)
            docs_x.append(wrds)
Beispiel #35
0
'''
Stemming: same meaning words treated as 1 word. Reduce words to their stem words.
Part of Speech Tagging: Tag words based on whether it is a noun, verb, adjective, etc.
'''
import nltk
from nltk.tokenize import word_tokenize

#Stemmer used: Lancaster Stemmer
text = "Mary closed on closing night when she was in the mood to close."
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
stemmedWords = [st.stem(word) for word in word_tokenize(text)]
print(stemmedWords)

#Tagging - Noun, Verb, Adverb, Adjective, etc
#NNP-Proper Noun, VBD-Verb, NN-Noun, PRP-Pronoun
#nltk.download('averaged_perceptron_tagger')
pos_taggs = nltk.pos_tag(word_tokenize(text))
print(pos_taggs)
for i in range(0, len(x)):
    text = x[i]
    num_char_w = len(text)
    feature_set[i].append(num_char_w)

#Number of characters without whitespace (6)
for i in range(0, len(x)):
    text = x[i]
    num_char = 0
    for j in range(0, len(text)):
        if text[j] != ' ':
            num_char = num_char + 1
    feature_set[i].append(num_char)

tkr = RegexpTokenizer('[a-zA-Z0-9@]+')
stemmer = LancasterStemmer()

tokenized_corpus = []

for i, news in enumerate(x):
    tokens = [stemmer.stem(t) for t in tkr.tokenize(news)]
    tokenized_corpus.append(tokens)

#Number of unique words (7)
for i in range(0, len(tokenized_corpus)):
    text = tokenized_corpus[i]
    s = set(text)
    unq = len(s)
    feature_set[i].append(len(s))

#Lexical Density or Complexity- Number of Unique Tokens Divided by total number of words (8)
Beispiel #37
0
 def lanStem(self, token):
     ls = LancasterStemmer()
     for w in token:
         print(ls.stem(w))
Beispiel #38
0
# use natural language toolkit
import json
import nltk
from nltk.stem.lancaster import LancasterStemmer
# word stemmer
stemmer = LancasterStemmer()
training_data = []
#with open("new.txt", "r") as read_file:
#   data = read_file.readlines()
#for i in data:
#    training_data.append(i)
#training_data=training_data.split(",")
#print(training_data[1])

training_data.append({
    "Class": "Description",
    "Question": "What is Filename injection Path traversel ?"
})
training_data.append({
    "Class":
    "Description",
    "Question":
    "What does Filename injection Path traversel mean ?"
})
training_data.append({
    "Class":
    "Description",
    "Question":
    "Tell me something about Filename injection Path traversel ?"
})
training_data.append({
Beispiel #39
0
class ChatBot(object):

    instance = None

    @classmethod
    def getBot(cls):
        if cls.instance is None:
            cls.instance = ChatBot()
        return cls.instance

    def __init__(self):
        print("Init")
        if self.instance is not None:
            raise ValueError("Did you forgot to call getBot function ? ")

        self.stemmer = LancasterStemmer()
        data = pickle.load(open(path.getPath('trained_data'), "rb"))
        self.words = data['words']
        self.classes = data['classes']
        train_x = data['train_x']
        train_y = data['train_y']
        with open(path.getJsonPath()) as json_data:
            self.intents = json.load(json_data)
        net = tflearn.input_data(shape=[None, len(train_x[0])])
        net = tflearn.fully_connected(net, 8)
        net = tflearn.fully_connected(net, 8)
        net = tflearn.fully_connected(net,
                                      len(train_y[0]),
                                      activation='softmax')
        net = tflearn.regression(net)
        self.model = tflearn.DNN(net,
                                 tensorboard_dir=path.getPath('train_logs'))
        self.model.load(path.getPath('model.tflearn'))

    def clean_up_sentence(self, sentence):
        sentence_words = nltk.word_tokenize(sentence)
        sentence_words = [
            self.stemmer.stem(word.lower()) for word in sentence_words
        ]
        return sentence_words

    def bow(self, sentence, words, show_details=False):
        sentence_words = self.clean_up_sentence(sentence)
        bag = [0] * len(words)
        for s in sentence_words:
            for i, w in enumerate(words):
                if w == s:
                    bag[i] = 1
                    if show_details:
                        print("found in bag: %s" % w)
        return np.array(bag)

    def classify(self, sentence):
        ERROR_THRESHOLD = 0.25
        results = self.model.predict([self.bow(sentence, self.words)])[0]
        results = [[i, r] for i, r in enumerate(results)
                   if r > ERROR_THRESHOLD]
        results.sort(key=lambda x: x[1], reverse=True)
        return_list = []
        for r in results:
            return_list.append((self.classes[r[0]], r[1]))
        return return_list

    def response(self, sentence, userID='111', show_details=False):
        results = self.classify(sentence)
        context = {}
        if results:
            while results:
                for i in self.intents['intents']:
                    if i['tag'] == results[0][0]:
                        if 'context_set' in i:
                            if show_details:
                                print('context:', i['context_set'])
                            context[userID] = i['context_set']
                        if not 'context_filter' in i or \
                                (userID in context and 'context_filter' in i and i['context_filter'] ==
                                 context[
                                     userID]):
                            if show_details: print('tag:', i['tag'])
                            return random.choice(i['responses'])
                return "I can't guess"
# For build the final list of words (for inverted index) we remove the columns: Preptime Cooktime Recipeyield
f = open("ricette.csv", "r", encoding='utf-8-sig')
ricette = []
for row in csv.reader(f, delimiter='\t'):
    if row:
        a = []
        a.extend(row[:3])
        a.extend(row[6:])
        ricette.append(a)
f.close()

#%%
# Define some function that we used succesively
stop = stopwords.words('english')
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
st = LancasterStemmer()
#%%
# We created a list with all the aliments that contains lactose
Intol = []
f = open("intol.txt")
for row in csv.reader(f, delimiter='\t'):
    Intol.append(row[0])
f.close()
text_i = " ".join(Intol).lower()
# Tokenization and stemming
tokens_i = tokenizer.tokenize(text_i)
Intol_stem = []
for w in tokens_i:
    Intol_stem.append(st.stem(w))
Intol_stem = set(Intol_stem)
#%%
        print()


# LDA
repeat(2, 6, cleaned_corpus)

# remove words 'love' and 'like'
sw = stopwords.words('english') + ['love', 'Love', 'like', 'Like']
repeat('LDA', 2, 4, cleaned_corpus, sw=sw)

# stem
stemmed_corpus = []
for poem in cleaned_corpus:
    stemmed_poem = []
    for word in poem:
        stemmed_poem.append(LancasterStemmer().stem(word))
    stemmed_corpus.append(''.join(stemmed_poem))

repeat('LDA', 2, 4, stemmed_corpus, sw=sw)
repeat('LDA', 3, 5, stemmed_corpus, sw=(sw + ['one', 'know', 'would']))

# nouns only
nouns_corpus = []
for poem in df['POS']:
    poem_nouns = []
    for word in poem:
        if word[1] == 'NN':
            poem_nouns.append(word[0] + ' ')
    nouns_corpus.append(''.join(poem_nouns))

repeat('LDA', 2, 5, nouns_corpus, sw=sw)
Beispiel #42
0
import nltk
from nltk.stem.lancaster import LancasterStemmer

stemmer = LancasterStemmer()
import numpy as np
import scipy
import pandas
import tflearn
import tensorflow as tf
import random
import json
import speech_recognition as sr

# restore all of our data structures

import pickle

data = pickle.load(open("training_data", "rb"))

words = data['words']

classes = data['classes']

train_x = data['train_x']

train_y = data['train_y']

# import our chat-bot intents file

import json
Beispiel #43
0
import nltk
nltk.download('punkt')
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()

import tensorflow as tf
import numpy as np
import tflearn
import random
import json

from google.colab import files
files.upload()


#import chat bot intents file
with open('intents.json') as json_data:
  intents = json.load(json_data)

#running intent file

words = []
classes = []
documents = []
ignore = ['?']

#looping through each sentence in the json file's pattern
for intent in intents['intents']:
  for pattern in intent['patterns']:
    #tokeninzing each word in the sentence
    w = nltk.word_tokenize(pattern)
Beispiel #44
0
def train_model():
    with open('intents.json') as json_data:
        intents = json.load(json_data)

    words = []  #Design the Vocabulary (unique words)
    classes = []
    documents = []
    ignore_words = ['?']
    # loop through each sentence in our intents patterns
    for intent in intents['intents']:
        for pattern in intent['patterns']:
            # tokenize each word in the sentence
            w = nltk.word_tokenize(pattern)
            # add to our words list
            words.extend(w)
            # add to documents in our corpus
            documents.append((w, intent['tag']))
            # add to our classes list
            if intent['tag'] not in classes:
                classes.append(intent['tag'])

    stemmer = LancasterStemmer()

    # stem and lower each word and remove duplicates
    words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]
    words = sorted(list(set(words)))

    # remove duplicates
    classes = sorted(list(set(classes)))

    # create our training data
    training = []

    # create an empty array for our output
    output_empty = [0] * len(classes)

    # training set, bag of words for each sentence
    for doc in documents:
        # initialize our bag of words
        bag = []
        # list of tokenized words for the pattern (pattern = what user says)
        pattern_words = doc[0]
        # stem each word
        pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]
        # create our bag of words array
        # mark the presence of words as a boolean value, 0 for absent, 1 for present.
        for w in words:
            bag.append(1) if w in pattern_words else bag.append(0)

        # output is a '0' for each tag and '1' for current tag
        output_row = list(output_empty)
        output_row[classes.index(doc[1])] = 1

        training.append([bag, output_row])

    # shuffle our features and turn into np.array
    random.shuffle(training)
    training = np.array(training)

    # create train and test lists
    train_x = list(training[:, 0])
    train_y = list(training[:, 1])

    # reset underlying graph data
    tf.reset_default_graph()
    # Build neural network
    net = tflearn.input_data(shape=[None, len(train_x[0])])
    net = tflearn.fully_connected(net, 8)
    net = tflearn.fully_connected(net, 8)
    net = tflearn.fully_connected(net, len(train_y[0]), activation='softmax')
    net = tflearn.regression(net)

    # Define model and setup tensorboard
    model = tflearn.DNN(net, tensorboard_dir='tflearn_logs')
    # Start training (apply gradient descent algorithm)
    model.fit(train_x, train_y, n_epoch=1000, batch_size=8, show_metric=True)
    model.save('model.tflearn')

    # save all of our data structures
    import pickle
    pickle.dump(
        {
            'words': words,
            'classes': classes,
            'train_x': train_x,
            'train_y': train_y
        }, open("training_data", "wb"))


#train_model()
Beispiel #45
0
import nltk

text = "strange lying saved discusses men builds"
print("原始文本:")
print(text)
# 词干提取与词形还原之前先进行分词
tokens = nltk.word_tokenize(text)

# stemming-提取词干
# 导入stem.porter和Lancaster工具包
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
# 实例化PosterStemmer对象
porter_stemmer = PorterStemmer()
# 实例化LancasterStemmer对象
lancaster_stemmer = LancasterStemmer()
# 新建stemmed_list和lancaster_list数组,用于分别存放PorterStemmer和LancasterStemmer的结果
stemmed_list = []
lancaster_list = []
for token in tokens:
    stemmed_list.append(porter_stemmer.stem(token))
    lancaster_list.append(lancaster_stemmer.stem(token))
print("提取词干结果:")
print("1.PorterStemmer:", stemmed_list)
print("2.LancasterStemmer:", lancaster_list)

# Lemmatization-词形还原
# nltk的Lemmatization是基于WordNet实现的,导入WordNetLemmatizer。
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()
Beispiel #46
0
for i in range(len(list_raw1)):
    list_tok1.append(nltk.word_tokenize(list_raw1[i].lower()))

for i in range(len(list_raw2)):
    list_tok2.append(nltk.word_tokenize(list_raw2[i].lower()))

list_pos1 = []
list_pos2 = []

for i in range(len(list_raw1)):
    list_pos1.append(nltk.pos_tag(list_tok1[i]))

for i in range(len(list_raw2)):
    list_pos2.append(nltk.pos_tag(list_tok2[i]))

st = LancasterStemmer()

#print (pos)
#grammar = "NP: {<DT>?<JJ>*<NN>}"
#cp = nltk.RegexpParser(grammar)
#result = cp.parse(pos)
#print (result)
#result.draw()
list1 = []
list2 = []
for j in range(len(list_pos1)):
    list1 = []
    for i in range(len(list_pos1[j])):
        if list_pos1[j][i][1] == 'NN':
            list1.append(list_pos1[j][i][0])
        if list_pos1[j][i][1] == 'JJ':
Beispiel #47
0
    def fit(self, module=None):

        if not module:
            module = self.module

        intents = {}
        for intent in module.intents:
            if intent.patterns:
                intents[intent.name] = {"patterns": []}
                for pattern in intent.patterns:
                    intents[intent.name]['patterns'].append(pattern.text)

        garbage_training_intents = Intent().select().where(
            Intent.agent != module.id)
        intents['not_found'] = {"patterns": []}
        for intent in garbage_training_intents:
            if intent.patterns:
                for pattern in intent.patterns:
                    intents['not_found']['patterns'].append(pattern.text)

        vocabulary = []
        classes = []
        documents = []
        ignore_words = ['?']

        for intent_name in intents:
            intent = intents[intent_name]
            for pattern in intent['patterns']:
                w = nltk.word_tokenize(pattern)
                vocabulary.extend(w)
                documents.append((w, intent_name))
                if intent_name not in classes:
                    classes.append(intent_name)

        stemmer = LancasterStemmer()
        vocabulary = [
            stemmer.stem(w.lower()) for w in vocabulary
            if w not in ignore_words
        ]
        vocabulary = sorted(list(set(vocabulary)))

        classes = sorted(list(set(classes)))
        training = []
        output_empty = [0] * len(classes)

        for doc in documents:
            bag = []
            pattern_words = doc[0]
            pattern_words = [
                stemmer.stem(word.lower()) for word in pattern_words
            ]
            for word in vocabulary:
                bag.append(1) if word in pattern_words else bag.append(0)

            output_row = list(output_empty)
            output_row[classes.index(doc[1])] = 1
            training.append([bag, output_row])

        random.shuffle(training)
        training = np.array(training)
        train_x = list(training[:, 0])
        train_y = list(training[:, 1])

        tf_model = Sequential()
        tf_model.add(
            Dense(128, input_shape=(len(train_x[0]), ), activation='relu'))
        tf_model.add(Dropout(0.5))
        tf_model.add(Dense(64, activation='relu'))
        tf_model.add(Dropout(0.5))
        tf_model.add(Dense(len(train_y[0]), activation='softmax'))

        sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
        tf_model.compile(loss='categorical_crossentropy',
                         optimizer=sgd,
                         metrics=['accuracy'])

        tf_model.fit(np.array(train_x),
                     np.array(train_y),
                     epochs=200,
                     batch_size=5,
                     verbose=1)

        save_model(tf_model, 'chat/' + module.name + '.h5', True)
        #converter = tf.lite.TFLiteConverter.from_keras_model_file('chat/model.h5')
        #tflite_model = converter.convert()
        #open("chat/model.tflite", "wb").write(tflite_model);

        with open("chat/" + module.name + ".pkl", "wb") as dataFile:
            pickle.dump(
                {
                    'vocabulary': vocabulary,
                    'classes': classes,
                    'train_x': train_x,
                    'train_y': train_y
                }, dataFile)
Beispiel #48
0
class ChatterBox(object):

    models = {}

    def __init__(self,
                 agent=None,
                 required_models=[],
                 error_threshold=0.25,
                 full_init=True):

        self.agent = agent
        self.required_models = required_models
        self.error_threshold = error_threshold
        self.stemmer = LancasterStemmer()
        self.graph = None
        self.session = None
        if full_init:
            self.initialize_agents()

    def initialize_agents(self):
        self.graph = tf.get_default_graph()
        self.session = tf.Session()
        set_session(self.session)

        for model in self.required_models:
            self.models[model] = ChatModule(model)

    def classify(self, model, sentence):

        with self.graph.as_default():

            set_session(self.session)
            bag = [0] * len(model.vocabulary)
            for s in [
                    self.stemmer.stem(word.lower())
                    for word in nltk.word_tokenize(sentence)
            ]:
                for i, w in enumerate(model.vocabulary):
                    if w == s:
                        bag[i] = 1

            results = model.model.predict(
                DataFrame([(np.array(bag))], dtype=float, index=['input']))[0]
            results = [[i, r] for i, r in enumerate(results)
                       if r > self.error_threshold]
            results.sort(key=lambda x: x[1])

        return results

    def intent(self, model, results):

        result = results.pop()
        intent_name = model.classes[result[0]]
        if intent_name == 'not_found':
            return ChatAgentResponse(intent_name, '', result[1])

        intent_model = Intent().select().where(
            Intent.name == intent_name).get()

        if not intent_model.dialogs:

            response = Application().handle_request(intent_name)
            if response:
                response = ChatAgentResponse(intent_name, response, result[1])
            else:
                intent_response = random.choice(intent_model.responses)
                response = ChatAgentResponse(intent_name, intent_response.text,
                                             result[1])

        else:

            dialogs = []
            for dialog in intent_model.dialogs:
                if not dialogs:
                    dialog_intent_model = Intent().select().where(
                        Intent.name == dialog.name).get()
                    response = ChatAgentResponse(
                        intent_name,
                        random.choice(dialog_intent_model.responses).text,
                        result[1], dialog.input_type)
                dialogs.append({
                    'name': dialog.name,
                    'slot': dialog.slot,
                    'value': None,
                    'input_type': dialog.input_type
                })
            session['intent'] = intent_model.name
            session['dialogs'] = dialogs
            session['dialog_step'] = 0

        if intent_model.contexts:
            contexts = []
            for context in intent_model.contexts:
                contexts.append(intent_model.text)
            session['context'] = " ".join(contexts)

        return response

    def dialog(self, input):

        self.store_input(input)

        if self.dialog_has_next_step():
            response = self.dialog_next_step()
        else:
            response = self.complete_dialog()

        return response

    def dialog_has_next_step(self):

        return session.get('dialog_step') + 1 < len(session.get('dialogs'))

    def dialog_next_step(self):

        dialogs = session.get('dialogs')
        session['dialog_step'] += 1
        intent_name = dialogs[session.get('dialog_step')]['name']
        intent_model = Intent().select().where(
            Intent.name == intent_name).get()

        return ChatAgentResponse(
            intent_name,
            random.choice(intent_model.responses).text,
            input_type=dialogs[session.get('dialog_step')]['input_type'])

    def store_input(self, input):

        dialogs = session.get('dialogs')
        dialogs[session.get('dialog_step')]['value'] = input
        session['dialogs'] = dialogs

    def complete_dialog(self):

        dialogs = session.get('dialogs')
        intent_model = Intent().select().where(
            Intent.name == session.get('intent')).get()
        slots = {}
        for dialog in dialogs:
            slots[dialog['slot']] = dialog['value']

        response = Application().handle_request(session.get('intent'), slots)

        response = ChatAgentResponse(session.get('intent'), response)

        self.clean_session()

        return response

    def chat(self, sentence):

        if not session.get('intent'):
            for model in self.models:
                results = self.classify(self.models[model], sentence)
                response = self.intent(self.models[model], results)
                if response.confidence > .85:
                    if response.classification == 'not_found':
                        continue
                    else:
                        break

        else:
            response = self.dialog(sentence)

        return response

    def clean_session(self):

        session['intent'] = None
        session['dialogs'] = None
        session['dialog_step'] = None
Beispiel #49
0
class BllipParser(Parser):
    """
    Implementation of the bllipparser for parsing the each sentence in each
    part separately, finding dependencies, parts of speech tags, lemmas and
    head words for each entity.

    Uses preprocessed text

    :param nbest: the number of parse trees to obtain
    :type nbest: int
    :param overparsing: overparsing determines how much more time the parser
        will spend on a sentence relative to the time it took to find the
        first possible complete parse
    :type overparsing: int
    """
    def __init__(self,
                 nbest=10,
                 overparsing=10,
                 only_parse=False,
                 stop_words=None):
        try:
            from bllipparser import RerankingParser
            # WARNING if only_parse=False, BllipParser depends on PyStanfordDependencies: pip install PyStanfordDependencies
        except ImportError:
            raise ImportError(
                'BllipParser not installed, perhaps it is not supported on OS X yet'
            )

        self.parser = RerankingParser.fetch_and_load('GENIA+PubMed',
                                                     verbose=True)
        # WARNING this can take a long while. Install manually: `python -mbllipparser.ModelFetcher -i GENIA+PubMed`
        """create a Reranking Parser from BllipParser"""
        self.parser.set_parser_options(nbest=nbest, overparsing=overparsing)
        """set parser options"""
        self.only_parse = only_parse
        """whether features should be used from the BllipParser"""
        self.stemmer = LancasterStemmer()
        """an instance of LancasterStemmer from NLTK"""
        self.stop_words = stop_words
        if self.stop_words is None:
            self.stop_words = stopwords.words('english')

    def parse(self, dataset):
        outer_bar = Bar('Processing [Bllip]', max=len(list(dataset.parts())))
        for part in dataset.parts():
            outer_bar.next()
            if len(part.sentence_parse_trees) > 0:
                continue
            for index, sentence in enumerate(part.sentences):
                sentence = [token.word for token in part.sentences[index]]
                parse = self.parser.parse(sentence)
                parsed = parse[0]
                part.sentence_parse_trees.append(str(parsed.ptb_parse))
                if not self.only_parse:
                    tokens = parsed.ptb_parse.sd_tokens()
                    for token in tokens:
                        tok = part.sentences[index][token.index - 1]
                        is_stop = False
                        if tok.word.lower() in self.stop_words:
                            is_stop = True
                        tok.features = {
                            'id': token.index - 1,
                            'pos': token.pos,
                            'lemma': self.stemmer.stem(tok.word),
                            'is_punct': self._is_punct(tok.word),
                            'dep': token.deprel,
                            'is_stop': is_stop,
                            'dependency_from': None,
                            'dependency_to': [],
                            'is_root': False,
                        }

                    for token in tokens:
                        tok = part.sentences[index][token.index - 1]
                        self._dependency_path(token, tok, part, index)

            part.percolate_tokens_to_entities()
            part.calculate_token_scores()
            part.set_head_tokens()

        outer_bar.finish()

    def _dependency_path(self, bllip_token, token, part, index):
        if bllip_token.head - 1 >= 0:
            token.features['dependency_from'] = (
                part.sentences[index][bllip_token.head - 1],
                bllip_token.deprel)
        else:
            token.features['dependency_from'] = (
                part.sentences[index][token.features['id']],
                bllip_token.deprel)
        token_from = part.sentences[index][bllip_token.head - 1]
        if (bllip_token.index != bllip_token.head):
            token_from.features['dependency_to'].append(
                (token, bllip_token.deprel))
        else:
            token.features['is_root'] = True

    def _is_punct(self, text):
        if text in ['.', ',', '-']:
            return True
        return False
Beispiel #50
0
import string
import unicodedata
import sys

# a table structure to hold pronunciation types
punct_tbl = dict.fromkeys(i for i in range(sys.maxunicode)
                          if unicodedata.category(chr(i)).startswith('P'))


# function to remove punctuation from a table
def remove_punctuation(text):
    return text.translate(punct_tbl)


# init the stemmer
stemmer = LancasterStemmer()

# read the json file and load the training data
with open('data.json', 'r') as json_data:
    data = json.load(json_data)

# list of all the categories to train for
categories = list(data.keys())
words = []
# a list of tuple with words in the sentence and the respective category name
docs = []

for each_category in data.keys():
    for each_sentence in data[each_category]:
        # remove punctuation from sentence
        print(each_sentence)
Beispiel #51
0
class NLPImplementation:
    def __init__(self, intents_location):
        self.intents_ignore_words = ["?", "!", ".", ","]
        self.ERROR_THRESHOLD = 0.25
        self.model = None
        self.intents_location = intents_location
        self.stemmer = LancasterStemmer()
        self.intents_words, self.intents_documents, self.intents_classes = self.apply_tokenization_on_intents()
        self.model_save_name = "chatbot_model.h5"
        self.spacy = spacy.load("en_core_web_sm")

    def clean_up_sentence(self, sentence):
        # tokenize the pattern
        sentence_words = nltk.word_tokenize(sentence)
        # stem each word
        sentence_words = [self.stemmer.stem(word.lower()) for word in sentence_words]

        return sentence_words

    def bag_of_words(self, sentence):
        """ return bag of words array: 0 or 1 for each word in the bag that exists in the sentence"""
        # tokenize the pattern
        sentence_words = self.clean_up_sentence(sentence)
        # bag of words
        bag = [0] * len(self.intents_words)
        for sw in sentence_words:
            for index, word in enumerate(self.intents_words):
                if word == sw:
                    bag[index] = 1

        return np.array(bag)

    def spacy_retrieve_nouns(self, text):
        """ Explain what spacy is """
        doc = self.spacy(text)
        ents = []
        for ent in doc.ents:
            ents.append(ent)
        return ents

    @staticmethod
    async def get_weather(location):
        client = python_weather.Client(format=python_weather.METRIC)
        weather = await client.find(location)
        current_temperature = int((weather.current.temperature - 32) * 5/9)
        return_text = f"Current temperature in {location} is {current_temperature}°C" \
                      f"\n\nThe forecast temperature for the next 5 days will be: \n"

        for forecast in weather.forecasts:
            temp = int((forecast.temperature-32)*5/9)
            return_text += f"Date: {forecast.date.date()}, Sky: {forecast.sky_text}, Temperature: {temp}°C\n"

        await client.close()

        return return_text

    @staticmethod
    def get_time_by_city(city_location):
        g = Nominatim(user_agent='twitter_chat_bot')
        location = g.geocode(city_location)

        obj = TimezoneFinder()
        result = obj.timezone_at(lng=location.longitude, lat=location.latitude)
        t = pytz.timezone(result)
        time = datetime.now(t).strftime('%Y:%m:%d %H:%M:%S')

        return str(time)

    def response(self, sentence):
        with open(self.intents_location) as json_data:
            intents = json.load(json_data)
            json_data.close()

        results = self.classify(sentence)
        # if classification exists then find the matching intent tag and return a response from the respective tag
        if results:
            # loop as long as there are matches to process
            while results:
                for i in intents['intents']:
                    # find a tag matching the first result
                    if i['tag'] == results[0]["intent"]:
                        # return a random response from the intent
                        # If question is for specific data such as Time, Weather, Wikipedia, etc, return specified info
                        if i['tag'] == 'information':
                            topic = re.search('tell me about (.*)', sentence.lower())
                            if topic:
                                topic = topic.group(1)
                                try:
                                    wiki = wikipedia.summary(topic)
                                except (wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError) as e:
                                    wiki = str(e)
                                return wiki
                            return "For me to understand your wikipedia question, use the format 'tell me about *'"

                        if i['tag'] == 'time':
                            ents = self.spacy_retrieve_nouns(sentence)
                            if len(ents) > 0:
                                time = self.get_time_by_city(str(ents[0]))
                                return f"The current time in {str(ents[0])} is {time}"

                        if i['tag'] == 'weather':
                            ents = self.spacy_retrieve_nouns(sentence)
                            print(ents)
                            if len(ents) > 0:
                                loop = asyncio.get_event_loop()

                                data_weather = loop.run_until_complete(self.get_weather(str(ents[0])))

                                return data_weather

                        if i['tag'] == 'stocks':
                            ticker = reticker.TickerExtractor().extract(sentence.upper())
                            print(ticker)
                            return_text = ""
                            for tick in ticker:
                                yahoo_price = YahooFinancials(tick)
                                if yahoo_price.get_current_price() is None:
                                    continue
                                return_text += f"Current price of {tick} is {yahoo_price.get_currency()} " \
                                               f"{yahoo_price.get_current_price()}\n"
                            if len(return_text) > 0:
                                return return_text

                        return random.choice(i['response'])

                results.pop(0)

    def classify(self, sentence):
        # generate probabilities from the model
        self.load_model()

        bow = self.bag_of_words(sentence)
        results = self.model.predict(np.array([bow]))[0]

        # Filters out predictions below a threshold
        results = [[i, res] for i, res in enumerate(results) if res > self.ERROR_THRESHOLD]

        # sort by strength of probability
        results.sort(key=lambda x: x[1], reverse=True)
        return_list = []
        for r in results:
            return_list.append({"intent": self.intents_classes[r[0]], "probability": r[1]})

        # return dict of intent and probability
        print(return_list)
        return return_list

    def apply_tokenization_on_intents(self):
        documents = []
        words = []
        classes = []

        with open(self.intents_location) as json_data:
            intents = json.load(json_data)
            json_data.close()

        for intent in intents["intents"]:
            for pattern in intent["patterns"]:
                #  Tokenize each word
                word = nltk.word_tokenize(pattern)
                words.extend(word)

                # Add to documents in our corpus
                documents.append((word, intent["tag"]))

                # Add to classes list
                if intent["tag"] not in classes:
                    classes.append(intent["tag"])

        words = [self.stemmer.stem(w.lower()) for w in words if w not in self.intents_ignore_words]
        words = sorted(list(set(words)))

        # Removes duplicates
        classes = sorted(list(set(classes)))

        # print(f"Document Length: {len(documents)}")
        # print(f"Classes length: {len(classes)} contains: \n {classes}")
        # print(f"Number of unique stemmed words: {len(words)} contains: \n {words}")

        return words, documents, classes

    def create_training_data(self):
        training = []
        # create an empty array for our output
        output_empty = [0] * len(self.intents_classes)

        # training set, bag of words for each sentence
        for doc in self.intents_documents:
            # initialize our bag of words
            bag = []

            # list of tokenized words for the pattern
            pattern_words = doc[0]

            # stem each word
            pattern_words = [self.stemmer.stem(word.lower()) for word in pattern_words]

            # create our bag of words array
            for word in self.intents_words:
                bag.append(1) if word in pattern_words else bag.append(0)

            # output is a '0' for each tag and '1' for current tag
            output_row = list(output_empty)
            output_row[self.intents_classes.index(doc[1])] = 1

            training.append([bag, output_row])

        random.shuffle(training)
        training = np.array(training)

        train_x = list(training[:, 0])
        train_y = list(training[:, 1])

        return [train_x, train_y]

    def train_model(self):
        # Build neural network

        train_x, train_y = self.create_training_data()
        model = Sequential()
        model.add(Dense(128, input_shape=(len(train_x[0]),), activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(len(train_y[0]), activation='softmax'))

        sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
        model.compile(loss="categorical_crossentropy", optimizer=sgd, metrics=["accuracy"])

        model_fit = model.fit(np.array(train_x), np.array(train_y), epochs=2000, batch_size=5, verbose=1)

        model.save(self.model_save_name, model_fit)
        print("Training Complete")

        pickle.dump(
            {
                'words': self.intents_words,
                'classes': self.intents_classes,
                'train_x': train_x,
                'train_y': train_y},
            open("training_data", "wb"),
        )

    def load_model(self):
        """Makes sure that self.model is loaded to be used for predictions"""
        try:
            data = pickle.load(open("training_data", "rb"))
            words = data['words']
            classes = data['classes']
            train_x = data['train_x']
            train_y = data['train_y']

            self.model = load_model(self.model_save_name)
        except FileNotFoundError as e:
            print("Model was not trained yet. Now training model")
            self.train_model()
            self.model = load_model(self.model_save_name)
Beispiel #52
0
解释一下,Stemming 是抽取词的词干或词根形式(不一定能够表达完整语义)。
NLTK中提供了三种最常用的词干提取器接口,即 Porter stemmer, Lancaster Stemmer 和 Snowball Stemmer。
Porter Stemmer基于Porter词干提取算法,来看例子
'''

from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()  
porter_stemmer.stem('maximum')  
porter_stemmer.stem('presumably')  
porter_stemmer.stem('multiply')  
porter_stemmer.stem('provision')  
porter_stemmer.stem('owed')

# Lancaster Stemmer 基于Lancaster 词干提取算法,来看例子
from nltk.stem.lancaster import LancasterStemmer  
lancaster_stemmer = LancasterStemmer()  
lancaster_stemmer.stem('maximum')  
lancaster_stemmer.stem('presumably')
lancaster_stemmer.stem('presumably')
lancaster_stemmer.stem('multiply')
lancaster_stemmer.stem('provision')
lancaster_stemmer.stem('owed')

# Snowball Stemmer基于Snowball 词干提取算法,来看例子
from nltk.stem import SnowballStemmer  
snowball_stemmer = SnowballStemmer('english')  
snowball_stemmer.stem('maximum')  
snowball_stemmer.stem('presumably')  
snowball_stemmer.stem('multiply')  
snowball_stemmer.stem('provision')  
snowball_stemmer.stem('owed')
# -*- coding: utf-8 -*-

from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer

input_words = [
    'writing', 'calves', 'be', 'branded', 'horse', 'randomize', 'possibly',
    'provision', 'hospital', 'kept', 'scratchy', 'code'
]

porter = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer('english')

stemmer_names = ['INPUT WORD', 'PORTER', 'LANCASTER', 'SNOWBALL']
fmt = '{:>16}' * len(stemmer_names)
print(fmt.format(*stemmer_names))
print('=' * 68)

for word in input_words:
    output = [
        word,
        porter.stem(word),
        lancaster.stem(word),
        snowball.stem(word)
    ]
    print(fmt.format(*output))
class Preprocessor:
    _stemmer = LancasterStemmer()

    @staticmethod
    def stem(word):
        return Preprocessor._stemmer.stem(word)
        #         pass
        #     pass
        matches = tool.check(sentence)
        if matches:
            i = 0
            while i < len(matches):
                grammer_error.append(matches[i].context)
                i += 1
                pass
        pass
    relevence_dict["business"] = tmp_bus
    relevence_dict['nonbusiness'] = tmp_non
    return relevence_dict, sen, mom_data, grammer_error


stemmer = LancasterStemmer()

training_data = []
training_data.append({"class": "greeting", "sentence": "how are you?"})
training_data.append({"class": "greeting", "sentence": "how is your day?"})
training_data.append({"class": "greeting", "sentence": "Hi, Vilas"})
training_data.append({
    "class": "greeting",
    "sentence": "how is it going today?"
})
training_data.append({"class": "greeting", "sentence": "I am doing good"})

training_data.append({"class": "goodbye", "sentence": "have a nice day"})
training_data.append({"class": "goodbye", "sentence": "see you later"})
training_data.append({"class": "goodbye", "sentence": "have a nice day"})
training_data.append({"class": "goodbye", "sentence": "talk to you soon"})
import nltk
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()

import numpy as np
import random
import json
import pickle

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD

from keras.models import load_model

###############################learn this
with open("intents.json") as file:
    data = json.load(file)
##############################
try:
    with open("data.pickle", "rb") as f:
        words, labels, training, output = pickle.load(f)
    error

except:
    words = []
    labels = []
    docs_x = []
    docs_y = []
    for intent in data["intents"]:
        for pattern in intent["patterns"]:
Beispiel #57
0
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()
import numpy
import tflearn
import tensorflow
import random
import json
import pickle

with open("intents.json") as file:
    data = json.load(file)
try:
    with open("data.pickle", "rb") as f:
        words, labels, training, output = pickle.load(f)

except:
    words = []
    labels = []
    docs_x = []
    docs_y = []

    for intent in data["intents"]:
        for pattern in intent["patterns"]:
            wrds = nltk.word_tokenize(pattern)
            words.extend(wrds)
            docs_x.append(wrds)
            docs_y.append(intent["tag"])

        if intent["tag"] not in labels:
Beispiel #58
0
import nltk
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()

import time
import difflib
import numpy
import webbrowser
import tflearn
import tensorflow
import random
from flask import Flask, render_template, request
import json
import pickle
import os

app = Flask(__name__)
with open("intents.json") as file:
    data = json.load(file)

try:
    with open("data.pickle", "rb") as f:
        words, labels, training, output = pickle.load(f)
except:
    words = []
    labels = []
    docs_patt = []
    docs_tag = []

    for intent in data["intents"]:
        # below we fetch patterns from all intents in one place
Beispiel #59
0
#coding=utf-8
#得到词原的几种不同方法  比较

from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem.lancaster import LancasterStemmer

string = "called"

porter_stemmer = PorterStemmer()
a = porter_stemmer.stem(string)
print a

wordnet_lemmatizer = WordNetLemmatizer()
b = wordnet_lemmatizer.lemmatize(string)
print b

snowball_stemmer = SnowballStemmer("english")
c = snowball_stemmer.stem(string)
print c

st = LancasterStemmer()
d = st.stem(string)
print d
import json
import numpy as np
from yellowbrick.text import PosTagVisualizer
nltk.download
from nltk import tokenize
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import pos_tag

#Import Stemmers
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer

stemmer_porter = PorterStemmer()
stemmer_lancaster = LancasterStemmer()
stemmer_snowball = SnowballStemmer('english')

#Load Reviews
def open_file(filename):
    data = []
    for line in open(filename, 'r'):
        data_ = json.loads(line)
        header = ['overall', 'reviewTime', 'reviewText', 'summary', 'unixReviewTime']
        line__ = [data_.get(h) for h in header]
        data.append(line__)
        df = pd.DataFrame(data, columns=header)
    return df

df = open_file('Cell_Phones_and_Accessories_5.json')
text_list = df['reviewText'].values.tolist()Pandas library