def exercise0(): # part a # part b # reference https://github.com/mikeholler/CSC499-NLP/blob/master/ch_6/exercises/improved_movie_review_classifier.py # by mjholler # retrieve all movie reviews in the form of (wordlist, category) documents = [(list(nltk.corpus.movie_reviews.words(fileid)), category) for category in nltk.corpus.movie_reviews.categories() for fileid in nltk.corpus.movie_reviews.fileids(category) ] #all_words = nltk.FreqDist(w.lower() for w in nltk.corpus.movie_reviews.words()) #word_features = all_words.keys()[:2000] # Get the top synsets in the document from the top 2000 words #synset_features = synsets(word_features) train_set = apply_features(document_features, documents[100:]) test_set = apply_features(document_features, documents[:100]) print 'training classifier' classifier = nltk.NaiveBayesClassifier.train(train_set) print 'accuracy', nltk.classify.accuracy(classifier, test_set) print '10 features', classifier.show_most_informative_features(10)
def get_labeled_examples(): for f in filenames: print f, print labels[f] words = [] for f in filenames: for word in words_in_lines( open(f).readlines()): #sacrifice preformance for memory words.append(word) all_words = FreqDist(w.lower() for w in words) word_features = all_words.keys()[:2000] # 2000 most frequent words get_features = get_get_features(word_features) # create feature extractor #pair sentences and labels labeled_examples = [] for f in filenames: labeled_examples.extend([(ex, labels[f]) for ex in open(f).readlines()]) shuffle(labeled_examples) #extract features labeled_instances = apply_features(get_features, labeled_examples, labeled=True) #lazy map return (labeled_instances, word_features)
def entry_reduce(key, raw_values): values = [] for raw_value in raw_values: logging.error(raw_value) logging.error(type(raw_value)) value = eval(raw_value) values.append(value) tweets = [] for(words,sentiment)in values: words_filtered=[e.lower() for e in words.split() if len(e)>=3] tweets.append((words_filtered ,sentiment)) word_features = get_word_features(get_words_in_tweets(tweets)) def extract_features(document): logging.error(document) document_words = set(document) features = {} for word in word_features: logging.error(word) features['contains(%s)' % word] = (word in document_words) return features training_set = apply_features(extract_features, tweets) logging.error(training_set) classifier = nltk.classify.NaiveBayesClassifier.train(training_set) setattr(classifier, 'category', key) yield pickle.dumps(classifier)
def get_trained_classifier(): df = pd.read_csv("sentiment.csv") cleanTestDict = [] for text in df['text']: cleanTestDict.append(tweet_cleaner_updated(text)) df['text_clean'] = cleanTestDict lemmer(df) stemmer(df) df_final = [] type = 'lemmed' global word_features word_features = get_word_features(get_words_in_tweets(df, type)) for index, row in df.iterrows(): if row.sentiment == 0: df_final.append((row['text_' + type], 'negative')) elif row.sentiment == 4: df_final.append((row['text_' + type], 'positive')) training_set = apply_features(extract_features, df_final) classifier = NaiveBayesClassifier.train(training_set) classifier.show_most_informative_features() return classifier
def avaliar_Sentimento(message): training_set = apply_features(extract_features,lista_feature_fell) classifier= nltk.NaiveBayesClassifier.train(training_set) print("\n\tPossibilidades: %s "%classifier.labels()) print ("\n\tSentimento Provavel: %s \n"%(classifier.classify(extract_features(message)))) #print ("Accuracy : %s" %nltk.classify.util.accuracy(classifier,training_set)) #print extract_features(message) print classifier.show_most_informative_features(32)
def train(raw_classifier, training_sets, feature_selection, transformer): training = [] # Since we have a rather large amount of training data, build features # lazily to avoid running out of memory. tuple_set = [(transformer.transform(x), cl) for cl in [POS, NEG] for x in training_sets[cl]] train_set = apply_features(feature_selection.select_features, tuple_set) return Classifier(raw_classifier.train(train_set), feature_selection, transformer, len(tuple_set))
def analisar_frase(self, widget): """Função: analisar a frase que o usuário""" print ("analisar_frase") global featureList frase = self.text_area.get_text() if ( frase != ""): frase_proc= normalizar(frase) self.text_area.set_text(frase) if (self.opcao == 'dilma' or self.opcao == 'copa' or self.opcao == 'palmeiras' or self.opcao == 'fatec'): print("Opcao depois: %s "%self.opcao) # Gera a lista de caracteristicas usada no metodo extract_features featureList = gera_lista_features(self.opcao) #print ("\n\tCaracteristicas conhecidas:\n\t%s "%(featureList)) lista_feature_fell = get_lista_feature_fell() print("\n\tCaracteristica / Sentimento:\n\t %s"%lista_feature_fell) # features = get_feature_list(self.opcao) # print("\n\tFeatureListtt: %s "%(features)) frase = self.text_area.get_text() frase_Normal = normalizar(frase) features_msg = getFeatureVector(frase_Normal) training_set = apply_features(extract_features,lista_feature_fell) self.fell = avaliar_Sentimento(features_msg,training_set) print ("\nFrase analisada: %s "%frase) print ("\n\tCaracteristicas da Msg - %s\n "%features_msg) language = detect_language(frase) print ("\n\tLingua: %s "%language) if ( language == 'portuguese'): print ("Sentimento: %s "%self.fell) # text color self.text_area.modify_text(gtk.STATE_NORMAL, gtk.gdk.color_parse("#FF0000")) self.text_area2.modify_text(gtk.STATE_NORMAL, gtk.gdk.color_parse("#FF0000")) # text font # entry.modify_font(pango.FontDescription("monospace 16")) self.text_area2.modify_font(pango.FontDescription("sans bold 16")) if ( self.fell == 'negativo'): frase = self.text_area.get_text() self.text_area.set_text(frase) self.text_area2.set_text(" Sentimento: Negativo") self.image_happy.set_from_file('../imgs/black.jpg') self.image_happy.show else: self.text_area2.set_text(" Sentimento: Positivo") self.image_sad.set_from_file('../imgs/black.jpg') self.image_sad.show else: self.text_area2.set_text(" Ce ta de Brincaxxon comigo ?????")
def classifier_for_training_set(positive, negative, blacklist=[]): """ Returns a Bayesian classifier for the given positive and negative sentences. """ positive_feedback \ = map(lambda s: (FreqDist(tokenize(s, blacklist)).keys(), 'positive'), positive) negative_feedback \ = map(lambda s: (FreqDist(tokenize(s, blacklist)).keys(), 'negative'), negative) training_set = apply_features(_classifier_features_for_document, positive_feedback + negative_feedback) return nltk.classify.NaiveBayesClassifier.train(training_set)
def train(self, text, labels): """ Returns trained model and set of unique words in training data """ #assert corpus self.corpus = self.extract_tokens(text, labels) self.word_features = self.get_features(self.corpus) train_set = apply_features(self.extract_features, self.corpus) self.classifier = NaiveBayesClassifier.train(train_set) return self.classifier, self.word_features
def apply_features(self, documents, labeled=None): """ Apply all feature extractor functions to the documents. This is a wrapper around `nltk.classify.util.apply_features`. If `labeled=False`, return featuresets as: [feature_func(doc) for doc in documents] If `labeled=True`, return featuresets as: [(feature_func(tok), label) for (tok, label) in toks] :param documents: a list of documents. `If labeled=True`, the method expects a list of (words, label) tuples. :rtype: LazyMap """ return apply_features(self.extract_features, documents, labeled)
def train(self, text, labels): """ Returns trained model and set of unique words in training data """ #call extract_tokens self.corpus = self.extract_tokens(text, labels) #call get_features self.word_features = self.get_features(self.corpus) #Extracting training set train_set = apply_features(self.extract_features, self.corpus) #Now train the NaiveBayesClassifier with train_set self.classifier = NaiveBayesClassifier.train(train_set) return self.classifier, self.word_features
def train(self, train, test_ratio): """ trains the chosen algorithm :param train: classification algorithm :param test_ratio: ratio of test documents from dataset :return: """ dataset_dict = self.process(self.dataset_dict) # label each document with the coresponding class for class_label, documents in dataset_dict.items(): dataset_dict[class_label] = [(document, class_label) for document in documents] #split dataset in test and training data. Get dict {"class_label" : [training_data], [test_data]" instances = defaultdict(list) for class_label, documents in dataset_dict.items(): train_data, test_data = split_data(documents, test_ratio) instances[InstanceType. Train] = instances[InstanceType.Train] + train_data instances[ InstanceType.Test] = instances[InstanceType.Test] + test_data tokenized_train_documents = [ training_document for training_document in instances[InstanceType.Train] ] training_words = [ word for word in find_all_words(tokenized_train_documents) ] unigram_feats = unigram_word_feats(training_words, top_n=2000) self.add_feat_extractor( self.feature_extractors_dict[FeatureExtractor.Unigram], unigrams=unigram_feats) #bigram_collocs_feats = bigram_collocation_feats([tokenized_train_tweet[0] for tokenized_train_tweet in tokenized_train_documents], top_n=1000, min_freq=5) #self.add_feat_extractor(self.feature_extractors_dict[FeatureExtractor.Bigram], bigrams=bigram_collocs_feats) training_set = apply_features(self.extract_features, tokenized_train_documents) self.classifier = train(training_set) labels_total = defaultdict(int) errors = defaultdict(int) for instance in instances[InstanceType.Test]: if len(instance[0]) > 1: classification = self.classify(instance[0]) if classification != instance[1]: errors[instance[1]] += 1 labels_total[instance[1]] += 1 acc_up = 0 acc_down = 0 for label, error in errors.items(): total = labels_total[label] acc_up += total acc_down += total + error precision = ((total - error) / total) * 100 print("{} class precision: {}".format(label, precision)) accuracy = (acc_up / acc_down) * 100 print("Accuracy: {}".format(accuracy))
def _get_trainingset(cls, source='db'): data = cls._get_data(source) cls._get_word_freq(data) return apply_features(cls.feature_extractor, data)
#Create a dictionary of features (True for each feature present, implicit False for absent features). In this case, features are words, but they could be bigger or smaller, simpler or more complex. for word in words: # filter out words of less than 2 characters, see impact on accuracy if len(word) >= 3: features["contains_word_(%s)" % word] = True return features raw_input("\n\nHit enter to continue...") print "Extracting Features from Training Set" train_set = apply_features(feature_extracting_function, training_set_array) raw_input("\n\nHit enter to continue...") print "Gathering unknown data points (new data) to predict on (again, hand-coded, see script source)" raw_input("\n\nHit enter to continue...") #Train a Naive Bayes Classifier (simple but surprisingly effective). This isn't the only classifier one could use (dtree is another, and there are many, many more), but it's a good start. # print "Training Naive Bayes Classifier"
print "Writing Feature Extractor" def feature_extracting_function(data_point): features = {} #Dictionary, roughly equivalent to a hashtable in other languages. data_point = ''.join(ch for ch in data_point if ch not in set(string.punctuation)) #Strip punctuation characters from the string. In Python, this happens to be usually done with a .join on the string object, but don't be thrown if you're used to other languages and this looks weird (hell, it looks weird to me), all we're doing is stripping punctuation. words = data_point.split() #Split data_point on whitespace, return as list words = [word.lower() for word in words] #Convert all words in list to lowercase. The [] syntax is a Python "list comprehension"; Google that phrase if you're confused. #Create a dictionary of features (True for each feature present, implicit False for absent features). In this case, features are words, but they could be bigger or smaller, simpler or more complex. for word in words: features["contains_word_(%s)" % word] = True return features raw_input("\n\nHit enter to continue...") print "Extracting Features from Training Set" train_set = apply_features(feature_extracting_function, known_data_points) raw_input("\n\nHit enter to continue...") print "Gathering unknown data points (new data) to predict on (again, hand-coded, see script source)" #Our query chocolate bars: we want to know whether or not they're matches unknown_1 = "milky light sweet nutty" unknown_2 = "dark bitter plain" unknown_3 = "dark dark bitter beyond belief organic" unknown_4 = "organic minty sweet dark" raw_input("\n\nHit enter to continue...") #Train a Naive Bayes Classifier (simple but surprisingly effective). This isn't the only classifier one could use (dtree is another, and there are many, many more), but it's a good start. print "Training Naive Bayes Classifier" nb = nltk.NaiveBayesClassifier.train(train_set)
# Event # Hackbright Data Science Workshop. # Author # Daniel Wiesenthal. [email protected]. # What is this? # This is a simple script illustrating the usage of the Python NLTK classifier. It is written in Python, but the comments are intended to make it clear how to port to other languages. The flow isn't particularly well decomposed as a program; rather, it is intended to go along linearly with the associated talk/presentation. # The goal is to find out which chocolate a particular volunteer during the talk will like. We have a few examples of chocolate bars that we know are either matches or not (misses), and want to use that to make a guess about an unknown bar (we don't know if it will be a match, and want to guess). # Further reading: # http://www.stanford.edu/class/cs124/lec/naivebayes.pdf # http://nltk.googlecode.com/svn/trunk/doc/book/ch06.html # Software Setup # For this script to work, you'll need to have Python, NLTK (a Python package), and Numpy (upon which NLTK depends) installed. On a Mac (which all have numpy pre-installed these days), run: # sudo easy_install pip # sudo pip install nltk # <cd to directory with this file> # python classification_101.py #Required libraries from myclassifier import MyClassifier import pprint try: import nltk from nltk.classify.util import apply_features import string print "Great! Looks like you're all set re: NLTK and Python." except Exception, e: print "Bummer. Looks like you don't have NLTK and Python set up correctly. (Exception: " + str( e) + ")" quit() raw_input("\n\nHit enter to get started...") #Some example chocolate bars. The format is a tuple of {information about the chocolate bar} and a {value}, where "match" is a good match and "miss" is a poor/bad match.
def analisar_frase(self, widget): """Função: analisar a frase que o usuário""" # Limpar a tela subprocess.call("clear") print ("\n\tAnalise") global featureList frase = self.text_area.get_text() if ( frase != ""): frase_proc= normalizar(frase) self.text_area.set_text(frase) if (self.opcao == 'dilma' or self.opcao == 'copa' or self.opcao == 'palmeiras' or self.opcao == 'fatec'): #print("Opcao depois: %s "%self.opcao) # Gera a lista de caracteristicas usada no metodo extract_features featureList = gera_lista_features(self.opcao) #print ("\n\tCaracteristicas conhecidas:\n\t%s "%(featureList)) lista_feature_fell = get_lista_feature_fell() #print("\n\tCaracteristica / Sentimento:\n\t %s"%lista_feature_fell) # features = get_feature_list(self.opcao) # print("\n\tFeatureListtt: %s "%(features)) frase = self.text_area.get_text() frase_Normal = normalizar(frase) features_msg = getFeatureVector(frase_Normal) print ("\n\tFrase analisada: %s "%frase) print ("\n\tCaracteristicas da Msg - %s\n "%features_msg) # Obter sentimentos associados show_relacao(obter_sentimento_associado(lista_feature_fell,features_msg)) training_set = apply_features(extract_features,lista_feature_fell) self.fell = avaliar_Sentimento(features_msg,training_set) language = detect_language(frase) #print ("\n\tLingua: %s "%language) if ( language == 'portuguese'): #print ("\n\tSentimento: %s "%self.fell) # text color self.text_area.modify_text(gtk.STATE_NORMAL, gtk.gdk.color_parse("#FF0000")) self.text_area2.modify_text(gtk.STATE_NORMAL, gtk.gdk.color_parse("#FF0000")) # text font # entry.modify_font(pango.FontDescription("monospace 16")) self.text_area2.modify_font(pango.FontDescription("sans bold 16")) if ( self.fell == 'negativo'): frase = self.text_area.get_text() self.text_area.set_text(frase) self.text_area2.set_text(" Sentimento: Negativo") self.image_black.set_from_file('glade/imgs/cry.png') self.image_black.show self.image_doubtRight = self.builder.get_object("doubtRight") self.image_doubtRight.set_from_file('glade/imgs/black.png') self.image_doubtRight.show() self.image_doubtLeft = self.builder.get_object("doubtLeft") self.image_doubtLeft.set_from_file('glade/imgs/black.png') self.image_doubtLeft.show() else: self.text_area2.set_text(" Sentimento: Positivo") self.image_black.set_from_file('glade/imgs/happy.png') self.image_black.show self.image_doubtRight = self.builder.get_object("doubtRight") self.image_doubtRight.set_from_file('glade/imgs/black.png') self.image_doubtRight.show() self.image_doubtLeft = self.builder.get_object("doubtLeft") self.image_doubtLeft.set_from_file('glade/imgs/black.png') self.image_doubtLeft.show() else: self.text_area2.set_text(" Ce ta de Brincaxxon comigo ?????")
def _get_trainingset(cls, source = 'db'): data = cls._get_data(source) cls._get_word_freq(data) return apply_features(cls.feature_extractor, data)
def nltkshow(request): def get_words_in_tweets(tweets): all_words = [] for (words, sentiment) in tweets: all_words.extend(words) return all_words def get_word_features(wordlist): wordlist = FreqDist(wordlist) word_features = wordlist.keys() return word_features pos_tweets=[('I love this car','positive'), ('This view is amazing','positive'), ('I feel great this morning','positive'), ('I am so excited about the concert','positive'), ('He is my best friend','positive')] neg_tweets=[('I do not like this car','negative'), ('This view is horrible','negative'), ('I feel tired this morning','negative'), ('I am not looking forward to the concert','negative'), ('He is my enemy','negative')] tweets=[] for(words,sentiment)in pos_tweets+neg_tweets: words_filtered=[e.lower() for e in words.split() if len(e)>=3] tweets.append((words_filtered,sentiment)) test_pos_tweets=[('I feel happy this morning','positive'), ('Larry is my friend','positive')] test_neg_tweets=[('I do not like that man','negative'), ('This view is horrible','negative'), ('The house is not great','negative'), ('Your song is annoying','negative')] test_tweets=[] for(test_words,test_sentiment)in test_pos_tweets+test_neg_tweets: test_words_filtered=[e.lower() for e in test_words.split() if len(e)>=3] test_tweets.append((test_words_filtered,test_sentiment)) word_features = get_word_features(get_words_in_tweets(tweets)) def extract_features(document): document_words = set(document) features = {} for word in word_features: features['contains(%s)' % word] = (word in document_words) return features training_set = apply_features(extract_features, tweets) test_training_set=apply_features(extract_features, test_tweets) classifier = nltk.classify.NaiveBayesClassifier.train(training_set) tweet = 'Your song is horrible' clas= classifier.classify(extract_features(tweet.split())) ''' classifier.show_most_informative_features(5) + clas +" "+ class2 class2 nltk.classify.util.accuracy(classifier,test_training_set) ''' now = datetime.datetime.now() html = clas + "<html><body>It is 555 now %s.</body></html>" % now return HttpResponse(html)
features["has(%s)" % letter] = (letter in name.lower()) return features names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) random.shuffle(names) # feature_sets = [(gender_features(n), g) for (n, g) in names] # train_set, test_set = feature_sets[500:], feature_sets[:500] train_names = names[1500:] dev_names = names[500:1500] test_names = names[:500] train_set = apply_features(gender_features, train_names) dev_set = apply_features(gender_features, dev_names) test_set = apply_features(gender_features, test_names) classifier = nltk.NaiveBayesClassifier.train(train_set) print classifier.classify(gender_features('Neo')) print classifier.classify(gender_features('Miya')) print nltk.classify.accuracy(classifier, test_set) errors = [] for (name, tag) in dev_names: guess = classifier.classify(gender_features(name)) if guess != tag: errors.append((tag, guess, name))
def nltkshow(request): def get_words_in_tweets(tweets): all_words = [] for (words, sentiment) in tweets: all_words.extend(words) return all_words def get_word_features(wordlist): wordlist = FreqDist(wordlist) word_features = wordlist.keys() return word_features pos_tweets = [('I love this car', 'positive'), ('This view is amazing', 'positive'), ('I feel great this morning', 'positive'), ('I am so excited about the concert', 'positive'), ('He is my best friend', 'positive')] neg_tweets = [('I do not like this car', 'negative'), ('This view is horrible', 'negative'), ('I feel tired this morning', 'negative'), ('I am not looking forward to the concert', 'negative'), ('He is my enemy', 'negative')] tweets = [] for (words, sentiment) in pos_tweets + neg_tweets: words_filtered = [e.lower() for e in words.split() if len(e) >= 3] tweets.append((words_filtered, sentiment)) test_pos_tweets = [('I feel happy this morning', 'positive'), ('Larry is my friend', 'positive')] test_neg_tweets = [('I do not like that man', 'negative'), ('This view is horrible', 'negative'), ('The house is not great', 'negative'), ('Your song is annoying', 'negative')] test_tweets = [] for (test_words, test_sentiment) in test_pos_tweets + test_neg_tweets: test_words_filtered = [ e.lower() for e in test_words.split() if len(e) >= 3 ] test_tweets.append((test_words_filtered, test_sentiment)) word_features = get_word_features(get_words_in_tweets(tweets)) def extract_features(document): document_words = set(document) features = {} for word in word_features: features['contains(%s)' % word] = (word in document_words) return features training_set = apply_features(extract_features, tweets) test_training_set = apply_features(extract_features, test_tweets) classifier = nltk.classify.NaiveBayesClassifier.train(training_set) tweet = 'Your song is horrible' clas = classifier.classify(extract_features(tweet.split())) ''' classifier.show_most_informative_features(5) + clas +" "+ class2 class2 nltk.classify.util.accuracy(classifier,test_training_set) ''' now = datetime.datetime.now() html = clas + "<html><body>It is 555 now %s.</body></html>" % now return HttpResponse(html)
# word_pairs.append(word_pair) #gen = (word for word in words if word not in nltk.corpus.stopwords.words('english')) #for word in gen: # features["contains_unigrams_(%s)" %(word)] = True #for word_pair in word_pairs: # features["contains_bigrams_(%s)" % (word_pair)] = True return features #Create proper test and training sets based on features print "Applying features to Training Set..." train_set = apply_features(feature_extracting_function, train_data_points) print "Applying features to Testing Set..." test_set = apply_features(feature_extracting_function, test_data_points) #Run the NLTK Naive Bayes Classifier on the training set print "Doing Naive Bayes...." nb = nltk.NaiveBayesClassifier.train(train_set) #NLTK Accuracy: Run trained model on the test set print "Accuracy: "+str(nltk.classify.accuracy(nb, test_set)) print "\n"+str(nb.show_most_informative_features(20))
def classify(self, instance): instance_feats = apply_features(self.extract_features, [instance], labeled=False) return self.classifier.classify(instance_feats[0])
tweet = 'CS' + tweet tweets = mono_tweets + cs_tweets # this is only necessary if mono and cs are mixed random.shuffle(tweets) # Splitting into train and test, FOR 1000 TWEETS (CHANGE IT ACCORDINGLY) train = tweets[:900] print(train[:3]) test = tweets[900:] print(test[:3]) print(len(test)) # Applying features to our data. train_feat = apply_features(senti_features2, train) print(train_feat[:3]) test_feat = apply_features(senti_features2, test) print('Training...') # Training the classifier me = MaxentClassifier.train(train_feat, max_iter=10) print('Evaluating...') print(evaluate_classifier(me, test_feat)) print('Best features:') me.show_most_informative_features()
for u in jb_users: curr = db.query("SELECT tweet FROM tweets WHERE user=%s", u["id"]) curr = [hash.sub("", mtn.sub("", t['tweet'])).strip() for t in curr] [jb_tweets.append((t, "JB")) for t in curr] print "Singapore", len(sg_tweets) print "Johor", len(jb_tweets) db.close() #NLTK processing tweets = [] random.shuffle(sg_tweets) random.shuffle(jb_tweets) tweets = filter_words(sg_tweets[3500:] + jb_tweets[1100:]) test = filter_words(sg_tweets[-220:] + jb_tweets[-220:]) word_features = get_word_features(get_words_in_tweets(tweets))[:2000] training_set = apply_features(extract_features, tweets) test_set = apply_features(extract_features, test) #Train using nltk classifier = nltk.NaiveBayesClassifier.train(training_set) print classifier.show_most_informative_features(32) print "Accuracy: ", nltk.classify.accuracy(classifier, test_set)
#+ bcolors.ENDC) lista_feature_fell = get_lista_feature_fell() #print("\n\tCaracteristica / Sentimento:\n\t %s"%lista_feature_fell) tema = listaColecao msg=sys.argv[1] language = detect_language(msg) ##print ("\n\tLingua: %s "%language) if ( language == 'portuguese'): print("\n\tAnalisar Msg: %s "%msg.capitalize()) msg2 = normalizar(msg) print("\n\tNormalizado: %s "%msg2) features_msg=getFeatureVector(msg2) print ("\n\tCaracteristicas da Msg - %s "%features_msg) relacao = obter_sentimento_associado(lista_feature_fell,features_msg) #print("\n\tSentimentos associados : %s "%sentimentos_associados) valor = show_relacao(relacao) if(valor == 'true'): training_set = apply_features(extract_features,lista_feature_fell) ##print("\n\tTraining_set: %s "%training_set) # Avalia mensagem avaliar_Sentimento(features_msg,training_set) else: print(bcolors.FAIL+"\n\tAvaliação Impossível\n\n"+bcolors.ENDC) else: print ("\n\tPor favor insira o texto novamente\n\n") else: print ('\nUsage: python testarMsg.py msg fatec|dilma|copa|palmeiras\n')
def avaliar_Sentimento(message): training_set = apply_features(extract_features,lista_feature_fell) classifier= nltk.NaiveBayesClassifier.train(training_set)
return word_feature def naive_bayes(training_set,test_set): # build the trained set and save it in naive_classifier.p naive_classifier = 'naive_classifier.p' if not os.path.exists(naive_classifier): classifier = nltk.NaiveBayesClassifier.train(training_set) fileobject = open(naive_classifier, 'wb') pickle.dump(classifier, fileobject) fileobject.close() # load the ctrained data set in the previous step. fileobject = open(naive_classifier, 'rb') classifier = pickle.load(fileobject) fileobject.close() ####### print("Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, test_set))*100) classifier.show_most_informative_features(15) df = pd.read_csv("rating_sentiment3.csv") dataset = featureExtraction(df) all_words = bag_of_words(dataset) word_freq = word_frequency(all_words) words_features = get_words_features(word_freq) processed_record = 0 training_set = apply_features(find_word_feature, dataset[:-750]) test_set = apply_features(find_word_feature, dataset[-750:]) naive_bayes(training_set,test_set)
# Event # Hackbright Data Science Workshop. # Author # Daniel Wiesenthal. [email protected]. # What is this? # This is a simple script illustrating the usage of the Python NLTK classifier. It is written in Python, but the comments are intended to make it clear how to port to other languages. The flow isn't particularly well decomposed as a program; rather, it is intended to go along linearly with the associated talk/presentation. # The goal is to find out which chocolate a particular volunteer during the talk will like. We have a few examples of chocolate bars that we know are either matches or not (misses), and want to use that to make a guess about an unknown bar (we don't know if it will be a match, and want to guess). # Further reading: # http://www.stanford.edu/class/cs124/lec/naivebayes.pdf # http://nltk.googlecode.com/svn/trunk/doc/book/ch06.html # Software Setup # For this script to work, you'll need to have Python, NLTK (a Python package), and Numpy (upon which NLTK depends) installed. On a Mac (which all have numpy pre-installed these days), run: # sudo easy_install pip # sudo pip install nltk # <cd to directory with this file> # python classification_101.py #Required libraries from myclassifier import MyClassifier import pprint try: import nltk from nltk.classify.util import apply_features import string print "Great! Looks like you're all set re: NLTK and Python." except Exception, e:
def twitterClass(): global wordFeatures tknzr = TweetTokenizer(strip_handles=True) onlyWords = re.compile('^[a-zA-Z]+$') # print if not os.path.exists(os.path.join(os.getcwd(), 'semtiment_classifier.pickle')): print twitter_samples.fileids() # print movie_reviews.fileids() # print tknzr = TweetTokenizer(strip_handles=True) onlyWords = re.compile('^[a-zA-Z]+$') labeledTweets = [] for it in twitter_samples.docs('negative_tweets.json'): tokens = [] for token in tknzr.tokenize(it['text']): if onlyWords.match(token) is not None: tokens.append(token.lower()) labeledTweets.append((tokens, "negative")) # print [token for token in tknzr.tokenize(it['text']) if onlyWords.match(token) is not None] for it in twitter_samples.docs('positive_tweets.json'): tokens = [] for token in tknzr.tokenize(it['text']): if onlyWords.match(token) is not None: tokens.append(token.lower()) labeledTweets.append((tokens, "positive")) # print labeledTweets wordFeatures = get_word_features(get_words_in_tweets(labeledTweets)) print "training" training = classUtil.apply_features(extract_features, labeledTweets) # print training sentimentClassifier = NaiveBayesClassifier.train(training) print "done training" f = open('semtiment_classifier.pickle', 'wb') pickle.dump(sentimentClassifier, f) f.close() else: fin = open('wordFeatures.json', "r") wordFeatures = json.load(fin) fin.close() print wordFeatures f = open('semtiment_classifier.pickle', 'rb') classifier = pickle.load(f) # type: nltk.classify.naivebayes.NaiveBayesClassifier f.close() # text,created_at tweets = [] onlyWords = re.compile('^[a-zA-Z]+$') labeledTweets = [] for row in csv.DictReader(open('datafiles/trump.csv')): text = row['text'] features = [] for token in tknzr.tokenize(text): if onlyWords.match(token) is not None: features.append(token.lower()) print row['created_at'] tweets.append({ "created_at": row['created_at'], "text": text, "classification": classifier.classify(extract_features(features)) }) classification = open('trumpClassified.json', 'w+') classification.write(json.dumps(tweets, indent=2)) classification.close() tweets = [] labeledTweets = [] for row in csv.DictReader(open('datafiles/clinton.csv')): text = row['text'] features = [] for token in tknzr.tokenize(text): if onlyWords.match(token) is not None: features.append(token.lower()) print row['created_at'] tweets.append({ "created_at": row['created_at'], "text": text, "classification": classifier.classify(extract_features(features)) }) classification = open('clintonClassified.json', 'w+') classification.write(json.dumps(tweets, indent=2)) classification.close()
test_tweets = [] for (test_words, test_sentiment) in test_pos_tweets + test_neg_tweets: test_words_filtered = [ e.lower() for e in test_words.split() if len(e) >= 3 ] test_tweets.append((test_words_filtered, test_sentiment)) word_features = get_word_features(get_words_in_tweets(tweets)) def extract_features(document): document_words = set(document) features = {} for word in word_features: features['contains(%s)' % word] = (word in document_words) return features training_set = apply_features(extract_features, tweets) test_training_set = apply_features(extract_features, test_tweets) classifier = nltk.classify.NaiveBayesClassifier.train(training_set) tweet = 'Your song is annoying' print classifier.classify(extract_features(tweet.split())) classifier.show_most_informative_features(5) print nltk.classify.util.accuracy(classifier, test_training_set)
test_reviews=[] for(test_words,test_sentiment)in test: test_words_filtered=[e.lower() for e in test_words.split() if len(e)>=3] test_reviews.append((test_words_filtered,test_sentiment)) word_features = get_word_features(get_words_in_reviews(reviews)) def extract_features(document): document_words = set(document) features = {} for word in word_features: features['contains(%s)' % word] = (word in document_words) return features training_set = apply_features(extract_features, reviews) test_training_set=apply_features(extract_features, test_reviews) classifier = nltk.classify.NaiveBayesClassifier.train(training_set) tweet = 'she is awesome' print(classifier.classify(extract_features(tweet.split()))) print(nltk.classify.util.accuracy(classifier,test_training_set)) classifier.show_most_informative_features(40)
# print ("Sentimento - %s "%x) i+=1 while ( y < len(listaMsg)): features = getAllFeatures(listaMsg[y],features) # Todas palavras relevantes/caracteristicas da mensagem featureVector = getFeatureVector(listaMsg[y]) lista_feature_fell.append((featureVector,listaFell[y])) y+=1 return features def get_lista_feature_fell(): return lista_feature_fell def get_feature_list(): return featureList if __name__ == '__main__': if (len(sys.argv) == 2 and (sys.argv[1] == 'fatec' or sys.argv[1] == 'dilma' or sys.argv[1] == 'copa' or sys.argv[1] == 'palmeiras')): # Limpar a tela #subprocess.call("clear") listaColecao = sys.argv[1] print ("\n\t\tAnálise de Sentimento\n\t\tAssunto: %s "%listaColecao.upper()) # Gera a lista de caracteristicas usada no metodo extract_features featureList = gera_lista_features(listaColecao) print ("\n\tCaracteristicas: %s\n "%(featureList)) lista_feature_fell=get_lista_feature_fell() #print ("\n\tCaracteristicas e Sentimento: %s "%lista_feature_fell) training_set = apply_features(check_features,lista_feature_fell) print ("\n\tTraining_set: %s \n"%training_set) else: print ('\nUsage: python trainingSet.py fatec|dilma|copa|palmeiras\n')
return word_features review_features = get_features(get_all_words(reviews)) # print review_features def extract_features(document): document_words = set(document) features = {} for word in review_features: features["contains(%s)" % word] = word in document_words # print features return features training_set = apply_features(extract_features, reviews) classifier = nltk.classify.NaiveBayesClassifier.train(training_set) # test = ["berbat bir yer", "Muhteşem bir yer.","harika","mükemmel bir yer","vasat","rezalet","Başarılı","Kötü yemekler"] for r in test: print classifier.classify(extract_features(r.split())) test_set = apply_features(extract_features, test_reviews) print nltk.metrics.scores.accuracy(classifier, test_set) # print classifier.show_most_informative_features(1)