def SentimentAnalyzer(): text = "The movie was amazing" fileids_pos = movie_reviews.fileids('pos') fileids_neg = movie_reviews.fileids('neg') features_pos = [(extract_features(movie_reviews.words(fileids=[f])), 'Positive') for f in fileids_pos] features_neg = [(extract_features(movie_reviews.words(fileids=[f])), 'Negative') for f in fileids_neg] threshold = 0.8 num_pos = int(threshold * len(features_pos)) num_neg = int(threshold * len(features_neg)) features_train = features_pos[:num_pos] + features_neg[:num_neg] features_test = features_pos[num_pos:] + features_neg[num_neg:] classifier = NaiveBayesClassifier.train(features_train) probabilities = classifier.prob_classify(extract_features(text.split())) predicted_sentiment = probabilities.max() response = { "accuracy": nltk_accuracy(classifier, features_test), "predicted_sentiment": predicted_sentiment, "probability": round(probabilities.prob(predicted_sentiment), 2) } print(response) return response
def SentimentAnalyzer(text): # load movie reviews from sample data fileids_pos = movie_reviews.fileids('pos') fileids_neg = movie_reviews.fileids('neg') features_pos = [(extract_features(movie_reviews.words(fileids=[f])),'Positive') for f in fileids_pos] features_neg = [(extract_features(movie_reviews.words(fileids=[f])),'Negative') for f in fileids_neg] threshold = 0.8 num_pos = int(threshold*len(features_pos)) num_neg = int(threshold*len(features_neg)) # creating training and testing data features_train = features_pos[:num_pos] + features_neg[:num_neg] features_test = features_pos[num_pos:] + features_neg[num_neg:] #print('\nNumber of training datapoints:', len(features_train)) #print('Number of test datapoints:', len(features_test)) # training a naive bayes classifier classifier = NaiveBayesClassifier.train(features_train) print('Accuracy:',nltk_accuracy(classifier, features_test)) probabilities = classifier.prob_classify(extract_features(text.split())) # Pick the maximum value predicted_sentiment = probabilities.max() print("Predicted sentiment:", predicted_sentiment) print("Probability:",round(probabilities.prob(predicted_sentiment), 2)) return predicted_sentiment, probabilities.prob(predicted_sentiment)
def SentimentAnalyzer(text): # load movie reviews from sample data # fileids_pos = movie_reviews.fileids('pos') # fileids_neg = movie_reviews.fileids('neg') # features_pos = [(extract_features(movie_reviews.words(fileids=[f])),'Positive') for f in fileids_pos] # features_neg = [(extract_features(movie_reviews.words(fileids=[f])),'Negative') for f in fileids_neg] threshold = 0.8 # num_pos = int(threshold*len(features_pos)) # num_neg = int(threshold*len(features_neg)) # creating training and testing data # features_train = features_pos[:num_pos] + features_neg[:num_neg] # features_test = features_pos[num_pos:] + features_neg[num_neg:] feature = frame.body_text label = frame.label features_train = [(extract_features(feature), label) for index, (feature, label) in frame.iterrows()] features_train = features_train[:2000] features_test = features_train[2000:] print('\nNumber of training datapoints:', len(features_train)) print('Number of test datapoints:', len(features_test)) # training a naive bayes classifier print(type(features_train)) print(type(features_train[0])) print(type(features_train[0][0])) classifier = NaiveBayesClassifier.train(features_train) print('Accuracy:',nltk_accuracy(classifier, features_test)) probabilities = classifier.prob_classify(extract_features(text.split())) # Pick the maximum value predicted_sentiment = probabilities.max() print("Predicted sentiment:", predicted_sentiment) print("Probability:",round(probabilities.prob(predicted_sentiment), 2)) return predicted_sentiment
'Positive') for f in fileids_pos] features_neg = [(extract_features(movie_reviews.words(fileids=[f])), 'Negative') for f in fileids_neg] threshold = 0.8 num_pos = int(threshold * len(features_pos)) num_neg = int(threshold * len(features_neg)) features_train = features_pos[:num_pos] + features_neg[:num_neg] features_test = features_pos[num_pos:] + features_neg[num_neg:] print('\nNumber of training datapoints:', len(features_train)) print('Number of test datapoints:', len(features_test)) classifier = NaiveBayesClassifier.train(features_train) print('\nAccuracy of the classifier:', nltk_accuracy(classifier, features_test)) N = 15 print('\nTop ' + str(N) + ' most informative words:') for i, item in enumerate(classifier.most_informative_features()): print(str(i + 1) + '. ' + item[0]) if i == N - 1: break input_reviews = [ "Everything about this movie is outstanding -- the performances, the way the true events are handled, the cinematography. In this day of digital news, this movie makes us stand back and realize what we may lose in the way of investigative journalism as we slowly kill off print media. The focus remains the child abuse scandal in the archdiocese in Boston. That reflects the conflict the characters face and deal with when events make them rethink the focus of their article. The movie is riveting, though we know the outcome." ] print("\nMovie review predictions:") for review in input_reviews: print("\nReview:", review)
threshold = 0.8 num_pos = int(threshold * len(features_pos)) num_neg = int(threshold * len(features_neg)) # Create training and training datasets features_train = features_pos[:num_pos] + features_neg[:num_neg] features_test = features_pos[num_pos:] + features_neg[num_neg:] # Print the number of datapoints used print('\nNumber of training datapoints:', len(features_train)) print('Number of test datapoints:', len(features_test)) # Train a Naive Bayes classifier classifier = NaiveBayesClassifier.train(features_train) print('\nAccuracy of the classifier:', nltk_accuracy(classifier, features_test)) N = 20 print('\nTop ' + str(N) + ' most informative words:') for i, item in enumerate(classifier.most_informative_features()): print(str(i + 1) + '. ' + item[0]) if i == N - 1: break # Test input movie reviews input_reviews = [ 'Im not sure theres a single unsuccessful moment in this entire film. This was the movie that reminded me how much I can still love a movie. ', 'However, although entertaining in parts, there is very little connective tissue between the two main running storylines, creating a disappointing disconnect which prevents the movie from truly coming together in the end.', 'While this has interesting moments, Foster seems unable to follow the story into as deep or dark a place as it should go and the ambiguity in the storytelling is unwarranted and frustrating to witness.', 'There is an appreciated sense of unconventionally to the film. However, the story quickly takes an overemotional and theatrical turn which diminish the many topics the story could have explored. ', "A sensational Korean trial makes for a fairly riveting cinematic ride, with its very own touches of that infamous gangnam style.",
def build_naive_bayes_model(self): print('Processing Naive Bayes classification: \n') for r in reviews: # tokenize review text tokens = word_tokenize(self.review_text(r)) # lower case tokens tokens = [w.lower() for w in tokens] # remove punctuation stripped = [w.translate(self.punc_table) for w in tokens] # filter out non-alphabetic words words = [word for word in stripped if word.isalpha()] # filter stop words stop_words = set(stopwords.words('english')) words = [w for w in words if not w in stop_words] # words = [w for w in words if not w in self.sentimentAnalyzerLexicons] # Frequency distribution for w in words: self.all_words.append(w) # Frequency distribution fdist = FreqDist(self.all_words) word_features = list(fdist.keys())[:3000] # set the text to processced result for model training later r['text'] = ' '.join(words) # label reviews with 4, 5 ratings as pos and the rest as neg. if (r['score'] == 4 or r['score'] == 5): self.documents.append((r, "pos")) else: self.documents.append((r, "neg")) def find_features(text): words = word_tokenize(text) features = {} for w in word_features: features[w] = (w in words) return features featuresets = [(find_features(rvw['text']), sentiment) for (rvw, sentiment) in self.documents] random.shuffle(featuresets) threshold = 0.8 training_set = featuresets[:int(threshold * len(featuresets))] testing_set = featuresets[int(threshold * len(featuresets)):] # Prep done. build the model and validate. classifier = NaiveBayesClassifier.train(training_set) print("Naive Bayes classifier accuracy percent:", (nltk_accuracy(classifier, testing_set)) * 100) print("\n") classifier.show_most_informative_features(15) # build the list of sentiment for each review from the review featuresets_to_classify = [find_features(rvw['text']) for (rvw, sentiment) in self.documents] labels = classifier.classify_many(featuresets_to_classify) self.nb_df = pd.DataFrame(labels, columns=['nb_label']) self.nb_df['review_id'] = [rvw['id'] for rvw, sentiment in self.documents] self.nb_df['appid'] = [rvw['appid'] for rvw, sentiment in self.documents]
def Analtsis_Movie(Movie_Reple): positive_count = 0 negative_count = 0 nomal_count = 0 features_train = read_pickle('features_train.txt') features_test = read_pickle('features_test.txt') print('\n학습데이터의 수:', len(features_train)) print('테스트 데이터의 수: ', len(features_test)) classifier = NaiveBayesClassifier.train(features_train) print('\n정확도 :', nltk_accuracy(classifier, features_test)) N = 15 print('\nTop ' + str(N) + ' 결정적인 단어 :') for i, item in enumerate(classifier.most_informative_features()): print(str(i + 1) + '. ' + item[0]) if i == N - 1: break input_reviews = read_review(Movie_Reple) print("\n영화 리뷰 예측:") review_list = [] for review in input_reviews: probabilities = classifier.prob_classify( extract_features(pos_tagger.nouns(review))) # Pick the maximum value predicted_sentiment = probabilities.max() print("\n리뷰 :", review) print("예측된 감정:", predicted_sentiment) print("정확도 :", round(probabilities.prob(predicted_sentiment), 2)) if predicted_sentiment == 'Positive': positive_count += 1 elif predicted_sentiment == "Negative": negative_count += 1 elif predicted_sentiment == 'Nomal': nomal_count += 1 print('긍정적인 비율 : ', positive_count / (positive_count + negative_count + nomal_count)) print('부정적인 비율 : ', negative_count / (positive_count + negative_count + nomal_count)) print('중립의 비율 : ', nomal_count / (positive_count + negative_count + nomal_count)) print('총 개수 : ', positive_count + negative_count + nomal_count) num = [positive_count, nomal_count, negative_count] vec = ['postive', 'nomal', 'negative'] plt.pie(num, labels=vec, colors=['steelblue', 'lightskyblue', 'salmon'], startangle=90, shadow=True, autopct='%1.1f%%') plt.show()
def analyzeFile(input_file): #Initialisation snowball = SnowballStemmer('english') healthplan = {"raw_text": "", "number_pages": 0} keywords=["autism", "prosthetics", "disability", "disabled", "learning disability",\ "prostheses", "physiotherapy", "deaf", "blind", "chronic conditions","hearing loss", "physiotherapist",\ "mental health", "autistic spectrum disorder", "autistic", "aspergers", "ADHD", "attention defecit disorder", \ "speech therapy", "dyslexia", "dyspraxia","learning disorders", "speech delay", "genetic screening", \ "cystic fibrosis", "visual impairment", "blindness", "deaf-blindness", "ABI", "acquired brain injury", "prosthesis"] keywords_lem = list(map(lambda x: snowball.stem(x), keywords)) #Import pdf pdfFileObj = open(input_file, 'rb') #'rb' for read binary mode pdfReader = PyPDF2.PdfFileReader(pdfFileObj) print("Number of pages in document:", pdfReader.numPages) healthplan["number_pages"] = pdfReader.numPages for i in range(healthplan["number_pages"]): healthplan["raw_text"] += pdfReader.getPage(i).extractText().replace( "\n", "") print("Processed ", round(i / healthplan["number_pages"] * 100, 2), "% of documents") #Tokenisation and lemmisation of document #strip removes trailing and leading from provided string, not good for whole document #translate quicker than replace, quicker than list concatenation ignoring badchars print(healthplan["raw_text"][:500]) table = str.maketrans(dict.fromkeys("(){}<>,'\t")) healthplan["raw_text"] = healthplan["raw_text"].translate(table) print(healthplan["raw_text"][:500]) healthplan["words"] = word_tokenize(healthplan["raw_text"]) print(healthplan["words"][:50]) healthplan["sentences"] = sent_tokenize(healthplan["raw_text"]) print(healthplan["sentences"][:10]) #Perform stemmisation healthplan["words"] = list( map(lambda x: snowball.stem(x), healthplan["words"])) #Could optionally do text chunking, so not limited by sentences #I'm going to combine n sentences to form sentence chunks healthplan["chunks"] = [] chunksize = 1 print("Number Document sentences: ", len(healthplan["sentences"])) for i in range(0, len(healthplan["sentences"]), chunksize): healthplan["chunks"].append(' '.join( healthplan["sentences"][i:(i) + chunksize])) print("Number Document chunks: ", len(healthplan["chunks"])) #Sentence category prediction category_map = {'sci.med': 'Medicine'} categories = [ 'alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc' ] for c in categories: category_map[c] = "Other" # Get the training dataset training_data = fetch_20newsgroups(subset='train', categories=category_map.keys(), shuffle=True, random_state=5) #print(training_data.data[0]) #extract term counts count_vectorizer = CountVectorizer() train_tc = count_vectorizer.fit_transform(training_data.data) #train term freq inversion tfidf = TfidfTransformer() train_tfidf = tfidf.fit_transform(train_tc) #Train a Multinomial Naive Bayes classifier classifier = MultinomialNB().fit(train_tfidf, training_data.target) # Transform input data using count vectorizer input_tc = count_vectorizer.transform(healthplan["chunks"]) # Transform vectorized data using tfidf transformer input_tfidf = tfidf.transform(input_tc) # Predict the output categories predictions = classifier.predict(input_tfidf) no_outputs = 10 # Print the outputs for sent, category in zip(healthplan["chunks"][:no_outputs], predictions[:no_outputs]): print('\nInput:', sent, '\nPredicted category:', category_map[training_data.target_names[category]]) #only include medicine relevant print("Percentage of document medicine related: ", round(sum(predictions) / len(predictions), 2), "% : ", sum(predictions)) #healthplan["chunks"]=healthplan["chunks"][predictions=="Medicine"] #leminize words in chunks, compare to leminized keywords #print chunks where relevant. If last/first sentence, group with previous? healthplan["chunks_lem"] = [[snowball.stem(s) for s in word_tokenize(c)] for c in healthplan["chunks"]] print("Disability key words: ", keywords_lem) healthplan["disabled_chunks_lem"] = [ (i, list(set(c).intersection(set(keywords_lem)))) if set(c).intersection(set(keywords_lem)) else (i, None) for i, c in enumerate(healthplan["chunks_lem"]) ] print( "Number of disability sentences: ", len(healthplan["disabled_chunks_lem"]) - healthplan["disabled_chunks_lem"].count(None)) print( "Percentage of document disability related: ", len(healthplan["disabled_chunks_lem"]) - healthplan["disabled_chunks_lem"].count(None)) healthplan["disabled_chunks"] = [] for i, c in enumerate(healthplan["chunks"]): if (healthplan["disabled_chunks_lem"][i][1] != None): print("Sentence: ", c) healthplan["disabled_chunks"].append([int(i), c]) print("Key words: ", healthplan["disabled_chunks_lem"][i]) print("\n", "#" * 10, "\n") #Returned sentences that relate to disability topics for c in healthplan["disabled_chunks"]: print(c) #print(np.array(healthplan["disabled_chunks"])[:,1]) ''' for i,c in enumerate(healthplan["chunks"]): if "prosthesis" in c: print("Index ",i,": ", healthplan["chunks_lem"][i], healthplan["disabled_chunks_lem"][i]) print("Original sentence: ", c) print("previous sebtebces", healthplan["chunks"][i-5:i]) print("Percentage of medical sentences disability related: ", round((len(healthplan["disabled_chunks_lem"])-healthplan["disabled_chunks_lem"].count(None))/len(healthplan["chunks_lem"]),2), "% : ", (len(healthplan["disabled_chunks_lem"])-healthplan["disabled_chunks_lem"].count(None))) print("\n", "#"*20, "\n") for i,c in enumerate(healthplan["sentences"]): if "prosthesis" in c: print("Index ",i,": ", healthplan["chunks_lem"][i], healthplan["disabled_chunks_lem"][i]) print("Original sentence: ", c) print("previous sebtebces", healthplan["sentences"][i-5:i]) ''' print("\n", "#" * 20, "\n") print(sentences) sentences = [(s[0], [snowball.stem(w) for w in word_tokenize(s[1])]) for s in sentences] print("\n", "#" * 20, "\n") #Convert sentence features using bag of words model def extract_features(words): return dict([(word, True) for word in words]) #split sentences into pos, neg, neut training/testing trainsplit = 0.8 features_pos, features_neg, features_neut = [], [], [] for s in sentences: if (s[0] == 0): print("I am a positive sentence:", s[1]) features_neut.append((extract_features(s[1]), "neutral")) elif (s[0] == 1): features_pos.append((extract_features(s[1]), "positive")) else: features_neg.append((extract_features(s[1]), "negative")) num_neut, num_pos, num_neg = list( map( int, np.array( [len(features_neut), len(features_pos), len(features_neg)]) * trainsplit)) features_test = features_neut[num_neut:] + features_pos[ num_pos:] + features_neg[num_neg:] features_train = features_neut[: num_neut] + features_pos[: num_pos] + features_neg[: num_neg] #sentence sentiment analysis # Train a Naive Bayes classifier classifier = NaiveBayesClassifier.train(features_train) print('\nAccuracy of the classifier:', nltk_accuracy(classifier, features_test)) ''' #predictions=classifier.predict(features_test) #Create Confusion Matrix confusion_mat=confusion_matrix(np.array(features_test)[:,1], predictions) #Visualize Confusion Matrix plt.imshow(confusion_mat, interpolation="nearest", cmap=plt.cm.spring)#, cmap=plt.cm.gray plt.title('Confusion Matrix') plt.colorbar() ticks=np.arange(5) plt.xticks(ticks,ticks) plt.yticks(ticks,ticks) plt.ylabel("True Labels") plt.xlabel("Predicted Labels") plt.show() #Classification Report targets=['Class-0', 'Class-1', 'Class-2', 'Class-3', 'Class-4'] print('/n', classification_report(np.array(features_test)[:,1], predictions, target_names=targets)) ''' N = 15 print('\nTop ' + str(N) + ' most informative words:') for i, item in enumerate(classifier.most_informative_features()): print(item) print(str(i + 1) + '. ' + item[0]) if i == N - 1: break print("Indexes of disability related chunks in sentences:", np.array(healthplan["disabled_chunks"])[:, 0]) print(list(map(int, np.array(healthplan["disabled_chunks"])[:, 0]))) #Go through all disability related chunks for i in np.array(healthplan["disabled_chunks"])[:, 0]: # Compute the probabilities i = int(i) dis_sentence = healthplan["chunks_lem"][i] print("Sentence trying to sentiment classify: ", healthplan["chunks"][i]) probabilities = classifier.prob_classify( extract_features(dis_sentence)) # Pick the maximum value predicted_sentiment = probabilities.max() # Print outputs print("Predicted sentiment:", predicted_sentiment) print("Probability:", round(probabilities.prob(predicted_sentiment), 2)) sys.stdout.flush()
# Define the train and test split (80% and 20%) threshold = 0.8 num_pos = int(threshold * len(features_pos)) num_neg = int(threshold * len(features_neg)) # Create training and training datasets features_train = features_pos[:num_pos] + features_neg[:num_neg] features_test = features_pos[num_pos:] + features_neg[num_neg:] # Print the number of datapoints used print('\nNumber of training datapoints:', len(features_train)) print('Number of test datapoints:', len(features_test)) # Train a Naive Bayes classifier classifier = NaiveBayesClassifier.train(features_train) print('\nAccuracy of the classifier:', nltk_accuracy( classifier, features_test)) N = 15 print('\nTop ' + str(N) + ' most informative words:') for i, item in enumerate(classifier.most_informative_features()): print(str(i+1) + '. ' + item[0]) if i == N - 1: break # Test input movie reviews input_reviews = [ 'The costumes in this movie were great', 'I think the story was terrible and the characters were very weak', 'People say that the director of the movie is amazing', 'This is such an idiotic movie. I will not recommend it to anyone.' ]
MNB_classifier = SklearnClassifier(MultinomialNB(alpha=1)).train(features_train) BNB_classifier = SklearnClassifier(BernoulliNB(alpha=1,binarize=0)).train(features_train) LGR_classifier = SklearnClassifier(LogisticRegression()).train(features_train) SDGC_classifier = SklearnClassifier(SGDClassifier(max_iter=1000,tol=1e-3)).train(features_train) SVC_classifier = SklearnClassifier(SVC()).train(features_train) LSVC_classifier = SklearnClassifier(LinearSVC()).train(features_train) NuSVC_classifier = SklearnClassifier(NuSVC()).train(features_train) #nu <= 0 or nu > 1 # N = 15 # print('\nTop ' + str(N) + ' most informative words:') # for i, item in enumerate(MNB_classifier.most_informative_features()): # print(str(i+1) + '. ' + item[0]) # if i == N - 1: # break print('ONB_classifier accuracy: ',nltk_accuracy(ONB_classifier,features_test)) print('MNB_classifier accuracy: ',nltk_accuracy(MNB_classifier,features_test)) print('BNB_classifier accuracy: ',nltk_accuracy(BNB_classifier,features_test)) print('LGR_classifier accuracy: ',nltk_accuracy(LGR_classifier,features_test)) print('SDGC_classifier accuracy: ',nltk_accuracy(SDGC_classifier,features_test)) print('SVC_classifier accuracy: ',nltk_accuracy(SVC_classifier,features_test)) print('LSVC_classifier accuracy: ',nltk_accuracy(LSVC_classifier,features_test)) print('NuSVC_classifier accuracy: ',nltk_accuracy(NuSVC_classifier,features_test)) # Test input movie reviews with open('text.txt','r',encoding='utf-8') as f1: input_reviews = sent_tokenize(f1.read()) f1.close() f = open('result.txt','w',encoding='utf-8')
'Negative') for f in fileids_neg] threshold = 0.8 num_pos = int(threshold * len(features_pos)) num_neg = int(threshold * len(features_neg)) # creating training and testing data features_train = features_pos[:num_pos] + features_neg[:num_neg] features_test = features_pos[num_pos:] + features_neg[num_neg:] print('\nNumber of training datapoints:', len(features_train)) print('Number of test datapoints:', len(features_test)) # training a naive bayes classifier classifier = NaiveBayesClassifier.train(features_train) print('Accuracy:', nltk_accuracy(classifier, features_test)) # testing input_reviews = [ 'The costumes in this movie were great', 'I think the story was terrible and the characters were very weak', 'People say that the director of the movie is amazing', 'This is such an idiotic movie. I will not recommend it to anyone.' ] print('Movie review prediction:') for review in input_reviews: print('Review:', review) # computing the probabilities probabilities = classifier.prob_classify( extract_features(review.split()))