def parseReviews(review_filepath): f_in = open(review_filepath, 'r') f_out = open ("out.csv", "w") csv_writer = csv.writer(f_out) count = 0 sentiment_analyzer = Sentiment() for review in f_in: count+=1 if count % 1000 == 0: print(count) review_obj = json.loads(review) text = review_obj["text"] date = review_obj["date"] votes = review_obj["votes"] stars = review_obj["stars"] funny = votes["funny"] useful = votes["useful"] cool = votes["cool"] year = int(date.split("-")[0]) normalize_factor = float(2016-year)/time_normalize_factor funny_norm = float(funny/normalize_factor) useful_norm = float(useful/normalize_factor) cool_norm = float(cool/normalize_factor) sentiment = sentiment_analyzer.getSentiment(text) csv_writer.writerow([sentiment, funny, useful, cool, funny_norm, useful_norm, cool_norm, stars]) f_in.close() f_out.close()
class SimpleNLP(object): def __init__(self, method=1, doc=None, datalist=None): self.doc = doc self.datalist = datalist self.seg = Seg() self.sentiment = Sentiment(method) self.method = method def seg_datalist(self): return self.seg.seg_from_datalist(self.datalist) def seg_doc(self): return self.seg.seg_from_doc(self.doc) def get_keyword_datalist(self): return dict(self.seg.get_keyword_from_datalist(self.datalist)) def sentiment_analysis_doc(self): if self.method == 1: self.sentiment.load_model(root_path + '/data/naivebayes_model30000v3') elif self.method == 2: self.sentiment.load_model(root_path + '/data/svmmodel10000v4') return self.sentiment.predict_sentence_doc(self.doc) def sentiment_analysis_datalist(self): if self.method == 1: self.sentiment.load_model(root_path + '/data/naivebayes_model30000v3') elif self.method == 2: self.sentiment.load_model(root_path + '/data/svmmodel10000v4') return self.sentiment.predict_datalist(self.datalist)
def get_text(): try: user_msg = request.form.get('user_msg', None) message = Message() print(user_msg) if user_msg: lang_detect = LanguageDetection(**conf) request_id = datetime.now().strftime('%Y%m%d%H%M%S') lang_detect_result = lang_detect.detect_language(id=request_id, text=user_msg) message.append( {'text': "%s 로 말씀하셨습니다. " % lang_detect_result.name}) if lang_detect_result.lang_code in conf['approval_lang_code']: sentiment = Sentiment(**conf) sentiment_score = sentiment.detect_sentiment( id=request_id, text=user_msg, language=lang_detect_result.lang_code) message.append({'text': sentiment_score.get_sentiment_text()}) else: message.append({'text': '해당언어로는 제가 기분을 알 수 없습니다.'}) message.append({ 'text': '%s 언어로 이야기 해주세요.' % (conf['approval_lang_code'].values()) }) return jsonify(message.to_dict()), 200 except Exception: import traceback print(traceback.format_exc()) d = {"messages": [{"text": "알아들을수가 없습니다. 다른애기를 해주세요."}]} return jsonify(d), 200
def test_sentiment(self): from sentiment import Sentiment s = Sentiment() score = s.parseRss("sentiment.testdata") #print(str(score)) self.assertTrue(score < 0.0)
def sentiment_ensemble_lexi_ml(self, lexicon_predictions, ml_predictions, classifiers={'GaussianNB': GaussianNB()}, n_folds=2): """ Fusion classification for s analysis :type lexicon_predictions: dict with lexicon name as keys and lists of predicted values as values :type ml_predictions: dict with classifiers name as keys and lists of predicted values as values :type classifiers: dict with name of classifier and classifier object :return: dict with measures and time for supervised learning process """ ensemble_features = self.features_array(lexicon_predictions.values(), ml_predictions.values()) self.feature_set = ensemble_features # temp_X = self.feature_set.T s = Sentiment() # print self.classes predictions = s.sentiment_classification( # X=self.feature_set, X=self.feature_set.T, # X=self.feature_set, y=self.classes, n_folds=n_folds, classifiers=classifiers) # print '+++++++++++++++++++++++ After ensemble +++++++++++++++++' # print # pprint(s.results) # TODO dodac predictions do results return s.results
def place_sentiment(): wd_id = urllib.unquote(str(request.args.get('wd_id'))) type = urllib.unquote(request.args.get('type')).encode('utf8') utility = Utility() sentiment = Sentiment() place = utility.get_place(wd_id=wd_id, type=type) place['label'] = place['uri'].split("/")[-1].replace("_", " ") if (place['wikidata_id'] != "None"): place['surfaceForm'] = utility.get_surfaceForm(place['wikidata_id']) elements = sentiment.get_place_word_frequency(wd_id=wd_id, type=type, pos_tag="a") words = elements['words'] most = words.most_common() array = [] for i in range(0, min(15, most.__len__())): new_dict = {} elem = most[i] new_dict['word'] = elem[0].decode('utf-8') new_dict['num'] = elem[1] array.append(new_dict) words['words'] = array pos_neg_words = sentiment.get_best_worst_words(wd_id, type, "a") return render_template('place_sentiment.html', place=place, words=words, pos_words=pos_neg_words['pos'], neg_words=pos_neg_words['neg'])
def get_tweet_sent(self, hashtag, time): #Historical twitter sentiment for time frame sentiment = Sentiment() d = datetime.today() - timedelta(days=time) authentication = tweepy.OAuthHandler( self.getConfig('Tweet')['consumer_key'], self.getConfig('Tweet')['consumer_secret']) authentication.set_access_token( self.getConfig('Tweet')['access_token'], self.getConfig('Tweet')['access_token_secret']) api = tweepy.API(authentication, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) tweetsPerQry = 100 maxTweets = self.getConfig('Tweet')['maxTweets'] data = {} for h in hashtag: maxId = -1 tweetCount = 0 sent_total = [] while tweetCount < maxTweets: if (maxId < 0): newTweets = api.search(q=h, count=tweetsPerQry, result_type="recent", tweet_mode="extended", languages=["en"]) else: newTweets = api.search(q=h, count=tweetsPerQry, max_id=str(maxId - 1), result_type="recent", tweet_mode="extended", languages=["en"]) if not newTweets: #print("Aint no tweet anymore....") break for tweet in newTweets: created_at = tweet.created_at text = tweet.full_text sentiment_score = sentiment.get_sentiment_score(text) followers = 0 #Remove scores of zero and only add for date if created_at >= d and sentiment_score != 0: sent_total.append(sentiment_score) tweetCount += len(newTweets) #print(h, tweetCount) maxId = newTweets[-1].id #This seems to be more acurate with median than mean data[h] = round(statistics.median(sent_total), 3) #print(maxId) return (data)
def test_sentimentDict(self): from sentiment import Sentiment s = Sentiment() self.assertTrue(len(Sentiment.Dict) > 0) self.assertTrue(s.scoreTitle("stocks down") < 0.0) self.assertTrue(s.scoreTitle("opening up") > 0.0) self.assertTrue(s.scoreDate("Mon, 04 Jan 2017 07:54:27 -0400") == 0.0)
def GetSubjScoreForSingleSent(SentIndex, Sentence): T0 = time() try: SentAnalyzer = Sentiment() ResDict = SentAnalyzer.analyze([Sentence]) pprint(ResDict) SentSentimentPol = {S: {'sentiment':ResDict['sentiments'][I], 'score': ResDict['scores'][I]} for I,S in enumerate(ResDict['sentences'])} print 'processed sentence number {} in {} sec'.format(SentIndex, round(time() - T0)) return SentSentimentPol except: print 'failed to process sentence number ', SentIndex return {}
def get_scatter_data(timespan): """Send tweet sentiment to scatter plot""" print "In our JSON route" + session.get("ticker") ticker = session.get("ticker") current_stock = Stock.query.get(ticker) tweets = current_stock.get_tweets() stocks = Stock.query.all() # tweets_json = json.dumps(tweets, default=lambda o: o.__dict__) # now = moment.utcnow().timezone("US/Eastern") result = [] s = Sentiment(stocks) sentiment = None negative = ['0.0', '0.1', '0.2', '0.3', '0.4', '0.5'] positive = ['0.6', '0.7', '0.8', '0.9', '1.0'] for tweet in tweets: #create a moment that represents now - 24 hours day_ago = moment.utcnow().timezone("US/Eastern").subtract(hours=24) # convert unicode created_at to string created_at = unicodedata.normalize('NFKD', tweet.created_at).encode('ascii', 'ignore') # format created_at string to ISO 8610 created_at_str = time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(created_at, '%a %b %d %H:%M:%S +0000 %Y')) # create a moment from the string created_at = moment.date(created_at_str, 'YYYY-MM-DD HH:mm:ss') # convert timezone of moment from UTC to Eastern time created_at_final = created_at.utcnow().timezone("US/Eastern") print created_at_final > day_ago if tweet.text.count('$') == 1 and tweet.retweeted_status is None and created_at_final > day_ago: # Convert tweet text from unicode to text tweet_text = unicodedata.normalize('NFKD', tweet.text).encode('ascii', 'ignore') # Get the sentiment of the tweet retured in either 'positive' or 'negative' sentiment_str = s.get_tweet_sentiment(tweet_text) if sentiment_str == 'positive': sentiment = random.choice(positive) if sentiment_str == 'negative': sentiment = random.choice(negative) created_at = unicodedata.normalize('NFKD', tweet.created_at).encode('ascii', 'ignore') # Sun Jun 05 17:09:07 +0000 2016 created_at_str = time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(created_at, '%a %b %d %H:%M:%S +0000 %Y')) # Below 4 lines returns duplicate timestamps... need a way to convert to US/EST timezone # create a moment from the string # created_at = moment.date(created_at_str, 'YYYY-MM-DD HH:mm:ss') # convert timezone of moment from UTC to Eastern time # created_at_final = created_at.utcnow().timezone("US/Eastern") print "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&" print created_at_str print "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&" result.append({'datetime': created_at_str, 'sentiment': sentiment}) #sort dictionary by datetime sorted_result = sorted(result, key=lambda k: k['datetime']) return json.dumps(sorted_result)
def ProcessSingleFile (FName, MailNumber): T0 = time() try: EmailText = [open(FName).read()] SentAnalyzer = Sentiment() Res = SentAnalyzer.analyze(EmailText) pprint (Res) SentSentimentPol = zip(Res['sentences'],Res['sentiments'],Res['scores']) SentSentimentPol = [ThreeTuple for ThreeTuple in SentSentimentPol if ThreeTuple[1] != 'neutral'] print 'processed email {} in {} sec'.format(MailNumber, round(time()-T0)) return SentSentimentPol except: print 'failed to process email ', MailNumber return []
def get_tweet_hist(self, hashtag): sentiment = Sentiment() tweetsPerQry = 100 maxTweets = self.get_config()['maxTweets'] for h in hashtag: maxId = -1 tweetCount = 0 while tweetCount < maxTweets: if (maxId < 0): newTweets = api.search(q=h, count=tweetsPerQry, result_type="recent", tweet_mode="extended", languages=["en"]) else: newTweets = api.search(q=h, count=tweetsPerQry, max_id=str(maxId - 1), result_type="recent", tweet_mode="extended", languages=["en"]) if not newTweets: print("Aint no tweet anymore....") break for tweet in newTweets: created_at = round( calendar.timegm( time.strptime(str(tweet.created_at), "%Y-%m-%d %H:%M:%S"))) * 1000 text = tweet.full_text sentiment_score = sentiment.get_sentiment_score(text) followers = 0 self.push_to_db(h, created_at, text, sentiment_score, followers) self.push_to_influx(h, created_at, text, sentiment_score, followers) #trainingData.append(tweet_tuple) tweetCount += len(newTweets) print(h, tweetCount) maxId = newTweets[-1].id
class Featurizer(object): def __init__(self): self.sentiment_analyzer = Sentiment('data/AFINN-111.txt') self.bow_vectorizer = None self.bow_analyzer = None def bag_of_words(self, body): return self.bow_vectorizer.transform([body]).toarray() def text_features(self, comment): num_chars = len(comment.get("body")) num_links = count_links(comment.get("body")) simple_tokens = comment.get("body").split(' ') num_words = 0 avg_word_length = 0 for token in simple_tokens: num_words += 1 avg_word_length += len(token) avg_word_length = float(avg_word_length) / float(num_words) sentiment = self.sentiment_analyzer.analyze( self.bow_analyzer(comment.get("body"))) score = comment.get("score") return [num_chars, num_links, num_words, num_words, avg_word_length, sentiment] def transform_comment(self, comment): return numpy.hstack(( numpy.array([self.text_features(comment)], dtype='float_'), self.bag_of_words(comment.get("body")))) def score_comment(self, comment): return comment.get("score") def transform(self, comments): """ Returns a Nx(D+1) numpy matrix of features. The first D columns correspond to features, where the final column corresponds to the scores of each comment""" # if it's a single instance, return an array if isinstance(comments, dict): return transform_comment(comments) # http://scikit-learn.org/stable/modules/feature_extraction.html self.bow_vectorizer = CountVectorizer(min_df=1) self.bow_vectorizer.fit([c.get("body") for c in comments]) self.bow_analyzer = self.bow_vectorizer.build_analyzer() def features_and_label(comment): return numpy.hstack(( self.transform_comment(comment), numpy.array([[self.score_comment(comment)]], dtype='float_'))) return numpy.vstack([features_and_label(c) for c in comments])
def __init__(self, file_in, max_articles=None, file_out=None): self.file_name = file_name self.max_articles = max_articles self.articles = [] self.sentiment = Sentiment() self.results = [] self.file_out = file_out
def partOne(): _day = "今日" # _day="昨日" height_10cm = "非st 非创业板 非科创板 非新股" up_all = crawl_length(_day + "涨幅大于0") down_all = crawl_length(_day + "涨幅小于0") up_5 = crawl_length(_day + "涨幅大于5 " + height_10cm) down_5 = crawl_length(_day + "跌幅大于5 " + height_10cm) up_num = crawl_length(_day + "涨停" + height_10cm) down_num = crawl_length(_day + "跌停 " + height_10cm) up_10_2 = crawl_length(_day + "二连板 " + height_10cm) up_highest = crawl_highest(_day + "二连板以上 " + height_10cm) a = Sentiment(str(datetime.datetime.now().date()), up_5=up_5, down_5=down_5, up_num=up_num, down_num=down_num, up_all=up_all, down_all=down_all, up_10_2=up_10_2, up_highest=up_highest) # a=Sentiment('2022-01-14',up_5=up_5,down_5=down_5,up_num=up_num,down_num=down_num,up_all=up_all,down_all=down_all,up_10_2=up_10_2,up_highest=up_highest) # a = Sentiment('2021-11-30', 166, 1740, 64, 2, 2899, 1583, 9, 5) print(a) insert(a)
def main(): #istantiate all the useful classes linker=EntityLinker() cleaner=Cleaner() sentiment=Sentiment() utility= Utility()
class Featurizer(object): def __init__(self): self.sentiment_analyzer = Sentiment('data/AFINN-111.txt') self.bow_vectorizer = None self.bow_analyzer = None def bag_of_words(self, body): return self.bow_vectorizer.transform([body]).toarray() def text_features(self, comment): num_chars = len(comment.get("body")) num_links = count_links(comment.get("body")) simple_tokens = comment.get("body").split(' ') num_words = 0 avg_word_length = 0 for token in simple_tokens: num_words += 1 avg_word_length += len(token) avg_word_length = float(avg_word_length) / float(num_words) sentiment = self.sentiment_analyzer.analyze( self.bow_analyzer(comment.get("body"))) score = comment.get("score") return [ num_chars, num_links, num_words, num_words, avg_word_length, sentiment ] def transform_comment(self, comment): return numpy.hstack((numpy.array([self.text_features(comment)], dtype='float_'), self.bag_of_words(comment.get("body")))) def score_comment(self, comment): return comment.get("score") def transform(self, comments): """ Returns a Nx(D+1) numpy matrix of features. The first D columns correspond to features, where the final column corresponds to the scores of each comment""" # if it's a single instance, return an array if isinstance(comments, dict): return transform_comment(comments) # http://scikit-learn.org/stable/modules/feature_extraction.html self.bow_vectorizer = CountVectorizer(min_df=1) self.bow_vectorizer.fit([c.get("body") for c in comments]) self.bow_analyzer = self.bow_vectorizer.build_analyzer() def features_and_label(comment): return numpy.hstack((self.transform_comment(comment), numpy.array([[self.score_comment(comment)]], dtype='float_'))) return numpy.vstack([features_and_label(c) for c in comments])
def __init__(self, available_intent, script_file="script.txt", intent_p_dict=INTENT_P_DICT, grammar_p_dict=GRAMMAR_P_DICT): self.episode_script = None # Init basic episode generator with open(os.path.join(DATA_ROOT, script_file), "rb") as f: script = json.load(f) if "pre_sales" in available_intent.keys(): self.pre_sales = PreSales(script["pre_sales"], available_intent["pre_sales"], intent_p_dict["pre_sales"], grammar_p_dict["pre_sales"]) else: self.pre_sales = None if "in_sales" in available_intent.keys(): self.in_sales = InSales(script["in_sales"], available_intent["in_sales"], intent_p_dict["in_sales"], grammar_p_dict["in_sales"]) else: self.in_sales = None if "after_sales" in available_intent.keys(): self.after_sales = AfterSales(script["after_sales"], available_intent["after_sales"], intent_p_dict["after_sales"]) else: self.after_sales = None if "sentiment" in available_intent.keys(): self.sentiment = Sentiment(script["sentiment"]) else: self.sentiment = None # Get available episodes self.available_episode = list(available_intent.keys()) if "pre_sales" in self.available_episode and "in_sales" in self.available_episode: self.available_episode.append(" ".join(["pre_sales", "in_sales"])) # Knowledge Base self.kb_helper = KnowledgeBase() # translate agent dialog action according to KB results self.translator = Translator()
def train(): print("load data") d = Sentiment(data_name, 6) print("start_train") with tf.device('/gpu:'+FLAGS.gpu): m = Model(data_name, d.num_class, embeddings = d.embeddings, size = FLAGS.size, batch_size = FLAGS.batch, dropout = FLAGS.dropout, rnn_cell = FLAGS.cell, optimize = FLAGS.opt) print("start_fit") m.fit(d.train_set, d.test_set, d.train_one_hot, d.test_one_hot, FLAGS.epoch)
def analysis(): json = request.get_json() params = json.get("params") base64_html = json.get("html") html = base64.b64decode(base64_html).decode('utf-8') response = Sentiment.run(html, params) return {"status": "ok", "message": response}
def __init__(self): super().__init__() self.storyGenerator = StoryGenerator() self.action_buttons = [] self.inventory_labels = [] self.out_path = "" self.storyGenerator.CHANCE_TO_REMEMBER_ITEM = 0.0 self.storyGenerator.CHANCE_TO_REMEMBER_PERSON = 0.0 self.total_nodes = 0 self.current_node = 0 self.EMPTY_ACTION = { "type": "", "action": "", "entity": "", "sentence": "", "simple": "", "probability": "" } self.action_generator = 0 self.DEBUG_OUT = False self.REACT_TO_SENTIMENT = True self.MAX_CHARACTERS = -1 self.sentiment = Sentiment() self.GENDER_A = ["unknown", "androgynous", "male", "mostly_male"] self.gender_detector = gender.Detector() logging.basicConfig(level=logging.DEBUG) logging.config.dictConfig({ 'version': 1, 'disable_existing_loggers': True }) self.logger = logging.getLogger(__name__) self.logger.info("Initialization done.")
def create_app(spark_context, dataset_path): global recommendation_engine global content_recommend global hot_words global sentiment recommendation_engine = RecommendationEngine(spark_context, dataset_path) content_recommend = ContentRecommend(spark_context, dataset_path) hot_words = HotWords(spark_context, dataset_path) sentiment = Sentiment(spark_context, dataset_path) img = Image(spark_context, dataset_path) app = Flask(__name__) app.config['JSON_AS_ASCII'] = False app.config['JSONIFY_MIMETYPE'] = "application/json;charset=utf-8" app.register_blueprint(main) return app
def api(): if request.method == 'GET': return jsonify(text='This is how you should format your POST data.') if request.method == 'POST': try: f = request.get_json() except Exception as e: # Should log this error maybe? print(e) return Errors.bad_request() else: for text in f: print( datetime.datetime.fromtimestamp(timestamp).strftime( '%Y-%m-%d %H:%M:%S')) return jsonify(Sentiment.demo_vader_instance(f[text]))
def process(self, media_list_file): # Filter the tweets. print_green("Filtering movies in {} to {}".format( raw_movie_dir, filtered_movie_dir)) filter.filterAll(media_list_file, raw_movie_dir, filtered_movie_dir) # Run the sentiment analysis. print_green("Sentiment of movies in {} to {}".format( filtered_movie_dir, scored_movie_dir)) Sentiment().run(filtered_movie_dir, scored_movie_dir) # Get the critic scores. print_green("Critic score of movies") Omdb().critic_scores(media_list_file) # Merge the critic scores and sentiment analysis scores into # the final data.json output file. print_green("Merging sentiment and critic scores") merge_gen('critic_scores.json', 'sentiment_scores.json', 'data.json')
def mappa_brescia_filtered_sentiment(): utility = Utility() sentiment = Sentiment() query = "" count = 0 if (urllib.unquote(str(request.args.get('elemento_stradale'))) == "1"): query = query + " or aggregated_type = \'elemento stradale\'" if (urllib.unquote(str(request.args.get('sede_scolastica'))) == "1"): query = query + " or aggregated_type = \'sede scolastica/culturale\'" if (urllib.unquote(str(request.args.get('luogo_turistico'))) == "1"): query = query + " or aggregated_type = \'luogo turistico/monumento\'" if (urllib.unquote(str(request.args.get('edificio_religioso'))) == "1"): query = query + " or aggregated_type = \'edificio religioso\'" if (urllib.unquote(str( request.args.get('suddivisione_cittadina'))) == "1"): query = query + " or aggregated_type = \'suddivisione cittadina'" if (urllib.unquote(str(request.args.get('edifici_civili'))) == "1"): query = query + " or aggregated_type = \'edificio civile\'" if (urllib.unquote(str(request.args.get('fermata_tpl'))) == "1"): query = query + " or aggregated_type = \'fermata tpl\'" if (urllib.unquote(str(request.args.get('edificio_sanitario'))) == "1"): query = query + " or aggregated_type = \'edificio sanitario\'" if (urllib.unquote(str(request.args.get('edificio_sportivo'))) == "1"): query = query + " or aggregated_type = \'edificio sportivo\'" n_citations = urllib.unquote(str(request.args.get('citations'))) norm = urllib.unquote(str(request.args.get('normalize'))) circle_size = urllib.unquote(str(request.args.get('circle_size'))) places = utility.get_places_filtered(1, 0, n_citations, query) params = dict() params['zoom'] = 13 params['circle_size'] = circle_size params['counters'] = "map.addLayer(circlesLayer);" params['sentiment'] = norm return render_template('mappa_sentiment.html', places=places, params=params)
def get_tweets(query): list_of_results = [] negative_words = [] positive_words = [] token = tauth.get_bearer_token() response = requests.get('https://api.twitter.com/1.1/search/tweets.json?', headers={"Authorization": "Bearer " + token}, params={ "q": query, "tweet_mode": "extended", "lang": "pt" }) if response.status_code != 200: raise Exception("Cannot get a tweets (Status Code %d) Message: %s" % (response.status_code, response.text)) body = response.json() if not body['statuses']: raise Exception("We don't have tweets to show") else: for tweet in body['statuses']: text = tweet['full_text'] language = tweet['metadata']['iso_language_code'] sentiment = Sentiment(text, language) result = Result( tweet['id'], tweet['user']['name'], text, sentiment.analyze_feeling(), tweet['favorite_count'], tweet['retweet_count'], tweet['created_at'], ) list_of_results.append(result) text_wordcloud = text_treatment.treat_for_wordcloud(text) token_space = tokenize.WhitespaceTokenizer() word_list = token_space.tokenize(text_wordcloud) for word in word_list: sentiment = Sentiment(word, language) analyzed_word = sentiment.analyze_feeling() positive_words.append( word) if analyzed_word == 'pos' else negative_words.append( word) persist_search(query=query, results=list_of_results) return list_of_results, ' '.join(positive_words), ' '.join(negative_words)
def getText(self): sentDataList = [] for item in self.jsonData: textSet = set() if (self.quotes_dict.keys().__contains__(item['dt'])): pass else: self.quotes_dict[item['dt']] = item['quote'] if (self.text_dict.keys().__contains__(item['dt'])): textSet = self.text_dict[item['dt']] textSet.add(item['data']['t']) self.text_dict[item['dt']] = textSet else: textSet.add(item['data']['t']) self.text_dict[item['dt']] = textSet for k, v in self.text_dict.items(): quotes = self.quotes_dict[k] sentiData = Sentiment(k, v, quotes) sentDataList.append(sentiData) return sentDataList
def main(): print ("Generating language models....") trainAACleanLM = LanguageModel(CLEAN_TRAIN_AA_FILE) trainAAInsultLM = LanguageModel(INSULT_TRAIN_AA_FILE) trainABCleanLM = LanguageModel(CLEAN_TRAIN_AB_FILE) trainABInsultLM = LanguageModel(INSULT_TRAIN_AB_FILE) testCleanLM = LanguageModel(CLEAN_TEST_FILE) testInsultLM = LanguageModel(INSULT_TEST_FILE) trainLabels = np.array(([0] * trainABCleanLM.getDocCount()) + ([1] * trainABInsultLM.getDocCount())) testLabels = np.array(([0] * testCleanLM.getDocCount()) + ([1] * testInsultLM.getDocCount())) ### Just baseline probabilities print ("Running baseline....") NB = baselineNaiveBayes(trainAACleanLM, trainAAInsultLM) print ("\tTraining NB....") NB.train() print ("\tTesting NB....") totalNBMatrix = np.array(NB.genProbs(trainABCleanLM.getSents(), trainABInsultLM.getSents())) trainMatrix = totalNBMatrix testMatrix = np.array(NB.genProbs(testCleanLM.getSents(), testInsultLM.getSents())) # clf = svm.SVC(kernel='linear') # print ("\tTraining SVM....") # clf.fit(trainMatrix, trainLabels) # print ("\tTesting SVM....") # output1 = clf.predict(testMatrix).tolist() ## Baseline + PoS Features print ("Running baseline + PoS Features....") cleanPosMatrix = trainABCleanLM.getPosMatrix() insultPosMatrix = trainABInsultLM.getPosMatrix() testCleanPosMatrix = testCleanLM.getPosMatrix() testInsultPosMatrix = testInsultLM.getPosMatrix() posFeatures = np.array(cleanPosMatrix + insultPosMatrix) testPosFeatures = np.array(testCleanPosMatrix + testInsultPosMatrix) trainMatrix = np.hstack((trainMatrix, posFeatures)) testMatrix = np.hstack((testMatrix, testPosFeatures)) # clf = svm.SVC(kernel='linear') # print ("\tTraining SVM....") # clf.fit(trainMatrix, trainLabels) # print ("\tTesting SVM....") # output2 = clf.predict(testMatrix).tolist() ### Baseline + PoS Features + TF-IDF Features (TODO Arun) print("Running baseline + PoS Features + TF-IDF Features") # generate list of features with TFIDF, using trainABCleanLM and trainABInsultLM # trainMatrix = np.hstack((trainMatrix, the new thing you just generated)) # do same for testMatrix # clf = svm.SVC() # print ("\tTraining SVM....") # clf.fit(trainMatrix, trainLabels) # print ("\tTesting SVM....") # output3 = clf.predict(testMatrix).tolist() # then update the output_file.txt thing below tfidf_train_features = tfidf.make_feature_vectors(trainAACleanLM, trainAAInsultLM, trainABCleanLM, trainABInsultLM) tfidf_test_features = tfidf.make_feature_vectors(trainAACleanLM, trainAAInsultLM, testCleanLM, testInsultLM) print tfidf_test_features.shape, tfidf_train_features.shape print testMatrix.shape, trainMatrix.shape trainMatrix = np.hstack((trainMatrix, tfidf_train_features)) testMatrix = np.hstack((testMatrix, tfidf_test_features)) # clf = svm.SVC(kernel='linear') # print ("\tTraining SVM....") # clf.fit(trainMatrix, trainLabels) # print ("\tTesting SVM....") # output3 = clf.predict(testMatrix).tolist() ### SENTIMENT ### print("Running baseline + PoS Features + TF-IDF Features + Sentiment Features") s = Sentiment() clean_train = np.array(s.get_clean_train_vector()) insult_train = np.array(s.get_insult_train_vector()) sentiment_train_features = np.concatenate((clean_train, insult_train), axis=0) shape = sentiment_train_features.shape sentiment_train_features = sentiment_train_features.reshape((shape[0], 1)) print sentiment_train_features.shape clean_test = np.array(s.get_clean_test_vector()) insult_test = np.array(s.get_insult_test_vector()) sentiment_test_features = np.concatenate((clean_test, insult_test), axis=0) shape = sentiment_test_features.shape sentiment_test_features = sentiment_test_features.reshape((shape[0], 1)) print sentiment_test_features.shape trainMatrix = np.hstack((trainMatrix, sentiment_train_features)) testMatrix = np.hstack((testMatrix, sentiment_test_features)) # clf = svm.SVC(kernel='linear') # print ("\tTraining SVM....") # clf.fit(trainMatrix, trainLabels) # print ("\tTesting SVM....") # output4 = clf.predict(testMatrix).tolist() ### MISSPELLINGS ### print("Running baseline + PoS Features + TF-IDF Features + Sentiment Features + Misspellings features") m = Misspellings() clean_train = np.array(m.get_clean_misspellings(False)) insult_train = np.array(m.get_insult_misspellings(False)) misspellings_train_features = np.concatenate((clean_train, insult_train), axis=0) shape = misspellings_train_features.shape misspellings_train_features = misspellings_train_features.reshape((shape[0], 1)) print misspellings_train_features.shape clean_test = np.array(m.get_clean_misspellings()) insult_test = np.array(m.get_insult_misspellings()) misspellings_test_features = np.concatenate((clean_test, insult_test), axis=0) shape = misspellings_test_features.shape misspellings_test_features = misspellings_test_features.reshape((shape[0], 1)) print misspellings_test_features.shape trainMatrix = np.hstack((trainMatrix, sentiment_train_features)) testMatrix = np.hstack((testMatrix, sentiment_test_features)) clf = svm.SVC(kernel='linear') print ("\tTraining SVM....") clf.fit(trainMatrix, trainLabels) print ("\tTesting SVM....") output5 = clf.predict(testMatrix).tolist() index_shuf = range(len(trainMatrix)) trainMatrix_shuf = [] trainLabel_shuf = [] shuffle(index_shuf) for i in index_shuf: trainMatrix_shuf.append(trainMatrix[i]) trainLabel_shuf.append(trainLabels[i]) train_sizes, train_scores, valid_scores = learning_curve(svm.SVC(), trainMatrix_shuf, trainLabel_shuf, train_sizes=[100, 300, 500, 700, 900], cv=2) average_train_scores = [sum(i)/float(len(i)) for i in train_scores] average_valid_scores = [sum(i)/float(len(i)) for i in valid_scores] plt.plot(train_sizes, average_train_scores) plt.plot(train_sizes, average_valid_scores) plt.legend(['Training score', 'Cross-validation score'], loc='center left', bbox_to_anchor=(0.85, 0.5)) plt.ylabel('Score') plt.xlabel('Training examples') plt.show() # with open('SVM_output_file_with_SB.txt', 'w+') as f: # f.write("Output 1\n") # f.write("{}\n".format(output1)) # interpret_results(output1, testLabels, f) # f.write("\nOutput 2\n") # f.write("{}\n".format(output2)) # interpret_results(output2, testLabels, f) # f.write("\nOutput 3\n") # f.write("{}\n".format(output3)) # interpret_results(output3, testLabels, f) # f.write("Output 4\n") # f.write("{}\n".format(output4)) # interpret_results(output4, testLabels, f) # f.write("Output 5\n") # f.write("{}\n".format(output5)) # interpret_results(output5, testLabels, f) get_pca_graph(trainMatrix, trainLabels, "train_pca.png", title="PCA of Training Set") get_pca_graph(testMatrix, testLabels, "test_pca.png", title="PCA of Test Set") get_pca_graph(trainMatrix, trainLabels, "train_pca2.png", title="PCA of Training Set (Insults Only)", plot_negative=False) get_pca_graph(testMatrix, testLabels, "test_pca2.png", title="PCA of Test Set (Insults Only)", plot_negative=False)
def main(): print("Generating language models....") trainAACleanLM = LanguageModel(CLEAN_TRAIN_AA_FILE) trainAAInsultLM = LanguageModel(INSULT_TRAIN_AA_FILE) trainABCleanLM = LanguageModel(CLEAN_TRAIN_AB_FILE) trainABInsultLM = LanguageModel(INSULT_TRAIN_AB_FILE) testCleanLM = LanguageModel(CLEAN_TEST_FILE) testInsultLM = LanguageModel(INSULT_TEST_FILE) trainLabels = np.array(([0] * trainABCleanLM.getDocCount()) + ([1] * trainABInsultLM.getDocCount())) testLabels = np.array(([0] * testCleanLM.getDocCount()) + ([1] * testInsultLM.getDocCount())) ### Just baseline probabilities print("Running baseline....") NB = baselineNaiveBayes(trainAACleanLM, trainAAInsultLM) print("\tTraining NB....") NB.train() print("\tTesting NB....") totalNBMatrix = np.array( NB.genProbs(trainABCleanLM.getSents(), trainABInsultLM.getSents())) trainMatrix = totalNBMatrix testMatrix = np.array( NB.genProbs(testCleanLM.getSents(), testInsultLM.getSents())) clf = RandomForestClassifier() print("\tTraining random forest....") clf.fit(trainMatrix, trainLabels) print("\tTesting random forest....") output1 = clf.predict(testMatrix).tolist() ### Baseline + PoS Features print("Running baseline + PoS Features....") cleanPosMatrix = trainABCleanLM.getPosMatrix() insultPosMatrix = trainABInsultLM.getPosMatrix() testCleanPosMatrix = testCleanLM.getPosMatrix() testInsultPosMatrix = testInsultLM.getPosMatrix() posFeatures = np.array(cleanPosMatrix + insultPosMatrix) testPosFeatures = np.array(testCleanPosMatrix + testInsultPosMatrix) trainMatrix = np.hstack((trainMatrix, posFeatures)) testMatrix = np.hstack((testMatrix, testPosFeatures)) clf = RandomForestClassifier() print("\tTraining SVM....") clf.fit(trainMatrix, trainLabels) print("\tTesting SVM....") output2 = clf.predict(testMatrix).tolist() ### Baseline + PoS Features + TF-IDF Features (TODO Arun) print("Running baseline + PoS Features + TF-IDF Features") # generate list of features with TFIDF, using trainABCleanLM and trainABInsultLM # trainMatrix = np.hstack((trainMatrix, the new thing you just generated)) # do same for testMatrix # clf = svm.SVC() # print ("\tTraining SVM....") # clf.fit(trainMatrix, trainLabels) # print ("\tTesting SVM....") # output3 = clf.predict(testMatrix).tolist() # then update the output_file.txt thing below tfidf_train_features = tfidf.make_feature_vectors(trainAACleanLM, trainAAInsultLM, trainABCleanLM, trainABInsultLM) tfidf_test_features = tfidf.make_feature_vectors(trainAACleanLM, trainAAInsultLM, testCleanLM, testInsultLM) print tfidf_test_features.shape, tfidf_train_features.shape print testMatrix.shape, trainMatrix.shape trainMatrix = np.hstack((trainMatrix, tfidf_train_features)) testMatrix = np.hstack((testMatrix, tfidf_test_features)) clf = RandomForestClassifier() print("\tTraining random forest....") clf.fit(trainMatrix, trainLabels) print("\tTesting random forest....") output3 = clf.predict(testMatrix).tolist() ### SENTIMENT ### print( "Running baseline + PoS Features + TF-IDF Features + Sentiment Features" ) s = Sentiment() clean_train = np.array(s.get_clean_train_vector()) insult_train = np.array(s.get_insult_train_vector()) sentiment_train_features = np.concatenate((clean_train, insult_train), axis=0) shape = sentiment_train_features.shape sentiment_train_features = sentiment_train_features.reshape((shape[0], 1)) print sentiment_train_features.shape clean_test = np.array(s.get_clean_test_vector()) insult_test = np.array(s.get_insult_test_vector()) sentiment_test_features = np.concatenate((clean_test, insult_test), axis=0) shape = sentiment_test_features.shape sentiment_test_features = sentiment_test_features.reshape((shape[0], 1)) print sentiment_test_features.shape trainMatrix = np.hstack((trainMatrix, sentiment_train_features)) testMatrix = np.hstack((testMatrix, sentiment_test_features)) clf = RandomForestClassifier() print("\tTraining random forest....") clf.fit(trainMatrix, trainLabels) print("\tTesting random forest....") output4 = clf.predict(testMatrix).tolist() ### MISSPELLINGS ### print( "Running baseline + PoS Features + TF-IDF Features + Sentiment Features + Misspellings features" ) m = Misspellings() clean_train = np.array(m.get_clean_misspellings(False)) insult_train = np.array(m.get_insult_misspellings(False)) misspellings_train_features = np.concatenate((clean_train, insult_train), axis=0) shape = misspellings_train_features.shape misspellings_train_features = misspellings_train_features.reshape( (shape[0], 1)) print misspellings_train_features.shape clean_test = np.array(m.get_clean_misspellings()) insult_test = np.array(m.get_insult_misspellings()) misspellings_test_features = np.concatenate((clean_test, insult_test), axis=0) shape = misspellings_test_features.shape misspellings_test_features = misspellings_test_features.reshape( (shape[0], 1)) print misspellings_test_features.shape trainMatrix = np.hstack((trainMatrix, sentiment_train_features)) testMatrix = np.hstack((testMatrix, sentiment_test_features)) clf = RandomForestClassifier() print("\tTraining random forest....") clf.fit(trainMatrix, trainLabels) print("\tTesting forest....") output5 = clf.predict(testMatrix).tolist() with open('RANDOM_FOREST_output_file_without_SB.txt', 'w+') as f: f.write("Output 1\n") f.write("{}\n".format(output1)) interpret_results(output1, testLabels, f) f.write("\nOutput 2\n") f.write("{}\n".format(output2)) interpret_results(output2, testLabels, f) f.write("\nOutput 3\n") f.write("{}\n".format(output3)) interpret_results(output3, testLabels, f) f.write("Output 4\n") f.write("{}\n".format(output4)) interpret_results(output4, testLabels, f) f.write("Output 5\n") f.write("{}\n".format(output5)) interpret_results(output5, testLabels, f)
from bs4 import BeautifulSoup import os from sentiment import Sentiment import csv if __name__ == '__main__': files = os.listdir('./zagat') csvFile = open('out.csv','w') writer = csv.writer(csvFile) sentiment = Sentiment() for f in files: with open('./zagat/'+f) as review: soup = BeautifulSoup(review) review_city = soup.find(itemprop='addressLocality').text review_state = soup.find(itemprop='addressRegion').text review_text = soup.find(itemprop='reviewBody').text review_sent = sentiment.getSentiment(review_text) writer.writerow([str(review_sent),review_state,review_city]) close(csvFile)
def __init__(self): self.sentiment_analyzer = Sentiment('data/AFINN-111.txt') self.bow_vectorizer = None self.bow_analyzer = None
def create_classifier(): dataset = load_dataset() X_train, Y_train = dataset classifier = Sentiment() classifier.fit(X_train, Y_train) return classifier
:param user: user name :param last_seen: last seen index of a message :return: list of messaged and the latest message index """ if user in self._history: user_messages = self._history[user] messages = user_messages[last_seen + 1:] return messages, len(user_messages) - 1 else: return [], -1 """ Singleton for in-RAM storage of all users' chat histories. """ g_message_history = MessageHistory() """ Sentiment analysis neural network. """ g_sentiment = Sentiment() @Request.application def application(request): """ Werkzeug application to process web requests. :param request: inbound HTTP request :return: Response object """ global g_message_history print("Got request", request) if request.method == 'POST': """ This section parses a chat message sent by user. """
from search import Search from sentiment import Sentiment import logger log = logger.init_logger('sentiment.log') template_folder = os.path.dirname(__file__) template_folder = os.path.join(template_folder, 'templates') app = Flask('OpinionRetrieval', template_folder=template_folder) search_engine = Search('TwitterAuthToken.json') classifier_path = 'sentiment_classifier.pickle' sentiment = Sentiment() sentiment.load(classifier_path) @app.route('/') def home(): return render_template('index.html') @app.route('/search', methods=['POST']) def search(): search_text = request.form['search_text'] tweets = search_engine.query(search_text) prediction = [] sentiment_prediction = sentiment.get_sentiment(
def run_quickstart(file_name): SpeechtoText = {} Output = [] Chunkfile = [] # [START speech_quickstart] import io import os # Imports the Google Cloud client library # [START speech_python_migration_imports] from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types # [END speech_python_migration_imports] # Instantiates a client # [START speech_python_migration_client] client = speech.SpeechClient() # [END speech_python_migration_client] # The name of the audio file to transcribe # file_name = os.path.join( # os.path.dirname(__file__), # 'resources', # 'audio.raw') #file_name='Vaishali_1_Hate.mp3' # Loads the audio into memory from pydub import AudioSegment from pydub.utils import make_chunks myaudio = AudioSegment.from_file(file_name, "wav") chunk_length_ms = 20000 # pydub calculates in millisec chunks = make_chunks(myaudio, chunk_length_ms) #Make chunks of one sec #Export all of the individual chunks as wav files for i, chunk in enumerate(chunks): chunk_name = "chunk{0}.wav".format(i) print("exporting", chunk_name) Chunkfile.append('../CutAudio/' + chunk_name) chunk.export('../CutAudio/' + chunk_name, format="wav") #print("Chunkfile",Chunkfile) for i in Chunkfile: with io.open(i, 'rb') as audio_file: content = audio_file.read() audio = types.RecognitionAudio(content=content) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code='en-US', use_enhanced=True, model='phone_call') #print("config",config) # Detects speech in the audio file response = client.recognize(config, audio) #print("response",type(response)) for result in response.results: print('Transcript: {}'.format(result.alternatives[0].transcript)) output = result.alternatives[0].transcript #print("output",output) Output.append(output) Chunkfile.clear() #print("Output",type(Output)) #from toxicCommentPrediction import toxicity_level from sentiment import Sentiment #from DisplayPrediction import DisplayOutput listToStr = ' '.join([str(elem) for elem in Output]) prediction = Sentiment(listToStr) #prediction=toxicity_level(listToStr) SpeechtoText[listToStr] = prediction #print("SpeechtoText",SpeechtoText) #ToxicityLevel=DisplayOutput(SpeechtoText) Output.clear() SpeechtoText.clear() return prediction, listToStr
def main(): print ("Generating language models....") trainAACleanLM = LanguageModel(CLEAN_TRAIN_AA_FILE) trainAAInsultLM = LanguageModel(INSULT_TRAIN_AA_FILE) trainABCleanLM = LanguageModel(CLEAN_TRAIN_AB_FILE) trainABInsultLM = LanguageModel(INSULT_TRAIN_AB_FILE) testCleanLM = LanguageModel(CLEAN_TEST_FILE) testInsultLM = LanguageModel(INSULT_TEST_FILE) trainLabels = np.array(([0] * trainABCleanLM.getDocCount()) + ([1] * trainABInsultLM.getDocCount())) testLabels = np.array(([0] * testCleanLM.getDocCount()) + ([1] * testInsultLM.getDocCount())) ### Just baseline probabilities print ("Running baseline....") NB = baselineNaiveBayes(trainAACleanLM, trainAAInsultLM) print ("\tTraining NB....") NB.train() print ("\tTesting NB....") totalNBMatrix = np.array(NB.genProbs(trainABCleanLM.getSents(), trainABInsultLM.getSents())) trainMatrix = totalNBMatrix testMatrix = np.array(NB.genProbs(testCleanLM.getSents(), testInsultLM.getSents())) clf = LogisticRegression() print ("\tTraining SVM....") clf.fit(trainMatrix, trainLabels) print ("\tTesting SVM....") output1 = clf.predict(testMatrix).tolist() ### Baseline + PoS Features print ("Running baseline + PoS Features....") cleanPosMatrix = trainABCleanLM.getPosMatrix() insultPosMatrix = trainABInsultLM.getPosMatrix() testCleanPosMatrix = testCleanLM.getPosMatrix() testInsultPosMatrix = testInsultLM.getPosMatrix() posFeatures = np.array(cleanPosMatrix + insultPosMatrix) testPosFeatures = np.array(testCleanPosMatrix + testInsultPosMatrix) trainMatrix = np.hstack((trainMatrix, posFeatures)) testMatrix = np.hstack((testMatrix, testPosFeatures)) clf = LogisticRegression() print ("\tTraining SVM....") clf.fit(trainMatrix, trainLabels) print ("\tTesting SVM....") output2 = clf.predict(testMatrix).tolist() ### Baseline + PoS Features + TF-IDF Features (TODO Arun) print("Running baseline + PoS Features + TF-IDF Features") # generate list of features with TFIDF, using trainABCleanLM and trainABInsultLM # trainMatrix = np.hstack((trainMatrix, the new thing you just generated)) # do same for testMatrix # clf = svm.SVC() # print ("\tTraining SVM....") # clf.fit(trainMatrix, trainLabels) # print ("\tTesting SVM....") # output3 = clf.predict(testMatrix).tolist() # then update the output_file.txt thing below tfidf_train_features = tfidf.make_feature_vectors(trainAACleanLM, trainAAInsultLM, trainABCleanLM, trainABInsultLM) tfidf_test_features = tfidf.make_feature_vectors(trainAACleanLM, trainAAInsultLM, testCleanLM, testInsultLM) print tfidf_test_features.shape, tfidf_train_features.shape print testMatrix.shape, trainMatrix.shape trainMatrix = np.hstack((trainMatrix, tfidf_train_features)) testMatrix = np.hstack((testMatrix, tfidf_test_features)) clf = LogisticRegression() print ("\tTraining SVM....") clf.fit(trainMatrix, trainLabels) print ("\tTesting SVM....") output3 = clf.predict(testMatrix).tolist() ### SENTIMENT ### print("Running baseline + PoS Features + TF-IDF Features + Sentiment Features") s = Sentiment() clean_train = np.array(s.get_clean_train_vector()) insult_train = np.array(s.get_insult_train_vector()) sentiment_train_features = np.concatenate((clean_train, insult_train), axis=0) shape = sentiment_train_features.shape sentiment_train_features = sentiment_train_features.reshape((shape[0], 1)) print sentiment_train_features.shape clean_test = np.array(s.get_clean_test_vector()) insult_test = np.array(s.get_insult_test_vector()) sentiment_test_features = np.concatenate((clean_test, insult_test), axis=0) shape = sentiment_test_features.shape sentiment_test_features = sentiment_test_features.reshape((shape[0], 1)) print sentiment_test_features.shape trainMatrix = np.hstack((trainMatrix, sentiment_train_features)) testMatrix = np.hstack((testMatrix, sentiment_test_features)) clf = LogisticRegression() print ("\tTraining SVM....") clf.fit(trainMatrix, trainLabels) print ("\tTesting SVM....") output4 = clf.predict(testMatrix).tolist() ### MISSPELLINGS ### print("Running baseline + PoS Features + TF-IDF Features + Sentiment Features + Misspellings features") m = Misspellings() clean_train = np.array(m.get_clean_misspellings(False)) insult_train = np.array(m.get_insult_misspellings(False)) misspellings_train_features = np.concatenate((clean_train, insult_train), axis=0) shape = misspellings_train_features.shape misspellings_train_features = misspellings_train_features.reshape((shape[0], 1)) print misspellings_train_features.shape clean_test = np.array(m.get_clean_misspellings()) insult_test = np.array(m.get_insult_misspellings()) misspellings_test_features = np.concatenate((clean_test, insult_test), axis=0) shape = misspellings_test_features.shape misspellings_test_features = misspellings_test_features.reshape((shape[0], 1)) print misspellings_test_features.shape trainMatrix = np.hstack((trainMatrix, sentiment_train_features)) testMatrix = np.hstack((testMatrix, sentiment_test_features)) clf = LogisticRegression() print ("\tTraining SVM....") clf.fit(trainMatrix, trainLabels) print ("\tTesting SVM....") output5 = clf.predict(testMatrix).tolist() with open('LOG_REG_output_file_w_SB.txt', 'w+') as f: f.write("Output 1\n") f.write("{}\n".format(output1)) interpret_results(output1, testLabels, f) f.write("\nOutput 2\n") f.write("{}\n".format(output2)) interpret_results(output2, testLabels, f) f.write("\nOutput 3\n") f.write("{}\n".format(output3)) interpret_results(output3, testLabels, f) f.write("Output 4\n") f.write("{}\n".format(output4)) interpret_results(output4, testLabels, f) f.write("Output 5\n") f.write("{}\n".format(output5)) interpret_results(output5, testLabels, f)
from sentiment import Sentiment def load_docs(neg_file, pos_file): neg = open(neg_file, 'r', encoding='utf-8').readlines() pos = open(pos_file, 'r', encoding='utf-8').readlines() neg_docs = [] pos_docs = [] for line in neg: neg_docs.append(line.rstrip("\r\n")) for line in pos: pos_docs.append(line.rstrip("\r\n")) return neg_docs, pos_docs if __name__ == '__main__': sentiment = Sentiment() neg_docs, pos_docs = load_docs('neg.txt', 'pos.txt') sentiment.train(neg_docs, pos_docs) text = '这个东西真心很赞' prob = sentiment.classify(text) print('prob: {}'.format(prob))