Exemple #1
0
def parseReviews(review_filepath):
	f_in = open(review_filepath, 'r')
	f_out = open ("out.csv", "w")
	csv_writer = csv.writer(f_out)
        count = 0
        sentiment_analyzer = Sentiment()
	for review in f_in:
                count+=1
                if count % 1000 == 0:
                        print(count)
		review_obj = json.loads(review)
		text = review_obj["text"]
		date = review_obj["date"]
		votes = review_obj["votes"]
                stars = review_obj["stars"]
		funny = votes["funny"]
		useful = votes["useful"]
		cool = votes["cool"]
		year = int(date.split("-")[0])
		normalize_factor = float(2016-year)/time_normalize_factor
		funny_norm = float(funny/normalize_factor)
		useful_norm = float(useful/normalize_factor)
		cool_norm = float(cool/normalize_factor)
                
		sentiment = sentiment_analyzer.getSentiment(text)

		csv_writer.writerow([sentiment, funny, useful, cool, funny_norm, useful_norm, cool_norm, stars])

	f_in.close()
	f_out.close()
Exemple #2
0
class SimpleNLP(object):
    def __init__(self, method=1, doc=None, datalist=None):
        self.doc = doc
        self.datalist = datalist
        self.seg = Seg()
        self.sentiment = Sentiment(method)
        self.method = method

    def seg_datalist(self):
        return self.seg.seg_from_datalist(self.datalist)

    def seg_doc(self):
        return self.seg.seg_from_doc(self.doc)

    def get_keyword_datalist(self):
        return dict(self.seg.get_keyword_from_datalist(self.datalist))

    def sentiment_analysis_doc(self):
        if self.method == 1:
            self.sentiment.load_model(root_path +
                                      '/data/naivebayes_model30000v3')
        elif self.method == 2:
            self.sentiment.load_model(root_path + '/data/svmmodel10000v4')
        return self.sentiment.predict_sentence_doc(self.doc)

    def sentiment_analysis_datalist(self):
        if self.method == 1:
            self.sentiment.load_model(root_path +
                                      '/data/naivebayes_model30000v3')
        elif self.method == 2:
            self.sentiment.load_model(root_path + '/data/svmmodel10000v4')
        return self.sentiment.predict_datalist(self.datalist)
Exemple #3
0
def get_text():
    try:
        user_msg = request.form.get('user_msg', None)
        message = Message()
        print(user_msg)
        if user_msg:

            lang_detect = LanguageDetection(**conf)
            request_id = datetime.now().strftime('%Y%m%d%H%M%S')
            lang_detect_result = lang_detect.detect_language(id=request_id,
                                                             text=user_msg)
            message.append(
                {'text': "%s 로 말씀하셨습니다. " % lang_detect_result.name})

            if lang_detect_result.lang_code in conf['approval_lang_code']:
                sentiment = Sentiment(**conf)
                sentiment_score = sentiment.detect_sentiment(
                    id=request_id,
                    text=user_msg,
                    language=lang_detect_result.lang_code)
                message.append({'text': sentiment_score.get_sentiment_text()})
            else:
                message.append({'text': '해당언어로는 제가 기분을 알 수 없습니다.'})
                message.append({
                    'text':
                    '%s 언어로 이야기 해주세요.' % (conf['approval_lang_code'].values())
                })
        return jsonify(message.to_dict()), 200
    except Exception:
        import traceback
        print(traceback.format_exc())
        d = {"messages": [{"text": "알아들을수가 없습니다. 다른애기를 해주세요."}]}
        return jsonify(d), 200
Exemple #4
0
    def test_sentiment(self):
        from sentiment import Sentiment

        s = Sentiment()
        score = s.parseRss("sentiment.testdata")
        #print(str(score))
        self.assertTrue(score < 0.0)
    def sentiment_ensemble_lexi_ml(self,
                                   lexicon_predictions,
                                   ml_predictions,
                                   classifiers={'GaussianNB': GaussianNB()},
                                   n_folds=2):
        """ Fusion classification for s analysis
        :type lexicon_predictions: dict with lexicon name as keys and lists of
            predicted values as values
        :type ml_predictions: dict with classifiers name as keys and lists of
            predicted values as values
        :type classifiers: dict with name of classifier and classifier object
        :return: dict with measures and time for supervised learning process
        """
        ensemble_features = self.features_array(lexicon_predictions.values(),
                                                ml_predictions.values())
        self.feature_set = ensemble_features
        # temp_X = self.feature_set.T
        s = Sentiment()
        # print self.classes
        predictions = s.sentiment_classification(
            # X=self.feature_set,
            X=self.feature_set.T,
            # X=self.feature_set,
            y=self.classes,
            n_folds=n_folds,
            classifiers=classifiers)

        # print '+++++++++++++++++++++++ After ensemble +++++++++++++++++'
        # print
        # pprint(s.results)
        # TODO dodac predictions do results

        return s.results
Exemple #6
0
def place_sentiment():
    wd_id = urllib.unquote(str(request.args.get('wd_id')))
    type = urllib.unquote(request.args.get('type')).encode('utf8')

    utility = Utility()
    sentiment = Sentiment()

    place = utility.get_place(wd_id=wd_id, type=type)
    place['label'] = place['uri'].split("/")[-1].replace("_", " ")
    if (place['wikidata_id'] != "None"):
        place['surfaceForm'] = utility.get_surfaceForm(place['wikidata_id'])
    elements = sentiment.get_place_word_frequency(wd_id=wd_id,
                                                  type=type,
                                                  pos_tag="a")

    words = elements['words']
    most = words.most_common()
    array = []
    for i in range(0, min(15, most.__len__())):
        new_dict = {}
        elem = most[i]
        new_dict['word'] = elem[0].decode('utf-8')
        new_dict['num'] = elem[1]
        array.append(new_dict)

    words['words'] = array
    pos_neg_words = sentiment.get_best_worst_words(wd_id, type, "a")

    return render_template('place_sentiment.html',
                           place=place,
                           words=words,
                           pos_words=pos_neg_words['pos'],
                           neg_words=pos_neg_words['neg'])
Exemple #7
0
    def get_tweet_sent(self, hashtag, time):
        #Historical twitter sentiment for time frame
        sentiment = Sentiment()
        d = datetime.today() - timedelta(days=time)

        authentication = tweepy.OAuthHandler(
            self.getConfig('Tweet')['consumer_key'],
            self.getConfig('Tweet')['consumer_secret'])
        authentication.set_access_token(
            self.getConfig('Tweet')['access_token'],
            self.getConfig('Tweet')['access_token_secret'])
        api = tweepy.API(authentication,
                         wait_on_rate_limit=True,
                         wait_on_rate_limit_notify=True)

        tweetsPerQry = 100
        maxTweets = self.getConfig('Tweet')['maxTweets']
        data = {}

        for h in hashtag:
            maxId = -1
            tweetCount = 0
            sent_total = []

            while tweetCount < maxTweets:

                if (maxId < 0):
                    newTweets = api.search(q=h,
                                           count=tweetsPerQry,
                                           result_type="recent",
                                           tweet_mode="extended",
                                           languages=["en"])
                else:
                    newTweets = api.search(q=h,
                                           count=tweetsPerQry,
                                           max_id=str(maxId - 1),
                                           result_type="recent",
                                           tweet_mode="extended",
                                           languages=["en"])

                if not newTweets:
                    #print("Aint no tweet anymore....")
                    break

                for tweet in newTweets:
                    created_at = tweet.created_at
                    text = tweet.full_text
                    sentiment_score = sentiment.get_sentiment_score(text)
                    followers = 0
                    #Remove scores of zero and only add for date
                    if created_at >= d and sentiment_score != 0:
                        sent_total.append(sentiment_score)

                tweetCount += len(newTweets)
                #print(h, tweetCount)
                maxId = newTweets[-1].id
            #This seems to be more acurate with median than mean
            data[h] = round(statistics.median(sent_total), 3)
            #print(maxId)
        return (data)
Exemple #8
0
    def test_sentimentDict(self):
        from sentiment import Sentiment
        s = Sentiment()
        self.assertTrue(len(Sentiment.Dict) > 0)

        self.assertTrue(s.scoreTitle("stocks down") < 0.0)
        self.assertTrue(s.scoreTitle("opening up") > 0.0)

        self.assertTrue(s.scoreDate("Mon, 04 Jan 2017 07:54:27 -0400") == 0.0)
Exemple #9
0
def GetSubjScoreForSingleSent(SentIndex, Sentence):
    T0 = time()
    try:
        SentAnalyzer = Sentiment()
        ResDict = SentAnalyzer.analyze([Sentence])
        pprint(ResDict)
        SentSentimentPol = {S: {'sentiment':ResDict['sentiments'][I], 'score': ResDict['scores'][I]} for I,S in enumerate(ResDict['sentences'])}
        print 'processed sentence number {} in {} sec'.format(SentIndex, round(time() - T0))
        return SentSentimentPol
    except:
        print 'failed to process sentence number ', SentIndex
        return {}
Exemple #10
0
def get_scatter_data(timespan):
    """Send tweet sentiment to scatter plot"""
    print "In our JSON route" + session.get("ticker")
    ticker = session.get("ticker")
    current_stock = Stock.query.get(ticker)
    tweets = current_stock.get_tweets()
    stocks = Stock.query.all()

    # tweets_json = json.dumps(tweets, default=lambda o: o.__dict__)

    # now = moment.utcnow().timezone("US/Eastern")
    result = []
    s = Sentiment(stocks)
    sentiment = None
    negative = ['0.0', '0.1', '0.2', '0.3', '0.4', '0.5']
    positive = ['0.6', '0.7', '0.8', '0.9', '1.0']

    for tweet in tweets:
        #create a moment that represents now - 24 hours
        day_ago = moment.utcnow().timezone("US/Eastern").subtract(hours=24)
        # convert unicode created_at to string
        created_at = unicodedata.normalize('NFKD', tweet.created_at).encode('ascii', 'ignore')
        # format created_at string to ISO 8610
        created_at_str = time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(created_at, '%a %b %d %H:%M:%S +0000 %Y'))
        # create a moment from the string
        created_at = moment.date(created_at_str, 'YYYY-MM-DD HH:mm:ss')
        # convert timezone of moment from UTC to Eastern time
        created_at_final = created_at.utcnow().timezone("US/Eastern")
        print created_at_final > day_ago
        if tweet.text.count('$') == 1 and tweet.retweeted_status is None and created_at_final > day_ago:
            # Convert tweet text from unicode to text
            tweet_text = unicodedata.normalize('NFKD', tweet.text).encode('ascii', 'ignore')
            # Get the sentiment of the tweet retured in either 'positive' or 'negative'
            sentiment_str = s.get_tweet_sentiment(tweet_text)
            if sentiment_str == 'positive':
                sentiment = random.choice(positive)
            if sentiment_str == 'negative':
                sentiment = random.choice(negative)
            created_at = unicodedata.normalize('NFKD', tweet.created_at).encode('ascii', 'ignore')
            # Sun Jun 05 17:09:07 +0000 2016
            created_at_str = time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(created_at, '%a %b %d %H:%M:%S +0000 %Y'))
            # Below 4 lines returns duplicate timestamps... need a way to convert to US/EST timezone
            # create a moment from the string
            # created_at = moment.date(created_at_str, 'YYYY-MM-DD HH:mm:ss')
            # convert timezone of moment from UTC to Eastern time
            # created_at_final = created_at.utcnow().timezone("US/Eastern")
            print "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&"
            print created_at_str
            print "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&"
            result.append({'datetime': created_at_str, 'sentiment': sentiment})
    #sort dictionary by datetime
    sorted_result = sorted(result, key=lambda k: k['datetime'])
    return json.dumps(sorted_result)
Exemple #11
0
def ProcessSingleFile (FName, MailNumber):
    T0 = time()
    try:
        EmailText = [open(FName).read()]
        SentAnalyzer = Sentiment()
        Res = SentAnalyzer.analyze(EmailText)
        pprint (Res)
        SentSentimentPol = zip(Res['sentences'],Res['sentiments'],Res['scores'])
        SentSentimentPol = [ThreeTuple for ThreeTuple in SentSentimentPol if ThreeTuple[1] != 'neutral']
        print 'processed email {} in {} sec'.format(MailNumber, round(time()-T0))
        return SentSentimentPol
    except:
        print 'failed to process email ', MailNumber
        return []
    def get_tweet_hist(self, hashtag):
        sentiment = Sentiment()

        tweetsPerQry = 100
        maxTweets = self.get_config()['maxTweets']

        for h in hashtag:
            maxId = -1
            tweetCount = 0

            while tweetCount < maxTweets:

                if (maxId < 0):
                    newTweets = api.search(q=h,
                                           count=tweetsPerQry,
                                           result_type="recent",
                                           tweet_mode="extended",
                                           languages=["en"])
                else:
                    newTweets = api.search(q=h,
                                           count=tweetsPerQry,
                                           max_id=str(maxId - 1),
                                           result_type="recent",
                                           tweet_mode="extended",
                                           languages=["en"])

                if not newTweets:
                    print("Aint no tweet anymore....")
                    break

                for tweet in newTweets:
                    created_at = round(
                        calendar.timegm(
                            time.strptime(str(tweet.created_at),
                                          "%Y-%m-%d %H:%M:%S"))) * 1000
                    text = tweet.full_text
                    sentiment_score = sentiment.get_sentiment_score(text)
                    followers = 0

                    self.push_to_db(h, created_at, text, sentiment_score,
                                    followers)
                    self.push_to_influx(h, created_at, text, sentiment_score,
                                        followers)
                    #trainingData.append(tweet_tuple)

                tweetCount += len(newTweets)
                print(h, tweetCount)
                maxId = newTweets[-1].id
class Featurizer(object):
    def __init__(self):
        self.sentiment_analyzer = Sentiment('data/AFINN-111.txt')
        self.bow_vectorizer = None
        self.bow_analyzer = None

    def bag_of_words(self, body):
        return self.bow_vectorizer.transform([body]).toarray()

    def text_features(self, comment):
        num_chars = len(comment.get("body"))
        num_links = count_links(comment.get("body"))

        simple_tokens = comment.get("body").split(' ')
        num_words = 0
        avg_word_length = 0
        for token in simple_tokens:
            num_words += 1
            avg_word_length += len(token)
        avg_word_length = float(avg_word_length) / float(num_words)

        sentiment = self.sentiment_analyzer.analyze(
            self.bow_analyzer(comment.get("body")))

        score = comment.get("score")

        return [num_chars, num_links, num_words, num_words, 
                avg_word_length, sentiment]

    def transform_comment(self, comment):
        return numpy.hstack((
            numpy.array([self.text_features(comment)], 
                        dtype='float_'),
            self.bag_of_words(comment.get("body"))))

    def score_comment(self, comment):
        return comment.get("score")

    def transform(self, comments):
        """ Returns a Nx(D+1) numpy matrix of features. The first D columns
        correspond to features, where the final column corresponds to the
        scores of each comment"""

        # if it's a single instance, return an array
        if isinstance(comments, dict):
            return transform_comment(comments)

        # http://scikit-learn.org/stable/modules/feature_extraction.html
        self.bow_vectorizer = CountVectorizer(min_df=1)
        self.bow_vectorizer.fit([c.get("body") for c in comments])
        self.bow_analyzer = self.bow_vectorizer.build_analyzer()

        def features_and_label(comment):
            return numpy.hstack((
                self.transform_comment(comment),
                numpy.array([[self.score_comment(comment)]], 
                            dtype='float_')))

        return numpy.vstack([features_and_label(c) 
                             for c in comments])
Exemple #14
0
 def __init__(self, file_in, max_articles=None, file_out=None):
     self.file_name = file_name
     self.max_articles = max_articles
     self.articles = []
     self.sentiment = Sentiment()
     self.results = []
     self.file_out = file_out
Exemple #15
0
def partOne():
    _day = "今日"
    # _day="昨日"
    height_10cm = "非st 非创业板 非科创板 非新股"
    up_all = crawl_length(_day + "涨幅大于0")
    down_all = crawl_length(_day + "涨幅小于0")
    up_5 = crawl_length(_day + "涨幅大于5 " + height_10cm)
    down_5 = crawl_length(_day + "跌幅大于5 " + height_10cm)
    up_num = crawl_length(_day + "涨停" + height_10cm)
    down_num = crawl_length(_day + "跌停 " + height_10cm)
    up_10_2 = crawl_length(_day + "二连板 " + height_10cm)
    up_highest = crawl_highest(_day + "二连板以上 " + height_10cm)

    a = Sentiment(str(datetime.datetime.now().date()),
                  up_5=up_5,
                  down_5=down_5,
                  up_num=up_num,
                  down_num=down_num,
                  up_all=up_all,
                  down_all=down_all,
                  up_10_2=up_10_2,
                  up_highest=up_highest)
    # a=Sentiment('2022-01-14',up_5=up_5,down_5=down_5,up_num=up_num,down_num=down_num,up_all=up_all,down_all=down_all,up_10_2=up_10_2,up_highest=up_highest)
    # a = Sentiment('2021-11-30', 166, 1740, 64, 2, 2899, 1583, 9, 5)
    print(a)
    insert(a)
Exemple #16
0
def main():

    #istantiate all the useful classes
    linker=EntityLinker()
    cleaner=Cleaner()
    sentiment=Sentiment()
    utility= Utility()
class Featurizer(object):
    def __init__(self):
        self.sentiment_analyzer = Sentiment('data/AFINN-111.txt')
        self.bow_vectorizer = None
        self.bow_analyzer = None

    def bag_of_words(self, body):
        return self.bow_vectorizer.transform([body]).toarray()

    def text_features(self, comment):
        num_chars = len(comment.get("body"))
        num_links = count_links(comment.get("body"))

        simple_tokens = comment.get("body").split(' ')
        num_words = 0
        avg_word_length = 0
        for token in simple_tokens:
            num_words += 1
            avg_word_length += len(token)
        avg_word_length = float(avg_word_length) / float(num_words)

        sentiment = self.sentiment_analyzer.analyze(
            self.bow_analyzer(comment.get("body")))

        score = comment.get("score")

        return [
            num_chars, num_links, num_words, num_words, avg_word_length,
            sentiment
        ]

    def transform_comment(self, comment):
        return numpy.hstack((numpy.array([self.text_features(comment)],
                                         dtype='float_'),
                             self.bag_of_words(comment.get("body"))))

    def score_comment(self, comment):
        return comment.get("score")

    def transform(self, comments):
        """ Returns a Nx(D+1) numpy matrix of features. The first D columns
        correspond to features, where the final column corresponds to the
        scores of each comment"""

        # if it's a single instance, return an array
        if isinstance(comments, dict):
            return transform_comment(comments)

        # http://scikit-learn.org/stable/modules/feature_extraction.html
        self.bow_vectorizer = CountVectorizer(min_df=1)
        self.bow_vectorizer.fit([c.get("body") for c in comments])
        self.bow_analyzer = self.bow_vectorizer.build_analyzer()

        def features_and_label(comment):
            return numpy.hstack((self.transform_comment(comment),
                                 numpy.array([[self.score_comment(comment)]],
                                             dtype='float_')))

        return numpy.vstack([features_and_label(c) for c in comments])
Exemple #18
0
    def __init__(self,
                 available_intent,
                 script_file="script.txt",
                 intent_p_dict=INTENT_P_DICT,
                 grammar_p_dict=GRAMMAR_P_DICT):
        self.episode_script = None

        # Init basic episode generator
        with open(os.path.join(DATA_ROOT, script_file), "rb") as f:
            script = json.load(f)

        if "pre_sales" in available_intent.keys():
            self.pre_sales = PreSales(script["pre_sales"],
                                      available_intent["pre_sales"],
                                      intent_p_dict["pre_sales"],
                                      grammar_p_dict["pre_sales"])
        else:
            self.pre_sales = None
        if "in_sales" in available_intent.keys():
            self.in_sales = InSales(script["in_sales"],
                                    available_intent["in_sales"],
                                    intent_p_dict["in_sales"],
                                    grammar_p_dict["in_sales"])
        else:
            self.in_sales = None
        if "after_sales" in available_intent.keys():
            self.after_sales = AfterSales(script["after_sales"],
                                          available_intent["after_sales"],
                                          intent_p_dict["after_sales"])
        else:
            self.after_sales = None
        if "sentiment" in available_intent.keys():
            self.sentiment = Sentiment(script["sentiment"])
        else:
            self.sentiment = None

        # Get available episodes
        self.available_episode = list(available_intent.keys())
        if "pre_sales" in self.available_episode and "in_sales" in self.available_episode:
            self.available_episode.append(" ".join(["pre_sales", "in_sales"]))

        # Knowledge Base
        self.kb_helper = KnowledgeBase()

        # translate agent dialog action according to KB results
        self.translator = Translator()
Exemple #19
0
def train():
	print("load data")
	d = Sentiment(data_name, 6)
	print("start_train")
	with tf.device('/gpu:'+FLAGS.gpu):
		m = Model(data_name, d.num_class, embeddings = d.embeddings, size = FLAGS.size, 
			batch_size = FLAGS.batch, dropout = FLAGS.dropout,
			rnn_cell = FLAGS.cell, optimize = FLAGS.opt)
		print("start_fit")
		m.fit(d.train_set, d.test_set, d.train_one_hot, d.test_one_hot, FLAGS.epoch)
Exemple #20
0
def analysis():
    json = request.get_json()

    params = json.get("params")
    base64_html = json.get("html")

    html = base64.b64decode(base64_html).decode('utf-8')

    response = Sentiment.run(html, params)

    return {"status": "ok", "message": response}
    def __init__(self):
        super().__init__()
        self.storyGenerator = StoryGenerator()
        self.action_buttons = []
        self.inventory_labels = []
        self.out_path = ""

        self.storyGenerator.CHANCE_TO_REMEMBER_ITEM = 0.0
        self.storyGenerator.CHANCE_TO_REMEMBER_PERSON = 0.0

        self.total_nodes = 0
        self.current_node = 0

        self.EMPTY_ACTION = {
            "type": "",
            "action": "",
            "entity": "",
            "sentence": "",
            "simple": "",
            "probability": ""
        }

        self.action_generator = 0

        self.DEBUG_OUT = False
        self.REACT_TO_SENTIMENT = True
        self.MAX_CHARACTERS = -1

        self.sentiment = Sentiment()

        self.GENDER_A = ["unknown", "androgynous", "male", "mostly_male"]

        self.gender_detector = gender.Detector()
        logging.basicConfig(level=logging.DEBUG)
        logging.config.dictConfig({
            'version': 1,
            'disable_existing_loggers': True
        })
        self.logger = logging.getLogger(__name__)
        self.logger.info("Initialization done.")
Exemple #22
0
def create_app(spark_context, dataset_path):
    global recommendation_engine
    global content_recommend
    global hot_words
    global sentiment
    recommendation_engine = RecommendationEngine(spark_context, dataset_path)
    content_recommend = ContentRecommend(spark_context, dataset_path)
    hot_words = HotWords(spark_context, dataset_path)
    sentiment = Sentiment(spark_context, dataset_path)
    img = Image(spark_context, dataset_path)
    app = Flask(__name__)
    app.config['JSON_AS_ASCII'] = False
    app.config['JSONIFY_MIMETYPE'] = "application/json;charset=utf-8"
    app.register_blueprint(main)
    return app
Exemple #23
0
def api():
    if request.method == 'GET':
        return jsonify(text='This is how you should format your POST data.')
    if request.method == 'POST':
        try:
            f = request.get_json()
        except Exception as e:
            # Should log this error maybe?
            print(e)
            return Errors.bad_request()
        else:
            for text in f:
                print(
                    datetime.datetime.fromtimestamp(timestamp).strftime(
                        '%Y-%m-%d %H:%M:%S'))
                return jsonify(Sentiment.demo_vader_instance(f[text]))
Exemple #24
0
 def process(self, media_list_file):
     # Filter the tweets.
     print_green("Filtering movies in {} to {}".format(
         raw_movie_dir, filtered_movie_dir))
     filter.filterAll(media_list_file, raw_movie_dir, filtered_movie_dir)
     # Run the sentiment analysis.
     print_green("Sentiment of movies in {} to {}".format(
         filtered_movie_dir, scored_movie_dir))
     Sentiment().run(filtered_movie_dir, scored_movie_dir)
     # Get the critic scores.
     print_green("Critic score of movies")
     Omdb().critic_scores(media_list_file)
     # Merge the critic scores and sentiment analysis scores into
     # the final data.json output file.
     print_green("Merging sentiment and critic scores")
     merge_gen('critic_scores.json', 'sentiment_scores.json', 'data.json')
Exemple #25
0
def mappa_brescia_filtered_sentiment():
    utility = Utility()
    sentiment = Sentiment()
    query = ""
    count = 0
    if (urllib.unquote(str(request.args.get('elemento_stradale'))) == "1"):
        query = query + " or aggregated_type = \'elemento stradale\'"

    if (urllib.unquote(str(request.args.get('sede_scolastica'))) == "1"):
        query = query + " or aggregated_type = \'sede scolastica/culturale\'"

    if (urllib.unquote(str(request.args.get('luogo_turistico'))) == "1"):
        query = query + " or aggregated_type = \'luogo turistico/monumento\'"

    if (urllib.unquote(str(request.args.get('edificio_religioso'))) == "1"):
        query = query + " or aggregated_type = \'edificio religioso\'"

    if (urllib.unquote(str(
            request.args.get('suddivisione_cittadina'))) == "1"):
        query = query + " or aggregated_type = \'suddivisione cittadina'"

    if (urllib.unquote(str(request.args.get('edifici_civili'))) == "1"):
        query = query + " or aggregated_type = \'edificio civile\'"

    if (urllib.unquote(str(request.args.get('fermata_tpl'))) == "1"):
        query = query + " or aggregated_type = \'fermata tpl\'"

    if (urllib.unquote(str(request.args.get('edificio_sanitario'))) == "1"):
        query = query + " or aggregated_type = \'edificio sanitario\'"

    if (urllib.unquote(str(request.args.get('edificio_sportivo'))) == "1"):
        query = query + " or aggregated_type = \'edificio sportivo\'"

    n_citations = urllib.unquote(str(request.args.get('citations')))
    norm = urllib.unquote(str(request.args.get('normalize')))
    circle_size = urllib.unquote(str(request.args.get('circle_size')))

    places = utility.get_places_filtered(1, 0, n_citations, query)

    params = dict()
    params['zoom'] = 13
    params['circle_size'] = circle_size
    params['counters'] = "map.addLayer(circlesLayer);"
    params['sentiment'] = norm
    return render_template('mappa_sentiment.html',
                           places=places,
                           params=params)
Exemple #26
0
def get_tweets(query):
    list_of_results = []
    negative_words = []
    positive_words = []
    token = tauth.get_bearer_token()

    response = requests.get('https://api.twitter.com/1.1/search/tweets.json?',
                            headers={"Authorization": "Bearer " + token},
                            params={
                                "q": query,
                                "tweet_mode": "extended",
                                "lang": "pt"
                            })

    if response.status_code != 200:
        raise Exception("Cannot get a tweets (Status Code %d) Message: %s" %
                        (response.status_code, response.text))

    body = response.json()
    if not body['statuses']:
        raise Exception("We don't have tweets to show")
    else:
        for tweet in body['statuses']:
            text = tweet['full_text']
            language = tweet['metadata']['iso_language_code']
            sentiment = Sentiment(text, language)

            result = Result(
                tweet['id'],
                tweet['user']['name'],
                text,
                sentiment.analyze_feeling(),
                tweet['favorite_count'],
                tweet['retweet_count'],
                tweet['created_at'],
            )
            list_of_results.append(result)
            text_wordcloud = text_treatment.treat_for_wordcloud(text)
            token_space = tokenize.WhitespaceTokenizer()
            word_list = token_space.tokenize(text_wordcloud)
            for word in word_list:
                sentiment = Sentiment(word, language)
                analyzed_word = sentiment.analyze_feeling()
                positive_words.append(
                    word) if analyzed_word == 'pos' else negative_words.append(
                        word)

    persist_search(query=query, results=list_of_results)
    return list_of_results, ' '.join(positive_words), ' '.join(negative_words)
Exemple #27
0
    def getText(self):
        sentDataList = []
        for item in self.jsonData:
            textSet = set()
            if (self.quotes_dict.keys().__contains__(item['dt'])):
                pass
            else:
                self.quotes_dict[item['dt']] = item['quote']

            if (self.text_dict.keys().__contains__(item['dt'])):
                textSet = self.text_dict[item['dt']]
                textSet.add(item['data']['t'])
                self.text_dict[item['dt']] = textSet
            else:
                textSet.add(item['data']['t'])
                self.text_dict[item['dt']] = textSet
        for k, v in self.text_dict.items():
            quotes = self.quotes_dict[k]
            sentiData = Sentiment(k, v, quotes)
            sentDataList.append(sentiData)

        return sentDataList
Exemple #28
0
def main():
    print ("Generating language models....")
    trainAACleanLM = LanguageModel(CLEAN_TRAIN_AA_FILE)
    trainAAInsultLM = LanguageModel(INSULT_TRAIN_AA_FILE)
        
    trainABCleanLM = LanguageModel(CLEAN_TRAIN_AB_FILE)
    trainABInsultLM = LanguageModel(INSULT_TRAIN_AB_FILE)

    testCleanLM = LanguageModel(CLEAN_TEST_FILE)
    testInsultLM = LanguageModel(INSULT_TEST_FILE)

    trainLabels = np.array(([0] * trainABCleanLM.getDocCount()) + ([1] * trainABInsultLM.getDocCount()))
    testLabels = np.array(([0] * testCleanLM.getDocCount()) + ([1] * testInsultLM.getDocCount()))


    ### Just baseline probabilities
    print ("Running baseline....")
    NB = baselineNaiveBayes(trainAACleanLM, trainAAInsultLM)
    print ("\tTraining NB....") 
    NB.train()
    print ("\tTesting NB....")  
    totalNBMatrix = np.array(NB.genProbs(trainABCleanLM.getSents(), trainABInsultLM.getSents()))

    trainMatrix = totalNBMatrix 

    testMatrix = np.array(NB.genProbs(testCleanLM.getSents(), testInsultLM.getSents()))

    # clf = svm.SVC(kernel='linear')
    # print ("\tTraining SVM....")    
    # clf.fit(trainMatrix, trainLabels)
    # print ("\tTesting SVM....") 
    # output1 = clf.predict(testMatrix).tolist()

    ## Baseline + PoS Features
    print ("Running baseline + PoS Features....")
    cleanPosMatrix = trainABCleanLM.getPosMatrix()
    insultPosMatrix = trainABInsultLM.getPosMatrix()

    testCleanPosMatrix = testCleanLM.getPosMatrix()
    testInsultPosMatrix = testInsultLM.getPosMatrix()

    posFeatures = np.array(cleanPosMatrix + insultPosMatrix)
    testPosFeatures = np.array(testCleanPosMatrix + testInsultPosMatrix)
    trainMatrix = np.hstack((trainMatrix, posFeatures))
    testMatrix = np.hstack((testMatrix, testPosFeatures))

    # clf = svm.SVC(kernel='linear')
    # print ("\tTraining SVM....")    
    # clf.fit(trainMatrix, trainLabels)
    # print ("\tTesting SVM....") 
    # output2 = clf.predict(testMatrix).tolist()


    ### Baseline + PoS Features + TF-IDF Features (TODO Arun)
    print("Running baseline + PoS Features + TF-IDF Features")
    # generate list of features with TFIDF, using trainABCleanLM and trainABInsultLM
    # trainMatrix = np.hstack((trainMatrix, the new thing you just generated))
    # do same for testMatrix
    # clf = svm.SVC()
    # print ("\tTraining SVM....")  
    # clf.fit(trainMatrix, trainLabels)
    # print ("\tTesting SVM....")   
    # output3 = clf.predict(testMatrix).tolist()    
    # then update the output_file.txt thing below


    tfidf_train_features = tfidf.make_feature_vectors(trainAACleanLM,
            trainAAInsultLM, trainABCleanLM, trainABInsultLM)

    tfidf_test_features = tfidf.make_feature_vectors(trainAACleanLM,
            trainAAInsultLM, testCleanLM, testInsultLM)

    print tfidf_test_features.shape, tfidf_train_features.shape
    print testMatrix.shape, trainMatrix.shape

    trainMatrix = np.hstack((trainMatrix, tfidf_train_features))
    testMatrix = np.hstack((testMatrix, tfidf_test_features))


    # clf = svm.SVC(kernel='linear')
    # print ("\tTraining SVM....")  
    # clf.fit(trainMatrix, trainLabels)
    # print ("\tTesting SVM....")   
    # output3 = clf.predict(testMatrix).tolist()  

    ### SENTIMENT ###
    print("Running baseline + PoS Features + TF-IDF Features + Sentiment Features")
    s = Sentiment()
    clean_train = np.array(s.get_clean_train_vector())
    insult_train = np.array(s.get_insult_train_vector())
    sentiment_train_features = np.concatenate((clean_train, insult_train), axis=0)
    shape = sentiment_train_features.shape
    sentiment_train_features = sentiment_train_features.reshape((shape[0], 1))
    print sentiment_train_features.shape

    clean_test = np.array(s.get_clean_test_vector())
    insult_test = np.array(s.get_insult_test_vector())
    sentiment_test_features = np.concatenate((clean_test, insult_test), axis=0)
    shape = sentiment_test_features.shape
    sentiment_test_features = sentiment_test_features.reshape((shape[0], 1))
    print sentiment_test_features.shape

    trainMatrix = np.hstack((trainMatrix, sentiment_train_features))
    testMatrix = np.hstack((testMatrix, sentiment_test_features))

    # clf = svm.SVC(kernel='linear')
    # print ("\tTraining SVM....")  
    # clf.fit(trainMatrix, trainLabels)
    # print ("\tTesting SVM....")   
    # output4 = clf.predict(testMatrix).tolist()  

    ### MISSPELLINGS ###
    print("Running baseline + PoS Features + TF-IDF Features + Sentiment Features + Misspellings features")
    m = Misspellings()
    clean_train = np.array(m.get_clean_misspellings(False))
    insult_train = np.array(m.get_insult_misspellings(False))
    misspellings_train_features = np.concatenate((clean_train, insult_train), axis=0)
    shape = misspellings_train_features.shape
    misspellings_train_features = misspellings_train_features.reshape((shape[0], 1))
    print misspellings_train_features.shape

    clean_test = np.array(m.get_clean_misspellings())
    insult_test = np.array(m.get_insult_misspellings())
    misspellings_test_features = np.concatenate((clean_test, insult_test), axis=0)
    shape = misspellings_test_features.shape
    misspellings_test_features = misspellings_test_features.reshape((shape[0], 1))
    print misspellings_test_features.shape

    trainMatrix = np.hstack((trainMatrix, sentiment_train_features))
    testMatrix = np.hstack((testMatrix, sentiment_test_features))

    clf = svm.SVC(kernel='linear')
    print ("\tTraining SVM....")  
    clf.fit(trainMatrix, trainLabels)
    print ("\tTesting SVM....")   
    output5 = clf.predict(testMatrix).tolist()  


    index_shuf = range(len(trainMatrix))
    trainMatrix_shuf = []
    trainLabel_shuf = []
    shuffle(index_shuf)
    for i in index_shuf:
        trainMatrix_shuf.append(trainMatrix[i])
        trainLabel_shuf.append(trainLabels[i])

    train_sizes, train_scores, valid_scores = learning_curve(svm.SVC(), trainMatrix_shuf, trainLabel_shuf, train_sizes=[100, 300, 500, 700, 900], cv=2)
    average_train_scores = [sum(i)/float(len(i)) for i in train_scores]
    average_valid_scores = [sum(i)/float(len(i)) for i in valid_scores]
    plt.plot(train_sizes, average_train_scores)
    plt.plot(train_sizes, average_valid_scores)
    plt.legend(['Training score', 'Cross-validation score'], loc='center left', bbox_to_anchor=(0.85, 0.5))
    plt.ylabel('Score')
    plt.xlabel('Training examples')
    plt.show()
    
    # with open('SVM_output_file_with_SB.txt', 'w+') as f:
    #     f.write("Output 1\n")
    #     f.write("{}\n".format(output1))
    #     interpret_results(output1, testLabels, f)
    #     f.write("\nOutput 2\n") 
    #     f.write("{}\n".format(output2))
    #     interpret_results(output2, testLabels, f)
    #     f.write("\nOutput 3\n") 
    #     f.write("{}\n".format(output3))
    #     interpret_results(output3, testLabels, f)
    #     f.write("Output 4\n")
    #     f.write("{}\n".format(output4))
    #     interpret_results(output4, testLabels, f)
    #     f.write("Output 5\n")
    #     f.write("{}\n".format(output5))
    #     interpret_results(output5, testLabels, f)

    get_pca_graph(trainMatrix, trainLabels, "train_pca.png", title="PCA of Training Set")
    get_pca_graph(testMatrix, testLabels, "test_pca.png", title="PCA of Test Set")
    get_pca_graph(trainMatrix, trainLabels, "train_pca2.png", title="PCA of Training Set (Insults Only)", plot_negative=False)
    get_pca_graph(testMatrix, testLabels, "test_pca2.png", title="PCA of Test Set (Insults Only)", plot_negative=False)
Exemple #29
0
def main():
    print("Generating language models....")
    trainAACleanLM = LanguageModel(CLEAN_TRAIN_AA_FILE)
    trainAAInsultLM = LanguageModel(INSULT_TRAIN_AA_FILE)

    trainABCleanLM = LanguageModel(CLEAN_TRAIN_AB_FILE)
    trainABInsultLM = LanguageModel(INSULT_TRAIN_AB_FILE)

    testCleanLM = LanguageModel(CLEAN_TEST_FILE)
    testInsultLM = LanguageModel(INSULT_TEST_FILE)

    trainLabels = np.array(([0] * trainABCleanLM.getDocCount()) +
                           ([1] * trainABInsultLM.getDocCount()))
    testLabels = np.array(([0] * testCleanLM.getDocCount()) +
                          ([1] * testInsultLM.getDocCount()))

    ### Just baseline probabilities
    print("Running baseline....")
    NB = baselineNaiveBayes(trainAACleanLM, trainAAInsultLM)
    print("\tTraining NB....")
    NB.train()
    print("\tTesting NB....")
    totalNBMatrix = np.array(
        NB.genProbs(trainABCleanLM.getSents(), trainABInsultLM.getSents()))

    trainMatrix = totalNBMatrix

    testMatrix = np.array(
        NB.genProbs(testCleanLM.getSents(), testInsultLM.getSents()))

    clf = RandomForestClassifier()
    print("\tTraining random forest....")
    clf.fit(trainMatrix, trainLabels)
    print("\tTesting random forest....")
    output1 = clf.predict(testMatrix).tolist()

    ### Baseline + PoS Features
    print("Running baseline + PoS Features....")
    cleanPosMatrix = trainABCleanLM.getPosMatrix()
    insultPosMatrix = trainABInsultLM.getPosMatrix()

    testCleanPosMatrix = testCleanLM.getPosMatrix()
    testInsultPosMatrix = testInsultLM.getPosMatrix()

    posFeatures = np.array(cleanPosMatrix + insultPosMatrix)
    testPosFeatures = np.array(testCleanPosMatrix + testInsultPosMatrix)
    trainMatrix = np.hstack((trainMatrix, posFeatures))
    testMatrix = np.hstack((testMatrix, testPosFeatures))

    clf = RandomForestClassifier()
    print("\tTraining SVM....")
    clf.fit(trainMatrix, trainLabels)
    print("\tTesting SVM....")
    output2 = clf.predict(testMatrix).tolist()

    ### Baseline + PoS Features + TF-IDF Features (TODO Arun)
    print("Running baseline + PoS Features + TF-IDF Features")
    # generate list of features with TFIDF, using trainABCleanLM and trainABInsultLM
    # trainMatrix = np.hstack((trainMatrix, the new thing you just generated))
    # do same for testMatrix
    # clf = svm.SVC()
    # print ("\tTraining SVM....")
    # clf.fit(trainMatrix, trainLabels)
    # print ("\tTesting SVM....")
    # output3 = clf.predict(testMatrix).tolist()
    # then update the output_file.txt thing below

    tfidf_train_features = tfidf.make_feature_vectors(trainAACleanLM,
                                                      trainAAInsultLM,
                                                      trainABCleanLM,
                                                      trainABInsultLM)

    tfidf_test_features = tfidf.make_feature_vectors(trainAACleanLM,
                                                     trainAAInsultLM,
                                                     testCleanLM, testInsultLM)

    print tfidf_test_features.shape, tfidf_train_features.shape
    print testMatrix.shape, trainMatrix.shape

    trainMatrix = np.hstack((trainMatrix, tfidf_train_features))
    testMatrix = np.hstack((testMatrix, tfidf_test_features))

    clf = RandomForestClassifier()
    print("\tTraining random forest....")
    clf.fit(trainMatrix, trainLabels)
    print("\tTesting random forest....")
    output3 = clf.predict(testMatrix).tolist()

    ### SENTIMENT ###
    print(
        "Running baseline + PoS Features + TF-IDF Features + Sentiment Features"
    )
    s = Sentiment()
    clean_train = np.array(s.get_clean_train_vector())
    insult_train = np.array(s.get_insult_train_vector())
    sentiment_train_features = np.concatenate((clean_train, insult_train),
                                              axis=0)
    shape = sentiment_train_features.shape
    sentiment_train_features = sentiment_train_features.reshape((shape[0], 1))
    print sentiment_train_features.shape

    clean_test = np.array(s.get_clean_test_vector())
    insult_test = np.array(s.get_insult_test_vector())
    sentiment_test_features = np.concatenate((clean_test, insult_test), axis=0)
    shape = sentiment_test_features.shape
    sentiment_test_features = sentiment_test_features.reshape((shape[0], 1))
    print sentiment_test_features.shape

    trainMatrix = np.hstack((trainMatrix, sentiment_train_features))
    testMatrix = np.hstack((testMatrix, sentiment_test_features))

    clf = RandomForestClassifier()
    print("\tTraining random forest....")
    clf.fit(trainMatrix, trainLabels)
    print("\tTesting random forest....")
    output4 = clf.predict(testMatrix).tolist()

    ### MISSPELLINGS ###
    print(
        "Running baseline + PoS Features + TF-IDF Features + Sentiment Features + Misspellings features"
    )
    m = Misspellings()
    clean_train = np.array(m.get_clean_misspellings(False))
    insult_train = np.array(m.get_insult_misspellings(False))
    misspellings_train_features = np.concatenate((clean_train, insult_train),
                                                 axis=0)
    shape = misspellings_train_features.shape
    misspellings_train_features = misspellings_train_features.reshape(
        (shape[0], 1))
    print misspellings_train_features.shape

    clean_test = np.array(m.get_clean_misspellings())
    insult_test = np.array(m.get_insult_misspellings())
    misspellings_test_features = np.concatenate((clean_test, insult_test),
                                                axis=0)
    shape = misspellings_test_features.shape
    misspellings_test_features = misspellings_test_features.reshape(
        (shape[0], 1))
    print misspellings_test_features.shape

    trainMatrix = np.hstack((trainMatrix, sentiment_train_features))
    testMatrix = np.hstack((testMatrix, sentiment_test_features))

    clf = RandomForestClassifier()
    print("\tTraining random forest....")
    clf.fit(trainMatrix, trainLabels)
    print("\tTesting forest....")
    output5 = clf.predict(testMatrix).tolist()

    with open('RANDOM_FOREST_output_file_without_SB.txt', 'w+') as f:
        f.write("Output 1\n")
        f.write("{}\n".format(output1))
        interpret_results(output1, testLabels, f)
        f.write("\nOutput 2\n")
        f.write("{}\n".format(output2))
        interpret_results(output2, testLabels, f)
        f.write("\nOutput 3\n")
        f.write("{}\n".format(output3))
        interpret_results(output3, testLabels, f)
        f.write("Output 4\n")
        f.write("{}\n".format(output4))
        interpret_results(output4, testLabels, f)
        f.write("Output 5\n")
        f.write("{}\n".format(output5))
        interpret_results(output5, testLabels, f)
Exemple #30
0
from bs4 import BeautifulSoup
import os
from sentiment import Sentiment
import csv

if __name__ == '__main__':
    files = os.listdir('./zagat')
    csvFile = open('out.csv','w')
    writer = csv.writer(csvFile)
    sentiment = Sentiment()
    for f in files:
        with open('./zagat/'+f) as review:
            soup = BeautifulSoup(review)
            review_city = soup.find(itemprop='addressLocality').text
            review_state = soup.find(itemprop='addressRegion').text
            review_text = soup.find(itemprop='reviewBody').text
            review_sent = sentiment.getSentiment(review_text)
            writer.writerow([str(review_sent),review_state,review_city])
    close(csvFile)
    
            
 def __init__(self):
     self.sentiment_analyzer = Sentiment('data/AFINN-111.txt')
     self.bow_vectorizer = None
     self.bow_analyzer = None
def create_classifier():
    dataset = load_dataset()
    X_train, Y_train = dataset
    classifier = Sentiment()
    classifier.fit(X_train, Y_train)
    return classifier
 def __init__(self):
     self.sentiment_analyzer = Sentiment('data/AFINN-111.txt')
     self.bow_vectorizer = None
     self.bow_analyzer = None
Exemple #34
0
        :param user: user name
        :param last_seen: last seen index of a message
        :return: list of messaged and the latest message index
        """
        if user in self._history:
            user_messages = self._history[user]
            messages = user_messages[last_seen + 1:]
            return messages, len(user_messages) - 1
        else:
            return [], -1


""" Singleton for in-RAM storage of all users' chat histories. """
g_message_history = MessageHistory()
""" Sentiment analysis neural network. """
g_sentiment = Sentiment()


@Request.application
def application(request):
    """
    Werkzeug application to process web requests.
    :param request: inbound HTTP request
    :return: Response object
    """

    global g_message_history

    print("Got request", request)
    if request.method == 'POST':
        """ This section parses a chat message sent by user. """
from search import Search
from sentiment import Sentiment
import logger

log = logger.init_logger('sentiment.log')

template_folder = os.path.dirname(__file__)
template_folder = os.path.join(template_folder, 'templates')

app = Flask('OpinionRetrieval', template_folder=template_folder)

search_engine = Search('TwitterAuthToken.json')

classifier_path = 'sentiment_classifier.pickle'
sentiment = Sentiment()
sentiment.load(classifier_path)


@app.route('/')
def home():
    return render_template('index.html')


@app.route('/search', methods=['POST'])
def search():
    search_text = request.form['search_text']
    tweets = search_engine.query(search_text)

    prediction = []
    sentiment_prediction = sentiment.get_sentiment(
def run_quickstart(file_name):
    SpeechtoText = {}
    Output = []
    Chunkfile = []
    # [START speech_quickstart]
    import io
    import os
    # Imports the Google Cloud client library
    # [START speech_python_migration_imports]
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    # [END speech_python_migration_imports]
    # Instantiates a client
    # [START speech_python_migration_client]
    client = speech.SpeechClient()
    # [END speech_python_migration_client]
    # The name of the audio file to transcribe
    # file_name = os.path.join(
    # os.path.dirname(__file__),
    # 'resources',
    # 'audio.raw')
    #file_name='Vaishali_1_Hate.mp3'
    # Loads the audio into memory
    from pydub import AudioSegment
    from pydub.utils import make_chunks
    myaudio = AudioSegment.from_file(file_name, "wav")
    chunk_length_ms = 20000  # pydub calculates in millisec
    chunks = make_chunks(myaudio, chunk_length_ms)  #Make chunks of one sec
    #Export all of the individual chunks as wav files
    for i, chunk in enumerate(chunks):
        chunk_name = "chunk{0}.wav".format(i)
        print("exporting", chunk_name)
        Chunkfile.append('../CutAudio/' + chunk_name)
        chunk.export('../CutAudio/' + chunk_name, format="wav")
    #print("Chunkfile",Chunkfile)
    for i in Chunkfile:
        with io.open(i, 'rb') as audio_file:
            content = audio_file.read()
            audio = types.RecognitionAudio(content=content)
        config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=16000,
            language_code='en-US',
            use_enhanced=True,
            model='phone_call')
        #print("config",config)
        # Detects speech in the audio file
        response = client.recognize(config, audio)
        #print("response",type(response))
        for result in response.results:
            print('Transcript: {}'.format(result.alternatives[0].transcript))
            output = result.alternatives[0].transcript
            #print("output",output)
            Output.append(output)
    Chunkfile.clear()
    #print("Output",type(Output))
    #from toxicCommentPrediction import toxicity_level
    from sentiment import Sentiment
    #from DisplayPrediction import DisplayOutput
    listToStr = ' '.join([str(elem) for elem in Output])
    prediction = Sentiment(listToStr)
    #prediction=toxicity_level(listToStr)
    SpeechtoText[listToStr] = prediction
    #print("SpeechtoText",SpeechtoText)
    #ToxicityLevel=DisplayOutput(SpeechtoText)
    Output.clear()
    SpeechtoText.clear()
    return prediction, listToStr
Exemple #37
0
def main():
    print ("Generating language models....")
    trainAACleanLM = LanguageModel(CLEAN_TRAIN_AA_FILE)
    trainAAInsultLM = LanguageModel(INSULT_TRAIN_AA_FILE)
        
    trainABCleanLM = LanguageModel(CLEAN_TRAIN_AB_FILE)
    trainABInsultLM = LanguageModel(INSULT_TRAIN_AB_FILE)

    testCleanLM = LanguageModel(CLEAN_TEST_FILE)
    testInsultLM = LanguageModel(INSULT_TEST_FILE)

    trainLabels = np.array(([0] * trainABCleanLM.getDocCount()) + ([1] * trainABInsultLM.getDocCount()))
    testLabels = np.array(([0] * testCleanLM.getDocCount()) + ([1] * testInsultLM.getDocCount()))


    ### Just baseline probabilities
    print ("Running baseline....")
    NB = baselineNaiveBayes(trainAACleanLM, trainAAInsultLM)
    print ("\tTraining NB....") 
    NB.train()
    print ("\tTesting NB....")  
    totalNBMatrix = np.array(NB.genProbs(trainABCleanLM.getSents(), trainABInsultLM.getSents()))

    trainMatrix = totalNBMatrix 

    testMatrix = np.array(NB.genProbs(testCleanLM.getSents(), testInsultLM.getSents()))

    clf = LogisticRegression()
    print ("\tTraining SVM....")    
    clf.fit(trainMatrix, trainLabels)
    print ("\tTesting SVM....") 
    output1 = clf.predict(testMatrix).tolist()


    ### Baseline + PoS Features
    print ("Running baseline + PoS Features....")
    cleanPosMatrix = trainABCleanLM.getPosMatrix()
    insultPosMatrix = trainABInsultLM.getPosMatrix()

    testCleanPosMatrix = testCleanLM.getPosMatrix()
    testInsultPosMatrix = testInsultLM.getPosMatrix()

    posFeatures = np.array(cleanPosMatrix + insultPosMatrix)
    testPosFeatures = np.array(testCleanPosMatrix + testInsultPosMatrix)
    trainMatrix = np.hstack((trainMatrix, posFeatures))
    testMatrix = np.hstack((testMatrix, testPosFeatures))

    clf = LogisticRegression()
    print ("\tTraining SVM....")    
    clf.fit(trainMatrix, trainLabels)
    print ("\tTesting SVM....") 
    output2 = clf.predict(testMatrix).tolist()


    ### Baseline + PoS Features + TF-IDF Features (TODO Arun)
    print("Running baseline + PoS Features + TF-IDF Features")
    # generate list of features with TFIDF, using trainABCleanLM and trainABInsultLM
    # trainMatrix = np.hstack((trainMatrix, the new thing you just generated))
    # do same for testMatrix
    # clf = svm.SVC()
    # print ("\tTraining SVM....")  
    # clf.fit(trainMatrix, trainLabels)
    # print ("\tTesting SVM....")   
    # output3 = clf.predict(testMatrix).tolist()    
    # then update the output_file.txt thing below


    tfidf_train_features = tfidf.make_feature_vectors(trainAACleanLM,
            trainAAInsultLM, trainABCleanLM, trainABInsultLM)

    tfidf_test_features = tfidf.make_feature_vectors(trainAACleanLM,
            trainAAInsultLM, testCleanLM, testInsultLM)

    print tfidf_test_features.shape, tfidf_train_features.shape
    print testMatrix.shape, trainMatrix.shape

    trainMatrix = np.hstack((trainMatrix, tfidf_train_features))
    testMatrix = np.hstack((testMatrix, tfidf_test_features))


    clf = LogisticRegression()
    print ("\tTraining SVM....")  
    clf.fit(trainMatrix, trainLabels)
    print ("\tTesting SVM....")   
    output3 = clf.predict(testMatrix).tolist()  

    ### SENTIMENT ###
    print("Running baseline + PoS Features + TF-IDF Features + Sentiment Features")
    s = Sentiment()
    clean_train = np.array(s.get_clean_train_vector())
    insult_train = np.array(s.get_insult_train_vector())
    sentiment_train_features = np.concatenate((clean_train, insult_train), axis=0)
    shape = sentiment_train_features.shape
    sentiment_train_features = sentiment_train_features.reshape((shape[0], 1))
    print sentiment_train_features.shape

    clean_test = np.array(s.get_clean_test_vector())
    insult_test = np.array(s.get_insult_test_vector())
    sentiment_test_features = np.concatenate((clean_test, insult_test), axis=0)
    shape = sentiment_test_features.shape
    sentiment_test_features = sentiment_test_features.reshape((shape[0], 1))
    print sentiment_test_features.shape

    trainMatrix = np.hstack((trainMatrix, sentiment_train_features))
    testMatrix = np.hstack((testMatrix, sentiment_test_features))

    clf = LogisticRegression()
    print ("\tTraining SVM....")  
    clf.fit(trainMatrix, trainLabels)
    print ("\tTesting SVM....")   
    output4 = clf.predict(testMatrix).tolist()  

    ### MISSPELLINGS ###
    print("Running baseline + PoS Features + TF-IDF Features + Sentiment Features + Misspellings features")
    m = Misspellings()
    clean_train = np.array(m.get_clean_misspellings(False))
    insult_train = np.array(m.get_insult_misspellings(False))
    misspellings_train_features = np.concatenate((clean_train, insult_train), axis=0)
    shape = misspellings_train_features.shape
    misspellings_train_features = misspellings_train_features.reshape((shape[0], 1))
    print misspellings_train_features.shape

    clean_test = np.array(m.get_clean_misspellings())
    insult_test = np.array(m.get_insult_misspellings())
    misspellings_test_features = np.concatenate((clean_test, insult_test), axis=0)
    shape = misspellings_test_features.shape
    misspellings_test_features = misspellings_test_features.reshape((shape[0], 1))
    print misspellings_test_features.shape

    trainMatrix = np.hstack((trainMatrix, sentiment_train_features))
    testMatrix = np.hstack((testMatrix, sentiment_test_features))

    clf = LogisticRegression()
    print ("\tTraining SVM....")  
    clf.fit(trainMatrix, trainLabels)
    print ("\tTesting SVM....")   
    output5 = clf.predict(testMatrix).tolist()  

    with open('LOG_REG_output_file_w_SB.txt', 'w+') as f:
        f.write("Output 1\n")
        f.write("{}\n".format(output1))
        interpret_results(output1, testLabels, f)
        f.write("\nOutput 2\n") 
        f.write("{}\n".format(output2))
        interpret_results(output2, testLabels, f)
        f.write("\nOutput 3\n") 
        f.write("{}\n".format(output3))
        interpret_results(output3, testLabels, f)
        f.write("Output 4\n")
        f.write("{}\n".format(output4))
        interpret_results(output4, testLabels, f)
        f.write("Output 5\n")
        f.write("{}\n".format(output5))
        interpret_results(output5, testLabels, f)
Exemple #38
0
from sentiment import Sentiment


def load_docs(neg_file, pos_file):
    neg = open(neg_file, 'r', encoding='utf-8').readlines()
    pos = open(pos_file, 'r', encoding='utf-8').readlines()
    neg_docs = []
    pos_docs = []

    for line in neg:
        neg_docs.append(line.rstrip("\r\n"))
    for line in pos:
        pos_docs.append(line.rstrip("\r\n"))

    return neg_docs, pos_docs


if __name__ == '__main__':
    sentiment = Sentiment()
    neg_docs, pos_docs = load_docs('neg.txt', 'pos.txt')

    sentiment.train(neg_docs, pos_docs)

    text = '这个东西真心很赞'
    prob = sentiment.classify(text)

    print('prob: {}'.format(prob))