class Featurizer(object):
    def __init__(self):
        self.sentiment_analyzer = Sentiment('data/AFINN-111.txt')
        self.bow_vectorizer = None
        self.bow_analyzer = None

    def bag_of_words(self, body):
        return self.bow_vectorizer.transform([body]).toarray()

    def text_features(self, comment):
        num_chars = len(comment.get("body"))
        num_links = count_links(comment.get("body"))

        simple_tokens = comment.get("body").split(' ')
        num_words = 0
        avg_word_length = 0
        for token in simple_tokens:
            num_words += 1
            avg_word_length += len(token)
        avg_word_length = float(avg_word_length) / float(num_words)

        sentiment = self.sentiment_analyzer.analyze(
            self.bow_analyzer(comment.get("body")))

        score = comment.get("score")

        return [num_chars, num_links, num_words, num_words, 
                avg_word_length, sentiment]

    def transform_comment(self, comment):
        return numpy.hstack((
            numpy.array([self.text_features(comment)], 
                        dtype='float_'),
            self.bag_of_words(comment.get("body"))))

    def score_comment(self, comment):
        return comment.get("score")

    def transform(self, comments):
        """ Returns a Nx(D+1) numpy matrix of features. The first D columns
        correspond to features, where the final column corresponds to the
        scores of each comment"""

        # if it's a single instance, return an array
        if isinstance(comments, dict):
            return transform_comment(comments)

        # http://scikit-learn.org/stable/modules/feature_extraction.html
        self.bow_vectorizer = CountVectorizer(min_df=1)
        self.bow_vectorizer.fit([c.get("body") for c in comments])
        self.bow_analyzer = self.bow_vectorizer.build_analyzer()

        def features_and_label(comment):
            return numpy.hstack((
                self.transform_comment(comment),
                numpy.array([[self.score_comment(comment)]], 
                            dtype='float_')))

        return numpy.vstack([features_and_label(c) 
                             for c in comments])
class Featurizer(object):
    def __init__(self):
        self.sentiment_analyzer = Sentiment('data/AFINN-111.txt')
        self.bow_vectorizer = None
        self.bow_analyzer = None

    def bag_of_words(self, body):
        return self.bow_vectorizer.transform([body]).toarray()

    def text_features(self, comment):
        num_chars = len(comment.get("body"))
        num_links = count_links(comment.get("body"))

        simple_tokens = comment.get("body").split(' ')
        num_words = 0
        avg_word_length = 0
        for token in simple_tokens:
            num_words += 1
            avg_word_length += len(token)
        avg_word_length = float(avg_word_length) / float(num_words)

        sentiment = self.sentiment_analyzer.analyze(
            self.bow_analyzer(comment.get("body")))

        score = comment.get("score")

        return [
            num_chars, num_links, num_words, num_words, avg_word_length,
            sentiment
        ]

    def transform_comment(self, comment):
        return numpy.hstack((numpy.array([self.text_features(comment)],
                                         dtype='float_'),
                             self.bag_of_words(comment.get("body"))))

    def score_comment(self, comment):
        return comment.get("score")

    def transform(self, comments):
        """ Returns a Nx(D+1) numpy matrix of features. The first D columns
        correspond to features, where the final column corresponds to the
        scores of each comment"""

        # if it's a single instance, return an array
        if isinstance(comments, dict):
            return transform_comment(comments)

        # http://scikit-learn.org/stable/modules/feature_extraction.html
        self.bow_vectorizer = CountVectorizer(min_df=1)
        self.bow_vectorizer.fit([c.get("body") for c in comments])
        self.bow_analyzer = self.bow_vectorizer.build_analyzer()

        def features_and_label(comment):
            return numpy.hstack((self.transform_comment(comment),
                                 numpy.array([[self.score_comment(comment)]],
                                             dtype='float_')))

        return numpy.vstack([features_and_label(c) for c in comments])
Esempio n. 3
0
def GetSubjScoreForSingleSent(SentIndex, Sentence):
    T0 = time()
    try:
        SentAnalyzer = Sentiment()
        ResDict = SentAnalyzer.analyze([Sentence])
        pprint(ResDict)
        SentSentimentPol = {S: {'sentiment':ResDict['sentiments'][I], 'score': ResDict['scores'][I]} for I,S in enumerate(ResDict['sentences'])}
        print 'processed sentence number {} in {} sec'.format(SentIndex, round(time() - T0))
        return SentSentimentPol
    except:
        print 'failed to process sentence number ', SentIndex
        return {}
Esempio n. 4
0
def ProcessSingleFile (FName, MailNumber):
    T0 = time()
    try:
        EmailText = [open(FName).read()]
        SentAnalyzer = Sentiment()
        Res = SentAnalyzer.analyze(EmailText)
        pprint (Res)
        SentSentimentPol = zip(Res['sentences'],Res['sentiments'],Res['scores'])
        SentSentimentPol = [ThreeTuple for ThreeTuple in SentSentimentPol if ThreeTuple[1] != 'neutral']
        print 'processed email {} in {} sec'.format(MailNumber, round(time()-T0))
        return SentSentimentPol
    except:
        print 'failed to process email ', MailNumber
        return []