Ejemplo n.º 1
0
    def get_discourse_markers(self):
        featureNames = [self.type + "_DiscourseMarkers"]
        functionName = "get_discourse_markers"

        if os.path.isfile(self.iC.featurePath + self.modelName + "_" +
                          functionName):
            utils.load_features_from_file(
                self.iC.featurePath + self.modelName + "_" + functionName,
                self.iC, self.type)
            print "loaded " + functionName
            return

        for instance in self.iC.instances:
            content = instance.text
            words = instance.tokens
            nwords = len(words)
            nMarkers = 0
            for marker in self.discourseMarkersList:
                nApparitions = content.count(marker)
                nMarkers = nMarkers + nApparitions

            ratio = 0.0
            if nwords > 0:
                ratio = nMarkers / nwords

            instance.addFeature(self.type, self.type + "_DiscourseMarkers",
                                ratio)

        utils.save_features_to_file(
            self.iC.featurePath + self.modelName + "_" + functionName,
            featureNames, self.iC, self.type)
Ejemplo n.º 2
0
    def get_interjections(self):
        featureNames = [self.type + "_Interjections"]
        functionName = "get_interjections"

        if os.path.isfile(self.iC.featurePath + self.modelName + "_" +
                          functionName):
            utils.load_features_from_file(
                self.iC.featurePath + self.modelName + "_" + functionName,
                self.iC, self.type)
            print "loaded " + functionName
            return

        for instance in self.iC.instances:
            content = instance.text
            nwords = len(instance.tokens)
            nInterjections = 0
            ratio = 0.0

            for interjection in self.interjections:
                if content.count(interjection.lower()) > 0:
                    nInterjections += content.count(interjection.lower())

            if nwords > 0:
                ratio = nInterjections / float(nwords)

            instance.addFeature(self.type, self.type + "_Interjections", ratio)
        utils.save_features_to_file(
            self.iC.featurePath + self.modelName + "_" + functionName,
            featureNames, self.iC, self.type)
Ejemplo n.º 3
0
    def get_numbers(self):
        featureNames = [self.type + "_Numbers"]
        functionName = "get_numbers"

        if os.path.isfile(self.iC.featurePath + self.modelName + "_" +
                          functionName):
            utils.load_features_from_file(
                self.iC.featurePath + self.modelName + "_" + functionName,
                self.iC, self.type)
            print "loaded " + functionName
            return

        for instance in self.iC.instances:
            matches = re.findall("[0-9]", instance.text)
            ratio = 0.0
            nchars = len(instance.text)

            if nchars > 0:
                ratio = len(matches) / nchars

            instance.addFeature(self.type, self.type + "_Numbers", ratio)

        utils.save_features_to_file(
            self.iC.featurePath + self.modelName + "_" + functionName,
            featureNames, self.iC, self.type)
    def compute_discourse_features(self):
        functionName = "compute_discourse_features"

        if os.path.isfile(self.iC.featurePath + self.modelName + "_" +
                          functionName):
            utils.load_features_from_file(
                self.iC.featurePath + self.modelName + "_" + functionName,
                self.iC, self.type)
            print "loaded " + functionName
            return

        nPosts = len(self.iC.instances)
        nDone = 0
        for instance in self.iC.instances:
            discourseOut = instance.discourse
            iTree = DiscourseTreeOperations(discourseOut)
            sentences = instance.sentences
            nsents = len(sentences)

            self.get_shape_features(iTree, nsents, instance)
            self.get_discourse_relation_usage(iTree, nsents, instance)
            nDone += 1
            print "processed " + str(nDone) + " of " + str(nPosts)

        self.adjust_features()
        utils.save_features_to_file(
            self.iC.featurePath + self.modelName + "_" + functionName,
            self.allDeps, self.iC, self.type)
Ejemplo n.º 5
0
    def get_chars_per_word(self):

        featureNames = [self.type + "_CharsPerWord"]
        functionName = "get_chars_per_word"

        if os.path.isfile(self.iC.featurePath + self.modelName + "_" +
                          functionName):
            utils.load_features_from_file(
                self.iC.featurePath + self.modelName + "_" + functionName,
                self.iC, self.type)
            print "loaded " + functionName
            return

        for instance in self.iC.instances:
            lWords = instance.tokens
            nwords = len(lWords)
            ratio = 0.0

            ncharsword = 0

            for word in lWords:
                nchars = len(word)
                ncharsword = ncharsword + nchars

            if nwords > 0:
                ratio = ncharsword / nwords

            instance.addFeature(self.type, self.type + "_CharsPerWord", ratio)

        utils.save_features_to_file(
            self.iC.featurePath + self.modelName + "_" + functionName,
            featureNames, self.iC, self.type)
Ejemplo n.º 6
0
    def get_acronyms(self):
        featureNames = [self.type + "_Acronyms"]
        functionName = "get_acronyms"

        if os.path.isfile(self.iC.featurePath + self.modelName + "_" +
                          functionName):
            utils.load_features_from_file(
                self.iC.featurePath + self.modelName + "_" + functionName,
                self.iC, self.type)
            print "loaded " + functionName
            return

        for instance in self.iC.instances:
            nacr = 0
            words = instance.tokens
            nwords = len(words)
            totalWords = 0
            ratio = 0.0
            for word in words:
                totalWords = totalWords + 1
                pattern = '(^[A-Z]([0-9]|[A-Z]|\.){3})'
                match = re.match(pattern, word)
                if match and word[len(word) - 1] != ":" and word[len(word) -
                                                                 1] != ',':
                    nacr = nacr + 1

            if nwords > 0:
                ratio = nacr / totalWords

            instance.addFeature(self.type, self.type + "_Acronyms", ratio)
        utils.save_features_to_file(
            self.iC.featurePath + self.modelName + "_" + functionName,
            featureNames, self.iC, self.type)
Ejemplo n.º 7
0
    def get_stopwords(self):
        featureNames = [self.type + "_Stopwords"]
        functionName = "get_stopwords"

        if os.path.isfile(self.iC.featurePath + self.modelName + "_" +
                          functionName):
            utils.load_features_from_file(
                self.iC.featurePath + self.modelName + "_" + functionName,
                self.iC, self.type)
            print "loaded " + functionName
            return

        stopwords = nltk.corpus.stopwords.words('english')
        for instance in self.iC.instances:
            words = instance.tokens
            nstopwords = 0
            totalWords = 0
            ratio = 0.0
            for word in words:
                totalWords = totalWords + 1
                if word.strip().lower() in stopwords:
                    nstopwords = nstopwords + 1

            if len(words) > 0:
                ratio = nstopwords / totalWords

            instance.addFeature(self.type, self.type + "_Stopwords", ratio)
        utils.save_features_to_file(
            self.iC.featurePath + self.modelName + "_" + functionName,
            featureNames, self.iC, self.type)
Ejemplo n.º 8
0
    def get_symbols(self, symbols, featureName):
        featureNames = [self.type + "_" + featureName]
        functionName = "get_symbols_" + featureName

        if os.path.isfile(self.iC.featurePath + self.modelName + "_" +
                          functionName):
            utils.load_features_from_file(
                self.iC.featurePath + self.modelName + "_" + functionName,
                self.iC, self.type)
            print "loaded " + functionName
            return

        for instance in self.iC.instances:
            nChars = len(instance.text)
            matches = 0
            ratio = 0.0

            for char in instance.text:
                if char in symbols:
                    matches = matches + 1

            if nChars > 0:
                ratio = matches / nChars

            instance.addFeature(self.type, self.type + "_" + featureName,
                                ratio)

        utils.save_features_to_file(
            self.iC.featurePath + self.modelName + "_" + functionName,
            featureNames, self.iC, self.type)
    def get_wordsPerSentence_stdandrange(self):
        featureNames = [
            self.type + "_STD", self.type + "_Range",
            self.type + "_wordsPerSentence"
        ]
        functionName = "get_wordsPerSentence_stdandrange"

        if os.path.isfile(self.iC.featurePath + self.modelName + "_" +
                          functionName):
            utils.load_features_from_file(
                self.iC.featurePath + self.modelName + "_" + functionName,
                self.iC, self.type)
            print "loaded " + functionName
            return

        for instance in self.iC.instances:
            sentences = instance.sentences
            lengths = []
            for sentence in sentences:
                lengths.append(len(word_tokenize(sentence)))

            std = np.std(lengths)
            mean = np.mean(lengths)
            rng = np.amax(lengths) - np.amin(lengths)

            instance.addFeature(self.type, self.type + "_STD", std)
            instance.addFeature(self.type, self.type + "_Range", rng)
            instance.addFeature(self.type, self.type + "_wordsPerSentence",
                                mean)

        utils.save_features_to_file(
            self.iC.featurePath + self.modelName + "_" + functionName,
            featureNames, self.iC, self.type)
Ejemplo n.º 10
0
    def get_dict_count(self):
        featureNames = [
            self.type + "_Abbrev", self.type + "_Curse",
            self.type + "_Positive", self.type + "_Negative"
        ]
        functionName = "get_dict_count"

        if os.path.isfile(self.iC.featurePath + self.modelName + "_" +
                          functionName):
            utils.load_features_from_file(
                self.iC.featurePath + self.modelName + "_" + functionName,
                self.iC, self.type)
            print "loaded " + functionName
            return

        for instance in self.iC.instances:
            lWords = instance.tokens
            nwords = len(lWords)
            nAbbrev = 0
            nCurse = 0
            nPos = 0
            nNeg = 0
            ratioAbbrev = 0.0
            ratioCurse = 0.0
            ratioPos = 0.0
            ratioNeg = 0.0

            for word in lWords:
                word = word.lower()
                if word in self.abbreviationList:
                    nAbbrev = nAbbrev + 1
                if word in self.badWordsList:
                    nCurse = nCurse + 1
                if word in self.negList:
                    nNeg = nNeg + 1
                if word in self.posList:
                    nPos = nPos + 1

            if nwords > 0:
                ratioAbbrev = nAbbrev / nwords
                ratioCurse = nCurse / nwords
                ratioPos = nPos / nwords
                ratioNeg = nNeg / nwords

            instance.addFeature(self.type, self.type + "_Abbrev", ratioAbbrev)
            instance.addFeature(self.type, self.type + "_Curse", ratioCurse)
            instance.addFeature(self.type, self.type + "_Positive", ratioPos)
            instance.addFeature(self.type, self.type + "_Negative", ratioNeg)
        utils.save_features_to_file(
            self.iC.featurePath + self.modelName + "_" + functionName,
            featureNames, self.iC, self.type)
Ejemplo n.º 11
0
    def get_firstperson_pronouns(self):

        featureNames = [
            self.type + "_FirstSingular", self.type + "_FirstPlural"
        ]
        functionName = "get_firstperson_pronouns"

        if os.path.isfile(self.iC.featurePath + self.modelName + "_" +
                          functionName):
            utils.load_features_from_file(
                self.iC.featurePath + self.modelName + "_" + functionName,
                self.iC, self.type)
            print "loaded " + functionName
            return

        #first_singular = ["i","me","my","mine"]
        #first_plural = ["we","our","ours"]

        first_singular = ["yo", "mi", "mío"]
        first_plural = ["nos", "nosotros", "nuestro"]

        for instance in self.iC.instances:
            lWords = instance.tokens
            nwords = len(lWords)
            ratioFirstS = 0.0
            ratioFirstP = 0.0

            nFirstS = 0
            nFirstP = 0
            for word in lWords:
                word = word.lower()
                if word in first_singular:
                    nFirstS = nFirstS + 1
                elif word in first_plural:
                    nFirstP = nFirstP + 1

            if nwords > 0:
                ratioFirstS = nFirstS / nwords
                ratioFirstP = nFirstP / nwords

            instance.addFeature(self.type, self.type + "_FirstSingular",
                                ratioFirstS)
            instance.addFeature(self.type, self.type + "_FirstPlural",
                                ratioFirstP)
        utils.save_features_to_file(
            self.iC.featurePath + self.modelName + "_" + functionName,
            featureNames, self.iC, self.type)
Ejemplo n.º 12
0
    def compute_syntactic_features(self):
        functionName = "compute_syntactic_features"

        if os.path.isfile(self.iC.featurePath + self.modelName + "_" +
                          functionName):
            utils.load_features_from_file(
                self.iC.featurePath + self.modelName + "_" + functionName,
                self.iC, self.type)
            print "loaded " + functionName
            return

        nPosts = len(self.iC.instances)
        nProcessed = 0
        print "Building Syntactic Trees"
        for instance in self.iC.instances:
            conllSents = instance.conll.split("\n\n")
            iTrees = []
            conllSents = conllSents[:-1]
            for conllSent in conllSents:
                try:
                    iTree = SyntacticTreeOperations(conllSent)
                    iTrees.append(iTree)
                except ValueError as e:
                    print e
                    continue

            self.get_relation_usage(iTrees, instance)
            self.get_relationgroup_usage(iTrees, instance)
            self.get_pos_usage(iTrees, instance)
            self.get_posgroup_usage(iTrees, instance)

            self.get_shape_features(iTrees, instance)
            self.get_subcoord_features(iTrees, instance)
            self.get_verb_features(iTrees, instance)
            nProcessed += 1
            print "processed " + str(nProcessed) + " of " + str(nPosts)

        self.adjust_features()
        utils.save_features_to_file(
            self.iC.featurePath + self.modelName + "_" + functionName,
            self.allRelationsPos, self.iC, self.type)
Ejemplo n.º 13
0
    def get_in_parenthesis_stats(self):

        featureNames = [
            self.type + "_charsinparenthesis",
            self.type + "_wordsinparenthesis"
        ]
        functionName = "get_in_parenthesis_stats"

        if os.path.isfile(self.iC.featurePath + self.modelName + "_" +
                          functionName):
            utils.load_features_from_file(
                self.iC.featurePath + self.modelName + "_" + functionName,
                self.iC, self.type)
            print "loaded " + functionName
            return

        for instance in self.iC.instances:
            matches = re.findall("\((.*?)\)", instance.text)
            npar = len(matches)
            totalchars = 0
            totalwords = 0

            for match in matches:
                totalchars += len(match)
                words = word_tokenize(match)
                totalwords = len(words)

            charsInParenthesis = 0.0
            wordsInParenthesis = 0.0
            if npar > 0:
                charsInParenthesis = totalchars / npar
                wordsInParenthesis = totalwords / npar

            instance.addFeature(self.type, self.type + "_charsinparenthesis",
                                charsInParenthesis)
            instance.addFeature(self.type, self.type + "_wordsinparenthesis",
                                wordsInParenthesis)

        utils.save_features_to_file(
            self.iC.featurePath + self.modelName + "_" + functionName,
            featureNames, self.iC, self.type)
Ejemplo n.º 14
0
    def get_uppers(self):
        featureNames = [self.type + "_UpperCases"]
        functionName = "get_uppers"

        if os.path.isfile(self.iC.featurePath + self.modelName + "_" +
                          functionName):
            utils.load_features_from_file(
                self.iC.featurePath + self.modelName + "_" + functionName,
                self.iC, self.type)
            print "loaded " + functionName
            return

        for instance in self.iC.instances:
            featValue = 0.0
            matches = re.findall("[A-Z]", instance.text, re.DOTALL)
            upperCases = len(matches)
            ratio = upperCases / len(instance.text)
            instance.addFeature(self.type, self.type + "_UpperCases", ratio)

        utils.save_features_to_file(
            self.iC.featurePath + self.modelName + "_" + functionName,
            featureNames, self.iC, self.type)
Ejemplo n.º 15
0
    def get_twothree_words(self):
        featureNames = [self.type + "_twoWords", self.type + "_threeWords"]
        functionName = "get_twothree_words"

        if os.path.isfile(self.iC.featurePath + self.modelName + "_" +
                          functionName):
            utils.load_features_from_file(
                self.iC.featurePath + self.modelName + "_" + functionName,
                self.iC, self.type)
            print "loaded " + functionName
            return

        for instance in self.iC.instances:
            textTokenized = instance.tokens
            nwords = len(textTokenized)
            nTwo = 0
            nThree = 0
            twoWords = 0.0
            threeWords = 0.0

            for word in textTokenized:
                if len(word) == 2:
                    nTwo += 1
                elif len(word) == 3:
                    nThree += 1

            if nwords > 0:
                twoWords = nTwo / nwords
                threeWords = nThree / nwords

            instance.addFeature(self.type, self.type + "_twoWords", twoWords)
            instance.addFeature(self.type, self.type + "_threeWords",
                                threeWords)
        utils.save_features_to_file(
            self.iC.featurePath + self.modelName + "_" + functionName,
            featureNames, self.iC, self.type)
Ejemplo n.º 16
0
    def get_vocabulary_richness(self):
        featureNames = [self.type + "_VocabularyRichness"]
        functionName = "get_vocabulary_richness"

        if os.path.isfile(self.iC.featurePath + self.modelName + "_" +
                          functionName):
            utils.load_features_from_file(
                self.iC.featurePath + self.modelName + "_" + functionName,
                self.iC, self.type)
            print "loaded " + functionName
            return

        for instance in self.iC.instances:
            lAllWords = instance.tokens
            lDiffWords = set(lAllWords)
            ratio = 0.0
            if len(lAllWords) > 0:
                ratio = len(lDiffWords) / len(lAllWords)
            instance.addFeature(self.type, self.type + "_VocabularyRichness",
                                ratio)

        utils.save_features_to_file(
            self.iC.featurePath + self.modelName + "_" + functionName,
            featureNames, self.iC, self.type)
Ejemplo n.º 17
0
    def get_mean_mood(self):
        featureNames = [
            self.type + "_TokenRatioAfraid", self.type + "_TokenRatioAmused",
            self.type + "_TokenRatioAngry", self.type + "_TokenRatioAnnoyed",
            self.type + "_TokenRatioDontCare", self.type + "_TokenRatioHappy",
            self.type + "_TokenRatioInspired", self.type + "_TokenRatioSad",
            self.type + "_EmotionRatio", self.type + "_EmotionRatioAfraid",
            self.type + "_EmotionRatioAmused", self.type +
            "_EmotionRatioAngry", self.type + "_EmotionRatioAnnoyed",
            self.type + "_EmotionRatioDontCare",
            self.type + "_EmotionRatioHappy",
            self.type + "_EmotionRatioInspired", self.type + "_EmotionRatioSad"
        ]
        functionName = "get_mean_mood"

        if os.path.isfile(self.iC.featurePath + self.modelName + "_" +
                          functionName):
            utils.load_features_from_file(
                self.iC.featurePath + self.modelName + "_" + functionName,
                self.iC, self.type)
            print "loaded " + functionName
            return

        lmtzr = WordNetLemmatizer()

        for instance in self.iC.instances:
            tokens = instance.tokens
            totalTokens = len(tokens)
            text_tagged = nltk.pos_tag(tokens)

            totalAfraid = 0
            totalAmused = 0
            totalAngry = 0
            totalAnnoyed = 0
            totalDontCare = 0
            totalHappy = 0
            totalInspired = 0
            totalSad = 0
            totalEmotionTokens = 0

            ratioAfraid = 0.0
            ratioAmused = 0.0
            ratioAngry = 0.0
            ratioAnnoyed = 0.0
            ratioDontCare = 0.0
            ratioHappy = 0.0
            ratioInspired = 0.0
            ratioSad = 0.0
            ratioEmotionTokens = 0.0

            ratioEAfraid = 0.0
            ratioEAmused = 0.0
            ratioEAngry = 0.0
            ratioEAnnoyed = 0.0
            ratioEDontCare = 0.0
            ratioEHappy = 0.0
            ratioEInspired = 0.0
            ratioESad = 0.0

            for word in text_tagged:
                pos = self.getDepecheMoodPos(word[1])
                if pos is None:
                    continue

                if pos == "v":
                    lemma = lmtzr.lemmatize(word[0], "v")
                else:
                    lemma = lmtzr.lemmatize(word[0])

                lemma = lemma.lower()

                idx = lemma + "#" + pos

                if idx in self.depecheMood.keys():
                    totalEmotionTokens += 1
                    totalAfraid += float(self.depecheMood[idx]["afraid"])
                    totalAmused += float(self.depecheMood[idx]["amused"])
                    totalAngry += float(self.depecheMood[idx]["angry"])
                    totalAnnoyed += float(self.depecheMood[idx]["annoyed"])
                    totalDontCare += float(self.depecheMood[idx]["dont_care"])
                    totalHappy += float(self.depecheMood[idx]["happy"])
                    totalInspired += float(self.depecheMood[idx]["inspired"])
                    totalSad += float(self.depecheMood[idx]["sad"])

            if totalTokens > 0:
                ratioAfraid = totalAfraid / totalTokens
                ratioAmused = totalAmused / totalTokens
                ratioAngry = totalAngry / totalTokens
                ratioAnnoyed = totalAnnoyed / totalTokens
                ratioDontCare = totalDontCare / totalTokens
                ratioHappy = totalHappy / totalTokens
                ratioInspired = totalInspired / totalTokens
                ratioSad = totalSad / totalTokens
                ratioEmotionTokens = totalEmotionTokens / totalTokens

            instance.addFeature(self.type, self.type + "_TokenRatioAfraid",
                                ratioAfraid)
            instance.addFeature(self.type, self.type + "_TokenRatioAmused",
                                ratioAmused)
            instance.addFeature(self.type, self.type + "_TokenRatioAngry",
                                ratioAngry)
            instance.addFeature(self.type, self.type + "_TokenRatioAnnoyed",
                                ratioAnnoyed)
            instance.addFeature(self.type, self.type + "_TokenRatioDontCare",
                                ratioDontCare)
            instance.addFeature(self.type, self.type + "_TokenRatioHappy",
                                ratioHappy)
            instance.addFeature(self.type, self.type + "_TokenRatioInspired",
                                ratioInspired)
            instance.addFeature(self.type, self.type + "_TokenRatioSad",
                                ratioSad)
            instance.addFeature(self.type, self.type + "_EmotionRatio",
                                ratioEmotionTokens)

            if totalEmotionTokens > 0:
                ratioEAfraid = totalAfraid / totalEmotionTokens
                ratioEAmused = totalAmused / totalEmotionTokens
                ratioEAngry = totalAngry / totalEmotionTokens
                ratioEAnnoyed = totalAnnoyed / totalEmotionTokens
                ratioEDontCare = totalDontCare / totalEmotionTokens
                ratioEHappy = totalHappy / totalEmotionTokens
                ratioEInspired = totalInspired / totalEmotionTokens
                ratioESad = totalSad / totalEmotionTokens

            instance.addFeature(self.type, self.type + "_EmotionRatioAfraid",
                                ratioEAfraid)
            instance.addFeature(self.type, self.type + "_EmotionRatioAmused",
                                ratioEAmused)
            instance.addFeature(self.type, self.type + "_EmotionRatioAngry",
                                ratioEAngry)
            instance.addFeature(self.type, self.type + "_EmotionRatioAnnoyed",
                                ratioEAnnoyed)
            instance.addFeature(self.type, self.type + "_EmotionRatioDontCare",
                                ratioEDontCare)
            instance.addFeature(self.type, self.type + "_EmotionRatioHappy",
                                ratioEHappy)
            instance.addFeature(self.type, self.type + "_EmotionRatioInspired",
                                ratioEInspired)
            instance.addFeature(self.type, self.type + "_EmotionRatioSad",
                                ratioESad)
        utils.save_features_to_file(
            self.iC.featurePath + self.modelName + "_" + functionName,
            featureNames, self.iC, self.type)