def get_discourse_markers(self): featureNames = [self.type + "_DiscourseMarkers"] functionName = "get_discourse_markers" if os.path.isfile(self.iC.featurePath + self.modelName + "_" + functionName): utils.load_features_from_file( self.iC.featurePath + self.modelName + "_" + functionName, self.iC, self.type) print "loaded " + functionName return for instance in self.iC.instances: content = instance.text words = instance.tokens nwords = len(words) nMarkers = 0 for marker in self.discourseMarkersList: nApparitions = content.count(marker) nMarkers = nMarkers + nApparitions ratio = 0.0 if nwords > 0: ratio = nMarkers / nwords instance.addFeature(self.type, self.type + "_DiscourseMarkers", ratio) utils.save_features_to_file( self.iC.featurePath + self.modelName + "_" + functionName, featureNames, self.iC, self.type)
def get_interjections(self): featureNames = [self.type + "_Interjections"] functionName = "get_interjections" if os.path.isfile(self.iC.featurePath + self.modelName + "_" + functionName): utils.load_features_from_file( self.iC.featurePath + self.modelName + "_" + functionName, self.iC, self.type) print "loaded " + functionName return for instance in self.iC.instances: content = instance.text nwords = len(instance.tokens) nInterjections = 0 ratio = 0.0 for interjection in self.interjections: if content.count(interjection.lower()) > 0: nInterjections += content.count(interjection.lower()) if nwords > 0: ratio = nInterjections / float(nwords) instance.addFeature(self.type, self.type + "_Interjections", ratio) utils.save_features_to_file( self.iC.featurePath + self.modelName + "_" + functionName, featureNames, self.iC, self.type)
def get_numbers(self): featureNames = [self.type + "_Numbers"] functionName = "get_numbers" if os.path.isfile(self.iC.featurePath + self.modelName + "_" + functionName): utils.load_features_from_file( self.iC.featurePath + self.modelName + "_" + functionName, self.iC, self.type) print "loaded " + functionName return for instance in self.iC.instances: matches = re.findall("[0-9]", instance.text) ratio = 0.0 nchars = len(instance.text) if nchars > 0: ratio = len(matches) / nchars instance.addFeature(self.type, self.type + "_Numbers", ratio) utils.save_features_to_file( self.iC.featurePath + self.modelName + "_" + functionName, featureNames, self.iC, self.type)
def compute_discourse_features(self): functionName = "compute_discourse_features" if os.path.isfile(self.iC.featurePath + self.modelName + "_" + functionName): utils.load_features_from_file( self.iC.featurePath + self.modelName + "_" + functionName, self.iC, self.type) print "loaded " + functionName return nPosts = len(self.iC.instances) nDone = 0 for instance in self.iC.instances: discourseOut = instance.discourse iTree = DiscourseTreeOperations(discourseOut) sentences = instance.sentences nsents = len(sentences) self.get_shape_features(iTree, nsents, instance) self.get_discourse_relation_usage(iTree, nsents, instance) nDone += 1 print "processed " + str(nDone) + " of " + str(nPosts) self.adjust_features() utils.save_features_to_file( self.iC.featurePath + self.modelName + "_" + functionName, self.allDeps, self.iC, self.type)
def get_chars_per_word(self): featureNames = [self.type + "_CharsPerWord"] functionName = "get_chars_per_word" if os.path.isfile(self.iC.featurePath + self.modelName + "_" + functionName): utils.load_features_from_file( self.iC.featurePath + self.modelName + "_" + functionName, self.iC, self.type) print "loaded " + functionName return for instance in self.iC.instances: lWords = instance.tokens nwords = len(lWords) ratio = 0.0 ncharsword = 0 for word in lWords: nchars = len(word) ncharsword = ncharsword + nchars if nwords > 0: ratio = ncharsword / nwords instance.addFeature(self.type, self.type + "_CharsPerWord", ratio) utils.save_features_to_file( self.iC.featurePath + self.modelName + "_" + functionName, featureNames, self.iC, self.type)
def get_acronyms(self): featureNames = [self.type + "_Acronyms"] functionName = "get_acronyms" if os.path.isfile(self.iC.featurePath + self.modelName + "_" + functionName): utils.load_features_from_file( self.iC.featurePath + self.modelName + "_" + functionName, self.iC, self.type) print "loaded " + functionName return for instance in self.iC.instances: nacr = 0 words = instance.tokens nwords = len(words) totalWords = 0 ratio = 0.0 for word in words: totalWords = totalWords + 1 pattern = '(^[A-Z]([0-9]|[A-Z]|\.){3})' match = re.match(pattern, word) if match and word[len(word) - 1] != ":" and word[len(word) - 1] != ',': nacr = nacr + 1 if nwords > 0: ratio = nacr / totalWords instance.addFeature(self.type, self.type + "_Acronyms", ratio) utils.save_features_to_file( self.iC.featurePath + self.modelName + "_" + functionName, featureNames, self.iC, self.type)
def get_stopwords(self): featureNames = [self.type + "_Stopwords"] functionName = "get_stopwords" if os.path.isfile(self.iC.featurePath + self.modelName + "_" + functionName): utils.load_features_from_file( self.iC.featurePath + self.modelName + "_" + functionName, self.iC, self.type) print "loaded " + functionName return stopwords = nltk.corpus.stopwords.words('english') for instance in self.iC.instances: words = instance.tokens nstopwords = 0 totalWords = 0 ratio = 0.0 for word in words: totalWords = totalWords + 1 if word.strip().lower() in stopwords: nstopwords = nstopwords + 1 if len(words) > 0: ratio = nstopwords / totalWords instance.addFeature(self.type, self.type + "_Stopwords", ratio) utils.save_features_to_file( self.iC.featurePath + self.modelName + "_" + functionName, featureNames, self.iC, self.type)
def get_symbols(self, symbols, featureName): featureNames = [self.type + "_" + featureName] functionName = "get_symbols_" + featureName if os.path.isfile(self.iC.featurePath + self.modelName + "_" + functionName): utils.load_features_from_file( self.iC.featurePath + self.modelName + "_" + functionName, self.iC, self.type) print "loaded " + functionName return for instance in self.iC.instances: nChars = len(instance.text) matches = 0 ratio = 0.0 for char in instance.text: if char in symbols: matches = matches + 1 if nChars > 0: ratio = matches / nChars instance.addFeature(self.type, self.type + "_" + featureName, ratio) utils.save_features_to_file( self.iC.featurePath + self.modelName + "_" + functionName, featureNames, self.iC, self.type)
def get_wordsPerSentence_stdandrange(self): featureNames = [ self.type + "_STD", self.type + "_Range", self.type + "_wordsPerSentence" ] functionName = "get_wordsPerSentence_stdandrange" if os.path.isfile(self.iC.featurePath + self.modelName + "_" + functionName): utils.load_features_from_file( self.iC.featurePath + self.modelName + "_" + functionName, self.iC, self.type) print "loaded " + functionName return for instance in self.iC.instances: sentences = instance.sentences lengths = [] for sentence in sentences: lengths.append(len(word_tokenize(sentence))) std = np.std(lengths) mean = np.mean(lengths) rng = np.amax(lengths) - np.amin(lengths) instance.addFeature(self.type, self.type + "_STD", std) instance.addFeature(self.type, self.type + "_Range", rng) instance.addFeature(self.type, self.type + "_wordsPerSentence", mean) utils.save_features_to_file( self.iC.featurePath + self.modelName + "_" + functionName, featureNames, self.iC, self.type)
def get_dict_count(self): featureNames = [ self.type + "_Abbrev", self.type + "_Curse", self.type + "_Positive", self.type + "_Negative" ] functionName = "get_dict_count" if os.path.isfile(self.iC.featurePath + self.modelName + "_" + functionName): utils.load_features_from_file( self.iC.featurePath + self.modelName + "_" + functionName, self.iC, self.type) print "loaded " + functionName return for instance in self.iC.instances: lWords = instance.tokens nwords = len(lWords) nAbbrev = 0 nCurse = 0 nPos = 0 nNeg = 0 ratioAbbrev = 0.0 ratioCurse = 0.0 ratioPos = 0.0 ratioNeg = 0.0 for word in lWords: word = word.lower() if word in self.abbreviationList: nAbbrev = nAbbrev + 1 if word in self.badWordsList: nCurse = nCurse + 1 if word in self.negList: nNeg = nNeg + 1 if word in self.posList: nPos = nPos + 1 if nwords > 0: ratioAbbrev = nAbbrev / nwords ratioCurse = nCurse / nwords ratioPos = nPos / nwords ratioNeg = nNeg / nwords instance.addFeature(self.type, self.type + "_Abbrev", ratioAbbrev) instance.addFeature(self.type, self.type + "_Curse", ratioCurse) instance.addFeature(self.type, self.type + "_Positive", ratioPos) instance.addFeature(self.type, self.type + "_Negative", ratioNeg) utils.save_features_to_file( self.iC.featurePath + self.modelName + "_" + functionName, featureNames, self.iC, self.type)
def get_firstperson_pronouns(self): featureNames = [ self.type + "_FirstSingular", self.type + "_FirstPlural" ] functionName = "get_firstperson_pronouns" if os.path.isfile(self.iC.featurePath + self.modelName + "_" + functionName): utils.load_features_from_file( self.iC.featurePath + self.modelName + "_" + functionName, self.iC, self.type) print "loaded " + functionName return #first_singular = ["i","me","my","mine"] #first_plural = ["we","our","ours"] first_singular = ["yo", "mi", "mío"] first_plural = ["nos", "nosotros", "nuestro"] for instance in self.iC.instances: lWords = instance.tokens nwords = len(lWords) ratioFirstS = 0.0 ratioFirstP = 0.0 nFirstS = 0 nFirstP = 0 for word in lWords: word = word.lower() if word in first_singular: nFirstS = nFirstS + 1 elif word in first_plural: nFirstP = nFirstP + 1 if nwords > 0: ratioFirstS = nFirstS / nwords ratioFirstP = nFirstP / nwords instance.addFeature(self.type, self.type + "_FirstSingular", ratioFirstS) instance.addFeature(self.type, self.type + "_FirstPlural", ratioFirstP) utils.save_features_to_file( self.iC.featurePath + self.modelName + "_" + functionName, featureNames, self.iC, self.type)
def compute_syntactic_features(self): functionName = "compute_syntactic_features" if os.path.isfile(self.iC.featurePath + self.modelName + "_" + functionName): utils.load_features_from_file( self.iC.featurePath + self.modelName + "_" + functionName, self.iC, self.type) print "loaded " + functionName return nPosts = len(self.iC.instances) nProcessed = 0 print "Building Syntactic Trees" for instance in self.iC.instances: conllSents = instance.conll.split("\n\n") iTrees = [] conllSents = conllSents[:-1] for conllSent in conllSents: try: iTree = SyntacticTreeOperations(conllSent) iTrees.append(iTree) except ValueError as e: print e continue self.get_relation_usage(iTrees, instance) self.get_relationgroup_usage(iTrees, instance) self.get_pos_usage(iTrees, instance) self.get_posgroup_usage(iTrees, instance) self.get_shape_features(iTrees, instance) self.get_subcoord_features(iTrees, instance) self.get_verb_features(iTrees, instance) nProcessed += 1 print "processed " + str(nProcessed) + " of " + str(nPosts) self.adjust_features() utils.save_features_to_file( self.iC.featurePath + self.modelName + "_" + functionName, self.allRelationsPos, self.iC, self.type)
def get_in_parenthesis_stats(self): featureNames = [ self.type + "_charsinparenthesis", self.type + "_wordsinparenthesis" ] functionName = "get_in_parenthesis_stats" if os.path.isfile(self.iC.featurePath + self.modelName + "_" + functionName): utils.load_features_from_file( self.iC.featurePath + self.modelName + "_" + functionName, self.iC, self.type) print "loaded " + functionName return for instance in self.iC.instances: matches = re.findall("\((.*?)\)", instance.text) npar = len(matches) totalchars = 0 totalwords = 0 for match in matches: totalchars += len(match) words = word_tokenize(match) totalwords = len(words) charsInParenthesis = 0.0 wordsInParenthesis = 0.0 if npar > 0: charsInParenthesis = totalchars / npar wordsInParenthesis = totalwords / npar instance.addFeature(self.type, self.type + "_charsinparenthesis", charsInParenthesis) instance.addFeature(self.type, self.type + "_wordsinparenthesis", wordsInParenthesis) utils.save_features_to_file( self.iC.featurePath + self.modelName + "_" + functionName, featureNames, self.iC, self.type)
def get_uppers(self): featureNames = [self.type + "_UpperCases"] functionName = "get_uppers" if os.path.isfile(self.iC.featurePath + self.modelName + "_" + functionName): utils.load_features_from_file( self.iC.featurePath + self.modelName + "_" + functionName, self.iC, self.type) print "loaded " + functionName return for instance in self.iC.instances: featValue = 0.0 matches = re.findall("[A-Z]", instance.text, re.DOTALL) upperCases = len(matches) ratio = upperCases / len(instance.text) instance.addFeature(self.type, self.type + "_UpperCases", ratio) utils.save_features_to_file( self.iC.featurePath + self.modelName + "_" + functionName, featureNames, self.iC, self.type)
def get_twothree_words(self): featureNames = [self.type + "_twoWords", self.type + "_threeWords"] functionName = "get_twothree_words" if os.path.isfile(self.iC.featurePath + self.modelName + "_" + functionName): utils.load_features_from_file( self.iC.featurePath + self.modelName + "_" + functionName, self.iC, self.type) print "loaded " + functionName return for instance in self.iC.instances: textTokenized = instance.tokens nwords = len(textTokenized) nTwo = 0 nThree = 0 twoWords = 0.0 threeWords = 0.0 for word in textTokenized: if len(word) == 2: nTwo += 1 elif len(word) == 3: nThree += 1 if nwords > 0: twoWords = nTwo / nwords threeWords = nThree / nwords instance.addFeature(self.type, self.type + "_twoWords", twoWords) instance.addFeature(self.type, self.type + "_threeWords", threeWords) utils.save_features_to_file( self.iC.featurePath + self.modelName + "_" + functionName, featureNames, self.iC, self.type)
def get_vocabulary_richness(self): featureNames = [self.type + "_VocabularyRichness"] functionName = "get_vocabulary_richness" if os.path.isfile(self.iC.featurePath + self.modelName + "_" + functionName): utils.load_features_from_file( self.iC.featurePath + self.modelName + "_" + functionName, self.iC, self.type) print "loaded " + functionName return for instance in self.iC.instances: lAllWords = instance.tokens lDiffWords = set(lAllWords) ratio = 0.0 if len(lAllWords) > 0: ratio = len(lDiffWords) / len(lAllWords) instance.addFeature(self.type, self.type + "_VocabularyRichness", ratio) utils.save_features_to_file( self.iC.featurePath + self.modelName + "_" + functionName, featureNames, self.iC, self.type)
def get_mean_mood(self): featureNames = [ self.type + "_TokenRatioAfraid", self.type + "_TokenRatioAmused", self.type + "_TokenRatioAngry", self.type + "_TokenRatioAnnoyed", self.type + "_TokenRatioDontCare", self.type + "_TokenRatioHappy", self.type + "_TokenRatioInspired", self.type + "_TokenRatioSad", self.type + "_EmotionRatio", self.type + "_EmotionRatioAfraid", self.type + "_EmotionRatioAmused", self.type + "_EmotionRatioAngry", self.type + "_EmotionRatioAnnoyed", self.type + "_EmotionRatioDontCare", self.type + "_EmotionRatioHappy", self.type + "_EmotionRatioInspired", self.type + "_EmotionRatioSad" ] functionName = "get_mean_mood" if os.path.isfile(self.iC.featurePath + self.modelName + "_" + functionName): utils.load_features_from_file( self.iC.featurePath + self.modelName + "_" + functionName, self.iC, self.type) print "loaded " + functionName return lmtzr = WordNetLemmatizer() for instance in self.iC.instances: tokens = instance.tokens totalTokens = len(tokens) text_tagged = nltk.pos_tag(tokens) totalAfraid = 0 totalAmused = 0 totalAngry = 0 totalAnnoyed = 0 totalDontCare = 0 totalHappy = 0 totalInspired = 0 totalSad = 0 totalEmotionTokens = 0 ratioAfraid = 0.0 ratioAmused = 0.0 ratioAngry = 0.0 ratioAnnoyed = 0.0 ratioDontCare = 0.0 ratioHappy = 0.0 ratioInspired = 0.0 ratioSad = 0.0 ratioEmotionTokens = 0.0 ratioEAfraid = 0.0 ratioEAmused = 0.0 ratioEAngry = 0.0 ratioEAnnoyed = 0.0 ratioEDontCare = 0.0 ratioEHappy = 0.0 ratioEInspired = 0.0 ratioESad = 0.0 for word in text_tagged: pos = self.getDepecheMoodPos(word[1]) if pos is None: continue if pos == "v": lemma = lmtzr.lemmatize(word[0], "v") else: lemma = lmtzr.lemmatize(word[0]) lemma = lemma.lower() idx = lemma + "#" + pos if idx in self.depecheMood.keys(): totalEmotionTokens += 1 totalAfraid += float(self.depecheMood[idx]["afraid"]) totalAmused += float(self.depecheMood[idx]["amused"]) totalAngry += float(self.depecheMood[idx]["angry"]) totalAnnoyed += float(self.depecheMood[idx]["annoyed"]) totalDontCare += float(self.depecheMood[idx]["dont_care"]) totalHappy += float(self.depecheMood[idx]["happy"]) totalInspired += float(self.depecheMood[idx]["inspired"]) totalSad += float(self.depecheMood[idx]["sad"]) if totalTokens > 0: ratioAfraid = totalAfraid / totalTokens ratioAmused = totalAmused / totalTokens ratioAngry = totalAngry / totalTokens ratioAnnoyed = totalAnnoyed / totalTokens ratioDontCare = totalDontCare / totalTokens ratioHappy = totalHappy / totalTokens ratioInspired = totalInspired / totalTokens ratioSad = totalSad / totalTokens ratioEmotionTokens = totalEmotionTokens / totalTokens instance.addFeature(self.type, self.type + "_TokenRatioAfraid", ratioAfraid) instance.addFeature(self.type, self.type + "_TokenRatioAmused", ratioAmused) instance.addFeature(self.type, self.type + "_TokenRatioAngry", ratioAngry) instance.addFeature(self.type, self.type + "_TokenRatioAnnoyed", ratioAnnoyed) instance.addFeature(self.type, self.type + "_TokenRatioDontCare", ratioDontCare) instance.addFeature(self.type, self.type + "_TokenRatioHappy", ratioHappy) instance.addFeature(self.type, self.type + "_TokenRatioInspired", ratioInspired) instance.addFeature(self.type, self.type + "_TokenRatioSad", ratioSad) instance.addFeature(self.type, self.type + "_EmotionRatio", ratioEmotionTokens) if totalEmotionTokens > 0: ratioEAfraid = totalAfraid / totalEmotionTokens ratioEAmused = totalAmused / totalEmotionTokens ratioEAngry = totalAngry / totalEmotionTokens ratioEAnnoyed = totalAnnoyed / totalEmotionTokens ratioEDontCare = totalDontCare / totalEmotionTokens ratioEHappy = totalHappy / totalEmotionTokens ratioEInspired = totalInspired / totalEmotionTokens ratioESad = totalSad / totalEmotionTokens instance.addFeature(self.type, self.type + "_EmotionRatioAfraid", ratioEAfraid) instance.addFeature(self.type, self.type + "_EmotionRatioAmused", ratioEAmused) instance.addFeature(self.type, self.type + "_EmotionRatioAngry", ratioEAngry) instance.addFeature(self.type, self.type + "_EmotionRatioAnnoyed", ratioEAnnoyed) instance.addFeature(self.type, self.type + "_EmotionRatioDontCare", ratioEDontCare) instance.addFeature(self.type, self.type + "_EmotionRatioHappy", ratioEHappy) instance.addFeature(self.type, self.type + "_EmotionRatioInspired", ratioEInspired) instance.addFeature(self.type, self.type + "_EmotionRatioSad", ratioESad) utils.save_features_to_file( self.iC.featurePath + self.modelName + "_" + functionName, featureNames, self.iC, self.type)