def get_local_features(self, text): feature_vector = dict() text_obj = TextObj(text) feature_extractor.get_ngrams(feature_vector, text_obj.tokens) feature_extractor.get_ngrams(feature_vector, text_obj.tokens, n=2) feature_extractor.get_initialisms(feature_vector, text_obj.tokens) feature_extractor.get_basic_lengths(feature_vector, text_obj.text, text_obj.sentences, text_obj.tokens) feature_extractor.get_repeated_punct(feature_vector, text_obj.text) feature_extractor.get_LIWC(feature_vector, text_obj.text) return feature_vector
def build_features(self, feats=None, start=None, end=None): # default: None -> use default features minIndex = self.occurrences[0].start maxIndex = self.occurrences[-1].end if start is None: start = minIndex if end is None: end = maxIndex assert minIndex <= start <= maxIndex, "Start index is beyond bounds." assert minIndex <= end <= maxIndex, "End index is beyond bounds." assert start <= end, "Start index is greater than end index." startInd = self.getIndex(self.occurrences, lambda x,y: x.start>=y, start) endInd = self.getIndex(self.occurrences, lambda x,y: x.end>y, end) if endInd is None: endInd = len(self.occurrences)-1 occurrences = self.occurrences[startInd:endInd] text = self.text[start:end] tokens = [o.text for o in occurrences] feature_dependencies_in = [d for d in self.dependencies if startInd <= d.gov_index < endInd and startInd <= d.dep_index < endInd] feature_dependencies_boundary = [d for d in self.dependencies if startInd <= d.gov_index < endInd != startInd <= d.dep_index < endInd] # MPQA &c if feats is None: self.features = dict() # start fresh # default features feats = ['unigram', 'initialism', 'lengths', 'punctuation', 'quotes', 'liwc', 'dep'] for feat in feats: if feat.endswith('gram'): n = measure_to_int(feat) feature_extractor.get_ngrams(self.features, tokens, n=n) elif feat.endswith('alism'): feature_extractor.get_initialisms(self.features, tokens, use_lowercase=True, finalism=(feat == 'finalism')) elif feat.startswith('lengths'): sentences = [] numSents = len(self.sentstarts) for i in range(numSents): if self.sentstarts[i] > end: break sStart = self.sentstarts[i] sEnd = self.sentends[i] if self.sentends[i] > start and self.sentstarts[i] < start: sStart = start elif self.sentends[i] > end and self.sentstarts[i] < end: sEnd = end sentences.append(self.text[sStart:sEnd]) words = tokens feature_extractor.get_basic_lengths(self.features, text, sentences, words) elif feat.startswith('punct'): feature_extractor.get_repeated_punct(self.features, text) elif feat.startswith('quot'): feature_extractor.get_quoted_terms(self.features, text) elif feat.lower() == 'liwc': text_scores = Counter() text_scores['Word Count'] = len(occurrences) for o in occurrences: text_scores.update(o.liwc) text_scores = word_category_counter.normalize(text_scores) for category, score in text_scores.items(): self.features['LIWC:'+category] = score elif feat.lower() == 'dep': dep_scores = Counter() #pdb.set_trace() for d in feature_dependencies_in: dep_string = "%s(%s,%s)" % (d.relation, self.occurrences[d.gov_index].lemma, self.occurrences[d.dep_index].lemma) dep_scores[dep_string] += 1 for dep, score in dep_scores.items(): self.features['dep:'+ dep] = score