def extract_(self, post, topic, features=[]):
     if len(features) == 0: features = _featlists.keys()
     else: features = filter(lambda x: x in _featlists, features)
     spans = [] # span := [ span-id, [ start, stop, { "category": (major, minor), "environment_indicators": [ (word, occurrence_id) ] } ] ]
     for featpair in features:
         (word_pairs, phrase_pairs) = _featlists[featpair]
         occ_i = 0
         for word, rx in word_pairs:
             position = 0
             for sentence in re.split(r'\.!?', post.text):
                 match = rx.match(post.text)
                 if match:
                     spans.append( [ "%s-%s" % (str(post.id), len(spans)+1),
                                    [ position + match.start(),
                                     position + match.end(),
                                     { "category": featpair,
                                     "environment_indicators":
                                     [ (word, str(occ_i)) ] } ] ] )
                     position += len(sentence)
                     occ_i += 1
         occ_i = 0
         for phrase, rx in phrase_pairs:
             for m in rx.finditer(post.text):
                 spans.append( [ "%s-%s" % (str(post.id), len(spans)+1),
                                [ m.start(),
                                 m.end(),
                                 { "category": featpair,
                                 "environment_indicators":
                                 [ (m.group(0), -1) ] } ] ] )
                 occ_i += 1
     for span in set([d[1][2]['category'] for d in spans]):
         self.freq[span] += 1
         self.by_topic[topic][span] += 1
     if len(self.by_topic[topic].keys()) < 2: return
     #self.by_side[topic][post.topic_side][span] += 1
     
     tuples = []
     for span in spans:
         start, stop, name = span[1][0], span[1][1], '-'.join(span[1][2]['category'])
         tuples.append((start,stop,name))
     b = Boundaries()
     b.initializeFromTuples(tuples)
     if len(b.boundaries) == 0: return
     self.by_topic[topic][POSTS_KEY] += 1
     b.walk(1, max(tuples, key=operator.itemgetter(1)))
     feature_vector = defaultdict(int)
     for partition in b.partitions[:-1]:
         unigrams = map(lambda unigram: self.stemmer.stem(unigram.lower()), re.split(r'\W', post.text[partition[0]:partition[1]]))
         for unigram in unigrams:
             feature_vector['{}:{}'.format(partition[2], unigram)] += 1
     return feature_vector
    def extract(self, post, topic, features=[]):
        if len(features) == 0: features = _featlists.keys()
        else: features = filter(lambda x: x in _featlists, features)
        spans = [] # span := [ span-id, [ start, stop, { "category": (major, minor), "environment_indicators": [ (word, occurrence_id) ] } ] ]
        for featpair in features:
            (word_pairs, phrase_pairs) = _featlists[featpair]
            occ_i = 0
            for word, rx in word_pairs:
                position = 0
                for sentence in re.split(r'\.!?', post.text):
                    match = rx.match(post.text)
                    if match:
                        spans.append( [ "%s-%s" % (str(post.id), len(spans)+1),
                                       [ position + match.start(),
                                        position + match.end(),
                                        { "category": featpair,
                                        "environment_indicators":
                                        [ (word, str(occ_i)) ] } ] ] )
                        position += len(sentence)
                        occ_i += 1
            occ_i = 0
            for phrase, rx in phrase_pairs:
                for m in rx.finditer(post.text):
                    spans.append( [ "%s-%s" % (str(post.id), len(spans)+1),
                                   [ m.start(),
                                    m.end(),
                                    { "category": featpair,
                                    "environment_indicators":
                                    [ (m.group(0), -1) ] } ] ] )
                    occ_i += 1
        self.by_topic[topic][POSTS_KEY] += 1
        environments = set()
        for span in set([d[1][2]['category'] for d in spans]):
            self.freq[span] += 1
            self.by_topic[topic][span] += 1
            self.by_side[topic][post.topic_side][span] += 1
            environments.add(span[0]) #XXX TODO need to make sure that the environments we are checking for are in the top three as far as probabiltiy goes
        self.environments_topic[topic].update([len(environments)])
        if len(environments) < 2:
            return

        #print 'throwing post away. discussion: {discussion} id: {id}'.format(**{'discussion': topic, 'id': post.id})
        tuples = []
        for span in spans:
            start, stop, name = span[1][0], span[1][1], '-'.join(span[1][2]['category'])
            tuples.append((start,stop,name))
        b = Boundaries()
        b.initializeFromTuples(tuples)
        if len(b.boundaries) == 0: return
        b.walk(1, max(tuples, key=operator.itemgetter(1)))
        #print 'boundaries:{boundary.boundaries}\nparititions:{boundary.partitions}'.format(boundary=b)
        fv_all = defaultdict(int)
        fv_collapsed = defaultdict(int)
        fv_commitment = defaultdict(int)
        tokens = 0
        for partition in b.partitions[:-1]:
            unigrams = map(lambda unigram: self.stemmer.stem(unigram.lower()), re.split(r'\W', post.text[partition[0]:partition[1]]))
            tokens += len(unigrams)
            for _label in set(partition[2].split()):
                for unigram in unigrams:
                    fv_commitment['{}:{}'.format(_label, unigram)] += 1
                    if _label == 'none':
                        fv_collapsed['commitment:{unigram}'.format(unigram=unigram)] += 1
                    else:
                        fv_collapsed['non_commitment:{unigram}'.format(unigram=unigram)] += 1
            for unigram in unigrams:
                fv_all['unigram_{unigram}'.format(unigram=unigram)] += 1
        for key in fv_all.keys():
            fv_all[key] /= float(tokens)
        for key in fv_commitment.keys():
            fv_commitment[key] /= float(tokens)
        for key in fv_collapsed.keys():
            fv_collapsed[key] /= float(tokens)
        fv_all[self.label] = post.topic_side
        fv_commitment[self.label] = post.topic_side
        fv_collapsed[self.label] = post.topic_side
        return {'all': fv_all, 'collapsed': fv_collapsed, 'commitment': fv_commitment}