def start_(self):
     dataset = Dataset('fourforums',annotation_list=['qr_dependencies', 'topic'])#'topic','dependencies'])
     for discussion in dataset.get_discussions(annotation_label='mechanical_turk'):
         if 'qr_meta' not in discussion.annotations['mechanical_turk']: continue
         topic = discussion.annotations['topic']
         for post in discussion.get_posts():
             result = self.extract_(post, topic) 
             if result:
                 self.feature_vectors.append(result)
    def generate_features(self):
        dataset = Dataset('convinceme',annotation_list=['topic','dependencies','used_in_wassa2011', 'side'])
        directory = "{}/convinceme/output_by_thread".format(data_root_dir)
        for discussion in dataset.get_discussions(annotation_label='topic'):
            if self.topic != discussion.annotations['topic']:
                continue
            for post in discussion.get_posts():

                feature_vector = defaultdict(int)
                post.discussion_id = discussion.id
                post.topic_side = get_topic_side(discussion, post.side)
                post.key = str((discussion.id,post.id))
                feature_vector[self.classification_feature] = post.topic_side
                try:

                    json_file = "{}/{}/{}.json".format(directory, discussion.id, post.id)
                    pos, parsetree, dep, ident = json.load(open(json_file, 'r'))
                    result = sorted(feat_vect(dep, pos, feature_vector), key=operator.itemgetter(0))
                    try:
                        text = TextObj(post.text.decode('utf-8', 'replace'))
                    except Exception, e:
                        continue

                    self.bounds.add(discussion_id=discussion.id, post_id=post.id, text=text.text, tuples=result)
                    
                    uni_from_boundaries(text.text, result, feature_vector)

                    dependency_list = None if 'dependencies' not in post.annotations else post.annotations['dependencies']
                    if 'unigram' in self.features:
                        ngrams_from_text(text.text.lower(), feature_vector, prefix="uni_lower_", n=1, style='float')
                        ngrams_from_text(text.text, feature_vector, prefix="uni_caps_", n=1, style='float')
                    feats = set(self.features).difference(set(['unigram']))
                    get_features_by_type(feature_vector=feature_vector, features=feats, text_obj=text, dependency_list=dependency_list)

                    
                    if None == dependency_list: continue
                    if 'dependencies' in self.features:
                        get_dependency_features(feature_vector, dependency_list, generalization='opinion')  

                    if DELETE_QUOTE:
                        unigrams = map(lambda x: x[8:], filter(lambda x: x.startswith('unigram:'), feature_vector.keys()))
                        for unigram in unigrams:
                            key = 'quote: {}'.format(unigram)
                            if key in feature_vector:
                                del feature_vector[key]

                    self.feature_vectors.append(feature_vector)

                except IOError, e:
                    # XXX TODO : we don't have all the parses saved apparently so this sometimes fails.
                    pass
 def start(self):
     dataset = Dataset('convinceme',annotation_list=['side', 'topic','dependencies','used_in_wassa2011'])#'topic','dependencies'])
     for discussion in dataset.get_discussions(annotation_label='topic'):
         topic = discussion.annotations['topic']
         for post in discussion.get_posts():
             post.discussion_id = discussion.id
             post.topic_side = get_topic_side(discussion, post.side)
             post.key = str((discussion.id,post.id))
             
             result = self.extract(post, topic) 
             if result:
                 self.feature_vectors_by_topic[topic].append(result['all'])
                 self.collapsed_vectors[topic].append(result['collapsed'])
                 self.commitment_vectors[topic].append(result['commitment'])
     
     for topic in self.feature_vectors_by_topic.keys():
         print '{topic} has {length} elements.'.format(**{'topic': topic, 'length': len(self.feature_vectors_by_topic[topic])})
    
     _feats = {'all': self.feature_vectors_by_topic,
               'commitment': self.commitment_vectors,
               'collapsed': self.collapsed_vectors,}
     fd = open('results', 'w')
     print 'Experimental Results:'
     fd.write('Experimental Results:\n')
     for topic in ['evolution','gay marriage', 'existence of god', 'abortion']:
         for featureset, vectors in _feats.iteritems():
             self.write(vectors=vectors, topic=topic, featureset=featureset)
         classifiers = ['weka.classifiers.functions.SMO']#'weka.classifiers.bayes.NaiveBayes']
         arff_folder = 'arffs/{topic}/'.format(topic=topic) 
         arffs = ['all.arff', 'collapsed.arff', 'commitment.arff']
         print '**RESULTS: {topic}'.format(topic=topic)
         fd.write('**RESULTS: {topic}\n'.format(topic=topic))
         for arff in arffs:
             results = defaultdict(dict) #run->classifier->featureset->results
             for classifier_name in classifiers:
                 run_results = weka_interface.cross_validate(arff_folder, [arff], classifier_name=classifier_name, classification_feature=self.label, n=10)
                 right = sum([1 for entry in run_results.values() if entry['right?']])
                 run_accuracy = right / float(len(run_results))
                 print '\t{arff}: accuracy - {accuracy}'.format(arff=arff, accuracy=run_accuracy)
                 fd.write('\t{arff}: accuracy - {accuracy}\n'.format(arff=arff, accuracy=run_accuracy))
     fd.close()