def start_(self): dataset = Dataset('fourforums',annotation_list=['qr_dependencies', 'topic'])#'topic','dependencies']) for discussion in dataset.get_discussions(annotation_label='mechanical_turk'): if 'qr_meta' not in discussion.annotations['mechanical_turk']: continue topic = discussion.annotations['topic'] for post in discussion.get_posts(): result = self.extract_(post, topic) if result: self.feature_vectors.append(result)
def generate_features(self): dataset = Dataset('convinceme',annotation_list=['topic','dependencies','used_in_wassa2011', 'side']) directory = "{}/convinceme/output_by_thread".format(data_root_dir) for discussion in dataset.get_discussions(annotation_label='topic'): if self.topic != discussion.annotations['topic']: continue for post in discussion.get_posts(): feature_vector = defaultdict(int) post.discussion_id = discussion.id post.topic_side = get_topic_side(discussion, post.side) post.key = str((discussion.id,post.id)) feature_vector[self.classification_feature] = post.topic_side try: json_file = "{}/{}/{}.json".format(directory, discussion.id, post.id) pos, parsetree, dep, ident = json.load(open(json_file, 'r')) result = sorted(feat_vect(dep, pos, feature_vector), key=operator.itemgetter(0)) try: text = TextObj(post.text.decode('utf-8', 'replace')) except Exception, e: continue self.bounds.add(discussion_id=discussion.id, post_id=post.id, text=text.text, tuples=result) uni_from_boundaries(text.text, result, feature_vector) dependency_list = None if 'dependencies' not in post.annotations else post.annotations['dependencies'] if 'unigram' in self.features: ngrams_from_text(text.text.lower(), feature_vector, prefix="uni_lower_", n=1, style='float') ngrams_from_text(text.text, feature_vector, prefix="uni_caps_", n=1, style='float') feats = set(self.features).difference(set(['unigram'])) get_features_by_type(feature_vector=feature_vector, features=feats, text_obj=text, dependency_list=dependency_list) if None == dependency_list: continue if 'dependencies' in self.features: get_dependency_features(feature_vector, dependency_list, generalization='opinion') if DELETE_QUOTE: unigrams = map(lambda x: x[8:], filter(lambda x: x.startswith('unigram:'), feature_vector.keys())) for unigram in unigrams: key = 'quote: {}'.format(unigram) if key in feature_vector: del feature_vector[key] self.feature_vectors.append(feature_vector) except IOError, e: # XXX TODO : we don't have all the parses saved apparently so this sometimes fails. pass
def start(self): dataset = Dataset('convinceme',annotation_list=['side', 'topic','dependencies','used_in_wassa2011'])#'topic','dependencies']) for discussion in dataset.get_discussions(annotation_label='topic'): topic = discussion.annotations['topic'] for post in discussion.get_posts(): post.discussion_id = discussion.id post.topic_side = get_topic_side(discussion, post.side) post.key = str((discussion.id,post.id)) result = self.extract(post, topic) if result: self.feature_vectors_by_topic[topic].append(result['all']) self.collapsed_vectors[topic].append(result['collapsed']) self.commitment_vectors[topic].append(result['commitment']) for topic in self.feature_vectors_by_topic.keys(): print '{topic} has {length} elements.'.format(**{'topic': topic, 'length': len(self.feature_vectors_by_topic[topic])}) _feats = {'all': self.feature_vectors_by_topic, 'commitment': self.commitment_vectors, 'collapsed': self.collapsed_vectors,} fd = open('results', 'w') print 'Experimental Results:' fd.write('Experimental Results:\n') for topic in ['evolution','gay marriage', 'existence of god', 'abortion']: for featureset, vectors in _feats.iteritems(): self.write(vectors=vectors, topic=topic, featureset=featureset) classifiers = ['weka.classifiers.functions.SMO']#'weka.classifiers.bayes.NaiveBayes'] arff_folder = 'arffs/{topic}/'.format(topic=topic) arffs = ['all.arff', 'collapsed.arff', 'commitment.arff'] print '**RESULTS: {topic}'.format(topic=topic) fd.write('**RESULTS: {topic}\n'.format(topic=topic)) for arff in arffs: results = defaultdict(dict) #run->classifier->featureset->results for classifier_name in classifiers: run_results = weka_interface.cross_validate(arff_folder, [arff], classifier_name=classifier_name, classification_feature=self.label, n=10) right = sum([1 for entry in run_results.values() if entry['right?']]) run_accuracy = right / float(len(run_results)) print '\t{arff}: accuracy - {accuracy}'.format(arff=arff, accuracy=run_accuracy) fd.write('\t{arff}: accuracy - {accuracy}\n'.format(arff=arff, accuracy=run_accuracy)) fd.close()