コード例 #1
0
    def generate_features(self):
        dataset = Dataset('convinceme',annotation_list=['topic','dependencies','used_in_wassa2011', 'side'])
        directory = "{}/convinceme/output_by_thread".format(data_root_dir)
        for discussion in dataset.get_discussions(annotation_label='topic'):
            if self.topic != discussion.annotations['topic']:
                continue
            for post in discussion.get_posts():

                feature_vector = defaultdict(int)
                post.discussion_id = discussion.id
                post.topic_side = get_topic_side(discussion, post.side)
                post.key = str((discussion.id,post.id))
                feature_vector[self.classification_feature] = post.topic_side
                try:

                    json_file = "{}/{}/{}.json".format(directory, discussion.id, post.id)
                    pos, parsetree, dep, ident = json.load(open(json_file, 'r'))
                    result = sorted(feat_vect(dep, pos, feature_vector), key=operator.itemgetter(0))
                    try:
                        text = TextObj(post.text.decode('utf-8', 'replace'))
                    except Exception, e:
                        continue

                    self.bounds.add(discussion_id=discussion.id, post_id=post.id, text=text.text, tuples=result)
                    
                    uni_from_boundaries(text.text, result, feature_vector)

                    dependency_list = None if 'dependencies' not in post.annotations else post.annotations['dependencies']
                    if 'unigram' in self.features:
                        ngrams_from_text(text.text.lower(), feature_vector, prefix="uni_lower_", n=1, style='float')
                        ngrams_from_text(text.text, feature_vector, prefix="uni_caps_", n=1, style='float')
                    feats = set(self.features).difference(set(['unigram']))
                    get_features_by_type(feature_vector=feature_vector, features=feats, text_obj=text, dependency_list=dependency_list)

                    
                    if None == dependency_list: continue
                    if 'dependencies' in self.features:
                        get_dependency_features(feature_vector, dependency_list, generalization='opinion')  

                    if DELETE_QUOTE:
                        unigrams = map(lambda x: x[8:], filter(lambda x: x.startswith('unigram:'), feature_vector.keys()))
                        for unigram in unigrams:
                            key = 'quote: {}'.format(unigram)
                            if key in feature_vector:
                                del feature_vector[key]

                    self.feature_vectors.append(feature_vector)

                except IOError, e:
                    # XXX TODO : we don't have all the parses saved apparently so this sometimes fails.
                    pass
コード例 #2
0
def uni_from_boundaries(text, boundaries, features):
    bounds = [(bound[2].upper(), re.sub(r'\r', '', text)[bound[0]:bound[1]]) for bound in boundaries]
    for bound in bounds:
        ngrams_from_text(bound[1], features, prefix=bound[0]+"_uni_caps_", n=1, style='float')
        ngrams_from_text(bound[1].lower(), features, prefix=bound[0]+"_uni_lower_", n=1, style='float')