コード例 #1
0
ファイル: corpus_handler.py プロジェクト: pascalc/nosy
    def get(self):
        try:
            limit = int(self.get_argument("limit", 10))
        except ValueError:
            raise tornado.web.HTTPError(400)

        query = {}

        # Search for keywords after stemming if supplied
        keywords = self.get_argument("keywords", None)
        if keywords:
            words = map(lambda k: k.lower(), keywords.split(","))
            words = map(lambda w: ClassificationObject.stem(w), words)
            query["stemmed_keywords"] = {"$all": words}

        # Search for tags if supplied
        tags = self.get_argument("tags", None)
        if tags:
            tags = map(lambda t: t.lower(), tags.split(","))
            query["tags"] = {"$all": tags}
        else:
            # Otherwise filter by tagged or untagged
            tagged = self.get_argument("tagged", False)
            if tagged:
                query["tags"] = {"$ne": []}
            else:
                query["tags"] = []

        results = ClassificationObject.find(query=query, limit=limit, sort=[("last_modified", pymongo.DESCENDING)])

        dicts = [c.to_dict() for c in results]
        json = simplejson.dumps(dicts, default=nosy.util.json_serializer)

        self.set_header("Content-Type", "application/json")
        self.write(json)
コード例 #2
0
    def to_classification_object(cls, json):
        c = ClassificationObject()

        c.source = 'twitter'
        c.text = json['text']
        c.created_at = json['created_at']

        return c
コード例 #3
0
ファイル: tweet_harvester.py プロジェクト: byouloh/nosy
    def to_classification_object(cls, json):
        c = ClassificationObject()
        
        c.source = 'twitter'
        c.text = json['text']
        c.created_at = json['created_at']

        return c
コード例 #4
0
ファイル: naive_bayes.py プロジェクト: antsemot/nosy
    def classify_text(self, text):
        if not hasattr(self, 'classifiers'): self.train()

        # Extract keywords like we do when learning
        c = ClassificationObject()
        c.text = text
        c.process()
        
        return self.classify(c)
コード例 #5
0
ファイル: corpus_handler.py プロジェクト: antsemot/nosy
    def delete(self, doc_id):
        try:
            doc_id = int(doc_id)
        except ValueError:
            raise tornado.web.HTTPError(400)

        ClassificationObject.remove({'_id': doc_id})

        json = simplejson.dumps({'success': True})
        self.set_header('Content-Type', 'application/json')
        self.write(json)
コード例 #6
0
ファイル: corpus_handler.py プロジェクト: pascalc/nosy
    def delete(self, doc_id):
        try:
            doc_id = int(doc_id)
        except ValueError:
            raise tornado.web.HTTPError(400)

        ClassificationObject.remove({"_id": doc_id})

        json = simplejson.dumps({"success": True})
        self.set_header("Content-Type", "application/json")
        self.write(json)
コード例 #7
0
ファイル: corpus_handler.py プロジェクト: antsemot/nosy
    def put(self):
        try:
            doc_id = int(self.get_argument('id'))
        except ValueError:
            raise tornado.web.HTTPError(400, "Expecting integer value")

        tags = self.get_argument('tags', None)
        if tags:
            tags = map(lambda t: t.lower(), tags.split(','))

        # update the tags for classification object
        c = ClassificationObject.find_by_id(doc_id)
        if c:
            c.tags = tags
            c.save()
        else:
            raise tornado.web.HTTPError(
                404, "Could not find document with id %i" % doc_id)

        json = simplejson.dumps({
            'success': True,
            'message': "Updated document with id %i" % doc_id,
            'tags': tags
        })
        self.set_header('Content-Type', 'application/json')
        self.write(json)
コード例 #8
0
 def _get_from_db(cls):
     objects = ClassificationObject.find({'tags': {'$ne': []}})
     data = []
     for c in objects:
         keywords = c.keywords
         tags = c.tags
         keywords = [w for w in keywords]
         for tag in tags:
             data.append((keywords, tag))
     return data
コード例 #9
0
ファイル: train.py プロジェクト: byouloh/nosy
	def _get_from_db(cls):
		objects = ClassificationObject.find( {'tags': {'$ne' : []}})
		data = []
		for c in objects:
			keywords = c.keywords
			tags = c.tags
			keywords = [w for w in keywords] 
			for tag in tags:
				data.append((keywords, tag))
		return data
コード例 #10
0
ファイル: naive_bayes.py プロジェクト: antsemot/nosy
    def load_features(self):
        self.features = {}
        for tag in self.TAGS: 
            self.features[tag] = []
            
            # Positive features
            tagged = ClassificationObject.find( { 'tags' : tag } )
            for c in tagged:
                bag_of_words = self._to_feature(c)
                positive_feature = (bag_of_words, tag)
                self.features[tag].append(positive_feature)

            # Negative features - we limit these to the same number as positive features
            untagged_limit = self.NEG_FEATURE_MULTIPLIER*len(self.features[tag])
            untagged = ClassificationObject.find( { 'tags' : { '$ne' : tag } }, 
                limit=untagged_limit)
            for c in untagged:
                bag_of_words = {}
                for k in c.keywords: bag_of_words[k] = True

                negative_feature = (bag_of_words, "!" + tag)
                self.features[tag].append(negative_feature)
コード例 #11
0
ファイル: corpus_handler.py プロジェクト: antsemot/nosy
    def get(self):
        try:
            limit = int(self.get_argument('limit', 10))
        except ValueError:
            raise tornado.web.HTTPError(400)

        query = {}

        # Search for keywords after stemming if supplied
        keywords = self.get_argument('keywords', None)
        if keywords:
            words = map(lambda k: k.lower(), keywords.split(','))
            words = map(lambda w: ClassificationObject.stem(w), words)
            query['stemmed_keywords'] = {'$all': words}

        # Search for tags if supplied
        tags = self.get_argument('tags', None)
        if tags:
            tags = map(lambda t: t.lower(), tags.split(','))
            query['tags'] = {'$all': tags}
        else:
            # Otherwise filter by tagged or untagged
            tagged = self.get_argument('tagged', False)
            if tagged:
                query['tags'] = {'$ne': []}
            else:
                query['tags'] = []

        results = ClassificationObject.find(query=query,
                                            limit=limit,
                                            sort=[("last_modified",
                                                   pymongo.DESCENDING)])

        dicts = [c.to_dict() for c in results]
        json = simplejson.dumps(dicts, default=nosy.util.json_serializer)

        self.set_header("Content-Type", "application/json")
        self.write(json)
コード例 #12
0
ファイル: corpus_handler.py プロジェクト: joacar/nosy
    def put(self, doc_id):
        try:
            doc_id = int(doc_id)
        except ValueError:
            raise tornado.web.HTTPError(400)

        tags = self.get_argument('tags', None)
        if tags:
            tags = map( lambda t: t.lower(), tags.split(','))

        # update the tags for classification object
        c = ClassificationObject.find_by_id(doc_id)
        if c:
            c.tags = tags
            c.save()
        else:
            raise tornado.web.HTTPError(404)

        json = simplejson.dumps({'success': True})
        self.set_header('Content-Type', 'application/json')
        self.write(json)
コード例 #13
0
ファイル: corpus_handler.py プロジェクト: pascalc/nosy
    def put(self):
        try:
            doc_id = int(self.get_argument("id"))
        except ValueError:
            raise tornado.web.HTTPError(400, "Expecting integer value")

        tags = self.get_argument("tags", None)
        if tags:
            tags = map(lambda t: t.lower(), tags.split(","))

        # update the tags for classification object
        c = ClassificationObject.find_by_id(doc_id)
        if c:
            c.tags = tags
            c.save()
        else:
            raise tornado.web.HTTPError(404, "Could not find document with id %i" % doc_id)

        json = simplejson.dumps({"success": True, "message": "Updated document with id %i" % doc_id, "tags": tags})
        self.set_header("Content-Type", "application/json")
        self.write(json)
コード例 #14
0
ファイル: corpus_handler.py プロジェクト: joacar/nosy
 def get(self):
     tags = ClassificationObject.tags()
     
     json = simplejson.dumps({'tags' : tags})
     self.set_header("Content-Type", "application/json")
     self.write(json)
コード例 #15
0
ファイル: movie_review_inserter.py プロジェクト: byouloh/nosy
import os

from nosy.model import ClassificationObject

# Clear all tagged with 'movie'
ClassificationObject.remove({'tags': {'$ne': [] } })

POS_DIR = 'txt_sentoken/pos'
NEG_DIR = 'txt_sentoken/neg'

def line_iterator(directory):
    for filename in os.listdir(directory):
        with open(os.path.join(directory, filename)) as f:
            for line in f:
                yield(line)

def save_classification_object(line, tags):
    c = ClassificationObject()
    c.text = line
    c.process()
    c.tags = tags
    c.save()

# Positive reviews
for line in line_iterator(POS_DIR):
    save_classification_object(line, ['movie'])

# Negative reviews
for line in line_iterator(NEG_DIR):
    save_classification_object(line, ['movie'])
コード例 #16
0
def save_classification_object(line, tags):
    c = ClassificationObject()
    c.text = line
    c.process()
    c.tags = tags
    c.save()
コード例 #17
0
import os

from nosy.model import ClassificationObject

# Clear all tagged with 'movie'
ClassificationObject.remove({'tags': {'$ne': []}})

POS_DIR = 'txt_sentoken/pos'
NEG_DIR = 'txt_sentoken/neg'


def line_iterator(directory):
    for filename in os.listdir(directory):
        with open(os.path.join(directory, filename)) as f:
            for line in f:
                yield (line)


def save_classification_object(line, tags):
    c = ClassificationObject()
    c.text = line
    c.process()
    c.tags = tags
    c.save()


# Positive reviews
for line in line_iterator(POS_DIR):
    save_classification_object(line, ['movie'])

# Negative reviews
コード例 #18
0
ファイル: corpus_handler.py プロジェクト: antsemot/nosy
    def get(self):
        tags = ClassificationObject.tags()

        json = simplejson.dumps({'tags': tags})
        self.set_header("Content-Type", "application/json")
        self.write(json)
コード例 #19
0
ファイル: naive_bayes.py プロジェクト: antsemot/nosy
class NaiveBayesClassifier(PersistentClassifier):
    TRAIN_RATIO = 0.75
    TEST_RATIO = 0.25

    TAGS = ClassificationObject.tags()

    TRAIN_SETS = {}
    TEST_SETS = {}

    # num(negatve_features) = NEG_FEATURE_MULTIPLIER*num(positive_features)
    NEG_FEATURE_MULTIPLIER = 10

    # UTILITIES

    @classmethod
    def _to_feature(cls, classification_object):
        bag_of_words = {}
        for w in classification_object.keywords: bag_of_words[w]= True
        return bag_of_words

    # FEATURES

    def load_features(self):
        self.features = {}
        for tag in self.TAGS: 
            self.features[tag] = []
            
            # Positive features
            tagged = ClassificationObject.find( { 'tags' : tag } )
            for c in tagged:
                bag_of_words = self._to_feature(c)
                positive_feature = (bag_of_words, tag)
                self.features[tag].append(positive_feature)

            # Negative features - we limit these to the same number as positive features
            untagged_limit = self.NEG_FEATURE_MULTIPLIER*len(self.features[tag])
            untagged = ClassificationObject.find( { 'tags' : { '$ne' : tag } }, 
                limit=untagged_limit)
            for c in untagged:
                bag_of_words = {}
                for k in c.keywords: bag_of_words[k] = True

                negative_feature = (bag_of_words, "!" + tag)
                self.features[tag].append(negative_feature)

    def _split_features(self):
        if not hasattr(self, 'features'): self.load_features()

        self.train_sets = {}
        self.test_sets = {}

        for tag, features in self.features.iteritems():
            random.shuffle(features)
            split_index = int(self.TRAIN_RATIO*len(features))

            self.train_sets[tag] = features[:split_index]
            self.test_sets[tag] = features[split_index:]

    # TRAINING

    def train(self):
        if not hasattr(self, 'train_sets'): self._split_features()

        self.classifiers = {}
        for tag, features in self.train_sets.iteritems():
            self.classifiers[tag] = NltkNaiveBayes.train(features)

    # TESTING

    def test(self):
        if not hasattr(self, 'test_sets'): self._split_features()
        if not hasattr(self, 'classifiers'): self.train()

        result = {}
        for tag, classifier in self.classifiers.iteritems():
            result[tag] = accuracy(classifier, self.test_sets[tag])
        return result

    def show_high_information_words(self, tag, n=10):
        if not hasattr(self, 'classifiers'): self.train()
        self.classifiers[tag].show_most_informative_features(n)

    # CLASSIFYING

    def classify(self, c):
        if not hasattr(self, 'classifiers'): self.train()

        feat = self._to_feature(c)
        result = {}
        for tag, classifier in self.classifiers.iteritems():
            result[tag] = classifier.prob_classify(feat).prob(tag)
        c.tags.update(result)

    def classify_text(self, text):
        if not hasattr(self, 'classifiers'): self.train()

        # Extract keywords like we do when learning
        c = ClassificationObject()
        c.text = text
        c.process()
        
        return self.classify(c)
コード例 #20
0
ファイル: movie_review_inserter.py プロジェクト: byouloh/nosy
def save_classification_object(line, tags):
    c = ClassificationObject()
    c.text = line
    c.process()
    c.tags = tags
    c.save()