def get(self): try: limit = int(self.get_argument("limit", 10)) except ValueError: raise tornado.web.HTTPError(400) query = {} # Search for keywords after stemming if supplied keywords = self.get_argument("keywords", None) if keywords: words = map(lambda k: k.lower(), keywords.split(",")) words = map(lambda w: ClassificationObject.stem(w), words) query["stemmed_keywords"] = {"$all": words} # Search for tags if supplied tags = self.get_argument("tags", None) if tags: tags = map(lambda t: t.lower(), tags.split(",")) query["tags"] = {"$all": tags} else: # Otherwise filter by tagged or untagged tagged = self.get_argument("tagged", False) if tagged: query["tags"] = {"$ne": []} else: query["tags"] = [] results = ClassificationObject.find(query=query, limit=limit, sort=[("last_modified", pymongo.DESCENDING)]) dicts = [c.to_dict() for c in results] json = simplejson.dumps(dicts, default=nosy.util.json_serializer) self.set_header("Content-Type", "application/json") self.write(json)
def to_classification_object(cls, json): c = ClassificationObject() c.source = 'twitter' c.text = json['text'] c.created_at = json['created_at'] return c
def classify_text(self, text): if not hasattr(self, 'classifiers'): self.train() # Extract keywords like we do when learning c = ClassificationObject() c.text = text c.process() return self.classify(c)
def delete(self, doc_id): try: doc_id = int(doc_id) except ValueError: raise tornado.web.HTTPError(400) ClassificationObject.remove({'_id': doc_id}) json = simplejson.dumps({'success': True}) self.set_header('Content-Type', 'application/json') self.write(json)
def delete(self, doc_id): try: doc_id = int(doc_id) except ValueError: raise tornado.web.HTTPError(400) ClassificationObject.remove({"_id": doc_id}) json = simplejson.dumps({"success": True}) self.set_header("Content-Type", "application/json") self.write(json)
def put(self): try: doc_id = int(self.get_argument('id')) except ValueError: raise tornado.web.HTTPError(400, "Expecting integer value") tags = self.get_argument('tags', None) if tags: tags = map(lambda t: t.lower(), tags.split(',')) # update the tags for classification object c = ClassificationObject.find_by_id(doc_id) if c: c.tags = tags c.save() else: raise tornado.web.HTTPError( 404, "Could not find document with id %i" % doc_id) json = simplejson.dumps({ 'success': True, 'message': "Updated document with id %i" % doc_id, 'tags': tags }) self.set_header('Content-Type', 'application/json') self.write(json)
def _get_from_db(cls): objects = ClassificationObject.find({'tags': {'$ne': []}}) data = [] for c in objects: keywords = c.keywords tags = c.tags keywords = [w for w in keywords] for tag in tags: data.append((keywords, tag)) return data
def _get_from_db(cls): objects = ClassificationObject.find( {'tags': {'$ne' : []}}) data = [] for c in objects: keywords = c.keywords tags = c.tags keywords = [w for w in keywords] for tag in tags: data.append((keywords, tag)) return data
def load_features(self): self.features = {} for tag in self.TAGS: self.features[tag] = [] # Positive features tagged = ClassificationObject.find( { 'tags' : tag } ) for c in tagged: bag_of_words = self._to_feature(c) positive_feature = (bag_of_words, tag) self.features[tag].append(positive_feature) # Negative features - we limit these to the same number as positive features untagged_limit = self.NEG_FEATURE_MULTIPLIER*len(self.features[tag]) untagged = ClassificationObject.find( { 'tags' : { '$ne' : tag } }, limit=untagged_limit) for c in untagged: bag_of_words = {} for k in c.keywords: bag_of_words[k] = True negative_feature = (bag_of_words, "!" + tag) self.features[tag].append(negative_feature)
def get(self): try: limit = int(self.get_argument('limit', 10)) except ValueError: raise tornado.web.HTTPError(400) query = {} # Search for keywords after stemming if supplied keywords = self.get_argument('keywords', None) if keywords: words = map(lambda k: k.lower(), keywords.split(',')) words = map(lambda w: ClassificationObject.stem(w), words) query['stemmed_keywords'] = {'$all': words} # Search for tags if supplied tags = self.get_argument('tags', None) if tags: tags = map(lambda t: t.lower(), tags.split(',')) query['tags'] = {'$all': tags} else: # Otherwise filter by tagged or untagged tagged = self.get_argument('tagged', False) if tagged: query['tags'] = {'$ne': []} else: query['tags'] = [] results = ClassificationObject.find(query=query, limit=limit, sort=[("last_modified", pymongo.DESCENDING)]) dicts = [c.to_dict() for c in results] json = simplejson.dumps(dicts, default=nosy.util.json_serializer) self.set_header("Content-Type", "application/json") self.write(json)
def put(self, doc_id): try: doc_id = int(doc_id) except ValueError: raise tornado.web.HTTPError(400) tags = self.get_argument('tags', None) if tags: tags = map( lambda t: t.lower(), tags.split(',')) # update the tags for classification object c = ClassificationObject.find_by_id(doc_id) if c: c.tags = tags c.save() else: raise tornado.web.HTTPError(404) json = simplejson.dumps({'success': True}) self.set_header('Content-Type', 'application/json') self.write(json)
def put(self): try: doc_id = int(self.get_argument("id")) except ValueError: raise tornado.web.HTTPError(400, "Expecting integer value") tags = self.get_argument("tags", None) if tags: tags = map(lambda t: t.lower(), tags.split(",")) # update the tags for classification object c = ClassificationObject.find_by_id(doc_id) if c: c.tags = tags c.save() else: raise tornado.web.HTTPError(404, "Could not find document with id %i" % doc_id) json = simplejson.dumps({"success": True, "message": "Updated document with id %i" % doc_id, "tags": tags}) self.set_header("Content-Type", "application/json") self.write(json)
def get(self): tags = ClassificationObject.tags() json = simplejson.dumps({'tags' : tags}) self.set_header("Content-Type", "application/json") self.write(json)
import os from nosy.model import ClassificationObject # Clear all tagged with 'movie' ClassificationObject.remove({'tags': {'$ne': [] } }) POS_DIR = 'txt_sentoken/pos' NEG_DIR = 'txt_sentoken/neg' def line_iterator(directory): for filename in os.listdir(directory): with open(os.path.join(directory, filename)) as f: for line in f: yield(line) def save_classification_object(line, tags): c = ClassificationObject() c.text = line c.process() c.tags = tags c.save() # Positive reviews for line in line_iterator(POS_DIR): save_classification_object(line, ['movie']) # Negative reviews for line in line_iterator(NEG_DIR): save_classification_object(line, ['movie'])
def save_classification_object(line, tags): c = ClassificationObject() c.text = line c.process() c.tags = tags c.save()
import os from nosy.model import ClassificationObject # Clear all tagged with 'movie' ClassificationObject.remove({'tags': {'$ne': []}}) POS_DIR = 'txt_sentoken/pos' NEG_DIR = 'txt_sentoken/neg' def line_iterator(directory): for filename in os.listdir(directory): with open(os.path.join(directory, filename)) as f: for line in f: yield (line) def save_classification_object(line, tags): c = ClassificationObject() c.text = line c.process() c.tags = tags c.save() # Positive reviews for line in line_iterator(POS_DIR): save_classification_object(line, ['movie']) # Negative reviews
def get(self): tags = ClassificationObject.tags() json = simplejson.dumps({'tags': tags}) self.set_header("Content-Type", "application/json") self.write(json)
class NaiveBayesClassifier(PersistentClassifier): TRAIN_RATIO = 0.75 TEST_RATIO = 0.25 TAGS = ClassificationObject.tags() TRAIN_SETS = {} TEST_SETS = {} # num(negatve_features) = NEG_FEATURE_MULTIPLIER*num(positive_features) NEG_FEATURE_MULTIPLIER = 10 # UTILITIES @classmethod def _to_feature(cls, classification_object): bag_of_words = {} for w in classification_object.keywords: bag_of_words[w]= True return bag_of_words # FEATURES def load_features(self): self.features = {} for tag in self.TAGS: self.features[tag] = [] # Positive features tagged = ClassificationObject.find( { 'tags' : tag } ) for c in tagged: bag_of_words = self._to_feature(c) positive_feature = (bag_of_words, tag) self.features[tag].append(positive_feature) # Negative features - we limit these to the same number as positive features untagged_limit = self.NEG_FEATURE_MULTIPLIER*len(self.features[tag]) untagged = ClassificationObject.find( { 'tags' : { '$ne' : tag } }, limit=untagged_limit) for c in untagged: bag_of_words = {} for k in c.keywords: bag_of_words[k] = True negative_feature = (bag_of_words, "!" + tag) self.features[tag].append(negative_feature) def _split_features(self): if not hasattr(self, 'features'): self.load_features() self.train_sets = {} self.test_sets = {} for tag, features in self.features.iteritems(): random.shuffle(features) split_index = int(self.TRAIN_RATIO*len(features)) self.train_sets[tag] = features[:split_index] self.test_sets[tag] = features[split_index:] # TRAINING def train(self): if not hasattr(self, 'train_sets'): self._split_features() self.classifiers = {} for tag, features in self.train_sets.iteritems(): self.classifiers[tag] = NltkNaiveBayes.train(features) # TESTING def test(self): if not hasattr(self, 'test_sets'): self._split_features() if not hasattr(self, 'classifiers'): self.train() result = {} for tag, classifier in self.classifiers.iteritems(): result[tag] = accuracy(classifier, self.test_sets[tag]) return result def show_high_information_words(self, tag, n=10): if not hasattr(self, 'classifiers'): self.train() self.classifiers[tag].show_most_informative_features(n) # CLASSIFYING def classify(self, c): if not hasattr(self, 'classifiers'): self.train() feat = self._to_feature(c) result = {} for tag, classifier in self.classifiers.iteritems(): result[tag] = classifier.prob_classify(feat).prob(tag) c.tags.update(result) def classify_text(self, text): if not hasattr(self, 'classifiers'): self.train() # Extract keywords like we do when learning c = ClassificationObject() c.text = text c.process() return self.classify(c)