def untrain(request, key): classifier = docclass.naivebayes(docclass.entryfeatures) entry = Entry.get(key) if request.method == 'POST': cache.delete('%s_entries' % entry.cat_ref.category) classifier.untrain(entry) entry.cat_ref = None entry.is_trained = False entry.save() return JSONResponse({'success' : 1}) raise Http404()
def guess(request, key): classifier = docclass.naivebayes(docclass.entryfeatures) classifier.setthreshold('Google', 2.0) classifier.setthreshold('Apple', 2.0) classifier.setthreshold('Microsoft', 2.0) entry = Entry.get(key) category = classifier.classify(entry, 'unknown') processingtime = classifier.get_processingtime() return JSONResponse({ 'category' : category, 'processingtime' : processingtime })
def train(request, key): classifier = docclass.naivebayes(docclass.entryfeatures) entry = Entry.get(key) category = "" if request.method == 'POST': if 'word' in request.POST and request.POST.get('word') and request.POST.get('word') != '': category_count = classifier.train(entry, request.POST.get('word')) entry.cat_ref = category_count entry.is_trained = True entry.save() category = category_count.category cache.delete('%s_entries' % category) return JSONResponse({ 'category' : category })
def refleshclassifier(request): classifier = docclass.naivebayes(docclass.getwords) classifier.testrun() return HttpResponse(mimetype='application/javascript')
def crawl(request): now = datetime.utcnow() feed_obj = Feed.all().filter('execute_hour =', now.hour).fetch(50) if now.minute <= 15: feeds = [f for f in feed_obj for em in f.execute_minute if em <= 15] elif now.minute <= 30: feeds = [f for f in feed_obj for em in f.execute_minute if em > 15 and em <= 30] elif now.minute <= 45: feeds = [f for f in feed_obj for em in f.execute_minute if em > 30 and em <= 45] else: feeds = [f for f in feed_obj for em in f.execute_minute if em > 45] classifier = docclass.naivebayes(docclass.entryfeatures) classifier.setthreshold('Google', 2.0) classifier.setthreshold('Apple', 2.0) classifier.setthreshold('Microsoft', 2.0) categories = [] for feed in feeds: d = feedparser.parse(feed.url) markup = "_crawledurls_%s" % str(feed.key()) cached_urls = cache.get(markup, []) i = 0 for e in d.entries[0:7]: try: url_hash = hashlib.md5(e.link).hexdigest() if url_hash in cached_urls: continue entry = Entry.all().filter('url_hash =', url_hash).get() if entry: continue #summary if 'summary' in e: summary = e.summary elif 'description' in e: summary = e.description else: summary = '' entry = Entry( feed_ref = feed.key(), title = e.title, url = e.link, description = summary, url_hash = url_hash) if not DEBUG and i < 5: classifier.classify(entry, 'Unknown') processing_time = classifier.get_processingtime() logging.info('processing time >>> %s' % processing_time) entry.cat_ref = classifier.getbestcat() categories.append(entry.cat_ref.category) entry.save() if e.has_key('categories'): for c in e.categories: ec = EntryCategory(entry_ref=entry, orig_category=c[1], lower_category=c[1].lower()) ec.save() cached_urls.insert(0, url_hash) i += 1 except StandardError, inst: logging.error('Failed to parse feed %s, %s' % (feed.url, inst)) #Update cache del cached_urls[50:] cache.set(markup, cached_urls, 86400)