Example #1
0
File: views.py Project: ryuic/newsy
def untrain(request, key):
    classifier = docclass.naivebayes(docclass.entryfeatures)
    entry = Entry.get(key)

    if request.method == 'POST':
        cache.delete('%s_entries' % entry.cat_ref.category)
        classifier.untrain(entry)
        entry.cat_ref = None
        entry.is_trained = False
        entry.save()
        return JSONResponse({'success' : 1})
    raise Http404()
Example #2
0
File: views.py Project: ryuic/newsy
def guess(request, key):
    classifier = docclass.naivebayes(docclass.entryfeatures)
    classifier.setthreshold('Google', 2.0)
    classifier.setthreshold('Apple', 2.0)
    classifier.setthreshold('Microsoft', 2.0)

    entry = Entry.get(key)

    category = classifier.classify(entry, 'unknown')
    processingtime = classifier.get_processingtime()

    return JSONResponse({
        'category' : category,
        'processingtime' : processingtime
        })
Example #3
0
File: views.py Project: ryuic/newsy
def train(request, key):
    classifier = docclass.naivebayes(docclass.entryfeatures)
    entry = Entry.get(key)
    category = ""

    if request.method == 'POST':
        if 'word' in request.POST and request.POST.get('word') and request.POST.get('word') != '':
            category_count = classifier.train(entry, request.POST.get('word'))
            entry.cat_ref = category_count
            entry.is_trained = True
            entry.save()
            category = category_count.category
            cache.delete('%s_entries' % category)

    return JSONResponse({ 'category' : category })
Example #4
0
File: views.py Project: ryuic/newsy
def refleshclassifier(request):
    classifier = docclass.naivebayes(docclass.getwords)
    classifier.testrun()
    return HttpResponse(mimetype='application/javascript')
Example #5
0
File: views.py Project: ryuic/newsy
def crawl(request):
    now = datetime.utcnow()
    feed_obj = Feed.all().filter('execute_hour =', now.hour).fetch(50)

    if now.minute <= 15:
        feeds = [f for f in feed_obj for em in f.execute_minute if em <= 15]
    elif now.minute <= 30:
        feeds = [f for f in feed_obj for em in f.execute_minute if em > 15 and em <= 30]
    elif now.minute <= 45:
        feeds = [f for f in feed_obj for em in f.execute_minute if em > 30 and em <= 45]
    else:
        feeds = [f for f in feed_obj for em in f.execute_minute if em > 45]

    classifier = docclass.naivebayes(docclass.entryfeatures)
    classifier.setthreshold('Google', 2.0)
    classifier.setthreshold('Apple', 2.0)
    classifier.setthreshold('Microsoft', 2.0)

    categories = []

    for feed in feeds:
        d = feedparser.parse(feed.url)
        markup = "_crawledurls_%s" % str(feed.key())
        cached_urls = cache.get(markup, [])

        i = 0
        for e in d.entries[0:7]:
            try:
                url_hash = hashlib.md5(e.link).hexdigest()
                if url_hash in cached_urls: continue
                entry = Entry.all().filter('url_hash =', url_hash).get()
                if entry: continue

                #summary
                if 'summary' in e: summary = e.summary
                elif 'description' in e: summary = e.description
                else: summary = ''

                entry = Entry(
                    feed_ref = feed.key(),
                    title = e.title,
                    url = e.link,
                    description = summary,
                    url_hash = url_hash)

                if not DEBUG and i < 5:
                    classifier.classify(entry, 'Unknown')
                    processing_time = classifier.get_processingtime()
                    logging.info('processing time >>> %s' % processing_time)
                    entry.cat_ref = classifier.getbestcat()
                    categories.append(entry.cat_ref.category)

                entry.save()

                if e.has_key('categories'):
                    for c in e.categories:
                        ec = EntryCategory(entry_ref=entry, orig_category=c[1], lower_category=c[1].lower())
                        ec.save()

                cached_urls.insert(0, url_hash)
                i += 1
            except StandardError, inst:
                logging.error('Failed to parse feed %s, %s' % (feed.url, inst))

        #Update cache
        del cached_urls[50:]
        cache.set(markup, cached_urls, 86400)