Ejemplo n.º 1
0
    def __init__(self, desc_len, summ_len):
        self.desc_length = desc_len
        self.summ_length = summ_len

        self.t_processor = Text_Process(self.desc_length, self.summ_length)
        self.wk_mgt = WikipediaHelper(self.desc_length, self.summ_length)
        self.t_creator = TopicManager()
        self.tl_creator = TopicLinkManager()
Ejemplo n.º 2
0
from datetime import datetime, timedelta
from diggly.util.diggly_threads import FuncThread
from django.db.models import Max, Q
from diggly.models import Topic, FeaturedTopics, TopicRedirect
from diggly.util.wikipediaAPI.wiki_api import WikipediaHelper
from bs4 import BeautifulSoup
import requests

#constants; to be moved to another file
TRENDING_TOPICS_URL = "http://wikipedia.trending.eu/en/__[404]24/"
TRENDING_TOPICS_BACKUP_URL = "http://tools.wmflabs.org/wikitrends/english-most-visited-today.html"

GET_PAGE_ID_URL = "http://rack36.cs.drexel.edu/getpageid/?q="

wiki_help = WikipediaHelper()


def update_featured_object(topic):
    #Get featured topics instance
    now = datetime.now()
    earlier_recent = timedelta(
        minutes=15)  #within last 15 mins (applies to recent topics)
    earlier_trending = timedelta(
        days=1)  #within last 1 day (applies to trending topics)

    topic.visit_counter += 1
    topic.save()

    try:
        featured_topics = FeaturedTopics.objects.all()[0]
        recent_topics = featured_topics.recent_topics
Ejemplo n.º 3
0
class JsonPediaManager():
    #desc_len = 6
    #summ_len = 1
    #t_processor = Text_Process(desc_len, summ_len)
    #t_mgt = TopicManager()
    #tl_mgt = TopicLinkManager()
    #wk_mgt = WikipediaHelper(description_len, summary_len)

    @classmethod
    def __init__(self, desc_len, summ_len):
        self.desc_length = desc_len
        self.summ_length = summ_len

        self.t_processor = Text_Process(self.desc_length, self.summ_length)
        self.wk_mgt = WikipediaHelper(self.desc_length, self.summ_length)
        self.t_creator = TopicManager()
        self.tl_creator = TopicLinkManager()

    def get_article(self, r_args):
        nlinks = self.__count_articles(r_args)
        titles = self.__get_article_titles(r_args, nlinks)

        print "TITLES -->", titles 
        for a_title in titles:
            r_entity = eng_entity.format(a_title)
            r_url = jpedia_base_url.format(r_entity) 

            self.__fetch_article(r_url)   

    def __fetch_article(self, url):
        print "\nURL --->\n", url  
        headers={
            "X-Mashape-Key": os.environ['MASHAPE_KEY'],
            "Accept": "application/json"
        } 

        resp = requests.get(url, headers=headers)
        print "RESP STATUS CODE -->", resp.status_code
        print "RESP -->", resp.text

        if resp.status_code != 200:
            #TODO: handle exception better
            raise requests.ApiError('GET: Jsonpedia api request error\n{}'.format(resp.status_code))
            return None 
    
        json_response = resp.json()
        pid = json_response['revid'] 

        print "JSONPEDIA PID -->", pid

    def __get_article_titles(self, r_args, nlinks):
        resources = self.__format_req(r_args) #assume you receive pageids
        #resources = "pageids=26903|26521"
        r_url = pageid_convert_url.format(resources)

        retrieveflag = "title"
        titles = []
        pages = self.wk_mgt.request_pages_plain(r_url, False) 
       
        #print "PAGES IN JSONPE -->", pages 
        for pid,page in pages.iteritems():
            a_title = page[retrieveflag].strip()
            titles.append(a_title.replace(" ", title_sep))
        
        #return arg_sep.join(titles)
        return titles

    def __count_articles(self, r_args):
        if self.__is_seq(r_args):
            return len(r_args)

        #default
        return 1

    def __format_req(self, r_args):
        if self.__is_seq(r_args): # check if arg is a list
            if self.__is_pageid(r_args[0]):
                return r_pageid.format(arg_sep.join(r_args))
            else:
                return r_title.format(arg_sep.join(r_args))

        if self.__is_pageid(r_args):
            return r_pageid.format(r_args)

        #default return single page title format
        return r_title.format(r_args)
    
    def __is_pageid(self, arg):
        try:
            pid = int(arg)
            return True;

        except ValueError:
            return False;   

    def __is_seq(self, arg):
        return (not hasattr(arg, "strip") and
                hasattr(arg, "__getitem__") or
                hasattr(arg, "__iter__"))