Esempio n. 1
0
 def __init__(self, app):
     self._browser = ThinBrowser()
     self.app = app
     self._url = "http://www.umbriaeventi.com/"
     self._page_gen = self._url + "eventi.php?menid=&comune=%s&page=%s"
Esempio n. 2
0
class ToscanaEvents(object):
    """Class to manage download events from Toscana's site
    """
    def __init__(self, app):
        self._browser = ThinBrowser()
        self.app = app
        self._url = "http://www.eventiintoscana.it/"

    def get_soup(self, url):
        """Return beautifulsoup obj from html url
        """
        response_page = self._browser.urlopen(url)
        html_page = self._browser.gzipPage(response_page)
        return BeautifulSoup(html_page)

    def download_cities(self):
        """Download all cities names and their links
        """
        cities_urls = dict()
        soup = self.get_soup(self._url)
        cities_l = soup.find(text = "Eventi in Provincia").\
            findNext("table").findAll("td")
        
        #self.app.logger.debug("Download Cities")
        for city in cities_l:
            city_name = city.img['alt'].replace("eventi", "").strip()
            city_link = city.a['href']
            cities_urls[convertName(city_name)] = city_link
        return cities_urls

    
    def get_events(self, url):
        """Return all event from a city
        """
        soup = self.get_soup(url)
        
        events = soup.findAll("div", {'class': "blog"})
        events_list = list()
        for event in events:
            event_obj = dict()
            period = event.\
                find("span", {'class': "titolino2"}).text.strip()
            event_obj['period'] = period
            
            last_period = period.split("al")[1].strip().split()[-1]
            
            title = event.\
                find("span", {'class': "titolino1"}).text.strip()
            
            #self.app.logger.debug("last_period : " + last_period)
            #self.app.logger.debug("title : " + title)
            
            try:
                title = title.split(last_period)[1].split("|")[0]
                event_obj['title'] = title
            except IndexError:
                last_period = period.split("al")[1].strip().split()[-2].capitalize()
                
            try:
                #self.app.logger.debug("last_period : " + last_period)
                title = title.split(last_period)[1].split("|")[0]
                event_obj['title'] = title
            except IndexError:
                event_obj['title'] = title
                
                
            #print(unidecode(title))
            
            img = event.\
                find("img", {'class': "lazy"})['data-href']
            #print(img)
            event_obj['img'] = img
            
            text = event.\
                find("div", {'class': "entry"}).findNext("p").text
            #print(unidecode(text))
            event_obj['text'] = text
            
            link = event.\
                find("span", {'class': "titolino1"}).a['href']
            #print(link)
            event_obj['link'] = link
            events_list.append(event_obj)
        
        return events_list    
Esempio n. 3
0
class UmbriaEvents(object):
    """Class to manage download events from Umbria's site
    """
    def __init__(self, app):
        self._browser = ThinBrowser()
        self.app = app
        self._url = "http://www.umbriaeventi.com/"
        self._page_gen = self._url + "eventi.php?menid=&comune=%s&page=%s"

    def get_soup(self, url):
        """Return beautifulsoup obj from html url
        """
        response_page = self._browser.urlopen(url)
        html_page = self._browser.gzipPage(response_page)
        return BeautifulSoup(html_page)

    def download_cities(self):
        """Download all cities names and their links
        """
        cities_urls = dict()
        soup = self.get_soup(self._url)
        cities_l = soup.find("div", {'class': "links-comuni"}).\
            findNext("div", {'class': "links-body"}).\
            findAll("li")
        
        #self.app.logger.debug("Download Cities")
        for city in cities_l:
            city_name = city.text.replace("Eventi", "").strip()
            city_link = self._url + city.a['href']
            cities_urls[city_name] = city_link
        return cities_urls
    
    def get_events(self, url):
        """Return all event from a city
        """
        soup = self.get_soup(url)
        
        #####
        # To Do:
        #   to grab js page we must use a web driver like selenium
        # -------------------------
        #scripts = soup.findAll("script")
        #num_comune = ""
        #for script in scripts:
        #    if script.text.find("comune=") > 0:
        #        num_comune = script.text.split("comune=")[1].split("&")[0]
        #        break
        #print(num_comune)
        #
        #for num in range(0,5):
        #    url = self._page_gen % (num_comune, num)
        #    print(url)
        #    soup = self.get_soup(url)
        #    events = soup.find("div", {'class': "eventi-lista"}).\
        #        findAll("div", {'class': "eventi-lista-value"})
        #    evt = events[0].find("div", {'class': "eventi-lista-value-titolo"})
        #    print(evt)
        
        events = soup.find("div", {'class': "eventi-lista"}).\
            findAll("div", {'class': "eventi-lista-value"})
        
        events_list = list()
        for event in events:
            event_obj = dict()
            evt = event.find("div", {'class': "eventi-lista-value-titolo"})
            title = evt.a.text
            #print(title)
            event_obj['title'] = title
            link = self._url + evt.a['href']
            soup = self.get_soup(link)
            try:
                link = soup.find("div", {'class': "network-col1-text"}).\
                    a['href']
            except AttributeError:
                pass
            #print(link)
            event_obj['link'] = link
            evt_img = event.find("div", {'class': "eventi-lista-value-logo"})
            img = self._url + evt_img.img['src']
            #print(img)
            event_obj['img'] = img
            period = event.\
                find("div", {'class': "eventi-lista-value-info2"}).text
            #print(period)
            event_obj['period'] = period
            text = event.\
                find("div", {'class': "eventi-lista-value-descrizione"}).text
            #print(unidecode(text))
            event_obj['text'] = text
            events_list.append(event_obj)
        
        return events_list   
Esempio n. 4
0
 def __init__(self, app):
     self._browser = ThinBrowser()
     self.app = app
     self._url = "http://www.eventiintoscana.it/"