def _check_source(self):
        link = self.document['link']
        session = requests.Session()

        if hasattr(self, 'auth_manager'):
            # Perform authentication
            self.auth_manager.auth(session)
            if not self.auth_manager.success:
                self.logger.error('Authentication failed..')
                return None

        html = self.fetch(link, session=session)
        if not html:
            self.logger.warning('Fetch "%s" returned nothing' % link)
            return None

        bsoup = get_soup(html)
        if not bsoup:
            self.logger.warning('BeautifulSoup returned None')
            return None

        try:
            entries = self.parse(bsoup)
        except Exception as e:
            self.logger.error('parse: %s' % unicode(e))
            return None

        return entries
Exemple #2
0
    def __check_site(self):
        link = self._get_document_field(self.document, 'announcements.link_site')
        if not link:
            self.logger.debug('"link_site" not found in document!')
            return None

        html = self.fetch(link)
        if not html:
            self.logger.warning('Fetch "%s" returned nothing' % link)
            return None

        bsoup = get_soup(html)
        if not bsoup:
            self.logger.warning('BeautifulSoup returned None')
            return None

        try:
            entries = self.parse_site(bsoup)
        except Exception as e:
            self.logger.error('parse_site: %s', unicode(e))
            return None

        try:
            entries = self.fix_site_entries(entries, link)
        except Exception as e:
            self.logger.error('fix_site_entries: %s', unicode(e))
            return None

        return entries
Exemple #3
0
    def parse(self, html):
        """
        Feel free to propose any changes on the schema below.

        Dictionary format:

        'lunch'-> 'main'   -> unicode
                  'salad'  -> unicode
                  'desert' -> unicode

        'dinner'-> 'main'   -> unicode
                   'salad'  -> unicode
                   'desert' -> unicode
        """

        # get the cells from the html
        bsoup = get_soup(html)
        cells = [self.prettify(cell.text) for cell in bsoup.find_all("td")]

        # split the cells according to meal. hardcoded positions
        # [main, served_with, salad, cheese, desert]
        lunch = [cells[9:16], cells[25:32], cells[33:40], cells[41:48], cells[49:56]]
        dinner = [cells[65:72], cells[81:88], cells[89:96], cells[97:104], cells[105:112]]

        # create the menu dictionary
        menu = list()
        for i in xrange(7):
            day_menu = {
                "name": self.weekdays[i],
                "date": date_to_datetime(self.latest_monday + timedelta(days=i)),
                "lunch": {
                    "main": lunch[0][i] + ". " + lunch[1][i],
                    "salad": lunch[2][i] + ". " + lunch[3][i],
                    "desert": lunch[4][i],
                },
                "dinner": {
                    "main": dinner[0][i] + ". " + dinner[1][i],
                    "salad": dinner[2][i] + ". " + dinner[3][i],
                    "desert": dinner[4][i],
                },
            }

            menu.append(day_menu)

        return menu
Exemple #4
0
    def _check_source(self):
        html = self.fetch()
        if not html:
            self.logger.warning('Fetch returned nothing. Make sure the file exist')
            return None

        bsoup = get_soup(html)
        if not bsoup:
            self.logger.warning('BeautifulSoup returned None')
            return None

        try:
            entries = self.parse(bsoup)
        except Exception as e:
            self.logger.error('parse: %s' % unicode(e))
            return None

        return entries
Exemple #5
0
    def parse(bsoup):
        announcements = []

        # get post containining announcements
        posts = bsoup.find(id='post')

        # get articles from post
        articles =  posts.find_all('article', class_='loop-entry clearfix')

        #loop thorugh articles
        for article in articles:
            #initialize announcement dictionary
            announcement = {}

            # get left part
            left = article.find('div', class_='loop-entry-left')
            date_post = left.find('div', class_= 'post-meta').find('div', class_ = 'post-date')
            announcement['date'] = parse_greek_date( date_post.text )
            announcement['has_time'] = False

            #get right part
            right = article.find('div', class_='loop-entry-right')

            announcement['title'] = right.h2.a['title']
            announcement['link'] = right.h2.a['href']

            paragraphs = right.find_all( 'p' )

            #join all paragraps to a single html
            announcement['html'] = '\n'.join( [unicode(p) for p in paragraphs] )

            # get the plaintext from html
            bsoup = get_soup(announcement['html'])
            announcement['plaintext'] = bsoup.text.strip()

            #add to announcements
            announcements.append(announcement)


        return announcements