Exemple #1
0
def fetch_url(url, content_type = 'text/html'):
    '''Fetches the specified URL.'''

    LOG.info('Fetching "%s"...', url)

    try:
        page = _fetch_url(url, headers = { 'Accept-Language': 'ru,en' })
    except urlfetch.Error as e:
        raise Error('Failed to fetch the page: {0}.', e)
    else:
        if page.status_code == httplib.OK:
            LOG.info('"%s" has been successfully fetched.', url)
        else:
            error_class = HTTPNotFoundError if page.status_code == httplib.NOT_FOUND else Error
            raise error_class('The server returned error: {0} ({1}).',
                httplib.responses.get(page.status_code, 'Unknown error'), page.status_code)

    content = page.content

    for key in page.headers:
        if key.lower() == 'content-type':
            value, params = cgi.parse_header(page.headers[key])

            if value != content_type:
                raise Error('The server returned a page with invalid content type: {0}.', value)

            if content_type.startswith('text/'):
                for param in params:
                    if param.lower() == 'charset':
                        content_encoding = params[param]
                        break
                else:
                    content_encoding = 'UTF-8'

                try:
                    content = content.decode(content_encoding)
                except UnicodeDecodeError:
                    raise Error('The server returned a page in invalid encoding.')

            break
    else:
        raise Error('The server returned a page with missing content type.')

    return content
Exemple #2
0
    def __fix_html(self, html):
        '''Fixes various things that may confuse the Python's HTML parser.'''

        html = self.script_regex.sub('', html)

        loop_replacements = (
            lambda html: self.__invalid_tag_attr_spacing_regex.subn(
                r'\1 \2', html),
            lambda html: self.__invalid_tag_attr_regex.subn(r'\1 ', html),
        )

        for loop_replacement in loop_replacements:
            for i in xrange(0, 1000):
                html, changed = loop_replacement(html)

                if not changed:
                    break
            else:
                raise Error('Too many errors in the HTML or infinite loop.')

        html = self.__misopened_tag_regex.sub(r'<\1 />', html)

        return html
Exemple #3
0
def _api(method, **kwargs):
    '''Calls the specified VKontakte API method.'''

    url = '{0}method/{1}?language=0&'.format(constants.API_URL,
                                             method) + urllib.urlencode(kwargs)

    try:
        data = vkfeed.utils.fetch_url(url, content_type='application/json')

        try:
            data = json.loads(data)
        except Exception as e:
            raise Error('Failed to parse JSON data: {0}.', e)
    except Exception as e:
        raise ConnectionError('API call {0} failed: {1}', url, e)

    if 'error' in data or 'response' not in data:
        error = data.get('error', {}).get('error_msg', '').strip()

        if not error:
            error = 'Ошибка вызова API.'
        elif error == 'Access denied: group is blocked':
            error = (
                'Страница временно заблокирована и проверяется администраторами, '
                'так как некоторые пользователи считают, что она не соответствует правилам сайта.'
            )
        elif error == 'Access denied: this wall available only for community members':
            error = 'Это частное сообщество. Доступ только по приглашениям администраторов.'
        elif error == 'User was deleted or banned':
            error = 'Пользователь удален или забанен.'
        elif not error.endswith('.'):
            error += '.'

        raise ServerError(data.get('error', {}).get('error_code'), error)

    return data['response']
Exemple #4
0
 def __init__(self, code, *args, **kwargs):
     Error.__init__(self, *args, **kwargs)
     self.code = code
Exemple #5
0
 def __init__(self, *args, **kwargs):
     Error.__init__(self, *args, **kwargs)
Exemple #6
0
 def __init__(self, server_error):
     Error.__init__(self, "Server returned an error.")
     self.server_error = server_error
Exemple #7
0
 def __init__(self):
     Error.__init__(self, "The user's profile page is not available.")
Exemple #8
0
 def __init__(self):
     Error.__init__(self, "This is a private group.")
Exemple #9
0
    def get(self, profile_name):
        '''Processes the request.

        We don't use VKontakte API because it requires authorization and gives
        tokens with expiration time which is not suitable for RSS generator.
        '''

        headers = self.__get_headers()
        user_agent = headers.get('user-agent', '').strip()

        if user_agent and (
                # Google Reader bot still crawls the Web. Reject it to save
                # bandwidth.
                user_agent.startswith('Feedfetcher-Google;') or

                # FeedNotifier updates feeds every minute
                user_agent.startswith('FeedNotifier/') or

                # YandexBlogs bot sends a lot of requests (2/minute) for some
                # feeds. The support doesn't respond adequately.
                'YandexBlogs' in user_agent):
            self.error(httplib.FORBIDDEN)
            return

        user_error = None
        http_status = httplib.OK
        unknown_user_error = False

        try:
            show_photo = (self.request.get('show_photo', '1') != '0')
            foreign_posts = (self.request.get('foreign_posts', '0') != '0')
            hash_tag_title = (self.request.get('hash_tag_title', '0') != '0')
            text_title = (self.request.get('text_title', '0') != '0')
            big_photos = (self.request.get('big_photos', '0') != '0')

            LOG.info(
                'Requested feed for "%s" (foreign_posts = %s, show_photo = %s, hash_tag_title = %s, text_title = %s, big_photos = %s).',
                profile_name, foreign_posts, show_photo, hash_tag_title,
                text_title, big_photos)

            use_api = True
            if_modified_since = None

            if use_api:
                # Use VKontakte API

                from vkfeed.tools import wall_reader

                cur_time = int(time.time())
                latency = constants.MINUTE_SECONDS
                min_timestamp = cur_time - constants.WEEK_SECONDS

                ## This confuses Google Reader users because it always requests
                ## feeds with 'Cache-Control: max-age=3600' when adding
                ## subscriptions and users often gen an empty feed.
                #for cache_control in headers.get('cache-control', '').split(','):
                #    cache_control = cache_control.strip()
                #    if cache_control.startswith('max-age='):
                #        LOG.info('Applying Cache-Control: %s...', cache_control)
                #        try:
                #            cache_max_age = int(cache_control[len('max-age='):])
                #        except ValueError:
                #            LOG.error('Invalid header: Cache-Control = %s.', cache_control)
                #        else:
                #            if cache_max_age:
                #                min_timestamp = max(min_timestamp, cur_time - cache_max_age - latency)

                if 'if-modified-since' in headers and headers[
                        'if-modified-since'] != '0':
                    LOG.info('Applying If-Modified-Since: %s...',
                             headers['if-modified-since'])
                    try:
                        if_modified_since = vkfeed.utils.http_timestamp(
                            headers['if-modified-since'])
                    except Exception as e:
                        LOG.error('Invalid header: If-Modified-Since = %s.',
                                  headers['if-modified-since'])
                    else:
                        min_timestamp = max(min_timestamp,
                                            if_modified_since - latency)

                max_age = cur_time - min_timestamp
                if max_age > constants.DAY_SECONDS:
                    max_posts_num = 10
                else:
                    max_posts_num = 50

                if user_agent and vkfeed.utils.zero_subscribers(user_agent):
                    max_posts_num /= 2

                LOG.info(
                    'Applying the following limits: max_age=%s, max_posts_num=%s',
                    max_age, max_posts_num)

                try:
                    data = wall_reader.read(profile_name, min_timestamp,
                                            max_posts_num, foreign_posts,
                                            show_photo, hash_tag_title,
                                            text_title, big_photos)
                except wall_reader.ConnectionError as e:
                    http_status = httplib.BAD_GATEWAY
                    user_error = 'Ошибка соединения с сервером <a href="{0}" target="_blank">{0}</a>.'.format(
                        constants.API_URL)
                    raise
                except wall_reader.ServerError as e:
                    http_status = httplib.NOT_FOUND
                    user_error = unicode(e)
                    raise
            else:
                # Parse HTML from site

                from vkfeed.tools.wall_parser import WallPageParser, ParseError, PrivateGroupError, ProfileNotAvailableError, ServerError

                url = constants.VK_URL + cgi.escape(profile_name)
                url_html = '<a href="{0}" target="_blank">{0}</a>'.format(url)

                if profile_name == 'feed':
                    http_status = httplib.NOT_FOUND
                    user_error = 'Страница {0} не является профилем пользователя или группы.'.format(
                        url_html)
                    raise Error('Unsupported page.')

                try:
                    profile_page = vkfeed.utils.fetch_url(url)
                except vkfeed.utils.HTTPNotFoundError:
                    http_status = httplib.NOT_FOUND
                    user_error = 'Пользователя или группы {0} не существует.'.format(
                        url_html)
                    raise
                except Error:
                    http_status = httplib.BAD_GATEWAY
                    user_error = 'Не удалось загрузить страницу {0}.'.format(
                        url_html)
                    unknown_user_error = True
                    raise

                try:
                    data = WallPageParser().parse(profile_page)
                except PrivateGroupError as e:
                    http_status = httplib.NOT_FOUND
                    user_error = 'Группа {0} является закрытой группой.'.format(
                        url_html)
                    raise
                except ProfileNotAvailableError as e:
                    http_status = httplib.NOT_FOUND
                    user_error = 'Страница пользователя {0} удалена или доступна только авторизованным пользователям.'.format(
                        url_html)
                    raise
                except ServerError as e:
                    LOG.debug('Page contents:\n%s', profile_page)
                    http_status = httplib.BAD_GATEWAY
                    user_error = 'Сервер {0} вернул ошибку{1}'.format(
                        url_html,
                        ':<br />' + e.server_error if e.server_error else '.')
                    unknown_user_error = True
                    raise
                except ParseError as e:
                    LOG.debug('Page contents:\n%s', profile_page)
                    http_status = httplib.NOT_FOUND
                    user_error = 'Сервер вернул страницу, на которой не удалось найти стену с сообщениями пользователя.'
                    unknown_user_error = True
                    raise

                data['url'] = url
                if 'user_photo' not in data:
                    data[
                        'user_photo'] = constants.APP_URL + 'images/vk-rss-logo.png'

            LOG.info('Return %s items.', len(data['posts']))

            if if_modified_since is not None and not data['posts']:
                http_status = httplib.NOT_MODIFIED
            else:
                feed = self.__generate_feed(data)
        except Exception as e:
            if isinstance(e, Error):
                if user_error and not unknown_user_error:
                    log_function = LOG.warning
                else:
                    log_function = LOG.error
            else:
                log_function = LOG.exception

            log_function('Unable to generate a feed for "%s": %s',
                         profile_name, e)

            if user_error:
                self.error(http_status)
                error = '<p>Ошибка при генерации RSS-ленты:</p><p>{0}</p>'.format(
                    user_error)
                if unknown_user_error:
                    error += '''<p>
                        Пожалуйста, убедитесь, что вы правильно указали профиль
                        пользователя или группы, и что данный профиль является
                        общедоступным. Если все указано верно, и ошибка
                        повторяется, пожалуйста, свяжитесь с <a
                        href="mailto:{0}">администратором</a>.
                    </p>'''.format(
                        cgi.escape(constants.ADMIN_EMAIL, quote=True))
            else:
                self.error(httplib.INTERNAL_SERVER_ERROR)
                error = '''
                    При генерации RSS-ленты произошла внутренняя ошибка сервера.
                    Если ошибка повторяется, пожалуйста, свяжитесь с <a href="mailto:{0}">администратором</a>.
                '''.format(cgi.escape(constants.ADMIN_EMAIL, quote=True))

            self.response.headers[
                b'Content-Type'] = b'text/html; charset=utf-8'
            self.response.out.write(
                vkfeed.utils.render_template('error.html', {'error': error}))
        else:
            if http_status == httplib.OK:
                self.response.headers[b'Content-Type'] = b'application/rss+xml'
                self.response.out.write(feed)
            else:
                self.error(http_status)
Exemple #10
0
 def __init__(self, code, *args, **kwargs):
     Error.__init__(self, *args, **kwargs)
     self.code = code
Exemple #11
0
 def __init__(self, *args, **kwargs):
     Error.__init__(self, *args, **kwargs)
Exemple #12
0
 def __init__(self, server_error):
     Error.__init__(self, 'Server returned an error.')
     self.server_error = server_error
Exemple #13
0
 def __init__(self):
     Error.__init__(self, "The user's profile page is not available.")
Exemple #14
0
    def __handle_post_date(self, tag, data):
        '''Handles data inside of post replies tag.'''

        replacements = (('jan.', '1'), ('feb.', '2'), ('mar.', '3'),
                        ('apr.', '4'), ('may', '5'), ('jun.', '6'),
                        ('jul.', '7'), ('aug.', '8'), ('sep.', '9'),
                        ('oct.', '10'), ('nov.', '11'), ('dec.', '12'),
                        ('янв', '1'), ('фев', '2'), ('мар', '3'), ('апр', '4'),
                        ('мая', '5'), ('июн', '6'), ('июл', '7'), ('авг', '8'),
                        ('сен', '9'), ('окт', '10'), ('ноя', '11'),
                        ('дек', '12'), ('два', '2'), ('две', '2'), ('три',
                                                                    '3'),
                        ('четыре', '4'), ('пять', '5'), ('шесть',
                                                         '6'), ('семь', '7'),
                        ('восемь', '8'), ('девять', '9'), ('десять',
                                                           '10'), ('two', '2'),
                        ('three', '3'), ('four', '4'), ('five', '5'), ('six',
                                                                       '6'),
                        ('seven', '7'), ('eight', '8'), ('nine', '9'), ('ten',
                                                                        '10'),
                        ('вчера', 'yesterday'), ('сегодня', 'today'), (' в ',
                                                                       ' at '))

        date_string = data.strip().lower()

        is_pm = date_string.endswith(' pm')
        if date_string.endswith(' am') or date_string.endswith(' pm'):
            date_string = date_string[:-3]

        tz_delta = datetime.timedelta(hours=4)  # MSK timezone
        today = datetime.datetime.utcnow() + tz_delta

        for token, replacement in replacements:
            date_string = date_string.replace(token, replacement)

        try:
            match = re.match(ur'(\d+ ){0,1}([^ ]+) (?:назад|ago)', date_string)

            if match:
                value = match.group(1)
                if value:
                    value = int(value.strip())
                else:
                    value = 1

                unit = match.group(2)

                if unit in ('секунд', 'секунду', 'секунды', 'second',
                            'seconds'):
                    date = today - datetime.timedelta(seconds=value)
                elif unit in ('минут', 'минуту', 'минуты', 'minute',
                              'minutes'):
                    date = today - datetime.timedelta(minutes=value)
                elif unit in ('час', 'часа', 'часов', 'hour', 'hours'):
                    date = today - datetime.timedelta(hours=value)
                elif unit in ('день', 'дня', 'дней', 'day', 'days'):
                    date = today - datetime.timedelta(days=value)
                elif unit in ('неделю', 'недели', 'недель', 'week', 'weeks'):
                    date = today - datetime.timedelta(weeks=value)
                else:
                    raise Error('Invalid time dimension: {0}.', unit)
            else:
                try:
                    date = datetime.datetime.strptime(date_string,
                                                      'today at %H:%M')
                    date = datetime.datetime.combine(today, date.time())
                except ValueError:
                    try:
                        date = datetime.datetime.strptime(
                            date_string, 'yesterday at %H:%M')
                        date = datetime.datetime.combine(
                            today - datetime.timedelta(days=1), date.time())
                    except ValueError:
                        try:
                            date = datetime.datetime.strptime(
                                '{0} {1}'.format(today.year, date_string),
                                '%Y %d %m at %H:%M')
                        except ValueError:
                            date = datetime.datetime.strptime(
                                date_string, '%d %m %Y')
                            date += tz_delta

            if is_pm:
                date += datetime.timedelta(hours=12)

            date -= tz_delta

            if date - datetime.timedelta(minutes=1) > today:
                if date - datetime.timedelta(days=1) <= today:
                    date -= datetime.timedelta(days=1)
                else:
                    last_year_date = datetime.datetime(date.year - 1,
                                                       date.month, date.day,
                                                       date.hour, date.minute,
                                                       date.second,
                                                       date.microsecond,
                                                       date.tzinfo)
                    if last_year_date <= today:
                        date = last_year_date

            self.__get_cur_post()['date'] = date
        except Exception as e:
            if self.__ignore_errors:
                LOG.exception('Failed to parse date %s.', data)
            else:
                raise e
Exemple #15
0
 def __init__(self):
     Error.__init__(self, 'This is a private group.')