Example #1
0
def sanitize_illegal_chars_for_xml(s):
    """Sanitize a string, removing characters illegal in XML.

    This will remove a number of characters that would break the  XML parser.
    They may be in the string due to a copy/paste.

    This code is courtesy of the XmlRpcPlugin developers, as documented
    here: http://stackoverflow.com/a/22273639
    """
    global ILLEGAL_XML_CHARS_RE

    if ILLEGAL_XML_CHARS_RE is None:
        _illegal_unichrs = [(0x00, 0x08), (0x0B, 0x0C), (0x0E, 0x1F),
                            (0x7F, 0x84), (0x86, 0x9F), (0xFDD0, 0xFDDF),
                            (0xFFFE, 0xFFFF)]

        if sys.maxunicode > 0x10000:
            _illegal_unichrs += [(0x1FFFE, 0x1FFFF), (0x2FFFE, 0x2FFFF),
                                 (0x3FFFE, 0x3FFFF), (0x4FFFE, 0x4FFFF),
                                 (0x5FFFE, 0x5FFFF), (0x6FFFE, 0x6FFFF),
                                 (0x7FFFE, 0x7FFFF), (0x8FFFE, 0x8FFFF),
                                 (0x9FFFE, 0x9FFFF), (0xAFFFE, 0xAFFFF),
                                 (0xBFFFE, 0xBFFFF), (0xCFFFE, 0xCFFFF),
                                 (0xDFFFE, 0xDFFFF), (0xEFFFE, 0xEFFFF),
                                 (0xFFFFE, 0xFFFFF), (0x10FFFE, 0x10FFFF)]

        ILLEGAL_XML_CHARS_RE = re.compile('[%s]' % ''.join([
            '%s-%s' % (six.unichr(low), six.unichr(high))
            for low, high in _illegal_unichrs
        ]))

    if isinstance(s, bytes):
        s = s.decode('utf-8')

    return ILLEGAL_XML_CHARS_RE.sub('', s)
  def test_unicode_header_checks(self):
    access_token = u'foo'
    client_id = u'some_client_id'
    client_secret = u'cOuDdkfjxxnv+'
    refresh_token = u'1/0/a.df219fjls0'
    token_expiry = str(datetime.datetime.utcnow())
    token_uri = str(GOOGLE_TOKEN_URI)
    revoke_uri = str(GOOGLE_REVOKE_URI)
    user_agent = u'refresh_checker/1.0'
    credentials = OAuth2Credentials(access_token, client_id, client_secret,
                                    refresh_token, token_expiry, token_uri,
                                    user_agent, revoke_uri=revoke_uri)

    # First, test that we correctly encode basic objects, making sure
    # to include a bytes object. Note that oauth2client will normalize
    # everything to bytes, no matter what python version we're in.
    http = credentials.authorize(HttpMock(headers={'status': '200'}))
    headers = {u'foo': 3, b'bar': True, 'baz': b'abc'}
    cleaned_headers = {b'foo': b'3', b'bar': b'True', b'baz': b'abc'}
    http.request(u'http://example.com', method=u'GET', headers=headers)
    for k, v in cleaned_headers.items():
      self.assertTrue(k in http.headers)
      self.assertEqual(v, http.headers[k])

    # Next, test that we do fail on unicode.
    unicode_str = six.unichr(40960) + 'abcd'
    self.assertRaises(
        NonAsciiHeaderError,
        http.request,
        u'http://example.com', method=u'GET', headers={u'foo': unicode_str})
Example #3
0
def _replace_entity(match):
    text = match.group(1)
    if text[0] == '#':
        text = text[1:]
        try:
            if text[0] in 'xX':
                c = int(text[1:], 16)
            else:
                c = int(text)
            return six.unichr(c)
        except ValueError:
            return match.group(0)
    else:
        try:
            return six.unichr(html_entities.name2codepoint[text])
        except (ValueError, KeyError):
            return match.group(0)
Example #4
0
def _replace_entity(match):
    text = match.group(1)
    if text[0] == '#':
        text = text[1:]
        try:
            if text[0] in 'xX':
                c = int(text[1:], 16)
            else:
                c = int(text)
            return six.unichr(c)
        except ValueError:
            return match.group(0)
    else:
        try:
            return six.unichr(html_entities.name2codepoint[text])
        except (ValueError, KeyError):
            return match.group(0)
Example #5
0
class PageForm(forms.ModelForm):

    meta_description = forms.CharField(widget=forms.Textarea, required=False)
    meta_keywords = forms.CharField(widget=forms.Textarea, required=False)
    parent = TreeNodeChoiceField(queryset=Page.tree.all(),
                                 level_indicator=3 * unichr(160),
                                 empty_label='---------',
                                 required=False)
    redirect_page = TreeNodeChoiceField(
        label=_('Redirect page'),
        queryset=Page.objects.filter(redirect_page__isnull=True),
        level_indicator=3 * unichr(160),
        empty_label='---------',
        required=False)

    class Meta:
        model = Page
        exclude = []

    def __init__(self, *args, **kwargs):
        super(PageForm, self).__init__(*args, **kwargs)
        if len(TEMPLATE_CHOICES) > 0:
            self.fields['template_name'] = forms.ChoiceField(
                choices=TEMPLATE_CHOICES, required=False, label=_('Template'))

    def clean_title(self):
        """
        Strips extra whitespace
        """
        return self.cleaned_data.get('title', '').strip()

    def clean_redirect_page(self):
        if self.cleaned_data['redirect_page']:
            try:
                if self.cleaned_data['url'] and is_quoted_url(
                        self.cleaned_data['url']):
                    raise forms.ValidationError(
                        _('A named url can\'t be combined with a redirect page'
                          ))
            except KeyError:
                pass
        return self.cleaned_data['redirect_page']
Example #6
0
 def set_row_color(self,
                   sheet,
                   row,
                   header,
                   color='FF000000',
                   bgcolor='FFFFFFFF'):
     for idx, item in enumerate(header):
         sheet[str(six.unichr(idx + ord('A'))) +
               str(row)].style = self.get_cell_style(color=color,
                                                     bgcolor=bgcolor)
     return row
Example #7
0
def sanitize_illegal_chars_for_xml(s):
    """Sanitize a string, removing characters illegal in XML.

    This will remove a number of characters that would break the  XML parser.
    They may be in the string due to a copy/paste.

    This code is courtesy of the XmlRpcPlugin developers, as documented
    here: http://stackoverflow.com/a/22273639
    """
    global ILLEGAL_XML_CHARS_RE

    if ILLEGAL_XML_CHARS_RE is None:
        _illegal_unichrs = [
            (0x00, 0x08), (0x0B, 0x0C), (0x0E, 0x1F), (0x7F, 0x84),
            (0x86, 0x9F), (0xFDD0, 0xFDDF), (0xFFFE, 0xFFFF)
        ]

        if sys.maxunicode > 0x10000:
            _illegal_unichrs += [
                (0x1FFFE, 0x1FFFF), (0x2FFFE, 0x2FFFF), (0x3FFFE, 0x3FFFF),
                (0x4FFFE, 0x4FFFF), (0x5FFFE, 0x5FFFF), (0x6FFFE, 0x6FFFF),
                (0x7FFFE, 0x7FFFF), (0x8FFFE, 0x8FFFF), (0x9FFFE, 0x9FFFF),
                (0xAFFFE, 0xAFFFF), (0xBFFFE, 0xBFFFF), (0xCFFFE, 0xCFFFF),
                (0xDFFFE, 0xDFFFF), (0xEFFFE, 0xEFFFF), (0xFFFFE, 0xFFFFF),
                (0x10FFFE, 0x10FFFF)
            ]

        ILLEGAL_XML_CHARS_RE = re.compile('[%s]' % ''.join([
            '%s-%s' % (six.unichr(low), six.unichr(high))
            for low, high in _illegal_unichrs
        ]))

    if isinstance(s, bytes):
        s = s.decode('utf-8')

    return ILLEGAL_XML_CHARS_RE.sub('', s)
Example #8
0
def valid_javascript_identifier(identifier, escape='\\u', ucd_cat=category):
    """Return whether the given ``id`` is a valid Javascript identifier."""

    if not identifier:
        return False

    if not isinstance(identifier, text_type):
        try:
            identifier = text_type(identifier, 'utf-8')
        except UnicodeDecodeError:
            return False

    if escape in identifier:

        new = []
        add_char = new.append
        split_id = identifier.split(escape)
        add_char(split_id.pop(0))

        for segment in split_id:
            if len(segment) < 4:
                return False
            try:
                add_char(unichr(int('0x' + segment[:4], 16)))
            except Exception:
                return False
            add_char(segment[4:])

        identifier = ''.join(new)

    if is_reserved_js_word(identifier):
        return False

    first_char = identifier[0]

    if not ((first_char in valid_jsid_chars) or
            (ucd_cat(first_char) in valid_jsid_categories_start)):
        return False

    for char in identifier[1:]:
        if not ((char in valid_jsid_chars) or
                (ucd_cat(char) in valid_jsid_categories)):
            return False

    return True
Example #9
0
def is_valid_javascript_identifier(identifier, escape=r'\\u',
        ucd_cat=category):
    """Return whether the given ``id`` is a valid Javascript identifier."""

    if not identifier:
        return False

    if not isinstance(identifier, six.text_type):
        try:
            identifier = six.text_type(identifier, 'utf-8')
        except UnicodeDecodeError:
            return False

    if escape in identifier:
        new = []
        add_char = new.append
        split_id = identifier.split(escape)
        add_char(split_id.pop(0))

        for segment in split_id:
            if len(segment) < 4:
                return False
            try:
                add_char(six.unichr(int('0x' + segment[:4], 16)))
            except Exception:
                return False
            add_char(segment[4:])

        identifier = u''.join(new)

    if is_reserved_js_word(identifier):
        return False

    first_char = identifier[0]

    if not ((first_char in valid_jsid_chars) or
            (ucd_cat(first_char) in valid_jsid_categories_start)):
        return False

    for char in identifier[1:]:
        if not ((char in valid_jsid_chars) or
                (ucd_cat(char) in valid_jsid_categories)):
            return False

    return True
Example #10
0
    def test_get_markdown_element_tree_with_named_entities(self):
        """Testing get_markdown_element_tree with named entities"""
        rendered_html_entities = ['&fooooo;']
        expected_html_entities = ['?']

        # toxml() will convert a select list of characters into named
        # entities when generating the string. We need to account for this.
        # These aren't expanded when text is processed in
        # get_markdown_element_tree(). They'll be &#...; entities.
        toxml_expanded_chars = {
            '&': '&amp;',
            '<': '&lt;',
            '>': '&gt;',
            '"': '&quot;',
        }

        for char_code, entity_name in six.iteritems(codepoint2name):
            rendered_html_entities.append('&%s;' % entity_name)

            char = six.unichr(char_code)
            expected_html_entities.append(toxml_expanded_chars.get(char, char))

        node = get_markdown_element_tree(''.join(rendered_html_entities))
        self.assertEqual(node[0].toxml(), ''.join(expected_html_entities))
Example #11
0
 def _callback(matches):
     match_id = matches.group(1)
     return six.unichr(int(match_id))
Example #12
0
from math import sqrt

from django.utils import six
from django.core.cache import caches
from django.utils.html import strip_tags
from django.contrib.sites.models import Site
from django.utils.functional import cached_property
from django.core.cache import InvalidCacheBackendError

from zinnia.models.entry import Entry
from zinnia.settings import STOP_WORDS
from zinnia.settings import COMPARISON_FIELDS

PUNCTUATION = dict.fromkeys(
    i for i in range(sys.maxunicode)
    if unicodedata.category(six.unichr(i)).startswith('P'))


def pearson_score(list1, list2):
    """
    Compute the Pearson' score between 2 lists of vectors.
    """
    size = len(list1)
    sum1 = sum(list1)
    sum2 = sum(list2)
    sum_sq1 = sum([pow(l, 2) for l in list1])
    sum_sq2 = sum([pow(l, 2) for l in list2])

    prod_sum = sum([list1[i] * list2[i] for i in range(size)])

    num = prod_sum - (sum1 * sum2 / float(size))
Example #13
0
import re
from django.utils.six import unichr
from django.utils.six.moves import html_entities


name2codepoint = html_entities.name2codepoint.copy()
name2codepoint['apos'] = ord("'")

_ENTITY_REF = re.compile(r'&(?:#(\d+)|(?:#x([\da-fA-F]+))|([a-zA-Z]+));')
_ENTITY_REPLACE = [
    lambda code: unichr(int(code, 10)) if code else None,
    lambda code: unichr(int(code, 16)) if code else None,
    lambda code: unichr(name2codepoint[code]) if code in name2codepoint else None
]

def htmlentitydecode(s):
    def unescape(match):
        for i, sub in enumerate(_ENTITY_REPLACE, start=1):
            replaced = sub(match.group(i))
            if replaced is not None:
                return replaced
        return match.group(0)
    return _ENTITY_REF.sub(unescape, s)
Example #14
0
import datetime
import json

from django.db import connection
from django.test import override_settings
from django.utils import six
from rest_framework.test import APITestCase

from tests.models import Cat, Group, Location, Profile, User
from tests.serializers import NestedEphemeralSerializer
from tests.setup import create_fixture

UNICODE_STRING = six.unichr(9629)  # unicode heart
# UNICODE_URL_STRING = urllib.quote(UNICODE_STRING.encode('utf-8'))
UNICODE_URL_STRING = '%E2%96%9D'


@override_settings(
    DYNAMIC_REST={
        'ENABLE_LINKS': False
    }
)
class TestUsersAPI(APITestCase):

    def setUp(self):
        self.fixture = create_fixture()
        self.maxDiff = None

    def test_get(self):
        with self.assertNumQueries(1):
            # 1 for User, 0 for Location
Example #15
0
import re
from django.utils.six import unichr
from django.utils.six.moves import html_entities

name2codepoint = html_entities.name2codepoint.copy()
name2codepoint['apos'] = ord("'")

_ENTITY_REF = re.compile(r'&(?:#(\d+)|(?:#x([\da-fA-F]+))|([a-zA-Z]+));')
_ENTITY_REPLACE = [
    lambda code: unichr(int(code, 10))
    if code else None, lambda code: unichr(int(code, 16))
    if code else None, lambda code: unichr(name2codepoint[code])
    if code in name2codepoint else None
]


def htmlentitydecode(s):
    def unescape(match):
        for i, sub in enumerate(_ENTITY_REPLACE, start=1):
            replaced = sub(match.group(i))
            if replaced is not None:
                return replaced
        return match.group(0)

    return _ENTITY_REF.sub(unescape, s)
Example #16
0
from __future__ import unicode_literals
Example #17
0
import datetime
import json

from django.db import connection
from django.test import override_settings
from django.utils import six
from rest_framework.test import APITestCase

from tests.models import Cat, Group, Location, Profile, User
from tests.serializers import NestedEphemeralSerializer
from tests.setup import create_fixture

UNICODE_STRING = six.unichr(9629)  # unicode heart
# UNICODE_URL_STRING = urllib.quote(UNICODE_STRING.encode('utf-8'))
UNICODE_URL_STRING = '%E2%96%9D'


@override_settings(
    DYNAMIC_REST={
        'ENABLE_LINKS': False
    }
)
class TestUsersAPI(APITestCase):

    def setUp(self):
        self.fixture = create_fixture()
        self.maxDiff = None

    def test_get(self):
        with self.assertNumQueries(1):
            # 1 for User, 0 for Location
import unicodedata
from math import sqrt

from django.utils import six
from django.core.cache import caches
from django.utils.html import strip_tags
from django.contrib.sites.models import Site
from django.utils.functional import cached_property
from django.core.cache import InvalidCacheBackendError

from zinnia.models.entry import Entry
from zinnia.settings import STOP_WORDS
from zinnia.settings import COMPARISON_FIELDS


PUNCTUATION = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(six.unichr(i)).startswith("P"))


def pearson_score(list1, list2):
    """
    Compute the Pearson' score between 2 lists of vectors.
    """
    size = len(list1)
    sum1 = sum(list1)
    sum2 = sum(list2)
    sum_sq1 = sum([pow(l, 2) for l in list1])
    sum_sq2 = sum([pow(l, 2) for l in list2])

    prod_sum = sum([list1[i] * list2[i] for i in range(size)])

    num = prod_sum - (sum1 * sum2 / float(size))