def test_incremental_encode(self):
        self.assertEquals(
            "".join(codecs.iterencode(u"python.org", "idna")),
            "python.org"
        )
        self.assertEquals(
            "".join(codecs.iterencode(u"python.org.", "idna")),
            "python.org."
        )
        self.assertEquals(
            "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
            "xn--pythn-mua.org."
        )
        self.assertEquals(
            "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
            "xn--pythn-mua.org."
        )

        encoder = codecs.getincrementalencoder("idna")()
        self.assertEquals(encoder.encode(u"\xe4x"), "")
        self.assertEquals(encoder.encode(u"ample.org"), "xn--xample-9ta.")
        self.assertEquals(encoder.encode(u"", True), "org")

        encoder.reset()
        self.assertEquals(encoder.encode(u"\xe4x"), "")
        self.assertEquals(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
        self.assertEquals(encoder.encode(u"", True), "")
Ejemplo n.º 2
0
    def test_incremental_encode(self):
        self.assertEquals(
            "".join(codecs.iterencode(u"python.org", "idna")),
            "python.org"
        )
        self.assertEquals(
            "".join(codecs.iterencode(u"python.org.", "idna")),
            "python.org."
        )
        self.assertEquals(
            "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
            "xn--pythn-mua.org."
        )
        self.assertEquals(
            "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
            "xn--pythn-mua.org."
        )

        encoder = codecs.getincrementalencoder("idna")()
        self.assertEquals(encoder.encode(u"\xe4x"), "")
        self.assertEquals(encoder.encode(u"ample.org"), "xn--xample-9ta.")
        self.assertEquals(encoder.encode(u"", True), "org")

        encoder.reset()
        self.assertEquals(encoder.encode(u"\xe4x"), "")
        self.assertEquals(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
        self.assertEquals(encoder.encode(u"", True), "")
Ejemplo n.º 3
0
    def push_last(self, arg: str, extra_margin: int=0,
            human_trunc: bool=False) -> typing.Optional[str]:
        last_arg = self.args[-1]
        tags, line = self._format()
        n = len(line.encode("utf8")) # get length of current line
        n += self._margin            # margin used for :hostmask
        if " " in arg and not " " in last_arg:
            n += 1                   # +1 for colon on new arg
        n += extra_margin            # used for things like (more ...)

        overflow: typing.Optional[str] = None

        if (n+len(arg.encode("utf8"))) > LINE_MAX:
            for i, char in enumerate(codecs.iterencode(arg, "utf8")):
                n += len(char)
                if n > LINE_MAX:
                    arg, overflow = arg[:i], arg[i:]
                    if human_trunc and not overflow[0] == " ":
                        new_arg, sep, new_overflow = arg.rpartition(" ")
                        if sep:
                            arg = new_arg
                            overflow = new_overflow+overflow
                    break
        if arg:
            self.args[-1] = last_arg+arg
        return overflow
Ejemplo n.º 4
0
def handler(env, start_response):
    
    inp = env['wsgi.input']
    reader = codecs.getreader("utf-8")
    state = json.load(reader(inp))
    
    ret = {}
    
    for key,vtype in STATE_KEYS.items():
        value = state[key]
        assert isinstance(value, vtype)
        ret[key] = value

    # validate the input, for sure.
    n = state["n"]
    if n < 2 or n > 201:
        ret["n"] = 101

    values = valuesiter(ret)

    ret["values"] = values
    
    start_response('200 OK', [('Content-Type','application/json')])
    je = json.JSONEncoder()
    viter = je.iterencode(ret)
    return codecs.iterencode(viter,"utf-8")
Ejemplo n.º 5
0
 def handle_charref(self, name):
     if name.startswith('x') or name.startswith('X'):
         cp = int(name[1:], 16)
     else:
         cp = int(name)
     u = unichr(cp)
     self.buf += codecs.iterencode(u, 'utf-8')
Ejemplo n.º 6
0
 def __init__(self, unicode_csvfile, *args, **kwargs):
     decoder = codecs.getdecoder('utf-8')
     self.decoder = lambda v: decoder(v)[0]
     utf8_csvfile = codecs.iterencode(unicode_csvfile, encoding='utf-8')
     # bollicks to csv.DictReader being an oldstyle class
     csv.DictReader.__init__(self, utf8_csvfile, *args, **kwargs)
     self.fieldnames = [self.decoder(f) for f in self.fieldnames]
Ejemplo n.º 7
0
def handler(env, start_response):

    inp = env['wsgi.input']
    reader = codecs.getreader("utf-8")
    state = json.load(reader(inp))

    ret = {}

    for key, vtype in STATE_KEYS.items():
        value = state[key]
        assert isinstance(value, vtype)
        ret[key] = value

    # validate the input, for sure.
    n = state["n"]
    if n < 2 or n > 201:
        ret["n"] = 101

    values = valuesiter(ret)

    ret["values"] = values

    start_response('200 OK', [('Content-Type', 'application/json')])
    je = json.JSONEncoder()
    viter = je.iterencode(ret)
    return codecs.iterencode(viter, "utf-8")
Ejemplo n.º 8
0
    def run_cgi_script(self, filesystem_path: pathlib.Path,
                       environ: dict) -> Response:
        """
        Execute the given file as a CGI script and return the script's stdout
        stream to the client.
        """
        script_name = str(filesystem_path)
        cgi_env = environ.copy()
        cgi_env["GATEWAY_INTERFACE"] = "GCI/1.1"
        cgi_env["SCRIPT_NAME"] = script_name

        # Decode the stream as unicode so we can parse the status line
        # Use surrogateescape to preserve any non-UTF8 byte sequences.
        out = subprocess.Popen(
            [script_name],
            stdout=subprocess.PIPE,
            env=cgi_env,
            bufsize=1,
            universal_newlines=True,
            errors="surrogateescape",
        )

        status_line = out.stdout.readline().strip()
        status_parts = status_line.split(maxsplit=1)
        if len(status_parts) != 2 or not status_parts[0].isdecimal():
            return Response(Status.CGI_ERROR, "Unexpected Error")

        status, meta = status_parts

        # Re-encode the rest of the body as bytes
        body = codecs.iterencode(out.stdout,
                                 encoding="utf-8",
                                 errors="surrogateescape")
        return Response(int(status), meta, body)
Ejemplo n.º 9
0
def csvreader(f):
    try:
        for row in csv.reader(f):
            yield row
    except UnicodeEncodeError:
        for row in csv.reader(codecs.iterencode(f, 'utf-8')):
            yield [e.decode('utf-8') for e in row]
Ejemplo n.º 10
0
 def __init__(self, unicode_csvfile, *args, **kwargs):
     decoder = codecs.getdecoder('utf-8')
     self.decoder = lambda v: decoder(v)[0]
     utf8_csvfile = codecs.iterencode(unicode_csvfile, encoding='utf-8')
     # bollicks to csv.DictReader being an oldstyle class
     csv.DictReader.__init__(self, utf8_csvfile, *args, **kwargs)
     self.fieldnames = [self.decoder(f) for f in self.fieldnames]
Ejemplo n.º 11
0
 def serialize(self, out):
     """Write the units back to file."""
     # Thanks to iterencode, a possible BOM is written only once
     for chunk in iterencode(
         (unit.getoutput() for unit in self.units), self.encoding
     ):
         out.write(chunk)
Ejemplo n.º 12
0
def iter_byte_indices(text, codec):
    '''
    Iterate over the codepoint offset of each byte (any codec).
    '''
    for i, b in enumerate(codecs.iterencode(text, codec)):
        for _ in b:
            yield i
    yield len(text)
Ejemplo n.º 13
0
def _utf8_iter_recoder(stream, encoding):
    """Generator re-encodes input file's lines from a given
    encoding to utf-8.

    :param stream: file handle.
    :param encoding: str of encoding.
    """
    return codecs.iterencode(codecs.iterdecode(stream, encoding), "utf-8")
Ejemplo n.º 14
0
def test_incremental_encode():
    from codecs import iterencode
    encoded = iterencode(
        (c for c in UNICODE.utf8),
        'internet'
    )
    encoded = ''.join(encoded)
    assert encoded == UNICODE.utf8.encode('UTF-8')
Ejemplo n.º 15
0
    def __init__(self, f, fromenc=ENCODING, toenc=ENCODING, **kwargs):
        """ Reencoder constructor

        Args:
            f (obj): File-like object
            fromenc (str): The input encoding.
            toenc (str): The output encoding.

        Kwargs:
            remove_BOM (bool): Remove Byte Order Marker (default: True)
            decode (bool): Decode the text into a string (default: False)

        Examples:
            >>> encoding = 'utf-16-be'
            >>> eff = p.join(DATA_DIR, 'utf16_big.csv')
            >>>
            >>> with open(eff, 'rb') as f:
            ...     reenc = Reencoder(f, encoding)
            ...     first = reenc.readline(keepends=False)
            ...     first.decode('utf-8') == '\ufeffa,b,c'
            ...     reenc.read().decode('utf-8').split('\\n')[1] == '4,5,ʤ'
            True
            True
            >>> with open(eff, 'rb') as f:
            ...     reenc = Reencoder(f, encoding, decode=True)
            ...     reenc.readline(keepends=False) == '\ufeffa,b,c'
            True
            >>> with open(eff, 'rU', encoding=encoding) as f:
            ...     reenc = Reencoder(f, remove_BOM=True)
            ...     reenc.readline(keepends=False) == b'a,b,c'
            ...     reenc.readline() == b'1,2,3\\n'
            ...     reenc.readline().decode('utf-8') == '4,5,ʤ'
            True
            True
            True
        """
        self.fileno = f.fileno
        first_line = next(f)
        bytes_mode = isinstance(first_line, BYTE_TYPE)
        decode = kwargs.get('decode')
        rencode = not decode

        if kwargs.get('remove_BOM'):
            strip = BOM.encode(fromenc) if bytes_mode else BOM
            first_line = first_line.lstrip(strip)

        chained = it.chain([first_line], f)

        if bytes_mode:
            decoded = iterdecode(chained, fromenc)
            self.binary = rencode
        else:
            decoded = chained
            self.binary = bytes_mode or rencode

        self.stream = iterencode(decoded, toenc) if rencode else decoded
Ejemplo n.º 16
0
 def read_decoded_data(self, f):
   # This ensures that the raw result bytes we got are, in fact, valid utf-8,
   # replacing invalid bytes with �. Because python2's unicode support is
   # wonky, we re-encode the now-valid-utf-8 back into a str object so that
   # users don't need to deal with `unicode` objects.
   # The file contents can be large, so be careful to do the conversion in
   # chunks while streaming the data in, instead of requiring a full copy.
   n = 1 << 16
   chunks = iter(lambda: f.read(n), '')
   decoded = codecs.iterdecode(chunks, 'utf-8', 'replace')
   return ''.join(codecs.iterencode(decoded, 'utf-8'))
Ejemplo n.º 17
0
 def __iter__(self):
     root = ElementTree.fromstringlist(codecs.iterencode(self.inputreader, 'utf'))
     table = root.findall('.//table')[5]
     for row in table[2:-1]:
         date_str = row.find('.//td[3].nobr').text
         tdate = datetime.strptime(date_str, '%d.%m.%Y')
         amount_str = row.find('.//td[5].nobr').text
         amount_str = amount_str.replace('.', '')
         tamount = float(normalize_num(amount_str))
         desc = plain_content(row.find('.//td[4]'))
         tmessage = normalize_field(desc)
         yield TransactionData(tdate, tamount, message=tmessage)
Ejemplo n.º 18
0
def iter_codepoint_indices(text, codec):
    '''
    Iterate over the byte offset of each character (any codec).
    '''
    # Note: for encodings with a BOM, the first offset probably shouldn't
    # be 0, but 2, 3, or 4, depending on the BOM's length.
    # This is ignored due to the lack of expected practical applications.
    i = 0
    for b in codecs.iterencode(text, codec):
        yield i
        i += len(b)
    yield i
Ejemplo n.º 19
0
 def __iter__(self):
     root = ElementTree.fromstringlist(codecs.iterencode(self.inputreader, 'utf'))
     table = root.findall('.//table')[5]
     for row in table[2:-1]:
         date_str = row.find('.//td[3].nobr').text
         tdate = datetime.strptime(date_str, '%d.%m.%Y')
         amount_str = row.find('.//td[5].nobr').text
         amount_str = amount_str.replace('.', '')
         tamount = float(normalize_num(amount_str))
         desc = plain_content(row.find('.//td[4]'))
         tmessage = normalize_field(desc)
         yield TransactionData(tdate, tamount, message=tmessage)
Ejemplo n.º 20
0
Archivo: util.py Proyecto: marrow/cinje
def stream(input, encoding=None, errors='strict'):
	"""Safely iterate a template generator, ignoring ``None`` values and optionally stream encoding.
	
	Used internally by ``cinje.flatten``, this allows for easy use of a template generator as a WSGI body.
	"""
	
	input = (i for i in input if i)  # Omits `None` (empty wrappers) and empty chunks.
	
	if encoding:  # Automatically, and iteratively, encode the text if requested.
		input = iterencode(input, encoding, errors=errors)
	
	return input
Ejemplo n.º 21
0
 def write_encoded_data(self, f):
   # Sometimes users give us invalid utf-8 data. They shouldn't, but it does
   # happen every once and a while. Just ignore it, and replace with �.
   # We're assuming users only want to write text data out.
   # self.data can be large, so be careful to do the conversion in chunks
   # while streaming the data out, instead of requiring a full copy.
   n = 1 << 16
   # This is a generator expression, so this only copies one chunk of
   # self.data at any one time.
   chunks = (self.data[i:i + n] for i in xrange(0, len(self.data), n))
   decoded = codecs.iterdecode(chunks, 'utf-8', 'replace')
   for chunk in codecs.iterencode(decoded, 'utf-8'):
     f.write(chunk)
def csv_unireader(f, encoding="utf-8"):
    if PY3:
        f = codecs.open(f, encoding=encoding)
        r = csv.reader(f, delimiter='|', quotechar='"')
    else:
        r = csv.reader(
            codecs.iterencode(codecs.iterdecode(open(f), encoding), 'utf-8'),
            delimiter=b'|', quotechar=b'"')
    for row in r:
        if PY3:
            yield row
        else:
            yield [e.decode("utf-8") for e in row]
Ejemplo n.º 23
0
 def error(self,
           code: int,
           message: str,
           environ: Environ,
           start_response: StartResponseCallable) -> ResponseStream:
     statuses = {
         400: 'Bad Request',
         405: 'Method Not Allowed',
     }
     start_response(
         f'{code} {statuses[code]}',
         [('Content-Type', 'text/plain')]
     )
     return codecs.iterencode(message, 'utf-8')
Ejemplo n.º 24
0
    def encode_all(f=None, **kwargs):
        """
        Encode unicode into bytes (str)
        """
        names = kwargs.pop('fieldnames', None)
        encoding = kwargs.pop('encoding', None) if f else False
        decoded = codecs.iterdecode(f, encoding) if encoding else f
        ekwargs = {encode(k): encode(v) for k, v in kwargs.items()}
        fmtparams = {k: v for k, v in ekwargs.items() if k in dir(csv.Dialect)}

        res = {
            'f': codecs.iterencode(decoded, ENCODING) if f else None,
            'fieldnames': [encode(x) for x in names] if names else None,
            'kwargs': ekwargs,
            'fmtparams': fmtparams}

        return res
Ejemplo n.º 25
0
def write_text_resource(foutput, text, encoding='utf-8'):
    """Write a text resource
    :param foutput: path or file handle
    :type foutput: str, file
    :param text: content to write
    :type text: str, unicode, iterable
    :param encoding: which encoding to use (default: UTF-8)
    :type encoding: str
    """
    if isinstance(foutput, file):
        for chunk in codecs.iterencode(text, encoding=encoding):
            foutput.write(chunk)
    else:
        with codecs.open(foutput, 'w', encoding=encoding) as fhandle:
            if isiterable(text):
                for line in text:
                    fhandle.write(u"%s\n" % line)
            else:
                fhandle.write(text)
Ejemplo n.º 26
0
    def __iter_extended_rows(self):

        # For PY2 encode/decode
        if six.PY2:
            # Reader requires utf-8 encoded stream
            bytes = iterencode(self.__chars, 'utf-8')
            items = csv.reader(bytes, **self.__options)
            for number, item in enumerate(items, start=1):
                values = []
                for value in item:
                    value = value.decode('utf-8')
                    values.append(value)
                yield (number, None, list(values))

        # For PY3 use chars
        else:
            items = csv.reader(self.__chars, **self.__options)
            for number, item in enumerate(items, start=1):
                yield (number, None, list(item))
Ejemplo n.º 27
0
    def __iter_extended_rows(self):

        # For PY2 encode/decode
        if six.PY2:
            # Reader requires utf-8 encoded stream
            bytes = iterencode(self.__chars, 'utf-8')
            items = csv.reader(bytes, **self.__options)
            for number, item in enumerate(items, start=1):
                values = []
                for value in item:
                    value = value.decode('utf-8')
                    values.append(value)
                yield (number, None, list(values))

        # For PY3 use chars
        else:
            items = csv.reader(self.__chars, **self.__options)
            for number, item in enumerate(items, start=1):
                yield (number, None, list(item))
Ejemplo n.º 28
0
    def __emit_items(self):

        # For PY2 encode/decode
        if six.PY2:
            # Reader requires utf-8 encoded stream
            bytes = iterencode(self.__chars, 'utf-8')
            items = csv.reader(bytes, **self.__options)
            for item in items:
                values = []
                for value in item:
                    value = value.decode('utf-8')
                    values.append(value)
                yield (None, tuple(values))

        # For PY3 use chars
        else:
            items = csv.reader(self.__chars, **self.__options)
            for item in items:
                yield (None, tuple(item))
Ejemplo n.º 29
0
def write_text_resource(foutput, text, encoding='utf-8'):
    """Write a text resource
    :param foutput: path or file handle
    :type foutput: str, file
    :param text: content to write
    :type text: str, unicode, iterable
    :param encoding: which encoding to use (default: UTF-8)
    :type encoding: str
    """
    if isinstance(foutput, file):
        for chunk in codecs.iterencode(text, encoding=encoding):
            foutput.write(chunk)
    else:
        with codecs.open(foutput, 'w', encoding=encoding) as fhandle:
            if isiterable(text):
                for line in text:
                    fhandle.write(u"%s\n" % line)
            else:
                fhandle.write(text)
Ejemplo n.º 30
0
def read_docs(filename, column):
	ret_arr = []
	#print_err("reading %s"%(filename))
	dupes = 0
	with open(filename, 'r') as infile:
		coder = codecs.iterencode(codecs.iterdecode(infile, "utf-8"), "utf-8")
		csvfile = csv.reader(coder, delimiter=',', quotechar='"')
		#next(csvfile)
		dup_checker = {}
		for row in csvfile:
			test = row[column].lower()
			if test not in dup_checker:
				dup_checker[test] = 1
				ret_arr.append(row[column])
			else:
				dupes += 1
	print_err("total dupes: %d"%dupes)

	return ret_arr
Ejemplo n.º 31
0
    def testParseResponse(self):

        # Parse a simple response to a subscribe:
        self.conn.write_socket(self.subscribe_cmd)
        parsed_resp = self.conn.parse_response()
        self.assertEqual([u'subscribe', u'tmp.20', 1L], parsed_resp)

        # Parse a more complicated incoming msg:
        #                 '*3\r\n' +\
        # 		    	  '$7\r\n' +\
        # 		    	  'PUBLISH\r\n' +\
        # 		    	  '$4\r\n' +\
        # 		    	  'test\r\n' +\
        # 		    	  '$11\r\n{"Hello World"}'

        # Make the OnDemandPublisher thread send us this
        # message:
        OneShotTester.answer_server.sendMessage(OneShotTester.test_msg,
                                                OneShotTester.from_channel)
        parsed_resp = self.conn.parse_response()

        # Get the message that was sent back to us:
        sent_bus_msg = OneShotTester.answer_server.outMsg
        out_id = sent_bus_msg.id
        rx_encoded = []
        for el in codecs.iterencode(parsed_resp, 'UTF-8'):
            rx_encoded.append(el)
        expected = [
            'message', OneShotTester.from_channel,
            '{"content": "Hello world", "id": "%s", ' % out_id
        ]
        # Time of sent msg will be different each time; cut it out of the
        # rx-ed string:
        rxed_msg_body = rx_encoded[2]
        rxed_body_chop_pos = rxed_msg_body.index('"time": ')
        chopped_rxed_body = rxed_msg_body[:rxed_body_chop_pos]
        # Replaced msg body part of received by the
        # truncated version that doesn't include the time:
        rx_encoded[2] = chopped_rxed_body

        self.assertEqual(expected, rx_encoded)
Ejemplo n.º 32
0
    def __iter_extended_rows(self):

        # For PY2 encode/decode
        if six.PY2:
            # Reader requires utf-8 encoded stream
            bytes = iterencode(self.__chars, 'utf-8')
            sample, dialect = self.__prepare_dialect(bytes)
            items = csv.reader(chain(sample, bytes), dialect=dialect)
            for number, item in enumerate(items, start=1):
                values = []
                for value in item:
                    value = value.decode('utf-8')
                    values.append(value)
                yield (number, None, list(values))

        # For PY3 use chars
        else:
            sample, dialect = self.__prepare_dialect(self.__chars)
            items = csv.reader(chain(sample, self.__chars), dialect=dialect)
            for number, item in enumerate(items, start=1):
                yield (number, None, list(item))
Ejemplo n.º 33
0
def encode_all(f=None, **kwargs):
    """
    Encode unicode into bytes (str)
    """
    names = kwargs.pop('fieldnames', None)
    encoding = kwargs.pop('encoding', None) if f else False

    if PY2:
        decoded = codecs.iterdecode(f, encoding) if encoding else f
        ekwargs = {encode(k): encode(v) for k, v in kwargs.items()}
    else:
        decoded, ekwargs = f, kwargs

    res = {
        'f': codecs.iterencode(decoded, ENCODING) if f and PY2 else decoded,
        'fieldnames': [encode(x) for x in names] if names and PY2 else names,
        'drkwargs': use_keys_from(ekwargs, READER_KEYS),
        'dwkwargs': use_keys_from(ekwargs, WRITER_KEYS),
        'fmtparams': use_keys_from(ekwargs, FMTKEYS)}

    return res
Ejemplo n.º 34
0
def encode_all(f=None, **kwargs):
    """
    Encode unicode into bytes (str)
    """
    names = kwargs.pop('fieldnames', None)
    encoding = kwargs.pop('encoding', None) if f else False

    if PY2:
        decoded = codecs.iterdecode(f, encoding) if encoding else f
        ekwargs = {encode(k): encode(v) for k, v in kwargs.items()}
    else:
        decoded, ekwargs = f, kwargs

    res = {
        'f': codecs.iterencode(decoded, ENCODING) if f and PY2 else decoded,
        'fieldnames': [encode(x) for x in names] if names and PY2 else names,
        'drkwargs': use_keys_from(ekwargs, READER_KEYS),
        'dwkwargs': use_keys_from(ekwargs, WRITER_KEYS),
        'fmtparams': use_keys_from(ekwargs, FMTKEYS)}

    return res
Ejemplo n.º 35
0
    def __iter_extended_rows(self):

        # For PY2 encode/decode
        if six.PY2:
            # Reader requires utf-8 encoded stream
            bytes = iterencode(self.__chars, 'utf-8')
            sample, dialect = self.__prepare_dialect(bytes)
            items = csv.reader(chain(sample, bytes), dialect=dialect)
            for row_number, item in enumerate(items, start=1):
                values = []
                for value in item:
                    value = value.decode('utf-8')
                    values.append(value)
                yield (row_number, None, list(values))

        # For PY3 use chars
        else:
            sample, dialect = self.__prepare_dialect(self.__chars)
            items = csv.reader(chain(sample, self.__chars), dialect=dialect)
            for row_number, item in enumerate(items, start=1):
                yield (row_number, None, list(item))
    def testParseResponse(self):

        # Parse a simple response to a subscribe:
        self.conn.write_socket(self.subscribe_cmd)
        parsed_resp = self.conn.parse_response()
        self.assertEqual([u'subscribe', u'tmp.20', 1L], parsed_resp)
        
        # Parse a more complicated incoming msg:
#                 '*3\r\n' +\
# 		    	  '$7\r\n' +\
# 		    	  'PUBLISH\r\n' +\
# 		    	  '$4\r\n' +\
# 		    	  'test\r\n' +\
# 		    	  '$11\r\n{"Hello World"}'
        
        # Make the OnDemandPublisher thread send us this
        # message:
        OneShotTester.answer_server.sendMessage(OneShotTester.test_msg, OneShotTester.from_channel)
        parsed_resp = self.conn.parse_response()
        
        # Get the message that was sent back to us:
        sent_bus_msg = OneShotTester.answer_server.outMsg
        out_id = sent_bus_msg.id
        rx_encoded = []
        for el in codecs.iterencode(parsed_resp, 'UTF-8'):
            rx_encoded.append(el)
        expected = ['message', 
                    OneShotTester.from_channel,
                    '{"content": "Hello world", "id": "%s", ' % out_id
                    ]
        # Time of sent msg will be different each time; cut it out of the 
        # rx-ed string:
        rxed_msg_body = rx_encoded[2]
        rxed_body_chop_pos = rxed_msg_body.index('"time": ')
        chopped_rxed_body = rxed_msg_body[:rxed_body_chop_pos]
        # Replaced msg body part of received by the
        # truncated version that doesn't include the time:
        rx_encoded[2] = chopped_rxed_body 
        
        self.assertEqual(expected, rx_encoded)
Ejemplo n.º 37
0
    def testIncrementalEncoder(self):

        # Tests derived from Python standard library test/test_codecs.py

        incremental_tests = (
            (u"python.org", b"python.org"),
            (u"python.org.", b"python.org."),
            (u"pyth\xf6n.org", b"xn--pythn-mua.org"),
            (u"pyth\xf6n.org.", b"xn--pythn-mua.org."),
        )
        for decoded, encoded in incremental_tests:
            self.assertEqual(b"".join(codecs.iterencode(decoded, "idna")),
                             encoded)

        encoder = codecs.getincrementalencoder("idna")()
        self.assertEqual(encoder.encode(u"\xe4x"), b"")
        self.assertEqual(encoder.encode(u"ample.org"), b"xn--xample-9ta.")
        self.assertEqual(encoder.encode(u"", True), b"org")

        encoder.reset()
        self.assertEqual(encoder.encode(u"\xe4x"), b"")
        self.assertEqual(encoder.encode(u"ample.org."), b"xn--xample-9ta.org.")
        self.assertEqual(encoder.encode(u"", True), b"")
Ejemplo n.º 38
0
    def testIncrementalEncoder(self):

        # Tests derived from Python standard library test/test_codecs.py

        incremental_tests = (
            ("python.org", b"python.org"),
            ("python.org.", b"python.org."),
            ("pyth\xf6n.org", b"xn--pythn-mua.org"),
            ("pyth\xf6n.org.", b"xn--pythn-mua.org."),
        )
        for decoded, encoded in incremental_tests:
            self.assertEqual(b"".join(codecs.iterencode(decoded, "idna")),
                             encoded)

        encoder = codecs.getincrementalencoder("idna")()
        self.assertEqual(encoder.encode("\xe4x"), b"")
        self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
        self.assertEqual(encoder.encode("", True), b"org")

        encoder.reset()
        self.assertEqual(encoder.encode("\xe4x"), b"")
        self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
        self.assertEqual(encoder.encode("", True), b"")
def csv_unireader(f, encoding="utf-8"):
    """
    abre um arquivo utf-8 com a lib csv
    """
    for row in csv.reader(codecs.iterencode(codecs.iterdecode(f, encoding), "utf-8"), delimiter="\t"):
        yield [e.decode("utf-8") for e in row]
Ejemplo n.º 40
0
file_list = sorted(glob.glob(args.inputs))
file_count = len(file_list)

url_dict = {}

#
# step through each file
#

for filename_index in range(file_count):
    filename = file_list[filename_index]
    print "opening \"%s\" (%d of %d)" % (
        filename,  filename_index+1, file_count)

    with io.open(filename, "r", encoding="utf-8") as f:
        input_csv = csv.DictReader(codecs.iterencode(f, "utf-8"))

        for row in input_csv:
            url = row['url']
            if url not in url_dict:
                url_dict[url] = row


with io.open(args.output, "w", encoding="utf-8") as outfile:
    outfile.write(u"id,url,text\n")
    for row in url_dict.values():
        tweet_id = row["id"]
        url = row["url"]
        text = row["text"]
        if text is not None:
            text = text.replace("\"", "\"\"")
Ejemplo n.º 41
0
    def on_get(self, request, resp, **kwargs):
        """
        Falcon resource method, for handling HTTP request GET method

        Falcon request provides: parameters embedded in URL via a keyword args
        dict, as well as convenience class variables falcon.HTTP_*

        FIXME: remove moduel pylint:disable= & refactor this overlong code block!
        """
        # obtain logged in API user ID (if available)
        api_session_user = auth.get_user_id(request)
        # select data
        with warehouse.get_source_model_session() as dwsupport_model:
            sources = source.SourceUtil.get_list_of_data_sources(
                 request.url
                ,auth.get_user_id(request)
                ,dwsupport_model)
            str_dataset_id = get_requested_dataset_id( sources, request, resp, kwargs)
            list_variables_requested_source = variables.get_list_of_variables( str_dataset_id)
            # convert 'datasets' into a list of variables
            list_requested_datasets = parameters.get_requested_datasets( request)
            list_variables_from_datasets = []
            for str_id in list_requested_datasets:
                if str_dataset_id == str_id:
                    list_variables_from_datasets = list_variables_requested_source
                    break
                if str_dataset_id == 'warehouse': #FIXME: refactor this into a source.warehouse function
                    #obtain the 'warehouse' field aliases for each dataset
                    list_source_variables = variables.get_list_of_variables( str_id)
                    for var in list_source_variables:
                        warehouse_utils = api.resources.source.warehouse.warehouse
                        str_alias = warehouse_utils.prefix_field_name( var, str_id)
                        list_variables_from_datasets.append( str_alias)
                else: #error; not a warehouse request & Dataset does not match requested ID
                    raise falcon.HTTPNotFound(description= "Unrecognized dataset: "
                                           + str_id)
            list_requested_variables = parameters.get_requested_variables( request)

            # add default variables
            if len(list_requested_variables) < 1:
                requested_default_query = parameters.get_list_requested_parameter(
                    defaults.PARAMETER_NAME, request)
                try:
                    default_variables = defaults.get_default_variables(
                         requested_default_query
                        ,str_dataset_id
                        ,dwsupport_model)
                except defaults.UndefinedDefaultQuery as error:
                    msg = ("Value {} is not defined for dataset: '{}'"
                           .format(error, str_dataset_id))
                    raise falcon.HTTPInvalidParam(msg, defaults.PARAMETER_NAME)
                except defaults.AmbiguousDefaultQuery as error:
                    msg = "More than one value was specified: {}".format(error)
                    raise falcon.HTTPInvalidParam(msg, defaults.PARAMETER_NAME)
                except defaults.AmbiguousQueryHierarchy as error:
                    raise falcon.HTTPBadRequest( #TODO: add functional test coverage
                        title="Missing Parameter"
                        ,description=(
                           "Selection defaults not clear for"
                           " data source: '{}'."
                           " Selection must specify one or more 'variables='"
                           " selection parameters (or a 'defaults=' parameter"
                           " value from the following list: {})"
                           ).format(str_dataset_id, error)
                    )
                list_requested_variables.extend(default_variables)

            # add variables derived from 'datasets' param
            list_requested_variables.extend( list_variables_from_datasets)
            list_requested_filters = parameters.get_requested_filters( request)
            # process pivot columns parameter
            try:
                pivot_column_variables = parameters.get_requested_pivot_columns(
                    request
                    ,str_dataset_id
                    ,dwsupport_model['tables'])
            except parameters.PivotVariableError as err:
                raise falcon.HTTPInvalidParam(
                    msg=str(err)
                    ,param_name=parameters.ReservedParameterNames.pivot_columns
                ) from err
            # process 'Empty_cells' parameter
            try:
                empty_cell_dimensions = parameters.get_requested_empty_cells(
                    request
                    ,str_dataset_id
                    ,dwsupport_model['tables']
                    ,dwsupport_model['associations']
                )
            except (parameters.EmptyCellsSourceError
                    ,parameters.EmptyCellsDimensionError) as err:
                raise falcon.HTTPInvalidParam(
                    msg=str(err)
                    ,param_name=parameters.ReservedParameterNames.empty_cells
                ) from err
            # retrieve data
            start_time = datetime.now(pytz.timezone('US/Pacific'))
            try:
                result_generator = data.get_data(str_dataset_id
                                            ,list_requested_variables
                                            ,list_requested_filters
                                            ,pivot_column_variables
                                            ,empty_cell_dimensions
                                            ,user_id=api_session_user)
            except sqlalchemy.exc.DatabaseError as err:
                raise falcon.HTTPInternalServerError(
                    title='500'
                    ,description="Please try again"
                ) from err
            except data.NoSourceException as err:
                raise falcon.HTTPNotFound(description=("Source '{}' dataset not found:"
                                                       " {}").format(str_dataset_id,err)) from err
            except parameters.FilterVariableError as err:
                #TODO: the bad HTTP parameter not always 'filters',sometimes a user-defined param (implicit-filter)
                #TODO: perhaps parameters should raise two different Exceptions?
                raise falcon.HTTPInvalidParam(str(err), 'filters') from err
            except data.NotAuthorizedException as error:
                raise falcon.HTTPUnauthorized(
                    title='401'
                    ,description=("Selection from sensitive data source '{}'"
                                  " not authorized").format(str_dataset_id)
                ) from error
            str_format_type = get_requested_format_type( kwargs)
            resp.content_type = FormatUtil.get_http_content_type(str_format_type)
            for data_source in sources:
                if data_source['id'] == str_dataset_id:
                    formatter = FormatUtil(str_format_type, data_source, request, start_time)
                    result_stream = formatter.format(result_generator)
                    break
            chunked_stream = streaming.biggerchunks_stream(result_stream, 4)#2(13.6),3(13),4(
            if str_format_type == 'xlsx':
                byte_stream = chunked_stream #already bytes
            else:
                encoding = 'utf-8'
                if resp.content_type == 'text/csv':
                    encoding = 'utf-8-sig'
                byte_stream = codecs.iterencode(chunked_stream, encoding)
            resp.stream = byte_stream#content
Ejemplo n.º 42
0
re_strip_mention = re.compile(u'(via\s)?@\w+', flags=re.IGNORECASE)
re_strip_cc = re.compile(u'-CC', flags=re.IGNORECASE)
re_strip_url = re.compile(u'(?i)\b((?:https?:?//|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', flags=re.IGNORECASE)
re_strip_tco_url = re.compile(u'https?:?//[\w\-\.]*(/[\w\-\%]*)*( via)?', flags=re.IGNORECASE)
re_strip_punct = re.compile(u'[\.,;\:\-\!\[\]\"\'\?\(\)\|@\/]', flags=re.IGNORECASE)
re_strip_newline = re.compile(u'[\r\n\t]+')
re_strip_ending_http = re.compile(u'(\s)ht?t?t?p?:?/?/?([\w\-\.]*)?\s?$', flags=re.IGNORECASE)

word_split_re = re.compile(u'\@?\w+|[\$?-]*[\d\.]+|[a-zA-Z0-9\-\'\/]+|[!\?\.\,\/\@\:\-\_&\$\s]*', flags=re.IGNORECASE)



for csvinput in args.inputfiles:

	with open(csvinput, 'r') as infile:
		coder = codecs.iterencode(codecs.iterdecode(infile, "utf-8"), "utf-8")
		csvreader = csv.reader(coder, delimiter=',', quotechar='"')

		# skip header
		next(csvreader)
		for row in csvreader:

			text = row[2]


			# strip retweet
			text = re_strip_rt.sub('', text)
			text = re_strip_mention.sub('', text)
			text = re_strip_cc.sub('', text)
			text = re_strip_url.sub('', text)
			text = re_strip_tco_url.sub('', text)
Ejemplo n.º 43
0
 def handle(self, *args, **options):
     for file_name in args:
         csv_file = codecs.open(file_name, 'rb', 'cp1252')
         encoded_file = codecs.iterencode(csv_file, 'utf-8')
         self.consume_csv(encoded_file)
def csv_unireader(f, encoding="utf-8"):
    for row in csv.reader(codecs.iterencode(codecs.iterdecode(f, encoding), "utf-8")):
        yield [e.decode("utf-8") for e in row]
Ejemplo n.º 45
0
def _csv_unireader(f, encoding='utf-8'):
    return csv.DictReader(codecs.iterencode(codecs.iterdecode(f, encoding),
                                            'utf-8'),
                          delimiter=',')
Ejemplo n.º 46
0
 sniffer = csv.Sniffer()
 for f in files:
     encoding = 'UTF-8'
     if f is not sys.stdin and quiet < 2:
         print ('Processing %s...' % f.name),
         sys.stdout.flush()
     data = f.read()
     prefix = data[:1024 ** 2]
     if f is not sys.stdin and chardet:
         encoding = chardet.detect(prefix)['encoding']
     dialect = sniffer.sniff(prefix, delimiters=',\t')
     f = StringIO(data)
     if f is not sys.stdin and quiet < 2:
         print '(as %s)' % encoding
     if encoding != 'UTF-8':
         f = codecs.iterencode(
             codecs.iterdecode(f, encoding), 'utf-8')
     if args.dry_run:
         # just the headers
         data = [f.next()]
     else:
         data = list(f)
     reader = csv.DictReader(data, dialect=dialect)
     if args.dry_run:
         d = GetSupportingDefaultDict()
         importer(d)
         fieldnames = set(reader.fieldnames)
         d = set(d)
         used = fieldnames & d
         print 'Used keys (%i):' % len(used)
         for k in sorted(used):
             print '*', k
Ejemplo n.º 47
0
 def serialize(self, out):
     """Write the units back to file."""
     # Thanks to iterencode, a possible BOM is written only once
     for chunk in iterencode((unit.getoutput() for unit in self.units), self.encoding):
         out.write(chunk)
Ejemplo n.º 48
0
#coding=utf-8
'''
Created on 2012-5-4

@author: zhaojp
'''
from codecs import iterencode;
from codecs import iterdecode;

echars = iterencode(['你', '好', '啊'], 'utf-8');
dchars = iterdecode(echars, 'utf-8');

for c in dchars:
    print(c)
Ejemplo n.º 49
0
def encode(input, output):
    # TODO tests
    for chunk in codecs.iterencode(chunks_text(input), 'utf-8'):
        output.write(chunk)
Ejemplo n.º 50
0
	def csv_unireader(self, f, encoding="utf-8"):
		import codecs
		import csv
		for row in csv.reader(codecs.iterencode(codecs.iterdecode(f, encoding), "utf-8")):
			yield [e.decode("utf-8") for e in row]
Ejemplo n.º 51
0
    def test_basics(self):
        s = u"abc123" # all codecs should be able to encode these
        for encoding in all_unicode_encodings:
            name = codecs.lookup(encoding).name
            if encoding.endswith("_codec"):
                name += "_codec"
            elif encoding == "latin_1":
                name = "latin_1"
            self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
            (bytes, size) = codecs.getencoder(encoding)(s)
            if encoding != "unicode_internal":
                self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
            (chars, size) = codecs.getdecoder(encoding)(bytes)
            self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))

            if encoding not in broken_unicode_with_streams:
                # check stream reader/writer
                q = Queue()
                writer = codecs.getwriter(encoding)(q)
                encodedresult = ""
                for c in s:
                    writer.write(c)
                    encodedresult += q.read()
                q = Queue()
                reader = codecs.getreader(encoding)(q)
                decodedresult = u""
                for c in encodedresult:
                    q.write(c)
                    decodedresult += reader.read()
                self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))

                # check incremental decoder/encoder (fetched via the Python
                # and C API) and iterencode()/iterdecode()
                try:
                    encoder = codecs.getincrementalencoder(encoding)()
                    cencoder = _testcapi.codec_incrementalencoder(encoding)
                except LookupError: # no IncrementalEncoder
                    pass
                else:
                    # check incremental decoder/encoder
                    encodedresult = ""
                    for c in s:
                        encodedresult += encoder.encode(c)
                    encodedresult += encoder.encode(u"", True)
                    decoder = codecs.getincrementaldecoder(encoding)()
                    decodedresult = u""
                    for c in encodedresult:
                        decodedresult += decoder.decode(c)
                    decodedresult += decoder.decode("", True)
                    self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))

                    # check C API
                    encodedresult = ""
                    for c in s:
                        encodedresult += cencoder.encode(c)
                    encodedresult += cencoder.encode(u"", True)
                    cdecoder = _testcapi.codec_incrementaldecoder(encoding)
                    decodedresult = u""
                    for c in encodedresult:
                        decodedresult += cdecoder.decode(c)
                    decodedresult += cdecoder.decode("", True)
                    self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))

                    # check iterencode()/iterdecode()
                    result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
                    self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))

                    # check iterencode()/iterdecode() with empty string
                    result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
                    self.assertEqual(result, u"")
Ejemplo n.º 52
0
def csv_unireader(f, encoding="utf-8", **fmtparams):
    data = csv.reader(
        codecs.iterencode(codecs.iterdecode(f, encoding), "utf-8"),
        **fmtparams)
    for row in data:
        yield [e.decode("utf-8") for e in row]
Ejemplo n.º 53
0
def csv_unireader(f, encoding="utf-8"):
    for row in csv.reader(codecs.iterencode(codecs.iterdecode(f, encoding), "utf-8"), delimiter=';', quotechar='"'):
        yield [e.decode("utf-8") for e in row]
Ejemplo n.º 54
0
def csv_rows(file_obj, dialect=csv.excel, **kwargs):
    csvfile = iterencode(file_obj, 'utf-8') if PY2 else file_obj
    csvreader = csv.reader(csvfile, dialect=dialect, **kwargs)
    csvreader = (list(iterdecode(i, 'utf-8')) for i in csvreader) if PY2 else csvreader
    for row in csvreader:
        yield row
Ejemplo n.º 55
0
def csv_unireader(f, encoding="utf-8", delimiter=',', quotechar='"'):
    for row in csv.reader(codecs.iterencode(codecs.iterdecode(f, encoding),
                                            "utf-8"),
                          delimiter=delimiter,
                          quotechar=quotechar):
        yield [e.decode("utf-8") for e in row]
Ejemplo n.º 56
0
def csv_unireader(f, encoding="utf-8"):
    for row in csv.reader(
            codecs.iterencode(codecs.iterdecode(f, encoding), "utf-8")):
        yield [e.decode("utf-8") for e in row]
Ejemplo n.º 57
0
 def writelines(self, lines):
     # function scope import, but this is a bug workaround for pandas.
     from codecs import iterencode
     encoded_lines = iterencode(lines, self.encoding)
     self.bytes_filelike.writelines(encoded_lines)
Ejemplo n.º 58
0
    def test_basics(self):
        s = u"abc123"  # all codecs should be able to encode these
        for encoding in all_unicode_encodings:
            name = codecs.lookup(encoding).name
            if encoding.endswith("_codec"):
                name += "_codec"
            elif encoding == "latin_1":
                name = "latin_1"
            self.assertEqual(encoding.replace("_", "-"),
                             name.replace("_", "-"))
            (bytes, size) = codecs.getencoder(encoding)(s)
            if encoding != "unicode_internal":
                self.assertEqual(
                    size, len(s),
                    "%r != %r (encoding=%r)" % (size, len(s), encoding))
            (chars, size) = codecs.getdecoder(encoding)(bytes)
            self.assertEqual(chars, s,
                             "%r != %r (encoding=%r)" % (chars, s, encoding))

            if encoding not in broken_unicode_with_streams:
                # check stream reader/writer
                q = Queue()
                writer = codecs.getwriter(encoding)(q)
                encodedresult = ""
                for c in s:
                    writer.write(c)
                    encodedresult += q.read()
                q = Queue()
                reader = codecs.getreader(encoding)(q)
                decodedresult = u""
                for c in encodedresult:
                    q.write(c)
                    decodedresult += reader.read()
                self.assertEqual(
                    decodedresult, s,
                    "%r != %r (encoding=%r)" % (decodedresult, s, encoding))

                # check incremental decoder/encoder (fetched via the Python
                # and C API) and iterencode()/iterdecode()
                try:
                    encoder = codecs.getincrementalencoder(encoding)()
                    cencoder = _testcapi.codec_incrementalencoder(encoding)
                except LookupError:  # no IncrementalEncoder
                    pass
                else:
                    # check incremental decoder/encoder
                    encodedresult = ""
                    for c in s:
                        encodedresult += encoder.encode(c)
                    encodedresult += encoder.encode(u"", True)
                    decoder = codecs.getincrementaldecoder(encoding)()
                    decodedresult = u""
                    for c in encodedresult:
                        decodedresult += decoder.decode(c)
                    decodedresult += decoder.decode("", True)
                    self.assertEqual(
                        decodedresult, s, "%r != %r (encoding=%r)" %
                        (decodedresult, s, encoding))

                    # check C API
                    encodedresult = ""
                    for c in s:
                        encodedresult += cencoder.encode(c)
                    encodedresult += cencoder.encode(u"", True)
                    cdecoder = _testcapi.codec_incrementaldecoder(encoding)
                    decodedresult = u""
                    for c in encodedresult:
                        decodedresult += cdecoder.decode(c)
                    decodedresult += cdecoder.decode("", True)
                    self.assertEqual(
                        decodedresult, s, "%r != %r (encoding=%r)" %
                        (decodedresult, s, encoding))

                    # check iterencode()/iterdecode()
                    result = u"".join(
                        codecs.iterdecode(codecs.iterencode(s, encoding),
                                          encoding))
                    self.assertEqual(
                        result, s,
                        "%r != %r (encoding=%r)" % (result, s, encoding))

                    # check iterencode()/iterdecode() with empty string
                    result = u"".join(
                        codecs.iterdecode(codecs.iterencode(u"", encoding),
                                          encoding))
                    self.assertEqual(result, u"")
Ejemplo n.º 59
0
 def handle(self, *args, **options):
     for file_name in args:
         csv_file = codecs.open(file_name, 'rb', 'cp1252')
         encoded_file = codecs.iterencode(csv_file, 'utf-8')
         self.consume_csv(encoded_file)